summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorJonathan Corbet <corbet@lwn.net>2019-07-22 13:42:10 -0600
committerJonathan Corbet <corbet@lwn.net>2019-07-22 13:42:10 -0600
commite27a24210aa17b8a0cd462865130fe73afd7e001 (patch)
treeb2d7b8311d280244fad3166792e8f447081a8bc9 /Documentation
parent224d5fd43d250f850d64fb6d668114aff29d7022 (diff)
parent5f9e832c137075045d15cd6899ab0505cfb2ca4b (diff)
downloadlinux-e27a24210aa17b8a0cd462865130fe73afd7e001.tar.gz
linux-e27a24210aa17b8a0cd462865130fe73afd7e001.tar.bz2
linux-e27a24210aa17b8a0cd462865130fe73afd7e001.zip
Merge tag 'v5.3-rc1' into docs-next
Pull in all of the massive docs changes from elsewhere.
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra2
-rw-r--r--Documentation/ABI/obsolete/sysfs-gpio2
-rw-r--r--Documentation/ABI/removed/sysfs-class-rfkill2
-rw-r--r--Documentation/ABI/stable/sysfs-class-infiniband17
-rw-r--r--Documentation/ABI/stable/sysfs-class-rfkill2
-rw-r--r--Documentation/ABI/stable/sysfs-devices-node2
-rw-r--r--Documentation/ABI/stable/sysfs-driver-mlxreg-io65
-rw-r--r--Documentation/ABI/testing/debugfs-cec-error-inj2
-rw-r--r--Documentation/ABI/testing/debugfs-cros-ec56
-rw-r--r--Documentation/ABI/testing/debugfs-driver-habanalabs18
-rw-r--r--Documentation/ABI/testing/debugfs-wilco-ec16
-rw-r--r--Documentation/ABI/testing/ima_policy6
-rw-r--r--Documentation/ABI/testing/procfs-diskstats2
-rw-r--r--Documentation/ABI/testing/procfs-smaps_rollup14
-rw-r--r--Documentation/ABI/testing/pstore4
-rw-r--r--Documentation/ABI/testing/sysfs-block2
-rw-r--r--Documentation/ABI/testing/sysfs-block-device2
-rw-r--r--Documentation/ABI/testing/sysfs-bus-css23
-rw-r--r--Documentation/ABI/testing/sysfs-bus-event_source-devices-format4
-rw-r--r--Documentation/ABI/testing/sysfs-bus-i2c-devices-hm635212
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio7
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-cros-ec10
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-distance-srf084
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-frequency-adf437144
-rw-r--r--Documentation/ABI/testing/sysfs-bus-iio-proximity-as39354
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats24
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci-devices-cciss44
-rw-r--r--Documentation/ABI/testing/sysfs-bus-siox22
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg22
-rw-r--r--Documentation/ABI/testing/sysfs-class-backlight-driver-lm35336
-rw-r--r--Documentation/ABI/testing/sysfs-class-cxl6
-rw-r--r--Documentation/ABI/testing/sysfs-class-devfreq2
-rw-r--r--Documentation/ABI/testing/sysfs-class-led-driver-lm35338
-rw-r--r--Documentation/ABI/testing/sysfs-class-leds-gt683r4
-rw-r--r--Documentation/ABI/testing/sysfs-class-net-phydev8
-rw-r--r--Documentation/ABI/testing/sysfs-class-net-qmi4
-rw-r--r--Documentation/ABI/testing/sysfs-class-power32
-rw-r--r--Documentation/ABI/testing/sysfs-class-power-wilco30
-rw-r--r--Documentation/ABI/testing/sysfs-class-powercap4
-rw-r--r--Documentation/ABI/testing/sysfs-class-switchtec2
-rw-r--r--Documentation/ABI/testing/sysfs-class-uwb_rc6
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu27
-rw-r--r--Documentation/ABI/testing/sysfs-driver-altera-cvp2
-rw-r--r--Documentation/ABI/testing/sysfs-driver-habanalabs42
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid12
-rw-r--r--Documentation/ABI/testing/sysfs-driver-hid-roccat-kone2
-rw-r--r--Documentation/ABI/testing/sysfs-driver-ppi2
-rw-r--r--Documentation/ABI/testing/sysfs-driver-st2
-rw-r--r--Documentation/ABI/testing/sysfs-driver-wacom2
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs8
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-fscaps2
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-iommu_groups9
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-vmcoreinfo2
-rw-r--r--Documentation/ABI/testing/sysfs-platform-asus-laptop2
-rw-r--r--Documentation/ABI/testing/sysfs-platform-asus-wmi10
-rw-r--r--Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl4
-rw-r--r--Documentation/ABI/testing/sysfs-platform-wilco-ec40
-rw-r--r--Documentation/ABI/testing/sysfs-power2
-rw-r--r--Documentation/COPYING-logo (renamed from Documentation/logo.txt)0
-rw-r--r--Documentation/DMA-API-HOWTO.txt2
-rw-r--r--Documentation/EDID/howto.rst58
-rw-r--r--Documentation/PCI/MSI-HOWTO.txt270
-rw-r--r--Documentation/PCI/PCIEBUS-HOWTO.txt198
-rw-r--r--Documentation/PCI/acpi-info.rst192
-rw-r--r--Documentation/PCI/acpi-info.txt187
-rw-r--r--Documentation/PCI/endpoint/index.rst13
-rw-r--r--Documentation/PCI/endpoint/pci-endpoint-cfs.rst118
-rw-r--r--Documentation/PCI/endpoint/pci-endpoint-cfs.txt105
-rw-r--r--Documentation/PCI/endpoint/pci-endpoint.rst231
-rw-r--r--Documentation/PCI/endpoint/pci-endpoint.txt215
-rw-r--r--Documentation/PCI/endpoint/pci-test-function.rst103
-rw-r--r--Documentation/PCI/endpoint/pci-test-function.txt87
-rw-r--r--Documentation/PCI/endpoint/pci-test-howto.rst235
-rw-r--r--Documentation/PCI/endpoint/pci-test-howto.txt206
-rw-r--r--Documentation/PCI/index.rst18
-rw-r--r--Documentation/PCI/msi-howto.rst287
-rw-r--r--Documentation/PCI/pci-error-recovery.rst424
-rw-r--r--Documentation/PCI/pci-error-recovery.txt413
-rw-r--r--Documentation/PCI/pci-iov-howto.rst172
-rw-r--r--Documentation/PCI/pci-iov-howto.txt147
-rw-r--r--Documentation/PCI/pci.rst578
-rw-r--r--Documentation/PCI/pci.txt636
-rw-r--r--Documentation/PCI/pcieaer-howto.rst311
-rw-r--r--Documentation/PCI/pcieaer-howto.txt267
-rw-r--r--Documentation/PCI/picebus-howto.rst220
-rw-r--r--Documentation/RCU/rcuref.txt21
-rw-r--r--Documentation/RCU/stallwarn.txt2
-rw-r--r--Documentation/RCU/whatisRCU.txt8
-rw-r--r--Documentation/accelerators/ocxl.rst178
-rw-r--r--Documentation/accounting/cgroupstats.rst31
-rw-r--r--Documentation/accounting/cgroupstats.txt27
-rw-r--r--Documentation/accounting/delay-accounting.rst126
-rw-r--r--Documentation/accounting/delay-accounting.txt117
-rw-r--r--Documentation/accounting/index.rst14
-rw-r--r--Documentation/accounting/psi.rst182
-rw-r--r--Documentation/accounting/psi.txt180
-rw-r--r--Documentation/accounting/taskstats-struct.rst199
-rw-r--r--Documentation/accounting/taskstats-struct.txt180
-rw-r--r--Documentation/accounting/taskstats.rst180
-rw-r--r--Documentation/accounting/taskstats.txt181
-rw-r--r--Documentation/admin-guide/LSM/LoadPin.rst10
-rw-r--r--Documentation/admin-guide/aoe/aoe.rst150
-rw-r--r--Documentation/admin-guide/aoe/autoload.sh (renamed from Documentation/aoe/autoload.sh)0
-rw-r--r--Documentation/admin-guide/aoe/examples.rst (renamed from Documentation/aoe/examples.rst)0
-rw-r--r--Documentation/admin-guide/aoe/index.rst17
-rw-r--r--Documentation/admin-guide/aoe/status.sh (renamed from Documentation/aoe/status.sh)0
-rw-r--r--Documentation/admin-guide/aoe/todo.rst (renamed from Documentation/aoe/todo.rst)0
-rw-r--r--Documentation/admin-guide/aoe/udev-install.sh (renamed from Documentation/aoe/udev-install.sh)0
-rw-r--r--Documentation/admin-guide/aoe/udev.txt26
-rw-r--r--Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg (renamed from Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg)0
-rw-r--r--Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg (renamed from Documentation/blockdev/drbd/DRBD-data-packets.svg)0
-rw-r--r--Documentation/admin-guide/blockdev/drbd/conn-states-8.dot (renamed from Documentation/blockdev/drbd/conn-states-8.dot)0
-rw-r--r--Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst42
-rw-r--r--Documentation/admin-guide/blockdev/drbd/disk-states-8.dot (renamed from Documentation/blockdev/drbd/disk-states-8.dot)0
-rw-r--r--Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot (renamed from Documentation/blockdev/drbd/drbd-connection-state-overview.dot)0
-rw-r--r--Documentation/admin-guide/blockdev/drbd/figures.rst30
-rw-r--r--Documentation/admin-guide/blockdev/drbd/index.rst19
-rw-r--r--Documentation/admin-guide/blockdev/drbd/node-states-8.dot13
-rw-r--r--Documentation/admin-guide/blockdev/floppy.rst255
-rw-r--r--Documentation/admin-guide/blockdev/index.rst16
-rw-r--r--Documentation/admin-guide/blockdev/nbd.rst31
-rw-r--r--Documentation/admin-guide/blockdev/paride.rst439
-rw-r--r--Documentation/admin-guide/blockdev/ramdisk.rst177
-rw-r--r--Documentation/admin-guide/blockdev/zram.rst422
-rw-r--r--Documentation/admin-guide/btmrvl.rst (renamed from Documentation/btmrvl.txt)0
-rw-r--r--Documentation/admin-guide/bug-hunting.rst4
-rw-r--r--Documentation/admin-guide/cgroup-v1/blkio-controller.rst302
-rw-r--r--Documentation/admin-guide/cgroup-v1/cgroups.rst695
-rw-r--r--Documentation/admin-guide/cgroup-v1/cpuacct.rst50
-rw-r--r--Documentation/admin-guide/cgroup-v1/cpusets.rst866
-rw-r--r--Documentation/admin-guide/cgroup-v1/devices.rst132
-rw-r--r--Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst127
-rw-r--r--Documentation/admin-guide/cgroup-v1/hugetlb.rst50
-rw-r--r--Documentation/admin-guide/cgroup-v1/index.rst28
-rw-r--r--Documentation/admin-guide/cgroup-v1/memcg_test.rst355
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst1003
-rw-r--r--Documentation/admin-guide/cgroup-v1/net_cls.rst44
-rw-r--r--Documentation/admin-guide/cgroup-v1/net_prio.rst57
-rw-r--r--Documentation/admin-guide/cgroup-v1/pids.rst92
-rw-r--r--Documentation/admin-guide/cgroup-v1/rdma.rst117
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst26
-rw-r--r--Documentation/admin-guide/clearing-warn-once.rst (renamed from Documentation/clearing-warn-once.txt)0
-rw-r--r--Documentation/admin-guide/cpu-load.rst (renamed from Documentation/cpu-load.txt)0
-rw-r--r--Documentation/admin-guide/cputopology.rst177
-rw-r--r--Documentation/admin-guide/device-mapper/cache-policies.rst (renamed from Documentation/device-mapper/cache-policies.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/cache.rst (renamed from Documentation/device-mapper/cache.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/delay.rst (renamed from Documentation/device-mapper/delay.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-crypt.rst (renamed from Documentation/device-mapper/dm-crypt.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-dust.txt (renamed from Documentation/device-mapper/dm-dust.txt)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-flakey.rst (renamed from Documentation/device-mapper/dm-flakey.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-init.rst (renamed from Documentation/device-mapper/dm-init.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-integrity.rst (renamed from Documentation/device-mapper/dm-integrity.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-io.rst (renamed from Documentation/device-mapper/dm-io.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-log.rst (renamed from Documentation/device-mapper/dm-log.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-queue-length.rst (renamed from Documentation/device-mapper/dm-queue-length.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-raid.rst (renamed from Documentation/device-mapper/dm-raid.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-service-time.rst (renamed from Documentation/device-mapper/dm-service-time.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-uevent.rst (renamed from Documentation/device-mapper/dm-uevent.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/dm-zoned.rst (renamed from Documentation/device-mapper/dm-zoned.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/era.rst (renamed from Documentation/device-mapper/era.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/index.rst42
-rw-r--r--Documentation/admin-guide/device-mapper/kcopyd.rst (renamed from Documentation/device-mapper/kcopyd.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/linear.rst (renamed from Documentation/device-mapper/linear.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/log-writes.rst (renamed from Documentation/device-mapper/log-writes.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/persistent-data.rst (renamed from Documentation/device-mapper/persistent-data.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/snapshot.rst196
-rw-r--r--Documentation/admin-guide/device-mapper/statistics.rst225
-rw-r--r--Documentation/admin-guide/device-mapper/striped.rst (renamed from Documentation/device-mapper/striped.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/switch.rst (renamed from Documentation/device-mapper/switch.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/thin-provisioning.rst (renamed from Documentation/device-mapper/thin-provisioning.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/unstriped.rst (renamed from Documentation/device-mapper/unstriped.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/verity.rst (renamed from Documentation/device-mapper/verity.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/writecache.rst (renamed from Documentation/device-mapper/writecache.rst)0
-rw-r--r--Documentation/admin-guide/device-mapper/zero.rst (renamed from Documentation/device-mapper/zero.rst)0
-rw-r--r--Documentation/admin-guide/devices.txt4
-rw-r--r--Documentation/admin-guide/efi-stub.rst (renamed from Documentation/efi-stub.txt)0
-rw-r--r--Documentation/admin-guide/gpio/index.rst17
-rw-r--r--Documentation/admin-guide/gpio/sysfs.rst (renamed from Documentation/gpio/sysfs.rst)0
-rw-r--r--Documentation/admin-guide/highuid.rst (renamed from Documentation/highuid.txt)0
-rw-r--r--Documentation/admin-guide/hw-vuln/l1tf.rst2
-rw-r--r--Documentation/admin-guide/hw_random.rst (renamed from Documentation/hw_random.txt)0
-rw-r--r--Documentation/admin-guide/index.rst29
-rw-r--r--Documentation/admin-guide/iostats.rst (renamed from Documentation/iostats.txt)0
-rw-r--r--Documentation/admin-guide/kdump/gdbmacros.txt (renamed from Documentation/kdump/gdbmacros.txt)0
-rw-r--r--Documentation/admin-guide/kdump/index.rst20
-rw-r--r--Documentation/admin-guide/kdump/kdump.rst (renamed from Documentation/kdump/kdump.rst)0
-rw-r--r--Documentation/admin-guide/kdump/vmcoreinfo.rst (renamed from Documentation/kdump/vmcoreinfo.rst)0
-rw-r--r--Documentation/admin-guide/kernel-parameters.rst2
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt153
-rw-r--r--Documentation/admin-guide/kernel-per-CPU-kthreads.rst356
-rw-r--r--Documentation/admin-guide/laptops/asus-laptop.rst271
-rw-r--r--Documentation/admin-guide/laptops/disk-shock-protection.rst151
-rw-r--r--Documentation/admin-guide/laptops/index.rst17
-rw-r--r--Documentation/admin-guide/laptops/laptop-mode.rst781
-rw-r--r--Documentation/admin-guide/laptops/lg-laptop.rst84
-rw-r--r--Documentation/admin-guide/laptops/sony-laptop.rst174
-rw-r--r--Documentation/admin-guide/laptops/sonypi.rst158
-rw-r--r--Documentation/admin-guide/laptops/thinkpad-acpi.rst1562
-rw-r--r--Documentation/admin-guide/laptops/toshiba_haps.rst87
-rw-r--r--Documentation/admin-guide/lcd-panel-cgram.rst27
-rw-r--r--Documentation/admin-guide/ldm.rst (renamed from Documentation/ldm.txt)0
-rw-r--r--Documentation/admin-guide/lockup-watchdogs.rst (renamed from Documentation/lockup-watchdogs.txt)0
-rw-r--r--Documentation/admin-guide/mm/cma_debugfs.rst25
-rw-r--r--Documentation/admin-guide/mm/index.rst3
-rw-r--r--Documentation/admin-guide/mm/ksm.rst2
-rw-r--r--Documentation/admin-guide/mm/numa_memory_policy.rst2
-rw-r--r--Documentation/admin-guide/namespaces/compatibility-list.rst43
-rw-r--r--Documentation/admin-guide/namespaces/index.rst11
-rw-r--r--Documentation/admin-guide/namespaces/resource-control.rst18
-rw-r--r--Documentation/admin-guide/numastat.rst (renamed from Documentation/numastat.txt)0
-rw-r--r--Documentation/admin-guide/perf/arm-ccn.rst61
-rw-r--r--Documentation/admin-guide/perf/arm_dsu_pmu.rst29
-rw-r--r--Documentation/admin-guide/perf/hisi-pmu.rst60
-rw-r--r--Documentation/admin-guide/perf/index.rst16
-rw-r--r--Documentation/admin-guide/perf/qcom_l2_pmu.rst39
-rw-r--r--Documentation/admin-guide/perf/qcom_l3_pmu.rst26
-rw-r--r--Documentation/admin-guide/perf/thunderx2-pmu.rst42
-rw-r--r--Documentation/admin-guide/perf/xgene-pmu.rst49
-rw-r--r--Documentation/admin-guide/pnp.rst (renamed from Documentation/pnp.txt)0
-rw-r--r--Documentation/admin-guide/rapidio.rst (renamed from Documentation/driver-api/rapidio.rst)0
-rw-r--r--Documentation/admin-guide/rtc.rst (renamed from Documentation/rtc.txt)0
-rw-r--r--Documentation/admin-guide/svga.rst (renamed from Documentation/svga.txt)0
-rw-r--r--Documentation/admin-guide/sysctl/abi.rst67
-rw-r--r--Documentation/admin-guide/sysctl/fs.rst384
-rw-r--r--Documentation/admin-guide/sysctl/index.rst98
-rw-r--r--Documentation/admin-guide/sysctl/kernel.rst1177
-rw-r--r--Documentation/admin-guide/sysctl/net.rst461
-rw-r--r--Documentation/admin-guide/sysctl/sunrpc.rst25
-rw-r--r--Documentation/admin-guide/sysctl/user.rst78
-rw-r--r--Documentation/admin-guide/sysctl/vm.rst964
-rw-r--r--Documentation/admin-guide/video-output.rst (renamed from Documentation/video-output.txt)0
-rw-r--r--Documentation/admin-guide/xfs.rst466
-rw-r--r--Documentation/aoe/aoe.rst150
-rw-r--r--Documentation/aoe/index.rst19
-rw-r--r--Documentation/aoe/udev.txt26
-rw-r--r--Documentation/arm/Booting218
-rw-r--r--Documentation/arm/IXP4xx172
-rw-r--r--Documentation/arm/Interrupts167
-rw-r--r--Documentation/arm/Marvell/README395
-rw-r--r--Documentation/arm/Microchip/README169
-rw-r--r--Documentation/arm/Netwinder78
-rw-r--r--Documentation/arm/OMAP/DSS362
-rw-r--r--Documentation/arm/OMAP/README11
-rw-r--r--Documentation/arm/OMAP/omap_pm154
-rw-r--r--Documentation/arm/Porting135
-rw-r--r--Documentation/arm/README204
-rw-r--r--Documentation/arm/SA1100/ADSBitsy43
-rw-r--r--Documentation/arm/SA1100/Assabet300
-rw-r--r--Documentation/arm/SA1100/Brutus66
-rw-r--r--Documentation/arm/SA1100/CERF29
-rw-r--r--Documentation/arm/SA1100/FreeBird21
-rw-r--r--Documentation/arm/SA1100/GraphicsClient98
-rw-r--r--Documentation/arm/SA1100/GraphicsMaster53
-rw-r--r--Documentation/arm/SA1100/HUW_WEBPANEL17
-rw-r--r--Documentation/arm/SA1100/Itsy39
-rw-r--r--Documentation/arm/SA1100/LART14
-rw-r--r--Documentation/arm/SA1100/PLEB11
-rw-r--r--Documentation/arm/SA1100/Pangolin23
-rw-r--r--Documentation/arm/SA1100/Tifon7
-rw-r--r--Documentation/arm/SA1100/Yopy2
-rw-r--r--Documentation/arm/SA1100/empeg2
-rw-r--r--Documentation/arm/SA1100/nanoEngine11
-rw-r--r--Documentation/arm/SA1100/serial_UART47
-rw-r--r--Documentation/arm/SPEAr/overview.txt63
-rw-r--r--Documentation/arm/Samsung-S3C24XX/CPUfreq.txt75
-rw-r--r--Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt58
-rw-r--r--Documentation/arm/Samsung-S3C24XX/GPIO.txt171
-rw-r--r--Documentation/arm/Samsung-S3C24XX/H1940.txt40
-rw-r--r--Documentation/arm/Samsung-S3C24XX/NAND.txt30
-rw-r--r--Documentation/arm/Samsung-S3C24XX/Overview.txt318
-rw-r--r--Documentation/arm/Samsung-S3C24XX/S3C2412.txt120
-rw-r--r--Documentation/arm/Samsung-S3C24XX/S3C2413.txt21
-rw-r--r--Documentation/arm/Samsung-S3C24XX/SMDK2440.txt56
-rw-r--r--Documentation/arm/Samsung-S3C24XX/Suspend.txt137
-rw-r--r--Documentation/arm/Samsung-S3C24XX/USB-Host.txt93
-rw-r--r--Documentation/arm/Samsung/Bootloader-interface.txt68
-rw-r--r--Documentation/arm/Samsung/GPIO.txt40
-rw-r--r--Documentation/arm/Samsung/Overview.txt86
-rw-r--r--Documentation/arm/Setup129
-rw-r--r--Documentation/arm/VFP/release-notes.txt55
-rw-r--r--Documentation/arm/arm.rst214
-rw-r--r--Documentation/arm/booting.rst237
-rw-r--r--Documentation/arm/cluster-pm-race-avoidance.rst533
-rw-r--r--Documentation/arm/cluster-pm-race-avoidance.txt498
-rw-r--r--Documentation/arm/firmware.rst72
-rw-r--r--Documentation/arm/firmware.txt70
-rw-r--r--Documentation/arm/index.rst80
-rw-r--r--Documentation/arm/interrupts.rst169
-rw-r--r--Documentation/arm/ixp4xx.rst173
-rw-r--r--Documentation/arm/kernel_mode_neon.rst124
-rw-r--r--Documentation/arm/kernel_mode_neon.txt121
-rw-r--r--Documentation/arm/kernel_user_helpers.rst268
-rw-r--r--Documentation/arm/kernel_user_helpers.txt267
-rw-r--r--Documentation/arm/keystone/Overview.txt55
-rw-r--r--Documentation/arm/keystone/knav-qmss.rst60
-rw-r--r--Documentation/arm/keystone/knav-qmss.txt56
-rw-r--r--Documentation/arm/keystone/overview.rst74
-rw-r--r--Documentation/arm/marvel.rst488
-rw-r--r--Documentation/arm/mem_alignment58
-rw-r--r--Documentation/arm/mem_alignment.rst63
-rw-r--r--Documentation/arm/memory.rst93
-rw-r--r--Documentation/arm/memory.txt88
-rw-r--r--Documentation/arm/microchip.rst204
-rw-r--r--Documentation/arm/netwinder.rst85
-rw-r--r--Documentation/arm/nwfpe/NOTES29
-rw-r--r--Documentation/arm/nwfpe/README70
-rw-r--r--Documentation/arm/nwfpe/README.FPE156
-rw-r--r--Documentation/arm/nwfpe/TODO67
-rw-r--r--Documentation/arm/nwfpe/index.rst13
-rw-r--r--Documentation/arm/nwfpe/netwinder-fpe.rst162
-rw-r--r--Documentation/arm/nwfpe/notes.rst32
-rw-r--r--Documentation/arm/nwfpe/nwfpe.rst74
-rw-r--r--Documentation/arm/nwfpe/todo.rst72
-rw-r--r--Documentation/arm/omap/dss.rst372
-rw-r--r--Documentation/arm/omap/index.rst12
-rw-r--r--Documentation/arm/omap/omap.rst18
-rw-r--r--Documentation/arm/omap/omap_pm.rst165
-rw-r--r--Documentation/arm/porting.rst137
-rw-r--r--Documentation/arm/pxa/mfp.rst288
-rw-r--r--Documentation/arm/pxa/mfp.txt286
-rw-r--r--Documentation/arm/sa1100/adsbitsy.rst51
-rw-r--r--Documentation/arm/sa1100/assabet.rst301
-rw-r--r--Documentation/arm/sa1100/brutus.rst69
-rw-r--r--Documentation/arm/sa1100/cerf.rst35
-rw-r--r--Documentation/arm/sa1100/freebird.rst25
-rw-r--r--Documentation/arm/sa1100/graphicsclient.rst102
-rw-r--r--Documentation/arm/sa1100/graphicsmaster.rst60
-rw-r--r--Documentation/arm/sa1100/huw_webpanel.rst21
-rw-r--r--Documentation/arm/sa1100/index.rst25
-rw-r--r--Documentation/arm/sa1100/itsy.rst47
-rw-r--r--Documentation/arm/sa1100/lart.rst15
-rw-r--r--Documentation/arm/sa1100/nanoengine.rst11
-rw-r--r--Documentation/arm/sa1100/pangolin.rst29
-rw-r--r--Documentation/arm/sa1100/pleb.rst13
-rw-r--r--Documentation/arm/sa1100/serial_uart.rst51
-rw-r--r--Documentation/arm/sa1100/tifon.rst7
-rw-r--r--Documentation/arm/sa1100/yopy.rst5
-rw-r--r--Documentation/arm/samsung-s3c24xx/cpufreq.rst76
-rw-r--r--Documentation/arm/samsung-s3c24xx/eb2410itx.rst59
-rw-r--r--Documentation/arm/samsung-s3c24xx/gpio.rst172
-rw-r--r--Documentation/arm/samsung-s3c24xx/h1940.rst41
-rw-r--r--Documentation/arm/samsung-s3c24xx/index.rst20
-rw-r--r--Documentation/arm/samsung-s3c24xx/nand.rst30
-rw-r--r--Documentation/arm/samsung-s3c24xx/overview.rst319
-rw-r--r--Documentation/arm/samsung-s3c24xx/s3c2412.rst121
-rw-r--r--Documentation/arm/samsung-s3c24xx/s3c2413.rst22
-rw-r--r--Documentation/arm/samsung-s3c24xx/smdk2440.rst57
-rw-r--r--Documentation/arm/samsung-s3c24xx/suspend.rst137
-rw-r--r--Documentation/arm/samsung-s3c24xx/usb-host.rst91
-rw-r--r--Documentation/arm/samsung/bootloader-interface.rst81
-rwxr-xr-xDocumentation/arm/samsung/clksrc-change-registers.awk (renamed from Documentation/arm/Samsung/clksrc-change-registers.awk)0
-rw-r--r--Documentation/arm/samsung/gpio.rst41
-rw-r--r--Documentation/arm/samsung/index.rst12
-rw-r--r--Documentation/arm/samsung/overview.rst89
-rw-r--r--Documentation/arm/setup.rst108
-rw-r--r--Documentation/arm/sh-mobile/.gitignore (renamed from Documentation/arm/SH-Mobile/.gitignore)0
-rw-r--r--Documentation/arm/spear/overview.rst66
-rw-r--r--Documentation/arm/sti/overview.rst36
-rw-r--r--Documentation/arm/sti/overview.txt33
-rw-r--r--Documentation/arm/sti/stih407-overview.rst19
-rw-r--r--Documentation/arm/sti/stih407-overview.txt18
-rw-r--r--Documentation/arm/sti/stih415-overview.rst14
-rw-r--r--Documentation/arm/sti/stih415-overview.txt12
-rw-r--r--Documentation/arm/sti/stih416-overview.rst13
-rw-r--r--Documentation/arm/sti/stih416-overview.txt12
-rw-r--r--Documentation/arm/sti/stih418-overview.rst21
-rw-r--r--Documentation/arm/sti/stih418-overview.txt20
-rw-r--r--Documentation/arm/stm32/overview.rst2
-rw-r--r--Documentation/arm/stm32/stm32f429-overview.rst7
-rw-r--r--Documentation/arm/stm32/stm32f746-overview.rst7
-rw-r--r--Documentation/arm/stm32/stm32f769-overview.rst7
-rw-r--r--Documentation/arm/stm32/stm32h743-overview.rst7
-rw-r--r--Documentation/arm/stm32/stm32mp157-overview.rst3
-rw-r--r--Documentation/arm/sunxi.rst150
-rw-r--r--Documentation/arm/sunxi/README102
-rw-r--r--Documentation/arm/sunxi/clocks.rst57
-rw-r--r--Documentation/arm/sunxi/clocks.txt56
-rw-r--r--Documentation/arm/swp_emulation27
-rw-r--r--Documentation/arm/swp_emulation.rst27
-rw-r--r--Documentation/arm/tcm.rst161
-rw-r--r--Documentation/arm/tcm.txt155
-rw-r--r--Documentation/arm/uefi.rst67
-rw-r--r--Documentation/arm/uefi.txt60
-rw-r--r--Documentation/arm/vfp/release-notes.rst57
-rw-r--r--Documentation/arm/vlocks.rst212
-rw-r--r--Documentation/arm/vlocks.txt211
-rw-r--r--Documentation/arm64/booting.rst2
-rw-r--r--Documentation/arm64/elf_hwcaps.rst8
-rw-r--r--Documentation/arm64/index.rst2
-rw-r--r--Documentation/arm64/silicon-errata.rst2
-rw-r--r--Documentation/arm64/sve.rst16
-rw-r--r--Documentation/atomic_t.txt26
-rw-r--r--Documentation/auxdisplay/lcd-panel-cgram.txt24
-rw-r--r--Documentation/backlight/lp855x-driver.txt66
-rw-r--r--Documentation/block/bfq-iosched.rst597
-rw-r--r--Documentation/block/bfq-iosched.txt583
-rw-r--r--Documentation/block/biodoc.rst1164
-rw-r--r--Documentation/block/biodoc.txt1077
-rw-r--r--Documentation/block/biovecs.rst146
-rw-r--r--Documentation/block/biovecs.txt144
-rw-r--r--Documentation/block/capability.rst18
-rw-r--r--Documentation/block/capability.txt15
-rw-r--r--Documentation/block/cmdline-partition.rst53
-rw-r--r--Documentation/block/cmdline-partition.txt46
-rw-r--r--Documentation/block/data-integrity.rst291
-rw-r--r--Documentation/block/data-integrity.txt281
-rw-r--r--Documentation/block/deadline-iosched.rst72
-rw-r--r--Documentation/block/deadline-iosched.txt75
-rw-r--r--Documentation/block/index.rst25
-rw-r--r--Documentation/block/ioprio.rst182
-rw-r--r--Documentation/block/ioprio.txt183
-rw-r--r--Documentation/block/kyber-iosched.rst15
-rw-r--r--Documentation/block/kyber-iosched.txt14
-rw-r--r--Documentation/block/null_blk.rst126
-rw-r--r--Documentation/block/null_blk.txt99
-rw-r--r--Documentation/block/pr.rst119
-rw-r--r--Documentation/block/pr.txt119
-rw-r--r--Documentation/block/queue-sysfs.rst254
-rw-r--r--Documentation/block/queue-sysfs.txt231
-rw-r--r--Documentation/block/request.rst99
-rw-r--r--Documentation/block/request.txt88
-rw-r--r--Documentation/block/stat.rst93
-rw-r--r--Documentation/block/stat.txt86
-rw-r--r--Documentation/block/switching-sched.rst39
-rw-r--r--Documentation/block/switching-sched.txt37
-rw-r--r--Documentation/block/writeback_cache_control.rst86
-rw-r--r--Documentation/block/writeback_cache_control.txt86
-rw-r--r--Documentation/blockdev/drbd/README.txt16
-rw-r--r--Documentation/blockdev/drbd/data-structure-v9.txt38
-rw-r--r--Documentation/blockdev/drbd/node-states-8.dot14
-rw-r--r--Documentation/blockdev/floppy.txt245
-rw-r--r--Documentation/blockdev/nbd.txt31
-rw-r--r--Documentation/blockdev/paride.txt417
-rw-r--r--Documentation/blockdev/ramdisk.txt174
-rw-r--r--Documentation/blockdev/zram.txt355
-rw-r--r--Documentation/bpf/bpf_design_QA.rst30
-rw-r--r--Documentation/bpf/index.rst1
-rw-r--r--Documentation/bpf/prog_cgroup_sockopt.rst93
-rw-r--r--Documentation/bus-devices/ti-gpmc.txt122
-rw-r--r--Documentation/cdrom/index.rst2
-rw-r--r--Documentation/cgroup-v1/blkio-controller.txt375
-rw-r--r--Documentation/cgroup-v1/cgroups.txt677
-rw-r--r--Documentation/cgroup-v1/cpuacct.txt49
-rw-r--r--Documentation/cgroup-v1/cpusets.txt839
-rw-r--r--Documentation/cgroup-v1/devices.txt116
-rw-r--r--Documentation/cgroup-v1/freezer-subsystem.txt123
-rw-r--r--Documentation/cgroup-v1/hugetlb.txt45
-rw-r--r--Documentation/cgroup-v1/memcg_test.txt280
-rw-r--r--Documentation/cgroup-v1/memory.txt892
-rw-r--r--Documentation/cgroup-v1/net_cls.txt39
-rw-r--r--Documentation/cgroup-v1/net_prio.txt55
-rw-r--r--Documentation/cgroup-v1/pids.txt88
-rw-r--r--Documentation/cgroup-v1/rdma.txt109
-rw-r--r--Documentation/cma/debugfs.txt21
-rw-r--r--Documentation/connector/connector.txt196
-rw-r--r--Documentation/console/console.txt145
-rw-r--r--Documentation/core-api/circular-buffers.rst2
-rw-r--r--Documentation/core-api/gcc-plugins.rst (renamed from Documentation/gcc-plugins.txt)0
-rw-r--r--Documentation/core-api/index.rst1
-rw-r--r--Documentation/core-api/kernel-api.rst2
-rw-r--r--Documentation/core-api/printk-formats.rst2
-rw-r--r--Documentation/core-api/timekeeping.rst12
-rw-r--r--Documentation/cpu-freq/core.txt2
-rw-r--r--Documentation/cputopology.txt159
-rw-r--r--Documentation/crypto/api-samples.rst176
-rw-r--r--Documentation/crypto/api-skcipher.rst2
-rw-r--r--Documentation/crypto/architecture.rst4
-rw-r--r--Documentation/crypto/crypto_engine.rst111
-rw-r--r--Documentation/dev-tools/kmemleak.rst48
-rw-r--r--Documentation/dev-tools/sparse.rst5
-rw-r--r--Documentation/device-mapper/index.rst44
-rw-r--r--Documentation/device-mapper/snapshot.rst180
-rw-r--r--Documentation/device-mapper/statistics.rst225
-rw-r--r--Documentation/devicetree/bindings/Makefile2
-rw-r--r--Documentation/devicetree/bindings/arm/al,alpine.txt16
-rw-r--r--Documentation/devicetree/bindings/arm/al,alpine.yaml21
-rw-r--r--Documentation/devicetree/bindings/arm/amlogic.txt142
-rw-r--r--Documentation/devicetree/bindings/arm/amlogic.yaml144
-rw-r--r--Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt28
-rw-r--r--Documentation/devicetree/bindings/arm/arm,scmi.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/arm-boards2
-rw-r--r--Documentation/devicetree/bindings/arm/atmel-at91.txt73
-rw-r--r--Documentation/devicetree/bindings/arm/atmel-at91.yaml134
-rw-r--r--Documentation/devicetree/bindings/arm/axxia.txt12
-rw-r--r--Documentation/devicetree/bindings/arm/axxia.yaml19
-rw-r--r--Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt4
-rw-r--r--Documentation/devicetree/bindings/arm/coresight.txt8
-rw-r--r--Documentation/devicetree/bindings/arm/cpus.yaml487
-rw-r--r--Documentation/devicetree/bindings/arm/digicolor.txt6
-rw-r--r--Documentation/devicetree/bindings/arm/digicolor.yaml16
-rw-r--r--Documentation/devicetree/bindings/arm/emtrion.txt12
-rw-r--r--Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt37
-rw-r--r--Documentation/devicetree/bindings/arm/fsl.yaml44
-rw-r--r--Documentation/devicetree/bindings/arm/idle-states.txt15
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek.txt89
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek.yaml91
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt1
-rw-r--r--Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt2
-rw-r--r--Documentation/devicetree/bindings/arm/moxart.txt12
-rw-r--r--Documentation/devicetree/bindings/arm/moxart.yaml19
-rw-r--r--Documentation/devicetree/bindings/arm/nxp/lpc32xx.txt8
-rw-r--r--Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml25
-rw-r--r--Documentation/devicetree/bindings/arm/omap/omap.txt3
-rw-r--r--Documentation/devicetree/bindings/arm/psci.txt111
-rw-r--r--Documentation/devicetree/bindings/arm/psci.yaml163
-rw-r--r--Documentation/devicetree/bindings/arm/qcom.yaml14
-rw-r--r--Documentation/devicetree/bindings/arm/rda.txt17
-rw-r--r--Documentation/devicetree/bindings/arm/rda.yaml20
-rw-r--r--Documentation/devicetree/bindings/arm/renesas.yaml8
-rw-r--r--Documentation/devicetree/bindings/arm/rockchip.yaml13
-rw-r--r--Documentation/devicetree/bindings/arm/stm32/mlahb.txt37
-rw-r--r--Documentation/devicetree/bindings/arm/stm32/stm32.txt10
-rw-r--r--Documentation/devicetree/bindings/arm/stm32/stm32.yaml31
-rw-r--r--Documentation/devicetree/bindings/arm/sunxi.yaml2
-rw-r--r--Documentation/devicetree/bindings/arm/ti/k3.txt3
-rw-r--r--Documentation/devicetree/bindings/arm/xen.txt2
-rw-r--r--Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml80
-rw-r--r--Documentation/devicetree/bindings/bus/sunxi-rsb.txt47
-rw-r--r--Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml141
-rw-r--r--Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt1
-rw-r--r--Documentation/devicetree/bindings/clock/at91-clock.txt7
-rw-r--r--Documentation/devicetree/bindings/clock/brcm,bcm63xx-clocks.txt22
-rw-r--r--Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt1
-rw-r--r--Documentation/devicetree/bindings/clock/mvebu-core-clock.txt1
-rw-r--r--Documentation/devicetree/bindings/clock/qcom,gpucc.txt4
-rw-r--r--Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt7
-rw-r--r--Documentation/devicetree/bindings/clock/silabs,si5341.txt162
-rw-r--r--Documentation/devicetree/bindings/clock/sunxi-ccu.txt62
-rw-r--r--Documentation/devicetree/bindings/common-properties.txt17
-rw-r--r--Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt37
-rw-r--r--Documentation/devicetree/bindings/crypto/atmel-crypto.txt13
-rw-r--r--Documentation/devicetree/bindings/csky/pmu.txt38
-rw-r--r--Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml100
-rw-r--r--Documentation/devicetree/bindings/display/arm,komeda.txt23
-rw-r--r--Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt4
-rw-r--r--Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt19
-rw-r--r--Documentation/devicetree/bindings/display/bridge/sii902x.txt42
-rw-r--r--Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt6
-rw-r--r--Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt1
-rw-r--r--Documentation/devicetree/bindings/display/ingenic,lcd.txt44
-rw-r--r--Documentation/devicetree/bindings/display/msm/dpu.txt10
-rw-r--r--Documentation/devicetree/bindings/display/msm/dsi.txt1
-rw-r--r--Documentation/devicetree/bindings/display/panel/armadeus,st0700-adapt.txt9
-rw-r--r--Documentation/devicetree/bindings/display/panel/edt,et-series.txt16
-rw-r--r--Documentation/devicetree/bindings/display/panel/evervision,vgg804821.txt12
-rw-r--r--Documentation/devicetree/bindings/display/panel/friendlyarm,hd702e.txt32
-rw-r--r--Documentation/devicetree/bindings/display/panel/koe,tx14d24vm1bpa.txt42
-rw-r--r--Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2045-53ts.txt11
-rw-r--r--Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2587-53ts.txt14
-rw-r--r--Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.txt33
-rw-r--r--Documentation/devicetree/bindings/display/panel/tfc,s9700rtwv43tr-01b.txt15
-rw-r--r--Documentation/devicetree/bindings/display/panel/vl050_8048nt_c01.txt12
-rw-r--r--Documentation/devicetree/bindings/display/renesas,du.txt2
-rw-r--r--Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt8
-rw-r--r--Documentation/devicetree/bindings/display/simple-framebuffer.yaml25
-rw-r--r--Documentation/devicetree/bindings/display/st,stm32-ltdc.txt3
-rw-r--r--Documentation/devicetree/bindings/display/sunxi/sun6i-dsi.txt93
-rw-r--r--Documentation/devicetree/bindings/dma/8250_mtk_dma.txt33
-rw-r--r--Documentation/devicetree/bindings/dma/arm-pl330.txt3
-rw-r--r--Documentation/devicetree/bindings/dma/fsl-edma.txt44
-rw-r--r--Documentation/devicetree/bindings/dma/fsl-qdma.txt1
-rw-r--r--Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt54
-rw-r--r--Documentation/devicetree/bindings/dma/sun6i-dma.txt9
-rw-r--r--Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt19
-rw-r--r--Documentation/devicetree/bindings/gpio/gpio-davinci.txt18
-rw-r--r--Documentation/devicetree/bindings/gpio/pl061-gpio.txt10
-rw-r--r--Documentation/devicetree/bindings/gpio/pl061-gpio.yaml69
-rw-r--r--Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt20
-rw-r--r--Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt1
-rw-r--r--Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt25
-rw-r--r--Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml65
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mt7621.txt25
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt64
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-ocores.txt9
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-omap.txt1
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-stm32.txt2
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-sun6i-p2wi.txt41
-rw-r--r--Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml124
-rw-r--r--Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt2
-rw-r--r--Documentation/devicetree/bindings/i3c/i3c.txt4
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml72
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml63
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adxl345.txt39
-rw-r--r--Documentation/devicetree/bindings/iio/accel/adxl372.txt33
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt75
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml160
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7780.txt48
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml87
-rw-r--r--Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt2
-rw-r--r--Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt1
-rw-r--r--Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.txt12
-rw-r--r--Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.yaml39
-rw-r--r--Documentation/devicetree/bindings/iio/frequency/adf4371.yaml63
-rw-r--r--Documentation/devicetree/bindings/iio/light/isl29018.txt27
-rw-r--r--Documentation/devicetree/bindings/iio/light/isl29018.yaml56
-rw-r--r--Documentation/devicetree/bindings/iio/light/tsl2583.txt25
-rw-r--r--Documentation/devicetree/bindings/iio/light/tsl2583.yaml46
-rw-r--r--Documentation/devicetree/bindings/iio/light/tsl2772.txt42
-rw-r--r--Documentation/devicetree/bindings/iio/light/tsl2772.yaml83
-rw-r--r--Documentation/devicetree/bindings/input/elan_i2c.txt11
-rw-r--r--Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt1
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt29
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt1
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt20
-rw-r--r--Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt43
-rw-r--r--Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt2
-rw-r--r--Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml21
-rw-r--r--Documentation/devicetree/bindings/leds/leds-lm36274.txt85
-rw-r--r--Documentation/devicetree/bindings/leds/leds-lm3697.txt73
-rw-r--r--Documentation/devicetree/bindings/leds/leds-spi-byte.txt44
-rw-r--r--Documentation/devicetree/bindings/mailbox/omap-mailbox.txt59
-rw-r--r--Documentation/devicetree/bindings/media/allegro.txt43
-rw-r--r--Documentation/devicetree/bindings/media/amlogic,vdec.txt71
-rw-r--r--Documentation/devicetree/bindings/media/imx7-csi.txt9
-rw-r--r--Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt50
-rw-r--r--Documentation/devicetree/bindings/media/st,stm32-dcmi.txt2
-rw-r--r--Documentation/devicetree/bindings/media/sun6i-csi.txt1
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt1
-rw-r--r--Documentation/devicetree/bindings/mfd/atmel-usart.txt20
-rw-r--r--Documentation/devicetree/bindings/mfd/cros-ec.txt5
-rw-r--r--Documentation/devicetree/bindings/mfd/lp87565.txt36
-rw-r--r--Documentation/devicetree/bindings/mfd/madera.txt8
-rw-r--r--Documentation/devicetree/bindings/mfd/rk808.txt44
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt102
-rw-r--r--Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt10
-rw-r--r--Documentation/devicetree/bindings/mfd/ti-lmu.txt88
-rw-r--r--Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt11
-rw-r--r--Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt23
-rw-r--r--Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt58
-rw-r--r--Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml98
-rw-r--r--Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt4
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc-controller.yaml374
-rw-r--r--Documentation/devicetree/bindings/mmc/mmc.txt178
-rw-r--r--Documentation/devicetree/bindings/mmc/renesas,sdhi.txt111
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-am654.txt9
-rw-r--r--Documentation/devicetree/bindings/mmc/sdhci-sprd.txt26
-rw-r--r--Documentation/devicetree/bindings/mmc/sunxi-mmc.txt52
-rw-r--r--Documentation/devicetree/bindings/mmc/tmio_mmc.txt120
-rw-r--r--Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml2
-rw-r--r--Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt5
-rw-r--r--Documentation/devicetree/bindings/mtd/cadence-quadspi.txt5
-rw-r--r--Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt13
-rw-r--r--Documentation/devicetree/bindings/mtd/nand-controller.yaml1
-rw-r--r--Documentation/devicetree/bindings/mtd/stm32-quadspi.txt43
-rw-r--r--Documentation/devicetree/bindings/mtd/ti,am654-hbmc.txt51
-rw-r--r--Documentation/devicetree/bindings/mux/mmio-mux.txt60
-rw-r--r--Documentation/devicetree/bindings/mux/reg-mux.txt129
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml56
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml70
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt19
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt27
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt27
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml65
-rw-r--r--Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml321
-rw-r--r--Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt1
-rw-r--r--Documentation/devicetree/bindings/net/can/rcar_can.txt13
-rw-r--r--Documentation/devicetree/bindings/net/can/rcar_canfd.txt16
-rw-r--r--Documentation/devicetree/bindings/net/dsa/ksz.txt2
-rw-r--r--Documentation/devicetree/bindings/net/dsa/marvell.txt7
-rw-r--r--Documentation/devicetree/bindings/net/dsa/qca8k.txt6
-rw-r--r--Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt58
-rw-r--r--Documentation/devicetree/bindings/net/dwmac-sun8i.txt201
-rw-r--r--Documentation/devicetree/bindings/net/ethernet-controller.yaml206
-rw-r--r--Documentation/devicetree/bindings/net/ethernet-phy.yaml177
-rw-r--r--Documentation/devicetree/bindings/net/ethernet.txt68
-rw-r--r--Documentation/devicetree/bindings/net/fixed-link.txt55
-rw-r--r--Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt7
-rw-r--r--Documentation/devicetree/bindings/net/keystone-netcp.txt44
-rw-r--r--Documentation/devicetree/bindings/net/macb.txt3
-rw-r--r--Documentation/devicetree/bindings/net/marvell-bluetooth.txt25
-rw-r--r--Documentation/devicetree/bindings/net/marvell-orion-mdio.txt2
-rw-r--r--Documentation/devicetree/bindings/net/mdio.txt38
-rw-r--r--Documentation/devicetree/bindings/net/mdio.yaml74
-rw-r--r--Documentation/devicetree/bindings/net/mediatek-bluetooth.txt17
-rw-r--r--Documentation/devicetree/bindings/net/mediatek-net.txt14
-rw-r--r--Documentation/devicetree/bindings/net/phy.txt80
-rw-r--r--Documentation/devicetree/bindings/net/qca,ar71xx.txt45
-rw-r--r--Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt4
-rw-r--r--Documentation/devicetree/bindings/net/snps,dwmac.yaml411
-rw-r--r--Documentation/devicetree/bindings/net/socfpga-dwmac.txt10
-rw-r--r--Documentation/devicetree/bindings/net/stmmac.txt179
-rw-r--r--Documentation/devicetree/bindings/net/ti,dp83867.txt14
-rw-r--r--Documentation/devicetree/bindings/net/wiznet,w5x00.txt50
-rw-r--r--Documentation/devicetree/bindings/net/xilinx_axienet.txt29
-rw-r--r--Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml51
-rw-r--r--Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt29
-rw-r--r--Documentation/devicetree/bindings/nvmem/imx-ocotp.txt1
-rw-r--r--Documentation/devicetree/bindings/pci/83xx-512x-pci.txt1
-rw-r--r--Documentation/devicetree/bindings/pci/mobiveil-pcie.txt2
-rw-r--r--Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt8
-rw-r--r--Documentation/devicetree/bindings/pci/pci.txt3
-rw-r--r--Documentation/devicetree/bindings/pci/qcom,pcie.txt25
-rw-r--r--Documentation/devicetree/bindings/pci/rcar-pci.txt1
-rw-r--r--Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt21
-rw-r--r--Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml57
-rw-r--r--Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt29
-rw-r--r--Documentation/devicetree/bindings/phy/mxs-usb-phy.txt3
-rw-r--r--Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt12
-rw-r--r--Documentation/devicetree/bindings/phy/phy-bindings.txt2
-rw-r--r--Documentation/devicetree/bindings/phy/phy-pxa-usb.txt18
-rw-r--r--Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt42
-rw-r--r--Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt13
-rw-r--r--Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt2
-rw-r--r--Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml83
-rw-r--r--Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml133
-rw-r--r--Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt34
-rw-r--r--Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt2
-rw-r--r--Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.txt39
-rw-r--r--Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt44
-rw-r--r--Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt16
-rw-r--r--Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt2
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt2
-rw-r--r--Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt107
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt172
-rw-r--r--Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt3
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt16
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt6
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt8
-rw-r--r--Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt190
-rw-r--r--Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt208
-rw-r--r--Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml271
-rw-r--r--Documentation/devicetree/bindings/power/qcom,rpmpd.txt2
-rw-r--r--Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.txt26
-rw-r--r--Documentation/devicetree/bindings/power/reset/qcom,pon.txt1
-rw-r--r--Documentation/devicetree/bindings/property-units.txt34
-rw-r--r--Documentation/devicetree/bindings/ptp/ptp-qoriq.txt2
-rw-r--r--Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml57
-rw-r--r--Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt5
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-sifive.txt33
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt9
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-stm32.txt3
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-sun4i.txt24
-rw-r--r--Documentation/devicetree/bindings/regulator/arizona-regulator.txt3
-rw-r--r--Documentation/devicetree/bindings/regulator/fixed-regulator.yaml5
-rw-r--r--Documentation/devicetree/bindings/regulator/gpio-regulator.txt57
-rw-r--r--Documentation/devicetree/bindings/regulator/gpio-regulator.yaml118
-rw-r--r--Documentation/devicetree/bindings/regulator/max8660.txt47
-rw-r--r--Documentation/devicetree/bindings/regulator/max8660.yaml77
-rw-r--r--Documentation/devicetree/bindings/regulator/pv88060.txt2
-rw-r--r--Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt22
-rw-r--r--Documentation/devicetree/bindings/regulator/regulator.txt140
-rw-r--r--Documentation/devicetree/bindings/regulator/regulator.yaml200
-rw-r--r--Documentation/devicetree/bindings/regulator/slg51000.txt88
-rw-r--r--Documentation/devicetree/bindings/regulator/st,stm32-booster.txt18
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt125
-rw-r--r--Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt140
-rw-r--r--Documentation/devicetree/bindings/remoteproc/stm32-rproc.txt63
-rw-r--r--Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt18
-rw-r--r--Documentation/devicetree/bindings/reset/fsl,imx7-src.txt2
-rw-r--r--Documentation/devicetree/bindings/riscv/cpus.yaml149
-rw-r--r--Documentation/devicetree/bindings/riscv/sifive.yaml25
-rw-r--r--Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt1
-rw-r--r--Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml43
-rw-r--r--Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml134
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc.txt73
-rw-r--r--Documentation/devicetree/bindings/rtc/rtc.yaml50
-rw-r--r--Documentation/devicetree/bindings/rtc/sun6i-rtc.txt46
-rw-r--r--Documentation/devicetree/bindings/rtc/sunxi-rtc.txt17
-rw-r--r--Documentation/devicetree/bindings/rtc/trivial-rtc.yaml92
-rw-r--r--Documentation/devicetree/bindings/serial/8250.txt19
-rw-r--r--Documentation/devicetree/bindings/serial/mtk-uart.txt13
-rw-r--r--Documentation/devicetree/bindings/serial/omap_serial.txt1
-rw-r--r--Documentation/devicetree/bindings/serial/st,stm32-usart.txt1
-rw-r--r--Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt10
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt81
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt6
-rw-r--r--Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt5
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml132
-rw-r--r--Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml120
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt6
-rw-r--r--Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt55
-rw-r--r--Documentation/devicetree/bindings/sound/cs42l73.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/cs42xx8.txt6
-rw-r--r--Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt3
-rw-r--r--Documentation/devicetree/bindings/sound/madera.txt67
-rw-r--r--Documentation/devicetree/bindings/sound/max98357a.txt4
-rw-r--r--Documentation/devicetree/bindings/sound/rt1011.txt32
-rwxr-xr-xDocumentation/devicetree/bindings/sound/rt1308.txt17
-rw-r--r--Documentation/devicetree/bindings/sound/st,stm32-i2s.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/st,stm32-sai.txt2
-rw-r--r--Documentation/devicetree/bindings/sound/sun4i-i2s.txt45
-rw-r--r--Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt42
-rw-r--r--Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml87
-rw-r--r--Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml107
-rw-r--r--Documentation/devicetree/bindings/spi/spi-bus.txt112
-rw-r--r--Documentation/devicetree/bindings/spi/spi-controller.yaml161
-rw-r--r--Documentation/devicetree/bindings/spi/spi-gpio.txt43
-rw-r--r--Documentation/devicetree/bindings/spi/spi-gpio.yaml72
-rw-r--r--Documentation/devicetree/bindings/spi/spi-pl022.yaml165
-rw-r--r--Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt5
-rw-r--r--Documentation/devicetree/bindings/spi/spi-sun4i.txt23
-rw-r--r--Documentation/devicetree/bindings/spi/spi-sun6i.txt44
-rw-r--r--Documentation/devicetree/bindings/spi/spi-synquacer.txt27
-rw-r--r--Documentation/devicetree/bindings/spi/spi_pl022.txt70
-rw-r--r--Documentation/devicetree/bindings/timer/nxp,sysctr-timer.txt25
-rw-r--r--Documentation/devicetree/bindings/timer/renesas,cmt.txt6
-rw-r--r--Documentation/devicetree/bindings/trivial-devices.yaml4
-rw-r--r--Documentation/devicetree/bindings/usb/dwc2.txt3
-rw-r--r--Documentation/devicetree/bindings/usb/dwc3.txt2
-rw-r--r--Documentation/devicetree/bindings/usb/generic-ehci.yaml3
-rw-r--r--Documentation/devicetree/bindings/usb/renesas,usb3.txt (renamed from Documentation/devicetree/bindings/usb/renesas_usb3.txt)0
-rw-r--r--Documentation/devicetree/bindings/usb/renesas,usbhs.txt57
-rw-r--r--Documentation/devicetree/bindings/usb/renesas_usbhs.txt55
-rw-r--r--Documentation/devicetree/bindings/usb/s3c2410-usb.txt2
-rw-r--r--Documentation/devicetree/bindings/vendor-prefixes.yaml90
-rw-r--r--Documentation/devicetree/bindings/virtio/iommu.txt66
-rw-r--r--Documentation/devicetree/bindings/virtio/mmio.txt30
-rw-r--r--Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt24
-rw-r--r--Documentation/devicetree/bindings/watchdog/renesas,wdt.txt (renamed from Documentation/devicetree/bindings/watchdog/renesas-wdt.txt)0
-rw-r--r--Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt1
-rw-r--r--Documentation/devicetree/booting-without-of.txt4
-rw-r--r--Documentation/dontdiff1
-rw-r--r--Documentation/driver-api/80211/mac80211-advanced.rst3
-rw-r--r--Documentation/driver-api/backlight/lp855x-driver.rst81
-rw-r--r--Documentation/driver-api/bt8xxgpio.rst (renamed from Documentation/bt8xxgpio.txt)0
-rw-r--r--Documentation/driver-api/connector.rst156
-rw-r--r--Documentation/driver-api/console.rst152
-rw-r--r--Documentation/driver-api/dcdbas.rst (renamed from Documentation/dcdbas.txt)0
-rw-r--r--Documentation/driver-api/dell_rbu.rst (renamed from Documentation/dell_rbu.txt)0
-rw-r--r--Documentation/driver-api/dmaengine/dmatest.rst21
-rw-r--r--Documentation/driver-api/driver-model/binding.rst98
-rw-r--r--Documentation/driver-api/driver-model/bus.rst146
-rw-r--r--Documentation/driver-api/driver-model/class.rst149
-rw-r--r--Documentation/driver-api/driver-model/design-patterns.rst116
-rw-r--r--Documentation/driver-api/driver-model/device.rst109
-rw-r--r--Documentation/driver-api/driver-model/devres.rst418
-rw-r--r--Documentation/driver-api/driver-model/driver.rst223
-rw-r--r--Documentation/driver-api/driver-model/index.rst24
-rw-r--r--Documentation/driver-api/driver-model/overview.rst124
-rw-r--r--Documentation/driver-api/driver-model/platform.rst246
-rw-r--r--Documentation/driver-api/driver-model/porting.rst448
-rw-r--r--Documentation/driver-api/early-userspace/buffer-format.rst119
-rw-r--r--Documentation/driver-api/early-userspace/early_userspace_support.rst154
-rw-r--r--Documentation/driver-api/early-userspace/index.rst18
-rw-r--r--Documentation/driver-api/edid.rst58
-rw-r--r--Documentation/driver-api/eisa.rst230
-rw-r--r--Documentation/driver-api/gpio/consumer.rst4
-rw-r--r--Documentation/driver-api/gpio/driver.rst18
-rw-r--r--Documentation/driver-api/index.rst45
-rw-r--r--Documentation/driver-api/interconnect.rst93
-rw-r--r--Documentation/driver-api/ipmb.rst105
-rw-r--r--Documentation/driver-api/isa.rst (renamed from Documentation/isa.txt)0
-rw-r--r--Documentation/driver-api/isapnp.rst (renamed from Documentation/isapnp.txt)0
-rw-r--r--Documentation/driver-api/lightnvm-pblk.rst (renamed from Documentation/lightnvm/pblk.txt)0
-rw-r--r--Documentation/driver-api/md/index.rst12
-rw-r--r--Documentation/driver-api/md/md-cluster.rst385
-rw-r--r--Documentation/driver-api/md/raid5-cache.rst111
-rw-r--r--Documentation/driver-api/md/raid5-ppl.rst47
-rw-r--r--Documentation/driver-api/mei/hdcp.rst32
-rw-r--r--Documentation/driver-api/mei/iamt.rst101
-rw-r--r--Documentation/driver-api/mei/index.rst23
-rw-r--r--Documentation/driver-api/mei/mei-client-bus.rst168
-rw-r--r--Documentation/driver-api/mei/mei.rst176
-rw-r--r--Documentation/driver-api/mei/nfc.rst28
-rw-r--r--Documentation/driver-api/memory-devices/index.rst18
-rw-r--r--Documentation/driver-api/memory-devices/ti-emif.rst64
-rw-r--r--Documentation/driver-api/memory-devices/ti-gpmc.rst179
-rw-r--r--Documentation/driver-api/men-chameleon-bus.rst (renamed from Documentation/men-chameleon-bus.txt)0
-rw-r--r--Documentation/driver-api/mmc/index.rst13
-rw-r--r--Documentation/driver-api/mmc/mmc-async-req.rst98
-rw-r--r--Documentation/driver-api/mmc/mmc-dev-attrs.rst91
-rw-r--r--Documentation/driver-api/mmc/mmc-dev-parts.rst41
-rw-r--r--Documentation/driver-api/mmc/mmc-tools.rst37
-rw-r--r--Documentation/driver-api/mtd/index.rst12
-rw-r--r--Documentation/driver-api/mtd/intel-spi.rst90
-rw-r--r--Documentation/driver-api/mtd/nand_ecc.rst763
-rw-r--r--Documentation/driver-api/mtd/spi-nor.rst66
-rw-r--r--Documentation/driver-api/nfc/index.rst11
-rw-r--r--Documentation/driver-api/nfc/nfc-hci.rst311
-rw-r--r--Documentation/driver-api/nfc/nfc-pn544.rst34
-rw-r--r--Documentation/driver-api/ntb.rst263
-rw-r--r--Documentation/driver-api/nvdimm/btt.rst285
-rw-r--r--Documentation/driver-api/nvdimm/index.rst12
-rw-r--r--Documentation/driver-api/nvdimm/nvdimm.rst887
-rw-r--r--Documentation/driver-api/nvdimm/security.rst143
-rw-r--r--Documentation/driver-api/nvmem.rst189
-rw-r--r--Documentation/driver-api/parport-lowlevel.rst (renamed from Documentation/parport-lowlevel.txt)0
-rw-r--r--Documentation/driver-api/phy/index.rst18
-rw-r--r--Documentation/driver-api/phy/phy.rst (renamed from Documentation/phy.txt)0
-rw-r--r--Documentation/driver-api/phy/samsung-usb2.rst137
-rw-r--r--Documentation/driver-api/pm/devices.rst6
-rw-r--r--Documentation/driver-api/pps.rst2
-rw-r--r--Documentation/driver-api/pti_intel_mid.rst106
-rw-r--r--Documentation/driver-api/ptp.rst2
-rw-r--r--Documentation/driver-api/pwm.rst165
-rw-r--r--Documentation/driver-api/rapidio/index.rst15
-rw-r--r--Documentation/driver-api/rapidio/mport_cdev.rst110
-rw-r--r--Documentation/driver-api/rapidio/rapidio.rst362
-rw-r--r--Documentation/driver-api/rapidio/rio_cm.rst135
-rw-r--r--Documentation/driver-api/rapidio/sysfs.rst7
-rw-r--r--Documentation/driver-api/rapidio/tsi721.rst112
-rw-r--r--Documentation/driver-api/rfkill.rst (renamed from Documentation/rfkill.txt)0
-rw-r--r--Documentation/driver-api/s390-drivers.rst4
-rw-r--r--Documentation/driver-api/serial/cyclades_z.rst (renamed from Documentation/serial/cyclades_z.rst)0
-rw-r--r--Documentation/driver-api/serial/driver.rst549
-rw-r--r--Documentation/driver-api/serial/index.rst32
-rw-r--r--Documentation/driver-api/serial/moxa-smartio.rst (renamed from Documentation/serial/moxa-smartio.rst)0
-rw-r--r--Documentation/driver-api/serial/n_gsm.rst (renamed from Documentation/serial/n_gsm.rst)0
-rw-r--r--Documentation/driver-api/serial/rocket.rst (renamed from Documentation/serial/rocket.rst)0
-rw-r--r--Documentation/driver-api/serial/serial-iso7816.rst (renamed from Documentation/serial/serial-iso7816.rst)0
-rw-r--r--Documentation/driver-api/serial/serial-rs485.rst (renamed from Documentation/serial/serial-rs485.rst)0
-rw-r--r--Documentation/driver-api/serial/tty.rst (renamed from Documentation/serial/tty.rst)0
-rw-r--r--Documentation/driver-api/sgi-ioc4.rst (renamed from Documentation/sgi-ioc4.txt)0
-rw-r--r--Documentation/driver-api/sm501.rst (renamed from Documentation/SM501.txt)0
-rw-r--r--Documentation/driver-api/smsc_ece1099.rst (renamed from Documentation/smsc_ece1099.txt)0
-rw-r--r--Documentation/driver-api/soundwire/locking.rst4
-rw-r--r--Documentation/driver-api/switchtec.rst102
-rw-r--r--Documentation/driver-api/sync_file.rst (renamed from Documentation/sync_file.txt)0
-rw-r--r--Documentation/driver-api/uio-howto.rst4
-rw-r--r--Documentation/driver-api/usb/power-management.rst2
-rw-r--r--Documentation/driver-api/vfio-mediated-device.rst414
-rw-r--r--Documentation/driver-api/vfio.rst (renamed from Documentation/vfio.txt)0
-rw-r--r--Documentation/driver-api/xilinx/eemi.rst (renamed from Documentation/xilinx/eemi.rst)0
-rw-r--r--Documentation/driver-api/xilinx/index.rst16
-rw-r--r--Documentation/driver-api/xillybus.rst (renamed from Documentation/xillybus.txt)0
-rw-r--r--Documentation/driver-api/zorro.rst (renamed from Documentation/zorro.txt)0
-rw-r--r--Documentation/driver-model/binding.txt98
-rw-r--r--Documentation/driver-model/bus.txt143
-rw-r--r--Documentation/driver-model/class.txt147
-rw-r--r--Documentation/driver-model/design-patterns.txt116
-rw-r--r--Documentation/driver-model/device.txt106
-rw-r--r--Documentation/driver-model/devres.txt412
-rw-r--r--Documentation/driver-model/driver.txt215
-rw-r--r--Documentation/driver-model/overview.txt123
-rw-r--r--Documentation/driver-model/platform.txt244
-rw-r--r--Documentation/driver-model/porting.txt447
-rw-r--r--Documentation/early-userspace/README151
-rw-r--r--Documentation/early-userspace/buffer-format.txt112
-rw-r--r--Documentation/eisa.txt230
-rw-r--r--Documentation/extcon/intel-int3496.txt27
-rw-r--r--Documentation/fault-injection/index.rst2
-rw-r--r--Documentation/fault-injection/nvme-fault-injection.rst58
-rw-r--r--Documentation/fb/fbcon.rst4
-rw-r--r--Documentation/fb/index.rst2
-rw-r--r--Documentation/fb/modedb.rst14
-rw-r--r--Documentation/fb/vesafb.rst2
-rw-r--r--Documentation/filesystems/Locking14
-rw-r--r--Documentation/filesystems/coda.txt11
-rw-r--r--Documentation/filesystems/dax.txt2
-rw-r--r--Documentation/filesystems/debugfs.txt2
-rw-r--r--Documentation/filesystems/ext2.txt8
-rw-r--r--Documentation/filesystems/f2fs.txt133
-rw-r--r--Documentation/filesystems/fscrypt.rst43
-rw-r--r--Documentation/filesystems/nfs/nfsroot.txt2
-rw-r--r--Documentation/filesystems/porting15
-rw-r--r--Documentation/filesystems/proc.txt87
-rw-r--r--Documentation/filesystems/ramfs-rootfs-initramfs.txt4
-rw-r--r--Documentation/filesystems/sysfs.txt2
-rw-r--r--Documentation/filesystems/tmpfs.txt2
-rw-r--r--Documentation/filesystems/xfs-self-describing-metadata.txt8
-rw-r--r--Documentation/filesystems/xfs.txt470
-rw-r--r--Documentation/firmware-guide/acpi/enumeration.rst2
-rw-r--r--Documentation/firmware-guide/acpi/extcon-intel-int3496.rst33
-rw-r--r--Documentation/firmware-guide/acpi/index.rst1
-rw-r--r--Documentation/fmc/API.txt47
-rw-r--r--Documentation/fmc/FMC-and-SDB.txt88
-rw-r--r--Documentation/fmc/carrier.txt311
-rw-r--r--Documentation/fmc/fmc-chardev.txt64
-rw-r--r--Documentation/fmc/fmc-fakedev.txt36
-rw-r--r--Documentation/fmc/fmc-trivial.txt17
-rw-r--r--Documentation/fmc/fmc-write-eeprom.txt98
-rw-r--r--Documentation/fmc/identifiers.txt168
-rw-r--r--Documentation/fmc/mezzanine.txt123
-rw-r--r--Documentation/fmc/parameters.txt56
-rw-r--r--Documentation/fpga/index.rst2
-rw-r--r--Documentation/gpio/index.rst17
-rw-r--r--Documentation/gpu/amdgpu.rst24
-rw-r--r--Documentation/gpu/drivers.rst1
-rw-r--r--Documentation/gpu/drm-client.rst3
-rw-r--r--Documentation/gpu/drm-kms-helpers.rst15
-rw-r--r--Documentation/gpu/drm-mm.rst34
-rw-r--r--Documentation/gpu/drm-uapi.rst19
-rw-r--r--Documentation/gpu/i915.rst87
-rw-r--r--Documentation/gpu/mcde.rst8
-rw-r--r--Documentation/gpu/todo.rst55
-rw-r--r--Documentation/hid/hid-alps.rst180
-rw-r--r--Documentation/hid/hid-alps.txt139
-rw-r--r--Documentation/hid/hid-sensor.rst242
-rw-r--r--Documentation/hid/hid-sensor.txt224
-rw-r--r--Documentation/hid/hid-transport.rst359
-rw-r--r--Documentation/hid/hid-transport.txt317
-rw-r--r--Documentation/hid/hiddev.rst251
-rw-r--r--Documentation/hid/hiddev.txt205
-rw-r--r--Documentation/hid/hidraw.rst138
-rw-r--r--Documentation/hid/hidraw.txt119
-rw-r--r--Documentation/hid/index.rst18
-rw-r--r--Documentation/hid/intel-ish-hid.rst485
-rw-r--r--Documentation/hid/intel-ish-hid.txt454
-rw-r--r--Documentation/hid/uhid.rst193
-rw-r--r--Documentation/hid/uhid.txt187
-rw-r--r--Documentation/hwmon/pxe161090
-rw-r--r--Documentation/hwmon/submitting-patches.rst2
-rw-r--r--Documentation/hwspinlock.txt81
-rw-r--r--Documentation/i2c/busses/i2c-i80120
-rw-r--r--Documentation/ia64/IRQ-redir.txt69
-rw-r--r--Documentation/ia64/README43
-rw-r--r--Documentation/ia64/aliasing.rst246
-rw-r--r--Documentation/ia64/aliasing.txt221
-rw-r--r--Documentation/ia64/efirtc.rst144
-rw-r--r--Documentation/ia64/efirtc.txt128
-rw-r--r--Documentation/ia64/err_inject.rst1067
-rw-r--r--Documentation/ia64/err_inject.txt1068
-rw-r--r--Documentation/ia64/fsys.rst303
-rw-r--r--Documentation/ia64/fsys.txt286
-rw-r--r--Documentation/ia64/ia64.rst49
-rw-r--r--Documentation/ia64/index.rst18
-rw-r--r--Documentation/ia64/irq-redir.rst80
-rw-r--r--Documentation/ia64/mca.rst198
-rw-r--r--Documentation/ia64/mca.txt194
-rw-r--r--Documentation/ia64/serial.rst165
-rw-r--r--Documentation/ia64/serial.txt151
-rw-r--r--Documentation/ia64/xen.rst206
-rw-r--r--Documentation/ia64/xen.txt183
-rw-r--r--Documentation/ide/index.rst2
-rw-r--r--Documentation/iio/ep93xx_adc.rst40
-rw-r--r--Documentation/iio/ep93xx_adc.txt29
-rw-r--r--Documentation/iio/iio_configfs.rst101
-rw-r--r--Documentation/iio/iio_configfs.txt93
-rw-r--r--Documentation/iio/index.rst12
-rw-r--r--Documentation/index.rst35
-rw-r--r--Documentation/infiniband/core_locking.rst118
-rw-r--r--Documentation/infiniband/core_locking.txt112
-rw-r--r--Documentation/infiniband/index.rst23
-rw-r--r--Documentation/infiniband/ipoib.rst115
-rw-r--r--Documentation/infiniband/ipoib.txt105
-rw-r--r--Documentation/infiniband/opa_vnic.rst159
-rw-r--r--Documentation/infiniband/opa_vnic.txt153
-rw-r--r--Documentation/infiniband/sysfs.rst6
-rw-r--r--Documentation/infiniband/sysfs.txt4
-rw-r--r--Documentation/infiniband/tag_matching.rst69
-rw-r--r--Documentation/infiniband/tag_matching.txt64
-rw-r--r--Documentation/infiniband/user_mad.rst166
-rw-r--r--Documentation/infiniband/user_mad.txt153
-rw-r--r--Documentation/infiniband/user_verbs.rst75
-rw-r--r--Documentation/infiniband/user_verbs.txt69
-rw-r--r--Documentation/input/input.rst2
-rw-r--r--Documentation/interconnect/interconnect.rst95
-rw-r--r--Documentation/ioctl/botching-up-ioctls.rst225
-rw-r--r--Documentation/ioctl/botching-up-ioctls.txt224
-rw-r--r--Documentation/ioctl/cdrom.rst1233
-rw-r--r--Documentation/ioctl/cdrom.txt967
-rw-r--r--Documentation/ioctl/hdio.rst1342
-rw-r--r--Documentation/ioctl/hdio.txt1071
-rw-r--r--Documentation/ioctl/index.rst16
-rw-r--r--Documentation/ioctl/ioctl-decoding.rst31
-rw-r--r--Documentation/ioctl/ioctl-decoding.txt24
-rw-r--r--Documentation/ioctl/ioctl-number.rst361
-rw-r--r--Documentation/ioctl/ioctl-number.txt350
-rw-r--r--Documentation/isdn/HiSax.cert96
-rw-r--r--Documentation/isdn/INTERFACE759
-rw-r--r--Documentation/isdn/INTERFACE.fax163
-rw-r--r--Documentation/isdn/README599
-rw-r--r--Documentation/isdn/README.FAQ26
-rw-r--r--Documentation/isdn/README.HiSax659
-rw-r--r--Documentation/isdn/README.audio138
-rw-r--r--Documentation/isdn/README.concap259
-rw-r--r--Documentation/isdn/README.diversion127
-rw-r--r--Documentation/isdn/README.fax45
-rw-r--r--Documentation/isdn/README.gigaset36
-rw-r--r--Documentation/isdn/README.hfc-pci41
-rw-r--r--Documentation/isdn/README.syncppp58
-rw-r--r--Documentation/isdn/README.x25184
-rw-r--r--Documentation/isdn/syncPPP.FAQ224
-rw-r--r--Documentation/kbuild/headers_install.rst7
-rw-r--r--Documentation/kbuild/index.rst2
-rw-r--r--Documentation/kbuild/issues.rst20
-rw-r--r--Documentation/kbuild/kbuild.rst17
-rw-r--r--Documentation/kbuild/kconfig-language.rst12
-rw-r--r--Documentation/kbuild/kconfig.rst8
-rw-r--r--Documentation/kbuild/makefiles.rst35
-rw-r--r--Documentation/kdump/index.rst21
-rw-r--r--Documentation/kernel-hacking/locking.rst2
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt356
-rw-r--r--Documentation/laptops/asus-laptop.txt257
-rw-r--r--Documentation/laptops/disk-shock-protection.txt149
-rw-r--r--Documentation/laptops/laptop-mode.txt782
-rw-r--r--Documentation/laptops/lg-laptop.rst85
-rw-r--r--Documentation/laptops/sony-laptop.txt144
-rw-r--r--Documentation/laptops/sonypi.txt152
-rw-r--r--Documentation/laptops/thinkpad-acpi.txt1487
-rw-r--r--Documentation/laptops/toshiba_haps.txt76
-rw-r--r--Documentation/leds/index.rst25
-rw-r--r--Documentation/leds/leds-blinkm.rst84
-rw-r--r--Documentation/leds/leds-blinkm.txt80
-rw-r--r--Documentation/leds/leds-class-flash.rst90
-rw-r--r--Documentation/leds/leds-class-flash.txt73
-rw-r--r--Documentation/leds/leds-class.rst125
-rw-r--r--Documentation/leds/leds-class.txt122
-rw-r--r--Documentation/leds/leds-lm3556.rst137
-rw-r--r--Documentation/leds/leds-lm3556.txt85
-rw-r--r--Documentation/leds/leds-lp3944.rst59
-rw-r--r--Documentation/leds/leds-lp3944.txt50
-rw-r--r--Documentation/leds/leds-lp5521.rst115
-rw-r--r--Documentation/leds/leds-lp5521.txt101
-rw-r--r--Documentation/leds/leds-lp5523.rst147
-rw-r--r--Documentation/leds/leds-lp5523.txt130
-rw-r--r--Documentation/leds/leds-lp5562.rst137
-rw-r--r--Documentation/leds/leds-lp5562.txt120
-rw-r--r--Documentation/leds/leds-lp55xx.rst224
-rw-r--r--Documentation/leds/leds-lp55xx.txt194
-rw-r--r--Documentation/leds/leds-mlxcpld.rst118
-rw-r--r--Documentation/leds/leds-mlxcpld.txt110
-rw-r--r--Documentation/leds/ledtrig-oneshot.rst44
-rw-r--r--Documentation/leds/ledtrig-oneshot.txt43
-rw-r--r--Documentation/leds/ledtrig-transient.rst167
-rw-r--r--Documentation/leds/ledtrig-transient.txt152
-rw-r--r--Documentation/leds/ledtrig-usbport.rst46
-rw-r--r--Documentation/leds/ledtrig-usbport.txt41
-rw-r--r--Documentation/leds/uleds.rst37
-rw-r--r--Documentation/leds/uleds.txt36
-rw-r--r--Documentation/livepatch/index.rst2
-rw-r--r--Documentation/locking/index.rst24
-rw-r--r--Documentation/locking/lockdep-design.rst394
-rw-r--r--Documentation/locking/lockdep-design.txt333
-rw-r--r--Documentation/locking/lockstat.rst204
-rw-r--r--Documentation/locking/lockstat.txt183
-rw-r--r--Documentation/locking/locktorture.rst170
-rw-r--r--Documentation/locking/locktorture.txt145
-rw-r--r--Documentation/locking/mutex-design.rst152
-rw-r--r--Documentation/locking/mutex-design.txt142
-rw-r--r--Documentation/locking/rt-mutex-design.rst574
-rw-r--r--Documentation/locking/rt-mutex-design.txt559
-rw-r--r--Documentation/locking/rt-mutex.rst77
-rw-r--r--Documentation/locking/rt-mutex.txt73
-rw-r--r--Documentation/locking/spinlocks.rst177
-rw-r--r--Documentation/locking/spinlocks.txt167
-rw-r--r--Documentation/locking/ww-mutex-design.rst393
-rw-r--r--Documentation/locking/ww-mutex-design.txt383
-rw-r--r--Documentation/m68k/index.rst17
-rw-r--r--Documentation/m68k/kernel-options.rst911
-rw-r--r--Documentation/m68k/kernel-options.txt884
-rw-r--r--Documentation/md/md-cluster.txt325
-rw-r--r--Documentation/md/raid5-cache.txt109
-rw-r--r--Documentation/md/raid5-ppl.txt45
-rw-r--r--Documentation/media/kapi/dtv-core.rst6
-rw-r--r--Documentation/media/kapi/v4l2-controls.rst206
-rw-r--r--Documentation/media/uapi/cec/cec-api.rst2
-rw-r--r--Documentation/media/uapi/cec/cec-ioc-g-mode.rst3
-rw-r--r--Documentation/media/uapi/cec/cec-ioc-receive.rst15
-rw-r--r--Documentation/media/uapi/mediactl/media-ioc-enum-links.rst7
-rw-r--r--Documentation/media/uapi/rc/rc-tables.rst30
-rw-r--r--Documentation/media/uapi/v4l/biblio.rst9
-rw-r--r--Documentation/media/uapi/v4l/ext-ctrls-codec.rst625
-rw-r--r--Documentation/media/uapi/v4l/extended-controls.rst15
-rw-r--r--Documentation/media/uapi/v4l/field-order.rst17
-rw-r--r--Documentation/media/uapi/v4l/pixfmt-compressed.rst25
-rw-r--r--Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst15
-rw-r--r--Documentation/media/uapi/v4l/pixfmt-v4l2.rst13
-rw-r--r--Documentation/media/uapi/v4l/vidioc-qbuf.rst8
-rw-r--r--Documentation/media/uapi/v4l/vidioc-queryctrl.rst30
-rw-r--r--Documentation/media/v4l-drivers/index.rst1
-rw-r--r--Documentation/media/v4l-drivers/vimc.dot22
-rw-r--r--Documentation/media/v4l-drivers/vimc.rst98
-rw-r--r--Documentation/media/v4l-drivers/vivid.rst5
-rw-r--r--Documentation/media/videodev2.h.rst.exceptions5
-rw-r--r--Documentation/memory-barriers.txt2
-rw-r--r--Documentation/memory-devices/ti-emif.txt57
-rw-r--r--Documentation/mic/index.rst2
-rw-r--r--Documentation/misc-devices/eeprom96
-rw-r--r--Documentation/misc-devices/eeprom.rst107
-rw-r--r--Documentation/misc-devices/ics932s40131
-rw-r--r--Documentation/misc-devices/ics932s401.rst36
-rw-r--r--Documentation/misc-devices/index.rst5
-rw-r--r--Documentation/misc-devices/isl2900362
-rw-r--r--Documentation/misc-devices/isl29003.rst75
-rw-r--r--Documentation/misc-devices/lis3lv02d93
-rw-r--r--Documentation/misc-devices/lis3lv02d.rst99
-rw-r--r--Documentation/misc-devices/max6875110
-rw-r--r--Documentation/misc-devices/max6875.rst136
-rw-r--r--Documentation/misc-devices/mei/mei-client-bus.txt141
-rw-r--r--Documentation/misc-devices/mei/mei.txt266
-rw-r--r--Documentation/mmc/mmc-async-req.txt87
-rw-r--r--Documentation/mmc/mmc-dev-attrs.txt77
-rw-r--r--Documentation/mmc/mmc-dev-parts.txt40
-rw-r--r--Documentation/mmc/mmc-tools.txt34
-rw-r--r--Documentation/mtd/intel-spi.txt88
-rw-r--r--Documentation/mtd/nand_ecc.txt714
-rw-r--r--Documentation/mtd/spi-nor.txt65
-rw-r--r--Documentation/namespaces/compatibility-list.txt39
-rw-r--r--Documentation/namespaces/resource-control.txt14
-rw-r--r--Documentation/netlabel/index.rst2
-rw-r--r--Documentation/networking/af_xdp.rst24
-rw-r--r--Documentation/networking/bonding.txt16
-rw-r--r--Documentation/networking/device_drivers/amazon/ena.txt5
-rw-r--r--Documentation/networking/device_drivers/aquantia/atlantic.txt439
-rw-r--r--Documentation/networking/device_drivers/google/gve.rst123
-rw-r--r--Documentation/networking/device_drivers/index.rst2
-rw-r--r--Documentation/networking/device_drivers/mellanox/mlx5.rst192
-rw-r--r--Documentation/networking/dsa/b53.rst183
-rw-r--r--Documentation/networking/dsa/configuration.rst292
-rw-r--r--Documentation/networking/dsa/index.rst2
-rw-r--r--Documentation/networking/ip-sysctl.txt62
-rw-r--r--Documentation/networking/mpls-sysctl.txt2
-rw-r--r--Documentation/networking/phy.rst45
-rw-r--r--Documentation/networking/rds.txt2
-rw-r--r--Documentation/networking/sfp-phylink.rst5
-rw-r--r--Documentation/networking/tls-offload.rst73
-rw-r--r--Documentation/nfc/nfc-hci.txt290
-rw-r--r--Documentation/nfc/nfc-pn544.txt32
-rw-r--r--Documentation/ntb.txt236
-rw-r--r--Documentation/nvdimm/btt.txt273
-rw-r--r--Documentation/nvdimm/nvdimm.txt815
-rw-r--r--Documentation/nvdimm/security.txt141
-rw-r--r--Documentation/nvmem/nvmem.txt183
-rw-r--r--Documentation/pcmcia/index.rst2
-rw-r--r--Documentation/perf/arm-ccn.txt59
-rw-r--r--Documentation/perf/arm_dsu_pmu.txt28
-rw-r--r--Documentation/perf/hisi-pmu.txt53
-rw-r--r--Documentation/perf/qcom_l2_pmu.txt38
-rw-r--r--Documentation/perf/qcom_l3_pmu.txt25
-rw-r--r--Documentation/perf/thunderx2-pmu.txt41
-rw-r--r--Documentation/perf/xgene-pmu.txt48
-rw-r--r--Documentation/phy/samsung-usb2.txt135
-rw-r--r--Documentation/pi-futex.txt2
-rw-r--r--Documentation/power/apm-acpi.rst36
-rw-r--r--Documentation/power/apm-acpi.txt32
-rw-r--r--Documentation/power/basic-pm-debugging.rst269
-rw-r--r--Documentation/power/basic-pm-debugging.txt254
-rw-r--r--Documentation/power/charger-manager.rst205
-rw-r--r--Documentation/power/charger-manager.txt200
-rw-r--r--Documentation/power/drivers-testing.rst51
-rw-r--r--Documentation/power/drivers-testing.txt46
-rw-r--r--Documentation/power/energy-model.rst147
-rw-r--r--Documentation/power/energy-model.txt144
-rw-r--r--Documentation/power/freezing-of-tasks.rst244
-rw-r--r--Documentation/power/freezing-of-tasks.txt231
-rw-r--r--Documentation/power/index.rst46
-rw-r--r--Documentation/power/interface.rst79
-rw-r--r--Documentation/power/interface.txt77
-rw-r--r--Documentation/power/opp.rst379
-rw-r--r--Documentation/power/opp.txt342
-rw-r--r--Documentation/power/pci.rst1135
-rw-r--r--Documentation/power/pci.txt1094
-rw-r--r--Documentation/power/pm_qos_interface.rst227
-rw-r--r--Documentation/power/pm_qos_interface.txt212
-rw-r--r--Documentation/power/power_supply_class.rst288
-rw-r--r--Documentation/power/power_supply_class.txt231
-rw-r--r--Documentation/power/powercap/powercap.rst257
-rw-r--r--Documentation/power/powercap/powercap.txt236
-rw-r--r--Documentation/power/regulator/consumer.rst229
-rw-r--r--Documentation/power/regulator/consumer.txt218
-rw-r--r--Documentation/power/regulator/design.rst38
-rw-r--r--Documentation/power/regulator/design.txt33
-rw-r--r--Documentation/power/regulator/machine.rst97
-rw-r--r--Documentation/power/regulator/machine.txt96
-rw-r--r--Documentation/power/regulator/overview.rst178
-rw-r--r--Documentation/power/regulator/overview.txt171
-rw-r--r--Documentation/power/regulator/regulator.rst32
-rw-r--r--Documentation/power/regulator/regulator.txt30
-rw-r--r--Documentation/power/runtime_pm.rst940
-rw-r--r--Documentation/power/runtime_pm.txt928
-rw-r--r--Documentation/power/s2ram.rst87
-rw-r--r--Documentation/power/s2ram.txt85
-rw-r--r--Documentation/power/suspend-and-cpuhotplug.rst286
-rw-r--r--Documentation/power/suspend-and-cpuhotplug.txt274
-rw-r--r--Documentation/power/suspend-and-interrupts.rst137
-rw-r--r--Documentation/power/suspend-and-interrupts.txt135
-rw-r--r--Documentation/power/swsusp-and-swap-files.rst63
-rw-r--r--Documentation/power/swsusp-and-swap-files.txt60
-rw-r--r--Documentation/power/swsusp-dmcrypt.rst140
-rw-r--r--Documentation/power/swsusp-dmcrypt.txt138
-rw-r--r--Documentation/power/swsusp.rst501
-rw-r--r--Documentation/power/swsusp.txt446
-rw-r--r--Documentation/power/tricks.rst29
-rw-r--r--Documentation/power/tricks.txt27
-rw-r--r--Documentation/power/userland-swsusp.rst191
-rw-r--r--Documentation/power/userland-swsusp.txt170
-rw-r--r--Documentation/power/video.rst213
-rw-r--r--Documentation/power/video.txt185
-rw-r--r--Documentation/powerpc/firmware-assisted-dump.txt2
-rw-r--r--Documentation/powerpc/vcpudispatch_stats.txt68
-rw-r--r--Documentation/process/changes.rst22
-rw-r--r--Documentation/process/submit-checklist.rst2
-rw-r--r--Documentation/process/submitting-drivers.rst2
-rw-r--r--Documentation/pti/pti_intel_mid.txt99
-rw-r--r--Documentation/pwm.txt158
-rw-r--r--Documentation/rapidio/mport_cdev.txt107
-rw-r--r--Documentation/rapidio/rapidio.txt351
-rw-r--r--Documentation/rapidio/rio_cm.txt119
-rw-r--r--Documentation/rapidio/sysfs.txt3
-rw-r--r--Documentation/rapidio/tsi721.txt97
-rw-r--r--Documentation/rbtree.txt6
-rw-r--r--Documentation/remoteproc.txt14
-rw-r--r--Documentation/riscv/boot-image-header.txt50
-rw-r--r--Documentation/riscv/index.rst2
-rw-r--r--Documentation/s390/3270.rst298
-rw-r--r--Documentation/s390/3270.txt271
-rw-r--r--Documentation/s390/CommonIO125
-rw-r--r--Documentation/s390/DASD73
-rw-r--r--Documentation/s390/Debugging390.txt2142
-rw-r--r--Documentation/s390/cds.rst530
-rw-r--r--Documentation/s390/cds.txt472
-rw-r--r--Documentation/s390/common_io.rst140
-rw-r--r--Documentation/s390/dasd.rst84
-rw-r--r--Documentation/s390/debugging390.rst2613
-rw-r--r--Documentation/s390/driver-model.rst328
-rw-r--r--Documentation/s390/driver-model.txt287
-rw-r--r--Documentation/s390/index.rst28
-rw-r--r--Documentation/s390/monreader.rst212
-rw-r--r--Documentation/s390/monreader.txt197
-rw-r--r--Documentation/s390/qeth.rst64
-rw-r--r--Documentation/s390/qeth.txt50
-rw-r--r--Documentation/s390/s390dbf.rst487
-rw-r--r--Documentation/s390/s390dbf.txt667
-rw-r--r--Documentation/s390/text_files.rst11
-rw-r--r--Documentation/s390/vfio-ap.rst866
-rw-r--r--Documentation/s390/vfio-ap.txt837
-rw-r--r--Documentation/s390/vfio-ccw.rst326
-rw-r--r--Documentation/s390/vfio-ccw.txt300
-rw-r--r--Documentation/s390/zfcpdump.rst50
-rw-r--r--Documentation/s390/zfcpdump.txt48
-rw-r--r--Documentation/scheduler/index.rst2
-rw-r--r--Documentation/scheduler/sched-deadline.rst2
-rw-r--r--Documentation/scheduler/sched-design-CFS.rst2
-rw-r--r--Documentation/scheduler/sched-energy.rst6
-rw-r--r--Documentation/scheduler/sched-pelt.c3
-rw-r--r--Documentation/scheduler/sched-rt-group.rst2
-rw-r--r--Documentation/scsi/osst.txt218
-rw-r--r--Documentation/scsi/ufs.txt7
-rw-r--r--Documentation/security/IMA-templates.rst7
-rw-r--r--Documentation/security/index.rst5
-rw-r--r--Documentation/security/keys/core.rst91
-rw-r--r--Documentation/security/keys/request-key.rst50
-rw-r--r--Documentation/security/lsm-development.rst (renamed from Documentation/security/LSM.rst)0
-rw-r--r--Documentation/security/lsm.rst (renamed from Documentation/lsm.txt)0
-rw-r--r--Documentation/security/sak.rst (renamed from Documentation/SAK.txt)0
-rw-r--r--Documentation/security/siphash.rst (renamed from Documentation/siphash.txt)0
-rw-r--r--Documentation/security/tpm/index.rst1
-rw-r--r--Documentation/security/tpm/xen-tpmfront.rst124
-rw-r--r--Documentation/security/tpm/xen-tpmfront.txt113
-rw-r--r--Documentation/serial/driver.rst549
-rw-r--r--Documentation/serial/index.rst32
-rw-r--r--Documentation/sparc/index.rst2
-rw-r--r--Documentation/switchtec.txt102
-rw-r--r--Documentation/sysctl/README76
-rw-r--r--Documentation/sysctl/abi.txt54
-rw-r--r--Documentation/sysctl/fs.txt374
-rw-r--r--Documentation/sysctl/kernel.txt1145
-rw-r--r--Documentation/sysctl/net.txt422
-rw-r--r--Documentation/sysctl/sunrpc.txt20
-rw-r--r--Documentation/sysctl/user.txt66
-rw-r--r--Documentation/sysctl/vm.txt946
-rw-r--r--Documentation/target/index.rst2
-rw-r--r--Documentation/thermal/cpu-cooling-api.rst107
-rw-r--r--Documentation/thermal/cpu-cooling-api.txt92
-rw-r--r--Documentation/thermal/exynos_thermal77
-rw-r--r--Documentation/thermal/exynos_thermal.rst90
-rw-r--r--Documentation/thermal/exynos_thermal_emulation53
-rw-r--r--Documentation/thermal/exynos_thermal_emulation.rst61
-rw-r--r--Documentation/thermal/index.rst18
-rw-r--r--Documentation/thermal/intel_powerclamp.rst320
-rw-r--r--Documentation/thermal/intel_powerclamp.txt317
-rw-r--r--Documentation/thermal/nouveau_thermal82
-rw-r--r--Documentation/thermal/nouveau_thermal.rst96
-rw-r--r--Documentation/thermal/power_allocator.rst271
-rw-r--r--Documentation/thermal/power_allocator.txt247
-rw-r--r--Documentation/thermal/sysfs-api.rst798
-rw-r--r--Documentation/thermal/sysfs-api.txt636
-rw-r--r--Documentation/thermal/x86_pkg_temperature_thermal47
-rw-r--r--Documentation/thermal/x86_pkg_temperature_thermal.rst55
-rw-r--r--Documentation/timers/index.rst2
-rw-r--r--Documentation/trace/coresight-cpu-debug.txt2
-rw-r--r--Documentation/trace/kprobetrace.rst42
-rw-r--r--Documentation/trace/uprobetracer.rst10
-rw-r--r--Documentation/translations/it_IT/kernel-hacking/locking.rst2
-rw-r--r--Documentation/translations/it_IT/process/submit-checklist.rst2
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt2
-rw-r--r--Documentation/translations/zh_CN/arm/Booting4
-rw-r--r--Documentation/translations/zh_CN/arm/kernel_user_helpers.txt4
-rw-r--r--Documentation/translations/zh_CN/arm64/booting.txt2
-rw-r--r--Documentation/translations/zh_CN/filesystems/sysfs.txt2
-rw-r--r--Documentation/translations/zh_CN/gpio.txt4
-rw-r--r--Documentation/translations/zh_CN/oops-tracing.txt4
-rw-r--r--Documentation/translations/zh_CN/process/submit-checklist.rst2
-rw-r--r--Documentation/translations/zh_CN/process/submitting-drivers.rst2
-rw-r--r--Documentation/translations/zh_CN/sparse.txt4
-rw-r--r--Documentation/usb/acm.rst (renamed from Documentation/usb/acm.txt)0
-rw-r--r--Documentation/usb/authorization.rst (renamed from Documentation/usb/authorization.txt)0
-rw-r--r--Documentation/usb/chipidea.rst (renamed from Documentation/usb/chipidea.txt)0
-rw-r--r--Documentation/usb/dwc3.rst (renamed from Documentation/usb/dwc3.txt)0
-rw-r--r--Documentation/usb/ehci.rst (renamed from Documentation/usb/ehci.txt)0
-rw-r--r--Documentation/usb/functionfs.rst (renamed from Documentation/usb/functionfs.txt)0
-rw-r--r--Documentation/usb/gadget-testing.rst934
-rw-r--r--Documentation/usb/gadget-testing.txt934
-rw-r--r--Documentation/usb/gadget_configfs.rst (renamed from Documentation/usb/gadget_configfs.txt)0
-rw-r--r--Documentation/usb/gadget_hid.rst (renamed from Documentation/usb/gadget_hid.txt)0
-rw-r--r--Documentation/usb/gadget_multi.rst (renamed from Documentation/usb/gadget_multi.txt)0
-rw-r--r--Documentation/usb/gadget_printer.rst (renamed from Documentation/usb/gadget_printer.txt)0
-rw-r--r--Documentation/usb/gadget_serial.rst (renamed from Documentation/usb/gadget_serial.txt)0
-rw-r--r--Documentation/usb/index.rst39
-rw-r--r--Documentation/usb/iuu_phoenix.rst (renamed from Documentation/usb/iuu_phoenix.txt)0
-rw-r--r--Documentation/usb/mass-storage.rst (renamed from Documentation/usb/mass-storage.txt)0
-rw-r--r--Documentation/usb/misc_usbsevseg.rst (renamed from Documentation/usb/misc_usbsevseg.txt)0
-rw-r--r--Documentation/usb/mtouchusb.rst (renamed from Documentation/usb/mtouchusb.txt)0
-rw-r--r--Documentation/usb/ohci.rst (renamed from Documentation/usb/ohci.txt)0
-rw-r--r--Documentation/usb/rio.rst (renamed from Documentation/usb/rio.txt)0
-rw-r--r--Documentation/usb/text_files.rst29
-rw-r--r--Documentation/usb/usb-help.rst (renamed from Documentation/usb/usb-help.txt)0
-rw-r--r--Documentation/usb/usb-serial.rst (renamed from Documentation/usb/usb-serial.txt)0
-rw-r--r--Documentation/usb/usbip_protocol.rst (renamed from Documentation/usb/usbip_protocol.txt)0
-rw-r--r--Documentation/usb/usbmon.rst (renamed from Documentation/usb/usbmon.txt)0
-rw-r--r--Documentation/usb/wusb-design-overview.rst (renamed from Documentation/usb/WUSB-Design-overview.txt)0
-rw-r--r--Documentation/userspace-api/accelerators/ocxl.rst176
-rw-r--r--Documentation/userspace-api/index.rst1
-rw-r--r--Documentation/vfio-mediated-device.txt414
-rw-r--r--Documentation/virtual/index.rst18
-rw-r--r--Documentation/virtual/kvm/api.txt81
-rw-r--r--Documentation/virtual/kvm/arm/psci.txt31
-rw-r--r--Documentation/virtual/kvm/cpuid.rst107
-rw-r--r--Documentation/virtual/kvm/cpuid.txt83
-rw-r--r--Documentation/virtual/kvm/hypercalls.txt11
-rw-r--r--Documentation/virtual/kvm/index.rst11
-rw-r--r--Documentation/virtual/kvm/locking.txt4
-rw-r--r--Documentation/virtual/kvm/msr.txt9
-rw-r--r--Documentation/virtual/paravirt_ops.rst35
-rw-r--r--Documentation/virtual/paravirt_ops.txt32
-rw-r--r--Documentation/vm/hmm.rst166
-rw-r--r--Documentation/vm/memory-model.rst40
-rw-r--r--Documentation/vm/numa.rst4
-rw-r--r--Documentation/vm/page_migration.rst2
-rw-r--r--Documentation/vm/unevictable-lru.rst4
-rw-r--r--Documentation/w1/w1.netlink2
-rw-r--r--Documentation/watchdog/hpwdt.rst4
-rw-r--r--Documentation/watchdog/index.rst2
-rw-r--r--Documentation/watchdog/watchdog-parameters.rst11
-rw-r--r--Documentation/x86/exception-tables.rst2
-rw-r--r--Documentation/x86/index.rst2
-rw-r--r--Documentation/x86/intel-iommu.rst (renamed from Documentation/Intel-IOMMU.txt)0
-rw-r--r--Documentation/x86/intel_txt.rst (renamed from Documentation/intel_txt.txt)0
-rw-r--r--Documentation/x86/topology.rst6
-rw-r--r--Documentation/x86/x86_64/fake-numa-for-cpusets.rst4
-rw-r--r--Documentation/xilinx/index.rst17
-rw-r--r--Documentation/xtensa/atomctl.rst51
-rw-r--r--Documentation/xtensa/atomctl.txt44
-rw-r--r--Documentation/xtensa/booting.rst22
-rw-r--r--Documentation/xtensa/booting.txt19
-rw-r--r--Documentation/xtensa/index.rst12
-rw-r--r--Documentation/xtensa/mmu.rst195
-rw-r--r--Documentation/xtensa/mmu.txt189
1449 files changed, 84344 insertions, 72959 deletions
diff --git a/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra
index 16020b31ae64..5d41ebadf15e 100644
--- a/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra
+++ b/Documentation/ABI/obsolete/sysfs-driver-hid-roccat-pyra
@@ -5,7 +5,7 @@ Description: It is possible to switch the cpi setting of the mouse with the
press of a button.
When read, this file returns the raw number of the actual cpi
setting reported by the mouse. This number has to be further
- processed to receive the real dpi value.
+ processed to receive the real dpi value:
VALUE DPI
1 400
diff --git a/Documentation/ABI/obsolete/sysfs-gpio b/Documentation/ABI/obsolete/sysfs-gpio
index 40d41ea1a3f5..e0d4e5e2dd90 100644
--- a/Documentation/ABI/obsolete/sysfs-gpio
+++ b/Documentation/ABI/obsolete/sysfs-gpio
@@ -11,7 +11,7 @@ Description:
Kernel code may export it for complete or partial access.
GPIOs are identified as they are inside the kernel, using integers in
- the range 0..INT_MAX. See Documentation/gpio for more information.
+ the range 0..INT_MAX. See Documentation/admin-guide/gpio for more information.
/sys/class/gpio
/export ... asks the kernel to export a GPIO to userspace
diff --git a/Documentation/ABI/removed/sysfs-class-rfkill b/Documentation/ABI/removed/sysfs-class-rfkill
index 3ce6231f20b2..9c08c7f98ffb 100644
--- a/Documentation/ABI/removed/sysfs-class-rfkill
+++ b/Documentation/ABI/removed/sysfs-class-rfkill
@@ -1,6 +1,6 @@
rfkill - radio frequency (RF) connector kill switch support
-For details to this subsystem look at Documentation/rfkill.txt.
+For details to this subsystem look at Documentation/driver-api/rfkill.rst.
What: /sys/class/rfkill/rfkill[0-9]+/claim
Date: 09-Jul-2007
diff --git a/Documentation/ABI/stable/sysfs-class-infiniband b/Documentation/ABI/stable/sysfs-class-infiniband
index 17211ceb9bf4..aed21b8916a2 100644
--- a/Documentation/ABI/stable/sysfs-class-infiniband
+++ b/Documentation/ABI/stable/sysfs-class-infiniband
@@ -423,23 +423,6 @@ Description:
(e.g. driver restart on the VM which owns the VF).
-sysfs interface for NetEffect RNIC Low-Level iWARP driver (nes)
----------------------------------------------------------------
-
-What: /sys/class/infiniband/nesX/hw_rev
-What: /sys/class/infiniband/nesX/hca_type
-What: /sys/class/infiniband/nesX/board_id
-Date: Feb, 2008
-KernelVersion: v2.6.25
-Contact: linux-rdma@vger.kernel.org
-Description:
- hw_rev: (RO) Hardware revision number
-
- hca_type: (RO) Host Channel Adapter type (NEX020)
-
- board_id: (RO) Manufacturing board id
-
-
sysfs interface for Chelsio T4/T5 RDMA driver (cxgb4)
-----------------------------------------------------
diff --git a/Documentation/ABI/stable/sysfs-class-rfkill b/Documentation/ABI/stable/sysfs-class-rfkill
index 80151a409d67..5b154f922643 100644
--- a/Documentation/ABI/stable/sysfs-class-rfkill
+++ b/Documentation/ABI/stable/sysfs-class-rfkill
@@ -1,6 +1,6 @@
rfkill - radio frequency (RF) connector kill switch support
-For details to this subsystem look at Documentation/rfkill.txt.
+For details to this subsystem look at Documentation/driver-api/rfkill.rst.
For the deprecated /sys/class/rfkill/*/claim knobs of this interface look in
Documentation/ABI/removed/sysfs-class-rfkill.
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index f7ce68fbd4b9..df8413cf1468 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -61,7 +61,7 @@ Date: October 2002
Contact: Linux Memory Management list <linux-mm@kvack.org>
Description:
The node's hit/miss statistics, in units of pages.
- See Documentation/numastat.txt
+ See Documentation/admin-guide/numastat.rst
What: /sys/devices/system/node/nodeX/distance
Date: October 2002
diff --git a/Documentation/ABI/stable/sysfs-driver-mlxreg-io b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
index 156319fc5b80..8ca498447aeb 100644
--- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io
+++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io
@@ -1,5 +1,4 @@
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
- asic_health
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_health
Date: June 2018
KernelVersion: 4.19
@@ -9,9 +8,8 @@ Description: This file shows ASIC health status. The possible values are:
The files are read only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
- cpld1_version
- cpld2_version
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld1_version
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld2_version
Date: June 2018
KernelVersion: 4.19
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -20,8 +18,7 @@ Description: These files show with which CPLD versions have been burned
The files are read only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
- fan_dir
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/fan_dir
Date: December 2018
KernelVersion: 5.0
@@ -32,8 +29,7 @@ Description: This file shows the system fans direction:
The files are read only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
- jtag_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable
Date: November 2018
KernelVersion: 5.0
@@ -43,8 +39,7 @@ Description: These files show with which CPLD versions have been burned
The files are read only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
- jtag_enable
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable
Date: November 2018
KernelVersion: 5.0
@@ -87,16 +82,15 @@ Description: These files allow asserting system power cycling, switching
The files are write only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
- reset_aux_pwr_or_ref
- reset_asic_thermal
- reset_hotswap_or_halt
- reset_hotswap_or_wd
- reset_fw_reset
- reset_long_pb
- reset_main_pwr_fail
- reset_short_pb
- reset_sw_reset
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_aux_pwr_or_ref
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_asic_thermal
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_hotswap_or_halt
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_hotswap_or_wd
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_fw_reset
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_long_pb
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_main_pwr_fail
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_short_pb
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sw_reset
Date: June 2018
KernelVersion: 4.19
Contact: Vadim Pasternak <vadimpmellanox.com>
@@ -110,11 +104,10 @@ Description: These files show the system reset cause, as following: power
The files are read only.
-What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/
- reset_comex_pwr_fail
- reset_from_comex
- reset_system
- reset_voltmon_upgrade_fail
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_pwr_fail
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_comex
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_system
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_voltmon_upgrade_fail
Date: November 2018
KernelVersion: 5.0
@@ -127,3 +120,23 @@ Description: These files show the system reset cause, as following: ComEx
the last reset cause.
The files are read only.
+
+Date: June 2019
+KernelVersion: 5.3
+Contact: Vadim Pasternak <vadimpmellanox.com>
+Description: These files show the system reset cause, as following:
+ COMEX thermal shutdown; wathchdog power off or reset was derived
+ by one of the next components: COMEX, switch board or by Small Form
+ Factor mezzanine, reset requested from ASIC, reset cuased by BIOS
+ reload. Value 1 in file means this is reset cause, 0 - otherwise.
+ Only one of the above causes could be 1 at the same time, representing
+ only last reset cause.
+
+ The files are read only.
+
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_thermal
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_wd
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_asic
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_reload_bios
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sff_wd
+What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_swb_wd
diff --git a/Documentation/ABI/testing/debugfs-cec-error-inj b/Documentation/ABI/testing/debugfs-cec-error-inj
index 122b65c5fe62..4c3596c6d25b 100644
--- a/Documentation/ABI/testing/debugfs-cec-error-inj
+++ b/Documentation/ABI/testing/debugfs-cec-error-inj
@@ -1,6 +1,6 @@
What: /sys/kernel/debug/cec/*/error-inj
Date: March 2018
-Contact: Hans Verkuil <hans.verkuil@cisco.com>
+Contact: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Description:
The CEC Framework allows for CEC error injection commands through
diff --git a/Documentation/ABI/testing/debugfs-cros-ec b/Documentation/ABI/testing/debugfs-cros-ec
new file mode 100644
index 000000000000..1fe0add99a2a
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-cros-ec
@@ -0,0 +1,56 @@
+What: /sys/kernel/debug/<cros-ec-device>/console_log
+Date: September 2017
+KernelVersion: 4.13
+Description:
+ If the EC supports the CONSOLE_READ command type, this file
+ can be used to grab the EC logs. The kernel polls for the log
+ and keeps its own buffer but userspace should grab this and
+ write it out to some logs.
+
+What: /sys/kernel/debug/<cros-ec-device>/panicinfo
+Date: September 2017
+KernelVersion: 4.13
+Description:
+ This file dumps the EC panic information from the previous
+ reboot. This file will only exist if the PANIC_INFO command
+ type is supported by the EC.
+
+What: /sys/kernel/debug/<cros-ec-device>/pdinfo
+Date: June 2018
+KernelVersion: 4.17
+Description:
+ This file provides the port role, muxes and power debug
+ information for all the USB PD/type-C ports available. If
+ the are no ports available, this file will be just an empty
+ file.
+
+What: /sys/kernel/debug/<cros-ec-device>/uptime
+Date: June 2019
+KernelVersion: 5.3
+Description:
+ A u32 providing the time since EC booted in ms. This is
+ is used for synchronizing the AP host time with the EC
+ log. An error is returned if the command is not supported
+ by the EC or there is a communication problem.
+
+What: /sys/kernel/debug/<cros-ec-device>/last_resume_result
+Date: June 2019
+KernelVersion: 5.3
+Description:
+ Some ECs have a feature where they will track transitions to
+ the (Intel) processor's SLP_S0 line, in order to detect cases
+ where a system failed to go into S0ix. When the system resumes,
+ an EC with this feature will return a summary of SLP_S0
+ transitions that occurred. The last_resume_result file returns
+ the most recent response from the AP's resume message to the EC.
+
+ The bottom 31 bits contain a count of the number of SLP_S0
+ transitions that occurred since the suspend message was
+ received. Bit 31 is set if the EC attempted to wake the
+ system due to a timeout when watching for SLP_S0 transitions.
+ Callers can use this to detect a wake from the EC due to
+ S0ix timeouts. The result will be zero if no suspend
+ transitions have been attempted, or the EC does not support
+ this feature.
+
+ Output will be in the format: "0x%08x\n".
diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
index 2f5b80be07a3..f0ac14b70ecb 100644
--- a/Documentation/ABI/testing/debugfs-driver-habanalabs
+++ b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -3,7 +3,10 @@ Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
Description: Sets the device address to be used for read or write through
- PCI bar. The acceptable value is a string that starts with "0x"
+ PCI bar, or the device VA of a host mapped memory to be read or
+ written directly from the host. The latter option is allowed
+ only when the IOMMU is disabled.
+ The acceptable value is a string that starts with "0x"
What: /sys/kernel/debug/habanalabs/hl<n>/command_buffers
Date: Jan 2019
@@ -33,10 +36,12 @@ Contact: oded.gabbay@gmail.com
Description: Allows the root user to read or write directly through the
device's PCI bar. Writing to this file generates a write
transaction while reading from the file generates a read
- transcation. This custom interface is needed (instead of using
+ transaction. This custom interface is needed (instead of using
the generic Linux user-space PCI mapping) because the DDR bar
is very small compared to the DDR memory and only the driver can
- move the bar before and after the transaction
+ move the bar before and after the transaction.
+ If the IOMMU is disabled, it also allows the root user to read
+ or write from the host a device VA of a host mapped memory
What: /sys/kernel/debug/habanalabs/hl<n>/device
Date: Jan 2019
@@ -46,6 +51,13 @@ Description: Enables the root user to set the device to specific state.
Valid values are "disable", "enable", "suspend", "resume".
User can read this property to see the valid values
+What: /sys/kernel/debug/habanalabs/hl<n>/engines
+Date: Jul 2019
+KernelVersion: 5.3
+Contact: oded.gabbay@gmail.com
+Description: Displays the status registers values of the device engines and
+ their derived idle status
+
What: /sys/kernel/debug/habanalabs/hl<n>/i2c_addr
Date: Jan 2019
KernelVersion: 5.1
diff --git a/Documentation/ABI/testing/debugfs-wilco-ec b/Documentation/ABI/testing/debugfs-wilco-ec
index 73a5a66ddca6..9d8d9d2def5b 100644
--- a/Documentation/ABI/testing/debugfs-wilco-ec
+++ b/Documentation/ABI/testing/debugfs-wilco-ec
@@ -23,11 +23,9 @@ Description:
For writing, bytes 0-1 indicate the message type, one of enum
wilco_ec_msg_type. Byte 2+ consist of the data passed in the
- request, starting at MBOX[0]
-
- At least three bytes are required for writing, two for the type
- and at least a single byte of data. Only the first
- EC_MAILBOX_DATA_SIZE bytes of MBOX will be used.
+ request, starting at MBOX[0]. At least three bytes are required
+ for writing, two for the type and at least a single byte of
+ data.
Example:
// Request EC info type 3 (EC firmware build date)
@@ -40,7 +38,7 @@ Description:
$ cat /sys/kernel/debug/wilco_ec/raw
00 00 31 32 2f 32 31 2f 31 38 00 38 00 01 00 2f 00 ..12/21/18.8...
- Note that the first 32 bytes of the received MBOX[] will be
- printed, even if some of the data is junk. It is up to you to
- know how many of the first bytes of data are the actual
- response.
+ Note that the first 16 bytes of the received MBOX[] will be
+ printed, even if some of the data is junk, and skipping bytes
+ 17 to 32. It is up to you to know how many of the first bytes of
+ data are the actual response.
diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy
index 74c6702de74e..fc376a323908 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -24,11 +24,11 @@ Description:
[euid=] [fowner=] [fsname=]]
lsm: [[subj_user=] [subj_role=] [subj_type=]
[obj_user=] [obj_role=] [obj_type=]]
- option: [[appraise_type=]] [permit_directio]
-
+ option: [[appraise_type=]] [template=] [permit_directio]
base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
[FIRMWARE_CHECK]
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
+ [KEXEC_CMDLINE]
mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND]
[[^]MAY_EXEC]
fsmagic:= hex value
@@ -38,6 +38,8 @@ Description:
fowner:= decimal value
lsm: are LSM specific
option: appraise_type:= [imasig]
+ template:= name of a defined IMA template type
+ (eg, ima-ng). Only valid when action is "measure".
pcr:= decimal value
default policy:
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index abac31d216de..2c44b4f1b060 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -29,4 +29,4 @@ Description:
17 - sectors discarded
18 - time spent discarding
- For more details refer to Documentation/iostats.txt
+ For more details refer to Documentation/admin-guide/iostats.rst
diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup
index 0a54ed0d63c9..274df44d8b1b 100644
--- a/Documentation/ABI/testing/procfs-smaps_rollup
+++ b/Documentation/ABI/testing/procfs-smaps_rollup
@@ -3,18 +3,28 @@ Date: August 2017
Contact: Daniel Colascione <dancol@google.com>
Description:
This file provides pre-summed memory information for a
- process. The format is identical to /proc/pid/smaps,
+ process. The format is almost identical to /proc/pid/smaps,
except instead of an entry for each VMA in a process,
smaps_rollup has a single entry (tagged "[rollup]")
for which each field is the sum of the corresponding
fields from all the maps in /proc/pid/smaps.
- For more details, see the procfs man page.
+ Additionally, the fields Pss_Anon, Pss_File and Pss_Shmem
+ are not present in /proc/pid/smaps. These fields represent
+ the sum of the Pss field of each type (anon, file, shmem).
+ For more details, see Documentation/filesystems/proc.txt
+ and the procfs man page.
Typical output looks like this:
00100000-ff709000 ---p 00000000 00:00 0 [rollup]
+ Size: 1192 kB
+ KernelPageSize: 4 kB
+ MMUPageSize: 4 kB
Rss: 884 kB
Pss: 385 kB
+ Pss_Anon: 301 kB
+ Pss_File: 80 kB
+ Pss_Shmem: 4 kB
Shared_Clean: 696 kB
Shared_Dirty: 0 kB
Private_Clean: 120 kB
diff --git a/Documentation/ABI/testing/pstore b/Documentation/ABI/testing/pstore
index 5fca9f5e10a3..d45209abdb1b 100644
--- a/Documentation/ABI/testing/pstore
+++ b/Documentation/ABI/testing/pstore
@@ -1,6 +1,6 @@
-Where: /sys/fs/pstore/... (or /dev/pstore/...)
+What: /sys/fs/pstore/... (or /dev/pstore/...)
Date: March 2011
-Kernel Version: 2.6.39
+KernelVersion: 2.6.39
Contact: tony.luck@intel.com
Description: Generic interface to platform dependent persistent storage.
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index dfad7427817c..f8c7c7126bb1 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -15,7 +15,7 @@ Description:
9 - I/Os currently in progress
10 - time spent doing I/Os (ms)
11 - weighted time spent doing I/Os (ms)
- For more details refer Documentation/iostats.txt
+ For more details refer Documentation/admin-guide/iostats.rst
What: /sys/block/<disk>/<part>/stat
diff --git a/Documentation/ABI/testing/sysfs-block-device b/Documentation/ABI/testing/sysfs-block-device
index 82ef6eab042d..17f2bc7dd261 100644
--- a/Documentation/ABI/testing/sysfs-block-device
+++ b/Documentation/ABI/testing/sysfs-block-device
@@ -45,7 +45,7 @@ Description:
- Values below -2 are rejected with -EINVAL
For more information, see
- Documentation/laptops/disk-shock-protection.txt
+ Documentation/admin-guide/laptops/disk-shock-protection.rst
What: /sys/block/*/device/ncq_prio_enable
diff --git a/Documentation/ABI/testing/sysfs-bus-css b/Documentation/ABI/testing/sysfs-bus-css
index 2979c40c10e9..966f8504bd7b 100644
--- a/Documentation/ABI/testing/sysfs-bus-css
+++ b/Documentation/ABI/testing/sysfs-bus-css
@@ -33,3 +33,26 @@ Description: Contains the PIM/PAM/POM values, as reported by the
in sync with the values current in the channel subsystem).
Note: This is an I/O-subchannel specific attribute.
Users: s390-tools, HAL
+
+What: /sys/bus/css/devices/.../driver_override
+Date: June 2019
+Contact: Cornelia Huck <cohuck@redhat.com>
+ linux-s390@vger.kernel.org
+Description: This file allows the driver for a device to be specified. When
+ specified, only a driver with a name matching the value written
+ to driver_override will have an opportunity to bind to the
+ device. The override is specified by writing a string to the
+ driver_override file (echo vfio-ccw > driver_override) and
+ may be cleared with an empty string (echo > driver_override).
+ This returns the device to standard matching rules binding.
+ Writing to driver_override does not automatically unbind the
+ device from its current driver or make any attempt to
+ automatically load the specified driver. If no driver with a
+ matching name is currently loaded in the kernel, the device
+ will not bind to any driver. This also allows devices to
+ opt-out of driver binding using a driver_override name such as
+ "none". Only a single driver may be specified in the override,
+ there is no support for parsing delimiters.
+ Note that unlike the mechanism of the same name for pci, this
+ file does not allow to override basic matching rules. I.e.,
+ the driver must still match the subchannel type of the device.
diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
index 77f47ff5ee02..5bb793ec926c 100644
--- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
+++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-format
@@ -1,6 +1,6 @@
-Where: /sys/bus/event_source/devices/<dev>/format
+What: /sys/bus/event_source/devices/<dev>/format
Date: January 2012
-Kernel Version: 3.3
+KernelVersion: 3.3
Contact: Jiri Olsa <jolsa@redhat.com>
Description:
Attribute group to describe the magic bits that go into
diff --git a/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352 b/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352
index feb2e4a87075..4a251b7f11e4 100644
--- a/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352
+++ b/Documentation/ABI/testing/sysfs-bus-i2c-devices-hm6352
@@ -1,20 +1,20 @@
-Where: /sys/bus/i2c/devices/.../heading0_input
+What: /sys/bus/i2c/devices/.../heading0_input
Date: April 2010
-Kernel Version: 2.6.36?
+KernelVersion: 2.6.36?
Contact: alan.cox@intel.com
Description: Reports the current heading from the compass as a floating
point value in degrees.
-Where: /sys/bus/i2c/devices/.../power_state
+What: /sys/bus/i2c/devices/.../power_state
Date: April 2010
-Kernel Version: 2.6.36?
+KernelVersion: 2.6.36?
Contact: alan.cox@intel.com
Description: Sets the power state of the device. 0 sets the device into
sleep mode, 1 wakes it up.
-Where: /sys/bus/i2c/devices/.../calibration
+What: /sys/bus/i2c/devices/.../calibration
Date: April 2010
-Kernel Version: 2.6.36?
+KernelVersion: 2.6.36?
Contact: alan.cox@intel.com
Description: Sets the calibration on or off (1 = on, 0 = off). See the
chip data sheet.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 6aef7dbbde44..680451695422 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -61,8 +61,11 @@ What: /sys/bus/iio/devices/triggerX/sampling_frequency_available
KernelVersion: 2.6.35
Contact: linux-iio@vger.kernel.org
Description:
- When the internal sampling clock can only take a small
- discrete set of values, this file lists those available.
+ When the internal sampling clock can only take a specific set of
+ frequencies, we can specify the available values with:
+ - a small discrete set of values like "0 2 4 6 8"
+ - a range with minimum, step and maximum frequencies like
+ "[min step max]"
What: /sys/bus/iio/devices/iio:deviceX/oversampling_ratio
KernelVersion: 2.6.38
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-cros-ec b/Documentation/ABI/testing/sysfs-bus-iio-cros-ec
index 0e95c2ca105c..6158f831c761 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-cros-ec
+++ b/Documentation/ABI/testing/sysfs-bus-iio-cros-ec
@@ -18,11 +18,11 @@ Description:
values are 'base' and 'lid'.
What: /sys/bus/iio/devices/iio:deviceX/id
-Date: Septembre 2017
+Date: September 2017
KernelVersion: 4.14
Contact: linux-iio@vger.kernel.org
Description:
- This attribute is exposed by the CrOS EC legacy accelerometer
- driver and represents the sensor ID as exposed by the EC. This
- ID is used by the Android sensor service hardware abstraction
- layer (sensor HAL) through the Android container on ChromeOS.
+ This attribute is exposed by the CrOS EC sensors driver and
+ represents the sensor ID as exposed by the EC. This ID is used
+ by the Android sensor service hardware abstraction layer (sensor
+ HAL) through the Android container on ChromeOS.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08 b/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08
index 0a1ca1487fa9..a133fd8d081a 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08
+++ b/Documentation/ABI/testing/sysfs-bus-iio-distance-srf08
@@ -1,4 +1,4 @@
-What /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
+What: /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
Date: January 2017
KernelVersion: 4.11
Contact: linux-iio@vger.kernel.org
@@ -6,7 +6,7 @@ Description:
Show or set the gain boost of the amp, from 0-31 range.
default 31
-What /sys/bus/iio/devices/iio:deviceX/sensor_max_range
+What: /sys/bus/iio/devices/iio:deviceX/sensor_max_range
Date: January 2017
KernelVersion: 4.11
Contact: linux-iio@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-frequency-adf4371 b/Documentation/ABI/testing/sysfs-bus-iio-frequency-adf4371
new file mode 100644
index 000000000000..302de64cb424
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-iio-frequency-adf4371
@@ -0,0 +1,44 @@
+What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_frequency
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Stores the PLL frequency in Hz for channel Y.
+ Reading returns the actual frequency in Hz.
+ The ADF4371 has an integrated VCO with fundamendal output
+ frequency ranging from 4000000000 Hz 8000000000 Hz.
+
+ out_altvoltage0_frequency:
+ A divide by 1, 2, 4, 8, 16, 32 or circuit generates
+ frequencies from 62500000 Hz to 8000000000 Hz.
+ out_altvoltage1_frequency:
+ This channel duplicates the channel 0 frequency
+ out_altvoltage2_frequency:
+ A frequency doubler generates frequencies from
+ 8000000000 Hz to 16000000000 Hz.
+ out_altvoltage3_frequency:
+ A frequency quadrupler generates frequencies from
+ 16000000000 Hz to 32000000000 Hz.
+
+ Note: writes to one of the channels will affect the frequency of
+ all the other channels, since it involves changing the VCO
+ fundamental output frequency.
+
+What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_name
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ Reading returns the datasheet name for channel Y:
+
+ out_altvoltage0_name: RF8x
+ out_altvoltage1_name: RFAUX8x
+ out_altvoltage2_name: RF16x
+ out_altvoltage3_name: RF32x
+
+What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_powerdown
+KernelVersion:
+Contact: linux-iio@vger.kernel.org
+Description:
+ This attribute allows the user to power down the PLL and it's
+ RFOut buffers.
+ Writing 1 causes the specified channel to power down.
+ Clearing returns to normal operation.
diff --git a/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935 b/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935
index 9a17ab5036a4..c59d95346341 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935
+++ b/Documentation/ABI/testing/sysfs-bus-iio-proximity-as3935
@@ -1,4 +1,4 @@
-What /sys/bus/iio/devices/iio:deviceX/in_proximity_input
+What: /sys/bus/iio/devices/iio:deviceX/in_proximity_input
Date: March 2014
KernelVersion: 3.15
Contact: Matt Ranostay <matt.ranostay@konsulko.com>
@@ -6,7 +6,7 @@ Description:
Get the current distance in meters of storm (1km steps)
1000-40000 = distance in meters
-What /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
+What: /sys/bus/iio/devices/iio:deviceX/sensor_sensitivity
Date: March 2014
KernelVersion: 3.15
Contact: Matt Ranostay <matt.ranostay@konsulko.com>
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
index 4b0318c99507..3c9a8c4a25eb 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
@@ -9,9 +9,9 @@ errors may be "seen" / reported by the link partner and not the
problematic endpoint itself (which may report all counters as 0 as it never
saw any problems).
-Where: /sys/bus/pci/devices/<dev>/aer_dev_correctable
+What: /sys/bus/pci/devices/<dev>/aer_dev_correctable
Date: July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: List of correctable errors seen and reported by this
PCI device using ERR_COR. Note that since multiple errors may
@@ -31,9 +31,9 @@ Header Log Overflow 0
TOTAL_ERR_COR 2
-------------------------------------------------------------------------
-Where: /sys/bus/pci/devices/<dev>/aer_dev_fatal
+What: /sys/bus/pci/devices/<dev>/aer_dev_fatal
Date: July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: List of uncorrectable fatal errors seen and reported by this
PCI device using ERR_FATAL. Note that since multiple errors may
@@ -62,9 +62,9 @@ TLP Prefix Blocked Error 0
TOTAL_ERR_FATAL 0
-------------------------------------------------------------------------
-Where: /sys/bus/pci/devices/<dev>/aer_dev_nonfatal
+What: /sys/bus/pci/devices/<dev>/aer_dev_nonfatal
Date: July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: List of uncorrectable nonfatal errors seen and reported by this
PCI device using ERR_NONFATAL. Note that since multiple errors
@@ -103,20 +103,20 @@ collectors) that are AER capable. These indicate the number of error messages as
device, so these counters include them and are thus cumulative of all the error
messages on the PCI hierarchy originating at that root port.
-Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
+What: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_cor
Date: July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: Total number of ERR_COR messages reported to rootport.
-Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
+What: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_fatal
Date: July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: Total number of ERR_FATAL messages reported to rootport.
-Where: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
+What: /sys/bus/pci/devices/<dev>/aer_stats/aer_rootport_total_err_nonfatal
Date: July 2018
-Kernel Version: 4.19.0
+KernelVersion: 4.19.0
Contact: linux-pci@vger.kernel.org, rajatja@google.com
Description: Total number of ERR_NONFATAL messages reported to rootport.
diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
index 53d99edd1d75..92a94e1068c2 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
@@ -1,68 +1,68 @@
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/model
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/model
Date: March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
Contact: iss_storagedev@hp.com
Description: Displays the SCSI INQUIRY page 0 model for logical drive
Y of controller X.
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
Date: March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
Contact: iss_storagedev@hp.com
Description: Displays the SCSI INQUIRY page 0 revision for logical
drive Y of controller X.
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
Date: March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
Contact: iss_storagedev@hp.com
Description: Displays the SCSI INQUIRY page 83 serial number for logical
drive Y of controller X.
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
Date: March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
Contact: iss_storagedev@hp.com
Description: Displays the SCSI INQUIRY page 0 vendor for logical drive
Y of controller X.
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
Date: March 2009
-Kernel Version: 2.6.30
+KernelVersion: 2.6.30
Contact: iss_storagedev@hp.com
Description: A symbolic link to /sys/block/cciss!cXdY
-Where: /sys/bus/pci/devices/<dev>/ccissX/rescan
+What: /sys/bus/pci/devices/<dev>/ccissX/rescan
Date: August 2009
-Kernel Version: 2.6.31
+KernelVersion: 2.6.31
Contact: iss_storagedev@hp.com
Description: Kicks of a rescan of the controller to discover logical
drive topology changes.
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/lunid
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/lunid
Date: August 2009
-Kernel Version: 2.6.31
+KernelVersion: 2.6.31
Contact: iss_storagedev@hp.com
Description: Displays the 8-byte LUN ID used to address logical
drive Y of controller X.
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/raid_level
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/raid_level
Date: August 2009
-Kernel Version: 2.6.31
+KernelVersion: 2.6.31
Contact: iss_storagedev@hp.com
Description: Displays the RAID level of logical drive Y of
controller X.
-Where: /sys/bus/pci/devices/<dev>/ccissX/cXdY/usage_count
+What: /sys/bus/pci/devices/<dev>/ccissX/cXdY/usage_count
Date: August 2009
-Kernel Version: 2.6.31
+KernelVersion: 2.6.31
Contact: iss_storagedev@hp.com
Description: Displays the usage count (number of opens) of logical drive Y
of controller X.
-Where: /sys/bus/pci/devices/<dev>/ccissX/resettable
+What: /sys/bus/pci/devices/<dev>/ccissX/resettable
Date: February 2011
-Kernel Version: 2.6.38
+KernelVersion: 2.6.38
Contact: iss_storagedev@hp.com
Description: Value of 1 indicates the controller can honor the reset_devices
kernel parameter. Value of 0 indicates reset_devices cannot be
@@ -71,9 +71,9 @@ Description: Value of 1 indicates the controller can honor the reset_devices
a dump device, as kdump requires resetting the device in order
to work reliably.
-Where: /sys/bus/pci/devices/<dev>/ccissX/transport_mode
+What: /sys/bus/pci/devices/<dev>/ccissX/transport_mode
Date: July 2011
-Kernel Version: 3.0
+KernelVersion: 3.0
Contact: iss_storagedev@hp.com
Description: Value of "simple" indicates that the controller has been placed
in "simple mode". Value of "performant" indicates that the
diff --git a/Documentation/ABI/testing/sysfs-bus-siox b/Documentation/ABI/testing/sysfs-bus-siox
index fed7c3765a4e..c2a403f20b90 100644
--- a/Documentation/ABI/testing/sysfs-bus-siox
+++ b/Documentation/ABI/testing/sysfs-bus-siox
@@ -1,6 +1,6 @@
What: /sys/bus/siox/devices/siox-X/active
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
On reading represents the current state of the bus. If it
contains a "0" the bus is stopped and connected devices are
@@ -12,7 +12,7 @@ Description:
What: /sys/bus/siox/devices/siox-X/device_add
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Write-only file. Write
@@ -27,13 +27,13 @@ Description:
What: /sys/bus/siox/devices/siox-X/device_remove
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Write-only file. A single write removes the last device in the siox chain.
What: /sys/bus/siox/devices/siox-X/poll_interval_ns
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Defines the interval between two poll cycles in nano seconds.
Note this is rounded to jiffies on writing. On reading the current value
@@ -41,33 +41,33 @@ Description:
What: /sys/bus/siox/devices/siox-X-Y/connected
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Read-only value. "0" means the Yth device on siox bus X isn't "connected" i.e.
communication with it is not ensured. "1" signals a working connection.
What: /sys/bus/siox/devices/siox-X-Y/inbytes
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Read-only value reporting the inbytes value provided to siox-X/device_add
What: /sys/bus/siox/devices/siox-X-Y/status_errors
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Counts the number of time intervals when the read status byte doesn't yield the
expected value.
What: /sys/bus/siox/devices/siox-X-Y/type
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Read-only value reporting the type value provided to siox-X/device_add.
What: /sys/bus/siox/devices/siox-X-Y/watchdog
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Read-only value reporting if the watchdog of the siox device is
active. "0" means the watchdog is not active and the device is expected to
@@ -75,13 +75,13 @@ Description:
What: /sys/bus/siox/devices/siox-X-Y/watchdog_errors
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Read-only value reporting the number to time intervals when the
watchdog was active.
What: /sys/bus/siox/devices/siox-X-Y/outbytes
KernelVersion: 4.16
-Contact: Gavin Schenk <g.schenk@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Contact: Thorsten Scherer <t.scherer@eckelmann.de>, Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Description:
Read-only value reporting the outbytes value provided to siox-X/device_add.
diff --git a/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
index 70d00dfa443d..9ade80f81f96 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
+++ b/Documentation/ABI/testing/sysfs-bus-usb-devices-usbsevseg
@@ -1,14 +1,14 @@
-Where: /sys/bus/usb/.../powered
+What: /sys/bus/usb/.../powered
Date: August 2008
-Kernel Version: 2.6.26
+KernelVersion: 2.6.26
Contact: Harrison Metzger <harrisonmetz@gmail.com>
Description: Controls whether the device's display will powered.
A value of 0 is off and a non-zero value is on.
-Where: /sys/bus/usb/.../mode_msb
-Where: /sys/bus/usb/.../mode_lsb
+What: /sys/bus/usb/.../mode_msb
+What: /sys/bus/usb/.../mode_lsb
Date: August 2008
-Kernel Version: 2.6.26
+KernelVersion: 2.6.26
Contact: Harrison Metzger <harrisonmetz@gmail.com>
Description: Controls the devices display mode.
For a 6 character display the values are
@@ -16,24 +16,24 @@ Description: Controls the devices display mode.
for an 8 character display the values are
MSB 0x08; LSB 0xFF.
-Where: /sys/bus/usb/.../textmode
+What: /sys/bus/usb/.../textmode
Date: August 2008
-Kernel Version: 2.6.26
+KernelVersion: 2.6.26
Contact: Harrison Metzger <harrisonmetz@gmail.com>
Description: Controls the way the device interprets its text buffer.
raw: each character controls its segment manually
hex: each character is between 0-15
ascii: each character is between '0'-'9' and 'A'-'F'.
-Where: /sys/bus/usb/.../text
+What: /sys/bus/usb/.../text
Date: August 2008
-Kernel Version: 2.6.26
+KernelVersion: 2.6.26
Contact: Harrison Metzger <harrisonmetz@gmail.com>
Description: The text (or data) for the device to display
-Where: /sys/bus/usb/.../decimals
+What: /sys/bus/usb/.../decimals
Date: August 2008
-Kernel Version: 2.6.26
+KernelVersion: 2.6.26
Contact: Harrison Metzger <harrisonmetz@gmail.com>
Description: Controls the decimal places on the device.
To set the nth decimal place, give this field
diff --git a/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
index 77cf7ac949af..c0e0a9ae7b3d 100644
--- a/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
+++ b/Documentation/ABI/testing/sysfs-class-backlight-driver-lm3533
@@ -4,7 +4,7 @@ KernelVersion: 3.5
Contact: Johan Hovold <jhovold@gmail.com>
Description:
Get the ALS output channel used as input in
- ALS-current-control mode (0, 1), where
+ ALS-current-control mode (0, 1), where:
0 - out_current0 (backlight 0)
1 - out_current1 (backlight 1)
@@ -28,7 +28,7 @@ Date: April 2012
KernelVersion: 3.5
Contact: Johan Hovold <jhovold@gmail.com>
Description:
- Set the brightness-mapping mode (0, 1), where
+ Set the brightness-mapping mode (0, 1), where:
0 - exponential mode
1 - linear mode
@@ -38,7 +38,7 @@ Date: April 2012
KernelVersion: 3.5
Contact: Johan Hovold <jhovold@gmail.com>
Description:
- Set the PWM-input control mask (5 bits), where
+ Set the PWM-input control mask (5 bits), where:
bit 5 - PWM-input enabled in Zone 4
bit 4 - PWM-input enabled in Zone 3
diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index bbbabffc682a..7970e3713e70 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -1,6 +1,6 @@
-Note: Attributes that are shared between devices are stored in the directory
-pointed to by the symlink device/.
-Example: The real path of the attribute /sys/class/cxl/afu0.0s/irqs_max is
+Please note that attributes that are shared between devices are stored in
+the directory pointed to by the symlink device/.
+For example, the real path of the attribute /sys/class/cxl/afu0.0s/irqs_max is
/sys/class/cxl/afu0.0s/device/irqs_max, i.e. /sys/class/cxl/afu0.0/irqs_max.
diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq
index ee39acacf6f8..01196e19afca 100644
--- a/Documentation/ABI/testing/sysfs-class-devfreq
+++ b/Documentation/ABI/testing/sysfs-class-devfreq
@@ -47,7 +47,7 @@ Description:
What: /sys/class/devfreq/.../trans_stat
Date: October 2012
Contact: MyungJoo Ham <myungjoo.ham@samsung.com>
-Descrtiption:
+Description:
This ABI shows the statistics of devfreq behavior on a
specific device. It shows the time spent in each state and
the number of transitions between states.
diff --git a/Documentation/ABI/testing/sysfs-class-led-driver-lm3533 b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
index 620ebb3b9baa..e4c89b261546 100644
--- a/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
+++ b/Documentation/ABI/testing/sysfs-class-led-driver-lm3533
@@ -4,7 +4,7 @@ KernelVersion: 3.5
Contact: Johan Hovold <jhovold@gmail.com>
Description:
Set the ALS output channel to use as input in
- ALS-current-control mode (1, 2), where
+ ALS-current-control mode (1, 2), where:
1 - out_current1
2 - out_current2
@@ -22,7 +22,7 @@ Date: April 2012
KernelVersion: 3.5
Contact: Johan Hovold <jhovold@gmail.com>
Description:
- Set the pattern generator fall and rise times (0..7), where
+ Set the pattern generator fall and rise times (0..7), where:
0 - 2048 us
1 - 262 ms
@@ -45,7 +45,7 @@ Date: April 2012
KernelVersion: 3.5
Contact: Johan Hovold <jhovold@gmail.com>
Description:
- Set the brightness-mapping mode (0, 1), where
+ Set the brightness-mapping mode (0, 1), where:
0 - exponential mode
1 - linear mode
@@ -55,7 +55,7 @@ Date: April 2012
KernelVersion: 3.5
Contact: Johan Hovold <jhovold@gmail.com>
Description:
- Set the PWM-input control mask (5 bits), where
+ Set the PWM-input control mask (5 bits), where:
bit 5 - PWM-input enabled in Zone 4
bit 4 - PWM-input enabled in Zone 3
diff --git a/Documentation/ABI/testing/sysfs-class-leds-gt683r b/Documentation/ABI/testing/sysfs-class-leds-gt683r
index e4fae6026e79..6adab27f646e 100644
--- a/Documentation/ABI/testing/sysfs-class-leds-gt683r
+++ b/Documentation/ABI/testing/sysfs-class-leds-gt683r
@@ -5,7 +5,7 @@ Contact: Janne Kanniainen <janne.kanniainen@gmail.com>
Description:
Set the mode of LEDs. You should notice that changing the mode
of one LED will update the mode of its two sibling devices as
- well.
+ well. Possible values are:
0 - normal
1 - audio
@@ -13,4 +13,4 @@ Description:
Normal: LEDs are fully on when enabled
Audio: LEDs brightness depends on sound level
- Breathing: LEDs brightness varies at human breathing rate \ No newline at end of file
+ Breathing: LEDs brightness varies at human breathing rate
diff --git a/Documentation/ABI/testing/sysfs-class-net-phydev b/Documentation/ABI/testing/sysfs-class-net-phydev
index 2a5723343aba..206cbf538b59 100644
--- a/Documentation/ABI/testing/sysfs-class-net-phydev
+++ b/Documentation/ABI/testing/sysfs-class-net-phydev
@@ -41,3 +41,11 @@ Description:
xgmii, moca, qsgmii, trgmii, 1000base-x, 2500base-x, rxaui,
xaui, 10gbase-kr, unknown
+What: /sys/class/mdio_bus/<bus>/<device>/phy_standalone
+Date: May 2019
+KernelVersion: 5.3
+Contact: netdev@vger.kernel.org
+Description:
+ Boolean value indicating whether the PHY device is used in
+ standalone mode, without a net_device associated, by PHYLINK.
+ Attribute created only when this is the case.
diff --git a/Documentation/ABI/testing/sysfs-class-net-qmi b/Documentation/ABI/testing/sysfs-class-net-qmi
index 7122d6264c49..c310db4ccbc2 100644
--- a/Documentation/ABI/testing/sysfs-class-net-qmi
+++ b/Documentation/ABI/testing/sysfs-class-net-qmi
@@ -29,7 +29,7 @@ Contact: Bjørn Mork <bjorn@mork.no>
Description:
Unsigned integer.
- Write a number ranging from 1 to 127 to add a qmap mux
+ Write a number ranging from 1 to 254 to add a qmap mux
based network device, supported by recent Qualcomm based
modems.
@@ -46,5 +46,5 @@ Contact: Bjørn Mork <bjorn@mork.no>
Description:
Unsigned integer.
- Write a number ranging from 1 to 127 to delete a previously
+ Write a number ranging from 1 to 254 to delete a previously
created qmap mux based network device.
diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index b77e30b9014e..27edc06e2495 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -376,10 +376,42 @@ Description:
supply. Normally this is configured based on the type of
connection made (e.g. A configured SDP should output a maximum
of 500mA so the input current limit is set to the same value).
+ Use preferably input_power_limit, and for problems that can be
+ solved using power limit use input_current_limit.
Access: Read, Write
Valid values: Represented in microamps
+What: /sys/class/power_supply/<supply_name>/input_voltage_limit
+Date: May 2019
+Contact: linux-pm@vger.kernel.org
+Description:
+ This entry configures the incoming VBUS voltage limit currently
+ set in the supply. Normally this is configured based on
+ system-level knowledge or user input (e.g. This is part of the
+ Pixel C's thermal management strategy to effectively limit the
+ input power to 5V when the screen is on to meet Google's skin
+ temperature targets). Note that this feature should not be
+ used for safety critical things.
+ Use preferably input_power_limit, and for problems that can be
+ solved using power limit use input_voltage_limit.
+
+ Access: Read, Write
+ Valid values: Represented in microvolts
+
+What: /sys/class/power_supply/<supply_name>/input_power_limit
+Date: May 2019
+Contact: linux-pm@vger.kernel.org
+Description:
+ This entry configures the incoming power limit currently set
+ in the supply. Normally this is configured based on
+ system-level knowledge or user input. Use preferably this
+ feature to limit the incoming power and use current/voltage
+ limit only for problems that can be solved using power limit.
+
+ Access: Read, Write
+ Valid values: Represented in microwatts
+
What: /sys/class/power_supply/<supply_name>/online,
Date: May 2007
Contact: linux-pm@vger.kernel.org
diff --git a/Documentation/ABI/testing/sysfs-class-power-wilco b/Documentation/ABI/testing/sysfs-class-power-wilco
new file mode 100644
index 000000000000..da1d6ffe5e3c
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power-wilco
@@ -0,0 +1,30 @@
+What: /sys/class/power_supply/wilco-charger/charge_type
+Date: April 2019
+KernelVersion: 5.2
+Description:
+ What charging algorithm to use:
+
+ Standard: Fully charges battery at a standard rate.
+ Adaptive: Battery settings adaptively optimized based on
+ typical battery usage pattern.
+ Fast: Battery charges over a shorter period.
+ Trickle: Extends battery lifespan, intended for users who
+ primarily use their Chromebook while connected to AC.
+ Custom: A low and high threshold percentage is specified.
+ Charging begins when level drops below
+ charge_control_start_threshold, and ceases when
+ level is above charge_control_end_threshold.
+
+What: /sys/class/power_supply/wilco-charger/charge_control_start_threshold
+Date: April 2019
+KernelVersion: 5.2
+Description:
+ Used when charge_type="Custom", as described above. Measured in
+ percentages. The valid range is [50, 95].
+
+What: /sys/class/power_supply/wilco-charger/charge_control_end_threshold
+Date: April 2019
+KernelVersion: 5.2
+Description:
+ Used when charge_type="Custom", as described above. Measured in
+ percentages. The valid range is [55, 100].
diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap
index db3b3ff70d84..ca491ec4e693 100644
--- a/Documentation/ABI/testing/sysfs-class-powercap
+++ b/Documentation/ABI/testing/sysfs-class-powercap
@@ -5,7 +5,7 @@ Contact: linux-pm@vger.kernel.org
Description:
The powercap/ class sub directory belongs to the power cap
subsystem. Refer to
- Documentation/power/powercap/powercap.txt for details.
+ Documentation/power/powercap/powercap.rst for details.
What: /sys/class/powercap/<control type>
Date: September 2013
@@ -147,6 +147,6 @@ What: /sys/class/powercap/.../<power zone>/enabled
Date: September 2013
KernelVersion: 3.13
Contact: linux-pm@vger.kernel.org
-Description
+Description:
This allows to enable/disable power capping at power zone level.
This applies to current power zone and its children.
diff --git a/Documentation/ABI/testing/sysfs-class-switchtec b/Documentation/ABI/testing/sysfs-class-switchtec
index 48cb4c15e430..76c7a661a595 100644
--- a/Documentation/ABI/testing/sysfs-class-switchtec
+++ b/Documentation/ABI/testing/sysfs-class-switchtec
@@ -1,6 +1,6 @@
switchtec - Microsemi Switchtec PCI Switch Management Endpoint
-For details on this subsystem look at Documentation/switchtec.txt.
+For details on this subsystem look at Documentation/driver-api/switchtec.rst.
What: /sys/class/switchtec
Date: 05-Jan-2017
diff --git a/Documentation/ABI/testing/sysfs-class-uwb_rc b/Documentation/ABI/testing/sysfs-class-uwb_rc
index 85f4875d16ac..a0578751c1e3 100644
--- a/Documentation/ABI/testing/sysfs-class-uwb_rc
+++ b/Documentation/ABI/testing/sysfs-class-uwb_rc
@@ -125,12 +125,6 @@ Description:
The EUI-48 of this device in colon separated hex
octets.
-What: /sys/class/uwb_rc/uwbN/<EUI-48>/BPST
-Date: July 2008
-KernelVersion: 2.6.27
-Contact: linux-usb@vger.kernel.org
-Description:
-
What: /sys/class/uwb_rc/uwbN/<EUI-48>/IEs
Date: July 2008
KernelVersion: 2.6.27
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 87478ac6c2af..5f7d7b14fa44 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -34,7 +34,7 @@ Description: CPU topology files that describe kernel limits related to
present: cpus that have been identified as being present in
the system.
- See Documentation/cputopology.txt for more information.
+ See Documentation/admin-guide/cputopology.rst for more information.
What: /sys/devices/system/cpu/probe
@@ -103,7 +103,7 @@ Description: CPU topology files that describe a logical CPU's relationship
thread_siblings_list: human-readable list of cpu#'s hardware
threads within the same core as cpu#
- See Documentation/cputopology.txt for more information.
+ See Documentation/admin-guide/cputopology.rst for more information.
What: /sys/devices/system/cpu/cpuidle/current_driver
@@ -539,3 +539,26 @@ Description: Intel Energy and Performance Bias Hint (EPB)
This attribute is present for all online CPUs supporting the
Intel EPB feature.
+
+What: /sys/devices/system/cpu/umwait_control
+ /sys/devices/system/cpu/umwait_control/enable_c02
+ /sys/devices/system/cpu/umwait_control/max_time
+Date: May 2019
+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description: Umwait control
+
+ enable_c02: Read/write interface to control umwait C0.2 state
+ Read returns C0.2 state status:
+ 0: C0.2 is disabled
+ 1: C0.2 is enabled
+
+ Write 'y' or '1' or 'on' to enable C0.2 state.
+ Write 'n' or '0' or 'off' to disable C0.2 state.
+
+ The interface is case insensitive.
+
+ max_time: Read/write interface to control umwait maximum time
+ in TSC-quanta that the CPU can reside in either C0.1
+ or C0.2 state. The time is an unsigned 32-bit number.
+ Note that a value of zero means there is no limit.
+ Low order two bits must be zero.
diff --git a/Documentation/ABI/testing/sysfs-driver-altera-cvp b/Documentation/ABI/testing/sysfs-driver-altera-cvp
index 8cde64a71edb..fbd8078fd7ad 100644
--- a/Documentation/ABI/testing/sysfs-driver-altera-cvp
+++ b/Documentation/ABI/testing/sysfs-driver-altera-cvp
@@ -1,6 +1,6 @@
What: /sys/bus/pci/drivers/altera-cvp/chkcfg
Date: May 2017
-Kernel Version: 4.13
+KernelVersion: 4.13
Contact: Anatolij Gustschin <agust@denx.de>
Description:
Contains either 1 or 0 and controls if configuration
diff --git a/Documentation/ABI/testing/sysfs-driver-habanalabs b/Documentation/ABI/testing/sysfs-driver-habanalabs
index 78b2bcf316a3..f433fc6db3c6 100644
--- a/Documentation/ABI/testing/sysfs-driver-habanalabs
+++ b/Documentation/ABI/testing/sysfs-driver-habanalabs
@@ -62,18 +62,20 @@ What: /sys/class/habanalabs/hl<n>/ic_clk
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
-Description: Allows the user to set the maximum clock frequency of the
- Interconnect fabric. Writes to this parameter affect the device
- only when the power management profile is set to "manual" mode.
- The device IC clock might be set to lower value then the
+Description: Allows the user to set the maximum clock frequency, in Hz, of
+ the Interconnect fabric. Writes to this parameter affect the
+ device only when the power management profile is set to "manual"
+ mode. The device IC clock might be set to lower value than the
maximum. The user should read the ic_clk_curr to see the actual
- frequency value of the IC
+ frequency value of the IC. This property is valid only for the
+ Goya ASIC family
What: /sys/class/habanalabs/hl<n>/ic_clk_curr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
-Description: Displays the current clock frequency of the Interconnect fabric
+Description: Displays the current clock frequency, in Hz, of the Interconnect
+ fabric. This property is valid only for the Goya ASIC family
What: /sys/class/habanalabs/hl<n>/infineon_ver
Date: Jan 2019
@@ -92,18 +94,20 @@ What: /sys/class/habanalabs/hl<n>/mme_clk
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
-Description: Allows the user to set the maximum clock frequency of the
- MME compute engine. Writes to this parameter affect the device
- only when the power management profile is set to "manual" mode.
- The device MME clock might be set to lower value then the
+Description: Allows the user to set the maximum clock frequency, in Hz, of
+ the MME compute engine. Writes to this parameter affect the
+ device only when the power management profile is set to "manual"
+ mode. The device MME clock might be set to lower value than the
maximum. The user should read the mme_clk_curr to see the actual
- frequency value of the MME
+ frequency value of the MME. This property is valid only for the
+ Goya ASIC family
What: /sys/class/habanalabs/hl<n>/mme_clk_curr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
-Description: Displays the current clock frequency of the MME compute engine
+Description: Displays the current clock frequency, in Hz, of the MME compute
+ engine. This property is valid only for the Goya ASIC family
What: /sys/class/habanalabs/hl<n>/pci_addr
Date: Jan 2019
@@ -163,18 +167,20 @@ What: /sys/class/habanalabs/hl<n>/tpc_clk
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
-Description: Allows the user to set the maximum clock frequency of the
- TPC compute engines. Writes to this parameter affect the device
- only when the power management profile is set to "manual" mode.
- The device TPC clock might be set to lower value then the
+Description: Allows the user to set the maximum clock frequency, in Hz, of
+ the TPC compute engines. Writes to this parameter affect the
+ device only when the power management profile is set to "manual"
+ mode. The device TPC clock might be set to lower value than the
maximum. The user should read the tpc_clk_curr to see the actual
- frequency value of the TPC
+ frequency value of the TPC. This property is valid only for
+ Goya ASIC family
What: /sys/class/habanalabs/hl<n>/tpc_clk_curr
Date: Jan 2019
KernelVersion: 5.1
Contact: oded.gabbay@gmail.com
-Description: Displays the current clock frequency of the TPC compute engines
+Description: Displays the current clock frequency, in Hz, of the TPC compute
+ engines. This property is valid only for the Goya ASIC family
What: /sys/class/habanalabs/hl<n>/uboot_ver
Date: Jan 2019
diff --git a/Documentation/ABI/testing/sysfs-driver-hid b/Documentation/ABI/testing/sysfs-driver-hid
index 48942cacb0bf..a59533410871 100644
--- a/Documentation/ABI/testing/sysfs-driver-hid
+++ b/Documentation/ABI/testing/sysfs-driver-hid
@@ -1,6 +1,6 @@
-What: For USB devices : /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
- For BT devices : /sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
- Symlink : /sys/class/hidraw/hidraw<num>/device/report_descriptor
+What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
+What: /sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/report_descriptor
+What: /sys/class/hidraw/hidraw<num>/device/report_descriptor
Date: Jan 2011
KernelVersion: 2.0.39
Contact: Alan Ott <alan@signal11.us>
@@ -9,9 +9,9 @@ Description: When read, this file returns the device's raw binary HID
This file cannot be written.
Users: HIDAPI library (http://www.signal11.us/oss/hidapi)
-What: For USB devices : /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
- For BT devices : /sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
- Symlink : /sys/class/hidraw/hidraw<num>/device/country
+What: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
+What: /sys/class/bluetooth/hci<addr>/<hid-bus>:<vendor-id>:<product-id>.<num>/country
+What: /sys/class/hidraw/hidraw<num>/device/country
Date: February 2015
KernelVersion: 3.19
Contact: Olivier Gay <ogay@logitech.com>
diff --git a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
index 3ca3971109bf..8f7982c70d72 100644
--- a/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
+++ b/Documentation/ABI/testing/sysfs-driver-hid-roccat-kone
@@ -5,7 +5,7 @@ Description: It is possible to switch the dpi setting of the mouse with the
press of a button.
When read, this file returns the raw number of the actual dpi
setting reported by the mouse. This number has to be further
- processed to receive the real dpi value.
+ processed to receive the real dpi value:
VALUE DPI
1 800
diff --git a/Documentation/ABI/testing/sysfs-driver-ppi b/Documentation/ABI/testing/sysfs-driver-ppi
index 9921ef285899..1a56fc507689 100644
--- a/Documentation/ABI/testing/sysfs-driver-ppi
+++ b/Documentation/ABI/testing/sysfs-driver-ppi
@@ -1,6 +1,6 @@
What: /sys/class/tpm/tpmX/ppi/
Date: August 2012
-Kernel Version: 3.6
+KernelVersion: 3.6
Contact: xiaoyan.zhang@intel.com
Description:
This folder includes the attributes related with PPI (Physical
diff --git a/Documentation/ABI/testing/sysfs-driver-st b/Documentation/ABI/testing/sysfs-driver-st
index ba5d77008a85..88cab66fd77f 100644
--- a/Documentation/ABI/testing/sysfs-driver-st
+++ b/Documentation/ABI/testing/sysfs-driver-st
@@ -1,6 +1,6 @@
What: /sys/bus/scsi/drivers/st/debug_flag
Date: October 2015
-Kernel Version: ?.?
+KernelVersion: ?.?
Contact: shane.seymour@hpe.com
Description:
This file allows you to turn debug output from the st driver
diff --git a/Documentation/ABI/testing/sysfs-driver-wacom b/Documentation/ABI/testing/sysfs-driver-wacom
index 2aa5503ee200..afc48fc163b5 100644
--- a/Documentation/ABI/testing/sysfs-driver-wacom
+++ b/Documentation/ABI/testing/sysfs-driver-wacom
@@ -1,6 +1,6 @@
What: /sys/bus/hid/devices/<bus>:<vid>:<pid>.<n>/speed
Date: April 2010
-Kernel Version: 2.6.35
+KernelVersion: 2.6.35
Contact: linux-bluetooth@vger.kernel.org
Description:
The /sys/bus/hid/devices/<bus>:<vid>:<pid>.<n>/speed file
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 91822ce25831..dca326e0ee3e 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -243,3 +243,11 @@ Description:
- Del: echo '[h/c]!extension' > /sys/fs/f2fs/<disk>/extension_list
- [h] means add/del hot file extension
- [c] means add/del cold file extension
+
+What: /sys/fs/f2fs/<disk>/unusable
+Date April 2019
+Contact: "Daniel Rosenberg" <drosen@google.com>
+Description:
+ If checkpoint=disable, it displays the number of blocks that are unusable.
+ If checkpoint=enable it displays the enumber of blocks that would be unusable
+ if checkpoint=disable were to be set.
diff --git a/Documentation/ABI/testing/sysfs-kernel-fscaps b/Documentation/ABI/testing/sysfs-kernel-fscaps
index 50a3033b5e15..bcff34665192 100644
--- a/Documentation/ABI/testing/sysfs-kernel-fscaps
+++ b/Documentation/ABI/testing/sysfs-kernel-fscaps
@@ -2,7 +2,7 @@ What: /sys/kernel/fscaps
Date: February 2011
KernelVersion: 2.6.38
Contact: Ludwig Nussel <ludwig.nussel@suse.de>
-Description
+Description:
Shows whether file system capabilities are honored
when executing a binary
diff --git a/Documentation/ABI/testing/sysfs-kernel-iommu_groups b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
index 35c64e00b35c..017f5bc3920c 100644
--- a/Documentation/ABI/testing/sysfs-kernel-iommu_groups
+++ b/Documentation/ABI/testing/sysfs-kernel-iommu_groups
@@ -24,3 +24,12 @@ Description: /sys/kernel/iommu_groups/reserved_regions list IOVA
region is described on a single line: the 1st field is
the base IOVA, the second is the end IOVA and the third
field describes the type of the region.
+
+What: /sys/kernel/iommu_groups/reserved_regions
+Date: June 2019
+KernelVersion: v5.3
+Contact: Eric Auger <eric.auger@redhat.com>
+Description: In case an RMRR is used only by graphics or USB devices
+ it is now exposed as "direct-relaxable" instead of "direct".
+ In device assignment use case, for instance, those RMRR
+ are considered to be relaxable and safe.
diff --git a/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo b/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo
index 7bd81168e063..1f1087a5f075 100644
--- a/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo
+++ b/Documentation/ABI/testing/sysfs-kernel-vmcoreinfo
@@ -4,7 +4,7 @@ KernelVersion: 2.6.24
Contact: Ken'ichi Ohmichi <oomichi@mxs.nes.nec.co.jp>
Kexec Mailing List <kexec@lists.infradead.org>
Vivek Goyal <vgoyal@redhat.com>
-Description
+Description:
Shows physical address and size of vmcoreinfo ELF note.
First value contains physical address of note in hex and
second value contains the size of note in hex. This ELF
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-laptop b/Documentation/ABI/testing/sysfs-platform-asus-laptop
index cd9d667c3da2..8b0e8205a6a2 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-asus-laptop
@@ -31,7 +31,7 @@ Description:
To control the LED display, use the following :
echo 0x0T000DDD > /sys/devices/platform/asus_laptop/
where T control the 3 letters display, and DDD the 3 digits display.
- The DDD table can be found in Documentation/laptops/asus-laptop.txt
+ The DDD table can be found in Documentation/admin-guide/laptops/asus-laptop.rst
What: /sys/devices/platform/asus_laptop/bluetooth
Date: January 2007
diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 019e1e29370e..9e99f2909612 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -36,3 +36,13 @@ KernelVersion: 3.5
Contact: "AceLan Kao" <acelan.kao@canonical.com>
Description:
Resume on lid open. 1 means on, 0 means off.
+
+What: /sys/devices/platform/<platform>/fan_boost_mode
+Date: Sep 2019
+KernelVersion: 5.3
+Contact: "Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
+Description:
+ Fan boost mode:
+ * 0 - normal,
+ * 1 - overboost,
+ * 2 - silent
diff --git a/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl b/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl
index 3c3514815cd5..c394b808be19 100644
--- a/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl
+++ b/Documentation/ABI/testing/sysfs-platform-i2c-demux-pinctrl
@@ -1,7 +1,7 @@
What: /sys/devices/platform/<i2c-demux-name>/available_masters
Date: January 2016
KernelVersion: 4.6
-Contact: Wolfram Sang <wsa@the-dreams.de>
+Contact: Wolfram Sang <wsa+renesas@sang-engineering.com>
Description:
Reading the file will give you a list of masters which can be
selected for a demultiplexed bus. The format is
@@ -12,7 +12,7 @@ Description:
What: /sys/devices/platform/<i2c-demux-name>/current_master
Date: January 2016
KernelVersion: 4.6
-Contact: Wolfram Sang <wsa@the-dreams.de>
+Contact: Wolfram Sang <wsa+renesas@sang-engineering.com>
Description:
This file selects/shows the active I2C master for a demultiplexed
bus. It uses the <index> value from the file 'available_masters'.
diff --git a/Documentation/ABI/testing/sysfs-platform-wilco-ec b/Documentation/ABI/testing/sysfs-platform-wilco-ec
new file mode 100644
index 000000000000..8827a734f933
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-platform-wilco-ec
@@ -0,0 +1,40 @@
+What: /sys/bus/platform/devices/GOOG000C\:00/boot_on_ac
+Date: April 2019
+KernelVersion: 5.3
+Description:
+ Boot on AC is a policy which makes the device boot from S5
+ when AC power is connected. This is useful for users who
+ want to run their device headless or with a dock.
+
+ Input should be parseable by kstrtou8() to 0 or 1.
+
+What: /sys/bus/platform/devices/GOOG000C\:00/build_date
+Date: May 2019
+KernelVersion: 5.3
+Description:
+ Display Wilco Embedded Controller firmware build date.
+ Output will a MM/DD/YY string.
+
+What: /sys/bus/platform/devices/GOOG000C\:00/build_revision
+Date: May 2019
+KernelVersion: 5.3
+Description:
+ Display Wilco Embedded Controller build revision.
+ Output will a version string be similar to the example below:
+ d2592cae0
+
+What: /sys/bus/platform/devices/GOOG000C\:00/model_number
+Date: May 2019
+KernelVersion: 5.3
+Description:
+ Display Wilco Embedded Controller model number.
+ Output will a version string be similar to the example below:
+ 08B6
+
+What: /sys/bus/platform/devices/GOOG000C\:00/version
+Date: May 2019
+KernelVersion: 5.3
+Description:
+ Display Wilco Embedded Controller firmware version.
+ The format of the string is x.y.z. Where x is major, y is minor
+ and z is the build number. For example: 95.00.06
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 18b7dc929234..3c5130355011 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -300,4 +300,4 @@ Description:
attempt.
Using this sysfs file will override any values that were
- set using the kernel command line for disk offset. \ No newline at end of file
+ set using the kernel command line for disk offset.
diff --git a/Documentation/logo.txt b/Documentation/COPYING-logo
index 296f0f7f67eb..296f0f7f67eb 100644
--- a/Documentation/logo.txt
+++ b/Documentation/COPYING-logo
diff --git a/Documentation/DMA-API-HOWTO.txt b/Documentation/DMA-API-HOWTO.txt
index cb712a02f59f..358d495456d1 100644
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -212,7 +212,7 @@ The standard 64-bit addressing device would do something like this::
If the device only supports 32-bit addressing for descriptors in the
coherent allocations, but supports full 64-bits for streaming mappings
-it would look like this:
+it would look like this::
if (dma_set_mask(dev, DMA_BIT_MASK(64))) {
dev_warn(dev, "mydev: No suitable DMA available\n");
diff --git a/Documentation/EDID/howto.rst b/Documentation/EDID/howto.rst
deleted file mode 100644
index 725fd49a88ca..000000000000
--- a/Documentation/EDID/howto.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-:orphan:
-
-====
-EDID
-====
-
-In the good old days when graphics parameters were configured explicitly
-in a file called xorg.conf, even broken hardware could be managed.
-
-Today, with the advent of Kernel Mode Setting, a graphics board is
-either correctly working because all components follow the standards -
-or the computer is unusable, because the screen remains dark after
-booting or it displays the wrong area. Cases when this happens are:
-- The graphics board does not recognize the monitor.
-- The graphics board is unable to detect any EDID data.
-- The graphics board incorrectly forwards EDID data to the driver.
-- The monitor sends no or bogus EDID data.
-- A KVM sends its own EDID data instead of querying the connected monitor.
-Adding the kernel parameter "nomodeset" helps in most cases, but causes
-restrictions later on.
-
-As a remedy for such situations, the kernel configuration item
-CONFIG_DRM_LOAD_EDID_FIRMWARE was introduced. It allows to provide an
-individually prepared or corrected EDID data set in the /lib/firmware
-directory from where it is loaded via the firmware interface. The code
-(see drivers/gpu/drm/drm_edid_load.c) contains built-in data sets for
-commonly used screen resolutions (800x600, 1024x768, 1280x1024, 1600x1200,
-1680x1050, 1920x1080) as binary blobs, but the kernel source tree does
-not contain code to create these data. In order to elucidate the origin
-of the built-in binary EDID blobs and to facilitate the creation of
-individual data for a specific misbehaving monitor, commented sources
-and a Makefile environment are given here.
-
-To create binary EDID and C source code files from the existing data
-material, simply type "make".
-
-If you want to create your own EDID file, copy the file 1024x768.S,
-replace the settings with your own data and add a new target to the
-Makefile. Please note that the EDID data structure expects the timing
-values in a different way as compared to the standard X11 format.
-
-X11:
- HTimings:
- hdisp hsyncstart hsyncend htotal
- VTimings:
- vdisp vsyncstart vsyncend vtotal
-
-EDID::
-
- #define XPIX hdisp
- #define XBLANK htotal-hdisp
- #define XOFFSET hsyncstart-hdisp
- #define XPULSE hsyncend-hsyncstart
-
- #define YPIX vdisp
- #define YBLANK vtotal-vdisp
- #define YOFFSET vsyncstart-vdisp
- #define YPULSE vsyncend-vsyncstart
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
deleted file mode 100644
index 618e13d5e276..000000000000
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ /dev/null
@@ -1,270 +0,0 @@
- The MSI Driver Guide HOWTO
- Tom L Nguyen tom.l.nguyen@intel.com
- 10/03/2003
- Revised Feb 12, 2004 by Martine Silbermann
- email: Martine.Silbermann@hp.com
- Revised Jun 25, 2004 by Tom L Nguyen
- Revised Jul 9, 2008 by Matthew Wilcox <willy@linux.intel.com>
- Copyright 2003, 2008 Intel Corporation
-
-1. About this guide
-
-This guide describes the basics of Message Signaled Interrupts (MSIs),
-the advantages of using MSI over traditional interrupt mechanisms, how
-to change your driver to use MSI or MSI-X and some basic diagnostics to
-try if a device doesn't support MSIs.
-
-
-2. What are MSIs?
-
-A Message Signaled Interrupt is a write from the device to a special
-address which causes an interrupt to be received by the CPU.
-
-The MSI capability was first specified in PCI 2.2 and was later enhanced
-in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X
-capability was also introduced with PCI 3.0. It supports more interrupts
-per device than MSI and allows interrupts to be independently configured.
-
-Devices may support both MSI and MSI-X, but only one can be enabled at
-a time.
-
-
-3. Why use MSIs?
-
-There are three reasons why using MSIs can give an advantage over
-traditional pin-based interrupts.
-
-Pin-based PCI interrupts are often shared amongst several devices.
-To support this, the kernel must call each interrupt handler associated
-with an interrupt, which leads to reduced performance for the system as
-a whole. MSIs are never shared, so this problem cannot arise.
-
-When a device writes data to memory, then raises a pin-based interrupt,
-it is possible that the interrupt may arrive before all the data has
-arrived in memory (this becomes more likely with devices behind PCI-PCI
-bridges). In order to ensure that all the data has arrived in memory,
-the interrupt handler must read a register on the device which raised
-the interrupt. PCI transaction ordering rules require that all the data
-arrive in memory before the value may be returned from the register.
-Using MSIs avoids this problem as the interrupt-generating write cannot
-pass the data writes, so by the time the interrupt is raised, the driver
-knows that all the data has arrived in memory.
-
-PCI devices can only support a single pin-based interrupt per function.
-Often drivers have to query the device to find out what event has
-occurred, slowing down interrupt handling for the common case. With
-MSIs, a device can support more interrupts, allowing each interrupt
-to be specialised to a different purpose. One possible design gives
-infrequent conditions (such as errors) their own interrupt which allows
-the driver to handle the normal interrupt handling path more efficiently.
-Other possible designs include giving one interrupt to each packet queue
-in a network card or each port in a storage controller.
-
-
-4. How to use MSIs
-
-PCI devices are initialised to use pin-based interrupts. The device
-driver has to set up the device to use MSI or MSI-X. Not all machines
-support MSIs correctly, and for those machines, the APIs described below
-will simply fail and the device will continue to use pin-based interrupts.
-
-4.1 Include kernel support for MSIs
-
-To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
-option enabled. This option is only available on some architectures,
-and it may depend on some other options also being set. For example,
-on x86, you must also enable X86_UP_APIC or SMP in order to see the
-CONFIG_PCI_MSI option.
-
-4.2 Using MSI
-
-Most of the hard work is done for the driver in the PCI layer. The driver
-simply has to request that the PCI layer set up the MSI capability for this
-device.
-
-To automatically use MSI or MSI-X interrupt vectors, use the following
-function:
-
- int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
- unsigned int max_vecs, unsigned int flags);
-
-which allocates up to max_vecs interrupt vectors for a PCI device. It
-returns the number of vectors allocated or a negative error. If the device
-has a requirements for a minimum number of vectors the driver can pass a
-min_vecs argument set to this limit, and the PCI core will return -ENOSPC
-if it can't meet the minimum number of vectors.
-
-The flags argument is used to specify which type of interrupt can be used
-by the device and the driver (PCI_IRQ_LEGACY, PCI_IRQ_MSI, PCI_IRQ_MSIX).
-A convenient short-hand (PCI_IRQ_ALL_TYPES) is also available to ask for
-any possible kind of interrupt. If the PCI_IRQ_AFFINITY flag is set,
-pci_alloc_irq_vectors() will spread the interrupts around the available CPUs.
-
-To get the Linux IRQ numbers passed to request_irq() and free_irq() and the
-vectors, use the following function:
-
- int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
-
-Any allocated resources should be freed before removing the device using
-the following function:
-
- void pci_free_irq_vectors(struct pci_dev *dev);
-
-If a device supports both MSI-X and MSI capabilities, this API will use the
-MSI-X facilities in preference to the MSI facilities. MSI-X supports any
-number of interrupts between 1 and 2048. In contrast, MSI is restricted to
-a maximum of 32 interrupts (and must be a power of two). In addition, the
-MSI interrupt vectors must be allocated consecutively, so the system might
-not be able to allocate as many vectors for MSI as it could for MSI-X. On
-some platforms, MSI interrupts must all be targeted at the same set of CPUs
-whereas MSI-X interrupts can all be targeted at different CPUs.
-
-If a device supports neither MSI-X or MSI it will fall back to a single
-legacy IRQ vector.
-
-The typical usage of MSI or MSI-X interrupts is to allocate as many vectors
-as possible, likely up to the limit supported by the device. If nvec is
-larger than the number supported by the device it will automatically be
-capped to the supported limit, so there is no need to query the number of
-vectors supported beforehand:
-
- nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_ALL_TYPES)
- if (nvec < 0)
- goto out_err;
-
-If a driver is unable or unwilling to deal with a variable number of MSI
-interrupts it can request a particular number of interrupts by passing that
-number to pci_alloc_irq_vectors() function as both 'min_vecs' and
-'max_vecs' parameters:
-
- ret = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_ALL_TYPES);
- if (ret < 0)
- goto out_err;
-
-The most notorious example of the request type described above is enabling
-the single MSI mode for a device. It could be done by passing two 1s as
-'min_vecs' and 'max_vecs':
-
- ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
- if (ret < 0)
- goto out_err;
-
-Some devices might not support using legacy line interrupts, in which case
-the driver can specify that only MSI or MSI-X is acceptable:
-
- nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_MSI | PCI_IRQ_MSIX);
- if (nvec < 0)
- goto out_err;
-
-4.3 Legacy APIs
-
-The following old APIs to enable and disable MSI or MSI-X interrupts should
-not be used in new code:
-
- pci_enable_msi() /* deprecated */
- pci_disable_msi() /* deprecated */
- pci_enable_msix_range() /* deprecated */
- pci_enable_msix_exact() /* deprecated */
- pci_disable_msix() /* deprecated */
-
-Additionally there are APIs to provide the number of supported MSI or MSI-X
-vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these
-should be avoided in favor of letting pci_alloc_irq_vectors() cap the
-number of vectors. If you have a legitimate special use case for the count
-of vectors we might have to revisit that decision and add a
-pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently.
-
-4.4 Considerations when using MSIs
-
-4.4.1 Spinlocks
-
-Most device drivers have a per-device spinlock which is taken in the
-interrupt handler. With pin-based interrupts or a single MSI, it is not
-necessary to disable interrupts (Linux guarantees the same interrupt will
-not be re-entered). If a device uses multiple interrupts, the driver
-must disable interrupts while the lock is held. If the device sends
-a different interrupt, the driver will deadlock trying to recursively
-acquire the spinlock. Such deadlocks can be avoided by using
-spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
-and acquire the lock (see Documentation/kernel-hacking/locking.rst).
-
-4.5 How to tell whether MSI/MSI-X is enabled on a device
-
-Using 'lspci -v' (as root) may show some devices with "MSI", "Message
-Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities
-has an 'Enable' flag which is followed with either "+" (enabled)
-or "-" (disabled).
-
-
-5. MSI quirks
-
-Several PCI chipsets or devices are known not to support MSIs.
-The PCI stack provides three ways to disable MSIs:
-
-1. globally
-2. on all devices behind a specific bridge
-3. on a single device
-
-5.1. Disabling MSIs globally
-
-Some host chipsets simply don't support MSIs properly. If we're
-lucky, the manufacturer knows this and has indicated it in the ACPI
-FADT table. In this case, Linux automatically disables MSIs.
-Some boards don't include this information in the table and so we have
-to detect them ourselves. The complete list of these is found near the
-quirk_disable_all_msi() function in drivers/pci/quirks.c.
-
-If you have a board which has problems with MSIs, you can pass pci=nomsi
-on the kernel command line to disable MSIs on all devices. It would be
-in your best interests to report the problem to linux-pci@vger.kernel.org
-including a full 'lspci -v' so we can add the quirks to the kernel.
-
-5.2. Disabling MSIs below a bridge
-
-Some PCI bridges are not able to route MSIs between busses properly.
-In this case, MSIs must be disabled on all devices behind the bridge.
-
-Some bridges allow you to enable MSIs by changing some bits in their
-PCI configuration space (especially the Hypertransport chipsets such
-as the nVidia nForce and Serverworks HT2000). As with host chipsets,
-Linux mostly knows about them and automatically enables MSIs if it can.
-If you have a bridge unknown to Linux, you can enable
-MSIs in configuration space using whatever method you know works, then
-enable MSIs on that bridge by doing:
-
- echo 1 > /sys/bus/pci/devices/$bridge/msi_bus
-
-where $bridge is the PCI address of the bridge you've enabled (eg
-0000:00:0e.0).
-
-To disable MSIs, echo 0 instead of 1. Changing this value should be
-done with caution as it could break interrupt handling for all devices
-below this bridge.
-
-Again, please notify linux-pci@vger.kernel.org of any bridges that need
-special handling.
-
-5.3. Disabling MSIs on a single device
-
-Some devices are known to have faulty MSI implementations. Usually this
-is handled in the individual device driver, but occasionally it's necessary
-to handle this with a quirk. Some drivers have an option to disable use
-of MSI. While this is a convenient workaround for the driver author,
-it is not good practice, and should not be emulated.
-
-5.4. Finding why MSIs are disabled on a device
-
-From the above three sections, you can see that there are many reasons
-why MSIs may not be enabled for a given device. Your first step should
-be to examine your dmesg carefully to determine whether MSIs are enabled
-for your machine. You should also check your .config to be sure you
-have enabled CONFIG_PCI_MSI.
-
-Then, 'lspci -t' gives the list of bridges above a device. Reading
-/sys/bus/pci/devices/*/msi_bus will tell you whether MSIs are enabled (1)
-or disabled (0). If 0 is found in any of the msi_bus files belonging
-to bridges between the PCI root and the device, MSIs are disabled.
-
-It is also worth checking the device driver to see whether it supports MSIs.
-For example, it may contain calls to pci_irq_alloc_vectors() with the
-PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
diff --git a/Documentation/PCI/PCIEBUS-HOWTO.txt b/Documentation/PCI/PCIEBUS-HOWTO.txt
deleted file mode 100644
index 15f0bb3b5045..000000000000
--- a/Documentation/PCI/PCIEBUS-HOWTO.txt
+++ /dev/null
@@ -1,198 +0,0 @@
- The PCI Express Port Bus Driver Guide HOWTO
- Tom L Nguyen tom.l.nguyen@intel.com
- 11/03/2004
-
-1. About this guide
-
-This guide describes the basics of the PCI Express Port Bus driver
-and provides information on how to enable the service drivers to
-register/unregister with the PCI Express Port Bus Driver.
-
-2. Copyright 2004 Intel Corporation
-
-3. What is the PCI Express Port Bus Driver
-
-A PCI Express Port is a logical PCI-PCI Bridge structure. There
-are two types of PCI Express Port: the Root Port and the Switch
-Port. The Root Port originates a PCI Express link from a PCI Express
-Root Complex and the Switch Port connects PCI Express links to
-internal logical PCI buses. The Switch Port, which has its secondary
-bus representing the switch's internal routing logic, is called the
-switch's Upstream Port. The switch's Downstream Port is bridging from
-switch's internal routing bus to a bus representing the downstream
-PCI Express link from the PCI Express Switch.
-
-A PCI Express Port can provide up to four distinct functions,
-referred to in this document as services, depending on its port type.
-PCI Express Port's services include native hotplug support (HP),
-power management event support (PME), advanced error reporting
-support (AER), and virtual channel support (VC). These services may
-be handled by a single complex driver or be individually distributed
-and handled by corresponding service drivers.
-
-4. Why use the PCI Express Port Bus Driver?
-
-In existing Linux kernels, the Linux Device Driver Model allows a
-physical device to be handled by only a single driver. The PCI
-Express Port is a PCI-PCI Bridge device with multiple distinct
-services. To maintain a clean and simple solution each service
-may have its own software service driver. In this case several
-service drivers will compete for a single PCI-PCI Bridge device.
-For example, if the PCI Express Root Port native hotplug service
-driver is loaded first, it claims a PCI-PCI Bridge Root Port. The
-kernel therefore does not load other service drivers for that Root
-Port. In other words, it is impossible to have multiple service
-drivers load and run on a PCI-PCI Bridge device simultaneously
-using the current driver model.
-
-To enable multiple service drivers running simultaneously requires
-having a PCI Express Port Bus driver, which manages all populated
-PCI Express Ports and distributes all provided service requests
-to the corresponding service drivers as required. Some key
-advantages of using the PCI Express Port Bus driver are listed below:
-
- - Allow multiple service drivers to run simultaneously on
- a PCI-PCI Bridge Port device.
-
- - Allow service drivers implemented in an independent
- staged approach.
-
- - Allow one service driver to run on multiple PCI-PCI Bridge
- Port devices.
-
- - Manage and distribute resources of a PCI-PCI Bridge Port
- device to requested service drivers.
-
-5. Configuring the PCI Express Port Bus Driver vs. Service Drivers
-
-5.1 Including the PCI Express Port Bus Driver Support into the Kernel
-
-Including the PCI Express Port Bus driver depends on whether the PCI
-Express support is included in the kernel config. The kernel will
-automatically include the PCI Express Port Bus driver as a kernel
-driver when the PCI Express support is enabled in the kernel.
-
-5.2 Enabling Service Driver Support
-
-PCI device drivers are implemented based on Linux Device Driver Model.
-All service drivers are PCI device drivers. As discussed above, it is
-impossible to load any service driver once the kernel has loaded the
-PCI Express Port Bus Driver. To meet the PCI Express Port Bus Driver
-Model requires some minimal changes on existing service drivers that
-imposes no impact on the functionality of existing service drivers.
-
-A service driver is required to use the two APIs shown below to
-register its service with the PCI Express Port Bus driver (see
-section 5.2.1 & 5.2.2). It is important that a service driver
-initializes the pcie_port_service_driver data structure, included in
-header file /include/linux/pcieport_if.h, before calling these APIs.
-Failure to do so will result an identity mismatch, which prevents
-the PCI Express Port Bus driver from loading a service driver.
-
-5.2.1 pcie_port_service_register
-
-int pcie_port_service_register(struct pcie_port_service_driver *new)
-
-This API replaces the Linux Driver Model's pci_register_driver API. A
-service driver should always calls pcie_port_service_register at
-module init. Note that after service driver being loaded, calls
-such as pci_enable_device(dev) and pci_set_master(dev) are no longer
-necessary since these calls are executed by the PCI Port Bus driver.
-
-5.2.2 pcie_port_service_unregister
-
-void pcie_port_service_unregister(struct pcie_port_service_driver *new)
-
-pcie_port_service_unregister replaces the Linux Driver Model's
-pci_unregister_driver. It's always called by service driver when a
-module exits.
-
-5.2.3 Sample Code
-
-Below is sample service driver code to initialize the port service
-driver data structure.
-
-static struct pcie_port_service_id service_id[] = { {
- .vendor = PCI_ANY_ID,
- .device = PCI_ANY_ID,
- .port_type = PCIE_RC_PORT,
- .service_type = PCIE_PORT_SERVICE_AER,
- }, { /* end: all zeroes */ }
-};
-
-static struct pcie_port_service_driver root_aerdrv = {
- .name = (char *)device_name,
- .id_table = &service_id[0],
-
- .probe = aerdrv_load,
- .remove = aerdrv_unload,
-
- .suspend = aerdrv_suspend,
- .resume = aerdrv_resume,
-};
-
-Below is a sample code for registering/unregistering a service
-driver.
-
-static int __init aerdrv_service_init(void)
-{
- int retval = 0;
-
- retval = pcie_port_service_register(&root_aerdrv);
- if (!retval) {
- /*
- * FIX ME
- */
- }
- return retval;
-}
-
-static void __exit aerdrv_service_exit(void)
-{
- pcie_port_service_unregister(&root_aerdrv);
-}
-
-module_init(aerdrv_service_init);
-module_exit(aerdrv_service_exit);
-
-6. Possible Resource Conflicts
-
-Since all service drivers of a PCI-PCI Bridge Port device are
-allowed to run simultaneously, below lists a few of possible resource
-conflicts with proposed solutions.
-
-6.1 MSI and MSI-X Vector Resource
-
-Once MSI or MSI-X interrupts are enabled on a device, it stays in this
-mode until they are disabled again. Since service drivers of the same
-PCI-PCI Bridge port share the same physical device, if an individual
-service driver enables or disables MSI/MSI-X mode it may result
-unpredictable behavior.
-
-To avoid this situation all service drivers are not permitted to
-switch interrupt mode on its device. The PCI Express Port Bus driver
-is responsible for determining the interrupt mode and this should be
-transparent to service drivers. Service drivers need to know only
-the vector IRQ assigned to the field irq of struct pcie_device, which
-is passed in when the PCI Express Port Bus driver probes each service
-driver. Service drivers should use (struct pcie_device*)dev->irq to
-call request_irq/free_irq. In addition, the interrupt mode is stored
-in the field interrupt_mode of struct pcie_device.
-
-6.3 PCI Memory/IO Mapped Regions
-
-Service drivers for PCI Express Power Management (PME), Advanced
-Error Reporting (AER), Hot-Plug (HP) and Virtual Channel (VC) access
-PCI configuration space on the PCI Express port. In all cases the
-registers accessed are independent of each other. This patch assumes
-that all service drivers will be well behaved and not overwrite
-other service driver's configuration settings.
-
-6.4 PCI Config Registers
-
-Each service driver runs its PCI config operations on its own
-capability structure except the PCI Express capability structure, in
-which Root Control register and Device Control register are shared
-between PME and AER. This patch assumes that all service drivers
-will be well behaved and not overwrite other service driver's
-configuration settings.
diff --git a/Documentation/PCI/acpi-info.rst b/Documentation/PCI/acpi-info.rst
new file mode 100644
index 000000000000..060217081c79
--- /dev/null
+++ b/Documentation/PCI/acpi-info.rst
@@ -0,0 +1,192 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================================
+ACPI considerations for PCI host bridges
+========================================
+
+The general rule is that the ACPI namespace should describe everything the
+OS might use unless there's another way for the OS to find it [1, 2].
+
+For example, there's no standard hardware mechanism for enumerating PCI
+host bridges, so the ACPI namespace must describe each host bridge, the
+method for accessing PCI config space below it, the address space windows
+the host bridge forwards to PCI (using _CRS), and the routing of legacy
+INTx interrupts (using _PRT).
+
+PCI devices, which are below the host bridge, generally do not need to be
+described via ACPI. The OS can discover them via the standard PCI
+enumeration mechanism, using config accesses to discover and identify
+devices and read and size their BARs. However, ACPI may describe PCI
+devices if it provides power management or hotplug functionality for them
+or if the device has INTx interrupts connected by platform interrupt
+controllers and a _PRT is needed to describe those connections.
+
+ACPI resource description is done via _CRS objects of devices in the ACPI
+namespace [2].   The _CRS is like a generalized PCI BAR: the OS can read
+_CRS and figure out what resource is being consumed even if it doesn't have
+a driver for the device [3].  That's important because it means an old OS
+can work correctly even on a system with new devices unknown to the OS.
+The new devices might not do anything, but the OS can at least make sure no
+resources conflict with them.
+
+Static tables like MCFG, HPET, ECDT, etc., are *not* mechanisms for
+reserving address space. The static tables are for things the OS needs to
+know early in boot, before it can parse the ACPI namespace. If a new table
+is defined, an old OS needs to operate correctly even though it ignores the
+table. _CRS allows that because it is generic and understood by the old
+OS; a static table does not.
+
+If the OS is expected to manage a non-discoverable device described via
+ACPI, that device will have a specific _HID/_CID that tells the OS what
+driver to bind to it, and the _CRS tells the OS and the driver where the
+device's registers are.
+
+PCI host bridges are PNP0A03 or PNP0A08 devices.  Their _CRS should
+describe all the address space they consume.  This includes all the windows
+they forward down to the PCI bus, as well as registers of the host bridge
+itself that are not forwarded to PCI.  The host bridge registers include
+things like secondary/subordinate bus registers that determine the bus
+range below the bridge, window registers that describe the apertures, etc.
+These are all device-specific, non-architected things, so the only way a
+PNP0A03/PNP0A08 driver can manage them is via _PRS/_CRS/_SRS, which contain
+the device-specific details.  The host bridge registers also include ECAM
+space, since it is consumed by the host bridge.
+
+ACPI defines a Consumer/Producer bit to distinguish the bridge registers
+("Consumer") from the bridge apertures ("Producer") [4, 5], but early
+BIOSes didn't use that bit correctly. The result is that the current ACPI
+spec defines Consumer/Producer only for the Extended Address Space
+descriptors; the bit should be ignored in the older QWord/DWord/Word
+Address Space descriptors. Consequently, OSes have to assume all
+QWord/DWord/Word descriptors are windows.
+
+Prior to the addition of Extended Address Space descriptors, the failure of
+Consumer/Producer meant there was no way to describe bridge registers in
+the PNP0A03/PNP0A08 device itself. The workaround was to describe the
+bridge registers (including ECAM space) in PNP0C02 catch-all devices [6].
+With the exception of ECAM, the bridge register space is device-specific
+anyway, so the generic PNP0A03/PNP0A08 driver (pci_root.c) has no need to
+know about it.  
+
+New architectures should be able to use "Consumer" Extended Address Space
+descriptors in the PNP0A03 device for bridge registers, including ECAM,
+although a strict interpretation of [6] might prohibit this. Old x86 and
+ia64 kernels assume all address space descriptors, including "Consumer"
+Extended Address Space ones, are windows, so it would not be safe to
+describe bridge registers this way on those architectures.
+
+PNP0C02 "motherboard" devices are basically a catch-all.  There's no
+programming model for them other than "don't use these resources for
+anything else."  So a PNP0C02 _CRS should claim any address space that is
+(1) not claimed by _CRS under any other device object in the ACPI namespace
+and (2) should not be assigned by the OS to something else.
+
+The PCIe spec requires the Enhanced Configuration Access Method (ECAM)
+unless there's a standard firmware interface for config access, e.g., the
+ia64 SAL interface [7]. A host bridge consumes ECAM memory address space
+and converts memory accesses into PCI configuration accesses. The spec
+defines the ECAM address space layout and functionality; only the base of
+the address space is device-specific. An ACPI OS learns the base address
+from either the static MCFG table or a _CBA method in the PNP0A03 device.
+
+The MCFG table must describe the ECAM space of non-hot pluggable host
+bridges [8]. Since MCFG is a static table and can't be updated by hotplug,
+a _CBA method in the PNP0A03 device describes the ECAM space of a
+hot-pluggable host bridge [9]. Note that for both MCFG and _CBA, the base
+address always corresponds to bus 0, even if the bus range below the bridge
+(which is reported via _CRS) doesn't start at 0.
+
+
+[1] ACPI 6.2, sec 6.1:
+ For any device that is on a non-enumerable type of bus (for example, an
+ ISA bus), OSPM enumerates the devices' identifier(s) and the ACPI
+ system firmware must supply an _HID object ... for each device to
+ enable OSPM to do that.
+
+[2] ACPI 6.2, sec 3.7:
+ The OS enumerates motherboard devices simply by reading through the
+ ACPI Namespace looking for devices with hardware IDs.
+
+ Each device enumerated by ACPI includes ACPI-defined objects in the
+ ACPI Namespace that report the hardware resources the device could
+ occupy [_PRS], an object that reports the resources that are currently
+ used by the device [_CRS], and objects for configuring those resources
+ [_SRS]. The information is used by the Plug and Play OS (OSPM) to
+ configure the devices.
+
+[3] ACPI 6.2, sec 6.2:
+ OSPM uses device configuration objects to configure hardware resources
+ for devices enumerated via ACPI. Device configuration objects provide
+ information about current and possible resource requirements, the
+ relationship between shared resources, and methods for configuring
+ hardware resources.
+
+ When OSPM enumerates a device, it calls _PRS to determine the resource
+ requirements of the device. It may also call _CRS to find the current
+ resource settings for the device. Using this information, the Plug and
+ Play system determines what resources the device should consume and
+ sets those resources by calling the device’s _SRS control method.
+
+ In ACPI, devices can consume resources (for example, legacy keyboards),
+ provide resources (for example, a proprietary PCI bridge), or do both.
+ Unless otherwise specified, resources for a device are assumed to be
+ taken from the nearest matching resource above the device in the device
+ hierarchy.
+
+[4] ACPI 6.2, sec 6.4.3.5.1, 2, 3, 4:
+ QWord/DWord/Word Address Space Descriptor (.1, .2, .3)
+ General Flags: Bit [0] Ignored
+
+ Extended Address Space Descriptor (.4)
+ General Flags: Bit [0] Consumer/Producer:
+
+ * 1 – This device consumes this resource
+ * 0 – This device produces and consumes this resource
+
+[5] ACPI 6.2, sec 19.6.43:
+ ResourceUsage specifies whether the Memory range is consumed by
+ this device (ResourceConsumer) or passed on to child devices
+ (ResourceProducer). If nothing is specified, then
+ ResourceConsumer is assumed.
+
+[6] PCI Firmware 3.2, sec 4.1.2:
+ If the operating system does not natively comprehend reserving the
+ MMCFG region, the MMCFG region must be reserved by firmware. The
+ address range reported in the MCFG table or by _CBA method (see Section
+ 4.1.3) must be reserved by declaring a motherboard resource. For most
+ systems, the motherboard resource would appear at the root of the ACPI
+ namespace (under \_SB) in a node with a _HID of EISAID (PNP0C02), and
+ the resources in this case should not be claimed in the root PCI bus’s
+ _CRS. The resources can optionally be returned in Int15 E820 or
+ EFIGetMemoryMap as reserved memory but must always be reported through
+ ACPI as a motherboard resource.
+
+[7] PCI Express 4.0, sec 7.2.2:
+ For systems that are PC-compatible, or that do not implement a
+ processor-architecture-specific firmware interface standard that allows
+ access to the Configuration Space, the ECAM is required as defined in
+ this section.
+
+[8] PCI Firmware 3.2, sec 4.1.2:
+ The MCFG table is an ACPI table that is used to communicate the base
+ addresses corresponding to the non-hot removable PCI Segment Groups
+ range within a PCI Segment Group available to the operating system at
+ boot. This is required for the PC-compatible systems.
+
+ The MCFG table is only used to communicate the base addresses
+ corresponding to the PCI Segment Groups available to the system at
+ boot.
+
+[9] PCI Firmware 3.2, sec 4.1.3:
+ The _CBA (Memory mapped Configuration Base Address) control method is
+ an optional ACPI object that returns the 64-bit memory mapped
+ configuration base address for the hot plug capable host bridge. The
+ base address returned by _CBA is processor-relative address. The _CBA
+ control method evaluates to an Integer.
+
+ This control method appears under a host bridge object. When the _CBA
+ method appears under an active host bridge object, the operating system
+ evaluates this structure to identify the memory mapped configuration
+ base address corresponding to the PCI Segment Group for the bus number
+ range specified in _CRS method. An ACPI name space object that contains
+ the _CBA method must also contain a corresponding _SEG method.
diff --git a/Documentation/PCI/acpi-info.txt b/Documentation/PCI/acpi-info.txt
deleted file mode 100644
index 3ffa3b03970e..000000000000
--- a/Documentation/PCI/acpi-info.txt
+++ /dev/null
@@ -1,187 +0,0 @@
- ACPI considerations for PCI host bridges
-
-The general rule is that the ACPI namespace should describe everything the
-OS might use unless there's another way for the OS to find it [1, 2].
-
-For example, there's no standard hardware mechanism for enumerating PCI
-host bridges, so the ACPI namespace must describe each host bridge, the
-method for accessing PCI config space below it, the address space windows
-the host bridge forwards to PCI (using _CRS), and the routing of legacy
-INTx interrupts (using _PRT).
-
-PCI devices, which are below the host bridge, generally do not need to be
-described via ACPI. The OS can discover them via the standard PCI
-enumeration mechanism, using config accesses to discover and identify
-devices and read and size their BARs. However, ACPI may describe PCI
-devices if it provides power management or hotplug functionality for them
-or if the device has INTx interrupts connected by platform interrupt
-controllers and a _PRT is needed to describe those connections.
-
-ACPI resource description is done via _CRS objects of devices in the ACPI
-namespace [2].   The _CRS is like a generalized PCI BAR: the OS can read
-_CRS and figure out what resource is being consumed even if it doesn't have
-a driver for the device [3].  That's important because it means an old OS
-can work correctly even on a system with new devices unknown to the OS.
-The new devices might not do anything, but the OS can at least make sure no
-resources conflict with them.
-
-Static tables like MCFG, HPET, ECDT, etc., are *not* mechanisms for
-reserving address space. The static tables are for things the OS needs to
-know early in boot, before it can parse the ACPI namespace. If a new table
-is defined, an old OS needs to operate correctly even though it ignores the
-table. _CRS allows that because it is generic and understood by the old
-OS; a static table does not.
-
-If the OS is expected to manage a non-discoverable device described via
-ACPI, that device will have a specific _HID/_CID that tells the OS what
-driver to bind to it, and the _CRS tells the OS and the driver where the
-device's registers are.
-
-PCI host bridges are PNP0A03 or PNP0A08 devices.  Their _CRS should
-describe all the address space they consume.  This includes all the windows
-they forward down to the PCI bus, as well as registers of the host bridge
-itself that are not forwarded to PCI.  The host bridge registers include
-things like secondary/subordinate bus registers that determine the bus
-range below the bridge, window registers that describe the apertures, etc.
-These are all device-specific, non-architected things, so the only way a
-PNP0A03/PNP0A08 driver can manage them is via _PRS/_CRS/_SRS, which contain
-the device-specific details.  The host bridge registers also include ECAM
-space, since it is consumed by the host bridge.
-
-ACPI defines a Consumer/Producer bit to distinguish the bridge registers
-("Consumer") from the bridge apertures ("Producer") [4, 5], but early
-BIOSes didn't use that bit correctly. The result is that the current ACPI
-spec defines Consumer/Producer only for the Extended Address Space
-descriptors; the bit should be ignored in the older QWord/DWord/Word
-Address Space descriptors. Consequently, OSes have to assume all
-QWord/DWord/Word descriptors are windows.
-
-Prior to the addition of Extended Address Space descriptors, the failure of
-Consumer/Producer meant there was no way to describe bridge registers in
-the PNP0A03/PNP0A08 device itself. The workaround was to describe the
-bridge registers (including ECAM space) in PNP0C02 catch-all devices [6].
-With the exception of ECAM, the bridge register space is device-specific
-anyway, so the generic PNP0A03/PNP0A08 driver (pci_root.c) has no need to
-know about it.  
-
-New architectures should be able to use "Consumer" Extended Address Space
-descriptors in the PNP0A03 device for bridge registers, including ECAM,
-although a strict interpretation of [6] might prohibit this. Old x86 and
-ia64 kernels assume all address space descriptors, including "Consumer"
-Extended Address Space ones, are windows, so it would not be safe to
-describe bridge registers this way on those architectures.
-
-PNP0C02 "motherboard" devices are basically a catch-all.  There's no
-programming model for them other than "don't use these resources for
-anything else."  So a PNP0C02 _CRS should claim any address space that is
-(1) not claimed by _CRS under any other device object in the ACPI namespace
-and (2) should not be assigned by the OS to something else.
-
-The PCIe spec requires the Enhanced Configuration Access Method (ECAM)
-unless there's a standard firmware interface for config access, e.g., the
-ia64 SAL interface [7]. A host bridge consumes ECAM memory address space
-and converts memory accesses into PCI configuration accesses. The spec
-defines the ECAM address space layout and functionality; only the base of
-the address space is device-specific. An ACPI OS learns the base address
-from either the static MCFG table or a _CBA method in the PNP0A03 device.
-
-The MCFG table must describe the ECAM space of non-hot pluggable host
-bridges [8]. Since MCFG is a static table and can't be updated by hotplug,
-a _CBA method in the PNP0A03 device describes the ECAM space of a
-hot-pluggable host bridge [9]. Note that for both MCFG and _CBA, the base
-address always corresponds to bus 0, even if the bus range below the bridge
-(which is reported via _CRS) doesn't start at 0.
-
-
-[1] ACPI 6.2, sec 6.1:
- For any device that is on a non-enumerable type of bus (for example, an
- ISA bus), OSPM enumerates the devices' identifier(s) and the ACPI
- system firmware must supply an _HID object ... for each device to
- enable OSPM to do that.
-
-[2] ACPI 6.2, sec 3.7:
- The OS enumerates motherboard devices simply by reading through the
- ACPI Namespace looking for devices with hardware IDs.
-
- Each device enumerated by ACPI includes ACPI-defined objects in the
- ACPI Namespace that report the hardware resources the device could
- occupy [_PRS], an object that reports the resources that are currently
- used by the device [_CRS], and objects for configuring those resources
- [_SRS]. The information is used by the Plug and Play OS (OSPM) to
- configure the devices.
-
-[3] ACPI 6.2, sec 6.2:
- OSPM uses device configuration objects to configure hardware resources
- for devices enumerated via ACPI. Device configuration objects provide
- information about current and possible resource requirements, the
- relationship between shared resources, and methods for configuring
- hardware resources.
-
- When OSPM enumerates a device, it calls _PRS to determine the resource
- requirements of the device. It may also call _CRS to find the current
- resource settings for the device. Using this information, the Plug and
- Play system determines what resources the device should consume and
- sets those resources by calling the device’s _SRS control method.
-
- In ACPI, devices can consume resources (for example, legacy keyboards),
- provide resources (for example, a proprietary PCI bridge), or do both.
- Unless otherwise specified, resources for a device are assumed to be
- taken from the nearest matching resource above the device in the device
- hierarchy.
-
-[4] ACPI 6.2, sec 6.4.3.5.1, 2, 3, 4:
- QWord/DWord/Word Address Space Descriptor (.1, .2, .3)
- General Flags: Bit [0] Ignored
-
- Extended Address Space Descriptor (.4)
- General Flags: Bit [0] Consumer/Producer:
- 1–This device consumes this resource
- 0–This device produces and consumes this resource
-
-[5] ACPI 6.2, sec 19.6.43:
- ResourceUsage specifies whether the Memory range is consumed by
- this device (ResourceConsumer) or passed on to child devices
- (ResourceProducer). If nothing is specified, then
- ResourceConsumer is assumed.
-
-[6] PCI Firmware 3.2, sec 4.1.2:
- If the operating system does not natively comprehend reserving the
- MMCFG region, the MMCFG region must be reserved by firmware. The
- address range reported in the MCFG table or by _CBA method (see Section
- 4.1.3) must be reserved by declaring a motherboard resource. For most
- systems, the motherboard resource would appear at the root of the ACPI
- namespace (under \_SB) in a node with a _HID of EISAID (PNP0C02), and
- the resources in this case should not be claimed in the root PCI bus’s
- _CRS. The resources can optionally be returned in Int15 E820 or
- EFIGetMemoryMap as reserved memory but must always be reported through
- ACPI as a motherboard resource.
-
-[7] PCI Express 4.0, sec 7.2.2:
- For systems that are PC-compatible, or that do not implement a
- processor-architecture-specific firmware interface standard that allows
- access to the Configuration Space, the ECAM is required as defined in
- this section.
-
-[8] PCI Firmware 3.2, sec 4.1.2:
- The MCFG table is an ACPI table that is used to communicate the base
- addresses corresponding to the non-hot removable PCI Segment Groups
- range within a PCI Segment Group available to the operating system at
- boot. This is required for the PC-compatible systems.
-
- The MCFG table is only used to communicate the base addresses
- corresponding to the PCI Segment Groups available to the system at
- boot.
-
-[9] PCI Firmware 3.2, sec 4.1.3:
- The _CBA (Memory mapped Configuration Base Address) control method is
- an optional ACPI object that returns the 64-bit memory mapped
- configuration base address for the hot plug capable host bridge. The
- base address returned by _CBA is processor-relative address. The _CBA
- control method evaluates to an Integer.
-
- This control method appears under a host bridge object. When the _CBA
- method appears under an active host bridge object, the operating system
- evaluates this structure to identify the memory mapped configuration
- base address corresponding to the PCI Segment Group for the bus number
- range specified in _CRS method. An ACPI name space object that contains
- the _CBA method must also contain a corresponding _SEG method.
diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst
new file mode 100644
index 000000000000..d114ea74b444
--- /dev/null
+++ b/Documentation/PCI/endpoint/index.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+PCI Endpoint Framework
+======================
+
+.. toctree::
+ :maxdepth: 2
+
+ pci-endpoint
+ pci-endpoint-cfs
+ pci-test-function
+ pci-test-howto
diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.rst b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst
new file mode 100644
index 000000000000..b6d39cdec56e
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-endpoint-cfs.rst
@@ -0,0 +1,118 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================
+Configuring PCI Endpoint Using CONFIGFS
+=======================================
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the
+PCI endpoint function and to bind the endpoint function
+with the endpoint controller. (For introducing other mechanisms to
+configure the PCI Endpoint Function refer to [1]).
+
+Mounting configfs
+=================
+
+The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs
+directory. configfs can be mounted using the following command::
+
+ mount -t configfs none /sys/kernel/config
+
+Directory Structure
+===================
+
+The pci_ep configfs has two directories at its root: controllers and
+functions. Every EPC device present in the system will have an entry in
+the *controllers* directory and and every EPF driver present in the system
+will have an entry in the *functions* directory.
+::
+
+ /sys/kernel/config/pci_ep/
+ .. controllers/
+ .. functions/
+
+Creating EPF Device
+===================
+
+Every registered EPF driver will be listed in controllers directory. The
+entries corresponding to EPF driver will be created by the EPF core.
+::
+
+ /sys/kernel/config/pci_ep/functions/
+ .. <EPF Driver1>/
+ ... <EPF Device 11>/
+ ... <EPF Device 21>/
+ .. <EPF Driver2>/
+ ... <EPF Device 12>/
+ ... <EPF Device 22>/
+
+In order to create a <EPF device> of the type probed by <EPF Driver>, the
+user has to create a directory inside <EPF DriverN>.
+
+Every <EPF device> directory consists of the following entries that can be
+used to configure the standard configuration header of the endpoint function.
+(These entries are created by the framework when any new <EPF Device> is
+created)
+::
+
+ .. <EPF Driver1>/
+ ... <EPF Device 11>/
+ ... vendorid
+ ... deviceid
+ ... revid
+ ... progif_code
+ ... subclass_code
+ ... baseclass_code
+ ... cache_line_size
+ ... subsys_vendor_id
+ ... subsys_id
+ ... interrupt_pin
+
+EPC Device
+==========
+
+Every registered EPC device will be listed in controllers directory. The
+entries corresponding to EPC device will be created by the EPC core.
+::
+
+ /sys/kernel/config/pci_ep/controllers/
+ .. <EPC Device1>/
+ ... <Symlink EPF Device11>/
+ ... <Symlink EPF Device12>/
+ ... start
+ .. <EPC Device2>/
+ ... <Symlink EPF Device21>/
+ ... <Symlink EPF Device22>/
+ ... start
+
+The <EPC Device> directory will have a list of symbolic links to
+<EPF Device>. These symbolic links should be created by the user to
+represent the functions present in the endpoint device.
+
+The <EPC Device> directory will also have a *start* field. Once
+"1" is written to this field, the endpoint device will be ready to
+establish the link with the host. This is usually done after
+all the EPF devices are created and linked with the EPC device.
+::
+
+ | controllers/
+ | <Directory: EPC name>/
+ | <Symbolic Link: Function>
+ | start
+ | functions/
+ | <Directory: EPF driver>/
+ | <Directory: EPF device>/
+ | vendorid
+ | deviceid
+ | revid
+ | progif_code
+ | subclass_code
+ | baseclass_code
+ | cache_line_size
+ | subsys_vendor_id
+ | subsys_id
+ | interrupt_pin
+ | function
+
+[1] :doc:`pci-endpoint`
diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt b/Documentation/PCI/endpoint/pci-endpoint-cfs.txt
deleted file mode 100644
index d740f29960a4..000000000000
--- a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt
+++ /dev/null
@@ -1,105 +0,0 @@
- CONFIGURING PCI ENDPOINT USING CONFIGFS
- Kishon Vijay Abraham I <kishon@ti.com>
-
-The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the
-PCI endpoint function and to bind the endpoint function
-with the endpoint controller. (For introducing other mechanisms to
-configure the PCI Endpoint Function refer to [1]).
-
-*) Mounting configfs
-
-The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs
-directory. configfs can be mounted using the following command.
-
- mount -t configfs none /sys/kernel/config
-
-*) Directory Structure
-
-The pci_ep configfs has two directories at its root: controllers and
-functions. Every EPC device present in the system will have an entry in
-the *controllers* directory and and every EPF driver present in the system
-will have an entry in the *functions* directory.
-
-/sys/kernel/config/pci_ep/
- .. controllers/
- .. functions/
-
-*) Creating EPF Device
-
-Every registered EPF driver will be listed in controllers directory. The
-entries corresponding to EPF driver will be created by the EPF core.
-
-/sys/kernel/config/pci_ep/functions/
- .. <EPF Driver1>/
- ... <EPF Device 11>/
- ... <EPF Device 21>/
- .. <EPF Driver2>/
- ... <EPF Device 12>/
- ... <EPF Device 22>/
-
-In order to create a <EPF device> of the type probed by <EPF Driver>, the
-user has to create a directory inside <EPF DriverN>.
-
-Every <EPF device> directory consists of the following entries that can be
-used to configure the standard configuration header of the endpoint function.
-(These entries are created by the framework when any new <EPF Device> is
-created)
-
- .. <EPF Driver1>/
- ... <EPF Device 11>/
- ... vendorid
- ... deviceid
- ... revid
- ... progif_code
- ... subclass_code
- ... baseclass_code
- ... cache_line_size
- ... subsys_vendor_id
- ... subsys_id
- ... interrupt_pin
-
-*) EPC Device
-
-Every registered EPC device will be listed in controllers directory. The
-entries corresponding to EPC device will be created by the EPC core.
-
-/sys/kernel/config/pci_ep/controllers/
- .. <EPC Device1>/
- ... <Symlink EPF Device11>/
- ... <Symlink EPF Device12>/
- ... start
- .. <EPC Device2>/
- ... <Symlink EPF Device21>/
- ... <Symlink EPF Device22>/
- ... start
-
-The <EPC Device> directory will have a list of symbolic links to
-<EPF Device>. These symbolic links should be created by the user to
-represent the functions present in the endpoint device.
-
-The <EPC Device> directory will also have a *start* field. Once
-"1" is written to this field, the endpoint device will be ready to
-establish the link with the host. This is usually done after
-all the EPF devices are created and linked with the EPC device.
-
-
- | controllers/
- | <Directory: EPC name>/
- | <Symbolic Link: Function>
- | start
- | functions/
- | <Directory: EPF driver>/
- | <Directory: EPF device>/
- | vendorid
- | deviceid
- | revid
- | progif_code
- | subclass_code
- | baseclass_code
- | cache_line_size
- | subsys_vendor_id
- | subsys_id
- | interrupt_pin
- | function
-
-[1] -> Documentation/PCI/endpoint/pci-endpoint.txt
diff --git a/Documentation/PCI/endpoint/pci-endpoint.rst b/Documentation/PCI/endpoint/pci-endpoint.rst
new file mode 100644
index 000000000000..0e2311b5617b
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-endpoint.rst
@@ -0,0 +1,231 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+This document is a guide to use the PCI Endpoint Framework in order to create
+endpoint controller driver, endpoint function driver, and using configfs
+interface to bind the function driver to the controller driver.
+
+Introduction
+============
+
+Linux has a comprehensive PCI subsystem to support PCI controllers that
+operates in Root Complex mode. The subsystem has capability to scan PCI bus,
+assign memory resources and IRQ resources, load PCI driver (based on
+vendor ID, device ID), support other services like hot-plug, power management,
+advanced error reporting and virtual channels.
+
+However the PCI controller IP integrated in some SoCs is capable of operating
+either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will
+add endpoint mode support in Linux. This will help to run Linux in an
+EP system which can have a wide variety of use cases from testing or
+validation, co-processor accelerator, etc.
+
+PCI Endpoint Core
+=================
+
+The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller
+library, the Endpoint Function library, and the configfs layer to bind the
+endpoint function with the endpoint controller.
+
+PCI Endpoint Controller(EPC) Library
+------------------------------------
+
+The EPC library provides APIs to be used by the controller that can operate
+in endpoint mode. It also provides APIs to be used by function driver/library
+in order to implement a particular endpoint function.
+
+APIs for the PCI controller Driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI controller driver.
+
+* devm_pci_epc_create()/pci_epc_create()
+
+ The PCI controller driver should implement the following ops:
+
+ * write_header: ops to populate configuration space header
+ * set_bar: ops to configure the BAR
+ * clear_bar: ops to reset the BAR
+ * alloc_addr_space: ops to allocate in PCI controller address space
+ * free_addr_space: ops to free the allocated address space
+ * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
+ * start: ops to start the PCI link
+ * stop: ops to stop the PCI link
+
+ The PCI controller driver can then create a new EPC device by invoking
+ devm_pci_epc_create()/pci_epc_create().
+
+* devm_pci_epc_destroy()/pci_epc_destroy()
+
+ The PCI controller driver can destroy the EPC device created by either
+ devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or
+ pci_epc_destroy().
+
+* pci_epc_linkup()
+
+ In order to notify all the function devices that the EPC device to which
+ they are linked has established a link with the host, the PCI controller
+ driver should invoke pci_epc_linkup().
+
+* pci_epc_mem_init()
+
+ Initialize the pci_epc_mem structure used for allocating EPC addr space.
+
+* pci_epc_mem_exit()
+
+ Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init().
+
+
+APIs for the PCI Endpoint Function Driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI endpoint function driver.
+
+* pci_epc_write_header()
+
+ The PCI endpoint function driver should use pci_epc_write_header() to
+ write the standard configuration header to the endpoint controller.
+
+* pci_epc_set_bar()
+
+ The PCI endpoint function driver should use pci_epc_set_bar() to configure
+ the Base Address Register in order for the host to assign PCI addr space.
+ Register space of the function driver is usually configured
+ using this API.
+
+* pci_epc_clear_bar()
+
+ The PCI endpoint function driver should use pci_epc_clear_bar() to reset
+ the BAR.
+
+* pci_epc_raise_irq()
+
+ The PCI endpoint function driver should use pci_epc_raise_irq() to raise
+ Legacy Interrupt, MSI or MSI-X Interrupt.
+
+* pci_epc_mem_alloc_addr()
+
+ The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to
+ allocate memory address from EPC addr space which is required to access
+ RC's buffer
+
+* pci_epc_mem_free_addr()
+
+ The PCI endpoint function driver should use pci_epc_mem_free_addr() to
+ free the memory space allocated using pci_epc_mem_alloc_addr().
+
+Other APIs
+~~~~~~~~~~
+
+There are other APIs provided by the EPC library. These are used for binding
+the EPF device with EPC device. pci-ep-cfs.c can be used as reference for
+using these APIs.
+
+* pci_epc_get()
+
+ Get a reference to the PCI endpoint controller based on the device name of
+ the controller.
+
+* pci_epc_put()
+
+ Release the reference to the PCI endpoint controller obtained using
+ pci_epc_get()
+
+* pci_epc_add_epf()
+
+ Add a PCI endpoint function to a PCI endpoint controller. A PCIe device
+ can have up to 8 functions according to the specification.
+
+* pci_epc_remove_epf()
+
+ Remove the PCI endpoint function from PCI endpoint controller.
+
+* pci_epc_start()
+
+ The PCI endpoint function driver should invoke pci_epc_start() once it
+ has configured the endpoint function and wants to start the PCI link.
+
+* pci_epc_stop()
+
+ The PCI endpoint function driver should invoke pci_epc_stop() to stop
+ the PCI LINK.
+
+
+PCI Endpoint Function(EPF) Library
+----------------------------------
+
+The EPF library provides APIs to be used by the function driver and the EPC
+library to provide endpoint mode functionality.
+
+APIs for the PCI Endpoint Function Driver
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI endpoint function driver.
+
+* pci_epf_register_driver()
+
+ The PCI Endpoint Function driver should implement the following ops:
+ * bind: ops to perform when a EPC device has been bound to EPF device
+ * unbind: ops to perform when a binding has been lost between a EPC
+ device and EPF device
+ * linkup: ops to perform when the EPC device has established a
+ connection with a host system
+
+ The PCI Function driver can then register the PCI EPF driver by using
+ pci_epf_register_driver().
+
+* pci_epf_unregister_driver()
+
+ The PCI Function driver can unregister the PCI EPF driver by using
+ pci_epf_unregister_driver().
+
+* pci_epf_alloc_space()
+
+ The PCI Function driver can allocate space for a particular BAR using
+ pci_epf_alloc_space().
+
+* pci_epf_free_space()
+
+ The PCI Function driver can free the allocated space
+ (using pci_epf_alloc_space) by invoking pci_epf_free_space().
+
+APIs for the PCI Endpoint Controller Library
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists the APIs that the PCI Endpoint core provides to be used
+by the PCI endpoint controller library.
+
+* pci_epf_linkup()
+
+ The PCI endpoint controller library invokes pci_epf_linkup() when the
+ EPC device has established the connection to the host.
+
+Other APIs
+~~~~~~~~~~
+
+There are other APIs provided by the EPF library. These are used to notify
+the function driver when the EPF device is bound to the EPC device.
+pci-ep-cfs.c can be used as reference for using these APIs.
+
+* pci_epf_create()
+
+ Create a new PCI EPF device by passing the name of the PCI EPF device.
+ This name will be used to bind the the EPF device to a EPF driver.
+
+* pci_epf_destroy()
+
+ Destroy the created PCI EPF device.
+
+* pci_epf_bind()
+
+ pci_epf_bind() should be invoked when the EPF device has been bound to
+ a EPC device.
+
+* pci_epf_unbind()
+
+ pci_epf_unbind() should be invoked when the binding between EPC device
+ and EPF device is lost.
diff --git a/Documentation/PCI/endpoint/pci-endpoint.txt b/Documentation/PCI/endpoint/pci-endpoint.txt
deleted file mode 100644
index e86a96b66a6a..000000000000
--- a/Documentation/PCI/endpoint/pci-endpoint.txt
+++ /dev/null
@@ -1,215 +0,0 @@
- PCI ENDPOINT FRAMEWORK
- Kishon Vijay Abraham I <kishon@ti.com>
-
-This document is a guide to use the PCI Endpoint Framework in order to create
-endpoint controller driver, endpoint function driver, and using configfs
-interface to bind the function driver to the controller driver.
-
-1. Introduction
-
-Linux has a comprehensive PCI subsystem to support PCI controllers that
-operates in Root Complex mode. The subsystem has capability to scan PCI bus,
-assign memory resources and IRQ resources, load PCI driver (based on
-vendor ID, device ID), support other services like hot-plug, power management,
-advanced error reporting and virtual channels.
-
-However the PCI controller IP integrated in some SoCs is capable of operating
-either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will
-add endpoint mode support in Linux. This will help to run Linux in an
-EP system which can have a wide variety of use cases from testing or
-validation, co-processor accelerator, etc.
-
-2. PCI Endpoint Core
-
-The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller
-library, the Endpoint Function library, and the configfs layer to bind the
-endpoint function with the endpoint controller.
-
-2.1 PCI Endpoint Controller(EPC) Library
-
-The EPC library provides APIs to be used by the controller that can operate
-in endpoint mode. It also provides APIs to be used by function driver/library
-in order to implement a particular endpoint function.
-
-2.1.1 APIs for the PCI controller Driver
-
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI controller driver.
-
-*) devm_pci_epc_create()/pci_epc_create()
-
- The PCI controller driver should implement the following ops:
- * write_header: ops to populate configuration space header
- * set_bar: ops to configure the BAR
- * clear_bar: ops to reset the BAR
- * alloc_addr_space: ops to allocate in PCI controller address space
- * free_addr_space: ops to free the allocated address space
- * raise_irq: ops to raise a legacy, MSI or MSI-X interrupt
- * start: ops to start the PCI link
- * stop: ops to stop the PCI link
-
- The PCI controller driver can then create a new EPC device by invoking
- devm_pci_epc_create()/pci_epc_create().
-
-*) devm_pci_epc_destroy()/pci_epc_destroy()
-
- The PCI controller driver can destroy the EPC device created by either
- devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or
- pci_epc_destroy().
-
-*) pci_epc_linkup()
-
- In order to notify all the function devices that the EPC device to which
- they are linked has established a link with the host, the PCI controller
- driver should invoke pci_epc_linkup().
-
-*) pci_epc_mem_init()
-
- Initialize the pci_epc_mem structure used for allocating EPC addr space.
-
-*) pci_epc_mem_exit()
-
- Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init().
-
-2.1.2 APIs for the PCI Endpoint Function Driver
-
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI endpoint function driver.
-
-*) pci_epc_write_header()
-
- The PCI endpoint function driver should use pci_epc_write_header() to
- write the standard configuration header to the endpoint controller.
-
-*) pci_epc_set_bar()
-
- The PCI endpoint function driver should use pci_epc_set_bar() to configure
- the Base Address Register in order for the host to assign PCI addr space.
- Register space of the function driver is usually configured
- using this API.
-
-*) pci_epc_clear_bar()
-
- The PCI endpoint function driver should use pci_epc_clear_bar() to reset
- the BAR.
-
-*) pci_epc_raise_irq()
-
- The PCI endpoint function driver should use pci_epc_raise_irq() to raise
- Legacy Interrupt, MSI or MSI-X Interrupt.
-
-*) pci_epc_mem_alloc_addr()
-
- The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to
- allocate memory address from EPC addr space which is required to access
- RC's buffer
-
-*) pci_epc_mem_free_addr()
-
- The PCI endpoint function driver should use pci_epc_mem_free_addr() to
- free the memory space allocated using pci_epc_mem_alloc_addr().
-
-2.1.3 Other APIs
-
-There are other APIs provided by the EPC library. These are used for binding
-the EPF device with EPC device. pci-ep-cfs.c can be used as reference for
-using these APIs.
-
-*) pci_epc_get()
-
- Get a reference to the PCI endpoint controller based on the device name of
- the controller.
-
-*) pci_epc_put()
-
- Release the reference to the PCI endpoint controller obtained using
- pci_epc_get()
-
-*) pci_epc_add_epf()
-
- Add a PCI endpoint function to a PCI endpoint controller. A PCIe device
- can have up to 8 functions according to the specification.
-
-*) pci_epc_remove_epf()
-
- Remove the PCI endpoint function from PCI endpoint controller.
-
-*) pci_epc_start()
-
- The PCI endpoint function driver should invoke pci_epc_start() once it
- has configured the endpoint function and wants to start the PCI link.
-
-*) pci_epc_stop()
-
- The PCI endpoint function driver should invoke pci_epc_stop() to stop
- the PCI LINK.
-
-2.2 PCI Endpoint Function(EPF) Library
-
-The EPF library provides APIs to be used by the function driver and the EPC
-library to provide endpoint mode functionality.
-
-2.2.1 APIs for the PCI Endpoint Function Driver
-
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI endpoint function driver.
-
-*) pci_epf_register_driver()
-
- The PCI Endpoint Function driver should implement the following ops:
- * bind: ops to perform when a EPC device has been bound to EPF device
- * unbind: ops to perform when a binding has been lost between a EPC
- device and EPF device
- * linkup: ops to perform when the EPC device has established a
- connection with a host system
-
- The PCI Function driver can then register the PCI EPF driver by using
- pci_epf_register_driver().
-
-*) pci_epf_unregister_driver()
-
- The PCI Function driver can unregister the PCI EPF driver by using
- pci_epf_unregister_driver().
-
-*) pci_epf_alloc_space()
-
- The PCI Function driver can allocate space for a particular BAR using
- pci_epf_alloc_space().
-
-*) pci_epf_free_space()
-
- The PCI Function driver can free the allocated space
- (using pci_epf_alloc_space) by invoking pci_epf_free_space().
-
-2.2.2 APIs for the PCI Endpoint Controller Library
-This section lists the APIs that the PCI Endpoint core provides to be used
-by the PCI endpoint controller library.
-
-*) pci_epf_linkup()
-
- The PCI endpoint controller library invokes pci_epf_linkup() when the
- EPC device has established the connection to the host.
-
-2.2.2 Other APIs
-There are other APIs provided by the EPF library. These are used to notify
-the function driver when the EPF device is bound to the EPC device.
-pci-ep-cfs.c can be used as reference for using these APIs.
-
-*) pci_epf_create()
-
- Create a new PCI EPF device by passing the name of the PCI EPF device.
- This name will be used to bind the the EPF device to a EPF driver.
-
-*) pci_epf_destroy()
-
- Destroy the created PCI EPF device.
-
-*) pci_epf_bind()
-
- pci_epf_bind() should be invoked when the EPF device has been bound to
- a EPC device.
-
-*) pci_epf_unbind()
-
- pci_epf_unbind() should be invoked when the binding between EPC device
- and EPF device is lost.
diff --git a/Documentation/PCI/endpoint/pci-test-function.rst b/Documentation/PCI/endpoint/pci-test-function.rst
new file mode 100644
index 000000000000..3c8521d7aa31
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-test-function.rst
@@ -0,0 +1,103 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+PCI Test Function
+=================
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+Traditionally PCI RC has always been validated by using standard
+PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards.
+However with the addition of EP-core in linux kernel, it is possible
+to configure a PCI controller that can operate in EP mode to work as
+a test device.
+
+The PCI endpoint test device is a virtual device (defined in software)
+used to test the endpoint functionality and serve as a sample driver
+for other PCI endpoint devices (to use the EP framework).
+
+The PCI endpoint test device has the following registers:
+
+ 1) PCI_ENDPOINT_TEST_MAGIC
+ 2) PCI_ENDPOINT_TEST_COMMAND
+ 3) PCI_ENDPOINT_TEST_STATUS
+ 4) PCI_ENDPOINT_TEST_SRC_ADDR
+ 5) PCI_ENDPOINT_TEST_DST_ADDR
+ 6) PCI_ENDPOINT_TEST_SIZE
+ 7) PCI_ENDPOINT_TEST_CHECKSUM
+ 8) PCI_ENDPOINT_TEST_IRQ_TYPE
+ 9) PCI_ENDPOINT_TEST_IRQ_NUMBER
+
+* PCI_ENDPOINT_TEST_MAGIC
+
+This register will be used to test BAR0. A known pattern will be written
+and read back from MAGIC register to verify BAR0.
+
+* PCI_ENDPOINT_TEST_COMMAND
+
+This register will be used by the host driver to indicate the function
+that the endpoint device must perform.
+
+======== ================================================================
+Bitfield Description
+======== ================================================================
+Bit 0 raise legacy IRQ
+Bit 1 raise MSI IRQ
+Bit 2 raise MSI-X IRQ
+Bit 3 read command (read data from RC buffer)
+Bit 4 write command (write data to RC buffer)
+Bit 5 copy command (copy data from one RC buffer to another RC buffer)
+======== ================================================================
+
+* PCI_ENDPOINT_TEST_STATUS
+
+This register reflects the status of the PCI endpoint device.
+
+======== ==============================
+Bitfield Description
+======== ==============================
+Bit 0 read success
+Bit 1 read fail
+Bit 2 write success
+Bit 3 write fail
+Bit 4 copy success
+Bit 5 copy fail
+Bit 6 IRQ raised
+Bit 7 source address is invalid
+Bit 8 destination address is invalid
+======== ==============================
+
+* PCI_ENDPOINT_TEST_SRC_ADDR
+
+This register contains the source address (RC buffer address) for the
+COPY/READ command.
+
+* PCI_ENDPOINT_TEST_DST_ADDR
+
+This register contains the destination address (RC buffer address) for
+the COPY/WRITE command.
+
+* PCI_ENDPOINT_TEST_IRQ_TYPE
+
+This register contains the interrupt type (Legacy/MSI) triggered
+for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands.
+
+Possible types:
+
+====== ==
+Legacy 0
+MSI 1
+MSI-X 2
+====== ==
+
+* PCI_ENDPOINT_TEST_IRQ_NUMBER
+
+This register contains the triggered ID interrupt.
+
+Admissible values:
+
+====== ===========
+Legacy 0
+MSI [1 .. 32]
+MSI-X [1 .. 2048]
+====== ===========
diff --git a/Documentation/PCI/endpoint/pci-test-function.txt b/Documentation/PCI/endpoint/pci-test-function.txt
deleted file mode 100644
index 5916f1f592bb..000000000000
--- a/Documentation/PCI/endpoint/pci-test-function.txt
+++ /dev/null
@@ -1,87 +0,0 @@
- PCI TEST
- Kishon Vijay Abraham I <kishon@ti.com>
-
-Traditionally PCI RC has always been validated by using standard
-PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards.
-However with the addition of EP-core in linux kernel, it is possible
-to configure a PCI controller that can operate in EP mode to work as
-a test device.
-
-The PCI endpoint test device is a virtual device (defined in software)
-used to test the endpoint functionality and serve as a sample driver
-for other PCI endpoint devices (to use the EP framework).
-
-The PCI endpoint test device has the following registers:
-
- 1) PCI_ENDPOINT_TEST_MAGIC
- 2) PCI_ENDPOINT_TEST_COMMAND
- 3) PCI_ENDPOINT_TEST_STATUS
- 4) PCI_ENDPOINT_TEST_SRC_ADDR
- 5) PCI_ENDPOINT_TEST_DST_ADDR
- 6) PCI_ENDPOINT_TEST_SIZE
- 7) PCI_ENDPOINT_TEST_CHECKSUM
- 8) PCI_ENDPOINT_TEST_IRQ_TYPE
- 9) PCI_ENDPOINT_TEST_IRQ_NUMBER
-
-*) PCI_ENDPOINT_TEST_MAGIC
-
-This register will be used to test BAR0. A known pattern will be written
-and read back from MAGIC register to verify BAR0.
-
-*) PCI_ENDPOINT_TEST_COMMAND:
-
-This register will be used by the host driver to indicate the function
-that the endpoint device must perform.
-
-Bitfield Description:
- Bit 0 : raise legacy IRQ
- Bit 1 : raise MSI IRQ
- Bit 2 : raise MSI-X IRQ
- Bit 3 : read command (read data from RC buffer)
- Bit 4 : write command (write data to RC buffer)
- Bit 5 : copy command (copy data from one RC buffer to another
- RC buffer)
-
-*) PCI_ENDPOINT_TEST_STATUS
-
-This register reflects the status of the PCI endpoint device.
-
-Bitfield Description:
- Bit 0 : read success
- Bit 1 : read fail
- Bit 2 : write success
- Bit 3 : write fail
- Bit 4 : copy success
- Bit 5 : copy fail
- Bit 6 : IRQ raised
- Bit 7 : source address is invalid
- Bit 8 : destination address is invalid
-
-*) PCI_ENDPOINT_TEST_SRC_ADDR
-
-This register contains the source address (RC buffer address) for the
-COPY/READ command.
-
-*) PCI_ENDPOINT_TEST_DST_ADDR
-
-This register contains the destination address (RC buffer address) for
-the COPY/WRITE command.
-
-*) PCI_ENDPOINT_TEST_IRQ_TYPE
-
-This register contains the interrupt type (Legacy/MSI) triggered
-for the READ/WRITE/COPY and raise IRQ (Legacy/MSI) commands.
-
-Possible types:
- - Legacy : 0
- - MSI : 1
- - MSI-X : 2
-
-*) PCI_ENDPOINT_TEST_IRQ_NUMBER
-
-This register contains the triggered ID interrupt.
-
-Admissible values:
- - Legacy : 0
- - MSI : [1 .. 32]
- - MSI-X : [1 .. 2048]
diff --git a/Documentation/PCI/endpoint/pci-test-howto.rst b/Documentation/PCI/endpoint/pci-test-howto.rst
new file mode 100644
index 000000000000..909f770a07d6
--- /dev/null
+++ b/Documentation/PCI/endpoint/pci-test-howto.rst
@@ -0,0 +1,235 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================
+PCI Test User Guide
+===================
+
+:Author: Kishon Vijay Abraham I <kishon@ti.com>
+
+This document is a guide to help users use pci-epf-test function driver
+and pci_endpoint_test host driver for testing PCI. The list of steps to
+be followed in the host side and EP side is given below.
+
+Endpoint Device
+===============
+
+Endpoint Controller Devices
+---------------------------
+
+To find the list of endpoint controller devices in the system::
+
+ # ls /sys/class/pci_epc/
+ 51000000.pcie_ep
+
+If PCI_ENDPOINT_CONFIGFS is enabled::
+
+ # ls /sys/kernel/config/pci_ep/controllers
+ 51000000.pcie_ep
+
+
+Endpoint Function Drivers
+-------------------------
+
+To find the list of endpoint function drivers in the system::
+
+ # ls /sys/bus/pci-epf/drivers
+ pci_epf_test
+
+If PCI_ENDPOINT_CONFIGFS is enabled::
+
+ # ls /sys/kernel/config/pci_ep/functions
+ pci_epf_test
+
+
+Creating pci-epf-test Device
+----------------------------
+
+PCI endpoint function device can be created using the configfs. To create
+pci-epf-test device, the following commands can be used::
+
+ # mount -t configfs none /sys/kernel/config
+ # cd /sys/kernel/config/pci_ep/
+ # mkdir functions/pci_epf_test/func1
+
+The "mkdir func1" above creates the pci-epf-test function device that will
+be probed by pci_epf_test driver.
+
+The PCI endpoint framework populates the directory with the following
+configurable fields::
+
+ # ls functions/pci_epf_test/func1
+ baseclass_code interrupt_pin progif_code subsys_id
+ cache_line_size msi_interrupts revid subsys_vendorid
+ deviceid msix_interrupts subclass_code vendorid
+
+The PCI endpoint function driver populates these entries with default values
+when the device is bound to the driver. The pci-epf-test driver populates
+vendorid with 0xffff and interrupt_pin with 0x0001::
+
+ # cat functions/pci_epf_test/func1/vendorid
+ 0xffff
+ # cat functions/pci_epf_test/func1/interrupt_pin
+ 0x0001
+
+
+Configuring pci-epf-test Device
+-------------------------------
+
+The user can configure the pci-epf-test device using configfs entry. In order
+to change the vendorid and the number of MSI interrupts used by the function
+device, the following commands can be used::
+
+ # echo 0x104c > functions/pci_epf_test/func1/vendorid
+ # echo 0xb500 > functions/pci_epf_test/func1/deviceid
+ # echo 16 > functions/pci_epf_test/func1/msi_interrupts
+ # echo 8 > functions/pci_epf_test/func1/msix_interrupts
+
+
+Binding pci-epf-test Device to EP Controller
+--------------------------------------------
+
+In order for the endpoint function device to be useful, it has to be bound to
+a PCI endpoint controller driver. Use the configfs to bind the function
+device to one of the controller driver present in the system::
+
+ # ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/
+
+Once the above step is completed, the PCI endpoint is ready to establish a link
+with the host.
+
+
+Start the Link
+--------------
+
+In order for the endpoint device to establish a link with the host, the _start_
+field should be populated with '1'::
+
+ # echo 1 > controllers/51000000.pcie_ep/start
+
+
+RootComplex Device
+==================
+
+lspci Output
+------------
+
+Note that the devices listed here correspond to the value populated in 1.4
+above::
+
+ 00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01)
+ 01:00.0 Unassigned class [ff00]: Texas Instruments Device b500
+
+
+Using Endpoint Test function Device
+-----------------------------------
+
+pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
+tests. To compile this tool the following commands should be used::
+
+ # cd <kernel-dir>
+ # make -C tools/pci
+
+or if you desire to compile and install in your system::
+
+ # cd <kernel-dir>
+ # make -C tools/pci install
+
+The tool and script will be located in <rootfs>/usr/bin/
+
+
+pcitest.sh Output
+~~~~~~~~~~~~~~~~~
+::
+
+ # pcitest.sh
+ BAR tests
+
+ BAR0: OKAY
+ BAR1: OKAY
+ BAR2: OKAY
+ BAR3: OKAY
+ BAR4: NOT OKAY
+ BAR5: NOT OKAY
+
+ Interrupt tests
+
+ SET IRQ TYPE TO LEGACY: OKAY
+ LEGACY IRQ: NOT OKAY
+ SET IRQ TYPE TO MSI: OKAY
+ MSI1: OKAY
+ MSI2: OKAY
+ MSI3: OKAY
+ MSI4: OKAY
+ MSI5: OKAY
+ MSI6: OKAY
+ MSI7: OKAY
+ MSI8: OKAY
+ MSI9: OKAY
+ MSI10: OKAY
+ MSI11: OKAY
+ MSI12: OKAY
+ MSI13: OKAY
+ MSI14: OKAY
+ MSI15: OKAY
+ MSI16: OKAY
+ MSI17: NOT OKAY
+ MSI18: NOT OKAY
+ MSI19: NOT OKAY
+ MSI20: NOT OKAY
+ MSI21: NOT OKAY
+ MSI22: NOT OKAY
+ MSI23: NOT OKAY
+ MSI24: NOT OKAY
+ MSI25: NOT OKAY
+ MSI26: NOT OKAY
+ MSI27: NOT OKAY
+ MSI28: NOT OKAY
+ MSI29: NOT OKAY
+ MSI30: NOT OKAY
+ MSI31: NOT OKAY
+ MSI32: NOT OKAY
+ SET IRQ TYPE TO MSI-X: OKAY
+ MSI-X1: OKAY
+ MSI-X2: OKAY
+ MSI-X3: OKAY
+ MSI-X4: OKAY
+ MSI-X5: OKAY
+ MSI-X6: OKAY
+ MSI-X7: OKAY
+ MSI-X8: OKAY
+ MSI-X9: NOT OKAY
+ MSI-X10: NOT OKAY
+ MSI-X11: NOT OKAY
+ MSI-X12: NOT OKAY
+ MSI-X13: NOT OKAY
+ MSI-X14: NOT OKAY
+ MSI-X15: NOT OKAY
+ MSI-X16: NOT OKAY
+ [...]
+ MSI-X2047: NOT OKAY
+ MSI-X2048: NOT OKAY
+
+ Read Tests
+
+ SET IRQ TYPE TO MSI: OKAY
+ READ ( 1 bytes): OKAY
+ READ ( 1024 bytes): OKAY
+ READ ( 1025 bytes): OKAY
+ READ (1024000 bytes): OKAY
+ READ (1024001 bytes): OKAY
+
+ Write Tests
+
+ WRITE ( 1 bytes): OKAY
+ WRITE ( 1024 bytes): OKAY
+ WRITE ( 1025 bytes): OKAY
+ WRITE (1024000 bytes): OKAY
+ WRITE (1024001 bytes): OKAY
+
+ Copy Tests
+
+ COPY ( 1 bytes): OKAY
+ COPY ( 1024 bytes): OKAY
+ COPY ( 1025 bytes): OKAY
+ COPY (1024000 bytes): OKAY
+ COPY (1024001 bytes): OKAY
diff --git a/Documentation/PCI/endpoint/pci-test-howto.txt b/Documentation/PCI/endpoint/pci-test-howto.txt
deleted file mode 100644
index 040479f437a5..000000000000
--- a/Documentation/PCI/endpoint/pci-test-howto.txt
+++ /dev/null
@@ -1,206 +0,0 @@
- PCI TEST USERGUIDE
- Kishon Vijay Abraham I <kishon@ti.com>
-
-This document is a guide to help users use pci-epf-test function driver
-and pci_endpoint_test host driver for testing PCI. The list of steps to
-be followed in the host side and EP side is given below.
-
-1. Endpoint Device
-
-1.1 Endpoint Controller Devices
-
-To find the list of endpoint controller devices in the system:
-
- # ls /sys/class/pci_epc/
- 51000000.pcie_ep
-
-If PCI_ENDPOINT_CONFIGFS is enabled
- # ls /sys/kernel/config/pci_ep/controllers
- 51000000.pcie_ep
-
-1.2 Endpoint Function Drivers
-
-To find the list of endpoint function drivers in the system:
-
- # ls /sys/bus/pci-epf/drivers
- pci_epf_test
-
-If PCI_ENDPOINT_CONFIGFS is enabled
- # ls /sys/kernel/config/pci_ep/functions
- pci_epf_test
-
-1.3 Creating pci-epf-test Device
-
-PCI endpoint function device can be created using the configfs. To create
-pci-epf-test device, the following commands can be used
-
- # mount -t configfs none /sys/kernel/config
- # cd /sys/kernel/config/pci_ep/
- # mkdir functions/pci_epf_test/func1
-
-The "mkdir func1" above creates the pci-epf-test function device that will
-be probed by pci_epf_test driver.
-
-The PCI endpoint framework populates the directory with the following
-configurable fields.
-
- # ls functions/pci_epf_test/func1
- baseclass_code interrupt_pin progif_code subsys_id
- cache_line_size msi_interrupts revid subsys_vendorid
- deviceid msix_interrupts subclass_code vendorid
-
-The PCI endpoint function driver populates these entries with default values
-when the device is bound to the driver. The pci-epf-test driver populates
-vendorid with 0xffff and interrupt_pin with 0x0001
-
- # cat functions/pci_epf_test/func1/vendorid
- 0xffff
- # cat functions/pci_epf_test/func1/interrupt_pin
- 0x0001
-
-1.4 Configuring pci-epf-test Device
-
-The user can configure the pci-epf-test device using configfs entry. In order
-to change the vendorid and the number of MSI interrupts used by the function
-device, the following commands can be used.
-
- # echo 0x104c > functions/pci_epf_test/func1/vendorid
- # echo 0xb500 > functions/pci_epf_test/func1/deviceid
- # echo 16 > functions/pci_epf_test/func1/msi_interrupts
- # echo 8 > functions/pci_epf_test/func1/msix_interrupts
-
-1.5 Binding pci-epf-test Device to EP Controller
-
-In order for the endpoint function device to be useful, it has to be bound to
-a PCI endpoint controller driver. Use the configfs to bind the function
-device to one of the controller driver present in the system.
-
- # ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/
-
-Once the above step is completed, the PCI endpoint is ready to establish a link
-with the host.
-
-1.6 Start the Link
-
-In order for the endpoint device to establish a link with the host, the _start_
-field should be populated with '1'.
-
- # echo 1 > controllers/51000000.pcie_ep/start
-
-2. RootComplex Device
-
-2.1 lspci Output
-
-Note that the devices listed here correspond to the value populated in 1.4 above
-
- 00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01)
- 01:00.0 Unassigned class [ff00]: Texas Instruments Device b500
-
-2.2 Using Endpoint Test function Device
-
-pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint
-tests. To compile this tool the following commands should be used:
-
- # cd <kernel-dir>
- # make -C tools/pci
-
-or if you desire to compile and install in your system:
-
- # cd <kernel-dir>
- # make -C tools/pci install
-
-The tool and script will be located in <rootfs>/usr/bin/
-
-2.2.1 pcitest.sh Output
- # pcitest.sh
- BAR tests
-
- BAR0: OKAY
- BAR1: OKAY
- BAR2: OKAY
- BAR3: OKAY
- BAR4: NOT OKAY
- BAR5: NOT OKAY
-
- Interrupt tests
-
- SET IRQ TYPE TO LEGACY: OKAY
- LEGACY IRQ: NOT OKAY
- SET IRQ TYPE TO MSI: OKAY
- MSI1: OKAY
- MSI2: OKAY
- MSI3: OKAY
- MSI4: OKAY
- MSI5: OKAY
- MSI6: OKAY
- MSI7: OKAY
- MSI8: OKAY
- MSI9: OKAY
- MSI10: OKAY
- MSI11: OKAY
- MSI12: OKAY
- MSI13: OKAY
- MSI14: OKAY
- MSI15: OKAY
- MSI16: OKAY
- MSI17: NOT OKAY
- MSI18: NOT OKAY
- MSI19: NOT OKAY
- MSI20: NOT OKAY
- MSI21: NOT OKAY
- MSI22: NOT OKAY
- MSI23: NOT OKAY
- MSI24: NOT OKAY
- MSI25: NOT OKAY
- MSI26: NOT OKAY
- MSI27: NOT OKAY
- MSI28: NOT OKAY
- MSI29: NOT OKAY
- MSI30: NOT OKAY
- MSI31: NOT OKAY
- MSI32: NOT OKAY
- SET IRQ TYPE TO MSI-X: OKAY
- MSI-X1: OKAY
- MSI-X2: OKAY
- MSI-X3: OKAY
- MSI-X4: OKAY
- MSI-X5: OKAY
- MSI-X6: OKAY
- MSI-X7: OKAY
- MSI-X8: OKAY
- MSI-X9: NOT OKAY
- MSI-X10: NOT OKAY
- MSI-X11: NOT OKAY
- MSI-X12: NOT OKAY
- MSI-X13: NOT OKAY
- MSI-X14: NOT OKAY
- MSI-X15: NOT OKAY
- MSI-X16: NOT OKAY
- [...]
- MSI-X2047: NOT OKAY
- MSI-X2048: NOT OKAY
-
- Read Tests
-
- SET IRQ TYPE TO MSI: OKAY
- READ ( 1 bytes): OKAY
- READ ( 1024 bytes): OKAY
- READ ( 1025 bytes): OKAY
- READ (1024000 bytes): OKAY
- READ (1024001 bytes): OKAY
-
- Write Tests
-
- WRITE ( 1 bytes): OKAY
- WRITE ( 1024 bytes): OKAY
- WRITE ( 1025 bytes): OKAY
- WRITE (1024000 bytes): OKAY
- WRITE (1024001 bytes): OKAY
-
- Copy Tests
-
- COPY ( 1 bytes): OKAY
- COPY ( 1024 bytes): OKAY
- COPY ( 1025 bytes): OKAY
- COPY (1024000 bytes): OKAY
- COPY (1024001 bytes): OKAY
diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
new file mode 100644
index 000000000000..f4c6121868c3
--- /dev/null
+++ b/Documentation/PCI/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================
+Linux PCI Bus Subsystem
+=======================
+
+.. toctree::
+ :maxdepth: 2
+ :numbered:
+
+ pci
+ picebus-howto
+ pci-iov-howto
+ msi-howto
+ acpi-info
+ pci-error-recovery
+ pcieaer-howto
+ endpoint/index
diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst
new file mode 100644
index 000000000000..994cbb660ade
--- /dev/null
+++ b/Documentation/PCI/msi-howto.rst
@@ -0,0 +1,287 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+==========================
+The MSI Driver Guide HOWTO
+==========================
+
+:Authors: Tom L Nguyen; Martine Silbermann; Matthew Wilcox
+
+:Copyright: 2003, 2008 Intel Corporation
+
+About this guide
+================
+
+This guide describes the basics of Message Signaled Interrupts (MSIs),
+the advantages of using MSI over traditional interrupt mechanisms, how
+to change your driver to use MSI or MSI-X and some basic diagnostics to
+try if a device doesn't support MSIs.
+
+
+What are MSIs?
+==============
+
+A Message Signaled Interrupt is a write from the device to a special
+address which causes an interrupt to be received by the CPU.
+
+The MSI capability was first specified in PCI 2.2 and was later enhanced
+in PCI 3.0 to allow each interrupt to be masked individually. The MSI-X
+capability was also introduced with PCI 3.0. It supports more interrupts
+per device than MSI and allows interrupts to be independently configured.
+
+Devices may support both MSI and MSI-X, but only one can be enabled at
+a time.
+
+
+Why use MSIs?
+=============
+
+There are three reasons why using MSIs can give an advantage over
+traditional pin-based interrupts.
+
+Pin-based PCI interrupts are often shared amongst several devices.
+To support this, the kernel must call each interrupt handler associated
+with an interrupt, which leads to reduced performance for the system as
+a whole. MSIs are never shared, so this problem cannot arise.
+
+When a device writes data to memory, then raises a pin-based interrupt,
+it is possible that the interrupt may arrive before all the data has
+arrived in memory (this becomes more likely with devices behind PCI-PCI
+bridges). In order to ensure that all the data has arrived in memory,
+the interrupt handler must read a register on the device which raised
+the interrupt. PCI transaction ordering rules require that all the data
+arrive in memory before the value may be returned from the register.
+Using MSIs avoids this problem as the interrupt-generating write cannot
+pass the data writes, so by the time the interrupt is raised, the driver
+knows that all the data has arrived in memory.
+
+PCI devices can only support a single pin-based interrupt per function.
+Often drivers have to query the device to find out what event has
+occurred, slowing down interrupt handling for the common case. With
+MSIs, a device can support more interrupts, allowing each interrupt
+to be specialised to a different purpose. One possible design gives
+infrequent conditions (such as errors) their own interrupt which allows
+the driver to handle the normal interrupt handling path more efficiently.
+Other possible designs include giving one interrupt to each packet queue
+in a network card or each port in a storage controller.
+
+
+How to use MSIs
+===============
+
+PCI devices are initialised to use pin-based interrupts. The device
+driver has to set up the device to use MSI or MSI-X. Not all machines
+support MSIs correctly, and for those machines, the APIs described below
+will simply fail and the device will continue to use pin-based interrupts.
+
+Include kernel support for MSIs
+-------------------------------
+
+To support MSI or MSI-X, the kernel must be built with the CONFIG_PCI_MSI
+option enabled. This option is only available on some architectures,
+and it may depend on some other options also being set. For example,
+on x86, you must also enable X86_UP_APIC or SMP in order to see the
+CONFIG_PCI_MSI option.
+
+Using MSI
+---------
+
+Most of the hard work is done for the driver in the PCI layer. The driver
+simply has to request that the PCI layer set up the MSI capability for this
+device.
+
+To automatically use MSI or MSI-X interrupt vectors, use the following
+function::
+
+ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+ unsigned int max_vecs, unsigned int flags);
+
+which allocates up to max_vecs interrupt vectors for a PCI device. It
+returns the number of vectors allocated or a negative error. If the device
+has a requirements for a minimum number of vectors the driver can pass a
+min_vecs argument set to this limit, and the PCI core will return -ENOSPC
+if it can't meet the minimum number of vectors.
+
+The flags argument is used to specify which type of interrupt can be used
+by the device and the driver (PCI_IRQ_LEGACY, PCI_IRQ_MSI, PCI_IRQ_MSIX).
+A convenient short-hand (PCI_IRQ_ALL_TYPES) is also available to ask for
+any possible kind of interrupt. If the PCI_IRQ_AFFINITY flag is set,
+pci_alloc_irq_vectors() will spread the interrupts around the available CPUs.
+
+To get the Linux IRQ numbers passed to request_irq() and free_irq() and the
+vectors, use the following function::
+
+ int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
+
+Any allocated resources should be freed before removing the device using
+the following function::
+
+ void pci_free_irq_vectors(struct pci_dev *dev);
+
+If a device supports both MSI-X and MSI capabilities, this API will use the
+MSI-X facilities in preference to the MSI facilities. MSI-X supports any
+number of interrupts between 1 and 2048. In contrast, MSI is restricted to
+a maximum of 32 interrupts (and must be a power of two). In addition, the
+MSI interrupt vectors must be allocated consecutively, so the system might
+not be able to allocate as many vectors for MSI as it could for MSI-X. On
+some platforms, MSI interrupts must all be targeted at the same set of CPUs
+whereas MSI-X interrupts can all be targeted at different CPUs.
+
+If a device supports neither MSI-X or MSI it will fall back to a single
+legacy IRQ vector.
+
+The typical usage of MSI or MSI-X interrupts is to allocate as many vectors
+as possible, likely up to the limit supported by the device. If nvec is
+larger than the number supported by the device it will automatically be
+capped to the supported limit, so there is no need to query the number of
+vectors supported beforehand::
+
+ nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_ALL_TYPES)
+ if (nvec < 0)
+ goto out_err;
+
+If a driver is unable or unwilling to deal with a variable number of MSI
+interrupts it can request a particular number of interrupts by passing that
+number to pci_alloc_irq_vectors() function as both 'min_vecs' and
+'max_vecs' parameters::
+
+ ret = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_ALL_TYPES);
+ if (ret < 0)
+ goto out_err;
+
+The most notorious example of the request type described above is enabling
+the single MSI mode for a device. It could be done by passing two 1s as
+'min_vecs' and 'max_vecs'::
+
+ ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
+ if (ret < 0)
+ goto out_err;
+
+Some devices might not support using legacy line interrupts, in which case
+the driver can specify that only MSI or MSI-X is acceptable::
+
+ nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_MSI | PCI_IRQ_MSIX);
+ if (nvec < 0)
+ goto out_err;
+
+Legacy APIs
+-----------
+
+The following old APIs to enable and disable MSI or MSI-X interrupts should
+not be used in new code::
+
+ pci_enable_msi() /* deprecated */
+ pci_disable_msi() /* deprecated */
+ pci_enable_msix_range() /* deprecated */
+ pci_enable_msix_exact() /* deprecated */
+ pci_disable_msix() /* deprecated */
+
+Additionally there are APIs to provide the number of supported MSI or MSI-X
+vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these
+should be avoided in favor of letting pci_alloc_irq_vectors() cap the
+number of vectors. If you have a legitimate special use case for the count
+of vectors we might have to revisit that decision and add a
+pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently.
+
+Considerations when using MSIs
+------------------------------
+
+Spinlocks
+~~~~~~~~~
+
+Most device drivers have a per-device spinlock which is taken in the
+interrupt handler. With pin-based interrupts or a single MSI, it is not
+necessary to disable interrupts (Linux guarantees the same interrupt will
+not be re-entered). If a device uses multiple interrupts, the driver
+must disable interrupts while the lock is held. If the device sends
+a different interrupt, the driver will deadlock trying to recursively
+acquire the spinlock. Such deadlocks can be avoided by using
+spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
+and acquire the lock (see Documentation/kernel-hacking/locking.rst).
+
+How to tell whether MSI/MSI-X is enabled on a device
+----------------------------------------------------
+
+Using 'lspci -v' (as root) may show some devices with "MSI", "Message
+Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities
+has an 'Enable' flag which is followed with either "+" (enabled)
+or "-" (disabled).
+
+
+MSI quirks
+==========
+
+Several PCI chipsets or devices are known not to support MSIs.
+The PCI stack provides three ways to disable MSIs:
+
+1. globally
+2. on all devices behind a specific bridge
+3. on a single device
+
+Disabling MSIs globally
+-----------------------
+
+Some host chipsets simply don't support MSIs properly. If we're
+lucky, the manufacturer knows this and has indicated it in the ACPI
+FADT table. In this case, Linux automatically disables MSIs.
+Some boards don't include this information in the table and so we have
+to detect them ourselves. The complete list of these is found near the
+quirk_disable_all_msi() function in drivers/pci/quirks.c.
+
+If you have a board which has problems with MSIs, you can pass pci=nomsi
+on the kernel command line to disable MSIs on all devices. It would be
+in your best interests to report the problem to linux-pci@vger.kernel.org
+including a full 'lspci -v' so we can add the quirks to the kernel.
+
+Disabling MSIs below a bridge
+-----------------------------
+
+Some PCI bridges are not able to route MSIs between busses properly.
+In this case, MSIs must be disabled on all devices behind the bridge.
+
+Some bridges allow you to enable MSIs by changing some bits in their
+PCI configuration space (especially the Hypertransport chipsets such
+as the nVidia nForce and Serverworks HT2000). As with host chipsets,
+Linux mostly knows about them and automatically enables MSIs if it can.
+If you have a bridge unknown to Linux, you can enable
+MSIs in configuration space using whatever method you know works, then
+enable MSIs on that bridge by doing::
+
+ echo 1 > /sys/bus/pci/devices/$bridge/msi_bus
+
+where $bridge is the PCI address of the bridge you've enabled (eg
+0000:00:0e.0).
+
+To disable MSIs, echo 0 instead of 1. Changing this value should be
+done with caution as it could break interrupt handling for all devices
+below this bridge.
+
+Again, please notify linux-pci@vger.kernel.org of any bridges that need
+special handling.
+
+Disabling MSIs on a single device
+---------------------------------
+
+Some devices are known to have faulty MSI implementations. Usually this
+is handled in the individual device driver, but occasionally it's necessary
+to handle this with a quirk. Some drivers have an option to disable use
+of MSI. While this is a convenient workaround for the driver author,
+it is not good practice, and should not be emulated.
+
+Finding why MSIs are disabled on a device
+-----------------------------------------
+
+From the above three sections, you can see that there are many reasons
+why MSIs may not be enabled for a given device. Your first step should
+be to examine your dmesg carefully to determine whether MSIs are enabled
+for your machine. You should also check your .config to be sure you
+have enabled CONFIG_PCI_MSI.
+
+Then, 'lspci -t' gives the list of bridges above a device. Reading
+`/sys/bus/pci/devices/*/msi_bus` will tell you whether MSIs are enabled (1)
+or disabled (0). If 0 is found in any of the msi_bus files belonging
+to bridges between the PCI root and the device, MSIs are disabled.
+
+It is also worth checking the device driver to see whether it supports MSIs.
+For example, it may contain calls to pci_irq_alloc_vectors() with the
+PCI_IRQ_MSI or PCI_IRQ_MSIX flags.
diff --git a/Documentation/PCI/pci-error-recovery.rst b/Documentation/PCI/pci-error-recovery.rst
new file mode 100644
index 000000000000..83db42092935
--- /dev/null
+++ b/Documentation/PCI/pci-error-recovery.rst
@@ -0,0 +1,424 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+PCI Error Recovery
+==================
+
+
+:Authors: - Linas Vepstas <linasvepstas@gmail.com>
+ - Richard Lary <rlary@us.ibm.com>
+ - Mike Mason <mmlnx@us.ibm.com>
+
+
+Many PCI bus controllers are able to detect a variety of hardware
+PCI errors on the bus, such as parity errors on the data and address
+buses, as well as SERR and PERR errors. Some of the more advanced
+chipsets are able to deal with these errors; these include PCI-E chipsets,
+and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
+pSeries boxes. A typical action taken is to disconnect the affected device,
+halting all I/O to it. The goal of a disconnection is to avoid system
+corruption; for example, to halt system memory corruption due to DMA's
+to "wild" addresses. Typically, a reconnection mechanism is also
+offered, so that the affected PCI device(s) are reset and put back
+into working condition. The reset phase requires coordination
+between the affected device drivers and the PCI controller chip.
+This document describes a generic API for notifying device drivers
+of a bus disconnection, and then performing error recovery.
+This API is currently implemented in the 2.6.16 and later kernels.
+
+Reporting and recovery is performed in several steps. First, when
+a PCI hardware error has resulted in a bus disconnect, that event
+is reported as soon as possible to all affected device drivers,
+including multiple instances of a device driver on multi-function
+cards. This allows device drivers to avoid deadlocking in spinloops,
+waiting for some i/o-space register to change, when it never will.
+It also gives the drivers a chance to defer incoming I/O as
+needed.
+
+Next, recovery is performed in several stages. Most of the complexity
+is forced by the need to handle multi-function devices, that is,
+devices that have multiple device drivers associated with them.
+In the first stage, each driver is allowed to indicate what type
+of reset it desires, the choices being a simple re-enabling of I/O
+or requesting a slot reset.
+
+If any driver requests a slot reset, that is what will be done.
+
+After a reset and/or a re-enabling of I/O, all drivers are
+again notified, so that they may then perform any device setup/config
+that may be required. After these have all completed, a final
+"resume normal operations" event is sent out.
+
+The biggest reason for choosing a kernel-based implementation rather
+than a user-space implementation was the need to deal with bus
+disconnects of PCI devices attached to storage media, and, in particular,
+disconnects from devices holding the root file system. If the root
+file system is disconnected, a user-space mechanism would have to go
+through a large number of contortions to complete recovery. Almost all
+of the current Linux file systems are not tolerant of disconnection
+from/reconnection to their underlying block device. By contrast,
+bus errors are easy to manage in the device driver. Indeed, most
+device drivers already handle very similar recovery procedures;
+for example, the SCSI-generic layer already provides significant
+mechanisms for dealing with SCSI bus errors and SCSI bus resets.
+
+
+Detailed Design
+===============
+
+Design and implementation details below, based on a chain of
+public email discussions with Ben Herrenschmidt, circa 5 April 2005.
+
+The error recovery API support is exposed to the driver in the form of
+a structure of function pointers pointed to by a new field in struct
+pci_driver. A driver that fails to provide the structure is "non-aware",
+and the actual recovery steps taken are platform dependent. The
+arch/powerpc implementation will simulate a PCI hotplug remove/add.
+
+This structure has the form::
+
+ struct pci_error_handlers
+ {
+ int (*error_detected)(struct pci_dev *dev, enum pci_channel_state);
+ int (*mmio_enabled)(struct pci_dev *dev);
+ int (*slot_reset)(struct pci_dev *dev);
+ void (*resume)(struct pci_dev *dev);
+ };
+
+The possible channel states are::
+
+ enum pci_channel_state {
+ pci_channel_io_normal, /* I/O channel is in normal state */
+ pci_channel_io_frozen, /* I/O to channel is blocked */
+ pci_channel_io_perm_failure, /* PCI card is dead */
+ };
+
+Possible return values are::
+
+ enum pci_ers_result {
+ PCI_ERS_RESULT_NONE, /* no result/none/not supported in device driver */
+ PCI_ERS_RESULT_CAN_RECOVER, /* Device driver can recover without slot reset */
+ PCI_ERS_RESULT_NEED_RESET, /* Device driver wants slot to be reset. */
+ PCI_ERS_RESULT_DISCONNECT, /* Device has completely failed, is unrecoverable */
+ PCI_ERS_RESULT_RECOVERED, /* Device driver is fully recovered and operational */
+ };
+
+A driver does not have to implement all of these callbacks; however,
+if it implements any, it must implement error_detected(). If a callback
+is not implemented, the corresponding feature is considered unsupported.
+For example, if mmio_enabled() and resume() aren't there, then it
+is assumed that the driver is not doing any direct recovery and requires
+a slot reset. Typically a driver will want to know about
+a slot_reset().
+
+The actual steps taken by a platform to recover from a PCI error
+event will be platform-dependent, but will follow the general
+sequence described below.
+
+STEP 0: Error Event
+-------------------
+A PCI bus error is detected by the PCI hardware. On powerpc, the slot
+is isolated, in that all I/O is blocked: all reads return 0xffffffff,
+all writes are ignored.
+
+
+STEP 1: Notification
+--------------------
+Platform calls the error_detected() callback on every instance of
+every driver affected by the error.
+
+At this point, the device might not be accessible anymore, depending on
+the platform (the slot will be isolated on powerpc). The driver may
+already have "noticed" the error because of a failing I/O, but this
+is the proper "synchronization point", that is, it gives the driver
+a chance to cleanup, waiting for pending stuff (timers, whatever, etc...)
+to complete; it can take semaphores, schedule, etc... everything but
+touch the device. Within this function and after it returns, the driver
+shouldn't do any new IOs. Called in task context. This is sort of a
+"quiesce" point. See note about interrupts at the end of this doc.
+
+All drivers participating in this system must implement this call.
+The driver must return one of the following result codes:
+
+ - PCI_ERS_RESULT_CAN_RECOVER
+ Driver returns this if it thinks it might be able to recover
+ the HW by just banging IOs or if it wants to be given
+ a chance to extract some diagnostic information (see
+ mmio_enable, below).
+ - PCI_ERS_RESULT_NEED_RESET
+ Driver returns this if it can't recover without a
+ slot reset.
+ - PCI_ERS_RESULT_DISCONNECT
+ Driver returns this if it doesn't want to recover at all.
+
+The next step taken will depend on the result codes returned by the
+drivers.
+
+If all drivers on the segment/slot return PCI_ERS_RESULT_CAN_RECOVER,
+then the platform should re-enable IOs on the slot (or do nothing in
+particular, if the platform doesn't isolate slots), and recovery
+proceeds to STEP 2 (MMIO Enable).
+
+If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET),
+then recovery proceeds to STEP 4 (Slot Reset).
+
+If the platform is unable to recover the slot, the next step
+is STEP 6 (Permanent Failure).
+
+.. note::
+
+ The current powerpc implementation assumes that a device driver will
+ *not* schedule or semaphore in this routine; the current powerpc
+ implementation uses one kernel thread to notify all devices;
+ thus, if one device sleeps/schedules, all devices are affected.
+ Doing better requires complex multi-threaded logic in the error
+ recovery implementation (e.g. waiting for all notification threads
+ to "join" before proceeding with recovery.) This seems excessively
+ complex and not worth implementing.
+
+ The current powerpc implementation doesn't much care if the device
+ attempts I/O at this point, or not. I/O's will fail, returning
+ a value of 0xff on read, and writes will be dropped. If more than
+ EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
+ assumes that the device driver has gone into an infinite loop
+ and prints an error to syslog. A reboot is then required to
+ get the device working again.
+
+STEP 2: MMIO Enabled
+--------------------
+The platform re-enables MMIO to the device (but typically not the
+DMA), and then calls the mmio_enabled() callback on all affected
+device drivers.
+
+This is the "early recovery" call. IOs are allowed again, but DMA is
+not, with some restrictions. This is NOT a callback for the driver to
+start operations again, only to peek/poke at the device, extract diagnostic
+information, if any, and eventually do things like trigger a device local
+reset or some such, but not restart operations. This callback is made if
+all drivers on a segment agree that they can try to recover and if no automatic
+link reset was performed by the HW. If the platform can't just re-enable IOs
+without a slot reset or a link reset, it will not call this callback, and
+instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
+
+.. note::
+
+ The following is proposed; no platform implements this yet:
+ Proposal: All I/O's should be done _synchronously_ from within
+ this callback, errors triggered by them will be returned via
+ the normal pci_check_whatever() API, no new error_detected()
+ callback will be issued due to an error happening here. However,
+ such an error might cause IOs to be re-blocked for the whole
+ segment, and thus invalidate the recovery that other devices
+ on the same segment might have done, forcing the whole segment
+ into one of the next states, that is, link reset or slot reset.
+
+The driver should return one of the following result codes:
+ - PCI_ERS_RESULT_RECOVERED
+ Driver returns this if it thinks the device is fully
+ functional and thinks it is ready to start
+ normal driver operations again. There is no
+ guarantee that the driver will actually be
+ allowed to proceed, as another driver on the
+ same segment might have failed and thus triggered a
+ slot reset on platforms that support it.
+
+ - PCI_ERS_RESULT_NEED_RESET
+ Driver returns this if it thinks the device is not
+ recoverable in its current state and it needs a slot
+ reset to proceed.
+
+ - PCI_ERS_RESULT_DISCONNECT
+ Same as above. Total failure, no recovery even after
+ reset driver dead. (To be defined more precisely)
+
+The next step taken depends on the results returned by the drivers.
+If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform
+proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
+
+If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
+proceeds to STEP 4 (Slot Reset)
+
+STEP 3: Link Reset
+------------------
+The platform resets the link. This is a PCI-Express specific step
+and is done whenever a fatal error has been detected that can be
+"solved" by resetting the link.
+
+STEP 4: Slot Reset
+------------------
+
+In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
+the platform will perform a slot reset on the requesting PCI device(s).
+The actual steps taken by a platform to perform a slot reset
+will be platform-dependent. Upon completion of slot reset, the
+platform will call the device slot_reset() callback.
+
+Powerpc platforms implement two levels of slot reset:
+soft reset(default) and fundamental(optional) reset.
+
+Powerpc soft reset consists of asserting the adapter #RST line and then
+restoring the PCI BAR's and PCI configuration header to a state
+that is equivalent to what it would be after a fresh system
+power-on followed by power-on BIOS/system firmware initialization.
+Soft reset is also known as hot-reset.
+
+Powerpc fundamental reset is supported by PCI Express cards only
+and results in device's state machines, hardware logic, port states and
+configuration registers to initialize to their default conditions.
+
+For most PCI devices, a soft reset will be sufficient for recovery.
+Optional fundamental reset is provided to support a limited number
+of PCI Express devices for which a soft reset is not sufficient
+for recovery.
+
+If the platform supports PCI hotplug, then the reset might be
+performed by toggling the slot electrical power off/on.
+
+It is important for the platform to restore the PCI config space
+to the "fresh poweron" state, rather than the "last state". After
+a slot reset, the device driver will almost always use its standard
+device initialization routines, and an unusual config space setup
+may result in hung devices, kernel panics, or silent data corruption.
+
+This call gives drivers the chance to re-initialize the hardware
+(re-download firmware, etc.). At this point, the driver may assume
+that the card is in a fresh state and is fully functional. The slot
+is unfrozen and the driver has full access to PCI config space,
+memory mapped I/O space and DMA. Interrupts (Legacy, MSI, or MSI-X)
+will also be available.
+
+Drivers should not restart normal I/O processing operations
+at this point. If all device drivers report success on this
+callback, the platform will call resume() to complete the sequence,
+and let the driver restart normal I/O processing.
+
+A driver can still return a critical failure for this function if
+it can't get the device operational after reset. If the platform
+previously tried a soft reset, it might now try a hard reset (power
+cycle) and then call slot_reset() again. It the device still can't
+be recovered, there is nothing more that can be done; the platform
+will typically report a "permanent failure" in such a case. The
+device will be considered "dead" in this case.
+
+Drivers for multi-function cards will need to coordinate among
+themselves as to which driver instance will perform any "one-shot"
+or global device initialization. For example, the Symbios sym53cxx2
+driver performs device init only from PCI function 0::
+
+ + if (PCI_FUNC(pdev->devfn) == 0)
+ + sym_reset_scsi_bus(np, 0);
+
+Result codes:
+ - PCI_ERS_RESULT_DISCONNECT
+ Same as above.
+
+Drivers for PCI Express cards that require a fundamental reset must
+set the needs_freset bit in the pci_dev structure in their probe function.
+For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
+PCI card types::
+
+ + /* Set EEH reset type to fundamental if required by hba */
+ + if (IS_QLA24XX(ha) || IS_QLA25XX(ha) || IS_QLA81XX(ha))
+ + pdev->needs_freset = 1;
+ +
+
+Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent
+Failure).
+
+.. note::
+
+ The current powerpc implementation does not try a power-cycle
+ reset if the driver returned PCI_ERS_RESULT_DISCONNECT.
+ However, it probably should.
+
+
+STEP 5: Resume Operations
+-------------------------
+The platform will call the resume() callback on all affected device
+drivers if all drivers on the segment have returned
+PCI_ERS_RESULT_RECOVERED from one of the 3 previous callbacks.
+The goal of this callback is to tell the driver to restart activity,
+that everything is back and running. This callback does not return
+a result code.
+
+At this point, if a new error happens, the platform will restart
+a new error recovery sequence.
+
+STEP 6: Permanent Failure
+-------------------------
+A "permanent failure" has occurred, and the platform cannot recover
+the device. The platform will call error_detected() with a
+pci_channel_state value of pci_channel_io_perm_failure.
+
+The device driver should, at this point, assume the worst. It should
+cancel all pending I/O, refuse all new I/O, returning -EIO to
+higher layers. The device driver should then clean up all of its
+memory and remove itself from kernel operations, much as it would
+during system shutdown.
+
+The platform will typically notify the system operator of the
+permanent failure in some way. If the device is hotplug-capable,
+the operator will probably want to remove and replace the device.
+Note, however, not all failures are truly "permanent". Some are
+caused by over-heating, some by a poorly seated card. Many
+PCI error events are caused by software bugs, e.g. DMA's to
+wild addresses or bogus split transactions due to programming
+errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
+for additional detail on real-life experience of the causes of
+software errors.
+
+
+Conclusion; General Remarks
+---------------------------
+The way the callbacks are called is platform policy. A platform with
+no slot reset capability may want to just "ignore" drivers that can't
+recover (disconnect them) and try to let other cards on the same segment
+recover. Keep in mind that in most real life cases, though, there will
+be only one driver per segment.
+
+Now, a note about interrupts. If you get an interrupt and your
+device is dead or has been isolated, there is a problem :)
+The current policy is to turn this into a platform policy.
+That is, the recovery API only requires that:
+
+ - There is no guarantee that interrupt delivery can proceed from any
+ device on the segment starting from the error detection and until the
+ slot_reset callback is called, at which point interrupts are expected
+ to be fully operational.
+
+ - There is no guarantee that interrupt delivery is stopped, that is,
+ a driver that gets an interrupt after detecting an error, or that detects
+ an error within the interrupt handler such that it prevents proper
+ ack'ing of the interrupt (and thus removal of the source) should just
+ return IRQ_NOTHANDLED. It's up to the platform to deal with that
+ condition, typically by masking the IRQ source during the duration of
+ the error handling. It is expected that the platform "knows" which
+ interrupts are routed to error-management capable slots and can deal
+ with temporarily disabling that IRQ number during error processing (this
+ isn't terribly complex). That means some IRQ latency for other devices
+ sharing the interrupt, but there is simply no other way. High end
+ platforms aren't supposed to share interrupts between many devices
+ anyway :)
+
+.. note::
+
+ Implementation details for the powerpc platform are discussed in
+ the file Documentation/powerpc/eeh-pci-error-recovery.txt
+
+ As of this writing, there is a growing list of device drivers with
+ patches implementing error recovery. Not all of these patches are in
+ mainline yet. These may be used as "examples":
+
+ - drivers/scsi/ipr
+ - drivers/scsi/sym53c8xx_2
+ - drivers/scsi/qla2xxx
+ - drivers/scsi/lpfc
+ - drivers/next/bnx2.c
+ - drivers/next/e100.c
+ - drivers/net/e1000
+ - drivers/net/e1000e
+ - drivers/net/ixgb
+ - drivers/net/ixgbe
+ - drivers/net/cxgb3
+ - drivers/net/s2io.c
+ - drivers/net/qlge
diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt
deleted file mode 100644
index 0b6bb3ef449e..000000000000
--- a/Documentation/PCI/pci-error-recovery.txt
+++ /dev/null
@@ -1,413 +0,0 @@
-
- PCI Error Recovery
- ------------------
- February 2, 2006
-
- Current document maintainer:
- Linas Vepstas <linasvepstas@gmail.com>
- updated by Richard Lary <rlary@us.ibm.com>
- and Mike Mason <mmlnx@us.ibm.com> on 27-Jul-2009
-
-
-Many PCI bus controllers are able to detect a variety of hardware
-PCI errors on the bus, such as parity errors on the data and address
-buses, as well as SERR and PERR errors. Some of the more advanced
-chipsets are able to deal with these errors; these include PCI-E chipsets,
-and the PCI-host bridges found on IBM Power4, Power5 and Power6-based
-pSeries boxes. A typical action taken is to disconnect the affected device,
-halting all I/O to it. The goal of a disconnection is to avoid system
-corruption; for example, to halt system memory corruption due to DMA's
-to "wild" addresses. Typically, a reconnection mechanism is also
-offered, so that the affected PCI device(s) are reset and put back
-into working condition. The reset phase requires coordination
-between the affected device drivers and the PCI controller chip.
-This document describes a generic API for notifying device drivers
-of a bus disconnection, and then performing error recovery.
-This API is currently implemented in the 2.6.16 and later kernels.
-
-Reporting and recovery is performed in several steps. First, when
-a PCI hardware error has resulted in a bus disconnect, that event
-is reported as soon as possible to all affected device drivers,
-including multiple instances of a device driver on multi-function
-cards. This allows device drivers to avoid deadlocking in spinloops,
-waiting for some i/o-space register to change, when it never will.
-It also gives the drivers a chance to defer incoming I/O as
-needed.
-
-Next, recovery is performed in several stages. Most of the complexity
-is forced by the need to handle multi-function devices, that is,
-devices that have multiple device drivers associated with them.
-In the first stage, each driver is allowed to indicate what type
-of reset it desires, the choices being a simple re-enabling of I/O
-or requesting a slot reset.
-
-If any driver requests a slot reset, that is what will be done.
-
-After a reset and/or a re-enabling of I/O, all drivers are
-again notified, so that they may then perform any device setup/config
-that may be required. After these have all completed, a final
-"resume normal operations" event is sent out.
-
-The biggest reason for choosing a kernel-based implementation rather
-than a user-space implementation was the need to deal with bus
-disconnects of PCI devices attached to storage media, and, in particular,
-disconnects from devices holding the root file system. If the root
-file system is disconnected, a user-space mechanism would have to go
-through a large number of contortions to complete recovery. Almost all
-of the current Linux file systems are not tolerant of disconnection
-from/reconnection to their underlying block device. By contrast,
-bus errors are easy to manage in the device driver. Indeed, most
-device drivers already handle very similar recovery procedures;
-for example, the SCSI-generic layer already provides significant
-mechanisms for dealing with SCSI bus errors and SCSI bus resets.
-
-
-Detailed Design
----------------
-Design and implementation details below, based on a chain of
-public email discussions with Ben Herrenschmidt, circa 5 April 2005.
-
-The error recovery API support is exposed to the driver in the form of
-a structure of function pointers pointed to by a new field in struct
-pci_driver. A driver that fails to provide the structure is "non-aware",
-and the actual recovery steps taken are platform dependent. The
-arch/powerpc implementation will simulate a PCI hotplug remove/add.
-
-This structure has the form:
-struct pci_error_handlers
-{
- int (*error_detected)(struct pci_dev *dev, enum pci_channel_state);
- int (*mmio_enabled)(struct pci_dev *dev);
- int (*slot_reset)(struct pci_dev *dev);
- void (*resume)(struct pci_dev *dev);
-};
-
-The possible channel states are:
-enum pci_channel_state {
- pci_channel_io_normal, /* I/O channel is in normal state */
- pci_channel_io_frozen, /* I/O to channel is blocked */
- pci_channel_io_perm_failure, /* PCI card is dead */
-};
-
-Possible return values are:
-enum pci_ers_result {
- PCI_ERS_RESULT_NONE, /* no result/none/not supported in device driver */
- PCI_ERS_RESULT_CAN_RECOVER, /* Device driver can recover without slot reset */
- PCI_ERS_RESULT_NEED_RESET, /* Device driver wants slot to be reset. */
- PCI_ERS_RESULT_DISCONNECT, /* Device has completely failed, is unrecoverable */
- PCI_ERS_RESULT_RECOVERED, /* Device driver is fully recovered and operational */
-};
-
-A driver does not have to implement all of these callbacks; however,
-if it implements any, it must implement error_detected(). If a callback
-is not implemented, the corresponding feature is considered unsupported.
-For example, if mmio_enabled() and resume() aren't there, then it
-is assumed that the driver is not doing any direct recovery and requires
-a slot reset. Typically a driver will want to know about
-a slot_reset().
-
-The actual steps taken by a platform to recover from a PCI error
-event will be platform-dependent, but will follow the general
-sequence described below.
-
-STEP 0: Error Event
--------------------
-A PCI bus error is detected by the PCI hardware. On powerpc, the slot
-is isolated, in that all I/O is blocked: all reads return 0xffffffff,
-all writes are ignored.
-
-
-STEP 1: Notification
---------------------
-Platform calls the error_detected() callback on every instance of
-every driver affected by the error.
-
-At this point, the device might not be accessible anymore, depending on
-the platform (the slot will be isolated on powerpc). The driver may
-already have "noticed" the error because of a failing I/O, but this
-is the proper "synchronization point", that is, it gives the driver
-a chance to cleanup, waiting for pending stuff (timers, whatever, etc...)
-to complete; it can take semaphores, schedule, etc... everything but
-touch the device. Within this function and after it returns, the driver
-shouldn't do any new IOs. Called in task context. This is sort of a
-"quiesce" point. See note about interrupts at the end of this doc.
-
-All drivers participating in this system must implement this call.
-The driver must return one of the following result codes:
- - PCI_ERS_RESULT_CAN_RECOVER:
- Driver returns this if it thinks it might be able to recover
- the HW by just banging IOs or if it wants to be given
- a chance to extract some diagnostic information (see
- mmio_enable, below).
- - PCI_ERS_RESULT_NEED_RESET:
- Driver returns this if it can't recover without a
- slot reset.
- - PCI_ERS_RESULT_DISCONNECT:
- Driver returns this if it doesn't want to recover at all.
-
-The next step taken will depend on the result codes returned by the
-drivers.
-
-If all drivers on the segment/slot return PCI_ERS_RESULT_CAN_RECOVER,
-then the platform should re-enable IOs on the slot (or do nothing in
-particular, if the platform doesn't isolate slots), and recovery
-proceeds to STEP 2 (MMIO Enable).
-
-If any driver requested a slot reset (by returning PCI_ERS_RESULT_NEED_RESET),
-then recovery proceeds to STEP 4 (Slot Reset).
-
-If the platform is unable to recover the slot, the next step
-is STEP 6 (Permanent Failure).
-
->>> The current powerpc implementation assumes that a device driver will
->>> *not* schedule or semaphore in this routine; the current powerpc
->>> implementation uses one kernel thread to notify all devices;
->>> thus, if one device sleeps/schedules, all devices are affected.
->>> Doing better requires complex multi-threaded logic in the error
->>> recovery implementation (e.g. waiting for all notification threads
->>> to "join" before proceeding with recovery.) This seems excessively
->>> complex and not worth implementing.
-
->>> The current powerpc implementation doesn't much care if the device
->>> attempts I/O at this point, or not. I/O's will fail, returning
->>> a value of 0xff on read, and writes will be dropped. If more than
->>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH
->>> assumes that the device driver has gone into an infinite loop
->>> and prints an error to syslog. A reboot is then required to
->>> get the device working again.
-
-STEP 2: MMIO Enabled
--------------------
-The platform re-enables MMIO to the device (but typically not the
-DMA), and then calls the mmio_enabled() callback on all affected
-device drivers.
-
-This is the "early recovery" call. IOs are allowed again, but DMA is
-not, with some restrictions. This is NOT a callback for the driver to
-start operations again, only to peek/poke at the device, extract diagnostic
-information, if any, and eventually do things like trigger a device local
-reset or some such, but not restart operations. This callback is made if
-all drivers on a segment agree that they can try to recover and if no automatic
-link reset was performed by the HW. If the platform can't just re-enable IOs
-without a slot reset or a link reset, it will not call this callback, and
-instead will have gone directly to STEP 3 (Link Reset) or STEP 4 (Slot Reset)
-
->>> The following is proposed; no platform implements this yet:
->>> Proposal: All I/O's should be done _synchronously_ from within
->>> this callback, errors triggered by them will be returned via
->>> the normal pci_check_whatever() API, no new error_detected()
->>> callback will be issued due to an error happening here. However,
->>> such an error might cause IOs to be re-blocked for the whole
->>> segment, and thus invalidate the recovery that other devices
->>> on the same segment might have done, forcing the whole segment
->>> into one of the next states, that is, link reset or slot reset.
-
-The driver should return one of the following result codes:
- - PCI_ERS_RESULT_RECOVERED
- Driver returns this if it thinks the device is fully
- functional and thinks it is ready to start
- normal driver operations again. There is no
- guarantee that the driver will actually be
- allowed to proceed, as another driver on the
- same segment might have failed and thus triggered a
- slot reset on platforms that support it.
-
- - PCI_ERS_RESULT_NEED_RESET
- Driver returns this if it thinks the device is not
- recoverable in its current state and it needs a slot
- reset to proceed.
-
- - PCI_ERS_RESULT_DISCONNECT
- Same as above. Total failure, no recovery even after
- reset driver dead. (To be defined more precisely)
-
-The next step taken depends on the results returned by the drivers.
-If all drivers returned PCI_ERS_RESULT_RECOVERED, then the platform
-proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
-
-If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
-proceeds to STEP 4 (Slot Reset)
-
-STEP 3: Link Reset
-------------------
-The platform resets the link. This is a PCI-Express specific step
-and is done whenever a fatal error has been detected that can be
-"solved" by resetting the link.
-
-STEP 4: Slot Reset
-------------------
-
-In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
-the platform will perform a slot reset on the requesting PCI device(s).
-The actual steps taken by a platform to perform a slot reset
-will be platform-dependent. Upon completion of slot reset, the
-platform will call the device slot_reset() callback.
-
-Powerpc platforms implement two levels of slot reset:
-soft reset(default) and fundamental(optional) reset.
-
-Powerpc soft reset consists of asserting the adapter #RST line and then
-restoring the PCI BAR's and PCI configuration header to a state
-that is equivalent to what it would be after a fresh system
-power-on followed by power-on BIOS/system firmware initialization.
-Soft reset is also known as hot-reset.
-
-Powerpc fundamental reset is supported by PCI Express cards only
-and results in device's state machines, hardware logic, port states and
-configuration registers to initialize to their default conditions.
-
-For most PCI devices, a soft reset will be sufficient for recovery.
-Optional fundamental reset is provided to support a limited number
-of PCI Express devices for which a soft reset is not sufficient
-for recovery.
-
-If the platform supports PCI hotplug, then the reset might be
-performed by toggling the slot electrical power off/on.
-
-It is important for the platform to restore the PCI config space
-to the "fresh poweron" state, rather than the "last state". After
-a slot reset, the device driver will almost always use its standard
-device initialization routines, and an unusual config space setup
-may result in hung devices, kernel panics, or silent data corruption.
-
-This call gives drivers the chance to re-initialize the hardware
-(re-download firmware, etc.). At this point, the driver may assume
-that the card is in a fresh state and is fully functional. The slot
-is unfrozen and the driver has full access to PCI config space,
-memory mapped I/O space and DMA. Interrupts (Legacy, MSI, or MSI-X)
-will also be available.
-
-Drivers should not restart normal I/O processing operations
-at this point. If all device drivers report success on this
-callback, the platform will call resume() to complete the sequence,
-and let the driver restart normal I/O processing.
-
-A driver can still return a critical failure for this function if
-it can't get the device operational after reset. If the platform
-previously tried a soft reset, it might now try a hard reset (power
-cycle) and then call slot_reset() again. It the device still can't
-be recovered, there is nothing more that can be done; the platform
-will typically report a "permanent failure" in such a case. The
-device will be considered "dead" in this case.
-
-Drivers for multi-function cards will need to coordinate among
-themselves as to which driver instance will perform any "one-shot"
-or global device initialization. For example, the Symbios sym53cxx2
-driver performs device init only from PCI function 0:
-
-+ if (PCI_FUNC(pdev->devfn) == 0)
-+ sym_reset_scsi_bus(np, 0);
-
- Result codes:
- - PCI_ERS_RESULT_DISCONNECT
- Same as above.
-
-Drivers for PCI Express cards that require a fundamental reset must
-set the needs_freset bit in the pci_dev structure in their probe function.
-For example, the QLogic qla2xxx driver sets the needs_freset bit for certain
-PCI card types:
-
-+ /* Set EEH reset type to fundamental if required by hba */
-+ if (IS_QLA24XX(ha) || IS_QLA25XX(ha) || IS_QLA81XX(ha))
-+ pdev->needs_freset = 1;
-+
-
-Platform proceeds either to STEP 5 (Resume Operations) or STEP 6 (Permanent
-Failure).
-
->>> The current powerpc implementation does not try a power-cycle
->>> reset if the driver returned PCI_ERS_RESULT_DISCONNECT.
->>> However, it probably should.
-
-
-STEP 5: Resume Operations
--------------------------
-The platform will call the resume() callback on all affected device
-drivers if all drivers on the segment have returned
-PCI_ERS_RESULT_RECOVERED from one of the 3 previous callbacks.
-The goal of this callback is to tell the driver to restart activity,
-that everything is back and running. This callback does not return
-a result code.
-
-At this point, if a new error happens, the platform will restart
-a new error recovery sequence.
-
-STEP 6: Permanent Failure
--------------------------
-A "permanent failure" has occurred, and the platform cannot recover
-the device. The platform will call error_detected() with a
-pci_channel_state value of pci_channel_io_perm_failure.
-
-The device driver should, at this point, assume the worst. It should
-cancel all pending I/O, refuse all new I/O, returning -EIO to
-higher layers. The device driver should then clean up all of its
-memory and remove itself from kernel operations, much as it would
-during system shutdown.
-
-The platform will typically notify the system operator of the
-permanent failure in some way. If the device is hotplug-capable,
-the operator will probably want to remove and replace the device.
-Note, however, not all failures are truly "permanent". Some are
-caused by over-heating, some by a poorly seated card. Many
-PCI error events are caused by software bugs, e.g. DMA's to
-wild addresses or bogus split transactions due to programming
-errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
-for additional detail on real-life experience of the causes of
-software errors.
-
-
-Conclusion; General Remarks
----------------------------
-The way the callbacks are called is platform policy. A platform with
-no slot reset capability may want to just "ignore" drivers that can't
-recover (disconnect them) and try to let other cards on the same segment
-recover. Keep in mind that in most real life cases, though, there will
-be only one driver per segment.
-
-Now, a note about interrupts. If you get an interrupt and your
-device is dead or has been isolated, there is a problem :)
-The current policy is to turn this into a platform policy.
-That is, the recovery API only requires that:
-
- - There is no guarantee that interrupt delivery can proceed from any
-device on the segment starting from the error detection and until the
-slot_reset callback is called, at which point interrupts are expected
-to be fully operational.
-
- - There is no guarantee that interrupt delivery is stopped, that is,
-a driver that gets an interrupt after detecting an error, or that detects
-an error within the interrupt handler such that it prevents proper
-ack'ing of the interrupt (and thus removal of the source) should just
-return IRQ_NOTHANDLED. It's up to the platform to deal with that
-condition, typically by masking the IRQ source during the duration of
-the error handling. It is expected that the platform "knows" which
-interrupts are routed to error-management capable slots and can deal
-with temporarily disabling that IRQ number during error processing (this
-isn't terribly complex). That means some IRQ latency for other devices
-sharing the interrupt, but there is simply no other way. High end
-platforms aren't supposed to share interrupts between many devices
-anyway :)
-
->>> Implementation details for the powerpc platform are discussed in
->>> the file Documentation/powerpc/eeh-pci-error-recovery.txt
-
->>> As of this writing, there is a growing list of device drivers with
->>> patches implementing error recovery. Not all of these patches are in
->>> mainline yet. These may be used as "examples":
->>>
->>> drivers/scsi/ipr
->>> drivers/scsi/sym53c8xx_2
->>> drivers/scsi/qla2xxx
->>> drivers/scsi/lpfc
->>> drivers/next/bnx2.c
->>> drivers/next/e100.c
->>> drivers/net/e1000
->>> drivers/net/e1000e
->>> drivers/net/ixgb
->>> drivers/net/ixgbe
->>> drivers/net/cxgb3
->>> drivers/net/s2io.c
->>> drivers/net/qlge
-
-The End
--------
diff --git a/Documentation/PCI/pci-iov-howto.rst b/Documentation/PCI/pci-iov-howto.rst
new file mode 100644
index 000000000000..b9fd003206f1
--- /dev/null
+++ b/Documentation/PCI/pci-iov-howto.rst
@@ -0,0 +1,172 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+====================================
+PCI Express I/O Virtualization Howto
+====================================
+
+:Copyright: |copy| 2009 Intel Corporation
+:Authors: - Yu Zhao <yu.zhao@intel.com>
+ - Donald Dutile <ddutile@redhat.com>
+
+Overview
+========
+
+What is SR-IOV
+--------------
+
+Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
+capability which makes one physical device appear as multiple virtual
+devices. The physical device is referred to as Physical Function (PF)
+while the virtual devices are referred to as Virtual Functions (VF).
+Allocation of the VF can be dynamically controlled by the PF via
+registers encapsulated in the capability. By default, this feature is
+not enabled and the PF behaves as traditional PCIe device. Once it's
+turned on, each VF's PCI configuration space can be accessed by its own
+Bus, Device and Function Number (Routing ID). And each VF also has PCI
+Memory Space, which is used to map its register set. VF device driver
+operates on the register set so it can be functional and appear as a
+real existing PCI device.
+
+User Guide
+==========
+
+How can I enable SR-IOV capability
+----------------------------------
+
+Multiple methods are available for SR-IOV enablement.
+In the first method, the device driver (PF driver) will control the
+enabling and disabling of the capability via API provided by SR-IOV core.
+If the hardware has SR-IOV capability, loading its PF driver would
+enable it and all VFs associated with the PF. Some PF drivers require
+a module parameter to be set to determine the number of VFs to enable.
+In the second method, a write to the sysfs file sriov_numvfs will
+enable and disable the VFs associated with a PCIe PF. This method
+enables per-PF, VF enable/disable values versus the first method,
+which applies to all PFs of the same device. Additionally, the
+PCI SRIOV core support ensures that enable/disable operations are
+valid to reduce duplication in multiple drivers for the same
+checks, e.g., check numvfs == 0 if enabling VFs, ensure
+numvfs <= totalvfs.
+The second method is the recommended method for new/future VF devices.
+
+How can I use the Virtual Functions
+-----------------------------------
+
+The VF is treated as hot-plugged PCI devices in the kernel, so they
+should be able to work in the same way as real PCI devices. The VF
+requires device driver that is same as a normal PCI device's.
+
+Developer Guide
+===============
+
+SR-IOV API
+----------
+
+To enable SR-IOV capability:
+
+(a) For the first method, in the driver::
+
+ int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+
+'nr_virtfn' is number of VFs to be enabled.
+
+(b) For the second method, from sysfs::
+
+ echo 'nr_virtfn' > \
+ /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
+
+To disable SR-IOV capability:
+
+(a) For the first method, in the driver::
+
+ void pci_disable_sriov(struct pci_dev *dev);
+
+(b) For the second method, from sysfs::
+
+ echo 0 > \
+ /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
+
+To enable auto probing VFs by a compatible driver on the host, run
+command below before enabling SR-IOV capabilities. This is the
+default behavior.
+::
+
+ echo 1 > \
+ /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
+
+To disable auto probing VFs by a compatible driver on the host, run
+command below before enabling SR-IOV capabilities. Updating this
+entry will not affect VFs which are already probed.
+::
+
+ echo 0 > \
+ /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
+
+Usage example
+-------------
+
+Following piece of code illustrates the usage of the SR-IOV API.
+::
+
+ static int dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
+ {
+ pci_enable_sriov(dev, NR_VIRTFN);
+
+ ...
+
+ return 0;
+ }
+
+ static void dev_remove(struct pci_dev *dev)
+ {
+ pci_disable_sriov(dev);
+
+ ...
+ }
+
+ static int dev_suspend(struct pci_dev *dev, pm_message_t state)
+ {
+ ...
+
+ return 0;
+ }
+
+ static int dev_resume(struct pci_dev *dev)
+ {
+ ...
+
+ return 0;
+ }
+
+ static void dev_shutdown(struct pci_dev *dev)
+ {
+ ...
+ }
+
+ static int dev_sriov_configure(struct pci_dev *dev, int numvfs)
+ {
+ if (numvfs > 0) {
+ ...
+ pci_enable_sriov(dev, numvfs);
+ ...
+ return numvfs;
+ }
+ if (numvfs == 0) {
+ ....
+ pci_disable_sriov(dev);
+ ...
+ return 0;
+ }
+ }
+
+ static struct pci_driver dev_driver = {
+ .name = "SR-IOV Physical Function driver",
+ .id_table = dev_id_table,
+ .probe = dev_probe,
+ .remove = dev_remove,
+ .suspend = dev_suspend,
+ .resume = dev_resume,
+ .shutdown = dev_shutdown,
+ .sriov_configure = dev_sriov_configure,
+ };
diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt
deleted file mode 100644
index d2a84151e99c..000000000000
--- a/Documentation/PCI/pci-iov-howto.txt
+++ /dev/null
@@ -1,147 +0,0 @@
- PCI Express I/O Virtualization Howto
- Copyright (C) 2009 Intel Corporation
- Yu Zhao <yu.zhao@intel.com>
-
- Update: November 2012
- -- sysfs-based SRIOV enable-/disable-ment
- Donald Dutile <ddutile@redhat.com>
-
-1. Overview
-
-1.1 What is SR-IOV
-
-Single Root I/O Virtualization (SR-IOV) is a PCI Express Extended
-capability which makes one physical device appear as multiple virtual
-devices. The physical device is referred to as Physical Function (PF)
-while the virtual devices are referred to as Virtual Functions (VF).
-Allocation of the VF can be dynamically controlled by the PF via
-registers encapsulated in the capability. By default, this feature is
-not enabled and the PF behaves as traditional PCIe device. Once it's
-turned on, each VF's PCI configuration space can be accessed by its own
-Bus, Device and Function Number (Routing ID). And each VF also has PCI
-Memory Space, which is used to map its register set. VF device driver
-operates on the register set so it can be functional and appear as a
-real existing PCI device.
-
-2. User Guide
-
-2.1 How can I enable SR-IOV capability
-
-Multiple methods are available for SR-IOV enablement.
-In the first method, the device driver (PF driver) will control the
-enabling and disabling of the capability via API provided by SR-IOV core.
-If the hardware has SR-IOV capability, loading its PF driver would
-enable it and all VFs associated with the PF. Some PF drivers require
-a module parameter to be set to determine the number of VFs to enable.
-In the second method, a write to the sysfs file sriov_numvfs will
-enable and disable the VFs associated with a PCIe PF. This method
-enables per-PF, VF enable/disable values versus the first method,
-which applies to all PFs of the same device. Additionally, the
-PCI SRIOV core support ensures that enable/disable operations are
-valid to reduce duplication in multiple drivers for the same
-checks, e.g., check numvfs == 0 if enabling VFs, ensure
-numvfs <= totalvfs.
-The second method is the recommended method for new/future VF devices.
-
-2.2 How can I use the Virtual Functions
-
-The VF is treated as hot-plugged PCI devices in the kernel, so they
-should be able to work in the same way as real PCI devices. The VF
-requires device driver that is same as a normal PCI device's.
-
-3. Developer Guide
-
-3.1 SR-IOV API
-
-To enable SR-IOV capability:
-(a) For the first method, in the driver:
- int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
- 'nr_virtfn' is number of VFs to be enabled.
-(b) For the second method, from sysfs:
- echo 'nr_virtfn' > \
- /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
-
-To disable SR-IOV capability:
-(a) For the first method, in the driver:
- void pci_disable_sriov(struct pci_dev *dev);
-(b) For the second method, from sysfs:
- echo 0 > \
- /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_numvfs
-
-To enable auto probing VFs by a compatible driver on the host, run
-command below before enabling SR-IOV capabilities. This is the
-default behavior.
- echo 1 > \
- /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
-
-To disable auto probing VFs by a compatible driver on the host, run
-command below before enabling SR-IOV capabilities. Updating this
-entry will not affect VFs which are already probed.
- echo 0 > \
- /sys/bus/pci/devices/<DOMAIN:BUS:DEVICE.FUNCTION>/sriov_drivers_autoprobe
-
-3.2 Usage example
-
-Following piece of code illustrates the usage of the SR-IOV API.
-
-static int dev_probe(struct pci_dev *dev, const struct pci_device_id *id)
-{
- pci_enable_sriov(dev, NR_VIRTFN);
-
- ...
-
- return 0;
-}
-
-static void dev_remove(struct pci_dev *dev)
-{
- pci_disable_sriov(dev);
-
- ...
-}
-
-static int dev_suspend(struct pci_dev *dev, pm_message_t state)
-{
- ...
-
- return 0;
-}
-
-static int dev_resume(struct pci_dev *dev)
-{
- ...
-
- return 0;
-}
-
-static void dev_shutdown(struct pci_dev *dev)
-{
- ...
-}
-
-static int dev_sriov_configure(struct pci_dev *dev, int numvfs)
-{
- if (numvfs > 0) {
- ...
- pci_enable_sriov(dev, numvfs);
- ...
- return numvfs;
- }
- if (numvfs == 0) {
- ....
- pci_disable_sriov(dev);
- ...
- return 0;
- }
-}
-
-static struct pci_driver dev_driver = {
- .name = "SR-IOV Physical Function driver",
- .id_table = dev_id_table,
- .probe = dev_probe,
- .remove = dev_remove,
- .suspend = dev_suspend,
- .resume = dev_resume,
- .shutdown = dev_shutdown,
- .sriov_configure = dev_sriov_configure,
-};
diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
new file mode 100644
index 000000000000..6864f9a70f5f
--- /dev/null
+++ b/Documentation/PCI/pci.rst
@@ -0,0 +1,578 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+How To Write Linux PCI Drivers
+==============================
+
+:Authors: - Martin Mares <mj@ucw.cz>
+ - Grant Grundler <grundler@parisc-linux.org>
+
+The world of PCI is vast and full of (mostly unpleasant) surprises.
+Since each CPU architecture implements different chip-sets and PCI devices
+have different requirements (erm, "features"), the result is the PCI support
+in the Linux kernel is not as trivial as one would wish. This short paper
+tries to introduce all potential driver authors to Linux APIs for
+PCI device drivers.
+
+A more complete resource is the third edition of "Linux Device Drivers"
+by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
+LDD3 is available for free (under Creative Commons License) from:
+http://lwn.net/Kernel/LDD3/.
+
+However, keep in mind that all documents are subject to "bit rot".
+Refer to the source code if things are not working as described here.
+
+Please send questions/comments/patches about Linux PCI API to the
+"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
+
+
+Structure of PCI drivers
+========================
+PCI drivers "discover" PCI devices in a system via pci_register_driver().
+Actually, it's the other way around. When the PCI generic code discovers
+a new device, the driver with a matching "description" will be notified.
+Details on this below.
+
+pci_register_driver() leaves most of the probing for devices to
+the PCI layer and supports online insertion/removal of devices [thus
+supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
+pci_register_driver() call requires passing in a table of function
+pointers and thus dictates the high level structure of a driver.
+
+Once the driver knows about a PCI device and takes ownership, the
+driver generally needs to perform the following initialization:
+
+ - Enable the device
+ - Request MMIO/IOP resources
+ - Set the DMA mask size (for both coherent and streaming DMA)
+ - Allocate and initialize shared control data (pci_allocate_coherent())
+ - Access device configuration space (if needed)
+ - Register IRQ handler (request_irq())
+ - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+ - Enable DMA/processing engines
+
+When done using the device, and perhaps the module needs to be unloaded,
+the driver needs to take the follow steps:
+
+ - Disable the device from generating IRQs
+ - Release the IRQ (free_irq())
+ - Stop all DMA activity
+ - Release DMA buffers (both streaming and coherent)
+ - Unregister from other subsystems (e.g. scsi or netdev)
+ - Release MMIO/IOP resources
+ - Disable the device
+
+Most of these topics are covered in the following sections.
+For the rest look at LDD3 or <linux/pci.h> .
+
+If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
+the PCI functions described below are defined as inline functions either
+completely empty or just returning an appropriate error codes to avoid
+lots of ifdefs in the drivers.
+
+
+pci_register_driver() call
+==========================
+
+PCI device drivers call ``pci_register_driver()`` during their
+initialization with a pointer to a structure describing the driver
+(``struct pci_driver``):
+
+.. kernel-doc:: include/linux/pci.h
+ :functions: pci_driver
+
+The ID table is an array of ``struct pci_device_id`` entries ending with an
+all-zero entry. Definitions with static const are generally preferred.
+
+.. kernel-doc:: include/linux/mod_devicetable.h
+ :functions: pci_device_id
+
+Most drivers only need ``PCI_DEVICE()`` or ``PCI_DEVICE_CLASS()`` to set up
+a pci_device_id table.
+
+New PCI IDs may be added to a device driver pci_ids table at runtime
+as shown below::
+
+ echo "vendor device subvendor subdevice class class_mask driver_data" > \
+ /sys/bus/pci/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The vendor and device fields are mandatory, the others are optional. Users
+need pass only as many optional fields as necessary:
+
+ - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
+ - class and classmask fields default to 0
+ - driver_data defaults to 0UL.
+
+Note that driver_data must match the value used by any of the pci_device_id
+entries defined in the driver. This makes the driver_data field mandatory
+if all the pci_device_id entries have a non-zero driver_data value.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCI devices listed in its (newly updated) pci_ids list.
+
+When the driver exits, it just calls pci_unregister_driver() and the PCI layer
+automatically calls the remove hook for all devices handled by the driver.
+
+
+"Attributes" for driver functions/data
+--------------------------------------
+
+Please mark the initialization and cleanup functions where appropriate
+(the corresponding macros are defined in <linux/init.h>):
+
+ ====== =================================================
+ __init Initialization code. Thrown away after the driver
+ initializes.
+ __exit Exit code. Ignored for non-modular drivers.
+ ====== =================================================
+
+Tips on when/where to use the above attributes:
+ - The module_init()/module_exit() functions (and all
+ initialization functions called _only_ from these)
+ should be marked __init/__exit.
+
+ - Do not mark the struct pci_driver.
+
+ - Do NOT mark a function if you are not sure which mark to use.
+ Better to not mark the function than mark the function wrong.
+
+
+How to find PCI devices manually
+================================
+
+PCI drivers should have a really good reason for not using the
+pci_register_driver() interface to search for PCI devices.
+The main reason PCI devices are controlled by multiple drivers
+is because one PCI device implements several different HW services.
+E.g. combined serial/parallel port/floppy controller.
+
+A manual search may be performed using the following constructs:
+
+Searching by vendor and device ID::
+
+ struct pci_dev *dev = NULL;
+ while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
+ configure_device(dev);
+
+Searching by class ID (iterate in a similar way)::
+
+ pci_get_class(CLASS_ID, dev)
+
+Searching by both vendor/device and subsystem vendor/device ID::
+
+ pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
+
+You can use the constant PCI_ANY_ID as a wildcard replacement for
+VENDOR_ID or DEVICE_ID. This allows searching for any device from a
+specific vendor, for example.
+
+These functions are hotplug-safe. They increment the reference count on
+the pci_dev that they return. You must eventually (possibly at module unload)
+decrement the reference count on these devices by calling pci_dev_put().
+
+
+Device Initialization Steps
+===========================
+
+As noted in the introduction, most PCI drivers need the following steps
+for device initialization:
+
+ - Enable the device
+ - Request MMIO/IOP resources
+ - Set the DMA mask size (for both coherent and streaming DMA)
+ - Allocate and initialize shared control data (pci_allocate_coherent())
+ - Access device configuration space (if needed)
+ - Register IRQ handler (request_irq())
+ - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+ - Enable DMA/processing engines.
+
+The driver can access PCI config space registers at any time.
+(Well, almost. When running BIST, config space can go away...but
+that will just result in a PCI Bus Master Abort and config reads
+will return garbage).
+
+
+Enable the PCI device
+---------------------
+Before touching any device registers, the driver needs to enable
+the PCI device by calling pci_enable_device(). This will:
+
+ - wake up the device if it was in suspended state,
+ - allocate I/O and memory regions of the device (if BIOS did not),
+ - allocate an IRQ (if BIOS did not).
+
+.. note::
+ pci_enable_device() can fail! Check the return value.
+
+.. warning::
+ OS BUG: we don't check resource allocations before enabling those
+ resources. The sequence would make more sense if we called
+ pci_request_resources() before calling pci_enable_device().
+ Currently, the device drivers can't detect the bug when when two
+ devices have been allocated the same range. This is not a common
+ problem and unlikely to get fixed soon.
+
+ This has been discussed before but not changed as of 2.6.19:
+ http://lkml.org/lkml/2006/3/2/194
+
+
+pci_set_master() will enable DMA by setting the bus master bit
+in the PCI_COMMAND register. It also fixes the latency timer value if
+it's set to something bogus by the BIOS. pci_clear_master() will
+disable DMA by clearing the bus master bit.
+
+If the PCI device can use the PCI Memory-Write-Invalidate transaction,
+call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval
+and also ensures that the cache line size register is set correctly.
+Check the return value of pci_set_mwi() as not all architectures
+or chip-sets may support Memory-Write-Invalidate. Alternatively,
+if Mem-Wr-Inval would be nice to have but is not required, call
+pci_try_set_mwi() to have the system do its best effort at enabling
+Mem-Wr-Inval.
+
+
+Request MMIO/IOP resources
+--------------------------
+Memory (MMIO), and I/O port addresses should NOT be read directly
+from the PCI device config space. Use the values in the pci_dev structure
+as the PCI "bus address" might have been remapped to a "host physical"
+address by the arch/chip-set specific kernel support.
+
+See Documentation/io-mapping.txt for how to access device registers
+or device memory.
+
+The device driver needs to call pci_request_region() to verify
+no other device is already using the same address resource.
+Conversely, drivers should call pci_release_region() AFTER
+calling pci_disable_device().
+The idea is to prevent two devices colliding on the same address range.
+
+.. tip::
+ See OS BUG comment above. Currently (2.6.19), The driver can only
+ determine MMIO and IO Port resource availability _after_ calling
+ pci_enable_device().
+
+Generic flavors of pci_request_region() are request_mem_region()
+(for MMIO ranges) and request_region() (for IO Port ranges).
+Use these for address resources that are not described by "normal" PCI
+BARs.
+
+Also see pci_request_selected_regions() below.
+
+
+Set the DMA mask size
+---------------------
+.. note::
+ If anything below doesn't make sense, please refer to
+ Documentation/DMA-API.txt. This section is just a reminder that
+ drivers need to indicate DMA capabilities of the device and is not
+ an authoritative source for DMA interfaces.
+
+While all drivers should explicitly indicate the DMA capability
+(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
+32-bit bus master capability for streaming data need the driver
+to "register" this capability by calling pci_set_dma_mask() with
+appropriate parameters. In general this allows more efficient DMA
+on systems where System RAM exists above 4G _physical_ address.
+
+Drivers for all PCI-X and PCIe compliant devices must call
+pci_set_dma_mask() as they are 64-bit DMA devices.
+
+Similarly, drivers must also "register" this capability if the device
+can directly address "consistent memory" in System RAM above 4G physical
+address by calling pci_set_consistent_dma_mask().
+Again, this includes drivers for all PCI-X and PCIe compliant devices.
+Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
+64-bit DMA capable for payload ("streaming") data but not control
+("consistent") data.
+
+
+Setup shared control data
+-------------------------
+Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
+memory. See Documentation/DMA-API.txt for a full description of
+the DMA APIs. This section is just a reminder that it needs to be done
+before enabling DMA on the device.
+
+
+Initialize device registers
+---------------------------
+Some drivers will need specific "capability" fields programmed
+or other "vendor specific" register initialized or reset.
+E.g. clearing pending interrupts.
+
+
+Register IRQ handler
+--------------------
+While calling request_irq() is the last step described here,
+this is often just another intermediate step to initialize a device.
+This step can often be deferred until the device is opened for use.
+
+All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
+and use the devid to map IRQs to devices (remember that all PCI IRQ lines
+can be shared).
+
+request_irq() will associate an interrupt handler and device handle
+with an interrupt number. Historically interrupt numbers represent
+IRQ lines which run from the PCI device to the Interrupt controller.
+With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
+
+request_irq() also enables the interrupt. Make sure the device is
+quiesced and does not have any interrupts pending before registering
+the interrupt handler.
+
+MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
+which deliver interrupts to the CPU via a DMA write to a Local APIC.
+The fundamental difference between MSI and MSI-X is how multiple
+"vectors" get allocated. MSI requires contiguous blocks of vectors
+while MSI-X can allocate several individual ones.
+
+MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
+PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
+causes the PCI support to program CPU vector data into the PCI device
+capability registers. Many architectures, chip-sets, or BIOSes do NOT
+support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
+the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
+specify PCI_IRQ_LEGACY as well.
+
+Drivers that have different interrupt handlers for MSI/MSI-X and
+legacy INTx should chose the right one based on the msi_enabled
+and msix_enabled flags in the pci_dev structure after calling
+pci_alloc_irq_vectors.
+
+There are (at least) two really good reasons for using MSI:
+
+1) MSI is an exclusive interrupt vector by definition.
+ This means the interrupt handler doesn't have to verify
+ its device caused the interrupt.
+
+2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
+ to be visible to the host CPU(s) when the MSI is delivered. This
+ is important for both data coherency and avoiding stale control data.
+ This guarantee allows the driver to omit MMIO reads to flush
+ the DMA stream.
+
+See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
+of MSI/MSI-X usage.
+
+
+PCI device shutdown
+===================
+
+When a PCI device driver is being unloaded, most of the following
+steps need to be performed:
+
+ - Disable the device from generating IRQs
+ - Release the IRQ (free_irq())
+ - Stop all DMA activity
+ - Release DMA buffers (both streaming and consistent)
+ - Unregister from other subsystems (e.g. scsi or netdev)
+ - Disable device from responding to MMIO/IO Port addresses
+ - Release MMIO/IO Port resource(s)
+
+
+Stop IRQs on the device
+-----------------------
+How to do this is chip/device specific. If it's not done, it opens
+the possibility of a "screaming interrupt" if (and only if)
+the IRQ is shared with another device.
+
+When the shared IRQ handler is "unhooked", the remaining devices
+using the same IRQ line will still need the IRQ enabled. Thus if the
+"unhooked" device asserts IRQ line, the system will respond assuming
+it was one of the remaining devices asserted the IRQ line. Since none
+of the other devices will handle the IRQ, the system will "hang" until
+it decides the IRQ isn't going to get handled and masks the IRQ (100,000
+iterations later). Once the shared IRQ is masked, the remaining devices
+will stop functioning properly. Not a nice situation.
+
+This is another reason to use MSI or MSI-X if it's available.
+MSI and MSI-X are defined to be exclusive interrupts and thus
+are not susceptible to the "screaming interrupt" problem.
+
+
+Release the IRQ
+---------------
+Once the device is quiesced (no more IRQs), one can call free_irq().
+This function will return control once any pending IRQs are handled,
+"unhook" the drivers IRQ handler from that IRQ, and finally release
+the IRQ if no one else is using it.
+
+
+Stop all DMA activity
+---------------------
+It's extremely important to stop all DMA operations BEFORE attempting
+to deallocate DMA control data. Failure to do so can result in memory
+corruption, hangs, and on some chip-sets a hard crash.
+
+Stopping DMA after stopping the IRQs can avoid races where the
+IRQ handler might restart DMA engines.
+
+While this step sounds obvious and trivial, several "mature" drivers
+didn't get this step right in the past.
+
+
+Release DMA buffers
+-------------------
+Once DMA is stopped, clean up streaming DMA first.
+I.e. unmap data buffers and return buffers to "upstream"
+owners if there is one.
+
+Then clean up "consistent" buffers which contain the control data.
+
+See Documentation/DMA-API.txt for details on unmapping interfaces.
+
+
+Unregister from other subsystems
+--------------------------------
+Most low level PCI device drivers support some other subsystem
+like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
+driver isn't losing resources from that other subsystem.
+If this happens, typically the symptom is an Oops (panic) when
+the subsystem attempts to call into a driver that has been unloaded.
+
+
+Disable Device from responding to MMIO/IO Port addresses
+--------------------------------------------------------
+io_unmap() MMIO or IO Port resources and then call pci_disable_device().
+This is the symmetric opposite of pci_enable_device().
+Do not access device registers after calling pci_disable_device().
+
+
+Release MMIO/IO Port Resource(s)
+--------------------------------
+Call pci_release_region() to mark the MMIO or IO Port range as available.
+Failure to do so usually results in the inability to reload the driver.
+
+
+How to access PCI config space
+==============================
+
+You can use `pci_(read|write)_config_(byte|word|dword)` to access the config
+space of a device represented by `struct pci_dev *`. All these functions return
+0 when successful or an error code (`PCIBIOS_...`) which can be translated to a
+text string by pcibios_strerror. Most drivers expect that accesses to valid PCI
+devices don't fail.
+
+If you don't have a struct pci_dev available, you can call
+`pci_bus_(read|write)_config_(byte|word|dword)` to access a given device
+and function on that bus.
+
+If you access fields in the standard portion of the config header, please
+use symbolic names of locations and bits declared in <linux/pci.h>.
+
+If you need to access Extended PCI Capability registers, just call
+pci_find_capability() for the particular capability and it will find the
+corresponding register block for you.
+
+
+Other interesting functions
+===========================
+
+============================= ================================================
+pci_get_domain_bus_and_slot() Find pci_dev corresponding to given domain,
+ bus and slot and number. If the device is
+ found, its reference count is increased.
+pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3)
+pci_find_capability() Find specified capability in device's capability
+ list.
+pci_resource_start() Returns bus start address for a given PCI region
+pci_resource_end() Returns bus end address for a given PCI region
+pci_resource_len() Returns the byte length of a PCI region
+pci_set_drvdata() Set private driver data pointer for a pci_dev
+pci_get_drvdata() Return private driver data pointer for a pci_dev
+pci_set_mwi() Enable Memory-Write-Invalidate transactions.
+pci_clear_mwi() Disable Memory-Write-Invalidate transactions.
+============================= ================================================
+
+
+Miscellaneous hints
+===================
+
+When displaying PCI device names to the user (for example when a driver wants
+to tell the user what card has it found), please use pci_name(pci_dev).
+
+Always refer to the PCI devices by a pointer to the pci_dev structure.
+All PCI layer functions use this identification and it's the only
+reasonable one. Don't use bus/slot/function numbers except for very
+special purposes -- on systems with multiple primary buses their semantics
+can be pretty complex.
+
+Don't try to turn on Fast Back to Back writes in your driver. All devices
+on the bus need to be capable of doing it, so this is something which needs
+to be handled by platform and generic code, not individual drivers.
+
+
+Vendor and device identifications
+=================================
+
+Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
+are shared across multiple drivers. You can add private definitions in
+your driver if they're helpful, or just use plain hex constants.
+
+The device IDs are arbitrary hex numbers (vendor controlled) and normally used
+only in a single location, the pci_device_id table.
+
+Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
+There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
+and https://github.com/pciutils/pciids.
+
+
+Obsolete functions
+==================
+
+There are several functions which you might come across when trying to
+port an old driver to the new PCI interface. They are no longer present
+in the kernel as they aren't compatible with hotplug or PCI domains or
+having sane locking.
+
+================= ===========================================
+pci_find_device() Superseded by pci_get_device()
+pci_find_subsys() Superseded by pci_get_subsys()
+pci_find_slot() Superseded by pci_get_domain_bus_and_slot()
+pci_get_slot() Superseded by pci_get_domain_bus_and_slot()
+================= ===========================================
+
+The alternative is the traditional PCI device driver that walks PCI
+device lists. This is still possible but discouraged.
+
+
+MMIO Space and "Write Posting"
+==============================
+
+Converting a driver from using I/O Port space to using MMIO space
+often requires some additional changes. Specifically, "write posting"
+needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
+already do this. I/O Port space guarantees write transactions reach the PCI
+device before the CPU can continue. Writes to MMIO space allow the CPU
+to continue before the transaction reaches the PCI device. HW weenies
+call this "Write Posting" because the write completion is "posted" to
+the CPU before the transaction has reached its destination.
+
+Thus, timing sensitive code should add readl() where the CPU is
+expected to wait before doing other work. The classic "bit banging"
+sequence works fine for I/O Port space::
+
+ for (i = 8; --i; val >>= 1) {
+ outb(val & 1, ioport_reg); /* write bit */
+ udelay(10);
+ }
+
+The same sequence for MMIO space should be::
+
+ for (i = 8; --i; val >>= 1) {
+ writeb(val & 1, mmio_reg); /* write bit */
+ readb(safe_mmio_reg); /* flush posted write */
+ udelay(10);
+ }
+
+It is important that "safe_mmio_reg" not have any side effects that
+interferes with the correct operation of the device.
+
+Another case to watch out for is when resetting a PCI device. Use PCI
+Configuration space reads to flush the writel(). This will gracefully
+handle the PCI master abort on all platforms if the PCI device is
+expected to not respond to a readl(). Most x86 platforms will allow
+MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
+(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt
deleted file mode 100644
index badb26ac33dc..000000000000
--- a/Documentation/PCI/pci.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-
- How To Write Linux PCI Drivers
-
- by Martin Mares <mj@ucw.cz> on 07-Feb-2000
- updated by Grant Grundler <grundler@parisc-linux.org> on 23-Dec-2006
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The world of PCI is vast and full of (mostly unpleasant) surprises.
-Since each CPU architecture implements different chip-sets and PCI devices
-have different requirements (erm, "features"), the result is the PCI support
-in the Linux kernel is not as trivial as one would wish. This short paper
-tries to introduce all potential driver authors to Linux APIs for
-PCI device drivers.
-
-A more complete resource is the third edition of "Linux Device Drivers"
-by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
-LDD3 is available for free (under Creative Commons License) from:
-
- http://lwn.net/Kernel/LDD3/
-
-However, keep in mind that all documents are subject to "bit rot".
-Refer to the source code if things are not working as described here.
-
-Please send questions/comments/patches about Linux PCI API to the
-"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
-
-
-
-0. Structure of PCI drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-PCI drivers "discover" PCI devices in a system via pci_register_driver().
-Actually, it's the other way around. When the PCI generic code discovers
-a new device, the driver with a matching "description" will be notified.
-Details on this below.
-
-pci_register_driver() leaves most of the probing for devices to
-the PCI layer and supports online insertion/removal of devices [thus
-supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
-pci_register_driver() call requires passing in a table of function
-pointers and thus dictates the high level structure of a driver.
-
-Once the driver knows about a PCI device and takes ownership, the
-driver generally needs to perform the following initialization:
-
- Enable the device
- Request MMIO/IOP resources
- Set the DMA mask size (for both coherent and streaming DMA)
- Allocate and initialize shared control data (pci_allocate_coherent())
- Access device configuration space (if needed)
- Register IRQ handler (request_irq())
- Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
- Enable DMA/processing engines
-
-When done using the device, and perhaps the module needs to be unloaded,
-the driver needs to take the follow steps:
- Disable the device from generating IRQs
- Release the IRQ (free_irq())
- Stop all DMA activity
- Release DMA buffers (both streaming and coherent)
- Unregister from other subsystems (e.g. scsi or netdev)
- Release MMIO/IOP resources
- Disable the device
-
-Most of these topics are covered in the following sections.
-For the rest look at LDD3 or <linux/pci.h> .
-
-If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
-the PCI functions described below are defined as inline functions either
-completely empty or just returning an appropriate error codes to avoid
-lots of ifdefs in the drivers.
-
-
-
-1. pci_register_driver() call
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI device drivers call pci_register_driver() during their
-initialization with a pointer to a structure describing the driver
-(struct pci_driver):
-
- field name Description
- ---------- ------------------------------------------------------
- id_table Pointer to table of device ID's the driver is
- interested in. Most drivers should export this
- table using MODULE_DEVICE_TABLE(pci,...).
-
- probe This probing function gets called (during execution
- of pci_register_driver() for already existing
- devices or later if a new device gets inserted) for
- all PCI devices which match the ID table and are not
- "owned" by the other drivers yet. This function gets
- passed a "struct pci_dev *" for each device whose
- entry in the ID table matches the device. The probe
- function returns zero when the driver chooses to
- take "ownership" of the device or an error code
- (negative number) otherwise.
- The probe function always gets called from process
- context, so it can sleep.
-
- remove The remove() function gets called whenever a device
- being handled by this driver is removed (either during
- deregistration of the driver or when it's manually
- pulled out of a hot-pluggable slot).
- The remove function always gets called from process
- context, so it can sleep.
-
- suspend Put device into low power state.
- suspend_late Put device into low power state.
-
- resume_early Wake device from low power state.
- resume Wake device from low power state.
-
- (Please see Documentation/power/pci.txt for descriptions
- of PCI Power Management and the related functions.)
-
- shutdown Hook into reboot_notifier_list (kernel/sys.c).
- Intended to stop any idling DMA operations.
- Useful for enabling wake-on-lan (NIC) or changing
- the power state of a device before reboot.
- e.g. drivers/net/e100.c.
-
- err_handler See Documentation/PCI/pci-error-recovery.txt
-
-
-The ID table is an array of struct pci_device_id entries ending with an
-all-zero entry. Definitions with static const are generally preferred.
-
-Each entry consists of:
-
- vendor,device Vendor and device ID to match (or PCI_ANY_ID)
-
- subvendor, Subsystem vendor and device ID to match (or PCI_ANY_ID)
- subdevice,
-
- class Device class, subclass, and "interface" to match.
- See Appendix D of the PCI Local Bus Spec or
- include/linux/pci_ids.h for a full list of classes.
- Most drivers do not need to specify class/class_mask
- as vendor/device is normally sufficient.
-
- class_mask limit which sub-fields of the class field are compared.
- See drivers/scsi/sym53c8xx_2/ for example of usage.
-
- driver_data Data private to the driver.
- Most drivers don't need to use driver_data field.
- Best practice is to use driver_data as an index
- into a static list of equivalent device types,
- instead of using it as a pointer.
-
-
-Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up
-a pci_device_id table.
-
-New PCI IDs may be added to a device driver pci_ids table at runtime
-as shown below:
-
-echo "vendor device subvendor subdevice class class_mask driver_data" > \
-/sys/bus/pci/drivers/{driver}/new_id
-
-All fields are passed in as hexadecimal values (no leading 0x).
-The vendor and device fields are mandatory, the others are optional. Users
-need pass only as many optional fields as necessary:
- o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
- o class and classmask fields default to 0
- o driver_data defaults to 0UL.
-
-Note that driver_data must match the value used by any of the pci_device_id
-entries defined in the driver. This makes the driver_data field mandatory
-if all the pci_device_id entries have a non-zero driver_data value.
-
-Once added, the driver probe routine will be invoked for any unclaimed
-PCI devices listed in its (newly updated) pci_ids list.
-
-When the driver exits, it just calls pci_unregister_driver() and the PCI layer
-automatically calls the remove hook for all devices handled by the driver.
-
-
-1.1 "Attributes" for driver functions/data
-
-Please mark the initialization and cleanup functions where appropriate
-(the corresponding macros are defined in <linux/init.h>):
-
- __init Initialization code. Thrown away after the driver
- initializes.
- __exit Exit code. Ignored for non-modular drivers.
-
-Tips on when/where to use the above attributes:
- o The module_init()/module_exit() functions (and all
- initialization functions called _only_ from these)
- should be marked __init/__exit.
-
- o Do not mark the struct pci_driver.
-
- o Do NOT mark a function if you are not sure which mark to use.
- Better to not mark the function than mark the function wrong.
-
-
-
-2. How to find PCI devices manually
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI drivers should have a really good reason for not using the
-pci_register_driver() interface to search for PCI devices.
-The main reason PCI devices are controlled by multiple drivers
-is because one PCI device implements several different HW services.
-E.g. combined serial/parallel port/floppy controller.
-
-A manual search may be performed using the following constructs:
-
-Searching by vendor and device ID:
-
- struct pci_dev *dev = NULL;
- while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
- configure_device(dev);
-
-Searching by class ID (iterate in a similar way):
-
- pci_get_class(CLASS_ID, dev)
-
-Searching by both vendor/device and subsystem vendor/device ID:
-
- pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
-
-You can use the constant PCI_ANY_ID as a wildcard replacement for
-VENDOR_ID or DEVICE_ID. This allows searching for any device from a
-specific vendor, for example.
-
-These functions are hotplug-safe. They increment the reference count on
-the pci_dev that they return. You must eventually (possibly at module unload)
-decrement the reference count on these devices by calling pci_dev_put().
-
-
-
-3. Device Initialization Steps
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-As noted in the introduction, most PCI drivers need the following steps
-for device initialization:
-
- Enable the device
- Request MMIO/IOP resources
- Set the DMA mask size (for both coherent and streaming DMA)
- Allocate and initialize shared control data (pci_allocate_coherent())
- Access device configuration space (if needed)
- Register IRQ handler (request_irq())
- Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
- Enable DMA/processing engines.
-
-The driver can access PCI config space registers at any time.
-(Well, almost. When running BIST, config space can go away...but
-that will just result in a PCI Bus Master Abort and config reads
-will return garbage).
-
-
-3.1 Enable the PCI device
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Before touching any device registers, the driver needs to enable
-the PCI device by calling pci_enable_device(). This will:
- o wake up the device if it was in suspended state,
- o allocate I/O and memory regions of the device (if BIOS did not),
- o allocate an IRQ (if BIOS did not).
-
-NOTE: pci_enable_device() can fail! Check the return value.
-
-[ OS BUG: we don't check resource allocations before enabling those
- resources. The sequence would make more sense if we called
- pci_request_resources() before calling pci_enable_device().
- Currently, the device drivers can't detect the bug when when two
- devices have been allocated the same range. This is not a common
- problem and unlikely to get fixed soon.
-
- This has been discussed before but not changed as of 2.6.19:
- http://lkml.org/lkml/2006/3/2/194
-]
-
-pci_set_master() will enable DMA by setting the bus master bit
-in the PCI_COMMAND register. It also fixes the latency timer value if
-it's set to something bogus by the BIOS. pci_clear_master() will
-disable DMA by clearing the bus master bit.
-
-If the PCI device can use the PCI Memory-Write-Invalidate transaction,
-call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval
-and also ensures that the cache line size register is set correctly.
-Check the return value of pci_set_mwi() as not all architectures
-or chip-sets may support Memory-Write-Invalidate. Alternatively,
-if Mem-Wr-Inval would be nice to have but is not required, call
-pci_try_set_mwi() to have the system do its best effort at enabling
-Mem-Wr-Inval.
-
-
-3.2 Request MMIO/IOP resources
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Memory (MMIO), and I/O port addresses should NOT be read directly
-from the PCI device config space. Use the values in the pci_dev structure
-as the PCI "bus address" might have been remapped to a "host physical"
-address by the arch/chip-set specific kernel support.
-
-See Documentation/io-mapping.txt for how to access device registers
-or device memory.
-
-The device driver needs to call pci_request_region() to verify
-no other device is already using the same address resource.
-Conversely, drivers should call pci_release_region() AFTER
-calling pci_disable_device().
-The idea is to prevent two devices colliding on the same address range.
-
-[ See OS BUG comment above. Currently (2.6.19), The driver can only
- determine MMIO and IO Port resource availability _after_ calling
- pci_enable_device(). ]
-
-Generic flavors of pci_request_region() are request_mem_region()
-(for MMIO ranges) and request_region() (for IO Port ranges).
-Use these for address resources that are not described by "normal" PCI
-BARs.
-
-Also see pci_request_selected_regions() below.
-
-
-3.3 Set the DMA mask size
-~~~~~~~~~~~~~~~~~~~~~~~~~
-[ If anything below doesn't make sense, please refer to
- Documentation/DMA-API.txt. This section is just a reminder that
- drivers need to indicate DMA capabilities of the device and is not
- an authoritative source for DMA interfaces. ]
-
-While all drivers should explicitly indicate the DMA capability
-(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
-32-bit bus master capability for streaming data need the driver
-to "register" this capability by calling pci_set_dma_mask() with
-appropriate parameters. In general this allows more efficient DMA
-on systems where System RAM exists above 4G _physical_ address.
-
-Drivers for all PCI-X and PCIe compliant devices must call
-pci_set_dma_mask() as they are 64-bit DMA devices.
-
-Similarly, drivers must also "register" this capability if the device
-can directly address "consistent memory" in System RAM above 4G physical
-address by calling pci_set_consistent_dma_mask().
-Again, this includes drivers for all PCI-X and PCIe compliant devices.
-Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
-64-bit DMA capable for payload ("streaming") data but not control
-("consistent") data.
-
-
-3.4 Setup shared control data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
-memory. See Documentation/DMA-API.txt for a full description of
-the DMA APIs. This section is just a reminder that it needs to be done
-before enabling DMA on the device.
-
-
-3.5 Initialize device registers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers will need specific "capability" fields programmed
-or other "vendor specific" register initialized or reset.
-E.g. clearing pending interrupts.
-
-
-3.6 Register IRQ handler
-~~~~~~~~~~~~~~~~~~~~~~~~
-While calling request_irq() is the last step described here,
-this is often just another intermediate step to initialize a device.
-This step can often be deferred until the device is opened for use.
-
-All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
-and use the devid to map IRQs to devices (remember that all PCI IRQ lines
-can be shared).
-
-request_irq() will associate an interrupt handler and device handle
-with an interrupt number. Historically interrupt numbers represent
-IRQ lines which run from the PCI device to the Interrupt controller.
-With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
-
-request_irq() also enables the interrupt. Make sure the device is
-quiesced and does not have any interrupts pending before registering
-the interrupt handler.
-
-MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
-which deliver interrupts to the CPU via a DMA write to a Local APIC.
-The fundamental difference between MSI and MSI-X is how multiple
-"vectors" get allocated. MSI requires contiguous blocks of vectors
-while MSI-X can allocate several individual ones.
-
-MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
-PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
-causes the PCI support to program CPU vector data into the PCI device
-capability registers. Many architectures, chip-sets, or BIOSes do NOT
-support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
-the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
-specify PCI_IRQ_LEGACY as well.
-
-Drivers that have different interrupt handlers for MSI/MSI-X and
-legacy INTx should chose the right one based on the msi_enabled
-and msix_enabled flags in the pci_dev structure after calling
-pci_alloc_irq_vectors.
-
-There are (at least) two really good reasons for using MSI:
-1) MSI is an exclusive interrupt vector by definition.
- This means the interrupt handler doesn't have to verify
- its device caused the interrupt.
-
-2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
- to be visible to the host CPU(s) when the MSI is delivered. This
- is important for both data coherency and avoiding stale control data.
- This guarantee allows the driver to omit MMIO reads to flush
- the DMA stream.
-
-See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
-of MSI/MSI-X usage.
-
-
-
-4. PCI device shutdown
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When a PCI device driver is being unloaded, most of the following
-steps need to be performed:
-
- Disable the device from generating IRQs
- Release the IRQ (free_irq())
- Stop all DMA activity
- Release DMA buffers (both streaming and consistent)
- Unregister from other subsystems (e.g. scsi or netdev)
- Disable device from responding to MMIO/IO Port addresses
- Release MMIO/IO Port resource(s)
-
-
-4.1 Stop IRQs on the device
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-How to do this is chip/device specific. If it's not done, it opens
-the possibility of a "screaming interrupt" if (and only if)
-the IRQ is shared with another device.
-
-When the shared IRQ handler is "unhooked", the remaining devices
-using the same IRQ line will still need the IRQ enabled. Thus if the
-"unhooked" device asserts IRQ line, the system will respond assuming
-it was one of the remaining devices asserted the IRQ line. Since none
-of the other devices will handle the IRQ, the system will "hang" until
-it decides the IRQ isn't going to get handled and masks the IRQ (100,000
-iterations later). Once the shared IRQ is masked, the remaining devices
-will stop functioning properly. Not a nice situation.
-
-This is another reason to use MSI or MSI-X if it's available.
-MSI and MSI-X are defined to be exclusive interrupts and thus
-are not susceptible to the "screaming interrupt" problem.
-
-
-4.2 Release the IRQ
-~~~~~~~~~~~~~~~~~~~
-Once the device is quiesced (no more IRQs), one can call free_irq().
-This function will return control once any pending IRQs are handled,
-"unhook" the drivers IRQ handler from that IRQ, and finally release
-the IRQ if no one else is using it.
-
-
-4.3 Stop all DMA activity
-~~~~~~~~~~~~~~~~~~~~~~~~~
-It's extremely important to stop all DMA operations BEFORE attempting
-to deallocate DMA control data. Failure to do so can result in memory
-corruption, hangs, and on some chip-sets a hard crash.
-
-Stopping DMA after stopping the IRQs can avoid races where the
-IRQ handler might restart DMA engines.
-
-While this step sounds obvious and trivial, several "mature" drivers
-didn't get this step right in the past.
-
-
-4.4 Release DMA buffers
-~~~~~~~~~~~~~~~~~~~~~~~
-Once DMA is stopped, clean up streaming DMA first.
-I.e. unmap data buffers and return buffers to "upstream"
-owners if there is one.
-
-Then clean up "consistent" buffers which contain the control data.
-
-See Documentation/DMA-API.txt for details on unmapping interfaces.
-
-
-4.5 Unregister from other subsystems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Most low level PCI device drivers support some other subsystem
-like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
-driver isn't losing resources from that other subsystem.
-If this happens, typically the symptom is an Oops (panic) when
-the subsystem attempts to call into a driver that has been unloaded.
-
-
-4.6 Disable Device from responding to MMIO/IO Port addresses
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-io_unmap() MMIO or IO Port resources and then call pci_disable_device().
-This is the symmetric opposite of pci_enable_device().
-Do not access device registers after calling pci_disable_device().
-
-
-4.7 Release MMIO/IO Port Resource(s)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Call pci_release_region() to mark the MMIO or IO Port range as available.
-Failure to do so usually results in the inability to reload the driver.
-
-
-
-5. How to access PCI config space
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-You can use pci_(read|write)_config_(byte|word|dword) to access the config
-space of a device represented by struct pci_dev *. All these functions return 0
-when successful or an error code (PCIBIOS_...) which can be translated to a text
-string by pcibios_strerror. Most drivers expect that accesses to valid PCI
-devices don't fail.
-
-If you don't have a struct pci_dev available, you can call
-pci_bus_(read|write)_config_(byte|word|dword) to access a given device
-and function on that bus.
-
-If you access fields in the standard portion of the config header, please
-use symbolic names of locations and bits declared in <linux/pci.h>.
-
-If you need to access Extended PCI Capability registers, just call
-pci_find_capability() for the particular capability and it will find the
-corresponding register block for you.
-
-
-
-6. Other interesting functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-pci_get_domain_bus_and_slot() Find pci_dev corresponding to given domain,
- bus and slot and number. If the device is
- found, its reference count is increased.
-pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3)
-pci_find_capability() Find specified capability in device's capability
- list.
-pci_resource_start() Returns bus start address for a given PCI region
-pci_resource_end() Returns bus end address for a given PCI region
-pci_resource_len() Returns the byte length of a PCI region
-pci_set_drvdata() Set private driver data pointer for a pci_dev
-pci_get_drvdata() Return private driver data pointer for a pci_dev
-pci_set_mwi() Enable Memory-Write-Invalidate transactions.
-pci_clear_mwi() Disable Memory-Write-Invalidate transactions.
-
-
-
-7. Miscellaneous hints
-~~~~~~~~~~~~~~~~~~~~~~
-
-When displaying PCI device names to the user (for example when a driver wants
-to tell the user what card has it found), please use pci_name(pci_dev).
-
-Always refer to the PCI devices by a pointer to the pci_dev structure.
-All PCI layer functions use this identification and it's the only
-reasonable one. Don't use bus/slot/function numbers except for very
-special purposes -- on systems with multiple primary buses their semantics
-can be pretty complex.
-
-Don't try to turn on Fast Back to Back writes in your driver. All devices
-on the bus need to be capable of doing it, so this is something which needs
-to be handled by platform and generic code, not individual drivers.
-
-
-
-8. Vendor and device identifications
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
-are shared across multiple drivers. You can add private definitions in
-your driver if they're helpful, or just use plain hex constants.
-
-The device IDs are arbitrary hex numbers (vendor controlled) and normally used
-only in a single location, the pci_device_id table.
-
-Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
-There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
-and https://github.com/pciutils/pciids.
-
-
-
-9. Obsolete functions
-~~~~~~~~~~~~~~~~~~~~~
-
-There are several functions which you might come across when trying to
-port an old driver to the new PCI interface. They are no longer present
-in the kernel as they aren't compatible with hotplug or PCI domains or
-having sane locking.
-
-pci_find_device() Superseded by pci_get_device()
-pci_find_subsys() Superseded by pci_get_subsys()
-pci_find_slot() Superseded by pci_get_domain_bus_and_slot()
-pci_get_slot() Superseded by pci_get_domain_bus_and_slot()
-
-
-The alternative is the traditional PCI device driver that walks PCI
-device lists. This is still possible but discouraged.
-
-
-
-10. MMIO Space and "Write Posting"
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Converting a driver from using I/O Port space to using MMIO space
-often requires some additional changes. Specifically, "write posting"
-needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
-already do this. I/O Port space guarantees write transactions reach the PCI
-device before the CPU can continue. Writes to MMIO space allow the CPU
-to continue before the transaction reaches the PCI device. HW weenies
-call this "Write Posting" because the write completion is "posted" to
-the CPU before the transaction has reached its destination.
-
-Thus, timing sensitive code should add readl() where the CPU is
-expected to wait before doing other work. The classic "bit banging"
-sequence works fine for I/O Port space:
-
- for (i = 8; --i; val >>= 1) {
- outb(val & 1, ioport_reg); /* write bit */
- udelay(10);
- }
-
-The same sequence for MMIO space should be:
-
- for (i = 8; --i; val >>= 1) {
- writeb(val & 1, mmio_reg); /* write bit */
- readb(safe_mmio_reg); /* flush posted write */
- udelay(10);
- }
-
-It is important that "safe_mmio_reg" not have any side effects that
-interferes with the correct operation of the device.
-
-Another case to watch out for is when resetting a PCI device. Use PCI
-Configuration space reads to flush the writel(). This will gracefully
-handle the PCI master abort on all platforms if the PCI device is
-expected to not respond to a readl(). Most x86 platforms will allow
-MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
-(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
-
diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst
new file mode 100644
index 000000000000..18bdefaafd1a
--- /dev/null
+++ b/Documentation/PCI/pcieaer-howto.rst
@@ -0,0 +1,311 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===========================================================
+The PCI Express Advanced Error Reporting Driver Guide HOWTO
+===========================================================
+
+:Authors: - T. Long Nguyen <tom.l.nguyen@intel.com>
+ - Yanmin Zhang <yanmin.zhang@intel.com>
+
+:Copyright: |copy| 2006 Intel Corporation
+
+Overview
+===========
+
+About this guide
+----------------
+
+This guide describes the basics of the PCI Express Advanced Error
+Reporting (AER) driver and provides information on how to use it, as
+well as how to enable the drivers of endpoint devices to conform with
+PCI Express AER driver.
+
+
+What is the PCI Express AER Driver?
+-----------------------------------
+
+PCI Express error signaling can occur on the PCI Express link itself
+or on behalf of transactions initiated on the link. PCI Express
+defines two error reporting paradigms: the baseline capability and
+the Advanced Error Reporting capability. The baseline capability is
+required of all PCI Express components providing a minimum defined
+set of error reporting requirements. Advanced Error Reporting
+capability is implemented with a PCI Express advanced error reporting
+extended capability structure providing more robust error reporting.
+
+The PCI Express AER driver provides the infrastructure to support PCI
+Express Advanced Error Reporting capability. The PCI Express AER
+driver provides three basic functions:
+
+ - Gathers the comprehensive error information if errors occurred.
+ - Reports error to the users.
+ - Performs error recovery actions.
+
+AER driver only attaches root ports which support PCI-Express AER
+capability.
+
+
+User Guide
+==========
+
+Include the PCI Express AER Root Driver into the Linux Kernel
+-------------------------------------------------------------
+
+The PCI Express AER Root driver is a Root Port service driver attached
+to the PCI Express Port Bus driver. If a user wants to use it, the driver
+has to be compiled. Option CONFIG_PCIEAER supports this capability. It
+depends on CONFIG_PCIEPORTBUS, so pls. set CONFIG_PCIEPORTBUS=y and
+CONFIG_PCIEAER = y.
+
+Load PCI Express AER Root Driver
+--------------------------------
+
+Some systems have AER support in firmware. Enabling Linux AER support at
+the same time the firmware handles AER may result in unpredictable
+behavior. Therefore, Linux does not handle AER events unless the firmware
+grants AER control to the OS via the ACPI _OSC method. See the PCI FW 3.0
+Specification for details regarding _OSC usage.
+
+AER error output
+----------------
+
+When a PCIe AER error is captured, an error message will be output to
+console. If it's a correctable error, it is output as a warning.
+Otherwise, it is printed as an error. So users could choose different
+log level to filter out correctable error messages.
+
+Below shows an example::
+
+ 0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID)
+ 0000:50:00.0: device [8086:0329] error status/mask=00100000/00000000
+ 0000:50:00.0: [20] Unsupported Request (First)
+ 0000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100
+
+In the example, 'Requester ID' means the ID of the device who sends
+the error message to root port. Pls. refer to pci express specs for
+other fields.
+
+AER Statistics / Counters
+-------------------------
+
+When PCIe AER errors are captured, the counters / statistics are also exposed
+in the form of sysfs attributes which are documented at
+Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
+
+Developer Guide
+===============
+
+To enable AER aware support requires a software driver to configure
+the AER capability structure within its device and to provide callbacks.
+
+To support AER better, developers need understand how AER does work
+firstly.
+
+PCI Express errors are classified into two types: correctable errors
+and uncorrectable errors. This classification is based on the impacts
+of those errors, which may result in degraded performance or function
+failure.
+
+Correctable errors pose no impacts on the functionality of the
+interface. The PCI Express protocol can recover without any software
+intervention or any loss of data. These errors are detected and
+corrected by hardware. Unlike correctable errors, uncorrectable
+errors impact functionality of the interface. Uncorrectable errors
+can cause a particular transaction or a particular PCI Express link
+to be unreliable. Depending on those error conditions, uncorrectable
+errors are further classified into non-fatal errors and fatal errors.
+Non-fatal errors cause the particular transaction to be unreliable,
+but the PCI Express link itself is fully functional. Fatal errors, on
+the other hand, cause the link to be unreliable.
+
+When AER is enabled, a PCI Express device will automatically send an
+error message to the PCIe root port above it when the device captures
+an error. The Root Port, upon receiving an error reporting message,
+internally processes and logs the error message in its PCI Express
+capability structure. Error information being logged includes storing
+the error reporting agent's requestor ID into the Error Source
+Identification Registers and setting the error bits of the Root Error
+Status Register accordingly. If AER error reporting is enabled in Root
+Error Command Register, the Root Port generates an interrupt if an
+error is detected.
+
+Note that the errors as described above are related to the PCI Express
+hierarchy and links. These errors do not include any device specific
+errors because device specific errors will still get sent directly to
+the device driver.
+
+Configure the AER capability structure
+--------------------------------------
+
+AER aware drivers of PCI Express component need change the device
+control registers to enable AER. They also could change AER registers,
+including mask and severity registers. Helper function
+pci_enable_pcie_error_reporting could be used to enable AER. See
+section 3.3.
+
+Provide callbacks
+-----------------
+
+callback reset_link to reset pci express link
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This callback is used to reset the pci express physical link when a
+fatal error happens. The root port aer service driver provides a
+default reset_link function, but different upstream ports might
+have different specifications to reset pci express link, so all
+upstream ports should provide their own reset_link functions.
+
+In struct pcie_port_service_driver, a new pointer, reset_link, is
+added.
+::
+
+ pci_ers_result_t (*reset_link) (struct pci_dev *dev);
+
+Section 3.2.2.2 provides more detailed info on when to call
+reset_link.
+
+PCI error-recovery callbacks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The PCI Express AER Root driver uses error callbacks to coordinate
+with downstream device drivers associated with a hierarchy in question
+when performing error recovery actions.
+
+Data struct pci_driver has a pointer, err_handler, to point to
+pci_error_handlers who consists of a couple of callback function
+pointers. AER driver follows the rules defined in
+pci-error-recovery.txt except pci express specific parts (e.g.
+reset_link). Pls. refer to pci-error-recovery.txt for detailed
+definitions of the callbacks.
+
+Below sections specify when to call the error callback functions.
+
+Correctable errors
+~~~~~~~~~~~~~~~~~~
+
+Correctable errors pose no impacts on the functionality of
+the interface. The PCI Express protocol can recover without any
+software intervention or any loss of data. These errors do not
+require any recovery actions. The AER driver clears the device's
+correctable error status register accordingly and logs these errors.
+
+Non-correctable (non-fatal and fatal) errors
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If an error message indicates a non-fatal error, performing link reset
+at upstream is not required. The AER driver calls error_detected(dev,
+pci_channel_io_normal) to all drivers associated within a hierarchy in
+question. for example::
+
+ EndPoint<==>DownstreamPort B<==>UpstreamPort A<==>RootPort
+
+If Upstream port A captures an AER error, the hierarchy consists of
+Downstream port B and EndPoint.
+
+A driver may return PCI_ERS_RESULT_CAN_RECOVER,
+PCI_ERS_RESULT_DISCONNECT, or PCI_ERS_RESULT_NEED_RESET, depending on
+whether it can recover or the AER driver calls mmio_enabled as next.
+
+If an error message indicates a fatal error, kernel will broadcast
+error_detected(dev, pci_channel_io_frozen) to all drivers within
+a hierarchy in question. Then, performing link reset at upstream is
+necessary. As different kinds of devices might use different approaches
+to reset link, AER port service driver is required to provide the
+function to reset link. Firstly, kernel looks for if the upstream
+component has an aer driver. If it has, kernel uses the reset_link
+callback of the aer driver. If the upstream component has no aer driver
+and the port is downstream port, we will perform a hot reset as the
+default by setting the Secondary Bus Reset bit of the Bridge Control
+register associated with the downstream port. As for upstream ports,
+they should provide their own aer service drivers with reset_link
+function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
+reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
+to mmio_enabled.
+
+helper functions
+----------------
+::
+
+ int pci_enable_pcie_error_reporting(struct pci_dev *dev);
+
+pci_enable_pcie_error_reporting enables the device to send error
+messages to root port when an error is detected. Note that devices
+don't enable the error reporting by default, so device drivers need
+call this function to enable it.
+
+::
+
+ int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+
+pci_disable_pcie_error_reporting disables the device to send error
+messages to root port when an error is detected.
+
+::
+
+ int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);`
+
+pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
+error status register.
+
+Frequent Asked Questions
+------------------------
+
+Q:
+ What happens if a PCI Express device driver does not provide an
+ error recovery handler (pci_driver->err_handler is equal to NULL)?
+
+A:
+ The devices attached with the driver won't be recovered. If the
+ error is fatal, kernel will print out warning messages. Please refer
+ to section 3 for more information.
+
+Q:
+ What happens if an upstream port service driver does not provide
+ callback reset_link?
+
+A:
+ Fatal error recovery will fail if the errors are reported by the
+ upstream ports who are attached by the service driver.
+
+Q:
+ How does this infrastructure deal with driver that is not PCI
+ Express aware?
+
+A:
+ This infrastructure calls the error callback functions of the
+ driver when an error happens. But if the driver is not aware of
+ PCI Express, the device might not report its own errors to root
+ port.
+
+Q:
+ What modifications will that driver need to make it compatible
+ with the PCI Express AER Root driver?
+
+A:
+ It could call the helper functions to enable AER in devices and
+ cleanup uncorrectable status register. Pls. refer to section 3.3.
+
+
+Software error injection
+========================
+
+Debugging PCIe AER error recovery code is quite difficult because it
+is hard to trigger real hardware errors. Software based error
+injection can be used to fake various kinds of PCIe errors.
+
+First you should enable PCIe AER software error injection in kernel
+configuration, that is, following item should be in your .config.
+
+CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
+
+After reboot with new kernel or insert the module, a device file named
+/dev/aer_inject should be created.
+
+Then, you need a user space tool named aer-inject, which can be gotten
+from:
+
+ https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
+
+More information about aer-inject can be found in the document comes
+with its source code.
diff --git a/Documentation/PCI/pcieaer-howto.txt b/Documentation/PCI/pcieaer-howto.txt
deleted file mode 100644
index 48ce7903e3c6..000000000000
--- a/Documentation/PCI/pcieaer-howto.txt
+++ /dev/null
@@ -1,267 +0,0 @@
- The PCI Express Advanced Error Reporting Driver Guide HOWTO
- T. Long Nguyen <tom.l.nguyen@intel.com>
- Yanmin Zhang <yanmin.zhang@intel.com>
- 07/29/2006
-
-
-1. Overview
-
-1.1 About this guide
-
-This guide describes the basics of the PCI Express Advanced Error
-Reporting (AER) driver and provides information on how to use it, as
-well as how to enable the drivers of endpoint devices to conform with
-PCI Express AER driver.
-
-1.2 Copyright (C) Intel Corporation 2006.
-
-1.3 What is the PCI Express AER Driver?
-
-PCI Express error signaling can occur on the PCI Express link itself
-or on behalf of transactions initiated on the link. PCI Express
-defines two error reporting paradigms: the baseline capability and
-the Advanced Error Reporting capability. The baseline capability is
-required of all PCI Express components providing a minimum defined
-set of error reporting requirements. Advanced Error Reporting
-capability is implemented with a PCI Express advanced error reporting
-extended capability structure providing more robust error reporting.
-
-The PCI Express AER driver provides the infrastructure to support PCI
-Express Advanced Error Reporting capability. The PCI Express AER
-driver provides three basic functions:
-
-- Gathers the comprehensive error information if errors occurred.
-- Reports error to the users.
-- Performs error recovery actions.
-
-AER driver only attaches root ports which support PCI-Express AER
-capability.
-
-
-2. User Guide
-
-2.1 Include the PCI Express AER Root Driver into the Linux Kernel
-
-The PCI Express AER Root driver is a Root Port service driver attached
-to the PCI Express Port Bus driver. If a user wants to use it, the driver
-has to be compiled. Option CONFIG_PCIEAER supports this capability. It
-depends on CONFIG_PCIEPORTBUS, so pls. set CONFIG_PCIEPORTBUS=y and
-CONFIG_PCIEAER = y.
-
-2.2 Load PCI Express AER Root Driver
-
-Some systems have AER support in firmware. Enabling Linux AER support at
-the same time the firmware handles AER may result in unpredictable
-behavior. Therefore, Linux does not handle AER events unless the firmware
-grants AER control to the OS via the ACPI _OSC method. See the PCI FW 3.0
-Specification for details regarding _OSC usage.
-
-2.3 AER error output
-
-When a PCIe AER error is captured, an error message will be output to
-console. If it's a correctable error, it is output as a warning.
-Otherwise, it is printed as an error. So users could choose different
-log level to filter out correctable error messages.
-
-Below shows an example:
-0000:50:00.0: PCIe Bus Error: severity=Uncorrected (Fatal), type=Transaction Layer, id=0500(Requester ID)
-0000:50:00.0: device [8086:0329] error status/mask=00100000/00000000
-0000:50:00.0: [20] Unsupported Request (First)
-0000:50:00.0: TLP Header: 04000001 00200a03 05010000 00050100
-
-In the example, 'Requester ID' means the ID of the device who sends
-the error message to root port. Pls. refer to pci express specs for
-other fields.
-
-2.4 AER Statistics / Counters
-
-When PCIe AER errors are captured, the counters / statistics are also exposed
-in the form of sysfs attributes which are documented at
-Documentation/ABI/testing/sysfs-bus-pci-devices-aer_stats
-
-3. Developer Guide
-
-To enable AER aware support requires a software driver to configure
-the AER capability structure within its device and to provide callbacks.
-
-To support AER better, developers need understand how AER does work
-firstly.
-
-PCI Express errors are classified into two types: correctable errors
-and uncorrectable errors. This classification is based on the impacts
-of those errors, which may result in degraded performance or function
-failure.
-
-Correctable errors pose no impacts on the functionality of the
-interface. The PCI Express protocol can recover without any software
-intervention or any loss of data. These errors are detected and
-corrected by hardware. Unlike correctable errors, uncorrectable
-errors impact functionality of the interface. Uncorrectable errors
-can cause a particular transaction or a particular PCI Express link
-to be unreliable. Depending on those error conditions, uncorrectable
-errors are further classified into non-fatal errors and fatal errors.
-Non-fatal errors cause the particular transaction to be unreliable,
-but the PCI Express link itself is fully functional. Fatal errors, on
-the other hand, cause the link to be unreliable.
-
-When AER is enabled, a PCI Express device will automatically send an
-error message to the PCIe root port above it when the device captures
-an error. The Root Port, upon receiving an error reporting message,
-internally processes and logs the error message in its PCI Express
-capability structure. Error information being logged includes storing
-the error reporting agent's requestor ID into the Error Source
-Identification Registers and setting the error bits of the Root Error
-Status Register accordingly. If AER error reporting is enabled in Root
-Error Command Register, the Root Port generates an interrupt if an
-error is detected.
-
-Note that the errors as described above are related to the PCI Express
-hierarchy and links. These errors do not include any device specific
-errors because device specific errors will still get sent directly to
-the device driver.
-
-3.1 Configure the AER capability structure
-
-AER aware drivers of PCI Express component need change the device
-control registers to enable AER. They also could change AER registers,
-including mask and severity registers. Helper function
-pci_enable_pcie_error_reporting could be used to enable AER. See
-section 3.3.
-
-3.2. Provide callbacks
-
-3.2.1 callback reset_link to reset pci express link
-
-This callback is used to reset the pci express physical link when a
-fatal error happens. The root port aer service driver provides a
-default reset_link function, but different upstream ports might
-have different specifications to reset pci express link, so all
-upstream ports should provide their own reset_link functions.
-
-In struct pcie_port_service_driver, a new pointer, reset_link, is
-added.
-
-pci_ers_result_t (*reset_link) (struct pci_dev *dev);
-
-Section 3.2.2.2 provides more detailed info on when to call
-reset_link.
-
-3.2.2 PCI error-recovery callbacks
-
-The PCI Express AER Root driver uses error callbacks to coordinate
-with downstream device drivers associated with a hierarchy in question
-when performing error recovery actions.
-
-Data struct pci_driver has a pointer, err_handler, to point to
-pci_error_handlers who consists of a couple of callback function
-pointers. AER driver follows the rules defined in
-pci-error-recovery.txt except pci express specific parts (e.g.
-reset_link). Pls. refer to pci-error-recovery.txt for detailed
-definitions of the callbacks.
-
-Below sections specify when to call the error callback functions.
-
-3.2.2.1 Correctable errors
-
-Correctable errors pose no impacts on the functionality of
-the interface. The PCI Express protocol can recover without any
-software intervention or any loss of data. These errors do not
-require any recovery actions. The AER driver clears the device's
-correctable error status register accordingly and logs these errors.
-
-3.2.2.2 Non-correctable (non-fatal and fatal) errors
-
-If an error message indicates a non-fatal error, performing link reset
-at upstream is not required. The AER driver calls error_detected(dev,
-pci_channel_io_normal) to all drivers associated within a hierarchy in
-question. for example,
-EndPoint<==>DownstreamPort B<==>UpstreamPort A<==>RootPort.
-If Upstream port A captures an AER error, the hierarchy consists of
-Downstream port B and EndPoint.
-
-A driver may return PCI_ERS_RESULT_CAN_RECOVER,
-PCI_ERS_RESULT_DISCONNECT, or PCI_ERS_RESULT_NEED_RESET, depending on
-whether it can recover or the AER driver calls mmio_enabled as next.
-
-If an error message indicates a fatal error, kernel will broadcast
-error_detected(dev, pci_channel_io_frozen) to all drivers within
-a hierarchy in question. Then, performing link reset at upstream is
-necessary. As different kinds of devices might use different approaches
-to reset link, AER port service driver is required to provide the
-function to reset link. Firstly, kernel looks for if the upstream
-component has an aer driver. If it has, kernel uses the reset_link
-callback of the aer driver. If the upstream component has no aer driver
-and the port is downstream port, we will perform a hot reset as the
-default by setting the Secondary Bus Reset bit of the Bridge Control
-register associated with the downstream port. As for upstream ports,
-they should provide their own aer service drivers with reset_link
-function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and
-reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes
-to mmio_enabled.
-
-3.3 helper functions
-
-3.3.1 int pci_enable_pcie_error_reporting(struct pci_dev *dev);
-pci_enable_pcie_error_reporting enables the device to send error
-messages to root port when an error is detected. Note that devices
-don't enable the error reporting by default, so device drivers need
-call this function to enable it.
-
-3.3.2 int pci_disable_pcie_error_reporting(struct pci_dev *dev);
-pci_disable_pcie_error_reporting disables the device to send error
-messages to root port when an error is detected.
-
-3.3.3 int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
-pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable
-error status register.
-
-3.4 Frequent Asked Questions
-
-Q: What happens if a PCI Express device driver does not provide an
-error recovery handler (pci_driver->err_handler is equal to NULL)?
-
-A: The devices attached with the driver won't be recovered. If the
-error is fatal, kernel will print out warning messages. Please refer
-to section 3 for more information.
-
-Q: What happens if an upstream port service driver does not provide
-callback reset_link?
-
-A: Fatal error recovery will fail if the errors are reported by the
-upstream ports who are attached by the service driver.
-
-Q: How does this infrastructure deal with driver that is not PCI
-Express aware?
-
-A: This infrastructure calls the error callback functions of the
-driver when an error happens. But if the driver is not aware of
-PCI Express, the device might not report its own errors to root
-port.
-
-Q: What modifications will that driver need to make it compatible
-with the PCI Express AER Root driver?
-
-A: It could call the helper functions to enable AER in devices and
-cleanup uncorrectable status register. Pls. refer to section 3.3.
-
-
-4. Software error injection
-
-Debugging PCIe AER error recovery code is quite difficult because it
-is hard to trigger real hardware errors. Software based error
-injection can be used to fake various kinds of PCIe errors.
-
-First you should enable PCIe AER software error injection in kernel
-configuration, that is, following item should be in your .config.
-
-CONFIG_PCIEAER_INJECT=y or CONFIG_PCIEAER_INJECT=m
-
-After reboot with new kernel or insert the module, a device file named
-/dev/aer_inject should be created.
-
-Then, you need a user space tool named aer-inject, which can be gotten
-from:
- https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
-
-More information about aer-inject can be found in the document comes
-with its source code.
diff --git a/Documentation/PCI/picebus-howto.rst b/Documentation/PCI/picebus-howto.rst
new file mode 100644
index 000000000000..f882ff62c51f
--- /dev/null
+++ b/Documentation/PCI/picebus-howto.rst
@@ -0,0 +1,220 @@
+.. SPDX-License-Identifier: GPL-2.0
+.. include:: <isonum.txt>
+
+===========================================
+The PCI Express Port Bus Driver Guide HOWTO
+===========================================
+
+:Author: Tom L Nguyen tom.l.nguyen@intel.com 11/03/2004
+:Copyright: |copy| 2004 Intel Corporation
+
+About this guide
+================
+
+This guide describes the basics of the PCI Express Port Bus driver
+and provides information on how to enable the service drivers to
+register/unregister with the PCI Express Port Bus Driver.
+
+
+What is the PCI Express Port Bus Driver
+=======================================
+
+A PCI Express Port is a logical PCI-PCI Bridge structure. There
+are two types of PCI Express Port: the Root Port and the Switch
+Port. The Root Port originates a PCI Express link from a PCI Express
+Root Complex and the Switch Port connects PCI Express links to
+internal logical PCI buses. The Switch Port, which has its secondary
+bus representing the switch's internal routing logic, is called the
+switch's Upstream Port. The switch's Downstream Port is bridging from
+switch's internal routing bus to a bus representing the downstream
+PCI Express link from the PCI Express Switch.
+
+A PCI Express Port can provide up to four distinct functions,
+referred to in this document as services, depending on its port type.
+PCI Express Port's services include native hotplug support (HP),
+power management event support (PME), advanced error reporting
+support (AER), and virtual channel support (VC). These services may
+be handled by a single complex driver or be individually distributed
+and handled by corresponding service drivers.
+
+Why use the PCI Express Port Bus Driver?
+========================================
+
+In existing Linux kernels, the Linux Device Driver Model allows a
+physical device to be handled by only a single driver. The PCI
+Express Port is a PCI-PCI Bridge device with multiple distinct
+services. To maintain a clean and simple solution each service
+may have its own software service driver. In this case several
+service drivers will compete for a single PCI-PCI Bridge device.
+For example, if the PCI Express Root Port native hotplug service
+driver is loaded first, it claims a PCI-PCI Bridge Root Port. The
+kernel therefore does not load other service drivers for that Root
+Port. In other words, it is impossible to have multiple service
+drivers load and run on a PCI-PCI Bridge device simultaneously
+using the current driver model.
+
+To enable multiple service drivers running simultaneously requires
+having a PCI Express Port Bus driver, which manages all populated
+PCI Express Ports and distributes all provided service requests
+to the corresponding service drivers as required. Some key
+advantages of using the PCI Express Port Bus driver are listed below:
+
+ - Allow multiple service drivers to run simultaneously on
+ a PCI-PCI Bridge Port device.
+
+ - Allow service drivers implemented in an independent
+ staged approach.
+
+ - Allow one service driver to run on multiple PCI-PCI Bridge
+ Port devices.
+
+ - Manage and distribute resources of a PCI-PCI Bridge Port
+ device to requested service drivers.
+
+Configuring the PCI Express Port Bus Driver vs. Service Drivers
+===============================================================
+
+Including the PCI Express Port Bus Driver Support into the Kernel
+-----------------------------------------------------------------
+
+Including the PCI Express Port Bus driver depends on whether the PCI
+Express support is included in the kernel config. The kernel will
+automatically include the PCI Express Port Bus driver as a kernel
+driver when the PCI Express support is enabled in the kernel.
+
+Enabling Service Driver Support
+-------------------------------
+
+PCI device drivers are implemented based on Linux Device Driver Model.
+All service drivers are PCI device drivers. As discussed above, it is
+impossible to load any service driver once the kernel has loaded the
+PCI Express Port Bus Driver. To meet the PCI Express Port Bus Driver
+Model requires some minimal changes on existing service drivers that
+imposes no impact on the functionality of existing service drivers.
+
+A service driver is required to use the two APIs shown below to
+register its service with the PCI Express Port Bus driver (see
+section 5.2.1 & 5.2.2). It is important that a service driver
+initializes the pcie_port_service_driver data structure, included in
+header file /include/linux/pcieport_if.h, before calling these APIs.
+Failure to do so will result an identity mismatch, which prevents
+the PCI Express Port Bus driver from loading a service driver.
+
+pcie_port_service_register
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+
+ int pcie_port_service_register(struct pcie_port_service_driver *new)
+
+This API replaces the Linux Driver Model's pci_register_driver API. A
+service driver should always calls pcie_port_service_register at
+module init. Note that after service driver being loaded, calls
+such as pci_enable_device(dev) and pci_set_master(dev) are no longer
+necessary since these calls are executed by the PCI Port Bus driver.
+
+pcie_port_service_unregister
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+
+ void pcie_port_service_unregister(struct pcie_port_service_driver *new)
+
+pcie_port_service_unregister replaces the Linux Driver Model's
+pci_unregister_driver. It's always called by service driver when a
+module exits.
+
+Sample Code
+~~~~~~~~~~~
+
+Below is sample service driver code to initialize the port service
+driver data structure.
+::
+
+ static struct pcie_port_service_id service_id[] = { {
+ .vendor = PCI_ANY_ID,
+ .device = PCI_ANY_ID,
+ .port_type = PCIE_RC_PORT,
+ .service_type = PCIE_PORT_SERVICE_AER,
+ }, { /* end: all zeroes */ }
+ };
+
+ static struct pcie_port_service_driver root_aerdrv = {
+ .name = (char *)device_name,
+ .id_table = &service_id[0],
+
+ .probe = aerdrv_load,
+ .remove = aerdrv_unload,
+
+ .suspend = aerdrv_suspend,
+ .resume = aerdrv_resume,
+ };
+
+Below is a sample code for registering/unregistering a service
+driver.
+::
+
+ static int __init aerdrv_service_init(void)
+ {
+ int retval = 0;
+
+ retval = pcie_port_service_register(&root_aerdrv);
+ if (!retval) {
+ /*
+ * FIX ME
+ */
+ }
+ return retval;
+ }
+
+ static void __exit aerdrv_service_exit(void)
+ {
+ pcie_port_service_unregister(&root_aerdrv);
+ }
+
+ module_init(aerdrv_service_init);
+ module_exit(aerdrv_service_exit);
+
+Possible Resource Conflicts
+===========================
+
+Since all service drivers of a PCI-PCI Bridge Port device are
+allowed to run simultaneously, below lists a few of possible resource
+conflicts with proposed solutions.
+
+MSI and MSI-X Vector Resource
+-----------------------------
+
+Once MSI or MSI-X interrupts are enabled on a device, it stays in this
+mode until they are disabled again. Since service drivers of the same
+PCI-PCI Bridge port share the same physical device, if an individual
+service driver enables or disables MSI/MSI-X mode it may result
+unpredictable behavior.
+
+To avoid this situation all service drivers are not permitted to
+switch interrupt mode on its device. The PCI Express Port Bus driver
+is responsible for determining the interrupt mode and this should be
+transparent to service drivers. Service drivers need to know only
+the vector IRQ assigned to the field irq of struct pcie_device, which
+is passed in when the PCI Express Port Bus driver probes each service
+driver. Service drivers should use (struct pcie_device*)dev->irq to
+call request_irq/free_irq. In addition, the interrupt mode is stored
+in the field interrupt_mode of struct pcie_device.
+
+PCI Memory/IO Mapped Regions
+----------------------------
+
+Service drivers for PCI Express Power Management (PME), Advanced
+Error Reporting (AER), Hot-Plug (HP) and Virtual Channel (VC) access
+PCI configuration space on the PCI Express port. In all cases the
+registers accessed are independent of each other. This patch assumes
+that all service drivers will be well behaved and not overwrite
+other service driver's configuration settings.
+
+PCI Config Registers
+--------------------
+
+Each service driver runs its PCI config operations on its own
+capability structure except the PCI Express capability structure, in
+which Root Control register and Device Control register are shared
+between PME and AER. This patch assumes that all service drivers
+will be well behaved and not overwrite other service driver's
+configuration settings.
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 613033ff2b9b..5e6429d66c24 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -12,6 +12,7 @@ please read on.
Reference counting on elements of lists which are protected by traditional
reader/writer spinlocks or semaphores are straightforward:
+CODE LISTING A:
1. 2.
add() search_and_reference()
{ {
@@ -28,7 +29,8 @@ add() search_and_reference()
release_referenced() delete()
{ {
... write_lock(&list_lock);
- atomic_dec(&el->rc, relfunc) ...
+ if(atomic_dec_and_test(&el->rc)) ...
+ kfree(el);
... remove_element
} write_unlock(&list_lock);
...
@@ -44,6 +46,7 @@ search_and_reference() could potentially hold reference to an element which
has already been deleted from the list/array. Use atomic_inc_not_zero()
in this scenario as follows:
+CODE LISTING B:
1. 2.
add() search_and_reference()
{ {
@@ -79,6 +82,7 @@ search_and_reference() code path. In such cases, the
atomic_dec_and_test() may be moved from delete() to el_free()
as follows:
+CODE LISTING C:
1. 2.
add() search_and_reference()
{ {
@@ -114,6 +118,17 @@ element can therefore safely be freed. This in turn guarantees that if
any reader finds the element, that reader may safely acquire a reference
without checking the value of the reference counter.
+A clear advantage of the RCU-based pattern in listing C over the one
+in listing B is that any call to search_and_reference() that locates
+a given object will succeed in obtaining a reference to that object,
+even given a concurrent invocation of delete() for that same object.
+Similarly, a clear advantage of both listings B and C over listing A is
+that a call to delete() is not delayed even if there are an arbitrarily
+large number of calls to search_and_reference() searching for the same
+object that delete() was invoked on. Instead, all that is delayed is
+the eventual invocation of kfree(), which is usually not a problem on
+modern computer systems, even the small ones.
+
In cases where delete() can sleep, synchronize_rcu() can be called from
delete(), so that el_free() can be subsumed into delete as follows:
@@ -130,3 +145,7 @@ delete()
kfree(el);
...
}
+
+As additional examples in the kernel, the pattern in listing C is used by
+reference counting of struct pid, while the pattern in listing B is used by
+struct posix_acl.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1ab70c37921f..13e88fc00f01 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -153,7 +153,7 @@ rcupdate.rcu_task_stall_timeout
This boot/sysfs parameter controls the RCU-tasks stall warning
interval. A value of zero or less suppresses RCU-tasks stall
warnings. A positive value sets the stall-warning interval
- in jiffies. An RCU-tasks stall warning starts with the line:
+ in seconds. An RCU-tasks stall warning starts with the line:
INFO: rcu_tasks detected stalls on tasks:
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 981651a8b65d..7e1a8721637a 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -212,7 +212,7 @@ synchronize_rcu()
rcu_assign_pointer()
- typeof(p) rcu_assign_pointer(p, typeof(p) v);
+ void rcu_assign_pointer(p, typeof(p) v);
Yes, rcu_assign_pointer() -is- implemented as a macro, though it
would be cool to be able to declare a function in this manner.
@@ -220,9 +220,9 @@ rcu_assign_pointer()
The updater uses this function to assign a new value to an
RCU-protected pointer, in order to safely communicate the change
- in value from the updater to the reader. This function returns
- the new value, and also executes any memory-barrier instructions
- required for a given CPU architecture.
+ in value from the updater to the reader. This macro does not
+ evaluate to an rvalue, but it does execute any memory-barrier
+ instructions required for a given CPU architecture.
Perhaps just as important, it serves to document (1) which
pointers are protected by RCU and (2) the point at which a
diff --git a/Documentation/accelerators/ocxl.rst b/Documentation/accelerators/ocxl.rst
deleted file mode 100644
index b1cea19a90f5..000000000000
--- a/Documentation/accelerators/ocxl.rst
+++ /dev/null
@@ -1,178 +0,0 @@
-:orphan:
-
-========================================================
-OpenCAPI (Open Coherent Accelerator Processor Interface)
-========================================================
-
-OpenCAPI is an interface between processors and accelerators. It aims
-at being low-latency and high-bandwidth. The specification is
-developed by the `OpenCAPI Consortium <http://opencapi.org/>`_.
-
-It allows an accelerator (which could be a FPGA, ASICs, ...) to access
-the host memory coherently, using virtual addresses. An OpenCAPI
-device can also host its own memory, that can be accessed from the
-host.
-
-OpenCAPI is known in linux as 'ocxl', as the open, processor-agnostic
-evolution of 'cxl' (the driver for the IBM CAPI interface for
-powerpc), which was named that way to avoid confusion with the ISDN
-CAPI subsystem.
-
-
-High-level view
-===============
-
-OpenCAPI defines a Data Link Layer (DL) and Transaction Layer (TL), to
-be implemented on top of a physical link. Any processor or device
-implementing the DL and TL can start sharing memory.
-
-::
-
- +-----------+ +-------------+
- | | | |
- | | | Accelerated |
- | Processor | | Function |
- | | +--------+ | Unit | +--------+
- | |--| Memory | | (AFU) |--| Memory |
- | | +--------+ | | +--------+
- +-----------+ +-------------+
- | |
- +-----------+ +-------------+
- | TL | | TLX |
- +-----------+ +-------------+
- | |
- +-----------+ +-------------+
- | DL | | DLX |
- +-----------+ +-------------+
- | |
- | PHY |
- +---------------------------------------+
-
-
-
-Device discovery
-================
-
-OpenCAPI relies on a PCI-like configuration space, implemented on the
-device. So the host can discover AFUs by querying the config space.
-
-OpenCAPI devices in Linux are treated like PCI devices (with a few
-caveats). The firmware is expected to abstract the hardware as if it
-was a PCI link. A lot of the existing PCI infrastructure is reused:
-devices are scanned and BARs are assigned during the standard PCI
-enumeration. Commands like 'lspci' can therefore be used to see what
-devices are available.
-
-The configuration space defines the AFU(s) that can be found on the
-physical adapter, such as its name, how many memory contexts it can
-work with, the size of its MMIO areas, ...
-
-
-
-MMIO
-====
-
-OpenCAPI defines two MMIO areas for each AFU:
-
-* the global MMIO area, with registers pertinent to the whole AFU.
-* a per-process MMIO area, which has a fixed size for each context.
-
-
-
-AFU interrupts
-==============
-
-OpenCAPI includes the possibility for an AFU to send an interrupt to a
-host process. It is done through a 'intrp_req' defined in the
-Transaction Layer, specifying a 64-bit object handle which defines the
-interrupt.
-
-The driver allows a process to allocate an interrupt and obtain its
-64-bit object handle, that can be passed to the AFU.
-
-
-
-char devices
-============
-
-The driver creates one char device per AFU found on the physical
-device. A physical device may have multiple functions and each
-function can have multiple AFUs. At the time of this writing though,
-it has only been tested with devices exporting only one AFU.
-
-Char devices can be found in /dev/ocxl/ and are named as:
-/dev/ocxl/<AFU name>.<location>.<index>
-
-where <AFU name> is a max 20-character long name, as found in the
-config space of the AFU.
-<location> is added by the driver and can help distinguish devices
-when a system has more than one instance of the same OpenCAPI device.
-<index> is also to help distinguish AFUs in the unlikely case where a
-device carries multiple copies of the same AFU.
-
-
-
-Sysfs class
-===========
-
-An ocxl class is added for the devices representing the AFUs. See
-/sys/class/ocxl. The layout is described in
-Documentation/ABI/testing/sysfs-class-ocxl
-
-
-
-User API
-========
-
-open
-----
-
-Based on the AFU definition found in the config space, an AFU may
-support working with more than one memory context, in which case the
-associated char device may be opened multiple times by different
-processes.
-
-
-ioctl
------
-
-OCXL_IOCTL_ATTACH:
-
- Attach the memory context of the calling process to the AFU so that
- the AFU can access its memory.
-
-OCXL_IOCTL_IRQ_ALLOC:
-
- Allocate an AFU interrupt and return an identifier.
-
-OCXL_IOCTL_IRQ_FREE:
-
- Free a previously allocated AFU interrupt.
-
-OCXL_IOCTL_IRQ_SET_FD:
-
- Associate an event fd to an AFU interrupt so that the user process
- can be notified when the AFU sends an interrupt.
-
-OCXL_IOCTL_GET_METADATA:
-
- Obtains configuration information from the card, such at the size of
- MMIO areas, the AFU version, and the PASID for the current context.
-
-OCXL_IOCTL_ENABLE_P9_WAIT:
-
- Allows the AFU to wake a userspace thread executing 'wait'. Returns
- information to userspace to allow it to configure the AFU. Note that
- this is only available on POWER9.
-
-OCXL_IOCTL_GET_FEATURES:
-
- Reports on which CPU features that affect OpenCAPI are usable from
- userspace.
-
-
-mmap
-----
-
-A process can mmap the per-process MMIO area for interactions with the
-AFU.
diff --git a/Documentation/accounting/cgroupstats.rst b/Documentation/accounting/cgroupstats.rst
new file mode 100644
index 000000000000..b9afc48f4ea2
--- /dev/null
+++ b/Documentation/accounting/cgroupstats.rst
@@ -0,0 +1,31 @@
+==================
+Control Groupstats
+==================
+
+Control Groupstats is inspired by the discussion at
+http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
+suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
+
+Per cgroup statistics infrastructure re-uses code from the taskstats
+interface. A new set of cgroup operations are registered with commands
+and attributes specific to cgroups. It should be very easy to
+extend per cgroup statistics, by adding members to the cgroupstats
+structure.
+
+The current model for cgroupstats is a pull, a push model (to post
+statistics on interesting events), should be very easy to add. Currently
+user space requests for statistics by passing the cgroup path.
+Statistics about the state of all the tasks in the cgroup is returned to
+user space.
+
+NOTE: We currently rely on delay accounting for extracting information
+about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
+information will not be available.
+
+To extract cgroup statistics a utility very similar to getdelays.c
+has been developed, the sample output of the utility is shown below::
+
+ ~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup/a"
+ sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
+ ~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup"
+ sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/accounting/cgroupstats.txt b/Documentation/accounting/cgroupstats.txt
deleted file mode 100644
index d16a9849e60e..000000000000
--- a/Documentation/accounting/cgroupstats.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Control Groupstats is inspired by the discussion at
-http://lkml.org/lkml/2007/4/11/187 and implements per cgroup statistics as
-suggested by Andrew Morton in http://lkml.org/lkml/2007/4/11/263.
-
-Per cgroup statistics infrastructure re-uses code from the taskstats
-interface. A new set of cgroup operations are registered with commands
-and attributes specific to cgroups. It should be very easy to
-extend per cgroup statistics, by adding members to the cgroupstats
-structure.
-
-The current model for cgroupstats is a pull, a push model (to post
-statistics on interesting events), should be very easy to add. Currently
-user space requests for statistics by passing the cgroup path.
-Statistics about the state of all the tasks in the cgroup is returned to
-user space.
-
-NOTE: We currently rely on delay accounting for extracting information
-about tasks blocked on I/O. If CONFIG_TASK_DELAY_ACCT is disabled, this
-information will not be available.
-
-To extract cgroup statistics a utility very similar to getdelays.c
-has been developed, the sample output of the utility is shown below
-
-~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup/a"
-sleeping 1, blocked 0, running 1, stopped 0, uninterruptible 0
-~/balbir/cgroupstats # ./getdelays -C "/sys/fs/cgroup"
-sleeping 155, blocked 0, running 1, stopped 0, uninterruptible 2
diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst
new file mode 100644
index 000000000000..7cc7f5852da0
--- /dev/null
+++ b/Documentation/accounting/delay-accounting.rst
@@ -0,0 +1,126 @@
+================
+Delay accounting
+================
+
+Tasks encounter delays in execution when they wait
+for some kernel resource to become available e.g. a
+runnable task may wait for a free CPU to run on.
+
+The per-task delay accounting functionality measures
+the delays experienced by a task while
+
+a) waiting for a CPU (while being runnable)
+b) completion of synchronous block I/O initiated by the task
+c) swapping in pages
+d) memory reclaim
+
+and makes these statistics available to userspace through
+the taskstats interface.
+
+Such delays provide feedback for setting a task's cpu priority,
+io priority and rss limit values appropriately. Long delays for
+important tasks could be a trigger for raising its corresponding priority.
+
+The functionality, through its use of the taskstats interface, also provides
+delay statistics aggregated for all tasks (or threads) belonging to a
+thread group (corresponding to a traditional Unix process). This is a commonly
+needed aggregation that is more efficiently done by the kernel.
+
+Userspace utilities, particularly resource management applications, can also
+aggregate delay statistics into arbitrary groups. To enable this, delay
+statistics of a task are available both during its lifetime as well as on its
+exit, ensuring continuous and complete monitoring can be done.
+
+
+Interface
+---------
+
+Delay accounting uses the taskstats interface which is described
+in detail in a separate document in this directory. Taskstats returns a
+generic data structure to userspace corresponding to per-pid and per-tgid
+statistics. The delay accounting functionality populates specific fields of
+this structure. See
+
+ include/linux/taskstats.h
+
+for a description of the fields pertaining to delay accounting.
+It will generally be in the form of counters returning the cumulative
+delay seen for cpu, sync block I/O, swapin, memory reclaim etc.
+
+Taking the difference of two successive readings of a given
+counter (say cpu_delay_total) for a task will give the delay
+experienced by the task waiting for the corresponding resource
+in that interval.
+
+When a task exits, records containing the per-task statistics
+are sent to userspace without requiring a command. If it is the last exiting
+task of a thread group, the per-tgid statistics are also sent. More details
+are given in the taskstats interface description.
+
+The getdelays.c userspace utility in tools/accounting directory allows simple
+commands to be run and the corresponding delay statistics to be displayed. It
+also serves as an example of using the taskstats interface.
+
+Usage
+-----
+
+Compile the kernel with::
+
+ CONFIG_TASK_DELAY_ACCT=y
+ CONFIG_TASKSTATS=y
+
+Delay accounting is enabled by default at boot up.
+To disable, add::
+
+ nodelayacct
+
+to the kernel boot options. The rest of the instructions
+below assume this has not been done.
+
+After the system has booted up, use a utility
+similar to getdelays.c to access the delays
+seen by a given task or a task group (tgid).
+The utility also allows a given command to be
+executed and the corresponding delays to be
+seen.
+
+General format of the getdelays command::
+
+ getdelays [-t tgid] [-p pid] [-c cmd...]
+
+
+Get delays, since system boot, for pid 10::
+
+ # ./getdelays -p 10
+ (output similar to next case)
+
+Get sum of delays, since system boot, for all pids with tgid 5::
+
+ # ./getdelays -t 5
+
+
+ CPU count real total virtual total delay total
+ 7876 92005750 100000000 24001500
+ IO count delay total
+ 0 0
+ SWAP count delay total
+ 0 0
+ RECLAIM count delay total
+ 0 0
+
+Get delays seen in executing a given simple command::
+
+ # ./getdelays -c ls /
+
+ bin data1 data3 data5 dev home media opt root srv sys usr
+ boot data2 data4 data6 etc lib mnt proc sbin subdomain tmp var
+
+
+ CPU count real total virtual total delay total
+ 6 4000250 4000000 0
+ IO count delay total
+ 0 0
+ SWAP count delay total
+ 0 0
+ RECLAIM count delay total
+ 0 0
diff --git a/Documentation/accounting/delay-accounting.txt b/Documentation/accounting/delay-accounting.txt
deleted file mode 100644
index 042ea59b5853..000000000000
--- a/Documentation/accounting/delay-accounting.txt
+++ /dev/null
@@ -1,117 +0,0 @@
-Delay accounting
-----------------
-
-Tasks encounter delays in execution when they wait
-for some kernel resource to become available e.g. a
-runnable task may wait for a free CPU to run on.
-
-The per-task delay accounting functionality measures
-the delays experienced by a task while
-
-a) waiting for a CPU (while being runnable)
-b) completion of synchronous block I/O initiated by the task
-c) swapping in pages
-d) memory reclaim
-
-and makes these statistics available to userspace through
-the taskstats interface.
-
-Such delays provide feedback for setting a task's cpu priority,
-io priority and rss limit values appropriately. Long delays for
-important tasks could be a trigger for raising its corresponding priority.
-
-The functionality, through its use of the taskstats interface, also provides
-delay statistics aggregated for all tasks (or threads) belonging to a
-thread group (corresponding to a traditional Unix process). This is a commonly
-needed aggregation that is more efficiently done by the kernel.
-
-Userspace utilities, particularly resource management applications, can also
-aggregate delay statistics into arbitrary groups. To enable this, delay
-statistics of a task are available both during its lifetime as well as on its
-exit, ensuring continuous and complete monitoring can be done.
-
-
-Interface
----------
-
-Delay accounting uses the taskstats interface which is described
-in detail in a separate document in this directory. Taskstats returns a
-generic data structure to userspace corresponding to per-pid and per-tgid
-statistics. The delay accounting functionality populates specific fields of
-this structure. See
- include/linux/taskstats.h
-for a description of the fields pertaining to delay accounting.
-It will generally be in the form of counters returning the cumulative
-delay seen for cpu, sync block I/O, swapin, memory reclaim etc.
-
-Taking the difference of two successive readings of a given
-counter (say cpu_delay_total) for a task will give the delay
-experienced by the task waiting for the corresponding resource
-in that interval.
-
-When a task exits, records containing the per-task statistics
-are sent to userspace without requiring a command. If it is the last exiting
-task of a thread group, the per-tgid statistics are also sent. More details
-are given in the taskstats interface description.
-
-The getdelays.c userspace utility in tools/accounting directory allows simple
-commands to be run and the corresponding delay statistics to be displayed. It
-also serves as an example of using the taskstats interface.
-
-Usage
------
-
-Compile the kernel with
- CONFIG_TASK_DELAY_ACCT=y
- CONFIG_TASKSTATS=y
-
-Delay accounting is enabled by default at boot up.
-To disable, add
- nodelayacct
-to the kernel boot options. The rest of the instructions
-below assume this has not been done.
-
-After the system has booted up, use a utility
-similar to getdelays.c to access the delays
-seen by a given task or a task group (tgid).
-The utility also allows a given command to be
-executed and the corresponding delays to be
-seen.
-
-General format of the getdelays command
-
-getdelays [-t tgid] [-p pid] [-c cmd...]
-
-
-Get delays, since system boot, for pid 10
-# ./getdelays -p 10
-(output similar to next case)
-
-Get sum of delays, since system boot, for all pids with tgid 5
-# ./getdelays -t 5
-
-
-CPU count real total virtual total delay total
- 7876 92005750 100000000 24001500
-IO count delay total
- 0 0
-SWAP count delay total
- 0 0
-RECLAIM count delay total
- 0 0
-
-Get delays seen in executing a given simple command
-# ./getdelays -c ls /
-
-bin data1 data3 data5 dev home media opt root srv sys usr
-boot data2 data4 data6 etc lib mnt proc sbin subdomain tmp var
-
-
-CPU count real total virtual total delay total
- 6 4000250 4000000 0
-IO count delay total
- 0 0
-SWAP count delay total
- 0 0
-RECLAIM count delay total
- 0 0
diff --git a/Documentation/accounting/index.rst b/Documentation/accounting/index.rst
new file mode 100644
index 000000000000..9369d8bf32be
--- /dev/null
+++ b/Documentation/accounting/index.rst
@@ -0,0 +1,14 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Accounting
+==========
+
+.. toctree::
+ :maxdepth: 1
+
+ cgroupstats
+ delay-accounting
+ psi
+ taskstats
+ taskstats-struct
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
new file mode 100644
index 000000000000..621111ce5740
--- /dev/null
+++ b/Documentation/accounting/psi.rst
@@ -0,0 +1,182 @@
+================================
+PSI - Pressure Stall Information
+================================
+
+:Date: April, 2018
+:Author: Johannes Weiner <hannes@cmpxchg.org>
+
+When CPU, memory or IO devices are contended, workloads experience
+latency spikes, throughput losses, and run the risk of OOM kills.
+
+Without an accurate measure of such contention, users are forced to
+either play it safe and under-utilize their hardware resources, or
+roll the dice and frequently suffer the disruptions resulting from
+excessive overcommit.
+
+The psi feature identifies and quantifies the disruptions caused by
+such resource crunches and the time impact it has on complex workloads
+or even entire systems.
+
+Having an accurate measure of productivity losses caused by resource
+scarcity aids users in sizing workloads to hardware--or provisioning
+hardware according to workload demand.
+
+As psi aggregates this information in realtime, systems can be managed
+dynamically using techniques such as load shedding, migrating jobs to
+other systems or data centers, or strategically pausing or killing low
+priority or restartable batch jobs.
+
+This allows maximizing hardware utilization without sacrificing
+workload health or risking major disruptions such as OOM kills.
+
+Pressure interface
+==================
+
+Pressure information for each resource is exported through the
+respective file in /proc/pressure/ -- cpu, memory, and io.
+
+The format for CPU is as such::
+
+ some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+and for memory and IO::
+
+ some avg10=0.00 avg60=0.00 avg300=0.00 total=0
+ full avg10=0.00 avg60=0.00 avg300=0.00 total=0
+
+The "some" line indicates the share of time in which at least some
+tasks are stalled on a given resource.
+
+The "full" line indicates the share of time in which all non-idle
+tasks are stalled on a given resource simultaneously. In this state
+actual CPU cycles are going to waste, and a workload that spends
+extended time in this state is considered to be thrashing. This has
+severe impact on performance, and it's useful to distinguish this
+situation from a state where some tasks are stalled but the CPU is
+still doing productive work. As such, time spent in this subset of the
+stall state is tracked separately and exported in the "full" averages.
+
+The ratios (in %) are tracked as recent trends over ten, sixty, and
+three hundred second windows, which gives insight into short term events
+as well as medium and long term trends. The total absolute stall time
+(in us) is tracked and exported as well, to allow detection of latency
+spikes which wouldn't necessarily make a dent in the time averages,
+or to average trends over custom time frames.
+
+Monitoring for pressure thresholds
+==================================
+
+Users can register triggers and use poll() to be woken up when resource
+pressure exceeds certain thresholds.
+
+A trigger describes the maximum cumulative stall time over a specific
+time window, e.g. 100ms of total stall time within any 500ms window to
+generate a wakeup event.
+
+To register a trigger user has to open psi interface file under
+/proc/pressure/ representing the resource to be monitored and write the
+desired threshold and time window. The open file descriptor should be
+used to wait for trigger events using select(), poll() or epoll().
+The following format is used::
+
+ <some|full> <stall amount in us> <time window in us>
+
+For example writing "some 150000 1000000" into /proc/pressure/memory
+would add 150ms threshold for partial memory stall measured within
+1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
+would add 50ms threshold for full io stall measured within 1sec time window.
+
+Triggers can be set on more than one psi metric and more than one trigger
+for the same psi metric can be specified. However for each trigger a separate
+file descriptor is required to be able to poll it separately from others,
+therefore for each trigger a separate open() syscall should be made even
+when opening the same psi interface file.
+
+Monitors activate only when system enters stall state for the monitored
+psi metric and deactivates upon exit from the stall state. While system is
+in the stall state psi signal growth is monitored at a rate of 10 times per
+tracking window.
+
+The kernel accepts window sizes ranging from 500ms to 10s, therefore min
+monitoring update interval is 50ms and max is 1s. Min limit is set to
+prevent overly frequent polling. Max limit is chosen as a high enough number
+after which monitors are most likely not needed and psi averages can be used
+instead.
+
+When activated, psi monitor stays active for at least the duration of one
+tracking window to avoid repeated activations/deactivations when system is
+bouncing in and out of the stall state.
+
+Notifications to the userspace are rate-limited to one per tracking window.
+
+The trigger will de-register when the file descriptor used to define the
+trigger is closed.
+
+Userspace monitor usage example
+===============================
+
+::
+
+ #include <errno.h>
+ #include <fcntl.h>
+ #include <stdio.h>
+ #include <poll.h>
+ #include <string.h>
+ #include <unistd.h>
+
+ /*
+ * Monitor memory partial stall with 1s tracking window size
+ * and 150ms threshold.
+ */
+ int main() {
+ const char trig[] = "some 150000 1000000";
+ struct pollfd fds;
+ int n;
+
+ fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
+ if (fds.fd < 0) {
+ printf("/proc/pressure/memory open error: %s\n",
+ strerror(errno));
+ return 1;
+ }
+ fds.events = POLLPRI;
+
+ if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
+ printf("/proc/pressure/memory write error: %s\n",
+ strerror(errno));
+ return 1;
+ }
+
+ printf("waiting for events...\n");
+ while (1) {
+ n = poll(&fds, 1, -1);
+ if (n < 0) {
+ printf("poll error: %s\n", strerror(errno));
+ return 1;
+ }
+ if (fds.revents & POLLERR) {
+ printf("got POLLERR, event source is gone\n");
+ return 0;
+ }
+ if (fds.revents & POLLPRI) {
+ printf("event triggered!\n");
+ } else {
+ printf("unknown event received: 0x%x\n", fds.revents);
+ return 1;
+ }
+ }
+
+ return 0;
+ }
+
+Cgroup2 interface
+=================
+
+In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
+mounted, pressure stall information is also tracked for tasks grouped
+into cgroups. Each subdirectory in the cgroupfs mountpoint contains
+cpu.pressure, memory.pressure, and io.pressure files; the format is
+the same as the /proc/pressure/ files.
+
+Per-cgroup psi monitors can be specified and used the same way as
+system-wide ones.
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
deleted file mode 100644
index 5cbe5659e3b7..000000000000
--- a/Documentation/accounting/psi.txt
+++ /dev/null
@@ -1,180 +0,0 @@
-================================
-PSI - Pressure Stall Information
-================================
-
-:Date: April, 2018
-:Author: Johannes Weiner <hannes@cmpxchg.org>
-
-When CPU, memory or IO devices are contended, workloads experience
-latency spikes, throughput losses, and run the risk of OOM kills.
-
-Without an accurate measure of such contention, users are forced to
-either play it safe and under-utilize their hardware resources, or
-roll the dice and frequently suffer the disruptions resulting from
-excessive overcommit.
-
-The psi feature identifies and quantifies the disruptions caused by
-such resource crunches and the time impact it has on complex workloads
-or even entire systems.
-
-Having an accurate measure of productivity losses caused by resource
-scarcity aids users in sizing workloads to hardware--or provisioning
-hardware according to workload demand.
-
-As psi aggregates this information in realtime, systems can be managed
-dynamically using techniques such as load shedding, migrating jobs to
-other systems or data centers, or strategically pausing or killing low
-priority or restartable batch jobs.
-
-This allows maximizing hardware utilization without sacrificing
-workload health or risking major disruptions such as OOM kills.
-
-Pressure interface
-==================
-
-Pressure information for each resource is exported through the
-respective file in /proc/pressure/ -- cpu, memory, and io.
-
-The format for CPU is as such:
-
-some avg10=0.00 avg60=0.00 avg300=0.00 total=0
-
-and for memory and IO:
-
-some avg10=0.00 avg60=0.00 avg300=0.00 total=0
-full avg10=0.00 avg60=0.00 avg300=0.00 total=0
-
-The "some" line indicates the share of time in which at least some
-tasks are stalled on a given resource.
-
-The "full" line indicates the share of time in which all non-idle
-tasks are stalled on a given resource simultaneously. In this state
-actual CPU cycles are going to waste, and a workload that spends
-extended time in this state is considered to be thrashing. This has
-severe impact on performance, and it's useful to distinguish this
-situation from a state where some tasks are stalled but the CPU is
-still doing productive work. As such, time spent in this subset of the
-stall state is tracked separately and exported in the "full" averages.
-
-The ratios (in %) are tracked as recent trends over ten, sixty, and
-three hundred second windows, which gives insight into short term events
-as well as medium and long term trends. The total absolute stall time
-(in us) is tracked and exported as well, to allow detection of latency
-spikes which wouldn't necessarily make a dent in the time averages,
-or to average trends over custom time frames.
-
-Monitoring for pressure thresholds
-==================================
-
-Users can register triggers and use poll() to be woken up when resource
-pressure exceeds certain thresholds.
-
-A trigger describes the maximum cumulative stall time over a specific
-time window, e.g. 100ms of total stall time within any 500ms window to
-generate a wakeup event.
-
-To register a trigger user has to open psi interface file under
-/proc/pressure/ representing the resource to be monitored and write the
-desired threshold and time window. The open file descriptor should be
-used to wait for trigger events using select(), poll() or epoll().
-The following format is used:
-
-<some|full> <stall amount in us> <time window in us>
-
-For example writing "some 150000 1000000" into /proc/pressure/memory
-would add 150ms threshold for partial memory stall measured within
-1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
-would add 50ms threshold for full io stall measured within 1sec time window.
-
-Triggers can be set on more than one psi metric and more than one trigger
-for the same psi metric can be specified. However for each trigger a separate
-file descriptor is required to be able to poll it separately from others,
-therefore for each trigger a separate open() syscall should be made even
-when opening the same psi interface file.
-
-Monitors activate only when system enters stall state for the monitored
-psi metric and deactivates upon exit from the stall state. While system is
-in the stall state psi signal growth is monitored at a rate of 10 times per
-tracking window.
-
-The kernel accepts window sizes ranging from 500ms to 10s, therefore min
-monitoring update interval is 50ms and max is 1s. Min limit is set to
-prevent overly frequent polling. Max limit is chosen as a high enough number
-after which monitors are most likely not needed and psi averages can be used
-instead.
-
-When activated, psi monitor stays active for at least the duration of one
-tracking window to avoid repeated activations/deactivations when system is
-bouncing in and out of the stall state.
-
-Notifications to the userspace are rate-limited to one per tracking window.
-
-The trigger will de-register when the file descriptor used to define the
-trigger is closed.
-
-Userspace monitor usage example
-===============================
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <poll.h>
-#include <string.h>
-#include <unistd.h>
-
-/*
- * Monitor memory partial stall with 1s tracking window size
- * and 150ms threshold.
- */
-int main() {
- const char trig[] = "some 150000 1000000";
- struct pollfd fds;
- int n;
-
- fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
- if (fds.fd < 0) {
- printf("/proc/pressure/memory open error: %s\n",
- strerror(errno));
- return 1;
- }
- fds.events = POLLPRI;
-
- if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
- printf("/proc/pressure/memory write error: %s\n",
- strerror(errno));
- return 1;
- }
-
- printf("waiting for events...\n");
- while (1) {
- n = poll(&fds, 1, -1);
- if (n < 0) {
- printf("poll error: %s\n", strerror(errno));
- return 1;
- }
- if (fds.revents & POLLERR) {
- printf("got POLLERR, event source is gone\n");
- return 0;
- }
- if (fds.revents & POLLPRI) {
- printf("event triggered!\n");
- } else {
- printf("unknown event received: 0x%x\n", fds.revents);
- return 1;
- }
- }
-
- return 0;
-}
-
-Cgroup2 interface
-=================
-
-In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
-mounted, pressure stall information is also tracked for tasks grouped
-into cgroups. Each subdirectory in the cgroupfs mountpoint contains
-cpu.pressure, memory.pressure, and io.pressure files; the format is
-the same as the /proc/pressure/ files.
-
-Per-cgroup psi monitors can be specified and used the same way as
-system-wide ones.
diff --git a/Documentation/accounting/taskstats-struct.rst b/Documentation/accounting/taskstats-struct.rst
new file mode 100644
index 000000000000..ca90fd489c9a
--- /dev/null
+++ b/Documentation/accounting/taskstats-struct.rst
@@ -0,0 +1,199 @@
+====================
+The struct taskstats
+====================
+
+This document contains an explanation of the struct taskstats fields.
+
+There are three different groups of fields in the struct taskstats:
+
+1) Common and basic accounting fields
+ If CONFIG_TASKSTATS is set, the taskstats interface is enabled and
+ the common fields and basic accounting fields are collected for
+ delivery at do_exit() of a task.
+2) Delay accounting fields
+ These fields are placed between::
+
+ /* Delay accounting fields start */
+
+ and::
+
+ /* Delay accounting fields end */
+
+ Their values are collected if CONFIG_TASK_DELAY_ACCT is set.
+3) Extended accounting fields
+ These fields are placed between::
+
+ /* Extended accounting fields start */
+
+ and::
+
+ /* Extended accounting fields end */
+
+ Their values are collected if CONFIG_TASK_XACCT is set.
+
+4) Per-task and per-thread context switch count statistics
+
+5) Time accounting for SMT machines
+
+6) Extended delay accounting fields for memory reclaim
+
+Future extension should add fields to the end of the taskstats struct, and
+should not change the relative position of each field within the struct.
+
+::
+
+ struct taskstats {
+
+1) Common and basic accounting fields::
+
+ /* The version number of this struct. This field is always set to
+ * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
+ * Each time the struct is changed, the value should be incremented.
+ */
+ __u16 version;
+
+ /* The exit code of a task. */
+ __u32 ac_exitcode; /* Exit status */
+
+ /* The accounting flags of a task as defined in <linux/acct.h>
+ * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
+ */
+ __u8 ac_flag; /* Record flags */
+
+ /* The value of task_nice() of a task. */
+ __u8 ac_nice; /* task_nice */
+
+ /* The name of the command that started this task. */
+ char ac_comm[TS_COMM_LEN]; /* Command name */
+
+ /* The scheduling discipline as set in task->policy field. */
+ __u8 ac_sched; /* Scheduling discipline */
+
+ __u8 ac_pad[3];
+ __u32 ac_uid; /* User ID */
+ __u32 ac_gid; /* Group ID */
+ __u32 ac_pid; /* Process ID */
+ __u32 ac_ppid; /* Parent process ID */
+
+ /* The time when a task begins, in [secs] since 1970. */
+ __u32 ac_btime; /* Begin time [sec since 1970] */
+
+ /* The elapsed time of a task, in [usec]. */
+ __u64 ac_etime; /* Elapsed time [usec] */
+
+ /* The user CPU time of a task, in [usec]. */
+ __u64 ac_utime; /* User CPU time [usec] */
+
+ /* The system CPU time of a task, in [usec]. */
+ __u64 ac_stime; /* System CPU time [usec] */
+
+ /* The minor page fault count of a task, as set in task->min_flt. */
+ __u64 ac_minflt; /* Minor Page Fault Count */
+
+ /* The major page fault count of a task, as set in task->maj_flt. */
+ __u64 ac_majflt; /* Major Page Fault Count */
+
+
+2) Delay accounting fields::
+
+ /* Delay accounting fields start
+ *
+ * All values, until the comment "Delay accounting fields end" are
+ * available only if delay accounting is enabled, even though the last
+ * few fields are not delays
+ *
+ * xxx_count is the number of delay values recorded
+ * xxx_delay_total is the corresponding cumulative delay in nanoseconds
+ *
+ * xxx_delay_total wraps around to zero on overflow
+ * xxx_count incremented regardless of overflow
+ */
+
+ /* Delay waiting for cpu, while runnable
+ * count, delay_total NOT updated atomically
+ */
+ __u64 cpu_count;
+ __u64 cpu_delay_total;
+
+ /* Following four fields atomically updated using task->delays->lock */
+
+ /* Delay waiting for synchronous block I/O to complete
+ * does not account for delays in I/O submission
+ */
+ __u64 blkio_count;
+ __u64 blkio_delay_total;
+
+ /* Delay waiting for page fault I/O (swap in only) */
+ __u64 swapin_count;
+ __u64 swapin_delay_total;
+
+ /* cpu "wall-clock" running time
+ * On some architectures, value will adjust for cpu time stolen
+ * from the kernel in involuntary waits due to virtualization.
+ * Value is cumulative, in nanoseconds, without a corresponding count
+ * and wraps around to zero silently on overflow
+ */
+ __u64 cpu_run_real_total;
+
+ /* cpu "virtual" running time
+ * Uses time intervals seen by the kernel i.e. no adjustment
+ * for kernel's involuntary waits due to virtualization.
+ * Value is cumulative, in nanoseconds, without a corresponding count
+ * and wraps around to zero silently on overflow
+ */
+ __u64 cpu_run_virtual_total;
+ /* Delay accounting fields end */
+ /* version 1 ends here */
+
+
+3) Extended accounting fields::
+
+ /* Extended accounting fields start */
+
+ /* Accumulated RSS usage in duration of a task, in MBytes-usecs.
+ * The current rss usage is added to this counter every time
+ * a tick is charged to a task's system time. So, at the end we
+ * will have memory usage multiplied by system time. Thus an
+ * average usage per system time unit can be calculated.
+ */
+ __u64 coremem; /* accumulated RSS usage in MB-usec */
+
+ /* Accumulated virtual memory usage in duration of a task.
+ * Same as acct_rss_mem1 above except that we keep track of VM usage.
+ */
+ __u64 virtmem; /* accumulated VM usage in MB-usec */
+
+ /* High watermark of RSS usage in duration of a task, in KBytes. */
+ __u64 hiwater_rss; /* High-watermark of RSS usage */
+
+ /* High watermark of VM usage in duration of a task, in KBytes. */
+ __u64 hiwater_vm; /* High-water virtual memory usage */
+
+ /* The following four fields are I/O statistics of a task. */
+ __u64 read_char; /* bytes read */
+ __u64 write_char; /* bytes written */
+ __u64 read_syscalls; /* read syscalls */
+ __u64 write_syscalls; /* write syscalls */
+
+ /* Extended accounting fields end */
+
+4) Per-task and per-thread statistics::
+
+ __u64 nvcsw; /* Context voluntary switch counter */
+ __u64 nivcsw; /* Context involuntary switch counter */
+
+5) Time accounting for SMT machines::
+
+ __u64 ac_utimescaled; /* utime scaled on frequency etc */
+ __u64 ac_stimescaled; /* stime scaled on frequency etc */
+ __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
+
+6) Extended delay accounting fields for memory reclaim::
+
+ /* Delay waiting for memory reclaim */
+ __u64 freepages_count;
+ __u64 freepages_delay_total;
+
+::
+
+ }
diff --git a/Documentation/accounting/taskstats-struct.txt b/Documentation/accounting/taskstats-struct.txt
deleted file mode 100644
index e7512c061c15..000000000000
--- a/Documentation/accounting/taskstats-struct.txt
+++ /dev/null
@@ -1,180 +0,0 @@
-The struct taskstats
---------------------
-
-This document contains an explanation of the struct taskstats fields.
-
-There are three different groups of fields in the struct taskstats:
-
-1) Common and basic accounting fields
- If CONFIG_TASKSTATS is set, the taskstats interface is enabled and
- the common fields and basic accounting fields are collected for
- delivery at do_exit() of a task.
-2) Delay accounting fields
- These fields are placed between
- /* Delay accounting fields start */
- and
- /* Delay accounting fields end */
- Their values are collected if CONFIG_TASK_DELAY_ACCT is set.
-3) Extended accounting fields
- These fields are placed between
- /* Extended accounting fields start */
- and
- /* Extended accounting fields end */
- Their values are collected if CONFIG_TASK_XACCT is set.
-
-4) Per-task and per-thread context switch count statistics
-
-5) Time accounting for SMT machines
-
-6) Extended delay accounting fields for memory reclaim
-
-Future extension should add fields to the end of the taskstats struct, and
-should not change the relative position of each field within the struct.
-
-
-struct taskstats {
-
-1) Common and basic accounting fields:
- /* The version number of this struct. This field is always set to
- * TAKSTATS_VERSION, which is defined in <linux/taskstats.h>.
- * Each time the struct is changed, the value should be incremented.
- */
- __u16 version;
-
- /* The exit code of a task. */
- __u32 ac_exitcode; /* Exit status */
-
- /* The accounting flags of a task as defined in <linux/acct.h>
- * Defined values are AFORK, ASU, ACOMPAT, ACORE, and AXSIG.
- */
- __u8 ac_flag; /* Record flags */
-
- /* The value of task_nice() of a task. */
- __u8 ac_nice; /* task_nice */
-
- /* The name of the command that started this task. */
- char ac_comm[TS_COMM_LEN]; /* Command name */
-
- /* The scheduling discipline as set in task->policy field. */
- __u8 ac_sched; /* Scheduling discipline */
-
- __u8 ac_pad[3];
- __u32 ac_uid; /* User ID */
- __u32 ac_gid; /* Group ID */
- __u32 ac_pid; /* Process ID */
- __u32 ac_ppid; /* Parent process ID */
-
- /* The time when a task begins, in [secs] since 1970. */
- __u32 ac_btime; /* Begin time [sec since 1970] */
-
- /* The elapsed time of a task, in [usec]. */
- __u64 ac_etime; /* Elapsed time [usec] */
-
- /* The user CPU time of a task, in [usec]. */
- __u64 ac_utime; /* User CPU time [usec] */
-
- /* The system CPU time of a task, in [usec]. */
- __u64 ac_stime; /* System CPU time [usec] */
-
- /* The minor page fault count of a task, as set in task->min_flt. */
- __u64 ac_minflt; /* Minor Page Fault Count */
-
- /* The major page fault count of a task, as set in task->maj_flt. */
- __u64 ac_majflt; /* Major Page Fault Count */
-
-
-2) Delay accounting fields:
- /* Delay accounting fields start
- *
- * All values, until the comment "Delay accounting fields end" are
- * available only if delay accounting is enabled, even though the last
- * few fields are not delays
- *
- * xxx_count is the number of delay values recorded
- * xxx_delay_total is the corresponding cumulative delay in nanoseconds
- *
- * xxx_delay_total wraps around to zero on overflow
- * xxx_count incremented regardless of overflow
- */
-
- /* Delay waiting for cpu, while runnable
- * count, delay_total NOT updated atomically
- */
- __u64 cpu_count;
- __u64 cpu_delay_total;
-
- /* Following four fields atomically updated using task->delays->lock */
-
- /* Delay waiting for synchronous block I/O to complete
- * does not account for delays in I/O submission
- */
- __u64 blkio_count;
- __u64 blkio_delay_total;
-
- /* Delay waiting for page fault I/O (swap in only) */
- __u64 swapin_count;
- __u64 swapin_delay_total;
-
- /* cpu "wall-clock" running time
- * On some architectures, value will adjust for cpu time stolen
- * from the kernel in involuntary waits due to virtualization.
- * Value is cumulative, in nanoseconds, without a corresponding count
- * and wraps around to zero silently on overflow
- */
- __u64 cpu_run_real_total;
-
- /* cpu "virtual" running time
- * Uses time intervals seen by the kernel i.e. no adjustment
- * for kernel's involuntary waits due to virtualization.
- * Value is cumulative, in nanoseconds, without a corresponding count
- * and wraps around to zero silently on overflow
- */
- __u64 cpu_run_virtual_total;
- /* Delay accounting fields end */
- /* version 1 ends here */
-
-
-3) Extended accounting fields
- /* Extended accounting fields start */
-
- /* Accumulated RSS usage in duration of a task, in MBytes-usecs.
- * The current rss usage is added to this counter every time
- * a tick is charged to a task's system time. So, at the end we
- * will have memory usage multiplied by system time. Thus an
- * average usage per system time unit can be calculated.
- */
- __u64 coremem; /* accumulated RSS usage in MB-usec */
-
- /* Accumulated virtual memory usage in duration of a task.
- * Same as acct_rss_mem1 above except that we keep track of VM usage.
- */
- __u64 virtmem; /* accumulated VM usage in MB-usec */
-
- /* High watermark of RSS usage in duration of a task, in KBytes. */
- __u64 hiwater_rss; /* High-watermark of RSS usage */
-
- /* High watermark of VM usage in duration of a task, in KBytes. */
- __u64 hiwater_vm; /* High-water virtual memory usage */
-
- /* The following four fields are I/O statistics of a task. */
- __u64 read_char; /* bytes read */
- __u64 write_char; /* bytes written */
- __u64 read_syscalls; /* read syscalls */
- __u64 write_syscalls; /* write syscalls */
-
- /* Extended accounting fields end */
-
-4) Per-task and per-thread statistics
- __u64 nvcsw; /* Context voluntary switch counter */
- __u64 nivcsw; /* Context involuntary switch counter */
-
-5) Time accounting for SMT machines
- __u64 ac_utimescaled; /* utime scaled on frequency etc */
- __u64 ac_stimescaled; /* stime scaled on frequency etc */
- __u64 cpu_scaled_run_real_total; /* scaled cpu_run_real_total */
-
-6) Extended delay accounting fields for memory reclaim
- /* Delay waiting for memory reclaim */
- __u64 freepages_count;
- __u64 freepages_delay_total;
-}
diff --git a/Documentation/accounting/taskstats.rst b/Documentation/accounting/taskstats.rst
new file mode 100644
index 000000000000..2a28b7f55c10
--- /dev/null
+++ b/Documentation/accounting/taskstats.rst
@@ -0,0 +1,180 @@
+=============================
+Per-task statistics interface
+=============================
+
+
+Taskstats is a netlink-based interface for sending per-task and
+per-process statistics from the kernel to userspace.
+
+Taskstats was designed for the following benefits:
+
+- efficiently provide statistics during lifetime of a task and on its exit
+- unified interface for multiple accounting subsystems
+- extensibility for use by future accounting patches
+
+Terminology
+-----------
+
+"pid", "tid" and "task" are used interchangeably and refer to the standard
+Linux task defined by struct task_struct. per-pid stats are the same as
+per-task stats.
+
+"tgid", "process" and "thread group" are used interchangeably and refer to the
+tasks that share an mm_struct i.e. the traditional Unix process. Despite the
+use of tgid, there is no special treatment for the task that is thread group
+leader - a process is deemed alive as long as it has any task belonging to it.
+
+Usage
+-----
+
+To get statistics during a task's lifetime, userspace opens a unicast netlink
+socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
+The response contains statistics for a task (if pid is specified) or the sum of
+statistics for all tasks of the process (if tgid is specified).
+
+To obtain statistics for tasks which are exiting, the userspace listener
+sends a register command and specifies a cpumask. Whenever a task exits on
+one of the cpus in the cpumask, its per-pid statistics are sent to the
+registered listener. Using cpumasks allows the data received by one listener
+to be limited and assists in flow control over the netlink interface and is
+explained in more detail below.
+
+If the exiting task is the last thread exiting its thread group,
+an additional record containing the per-tgid stats is also sent to userspace.
+The latter contains the sum of per-pid stats for all threads in the thread
+group, both past and present.
+
+getdelays.c is a simple utility demonstrating usage of the taskstats interface
+for reporting delay accounting statistics. Users can register cpumasks,
+send commands and process responses, listen for per-tid/tgid exit data,
+write the data received to a file and do basic flow control by increasing
+receive buffer sizes.
+
+Interface
+---------
+
+The user-kernel interface is encapsulated in include/linux/taskstats.h
+
+To avoid this documentation becoming obsolete as the interface evolves, only
+an outline of the current version is given. taskstats.h always overrides the
+description here.
+
+struct taskstats is the common accounting structure for both per-pid and
+per-tgid data. It is versioned and can be extended by each accounting subsystem
+that is added to the kernel. The fields and their semantics are defined in the
+taskstats.h file.
+
+The data exchanged between user and kernel space is a netlink message belonging
+to the NETLINK_GENERIC family and using the netlink attributes interface.
+The messages are in the format::
+
+ +----------+- - -+-------------+-------------------+
+ | nlmsghdr | Pad | genlmsghdr | taskstats payload |
+ +----------+- - -+-------------+-------------------+
+
+
+The taskstats payload is one of the following three kinds:
+
+1. Commands: Sent from user to kernel. Commands to get data on
+a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
+containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
+the task/process for which userspace wants statistics.
+
+Commands to register/deregister interest in exit data from a set of cpus
+consist of one attribute, of type
+TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
+attribute payload. The cpumask is specified as an ascii string of
+comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
+the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
+in cpus before closing the listening socket, the kernel cleans up its interest
+set over time. However, for the sake of efficiency, an explicit deregistration
+is advisable.
+
+2. Response for a command: sent from the kernel in response to a userspace
+command. The payload is a series of three attributes of type:
+
+a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
+a pid/tgid will be followed by some stats.
+
+b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
+are being returned.
+
+c) TASKSTATS_TYPE_STATS: attribute with a struct taskstats as payload. The
+same structure is used for both per-pid and per-tgid stats.
+
+3. New message sent by kernel whenever a task exits. The payload consists of a
+ series of attributes of the following type:
+
+a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
+b) TASKSTATS_TYPE_PID: contains exiting task's pid
+c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
+d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
+e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
+f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
+
+
+per-tgid stats
+--------------
+
+Taskstats provides per-process stats, in addition to per-task stats, since
+resource management is often done at a process granularity and aggregating task
+stats in userspace alone is inefficient and potentially inaccurate (due to lack
+of atomicity).
+
+However, maintaining per-process, in addition to per-task stats, within the
+kernel has space and time overheads. To address this, the taskstats code
+accumulates each exiting task's statistics into a process-wide data structure.
+When the last task of a process exits, the process level data accumulated also
+gets sent to userspace (along with the per-task data).
+
+When a user queries to get per-tgid data, the sum of all other live threads in
+the group is added up and added to the accumulated total for previously exited
+threads of the same thread group.
+
+Extending taskstats
+-------------------
+
+There are two ways to extend the taskstats interface to export more
+per-task/process stats as patches to collect them get added to the kernel
+in future:
+
+1. Adding more fields to the end of the existing struct taskstats. Backward
+ compatibility is ensured by the version number within the
+ structure. Userspace will use only the fields of the struct that correspond
+ to the version its using.
+
+2. Defining separate statistic structs and using the netlink attributes
+ interface to return them. Since userspace processes each netlink attribute
+ independently, it can always ignore attributes whose type it does not
+ understand (because it is using an older version of the interface).
+
+
+Choosing between 1. and 2. is a matter of trading off flexibility and
+overhead. If only a few fields need to be added, then 1. is the preferable
+path since the kernel and userspace don't need to incur the overhead of
+processing new netlink attributes. But if the new fields expand the existing
+struct too much, requiring disparate userspace accounting utilities to
+unnecessarily receive large structures whose fields are of no interest, then
+extending the attributes structure would be worthwhile.
+
+Flow control for taskstats
+--------------------------
+
+When the rate of task exits becomes large, a listener may not be able to keep
+up with the kernel's rate of sending per-tid/tgid exit data leading to data
+loss. This possibility gets compounded when the taskstats structure gets
+extended and the number of cpus grows large.
+
+To avoid losing statistics, userspace should do one or more of the following:
+
+- increase the receive buffer sizes for the netlink sockets opened by
+ listeners to receive exit data.
+
+- create more listeners and reduce the number of cpus being listened to by
+ each listener. In the extreme case, there could be one listener for each cpu.
+ Users may also consider setting the cpu affinity of the listener to the subset
+ of cpus to which it listens, especially if they are listening to just one cpu.
+
+Despite these measures, if the userspace receives ENOBUFS error messages
+indicated overflow of receive buffers, it should take measures to handle the
+loss of data.
diff --git a/Documentation/accounting/taskstats.txt b/Documentation/accounting/taskstats.txt
deleted file mode 100644
index ff06b738bb88..000000000000
--- a/Documentation/accounting/taskstats.txt
+++ /dev/null
@@ -1,181 +0,0 @@
-Per-task statistics interface
------------------------------
-
-
-Taskstats is a netlink-based interface for sending per-task and
-per-process statistics from the kernel to userspace.
-
-Taskstats was designed for the following benefits:
-
-- efficiently provide statistics during lifetime of a task and on its exit
-- unified interface for multiple accounting subsystems
-- extensibility for use by future accounting patches
-
-Terminology
------------
-
-"pid", "tid" and "task" are used interchangeably and refer to the standard
-Linux task defined by struct task_struct. per-pid stats are the same as
-per-task stats.
-
-"tgid", "process" and "thread group" are used interchangeably and refer to the
-tasks that share an mm_struct i.e. the traditional Unix process. Despite the
-use of tgid, there is no special treatment for the task that is thread group
-leader - a process is deemed alive as long as it has any task belonging to it.
-
-Usage
------
-
-To get statistics during a task's lifetime, userspace opens a unicast netlink
-socket (NETLINK_GENERIC family) and sends commands specifying a pid or a tgid.
-The response contains statistics for a task (if pid is specified) or the sum of
-statistics for all tasks of the process (if tgid is specified).
-
-To obtain statistics for tasks which are exiting, the userspace listener
-sends a register command and specifies a cpumask. Whenever a task exits on
-one of the cpus in the cpumask, its per-pid statistics are sent to the
-registered listener. Using cpumasks allows the data received by one listener
-to be limited and assists in flow control over the netlink interface and is
-explained in more detail below.
-
-If the exiting task is the last thread exiting its thread group,
-an additional record containing the per-tgid stats is also sent to userspace.
-The latter contains the sum of per-pid stats for all threads in the thread
-group, both past and present.
-
-getdelays.c is a simple utility demonstrating usage of the taskstats interface
-for reporting delay accounting statistics. Users can register cpumasks,
-send commands and process responses, listen for per-tid/tgid exit data,
-write the data received to a file and do basic flow control by increasing
-receive buffer sizes.
-
-Interface
----------
-
-The user-kernel interface is encapsulated in include/linux/taskstats.h
-
-To avoid this documentation becoming obsolete as the interface evolves, only
-an outline of the current version is given. taskstats.h always overrides the
-description here.
-
-struct taskstats is the common accounting structure for both per-pid and
-per-tgid data. It is versioned and can be extended by each accounting subsystem
-that is added to the kernel. The fields and their semantics are defined in the
-taskstats.h file.
-
-The data exchanged between user and kernel space is a netlink message belonging
-to the NETLINK_GENERIC family and using the netlink attributes interface.
-The messages are in the format
-
- +----------+- - -+-------------+-------------------+
- | nlmsghdr | Pad | genlmsghdr | taskstats payload |
- +----------+- - -+-------------+-------------------+
-
-
-The taskstats payload is one of the following three kinds:
-
-1. Commands: Sent from user to kernel. Commands to get data on
-a pid/tgid consist of one attribute, of type TASKSTATS_CMD_ATTR_PID/TGID,
-containing a u32 pid or tgid in the attribute payload. The pid/tgid denotes
-the task/process for which userspace wants statistics.
-
-Commands to register/deregister interest in exit data from a set of cpus
-consist of one attribute, of type
-TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK and contain a cpumask in the
-attribute payload. The cpumask is specified as an ascii string of
-comma-separated cpu ranges e.g. to listen to exit data from cpus 1,2,3,5,7,8
-the cpumask would be "1-3,5,7-8". If userspace forgets to deregister interest
-in cpus before closing the listening socket, the kernel cleans up its interest
-set over time. However, for the sake of efficiency, an explicit deregistration
-is advisable.
-
-2. Response for a command: sent from the kernel in response to a userspace
-command. The payload is a series of three attributes of type:
-
-a) TASKSTATS_TYPE_AGGR_PID/TGID : attribute containing no payload but indicates
-a pid/tgid will be followed by some stats.
-
-b) TASKSTATS_TYPE_PID/TGID: attribute whose payload is the pid/tgid whose stats
-are being returned.
-
-c) TASKSTATS_TYPE_STATS: attribute with a struct taskstats as payload. The
-same structure is used for both per-pid and per-tgid stats.
-
-3. New message sent by kernel whenever a task exits. The payload consists of a
- series of attributes of the following type:
-
-a) TASKSTATS_TYPE_AGGR_PID: indicates next two attributes will be pid+stats
-b) TASKSTATS_TYPE_PID: contains exiting task's pid
-c) TASKSTATS_TYPE_STATS: contains the exiting task's per-pid stats
-d) TASKSTATS_TYPE_AGGR_TGID: indicates next two attributes will be tgid+stats
-e) TASKSTATS_TYPE_TGID: contains tgid of process to which task belongs
-f) TASKSTATS_TYPE_STATS: contains the per-tgid stats for exiting task's process
-
-
-per-tgid stats
---------------
-
-Taskstats provides per-process stats, in addition to per-task stats, since
-resource management is often done at a process granularity and aggregating task
-stats in userspace alone is inefficient and potentially inaccurate (due to lack
-of atomicity).
-
-However, maintaining per-process, in addition to per-task stats, within the
-kernel has space and time overheads. To address this, the taskstats code
-accumulates each exiting task's statistics into a process-wide data structure.
-When the last task of a process exits, the process level data accumulated also
-gets sent to userspace (along with the per-task data).
-
-When a user queries to get per-tgid data, the sum of all other live threads in
-the group is added up and added to the accumulated total for previously exited
-threads of the same thread group.
-
-Extending taskstats
--------------------
-
-There are two ways to extend the taskstats interface to export more
-per-task/process stats as patches to collect them get added to the kernel
-in future:
-
-1. Adding more fields to the end of the existing struct taskstats. Backward
- compatibility is ensured by the version number within the
- structure. Userspace will use only the fields of the struct that correspond
- to the version its using.
-
-2. Defining separate statistic structs and using the netlink attributes
- interface to return them. Since userspace processes each netlink attribute
- independently, it can always ignore attributes whose type it does not
- understand (because it is using an older version of the interface).
-
-
-Choosing between 1. and 2. is a matter of trading off flexibility and
-overhead. If only a few fields need to be added, then 1. is the preferable
-path since the kernel and userspace don't need to incur the overhead of
-processing new netlink attributes. But if the new fields expand the existing
-struct too much, requiring disparate userspace accounting utilities to
-unnecessarily receive large structures whose fields are of no interest, then
-extending the attributes structure would be worthwhile.
-
-Flow control for taskstats
---------------------------
-
-When the rate of task exits becomes large, a listener may not be able to keep
-up with the kernel's rate of sending per-tid/tgid exit data leading to data
-loss. This possibility gets compounded when the taskstats structure gets
-extended and the number of cpus grows large.
-
-To avoid losing statistics, userspace should do one or more of the following:
-
-- increase the receive buffer sizes for the netlink sockets opened by
-listeners to receive exit data.
-
-- create more listeners and reduce the number of cpus being listened to by
-each listener. In the extreme case, there could be one listener for each cpu.
-Users may also consider setting the cpu affinity of the listener to the subset
-of cpus to which it listens, especially if they are listening to just one cpu.
-
-Despite these measures, if the userspace receives ENOBUFS error messages
-indicated overflow of receive buffers, it should take measures to handle the
-loss of data.
-
-----
diff --git a/Documentation/admin-guide/LSM/LoadPin.rst b/Documentation/admin-guide/LSM/LoadPin.rst
index 32070762d24c..716ad9b23c9a 100644
--- a/Documentation/admin-guide/LSM/LoadPin.rst
+++ b/Documentation/admin-guide/LSM/LoadPin.rst
@@ -19,3 +19,13 @@ block device backing the filesystem is not read-only, a sysctl is
created to toggle pinning: ``/proc/sys/kernel/loadpin/enabled``. (Having
a mutable filesystem means pinning is mutable too, but having the
sysctl allows for easy testing on systems with a mutable filesystem.)
+
+It's also possible to exclude specific file types from LoadPin using kernel
+command line option "``loadpin.exclude``". By default, all files are
+included, but they can be excluded using kernel command line option such
+as "``loadpin.exclude=kernel-module,kexec-image``". This allows to use
+different mechanisms such as ``CONFIG_MODULE_SIG`` and
+``CONFIG_KEXEC_VERIFY_SIG`` to verify kernel module and kernel image while
+still use LoadPin to protect the integrity of other files kernel loads. The
+full list of valid file types can be found in ``kernel_read_file_str``
+defined in ``include/linux/fs.h``.
diff --git a/Documentation/admin-guide/aoe/aoe.rst b/Documentation/admin-guide/aoe/aoe.rst
new file mode 100644
index 000000000000..a05e751363a0
--- /dev/null
+++ b/Documentation/admin-guide/aoe/aoe.rst
@@ -0,0 +1,150 @@
+Introduction
+============
+
+ATA over Ethernet is a network protocol that provides simple access to
+block storage on the LAN.
+
+ http://support.coraid.com/documents/AoEr11.txt
+
+The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ...
+
+ http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html
+
+It has many tips and hints! Please see, especially, recommended
+tunings for virtual memory:
+
+ http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19
+
+The aoetools are userland programs that are designed to work with this
+driver. The aoetools are on sourceforge.
+
+ http://aoetools.sourceforge.net/
+
+The scripts in this Documentation/admin-guide/aoe directory are intended to
+document the use of the driver and are not necessary if you install
+the aoetools.
+
+
+Creating Device Nodes
+=====================
+
+ Users of udev should find the block device nodes created
+ automatically, but to create all the necessary device nodes, use the
+ udev configuration rules provided in udev.txt (in this directory).
+
+ There is a udev-install.sh script that shows how to install these
+ rules on your system.
+
+ There is also an autoload script that shows how to edit
+ /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when
+ necessary. Preloading the aoe module is preferable to autoloading,
+ however, because AoE discovery takes a few seconds. It can be
+ confusing when an AoE device is not present the first time the a
+ command is run but appears a second later.
+
+Using Device Nodes
+==================
+
+ "cat /dev/etherd/err" blocks, waiting for error diagnostic output,
+ like any retransmitted packets.
+
+ "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to
+ limit ATA over Ethernet traffic to eth2 and eth4. AoE traffic from
+ untrusted networks should be ignored as a matter of security. See
+ also the aoe_iflist driver option described below.
+
+ "echo > /dev/etherd/discover" tells the driver to find out what AoE
+ devices are available.
+
+ In the future these character devices may disappear and be replaced
+ by sysfs counterparts. Using the commands in aoetools insulates
+ users from these implementation details.
+
+ The block devices are named like this::
+
+ e{shelf}.{slot}
+ e{shelf}.{slot}p{part}
+
+ ... so that "e0.2" is the third blade from the left (slot 2) in the
+ first shelf (shelf address zero). That's the whole disk. The first
+ partition on that disk would be "e0.2p1".
+
+Using sysfs
+===========
+
+ Each aoe block device in /sys/block has the extra attributes of
+ state, mac, and netif. The state attribute is "up" when the device
+ is ready for I/O and "down" if detected but unusable. The
+ "down,closewait" state shows that the device is still open and
+ cannot come up again until it has been closed.
+
+ The mac attribute is the ethernet address of the remote AoE device.
+ The netif attribute is the network interface on the localhost
+ through which we are communicating with the remote AoE device.
+
+ There is a script in this directory that formats this information in
+ a convenient way. Users with aoetools should use the aoe-stat
+ command::
+
+ root@makki root# sh Documentation/admin-guide/aoe/status.sh
+ e10.0 eth3 up
+ e10.1 eth3 up
+ e10.2 eth3 up
+ e10.3 eth3 up
+ e10.4 eth3 up
+ e10.5 eth3 up
+ e10.6 eth3 up
+ e10.7 eth3 up
+ e10.8 eth3 up
+ e10.9 eth3 up
+ e4.0 eth1 up
+ e4.1 eth1 up
+ e4.2 eth1 up
+ e4.3 eth1 up
+ e4.4 eth1 up
+ e4.5 eth1 up
+ e4.6 eth1 up
+ e4.7 eth1 up
+ e4.8 eth1 up
+ e4.9 eth1 up
+
+ Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver
+ option discussed below) instead of /dev/etherd/interfaces to limit
+ AoE traffic to the network interfaces in the given
+ whitespace-separated list. Unlike the old character device, the
+ sysfs entry can be read from as well as written to.
+
+ It's helpful to trigger discovery after setting the list of allowed
+ interfaces. The aoetools package provides an aoe-discover script
+ for this purpose. You can also directly use the
+ /dev/etherd/discover special file described above.
+
+Driver Options
+==============
+
+ There is a boot option for the built-in aoe driver and a
+ corresponding module parameter, aoe_iflist. Without this option,
+ all network interfaces may be used for ATA over Ethernet. Here is a
+ usage example for the module parameter::
+
+ modprobe aoe_iflist="eth1 eth3"
+
+ The aoe_deadsecs module parameter determines the maximum number of
+ seconds that the driver will wait for an AoE device to provide a
+ response to an AoE command. After aoe_deadsecs seconds have
+ elapsed, the AoE device will be marked as "down". A value of zero
+ is supported for testing purposes and makes the aoe driver keep
+ trying AoE commands forever.
+
+ The aoe_maxout module parameter has a default of 128. This is the
+ maximum number of unresponded packets that will be sent to an AoE
+ target at one time.
+
+ The aoe_dyndevs module parameter defaults to 1, meaning that the
+ driver will assign a block device minor number to a discovered AoE
+ target based on the order of its discovery. With dynamic minor
+ device numbers in use, a greater range of AoE shelf and slot
+ addresses can be supported. Users with udev will never have to
+ think about minor numbers. Using aoe_dyndevs=0 allows device nodes
+ to be pre-created using a static minor-number scheme with the
+ aoe-mkshelf script in the aoetools.
diff --git a/Documentation/aoe/autoload.sh b/Documentation/admin-guide/aoe/autoload.sh
index 815dff4691c9..815dff4691c9 100644
--- a/Documentation/aoe/autoload.sh
+++ b/Documentation/admin-guide/aoe/autoload.sh
diff --git a/Documentation/aoe/examples.rst b/Documentation/admin-guide/aoe/examples.rst
index 91f3198e52c1..91f3198e52c1 100644
--- a/Documentation/aoe/examples.rst
+++ b/Documentation/admin-guide/aoe/examples.rst
diff --git a/Documentation/admin-guide/aoe/index.rst b/Documentation/admin-guide/aoe/index.rst
new file mode 100644
index 000000000000..d71c5df15922
--- /dev/null
+++ b/Documentation/admin-guide/aoe/index.rst
@@ -0,0 +1,17 @@
+=======================
+ATA over Ethernet (AoE)
+=======================
+
+.. toctree::
+ :maxdepth: 1
+
+ aoe
+ todo
+ examples
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/aoe/status.sh b/Documentation/admin-guide/aoe/status.sh
index eeec7baae57a..eeec7baae57a 100644
--- a/Documentation/aoe/status.sh
+++ b/Documentation/admin-guide/aoe/status.sh
diff --git a/Documentation/aoe/todo.rst b/Documentation/admin-guide/aoe/todo.rst
index dea8db5a33e1..dea8db5a33e1 100644
--- a/Documentation/aoe/todo.rst
+++ b/Documentation/admin-guide/aoe/todo.rst
diff --git a/Documentation/aoe/udev-install.sh b/Documentation/admin-guide/aoe/udev-install.sh
index 15e86f58c036..15e86f58c036 100644
--- a/Documentation/aoe/udev-install.sh
+++ b/Documentation/admin-guide/aoe/udev-install.sh
diff --git a/Documentation/admin-guide/aoe/udev.txt b/Documentation/admin-guide/aoe/udev.txt
new file mode 100644
index 000000000000..5fb756466bc7
--- /dev/null
+++ b/Documentation/admin-guide/aoe/udev.txt
@@ -0,0 +1,26 @@
+# These rules tell udev what device nodes to create for aoe support.
+# They may be installed along the following lines. Check the section
+# 8 udev manpage to see whether your udev supports SUBSYSTEM, and
+# whether it uses one or two equal signs for SUBSYSTEM and KERNEL.
+#
+# ecashin@makki ~$ su
+# Password:
+# bash# find /etc -type f -name udev.conf
+# /etc/udev/udev.conf
+# bash# grep udev_rules= /etc/udev/udev.conf
+# udev_rules="/etc/udev/rules.d/"
+# bash# ls /etc/udev/rules.d/
+# 10-wacom.rules 50-udev.rules
+# bash# cp /path/to/linux/Documentation/admin-guide/aoe/udev.txt \
+# /etc/udev/rules.d/60-aoe.rules
+#
+
+# aoe char devices
+SUBSYSTEM=="aoe", KERNEL=="discover", NAME="etherd/%k", GROUP="disk", MODE="0220"
+SUBSYSTEM=="aoe", KERNEL=="err", NAME="etherd/%k", GROUP="disk", MODE="0440"
+SUBSYSTEM=="aoe", KERNEL=="interfaces", NAME="etherd/%k", GROUP="disk", MODE="0220"
+SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="0220"
+SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220"
+
+# aoe block devices
+KERNEL=="etherd*", GROUP="disk"
diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg
index f87cfa0dc2fb..f87cfa0dc2fb 100644
--- a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg
+++ b/Documentation/admin-guide/blockdev/drbd/DRBD-8.3-data-packets.svg
diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg
index 48a1e2165fec..48a1e2165fec 100644
--- a/Documentation/blockdev/drbd/DRBD-data-packets.svg
+++ b/Documentation/admin-guide/blockdev/drbd/DRBD-data-packets.svg
diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot
index 025e8cf5e64a..025e8cf5e64a 100644
--- a/Documentation/blockdev/drbd/conn-states-8.dot
+++ b/Documentation/admin-guide/blockdev/drbd/conn-states-8.dot
diff --git a/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst
new file mode 100644
index 000000000000..66036b901644
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/data-structure-v9.rst
@@ -0,0 +1,42 @@
+================================
+kernel data structure for DRBD-9
+================================
+
+This describes the in kernel data structure for DRBD-9. Starting with
+Linux v3.14 we are reorganizing DRBD to use this data structure.
+
+Basic Data Structure
+====================
+
+A node has a number of DRBD resources. Each such resource has a number of
+devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD
+device is represented by a block device locally.
+
+The DRBD objects are interconnected to form a matrix as depicted below; a
+drbd_peer_device object sits at each intersection between a drbd_device and a
+drbd_connection::
+
+ /--------------+---------------+.....+---------------\
+ | resource | device | | device |
+ +--------------+---------------+.....+---------------+
+ | connection | peer_device | | peer_device |
+ +--------------+---------------+.....+---------------+
+ : : : : :
+ : : : : :
+ +--------------+---------------+.....+---------------+
+ | connection | peer_device | | peer_device |
+ \--------------+---------------+.....+---------------/
+
+In this table, horizontally, devices can be accessed from resources by their
+volume number. Likewise, peer_devices can be accessed from connections by
+their volume number. Objects in the vertical direction are connected by double
+linked lists. There are back pointers from peer_devices to their connections a
+devices, and from connections and devices to their resource.
+
+All resources are in the drbd_resources double-linked list. In addition, all
+devices can be accessed by their minor device number via the drbd_devices idr.
+
+The drbd_resource, drbd_connection, and drbd_device objects are reference
+counted. The peer_device objects only serve to establish the links between
+devices and connections; their lifetime is determined by the lifetime of the
+device and connection which they reference.
diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot
index d06cfb46fb98..d06cfb46fb98 100644
--- a/Documentation/blockdev/drbd/disk-states-8.dot
+++ b/Documentation/admin-guide/blockdev/drbd/disk-states-8.dot
diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot
index 6d9cf0a7b11d..6d9cf0a7b11d 100644
--- a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot
+++ b/Documentation/admin-guide/blockdev/drbd/drbd-connection-state-overview.dot
diff --git a/Documentation/admin-guide/blockdev/drbd/figures.rst b/Documentation/admin-guide/blockdev/drbd/figures.rst
new file mode 100644
index 000000000000..bd9a4901fe46
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/figures.rst
@@ -0,0 +1,30 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. The here included files are intended to help understand the implementation
+
+Data flows that Relate some functions, and write packets
+========================================================
+
+.. kernel-figure:: DRBD-8.3-data-packets.svg
+ :alt: DRBD-8.3-data-packets.svg
+ :align: center
+
+.. kernel-figure:: DRBD-data-packets.svg
+ :alt: DRBD-data-packets.svg
+ :align: center
+
+
+Sub graphs of DRBD's state transitions
+======================================
+
+.. kernel-figure:: conn-states-8.dot
+ :alt: conn-states-8.dot
+ :align: center
+
+.. kernel-figure:: disk-states-8.dot
+ :alt: disk-states-8.dot
+ :align: center
+
+.. kernel-figure:: node-states-8.dot
+ :alt: node-states-8.dot
+ :align: center
diff --git a/Documentation/admin-guide/blockdev/drbd/index.rst b/Documentation/admin-guide/blockdev/drbd/index.rst
new file mode 100644
index 000000000000..68ecd5c113e9
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/index.rst
@@ -0,0 +1,19 @@
+==========================================
+Distributed Replicated Block Device - DRBD
+==========================================
+
+Description
+===========
+
+ DRBD is a shared-nothing, synchronously replicated block device. It
+ is designed to serve as a building block for high availability
+ clusters and in this context, is a "drop-in" replacement for shared
+ storage. Simplistically, you could see it as a network RAID 1.
+
+ Please visit http://www.drbd.org to find out more.
+
+.. toctree::
+ :maxdepth: 1
+
+ data-structure-v9
+ figures
diff --git a/Documentation/admin-guide/blockdev/drbd/node-states-8.dot b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot
new file mode 100644
index 000000000000..bfa54e1f8016
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/drbd/node-states-8.dot
@@ -0,0 +1,13 @@
+digraph node_states {
+ Secondary -> Primary [ label = "ioctl_set_state()" ]
+ Primary -> Secondary [ label = "ioctl_set_state()" ]
+}
+
+digraph peer_states {
+ Secondary -> Primary [ label = "recv state packet" ]
+ Primary -> Secondary [ label = "recv state packet" ]
+ Primary -> Unknown [ label = "connection lost" ]
+ Secondary -> Unknown [ label = "connection lost" ]
+ Unknown -> Primary [ label = "connected" ]
+ Unknown -> Secondary [ label = "connected" ]
+}
diff --git a/Documentation/admin-guide/blockdev/floppy.rst b/Documentation/admin-guide/blockdev/floppy.rst
new file mode 100644
index 000000000000..4a8f31cf4139
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/floppy.rst
@@ -0,0 +1,255 @@
+=============
+Floppy Driver
+=============
+
+FAQ list:
+=========
+
+A FAQ list may be found in the fdutils package (see below), and also
+at <http://fdutils.linux.lu/faq.html>.
+
+
+LILO configuration options (Thinkpad users, read this)
+======================================================
+
+The floppy driver is configured using the 'floppy=' option in
+lilo. This option can be typed at the boot prompt, or entered in the
+lilo configuration file.
+
+Example: If your kernel is called linux-2.6.9, type the following line
+at the lilo boot prompt (if you have a thinkpad)::
+
+ linux-2.6.9 floppy=thinkpad
+
+You may also enter the following line in /etc/lilo.conf, in the description
+of linux-2.6.9::
+
+ append = "floppy=thinkpad"
+
+Several floppy related options may be given, example::
+
+ linux-2.6.9 floppy=daring floppy=two_fdc
+ append = "floppy=daring floppy=two_fdc"
+
+If you give options both in the lilo config file and on the boot
+prompt, the option strings of both places are concatenated, the boot
+prompt options coming last. That's why there are also options to
+restore the default behavior.
+
+
+Module configuration options
+============================
+
+If you use the floppy driver as a module, use the following syntax::
+
+ modprobe floppy floppy="<options>"
+
+Example::
+
+ modprobe floppy floppy="omnibook messages"
+
+If you need certain options enabled every time you load the floppy driver,
+you can put::
+
+ options floppy floppy="omnibook messages"
+
+in a configuration file in /etc/modprobe.d/.
+
+
+The floppy driver related options are:
+
+ floppy=asus_pci
+ Sets the bit mask to allow only units 0 and 1. (default)
+
+ floppy=daring
+ Tells the floppy driver that you have a well behaved floppy controller.
+ This allows more efficient and smoother operation, but may fail on
+ certain controllers. This may speed up certain operations.
+
+ floppy=0,daring
+ Tells the floppy driver that your floppy controller should be used
+ with caution.
+
+ floppy=one_fdc
+ Tells the floppy driver that you have only one floppy controller.
+ (default)
+
+ floppy=two_fdc / floppy=<address>,two_fdc
+ Tells the floppy driver that you have two floppy controllers.
+ The second floppy controller is assumed to be at <address>.
+ This option is not needed if the second controller is at address
+ 0x370, and if you use the 'cmos' option.
+
+ floppy=thinkpad
+ Tells the floppy driver that you have a Thinkpad. Thinkpads use an
+ inverted convention for the disk change line.
+
+ floppy=0,thinkpad
+ Tells the floppy driver that you don't have a Thinkpad.
+
+ floppy=omnibook / floppy=nodma
+ Tells the floppy driver not to use Dma for data transfers.
+ This is needed on HP Omnibooks, which don't have a workable
+ DMA channel for the floppy driver. This option is also useful
+ if you frequently get "Unable to allocate DMA memory" messages.
+ Indeed, dma memory needs to be continuous in physical memory,
+ and is thus harder to find, whereas non-dma buffers may be
+ allocated in virtual memory. However, I advise against this if
+ you have an FDC without a FIFO (8272A or 82072). 82072A and
+ later are OK. You also need at least a 486 to use nodma.
+ If you use nodma mode, I suggest you also set the FIFO
+ threshold to 10 or lower, in order to limit the number of data
+ transfer interrupts.
+
+ If you have a FIFO-able FDC, the floppy driver automatically
+ falls back on non DMA mode if no DMA-able memory can be found.
+ If you want to avoid this, explicitly ask for 'yesdma'.
+
+ floppy=yesdma
+ Tells the floppy driver that a workable DMA channel is available.
+ (default)
+
+ floppy=nofifo
+ Disables the FIFO entirely. This is needed if you get "Bus
+ master arbitration error" messages from your Ethernet card (or
+ from other devices) while accessing the floppy.
+
+ floppy=usefifo
+ Enables the FIFO. (default)
+
+ floppy=<threshold>,fifo_depth
+ Sets the FIFO threshold. This is mostly relevant in DMA
+ mode. If this is higher, the floppy driver tolerates more
+ interrupt latency, but it triggers more interrupts (i.e. it
+ imposes more load on the rest of the system). If this is
+ lower, the interrupt latency should be lower too (faster
+ processor). The benefit of a lower threshold is less
+ interrupts.
+
+ To tune the fifo threshold, switch on over/underrun messages
+ using 'floppycontrol --messages'. Then access a floppy
+ disk. If you get a huge amount of "Over/Underrun - retrying"
+ messages, then the fifo threshold is too low. Try with a
+ higher value, until you only get an occasional Over/Underrun.
+ It is a good idea to compile the floppy driver as a module
+ when doing this tuning. Indeed, it allows to try different
+ fifo values without rebooting the machine for each test. Note
+ that you need to do 'floppycontrol --messages' every time you
+ re-insert the module.
+
+ Usually, tuning the fifo threshold should not be needed, as
+ the default (0xa) is reasonable.
+
+ floppy=<drive>,<type>,cmos
+ Sets the CMOS type of <drive> to <type>. This is mandatory if
+ you have more than two floppy drives (only two can be
+ described in the physical CMOS), or if your BIOS uses
+ non-standard CMOS types. The CMOS types are:
+
+ == ==================================
+ 0 Use the value of the physical CMOS
+ 1 5 1/4 DD
+ 2 5 1/4 HD
+ 3 3 1/2 DD
+ 4 3 1/2 HD
+ 5 3 1/2 ED
+ 6 3 1/2 ED
+ 16 unknown or not installed
+ == ==================================
+
+ (Note: there are two valid types for ED drives. This is because 5 was
+ initially chosen to represent floppy *tapes*, and 6 for ED drives.
+ AMI ignored this, and used 5 for ED drives. That's why the floppy
+ driver handles both.)
+
+ floppy=unexpected_interrupts
+ Print a warning message when an unexpected interrupt is received.
+ (default)
+
+ floppy=no_unexpected_interrupts / floppy=L40SX
+ Don't print a message when an unexpected interrupt is received. This
+ is needed on IBM L40SX laptops in certain video modes. (There seems
+ to be an interaction between video and floppy. The unexpected
+ interrupts affect only performance, and can be safely ignored.)
+
+ floppy=broken_dcl
+ Don't use the disk change line, but assume that the disk was
+ changed whenever the device node is reopened. Needed on some
+ boxes where the disk change line is broken or unsupported.
+ This should be regarded as a stopgap measure, indeed it makes
+ floppy operation less efficient due to unneeded cache
+ flushings, and slightly more unreliable. Please verify your
+ cable, connection and jumper settings if you have any DCL
+ problems. However, some older drives, and also some laptops
+ are known not to have a DCL.
+
+ floppy=debug
+ Print debugging messages.
+
+ floppy=messages
+ Print informational messages for some operations (disk change
+ notifications, warnings about over and underruns, and about
+ autodetection).
+
+ floppy=silent_dcl_clear
+ Uses a less noisy way to clear the disk change line (which
+ doesn't involve seeks). Implied by 'daring' option.
+
+ floppy=<nr>,irq
+ Sets the floppy IRQ to <nr> instead of 6.
+
+ floppy=<nr>,dma
+ Sets the floppy DMA channel to <nr> instead of 2.
+
+ floppy=slow
+ Use PS/2 stepping rate::
+
+ PS/2 floppies have much slower step rates than regular floppies.
+ It's been recommended that take about 1/4 of the default speed
+ in some more extreme cases.
+
+
+Supporting utilities and additional documentation:
+==================================================
+
+Additional parameters of the floppy driver can be configured at
+runtime. Utilities which do this can be found in the fdutils package.
+This package also contains a new version of mtools which allows to
+access high capacity disks (up to 1992K on a high density 3 1/2 disk!).
+It also contains additional documentation about the floppy driver.
+
+The latest version can be found at fdutils homepage:
+
+ http://fdutils.linux.lu
+
+The fdutils releases can be found at:
+
+ http://fdutils.linux.lu/download.html
+
+ http://www.tux.org/pub/knaff/fdutils/
+
+ ftp://metalab.unc.edu/pub/Linux/utils/disk-management/
+
+Reporting problems about the floppy driver
+==========================================
+
+If you have a question or a bug report about the floppy driver, mail
+me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use
+comp.os.linux.hardware. As the volume in these groups is rather high,
+be sure to include the word "floppy" (or "FLOPPY") in the subject
+line. If the reported problem happens when mounting floppy disks, be
+sure to mention also the type of the filesystem in the subject line.
+
+Be sure to read the FAQ before mailing/posting any bug reports!
+
+Alain
+
+Changelog
+=========
+
+10-30-2004 :
+ Cleanup, updating, add reference to module configuration.
+ James Nelson <james4765@gmail.com>
+
+6-3-2000 :
+ Original Document
diff --git a/Documentation/admin-guide/blockdev/index.rst b/Documentation/admin-guide/blockdev/index.rst
new file mode 100644
index 000000000000..b903cf152091
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/index.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+The Linux RapidIO Subsystem
+===========================
+
+.. toctree::
+ :maxdepth: 1
+
+ floppy
+ nbd
+ paride
+ ramdisk
+ zram
+
+ drbd/index
diff --git a/Documentation/admin-guide/blockdev/nbd.rst b/Documentation/admin-guide/blockdev/nbd.rst
new file mode 100644
index 000000000000..d78dfe559dcf
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/nbd.rst
@@ -0,0 +1,31 @@
+==================================
+Network Block Device (TCP version)
+==================================
+
+1) Overview
+-----------
+
+What is it: With this compiled in the kernel (or as a module), Linux
+can use a remote server as one of its block devices. So every time
+the client computer wants to read, e.g., /dev/nb0, it sends a
+request over TCP to the server, which will reply with the data read.
+This can be used for stations with low disk space (or even diskless)
+to borrow disk space from another computer.
+Unlike NFS, it is possible to put any filesystem on it, etc.
+
+For more information, or to download the nbd-client and nbd-server
+tools, go to http://nbd.sf.net/.
+
+The nbd kernel module need only be installed on the client
+system, as the nbd-server is completely in userspace. In fact,
+the nbd-server has been successfully ported to other operating
+systems, including Windows.
+
+A) NBD parameters
+-----------------
+
+max_part
+ Number of partitions per device (default: 0).
+
+nbds_max
+ Number of block devices that should be initialized (default: 16).
diff --git a/Documentation/admin-guide/blockdev/paride.rst b/Documentation/admin-guide/blockdev/paride.rst
new file mode 100644
index 000000000000..87b4278bf314
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/paride.rst
@@ -0,0 +1,439 @@
+===================================
+Linux and parallel port IDE devices
+===================================
+
+PARIDE v1.03 (c) 1997-8 Grant Guenther <grant@torque.net>
+
+1. Introduction
+===============
+
+Owing to the simplicity and near universality of the parallel port interface
+to personal computers, many external devices such as portable hard-disk,
+CD-ROM, LS-120 and tape drives use the parallel port to connect to their
+host computer. While some devices (notably scanners) use ad-hoc methods
+to pass commands and data through the parallel port interface, most
+external devices are actually identical to an internal model, but with
+a parallel-port adapter chip added in. Some of the original parallel port
+adapters were little more than mechanisms for multiplexing a SCSI bus.
+(The Iomega PPA-3 adapter used in the ZIP drives is an example of this
+approach). Most current designs, however, take a different approach.
+The adapter chip reproduces a small ISA or IDE bus in the external device
+and the communication protocol provides operations for reading and writing
+device registers, as well as data block transfer functions. Sometimes,
+the device being addressed via the parallel cable is a standard SCSI
+controller like an NCR 5380. The "ditto" family of external tape
+drives use the ISA replicator to interface a floppy disk controller,
+which is then connected to a floppy-tape mechanism. The vast majority
+of external parallel port devices, however, are now based on standard
+IDE type devices, which require no intermediate controller. If one
+were to open up a parallel port CD-ROM drive, for instance, one would
+find a standard ATAPI CD-ROM drive, a power supply, and a single adapter
+that interconnected a standard PC parallel port cable and a standard
+IDE cable. It is usually possible to exchange the CD-ROM device with
+any other device using the IDE interface.
+
+The document describes the support in Linux for parallel port IDE
+devices. It does not cover parallel port SCSI devices, "ditto" tape
+drives or scanners. Many different devices are supported by the
+parallel port IDE subsystem, including:
+
+ - MicroSolutions backpack CD-ROM
+ - MicroSolutions backpack PD/CD
+ - MicroSolutions backpack hard-drives
+ - MicroSolutions backpack 8000t tape drive
+ - SyQuest EZ-135, EZ-230 & SparQ drives
+ - Avatar Shark
+ - Imation Superdisk LS-120
+ - Maxell Superdisk LS-120
+ - FreeCom Power CD
+ - Hewlett-Packard 5GB and 8GB tape drives
+ - Hewlett-Packard 7100 and 7200 CD-RW drives
+
+as well as most of the clone and no-name products on the market.
+
+To support such a wide range of devices, PARIDE, the parallel port IDE
+subsystem, is actually structured in three parts. There is a base
+paride module which provides a registry and some common methods for
+accessing the parallel ports. The second component is a set of
+high-level drivers for each of the different types of supported devices:
+
+ === =============
+ pd IDE disk
+ pcd ATAPI CD-ROM
+ pf ATAPI disk
+ pt ATAPI tape
+ pg ATAPI generic
+ === =============
+
+(Currently, the pg driver is only used with CD-R drives).
+
+The high-level drivers function according to the relevant standards.
+The third component of PARIDE is a set of low-level protocol drivers
+for each of the parallel port IDE adapter chips. Thanks to the interest
+and encouragement of Linux users from many parts of the world,
+support is available for almost all known adapter protocols:
+
+ ==== ====================================== ====
+ aten ATEN EH-100 (HK)
+ bpck Microsolutions backpack (US)
+ comm DataStor (old-type) "commuter" adapter (TW)
+ dstr DataStor EP-2000 (TW)
+ epat Shuttle EPAT (UK)
+ epia Shuttle EPIA (UK)
+ fit2 FIT TD-2000 (US)
+ fit3 FIT TD-3000 (US)
+ friq Freecom IQ cable (DE)
+ frpw Freecom Power (DE)
+ kbic KingByte KBIC-951A and KBIC-971A (TW)
+ ktti KT Technology PHd adapter (SG)
+ on20 OnSpec 90c20 (US)
+ on26 OnSpec 90c26 (US)
+ ==== ====================================== ====
+
+
+2. Using the PARIDE subsystem
+=============================
+
+While configuring the Linux kernel, you may choose either to build
+the PARIDE drivers into your kernel, or to build them as modules.
+
+In either case, you will need to select "Parallel port IDE device support"
+as well as at least one of the high-level drivers and at least one
+of the parallel port communication protocols. If you do not know
+what kind of parallel port adapter is used in your drive, you could
+begin by checking the file names and any text files on your DOS
+installation floppy. Alternatively, you can look at the markings on
+the adapter chip itself. That's usually sufficient to identify the
+correct device.
+
+You can actually select all the protocol modules, and allow the PARIDE
+subsystem to try them all for you.
+
+For the "brand-name" products listed above, here are the protocol
+and high-level drivers that you would use:
+
+ ================ ============ ====== ========
+ Manufacturer Model Driver Protocol
+ ================ ============ ====== ========
+ MicroSolutions CD-ROM pcd bpck
+ MicroSolutions PD drive pf bpck
+ MicroSolutions hard-drive pd bpck
+ MicroSolutions 8000t tape pt bpck
+ SyQuest EZ, SparQ pd epat
+ Imation Superdisk pf epat
+ Maxell Superdisk pf friq
+ Avatar Shark pd epat
+ FreeCom CD-ROM pcd frpw
+ Hewlett-Packard 5GB Tape pt epat
+ Hewlett-Packard 7200e (CD) pcd epat
+ Hewlett-Packard 7200e (CD-R) pg epat
+ ================ ============ ====== ========
+
+2.1 Configuring built-in drivers
+---------------------------------
+
+We recommend that you get to know how the drivers work and how to
+configure them as loadable modules, before attempting to compile a
+kernel with the drivers built-in.
+
+If you built all of your PARIDE support directly into your kernel,
+and you have just a single parallel port IDE device, your kernel should
+locate it automatically for you. If you have more than one device,
+you may need to give some command line options to your bootloader
+(eg: LILO), how to do that is beyond the scope of this document.
+
+The high-level drivers accept a number of command line parameters, all
+of which are documented in the source files in linux/drivers/block/paride.
+By default, each driver will automatically try all parallel ports it
+can find, and all protocol types that have been installed, until it finds
+a parallel port IDE adapter. Once it finds one, the probe stops. So,
+if you have more than one device, you will need to tell the drivers
+how to identify them. This requires specifying the port address, the
+protocol identification number and, for some devices, the drive's
+chain ID. While your system is booting, a number of messages are
+displayed on the console. Like all such messages, they can be
+reviewed with the 'dmesg' command. Among those messages will be
+some lines like::
+
+ paride: bpck registered as protocol 0
+ paride: epat registered as protocol 1
+
+The numbers will always be the same until you build a new kernel with
+different protocol selections. You should note these numbers as you
+will need them to identify the devices.
+
+If you happen to be using a MicroSolutions backpack device, you will
+also need to know the unit ID number for each drive. This is usually
+the last two digits of the drive's serial number (but read MicroSolutions'
+documentation about this).
+
+As an example, let's assume that you have a MicroSolutions PD/CD drive
+with unit ID number 36 connected to the parallel port at 0x378, a SyQuest
+EZ-135 connected to the chained port on the PD/CD drive and also an
+Imation Superdisk connected to port 0x278. You could give the following
+options on your boot command::
+
+ pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36
+
+In the last option, pf.drive1 configures device /dev/pf1, the 0x378
+is the parallel port base address, the 0 is the protocol registration
+number and 36 is the chain ID.
+
+Please note: while PARIDE will work both with and without the
+PARPORT parallel port sharing system that is included by the
+"Parallel port support" option, PARPORT must be included and enabled
+if you want to use chains of devices on the same parallel port.
+
+2.2 Loading and configuring PARIDE as modules
+----------------------------------------------
+
+It is much faster and simpler to get to understand the PARIDE drivers
+if you use them as loadable kernel modules.
+
+Note 1:
+ using these drivers with the "kerneld" automatic module loading
+ system is not recommended for beginners, and is not documented here.
+
+Note 2:
+ if you build PARPORT support as a loadable module, PARIDE must
+ also be built as loadable modules, and PARPORT must be loaded before
+ the PARIDE modules.
+
+To use PARIDE, you must begin by::
+
+ insmod paride
+
+this loads a base module which provides a registry for the protocols,
+among other tasks.
+
+Then, load as many of the protocol modules as you think you might need.
+As you load each module, it will register the protocols that it supports,
+and print a log message to your kernel log file and your console. For
+example::
+
+ # insmod epat
+ paride: epat registered as protocol 0
+ # insmod kbic
+ paride: k951 registered as protocol 1
+ paride: k971 registered as protocol 2
+
+Finally, you can load high-level drivers for each kind of device that
+you have connected. By default, each driver will autoprobe for a single
+device, but you can support up to four similar devices by giving their
+individual co-ordinates when you load the driver.
+
+For example, if you had two no-name CD-ROM drives both using the
+KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc
+you could give the following command::
+
+ # insmod pcd drive0=0x378,1 drive1=0x3bc,1
+
+For most adapters, giving a port address and protocol number is sufficient,
+but check the source files in linux/drivers/block/paride for more
+information. (Hopefully someone will write some man pages one day !).
+
+As another example, here's what happens when PARPORT is installed, and
+a SyQuest EZ-135 is attached to port 0x378::
+
+ # insmod paride
+ paride: version 1.0 installed
+ # insmod epat
+ paride: epat registered as protocol 0
+ # insmod pd
+ pd: pd version 1.0, major 45, cluster 64, nice 0
+ pda: Sharing parport1 at 0x378
+ pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1
+ pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media
+ pda: pda1
+
+Note that the last line is the output from the generic partition table
+scanner - in this case it reports that it has found a disk with one partition.
+
+2.3 Using a PARIDE device
+--------------------------
+
+Once the drivers have been loaded, you can access PARIDE devices in the
+same way as their traditional counterparts. You will probably need to
+create the device "special files". Here is a simple script that you can
+cut to a file and execute::
+
+ #!/bin/bash
+ #
+ # mkd -- a script to create the device special files for the PARIDE subsystem
+ #
+ function mkdev {
+ mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
+ }
+ #
+ function pd {
+ D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
+ mkdev pd$D b 45 $[ $1 * 16 ]
+ for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
+ done
+ }
+ #
+ cd /dev
+ #
+ for u in 0 1 2 3 ; do pd $u ; done
+ for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done
+ for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done
+ for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done
+ for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done
+ for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done
+ #
+ # end of mkd
+
+With the device files and drivers in place, you can access PARIDE devices
+like any other Linux device. For example, to mount a CD-ROM in pcd0, use::
+
+ mount /dev/pcd0 /cdrom
+
+If you have a fresh Avatar Shark cartridge, and the drive is pda, you
+might do something like::
+
+ fdisk /dev/pda -- make a new partition table with
+ partition 1 of type 83
+
+ mke2fs /dev/pda1 -- to build the file system
+
+ mkdir /shark -- make a place to mount the disk
+
+ mount /dev/pda1 /shark
+
+Devices like the Imation superdisk work in the same way, except that
+they do not have a partition table. For example to make a 120MB
+floppy that you could share with a DOS system::
+
+ mkdosfs /dev/pf0
+ mount /dev/pf0 /mnt
+
+
+2.4 The pf driver
+------------------
+
+The pf driver is intended for use with parallel port ATAPI disk
+devices. The most common devices in this category are PD drives
+and LS-120 drives. Traditionally, media for these devices are not
+partitioned. Consequently, the pf driver does not support partitioned
+media. This may be changed in a future version of the driver.
+
+2.5 Using the pt driver
+------------------------
+
+The pt driver for parallel port ATAPI tape drives is a minimal driver.
+It does not yet support many of the standard tape ioctl operations.
+For best performance, a block size of 32KB should be used. You will
+probably want to set the parallel port delay to 0, if you can.
+
+2.6 Using the pg driver
+------------------------
+
+The pg driver can be used in conjunction with the cdrecord program
+to create CD-ROMs. Please get cdrecord version 1.6.1 or later
+from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media
+your parallel port should ideally be set to EPP mode, and the "port delay"
+should be set to 0. With those settings it is possible to record at 2x
+speed without any buffer underruns. If you cannot get the driver to work
+in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
+
+
+3. Troubleshooting
+==================
+
+3.1 Use EPP mode if you can
+----------------------------
+
+The most common problems that people report with the PARIDE drivers
+concern the parallel port CMOS settings. At this time, none of the
+PARIDE protocol modules support ECP mode, or any ECP combination modes.
+If you are able to do so, please set your parallel port into EPP mode
+using your CMOS setup procedure.
+
+3.2 Check the port delay
+-------------------------
+
+Some parallel ports cannot reliably transfer data at full speed. To
+offset the errors, the PARIDE protocol modules introduce a "port
+delay" between each access to the i/o ports. Each protocol sets
+a default value for this delay. In most cases, the user can override
+the default and set it to 0 - resulting in somewhat higher transfer
+rates. In some rare cases (especially with older 486 systems) the
+default delays are not long enough. if you experience corrupt data
+transfers, or unexpected failures, you may wish to increase the
+port delay. The delay can be programmed using the "driveN" parameters
+to each of the high-level drivers. Please see the notes above, or
+read the comments at the beginning of the driver source files in
+linux/drivers/block/paride.
+
+3.3 Some drives need a printer reset
+-------------------------------------
+
+There appear to be a number of "noname" external drives on the market
+that do not always power up correctly. We have noticed this with some
+drives based on OnSpec and older Freecom adapters. In these rare cases,
+the adapter can often be reinitialised by issuing a "printer reset" on
+the parallel port. As the reset operation is potentially disruptive in
+multiple device environments, the PARIDE drivers will not do it
+automatically. You can however, force a printer reset by doing::
+
+ insmod lp reset=1
+ rmmod lp
+
+If you have one of these marginal cases, you should probably build
+your paride drivers as modules, and arrange to do the printer reset
+before loading the PARIDE drivers.
+
+3.4 Use the verbose option and dmesg if you need help
+------------------------------------------------------
+
+While a lot of testing has gone into these drivers to make them work
+as smoothly as possible, problems will arise. If you do have problems,
+please check all the obvious things first: does the drive work in
+DOS with the manufacturer's drivers ? If that doesn't yield any useful
+clues, then please make sure that only one drive is hooked to your system,
+and that either (a) PARPORT is enabled or (b) no other device driver
+is using your parallel port (check in /proc/ioports). Then, load the
+appropriate drivers (you can load several protocol modules if you want)
+as in::
+
+ # insmod paride
+ # insmod epat
+ # insmod bpck
+ # insmod kbic
+ ...
+ # insmod pd verbose=1
+
+(using the correct driver for the type of device you have, of course).
+The verbose=1 parameter will cause the drivers to log a trace of their
+activity as they attempt to locate your drive.
+
+Use 'dmesg' to capture a log of all the PARIDE messages (any messages
+beginning with paride:, a protocol module's name or a driver's name) and
+include that with your bug report. You can submit a bug report in one
+of two ways. Either send it directly to the author of the PARIDE suite,
+by e-mail to grant@torque.net, or join the linux-parport mailing list
+and post your report there.
+
+3.5 For more information or help
+---------------------------------
+
+You can join the linux-parport mailing list by sending a mail message
+to:
+
+ linux-parport-request@torque.net
+
+with the single word::
+
+ subscribe
+
+in the body of the mail message (not in the subject line). Please be
+sure that your mail program is correctly set up when you do this, as
+the list manager is a robot that will subscribe you using the reply
+address in your mail headers. REMOVE any anti-spam gimmicks you may
+have in your mail headers, when sending mail to the list server.
+
+You might also find some useful information on the linux-parport
+web pages (although they are not always up to date) at
+
+ http://web.archive.org/web/%2E/http://www.torque.net/parport/
diff --git a/Documentation/admin-guide/blockdev/ramdisk.rst b/Documentation/admin-guide/blockdev/ramdisk.rst
new file mode 100644
index 000000000000..b7c2268f8dec
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/ramdisk.rst
@@ -0,0 +1,177 @@
+==========================================
+Using the RAM disk block device with Linux
+==========================================
+
+.. Contents:
+
+ 1) Overview
+ 2) Kernel Command Line Parameters
+ 3) Using "rdev -r"
+ 4) An Example of Creating a Compressed RAM Disk
+
+
+1) Overview
+-----------
+
+The RAM disk driver is a way to use main system memory as a block device. It
+is required for initrd, an initial filesystem used if you need to load modules
+in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can
+also be used for a temporary filesystem for crypto work, since the contents
+are erased on reboot.
+
+The RAM disk dynamically grows as more space is required. It does this by using
+RAM from the buffer cache. The driver marks the buffers it is using as dirty
+so that the VM subsystem does not try to reclaim them later.
+
+The RAM disk supports up to 16 RAM disks by default, and can be reconfigured
+to support an unlimited number of RAM disks (at your own risk). Just change
+the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu
+and (re)build the kernel.
+
+To use RAM disk support with your system, run './MAKEDEV ram' from the /dev
+directory. RAM disks are all major number 1, and start with minor number 0
+for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd.
+
+The new RAM disk also has the ability to load compressed RAM disk images,
+allowing one to squeeze more programs onto an average installation or
+rescue floppy disk.
+
+
+2) Parameters
+---------------------------------
+
+2a) Kernel Command Line Parameters
+
+ ramdisk_size=N
+ Size of the ramdisk.
+
+This parameter tells the RAM disk driver to set up RAM disks of N k size. The
+default is 4096 (4 MB).
+
+2b) Module parameters
+
+ rd_nr
+ /dev/ramX devices created.
+
+ max_part
+ Maximum partition number.
+
+ rd_size
+ See ramdisk_size.
+
+3) Using "rdev -r"
+------------------
+
+The usage of the word (two bytes) that "rdev -r" sets in the kernel image is
+as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up
+to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit
+14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a
+prompt/wait sequence is to be given before trying to read the RAM disk. Since
+the RAM disk dynamically grows as data is being written into it, a size field
+is not required. Bits 11 to 13 are not currently used and may as well be zero.
+These numbers are no magical secrets, as seen below::
+
+ ./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF
+ ./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000
+ ./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000
+
+Consider a typical two floppy disk setup, where you will have the
+kernel on disk one, and have already put a RAM disk image onto disk #2.
+
+Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk
+starts at an offset of 0 kB from the beginning of the floppy.
+The command line equivalent is: "ramdisk_start=0"
+
+You want bit 14 as one, indicating that a RAM disk is to be loaded.
+The command line equivalent is: "load_ramdisk=1"
+
+You want bit 15 as one, indicating that you want a prompt/keypress
+sequence so that you have a chance to switch floppy disks.
+The command line equivalent is: "prompt_ramdisk=1"
+
+Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word.
+So to create disk one of the set, you would do::
+
+ /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0
+ /usr/src/linux# rdev /dev/fd0 /dev/fd0
+ /usr/src/linux# rdev -r /dev/fd0 49152
+
+If you make a boot disk that has LILO, then for the above, you would use::
+
+ append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1"
+
+Since the default start = 0 and the default prompt = 1, you could use::
+
+ append = "load_ramdisk=1"
+
+
+4) An Example of Creating a Compressed RAM Disk
+-----------------------------------------------
+
+To create a RAM disk image, you will need a spare block device to
+construct it on. This can be the RAM disk device itself, or an
+unused disk partition (such as an unmounted swap partition). For this
+example, we will use the RAM disk device, "/dev/ram0".
+
+Note: This technique should not be done on a machine with less than 8 MB
+of RAM. If using a spare disk partition instead of /dev/ram0, then this
+restriction does not apply.
+
+a) Decide on the RAM disk size that you want. Say 2 MB for this example.
+ Create it by writing to the RAM disk device. (This step is not currently
+ required, but may be in the future.) It is wise to zero out the
+ area (esp. for disks) so that maximal compression is achieved for
+ the unused blocks of the image that you are about to create::
+
+ dd if=/dev/zero of=/dev/ram0 bs=1k count=2048
+
+b) Make a filesystem on it. Say ext2fs for this example::
+
+ mke2fs -vm0 /dev/ram0 2048
+
+c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...)
+ and unmount it again.
+
+d) Compress the contents of the RAM disk. The level of compression
+ will be approximately 50% of the space used by the files. Unused
+ space on the RAM disk will compress to almost nothing::
+
+ dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz
+
+e) Put the kernel onto the floppy::
+
+ dd if=zImage of=/dev/fd0 bs=1k
+
+f) Put the RAM disk image onto the floppy, after the kernel. Use an offset
+ that is slightly larger than the kernel, so that you can put another
+ (possibly larger) kernel onto the same floppy later without overlapping
+ the RAM disk image. An offset of 400 kB for kernels about 350 kB in
+ size would be reasonable. Make sure offset+size of ram_image.gz is
+ not larger than the total space on your floppy (usually 1440 kB)::
+
+ dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400
+
+g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc.
+ For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would
+ have 2^15 + 2^14 + 400 = 49552::
+
+ rdev /dev/fd0 /dev/fd0
+ rdev -r /dev/fd0 49552
+
+That is it. You now have your boot/root compressed RAM disk floppy. Some
+users may wish to combine steps (d) and (f) by using a pipe.
+
+
+ Paul Gortmaker 12/95
+
+Changelog:
+----------
+
+10-22-04 :
+ Updated to reflect changes in command line options, remove
+ obsolete references, general cleanup.
+ James Nelson (james4765@gmail.com)
+
+
+12-95 :
+ Original Document
diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst
new file mode 100644
index 000000000000..6eccf13219ff
--- /dev/null
+++ b/Documentation/admin-guide/blockdev/zram.rst
@@ -0,0 +1,422 @@
+========================================
+zram: Compressed RAM based block devices
+========================================
+
+Introduction
+============
+
+The zram module creates RAM based block devices named /dev/zram<id>
+(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
+in memory itself. These disks allow very fast I/O and compression provides
+good amounts of memory savings. Some of the usecases include /tmp storage,
+use as swap disks, various caches under /var and maybe many more :)
+
+Statistics for individual zram devices are exported through sysfs nodes at
+/sys/block/zram<id>/
+
+Usage
+=====
+
+There are several ways to configure and manage zram device(-s):
+
+a) using zram and zram_control sysfs attributes
+b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org).
+
+In this document we will describe only 'manual' zram configuration steps,
+IOW, zram and zram_control sysfs attributes.
+
+In order to get a better idea about zramctl please consult util-linux
+documentation, zramctl man-page or `zramctl --help`. Please be informed
+that zram maintainers do not develop/maintain util-linux or zramctl, should
+you have any questions please contact util-linux@vger.kernel.org
+
+Following shows a typical sequence of steps for using zram.
+
+WARNING
+=======
+
+For the sake of simplicity we skip error checking parts in most of the
+examples below. However, it is your sole responsibility to handle errors.
+
+zram sysfs attributes always return negative values in case of errors.
+The list of possible return codes:
+
+======== =============================================================
+-EBUSY an attempt to modify an attribute that cannot be changed once
+ the device has been initialised. Please reset device first;
+-ENOMEM zram was not able to allocate enough memory to fulfil your
+ needs;
+-EINVAL invalid input has been provided.
+======== =============================================================
+
+If you use 'echo', the returned value that is changed by 'echo' utility,
+and, in general case, something like::
+
+ echo 3 > /sys/block/zram0/max_comp_streams
+ if [ $? -ne 0 ];
+ handle_error
+ fi
+
+should suffice.
+
+1) Load Module
+==============
+
+::
+
+ modprobe zram num_devices=4
+ This creates 4 devices: /dev/zram{0,1,2,3}
+
+num_devices parameter is optional and tells zram how many devices should be
+pre-created. Default: 1.
+
+2) Set max number of compression streams
+========================================
+
+Regardless the value passed to this attribute, ZRAM will always
+allocate multiple compression streams - one per online CPUs - thus
+allowing several concurrent compression operations. The number of
+allocated compression streams goes down when some of the CPUs
+become offline. There is no single-compression-stream mode anymore,
+unless you are running a UP system or has only 1 CPU online.
+
+To find out how many streams are currently available::
+
+ cat /sys/block/zram0/max_comp_streams
+
+3) Select compression algorithm
+===============================
+
+Using comp_algorithm device attribute one can see available and
+currently selected (shown in square brackets) compression algorithms,
+change selected compression algorithm (once the device is initialised
+there is no way to change compression algorithm).
+
+Examples::
+
+ #show supported compression algorithms
+ cat /sys/block/zram0/comp_algorithm
+ lzo [lz4]
+
+ #select lzo compression algorithm
+ echo lzo > /sys/block/zram0/comp_algorithm
+
+For the time being, the `comp_algorithm` content does not necessarily
+show every compression algorithm supported by the kernel. We keep this
+list primarily to simplify device configuration and one can configure
+a new device with a compression algorithm that is not listed in
+`comp_algorithm`. The thing is that, internally, ZRAM uses Crypto API
+and, if some of the algorithms were built as modules, it's impossible
+to list all of them using, for instance, /proc/crypto or any other
+method. This, however, has an advantage of permitting the usage of
+custom crypto compression modules (implementing S/W or H/W compression).
+
+4) Set Disksize
+===============
+
+Set disk size by writing the value to sysfs node 'disksize'.
+The value can be either in bytes or you can use mem suffixes.
+Examples::
+
+ # Initialize /dev/zram0 with 50MB disksize
+ echo $((50*1024*1024)) > /sys/block/zram0/disksize
+
+ # Using mem suffixes
+ echo 256K > /sys/block/zram0/disksize
+ echo 512M > /sys/block/zram0/disksize
+ echo 1G > /sys/block/zram0/disksize
+
+Note:
+There is little point creating a zram of greater than twice the size of memory
+since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
+size of the disk when not in use so a huge zram is wasteful.
+
+5) Set memory limit: Optional
+=============================
+
+Set memory limit by writing the value to sysfs node 'mem_limit'.
+The value can be either in bytes or you can use mem suffixes.
+In addition, you could change the value in runtime.
+Examples::
+
+ # limit /dev/zram0 with 50MB memory
+ echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
+
+ # Using mem suffixes
+ echo 256K > /sys/block/zram0/mem_limit
+ echo 512M > /sys/block/zram0/mem_limit
+ echo 1G > /sys/block/zram0/mem_limit
+
+ # To disable memory limit
+ echo 0 > /sys/block/zram0/mem_limit
+
+6) Activate
+===========
+
+::
+
+ mkswap /dev/zram0
+ swapon /dev/zram0
+
+ mkfs.ext4 /dev/zram1
+ mount /dev/zram1 /tmp
+
+7) Add/remove zram devices
+==========================
+
+zram provides a control interface, which enables dynamic (on-demand) device
+addition and removal.
+
+In order to add a new /dev/zramX device, perform read operation on hot_add
+attribute. This will return either new device's device id (meaning that you
+can use /dev/zram<id>) or error code.
+
+Example::
+
+ cat /sys/class/zram-control/hot_add
+ 1
+
+To remove the existing /dev/zramX device (where X is a device id)
+execute::
+
+ echo X > /sys/class/zram-control/hot_remove
+
+8) Stats
+========
+
+Per-device statistics are exported as various nodes under /sys/block/zram<id>/
+
+A brief description of exported device attributes. For more details please
+read Documentation/ABI/testing/sysfs-block-zram.
+
+====================== ====== ===============================================
+Name access description
+====================== ====== ===============================================
+disksize RW show and set the device's disk size
+initstate RO shows the initialization state of the device
+reset WO trigger device reset
+mem_used_max WO reset the `mem_used_max` counter (see later)
+mem_limit WO specifies the maximum amount of memory ZRAM can
+ use to store the compressed data
+writeback_limit WO specifies the maximum amount of write IO zram
+ can write out to backing device as 4KB unit
+writeback_limit_enable RW show and set writeback_limit feature
+max_comp_streams RW the number of possible concurrent compress
+ operations
+comp_algorithm RW show and change the compression algorithm
+compact WO trigger memory compaction
+debug_stat RO this file is used for zram debugging purposes
+backing_dev RW set up backend storage for zram to write out
+idle WO mark allocated slot as idle
+====================== ====== ===============================================
+
+
+User space is advised to use the following files to read the device statistics.
+
+File /sys/block/zram<id>/stat
+
+Represents block layer statistics. Read Documentation/block/stat.rst for
+details.
+
+File /sys/block/zram<id>/io_stat
+
+The stat file represents device's I/O statistics not accounted by block
+layer and, thus, not available in zram<id>/stat file. It consists of a
+single line of text and contains the following stats separated by
+whitespace:
+
+ ============= =============================================================
+ failed_reads The number of failed reads
+ failed_writes The number of failed writes
+ invalid_io The number of non-page-size-aligned I/O requests
+ notify_free Depending on device usage scenario it may account
+
+ a) the number of pages freed because of swap slot free
+ notifications
+ b) the number of pages freed because of
+ REQ_OP_DISCARD requests sent by bio. The former ones are
+ sent to a swap block device when a swap slot is freed,
+ which implies that this disk is being used as a swap disk.
+
+ The latter ones are sent by filesystem mounted with
+ discard option, whenever some data blocks are getting
+ discarded.
+ ============= =============================================================
+
+File /sys/block/zram<id>/mm_stat
+
+The stat file represents device's mm statistics. It consists of a single
+line of text and contains the following stats separated by whitespace:
+
+ ================ =============================================================
+ orig_data_size uncompressed size of data stored in this disk.
+ This excludes same-element-filled pages (same_pages) since
+ no memory is allocated for them.
+ Unit: bytes
+ compr_data_size compressed size of data stored in this disk
+ mem_used_total the amount of memory allocated for this disk. This
+ includes allocator fragmentation and metadata overhead,
+ allocated for this disk. So, allocator space efficiency
+ can be calculated using compr_data_size and this statistic.
+ Unit: bytes
+ mem_limit the maximum amount of memory ZRAM can use to store
+ the compressed data
+ mem_used_max the maximum amount of memory zram have consumed to
+ store the data
+ same_pages the number of same element filled pages written to this disk.
+ No memory is allocated for such pages.
+ pages_compacted the number of pages freed during compaction
+ huge_pages the number of incompressible pages
+ ================ =============================================================
+
+File /sys/block/zram<id>/bd_stat
+
+The stat file represents device's backing device statistics. It consists of
+a single line of text and contains the following stats separated by whitespace:
+
+ ============== =============================================================
+ bd_count size of data written in backing device.
+ Unit: 4K bytes
+ bd_reads the number of reads from backing device
+ Unit: 4K bytes
+ bd_writes the number of writes to backing device
+ Unit: 4K bytes
+ ============== =============================================================
+
+9) Deactivate
+=============
+
+::
+
+ swapoff /dev/zram0
+ umount /dev/zram1
+
+10) Reset
+=========
+
+ Write any positive value to 'reset' sysfs node::
+
+ echo 1 > /sys/block/zram0/reset
+ echo 1 > /sys/block/zram1/reset
+
+ This frees all the memory allocated for the given device and
+ resets the disksize to zero. You must set the disksize again
+ before reusing the device.
+
+Optional Feature
+================
+
+writeback
+---------
+
+With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page
+to backing storage rather than keeping it in memory.
+To use the feature, admin should set up backing device via::
+
+ echo /dev/sda5 > /sys/block/zramX/backing_dev
+
+before disksize setting. It supports only partition at this moment.
+If admin want to use incompressible page writeback, they could do via::
+
+ echo huge > /sys/block/zramX/write
+
+To use idle page writeback, first, user need to declare zram pages
+as idle::
+
+ echo all > /sys/block/zramX/idle
+
+From now on, any pages on zram are idle pages. The idle mark
+will be removed until someone request access of the block.
+IOW, unless there is access request, those pages are still idle pages.
+
+Admin can request writeback of those idle pages at right timing via::
+
+ echo idle > /sys/block/zramX/writeback
+
+With the command, zram writeback idle pages from memory to the storage.
+
+If there are lots of write IO with flash device, potentially, it has
+flash wearout problem so that admin needs to design write limitation
+to guarantee storage health for entire product life.
+
+To overcome the concern, zram supports "writeback_limit" feature.
+The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
+any writeback. IOW, if admin want to apply writeback budget, he should
+enable writeback_limit_enable via::
+
+ $ echo 1 > /sys/block/zramX/writeback_limit_enable
+
+Once writeback_limit_enable is set, zram doesn't allow any writeback
+until admin set the budget via /sys/block/zramX/writeback_limit.
+
+(If admin doesn't enable writeback_limit_enable, writeback_limit's value
+assigned via /sys/block/zramX/writeback_limit is meaninless.)
+
+If admin want to limit writeback as per-day 400M, he could do it
+like below::
+
+ $ MB_SHIFT=20
+ $ 4K_SHIFT=12
+ $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
+ /sys/block/zram0/writeback_limit.
+ $ echo 1 > /sys/block/zram0/writeback_limit_enable
+
+If admin want to allow further write again once the bugdet is exausted,
+he could do it like below::
+
+ $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
+ /sys/block/zram0/writeback_limit
+
+If admin want to see remaining writeback budget since he set::
+
+ $ cat /sys/block/zramX/writeback_limit
+
+If admin want to disable writeback limit, he could do::
+
+ $ echo 0 > /sys/block/zramX/writeback_limit_enable
+
+The writeback_limit count will reset whenever you reset zram(e.g.,
+system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
+writeback happened until you reset the zram to allocate extra writeback
+budget in next setting is user's job.
+
+If admin want to measure writeback count in a certain period, he could
+know it via /sys/block/zram0/bd_stat's 3rd column.
+
+memory tracking
+===============
+
+With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
+zram block. It could be useful to catch cold or incompressible
+pages of the process with*pagemap.
+
+If you enable the feature, you could see block state via
+/sys/kernel/debug/zram/zram0/block_state". The output is as follows::
+
+ 300 75.033841 .wh.
+ 301 63.806904 s...
+ 302 63.806919 ..hi
+
+First column
+ zram's block index.
+Second column
+ access time since the system was booted
+Third column
+ state of the block:
+
+ s:
+ same page
+ w:
+ written page to backing store
+ h:
+ huge page
+ i:
+ idle page
+
+First line of above example says 300th block is accessed at 75.033841sec
+and the block's state is huge so it is written back to the backing
+storage. It's a debugging feature so anyone shouldn't rely on it to work
+properly.
+
+Nitin Gupta
+ngupta@vflare.org
diff --git a/Documentation/btmrvl.txt b/Documentation/admin-guide/btmrvl.rst
index ec57740ead0c..ec57740ead0c 100644
--- a/Documentation/btmrvl.txt
+++ b/Documentation/admin-guide/btmrvl.rst
diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst
index b761aa2a51d2..44b8a4edd348 100644
--- a/Documentation/admin-guide/bug-hunting.rst
+++ b/Documentation/admin-guide/bug-hunting.rst
@@ -90,9 +90,9 @@ the disk is not available then you have three options:
run a null modem to a second machine and capture the output there
using your favourite communication program. Minicom works well.
-(3) Use Kdump (see Documentation/kdump/kdump.rst),
+(3) Use Kdump (see Documentation/admin-guide/kdump/kdump.rst),
extract the kernel ring buffer from old memory with using dmesg
- gdbmacro in Documentation/kdump/gdbmacros.txt.
+ gdbmacro in Documentation/admin-guide/kdump/gdbmacros.txt.
Finding the bug's location
--------------------------
diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
new file mode 100644
index 000000000000..1d7d962933be
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
@@ -0,0 +1,302 @@
+===================
+Block IO Controller
+===================
+
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+One IO control policy is throttling policy which can be used to
+specify upper IO rate limits on devices. This policy is implemented in
+generic block layer and can be used on leaf nodes as well as higher
+level logical devices like device mapper.
+
+HOWTO
+=====
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller::
+
+ CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer::
+
+ CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
+
+ mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+ for policy is "<major>:<minor> <bytes_per_second>"::
+
+ echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
+
+ Above will put a limit of 1MB/second on reads happening for root group
+ on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not::
+
+ # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+ 1024+0 records in
+ 1024+0 records out
+ 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Throttling implements hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows::
+
+ root
+ / \
+ test1 test2
+ |
+ test3
+
+Throttling with "sane_behavior" will handle the
+hierarchy correctly. For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following::
+
+ pivot
+ / / \ \
+ root test1 test2 test3
+
+Various user visible config options
+===================================
+CONFIG_BLK_CGROUP
+ - Block IO controller.
+
+CONFIG_BFQ_CGROUP_DEBUG
+ - Debug help. Right now some additional stats file show up in cgroup
+ if this option is enabled.
+
+CONFIG_BLK_DEV_THROTTLING
+ - Enable block device throttling support in block layer.
+
+Details of cgroup files
+=======================
+Proportional weight policy files
+--------------------------------
+- blkio.weight
+ - Specifies per cgroup weight. This is default weight of the group
+ on all the devices until and unless overridden by per device rule.
+ (See blkio.weight_device).
+ Currently allowed range of weights is from 10 to 1000.
+
+- blkio.weight_device
+ - One can specify per cgroup per device rules using this interface.
+ These rules override the default value of group weight as specified
+ by blkio.weight.
+
+ Following is the format::
+
+ # echo dev_maj:dev_minor weight > blkio.weight_device
+
+ Configure weight=300 on /dev/sdb (8:16) in this cgroup::
+
+ # echo 8:16 300 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
+ Configure weight=500 on /dev/sda (8:0) in this cgroup::
+
+ # echo 8:0 500 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:0 500
+ 8:16 300
+
+ Remove specific weight for /dev/sda in this cgroup::
+
+ # echo 8:0 0 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
+- blkio.leaf_weight[_device]
+ - Equivalents of blkio.weight[_device] for the purpose of
+ deciding how much weight tasks in the given cgroup has while
+ competing with the cgroup's child cgroups. For details,
+ please refer to Documentation/block/cfq-iosched.txt.
+
+- blkio.time
+ - disk time allocated to cgroup per device in milliseconds. First
+ two fields specify the major and minor number of the device and
+ third field specifies the disk time allocated to group in
+ milliseconds.
+
+- blkio.sectors
+ - number of sectors transferred to/from disk by the group. First
+ two fields specify the major and minor number of the device and
+ third field specifies the number of sectors transferred by the
+ group to/from the device.
+
+- blkio.io_service_bytes
+ - Number of bytes transferred to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of bytes.
+
+- blkio.io_serviced
+ - Number of IOs (bio) issued to the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of IOs.
+
+- blkio.io_service_time
+ - Total amount of time between request dispatch and request completion
+ for the IOs done by this cgroup. This is in nanoseconds to make it
+ meaningful for flash devices too. For devices with queue depth of 1,
+ this time represents the actual service time. When queue_depth > 1,
+ that is no longer true as requests may be served out of order. This
+ may cause the service time for a given IO to include the service time
+ of multiple IOs when served out of order which may result in total
+ io_service_time > actual time elapsed. This time is further divided by
+ the type of operation - read or write, sync or async. First two fields
+ specify the major and minor number of the device, third field
+ specifies the operation type and the fourth field specifies the
+ io_service_time in ns.
+
+- blkio.io_wait_time
+ - Total amount of time the IOs for this cgroup spent waiting in the
+ scheduler queues for service. This can be greater than the total time
+ elapsed since it is cumulative io_wait_time for all IOs. It is not a
+ measure of total time the cgroup spent waiting but rather a measure of
+ the wait_time for its individual IOs. For devices with queue_depth > 1
+ this metric does not include the time spent waiting for service once
+ the IO is dispatched to the device but till it actually gets serviced
+ (there might be a time lag here due to re-ordering of requests by the
+ device). This is in nanoseconds to make it meaningful for flash
+ devices too. This time is further divided by the type of operation -
+ read or write, sync or async. First two fields specify the major and
+ minor number of the device, third field specifies the operation type
+ and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+ - Total number of bios/requests merged into requests belonging to this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.io_queued
+ - Total number of requests queued up at any given instant for this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.avg_queue_size
+ - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+ The average queue size for this cgroup over the entire time of this
+ cgroup's existence. Queue size samples are taken each time one of the
+ queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+ - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+ This is the amount of time the cgroup had to wait since it became busy
+ (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+ its queues. This is different from the io_wait_time which is the
+ cumulative total of the amount of time spent by each IO in that cgroup
+ waiting in the scheduler queue. This is in nanoseconds. If this is
+ read when the cgroup is in a waiting (for timeslice) state, the stat
+ will only report the group_wait_time accumulated till the last time it
+ got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+ - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+ This is the amount of time a cgroup spends without any pending
+ requests when not being served, i.e., it does not include any time
+ spent idling for one of the queues of the cgroup. This is in
+ nanoseconds. If this is read when the cgroup is in an empty state,
+ the stat will only report the empty_time accumulated till the last
+ time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+ - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+ This is the amount of time spent by the IO scheduler idling for a
+ given cgroup in anticipation of a better request than the existing ones
+ from other queues/cgroups. This is in nanoseconds. If this is read
+ when the cgroup is in an idling state, the stat will only report the
+ idle_time accumulated till the last idle period and will not include
+ the current delta.
+
+- blkio.dequeue
+ - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
+ gives the statistics about how many a times a group was dequeued
+ from service tree of the device. First two fields specify the major
+ and minor number of the device and third field specifies the number
+ of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+ - Recursive version of various stats. These files show the
+ same information as their non-recursive counterparts but
+ include stats from all the descendant cgroups.
+
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+ - Specifies upper limit on READ rate from the device. IO rate is
+ specified in bytes per second. Rules are per device. Following is
+ the format::
+
+ echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
+
+- blkio.throttle.write_bps_device
+ - Specifies upper limit on WRITE rate to the device. IO rate is
+ specified in bytes per second. Rules are per device. Following is
+ the format::
+
+ echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
+
+- blkio.throttle.read_iops_device
+ - Specifies upper limit on READ rate from the device. IO rate is
+ specified in IO per second. Rules are per device. Following is
+ the format::
+
+ echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
+
+- blkio.throttle.write_iops_device
+ - Specifies upper limit on WRITE rate to the device. IO rate is
+ specified in io per second. Rules are per device. Following is
+ the format::
+
+ echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+ subjected to both the constraints.
+
+- blkio.throttle.io_serviced
+ - Number of IOs (bio) issued to the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of IOs.
+
+- blkio.throttle.io_service_bytes
+ - Number of bytes transferred to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of bytes.
+
+Common files among various policies
+-----------------------------------
+- blkio.reset_stats
+ - Writing an int to this file will result in resetting all the stats
+ for that cgroup.
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
new file mode 100644
index 000000000000..b0688011ed06
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -0,0 +1,695 @@
+==============
+Control Groups
+==============
+
+Written by Paul Menage <menage@google.com> based on
+Documentation/admin-guide/cgroup-v1/cpusets.rst
+
+Original copyright statements from cpusets.txt:
+
+Portions Copyright (C) 2004 BULL SA.
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+
+Modified by Paul Jackson <pj@sgi.com>
+
+Modified by Christoph Lameter <cl@linux.com>
+
+.. CONTENTS:
+
+ 1. Control Groups
+ 1.1 What are cgroups ?
+ 1.2 Why are cgroups needed ?
+ 1.3 How are cgroups implemented ?
+ 1.4 What does notify_on_release do ?
+ 1.5 What does clone_children do ?
+ 1.6 How do I use cgroups ?
+ 2. Usage Examples and Syntax
+ 2.1 Basic Usage
+ 2.2 Attaching processes
+ 2.3 Mounting hierarchies by name
+ 3. Kernel API
+ 3.1 Overview
+ 3.2 Synchronization
+ 3.3 Subsystem API
+ 4. Extended attributes usage
+ 5. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy. Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierarchies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User-level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task PIDs assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource-tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+up in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines::
+
+ CPU : "Top cpuset"
+ / \
+ CPUSet1 CPUSet2
+ | |
+ (Professors) (Students)
+
+ In addition (system tasks) are attached to topcpuset (so
+ that they can run anywhere) with a limit of 20%
+
+ Memory : Professors (50%), Students (30%), system (20%)
+
+ Disk : Professors (50%), Students (30%), system (20%)
+
+ Network : WWW browsing (20%), Network File System (60%), others (20%)
+ / \
+ Professors (15%) students (5%)
+
+Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
+into the NFS network class.
+
+At the same time Firefox/Lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies),
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can::
+
+ # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+appropriate network and other resource class. This may lead to
+proliferation of such cgroups.
+
+Also let's say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :)) OR give one of the student's simulation
+apps enhanced CPU power.
+
+With ability to write PIDs directly to resource classes, it's just a
+matter of::
+
+ # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
+ (after some time)
+ # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
+
+Without this ability, the administrator would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+ css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+ cgroup_subsys_state objects, one for each cgroup subsystem
+ registered in the system. There is no direct link from a task to
+ the cgroup of which it's a member in each hierarchy, but this
+ can be determined by following pointers through the
+ cgroup_subsys_state objects. This is because accessing the
+ subsystem state is something that's expected to happen frequently
+ and in performance-critical code, whereas operations that require a
+ task's actual cgroup assignments (in particular, moving between
+ cgroups) are less common. A linked list runs through the cg_list
+ field of each task_struct using the css_set, anchored at
+ css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+ manipulation from user space.
+
+ - You can list all the tasks (by PID) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance-critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+ css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition, a new file system of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel. When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options. By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by PID) attached to that cgroup. This list
+ is not guaranteed to be sorted. Writing a thread ID into this file
+ moves the thread into this cgroup.
+ - cgroup.procs: list of thread group IDs in the cgroup. This list is
+ not guaranteed to be sorted or free of duplicate TGIDs, and userspace
+ should sort/uniquify the list if this property is required.
+ Writing a thread group ID into this file moves all threads in that
+ group into this cgroup.
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+ exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command. The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks. A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, otherwise a new
+css_set is allocated. The appropriate existing css_set is located by
+looking into a hash table.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup. This enables automatic
+removal of abandoned cgroups. The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0). The default value of other cgroups at creation is the current
+value of their parents' notify_on_release settings. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 What does clone_children do ?
+---------------------------------
+
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
+
+1.6 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like::
+
+ 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
+ 2) mkdir /sys/fs/cgroup/cpuset
+ 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
+ the /sys/fs/cgroup/cpuset virtual file system.
+ 5) Start a task that will be the "founding father" of the new job.
+ 6) Attach that task to the new cgroup by writing its PID to the
+ /sys/fs/cgroup/cpuset tasks file for that cgroup.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup::
+
+ mount -t tmpfs cgroup_root /sys/fs/cgroup
+ mkdir /sys/fs/cgroup/cpuset
+ mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
+ cd /sys/fs/cgroup/cpuset
+ mkdir Charlie
+ cd Charlie
+ /bin/echo 2-3 > cpuset.cpus
+ /bin/echo 1 > cpuset.mems
+ /bin/echo $$ > tasks
+ sh
+ # The subshell 'sh' is now running in cgroup Charlie
+ # The next line should display '/Charlie'
+ cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy with all available subsystems, type::
+
+ # mount -t cgroup xxx /sys/fs/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+Note: Some subsystems do not work without some user input first. For instance,
+if cpusets are enabled the user will have to populate the cpus and mems files
+for each new cgroup created before that group can be used.
+
+As explained in section `1.2 Why are cgroups needed?` you should create
+different hierarchies of cgroups for each single resource or group of
+resources you want to control. Therefore, you should mount a tmpfs on
+/sys/fs/cgroup and create directories for each cgroup resource or resource
+group::
+
+ # mount -t tmpfs cgroup_root /sys/fs/cgroup
+ # mkdir /sys/fs/cgroup/rg1
+
+To mount a cgroup hierarchy with just the cpuset and memory
+subsystems, type::
+
+ # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
+
+While remounting cgroups is currently supported, it is not recommend
+to use it. Remounting allows changing bound subsystems and
+release_agent. Rebinding is hardly useful as it only works when the
+hierarchy is empty and release_agent itself should be replaced with
+conventional fsnotify. The support for remounting will be removed in
+the future.
+
+To Specify a hierarchy's release_agent::
+
+ # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
+ xxx /sys/fs/cgroup/rg1
+
+Note that specifying 'release_agent' more than once will return failure.
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
+is the cgroup that holds the whole system.
+
+If you want to change the value of release_agent::
+
+ # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
+
+It can also be changed via remount.
+
+If you want to create a new cgroup under /sys/fs/cgroup/rg1::
+
+ # cd /sys/fs/cgroup/rg1
+ # mkdir my_cgroup
+
+Now you want to do something with this cgroup:
+
+ # cd my_cgroup
+
+In this directory you can find several files::
+
+ # ls
+ cgroup.procs notify_on_release tasks
+ (plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup::
+
+ # /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory::
+
+ # mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir::
+
+ # rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+::
+
+ # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+ # /bin/echo PID1 > tasks
+ # /bin/echo PID2 > tasks
+ ...
+ # /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0::
+
+ # echo 0 > tasks
+
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the PID of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
+Note: Since every task is always a member of exactly one cgroup in each
+mounted hierarchy, to remove a task from its current cgroup you must
+move it into a new cgroup (possibly the root cgroup) by writing to the
+new cgroup's tasks file.
+
+Note: Due to some restrictions enforced by some cgroup subsystems, moving
+a process to another cgroup can fail.
+
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy. This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems. Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem ID which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+ entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+ no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+ at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem ID; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_cgrp_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are css_alloc/free. Any others that are null are presumed to
+be successful no-ops.
+
+``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called to allocate a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+ERR_PTR() value. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+``int css_online(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
+
+``void css_offline(struct cgroup *cgrp);``
+(cgroup_mutex held by caller)
+
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+``void css_free(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
+
+``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+ - it's guaranteed that all are from the same thread group
+ - @tset contains all tasks from the thread group whether or not
+ they're switching cgroups
+ - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future.
+
+``void css_reset(struct cgroup_subsys_state *css)``
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state. This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it. cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
+``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded. The parameters are identical to can_attach().
+
+``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
+
+``void fork(struct task_struct *task)``
+
+Called when a task is forked into a cgroup.
+
+``void exit(struct task_struct *task)``
+
+Called during task exit.
+
+``void free(struct task_struct *task)``
+
+Called when the task_struct is freed.
+
+``void bind(struct cgroup *root)``
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Extended attribute usage
+===========================
+
+cgroup filesystem supports certain types of extended attributes in its
+directories and files. The current supported types are:
+
+ - Trusted (XATTR_TRUSTED)
+ - Security (XATTR_SECURITY)
+
+Both require CAP_SYS_ADMIN capability to set.
+
+Like in tmpfs, the extended attributes in cgroup filesystem are stored
+using kernel memory and it's advised to keep the usage at minimum. This
+is the reason why user defined extended attributes are not supported, since
+any user can do it and there's no limit in the value size.
+
+The current known users for this feature are SELinux to limit cgroup usage
+in containers and systemd for assorted meta data like main PID in a cgroup
+(systemd creates a cgroup per service).
+
+5. Questions
+============
+
+::
+
+ Q: what's up with this '/bin/echo' ?
+ A: bash's builtin 'echo' command does not check calls to write() against
+ errors. If you use it in the cgroup file system, you won't be
+ able to tell whether a command succeeded or failed.
+
+ Q: When I attach processes, only the first of the line gets really attached !
+ A: We can only return one error code per call to write(). So you should also
+ put only ONE PID.
diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
new file mode 100644
index 000000000000..d30ed81d2ad7
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
@@ -0,0 +1,50 @@
+=========================
+CPU Accounting Controller
+=========================
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem::
+
+ # mount -t cgroup -ocpuacct none /sys/fs/cgroup
+
+With the above step, the initial or the parent accounting group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
+by this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /sys/fs/cgroup::
+
+ # cd /sys/fs/cgroup
+ # mkdir g1
+ # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/sys/fs/cgroup/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+
+user and system are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+
+- It is theoretically possible to see wrong values for user and system times.
+ This is because percpu_counter_read() on 32bit systems isn't safe
+ against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+ due to the batch processing nature of percpu_counter.
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
new file mode 100644
index 000000000000..86a6ae995d54
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -0,0 +1,866 @@
+=======
+CPUSETS
+=======
+
+Copyright (C) 2004 BULL SA.
+
+Written by Simon.Derr@bull.net
+
+- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+- Modified by Paul Jackson <pj@sgi.com>
+- Modified by Christoph Lameter <cl@linux.com>
+- Modified by Paul Menage <menage@google.com>
+- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+.. CONTENTS:
+
+ 1. Cpusets
+ 1.1 What are cpusets ?
+ 1.2 Why are cpusets needed ?
+ 1.3 How are cpusets implemented ?
+ 1.4 What are exclusive cpusets ?
+ 1.5 What is memory_pressure ?
+ 1.6 What is memory spread ?
+ 1.7 What is sched_load_balance ?
+ 1.8 What is sched_relax_domain_level ?
+ 1.9 How do I use cpusets ?
+ 2. Usage Examples and Syntax
+ 2.1 Basic Usage
+ 2.2 Adding/removing cpus
+ 2.3 Setting flags
+ 2.4 Attaching processes
+ 3. Questions
+ 4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks. In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a task's current cpuset. They form a nested
+hierarchy visible in a virtual file system. These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/admin-guide/cgroup-v1/cgroups.rst.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that task's cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset. The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting task's mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+ * Web Servers running multiple instances of the same web application,
+ * Servers running different applications (for instance, a web server
+ and a database), or
+ * NUMA systems running large HPC applications with demanding
+ performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets. It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+ kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+ in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+ allowed in that task's cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+ those Memory Nodes allowed in that task's cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+ Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+ of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+ browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+ cpuset (except direct ancestors and descendants) may contain
+ any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+ allowed in that task's cpuset.
+ - in sched.c migrate_live_tasks(), to keep migrating tasks within
+ the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+ Memory Nodes by what's allowed in that task's cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel. No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example::
+
+ Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
+ Cpus_allowed_list: 0-127
+ Mems_allowed: ffffffff,ffffffff
+ Mems_allowed_list: 0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag: is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
+
+In addition, only the root cpuset has the following file:
+
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command. The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset. A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset. Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only. The cpus file automatically tracks the value of
+cpu_online_mask using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendant, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users. All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space. This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset. To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned to them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure. It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==>
+ Unless this feature is enabled by writing "1" to the special file
+ /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+ code of __alloc_pages() for this metric reduces to simply noticing
+ that the cpuset_memory_pressure_enabled flag is zero. So only
+ systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+ Because this meter is per-cpuset, rather than per-task or mm,
+ the system load imposed by a batch scheduler monitoring this
+ metric is sharply reduced on large systems, because a scan of
+ the tasklist can be avoided on each set of queries.
+
+ Because this meter is a running average, instead of an accumulating
+ counter, a batch scheduler can detect memory pressure with a
+ single read, instead of having to read and accumulate results
+ for a period of time.
+
+ Because this meter is per-cpuset rather than per-task or mm,
+ the batch scheduler can obtain the key information, memory
+ pressure in a cpuset, with a single read, rather than having to
+ query and accumulate results over all the (dynamically changing)
+ set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures. They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the task's NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the task's NUMA mempolicy and be spread
+instead. Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing task's memory spread settings. If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
+files. By default they contain "0", meaning that the feature is off
+for that cpuset. If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
+PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset. The page allocation calls for the page cache
+is modified to perform an inline check for this PFA_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
+PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple. It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current task's mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit. Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched/core.c) automatically load balances
+tasks. If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced. So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, including those
+marked isolated using the kernel boot time "isolcpus=" argument. However,
+the isolated CPUs will not participate in load balancing, and will not
+have tasks running on them unless explicitly assigned.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+
+ 1) On large systems, load balancing across many CPUs is expensive.
+ If the system is managed using cpusets to place independent jobs
+ on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+ system overhead on those CPUs, including avoiding task load
+ balancing if that is not needed.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendant cpusets. Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest. Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding. So if each of two partially
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
+form a single sched domain that is a superset of both. We won't move
+a task to a CPU outside its cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
+and the sched domain configuration. If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above. In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+CPUs in "cpuset.isolcpus" were excluded from load balancing by the
+isolcpus= kernel boot option, and will never be load balanced regardless
+of the value of "cpuset.sched_load_balance" in any cpuset.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.) When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can. It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
+all the CPUs that must be load balanced.
+
+The cpuset code builds a new such partition and passes it to the
+scheduler sched domain setup code, to have the sched domains rebuilt
+as necessary, whenever:
+
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - or CPUs come or go from a cpuset with this flag enabled,
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
+ and with this flag enabled changes,
+ - or a cpuset with non-empty CPUs and with this flag enabled is removed,
+ - or a cpu is offlined/onlined.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (struct cpumask) in the
+partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+every time. In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searches all.
+
+For example, assume CPU Z is relatively far from CPU X. Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick. For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
+this searching range as you like. This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+====== ===========================================================
+ -1 no request. use system default or follow request of others.
+ 0 no search.
+ 1 search siblings (hyperthreads in a core).
+ 2 search cores in a package.
+ 3 search cpus in a node [= system wide on non-NUMA system]
+ 4 search nodes in a chunk of node [on NUMA system]
+ 5 search system wide [on NUMA system]
+====== ===========================================================
+
+The system default is architecture dependent. The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used. Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not depends on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+
+ - The migration costs between each cpu can be assumed considerably
+ small(for you) due to your special application's behavior or
+ special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+ the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+ then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the task's cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement. If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset. If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
+as queried by get_mempolicy(), doesn't change). If a task is moved
+from one cpuset to another, then the kernel will adjust the task's
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately. Similarly,
+if a task's pid is written to another cpuset's 'tasks' file, then its
+allowed CPU placement is changed immediately. If such a task had been
+bound to some subset of its cpuset using the sched_setaffinity() call,
+the task will be allowed to run on any CPU allowed in its new cpuset,
+negating the effect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+and the processor placement is updated immediately.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the task's new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
+
+There is an exception to the above. If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus. But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching. In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs. When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well. In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above. GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails. If the request cannot be satisfied within
+the current task's cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it. It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /sys/fs/cgroup/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+ the /sys/fs/cgroup/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+ /sys/fs/cgroup/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset::
+
+ mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ cd /sys/fs/cgroup/cpuset
+ mkdir Charlie
+ cd Charlie
+ /bin/echo 2-3 > cpuset.cpus
+ /bin/echo 1 > cpuset.mems
+ /bin/echo $$ > tasks
+ sh
+ # The subshell 'sh' is now running in cpuset Charlie
+ # The next line should display '/Charlie'
+ cat /proc/self/cpuset
+
+There are ways to query or modify cpusets:
+
+ - via the cpuset file system directly, using the various cd, mkdir, echo,
+ cat, rmdir commands from the shell, or their equivalent from C.
+ - via the C library libcpuset.
+ - via the C library libcgroup.
+ (http://sourceforge.net/projects/libcg/)
+ - via the python application cset.
+ (http://code.google.com/p/cpuset/)
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset. The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
+
+Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
+
+ # cd /sys/fs/cgroup/cpuset
+ # mkdir my_cpuset
+
+Now you want to do something with this cpuset::
+
+ # cd my_cpuset
+
+In this directory you can find several files::
+
+ # ls
+ cgroup.clone_children cpuset.memory_pressure
+ cgroup.event_control cpuset.memory_spread_page
+ cgroup.procs cpuset.memory_spread_slab
+ cpuset.cpu_exclusive cpuset.mems
+ cpuset.cpus cpuset.sched_load_balance
+ cpuset.mem_exclusive cpuset.sched_relax_domain_level
+ cpuset.mem_hardwall notify_on_release
+ cpuset.memory_migrate tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties. By writing to these files you can manipulate
+the cpuset.
+
+Set some flags::
+
+ # /bin/echo 1 > cpuset.cpu_exclusive
+
+Add some cpus::
+
+ # /bin/echo 0-7 > cpuset.cpus
+
+Add some mems::
+
+ # /bin/echo 0-7 > cpuset.mems
+
+Now attach your shell to this cpuset::
+
+ # /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory::
+
+ # mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir::
+
+ # rmdir my_sub_cs
+
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command::
+
+ mount -t cpuset X /sys/fs/cgroup/cpuset
+
+is equivalent to::
+
+ mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
+ echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories::
+
+ # /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
+ # /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
+
+To add a CPU to a cpuset, write the new list of CPUs including the
+CPU to be added. To add 6 to the above cpuset::
+
+ # /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6
+
+Similarly to remove a CPU from a cpuset, write the new list of CPUs
+without the CPU to be removed.
+
+To remove all the CPUs::
+
+ # /bin/echo "" > cpuset.cpus -> clear cpus list
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple::
+
+ # /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive'
+ # /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+::
+
+ # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+ # /bin/echo PID1 > tasks
+ # /bin/echo PID2 > tasks
+ ...
+ # /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q:
+ what's up with this '/bin/echo' ?
+
+A:
+ bash's builtin 'echo' command does not check calls to write() against
+ errors. If you use it in the cpuset file system, you won't be
+ able to tell whether a command succeeded or failed.
+
+Q:
+ When I attach processes, only the first of the line gets really attached !
+
+A:
+ We can only return one error code per call to write(). So you should also
+ put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst
new file mode 100644
index 000000000000..e1886783961e
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/devices.rst
@@ -0,0 +1,132 @@
+===========================
+Device Whitelist Controller
+===========================
+
+1. Description
+==============
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files. A device cgroup associates a device access
+whitelist with each cgroup. A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block). 'all' means it applies
+to all types and all major and minor numbers. Major and minor are
+either an integer or * for all. Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'. A child device
+cgroup gets a copy of the parent. Administrators can then remove
+devices from the whitelist or add new entries. A child cgroup can
+never receive a device access which is denied by its parent.
+
+2. User Interface
+=================
+
+An entry is added using devices.allow, and removed using
+devices.deny. For instance::
+
+ echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null. Doing::
+
+ echo a > /sys/fs/cgroup/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing::
+
+ echo a > /sys/fs/cgroup/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+===========
+
+Any task can move itself between cgroups. This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this. We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD. We may want to just refuse moving to a cgroup which
+isn't a descendant of the current one. Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup. (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
+
+4. Hierarchy
+============
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent. Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated. In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example::
+
+ A
+ / \
+ B
+
+ group behavior exceptions
+ A allow "b 8:* rwm", "c 116:1 rw"
+ B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A::
+
+ # echo "c 116:* r" > A/devices.deny
+
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed::
+
+ group whitelist entries denied devices
+ A all "b 8:* rwm", "c 116:* rw"
+ B "c 1:3 rwm", "b 3:* rwm" all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated::
+
+ A
+ / \
+ B
+
+ group whitelist entries denied devices
+ A "c 1:3 rwm", "c 1:5 r" all the rest
+ B "c 1:3 rwm", "c 1:5 r" all the rest
+
+when adding ``c *:3 rwm``::
+
+ # echo "c *:3 rwm" >A/devices.allow
+
+the result::
+
+ group whitelist entries denied devices
+ A "c *:3 rwm", "c 1:5 r" all the rest
+ B "c 1:3 rwm", "c 1:5 r" all the rest
+
+but now it'll be possible to add new entries to B::
+
+ # echo "c 2:3 rwm" >B/devices.allow
+ # echo "c 50:3 r" >B/devices.allow
+
+or even::
+
+ # echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+---------------------------------------
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions. The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation. Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
new file mode 100644
index 000000000000..582d3427de3f
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
@@ -0,0 +1,127 @@
+==============
+Cgroup Freezer
+==============
+
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells::
+
+ $ echo $$
+ 16644
+ $ bash
+ $ echo $$
+ 16690
+
+ From a second, unrelated bash shell:
+ $ kill -SIGSTOP 16690
+ $ kill -SIGCONT 16690
+
+ <at this point 16690 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks belonging to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
+
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+ When read, returns the effective state of the cgroup - "THAWED",
+ "FREEZING" or "FROZEN". This is the combined self and parent-states.
+ If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+ FREEZING cgroup transitions into FROZEN state when all tasks
+ belonging to the cgroup and its descendants become frozen. Note that
+ a cgroup reverts to FREEZING from FROZEN after a new task is added
+ to the cgroup or one of its descendant cgroups until the new task is
+ frozen.
+
+ When written, sets the self-state of the cgroup. Two values are
+ allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+ if not already freezing, enters FREEZING state along with all its
+ descendant cgroups.
+
+ If THAWED is written, the self-state of the cgroup is changed to
+ THAWED. Note that the effective state may not change to THAWED if
+ the parent-state is still freezing. If a cgroup's effective state
+ becomes THAWED, all its descendants which are freezing because of
+ the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+ Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+ This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+ Shows the parent-state. 0 if none of the cgroup's ancestors is
+ frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
+
+* Examples of usage::
+
+ # mkdir /sys/fs/cgroup/freezer
+ # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
+ # mkdir /sys/fs/cgroup/freezer/0
+ # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
+
+to get status of the freezer subsystem::
+
+ # cat /sys/fs/cgroup/freezer/0/freezer.state
+ THAWED
+
+to freeze all tasks in the container::
+
+ # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
+ # cat /sys/fs/cgroup/freezer/0/freezer.state
+ FREEZING
+ # cat /sys/fs/cgroup/freezer/0/freezer.state
+ FROZEN
+
+to unfreeze all tasks in the container::
+
+ # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
+ # cat /sys/fs/cgroup/freezer/0/freezer.state
+ THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
new file mode 100644
index 000000000000..a3902aa253a9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
@@ -0,0 +1,50 @@
+==================
+HugeTLB Controller
+==================
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault. Since HugeTLB doesn't
+support page reclaim, enforcing the limit at page fault time implies that,
+the application will get SIGBUS signal if it tries to access HugeTLB pages
+beyond its limit. This requires the application to know beforehand how much
+HugeTLB pages it would require for its use.
+
+HugeTLB controller can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -o hugetlb none /sys/fs/cgroup
+
+With the above step, the initial or the parent HugeTLB group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+
+New groups can be created under the parent group /sys/fs/cgroup::
+
+ # cd /sys/fs/cgroup
+ # mkdir g1
+ # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it.
+
+Brief summary of control files::
+
+ hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
+ hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
+ hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
+ hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
+
+For a system supporting three hugepage sizes (64k, 32M and 1G), the control
+files include::
+
+ hugetlb.1GB.limit_in_bytes
+ hugetlb.1GB.max_usage_in_bytes
+ hugetlb.1GB.usage_in_bytes
+ hugetlb.1GB.failcnt
+ hugetlb.64KB.limit_in_bytes
+ hugetlb.64KB.max_usage_in_bytes
+ hugetlb.64KB.usage_in_bytes
+ hugetlb.64KB.failcnt
+ hugetlb.32MB.limit_in_bytes
+ hugetlb.32MB.max_usage_in_bytes
+ hugetlb.32MB.usage_in_bytes
+ hugetlb.32MB.failcnt
diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst
new file mode 100644
index 000000000000..10bf48bae0b0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -0,0 +1,28 @@
+========================
+Control Groups version 1
+========================
+
+.. toctree::
+ :maxdepth: 1
+
+ cgroups
+
+ blkio-controller
+ cpuacct
+ cpusets
+ devices
+ freezer-subsystem
+ hugetlb
+ memcg_test
+ memory
+ net_cls
+ net_prio
+ pids
+ rdma
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
new file mode 100644
index 000000000000..3f7115e07b5d
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -0,0 +1,355 @@
+=====================================================
+Memory Resource Controller(Memcg) Implementation Memo
+=====================================================
+
+Last Updated: 2010/2
+
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
+
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+
+(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst)
+
+0. How to record usage ?
+========================
+
+ 2 objects are used.
+
+ page_cgroup ....an object per page.
+
+ Allocated at boot or memory hotplug. Freed at memory hot removal.
+
+ swap_cgroup ... an entry per swp_entry.
+
+ Allocated at swapon(). Freed at swapoff().
+
+ The page_cgroup has USED bit and double count against a page_cgroup never
+ occurs. swap_cgroup is used only when a charged page is swapped-out.
+
+1. Charge
+=========
+
+ a page/swp_entry may be charged (usage += PAGE_SIZE) at
+
+ mem_cgroup_try_charge()
+
+2. Uncharge
+===========
+
+ a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+
+ mem_cgroup_uncharge()
+ Called when a page's refcount goes down to 0.
+
+ mem_cgroup_uncharge_swap()
+ Called when swp_entry's refcnt goes down to 0. A charge against swap
+ disappears.
+
+3. charge-commit-cancel
+=======================
+
+ Memcg pages are charged in two steps:
+
+ - mem_cgroup_try_charge()
+ - mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
+
+ At try_charge(), there are no flags to say "this page is charged".
+ at this point, usage += PAGE_SIZE.
+
+ At commit(), the page is associated with the memcg.
+
+ At cancel(), simply usage -= PAGE_SIZE.
+
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+
+4. Anonymous
+============
+
+ Anonymous page is newly allocated at
+ - page fault into MAP_ANONYMOUS mapping.
+ - Copy-On-Write.
+
+ 4.1 Swap-in.
+ At swap-in, the page is taken from swap-cache. There are 2 cases.
+
+ (a) If the SwapCache is newly allocated and read, it has no charges.
+ (b) If the SwapCache has been mapped by processes, it has been
+ charged already.
+
+ 4.2 Swap-out.
+ At swap-out, typical state transition is below.
+
+ (a) add to swap cache. (marked as SwapCache)
+ swp_entry's refcnt += 1.
+ (b) fully unmapped.
+ swp_entry's refcnt += # of ptes.
+ (c) write back to swap.
+ (d) delete from swap cache. (remove from SwapCache)
+ swp_entry's refcnt -= 1.
+
+
+ Finally, at task exit,
+ (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+
+5. Page Cache
+=============
+
+ Page Cache is charged at
+ - add_to_page_cache_locked().
+
+ The logic is very clear. (About migration, see below)
+
+ Note:
+ __remove_from_page_cache() is called by remove_from_page_cache()
+ and __remove_mapping().
+
+6. Shmem(tmpfs) Page Cache
+===========================
+
+ The best way to understand shmem's page state transition is to read
+ mm/shmem.c.
+
+ But brief explanation of the behavior of memcg around shmem will be
+ helpful to understand the logic.
+
+ Shmem's page (just leaf page, not direct/indirect block) can be on
+
+ - radix-tree of shmem's inode.
+ - SwapCache.
+ - Both on radix-tree and SwapCache. This happens at swap-in
+ and swap-out,
+
+ It's charged when...
+
+ - A new page is added to shmem's radix-tree.
+ - A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+
+7. Page Migration
+=================
+
+ mem_cgroup_migrate()
+
+8. LRU
+======
+ Each memcg has its own private LRU. Now, its handling is under global
+ VM's control (means that it's handled under global pgdat->lru_lock).
+ Almost all routines around memcg's LRU is called by global LRU's
+ list management functions under pgdat->lru_lock.
+
+ A special function is mem_cgroup_isolate_pages(). This scans
+ memcg's private LRU and call __isolate_lru_page() to extract a page
+ from LRU.
+
+ (By __isolate_lru_page(), the page is removed from both of global and
+ private LRU.)
+
+
+9. Typical Tests.
+=================
+
+ Tests for racy cases.
+
+9.1 Small limit to memcg.
+-------------------------
+
+ When you do test to do racy case, it's good test to set memcg's limit
+ to be very small rather than GB. Many races found in the test under
+ xKB or xxMB limits.
+
+ (Memory behavior under GB and Memory behavior under MB shows very
+ different situation.)
+
+9.2 Shmem
+---------
+
+ Historically, memcg's shmem handling was poor and we saw some amount
+ of troubles here. This is because shmem is page-cache but can be
+ SwapCache. Test with shmem/tmpfs is always good test.
+
+9.3 Migration
+-------------
+
+ For NUMA, migration is an another special case. To do easy test, cpuset
+ is useful. Following is a sample script to do migration::
+
+ mount -t cgroup -o cpuset none /opt/cpuset
+
+ mkdir /opt/cpuset/01
+ echo 1 > /opt/cpuset/01/cpuset.cpus
+ echo 0 > /opt/cpuset/01/cpuset.mems
+ echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+ mkdir /opt/cpuset/02
+ echo 1 > /opt/cpuset/02/cpuset.cpus
+ echo 1 > /opt/cpuset/02/cpuset.mems
+ echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+
+ In above set, when you moves a task from 01 to 02, page migration to
+ node 0 to node 1 will occur. Following is a script to migrate all
+ under cpuset.::
+
+ --
+ move_task()
+ {
+ for pid in $1
+ do
+ /bin/echo $pid >$2/tasks 2>/dev/null
+ echo -n $pid
+ echo -n " "
+ done
+ echo END
+ }
+
+ G1_TASK=`cat ${G1}/tasks`
+ G2_TASK=`cat ${G2}/tasks`
+ move_task "${G1_TASK}" ${G2} &
+ --
+
+9.4 Memory hotplug
+------------------
+
+ memory hotplug test is one of good test.
+
+ to offline memory, do following::
+
+ # echo offline > /sys/devices/system/memory/memoryXXX/state
+
+ (XXX is the place of memory)
+
+ This is an easy way to test page migration, too.
+
+9.5 mkdir/rmdir
+---------------
+
+ When using hierarchy, mkdir/rmdir test should be done.
+ Use tests like the following::
+
+ echo 1 >/opt/cgroup/01/memory/use_hierarchy
+ mkdir /opt/cgroup/01/child_a
+ mkdir /opt/cgroup/01/child_b
+
+ set limit to 01.
+ add limit to 01/child_b
+ run jobs under child_a and child_b
+
+ create/delete following groups at random while jobs are running::
+
+ /opt/cgroup/01/child_a/child_aa
+ /opt/cgroup/01/child_b/child_bb
+ /opt/cgroup/01/child_c
+
+ running new jobs in new group is also good.
+
+9.6 Mount with other subsystems
+-------------------------------
+
+ Mounting with other subsystems is a good test because there is a
+ race and lock dependency with other cgroup subsystems.
+
+ example::
+
+ # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
+
+ and do task move, mkdir, rmdir etc...under this.
+
+9.7 swapoff
+-----------
+
+ Besides management of swap is one of complicated parts of memcg,
+ call path of swap-in at swapoff is not same as usual swap-in path..
+ It's worth to be tested explicitly.
+
+ For example, test like following is good:
+
+ (Shell-A)::
+
+ # mount -t cgroup none /cgroup -o memory
+ # mkdir /cgroup/test
+ # echo 40M > /cgroup/test/memory.limit_in_bytes
+ # echo 0 > /cgroup/test/tasks
+
+ Run malloc(100M) program under this. You'll see 60M of swaps.
+
+ (Shell-B)::
+
+ # move all tasks in /cgroup/test to /cgroup
+ # /sbin/swapoff -a
+ # rmdir /cgroup/test
+ # kill malloc task.
+
+ Of course, tmpfs v.s. swapoff test should be tested, too.
+
+9.8 OOM-Killer
+--------------
+
+ Out-of-memory caused by memcg's limit will kill tasks under
+ the memcg. When hierarchy is used, a task under hierarchy
+ will be killed by the kernel.
+
+ In this case, panic_on_oom shouldn't be invoked and tasks
+ in other groups shouldn't be killed.
+
+ It's not difficult to cause OOM under memcg as following.
+
+ Case A) when you can swapoff::
+
+ #swapoff -a
+ #echo 50M > /memory.limit_in_bytes
+
+ run 51M of malloc
+
+ Case B) when you use mem+swap limitation::
+
+ #echo 50M > memory.limit_in_bytes
+ #echo 50M > memory.memsw.limit_in_bytes
+
+ run 51M of malloc
+
+9.9 Move charges at task migration
+----------------------------------
+
+ Charges associated with a task can be moved along with task migration.
+
+ (Shell-A)::
+
+ #mkdir /cgroup/A
+ #echo $$ >/cgroup/A/tasks
+
+ run some programs which uses some amount of memory in /cgroup/A.
+
+ (Shell-B)::
+
+ #mkdir /cgroup/B
+ #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+ #echo "pid of the program running in group A" >/cgroup/B/tasks
+
+ You can see charges have been moved by reading ``*.usage_in_bytes`` or
+ memory.stat of both A and B.
+
+ See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should
+ be written to move_charge_at_immigrate.
+
+9.10 Memory thresholds
+----------------------
+
+ Memory controller implements memory thresholds using cgroups notification
+ API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+
+ (Shell-A) Create cgroup and run event listener::
+
+ # mkdir /cgroup/A
+ # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+ (Shell-B) Add task to cgroup and try to allocate and free memory::
+
+ # echo $$ >/cgroup/A/tasks
+ # a="$(dd if=/dev/zero bs=1M count=10)"
+ # a=
+
+ You will see message from cgroup_event_listener every time you cross
+ the thresholds.
+
+ Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+ It's good idea to test root cgroup as well.
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
new file mode 100644
index 000000000000..41bdc038dad9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -0,0 +1,1003 @@
+==========================
+Memory Resource Controller
+==========================
+
+NOTE:
+ This document is hopelessly outdated and it asks for a complete
+ rewrite. It still contains a useful information so we are keeping it
+ here but make sure to check the current code if you need a deeper
+ understanding.
+
+NOTE:
+ The Memory Resource Controller has generically been referred to as the
+ memory controller in this document. Do not confuse memory controller
+ used here with the memory controller that is used in hardware.
+
+(For editors) In this document:
+ When we mention a cgroup (cgroupfs's directory) with memory controller,
+ we call it "memory cgroup". When you see git-log and source code, you'll
+ see patch's title and function names tend to use "memcg".
+ In this document, we avoid using it.
+
+Benefits and Purpose of the memory controller
+=============================================
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+ Memory-hungry applications can be isolated and limited to a smaller
+ amount of memory.
+b. Create a cgroup with a limited amount of memory; this can be used
+ as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+ to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+ rest of the system to ensure that burning does not fail due to lack
+ of available memory.
+e. There are several other use cases; find one or use the controller just
+ for fun (to learn and hack on the VM subsystem).
+
+Current Status: linux-2.6.34-mmotm(development version of 2010/April)
+
+Features:
+
+ - accounting anonymous pages, file caches, swap caches usage and limiting them.
+ - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
+ - optionally, memory+swap usage can be accounted and limited.
+ - hierarchical accounting
+ - soft limit
+ - moving (recharging) account at moving a task is selectable.
+ - usage threshold notifier
+ - memory pressure notifier
+ - oom-killer disable knob and oom-notifier
+ - Root cgroup has no limit controls.
+
+ Kernel memory support is a work in progress, and the current version provides
+ basically functionality. (See Section 2.7)
+
+Brief summary of control files.
+
+==================================== ==========================================
+ tasks attach a task(thread) and show list of
+ threads
+ cgroup.procs show list of processes
+ cgroup.event_control an interface for event_fd()
+ memory.usage_in_bytes show current usage for memory
+ (See 5.5 for details)
+ memory.memsw.usage_in_bytes show current usage for memory+Swap
+ (See 5.5 for details)
+ memory.limit_in_bytes set/show limit of memory usage
+ memory.memsw.limit_in_bytes set/show limit of memory+Swap usage
+ memory.failcnt show the number of memory usage hits limits
+ memory.memsw.failcnt show the number of memory+Swap hits limits
+ memory.max_usage_in_bytes show max memory usage recorded
+ memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes set/show soft limit of memory usage
+ memory.stat show various statistics
+ memory.use_hierarchy set/show hierarchical account enabled
+ memory.force_empty trigger forced page reclaim
+ memory.pressure_level set memory pressure notifications
+ memory.swappiness set/show swappiness parameter of vmscan
+ (See sysctl's vm.swappiness)
+ memory.move_charge_at_immigrate set/show controls of moving charges
+ memory.oom_control set/show oom controls.
+ memory.numa_stat show the number of memory usage per numa
+ node
+
+ memory.kmem.limit_in_bytes set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes show current kernel memory allocation
+ memory.kmem.failcnt show the number of kernel memory usage
+ hits limits
+ memory.kmem.max_usage_in_bytes show max kernel memory usage recorded
+
+ memory.kmem.tcp.limit_in_bytes set/show hard limit for tcp buf memory
+ memory.kmem.tcp.usage_in_bytes show current tcp buf memory allocation
+ memory.kmem.tcp.failcnt show the number of tcp buf memory usage
+ hits limits
+ memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded
+==================================== ==========================================
+
+1. History
+==========
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+=================
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+-----------
+
+The core of the design is a counter called the page_counter. The
+page_counter tracks the current memory usage and limit of the group of
+processes associated with the controller. Each cgroup has a memory controller
+specific data structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+---------------
+
+::
+
+ +--------------------+
+ | mem_cgroup |
+ | (page_counter) |
+ +--------------------+
+ / ^ \
+ / | \
+ +---------------+ | +---------------+
+ | mm_struct | |.... | mm_struct |
+ | | | | |
+ +---------------+ | +---------------+
+ |
+ + --------------+
+ |
+ +---------------+ +------+--------+
+ | page +----------> page_cgroup|
+ | | | |
+ +---------------+ +---------------+
+
+ (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+ cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+updated. page_cgroup has its own LRU on cgroup.
+(*) page_cgroup structure is allocated at boot/memory-hotplug time.
+
+2.2.1 Accounting details
+------------------------
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+Some pages which are never reclaimable and will not be on the LRU
+are not accounted. We just account pages under usual VM management.
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree. Even if RSS pages are fully
+unmapped (by kswapd), they may exist as SwapCache in the system until they
+are really freed. Such SwapCaches are also accounted.
+A swapped-in page is not accounted until it's mapped.
+
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
+This means swapped-in pages may contain pages for other tasks than a task
+causing page fault. So, we avoid accounting at swap-in I/O.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-LRU because our purpose is to control amount
+of used pages; not-on-LRU pages tend to be out-of-control from VM view.
+
+2.3 Shared Page Accounting
+--------------------------
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
+Exception: If CONFIG_MEMCG_SWAP is not used.
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+--------------------------------------
+
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+memsw means memory+swap. Usage of memory+swap is limited by
+memsw.limit_in_bytes.
+
+Example: Assume a system with 4G of swap. A task which allocates 6G of memory
+(by mistake) under 2G memory limitation will use all swap.
+In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
+By using the memsw limit, you can avoid system OOM which can be caused by swap
+shortage.
+
+**why 'memory+swap' rather than swap**
+
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+memory+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, memory+swap limit is better than just limiting swap from
+an OS point of view.
+
+**What happens when a cgroup hits memory.memsw.limit_in_bytes**
+
+When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
+
+2.5 Reclaim
+-----------
+
+Each cgroup maintains a per cgroup LRU which has the same structure as
+global VM. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup. (See 10. OOM Control below.)
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per-cgroup LRU
+list.
+
+NOTE:
+ Reclaim does not work for the root cgroup, since we cannot set any
+ limits on the root cgroup.
+
+Note2:
+ When panic_on_oom is set to "2", the whole system will panic.
+
+When oom event notifier is registered, event will be delivered.
+(See oom_control section)
+
+2.6 Locking
+-----------
+
+ lock_page_cgroup()/unlock_page_cgroup() should not be called under
+ the i_pages lock.
+
+ Other lock order is following:
+
+ PG_locked.
+ mm->page_table_lock
+ pgdat->lru_lock
+ lock_page_cgroup.
+
+ In many cases, just lock_page_cgroup() is called.
+
+ per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
+ pgdat->lru_lock, it has no lock of its own.
+
+2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+-----------------------------------------------
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+
+Kernel memory accounting is enabled for all memory cgroups by default. But
+it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
+at boot time. In this case, kernel memory will not be accounted at all.
+
+Kernel memory limits are not imposed for the root cgroup. Usage for the root
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+2.7.1 Current Kernel Memory resources accounted
+-----------------------------------------------
+
+stack pages:
+ every process consumes some stack pages. By accounting into
+ kernel memory, we prevent new processes from being created when the kernel
+ memory usage is too high.
+
+slab pages:
+ pages allocated by the SLAB or SLUB allocator are tracked. A copy
+ of each kmem_cache is created every time the cache is touched by the first time
+ from inside the memcg. The creation is done lazily, so some objects can still be
+ skipped while the cache is being created. All objects in a slab page should
+ belong to the same memcg. This only fails to hold when a task is migrated to a
+ different memcg during the page allocation by the cache.
+
+sockets memory pressure:
+ some sockets protocols have memory pressure
+ thresholds. The Memory Controller allows them to be controlled individually
+ per cgroup, instead of globally.
+
+tcp memory pressure:
+ sockets memory pressure for the tcp protocol.
+
+2.7.2 Common use cases
+----------------------
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+U != 0, K = unlimited:
+ This is the standard memcg limitation mechanism already present before kmem
+ accounting. Kernel memory is completely ignored.
+
+U != 0, K < U:
+ Kernel memory is a subset of the user memory. This setup is useful in
+ deployments where the total amount of memory per-cgroup is overcommited.
+ Overcommiting kernel memory limits is definitely not recommended, since the
+ box can still run out of non-reclaimable memory.
+ In this case, the admin could set up K so that the sum of all groups is
+ never greater than the total memory, and freely set U at the cost of his
+ QoS.
+
+WARNING:
+ In the current implementation, memory reclaim will NOT be
+ triggered for a cgroup when it hits K while staying below U, which makes
+ this setup impractical.
+
+U != 0, K >= U:
+ Since kmem charges will also be fed to the user counter and reclaim will be
+ triggered for the cgroup for both kinds of memory. This setup gives the
+ admin a unified view of memory, and it is also useful for people who just
+ want to track kernel memory usage.
+
+3. User Interface
+=================
+
+3.0. Configuration
+------------------
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_MEMCG
+c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+
+3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
+-------------------------------------------------------------------
+
+::
+
+ # mount -t tmpfs none /sys/fs/cgroup
+ # mkdir /sys/fs/cgroup/memory
+ # mount -t cgroup none /sys/fs/cgroup/memory -o memory
+
+3.2. Make the new group and move bash into it::
+
+ # mkdir /sys/fs/cgroup/memory/0
+ # echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+Since now we're in the 0 cgroup, we can alter the memory limit::
+
+ # echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+NOTE:
+ We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+ mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
+ Gibibytes.)
+
+NOTE:
+ We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
+
+NOTE:
+ We cannot set limits on the root cgroup any more.
+
+::
+
+ # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+ 4194304
+
+We can check the usage::
+
+ # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
+ 1216512
+
+A successful write to this file does not guarantee a successful setting of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel::
+
+ # echo 1 > memory.limit_in_bytes
+ # cat memory.limit_in_bytes
+ 4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+4. Testing
+==========
+
+For testing features and implementation, see memcg_test.txt.
+
+Performance test is also important. To see pure memory controller's overhead,
+testing on tmpfs will give you good numbers of small overheads.
+Example: do kernel make on tmpfs.
+
+Page-fault scalability is also important. At measuring parallel
+page fault test, multi-process test may be better than multi-thread
+test because it has noise of shared objects/status.
+
+But the above two are testing extreme situations.
+Trying usual test under memory controller is always helpful.
+
+4.1 Troubleshooting
+-------------------
+
+Sometimes a user might find that the application under a cgroup is
+terminated by the OOM killer. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
+seeing what happens will be helpful.
+
+4.2 Task migration
+------------------
+
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+You can move charges of a task along with task migration.
+See 8. "Move charges at task migration"
+
+4.3 Removing a cgroup
+---------------------
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. (because we charge against pages, not
+against tasks.)
+
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
+
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+About use_hierarchy, see Section 6.
+
+5. Misc. interfaces
+===================
+
+5.1 force_empty
+---------------
+ memory.force_empty interface is provided to make cgroup's memory usage empty.
+ When writing anything to this::
+
+ # echo 0 > memory.force_empty
+
+ the cgroup will be reclaimed and as many pages reclaimed as possible.
+
+ The typical use case for this interface is before calling rmdir().
+ Though rmdir() offlines memcg, but the memcg may still stay there due to
+ charged file caches. Some out-of-use page caches may keep charged until
+ memory pressure happens. If you want to avoid that, force_empty will be useful.
+
+ Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+ kernel pages will still be seen. This is not considered a failure and the
+ write will still return success. In this case, it is expected that
+ memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
+ About use_hierarchy, see Section 6.
+
+5.2 stat file
+-------------
+
+memory.stat file includes following statistics
+
+per-memory cgroup local status
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============== ===============================================================
+cache # of bytes of page cache memory.
+rss # of bytes of anonymous and swap cache memory (includes
+ transparent hugepages).
+rss_huge # of bytes of anonymous transparent hugepages.
+mapped_file # of bytes of mapped file (includes tmpfs/shmem)
+pgpgin # of charging events to the memory cgroup. The charging
+ event happens each time a page is accounted as either mapped
+ anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout # of uncharging events to the memory cgroup. The uncharging
+ event happens each time a page is unaccounted from the cgroup.
+swap # of bytes of swap usage
+dirty # of bytes that are waiting to get written back to the disk.
+writeback # of bytes of file/anon cache that are queued for syncing to
+ disk.
+inactive_anon # of bytes of anonymous and swap cache memory on inactive
+ LRU list.
+active_anon # of bytes of anonymous and swap cache memory on active
+ LRU list.
+inactive_file # of bytes of file-backed memory on inactive LRU list.
+active_file # of bytes of file-backed memory on active LRU list.
+unevictable # of bytes of memory that cannot be reclaimed (mlocked etc).
+=============== ===============================================================
+
+status considering hierarchy (see memory.use_hierarchy settings)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ===================================================
+hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
+ under which the memory cgroup is
+hierarchical_memsw_limit # of bytes of memory+swap limit with regard to
+ hierarchy under which memory cgroup is.
+
+total_<counter> # hierarchical version of <counter>, which in
+ addition to the cgroup's own value includes the
+ sum of all hierarchical children's values of
+ <counter>, i.e. total_cache
+========================= ===================================================
+
+The following additional stats are dependent on CONFIG_DEBUG_VM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ========================================
+recent_rotated_anon VM internal parameter. (see mm/vmscan.c)
+recent_rotated_file VM internal parameter. (see mm/vmscan.c)
+recent_scanned_anon VM internal parameter. (see mm/vmscan.c)
+recent_scanned_file VM internal parameter. (see mm/vmscan.c)
+========================= ========================================
+
+Memo:
+ recent_rotated means recent frequency of LRU rotation.
+ recent_scanned means recent # of scans to LRU.
+ showing for better debug please see the code for meanings.
+
+Note:
+ Only anonymous and swap cache memory is listed as part of 'rss' stat.
+ This should not be confused with the true 'resident set size' or the
+ amount of physical memory used by the cgroup.
+
+ 'rss + mapped_file" will give you resident set size of cgroup.
+
+ (Note: file and shmem may be shared among other cgroups. In that case,
+ mapped_file is accounted only when the memory cgroup is owner of page
+ cache.)
+
+5.3 swappiness
+--------------
+
+Overrides /proc/sys/vm/swappiness for the particular group. The tunable
+in the root cgroup corresponds to the global swappiness setting.
+
+Please note that unlike during the global reclaim, limit reclaim
+enforces that 0 swappiness really prevents from any swapping even if
+there is a swap storage available. This might lead to memcg OOM killer
+if there are no file pages to reclaim.
+
+5.4 failcnt
+-----------
+
+A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
+This failcnt(== failure count) shows the number of times that a usage counter
+hit its limit. When a memory cgroup hits a limit, failcnt increases and
+memory under it will be reclaimed.
+
+You can reset failcnt by writing 0 to failcnt file::
+
+ # echo 0 > .../memory.failcnt
+
+5.5 usage_in_bytes
+------------------
+
+For efficiency, as other kernel components, memory cgroup uses some optimization
+to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
+value for efficient access. (Of course, when necessary, it's synchronized.)
+If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
+value in memory.stat(see 5.2).
+
+5.6 numa_stat
+-------------
+
+This is similar to numa_maps but operates on a per-memcg basis. This is
+useful for providing visibility into the numa locality information within
+an memcg since the pages are allowed to be allocated from any physical
+node. One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
+
+Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
+per-node page counts including "hierarchical_<counter>" which sums up all
+hierarchical children's values in addition to the memcg's own value.
+
+The output format of memory.numa_stat is::
+
+ total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+ file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+ anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+ unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+ hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+The "total" count is sum of file + anon + unevictable.
+
+6. Hierarchy support
+====================
+
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy::
+
+ root
+ / | \
+ / | \
+ a b c
+ | \
+ | \
+ d e
+
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled. If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+
+6.1 Enabling hierarchical accounting and reclaim
+------------------------------------------------
+
+A memory cgroup by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
+
+ # echo 1 > memory.use_hierarchy
+
+The feature can be disabled by::
+
+ # echo 0 > memory.use_hierarchy
+
+NOTE1:
+ Enabling/disabling will fail if either the cgroup already has other
+ cgroups created below it, or if the parent cgroup has use_hierarchy
+ enabled.
+
+NOTE2:
+ When panic_on_oom is set to "2", the whole system will panic in
+ case of an OOM event in any cgroup.
+
+7. Soft limits
+==============
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory, control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best-effort feature; it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is set up such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+-------------
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 MiB)::
+
+ # echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use::
+
+ # echo 1G > memory.soft_limit_in_bytes
+
+NOTE1:
+ Soft limits take effect over a long period of time, since they involve
+ reclaiming memory for balancing between memory cgroups
+NOTE2:
+ It is recommended to set the soft limit always below the hard limit,
+ otherwise the hard limit will take precedence.
+
+8. Move charges at task migration
+=================================
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+-------------
+
+This feature is disabled by default. It can be enabled (and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it::
+
+ # echo (some positive value) > memory.move_charge_at_immigrate
+
+Note:
+ Each bits of move_charge_at_immigrate has its own meaning about what type
+ of charges should be moved. See 8.2 for details.
+Note:
+ Charges are moved only when you move mm->owner, in other words,
+ a leader of a thread group.
+Note:
+ If we cannot find enough space for the task in the destination cgroup, we
+ try to make space by reclaiming memory. Task migration may fail if we
+ cannot make enough space.
+Note:
+ It can take several seconds if you move charges much.
+
+And if you want disable it again::
+
+ # echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be moved
+--------------------------------------
+
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
+
++---+--------------------------------------------------------------------------+
+|bit| what type of charges would be moved ? |
++===+==========================================================================+
+| 0 | A charge of an anonymous page (or swap of it) used by the target task. |
+| | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
++---+--------------------------------------------------------------------------+
+| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
+| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of |
+| | anonymous pages, file pages (and swaps) in the range mmapped by the task |
+| | will be moved even if the task hasn't done page fault, i.e. they might |
+| | not be the task's "RSS", but other task's "RSS" that maps the same file. |
+| | And mapcount of the page is ignored (the page can be moved even if |
+| | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to |
+| | enable move of swap charges. |
++---+--------------------------------------------------------------------------+
+
+8.3 TODO
+--------
+
+- All of moving charge operations are done under cgroup_mutex. It's not good
+ behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+====================
+
+Memory cgroup implements memory thresholds using the cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
+ cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. OOM Control
+===============
+
+memory.oom_control file is for OOM notification and other controls.
+
+Memory cgroup implements OOM notifier using the cgroup notification
+API (See cgroups.txt). It allows to register multiple OOM notification
+delivery and gets notification when OOM happens.
+
+To register a notifier, an application must:
+
+ - create an eventfd using eventfd(2)
+ - open memory.oom_control file
+ - write string like "<event_fd> <fd of memory.oom_control>" to
+ cgroup.event_control
+
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
+
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
+
+ #echo 1 > memory.oom_control
+
+If OOM-killer is disabled, tasks under cgroup will hang/sleep
+in memory cgroup's OOM-waitqueue when they request accountable memory.
+
+For running them, you have to relax the memory cgroup's OOM status by
+
+ * enlarge limit or reduce usage.
+
+To reduce usage,
+
+ * kill some tasks.
+ * move some tasks to other group with account migration.
+ * remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+
+ - oom_kill_disable 0 or 1
+ (if 1, oom-killer is disabled)
+ - under_oom 0 or 1
+ (if 1, the memory cgroup is under OOM, tasks may be stopped.)
+
+11. Memory Pressure
+===================
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+By default, events are propagated upward until the event is handled, i.e. the
+events are not pass-through. For example, you have three cgroups: A->B->C. Now
+you set up an event listener on cgroups A, B and C, and suppose group C
+experiences some pressure. In this situation, only group C will receive the
+notification, i.e. groups A and B will not receive it. This is done to avoid
+excessive "broadcasting" of messages, which disturbs the system and which is
+especially bad if we are low on memory or thrashing. Group B, will receive
+notification only if there are no event listers for group C.
+
+There are three optional modes that specify different propagation behavior:
+
+ - "default": this is the default behavior specified above. This mode is the
+ same as omitting the optional mode parameter, preserved by backwards
+ compatibility.
+
+ - "hierarchy": events always propagate up to the root, similar to the default
+ behavior, except that propagation continues regardless of whether there are
+ event listeners at each level, with the "hierarchy" mode. In the above
+ example, groups A, B, and C will receive notification of memory pressure.
+
+ - "local": events are pass-through, i.e. they only receive notifications when
+ memory pressure is experienced in the memcg for which the notification is
+ registered. In the above example, group C will receive notification if
+ registered for "local" notification and the group experiences memory
+ pressure. However, group B will never receive notification, regardless if
+ there is an event listener for group C or not, if group B is registered for
+ local notification.
+
+The level and event notification mode ("hierarchy" or "local", if necessary) are
+specified by a comma-delimited string, i.e. "low,hierarchy" specifies
+hierarchical, pass-through, notification for all ancestor memcgs. Notification
+that is the default, non pass-through behavior, does not specify a mode.
+"medium,local" specifies pass-through notification for the medium level.
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
+ to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+ Here is a small script example that makes a new cgroup, sets up a
+ memory limit, sets up a notification in the cgroup and then makes child
+ cgroup experience a critical pressure::
+
+ # cd /sys/fs/cgroup/memory/
+ # mkdir foo
+ # cd foo
+ # cgroup_event_listener memory.pressure_level low,hierarchy &
+ # echo 8000000 > memory.limit_in_bytes
+ # echo 8000000 > memory.memsw.limit_in_bytes
+ # echo $$ > tasks
+ # dd if=/dev/zero | read x
+
+ (Expect a bunch of notifications, and eventually, the oom-killer will
+ trigger.)
+
+12. TODO
+========
+
+1. Make per-cgroup scanner reclaim not-shared pages first
+2. Teach controller to account for shared-pages
+3. Start reclamation in the background when the limit is
+ not yet hit but the usage is getting closer
+
+Summary
+=======
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+==========
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+ http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+ http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+ http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+ http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+ subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+ http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+ http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+ http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+ http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+ http://lwn.net/Articles/243795/
diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst
new file mode 100644
index 000000000000..a2cf272af7a0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst
@@ -0,0 +1,44 @@
+=========================
+Network classifier cgroup
+=========================
+
+The Network classifier cgroup provides an interface to
+tag network packets with a class identifier (classid).
+
+The Traffic Controller (tc) can be used to assign
+different priorities to packets from different cgroups.
+Also, Netfilter (iptables) can use this tag to perform
+actions on such packets.
+
+Creating a net_cls cgroups instance creates a net_cls.classid file.
+This net_cls.classid value is initialized to 0.
+
+You can write hexadecimal values to net_cls.classid; the format for these
+values is 0xAAAABBBB; AAAA is the major handle number and BBBB
+is the minor handle number.
+Reading net_cls.classid yields a decimal result.
+
+Example::
+
+ mkdir /sys/fs/cgroup/net_cls
+ mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
+ mkdir /sys/fs/cgroup/net_cls/0
+ echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid
+
+- setting a 10:1 handle::
+
+ cat /sys/fs/cgroup/net_cls/0/net_cls.classid
+ 1048577
+
+- configuring tc::
+
+ tc qdisc add dev eth0 root handle 10: htb
+ tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+
+- creating traffic class 10:1::
+
+ tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+configuring iptables, basic example::
+
+ iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst
new file mode 100644
index 000000000000..b40905871c64
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst
@@ -0,0 +1,57 @@
+=======================
+Network priority cgroup
+=======================
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option. This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+ decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem::
+
+ # mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+ This file is read-only, and is simply informative. It contains a unique
+ integer value that the kernel uses as an internal representation of this
+ cgroup.
+
+net_prio.ifpriomap
+ This file contains a map of the priorities assigned to traffic originating
+ from processes in this group and egressing the system on various interfaces.
+ It contains a list of tuples in the form <ifname priority>. Contents of this
+ file can be modified by echoing a string into the file using the same tuple
+ format. For example::
+
+ echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst
new file mode 100644
index 000000000000..6acebd9e72c8
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -0,0 +1,92 @@
+=========================
+Process Number Controller
+=========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+The pids.events file contains event counters:
+
+ - max: Number of times fork failed because limit was hit.
+
+Example
+-------
+
+First, we mount the pids controller::
+
+ # mkdir -p /sys/fs/cgroup/pids
+ # mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it::
+
+ # mkdir -p /sys/fs/cgroup/pids/parent/child
+ # echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+ # echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+ # cat /sys/fs/cgroup/pids/parent/pids.current
+ 2
+ #
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail::
+
+ # cat /sys/fs/cgroup/pids/parent/pids.current
+ 2
+ # ( /bin/echo "Here's some processes for you." | cat )
+ sh: fork: Resource temporary unavailable
+ #
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's)::
+
+ # echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+ # cat /sys/fs/cgroup/pids/parent/pids.current
+ 2
+ # cat /sys/fs/cgroup/pids/parent/child/pids.current
+ 2
+ # cat /sys/fs/cgroup/pids/parent/child/pids.max
+ max
+ # ( /bin/echo "Here's some processes for you." | cat )
+ sh: fork: Resource temporary unavailable
+ #
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current)::
+
+ # echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+ # /bin/echo "We can't even spawn a single process now."
+ sh: fork: Resource temporary unavailable
+ # echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+ # /bin/echo "We can't even spawn a single process now."
+ sh: fork: Resource temporary unavailable
+ #
diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst
new file mode 100644
index 000000000000..2fcb0a9bf790
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/rdma.rst
@@ -0,0 +1,117 @@
+===============
+RDMA Controller
+===============
+
+.. Contents
+
+ 1. Overview
+ 1-1. What is RDMA controller?
+ 1-2. Why RDMA controller needed?
+ 1-3. How is RDMA controller implemented?
+ 2. Usage Examples
+
+1. Overview
+===========
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can lead to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+
+ ========== =============================
+ hca_handle Maximum number of HCA Handles
+ hca_object Maximum number of HCA Objects
+ ========== =============================
+
+2. Usage Examples
+=================
+
+(a) Configure resource limit::
+
+ echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+ echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit::
+
+ cat /sys/fs/cgroup/rdma/2/rdma.max
+ #Output:
+ mlx4_0 hca_handle=2 hca_object=2000
+ ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage::
+
+ cat /sys/fs/cgroup/rdma/2/rdma.current
+ #Output:
+ mlx4_0 hca_handle=1 hca_object=20
+ ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit::
+
+ echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index cf88c1f98270..3b29005aa981 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and
conventions of cgroup v2. It describes all userland-visible aspects
of cgroup including core and specific controller behaviors. All
future changes must be reflected in this document. Documentation for
-v1 is available under Documentation/cgroup-v1/.
+v1 is available under Documentation/admin-guide/cgroup-v1/.
.. CONTENTS
@@ -705,6 +705,12 @@ Conventions
informational files on the root cgroup which end up showing global
information available elsewhere shouldn't exist.
+- The default time unit is microseconds. If a different unit is ever
+ used, an explicit unit suffix must be present.
+
+- A parts-per quantity should use a percentage decimal with at least
+ two digit fractional part - e.g. 13.40.
+
- If a controller implements weight based resource distribution, its
interface file should be named "weight" and have the range [1,
10000] with 100 as the default. The values are chosen to allow
@@ -1008,7 +1014,7 @@ All time durations are in microseconds.
A read-only nested-key file which exists on non-root cgroups.
Shows pressure stall information for CPU. See
- Documentation/accounting/psi.txt for details.
+ Documentation/accounting/psi.rst for details.
Memory
@@ -1140,6 +1146,11 @@ PAGE_SIZE multiple when read back.
otherwise, a value change in this file generates a file
modified event.
+ Note that all fields in this file are hierarchical and the
+ file modified event can be generated due to an event down the
+ hierarchy. For for the local events at the cgroup level see
+ memory.events.local.
+
low
The number of times the cgroup is reclaimed due to
high memory pressure even though its usage is under
@@ -1179,6 +1190,11 @@ PAGE_SIZE multiple when read back.
The number of processes belonging to this cgroup
killed by any kind of OOM killer.
+ memory.events.local
+ Similar to memory.events but the fields in the file are local
+ to the cgroup i.e. not hierarchical. The file modified event
+ generated on this file reflects only the local events.
+
memory.stat
A read-only flat-keyed file which exists on non-root cgroups.
@@ -1339,7 +1355,7 @@ PAGE_SIZE multiple when read back.
A read-only nested-key file which exists on non-root cgroups.
Shows pressure stall information for memory. See
- Documentation/accounting/psi.txt for details.
+ Documentation/accounting/psi.rst for details.
Usage Guidelines
@@ -1482,7 +1498,7 @@ IO Interface Files
A read-only nested-key file which exists on non-root cgroups.
Shows pressure stall information for IO. See
- Documentation/accounting/psi.txt for details.
+ Documentation/accounting/psi.rst for details.
Writeback
@@ -2108,7 +2124,7 @@ following two functions.
a queue (device) has been associated with the bio and
before submission.
- wbc_account_io(@wbc, @page, @bytes)
+ wbc_account_cgroup_owner(@wbc, @page, @bytes)
Should be called for each data segment being written out.
While this function doesn't care exactly when it's called
during the writeback session, it's the easiest and most
diff --git a/Documentation/clearing-warn-once.txt b/Documentation/admin-guide/clearing-warn-once.rst
index 211fd926cf00..211fd926cf00 100644
--- a/Documentation/clearing-warn-once.txt
+++ b/Documentation/admin-guide/clearing-warn-once.rst
diff --git a/Documentation/cpu-load.txt b/Documentation/admin-guide/cpu-load.rst
index 2d01ce43d2a2..2d01ce43d2a2 100644
--- a/Documentation/cpu-load.txt
+++ b/Documentation/admin-guide/cpu-load.rst
diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
new file mode 100644
index 000000000000..b90dafcc8237
--- /dev/null
+++ b/Documentation/admin-guide/cputopology.rst
@@ -0,0 +1,177 @@
+===========================================
+How CPU topology info is exported via sysfs
+===========================================
+
+Export CPU topology info via sysfs. Items (attributes) are similar
+to /proc/cpuinfo output of some architectures. They reside in
+/sys/devices/system/cpu/cpuX/topology/:
+
+physical_package_id:
+
+ physical package id of cpuX. Typically corresponds to a physical
+ socket number, but the actual value is architecture and platform
+ dependent.
+
+die_id:
+
+ the CPU die ID of cpuX. Typically it is the hardware platform's
+ identifier (rather than the kernel's). The actual value is
+ architecture and platform dependent.
+
+core_id:
+
+ the CPU core ID of cpuX. Typically it is the hardware platform's
+ identifier (rather than the kernel's). The actual value is
+ architecture and platform dependent.
+
+book_id:
+
+ the book ID of cpuX. Typically it is the hardware platform's
+ identifier (rather than the kernel's). The actual value is
+ architecture and platform dependent.
+
+drawer_id:
+
+ the drawer ID of cpuX. Typically it is the hardware platform's
+ identifier (rather than the kernel's). The actual value is
+ architecture and platform dependent.
+
+core_cpus:
+
+ internal kernel map of CPUs within the same core.
+ (deprecated name: "thread_siblings")
+
+core_cpus_list:
+
+ human-readable list of CPUs within the same core.
+ (deprecated name: "thread_siblings_list");
+
+package_cpus:
+
+ internal kernel map of the CPUs sharing the same physical_package_id.
+ (deprecated name: "core_siblings")
+
+package_cpus_list:
+
+ human-readable list of CPUs sharing the same physical_package_id.
+ (deprecated name: "core_siblings_list")
+
+die_cpus:
+
+ internal kernel map of CPUs within the same die.
+
+die_cpus_list:
+
+ human-readable list of CPUs within the same die.
+
+book_siblings:
+
+ internal kernel map of cpuX's hardware threads within the same
+ book_id.
+
+book_siblings_list:
+
+ human-readable list of cpuX's hardware threads within the same
+ book_id.
+
+drawer_siblings:
+
+ internal kernel map of cpuX's hardware threads within the same
+ drawer_id.
+
+drawer_siblings_list:
+
+ human-readable list of cpuX's hardware threads within the same
+ drawer_id.
+
+Architecture-neutral, drivers/base/topology.c, exports these attributes.
+However, the book and drawer related sysfs files will only be created if
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
+
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
+where they reflect the cpu and cache hierarchy.
+
+For an architecture to support this feature, it must define some of
+these macros in include/asm-XXX/topology.h::
+
+ #define topology_physical_package_id(cpu)
+ #define topology_die_id(cpu)
+ #define topology_core_id(cpu)
+ #define topology_book_id(cpu)
+ #define topology_drawer_id(cpu)
+ #define topology_sibling_cpumask(cpu)
+ #define topology_core_cpumask(cpu)
+ #define topology_die_cpumask(cpu)
+ #define topology_book_cpumask(cpu)
+ #define topology_drawer_cpumask(cpu)
+
+The type of ``**_id macros`` is int.
+The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
+correspond with appropriate ``**_siblings`` sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).
+
+To be consistent on all architectures, include/linux/topology.h
+provides default definitions for any of the above macros that are
+not defined by include/asm-XXX/topology.h:
+
+1) topology_physical_package_id: -1
+2) topology_die_id: -1
+3) topology_core_id: 0
+4) topology_sibling_cpumask: just the given CPU
+5) topology_core_cpumask: just the given CPU
+6) topology_die_cpumask: just the given CPU
+
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
+no default definitions for topology_drawer_id() and topology_drawer_cpumask().
+
+Additionally, CPU topology information is provided under
+/sys/devices/system/cpu and includes these files. The internal
+source for the output is in brackets ("[]").
+
+ =========== ==========================================================
+ kernel_max: the maximum CPU index allowed by the kernel configuration.
+ [NR_CPUS-1]
+
+ offline: CPUs that are not online because they have been
+ HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
+ of CPUs allowed by the kernel configuration (kernel_max
+ above). [~cpu_online_mask + cpus >= NR_CPUS]
+
+ online: CPUs that are online and being scheduled [cpu_online_mask]
+
+ possible: CPUs that have been allocated resources and can be
+ brought online if they are present. [cpu_possible_mask]
+
+ present: CPUs that have been identified as being present in the
+ system. [cpu_present_mask]
+ =========== ==========================================================
+
+The format for the above output is compatible with cpulist_parse()
+[see <linux/cpumask.h>]. Some examples follow.
+
+In this example, there are 64 CPUs in the system but cpus 32-63 exceed
+the kernel max which is limited to 0..31 by the NR_CPUS config option
+being 32. Note also that CPUs 2 and 4-31 are not online but could be
+brought online as they are both present and possible::
+
+ kernel_max: 31
+ offline: 2,4-31,32-63
+ online: 0-1,3
+ possible: 0-31
+ present: 0-31
+
+In this example, the NR_CPUS config option is 128, but the kernel was
+started with possible_cpus=144. There are 4 CPUs in the system and cpu2
+was manually taken offline (and is the only CPU that can be brought
+online.)::
+
+ kernel_max: 127
+ offline: 2,4-127,128-143
+ online: 0-1,3
+ possible: 0-127
+ present: 0-3
+
+See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
+as well as more information on the various cpumasks.
diff --git a/Documentation/device-mapper/cache-policies.rst b/Documentation/admin-guide/device-mapper/cache-policies.rst
index b17fe352fc41..b17fe352fc41 100644
--- a/Documentation/device-mapper/cache-policies.rst
+++ b/Documentation/admin-guide/device-mapper/cache-policies.rst
diff --git a/Documentation/device-mapper/cache.rst b/Documentation/admin-guide/device-mapper/cache.rst
index f15e5254d05b..f15e5254d05b 100644
--- a/Documentation/device-mapper/cache.rst
+++ b/Documentation/admin-guide/device-mapper/cache.rst
diff --git a/Documentation/device-mapper/delay.rst b/Documentation/admin-guide/device-mapper/delay.rst
index 917ba8c33359..917ba8c33359 100644
--- a/Documentation/device-mapper/delay.rst
+++ b/Documentation/admin-guide/device-mapper/delay.rst
diff --git a/Documentation/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst
index 8f4a3f889d43..8f4a3f889d43 100644
--- a/Documentation/device-mapper/dm-crypt.rst
+++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst
diff --git a/Documentation/device-mapper/dm-dust.txt b/Documentation/admin-guide/device-mapper/dm-dust.txt
index 954d402a1f6a..954d402a1f6a 100644
--- a/Documentation/device-mapper/dm-dust.txt
+++ b/Documentation/admin-guide/device-mapper/dm-dust.txt
diff --git a/Documentation/device-mapper/dm-flakey.rst b/Documentation/admin-guide/device-mapper/dm-flakey.rst
index 86138735879d..86138735879d 100644
--- a/Documentation/device-mapper/dm-flakey.rst
+++ b/Documentation/admin-guide/device-mapper/dm-flakey.rst
diff --git a/Documentation/device-mapper/dm-init.rst b/Documentation/admin-guide/device-mapper/dm-init.rst
index e5242ff17e9b..e5242ff17e9b 100644
--- a/Documentation/device-mapper/dm-init.rst
+++ b/Documentation/admin-guide/device-mapper/dm-init.rst
diff --git a/Documentation/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst
index a30aa91b5fbe..a30aa91b5fbe 100644
--- a/Documentation/device-mapper/dm-integrity.rst
+++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst
diff --git a/Documentation/device-mapper/dm-io.rst b/Documentation/admin-guide/device-mapper/dm-io.rst
index d2492917a1f5..d2492917a1f5 100644
--- a/Documentation/device-mapper/dm-io.rst
+++ b/Documentation/admin-guide/device-mapper/dm-io.rst
diff --git a/Documentation/device-mapper/dm-log.rst b/Documentation/admin-guide/device-mapper/dm-log.rst
index ba4fce39bc27..ba4fce39bc27 100644
--- a/Documentation/device-mapper/dm-log.rst
+++ b/Documentation/admin-guide/device-mapper/dm-log.rst
diff --git a/Documentation/device-mapper/dm-queue-length.rst b/Documentation/admin-guide/device-mapper/dm-queue-length.rst
index d8e381c1cb02..d8e381c1cb02 100644
--- a/Documentation/device-mapper/dm-queue-length.rst
+++ b/Documentation/admin-guide/device-mapper/dm-queue-length.rst
diff --git a/Documentation/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst
index 2fe255b130fb..2fe255b130fb 100644
--- a/Documentation/device-mapper/dm-raid.rst
+++ b/Documentation/admin-guide/device-mapper/dm-raid.rst
diff --git a/Documentation/device-mapper/dm-service-time.rst b/Documentation/admin-guide/device-mapper/dm-service-time.rst
index facf277fc13c..facf277fc13c 100644
--- a/Documentation/device-mapper/dm-service-time.rst
+++ b/Documentation/admin-guide/device-mapper/dm-service-time.rst
diff --git a/Documentation/device-mapper/dm-uevent.rst b/Documentation/admin-guide/device-mapper/dm-uevent.rst
index 4a8ee8d069c9..4a8ee8d069c9 100644
--- a/Documentation/device-mapper/dm-uevent.rst
+++ b/Documentation/admin-guide/device-mapper/dm-uevent.rst
diff --git a/Documentation/device-mapper/dm-zoned.rst b/Documentation/admin-guide/device-mapper/dm-zoned.rst
index 07f56ebc1730..07f56ebc1730 100644
--- a/Documentation/device-mapper/dm-zoned.rst
+++ b/Documentation/admin-guide/device-mapper/dm-zoned.rst
diff --git a/Documentation/device-mapper/era.rst b/Documentation/admin-guide/device-mapper/era.rst
index 90dd5c670b9f..90dd5c670b9f 100644
--- a/Documentation/device-mapper/era.rst
+++ b/Documentation/admin-guide/device-mapper/era.rst
diff --git a/Documentation/admin-guide/device-mapper/index.rst b/Documentation/admin-guide/device-mapper/index.rst
new file mode 100644
index 000000000000..c77c58b8f67b
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/index.rst
@@ -0,0 +1,42 @@
+=============
+Device Mapper
+=============
+
+.. toctree::
+ :maxdepth: 1
+
+ cache-policies
+ cache
+ delay
+ dm-crypt
+ dm-flakey
+ dm-init
+ dm-integrity
+ dm-io
+ dm-log
+ dm-queue-length
+ dm-raid
+ dm-service-time
+ dm-uevent
+ dm-zoned
+ era
+ kcopyd
+ linear
+ log-writes
+ persistent-data
+ snapshot
+ statistics
+ striped
+ switch
+ thin-provisioning
+ unstriped
+ verity
+ writecache
+ zero
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/device-mapper/kcopyd.rst b/Documentation/admin-guide/device-mapper/kcopyd.rst
index 7651d395127f..7651d395127f 100644
--- a/Documentation/device-mapper/kcopyd.rst
+++ b/Documentation/admin-guide/device-mapper/kcopyd.rst
diff --git a/Documentation/device-mapper/linear.rst b/Documentation/admin-guide/device-mapper/linear.rst
index 9d17fc6e64a9..9d17fc6e64a9 100644
--- a/Documentation/device-mapper/linear.rst
+++ b/Documentation/admin-guide/device-mapper/linear.rst
diff --git a/Documentation/device-mapper/log-writes.rst b/Documentation/admin-guide/device-mapper/log-writes.rst
index 23141f2ffb7c..23141f2ffb7c 100644
--- a/Documentation/device-mapper/log-writes.rst
+++ b/Documentation/admin-guide/device-mapper/log-writes.rst
diff --git a/Documentation/device-mapper/persistent-data.rst b/Documentation/admin-guide/device-mapper/persistent-data.rst
index 2065c3c5a091..2065c3c5a091 100644
--- a/Documentation/device-mapper/persistent-data.rst
+++ b/Documentation/admin-guide/device-mapper/persistent-data.rst
diff --git a/Documentation/admin-guide/device-mapper/snapshot.rst b/Documentation/admin-guide/device-mapper/snapshot.rst
new file mode 100644
index 000000000000..ccdd8b587a74
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/snapshot.rst
@@ -0,0 +1,196 @@
+==============================
+Device-mapper snapshot support
+==============================
+
+Device-mapper allows you, without massive data copying:
+
+- To create snapshots of any block device i.e. mountable, saved states of
+ the block device which are also writable without interfering with the
+ original content;
+- To create device "forks", i.e. multiple different versions of the
+ same data stream.
+- To merge a snapshot of a block device back into the snapshot's origin
+ device.
+
+In the first two cases, dm copies only the chunks of data that get
+changed and uses a separate copy-on-write (COW) block device for
+storage.
+
+For snapshot merge the contents of the COW storage are merged back into
+the origin device.
+
+
+There are three dm targets available:
+snapshot, snapshot-origin, and snapshot-merge.
+
+- snapshot-origin <origin>
+
+which will normally have one or more snapshots based on it.
+Reads will be mapped directly to the backing device. For each write, the
+original data will be saved in the <COW device> of each snapshot to keep
+its visible content unchanged, at least until the <COW device> fills up.
+
+
+- snapshot <origin> <COW device> <persistent?> <chunksize>
+ [<# feature args> [<arg>]*]
+
+A snapshot of the <origin> block device is created. Changed chunks of
+<chunksize> sectors will be stored on the <COW device>. Writes will
+only go to the <COW device>. Reads will come from the <COW device> or
+from <origin> for unchanged data. <COW device> will often be
+smaller than the origin and if it fills up the snapshot will become
+useless and be disabled, returning errors. So it is important to monitor
+the amount of free space and expand the <COW device> before it fills up.
+
+<persistent?> is P (Persistent) or N (Not persistent - will not survive
+after reboot). O (Overflow) can be added as a persistent store option
+to allow userspace to advertise its support for seeing "Overflow" in the
+snapshot status. So supported store types are "P", "PO" and "N".
+
+The difference between persistent and transient is with transient
+snapshots less metadata must be saved on disk - they can be kept in
+memory by the kernel.
+
+When loading or unloading the snapshot target, the corresponding
+snapshot-origin or snapshot-merge target must be suspended. A failure to
+suspend the origin target could result in data corruption.
+
+Optional features:
+
+ discard_zeroes_cow - a discard issued to the snapshot device that
+ maps to entire chunks to will zero the corresponding exception(s) in
+ the snapshot's exception store.
+
+ discard_passdown_origin - a discard to the snapshot device is passed
+ down to the snapshot-origin's underlying device. This doesn't cause
+ copy-out to the snapshot exception store because the snapshot-origin
+ target is bypassed.
+
+ The discard_passdown_origin feature depends on the discard_zeroes_cow
+ feature being enabled.
+
+
+- snapshot-merge <origin> <COW device> <persistent> <chunksize>
+ [<# feature args> [<arg>]*]
+
+takes the same table arguments as the snapshot target except it only
+works with persistent snapshots. This target assumes the role of the
+"snapshot-origin" target and must not be loaded if the "snapshot-origin"
+is still present for <origin>.
+
+Creates a merging snapshot that takes control of the changed chunks
+stored in the <COW device> of an existing snapshot, through a handover
+procedure, and merges these chunks back into the <origin>. Once merging
+has started (in the background) the <origin> may be opened and the merge
+will continue while I/O is flowing to it. Changes to the <origin> are
+deferred until the merging snapshot's corresponding chunk(s) have been
+merged. Once merging has started the snapshot device, associated with
+the "snapshot" target, will return -EIO when accessed.
+
+
+How snapshot is used by LVM2
+============================
+When you create the first LVM2 snapshot of a volume, four dm devices are used:
+
+1) a device containing the original mapping table of the source volume;
+2) a device used as the <COW device>;
+3) a "snapshot" device, combining #1 and #2, which is the visible snapshot
+ volume;
+4) the "original" volume (which uses the device number used by the original
+ source volume), whose table is replaced by a "snapshot-origin" mapping
+ from device #1.
+
+A fixed naming scheme is used, so with the following commands::
+
+ lvcreate -L 1G -n base volumeGroup
+ lvcreate -L 100M --snapshot -n snap volumeGroup/base
+
+we'll have this situation (with volumes in above order)::
+
+ # dmsetup table|grep volumeGroup
+
+ volumeGroup-base-real: 0 2097152 linear 8:19 384
+ volumeGroup-snap-cow: 0 204800 linear 8:19 2097536
+ volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16
+ volumeGroup-base: 0 2097152 snapshot-origin 254:11
+
+ # ls -lL /dev/mapper/volumeGroup-*
+ brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
+ brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
+ brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
+
+
+How snapshot-merge is used by LVM2
+==================================
+A merging snapshot assumes the role of the "snapshot-origin" while
+merging. As such the "snapshot-origin" is replaced with
+"snapshot-merge". The "-real" device is not changed and the "-cow"
+device is renamed to <origin name>-cow to aid LVM2's cleanup of the
+merging snapshot after it completes. The "snapshot" that hands over its
+COW device to the "snapshot-merge" is deactivated (unless using lvchange
+--refresh); but if it is left active it will simply return I/O errors.
+
+A snapshot will merge into its origin with the following command::
+
+ lvconvert --merge volumeGroup/snap
+
+we'll now have this situation::
+
+ # dmsetup table|grep volumeGroup
+
+ volumeGroup-base-real: 0 2097152 linear 8:19 384
+ volumeGroup-base-cow: 0 204800 linear 8:19 2097536
+ volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
+
+ # ls -lL /dev/mapper/volumeGroup-*
+ brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
+ brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
+ brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
+
+
+How to determine when a merging is complete
+===========================================
+The snapshot-merge and snapshot status lines end with:
+
+ <sectors_allocated>/<total_sectors> <metadata_sectors>
+
+Both <sectors_allocated> and <total_sectors> include both data and metadata.
+During merging, the number of sectors allocated gets smaller and
+smaller. Merging has finished when the number of sectors holding data
+is zero, in other words <sectors_allocated> == <metadata_sectors>.
+
+Here is a practical example (using a hybrid of lvm and dmsetup commands)::
+
+ # lvs
+ LV VG Attr LSize Origin Snap% Move Log Copy% Convert
+ base volumeGroup owi-a- 4.00g
+ snap volumeGroup swi-a- 1.00g base 18.97
+
+ # dmsetup status volumeGroup-snap
+ 0 8388608 snapshot 397896/2097152 1560
+ ^^^^ metadata sectors
+
+ # lvconvert --merge -b volumeGroup/snap
+ Merging of volume snap started.
+
+ # lvs volumeGroup/snap
+ LV VG Attr LSize Origin Snap% Move Log Copy% Convert
+ base volumeGroup Owi-a- 4.00g 17.23
+
+ # dmsetup status volumeGroup-base
+ 0 8388608 snapshot-merge 281688/2097152 1104
+
+ # dmsetup status volumeGroup-base
+ 0 8388608 snapshot-merge 180480/2097152 712
+
+ # dmsetup status volumeGroup-base
+ 0 8388608 snapshot-merge 16/2097152 16
+
+Merging has finished.
+
+::
+
+ # lvs
+ LV VG Attr LSize Origin Snap% Move Log Copy% Convert
+ base volumeGroup owi-a- 4.00g
diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst
new file mode 100644
index 000000000000..41ded0bc5933
--- /dev/null
+++ b/Documentation/admin-guide/device-mapper/statistics.rst
@@ -0,0 +1,225 @@
+=============
+DM statistics
+=============
+
+Device Mapper supports the collection of I/O statistics on user-defined
+regions of a DM device. If no regions are defined no statistics are
+collected so there isn't any performance impact. Only bio-based DM
+devices are currently supported.
+
+Each user-defined region specifies a starting sector, length and step.
+Individual statistics will be collected for each step-sized area within
+the range specified.
+
+The I/O statistics counters for each step-sized area of a region are
+in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see:
+Documentation/admin-guide/iostats.rst). But two extra counters (12 and 13) are
+provided: total time spent reading and writing. When the histogram
+argument is used, the 14th parameter is reported that represents the
+histogram of latencies. All these counters may be accessed by sending
+the @stats_print message to the appropriate DM device via dmsetup.
+
+The reported times are in milliseconds and the granularity depends on
+the kernel ticks. When the option precise_timestamps is used, the
+reported times are in nanoseconds.
+
+Each region has a corresponding unique identifier, which we call a
+region_id, that is assigned when the region is created. The region_id
+must be supplied when querying statistics about the region, deleting the
+region, etc. Unique region_ids enable multiple userspace programs to
+request and process statistics for the same DM device without stepping
+on each other's data.
+
+The creation of DM statistics will allocate memory via kmalloc or
+fallback to using vmalloc space. At most, 1/4 of the overall system
+memory may be allocated by DM statistics. The admin can see how much
+memory is used by reading:
+
+ /sys/module/dm_mod/parameters/stats_current_allocated_bytes
+
+Messages
+========
+
+ @stats_create <range> <step> [<number_of_optional_arguments> <optional_arguments>...] [<program_id> [<aux_data>]]
+ Create a new region and return the region_id.
+
+ <range>
+ "-"
+ whole device
+ "<start_sector>+<length>"
+ a range of <length> 512-byte sectors
+ starting with <start_sector>.
+
+ <step>
+ "<area_size>"
+ the range is subdivided into areas each containing
+ <area_size> sectors.
+ "/<number_of_areas>"
+ the range is subdivided into the specified
+ number of areas.
+
+ <number_of_optional_arguments>
+ The number of optional arguments
+
+ <optional_arguments>
+ The following optional arguments are supported:
+
+ precise_timestamps
+ use precise timer with nanosecond resolution
+ instead of the "jiffies" variable. When this argument is
+ used, the resulting times are in nanoseconds instead of
+ milliseconds. Precise timestamps are a little bit slower
+ to obtain than jiffies-based timestamps.
+ histogram:n1,n2,n3,n4,...
+ collect histogram of latencies. The
+ numbers n1, n2, etc are times that represent the boundaries
+ of the histogram. If precise_timestamps is not used, the
+ times are in milliseconds, otherwise they are in
+ nanoseconds. For each range, the kernel will report the
+ number of requests that completed within this range. For
+ example, if we use "histogram:10,20,30", the kernel will
+ report four numbers a:b:c:d. a is the number of requests
+ that took 0-10 ms to complete, b is the number of requests
+ that took 10-20 ms to complete, c is the number of requests
+ that took 20-30 ms to complete and d is the number of
+ requests that took more than 30 ms to complete.
+
+ <program_id>
+ An optional parameter. A name that uniquely identifies
+ the userspace owner of the range. This groups ranges together
+ so that userspace programs can identify the ranges they
+ created and ignore those created by others.
+ The kernel returns this string back in the output of
+ @stats_list message, but it doesn't use it for anything else.
+ If we omit the number of optional arguments, program id must not
+ be a number, otherwise it would be interpreted as the number of
+ optional arguments.
+
+ <aux_data>
+ An optional parameter. A word that provides auxiliary data
+ that is useful to the client program that created the range.
+ The kernel returns this string back in the output of
+ @stats_list message, but it doesn't use this value for anything.
+
+ @stats_delete <region_id>
+ Delete the region with the specified id.
+
+ <region_id>
+ region_id returned from @stats_create
+
+ @stats_clear <region_id>
+ Clear all the counters except the in-flight i/o counters.
+
+ <region_id>
+ region_id returned from @stats_create
+
+ @stats_list [<program_id>]
+ List all regions registered with @stats_create.
+
+ <program_id>
+ An optional parameter.
+ If this parameter is specified, only matching regions
+ are returned.
+ If it is not specified, all regions are returned.
+
+ Output format:
+ <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
+ precise_timestamps histogram:n1,n2,n3,...
+
+ The strings "precise_timestamps" and "histogram" are printed only
+ if they were specified when creating the region.
+
+ @stats_print <region_id> [<starting_line> <number_of_lines>]
+ Print counters for each step-sized area of a region.
+
+ <region_id>
+ region_id returned from @stats_create
+
+ <starting_line>
+ The index of the starting line in the output.
+ If omitted, all lines are returned.
+
+ <number_of_lines>
+ The number of lines to include in the output.
+ If omitted, all lines are returned.
+
+ Output format for each step-sized area of a region:
+
+ <start_sector>+<length>
+ counters
+
+ The first 11 counters have the same meaning as
+ `/sys/block/*/stat or /proc/diskstats`.
+
+ Please refer to Documentation/admin-guide/iostats.rst for details.
+
+ 1. the number of reads completed
+ 2. the number of reads merged
+ 3. the number of sectors read
+ 4. the number of milliseconds spent reading
+ 5. the number of writes completed
+ 6. the number of writes merged
+ 7. the number of sectors written
+ 8. the number of milliseconds spent writing
+ 9. the number of I/Os currently in progress
+ 10. the number of milliseconds spent doing I/Os
+ 11. the weighted number of milliseconds spent doing I/Os
+
+ Additional counters:
+
+ 12. the total time spent reading in milliseconds
+ 13. the total time spent writing in milliseconds
+
+ @stats_print_clear <region_id> [<starting_line> <number_of_lines>]
+ Atomically print and then clear all the counters except the
+ in-flight i/o counters. Useful when the client consuming the
+ statistics does not want to lose any statistics (those updated
+ between printing and clearing).
+
+ <region_id>
+ region_id returned from @stats_create
+
+ <starting_line>
+ The index of the starting line in the output.
+ If omitted, all lines are printed and then cleared.
+
+ <number_of_lines>
+ The number of lines to process.
+ If omitted, all lines are printed and then cleared.
+
+ @stats_set_aux <region_id> <aux_data>
+ Store auxiliary data aux_data for the specified region.
+
+ <region_id>
+ region_id returned from @stats_create
+
+ <aux_data>
+ The string that identifies data which is useful to the client
+ program that created the range. The kernel returns this
+ string back in the output of @stats_list message, but it
+ doesn't use this value for anything.
+
+Examples
+========
+
+Subdivide the DM device 'vol' into 100 pieces and start collecting
+statistics on them::
+
+ dmsetup message vol 0 @stats_create - /100
+
+Set the auxiliary data string to "foo bar baz" (the escape for each
+space must also be escaped, otherwise the shell will consume them)::
+
+ dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
+
+List the statistics::
+
+ dmsetup message vol 0 @stats_list
+
+Print the statistics::
+
+ dmsetup message vol 0 @stats_print 0
+
+Delete the statistics::
+
+ dmsetup message vol 0 @stats_delete 0
diff --git a/Documentation/device-mapper/striped.rst b/Documentation/admin-guide/device-mapper/striped.rst
index e9a8da192ae1..e9a8da192ae1 100644
--- a/Documentation/device-mapper/striped.rst
+++ b/Documentation/admin-guide/device-mapper/striped.rst
diff --git a/Documentation/device-mapper/switch.rst b/Documentation/admin-guide/device-mapper/switch.rst
index 7dde06be1a4f..7dde06be1a4f 100644
--- a/Documentation/device-mapper/switch.rst
+++ b/Documentation/admin-guide/device-mapper/switch.rst
diff --git a/Documentation/device-mapper/thin-provisioning.rst b/Documentation/admin-guide/device-mapper/thin-provisioning.rst
index bafebf79da4b..bafebf79da4b 100644
--- a/Documentation/device-mapper/thin-provisioning.rst
+++ b/Documentation/admin-guide/device-mapper/thin-provisioning.rst
diff --git a/Documentation/device-mapper/unstriped.rst b/Documentation/admin-guide/device-mapper/unstriped.rst
index 0a8d3eb3f072..0a8d3eb3f072 100644
--- a/Documentation/device-mapper/unstriped.rst
+++ b/Documentation/admin-guide/device-mapper/unstriped.rst
diff --git a/Documentation/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst
index a4d1c1476d72..a4d1c1476d72 100644
--- a/Documentation/device-mapper/verity.rst
+++ b/Documentation/admin-guide/device-mapper/verity.rst
diff --git a/Documentation/device-mapper/writecache.rst b/Documentation/admin-guide/device-mapper/writecache.rst
index d3d7690f5e8d..d3d7690f5e8d 100644
--- a/Documentation/device-mapper/writecache.rst
+++ b/Documentation/admin-guide/device-mapper/writecache.rst
diff --git a/Documentation/device-mapper/zero.rst b/Documentation/admin-guide/device-mapper/zero.rst
index 11fb5cf4597c..11fb5cf4597c 100644
--- a/Documentation/device-mapper/zero.rst
+++ b/Documentation/admin-guide/device-mapper/zero.rst
diff --git a/Documentation/admin-guide/devices.txt b/Documentation/admin-guide/devices.txt
index 1649117e6087..e56e00655153 100644
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -2693,8 +2693,8 @@
41 = /dev/ttySMX0 Motorola i.MX - port 0
42 = /dev/ttySMX1 Motorola i.MX - port 1
43 = /dev/ttySMX2 Motorola i.MX - port 2
- 44 = /dev/ttyMM0 Marvell MPSC - port 0
- 45 = /dev/ttyMM1 Marvell MPSC - port 1
+ 44 = /dev/ttyMM0 Marvell MPSC - port 0 (obsolete unused)
+ 45 = /dev/ttyMM1 Marvell MPSC - port 1 (obsolete unused)
46 = /dev/ttyCPM0 PPC CPM (SCC or SMC) - port 0
...
47 = /dev/ttyCPM5 PPC CPM (SCC or SMC) - port 5
diff --git a/Documentation/efi-stub.txt b/Documentation/admin-guide/efi-stub.rst
index 833edb0d0bc4..833edb0d0bc4 100644
--- a/Documentation/efi-stub.txt
+++ b/Documentation/admin-guide/efi-stub.rst
diff --git a/Documentation/admin-guide/gpio/index.rst b/Documentation/admin-guide/gpio/index.rst
new file mode 100644
index 000000000000..a244ba4e87d5
--- /dev/null
+++ b/Documentation/admin-guide/gpio/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+gpio
+====
+
+.. toctree::
+ :maxdepth: 1
+
+ sysfs
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/gpio/sysfs.rst b/Documentation/admin-guide/gpio/sysfs.rst
index ec09ffd983e7..ec09ffd983e7 100644
--- a/Documentation/gpio/sysfs.rst
+++ b/Documentation/admin-guide/gpio/sysfs.rst
diff --git a/Documentation/highuid.txt b/Documentation/admin-guide/highuid.rst
index 6ee70465c0ea..6ee70465c0ea 100644
--- a/Documentation/highuid.txt
+++ b/Documentation/admin-guide/highuid.rst
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
index 31653a9f0e1b..f83212fae4d5 100644
--- a/Documentation/admin-guide/hw-vuln/l1tf.rst
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -241,7 +241,7 @@ Guest mitigation mechanisms
For further information about confining guests to a single or to a group
of cores consult the cpusets documentation:
- https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
+ https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst
.. _interrupt_isolation:
diff --git a/Documentation/hw_random.txt b/Documentation/admin-guide/hw_random.rst
index 121de96e395e..121de96e395e 100644
--- a/Documentation/hw_random.txt
+++ b/Documentation/admin-guide/hw_random.rst
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 24fbe0568eff..33feab2f4084 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -16,6 +16,7 @@ etc.
README
kernel-parameters
devices
+ sysctl/index
This section describes CPU vulnerabilities and their mitigations.
@@ -38,6 +39,8 @@ problems and bugs in particular.
ramoops
dynamic-debug-howto
init
+ kdump/index
+ perf/index
This is the beginning of a section with information of interest to
application developers. Documents covering various aspects of the kernel
@@ -56,11 +59,13 @@ configure specific aspects of kernel behavior to your liking.
initrd
cgroup-v2
+ cgroup-v1/index
serial-console
braille-console
parport
md
module-signing
+ rapidio
sysrq
unicode
vga-softcursor
@@ -69,14 +74,38 @@ configure specific aspects of kernel behavior to your liking.
java
ras
bcache
+ blockdev/index
ext4
binderfs
+ xfs
pm/index
thunderbolt
LSM/index
mm/index
+ namespaces/index
perf-security
acpi/index
+ aoe/index
+ btmrvl
+ clearing-warn-once
+ cpu-load
+ cputopology
+ device-mapper/index
+ efi-stub
+ gpio/index
+ highuid
+ hw_random
+ iostats
+ kernel-per-CPU-kthreads
+ laptops/index
+ lcd-panel-cgram
+ ldm
+ lockup-watchdogs
+ numastat
+ pnp
+ rtc
+ svga
+ video-output
.. only:: subproject and html
diff --git a/Documentation/iostats.txt b/Documentation/admin-guide/iostats.rst
index 5d63b18bd6d1..5d63b18bd6d1 100644
--- a/Documentation/iostats.txt
+++ b/Documentation/admin-guide/iostats.rst
diff --git a/Documentation/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt
index 220d0a80ca2c..220d0a80ca2c 100644
--- a/Documentation/kdump/gdbmacros.txt
+++ b/Documentation/admin-guide/kdump/gdbmacros.txt
diff --git a/Documentation/admin-guide/kdump/index.rst b/Documentation/admin-guide/kdump/index.rst
new file mode 100644
index 000000000000..8e2ebd0383cd
--- /dev/null
+++ b/Documentation/admin-guide/kdump/index.rst
@@ -0,0 +1,20 @@
+
+================================================================
+Documentation for Kdump - The kexec-based Crash Dumping Solution
+================================================================
+
+This document includes overview, setup and installation, and analysis
+information.
+
+.. toctree::
+ :maxdepth: 1
+
+ kdump
+ vmcoreinfo
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst
index ac7e131d2935..ac7e131d2935 100644
--- a/Documentation/kdump/kdump.rst
+++ b/Documentation/admin-guide/kdump/kdump.rst
diff --git a/Documentation/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 007a6b86e0ee..007a6b86e0ee 100644
--- a/Documentation/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst
index 5d29ba5ad88c..d05d531b4ec9 100644
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -118,7 +118,7 @@ parameter is applicable::
LOOP Loopback device support is enabled.
M68k M68k architecture is enabled.
These options have more detailed description inside of
- Documentation/m68k/kernel-options.txt.
+ Documentation/m68k/kernel-options.rst.
MDA MDA console support is enabled.
MIPS MIPS architecture is enabled.
MOUSE Appropriate mouse support is enabled.
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 4edf67801420..46b826fcb5ad 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -13,7 +13,7 @@
For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force"
are available
- See also Documentation/power/runtime_pm.txt, pci=noacpi
+ See also Documentation/power/runtime_pm.rst, pci=noacpi
acpi_apic_instance= [ACPI, IOAPIC]
Format: <int>
@@ -223,7 +223,7 @@
acpi_sleep= [HW,ACPI] Sleep options
Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
old_ordering, nonvs, sci_force_enable, nobl }
- See Documentation/power/video.txt for information on
+ See Documentation/power/video.rst for information on
s3_bios and s3_mode.
s3_beep is for debugging; it makes the PC's speaker beep
as soon as the kernel's real-mode entry point is called.
@@ -430,7 +430,7 @@
blkdevparts= Manual partition parsing of block device(s) for
embedded devices based on command line input.
- See Documentation/block/cmdline-partition.txt
+ See Documentation/block/cmdline-partition.rst
boot_delay= Milliseconds to delay each printk during boot.
Values larger than 10 seconds (10000) are changed to
@@ -478,7 +478,7 @@
others).
ccw_timeout_log [S390]
- See Documentation/s390/CommonIO for details.
+ See Documentation/s390/common_io.rst for details.
cgroup_disable= [KNL] Disable a particular controller
Format: {name of the controller(s) to disable}
@@ -516,7 +516,7 @@
/selinux/checkreqprot.
cio_ignore= [S390]
- See Documentation/s390/CommonIO for details.
+ See Documentation/s390/common_io.rst for details.
clk_ignore_unused
[CLK]
Prevents the clock framework from automatically gating
@@ -708,14 +708,14 @@
[KNL, x86_64] select a region under 4G first, and
fall back to reserve region above 4G when '@offset'
hasn't been specified.
- See Documentation/kdump/kdump.rst for further details.
+ See Documentation/admin-guide/kdump/kdump.rst for further details.
crashkernel=range1:size1[,range2:size2,...][@offset]
[KNL] Same as above, but depends on the memory
in the running system. The syntax of range is
start-[end] where start and end are both
a memory unit (amount[KMG]). See also
- Documentation/kdump/kdump.rst for an example.
+ Documentation/admin-guide/kdump/kdump.rst for an example.
crashkernel=size[KMG],high
[KNL, x86_64] range could be above 4G. Allow kernel
@@ -805,12 +805,10 @@
tracking down these problems.
debug_pagealloc=
- [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
- parameter enables the feature at boot time. In
- default, it is disabled. We can avoid allocating huge
- chunk of memory for debug pagealloc if we don't enable
- it at boot time and the system will work mostly same
- with the kernel built without CONFIG_DEBUG_PAGEALLOC.
+ [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter
+ enables the feature at boot time. By default, it is
+ disabled and the system will work mostly the same as a
+ kernel built without CONFIG_DEBUG_PAGEALLOC.
on: enable the feature
debugpat [X86] Enable PAT debugging
@@ -932,7 +930,7 @@
edid/1680x1050.bin, or edid/1920x1080.bin is given
and no file with the same name exists. Details and
instructions how to build your own EDID data are
- available in Documentation/EDID/howto.rst. An EDID
+ available in Documentation/driver-api/edid.rst. An EDID
data set will only be used for a particular connector,
if its name and a colon are prepended to the EDID
name. Each connector may use a unique EDID data
@@ -1201,15 +1199,15 @@
elevator= [IOSCHED]
Format: { "mq-deadline" | "kyber" | "bfq" }
- See Documentation/block/deadline-iosched.txt,
- Documentation/block/kyber-iosched.txt and
- Documentation/block/bfq-iosched.txt for details.
+ See Documentation/block/deadline-iosched.rst,
+ Documentation/block/kyber-iosched.rst and
+ Documentation/block/bfq-iosched.rst for details.
elfcorehdr=[size[KMG]@]offset[KMG] [IA64,PPC,SH,X86,S390]
Specifies physical address of start of kernel core
image elf header and optionally the size. Generally
kexec loader will pass this option to capture kernel.
- See Documentation/kdump/kdump.rst for details.
+ See Documentation/admin-guide/kdump/kdump.rst for details.
enable_mtrr_cleanup [X86]
The kernel tries to adjust MTRR layout from continuous
@@ -1251,7 +1249,7 @@
See also Documentation/fault-injection/.
floppy= [HW]
- See Documentation/blockdev/floppy.txt.
+ See Documentation/admin-guide/blockdev/floppy.rst.
force_pal_cache_flush
[IA-64] Avoid check_sal_cache_flush which may hang on
@@ -1670,6 +1668,15 @@
initrd= [BOOT] Specify the location of the initial ramdisk
+ init_on_alloc= [MM] Fill newly allocated pages and heap objects with
+ zeroes.
+ Format: 0 | 1
+ Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON.
+
+ init_on_free= [MM] Fill freed pages and heap objects with zeroes.
+ Format: 0 | 1
+ Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
+
init_pkru= [x86] Specify the default memory protection keys rights
register contents for all processes. 0x55555554 by
default (disallow access to all but pkey 0). Can
@@ -2004,6 +2011,19 @@
Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
the default is off.
+ kprobe_event=[probe-list]
+ [FTRACE] Add kprobe events and enable at boot time.
+ The probe-list is a semicolon delimited list of probe
+ definitions. Each definition is same as kprobe_events
+ interface, but the parameters are comma delimited.
+ For example, to add a kprobe event on vfs_read with
+ arg1 and arg2, add to the command line;
+
+ kprobe_event=p,vfs_read,$arg1,$arg2
+
+ See also Documentation/trace/kprobetrace.rst "Kernel
+ Boot Parameter" section.
+
kpti= [ARM64] Control page table isolation of user
and kernel address spaces.
Default: enabled on cores which need mitigation.
@@ -2227,7 +2247,7 @@
memblock=debug [KNL] Enable memblock debug messages.
load_ramdisk= [RAM] List of ramdisks to load from floppy
- See Documentation/blockdev/ramdisk.txt.
+ See Documentation/admin-guide/blockdev/ramdisk.rst.
lockd.nlm_grace_period=P [NFS] Assign grace period.
Format: <integer>
@@ -2870,6 +2890,17 @@
/sys/module/printk/parameters/console_suspend) to
turn on/off it dynamically.
+ novmcoredd [KNL,KDUMP]
+ Disable device dump. Device dump allows drivers to
+ append dump data to vmcore so you can collect driver
+ specified debug info. Drivers can append the data
+ without any limit and this data is stored in memory,
+ so this may cause significant memory stress. Disabling
+ device dump can help save memory but the driver debug
+ data will be no longer available. This parameter
+ is only available when CONFIG_PROC_VMCORE_DEVICE_DUMP
+ is set.
+
noaliencache [MM, NUMA, SLAB] Disables the allocation of alien
caches in the slab allocator. Saves per-node memory,
but will impact performance.
@@ -2925,7 +2956,7 @@
register save and restore. The kernel will only save
legacy floating-point registers on task switch.
- nohugeiomap [KNL,x86] Disable kernel huge I/O mappings.
+ nohugeiomap [KNL,x86,PPC] Disable kernel huge I/O mappings.
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
Equivalent to smt=1.
@@ -3137,7 +3168,7 @@
numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
'node', 'default' can be specified
This can be set from sysctl after boot.
- See Documentation/sysctl/vm.txt for details.
+ See Documentation/admin-guide/sysctl/vm.rst for details.
ohci1394_dma=early [HW] enable debugging via the ohci1394 driver.
See Documentation/debugging-via-ohci1394.txt for more
@@ -3261,7 +3292,7 @@
pcd. [PARIDE]
See header of drivers/block/paride/pcd.c.
- See also Documentation/blockdev/paride.txt.
+ See also Documentation/admin-guide/blockdev/paride.rst.
pci=option[,option...] [PCI] various PCI subsystem options.
@@ -3505,7 +3536,7 @@
needed on a platform with proper driver support.
pd. [PARIDE]
- See Documentation/blockdev/paride.txt.
+ See Documentation/admin-guide/blockdev/paride.rst.
pdcchassis= [PARISC,HW] Disable/Enable PDC Chassis Status codes at
boot time.
@@ -3520,10 +3551,10 @@
and performance comparison.
pf. [PARIDE]
- See Documentation/blockdev/paride.txt.
+ See Documentation/admin-guide/blockdev/paride.rst.
pg. [PARIDE]
- See Documentation/blockdev/paride.txt.
+ See Documentation/admin-guide/blockdev/paride.rst.
pirq= [SMP,APIC] Manual mp-table setup
See Documentation/x86/i386/IO-APIC.rst.
@@ -3635,7 +3666,7 @@
prompt_ramdisk= [RAM] List of RAM disks to prompt for floppy disk
before loading.
- See Documentation/blockdev/ramdisk.txt.
+ See Documentation/admin-guide/blockdev/ramdisk.rst.
psi= [KNL] Enable or disable pressure stall information
tracking.
@@ -3657,7 +3688,7 @@
pstore.backend= Specify the name of the pstore backend to use
pt. [PARIDE]
- See Documentation/blockdev/paride.txt.
+ See Documentation/admin-guide/blockdev/paride.rst.
pti= [X86_64] Control Page Table Isolation of user and
kernel address spaces. Disabling this feature
@@ -3686,7 +3717,7 @@
See Documentation/admin-guide/md.rst.
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
- See Documentation/blockdev/ramdisk.txt.
+ See Documentation/admin-guide/blockdev/ramdisk.rst.
random.trust_cpu={on,off}
[KNL] Enable or disable trusting the use of the
@@ -3750,6 +3781,12 @@
the propagation of recent CPU-hotplug changes up
the rcu_node combining tree.
+ rcutree.use_softirq= [KNL]
+ If set to zero, move all RCU_SOFTIRQ processing to
+ per-CPU rcuc kthreads. Defaults to a non-zero
+ value, meaning that RCU_SOFTIRQ is used by default.
+ Specify rcutree.use_softirq=0 to use rcuc kthreads.
+
rcutree.rcu_fanout_exact= [KNL]
Disable autobalancing of the rcu_node combining
tree. This is used by rcutorture, and might
@@ -4076,7 +4113,7 @@
relax_domain_level=
[KNL, SMP] Set scheduler's default relax_domain_level.
- See Documentation/cgroup-v1/cpusets.txt.
+ See Documentation/admin-guide/cgroup-v1/cpusets.rst.
reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory
Format: <base1>,<size1>[,<base2>,<size2>,...]
@@ -4106,7 +4143,7 @@
Specify the offset from the beginning of the partition
given by "resume=" at which the swap header is located,
in <PAGE_SIZE> units (needed only for swap files).
- See Documentation/power/swsusp-and-swap-files.txt
+ See Documentation/power/swsusp-and-swap-files.rst
resumedelay= [HIBERNATION] Delay (in seconds) to pause before attempting to
read the resume files
@@ -4334,7 +4371,7 @@
Format: <integer>
sonypi.*= [HW] Sony Programmable I/O Control Device driver
- See Documentation/laptops/sonypi.txt
+ See Documentation/admin-guide/laptops/sonypi.rst
spectre_v2= [X86] Control mitigation of Spectre variant 2
(indirect branch speculation) vulnerability.
@@ -4586,7 +4623,7 @@
swapaccount=[0|1]
[KNL] Enable accounting of swap in memory resource
controller if no parameter or 1 is given or disable
- it if 0 is given (See Documentation/cgroup-v1/memory.txt)
+ it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
swiotlb= [ARM,IA-64,PPC,MIPS,X86]
Format: { <int> | force | noforce }
@@ -4661,27 +4698,6 @@
Force threading of all interrupt handlers except those
marked explicitly IRQF_NO_THREAD.
- tmem [KNL,XEN]
- Enable the Transcendent memory driver if built-in.
-
- tmem.cleancache=0|1 [KNL, XEN]
- Default is on (1). Disable the usage of the cleancache
- API to send anonymous pages to the hypervisor.
-
- tmem.frontswap=0|1 [KNL, XEN]
- Default is on (1). Disable the usage of the frontswap
- API to send swap pages to the hypervisor. If disabled
- the selfballooning and selfshrinking are force disabled.
-
- tmem.selfballooning=0|1 [KNL, XEN]
- Default is on (1). Disable the driving of swap pages
- to the hypervisor.
-
- tmem.selfshrinking=0|1 [KNL, XEN]
- Default is on (1). Partial swapoff that immediately
- transfers pages from Xen hypervisor back to the
- kernel based on different criteria.
-
topology= [S390]
Format: {off | on}
Specify if the kernel should make use of the cpu
@@ -5053,7 +5069,7 @@
vga= [BOOT,X86-32] Select a particular video mode
See Documentation/x86/boot.rst and
- Documentation/svga.txt.
+ Documentation/admin-guide/svga.rst.
Use vga=ask for menu.
This is actually a boot loader parameter; the value is
passed to the kernel using a special protocol.
@@ -5098,13 +5114,12 @@
targets for exploits that can control RIP.
emulate [default] Vsyscalls turn into traps and are
- emulated reasonably safely.
+ emulated reasonably safely. The vsyscall
+ page is readable.
- native Vsyscalls are native syscall instructions.
- This is a little bit faster than trapping
- and makes a few dynamic recompilers work
- better than they would in emulation mode.
- It also makes exploits much easier to write.
+ xonly Vsyscalls turn into traps and are
+ emulated reasonably safely. The vsyscall
+ page is not readable.
none Vsyscalls don't work at all. This makes
them quite hard to use for exploits but
@@ -5252,6 +5267,8 @@
xen_nopv [X86]
Disables the PV optimizations forcing the HVM guest to
run as generic HVM guest with no PV drivers.
+ This option is obsoleted by the "nopv" option, which
+ has equivalent effect for XEN platform.
xen_scrub_pages= [XEN]
Boolean option to control scrubbing pages before giving them back
@@ -5266,10 +5283,24 @@
improve timer resolution at the expense of processing
more timer interrupts.
+ nopv= [X86,XEN,KVM,HYPER_V,VMWARE]
+ Disables the PV optimizations forcing the guest to run
+ as generic guest with no PV drivers. Currently support
+ XEN HVM, KVM, HYPER_V and VMWARE guest.
+
xirc2ps_cs= [NET,PCMCIA]
Format:
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
+ xive= [PPC]
+ By default on POWER9 and above, the kernel will
+ natively use the XIVE interrupt controller. This option
+ allows the fallback firmware mode to be used:
+
+ off Fallback to firmware control of XIVE interrupt
+ controller on both pseries and powernv
+ platforms. Only useful on POWER9 and above.
+
xhci-hcd.quirks [USB,KNL]
A hex value specifying bitmask with supplemental xhci
host controller quirks. Meaning of each bit can be
diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
new file mode 100644
index 000000000000..4f18456dd3b1
--- /dev/null
+++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
@@ -0,0 +1,356 @@
+==========================================
+Reducing OS jitter due to per-cpu kthreads
+==========================================
+
+This document lists per-CPU kthreads in the Linux kernel and presents
+options to control their OS jitter. Note that non-per-CPU kthreads are
+not listed here. To reduce OS jitter from non-per-CPU kthreads, bind
+them to a "housekeeping" CPU dedicated to such work.
+
+References
+==========
+
+- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs.
+
+- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs.
+
+- man taskset: Using the taskset command to bind tasks to sets
+ of CPUs.
+
+- man sched_setaffinity: Using the sched_setaffinity() system
+ call to bind tasks to sets of CPUs.
+
+- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state,
+ writing "0" to offline and "1" to online.
+
+- In order to locate kernel-generated OS jitter on CPU N:
+
+ cd /sys/kernel/debug/tracing
+ echo 1 > max_graph_depth # Increase the "1" for more detail
+ echo function_graph > current_tracer
+ # run workload
+ cat per_cpu/cpuN/trace
+
+kthreads
+========
+
+Name:
+ ehca_comp/%u
+
+Purpose:
+ Periodically process Infiniband-related work.
+
+To reduce its OS jitter, do any of the following:
+
+1. Don't use eHCA Infiniband hardware, instead choosing hardware
+ that does not require per-CPU kthreads. This will prevent these
+ kthreads from being created in the first place. (This will
+ work for most people, as this hardware, though important, is
+ relatively old and is produced in relatively low unit volumes.)
+2. Do all eHCA-Infiniband-related work on other CPUs, including
+ interrupts.
+3. Rework the eHCA driver so that its per-CPU kthreads are
+ provisioned only on selected CPUs.
+
+
+Name:
+ irq/%d-%s
+
+Purpose:
+ Handle threaded interrupts.
+
+To reduce its OS jitter, do the following:
+
+1. Use irq affinity to force the irq threads to execute on
+ some other CPU.
+
+Name:
+ kcmtpd_ctr_%d
+
+Purpose:
+ Handle Bluetooth work.
+
+To reduce its OS jitter, do one of the following:
+
+1. Don't use Bluetooth, in which case these kthreads won't be
+ created in the first place.
+2. Use irq affinity to force Bluetooth-related interrupts to
+ occur on some other CPU and furthermore initiate all
+ Bluetooth activity on some other CPU.
+
+Name:
+ ksoftirqd/%u
+
+Purpose:
+ Execute softirq handlers when threaded or when under heavy load.
+
+To reduce its OS jitter, each softirq vector must be handled
+separately as follows:
+
+TIMER_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1. To the extent possible, keep the CPU out of the kernel when it
+ is non-idle, for example, by avoiding system calls and by forcing
+ both kernel threads and interrupts to execute elsewhere.
+2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force
+ the CPU offline, then bring it back online. This forces
+ recurring timers to migrate elsewhere. If you are concerned
+ with multiple CPUs, force them all offline before bringing the
+ first one back online. Once you have onlined the CPUs in question,
+ do not offline any other CPUs, because doing so could force the
+ timer back onto one of the CPUs in question.
+
+NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
+---------------------------------
+
+Do all of the following:
+
+1. Force networking interrupts onto other CPUs.
+2. Initiate any network I/O on other CPUs.
+3. Once your application has started, prevent CPU-hotplug operations
+ from being initiated from tasks that might run on the CPU to
+ be de-jittered. (It is OK to force this CPU offline and then
+ bring it back online before you start your application.)
+
+BLOCK_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1. Force block-device interrupts onto some other CPU.
+2. Initiate any block I/O on other CPUs.
+3. Once your application has started, prevent CPU-hotplug operations
+ from being initiated from tasks that might run on the CPU to
+ be de-jittered. (It is OK to force this CPU offline and then
+ bring it back online before you start your application.)
+
+IRQ_POLL_SOFTIRQ
+----------------
+
+Do all of the following:
+
+1. Force block-device interrupts onto some other CPU.
+2. Initiate any block I/O and block-I/O polling on other CPUs.
+3. Once your application has started, prevent CPU-hotplug operations
+ from being initiated from tasks that might run on the CPU to
+ be de-jittered. (It is OK to force this CPU offline and then
+ bring it back online before you start your application.)
+
+TASKLET_SOFTIRQ
+---------------
+
+Do one or more of the following:
+
+1. Avoid use of drivers that use tasklets. (Such drivers will contain
+ calls to things like tasklet_schedule().)
+2. Convert all drivers that you must use from tasklets to workqueues.
+3. Force interrupts for drivers using tasklets onto other CPUs,
+ and also do I/O involving these drivers on other CPUs.
+
+SCHED_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1. Avoid sending scheduler IPIs to the CPU to be de-jittered,
+ for example, ensure that at most one runnable kthread is present
+ on that CPU. If a thread that expects to run on the de-jittered
+ CPU awakens, the scheduler will send an IPI that can result in
+ a subsequent SCHED_SOFTIRQ.
+2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+ is marked as an adaptive-ticks CPU using the "nohz_full="
+ boot parameter. This reduces the number of scheduler-clock
+ interrupts that the de-jittered CPU receives, minimizing its
+ chances of being selected to do the load balancing work that
+ runs in SCHED_SOFTIRQ context.
+3. To the extent possible, keep the CPU out of the kernel when it
+ is non-idle, for example, by avoiding system calls and by
+ forcing both kernel threads and interrupts to execute elsewhere.
+ This further reduces the number of scheduler-clock interrupts
+ received by the de-jittered CPU.
+
+HRTIMER_SOFTIRQ
+---------------
+
+Do all of the following:
+
+1. To the extent possible, keep the CPU out of the kernel when it
+ is non-idle. For example, avoid system calls and force both
+ kernel threads and interrupts to execute elsewhere.
+2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the
+ CPU offline, then bring it back online. This forces recurring
+ timers to migrate elsewhere. If you are concerned with multiple
+ CPUs, force them all offline before bringing the first one
+ back online. Once you have onlined the CPUs in question, do not
+ offline any other CPUs, because doing so could force the timer
+ back onto one of the CPUs in question.
+
+RCU_SOFTIRQ
+-----------
+
+Do at least one of the following:
+
+1. Offload callbacks and keep the CPU in either dyntick-idle or
+ adaptive-ticks state by doing all of the following:
+
+ a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+ de-jittered is marked as an adaptive-ticks CPU using the
+ "nohz_full=" boot parameter. Bind the rcuo kthreads to
+ housekeeping CPUs, which can tolerate OS jitter.
+ b. To the extent possible, keep the CPU out of the kernel
+ when it is non-idle, for example, by avoiding system
+ calls and by forcing both kernel threads and interrupts
+ to execute elsewhere.
+
+2. Enable RCU to do its processing remotely via dyntick-idle by
+ doing all of the following:
+
+ a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
+ b. Ensure that the CPU goes idle frequently, allowing other
+ CPUs to detect that it has passed through an RCU quiescent
+ state. If the kernel is built with CONFIG_NO_HZ_FULL=y,
+ userspace execution also allows other CPUs to detect that
+ the CPU in question has passed through a quiescent state.
+ c. To the extent possible, keep the CPU out of the kernel
+ when it is non-idle, for example, by avoiding system
+ calls and by forcing both kernel threads and interrupts
+ to execute elsewhere.
+
+Name:
+ kworker/%u:%d%s (cpu, id, priority)
+
+Purpose:
+ Execute workqueue requests
+
+To reduce its OS jitter, do any of the following:
+
+1. Run your workload at a real-time priority, which will allow
+ preempting the kworker daemons.
+2. A given workqueue can be made visible in the sysfs filesystem
+ by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
+ Such a workqueue can be confined to a given subset of the
+ CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
+ files. The set of WQ_SYSFS workqueues can be displayed using
+ "ls sys/devices/virtual/workqueue". That said, the workqueues
+ maintainer would like to caution people against indiscriminately
+ sprinkling WQ_SYSFS across all the workqueues. The reason for
+ caution is that it is easy to add WQ_SYSFS, but because sysfs is
+ part of the formal user/kernel API, it can be nearly impossible
+ to remove it, even if its addition was a mistake.
+3. Do any of the following needed to avoid jitter that your
+ application cannot tolerate:
+
+ a. Build your kernel with CONFIG_SLUB=y rather than
+ CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
+ use of each CPU's workqueues to run its cache_reap()
+ function.
+ b. Avoid using oprofile, thus avoiding OS jitter from
+ wq_sync_buffer().
+ c. Limit your CPU frequency so that a CPU-frequency
+ governor is not required, possibly enlisting the aid of
+ special heatsinks or other cooling technologies. If done
+ correctly, and if you CPU architecture permits, you should
+ be able to build your kernel with CONFIG_CPU_FREQ=n to
+ avoid the CPU-frequency governor periodically running
+ on each CPU, including cs_dbs_timer() and od_dbs_timer().
+
+ WARNING: Please check your CPU specifications to
+ make sure that this is safe on your particular system.
+ d. As of v3.18, Christoph Lameter's on-demand vmstat workers
+ commit prevents OS jitter due to vmstat_update() on
+ CONFIG_SMP=y systems. Before v3.18, is not possible
+ to entirely get rid of the OS jitter, but you can
+ decrease its frequency by writing a large value to
+ /proc/sys/vm/stat_interval. The default value is HZ,
+ for an interval of one second. Of course, larger values
+ will make your virtual-memory statistics update more
+ slowly. Of course, you can also run your workload at
+ a real-time priority, thus preempting vmstat_update(),
+ but if your workload is CPU-bound, this is a bad idea.
+ However, there is an RFC patch from Christoph Lameter
+ (based on an earlier one from Gilad Ben-Yossef) that
+ reduces or even eliminates vmstat overhead for some
+ workloads at https://lkml.org/lkml/2013/9/4/379.
+ e. Boot with "elevator=noop" to avoid workqueue use by
+ the block layer.
+ f. If running on high-end powerpc servers, build with
+ CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS
+ daemon from running on each CPU every second or so.
+ (This will require editing Kconfig files and will defeat
+ this platform's RAS functionality.) This avoids jitter
+ due to the rtas_event_scan() function.
+ WARNING: Please check your CPU specifications to
+ make sure that this is safe on your particular system.
+ g. If running on Cell Processor, build your kernel with
+ CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
+ spu_gov_work().
+ WARNING: Please check your CPU specifications to
+ make sure that this is safe on your particular system.
+ h. If running on PowerMAC, build your kernel with
+ CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
+ avoiding OS jitter from rackmeter_do_timer().
+
+Name:
+ rcuc/%u
+
+Purpose:
+ Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
+
+To reduce its OS jitter, do at least one of the following:
+
+1. Build the kernel with CONFIG_PREEMPT=n. This prevents these
+ kthreads from being created in the first place, and also obviates
+ the need for RCU priority boosting. This approach is feasible
+ for workloads that do not require high degrees of responsiveness.
+2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these
+ kthreads from being created in the first place. This approach
+ is feasible only if your workload never requires RCU priority
+ boosting, for example, if you ensure frequent idle time on all
+ CPUs that might execute within the kernel.
+3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+ boot parameter offloading RCU callbacks from all CPUs susceptible
+ to OS jitter. This approach prevents the rcuc/%u kthreads from
+ having any work to do, so that they are never awakened.
+4. Ensure that the CPU never enters the kernel, and, in particular,
+ avoid initiating any CPU hotplug operations on this CPU. This is
+ another way of preventing any callbacks from being queued on the
+ CPU, again preventing the rcuc/%u kthreads from having any work
+ to do.
+
+Name:
+ rcuop/%d and rcuos/%d
+
+Purpose:
+ Offload RCU callbacks from the corresponding CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1. Use affinity, cgroups, or other mechanism to force these kthreads
+ to execute on some other CPU.
+2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
+ kthreads from being created in the first place. However, please
+ note that this will not eliminate OS jitter, but will instead
+ shift it to RCU_SOFTIRQ.
+
+Name:
+ watchdog/%u
+
+Purpose:
+ Detect software lockups on each CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
+ kthreads from being created in the first place.
+2. Boot with "nosoftlockup=0", which will also prevent these kthreads
+ from being created. Other related watchdog and softlockup boot
+ parameters may be found in Documentation/admin-guide/kernel-parameters.rst
+ and Documentation/watchdog/watchdog-parameters.rst.
+3. Echo a zero to /proc/sys/kernel/watchdog to disable the
+ watchdog timer.
+4. Echo a large number of /proc/sys/kernel/watchdog_thresh in
+ order to reduce the frequency of OS jitter due to the watchdog
+ timer down to a level that is acceptable for your workload.
diff --git a/Documentation/admin-guide/laptops/asus-laptop.rst b/Documentation/admin-guide/laptops/asus-laptop.rst
new file mode 100644
index 000000000000..95176321a25a
--- /dev/null
+++ b/Documentation/admin-guide/laptops/asus-laptop.rst
@@ -0,0 +1,271 @@
+==================
+Asus Laptop Extras
+==================
+
+Version 0.1
+
+August 6, 2009
+
+Corentin Chary <corentincj@iksaif.net>
+http://acpi4asus.sf.net/
+
+ This driver provides support for extra features of ACPI-compatible ASUS laptops.
+ It may also support some MEDION, JVC or VICTOR laptops (such as MEDION 9675 or
+ VICTOR XP7210 for example). It makes all the extra buttons generate input
+ events (like keyboards).
+
+ On some models adds support for changing the display brightness and output,
+ switching the LCD backlight on and off, and most importantly, allows you to
+ blink those fancy LEDs intended for reporting mail and wireless status.
+
+This driver supersedes the old asus_acpi driver.
+
+Requirements
+------------
+
+ Kernel 2.6.X sources, configured for your computer, with ACPI support.
+ You also need CONFIG_INPUT and CONFIG_ACPI.
+
+Status
+------
+
+ The features currently supported are the following (see below for
+ detailed description):
+
+ - Fn key combinations
+ - Bluetooth enable and disable
+ - Wlan enable and disable
+ - GPS enable and disable
+ - Video output switching
+ - Ambient Light Sensor on and off
+ - LED control
+ - LED Display control
+ - LCD brightness control
+ - LCD on and off
+
+ A compatibility table by model and feature is maintained on the web
+ site, http://acpi4asus.sf.net/.
+
+Usage
+-----
+
+ Try "modprobe asus-laptop". Check your dmesg (simply type dmesg). You should
+ see some lines like this :
+
+ Asus Laptop Extras version 0.42
+ - L2D model detected.
+
+ If it is not the output you have on your laptop, send it (and the laptop's
+ DSDT) to me.
+
+ That's all, now, all the events generated by the hotkeys of your laptop
+ should be reported via netlink events. You can check with
+ "acpi_genl monitor" (part of the acpica project).
+
+ Hotkeys are also reported as input keys (like keyboards) you can check
+ which key are supported using "xev" under X11.
+
+ You can get information on the version of your DSDT table by reading the
+ /sys/devices/platform/asus-laptop/infos entry. If you have a question or a
+ bug report to do, please include the output of this entry.
+
+LEDs
+----
+
+ You can modify LEDs be echoing values to `/sys/class/leds/asus/*/brightness`::
+
+ echo 1 > /sys/class/leds/asus::mail/brightness
+
+ will switch the mail LED on.
+
+ You can also know if they are on/off by reading their content and use
+ kernel triggers like disk-activity or heartbeat.
+
+Backlight
+---------
+
+ You can control lcd backlight power and brightness with
+ /sys/class/backlight/asus-laptop/. Brightness Values are between 0 and 15.
+
+Wireless devices
+----------------
+
+ You can turn the internal Bluetooth adapter on/off with the bluetooth entry
+ (only on models with Bluetooth). This usually controls the associated LED.
+ Same for Wlan adapter.
+
+Display switching
+-----------------
+
+ Note: the display switching code is currently considered EXPERIMENTAL.
+
+ Switching works for the following models:
+
+ - L3800C
+ - A2500H
+ - L5800C
+ - M5200N
+ - W1000N (albeit with some glitches)
+ - M6700R
+ - A6JC
+ - F3J
+
+ Switching doesn't work for the following:
+
+ - M3700N
+ - L2X00D (locks the laptop under certain conditions)
+
+ To switch the displays, echo values from 0 to 15 to
+ /sys/devices/platform/asus-laptop/display. The significance of those values
+ is as follows:
+
+ +-------+-----+-----+-----+-----+-----+
+ | Bin | Val | DVI | TV | CRT | LCD |
+ +-------+-----+-----+-----+-----+-----+
+ | 0000 | 0 | | | | |
+ +-------+-----+-----+-----+-----+-----+
+ | 0001 | 1 | | | | X |
+ +-------+-----+-----+-----+-----+-----+
+ | 0010 | 2 | | | X | |
+ +-------+-----+-----+-----+-----+-----+
+ | 0011 | 3 | | | X | X |
+ +-------+-----+-----+-----+-----+-----+
+ | 0100 | 4 | | X | | |
+ +-------+-----+-----+-----+-----+-----+
+ | 0101 | 5 | | X | | X |
+ +-------+-----+-----+-----+-----+-----+
+ | 0110 | 6 | | X | X | |
+ +-------+-----+-----+-----+-----+-----+
+ | 0111 | 7 | | X | X | X |
+ +-------+-----+-----+-----+-----+-----+
+ | 1000 | 8 | X | | | |
+ +-------+-----+-----+-----+-----+-----+
+ | 1001 | 9 | X | | | X |
+ +-------+-----+-----+-----+-----+-----+
+ | 1010 | 10 | X | | X | |
+ +-------+-----+-----+-----+-----+-----+
+ | 1011 | 11 | X | | X | X |
+ +-------+-----+-----+-----+-----+-----+
+ | 1100 | 12 | X | X | | |
+ +-------+-----+-----+-----+-----+-----+
+ | 1101 | 13 | X | X | | X |
+ +-------+-----+-----+-----+-----+-----+
+ | 1110 | 14 | X | X | X | |
+ +-------+-----+-----+-----+-----+-----+
+ | 1111 | 15 | X | X | X | X |
+ +-------+-----+-----+-----+-----+-----+
+
+ In most cases, the appropriate displays must be plugged in for the above
+ combinations to work. TV-Out may need to be initialized at boot time.
+
+ Debugging:
+
+ 1) Check whether the Fn+F8 key:
+
+ a) does not lock the laptop (try a boot with noapic / nolapic if it does)
+ b) generates events (0x6n, where n is the value corresponding to the
+ configuration above)
+ c) actually works
+
+ Record the disp value at every configuration.
+ 2) Echo values from 0 to 15 to /sys/devices/platform/asus-laptop/display.
+ Record its value, note any change. If nothing changes, try a broader range,
+ up to 65535.
+ 3) Send ANY output (both positive and negative reports are needed, unless your
+ machine is already listed above) to the acpi4asus-user mailing list.
+
+ Note: on some machines (e.g. L3C), after the module has been loaded, only 0x6n
+ events are generated and no actual switching occurs. In such a case, a line
+ like::
+
+ echo $((10#$arg-60)) > /sys/devices/platform/asus-laptop/display
+
+ will usually do the trick ($arg is the 0000006n-like event passed to acpid).
+
+ Note: there is currently no reliable way to read display status on xxN
+ (Centrino) models.
+
+LED display
+-----------
+
+ Some models like the W1N have a LED display that can be used to display
+ several items of information.
+
+ LED display works for the following models:
+
+ - W1000N
+ - W1J
+
+ To control the LED display, use the following::
+
+ echo 0x0T000DDD > /sys/devices/platform/asus-laptop/
+
+ where T control the 3 letters display, and DDD the 3 digits display,
+ according to the tables below::
+
+ DDD (digits)
+ 000 to 999 = display digits
+ AAA = ---
+ BBB to FFF = turn-off
+
+ T (type)
+ 0 = off
+ 1 = dvd
+ 2 = vcd
+ 3 = mp3
+ 4 = cd
+ 5 = tv
+ 6 = cpu
+ 7 = vol
+
+ For example "echo 0x01000001 >/sys/devices/platform/asus-laptop/ledd"
+ would display "DVD001".
+
+Driver options
+--------------
+
+ Options can be passed to the asus-laptop driver using the standard
+ module argument syntax (<param>=<value> when passing the option to the
+ module or asus-laptop.<param>=<value> on the kernel boot line when
+ asus-laptop is statically linked into the kernel).
+
+ wapf: WAPF defines the behavior of the Fn+Fx wlan key
+ The significance of values is yet to be found, but
+ most of the time:
+
+ - 0x0 should do nothing
+ - 0x1 should allow to control the device with Fn+Fx key.
+ - 0x4 should send an ACPI event (0x88) while pressing the Fn+Fx key
+ - 0x5 like 0x1 or 0x4
+
+ The default value is 0x1.
+
+Unsupported models
+------------------
+
+ These models will never be supported by this module, as they use a completely
+ different mechanism to handle LEDs and extra stuff (meaning we have no clue
+ how it works):
+
+ - ASUS A1300 (A1B), A1370D
+ - ASUS L7300G
+ - ASUS L8400
+
+Patches, Errors, Questions
+--------------------------
+
+ I appreciate any success or failure
+ reports, especially if they add to or correct the compatibility table.
+ Please include the following information in your report:
+
+ - Asus model name
+ - a copy of your ACPI tables, using the "acpidump" utility
+ - a copy of /sys/devices/platform/asus-laptop/infos
+ - which driver features work and which don't
+ - the observed behavior of non-working features
+
+ Any other comments or patches are also more than welcome.
+
+ acpi4asus-user@lists.sourceforge.net
+
+ http://sourceforge.net/projects/acpi4asus
diff --git a/Documentation/admin-guide/laptops/disk-shock-protection.rst b/Documentation/admin-guide/laptops/disk-shock-protection.rst
new file mode 100644
index 000000000000..e97c5f78d8c3
--- /dev/null
+++ b/Documentation/admin-guide/laptops/disk-shock-protection.rst
@@ -0,0 +1,151 @@
+==========================
+Hard disk shock protection
+==========================
+
+Author: Elias Oltmanns <eo@nebensachen.de>
+
+Last modified: 2008-10-03
+
+
+.. 0. Contents
+
+ 1. Intro
+ 2. The interface
+ 3. References
+ 4. CREDITS
+
+
+1. Intro
+--------
+
+ATA/ATAPI-7 specifies the IDLE IMMEDIATE command with unload feature.
+Issuing this command should cause the drive to switch to idle mode and
+unload disk heads. This feature is being used in modern laptops in
+conjunction with accelerometers and appropriate software to implement
+a shock protection facility. The idea is to stop all I/O operations on
+the internal hard drive and park its heads on the ramp when critical
+situations are anticipated. The desire to have such a feature
+available on GNU/Linux systems has been the original motivation to
+implement a generic disk head parking interface in the Linux kernel.
+Please note, however, that other components have to be set up on your
+system in order to get disk shock protection working (see
+section 3. References below for pointers to more information about
+that).
+
+
+2. The interface
+----------------
+
+For each ATA device, the kernel exports the file
+`block/*/device/unload_heads` in sysfs (here assumed to be mounted under
+/sys). Access to `/sys/block/*/device/unload_heads` is denied with
+-EOPNOTSUPP if the device does not support the unload feature.
+Otherwise, writing an integer value to this file will take the heads
+of the respective drive off the platter and block all I/O operations
+for the specified number of milliseconds. When the timeout expires and
+no further disk head park request has been issued in the meantime,
+normal operation will be resumed. The maximal value accepted for a
+timeout is 30000 milliseconds. Exceeding this limit will return
+-EOVERFLOW, but heads will be parked anyway and the timeout will be
+set to 30 seconds. However, you can always change a timeout to any
+value between 0 and 30000 by issuing a subsequent head park request
+before the timeout of the previous one has expired. In particular, the
+total timeout can exceed 30 seconds and, more importantly, you can
+cancel a previously set timeout and resume normal operation
+immediately by specifying a timeout of 0. Values below -2 are rejected
+with -EINVAL (see below for the special meaning of -1 and -2). If the
+timeout specified for a recent head park request has not yet expired,
+reading from `/sys/block/*/device/unload_heads` will report the number
+of milliseconds remaining until normal operation will be resumed;
+otherwise, reading the unload_heads attribute will return 0.
+
+For example, do the following in order to park the heads of drive
+/dev/sda and stop all I/O operations for five seconds::
+
+ # echo 5000 > /sys/block/sda/device/unload_heads
+
+A simple::
+
+ # cat /sys/block/sda/device/unload_heads
+
+will show you how many milliseconds are left before normal operation
+will be resumed.
+
+A word of caution: The fact that the interface operates on a basis of
+milliseconds may raise expectations that cannot be satisfied in
+reality. In fact, the ATA specs clearly state that the time for an
+unload operation to complete is vendor specific. The hint in ATA-7
+that this will typically be within 500 milliseconds apparently has
+been dropped in ATA-8.
+
+There is a technical detail of this implementation that may cause some
+confusion and should be discussed here. When a head park request has
+been issued to a device successfully, all I/O operations on the
+controller port this device is attached to will be deferred. That is
+to say, any other device that may be connected to the same port will
+be affected too. The only exception is that a subsequent head unload
+request to that other device will be executed immediately. Further
+operations on that port will be deferred until the timeout specified
+for either device on the port has expired. As far as PATA (old style
+IDE) configurations are concerned, there can only be two devices
+attached to any single port. In SATA world we have port multipliers
+which means that a user-issued head parking request to one device may
+actually result in stopping I/O to a whole bunch of devices. However,
+since this feature is supposed to be used on laptops and does not seem
+to be very useful in any other environment, there will be mostly one
+device per port. Even if the CD/DVD writer happens to be connected to
+the same port as the hard drive, it generally *should* recover just
+fine from the occasional buffer under-run incurred by a head park
+request to the HD. Actually, when you are using an ide driver rather
+than its libata counterpart (i.e. your disk is called /dev/hda
+instead of /dev/sda), then parking the heads of one drive (drive X)
+will generally not affect the mode of operation of another drive
+(drive Y) on the same port as described above. It is only when a port
+reset is required to recover from an exception on drive Y that further
+I/O operations on that drive (and the reset itself) will be delayed
+until drive X is no longer in the parked state.
+
+Finally, there are some hard drives that only comply with an earlier
+version of the ATA standard than ATA-7, but do support the unload
+feature nonetheless. Unfortunately, there is no safe way Linux can
+detect these devices, so you won't be able to write to the
+unload_heads attribute. If you know that your device really does
+support the unload feature (for instance, because the vendor of your
+laptop or the hard drive itself told you so), then you can tell the
+kernel to enable the usage of this feature for that drive by writing
+the special value -1 to the unload_heads attribute::
+
+ # echo -1 > /sys/block/sda/device/unload_heads
+
+will enable the feature for /dev/sda, and giving -2 instead of -1 will
+disable it again.
+
+
+3. References
+-------------
+
+There are several laptops from different vendors featuring shock
+protection capabilities. As manufacturers have refused to support open
+source development of the required software components so far, Linux
+support for shock protection varies considerably between different
+hardware implementations. Ideally, this section should contain a list
+of pointers at different projects aiming at an implementation of shock
+protection on different systems. Unfortunately, I only know of a
+single project which, although still considered experimental, is fit
+for use. Please feel free to add projects that have been the victims
+of my ignorance.
+
+- http://www.thinkwiki.org/wiki/HDAPS
+
+ See this page for information about Linux support of the hard disk
+ active protection system as implemented in IBM/Lenovo Thinkpads.
+
+
+4. CREDITS
+----------
+
+This implementation of disk head parking has been inspired by a patch
+originally published by Jon Escombe <lists@dresco.co.uk>. My efforts
+to develop an implementation of this feature that is fit to be merged
+into mainline have been aided by various kernel developers, in
+particular by Tejun Heo and Bartlomiej Zolnierkiewicz.
diff --git a/Documentation/admin-guide/laptops/index.rst b/Documentation/admin-guide/laptops/index.rst
new file mode 100644
index 000000000000..cd9a1c2695fd
--- /dev/null
+++ b/Documentation/admin-guide/laptops/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+Laptop Drivers
+==============
+
+.. toctree::
+ :maxdepth: 1
+
+ asus-laptop
+ disk-shock-protection
+ laptop-mode
+ lg-laptop
+ sony-laptop
+ sonypi
+ thinkpad-acpi
+ toshiba_haps
diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst
new file mode 100644
index 000000000000..c984c4262f2e
--- /dev/null
+++ b/Documentation/admin-guide/laptops/laptop-mode.rst
@@ -0,0 +1,781 @@
+===============================================
+How to conserve battery power using laptop-mode
+===============================================
+
+Document Author: Bart Samwel (bart@samwel.tk)
+
+Date created: January 2, 2004
+
+Last modified: December 06, 2004
+
+Introduction
+------------
+
+Laptop mode is used to minimize the time that the hard disk needs to be spun up,
+to conserve battery power on laptops. It has been reported to cause significant
+power savings.
+
+.. Contents
+
+ * Introduction
+ * Installation
+ * Caveats
+ * The Details
+ * Tips & Tricks
+ * Control script
+ * ACPI integration
+ * Monitoring tool
+
+
+Installation
+------------
+
+To use laptop mode, you don't need to set any kernel configuration options
+or anything. Simply install all the files included in this document, and
+laptop mode will automatically be started when you're on battery. For
+your convenience, a tarball containing an installer can be downloaded at:
+
+ http://www.samwel.tk/laptop_mode/laptop_mode/
+
+To configure laptop mode, you need to edit the configuration file, which is
+located in /etc/default/laptop-mode on Debian-based systems, or in
+/etc/sysconfig/laptop-mode on other systems.
+
+Unfortunately, automatic enabling of laptop mode does not work for
+laptops that don't have ACPI. On those laptops, you need to start laptop
+mode manually. To start laptop mode, run "laptop_mode start", and to
+stop it, run "laptop_mode stop". (Note: The laptop mode tools package now
+has experimental support for APM, you might want to try that first.)
+
+
+Caveats
+-------
+
+* The downside of laptop mode is that you have a chance of losing up to 10
+ minutes of work. If you cannot afford this, don't use it! The supplied ACPI
+ scripts automatically turn off laptop mode when the battery almost runs out,
+ so that you won't lose any data at the end of your battery life.
+
+* Most desktop hard drives have a very limited lifetime measured in spindown
+ cycles, typically about 50.000 times (it's usually listed on the spec sheet).
+ Check your drive's rating, and don't wear down your drive's lifetime if you
+ don't need to.
+
+* If you mount some of your ext3/reiserfs filesystems with the -n option, then
+ the control script will not be able to remount them correctly. You must set
+ DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
+ wrong options -- or it will fail because it cannot write to /etc/mtab.
+
+* If you have your filesystems listed as type "auto" in fstab, like I did, then
+ the control script will not recognize them as filesystems that need remounting.
+ You must list the filesystems with their true type instead.
+
+* It has been reported that some versions of the mutt mail client use file access
+ times to determine whether a folder contains new mail. If you use mutt and
+ experience this, you must disable the noatime remounting by setting the option
+ DO_REMOUNT_NOATIME to 0 in the configuration file.
+
+
+The Details
+-----------
+
+Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is
+present for all kernels that have the laptop mode patch, regardless of any
+configuration options. When the knob is set, any physical disk I/O (that might
+have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The
+result of this is that after a disk has spun down, it will not be spun up
+anymore to write dirty blocks, because those blocks had already been written
+immediately after the most recent read operation. The value of the laptop_mode
+knob determines the time between the occurrence of disk I/O and when the flush
+is triggered. A sensible value for the knob is 5 seconds. Setting the knob to
+0 disables laptop mode.
+
+To increase the effectiveness of the laptop_mode strategy, the laptop_mode
+control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
+/proc/sys/vm to about 10 minutes (by default), which means that pages that are
+dirtied are not forced to be written to disk as often. The control script also
+changes the dirty background ratio, so that background writeback of dirty pages
+is not done anymore. Combined with a higher commit value (also 10 minutes) for
+ext3 or ReiserFS filesystems (also done automatically by the control script),
+this results in concentration of disk activity in a small time interval which
+occurs only once every 10 minutes, or whenever the disk is forced to spin up by
+a cache miss. The disk can then be spun down in the periods of inactivity.
+
+If you want to find out which process caused the disk to spin up, you can
+gather information by setting the flag /proc/sys/vm/block_dump. When this flag
+is set, Linux reports all disk read and write operations that take place, and
+all block dirtyings done to files. This makes it possible to debug why a disk
+needs to spin up, and to increase battery life even more. The output of
+block_dump is written to the kernel output, and it can be retrieved using
+"dmesg". When you use block_dump and your kernel logging level also includes
+kernel debugging messages, you probably want to turn off klogd, otherwise
+the output of block_dump will be logged, causing disk activity that is not
+normally there.
+
+
+Configuration
+-------------
+
+The laptop mode configuration file is located in /etc/default/laptop-mode on
+Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It
+contains the following options:
+
+MAX_AGE:
+
+Maximum time, in seconds, of hard drive spindown time that you are
+comfortable with. Worst case, it's possible that you could lose this
+amount of work if your battery fails while you're in laptop mode.
+
+MINIMUM_BATTERY_MINUTES:
+
+Automatically disable laptop mode if the remaining number of minutes of
+battery power is less than this value. Default is 10 minutes.
+
+AC_HD/BATT_HD:
+
+The idle timeout that should be set on your hard drive when laptop mode
+is active (BATT_HD) and when it is not active (AC_HD). The defaults are
+20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The
+possible values are those listed in the manual page for "hdparm" for the
+"-S" option.
+
+HD:
+
+The devices for which the spindown timeout should be adjusted by laptop mode.
+Default is /dev/hda. If you specify multiple devices, separate them by a space.
+
+READAHEAD:
+
+Disk readahead, in 512-byte sectors, while laptop mode is active. A large
+readahead can prevent disk accesses for things like executable pages (which are
+loaded on demand while the application executes) and sequentially accessed data
+(MP3s).
+
+DO_REMOUNTS:
+
+The control script automatically remounts any mounted journaled filesystems
+with appropriate commit interval options. When this option is set to 0, this
+feature is disabled.
+
+DO_REMOUNT_NOATIME:
+
+When remounting, should the filesystems be remounted with the noatime option?
+Normally, this is set to "1" (enabled), but there may be programs that require
+access time recording.
+
+DIRTY_RATIO:
+
+The percentage of memory that is allowed to contain "dirty" or unsaved data
+before a writeback is forced, while laptop mode is active. Corresponds to
+the /proc/sys/vm/dirty_ratio sysctl.
+
+DIRTY_BACKGROUND_RATIO:
+
+The percentage of memory that is allowed to contain "dirty" or unsaved data
+after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set
+this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio
+sysctl.
+
+Note that the behaviour of dirty_background_ratio is quite different
+when laptop mode is active and when it isn't. When laptop mode is inactive,
+dirty_background_ratio is the threshold percentage at which background writeouts
+start taking place. When laptop mode is active, however, background writeouts
+are disabled, and the dirty_background_ratio only determines how much writeback
+is done when dirty_ratio is reached.
+
+DO_CPU:
+
+Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup.
+See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.)
+
+CPU_MAXFREQ:
+
+When on battery, what is the maximum CPU speed that the system should use? Legal
+values are "slowest" for the slowest speed that your CPU is able to operate at,
+or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies.
+
+
+Tips & Tricks
+-------------
+
+* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
+ of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1).
+
+* You can spin down the disk while playing MP3, by setting disk readahead
+ to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at
+ once, and will then spin down while the MP3 is playing. (Thanks to Bartek
+ Kania.)
+
+* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
+ of colours that my display uses it consumes less battery power. I've seen
+ this on powerbooks too. I hope that this is a piece of information that
+ might be useful to the Laptop Mode patch or its users."
+
+* In syslog.conf, you can prefix entries with a dash `-` to omit syncing the
+ file after every logging. When you're using laptop-mode and your disk doesn't
+ spin down, this is a likely culprit.
+
+* Richard Atterer observed that laptop mode does not work well with noflushd
+ (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
+ from doing its thing.
+
+* If you're worried about your data, you might want to consider using a USB
+ memory stick or something like that as a "working area". (Be aware though
+ that flash memory can only handle a limited number of writes, and overuse
+ may wear out your memory stick pretty quickly. Do _not_ use journalling
+ filesystems on flash memory sticks.)
+
+
+Configuration file for control and ACPI battery scripts
+-------------------------------------------------------
+
+This allows the tunables to be changed for the scripts via an external
+configuration file
+
+It should be installed as /etc/default/laptop-mode on Debian, and as
+/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes.
+
+Config file::
+
+ # Maximum time, in seconds, of hard drive spindown time that you are
+ # comfortable with. Worst case, it's possible that you could lose this
+ # amount of work if your battery fails you while in laptop mode.
+ #MAX_AGE=600
+
+ # Automatically disable laptop mode when the number of minutes of battery
+ # that you have left goes below this threshold.
+ MINIMUM_BATTERY_MINUTES=10
+
+ # Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG
+ # by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk
+ # will read a complete MP3 at once, and will then spin down while the MP3/OGG is
+ # playing.
+ #READAHEAD=4096
+
+ # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
+ #DO_REMOUNTS=1
+
+ # And shall we add the "noatime" option to that as well? (1=yes)
+ #DO_REMOUNT_NOATIME=1
+
+ # Dirty synchronous ratio. At this percentage of dirty pages the process
+ # which
+ # calls write() does its own writeback
+ #DIRTY_RATIO=40
+
+ #
+ # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been
+ # exceeded, the kernel will wake flusher threads which will then reduce the
+ # amount of dirty memory to dirty_background_ratio. Set this nice and low,
+ # so once some writeout has commenced, we do a lot of it.
+ #
+ #DIRTY_BACKGROUND_RATIO=5
+
+ # kernel default dirty buffer age
+ #DEF_AGE=30
+ #DEF_UPDATE=5
+ #DEF_DIRTY_BACKGROUND_RATIO=10
+ #DEF_DIRTY_RATIO=40
+ #DEF_XFS_AGE_BUFFER=15
+ #DEF_XFS_SYNC_INTERVAL=30
+ #DEF_XFS_BUFD_INTERVAL=1
+
+ # This must be adjusted manually to the value of HZ in the running kernel
+ # on 2.4, until the XFS people change their 2.4 external interfaces to work in
+ # centisecs. This can be automated, but it's a work in progress that still
+ # needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for
+ # external interfaces, and that is currently always set to 100. So you don't
+ # need to change this on 2.6.
+ #XFS_HZ=100
+
+ # Should the maximum CPU frequency be adjusted down while on battery?
+ # Requires CPUFreq to be setup.
+ # See Documentation/admin-guide/pm/cpufreq.rst for more info
+ #DO_CPU=0
+
+ # When on battery what is the maximum CPU speed that the system should
+ # use? Legal values are "slowest" for the slowest speed that your
+ # CPU is able to operate at, or a value listed in:
+ # /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
+ # Only applicable if DO_CPU=1.
+ #CPU_MAXFREQ=slowest
+
+ # Idle timeout for your hard drive (man hdparm for valid values, -S option)
+ # Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4).
+ #AC_HD=244
+ #BATT_HD=4
+
+ # The drives for which to adjust the idle timeout. Separate them by a space,
+ # e.g. HD="/dev/hda /dev/hdb".
+ #HD="/dev/hda"
+
+ # Set the spindown timeout on a hard drive?
+ #DO_HD=1
+
+
+Control script
+--------------
+
+Please note that this control script works for the Linux 2.4 and 2.6 series (thanks
+to Kiko Piris).
+
+Control script::
+
+ #!/bin/bash
+
+ # start or stop laptop_mode, best run by a power management daemon when
+ # ac gets connected/disconnected from a laptop
+ #
+ # install as /sbin/laptop_mode
+ #
+ # Contributors to this script: Kiko Piris
+ # Bart Samwel
+ # Micha Feigin
+ # Andrew Morton
+ # Herve Eychenne
+ # Dax Kelson
+ #
+ # Original Linux 2.4 version by: Jens Axboe
+
+ #############################################################################
+
+ # Source config
+ if [ -f /etc/default/laptop-mode ] ; then
+ # Debian
+ . /etc/default/laptop-mode
+ elif [ -f /etc/sysconfig/laptop-mode ] ; then
+ # Others
+ . /etc/sysconfig/laptop-mode
+ fi
+
+ # Don't raise an error if the config file is incomplete
+ # set defaults instead:
+
+ # Maximum time, in seconds, of hard drive spindown time that you are
+ # comfortable with. Worst case, it's possible that you could lose this
+ # amount of work if your battery fails you while in laptop mode.
+ MAX_AGE=${MAX_AGE:-'600'}
+
+ # Read-ahead, in kilobytes
+ READAHEAD=${READAHEAD:-'4096'}
+
+ # Shall we remount journaled fs. with appropriate commit interval? (1=yes)
+ DO_REMOUNTS=${DO_REMOUNTS:-'1'}
+
+ # And shall we add the "noatime" option to that as well? (1=yes)
+ DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'}
+
+ # Shall we adjust the idle timeout on a hard drive?
+ DO_HD=${DO_HD:-'1'}
+
+ # Adjust idle timeout on which hard drive?
+ HD="${HD:-'/dev/hda'}"
+
+ # spindown time for HD (hdparm -S values)
+ AC_HD=${AC_HD:-'244'}
+ BATT_HD=${BATT_HD:-'4'}
+
+ # Dirty synchronous ratio. At this percentage of dirty pages the process which
+ # calls write() does its own writeback
+ DIRTY_RATIO=${DIRTY_RATIO:-'40'}
+
+ # cpu frequency scaling
+ # See Documentation/admin-guide/pm/cpufreq.rst for more info
+ DO_CPU=${CPU_MANAGE:-'0'}
+ CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'}
+
+ #
+ # Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been
+ # exceeded, the kernel will wake flusher threads which will then reduce the
+ # amount of dirty memory to dirty_background_ratio. Set this nice and low,
+ # so once some writeout has commenced, we do a lot of it.
+ #
+ DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'}
+
+ # kernel default dirty buffer age
+ DEF_AGE=${DEF_AGE:-'30'}
+ DEF_UPDATE=${DEF_UPDATE:-'5'}
+ DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'}
+ DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'}
+ DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'}
+ DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'}
+ DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'}
+
+ # This must be adjusted manually to the value of HZ in the running kernel
+ # on 2.4, until the XFS people change their 2.4 external interfaces to work in
+ # centisecs. This can be automated, but it's a work in progress that still needs
+ # some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external
+ # interfaces, and that is currently always set to 100. So you don't need to
+ # change this on 2.6.
+ XFS_HZ=${XFS_HZ:-'100'}
+
+ #############################################################################
+
+ KLEVEL="$(uname -r |
+ {
+ IFS='.' read a b c
+ echo $a.$b
+ }
+ )"
+ case "$KLEVEL" in
+ "2.4"|"2.6")
+ ;;
+ *)
+ echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2
+ exit 1
+ ;;
+ esac
+
+ if [ ! -e /proc/sys/vm/laptop_mode ] ; then
+ echo "Kernel is not patched with laptop_mode patch." >&2
+ exit 1
+ fi
+
+ if [ ! -w /proc/sys/vm/laptop_mode ] ; then
+ echo "You do not have enough privileges to enable laptop_mode." >&2
+ exit 1
+ fi
+
+ # Remove an option (the first parameter) of the form option=<number> from
+ # a mount options string (the rest of the parameters).
+ parse_mount_opts () {
+ OPT="$1"
+ shift
+ echo ",$*," | sed \
+ -e 's/,'"$OPT"'=[0-9]*,/,/g' \
+ -e 's/,,*/,/g' \
+ -e 's/^,//' \
+ -e 's/,$//'
+ }
+
+ # Remove an option (the first parameter) without any arguments from
+ # a mount option string (the rest of the parameters).
+ parse_nonumber_mount_opts () {
+ OPT="$1"
+ shift
+ echo ",$*," | sed \
+ -e 's/,'"$OPT"',/,/g' \
+ -e 's/,,*/,/g' \
+ -e 's/^,//' \
+ -e 's/,$//'
+ }
+
+ # Find out the state of a yes/no option (e.g. "atime"/"noatime") in
+ # fstab for a given filesystem, and use this state to replace the
+ # value of the option in another mount options string. The device
+ # is the first argument, the option name the second, and the default
+ # value the third. The remainder is the mount options string.
+ #
+ # Example:
+ # parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
+ #
+ # If fstab contains, say, "rw" for this filesystem, then the result
+ # will be "defaults,atime".
+ parse_yesno_opts_wfstab () {
+ L_DEV="$1"
+ OPT="$2"
+ DEF_OPT="$3"
+ shift 3
+ L_OPTS="$*"
+ PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
+ PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
+ # Watch for a default atime in fstab
+ FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
+ if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then
+ # option specified in fstab: extract the value and use it
+ if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then
+ echo "$PARSEDOPTS1,no$OPT"
+ else
+ # no$OPT not found -- so we must have $OPT.
+ echo "$PARSEDOPTS1,$OPT"
+ fi
+ else
+ # option not specified in fstab -- choose the default.
+ echo "$PARSEDOPTS1,$DEF_OPT"
+ fi
+ }
+
+ # Find out the state of a numbered option (e.g. "commit=NNN") in
+ # fstab for a given filesystem, and use this state to replace the
+ # value of the option in another mount options string. The device
+ # is the first argument, and the option name the second. The
+ # remainder is the mount options string in which the replacement
+ # must be done.
+ #
+ # Example:
+ # parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
+ #
+ # If fstab contains, say, "commit=3,rw" for this filesystem, then the
+ # result will be "rw,commit=3".
+ parse_mount_opts_wfstab () {
+ L_DEV="$1"
+ OPT="$2"
+ shift 2
+ L_OPTS="$*"
+ PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
+ # Watch for a default commit in fstab
+ FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
+ if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then
+ # option specified in fstab: extract the value, and use it
+ echo -n "$PARSEDOPTS1,$OPT="
+ echo ",$FSTAB_OPTS," | sed \
+ -e 's/.*,'"$OPT"'=//' \
+ -e 's/,.*//'
+ else
+ # option not specified in fstab: set it to 0
+ echo "$PARSEDOPTS1,$OPT=0"
+ fi
+ }
+
+ deduce_fstype () {
+ MP="$1"
+ # My root filesystem unfortunately has
+ # type "unknown" in /etc/mtab. If we encounter
+ # "unknown", we try to get the type from fstab.
+ cat /etc/fstab |
+ grep -v '^#' |
+ while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do
+ if [ "$FSTAB_MP" = "$MP" ]; then
+ echo $FSTAB_FST
+ exit 0
+ fi
+ done
+ }
+
+ if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then
+ NOATIME_OPT=",noatime"
+ fi
+
+ case "$1" in
+ start)
+ AGE=$((100*$MAX_AGE))
+ XFS_AGE=$(($XFS_HZ*$MAX_AGE))
+ echo -n "Starting laptop_mode"
+
+ if [ -d /proc/sys/vm/pagebuf ] ; then
+ # (For 2.4 and early 2.6.)
+ # This only needs to be set, not reset -- it is only used when
+ # laptop mode is enabled.
+ echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
+ echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+ elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+ # (A couple of early 2.6 laptop mode patches had these.)
+ # The same goes for these.
+ echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
+ echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+ elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
+ # (2.6.6)
+ # But not for these -- they are also used in normal
+ # operation.
+ echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
+ echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
+ elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
+ # (2.6.7 upwards)
+ # And not for these either. These are in centisecs,
+ # not USER_HZ, so we have to use $AGE, not $XFS_AGE.
+ echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs
+ echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs
+ echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs
+ fi
+
+ case "$KLEVEL" in
+ "2.4")
+ echo 1 > /proc/sys/vm/laptop_mode
+ echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush
+ ;;
+ "2.6")
+ echo 5 > /proc/sys/vm/laptop_mode
+ echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs
+ echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs
+ echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio
+ echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio
+ ;;
+ esac
+ if [ $DO_REMOUNTS -eq 1 ]; then
+ cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+ PARSEDOPTS="$(parse_mount_opts "$OPTS")"
+ if [ "$FST" = 'unknown' ]; then
+ FST=$(deduce_fstype $MP)
+ fi
+ case "$FST" in
+ "ext3"|"reiserfs")
+ PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
+ mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
+ ;;
+ "xfs")
+ mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT
+ ;;
+ esac
+ if [ -b $DEV ] ; then
+ blockdev --setra $(($READAHEAD * 2)) $DEV
+ fi
+ done
+ fi
+ if [ $DO_HD -eq 1 ] ; then
+ for THISHD in $HD ; do
+ /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1
+ /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1
+ done
+ fi
+ if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
+ if [ $CPU_MAXFREQ = 'slowest' ]; then
+ CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq`
+ fi
+ echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
+ fi
+ echo "."
+ ;;
+ stop)
+ U_AGE=$((100*$DEF_UPDATE))
+ B_AGE=$((100*$DEF_AGE))
+ echo -n "Stopping laptop_mode"
+ echo 0 > /proc/sys/vm/laptop_mode
+ if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+ # These need to be restored, if there are no lm_*.
+ echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer
+ echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval
+ elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
+ # These need to be restored as well.
+ echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs
+ echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs
+ echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs
+ fi
+ case "$KLEVEL" in
+ "2.4")
+ echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush
+ ;;
+ "2.6")
+ echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs
+ echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs
+ echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio
+ echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio
+ ;;
+ esac
+ if [ $DO_REMOUNTS -eq 1 ] ; then
+ cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+ # Reset commit and atime options to defaults.
+ if [ "$FST" = 'unknown' ]; then
+ FST=$(deduce_fstype $MP)
+ fi
+ case "$FST" in
+ "ext3"|"reiserfs")
+ PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
+ PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
+ mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+ ;;
+ "xfs")
+ PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
+ mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+ ;;
+ esac
+ if [ -b $DEV ] ; then
+ blockdev --setra 256 $DEV
+ fi
+ done
+ fi
+ if [ $DO_HD -eq 1 ] ; then
+ for THISHD in $HD ; do
+ /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1
+ /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1
+ done
+ fi
+ if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
+ echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
+ fi
+ echo "."
+ ;;
+ *)
+ echo "Usage: $0 {start|stop}" 2>&1
+ exit 1
+ ;;
+
+ esac
+
+ exit 0
+
+
+ACPI integration
+----------------
+
+Dax Kelson submitted this so that the ACPI acpid daemon will
+kick off the laptop_mode script and run hdparm. The part that
+automatically disables laptop mode when the battery is low was
+written by Jan Topinski.
+
+/etc/acpi/events/ac_adapter::
+
+ event=ac_adapter
+ action=/etc/acpi/actions/ac.sh %e
+
+/etc/acpi/events/battery::
+
+ event=battery.*
+ action=/etc/acpi/actions/battery.sh %e
+
+/etc/acpi/actions/ac.sh::
+
+ #!/bin/bash
+
+ # ac on/offline event handler
+
+ status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state`
+
+ case $status in
+ "on-line")
+ /sbin/laptop_mode stop
+ exit 0
+ ;;
+ "off-line")
+ /sbin/laptop_mode start
+ exit 0
+ ;;
+ esac
+
+
+/etc/acpi/actions/battery.sh::
+
+ #! /bin/bash
+
+ # Automatically disable laptop mode when the battery almost runs out.
+
+ BATT_INFO=/proc/acpi/battery/$2/state
+
+ if [[ -f /proc/sys/vm/laptop_mode ]]
+ then
+ LM=`cat /proc/sys/vm/laptop_mode`
+ if [[ $LM -gt 0 ]]
+ then
+ if [[ -f $BATT_INFO ]]
+ then
+ # Source the config file only now that we know we need
+ if [ -f /etc/default/laptop-mode ] ; then
+ # Debian
+ . /etc/default/laptop-mode
+ elif [ -f /etc/sysconfig/laptop-mode ] ; then
+ # Others
+ . /etc/sysconfig/laptop-mode
+ fi
+ MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'}
+
+ ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`"
+ if [[ ACTION -eq "discharging" ]]
+ then
+ PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" `
+ REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" `
+ fi
+ if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES))
+ then
+ /sbin/laptop_mode stop
+ fi
+ else
+ logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path."
+ fi
+ fi
+ fi
+
+
+Monitoring tool
+---------------
+
+Bartek Kania submitted this, it can be used to measure how much time your disk
+spends spun up/down. See tools/laptop/dslm/dslm.c
diff --git a/Documentation/admin-guide/laptops/lg-laptop.rst b/Documentation/admin-guide/laptops/lg-laptop.rst
new file mode 100644
index 000000000000..ce9b14671cb9
--- /dev/null
+++ b/Documentation/admin-guide/laptops/lg-laptop.rst
@@ -0,0 +1,84 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+
+LG Gram laptop extra features
+=============================
+
+By Matan Ziv-Av <matan@svgalib.org>
+
+
+Hotkeys
+-------
+
+The following FN keys are ignored by the kernel without this driver:
+
+- FN-F1 (LG control panel) - Generates F15
+- FN-F5 (Touchpad toggle) - Generates F13
+- FN-F6 (Airplane mode) - Generates RFKILL
+- FN-F8 (Keyboard backlight) - Generates F16.
+ This key also changes keyboard backlight mode.
+- FN-F9 (Reader mode) - Generates F14
+
+The rest of the FN keys work without a need for a special driver.
+
+
+Reader mode
+-----------
+
+Writing 0/1 to /sys/devices/platform/lg-laptop/reader_mode disables/enables
+reader mode. In this mode the screen colors change (blue color reduced),
+and the reader mode indicator LED (on F9 key) turns on.
+
+
+FN Lock
+-------
+
+Writing 0/1 to /sys/devices/platform/lg-laptop/fn_lock disables/enables
+FN lock.
+
+
+Battery care limit
+------------------
+
+Writing 80/100 to /sys/devices/platform/lg-laptop/battery_care_limit
+sets the maximum capacity to charge the battery. Limiting the charge
+reduces battery capacity loss over time.
+
+This value is reset to 100 when the kernel boots.
+
+
+Fan mode
+--------
+
+Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables
+the fan silent mode.
+
+
+USB charge
+----------
+
+Writing 0/1 to /sys/devices/platform/lg-laptop/usb_charge disables/enables
+charging another device from the USB port while the device is turned off.
+
+This value is reset to 0 when the kernel boots.
+
+
+LEDs
+~~~~
+
+The are two LED devices supported by the driver:
+
+Keyboard backlight
+------------------
+
+A led device named kbd_led controls the keyboard backlight. There are three
+lighting level: off (0), low (127) and high (255).
+
+The keyboard backlight is also controlled by the key combination FN-F8
+which cycles through those levels.
+
+
+Touchpad indicator LED
+----------------------
+
+On the F5 key. Controlled by led device names tpad_led.
diff --git a/Documentation/admin-guide/laptops/sony-laptop.rst b/Documentation/admin-guide/laptops/sony-laptop.rst
new file mode 100644
index 000000000000..9edcc7f6612f
--- /dev/null
+++ b/Documentation/admin-guide/laptops/sony-laptop.rst
@@ -0,0 +1,174 @@
+=========================================
+Sony Notebook Control Driver (SNC) Readme
+=========================================
+
+ - Copyright (C) 2004- 2005 Stelian Pop <stelian@popies.net>
+ - Copyright (C) 2007 Mattia Dongili <malattia@linux.it>
+
+This mini-driver drives the SNC and SPIC device present in the ACPI BIOS of the
+Sony Vaio laptops. This driver mixes both devices functions under the same
+(hopefully consistent) interface. This also means that the sonypi driver is
+obsoleted by sony-laptop now.
+
+Fn keys (hotkeys):
+------------------
+
+Some models report hotkeys through the SNC or SPIC devices, such events are
+reported both through the ACPI subsystem as acpi events and through the INPUT
+subsystem. See the logs of /proc/bus/input/devices to find out what those
+events are and which input devices are created by the driver.
+Additionally, loading the driver with the debug option will report all events
+in the kernel log.
+
+The "scancodes" passed to the input system (that can be remapped with udev)
+are indexes to the table "sony_laptop_input_keycode_map" in the sony-laptop.c
+module. For example the "FN/E" key combination (EJECTCD on some models)
+generates the scancode 20 (0x14).
+
+Backlight control:
+------------------
+If your laptop model supports it, you will find sysfs files in the
+/sys/class/backlight/sony/
+directory. You will be able to query and set the current screen
+brightness:
+
+ ====================== =========================================
+ brightness get/set screen brightness (an integer
+ between 0 and 7)
+ actual_brightness reading from this file will query the HW
+ to get real brightness value
+ max_brightness the maximum brightness value
+ ====================== =========================================
+
+
+Platform specific:
+------------------
+Loading the sony-laptop module will create a
+/sys/devices/platform/sony-laptop/
+directory populated with some files.
+
+You then read/write integer values from/to those files by using
+standard UNIX tools.
+
+The files are:
+
+ ====================== ==========================================
+ brightness_default screen brightness which will be set
+ when the laptop will be rebooted
+ cdpower power on/off the internal CD drive
+ audiopower power on/off the internal sound card
+ lanpower power on/off the internal ethernet card
+ (only in debug mode)
+ bluetoothpower power on/off the internal bluetooth device
+ fanspeed get/set the fan speed
+ ====================== ==========================================
+
+Note that some files may be missing if they are not supported
+by your particular laptop model.
+
+Example usage::
+
+ # echo "1" > /sys/devices/platform/sony-laptop/brightness_default
+
+sets the lowest screen brightness for the next and later reboots
+
+::
+
+ # echo "8" > /sys/devices/platform/sony-laptop/brightness_default
+
+sets the highest screen brightness for the next and later reboots
+
+::
+
+ # cat /sys/devices/platform/sony-laptop/brightness_default
+
+retrieves the value
+
+::
+
+ # echo "0" > /sys/devices/platform/sony-laptop/audiopower
+
+powers off the sound card
+
+::
+
+ # echo "1" > /sys/devices/platform/sony-laptop/audiopower
+
+powers on the sound card.
+
+
+RFkill control:
+---------------
+More recent Vaio models expose a consistent set of ACPI methods to
+control radio frequency emitting devices. If you are a lucky owner of
+such a laptop you will find the necessary rfkill devices under
+/sys/class/rfkill. Check those starting with sony-* in::
+
+ # grep . /sys/class/rfkill/*/{state,name}
+
+
+Development:
+------------
+
+If you want to help with the development of this driver (and
+you are not afraid of any side effects doing strange things with
+your ACPI BIOS could have on your laptop), load the driver and
+pass the option 'debug=1'.
+
+REPEAT:
+ **DON'T DO THIS IF YOU DON'T LIKE RISKY BUSINESS.**
+
+In your kernel logs you will find the list of all ACPI methods
+the SNC device has on your laptop.
+
+* For new models you will see a long list of meaningless method names,
+ reading the DSDT table source should reveal that:
+
+(1) the SNC device uses an internal capability lookup table
+(2) SN00 is used to find values in the lookup table
+(3) SN06 and SN07 are used to call into the real methods based on
+ offsets you can obtain iterating the table using SN00
+(4) SN02 used to enable events.
+
+Some values in the capability lookup table are more or less known, see
+the code for all sony_call_snc_handle calls, others are more obscure.
+
+* For old models you can see the GCDP/GCDP methods used to pwer on/off
+ the CD drive, but there are others and they are usually different from
+ model to model.
+
+**I HAVE NO IDEA WHAT THOSE METHODS DO.**
+
+The sony-laptop driver creates, for some of those methods (the most
+current ones found on several Vaio models), an entry under
+/sys/devices/platform/sony-laptop, just like the 'cdpower' one.
+You can create other entries corresponding to your own laptop methods by
+further editing the source (see the 'sony_nc_values' table, and add a new
+entry to this table with your get/set method names using the
+SNC_HANDLE_NAMES macro).
+
+Your mission, should you accept it, is to try finding out what
+those entries are for, by reading/writing random values from/to those
+files and find out what is the impact on your laptop.
+
+Should you find anything interesting, please report it back to me,
+I will not disavow all knowledge of your actions :)
+
+See also http://www.linux.it/~malattia/wiki/index.php/Sony_drivers for other
+useful info.
+
+Bugs/Limitations:
+-----------------
+
+* This driver is not based on official documentation from Sony
+ (because there is none), so there is no guarantee this driver
+ will work at all, or do the right thing. Although this hasn't
+ happened to me, this driver could do very bad things to your
+ laptop, including permanent damage.
+
+* The sony-laptop and sonypi drivers do not interact at all. In the
+ future, sonypi will be removed and replaced by sony-laptop.
+
+* spicctrl, which is the userspace tool used to communicate with the
+ sonypi driver (through /dev/sonypi) is deprecated as well since all
+ its features are now available under the sysfs tree via sony-laptop.
diff --git a/Documentation/admin-guide/laptops/sonypi.rst b/Documentation/admin-guide/laptops/sonypi.rst
new file mode 100644
index 000000000000..c6eaaf48f7c1
--- /dev/null
+++ b/Documentation/admin-guide/laptops/sonypi.rst
@@ -0,0 +1,158 @@
+==================================================
+Sony Programmable I/O Control Device Driver Readme
+==================================================
+
+ - Copyright (C) 2001-2004 Stelian Pop <stelian@popies.net>
+ - Copyright (C) 2001-2002 Alcôve <www.alcove.com>
+ - Copyright (C) 2001 Michael Ashley <m.ashley@unsw.edu.au>
+ - Copyright (C) 2001 Junichi Morita <jun1m@mars.dti.ne.jp>
+ - Copyright (C) 2000 Takaya Kinjo <t-kinjo@tc4.so-net.ne.jp>
+ - Copyright (C) 2000 Andrew Tridgell <tridge@samba.org>
+
+This driver enables access to the Sony Programmable I/O Control Device which
+can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be
+limited to new FX series laptops, at least the FX501 and the FX702) lack a
+sonypi device and are not supported at all by this driver.
+
+It will give access (through a user space utility) to some events those laptops
+generate, like:
+
+ - jogdial events (the small wheel on the side of Vaios)
+ - capture button events (only on Vaio Picturebook series)
+ - Fn keys
+ - bluetooth button (only on C1VR model)
+ - programmable keys, back, help, zoom, thumbphrase buttons, etc.
+ (when available)
+
+Those events (see linux/sonypi.h) can be polled using the character device node
+/dev/sonypi (major 10, minor auto allocated or specified as a option).
+A simple daemon which translates the jogdial movements into mouse wheel events
+can be downloaded at: <http://popies.net/sonypi/>
+
+Another option to intercept the events is to get them directly through the
+input layer.
+
+This driver supports also some ioctl commands for setting the LCD screen
+brightness and querying the batteries charge information (some more
+commands may be added in the future).
+
+This driver can also be used to set the camera controls on Picturebook series
+(brightness, contrast etc), and is used by the video4linux driver for the
+Motion Eye camera.
+
+Please note that this driver was created by reverse engineering the Windows
+driver and the ACPI BIOS, because Sony doesn't agree to release any programming
+specs for its laptops. If someone convinces them to do so, drop me a note.
+
+Driver options:
+---------------
+
+Several options can be passed to the sonypi driver using the standard
+module argument syntax (<param>=<value> when passing the option to the
+module or sonypi.<param>=<value> on the kernel boot line when sonypi is
+statically linked into the kernel). Those options are:
+
+ =============== =======================================================
+ minor: minor number of the misc device /dev/sonypi,
+ default is -1 (automatic allocation, see /proc/misc
+ or kernel logs)
+
+ camera: if you have a PictureBook series Vaio (with the
+ integrated MotionEye camera), set this parameter to 1
+ in order to let the driver access to the camera
+
+ fnkeyinit: on some Vaios (C1VE, C1VR etc), the Fn key events don't
+ get enabled unless you set this parameter to 1.
+ Do not use this option unless it's actually necessary,
+ some Vaio models don't deal well with this option.
+ This option is available only if the kernel is
+ compiled without ACPI support (since it conflicts
+ with it and it shouldn't be required anyway if
+ ACPI is already enabled).
+
+ verbose: set to 1 to print unknown events received from the
+ sonypi device.
+ set to 2 to print all events received from the
+ sonypi device.
+
+ compat: uses some compatibility code for enabling the sonypi
+ events. If the driver worked for you in the past
+ (prior to version 1.5) and does not work anymore,
+ add this option and report to the author.
+
+ mask: event mask telling the driver what events will be
+ reported to the user. This parameter is required for
+ some Vaio models where the hardware reuses values
+ used in other Vaio models (like the FX series who does
+ not have a jogdial but reuses the jogdial events for
+ programmable keys events). The default event mask is
+ set to 0xffffffff, meaning that all possible events
+ will be tried. You can use the following bits to
+ construct your own event mask (from
+ drivers/char/sonypi.h)::
+
+ SONYPI_JOGGER_MASK 0x0001
+ SONYPI_CAPTURE_MASK 0x0002
+ SONYPI_FNKEY_MASK 0x0004
+ SONYPI_BLUETOOTH_MASK 0x0008
+ SONYPI_PKEY_MASK 0x0010
+ SONYPI_BACK_MASK 0x0020
+ SONYPI_HELP_MASK 0x0040
+ SONYPI_LID_MASK 0x0080
+ SONYPI_ZOOM_MASK 0x0100
+ SONYPI_THUMBPHRASE_MASK 0x0200
+ SONYPI_MEYE_MASK 0x0400
+ SONYPI_MEMORYSTICK_MASK 0x0800
+ SONYPI_BATTERY_MASK 0x1000
+ SONYPI_WIRELESS_MASK 0x2000
+
+ useinput: if set (which is the default) two input devices are
+ created, one which interprets the jogdial events as
+ mouse events, the other one which acts like a
+ keyboard reporting the pressing of the special keys.
+ =============== =======================================================
+
+Module use:
+-----------
+
+In order to automatically load the sonypi module on use, you can put those
+lines a configuration file in /etc/modprobe.d/::
+
+ alias char-major-10-250 sonypi
+ options sonypi minor=250
+
+This supposes the use of minor 250 for the sonypi device::
+
+ # mknod /dev/sonypi c 10 250
+
+Bugs:
+-----
+
+ - several users reported that this driver disables the BIOS-managed
+ Fn-keys which put the laptop in sleeping state, or switch the
+ external monitor on/off. There is no workaround yet, since this
+ driver disables all APM management for those keys, by enabling the
+ ACPI management (and the ACPI core stuff is not complete yet). If
+ you have one of those laptops with working Fn keys and want to
+ continue to use them, don't use this driver.
+
+ - some users reported that the laptop speed is lower (dhrystone
+ tested) when using the driver with the fnkeyinit parameter. I cannot
+ reproduce it on my laptop and not all users have this problem.
+ This happens because the fnkeyinit parameter enables the ACPI
+ mode (but without additional ACPI control, like processor
+ speed handling etc). Use ACPI instead of APM if it works on your
+ laptop.
+
+ - sonypi lacks the ability to distinguish between certain key
+ events on some models.
+
+ - some models with the nvidia card (geforce go 6200 tc) uses a
+ different way to adjust the backlighting of the screen. There
+ is a userspace utility to adjust the brightness on those models,
+ which can be downloaded from
+ http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2
+
+ - since all development was done by reverse engineering, there is
+ *absolutely no guarantee* that this driver will not crash your
+ laptop. Permanently.
diff --git a/Documentation/admin-guide/laptops/thinkpad-acpi.rst b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
new file mode 100644
index 000000000000..adea0bf2acc5
--- /dev/null
+++ b/Documentation/admin-guide/laptops/thinkpad-acpi.rst
@@ -0,0 +1,1562 @@
+===========================
+ThinkPad ACPI Extras Driver
+===========================
+
+Version 0.25
+
+October 16th, 2013
+
+- Borislav Deianov <borislav@users.sf.net>
+- Henrique de Moraes Holschuh <hmh@hmh.eng.br>
+
+http://ibm-acpi.sf.net/
+
+This is a Linux driver for the IBM and Lenovo ThinkPad laptops. It
+supports various features of these laptops which are accessible
+through the ACPI and ACPI EC framework, but not otherwise fully
+supported by the generic Linux ACPI drivers.
+
+This driver used to be named ibm-acpi until kernel 2.6.21 and release
+0.13-20070314. It used to be in the drivers/acpi tree, but it was
+moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
+2.6.22, and release 0.14. It was moved to drivers/platform/x86 for
+kernel 2.6.29 and release 0.22.
+
+The driver is named "thinkpad-acpi". In some places, like module
+names and log messages, "thinkpad_acpi" is used because of userspace
+issues.
+
+"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
+long due to length limitations on some Linux kernel versions.
+
+Status
+------
+
+The features currently supported are the following (see below for
+detailed description):
+
+ - Fn key combinations
+ - Bluetooth enable and disable
+ - video output switching, expansion control
+ - ThinkLight on and off
+ - CMOS/UCMS control
+ - LED control
+ - ACPI sounds
+ - temperature sensors
+ - Experimental: embedded controller register dump
+ - LCD brightness control
+ - Volume control
+ - Fan control and monitoring: fan speed, fan enable/disable
+ - WAN enable and disable
+ - UWB enable and disable
+
+A compatibility table by model and feature is maintained on the web
+site, http://ibm-acpi.sf.net/. I appreciate any success or failure
+reports, especially if they add to or correct the compatibility table.
+Please include the following information in your report:
+
+ - ThinkPad model name
+ - a copy of your ACPI tables, using the "acpidump" utility
+ - a copy of the output of dmidecode, with serial numbers
+ and UUIDs masked off
+ - which driver features work and which don't
+ - the observed behavior of non-working features
+
+Any other comments or patches are also more than welcome.
+
+
+Installation
+------------
+
+If you are compiling this driver as included in the Linux kernel
+sources, look for the CONFIG_THINKPAD_ACPI Kconfig option.
+It is located on the menu path: "Device Drivers" -> "X86 Platform
+Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras".
+
+
+Features
+--------
+
+The driver exports two different interfaces to userspace, which can be
+used to access the features it provides. One is a legacy procfs-based
+interface, which will be removed at some time in the future. The other
+is a new sysfs-based interface which is not complete yet.
+
+The procfs interface creates the /proc/acpi/ibm directory. There is a
+file under that directory for each feature it supports. The procfs
+interface is mostly frozen, and will change very little if at all: it
+will not be extended to add any new functionality in the driver, instead
+all new functionality will be implemented on the sysfs interface.
+
+The sysfs interface tries to blend in the generic Linux sysfs subsystems
+and classes as much as possible. Since some of these subsystems are not
+yet ready or stabilized, it is expected that this interface will change,
+and any and all userspace programs must deal with it.
+
+
+Notes about the sysfs interface
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Unlike what was done with the procfs interface, correctness when talking
+to the sysfs interfaces will be enforced, as will correctness in the
+thinkpad-acpi's implementation of sysfs interfaces.
+
+Also, any bugs in the thinkpad-acpi sysfs driver code or in the
+thinkpad-acpi's implementation of the sysfs interfaces will be fixed for
+maximum correctness, even if that means changing an interface in
+non-compatible ways. As these interfaces mature both in the kernel and
+in thinkpad-acpi, such changes should become quite rare.
+
+Applications interfacing to the thinkpad-acpi sysfs interfaces must
+follow all sysfs guidelines and correctly process all errors (the sysfs
+interface makes extensive use of errors). File descriptors and open /
+close operations to the sysfs inodes must also be properly implemented.
+
+The version of thinkpad-acpi's sysfs interface is exported by the driver
+as a driver attribute (see below).
+
+Sysfs driver attributes are on the driver's sysfs attribute space,
+for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and
+/sys/bus/platform/drivers/thinkpad_hwmon/
+
+Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
+space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/.
+
+Sysfs device attributes for the sensors and fan are on the
+thinkpad_hwmon device's sysfs attribute space, but you should locate it
+looking for a hwmon device with the name attribute of "thinkpad", or
+better yet, through libsensors. For 4.14+ sysfs attributes were moved to the
+hwmon device (/sys/bus/platform/devices/thinkpad_hwmon/hwmon/hwmon? or
+/sys/class/hwmon/hwmon?).
+
+Driver version
+--------------
+
+procfs: /proc/acpi/ibm/driver
+
+sysfs driver attribute: version
+
+The driver name and version. No commands can be written to this file.
+
+
+Sysfs interface version
+-----------------------
+
+sysfs driver attribute: interface_version
+
+Version of the thinkpad-acpi sysfs interface, as an unsigned long
+(output in hex format: 0xAAAABBCC), where:
+
+ AAAA
+ - major revision
+ BB
+ - minor revision
+ CC
+ - bugfix revision
+
+The sysfs interface version changelog for the driver can be found at the
+end of this document. Changes to the sysfs interface done by the kernel
+subsystems are not documented here, nor are they tracked by this
+attribute.
+
+Changes to the thinkpad-acpi sysfs interface are only considered
+non-experimental when they are submitted to Linux mainline, at which
+point the changes in this interface are documented and interface_version
+may be updated. If you are using any thinkpad-acpi features not yet
+sent to mainline for merging, you do so on your own risk: these features
+may disappear, or be implemented in a different and incompatible way by
+the time they are merged in Linux mainline.
+
+Changes that are backwards-compatible by nature (e.g. the addition of
+attributes that do not change the way the other attributes work) do not
+always warrant an update of interface_version. Therefore, one must
+expect that an attribute might not be there, and deal with it properly
+(an attribute not being there *is* a valid way to make it clear that a
+feature is not available in sysfs).
+
+
+Hot keys
+--------
+
+procfs: /proc/acpi/ibm/hotkey
+
+sysfs device attribute: hotkey_*
+
+In a ThinkPad, the ACPI HKEY handler is responsible for communicating
+some important events and also keyboard hot key presses to the operating
+system. Enabling the hotkey functionality of thinkpad-acpi signals the
+firmware that such a driver is present, and modifies how the ThinkPad
+firmware will behave in many situations.
+
+The driver enables the HKEY ("hot key") event reporting automatically
+when loaded, and disables it when it is removed.
+
+The driver will report HKEY events in the following format::
+
+ ibm/hotkey HKEY 00000080 0000xxxx
+
+Some of these events refer to hot key presses, but not all of them.
+
+The driver will generate events over the input layer for hot keys and
+radio switches, and over the ACPI netlink layer for other events. The
+input layer support accepts the standard IOCTLs to remap the keycodes
+assigned to each hot key.
+
+The hot key bit mask allows some control over which hot keys generate
+events. If a key is "masked" (bit set to 0 in the mask), the firmware
+will handle it. If it is "unmasked", it signals the firmware that
+thinkpad-acpi would prefer to handle it, if the firmware would be so
+kind to allow it (and it often doesn't!).
+
+Not all bits in the mask can be modified. Not all bits that can be
+modified do anything. Not all hot keys can be individually controlled
+by the mask. Some models do not support the mask at all. The behaviour
+of the mask is, therefore, highly dependent on the ThinkPad model.
+
+The driver will filter out any unmasked hotkeys, so even if the firmware
+doesn't allow disabling an specific hotkey, the driver will not report
+events for unmasked hotkeys.
+
+Note that unmasking some keys prevents their default behavior. For
+example, if Fn+F5 is unmasked, that key will no longer enable/disable
+Bluetooth by itself in firmware.
+
+Note also that not all Fn key combinations are supported through ACPI
+depending on the ThinkPad model and firmware version. On those
+ThinkPads, it is still possible to support some extra hotkeys by
+polling the "CMOS NVRAM" at least 10 times per second. The driver
+attempts to enables this functionality automatically when required.
+
+procfs notes
+^^^^^^^^^^^^
+
+The following commands can be written to the /proc/acpi/ibm/hotkey file::
+
+ echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
+ echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
+ ... any other 8-hex-digit mask ...
+ echo reset > /proc/acpi/ibm/hotkey -- restore the recommended mask
+
+The following commands have been deprecated and will cause the kernel
+to log a warning::
+
+ echo enable > /proc/acpi/ibm/hotkey -- does nothing
+ echo disable > /proc/acpi/ibm/hotkey -- returns an error
+
+The procfs interface does not support NVRAM polling control. So as to
+maintain maximum bug-to-bug compatibility, it does not report any masks,
+nor does it allow one to manipulate the hot key mask when the firmware
+does not support masks at all, even if NVRAM polling is in use.
+
+sysfs notes
+^^^^^^^^^^^
+
+ hotkey_bios_enabled:
+ DEPRECATED, WILL BE REMOVED SOON.
+
+ Returns 0.
+
+ hotkey_bios_mask:
+ DEPRECATED, DON'T USE, WILL BE REMOVED IN THE FUTURE.
+
+ Returns the hot keys mask when thinkpad-acpi was loaded.
+ Upon module unload, the hot keys mask will be restored
+ to this value. This is always 0x80c, because those are
+ the hotkeys that were supported by ancient firmware
+ without mask support.
+
+ hotkey_enable:
+ DEPRECATED, WILL BE REMOVED SOON.
+
+ 0: returns -EPERM
+ 1: does nothing
+
+ hotkey_mask:
+ bit mask to enable reporting (and depending on
+ the firmware, ACPI event generation) for each hot key
+ (see above). Returns the current status of the hot keys
+ mask, and allows one to modify it.
+
+ hotkey_all_mask:
+ bit mask that should enable event reporting for all
+ supported hot keys, when echoed to hotkey_mask above.
+ Unless you know which events need to be handled
+ passively (because the firmware *will* handle them
+ anyway), do *not* use hotkey_all_mask. Use
+ hotkey_recommended_mask, instead. You have been warned.
+
+ hotkey_recommended_mask:
+ bit mask that should enable event reporting for all
+ supported hot keys, except those which are always
+ handled by the firmware anyway. Echo it to
+ hotkey_mask above, to use. This is the default mask
+ used by the driver.
+
+ hotkey_source_mask:
+ bit mask that selects which hot keys will the driver
+ poll the NVRAM for. This is auto-detected by the driver
+ based on the capabilities reported by the ACPI firmware,
+ but it can be overridden at runtime.
+
+ Hot keys whose bits are set in hotkey_source_mask are
+ polled for in NVRAM, and reported as hotkey events if
+ enabled in hotkey_mask. Only a few hot keys are
+ available through CMOS NVRAM polling.
+
+ Warning: when in NVRAM mode, the volume up/down/mute
+ keys are synthesized according to changes in the mixer,
+ which uses a single volume up or volume down hotkey
+ press to unmute, as per the ThinkPad volume mixer user
+ interface. When in ACPI event mode, volume up/down/mute
+ events are reported by the firmware and can behave
+ differently (and that behaviour changes with firmware
+ version -- not just with firmware models -- as well as
+ OSI(Linux) state).
+
+ hotkey_poll_freq:
+ frequency in Hz for hot key polling. It must be between
+ 0 and 25 Hz. Polling is only carried out when strictly
+ needed.
+
+ Setting hotkey_poll_freq to zero disables polling, and
+ will cause hot key presses that require NVRAM polling
+ to never be reported.
+
+ Setting hotkey_poll_freq too low may cause repeated
+ pressings of the same hot key to be misreported as a
+ single key press, or to not even be detected at all.
+ The recommended polling frequency is 10Hz.
+
+ hotkey_radio_sw:
+ If the ThinkPad has a hardware radio switch, this
+ attribute will read 0 if the switch is in the "radios
+ disabled" position, and 1 if the switch is in the
+ "radios enabled" position.
+
+ This attribute has poll()/select() support.
+
+ hotkey_tablet_mode:
+ If the ThinkPad has tablet capabilities, this attribute
+ will read 0 if the ThinkPad is in normal mode, and
+ 1 if the ThinkPad is in tablet mode.
+
+ This attribute has poll()/select() support.
+
+ wakeup_reason:
+ Set to 1 if the system is waking up because the user
+ requested a bay ejection. Set to 2 if the system is
+ waking up because the user requested the system to
+ undock. Set to zero for normal wake-ups or wake-ups
+ due to unknown reasons.
+
+ This attribute has poll()/select() support.
+
+ wakeup_hotunplug_complete:
+ Set to 1 if the system was waken up because of an
+ undock or bay ejection request, and that request
+ was successfully completed. At this point, it might
+ be useful to send the system back to sleep, at the
+ user's choice. Refer to HKEY events 0x4003 and
+ 0x3003, below.
+
+ This attribute has poll()/select() support.
+
+input layer notes
+^^^^^^^^^^^^^^^^^
+
+A Hot key is mapped to a single input layer EV_KEY event, possibly
+followed by an EV_MSC MSC_SCAN event that shall contain that key's scan
+code. An EV_SYN event will always be generated to mark the end of the
+event block.
+
+Do not use the EV_MSC MSC_SCAN events to process keys. They are to be
+used as a helper to remap keys, only. They are particularly useful when
+remapping KEY_UNKNOWN keys.
+
+The events are available in an input device, with the following id:
+
+ ============== ==============================
+ Bus BUS_HOST
+ vendor 0x1014 (PCI_VENDOR_ID_IBM) or
+ 0x17aa (PCI_VENDOR_ID_LENOVO)
+ product 0x5054 ("TP")
+ version 0x4101
+ ============== ==============================
+
+The version will have its LSB incremented if the keymap changes in a
+backwards-compatible way. The MSB shall always be 0x41 for this input
+device. If the MSB is not 0x41, do not use the device as described in
+this section, as it is either something else (e.g. another input device
+exported by a thinkpad driver, such as HDAPS) or its functionality has
+been changed in a non-backwards compatible way.
+
+Adding other event types for other functionalities shall be considered a
+backwards-compatible change for this input device.
+
+Thinkpad-acpi Hot Key event map (version 0x4101):
+
+======= ======= ============== ==============================================
+ACPI Scan
+event code Key Notes
+======= ======= ============== ==============================================
+0x1001 0x00 FN+F1 -
+
+0x1002 0x01 FN+F2 IBM: battery (rare)
+ Lenovo: Screen lock
+
+0x1003 0x02 FN+F3 Many IBM models always report
+ this hot key, even with hot keys
+ disabled or with Fn+F3 masked
+ off
+ IBM: screen lock, often turns
+ off the ThinkLight as side-effect
+ Lenovo: battery
+
+0x1004 0x03 FN+F4 Sleep button (ACPI sleep button
+ semantics, i.e. sleep-to-RAM).
+ It always generates some kind
+ of event, either the hot key
+ event or an ACPI sleep button
+ event. The firmware may
+ refuse to generate further FN+F4
+ key presses until a S3 or S4 ACPI
+ sleep cycle is performed or some
+ time passes.
+
+0x1005 0x04 FN+F5 Radio. Enables/disables
+ the internal Bluetooth hardware
+ and W-WAN card if left in control
+ of the firmware. Does not affect
+ the WLAN card.
+ Should be used to turn on/off all
+ radios (Bluetooth+W-WAN+WLAN),
+ really.
+
+0x1006 0x05 FN+F6 -
+
+0x1007 0x06 FN+F7 Video output cycle.
+ Do you feel lucky today?
+
+0x1008 0x07 FN+F8 IBM: toggle screen expand
+ Lenovo: configure UltraNav,
+ or toggle screen expand
+
+0x1009 0x08 FN+F9 -
+
+... ... ... ...
+
+0x100B 0x0A FN+F11 -
+
+0x100C 0x0B FN+F12 Sleep to disk. You are always
+ supposed to handle it yourself,
+ either through the ACPI event,
+ or through a hotkey event.
+ The firmware may refuse to
+ generate further FN+F12 key
+ press events until a S3 or S4
+ ACPI sleep cycle is performed,
+ or some time passes.
+
+0x100D 0x0C FN+BACKSPACE -
+0x100E 0x0D FN+INSERT -
+0x100F 0x0E FN+DELETE -
+
+0x1010 0x0F FN+HOME Brightness up. This key is
+ always handled by the firmware
+ in IBM ThinkPads, even when
+ unmasked. Just leave it alone.
+ For Lenovo ThinkPads with a new
+ BIOS, it has to be handled either
+ by the ACPI OSI, or by userspace.
+ The driver does the right thing,
+ never mess with this.
+0x1011 0x10 FN+END Brightness down. See brightness
+ up for details.
+
+0x1012 0x11 FN+PGUP ThinkLight toggle. This key is
+ always handled by the firmware,
+ even when unmasked.
+
+0x1013 0x12 FN+PGDOWN -
+
+0x1014 0x13 FN+SPACE Zoom key
+
+0x1015 0x14 VOLUME UP Internal mixer volume up. This
+ key is always handled by the
+ firmware, even when unmasked.
+ NOTE: Lenovo seems to be changing
+ this.
+0x1016 0x15 VOLUME DOWN Internal mixer volume up. This
+ key is always handled by the
+ firmware, even when unmasked.
+ NOTE: Lenovo seems to be changing
+ this.
+0x1017 0x16 MUTE Mute internal mixer. This
+ key is always handled by the
+ firmware, even when unmasked.
+
+0x1018 0x17 THINKPAD ThinkPad/Access IBM/Lenovo key
+
+0x1019 0x18 unknown
+
+... ... ...
+
+0x1020 0x1F unknown
+======= ======= ============== ==============================================
+
+The ThinkPad firmware does not allow one to differentiate when most hot
+keys are pressed or released (either that, or we don't know how to, yet).
+For these keys, the driver generates a set of events for a key press and
+immediately issues the same set of events for a key release. It is
+unknown by the driver if the ThinkPad firmware triggered these events on
+hot key press or release, but the firmware will do it for either one, not
+both.
+
+If a key is mapped to KEY_RESERVED, it generates no input events at all.
+If a key is mapped to KEY_UNKNOWN, it generates an input event that
+includes an scan code. If a key is mapped to anything else, it will
+generate input device EV_KEY events.
+
+In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW
+events for switches:
+
+============== ==============================================
+SW_RFKILL_ALL T60 and later hardware rfkill rocker switch
+SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A
+============== ==============================================
+
+Non hotkey ACPI HKEY event map
+------------------------------
+
+Events that are never propagated by the driver:
+
+====== ==================================================
+0x2304 System is waking up from suspend to undock
+0x2305 System is waking up from suspend to eject bay
+0x2404 System is waking up from hibernation to undock
+0x2405 System is waking up from hibernation to eject bay
+0x5001 Lid closed
+0x5002 Lid opened
+0x5009 Tablet swivel: switched to tablet mode
+0x500A Tablet swivel: switched to normal mode
+0x5010 Brightness level changed/control event
+0x6000 KEYBOARD: Numlock key pressed
+0x6005 KEYBOARD: Fn key pressed (TO BE VERIFIED)
+0x7000 Radio Switch may have changed state
+====== ==================================================
+
+
+Events that are propagated by the driver to userspace:
+
+====== =====================================================
+0x2313 ALARM: System is waking up from suspend because
+ the battery is nearly empty
+0x2413 ALARM: System is waking up from hibernation because
+ the battery is nearly empty
+0x3003 Bay ejection (see 0x2x05) complete, can sleep again
+0x3006 Bay hotplug request (hint to power up SATA link when
+ the optical drive tray is ejected)
+0x4003 Undocked (see 0x2x04), can sleep again
+0x4010 Docked into hotplug port replicator (non-ACPI dock)
+0x4011 Undocked from hotplug port replicator (non-ACPI dock)
+0x500B Tablet pen inserted into its storage bay
+0x500C Tablet pen removed from its storage bay
+0x6011 ALARM: battery is too hot
+0x6012 ALARM: battery is extremely hot
+0x6021 ALARM: a sensor is too hot
+0x6022 ALARM: a sensor is extremely hot
+0x6030 System thermal table changed
+0x6032 Thermal Control command set completion (DYTC, Windows)
+0x6040 Nvidia Optimus/AC adapter related (TO BE VERIFIED)
+0x60C0 X1 Yoga 2016, Tablet mode status changed
+0x60F0 Thermal Transformation changed (GMTS, Windows)
+====== =====================================================
+
+Battery nearly empty alarms are a last resort attempt to get the
+operating system to hibernate or shutdown cleanly (0x2313), or shutdown
+cleanly (0x2413) before power is lost. They must be acted upon, as the
+wake up caused by the firmware will have negated most safety nets...
+
+When any of the "too hot" alarms happen, according to Lenovo the user
+should suspend or hibernate the laptop (and in the case of battery
+alarms, unplug the AC adapter) to let it cool down. These alarms do
+signal that something is wrong, they should never happen on normal
+operating conditions.
+
+The "extremely hot" alarms are emergencies. According to Lenovo, the
+operating system is to force either an immediate suspend or hibernate
+cycle, or a system shutdown. Obviously, something is very wrong if this
+happens.
+
+
+Brightness hotkey notes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Don't mess with the brightness hotkeys in a Thinkpad. If you want
+notifications for OSD, use the sysfs backlight class event support.
+
+The driver will issue KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN events
+automatically for the cases were userspace has to do something to
+implement brightness changes. When you override these events, you will
+either fail to handle properly the ThinkPads that require explicit
+action to change backlight brightness, or the ThinkPads that require
+that no action be taken to work properly.
+
+
+Bluetooth
+---------
+
+procfs: /proc/acpi/ibm/bluetooth
+
+sysfs device attribute: bluetooth_enable (deprecated)
+
+sysfs rfkill class: switch "tpacpi_bluetooth_sw"
+
+This feature shows the presence and current state of a ThinkPad
+Bluetooth device in the internal ThinkPad CDC slot.
+
+If the ThinkPad supports it, the Bluetooth state is stored in NVRAM,
+so it is kept across reboots and power-off.
+
+Procfs notes
+^^^^^^^^^^^^
+
+If Bluetooth is installed, the following commands can be used::
+
+ echo enable > /proc/acpi/ibm/bluetooth
+ echo disable > /proc/acpi/ibm/bluetooth
+
+Sysfs notes
+^^^^^^^^^^^
+
+ If the Bluetooth CDC card is installed, it can be enabled /
+ disabled through the "bluetooth_enable" thinkpad-acpi device
+ attribute, and its current status can also be queried.
+
+ enable:
+
+ - 0: disables Bluetooth / Bluetooth is disabled
+ - 1: enables Bluetooth / Bluetooth is enabled.
+
+ Note: this interface has been superseded by the generic rfkill
+ class. It has been deprecated, and it will be removed in year
+ 2010.
+
+ rfkill controller switch "tpacpi_bluetooth_sw": refer to
+ Documentation/driver-api/rfkill.rst for details.
+
+
+Video output control -- /proc/acpi/ibm/video
+--------------------------------------------
+
+This feature allows control over the devices used for video output -
+LCD, CRT or DVI (if available). The following commands are available::
+
+ echo lcd_enable > /proc/acpi/ibm/video
+ echo lcd_disable > /proc/acpi/ibm/video
+ echo crt_enable > /proc/acpi/ibm/video
+ echo crt_disable > /proc/acpi/ibm/video
+ echo dvi_enable > /proc/acpi/ibm/video
+ echo dvi_disable > /proc/acpi/ibm/video
+ echo auto_enable > /proc/acpi/ibm/video
+ echo auto_disable > /proc/acpi/ibm/video
+ echo expand_toggle > /proc/acpi/ibm/video
+ echo video_switch > /proc/acpi/ibm/video
+
+NOTE:
+ Access to this feature is restricted to processes owning the
+ CAP_SYS_ADMIN capability for safety reasons, as it can interact badly
+ enough with some versions of X.org to crash it.
+
+Each video output device can be enabled or disabled individually.
+Reading /proc/acpi/ibm/video shows the status of each device.
+
+Automatic video switching can be enabled or disabled. When automatic
+video switching is enabled, certain events (e.g. opening the lid,
+docking or undocking) cause the video output device to change
+automatically. While this can be useful, it also causes flickering
+and, on the X40, video corruption. By disabling automatic switching,
+the flickering or video corruption can be avoided.
+
+The video_switch command cycles through the available video outputs
+(it simulates the behavior of Fn-F7).
+
+Video expansion can be toggled through this feature. This controls
+whether the display is expanded to fill the entire LCD screen when a
+mode with less than full resolution is used. Note that the current
+video expansion status cannot be determined through this feature.
+
+Note that on many models (particularly those using Radeon graphics
+chips) the X driver configures the video card in a way which prevents
+Fn-F7 from working. This also disables the video output switching
+features of this driver, as it uses the same ACPI methods as
+Fn-F7. Video switching on the console should still work.
+
+UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000
+
+
+ThinkLight control
+------------------
+
+procfs: /proc/acpi/ibm/light
+
+sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED
+
+procfs notes
+^^^^^^^^^^^^
+
+The ThinkLight status can be read and set through the procfs interface. A
+few models which do not make the status available will show the ThinkLight
+status as "unknown". The available commands are::
+
+ echo on > /proc/acpi/ibm/light
+ echo off > /proc/acpi/ibm/light
+
+sysfs notes
+^^^^^^^^^^^
+
+The ThinkLight sysfs interface is documented by the LED class
+documentation, in Documentation/leds/leds-class.rst. The ThinkLight LED name
+is "tpacpi::thinklight".
+
+Due to limitations in the sysfs LED class, if the status of the ThinkLight
+cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
+It is impossible to know if the status returned through sysfs is valid.
+
+
+CMOS/UCMS control
+-----------------
+
+procfs: /proc/acpi/ibm/cmos
+
+sysfs device attribute: cmos_command
+
+This feature is mostly used internally by the ACPI firmware to keep the legacy
+CMOS NVRAM bits in sync with the current machine state, and to record this
+state so that the ThinkPad will retain such settings across reboots.
+
+Some of these commands actually perform actions in some ThinkPad models, but
+this is expected to disappear more and more in newer models. As an example, in
+a T43 and in a X40, commands 12 and 13 still control the ThinkLight state for
+real, but commands 0 to 2 don't control the mixer anymore (they have been
+phased out) and just update the NVRAM.
+
+The range of valid cmos command numbers is 0 to 21, but not all have an
+effect and the behavior varies from model to model. Here is the behavior
+on the X40 (tpb is the ThinkPad Buttons utility):
+
+ - 0 - Related to "Volume down" key press
+ - 1 - Related to "Volume up" key press
+ - 2 - Related to "Mute on" key press
+ - 3 - Related to "Access IBM" key press
+ - 4 - Related to "LCD brightness up" key press
+ - 5 - Related to "LCD brightness down" key press
+ - 11 - Related to "toggle screen expansion" key press/function
+ - 12 - Related to "ThinkLight on"
+ - 13 - Related to "ThinkLight off"
+ - 14 - Related to "ThinkLight" key press (toggle ThinkLight)
+
+The cmos command interface is prone to firmware split-brain problems, as
+in newer ThinkPads it is just a compatibility layer. Do not use it, it is
+exported just as a debug tool.
+
+
+LED control
+-----------
+
+procfs: /proc/acpi/ibm/led
+sysfs attributes: as per LED class, see below for names
+
+Some of the LED indicators can be controlled through this feature. On
+some older ThinkPad models, it is possible to query the status of the
+LED indicators as well. Newer ThinkPads cannot query the real status
+of the LED indicators.
+
+Because misuse of the LEDs could induce an unaware user to perform
+dangerous actions (like undocking or ejecting a bay device while the
+buses are still active), or mask an important alarm (such as a nearly
+empty battery, or a broken battery), access to most LEDs is
+restricted.
+
+Unrestricted access to all LEDs requires that thinkpad-acpi be
+compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled.
+Distributions must never enable this option. Individual users that
+are aware of the consequences are welcome to enabling it.
+
+Audio mute and microphone mute LEDs are supported, but currently not
+visible to userspace. They are used by the snd-hda-intel audio driver.
+
+procfs notes
+^^^^^^^^^^^^
+
+The available commands are::
+
+ echo '<LED number> on' >/proc/acpi/ibm/led
+ echo '<LED number> off' >/proc/acpi/ibm/led
+ echo '<LED number> blink' >/proc/acpi/ibm/led
+
+The <LED number> range is 0 to 15. The set of LEDs that can be
+controlled varies from model to model. Here is the common ThinkPad
+mapping:
+
+ - 0 - power
+ - 1 - battery (orange)
+ - 2 - battery (green)
+ - 3 - UltraBase/dock
+ - 4 - UltraBay
+ - 5 - UltraBase battery slot
+ - 6 - (unknown)
+ - 7 - standby
+ - 8 - dock status 1
+ - 9 - dock status 2
+ - 10, 11 - (unknown)
+ - 12 - thinkvantage
+ - 13, 14, 15 - (unknown)
+
+All of the above can be turned on and off and can be made to blink.
+
+sysfs notes
+^^^^^^^^^^^
+
+The ThinkPad LED sysfs interface is described in detail by the LED class
+documentation, in Documentation/leds/leds-class.rst.
+
+The LEDs are named (in LED ID order, from 0 to 12):
+"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt",
+"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt",
+"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1",
+"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3",
+"tpacpi::thinkvantage".
+
+Due to limitations in the sysfs LED class, if the status of the LED
+indicators cannot be read due to an error, thinkpad-acpi will report it as
+a brightness of zero (same as LED off).
+
+If the thinkpad firmware doesn't support reading the current status,
+trying to read the current LED brightness will just return whatever
+brightness was last written to that attribute.
+
+These LEDs can blink using hardware acceleration. To request that a
+ThinkPad indicator LED should blink in hardware accelerated mode, use the
+"timer" trigger, and leave the delay_on and delay_off parameters set to
+zero (to request hardware acceleration autodetection).
+
+LEDs that are known not to exist in a given ThinkPad model are not
+made available through the sysfs interface. If you have a dock and you
+notice there are LEDs listed for your ThinkPad that do not exist (and
+are not in the dock), or if you notice that there are missing LEDs,
+a report to ibm-acpi-devel@lists.sourceforge.net is appreciated.
+
+
+ACPI sounds -- /proc/acpi/ibm/beep
+----------------------------------
+
+The BEEP method is used internally by the ACPI firmware to provide
+audible alerts in various situations. This feature allows the same
+sounds to be triggered manually.
+
+The commands are non-negative integer numbers::
+
+ echo <number> >/proc/acpi/ibm/beep
+
+The valid <number> range is 0 to 17. Not all numbers trigger sounds
+and the sounds vary from model to model. Here is the behavior on the
+X40:
+
+ - 0 - stop a sound in progress (but use 17 to stop 16)
+ - 2 - two beeps, pause, third beep ("low battery")
+ - 3 - single beep
+ - 4 - high, followed by low-pitched beep ("unable")
+ - 5 - single beep
+ - 6 - very high, followed by high-pitched beep ("AC/DC")
+ - 7 - high-pitched beep
+ - 9 - three short beeps
+ - 10 - very long beep
+ - 12 - low-pitched beep
+ - 15 - three high-pitched beeps repeating constantly, stop with 0
+ - 16 - one medium-pitched beep repeating constantly, stop with 17
+ - 17 - stop 16
+
+
+Temperature sensors
+-------------------
+
+procfs: /proc/acpi/ibm/thermal
+
+sysfs device attributes: (hwmon "thinkpad") temp*_input
+
+Most ThinkPads include six or more separate temperature sensors but only
+expose the CPU temperature through the standard ACPI methods. This
+feature shows readings from up to eight different sensors on older
+ThinkPads, and up to sixteen different sensors on newer ThinkPads.
+
+For example, on the X40, a typical output may be:
+
+temperatures:
+ 42 42 45 41 36 -128 33 -128
+
+On the T43/p, a typical output may be:
+
+temperatures:
+ 48 48 36 52 38 -128 31 -128 48 52 48 -128 -128 -128 -128 -128
+
+The mapping of thermal sensors to physical locations varies depending on
+system-board model (and thus, on ThinkPad model).
+
+http://thinkwiki.org/wiki/Thermal_Sensors is a public wiki page that
+tries to track down these locations for various models.
+
+Most (newer?) models seem to follow this pattern:
+
+- 1: CPU
+- 2: (depends on model)
+- 3: (depends on model)
+- 4: GPU
+- 5: Main battery: main sensor
+- 6: Bay battery: main sensor
+- 7: Main battery: secondary sensor
+- 8: Bay battery: secondary sensor
+- 9-15: (depends on model)
+
+For the R51 (source: Thomas Gruber):
+
+- 2: Mini-PCI
+- 3: Internal HDD
+
+For the T43, T43/p (source: Shmidoax/Thinkwiki.org)
+http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_T43.2C_T43p
+
+- 2: System board, left side (near PCMCIA slot), reported as HDAPS temp
+- 3: PCMCIA slot
+- 9: MCH (northbridge) to DRAM Bus
+- 10: Clock-generator, mini-pci card and ICH (southbridge), under Mini-PCI
+ card, under touchpad
+- 11: Power regulator, underside of system board, below F2 key
+
+The A31 has a very atypical layout for the thermal sensors
+(source: Milos Popovic, http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_A31)
+
+- 1: CPU
+- 2: Main Battery: main sensor
+- 3: Power Converter
+- 4: Bay Battery: main sensor
+- 5: MCH (northbridge)
+- 6: PCMCIA/ambient
+- 7: Main Battery: secondary sensor
+- 8: Bay Battery: secondary sensor
+
+
+Procfs notes
+^^^^^^^^^^^^
+
+ Readings from sensors that are not available return -128.
+ No commands can be written to this file.
+
+Sysfs notes
+^^^^^^^^^^^
+
+ Sensors that are not available return the ENXIO error. This
+ status may change at runtime, as there are hotplug thermal
+ sensors, like those inside the batteries and docks.
+
+ thinkpad-acpi thermal sensors are reported through the hwmon
+ subsystem, and follow all of the hwmon guidelines at
+ Documentation/hwmon.
+
+EXPERIMENTAL: Embedded controller register dump
+-----------------------------------------------
+
+This feature is not included in the thinkpad driver anymore.
+Instead the EC can be accessed through /sys/kernel/debug/ec with
+a userspace tool which can be found here:
+ftp://ftp.suse.com/pub/people/trenn/sources/ec
+
+Use it to determine the register holding the fan
+speed on some models. To do that, do the following:
+
+ - make sure the battery is fully charged
+ - make sure the fan is running
+ - use above mentioned tool to read out the EC
+
+Often fan and temperature values vary between
+readings. Since temperatures don't change vary fast, you can take
+several quick dumps to eliminate them.
+
+You can use a similar method to figure out the meaning of other
+embedded controller registers - e.g. make sure nothing else changes
+except the charging or discharging battery to determine which
+registers contain the current battery capacity, etc. If you experiment
+with this, do send me your results (including some complete dumps with
+a description of the conditions when they were taken.)
+
+
+LCD brightness control
+----------------------
+
+procfs: /proc/acpi/ibm/brightness
+
+sysfs backlight device "thinkpad_screen"
+
+This feature allows software control of the LCD brightness on ThinkPad
+models which don't have a hardware brightness slider.
+
+It has some limitations: the LCD backlight cannot be actually turned
+on or off by this interface, it just controls the backlight brightness
+level.
+
+On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
+has eight brightness levels, ranging from 0 to 7. Some of the levels
+may not be distinct. Later Lenovo models that implement the ACPI
+display backlight brightness control methods have 16 levels, ranging
+from 0 to 15.
+
+For IBM ThinkPads, there are two interfaces to the firmware for direct
+brightness control, EC and UCMS (or CMOS). To select which one should be
+used, use the brightness_mode module parameter: brightness_mode=1 selects
+EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
+mode with NVRAM backing (so that brightness changes are remembered across
+shutdown/reboot).
+
+The driver tries to select which interface to use from a table of
+defaults for each ThinkPad model. If it makes a wrong choice, please
+report this as a bug, so that we can fix it.
+
+Lenovo ThinkPads only support brightness_mode=2 (UCMS).
+
+When display backlight brightness controls are available through the
+standard ACPI interface, it is best to use it instead of this direct
+ThinkPad-specific interface. The driver will disable its native
+backlight brightness control interface if it detects that the standard
+ACPI interface is available in the ThinkPad.
+
+If you want to use the thinkpad-acpi backlight brightness control
+instead of the generic ACPI video backlight brightness control for some
+reason, you should use the acpi_backlight=vendor kernel parameter.
+
+The brightness_enable module parameter can be used to control whether
+the LCD brightness control feature will be enabled when available.
+brightness_enable=0 forces it to be disabled. brightness_enable=1
+forces it to be enabled when available, even if the standard ACPI
+interface is also available.
+
+Procfs notes
+^^^^^^^^^^^^
+
+The available commands are::
+
+ echo up >/proc/acpi/ibm/brightness
+ echo down >/proc/acpi/ibm/brightness
+ echo 'level <level>' >/proc/acpi/ibm/brightness
+
+Sysfs notes
+^^^^^^^^^^^
+
+The interface is implemented through the backlight sysfs class, which is
+poorly documented at this time.
+
+Locate the thinkpad_screen device under /sys/class/backlight, and inside
+it there will be the following attributes:
+
+ max_brightness:
+ Reads the maximum brightness the hardware can be set to.
+ The minimum is always zero.
+
+ actual_brightness:
+ Reads what brightness the screen is set to at this instant.
+
+ brightness:
+ Writes request the driver to change brightness to the
+ given value. Reads will tell you what brightness the
+ driver is trying to set the display to when "power" is set
+ to zero and the display has not been dimmed by a kernel
+ power management event.
+
+ power:
+ power management mode, where 0 is "display on", and 1 to 3
+ will dim the display backlight to brightness level 0
+ because thinkpad-acpi cannot really turn the backlight
+ off. Kernel power management events can temporarily
+ increase the current power management level, i.e. they can
+ dim the display.
+
+
+WARNING:
+
+ Whatever you do, do NOT ever call thinkpad-acpi backlight-level change
+ interface and the ACPI-based backlight level change interface
+ (available on newer BIOSes, and driven by the Linux ACPI video driver)
+ at the same time. The two will interact in bad ways, do funny things,
+ and maybe reduce the life of the backlight lamps by needlessly kicking
+ its level up and down at every change.
+
+
+Volume control (Console Audio control)
+--------------------------------------
+
+procfs: /proc/acpi/ibm/volume
+
+ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC"
+
+NOTE: by default, the volume control interface operates in read-only
+mode, as it is supposed to be used for on-screen-display purposes.
+The read/write mode can be enabled through the use of the
+"volume_control=1" module parameter.
+
+NOTE: distros are urged to not enable volume_control by default, this
+should be done by the local admin only. The ThinkPad UI is for the
+console audio control to be done through the volume keys only, and for
+the desktop environment to just provide on-screen-display feedback.
+Software volume control should be done only in the main AC97/HDA
+mixer.
+
+
+About the ThinkPad Console Audio control
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ThinkPads have a built-in amplifier and muting circuit that drives the
+console headphone and speakers. This circuit is after the main AC97
+or HDA mixer in the audio path, and under exclusive control of the
+firmware.
+
+ThinkPads have three special hotkeys to interact with the console
+audio control: volume up, volume down and mute.
+
+It is worth noting that the normal way the mute function works (on
+ThinkPads that do not have a "mute LED") is:
+
+1. Press mute to mute. It will *always* mute, you can press it as
+ many times as you want, and the sound will remain mute.
+
+2. Press either volume key to unmute the ThinkPad (it will _not_
+ change the volume, it will just unmute).
+
+This is a very superior design when compared to the cheap software-only
+mute-toggle solution found on normal consumer laptops: you can be
+absolutely sure the ThinkPad will not make noise if you press the mute
+button, no matter the previous state.
+
+The IBM ThinkPads, and the earlier Lenovo ThinkPads have variable-gain
+amplifiers driving the speakers and headphone output, and the firmware
+also handles volume control for the headphone and speakers on these
+ThinkPads without any help from the operating system (this volume
+control stage exists after the main AC97 or HDA mixer in the audio
+path).
+
+The newer Lenovo models only have firmware mute control, and depend on
+the main HDA mixer to do volume control (which is done by the operating
+system). In this case, the volume keys are filtered out for unmute
+key press (there are some firmware bugs in this area) and delivered as
+normal key presses to the operating system (thinkpad-acpi is not
+involved).
+
+
+The ThinkPad-ACPI volume control
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The preferred way to interact with the Console Audio control is the
+ALSA interface.
+
+The legacy procfs interface allows one to read the current state,
+and if volume control is enabled, accepts the following commands::
+
+ echo up >/proc/acpi/ibm/volume
+ echo down >/proc/acpi/ibm/volume
+ echo mute >/proc/acpi/ibm/volume
+ echo unmute >/proc/acpi/ibm/volume
+ echo 'level <level>' >/proc/acpi/ibm/volume
+
+The <level> number range is 0 to 14 although not all of them may be
+distinct. To unmute the volume after the mute command, use either the
+up or down command (the level command will not unmute the volume), or
+the unmute command.
+
+You can use the volume_capabilities parameter to tell the driver
+whether your thinkpad has volume control or mute-only control:
+volume_capabilities=1 for mixers with mute and volume control,
+volume_capabilities=2 for mixers with only mute control.
+
+If the driver misdetects the capabilities for your ThinkPad model,
+please report this to ibm-acpi-devel@lists.sourceforge.net, so that we
+can update the driver.
+
+There are two strategies for volume control. To select which one
+should be used, use the volume_mode module parameter: volume_mode=1
+selects EC mode, and volume_mode=3 selects EC mode with NVRAM backing
+(so that volume/mute changes are remembered across shutdown/reboot).
+
+The driver will operate in volume_mode=3 by default. If that does not
+work well on your ThinkPad model, please report this to
+ibm-acpi-devel@lists.sourceforge.net.
+
+The driver supports the standard ALSA module parameters. If the ALSA
+mixer is disabled, the driver will disable all volume functionality.
+
+
+Fan control and monitoring: fan speed, fan enable/disable
+---------------------------------------------------------
+
+procfs: /proc/acpi/ibm/fan
+
+sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1, pwm1_enable, fan2_input
+
+sysfs hwmon driver attributes: fan_watchdog
+
+NOTE NOTE NOTE:
+ fan control operations are disabled by default for
+ safety reasons. To enable them, the module parameter "fan_control=1"
+ must be given to thinkpad-acpi.
+
+This feature attempts to show the current fan speed, control mode and
+other fan data that might be available. The speed is read directly
+from the hardware registers of the embedded controller. This is known
+to work on later R, T, X and Z series ThinkPads but may show a bogus
+value on other models.
+
+Some Lenovo ThinkPads support a secondary fan. This fan cannot be
+controlled separately, it shares the main fan control.
+
+Fan levels
+^^^^^^^^^^
+
+Most ThinkPad fans work in "levels" at the firmware interface. Level 0
+stops the fan. The higher the level, the higher the fan speed, although
+adjacent levels often map to the same fan speed. 7 is the highest
+level, where the fan reaches the maximum recommended speed.
+
+Level "auto" means the EC changes the fan level according to some
+internal algorithm, usually based on readings from the thermal sensors.
+
+There is also a "full-speed" level, also known as "disengaged" level.
+In this level, the EC disables the speed-locked closed-loop fan control,
+and drives the fan as fast as it can go, which might exceed hardware
+limits, so use this level with caution.
+
+The fan usually ramps up or down slowly from one speed to another, and
+it is normal for the EC to take several seconds to react to fan
+commands. The full-speed level may take up to two minutes to ramp up to
+maximum speed, and in some ThinkPads, the tachometer readings go stale
+while the EC is transitioning to the full-speed level.
+
+WARNING WARNING WARNING: do not leave the fan disabled unless you are
+monitoring all of the temperature sensor readings and you are ready to
+enable it if necessary to avoid overheating.
+
+An enabled fan in level "auto" may stop spinning if the EC decides the
+ThinkPad is cool enough and doesn't need the extra airflow. This is
+normal, and the EC will spin the fan up if the various thermal readings
+rise too much.
+
+On the X40, this seems to depend on the CPU and HDD temperatures.
+Specifically, the fan is turned on when either the CPU temperature
+climbs to 56 degrees or the HDD temperature climbs to 46 degrees. The
+fan is turned off when the CPU temperature drops to 49 degrees and the
+HDD temperature drops to 41 degrees. These thresholds cannot
+currently be controlled.
+
+The ThinkPad's ACPI DSDT code will reprogram the fan on its own when
+certain conditions are met. It will override any fan programming done
+through thinkpad-acpi.
+
+The thinkpad-acpi kernel driver can be programmed to revert the fan
+level to a safe setting if userspace does not issue one of the procfs
+fan commands: "enable", "disable", "level" or "watchdog", or if there
+are no writes to pwm1_enable (or to pwm1 *if and only if* pwm1_enable is
+set to 1, manual mode) within a configurable amount of time of up to
+120 seconds. This functionality is called fan safety watchdog.
+
+Note that the watchdog timer stops after it enables the fan. It will be
+rearmed again automatically (using the same interval) when one of the
+above mentioned fan commands is received. The fan watchdog is,
+therefore, not suitable to protect against fan mode changes made through
+means other than the "enable", "disable", and "level" procfs fan
+commands, or the hwmon fan control sysfs interface.
+
+Procfs notes
+^^^^^^^^^^^^
+
+The fan may be enabled or disabled with the following commands::
+
+ echo enable >/proc/acpi/ibm/fan
+ echo disable >/proc/acpi/ibm/fan
+
+Placing a fan on level 0 is the same as disabling it. Enabling a fan
+will try to place it in a safe level if it is too slow or disabled.
+
+The fan level can be controlled with the command::
+
+ echo 'level <level>' > /proc/acpi/ibm/fan
+
+Where <level> is an integer from 0 to 7, or one of the words "auto" or
+"full-speed" (without the quotes). Not all ThinkPads support the "auto"
+and "full-speed" levels. The driver accepts "disengaged" as an alias for
+"full-speed", and reports it as "disengaged" for backwards
+compatibility.
+
+On the X31 and X40 (and ONLY on those models), the fan speed can be
+controlled to a certain degree. Once the fan is running, it can be
+forced to run faster or slower with the following command::
+
+ echo 'speed <speed>' > /proc/acpi/ibm/fan
+
+The sustainable range of fan speeds on the X40 appears to be from about
+3700 to about 7350. Values outside this range either do not have any
+effect or the fan speed eventually settles somewhere in that range. The
+fan cannot be stopped or started with this command. This functionality
+is incomplete, and not available through the sysfs interface.
+
+To program the safety watchdog, use the "watchdog" command::
+
+ echo 'watchdog <interval in seconds>' > /proc/acpi/ibm/fan
+
+If you want to disable the watchdog, use 0 as the interval.
+
+Sysfs notes
+^^^^^^^^^^^
+
+The sysfs interface follows the hwmon subsystem guidelines for the most
+part, and the exception is the fan safety watchdog.
+
+Writes to any of the sysfs attributes may return the EINVAL error if
+that operation is not supported in a given ThinkPad or if the parameter
+is out-of-bounds, and EPERM if it is forbidden. They may also return
+EINTR (interrupted system call), and EIO (I/O error while trying to talk
+to the firmware).
+
+Features not yet implemented by the driver return ENOSYS.
+
+hwmon device attribute pwm1_enable:
+ - 0: PWM offline (fan is set to full-speed mode)
+ - 1: Manual PWM control (use pwm1 to set fan level)
+ - 2: Hardware PWM control (EC "auto" mode)
+ - 3: reserved (Software PWM control, not implemented yet)
+
+ Modes 0 and 2 are not supported by all ThinkPads, and the
+ driver is not always able to detect this. If it does know a
+ mode is unsupported, it will return -EINVAL.
+
+hwmon device attribute pwm1:
+ Fan level, scaled from the firmware values of 0-7 to the hwmon
+ scale of 0-255. 0 means fan stopped, 255 means highest normal
+ speed (level 7).
+
+ This attribute only commands the fan if pmw1_enable is set to 1
+ (manual PWM control).
+
+hwmon device attribute fan1_input:
+ Fan tachometer reading, in RPM. May go stale on certain
+ ThinkPads while the EC transitions the PWM to offline mode,
+ which can take up to two minutes. May return rubbish on older
+ ThinkPads.
+
+hwmon device attribute fan2_input:
+ Fan tachometer reading, in RPM, for the secondary fan.
+ Available only on some ThinkPads. If the secondary fan is
+ not installed, will always read 0.
+
+hwmon driver attribute fan_watchdog:
+ Fan safety watchdog timer interval, in seconds. Minimum is
+ 1 second, maximum is 120 seconds. 0 disables the watchdog.
+
+To stop the fan: set pwm1 to zero, and pwm1_enable to 1.
+
+To start the fan in a safe mode: set pwm1_enable to 2. If that fails
+with EINVAL, try to set pwm1_enable to 1 and pwm1 to at least 128 (255
+would be the safest choice, though).
+
+
+WAN
+---
+
+procfs: /proc/acpi/ibm/wan
+
+sysfs device attribute: wwan_enable (deprecated)
+
+sysfs rfkill class: switch "tpacpi_wwan_sw"
+
+This feature shows the presence and current state of the built-in
+Wireless WAN device.
+
+If the ThinkPad supports it, the WWAN state is stored in NVRAM,
+so it is kept across reboots and power-off.
+
+It was tested on a Lenovo ThinkPad X60. It should probably work on other
+ThinkPad models which come with this module installed.
+
+Procfs notes
+^^^^^^^^^^^^
+
+If the W-WAN card is installed, the following commands can be used::
+
+ echo enable > /proc/acpi/ibm/wan
+ echo disable > /proc/acpi/ibm/wan
+
+Sysfs notes
+^^^^^^^^^^^
+
+ If the W-WAN card is installed, it can be enabled /
+ disabled through the "wwan_enable" thinkpad-acpi device
+ attribute, and its current status can also be queried.
+
+ enable:
+ - 0: disables WWAN card / WWAN card is disabled
+ - 1: enables WWAN card / WWAN card is enabled.
+
+ Note: this interface has been superseded by the generic rfkill
+ class. It has been deprecated, and it will be removed in year
+ 2010.
+
+ rfkill controller switch "tpacpi_wwan_sw": refer to
+ Documentation/driver-api/rfkill.rst for details.
+
+
+EXPERIMENTAL: UWB
+-----------------
+
+This feature is considered EXPERIMENTAL because it has not been extensively
+tested and validated in various ThinkPad models yet. The feature may not
+work as expected. USE WITH CAUTION! To use this feature, you need to supply
+the experimental=1 parameter when loading the module.
+
+sysfs rfkill class: switch "tpacpi_uwb_sw"
+
+This feature exports an rfkill controller for the UWB device, if one is
+present and enabled in the BIOS.
+
+Sysfs notes
+^^^^^^^^^^^
+
+ rfkill controller switch "tpacpi_uwb_sw": refer to
+ Documentation/driver-api/rfkill.rst for details.
+
+Adaptive keyboard
+-----------------
+
+sysfs device attribute: adaptive_kbd_mode
+
+This sysfs attribute controls the keyboard "face" that will be shown on the
+Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read
+and set.
+
+- 1 = Home mode
+- 2 = Web-browser mode
+- 3 = Web-conference mode
+- 4 = Function mode
+- 5 = Layflat mode
+
+For more details about which buttons will appear depending on the mode, please
+review the laptop's user guide:
+http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf
+
+Multiple Commands, Module Parameters
+------------------------------------
+
+Multiple commands can be written to the proc files in one shot by
+separating them with commas, for example::
+
+ echo enable,0xffff > /proc/acpi/ibm/hotkey
+ echo lcd_disable,crt_enable > /proc/acpi/ibm/video
+
+Commands can also be specified when loading the thinkpad-acpi module,
+for example::
+
+ modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
+
+
+Enabling debugging output
+-------------------------
+
+The module takes a debug parameter which can be used to selectively
+enable various classes of debugging output, for example::
+
+ modprobe thinkpad_acpi debug=0xffff
+
+will enable all debugging output classes. It takes a bitmask, so
+to enable more than one output class, just add their values.
+
+ ============= ======================================
+ Debug bitmask Description
+ ============= ======================================
+ 0x8000 Disclose PID of userspace programs
+ accessing some functions of the driver
+ 0x0001 Initialization and probing
+ 0x0002 Removal
+ 0x0004 RF Transmitter control (RFKILL)
+ (bluetooth, WWAN, UWB...)
+ 0x0008 HKEY event interface, hotkeys
+ 0x0010 Fan control
+ 0x0020 Backlight brightness
+ 0x0040 Audio mixer/volume control
+ ============= ======================================
+
+There is also a kernel build option to enable more debugging
+information, which may be necessary to debug driver problems.
+
+The level of debugging information output by the driver can be changed
+at runtime through sysfs, using the driver attribute debug_level. The
+attribute takes the same bitmask as the debug module parameter above.
+
+
+Force loading of module
+-----------------------
+
+If thinkpad-acpi refuses to detect your ThinkPad, you can try to specify
+the module parameter force_load=1. Regardless of whether this works or
+not, please contact ibm-acpi-devel@lists.sourceforge.net with a report.
+
+
+Sysfs interface changelog
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========= ===============================================================
+0x000100: Initial sysfs support, as a single platform driver and
+ device.
+0x000200: Hot key support for 32 hot keys, and radio slider switch
+ support.
+0x010000: Hot keys are now handled by default over the input
+ layer, the radio switch generates input event EV_RADIO,
+ and the driver enables hot key handling by default in
+ the firmware.
+
+0x020000: ABI fix: added a separate hwmon platform device and
+ driver, which must be located by name (thinkpad)
+ and the hwmon class for libsensors4 (lm-sensors 3)
+ compatibility. Moved all hwmon attributes to this
+ new platform device.
+
+0x020100: Marker for thinkpad-acpi with hot key NVRAM polling
+ support. If you must, use it to know you should not
+ start a userspace NVRAM poller (allows to detect when
+ NVRAM is compiled out by the user because it is
+ unneeded/undesired in the first place).
+0x020101: Marker for thinkpad-acpi with hot key NVRAM polling
+ and proper hotkey_mask semantics (version 8 of the
+ NVRAM polling patch). Some development snapshots of
+ 0.18 had an earlier version that did strange things
+ to hotkey_mask.
+
+0x020200: Add poll()/select() support to the following attributes:
+ hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
+
+0x020300: hotkey enable/disable support removed, attributes
+ hotkey_bios_enabled and hotkey_enable deprecated and
+ marked for removal.
+
+0x020400: Marker for 16 LEDs support. Also, LEDs that are known
+ to not exist in a given model are not registered with
+ the LED sysfs class anymore.
+
+0x020500: Updated hotkey driver, hotkey_mask is always available
+ and it is always able to disable hot keys. Very old
+ thinkpads are properly supported. hotkey_bios_mask
+ is deprecated and marked for removal.
+
+0x020600: Marker for backlight change event support.
+
+0x020700: Support for mute-only mixers.
+ Volume control in read-only mode by default.
+ Marker for ALSA mixer support.
+
+0x030000: Thermal and fan sysfs attributes were moved to the hwmon
+ device instead of being attached to the backing platform
+ device.
+========= ===============================================================
diff --git a/Documentation/admin-guide/laptops/toshiba_haps.rst b/Documentation/admin-guide/laptops/toshiba_haps.rst
new file mode 100644
index 000000000000..d28b6c3f2849
--- /dev/null
+++ b/Documentation/admin-guide/laptops/toshiba_haps.rst
@@ -0,0 +1,87 @@
+====================================
+Toshiba HDD Active Protection Sensor
+====================================
+
+Kernel driver: toshiba_haps
+
+Author: Azael Avalos <coproscefalo@gmail.com>
+
+
+.. 0. Contents
+
+ 1. Description
+ 2. Interface
+ 3. Accelerometer axes
+ 4. Supported devices
+ 5. Usage
+
+
+1. Description
+--------------
+
+This driver provides support for the accelerometer found in various Toshiba
+laptops, being called "Toshiba HDD Protection - Shock Sensor" officially,
+and detects laptops automatically with this device.
+On Windows, Toshiba provided software monitors this device and provides
+automatic HDD protection (head unload) on sudden moves or harsh vibrations,
+however, this driver only provides a notification via a sysfs file to let
+userspace tools or daemons act accordingly, as well as providing a sysfs
+file to set the desired protection level or sensor sensibility.
+
+
+2. Interface
+------------
+
+This device comes with 3 methods:
+
+==== =====================================================================
+_STA Checks existence of the device, returning Zero if the device does not
+ exists or is not supported.
+PTLV Sets the desired protection level.
+RSSS Shuts down the HDD protection interface for a few seconds,
+ then restores normal operation.
+==== =====================================================================
+
+Note:
+ The presence of Solid State Drives (SSD) can make this driver to fail loading,
+ given the fact that such drives have no movable parts, and thus, not requiring
+ any "protection" as well as failing during the evaluation of the _STA method
+ found under this device.
+
+
+3. Accelerometer axes
+---------------------
+
+This device does not report any axes, however, to query the sensor position
+a couple HCI (Hardware Configuration Interface) calls (0x6D and 0xA6) are
+provided to query such information, handled by the kernel module toshiba_acpi
+since kernel version 3.15.
+
+
+4. Supported devices
+--------------------
+
+This driver binds itself to the ACPI device TOS620A, and any Toshiba laptop
+with this device is supported, given the fact that they have the presence of
+conventional HDD and not only SSD, or a combination of both HDD and SSD.
+
+
+5. Usage
+--------
+
+The sysfs files under /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS620A:00/ are:
+
+================ ============================================================
+protection_level The protection_level is readable and writeable, and
+ provides a way to let userspace query the current protection
+ level, as well as set the desired protection level, the
+ available protection levels are::
+
+ ============ ======= ========== ========
+ 0 - Disabled 1 - Low 2 - Medium 3 - High
+ ============ ======= ========== ========
+
+reset_protection The reset_protection entry is writeable only, being "1"
+ the only parameter it accepts, it is used to trigger
+ a reset of the protection interface.
+================ ============================================================
diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst
new file mode 100644
index 000000000000..a3eb00c62f53
--- /dev/null
+++ b/Documentation/admin-guide/lcd-panel-cgram.rst
@@ -0,0 +1,27 @@
+======================================
+Parallel port LCD/Keypad Panel support
+======================================
+
+Some LCDs allow you to define up to 8 characters, mapped to ASCII
+characters 0 to 7. The escape code to define a new character is
+'\e[LG' followed by one digit from 0 to 7, representing the character
+number, and up to 8 couples of hex digits terminated by a semi-colon
+(';'). Each couple of digits represents a line, with 1-bits for each
+illuminated pixel with LSB on the right. Lines are numbered from the
+top of the character to the bottom. On a 5x7 matrix, only the 5 lower
+bits of the 7 first bytes are used for each character. If the string
+is incomplete, only complete lines will be redefined. Here are some
+examples::
+
+ printf "\e[LG0010101050D1F0C04;" => 0 = [enter]
+ printf "\e[LG1040E1F0000000000;" => 1 = [up]
+ printf "\e[LG2000000001F0E0400;" => 2 = [down]
+ printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down]
+ printf "\e[LG40002060E1E0E0602;" => 4 = [left]
+ printf "\e[LG500080C0E0F0E0C08;" => 5 = [right]
+ printf "\e[LG60016051516141400;" => 6 = "IP"
+
+ printf "\e[LG00103071F1F070301;" => big speaker
+ printf "\e[LG00002061E1E060200;" => small speaker
+
+Willy
diff --git a/Documentation/ldm.txt b/Documentation/admin-guide/ldm.rst
index 12c571368e73..12c571368e73 100644
--- a/Documentation/ldm.txt
+++ b/Documentation/admin-guide/ldm.rst
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/admin-guide/lockup-watchdogs.rst
index 290840c160af..290840c160af 100644
--- a/Documentation/lockup-watchdogs.txt
+++ b/Documentation/admin-guide/lockup-watchdogs.rst
diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst
new file mode 100644
index 000000000000..4e06ffabd78a
--- /dev/null
+++ b/Documentation/admin-guide/mm/cma_debugfs.rst
@@ -0,0 +1,25 @@
+=====================
+CMA Debugfs Interface
+=====================
+
+The CMA debugfs interface is useful to retrieve basic information out of the
+different CMA areas and to test allocation/release in each of the areas.
+
+Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
+kernel's CMA index. So the first CMA zone would be:
+
+ <debugfs>/cma/cma-0
+
+The structure of the files created under that directory is as follows:
+
+ - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
+ - [RO] count: Amount of memory in the CMA area.
+ - [RO] order_per_bit: Order of pages represented by one bit.
+ - [RO] bitmap: The bitmap of page states in the zone.
+ - [WO] alloc: Allocate N pages from that CMA area. For example::
+
+ echo 5 > <debugfs>/cma/cma-2/alloc
+
+would try to allocate 5 pages from the cma-2 area.
+
+ - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index ddf8d8d33377..11db46448354 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -11,7 +11,7 @@ processes address space and many other cool things.
Linux memory management is a complex system with many configurable
settings. Most of these settings are available via ``/proc``
filesystem and can be quired and adjusted using ``sysctl``. These APIs
-are described in Documentation/sysctl/vm.txt and in `man 5 proc`_.
+are described in Documentation/admin-guide/sysctl/vm.rst and in `man 5 proc`_.
.. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html
@@ -26,6 +26,7 @@ the Linux memory management.
:maxdepth: 1
concepts
+ cma_debugfs
hugetlbpage
idle_page_tracking
ksm
diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
index 9303786632d1..874eb0c77d34 100644
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -59,7 +59,7 @@ MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
If a region of memory must be split into at least one new MADV_MERGEABLE
or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
-will exceed ``vm.max_map_count`` (see Documentation/sysctl/vm.txt).
+will exceed ``vm.max_map_count`` (see Documentation/admin-guide/sysctl/vm.rst).
Like other madvise calls, they are intended for use on mapped areas of
the user address space: they will report ENOMEM if the specified range
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index d78c5b315f72..8463f5538fda 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy
support.
Memory policies should not be confused with cpusets
-(``Documentation/cgroup-v1/cpusets.txt``)
+(``Documentation/admin-guide/cgroup-v1/cpusets.rst``)
which is an administrative mechanism for restricting the nodes from which
memory may be allocated by a set of processes. Memory policies are a
programming interface that a NUMA-aware application can take advantage of. When
diff --git a/Documentation/admin-guide/namespaces/compatibility-list.rst b/Documentation/admin-guide/namespaces/compatibility-list.rst
new file mode 100644
index 000000000000..318800b2a943
--- /dev/null
+++ b/Documentation/admin-guide/namespaces/compatibility-list.rst
@@ -0,0 +1,43 @@
+=============================
+Namespaces compatibility list
+=============================
+
+This document contains the information about the problems user
+may have when creating tasks living in different namespaces.
+
+Here's the summary. This matrix shows the known problems, that
+occur when tasks share some namespace (the columns) while living
+in different other namespaces (the rows):
+
+==== === === === === ==== ===
+- UTS IPC VFS PID User Net
+==== === === === === ==== ===
+UTS X
+IPC X 1
+VFS X
+PID 1 1 X
+User 2 2 X
+Net X
+==== === === === === ==== ===
+
+1. Both the IPC and the PID namespaces provide IDs to address
+ object inside the kernel. E.g. semaphore with IPCID or
+ process group with pid.
+
+ In both cases, tasks shouldn't try exposing this ID to some
+ other task living in a different namespace via a shared filesystem
+ or IPC shmem/message. The fact is that this ID is only valid
+ within the namespace it was obtained in and may refer to some
+ other object in another namespace.
+
+2. Intentionally, two equal user IDs in different user namespaces
+ should not be equal from the VFS point of view. In other
+ words, user 10 in one user namespace shouldn't have the same
+ access permissions to files, belonging to user 10 in another
+ namespace.
+
+ The same is true for the IPC namespaces being shared - two users
+ from different user namespaces should not access the same IPC objects
+ even having equal UIDs.
+
+ But currently this is not so.
diff --git a/Documentation/admin-guide/namespaces/index.rst b/Documentation/admin-guide/namespaces/index.rst
new file mode 100644
index 000000000000..384f2e0f33d2
--- /dev/null
+++ b/Documentation/admin-guide/namespaces/index.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+Namespaces
+==========
+
+.. toctree::
+ :maxdepth: 1
+
+ compatibility-list
+ resource-control
diff --git a/Documentation/admin-guide/namespaces/resource-control.rst b/Documentation/admin-guide/namespaces/resource-control.rst
new file mode 100644
index 000000000000..369556e00f0c
--- /dev/null
+++ b/Documentation/admin-guide/namespaces/resource-control.rst
@@ -0,0 +1,18 @@
+===========================
+Namespaces research control
+===========================
+
+There are a lot of kinds of objects in the kernel that don't have
+individual limits or that have limits that are ineffective when a set
+of processes is allowed to switch user ids. With user namespaces
+enabled in a kernel for people who don't trust their users or their
+users programs to play nice this problems becomes more acute.
+
+Therefore it is recommended that memory control groups be enabled in
+kernels that enable user namespaces, and it is further recommended
+that userspace configure memory control groups to limit how much
+memory user's they don't trust to play nice can use.
+
+Memory control groups can be configured by installing the libcgroup
+package present on most distros editing /etc/cgrules.conf,
+/etc/cgconfig.conf and setting up libpam-cgroup.
diff --git a/Documentation/numastat.txt b/Documentation/admin-guide/numastat.rst
index aaf1667489f8..aaf1667489f8 100644
--- a/Documentation/numastat.txt
+++ b/Documentation/admin-guide/numastat.rst
diff --git a/Documentation/admin-guide/perf/arm-ccn.rst b/Documentation/admin-guide/perf/arm-ccn.rst
new file mode 100644
index 000000000000..832b0c64023a
--- /dev/null
+++ b/Documentation/admin-guide/perf/arm-ccn.rst
@@ -0,0 +1,61 @@
+==========================
+ARM Cache Coherent Network
+==========================
+
+CCN-504 is a ring-bus interconnect consisting of 11 crosspoints
+(XPs), with each crosspoint supporting up to two device ports,
+so nodes (devices) 0 and 1 are connected to crosspoint 0,
+nodes 2 and 3 to crosspoint 1 etc.
+
+PMU (perf) driver
+-----------------
+
+The CCN driver registers a perf PMU driver, which provides
+description of available events and configuration options
+in sysfs, see /sys/bus/event_source/devices/ccn*.
+
+The "format" directory describes format of the config, config1
+and config2 fields of the perf_event_attr structure. The "events"
+directory provides configuration templates for all documented
+events, that can be used with perf tool. For example "xp_valid_flit"
+is an equivalent of "type=0x8,event=0x4". Other parameters must be
+explicitly specified.
+
+For events originating from device, "node" defines its index.
+
+Crosspoint PMU events require "xp" (index), "bus" (bus number)
+and "vc" (virtual channel ID).
+
+Crosspoint watchpoint-based events (special "event" value 0xfe)
+require "xp" and "vc" as as above plus "port" (device port index),
+"dir" (transmit/receive direction), comparator values ("cmp_l"
+and "cmp_h") and "mask", being index of the comparator mask.
+
+Masks are defined separately from the event description
+(due to limited number of the config values) in the "cmp_mask"
+directory, with first 8 configurable by user and additional
+4 hardcoded for the most frequent use cases.
+
+Cycle counter is described by a "type" value 0xff and does
+not require any other settings.
+
+The driver also provides a "cpumask" sysfs attribute, which contains
+a single CPU ID, of the processor which will be used to handle all
+the CCN PMU events. It is recommended that the user space tools
+request the events on this processor (if not, the perf_event->cpu value
+will be overwritten anyway). In case of this processor being offlined,
+the events are migrated to another one and the attribute is updated.
+
+Example of perf tool use::
+
+ / # perf list | grep ccn
+ ccn/cycles/ [Kernel PMU event]
+ <...>
+ ccn/xp_valid_flit,xp=?,port=?,vc=?,dir=?/ [Kernel PMU event]
+ <...>
+
+ / # perf stat -a -e ccn/cycles/,ccn/xp_valid_flit,xp=1,port=0,vc=1,dir=1/ \
+ sleep 1
+
+The driver does not support sampling, therefore "perf record" will
+not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/admin-guide/perf/arm_dsu_pmu.rst b/Documentation/admin-guide/perf/arm_dsu_pmu.rst
new file mode 100644
index 000000000000..7fd34db75d13
--- /dev/null
+++ b/Documentation/admin-guide/perf/arm_dsu_pmu.rst
@@ -0,0 +1,29 @@
+==================================
+ARM DynamIQ Shared Unit (DSU) PMU
+==================================
+
+ARM DynamIQ Shared Unit integrates one or more cores with an L3 memory system,
+control logic and external interfaces to form a multicore cluster. The PMU
+allows counting the various events related to the L3 cache, Snoop Control Unit
+etc, using 32bit independent counters. It also provides a 64bit cycle counter.
+
+The PMU can only be accessed via CPU system registers and are common to the
+cores connected to the same DSU. Like most of the other uncore PMUs, DSU
+PMU doesn't support process specific events and cannot be used in sampling mode.
+
+The DSU provides a bitmap for a subset of implemented events via hardware
+registers. There is no way for the driver to determine if the other events
+are available or not. Hence the driver exposes only those events advertised
+by the DSU, in "events" directory under::
+
+ /sys/bus/event_sources/devices/arm_dsu_<N>/
+
+The user should refer to the TRM of the product to figure out the supported events
+and use the raw event code for the unlisted events.
+
+The driver also exposes the CPUs connected to the DSU instance in "associated_cpus".
+
+
+e.g usage::
+
+ perf stat -a -e arm_dsu_0/cycles/
diff --git a/Documentation/admin-guide/perf/hisi-pmu.rst b/Documentation/admin-guide/perf/hisi-pmu.rst
new file mode 100644
index 000000000000..404a5c3d9d00
--- /dev/null
+++ b/Documentation/admin-guide/perf/hisi-pmu.rst
@@ -0,0 +1,60 @@
+======================================================
+HiSilicon SoC uncore Performance Monitoring Unit (PMU)
+======================================================
+
+The HiSilicon SoC chip includes various independent system device PMUs
+such as L3 cache (L3C), Hydra Home Agent (HHA) and DDRC. These PMUs are
+independent and have hardware logic to gather statistics and performance
+information.
+
+The HiSilicon SoC encapsulates multiple CPU and IO dies. Each CPU cluster
+(CCL) is made up of 4 cpu cores sharing one L3 cache; each CPU die is
+called Super CPU cluster (SCCL) and is made up of 6 CCLs. Each SCCL has
+two HHAs (0 - 1) and four DDRCs (0 - 3), respectively.
+
+HiSilicon SoC uncore PMU driver
+-------------------------------
+
+Each device PMU has separate registers for event counting, control and
+interrupt, and the PMU driver shall register perf PMU drivers like L3C,
+HHA and DDRC etc. The available events and configuration options shall
+be described in the sysfs, see:
+
+/sys/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>/, or
+/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>.
+The "perf list" command shall list the available events from sysfs.
+
+Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU
+name will appear in event listing as hisi_sccl<sccl-id>_module<index-id>.
+where "sccl-id" is the identifier of the SCCL and "index-id" is the index of
+module.
+
+e.g. hisi_sccl3_l3c0/rd_hit_cpipe is READ_HIT_CPIPE event of L3C index #0 in
+SCCL ID #3.
+
+e.g. hisi_sccl1_hha0/rx_operations is RX_OPERATIONS event of HHA index #0 in
+SCCL ID #1.
+
+The driver also provides a "cpumask" sysfs attribute, which shows the CPU core
+ID used to count the uncore PMU event.
+
+Example usage of perf::
+
+ $# perf list
+ hisi_sccl3_l3c0/rd_hit_cpipe/ [kernel PMU event]
+ ------------------------------------------
+ hisi_sccl3_l3c0/wr_hit_cpipe/ [kernel PMU event]
+ ------------------------------------------
+ hisi_sccl1_l3c0/rd_hit_cpipe/ [kernel PMU event]
+ ------------------------------------------
+ hisi_sccl1_l3c0/wr_hit_cpipe/ [kernel PMU event]
+ ------------------------------------------
+
+ $# perf stat -a -e hisi_sccl3_l3c0/rd_hit_cpipe/ sleep 5
+ $# perf stat -a -e hisi_sccl3_l3c0/config=0x02/ sleep 5
+
+The current driver does not support sampling. So "perf record" is unsupported.
+Also attach to a task is unsupported as the events are all uncore.
+
+Note: Please contact the maintainer for a complete list of events supported for
+the PMU devices in the SoC and its information if needed.
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
new file mode 100644
index 000000000000..ee4bfd2a740f
--- /dev/null
+++ b/Documentation/admin-guide/perf/index.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+Performance monitor support
+===========================
+
+.. toctree::
+ :maxdepth: 1
+
+ hisi-pmu
+ qcom_l2_pmu
+ qcom_l3_pmu
+ arm-ccn
+ xgene-pmu
+ arm_dsu_pmu
+ thunderx2-pmu
diff --git a/Documentation/admin-guide/perf/qcom_l2_pmu.rst b/Documentation/admin-guide/perf/qcom_l2_pmu.rst
new file mode 100644
index 000000000000..c130178a4a55
--- /dev/null
+++ b/Documentation/admin-guide/perf/qcom_l2_pmu.rst
@@ -0,0 +1,39 @@
+=====================================================================
+Qualcomm Technologies Level-2 Cache Performance Monitoring Unit (PMU)
+=====================================================================
+
+This driver supports the L2 cache clusters found in Qualcomm Technologies
+Centriq SoCs. There are multiple physical L2 cache clusters, each with their
+own PMU. Each cluster has one or more CPUs associated with it.
+
+There is one logical L2 PMU exposed, which aggregates the results from
+the physical PMUs.
+
+The driver provides a description of its available events and configuration
+options in sysfs, see /sys/devices/l2cache_0.
+
+The "format" directory describes the format of the events.
+
+Events can be envisioned as a 2-dimensional array. Each column represents
+a group of events. There are 8 groups. Only one entry from each
+group can be in use at a time. If multiple events from the same group
+are specified, the conflicting events cannot be counted at the same time.
+
+Events are specified as 0xCCG, where CC is 2 hex digits specifying
+the code (array row) and G specifies the group (column) 0-7.
+
+In addition there is a cycle counter event specified by the value 0xFE
+which is outside the above scheme.
+
+The driver provides a "cpumask" sysfs attribute which contains a mask
+consisting of one CPU per cluster which will be used to handle all the PMU
+events on that cluster.
+
+Examples for use with perf::
+
+ perf stat -e l2cache_0/config=0x001/,l2cache_0/config=0x042/ -a sleep 1
+
+ perf stat -e l2cache_0/config=0xfe/ -C 2 sleep 1
+
+The driver does not support sampling, therefore "perf record" will
+not work. Per-task perf sessions are not supported.
diff --git a/Documentation/admin-guide/perf/qcom_l3_pmu.rst b/Documentation/admin-guide/perf/qcom_l3_pmu.rst
new file mode 100644
index 000000000000..a3d014a46bfd
--- /dev/null
+++ b/Documentation/admin-guide/perf/qcom_l3_pmu.rst
@@ -0,0 +1,26 @@
+===========================================================================
+Qualcomm Datacenter Technologies L3 Cache Performance Monitoring Unit (PMU)
+===========================================================================
+
+This driver supports the L3 cache PMUs found in Qualcomm Datacenter Technologies
+Centriq SoCs. The L3 cache on these SOCs is composed of multiple slices, shared
+by all cores within a socket. Each slice is exposed as a separate uncore perf
+PMU with device name l3cache_<socket>_<instance>. User space is responsible
+for aggregating across slices.
+
+The driver provides a description of its available events and configuration
+options in sysfs, see /sys/devices/l3cache*. Given that these are uncore PMUs
+the driver also exposes a "cpumask" sysfs attribute which contains a mask
+consisting of one CPU per socket which will be used to handle all the PMU
+events on that socket.
+
+The hardware implements 32bit event counters and has a flat 8bit event space
+exposed via the "event" format attribute. In addition to the 32bit physical
+counters the driver supports virtual 64bit hardware counters by using hardware
+counter chaining. This feature is exposed via the "lc" (long counter) format
+flag. E.g.::
+
+ perf stat -e l3cache_0_0/read-miss,lc/
+
+Given that these are uncore PMUs the driver does not support sampling, therefore
+"perf record" will not work. Per-task perf sessions are not supported.
diff --git a/Documentation/admin-guide/perf/thunderx2-pmu.rst b/Documentation/admin-guide/perf/thunderx2-pmu.rst
new file mode 100644
index 000000000000..08e33675853a
--- /dev/null
+++ b/Documentation/admin-guide/perf/thunderx2-pmu.rst
@@ -0,0 +1,42 @@
+=============================================================
+Cavium ThunderX2 SoC Performance Monitoring Unit (PMU UNCORE)
+=============================================================
+
+The ThunderX2 SoC PMU consists of independent, system-wide, per-socket
+PMUs such as the Level 3 Cache (L3C) and DDR4 Memory Controller (DMC).
+
+The DMC has 8 interleaved channels and the L3C has 16 interleaved tiles.
+Events are counted for the default channel (i.e. channel 0) and prorated
+to the total number of channels/tiles.
+
+The DMC and L3C support up to 4 counters. Counters are independently
+programmable and can be started and stopped individually. Each counter
+can be set to a different event. Counters are 32-bit and do not support
+an overflow interrupt; they are read every 2 seconds.
+
+PMU UNCORE (perf) driver:
+
+The thunderx2_pmu driver registers per-socket perf PMUs for the DMC and
+L3C devices. Each PMU can be used to count up to 4 events
+simultaneously. The PMUs provide a description of their available events
+and configuration options under sysfs, see
+/sys/devices/uncore_<l3c_S/dmc_S/>; S is the socket id.
+
+The driver does not support sampling, therefore "perf record" will not
+work. Per-task perf sessions are also not supported.
+
+Examples::
+
+ # perf stat -a -e uncore_dmc_0/cnt_cycles/ sleep 1
+
+ # perf stat -a -e \
+ uncore_dmc_0/cnt_cycles/,\
+ uncore_dmc_0/data_transfers/,\
+ uncore_dmc_0/read_txns/,\
+ uncore_dmc_0/write_txns/ sleep 1
+
+ # perf stat -a -e \
+ uncore_l3c_0/read_request/,\
+ uncore_l3c_0/read_hit/,\
+ uncore_l3c_0/inv_request/,\
+ uncore_l3c_0/inv_hit/ sleep 1
diff --git a/Documentation/admin-guide/perf/xgene-pmu.rst b/Documentation/admin-guide/perf/xgene-pmu.rst
new file mode 100644
index 000000000000..644f8ed89152
--- /dev/null
+++ b/Documentation/admin-guide/perf/xgene-pmu.rst
@@ -0,0 +1,49 @@
+================================================
+APM X-Gene SoC Performance Monitoring Unit (PMU)
+================================================
+
+X-Gene SoC PMU consists of various independent system device PMUs such as
+L3 cache(s), I/O bridge(s), memory controller bridge(s) and memory
+controller(s). These PMU devices are loosely architected to follow the
+same model as the PMU for ARM cores. The PMUs share the same top level
+interrupt and status CSR region.
+
+PMU (perf) driver
+-----------------
+
+The xgene-pmu driver registers several perf PMU drivers. Each of the perf
+driver provides description of its available events and configuration options
+in sysfs, see /sys/devices/<l3cX/iobX/mcbX/mcX>/.
+
+The "format" directory describes format of the config (event ID),
+config1 (agent ID) fields of the perf_event_attr structure. The "events"
+directory provides configuration templates for all supported event types that
+can be used with perf tool. For example, "l3c0/bank-fifo-full/" is an
+equivalent of "l3c0/config=0x0b/".
+
+Most of the SoC PMU has a specific list of agent ID used for monitoring
+performance of a specific datapath. For example, agents of a L3 cache can be
+a specific CPU or an I/O bridge. Each PMU has a set of 2 registers capable of
+masking the agents from which the request come from. If the bit with
+the bit number corresponding to the agent is set, the event is counted only if
+it is caused by a request from that agent. Each agent ID bit is inversely mapped
+to a corresponding bit in "config1" field. By default, the event will be
+counted for all agent requests (config1 = 0x0). For all the supported agents of
+each PMU, please refer to APM X-Gene User Manual.
+
+Each perf driver also provides a "cpumask" sysfs attribute, which contains a
+single CPU ID of the processor which will be used to handle all the PMU events.
+
+Example for perf tool use::
+
+ / # perf list | grep -e l3c -e iob -e mcb -e mc
+ l3c0/ackq-full/ [Kernel PMU event]
+ <...>
+ mcb1/mcb-csw-stall/ [Kernel PMU event]
+
+ / # perf stat -a -e l3c0/read-miss/,mcb1/csw-write-request/ sleep 1
+
+ / # perf stat -a -e l3c0/read-miss,config1=0xfffffffffffffffe/ sleep 1
+
+The driver does not support sampling, therefore "perf record" will
+not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/pnp.txt b/Documentation/admin-guide/pnp.rst
index bab2d10631f0..bab2d10631f0 100644
--- a/Documentation/pnp.txt
+++ b/Documentation/admin-guide/pnp.rst
diff --git a/Documentation/driver-api/rapidio.rst b/Documentation/admin-guide/rapidio.rst
index 71ff658ab78e..71ff658ab78e 100644
--- a/Documentation/driver-api/rapidio.rst
+++ b/Documentation/admin-guide/rapidio.rst
diff --git a/Documentation/rtc.txt b/Documentation/admin-guide/rtc.rst
index 688c95b11919..688c95b11919 100644
--- a/Documentation/rtc.txt
+++ b/Documentation/admin-guide/rtc.rst
diff --git a/Documentation/svga.txt b/Documentation/admin-guide/svga.rst
index b6c2f9acca92..b6c2f9acca92 100644
--- a/Documentation/svga.txt
+++ b/Documentation/admin-guide/svga.rst
diff --git a/Documentation/admin-guide/sysctl/abi.rst b/Documentation/admin-guide/sysctl/abi.rst
new file mode 100644
index 000000000000..599bcde7f0b7
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/abi.rst
@@ -0,0 +1,67 @@
+================================
+Documentation for /proc/sys/abi/
+================================
+
+kernel version 2.6.0.test2
+
+Copyright (c) 2003, Fabian Frederick <ffrederick@users.sourceforge.net>
+
+For general info: index.rst.
+
+------------------------------------------------------------------------------
+
+This path is binary emulation relevant aka personality types aka abi.
+When a process is executed, it's linked to an exec_domain whose
+personality is defined using values available from /proc/sys/abi.
+You can find further details about abi in include/linux/personality.h.
+
+Here are the files featuring in 2.6 kernel:
+
+- defhandler_coff
+- defhandler_elf
+- defhandler_lcall7
+- defhandler_libcso
+- fake_utsname
+- trace
+
+defhandler_coff
+---------------
+
+defined value:
+ PER_SCOSVR3::
+
+ 0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE
+
+defhandler_elf
+--------------
+
+defined value:
+ PER_LINUX::
+
+ 0
+
+defhandler_lcall7
+-----------------
+
+defined value :
+ PER_SVR4::
+
+ 0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
+
+defhandler_libsco
+-----------------
+
+defined value:
+ PER_SVR4::
+
+ 0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
+
+fake_utsname
+------------
+
+Unused
+
+trace
+-----
+
+Unused
diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst
new file mode 100644
index 000000000000..2a45119e3331
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -0,0 +1,384 @@
+===============================
+Documentation for /proc/sys/fs/
+===============================
+
+kernel version 2.2.10
+
+Copyright (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+Copyright (c) 2009, Shen Feng<shen@cn.fujitsu.com>
+
+For general info and legal blurb, please look in intro.rst.
+
+------------------------------------------------------------------------------
+
+This file contains documentation for the sysctl files in
+/proc/sys/fs/ and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to tune and monitor
+miscellaneous and general things in the operation of the Linux
+kernel. Since some of the files _can_ be used to screw up your
+system, it is advisable to read both documentation and source
+before actually making adjustments.
+
+1. /proc/sys/fs
+===============
+
+Currently, these files are in /proc/sys/fs:
+
+- aio-max-nr
+- aio-nr
+- dentry-state
+- dquot-max
+- dquot-nr
+- file-max
+- file-nr
+- inode-max
+- inode-nr
+- inode-state
+- nr_open
+- overflowuid
+- overflowgid
+- pipe-user-pages-hard
+- pipe-user-pages-soft
+- protected_fifos
+- protected_hardlinks
+- protected_regular
+- protected_symlinks
+- suid_dumpable
+- super-max
+- super-nr
+
+
+aio-nr & aio-max-nr
+-------------------
+
+aio-nr is the running total of the number of events specified on the
+io_setup system call for all currently active aio contexts. If aio-nr
+reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
+raising aio-max-nr does not result in the pre-allocation or re-sizing
+of any kernel data structures.
+
+
+dentry-state
+------------
+
+From linux/include/linux/dcache.h::
+
+ struct dentry_stat_t dentry_stat {
+ int nr_dentry;
+ int nr_unused;
+ int age_limit; /* age in seconds */
+ int want_pages; /* pages requested by system */
+ int nr_negative; /* # of unused negative dentries */
+ int dummy; /* Reserved for future use */
+ };
+
+Dentries are dynamically allocated and deallocated.
+
+nr_dentry shows the total number of dentries allocated (active
++ unused). nr_unused shows the number of dentries that are not
+actively used, but are saved in the LRU list for future reuse.
+
+Age_limit is the age in seconds after which dcache entries
+can be reclaimed when memory is short and want_pages is
+nonzero when shrink_dcache_pages() has been called and the
+dcache isn't pruned yet.
+
+nr_negative shows the number of unused dentries that are also
+negative dentries which do not map to any files. Instead,
+they help speeding up rejection of non-existing files provided
+by the users.
+
+
+dquot-max & dquot-nr
+--------------------
+
+The file dquot-max shows the maximum number of cached disk
+quota entries.
+
+The file dquot-nr shows the number of allocated disk quota
+entries and the number of free disk quota entries.
+
+If the number of free cached disk quotas is very low and
+you have some awesome number of simultaneous system users,
+you might want to raise the limit.
+
+
+file-max & file-nr
+------------------
+
+The value in file-max denotes the maximum number of file-
+handles that the Linux kernel will allocate. When you get lots
+of error messages about running out of file handles, you might
+want to increase this limit.
+
+Historically,the kernel was able to allocate file handles
+dynamically, but not to free them again. The three values in
+file-nr denote the number of allocated file handles, the number
+of allocated but unused file handles, and the maximum number of
+file handles. Linux 2.6 always reports 0 as the number of free
+file handles -- this is not an error, it just means that the
+number of allocated file handles exactly matches the number of
+used file handles.
+
+Attempts to allocate more file descriptors than file-max are
+reported with printk, look for "VFS: file-max limit <number>
+reached".
+
+
+nr_open
+-------
+
+This denotes the maximum number of file-handles a process can
+allocate. Default value is 1024*1024 (1048576) which should be
+enough for most machines. Actual limit depends on RLIMIT_NOFILE
+resource limit.
+
+
+inode-max, inode-nr & inode-state
+---------------------------------
+
+As with file handles, the kernel allocates the inode structures
+dynamically, but can't free them yet.
+
+The value in inode-max denotes the maximum number of inode
+handlers. This value should be 3-4 times larger than the value
+in file-max, since stdin, stdout and network sockets also
+need an inode struct to handle them. When you regularly run
+out of inodes, you need to increase this value.
+
+The file inode-nr contains the first two items from
+inode-state, so we'll skip to that file...
+
+Inode-state contains three actual numbers and four dummies.
+The actual numbers are, in order of appearance, nr_inodes,
+nr_free_inodes and preshrink.
+
+Nr_inodes stands for the number of inodes the system has
+allocated, this can be slightly more than inode-max because
+Linux allocates them one pageful at a time.
+
+Nr_free_inodes represents the number of free inodes (?) and
+preshrink is nonzero when the nr_inodes > inode-max and the
+system needs to prune the inode list instead of allocating
+more.
+
+
+overflowgid & overflowuid
+-------------------------
+
+Some filesystems only support 16-bit UIDs and GIDs, although in Linux
+UIDs and GIDs are 32 bits. When one of these filesystems is mounted
+with writes enabled, any UID or GID that would exceed 65535 is translated
+to a fixed value before being written to disk.
+
+These sysctls allow you to change the value of the fixed UID and GID.
+The default is 65534.
+
+
+pipe-user-pages-hard
+--------------------
+
+Maximum total number of pages a non-privileged user may allocate for pipes.
+Once this limit is reached, no new pipes may be allocated until usage goes
+below the limit again. When set to 0, no limit is applied, which is the default
+setting.
+
+
+pipe-user-pages-soft
+--------------------
+
+Maximum total number of pages a non-privileged user may allocate for pipes
+before the pipe size gets limited to a single page. Once this limit is reached,
+new pipes will be limited to a single page in size for this user in order to
+limit total memory usage, and trying to increase them using fcntl() will be
+denied until usage goes below the limit again. The default value allows to
+allocate up to 1024 pipes at their default size. When set to 0, no limit is
+applied.
+
+
+protected_fifos
+---------------
+
+The intent of this protection is to avoid unintentional writes to
+an attacker-controlled FIFO, where a program expected to create a regular
+file.
+
+When set to "0", writing to FIFOs is unrestricted.
+
+When set to "1" don't allow O_CREAT open on FIFOs that we don't own
+in world writable sticky directories, unless they are owned by the
+owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+This protection is based on the restrictions in Openwall.
+
+
+protected_hardlinks
+--------------------
+
+A long-standing class of security issues is the hardlink-based
+time-of-check-time-of-use race, most commonly seen in world-writable
+directories like /tmp. The common method of exploitation of this flaw
+is to cross privilege boundaries when following a given hardlink (i.e. a
+root process follows a hardlink created by another user). Additionally,
+on systems without separated partitions, this stops unauthorized users
+from "pinning" vulnerable setuid/setgid files against being upgraded by
+the administrator, or linking to special files.
+
+When set to "0", hardlink creation behavior is unrestricted.
+
+When set to "1" hardlinks cannot be created by users if they do not
+already own the source file, or do not have read/write access to it.
+
+This protection is based on the restrictions in Openwall and grsecurity.
+
+
+protected_regular
+-----------------
+
+This protection is similar to protected_fifos, but it
+avoids writes to an attacker-controlled regular file, where a program
+expected to create one.
+
+When set to "0", writing to regular files is unrestricted.
+
+When set to "1" don't allow O_CREAT open on regular files that we
+don't own in world writable sticky directories, unless they are
+owned by the owner of the directory.
+
+When set to "2" it also applies to group writable sticky directories.
+
+
+protected_symlinks
+------------------
+
+A long-standing class of security issues is the symlink-based
+time-of-check-time-of-use race, most commonly seen in world-writable
+directories like /tmp. The common method of exploitation of this flaw
+is to cross privilege boundaries when following a given symlink (i.e. a
+root process follows a symlink belonging to another user). For a likely
+incomplete list of hundreds of examples across the years, please see:
+http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp
+
+When set to "0", symlink following behavior is unrestricted.
+
+When set to "1" symlinks are permitted to be followed only when outside
+a sticky world-writable directory, or when the uid of the symlink and
+follower match, or when the directory owner matches the symlink's owner.
+
+This protection is based on the restrictions in Openwall and grsecurity.
+
+
+suid_dumpable:
+--------------
+
+This value can be used to query and set the core dump mode for setuid
+or otherwise protected/tainted binaries. The modes are
+
+= ========== ===============================================================
+0 (default) traditional behaviour. Any process which has changed
+ privilege levels or is execute only will not be dumped.
+1 (debug) all processes dump core when possible. The core dump is
+ owned by the current user and no security is applied. This is
+ intended for system debugging situations only.
+ Ptrace is unchecked.
+ This is insecure as it allows regular users to examine the
+ memory contents of privileged processes.
+2 (suidsafe) any binary which normally would not be dumped is dumped
+ anyway, but only if the "core_pattern" kernel sysctl is set to
+ either a pipe handler or a fully qualified path. (For more
+ details on this limitation, see CVE-2006-2451.) This mode is
+ appropriate when administrators are attempting to debug
+ problems in a normal environment, and either have a core dump
+ pipe handler that knows to treat privileged core dumps with
+ care, or specific directory defined for catching core dumps.
+ If a core dump happens without a pipe handler or fully
+ qualified path, a message will be emitted to syslog warning
+ about the lack of a correct setting.
+= ========== ===============================================================
+
+
+super-max & super-nr
+--------------------
+
+These numbers control the maximum number of superblocks, and
+thus the maximum number of mounted filesystems the kernel
+can have. You only need to increase super-max if you need to
+mount more filesystems than the current value in super-max
+allows you to.
+
+
+aio-nr & aio-max-nr
+-------------------
+
+aio-nr shows the current system-wide number of asynchronous io
+requests. aio-max-nr allows you to change the maximum value
+aio-nr can grow to.
+
+
+mount-max
+---------
+
+This denotes the maximum number of mounts that may exist
+in a mount namespace.
+
+
+
+2. /proc/sys/fs/binfmt_misc
+===========================
+
+Documentation for the files in /proc/sys/fs/binfmt_misc is
+in Documentation/admin-guide/binfmt-misc.rst.
+
+
+3. /proc/sys/fs/mqueue - POSIX message queues filesystem
+========================================================
+
+
+The "mqueue" filesystem provides the necessary kernel features to enable the
+creation of a user space library that implements the POSIX message queues
+API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
+Interfaces specification.)
+
+The "mqueue" filesystem contains values for determining/setting the amount of
+resources used by the file system.
+
+/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
+maximum number of message queues allowed on the system.
+
+/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
+maximum number of messages in a queue value. In fact it is the limiting value
+for another (user) limit which is set in mq_open invocation. This attribute of
+a queue must be less or equal then msg_max.
+
+/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
+maximum message size value (it is every message queue's attribute set during
+its creation).
+
+/proc/sys/fs/mqueue/msg_default is a read/write file for setting/getting the
+default number of messages in a queue value if attr parameter of mq_open(2) is
+NULL. If it exceed msg_max, the default value is initialized msg_max.
+
+/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting
+the default message size value if attr parameter of mq_open(2) is NULL. If it
+exceed msgsize_max, the default value is initialized msgsize_max.
+
+4. /proc/sys/fs/epoll - Configuration options for the epoll interface
+=====================================================================
+
+This directory contains configuration options for the epoll(7) interface.
+
+max_user_watches
+----------------
+
+Every epoll file descriptor can store a number of files to be monitored
+for event readiness. Each one of these monitored files constitutes a "watch".
+This configuration option sets the maximum number of "watches" that are
+allowed for each user.
+Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
+on a 64bit one.
+The current default value for max_user_watches is the 1/32 of the available
+low memory, divided for the "watch" cost in bytes.
diff --git a/Documentation/admin-guide/sysctl/index.rst b/Documentation/admin-guide/sysctl/index.rst
new file mode 100644
index 000000000000..03346f98c7b9
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/index.rst
@@ -0,0 +1,98 @@
+===========================
+Documentation for /proc/sys
+===========================
+
+Copyright (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+------------------------------------------------------------------------------
+
+'Why', I hear you ask, 'would anyone even _want_ documentation
+for them sysctl files? If anybody really needs it, it's all in
+the source...'
+
+Well, this documentation is written because some people either
+don't know they need to tweak something, or because they don't
+have the time or knowledge to read the source code.
+
+Furthermore, the programmers who built sysctl have built it to
+be actually used, not just for the fun of programming it :-)
+
+------------------------------------------------------------------------------
+
+Legal blurb:
+
+As usual, there are two main things to consider:
+
+1. you get what you pay for
+2. it's free
+
+The consequences are that I won't guarantee the correctness of
+this document, and if you come to me complaining about how you
+screwed up your system because of wrong documentation, I won't
+feel sorry for you. I might even laugh at you...
+
+But of course, if you _do_ manage to screw up your system using
+only the sysctl options used in this file, I'd like to hear of
+it. Not only to have a great laugh, but also to make sure that
+you're the last RTFMing person to screw up.
+
+In short, e-mail your suggestions, corrections and / or horror
+stories to: <riel@nl.linux.org>
+
+Rik van Riel.
+
+--------------------------------------------------------------
+
+Introduction
+============
+
+Sysctl is a means of configuring certain aspects of the kernel
+at run-time, and the /proc/sys/ directory is there so that you
+don't even need special tools to do it!
+In fact, there are only four things needed to use these config
+facilities:
+
+- a running Linux system
+- root access
+- common sense (this is especially hard to come by these days)
+- knowledge of what all those values mean
+
+As a quick 'ls /proc/sys' will show, the directory consists of
+several (arch-dependent?) subdirs. Each subdir is mainly about
+one part of the kernel, so you can do configuration on a piece
+by piece basis, or just some 'thematic frobbing'.
+
+This documentation is about:
+
+=============== ===============================================================
+abi/ execution domains & personalities
+debug/ <empty>
+dev/ device specific information (eg dev/cdrom/info)
+fs/ specific filesystems
+ filehandle, inode, dentry and quota tuning
+ binfmt_misc <Documentation/admin-guide/binfmt-misc.rst>
+kernel/ global kernel info / tuning
+ miscellaneous stuff
+net/ networking stuff, for documentation look in:
+ <Documentation/networking/>
+proc/ <empty>
+sunrpc/ SUN Remote Procedure Call (NFS)
+vm/ memory management tuning
+ buffer and cache management
+user/ Per user per user namespace limits
+=============== ===============================================================
+
+These are the subdirs I have on my system. There might be more
+or other subdirs in another setup. If you see another dir, I'd
+really like to hear about it :-)
+
+.. toctree::
+ :maxdepth: 1
+
+ abi
+ fs
+ kernel
+ net
+ sunrpc
+ user
+ vm
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
new file mode 100644
index 000000000000..032c7cd3cede
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -0,0 +1,1177 @@
+===================================
+Documentation for /proc/sys/kernel/
+===================================
+
+kernel version 2.2.10
+
+Copyright (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+Copyright (c) 2009, Shen Feng<shen@cn.fujitsu.com>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains documentation for the sysctl files in
+/proc/sys/kernel/ and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to tune and monitor
+miscellaneous and general things in the operation of the Linux
+kernel. Since some of the files _can_ be used to screw up your
+system, it is advisable to read both documentation and source
+before actually making adjustments.
+
+Currently, these files might (depending on your configuration)
+show up in /proc/sys/kernel:
+
+- acct
+- acpi_video_flags
+- auto_msgmni
+- bootloader_type [ X86 only ]
+- bootloader_version [ X86 only ]
+- cap_last_cap
+- core_pattern
+- core_pipe_limit
+- core_uses_pid
+- ctrl-alt-del
+- dmesg_restrict
+- domainname
+- hostname
+- hotplug
+- hardlockup_all_cpu_backtrace
+- hardlockup_panic
+- hung_task_panic
+- hung_task_check_count
+- hung_task_timeout_secs
+- hung_task_check_interval_secs
+- hung_task_warnings
+- hyperv_record_panic_msg
+- kexec_load_disabled
+- kptr_restrict
+- l2cr [ PPC only ]
+- modprobe ==> Documentation/debugging-modules.txt
+- modules_disabled
+- msg_next_id [ sysv ipc ]
+- msgmax
+- msgmnb
+- msgmni
+- nmi_watchdog
+- osrelease
+- ostype
+- overflowgid
+- overflowuid
+- panic
+- panic_on_oops
+- panic_on_stackoverflow
+- panic_on_unrecovered_nmi
+- panic_on_warn
+- panic_print
+- panic_on_rcu_stall
+- perf_cpu_time_max_percent
+- perf_event_paranoid
+- perf_event_max_stack
+- perf_event_mlock_kb
+- perf_event_max_contexts_per_stack
+- pid_max
+- powersave-nap [ PPC only ]
+- printk
+- printk_delay
+- printk_ratelimit
+- printk_ratelimit_burst
+- pty ==> Documentation/filesystems/devpts.txt
+- randomize_va_space
+- real-root-dev ==> Documentation/admin-guide/initrd.rst
+- reboot-cmd [ SPARC only ]
+- rtsig-max
+- rtsig-nr
+- sched_energy_aware
+- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst
+- sem
+- sem_next_id [ sysv ipc ]
+- sg-big-buff [ generic SCSI device (sg) ]
+- shm_next_id [ sysv ipc ]
+- shm_rmid_forced
+- shmall
+- shmmax [ sysv ipc ]
+- shmmni
+- softlockup_all_cpu_backtrace
+- soft_watchdog
+- stack_erasing
+- stop-a [ SPARC only ]
+- sysrq ==> Documentation/admin-guide/sysrq.rst
+- sysctl_writes_strict
+- tainted ==> Documentation/admin-guide/tainted-kernels.rst
+- threads-max
+- unknown_nmi_panic
+- watchdog
+- watchdog_thresh
+- version
+
+
+acct:
+=====
+
+highwater lowwater frequency
+
+If BSD-style process accounting is enabled these values control
+its behaviour. If free space on filesystem where the log lives
+goes below <lowwater>% accounting suspends. If free space gets
+above <highwater>% accounting resumes. <Frequency> determines
+how often do we check the amount of free space (value is in
+seconds). Default:
+4 2 30
+That is, suspend accounting if there left <= 2% free; resume it
+if we got >=4%; consider information about amount of free space
+valid for 30 seconds.
+
+
+acpi_video_flags:
+=================
+
+flags
+
+See Doc*/kernel/power/video.txt, it allows mode of video boot to be
+set during run time.
+
+
+auto_msgmni:
+============
+
+This variable has no effect and may be removed in future kernel
+releases. Reading it always returns 0.
+Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni
+upon memory add/remove or upon ipc namespace creation/removal.
+Echoing "1" into this file enabled msgmni automatic recomputing.
+Echoing "0" turned it off. auto_msgmni default value was 1.
+
+
+bootloader_type:
+================
+
+x86 bootloader identification
+
+This gives the bootloader type number as indicated by the bootloader,
+shifted left by 4, and OR'd with the low four bits of the bootloader
+version. The reason for this encoding is that this used to match the
+type_of_loader field in the kernel header; the encoding is kept for
+backwards compatibility. That is, if the full bootloader type number
+is 0x15 and the full version number is 0x234, this file will contain
+the value 340 = 0x154.
+
+See the type_of_loader and ext_loader_type fields in
+Documentation/x86/boot.rst for additional information.
+
+
+bootloader_version:
+===================
+
+x86 bootloader version
+
+The complete bootloader version number. In the example above, this
+file will contain the value 564 = 0x234.
+
+See the type_of_loader and ext_loader_ver fields in
+Documentation/x86/boot.rst for additional information.
+
+
+cap_last_cap:
+=============
+
+Highest valid capability of the running kernel. Exports
+CAP_LAST_CAP from the kernel.
+
+
+core_pattern:
+=============
+
+core_pattern is used to specify a core dumpfile pattern name.
+
+* max length 127 characters; default value is "core"
+* core_pattern is used as a pattern template for the output filename;
+ certain string patterns (beginning with '%') are substituted with
+ their actual values.
+* backward compatibility with core_uses_pid:
+
+ If core_pattern does not include "%p" (default does not)
+ and core_uses_pid is set, then .PID will be appended to
+ the filename.
+
+* corename format specifiers::
+
+ %<NUL> '%' is dropped
+ %% output one '%'
+ %p pid
+ %P global pid (init PID namespace)
+ %i tid
+ %I global tid (init PID namespace)
+ %u uid (in initial user namespace)
+ %g gid (in initial user namespace)
+ %d dump mode, matches PR_SET_DUMPABLE and
+ /proc/sys/fs/suid_dumpable
+ %s signal number
+ %t UNIX time of dump
+ %h hostname
+ %e executable filename (may be shortened)
+ %E executable path
+ %<OTHER> both are dropped
+
+* If the first character of the pattern is a '|', the kernel will treat
+ the rest of the pattern as a command to run. The core dump will be
+ written to the standard input of that program instead of to a file.
+
+
+core_pipe_limit:
+================
+
+This sysctl is only applicable when core_pattern is configured to pipe
+core files to a user space helper (when the first character of
+core_pattern is a '|', see above). When collecting cores via a pipe
+to an application, it is occasionally useful for the collecting
+application to gather data about the crashing process from its
+/proc/pid directory. In order to do this safely, the kernel must wait
+for the collecting process to exit, so as not to remove the crashing
+processes proc files prematurely. This in turn creates the
+possibility that a misbehaving userspace collecting process can block
+the reaping of a crashed process simply by never exiting. This sysctl
+defends against that. It defines how many concurrent crashing
+processes may be piped to user space applications in parallel. If
+this value is exceeded, then those crashing processes above that value
+are noted via the kernel log and their cores are skipped. 0 is a
+special value, indicating that unlimited processes may be captured in
+parallel, but that no waiting will take place (i.e. the collecting
+process is not guaranteed access to /proc/<crashing pid>/). This
+value defaults to 0.
+
+
+core_uses_pid:
+==============
+
+The default coredump filename is "core". By setting
+core_uses_pid to 1, the coredump filename becomes core.PID.
+If core_pattern does not include "%p" (default does not)
+and core_uses_pid is set, then .PID will be appended to
+the filename.
+
+
+ctrl-alt-del:
+=============
+
+When the value in this file is 0, ctrl-alt-del is trapped and
+sent to the init(1) program to handle a graceful restart.
+When, however, the value is > 0, Linux's reaction to a Vulcan
+Nerve Pinch (tm) will be an immediate reboot, without even
+syncing its dirty buffers.
+
+Note:
+ when a program (like dosemu) has the keyboard in 'raw'
+ mode, the ctrl-alt-del is intercepted by the program before it
+ ever reaches the kernel tty layer, and it's up to the program
+ to decide what to do with it.
+
+
+dmesg_restrict:
+===============
+
+This toggle indicates whether unprivileged users are prevented
+from using dmesg(8) to view messages from the kernel's log buffer.
+When dmesg_restrict is set to (0) there are no restrictions. When
+dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use
+dmesg(8).
+
+The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the
+default value of dmesg_restrict.
+
+
+domainname & hostname:
+======================
+
+These files can be used to set the NIS/YP domainname and the
+hostname of your box in exactly the same way as the commands
+domainname and hostname, i.e.::
+
+ # echo "darkstar" > /proc/sys/kernel/hostname
+ # echo "mydomain" > /proc/sys/kernel/domainname
+
+has the same effect as::
+
+ # hostname "darkstar"
+ # domainname "mydomain"
+
+Note, however, that the classic darkstar.frop.org has the
+hostname "darkstar" and DNS (Internet Domain Name Server)
+domainname "frop.org", not to be confused with the NIS (Network
+Information Service) or YP (Yellow Pages) domainname. These two
+domain names are in general different. For a detailed discussion
+see the hostname(1) man page.
+
+
+hardlockup_all_cpu_backtrace:
+=============================
+
+This value controls the hard lockup detector behavior when a hard
+lockup condition is detected as to whether or not to gather further
+debug information. If enabled, arch-specific all-CPU stack dumping
+will be initiated.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
+
+
+hardlockup_panic:
+=================
+
+This parameter can be used to control whether the kernel panics
+when a hard lockup is detected.
+
+ 0 - don't panic on hard lockup
+ 1 - panic on hard lockup
+
+See Documentation/admin-guide/lockup-watchdogs.rst for more information. This can
+also be set using the nmi_watchdog kernel parameter.
+
+
+hotplug:
+========
+
+Path for the hotplug policy agent.
+Default value is "/sbin/hotplug".
+
+
+hung_task_panic:
+================
+
+Controls the kernel's behavior when a hung task is detected.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0: continue operation. This is the default behavior.
+
+1: panic immediately.
+
+
+hung_task_check_count:
+======================
+
+The upper bound on the number of tasks that are checked.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+
+hung_task_timeout_secs:
+=======================
+
+When a task in D state did not get scheduled
+for more than this value report a warning.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0: means infinite timeout - no checking done.
+
+Possible values to set are in range {0..LONG_MAX/HZ}.
+
+
+hung_task_check_interval_secs:
+==============================
+
+Hung task check interval. If hung task checking is enabled
+(see hung_task_timeout_secs), the check is done every
+hung_task_check_interval_secs seconds.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+0 (default): means use hung_task_timeout_secs as checking interval.
+Possible values to set are in range {0..LONG_MAX/HZ}.
+
+
+hung_task_warnings:
+===================
+
+The maximum number of warnings to report. During a check interval
+if a hung task is detected, this value is decreased by 1.
+When this value reaches 0, no more warnings will be reported.
+This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
+
+-1: report an infinite number of warnings.
+
+
+hyperv_record_panic_msg:
+========================
+
+Controls whether the panic kmsg data should be reported to Hyper-V.
+
+0: do not report panic kmsg data.
+
+1: report the panic kmsg data. This is the default behavior.
+
+
+kexec_load_disabled:
+====================
+
+A toggle indicating if the kexec_load syscall has been disabled. This
+value defaults to 0 (false: kexec_load enabled), but can be set to 1
+(true: kexec_load disabled). Once true, kexec can no longer be used, and
+the toggle cannot be set back to false. This allows a kexec image to be
+loaded before disabling the syscall, allowing a system to set up (and
+later use) an image without it being altered. Generally used together
+with the "modules_disabled" sysctl.
+
+
+kptr_restrict:
+==============
+
+This toggle indicates whether restrictions are placed on
+exposing kernel addresses via /proc and other interfaces.
+
+When kptr_restrict is set to 0 (the default) the address is hashed before
+printing. (This is the equivalent to %p.)
+
+When kptr_restrict is set to (1), kernel pointers printed using the %pK
+format specifier will be replaced with 0's unless the user has CAP_SYSLOG
+and effective user and group ids are equal to the real ids. This is
+because %pK checks are done at read() time rather than open() time, so
+if permissions are elevated between the open() and the read() (e.g via
+a setuid binary) then %pK will not leak kernel pointers to unprivileged
+users. Note, this is a temporary solution only. The correct long-term
+solution is to do the permission checks at open() time. Consider removing
+world read permissions from files that use %pK, and using dmesg_restrict
+to protect against uses of %pK in dmesg(8) if leaking kernel pointer
+values to unprivileged users is a concern.
+
+When kptr_restrict is set to (2), kernel pointers printed using
+%pK will be replaced with 0's regardless of privileges.
+
+
+l2cr: (PPC only)
+================
+
+This flag controls the L2 cache of G3 processor boards. If
+0, the cache is disabled. Enabled if nonzero.
+
+
+modules_disabled:
+=================
+
+A toggle value indicating if modules are allowed to be loaded
+in an otherwise modular kernel. This toggle defaults to off
+(0), but can be set true (1). Once true, modules can be
+neither loaded nor unloaded, and the toggle cannot be set back
+to false. Generally used with the "kexec_load_disabled" toggle.
+
+
+msg_next_id, sem_next_id, and shm_next_id:
+==========================================
+
+These three toggles allows to specify desired id for next allocated IPC
+object: message, semaphore or shared memory respectively.
+
+By default they are equal to -1, which means generic allocation logic.
+Possible values to set are in range {0..INT_MAX}.
+
+Notes:
+ 1) kernel doesn't guarantee, that new object will have desired id. So,
+ it's up to userspace, how to handle an object with "wrong" id.
+ 2) Toggle with non-default value will be set back to -1 by kernel after
+ successful IPC object allocation. If an IPC object allocation syscall
+ fails, it is undefined if the value remains unmodified or is reset to -1.
+
+
+nmi_watchdog:
+=============
+
+This parameter can be used to control the NMI watchdog
+(i.e. the hard lockup detector) on x86 systems.
+
+0 - disable the hard lockup detector
+
+1 - enable the hard lockup detector
+
+The hard lockup detector monitors each CPU for its ability to respond to
+timer interrupts. The mechanism utilizes CPU performance counter registers
+that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
+while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
+
+The NMI watchdog is disabled by default if the kernel is running as a guest
+in a KVM virtual machine. This default can be overridden by adding::
+
+ nmi_watchdog=1
+
+to the guest kernel command line (see Documentation/admin-guide/kernel-parameters.rst).
+
+
+numa_balancing:
+===============
+
+Enables/disables automatic page fault based NUMA memory
+balancing. Memory is moved automatically to nodes
+that access it often.
+
+Enables/disables automatic NUMA memory balancing. On NUMA machines, there
+is a performance penalty if remote memory is accessed by a CPU. When this
+feature is enabled the kernel samples what task thread is accessing memory
+by periodically unmapping pages and later trapping a page fault. At the
+time of the page fault, it is determined if the data being accessed should
+be migrated to a local memory node.
+
+The unmapping of pages and trapping faults incur additional overhead that
+ideally is offset by improved memory locality but there is no universal
+guarantee. If the target workload is already bound to NUMA nodes then this
+feature should be disabled. Otherwise, if the system overhead from the
+feature is too high then the rate the kernel samples for NUMA hinting
+faults may be controlled by the numa_balancing_scan_period_min_ms,
+numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
+
+numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
+===============================================================================================================================
+
+
+Automatic NUMA balancing scans tasks address space and unmaps pages to
+detect if pages are properly placed or if the data should be migrated to a
+memory node local to where the task is running. Every "scan delay" the task
+scans the next "scan size" number of pages in its address space. When the
+end of the address space is reached the scanner restarts from the beginning.
+
+In combination, the "scan delay" and "scan size" determine the scan rate.
+When "scan delay" decreases, the scan rate increases. The scan delay and
+hence the scan rate of every task is adaptive and depends on historical
+behaviour. If pages are properly placed then the scan delay increases,
+otherwise the scan delay decreases. The "scan size" is not adaptive but
+the higher the "scan size", the higher the scan rate.
+
+Higher scan rates incur higher system overhead as page faults must be
+trapped and potentially data must be migrated. However, the higher the scan
+rate, the more quickly a tasks memory is migrated to a local node if the
+workload pattern changes and minimises performance impact due to remote
+memory accesses. These sysctls control the thresholds for scan delays and
+the number of pages scanned.
+
+numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the maximum scanning
+rate for each task.
+
+numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
+when it initially forks.
+
+numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
+scan a tasks virtual memory. It effectively controls the minimum scanning
+rate for each task.
+
+numa_balancing_scan_size_mb is how many megabytes worth of pages are
+scanned for a given scan.
+
+
+osrelease, ostype & version:
+============================
+
+::
+
+ # cat osrelease
+ 2.1.88
+ # cat ostype
+ Linux
+ # cat version
+ #5 Wed Feb 25 21:49:24 MET 1998
+
+The files osrelease and ostype should be clear enough. Version
+needs a little more clarification however. The '#5' means that
+this is the fifth kernel built from this source base and the
+date behind it indicates the time the kernel was built.
+The only way to tune these values is to rebuild the kernel :-)
+
+
+overflowgid & overflowuid:
+==========================
+
+if your architecture did not always support 32-bit UIDs (i.e. arm,
+i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to
+applications that use the old 16-bit UID/GID system calls, if the
+actual UID or GID would exceed 65535.
+
+These sysctls allow you to change the value of the fixed UID and GID.
+The default is 65534.
+
+
+panic:
+======
+
+The value in this file represents the number of seconds the kernel
+waits before rebooting on a panic. When you use the software watchdog,
+the recommended setting is 60.
+
+
+panic_on_io_nmi:
+================
+
+Controls the kernel's behavior when a CPU receives an NMI caused by
+an IO error.
+
+0: try to continue operation (default)
+
+1: panic immediately. The IO error triggered an NMI. This indicates a
+ serious system condition which could result in IO data corruption.
+ Rather than continuing, panicking might be a better choice. Some
+ servers issue this sort of NMI when the dump button is pushed,
+ and you can use this option to take a crash dump.
+
+
+panic_on_oops:
+==============
+
+Controls the kernel's behaviour when an oops or BUG is encountered.
+
+0: try to continue operation
+
+1: panic immediately. If the `panic` sysctl is also non-zero then the
+ machine will be rebooted.
+
+
+panic_on_stackoverflow:
+=======================
+
+Controls the kernel's behavior when detecting the overflows of
+kernel, IRQ and exception stacks except a user stack.
+This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
+
+0: try to continue operation.
+
+1: panic immediately.
+
+
+panic_on_unrecovered_nmi:
+=========================
+
+The default Linux behaviour on an NMI of either memory or unknown is
+to continue operation. For many environments such as scientific
+computing it is preferable that the box is taken out and the error
+dealt with than an uncorrected parity/ECC error get propagated.
+
+A small number of systems do generate NMI's for bizarre random reasons
+such as power management so the default is off. That sysctl works like
+the existing panic controls already in that directory.
+
+
+panic_on_warn:
+==============
+
+Calls panic() in the WARN() path when set to 1. This is useful to avoid
+a kernel rebuild when attempting to kdump at the location of a WARN().
+
+0: only WARN(), default behaviour.
+
+1: call panic() after printing out WARN() location.
+
+
+panic_print:
+============
+
+Bitmask for printing system info when panic happens. User can chose
+combination of the following bits:
+
+===== ========================================
+bit 0 print all tasks info
+bit 1 print system memory info
+bit 2 print timer info
+bit 3 print locks info if CONFIG_LOCKDEP is on
+bit 4 print ftrace buffer
+===== ========================================
+
+So for example to print tasks and memory info on panic, user can::
+
+ echo 3 > /proc/sys/kernel/panic_print
+
+
+panic_on_rcu_stall:
+===================
+
+When set to 1, calls panic() after RCU stall detection messages. This
+is useful to define the root cause of RCU stalls using a vmcore.
+
+0: do not panic() when RCU stall takes place, default behavior.
+
+1: panic() after printing RCU stall messages.
+
+
+perf_cpu_time_max_percent:
+==========================
+
+Hints to the kernel how much CPU time it should be allowed to
+use to handle perf sampling events. If the perf subsystem
+is informed that its samples are exceeding this limit, it
+will drop its sampling frequency to attempt to reduce its CPU
+usage.
+
+Some perf sampling happens in NMIs. If these samples
+unexpectedly take too long to execute, the NMIs can become
+stacked up next to each other so much that nothing else is
+allowed to execute.
+
+0:
+ disable the mechanism. Do not monitor or correct perf's
+ sampling rate no matter how CPU time it takes.
+
+1-100:
+ attempt to throttle perf's sample rate to this
+ percentage of CPU. Note: the kernel calculates an
+ "expected" length of each sample event. 100 here means
+ 100% of that expected length. Even if this is set to
+ 100, you may still see sample throttling if this
+ length is exceeded. Set to 0 if you truly do not care
+ how much CPU is consumed.
+
+
+perf_event_paranoid:
+====================
+
+Controls use of the performance events system by unprivileged
+users (without CAP_SYS_ADMIN). The default value is 2.
+
+=== ==================================================================
+ -1 Allow use of (almost) all events by all users
+
+ Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
+
+>=0 Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
+
+ Disallow raw tracepoint access by users without CAP_SYS_ADMIN
+
+>=1 Disallow CPU event access by users without CAP_SYS_ADMIN
+
+>=2 Disallow kernel profiling by users without CAP_SYS_ADMIN
+=== ==================================================================
+
+
+perf_event_max_stack:
+=====================
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+
+perf_event_mlock_kb:
+====================
+
+Control size of per-cpu ring buffer not counted agains mlock limit.
+
+The default value is 512 + 1 page
+
+
+perf_event_max_contexts_per_stack:
+==================================
+
+Controls maximum number of stack frame context entries for
+(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
+instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 8.
+
+
+pid_max:
+========
+
+PID allocation wrap value. When the kernel's next PID value
+reaches this value, it wraps back to a minimum PID value.
+PIDs of value pid_max or larger are not allocated.
+
+
+ns_last_pid:
+============
+
+The last pid allocated in the current (the one task using this sysctl
+lives in) pid namespace. When selecting a pid for a next task on fork
+kernel tries to allocate a number starting from this one.
+
+
+powersave-nap: (PPC only)
+=========================
+
+If set, Linux-PPC will use the 'nap' mode of powersaving,
+otherwise the 'doze' mode will be used.
+
+==============================================================
+
+printk:
+=======
+
+The four values in printk denote: console_loglevel,
+default_message_loglevel, minimum_console_loglevel and
+default_console_loglevel respectively.
+
+These values influence printk() behavior when printing or
+logging error messages. See 'man 2 syslog' for more info on
+the different loglevels.
+
+- console_loglevel:
+ messages with a higher priority than
+ this will be printed to the console
+- default_message_loglevel:
+ messages without an explicit priority
+ will be printed with this priority
+- minimum_console_loglevel:
+ minimum (highest) value to which
+ console_loglevel can be set
+- default_console_loglevel:
+ default value for console_loglevel
+
+
+printk_delay:
+=============
+
+Delay each printk message in printk_delay milliseconds
+
+Value from 0 - 10000 is allowed.
+
+
+printk_ratelimit:
+=================
+
+Some warning messages are rate limited. printk_ratelimit specifies
+the minimum length of time between these messages (in jiffies), by
+default we allow one every 5 seconds.
+
+A value of 0 will disable rate limiting.
+
+
+printk_ratelimit_burst:
+=======================
+
+While long term we enforce one message per printk_ratelimit
+seconds, we do allow a burst of messages to pass through.
+printk_ratelimit_burst specifies the number of messages we can
+send before ratelimiting kicks in.
+
+
+printk_devkmsg:
+===============
+
+Control the logging to /dev/kmsg from userspace:
+
+ratelimit:
+ default, ratelimited
+
+on: unlimited logging to /dev/kmsg from userspace
+
+off: logging to /dev/kmsg disabled
+
+The kernel command line parameter printk.devkmsg= overrides this and is
+a one-time setting until next reboot: once set, it cannot be changed by
+this sysctl interface anymore.
+
+
+randomize_va_space:
+===================
+
+This option can be used to select the type of process address
+space randomization that is used in the system, for architectures
+that support this feature.
+
+== ===========================================================================
+0 Turn the process address space randomization off. This is the
+ default for architectures that do not support this feature anyways,
+ and kernels that are booted with the "norandmaps" parameter.
+
+1 Make the addresses of mmap base, stack and VDSO page randomized.
+ This, among other things, implies that shared libraries will be
+ loaded to random addresses. Also for PIE-linked binaries, the
+ location of code start is randomized. This is the default if the
+ CONFIG_COMPAT_BRK option is enabled.
+
+2 Additionally enable heap randomization. This is the default if
+ CONFIG_COMPAT_BRK is disabled.
+
+ There are a few legacy applications out there (such as some ancient
+ versions of libc.so.5 from 1996) that assume that brk area starts
+ just after the end of the code+bss. These applications break when
+ start of the brk area is randomized. There are however no known
+ non-legacy applications that would be broken this way, so for most
+ systems it is safe to choose full randomization.
+
+ Systems with ancient and/or broken binaries should be configured
+ with CONFIG_COMPAT_BRK enabled, which excludes the heap from process
+ address space randomization.
+== ===========================================================================
+
+
+reboot-cmd: (Sparc only)
+========================
+
+??? This seems to be a way to give an argument to the Sparc
+ROM/Flash boot loader. Maybe to tell it what to do after
+rebooting. ???
+
+
+rtsig-max & rtsig-nr:
+=====================
+
+The file rtsig-max can be used to tune the maximum number
+of POSIX realtime (queued) signals that can be outstanding
+in the system.
+
+rtsig-nr shows the number of RT signals currently queued.
+
+
+sched_energy_aware:
+===================
+
+Enables/disables Energy Aware Scheduling (EAS). EAS starts
+automatically on platforms where it can run (that is,
+platforms with asymmetric CPU topologies and having an Energy
+Model available). If your platform happens to meet the
+requirements for EAS but you do not want to use it, change
+this value to 0.
+
+
+sched_schedstats:
+=================
+
+Enables/disables scheduler statistics. Enabling this feature
+incurs a small amount of overhead in the scheduler but is
+useful for debugging and performance tuning.
+
+
+sg-big-buff:
+============
+
+This file shows the size of the generic SCSI (sg) buffer.
+You can't tune it just yet, but you could change it on
+compile time by editing include/scsi/sg.h and changing
+the value of SG_BIG_BUFF.
+
+There shouldn't be any reason to change this value. If
+you can come up with one, you probably know what you
+are doing anyway :)
+
+
+shmall:
+=======
+
+This parameter sets the total amount of shared memory pages that
+can be used system wide. Hence, SHMALL should always be at least
+ceil(shmmax/PAGE_SIZE).
+
+If you are not sure what the default PAGE_SIZE is on your Linux
+system, you can run the following command:
+
+ # getconf PAGE_SIZE
+
+
+shmmax:
+=======
+
+This value can be used to query and set the run time limit
+on the maximum shared memory segment size that can be created.
+Shared memory segments up to 1Gb are now supported in the
+kernel. This value defaults to SHMMAX.
+
+
+shm_rmid_forced:
+================
+
+Linux lets you set resource limits, including how much memory one
+process can consume, via setrlimit(2). Unfortunately, shared memory
+segments are allowed to exist without association with any process, and
+thus might not be counted against any resource limits. If enabled,
+shared memory segments are automatically destroyed when their attach
+count becomes zero after a detach or a process termination. It will
+also destroy segments that were created, but never attached to, on exit
+from the process. The only use left for IPC_RMID is to immediately
+destroy an unattached segment. Of course, this breaks the way things are
+defined, so some applications might stop working. Note that this
+feature will do you no good unless you also configure your resource
+limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't
+need this.
+
+Note that if you change this from 0 to 1, already created segments
+without users and with a dead originative process will be destroyed.
+
+
+sysctl_writes_strict:
+=====================
+
+Control how file position affects the behavior of updating sysctl values
+via the /proc/sys interface:
+
+ == ======================================================================
+ -1 Legacy per-write sysctl value handling, with no printk warnings.
+ Each write syscall must fully contain the sysctl value to be
+ written, and multiple writes on the same sysctl file descriptor
+ will rewrite the sysctl value, regardless of file position.
+ 0 Same behavior as above, but warn about processes that perform writes
+ to a sysctl file descriptor when the file position is not 0.
+ 1 (default) Respect file position when writing sysctl strings. Multiple
+ writes will append to the sysctl value buffer. Anything past the max
+ length of the sysctl value buffer will be ignored. Writes to numeric
+ sysctl entries must always be at file position 0 and the value must
+ be fully contained in the buffer sent in the write syscall.
+ == ======================================================================
+
+
+softlockup_all_cpu_backtrace:
+=============================
+
+This value controls the soft lockup detector thread's behavior
+when a soft lockup condition is detected as to whether or not
+to gather further debug information. If enabled, each cpu will
+be issued an NMI and instructed to capture stack trace.
+
+This feature is only applicable for architectures which support
+NMI.
+
+0: do nothing. This is the default behavior.
+
+1: on detection capture more debug information.
+
+
+soft_watchdog:
+==============
+
+This parameter can be used to control the soft lockup detector.
+
+ 0 - disable the soft lockup detector
+
+ 1 - enable the soft lockup detector
+
+The soft lockup detector monitors CPUs for threads that are hogging the CPUs
+without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
+from running. The mechanism depends on the CPUs ability to respond to timer
+interrupts which are needed for the 'watchdog/N' threads to be woken up by
+the watchdog timer function, otherwise the NMI watchdog - if enabled - can
+detect a hard lockup condition.
+
+
+stack_erasing:
+==============
+
+This parameter can be used to control kernel stack erasing at the end
+of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK.
+
+That erasing reduces the information which kernel stack leak bugs
+can reveal and blocks some uninitialized stack variable attacks.
+The tradeoff is the performance impact: on a single CPU system kernel
+compilation sees a 1% slowdown, other systems and workloads may vary.
+
+ 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated.
+
+ 1: kernel stack erasing is enabled (default), it is performed before
+ returning to the userspace at the end of syscalls.
+
+
+tainted
+=======
+
+Non-zero if the kernel has been tainted. Numeric values, which can be
+ORed together. The letters are seen in "Tainted" line of Oops reports.
+
+====== ===== ==============================================================
+ 1 `(P)` proprietary module was loaded
+ 2 `(F)` module was force loaded
+ 4 `(S)` SMP kernel oops on an officially SMP incapable processor
+ 8 `(R)` module was force unloaded
+ 16 `(M)` processor reported a Machine Check Exception (MCE)
+ 32 `(B)` bad page referenced or some unexpected page flags
+ 64 `(U)` taint requested by userspace application
+ 128 `(D)` kernel died recently, i.e. there was an OOPS or BUG
+ 256 `(A)` an ACPI table was overridden by user
+ 512 `(W)` kernel issued warning
+ 1024 `(C)` staging driver was loaded
+ 2048 `(I)` workaround for bug in platform firmware applied
+ 4096 `(O)` externally-built ("out-of-tree") module was loaded
+ 8192 `(E)` unsigned module was loaded
+ 16384 `(L)` soft lockup occurred
+ 32768 `(K)` kernel has been live patched
+ 65536 `(X)` Auxiliary taint, defined and used by for distros
+131072 `(T)` The kernel was built with the struct randomization plugin
+====== ===== ==============================================================
+
+See Documentation/admin-guide/tainted-kernels.rst for more information.
+
+
+threads-max:
+============
+
+This value controls the maximum number of threads that can be created
+using fork().
+
+During initialization the kernel sets this value such that even if the
+maximum number of threads is created, the thread structures occupy only
+a part (1/8th) of the available RAM pages.
+
+The minimum value that can be written to threads-max is 20.
+
+The maximum value that can be written to threads-max is given by the
+constant FUTEX_TID_MASK (0x3fffffff).
+
+If a value outside of this range is written to threads-max an error
+EINVAL occurs.
+
+The value written is checked against the available RAM pages. If the
+thread structures would occupy too much (more than 1/8th) of the
+available RAM pages threads-max is reduced accordingly.
+
+
+unknown_nmi_panic:
+==================
+
+The value in this file affects behavior of handling NMI. When the
+value is non-zero, unknown NMI is trapped and then panic occurs. At
+that time, kernel debugging information is displayed on console.
+
+NMI switch that most IA32 servers have fires unknown NMI up, for
+example. If a system hangs up, try pressing the NMI switch.
+
+
+watchdog:
+=========
+
+This parameter can be used to disable or enable the soft lockup detector
+_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
+
+ 0 - disable both lockup detectors
+
+ 1 - enable both lockup detectors
+
+The soft lockup detector and the NMI watchdog can also be disabled or
+enabled individually, using the soft_watchdog and nmi_watchdog parameters.
+If the watchdog parameter is read, for example by executing::
+
+ cat /proc/sys/kernel/watchdog
+
+the output of this command (0 or 1) shows the logical OR of soft_watchdog
+and nmi_watchdog.
+
+
+watchdog_cpumask:
+=================
+
+This value can be used to control on which cpus the watchdog may run.
+The default cpumask is all possible cores, but if NO_HZ_FULL is
+enabled in the kernel config, and cores are specified with the
+nohz_full= boot argument, those cores are excluded by default.
+Offline cores can be included in this mask, and if the core is later
+brought online, the watchdog will be started based on the mask value.
+
+Typically this value would only be touched in the nohz_full case
+to re-enable cores that by default were not running the watchdog,
+if a kernel lockup was suspected on those cores.
+
+The argument value is the standard cpulist format for cpumasks,
+so for example to enable the watchdog on cores 0, 2, 3, and 4 you
+might say::
+
+ echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
+
+
+watchdog_thresh:
+================
+
+This value can be used to control the frequency of hrtimer and NMI
+events and the soft and hard lockup thresholds. The default threshold
+is 10 seconds.
+
+The softlockup threshold is (2 * watchdog_thresh). Setting this
+tunable to zero will disable lockup detection altogether.
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
new file mode 100644
index 000000000000..a7d44e71019d
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -0,0 +1,461 @@
+================================
+Documentation for /proc/sys/net/
+================================
+
+Copyright
+
+Copyright (c) 1999
+
+ - Terrehon Bowden <terrehon@pacbell.net>
+ - Bodo Bauer <bb@ricochet.net>
+
+Copyright (c) 2000
+
+ - Jorge Nerin <comandante@zaralinux.com>
+
+Copyright (c) 2009
+
+ - Shen Feng <shen@cn.fujitsu.com>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/net
+
+The interface to the networking parts of the kernel is located in
+/proc/sys/net. The following table shows all possible subdirectories. You may
+see only some of them, depending on your kernel's configuration.
+
+
+Table : Subdirectories in /proc/sys/net
+
+ ========= =================== = ========== ==================
+ Directory Content Directory Content
+ ========= =================== = ========== ==================
+ core General parameter appletalk Appletalk protocol
+ unix Unix domain sockets netrom NET/ROM
+ 802 E802 protocol ax25 AX25
+ ethernet Ethernet protocol rose X.25 PLP layer
+ ipv4 IP version 4 x25 X.25 protocol
+ ipx IPX token-ring IBM token ring
+ bridge Bridging decnet DEC net
+ ipv6 IP version 6 tipc TIPC
+ ========= =================== = ========== ==================
+
+1. /proc/sys/net/core - Network core options
+============================================
+
+bpf_jit_enable
+--------------
+
+This enables the BPF Just in Time (JIT) compiler. BPF is a flexible
+and efficient infrastructure allowing to execute bytecode at various
+hook points. It is used in a number of Linux kernel subsystems such
+as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints)
+and security (e.g. seccomp). LLVM has a BPF back end that can compile
+restricted C into a sequence of BPF instructions. After program load
+through bpf(2) and passing a verifier in the kernel, a JIT will then
+translate these BPF proglets into native CPU instructions. There are
+two flavors of JITs, the newer eBPF JIT currently supported on:
+
+ - x86_64
+ - x86_32
+ - arm64
+ - arm32
+ - ppc64
+ - sparc64
+ - mips64
+ - s390x
+ - riscv
+
+And the older cBPF JIT supported on the following archs:
+
+ - mips
+ - ppc
+ - sparc
+
+eBPF JITs are a superset of cBPF JITs, meaning the kernel will
+migrate cBPF instructions into eBPF instructions and then JIT
+compile them transparently. Older cBPF JITs can only translate
+tcpdump filters, seccomp rules, etc, but not mentioned eBPF
+programs loaded through bpf(2).
+
+Values:
+
+ - 0 - disable the JIT (default value)
+ - 1 - enable the JIT
+ - 2 - enable the JIT and ask the compiler to emit traces on kernel log.
+
+bpf_jit_harden
+--------------
+
+This enables hardening for the BPF JIT compiler. Supported are eBPF
+JIT backends. Enabling hardening trades off performance, but can
+mitigate JIT spraying.
+
+Values:
+
+ - 0 - disable JIT hardening (default value)
+ - 1 - enable JIT hardening for unprivileged users only
+ - 2 - enable JIT hardening for all users
+
+bpf_jit_kallsyms
+----------------
+
+When BPF JIT compiler is enabled, then compiled images are unknown
+addresses to the kernel, meaning they neither show up in traces nor
+in /proc/kallsyms. This enables export of these addresses, which can
+be used for debugging/tracing. If bpf_jit_harden is enabled, this
+feature is disabled.
+
+Values :
+
+ - 0 - disable JIT kallsyms export (default value)
+ - 1 - enable JIT kallsyms export for privileged users only
+
+bpf_jit_limit
+-------------
+
+This enforces a global limit for memory allocations to the BPF JIT
+compiler in order to reject unprivileged JIT requests once it has
+been surpassed. bpf_jit_limit contains the value of the global limit
+in bytes.
+
+dev_weight
+----------
+
+The maximum number of packets that kernel can handle on a NAPI interrupt,
+it's a Per-CPU variable. For drivers that support LRO or GRO_HW, a hardware
+aggregated packet is counted as one packet in this context.
+
+Default: 64
+
+dev_weight_rx_bias
+------------------
+
+RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
+of the driver for the per softirq cycle netdev_budget. This parameter influences
+the proportion of the configured netdev_budget that is spent on RPS based packet
+processing during RX softirq cycles. It is further meant for making current
+dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
+(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
+on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
+
+Default: 1
+
+dev_weight_tx_bias
+------------------
+
+Scales the maximum number of packets that can be processed during a TX softirq cycle.
+Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
+net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
+
+Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
+
+Default: 1
+
+default_qdisc
+-------------
+
+The default queuing discipline to use for network devices. This allows
+overriding the default of pfifo_fast with an alternative. Since the default
+queuing discipline is created without additional parameters so is best suited
+to queuing disciplines that work well without configuration like stochastic
+fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use
+queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin
+which require setting up classes and bandwidths. Note that physical multiqueue
+interfaces still use mq as root qdisc, which in turn uses this default for its
+leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead
+default to noqueue.
+
+Default: pfifo_fast
+
+busy_read
+---------
+
+Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL)
+Approximate time in us to busy loop waiting for packets on the device queue.
+This sets the default value of the SO_BUSY_POLL socket option.
+Can be set or overridden per socket by setting socket option SO_BUSY_POLL,
+which is the preferred method of enabling. If you need to enable the feature
+globally via sysctl, a value of 50 is recommended.
+
+Will increase power usage.
+
+Default: 0 (off)
+
+busy_poll
+----------------
+Low latency busy poll timeout for poll and select. (needs CONFIG_NET_RX_BUSY_POLL)
+Approximate time in us to busy loop waiting for events.
+Recommended value depends on the number of sockets you poll on.
+For several sockets 50, for several hundreds 100.
+For more than that you probably want to use epoll.
+Note that only sockets with SO_BUSY_POLL set will be busy polled,
+so you want to either selectively set SO_BUSY_POLL on those sockets or set
+sysctl.net.busy_read globally.
+
+Will increase power usage.
+
+Default: 0 (off)
+
+rmem_default
+------------
+
+The default setting of the socket receive buffer in bytes.
+
+rmem_max
+--------
+
+The maximum receive socket buffer size in bytes.
+
+tstamp_allow_data
+-----------------
+Allow processes to receive tx timestamps looped together with the original
+packet contents. If disabled, transmit timestamp requests from unprivileged
+processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set.
+
+Default: 1 (on)
+
+
+wmem_default
+------------
+
+The default setting (in bytes) of the socket send buffer.
+
+wmem_max
+--------
+
+The maximum send socket buffer size in bytes.
+
+message_burst and message_cost
+------------------------------
+
+These parameters are used to limit the warning messages written to the kernel
+log from the networking code. They enforce a rate limit to make a
+denial-of-service attack impossible. A higher message_cost factor, results in
+fewer messages that will be written. Message_burst controls when messages will
+be dropped. The default settings limit warning messages to one every five
+seconds.
+
+warnings
+--------
+
+This sysctl is now unused.
+
+This was used to control console messages from the networking stack that
+occur because of problems on the network like duplicate address or bad
+checksums.
+
+These messages are now emitted at KERN_DEBUG and can generally be enabled
+and controlled by the dynamic_debug facility.
+
+netdev_budget
+-------------
+
+Maximum number of packets taken from all interfaces in one polling cycle (NAPI
+poll). In one polling cycle interfaces which are registered to polling are
+probed in a round-robin manner. Also, a polling cycle may not exceed
+netdev_budget_usecs microseconds, even if netdev_budget has not been
+exhausted.
+
+netdev_budget_usecs
+---------------------
+
+Maximum number of microseconds in one NAPI polling cycle. Polling
+will exit when either netdev_budget_usecs have elapsed during the
+poll cycle or the number of packets processed reaches netdev_budget.
+
+netdev_max_backlog
+------------------
+
+Maximum number of packets, queued on the INPUT side, when the interface
+receives packets faster than kernel can process them.
+
+netdev_rss_key
+--------------
+
+RSS (Receive Side Scaling) enabled drivers use a 40 bytes host key that is
+randomly generated.
+Some user space might need to gather its content even if drivers do not
+provide ethtool -x support yet.
+
+::
+
+ myhost:~# cat /proc/sys/net/core/netdev_rss_key
+ 84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8: ... (52 bytes total)
+
+File contains nul bytes if no driver ever called netdev_rss_key_fill() function.
+
+Note:
+ /proc/sys/net/core/netdev_rss_key contains 52 bytes of key,
+ but most drivers only use 40 bytes of it.
+
+::
+
+ myhost:~# ethtool -x eth0
+ RX flow hash indirection table for eth0 with 8 RX ring(s):
+ 0: 0 1 2 3 4 5 6 7
+ RSS hash key:
+ 84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8:43:e3:c9:0c:fd:17:55:c2:3a:4d:69:ed:f1:42:89
+
+netdev_tstamp_prequeue
+----------------------
+
+If set to 0, RX packet timestamps can be sampled after RPS processing, when
+the target CPU processes packets. It might give some delay on timestamps, but
+permit to distribute the load on several cpus.
+
+If set to 1 (default), timestamps are sampled as soon as possible, before
+queueing.
+
+optmem_max
+----------
+
+Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
+of struct cmsghdr structures with appended data.
+
+fb_tunnels_only_for_init_net
+----------------------------
+
+Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0,
+sit0, ip6tnl0, ip6gre0) are automatically created when a new
+network namespace is created, if corresponding tunnel is present
+in initial network namespace.
+If set to 1, these devices are not automatically created, and
+user space is responsible for creating them if needed.
+
+Default : 0 (for compatibility reasons)
+
+devconf_inherit_init_net
+------------------------
+
+Controls if a new network namespace should inherit all current
+settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By
+default, we keep the current behavior: for IPv4 we inherit all current
+settings from init_net and for IPv6 we reset all settings to default.
+
+If set to 1, both IPv4 and IPv6 settings are forced to inherit from
+current ones in init_net. If set to 2, both IPv4 and IPv6 settings are
+forced to reset to their default values.
+
+Default : 0 (for compatibility reasons)
+
+2. /proc/sys/net/unix - Parameters for Unix domain sockets
+----------------------------------------------------------
+
+There is only one file in this directory.
+unix_dgram_qlen limits the max number of datagrams queued in Unix domain
+socket's buffer. It will not take effect unless PF_UNIX flag is specified.
+
+
+3. /proc/sys/net/ipv4 - IPV4 settings
+-------------------------------------
+Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for
+descriptions of these entries.
+
+
+4. Appletalk
+------------
+
+The /proc/sys/net/appletalk directory holds the Appletalk configuration data
+when Appletalk is loaded. The configurable parameters are:
+
+aarp-expiry-time
+----------------
+
+The amount of time we keep an ARP entry before expiring it. Used to age out
+old hosts.
+
+aarp-resolve-time
+-----------------
+
+The amount of time we will spend trying to resolve an Appletalk address.
+
+aarp-retransmit-limit
+---------------------
+
+The number of times we will retransmit a query before giving up.
+
+aarp-tick-time
+--------------
+
+Controls the rate at which expires are checked.
+
+The directory /proc/net/appletalk holds the list of active Appletalk sockets
+on a machine.
+
+The fields indicate the DDP type, the local address (in network:node format)
+the remote address, the size of the transmit pending queue, the size of the
+received queue (bytes waiting for applications to read) the state and the uid
+owning the socket.
+
+/proc/net/atalk_iface lists all the interfaces configured for appletalk.It
+shows the name of the interface, its Appletalk address, the network range on
+that address (or network number for phase 1 networks), and the status of the
+interface.
+
+/proc/net/atalk_route lists each known network route. It lists the target
+(network) that the route leads to, the router (may be directly connected), the
+route flags, and the device the route is using.
+
+
+5. IPX
+------
+
+The IPX protocol has no tunable values in proc/sys/net.
+
+The IPX protocol does, however, provide proc/net/ipx. This lists each IPX
+socket giving the local and remote addresses in Novell format (that is
+network:node:port). In accordance with the strange Novell tradition,
+everything but the port is in hex. Not_Connected is displayed for sockets that
+are not tied to a specific remote address. The Tx and Rx queue sizes indicate
+the number of bytes pending for transmission and reception. The state
+indicates the state the socket is in and the uid is the owning uid of the
+socket.
+
+The /proc/net/ipx_interface file lists all IPX interfaces. For each interface
+it gives the network number, the node number, and indicates if the network is
+the primary network. It also indicates which device it is bound to (or
+Internal for internal networks) and the Frame Type if appropriate. Linux
+supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for
+IPX.
+
+The /proc/net/ipx_route table holds a list of IPX routes. For each route it
+gives the destination network, the router node (or Directly) and the network
+address of the router (or Connected) for internal networks.
+
+6. TIPC
+-------
+
+tipc_rmem
+---------
+
+The TIPC protocol now has a tunable for the receive memory, similar to the
+tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max)
+
+::
+
+ # cat /proc/sys/net/tipc/tipc_rmem
+ 4252725 34021800 68043600
+ #
+
+The max value is set to CONN_OVERLOAD_LIMIT, and the default and min values
+are scaled (shifted) versions of that same value. Note that the min value
+is not at this point in time used in any meaningful way, but the triplet is
+preserved in order to be consistent with things like tcp_rmem.
+
+named_timeout
+-------------
+
+TIPC name table updates are distributed asynchronously in a cluster, without
+any form of transaction handling. This means that different race scenarios are
+possible. One such is that a name withdrawal sent out by one node and received
+by another node may arrive after a second, overlapping name publication already
+has been accepted from a third node, although the conflicting updates
+originally may have been issued in the correct sequential order.
+If named_timeout is nonzero, failed topology updates will be placed on a defer
+queue until another event arrives that clears the error, or until the timeout
+expires. Value is in milliseconds.
diff --git a/Documentation/admin-guide/sysctl/sunrpc.rst b/Documentation/admin-guide/sysctl/sunrpc.rst
new file mode 100644
index 000000000000..09780a682afd
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/sunrpc.rst
@@ -0,0 +1,25 @@
+===================================
+Documentation for /proc/sys/sunrpc/
+===================================
+
+kernel version 2.2.10
+
+Copyright (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/sunrpc and is valid for Linux kernel version 2.2.
+
+The files in this directory can be used to (re)set the debug
+flags of the SUN Remote Procedure Call (RPC) subsystem in
+the Linux kernel. This stuff is used for NFS, KNFSD and
+maybe a few other things as well.
+
+The files in there are used to control the debugging flags:
+rpc_debug, nfs_debug, nfsd_debug and nlm_debug.
+
+These flags are for kernel hackers only. You should read the
+source code in net/sunrpc/ for more information.
diff --git a/Documentation/admin-guide/sysctl/user.rst b/Documentation/admin-guide/sysctl/user.rst
new file mode 100644
index 000000000000..650eaa03f15e
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/user.rst
@@ -0,0 +1,78 @@
+=================================
+Documentation for /proc/sys/user/
+=================================
+
+kernel version 4.9.0
+
+Copyright (c) 2016 Eric Biederman <ebiederm@xmission.com>
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/user.
+
+The files in this directory can be used to override the default
+limits on the number of namespaces and other objects that have
+per user per user namespace limits.
+
+The primary purpose of these limits is to stop programs that
+malfunction and attempt to create a ridiculous number of objects,
+before the malfunction becomes a system wide problem. It is the
+intention that the defaults of these limits are set high enough that
+no program in normal operation should run into these limits.
+
+The creation of per user per user namespace objects are charged to
+the user in the user namespace who created the object and
+verified to be below the per user limit in that user namespace.
+
+The creation of objects is also charged to all of the users
+who created user namespaces the creation of the object happens
+in (user namespaces can be nested) and verified to be below the per user
+limits in the user namespaces of those users.
+
+This recursive counting of created objects ensures that creating a
+user namespace does not allow a user to escape their current limits.
+
+Currently, these files are in /proc/sys/user:
+
+max_cgroup_namespaces
+=====================
+
+ The maximum number of cgroup namespaces that any user in the current
+ user namespace may create.
+
+max_ipc_namespaces
+==================
+
+ The maximum number of ipc namespaces that any user in the current
+ user namespace may create.
+
+max_mnt_namespaces
+==================
+
+ The maximum number of mount namespaces that any user in the current
+ user namespace may create.
+
+max_net_namespaces
+==================
+
+ The maximum number of network namespaces that any user in the
+ current user namespace may create.
+
+max_pid_namespaces
+==================
+
+ The maximum number of pid namespaces that any user in the current
+ user namespace may create.
+
+max_user_namespaces
+===================
+
+ The maximum number of user namespaces that any user in the current
+ user namespace may create.
+
+max_uts_namespaces
+==================
+
+ The maximum number of user namespaces that any user in the current
+ user namespace may create.
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
new file mode 100644
index 000000000000..64aeee1009ca
--- /dev/null
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -0,0 +1,964 @@
+===============================
+Documentation for /proc/sys/vm/
+===============================
+
+kernel version 2.6.29
+
+Copyright (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
+
+Copyright (c) 2008 Peter W. Morreale <pmorreale@novell.com>
+
+For general info and legal blurb, please look in index.rst.
+
+------------------------------------------------------------------------------
+
+This file contains the documentation for the sysctl files in
+/proc/sys/vm and is valid for Linux kernel version 2.6.29.
+
+The files in this directory can be used to tune the operation
+of the virtual memory (VM) subsystem of the Linux kernel and
+the writeout of dirty data to disk.
+
+Default values and initialization routines for most of these
+files can be found in mm/swap.c.
+
+Currently, these files are in /proc/sys/vm:
+
+- admin_reserve_kbytes
+- block_dump
+- compact_memory
+- compact_unevictable_allowed
+- dirty_background_bytes
+- dirty_background_ratio
+- dirty_bytes
+- dirty_expire_centisecs
+- dirty_ratio
+- dirtytime_expire_seconds
+- dirty_writeback_centisecs
+- drop_caches
+- extfrag_threshold
+- hugetlb_shm_group
+- laptop_mode
+- legacy_va_layout
+- lowmem_reserve_ratio
+- max_map_count
+- memory_failure_early_kill
+- memory_failure_recovery
+- min_free_kbytes
+- min_slab_ratio
+- min_unmapped_ratio
+- mmap_min_addr
+- mmap_rnd_bits
+- mmap_rnd_compat_bits
+- nr_hugepages
+- nr_hugepages_mempolicy
+- nr_overcommit_hugepages
+- nr_trim_pages (only if CONFIG_MMU=n)
+- numa_zonelist_order
+- oom_dump_tasks
+- oom_kill_allocating_task
+- overcommit_kbytes
+- overcommit_memory
+- overcommit_ratio
+- page-cluster
+- panic_on_oom
+- percpu_pagelist_fraction
+- stat_interval
+- stat_refresh
+- numa_stat
+- swappiness
+- unprivileged_userfaultfd
+- user_reserve_kbytes
+- vfs_cache_pressure
+- watermark_boost_factor
+- watermark_scale_factor
+- zone_reclaim_mode
+
+
+admin_reserve_kbytes
+====================
+
+The amount of free memory in the system that should be reserved for users
+with the capability cap_sys_admin.
+
+admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
+
+That should provide enough for the admin to log in and kill a process,
+if necessary, under the default overcommit 'guess' mode.
+
+Systems running under overcommit 'never' should increase this to account
+for the full Virtual Memory Size of programs used to recover. Otherwise,
+root may not be able to log in to recover the system.
+
+How do you calculate a minimum useful reserve?
+
+sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
+
+For overcommit 'guess', we can sum resident set sizes (RSS).
+On x86_64 this is about 8MB.
+
+For overcommit 'never', we can take the max of their virtual sizes (VSZ)
+and add the sum of their RSS.
+On x86_64 this is about 128MB.
+
+Changing this takes effect whenever an application requests memory.
+
+
+block_dump
+==========
+
+block_dump enables block I/O debugging when set to a nonzero value. More
+information on block I/O debugging is in Documentation/admin-guide/laptops/laptop-mode.rst.
+
+
+compact_memory
+==============
+
+Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
+all zones are compacted such that free memory is available in contiguous
+blocks where possible. This can be important for example in the allocation of
+huge pages although processes will also directly compact memory as required.
+
+
+compact_unevictable_allowed
+===========================
+
+Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
+allowed to examine the unevictable lru (mlocked pages) for pages to compact.
+This should be used on systems where stalls for minor page faults are an
+acceptable trade for large contiguous free memory. Set to 0 to prevent
+compaction from moving pages that are unevictable. Default value is 1.
+
+
+dirty_background_bytes
+======================
+
+Contains the amount of dirty memory at which the background kernel
+flusher threads will start writeback.
+
+Note:
+ dirty_background_bytes is the counterpart of dirty_background_ratio. Only
+ one of them may be specified at a time. When one sysctl is written it is
+ immediately taken into account to evaluate the dirty memory limits and the
+ other appears as 0 when read.
+
+
+dirty_background_ratio
+======================
+
+Contains, as a percentage of total available memory that contains free pages
+and reclaimable pages, the number of pages at which the background kernel
+flusher threads will start writing out dirty data.
+
+The total available memory is not equal to total system memory.
+
+
+dirty_bytes
+===========
+
+Contains the amount of dirty memory at which a process generating disk writes
+will itself start writeback.
+
+Note: dirty_bytes is the counterpart of dirty_ratio. Only one of them may be
+specified at a time. When one sysctl is written it is immediately taken into
+account to evaluate the dirty memory limits and the other appears as 0 when
+read.
+
+Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
+value lower than this limit will be ignored and the old configuration will be
+retained.
+
+
+dirty_expire_centisecs
+======================
+
+This tunable is used to define when dirty data is old enough to be eligible
+for writeout by the kernel flusher threads. It is expressed in 100'ths
+of a second. Data which has been dirty in-memory for longer than this
+interval will be written out next time a flusher thread wakes up.
+
+
+dirty_ratio
+===========
+
+Contains, as a percentage of total available memory that contains free pages
+and reclaimable pages, the number of pages at which a process which is
+generating disk writes will itself start writing out dirty data.
+
+The total available memory is not equal to total system memory.
+
+
+dirtytime_expire_seconds
+========================
+
+When a lazytime inode is constantly having its pages dirtied, the inode with
+an updated timestamp will never get chance to be written out. And, if the
+only thing that has happened on the file system is a dirtytime inode caused
+by an atime update, a worker will be scheduled to make sure that inode
+eventually gets pushed out to disk. This tunable is used to define when dirty
+inode is old enough to be eligible for writeback by the kernel flusher threads.
+And, it is also used as the interval to wakeup dirtytime_writeback thread.
+
+
+dirty_writeback_centisecs
+=========================
+
+The kernel flusher threads will periodically wake up and write `old` data
+out to disk. This tunable expresses the interval between those wakeups, in
+100'ths of a second.
+
+Setting this to zero disables periodic writeback altogether.
+
+
+drop_caches
+===========
+
+Writing to this will cause the kernel to drop clean caches, as well as
+reclaimable slab objects like dentries and inodes. Once dropped, their
+memory becomes free.
+
+To free pagecache::
+
+ echo 1 > /proc/sys/vm/drop_caches
+
+To free reclaimable slab objects (includes dentries and inodes)::
+
+ echo 2 > /proc/sys/vm/drop_caches
+
+To free slab objects and pagecache::
+
+ echo 3 > /proc/sys/vm/drop_caches
+
+This is a non-destructive operation and will not free any dirty objects.
+To increase the number of objects freed by this operation, the user may run
+`sync` prior to writing to /proc/sys/vm/drop_caches. This will minimize the
+number of dirty objects on the system and create more candidates to be
+dropped.
+
+This file is not a means to control the growth of the various kernel caches
+(inodes, dentries, pagecache, etc...) These objects are automatically
+reclaimed by the kernel when memory is needed elsewhere on the system.
+
+Use of this file can cause performance problems. Since it discards cached
+objects, it may cost a significant amount of I/O and CPU to recreate the
+dropped objects, especially if they were under heavy use. Because of this,
+use outside of a testing or debugging environment is not recommended.
+
+You may see informational messages in your kernel log when this file is
+used::
+
+ cat (1234): drop_caches: 3
+
+These are informational only. They do not mean that anything is wrong
+with your system. To disable them, echo 4 (bit 2) into drop_caches.
+
+
+extfrag_threshold
+=================
+
+This parameter affects whether the kernel will compact memory or direct
+reclaim to satisfy a high-order allocation. The extfrag/extfrag_index file in
+debugfs shows what the fragmentation index for each order is in each zone in
+the system. Values tending towards 0 imply allocations would fail due to lack
+of memory, values towards 1000 imply failures are due to fragmentation and -1
+implies that the allocation will succeed as long as watermarks are met.
+
+The kernel will not compact memory in a zone if the
+fragmentation index is <= extfrag_threshold. The default value is 500.
+
+
+highmem_is_dirtyable
+====================
+
+Available only for systems with CONFIG_HIGHMEM enabled (32b systems).
+
+This parameter controls whether the high memory is considered for dirty
+writers throttling. This is not the case by default which means that
+only the amount of memory directly visible/usable by the kernel can
+be dirtied. As a result, on systems with a large amount of memory and
+lowmem basically depleted writers might be throttled too early and
+streaming writes can get very slow.
+
+Changing the value to non zero would allow more memory to be dirtied
+and thus allow writers to write more data which can be flushed to the
+storage more effectively. Note this also comes with a risk of pre-mature
+OOM killer because some writers (e.g. direct block device writes) can
+only use the low memory and they can fill it up with dirty data without
+any throttling.
+
+
+hugetlb_shm_group
+=================
+
+hugetlb_shm_group contains group id that is allowed to create SysV
+shared memory segment using hugetlb page.
+
+
+laptop_mode
+===========
+
+laptop_mode is a knob that controls "laptop mode". All the things that are
+controlled by this knob are discussed in Documentation/admin-guide/laptops/laptop-mode.rst.
+
+
+legacy_va_layout
+================
+
+If non-zero, this sysctl disables the new 32-bit mmap layout - the kernel
+will use the legacy (2.4) layout for all processes.
+
+
+lowmem_reserve_ratio
+====================
+
+For some specialised workloads on highmem machines it is dangerous for
+the kernel to allow process memory to be allocated from the "lowmem"
+zone. This is because that memory could then be pinned via the mlock()
+system call, or by unavailability of swapspace.
+
+And on large highmem machines this lack of reclaimable lowmem memory
+can be fatal.
+
+So the Linux page allocator has a mechanism which prevents allocations
+which *could* use highmem from using too much lowmem. This means that
+a certain amount of lowmem is defended from the possibility of being
+captured into pinned user memory.
+
+(The same argument applies to the old 16 megabyte ISA DMA region. This
+mechanism will also defend that region from allocations which could use
+highmem or lowmem).
+
+The `lowmem_reserve_ratio` tunable determines how aggressive the kernel is
+in defending these lower zones.
+
+If you have a machine which uses highmem or ISA DMA and your
+applications are using mlock(), or if you are running with no swap then
+you probably should change the lowmem_reserve_ratio setting.
+
+The lowmem_reserve_ratio is an array. You can see them by reading this file::
+
+ % cat /proc/sys/vm/lowmem_reserve_ratio
+ 256 256 32
+
+But, these values are not used directly. The kernel calculates # of protection
+pages for each zones from them. These are shown as array of protection pages
+in /proc/zoneinfo like followings. (This is an example of x86-64 box).
+Each zone has an array of protection pages like this::
+
+ Node 0, zone DMA
+ pages free 1355
+ min 3
+ low 3
+ high 4
+ :
+ :
+ numa_other 0
+ protection: (0, 2004, 2004, 2004)
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ pagesets
+ cpu: 0 pcp: 0
+ :
+
+These protections are added to score to judge whether this zone should be used
+for page allocation or should be reclaimed.
+
+In this example, if normal pages (index=2) are required to this DMA zone and
+watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
+not be used because pages_free(1355) is smaller than watermark + protection[2]
+(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
+normal page requirement. If requirement is DMA zone(index=0), protection[0]
+(=0) is used.
+
+zone[i]'s protection[j] is calculated by following expression::
+
+ (i < j):
+ zone[i]->protection[j]
+ = (total sums of managed_pages from zone[i+1] to zone[j] on the node)
+ / lowmem_reserve_ratio[i];
+ (i = j):
+ (should not be protected. = 0;
+ (i > j):
+ (not necessary, but looks 0)
+
+The default values of lowmem_reserve_ratio[i] are
+
+ === ====================================
+ 256 (if zone[i] means DMA or DMA32 zone)
+ 32 (others)
+ === ====================================
+
+As above expression, they are reciprocal number of ratio.
+256 means 1/256. # of protection pages becomes about "0.39%" of total managed
+pages of higher zones on the node.
+
+If you would like to protect more pages, smaller values are effective.
+The minimum value is 1 (1/1 -> 100%). The value less than 1 completely
+disables protection of the pages.
+
+
+max_map_count:
+==============
+
+This file contains the maximum number of memory map areas a process
+may have. Memory map areas are used as a side-effect of calling
+malloc, directly by mmap, mprotect, and madvise, and also when loading
+shared libraries.
+
+While most applications need less than a thousand maps, certain
+programs, particularly malloc debuggers, may consume lots of them,
+e.g., up to one or two maps per allocation.
+
+The default value is 65536.
+
+
+memory_failure_early_kill:
+==========================
+
+Control how to kill processes when uncorrected memory error (typically
+a 2bit error in a memory module) is detected in the background by hardware
+that cannot be handled by the kernel. In some cases (like the page
+still having a valid copy on disk) the kernel will handle the failure
+transparently without affecting any applications. But if there is
+no other uptodate copy of the data it will kill to prevent any data
+corruptions from propagating.
+
+1: Kill all processes that have the corrupted and not reloadable page mapped
+as soon as the corruption is detected. Note this is not supported
+for a few types of pages, like kernel internally allocated data or
+the swap cache, but works for the majority of user pages.
+
+0: Only unmap the corrupted page from all processes and only kill a process
+who tries to access it.
+
+The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+handle this if they want to.
+
+This is only active on architectures/platforms with advanced machine
+check handling and depends on the hardware capabilities.
+
+Applications can override this setting individually with the PR_MCE_KILL prctl
+
+
+memory_failure_recovery
+=======================
+
+Enable memory failure recovery (when supported by the platform)
+
+1: Attempt recovery.
+
+0: Always panic on a memory failure.
+
+
+min_free_kbytes
+===============
+
+This is used to force the Linux VM to keep a minimum number
+of kilobytes free. The VM uses this number to compute a
+watermark[WMARK_MIN] value for each lowmem zone in the system.
+Each lowmem zone gets a number of reserved free pages based
+proportionally on its size.
+
+Some minimal amount of memory is needed to satisfy PF_MEMALLOC
+allocations; if you set this to lower than 1024KB, your system will
+become subtly broken, and prone to deadlock under high loads.
+
+Setting this too high will OOM your machine instantly.
+
+
+min_slab_ratio
+==============
+
+This is available only on NUMA kernels.
+
+A percentage of the total pages in each zone. On Zone reclaim
+(fallback from the local zone occurs) slabs will be reclaimed if more
+than this percentage of pages in a zone are reclaimable slab pages.
+This insures that the slab growth stays under control even in NUMA
+systems that rarely perform global reclaim.
+
+The default is 5 percent.
+
+Note that slab reclaim is triggered in a per zone / node fashion.
+The process of reclaiming slab memory is currently not node specific
+and may not be fast.
+
+
+min_unmapped_ratio
+==================
+
+This is available only on NUMA kernels.
+
+This is a percentage of the total pages in each zone. Zone reclaim will
+only occur if more than this percentage of pages are in a state that
+zone_reclaim_mode allows to be reclaimed.
+
+If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
+against all file-backed unmapped pages including swapcache pages and tmpfs
+files. Otherwise, only unmapped pages backed by normal files but not tmpfs
+files and similar are considered.
+
+The default is 1 percent.
+
+
+mmap_min_addr
+=============
+
+This file indicates the amount of address space which a user process will
+be restricted from mmapping. Since kernel null dereference bugs could
+accidentally operate based on the information in the first couple of pages
+of memory userspace processes should not be allowed to write to them. By
+default this value is set to 0 and no protections will be enforced by the
+security module. Setting this value to something like 64k will allow the
+vast majority of applications to work correctly and provide defense in depth
+against future potential kernel bugs.
+
+
+mmap_rnd_bits
+=============
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations on architectures which support
+tuning address space randomization. This value will be bounded
+by the architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/vm/mmap_rnd_bits tunable
+
+
+mmap_rnd_compat_bits
+====================
+
+This value can be used to select the number of bits to use to
+determine the random offset to the base address of vma regions
+resulting from mmap allocations for applications run in
+compatibility mode on architectures which support tuning address
+space randomization. This value will be bounded by the
+architecture's minimum and maximum supported values.
+
+This value can be changed after boot using the
+/proc/sys/vm/mmap_rnd_compat_bits tunable
+
+
+nr_hugepages
+============
+
+Change the minimum size of the hugepage pool.
+
+See Documentation/admin-guide/mm/hugetlbpage.rst
+
+
+nr_hugepages_mempolicy
+======================
+
+Change the size of the hugepage pool at run-time on a specific
+set of NUMA nodes.
+
+See Documentation/admin-guide/mm/hugetlbpage.rst
+
+
+nr_overcommit_hugepages
+=======================
+
+Change the maximum size of the hugepage pool. The maximum is
+nr_hugepages + nr_overcommit_hugepages.
+
+See Documentation/admin-guide/mm/hugetlbpage.rst
+
+
+nr_trim_pages
+=============
+
+This is available only on NOMMU kernels.
+
+This value adjusts the excess page trimming behaviour of power-of-2 aligned
+NOMMU mmap allocations.
+
+A value of 0 disables trimming of allocations entirely, while a value of 1
+trims excess pages aggressively. Any value >= 1 acts as the watermark where
+trimming of allocations is initiated.
+
+The default value is 1.
+
+See Documentation/nommu-mmap.txt for more information.
+
+
+numa_zonelist_order
+===================
+
+This sysctl is only for NUMA and it is deprecated. Anything but
+Node order will fail!
+
+'where the memory is allocated from' is controlled by zonelists.
+
+(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
+you may be able to read ZONE_DMA as ZONE_DMA32...)
+
+In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following.
+ZONE_NORMAL -> ZONE_DMA
+This means that a memory allocation request for GFP_KERNEL will
+get memory from ZONE_DMA only when ZONE_NORMAL is not available.
+
+In NUMA case, you can think of following 2 types of order.
+Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL::
+
+ (A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL
+ (B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA.
+
+Type(A) offers the best locality for processes on Node(0), but ZONE_DMA
+will be used before ZONE_NORMAL exhaustion. This increases possibility of
+out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small.
+
+Type(B) cannot offer the best locality but is more robust against OOM of
+the DMA zone.
+
+Type(A) is called as "Node" order. Type (B) is "Zone" order.
+
+"Node order" orders the zonelists by node, then by zone within each node.
+Specify "[Nn]ode" for node order
+
+"Zone Order" orders the zonelists by zone type, then by node within each
+zone. Specify "[Zz]one" for zone order.
+
+Specify "[Dd]efault" to request automatic configuration.
+
+On 32-bit, the Normal zone needs to be preserved for allocations accessible
+by the kernel, so "zone" order will be selected.
+
+On 64-bit, devices that require DMA32/DMA are relatively rare, so "node"
+order will be selected.
+
+Default order is recommended unless this is causing problems for your
+system/application.
+
+
+oom_dump_tasks
+==============
+
+Enables a system-wide task dump (excluding kernel threads) to be produced
+when the kernel performs an OOM-killing and includes such information as
+pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
+score, and name. This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
+
+If this is set to zero, this information is suppressed. On very
+large systems with thousands of tasks it may not be feasible to dump
+the memory state information for each one. Such systems should not
+be forced to incur a performance penalty in OOM conditions when the
+information may not be desired.
+
+If this is set to non-zero, this information is shown whenever the
+OOM killer actually kills a memory-hogging task.
+
+The default value is 1 (enabled).
+
+
+oom_kill_allocating_task
+========================
+
+This enables or disables killing the OOM-triggering task in
+out-of-memory situations.
+
+If this is set to zero, the OOM killer will scan through the entire
+tasklist and select a task based on heuristics to kill. This normally
+selects a rogue memory-hogging task that frees up a large amount of
+memory when killed.
+
+If this is set to non-zero, the OOM killer simply kills the task that
+triggered the out-of-memory condition. This avoids the expensive
+tasklist scan.
+
+If panic_on_oom is selected, it takes precedence over whatever value
+is used in oom_kill_allocating_task.
+
+The default value is 0.
+
+
+overcommit_kbytes
+=================
+
+When overcommit_memory is set to 2, the committed address space is not
+permitted to exceed swap plus this amount of physical RAM. See below.
+
+Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one
+of them may be specified at a time. Setting one disables the other (which
+then appears as 0 when read).
+
+
+overcommit_memory
+=================
+
+This value contains a flag that enables memory overcommitment.
+
+When this flag is 0, the kernel attempts to estimate the amount
+of free memory left when userspace requests more memory.
+
+When this flag is 1, the kernel pretends there is always enough
+memory until it actually runs out.
+
+When this flag is 2, the kernel uses a "never overcommit"
+policy that attempts to prevent any overcommit of memory.
+Note that user_reserve_kbytes affects this policy.
+
+This feature can be very useful because there are a lot of
+programs that malloc() huge amounts of memory "just-in-case"
+and don't use much of it.
+
+The default value is 0.
+
+See Documentation/vm/overcommit-accounting.rst and
+mm/util.c::__vm_enough_memory() for more information.
+
+
+overcommit_ratio
+================
+
+When overcommit_memory is set to 2, the committed address
+space is not permitted to exceed swap plus this percentage
+of physical RAM. See above.
+
+
+page-cluster
+============
+
+page-cluster controls the number of pages up to which consecutive pages
+are read in from swap in a single attempt. This is the swap counterpart
+to page cache readahead.
+The mentioned consecutivity is not in terms of virtual/physical addresses,
+but consecutive on swap space - that means they were swapped out together.
+
+It is a logarithmic value - setting it to zero means "1 page", setting
+it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
+Zero disables swap readahead completely.
+
+The default value is three (eight pages at a time). There may be some
+small benefits in tuning this to a different value if your workload is
+swap-intensive.
+
+Lower values mean lower latencies for initial faults, but at the same time
+extra faults and I/O delays for following faults if they would have been part of
+that consecutive pages readahead would have brought in.
+
+
+panic_on_oom
+============
+
+This enables or disables panic on out-of-memory feature.
+
+If this is set to 0, the kernel will kill some rogue process,
+called oom_killer. Usually, oom_killer can kill rogue processes and
+system will survive.
+
+If this is set to 1, the kernel panics when out-of-memory happens.
+However, if a process limits using nodes by mempolicy/cpusets,
+and those nodes become memory exhaustion status, one process
+may be killed by oom-killer. No panic occurs in this case.
+Because other nodes' memory may be free. This means system total status
+may be not fatal yet.
+
+If this is set to 2, the kernel panics compulsorily even on the
+above-mentioned. Even oom happens under memory cgroup, the whole
+system panics.
+
+The default value is 0.
+
+1 and 2 are for failover of clustering. Please select either
+according to your policy of failover.
+
+panic_on_oom=2+kdump gives you very strong tool to investigate
+why oom happens. You can get snapshot.
+
+
+percpu_pagelist_fraction
+========================
+
+This is the fraction of pages at most (high mark pcp->high) in each zone that
+are allocated for each per cpu page list. The min value for this is 8. It
+means that we don't allow more than 1/8th of pages in each zone to be
+allocated in any single per_cpu_pagelist. This entry only changes the value
+of hot per cpu pagelists. User can specify a number like 100 to allocate
+1/100th of each zone to each per cpu page list.
+
+The batch value of each per cpu pagelist is also updated as a result. It is
+set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
+
+The initial value is zero. Kernel does not use this value at boot time to set
+the high water marks for each per cpu page list. If the user writes '0' to this
+sysctl, it will revert to this default behavior.
+
+
+stat_interval
+=============
+
+The time interval between which vm statistics are updated. The default
+is 1 second.
+
+
+stat_refresh
+============
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+
+numa_stat
+=========
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do::
+
+ echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do::
+
+ echo 1 > /proc/sys/vm/numa_stat
+
+
+swappiness
+==========
+
+This control is used to define how aggressive the kernel will swap
+memory pages. Higher values will increase aggressiveness, lower values
+decrease the amount of swap. A value of 0 instructs the kernel not to
+initiate swap until the amount of free and file-backed pages is less
+than the high water mark in a zone.
+
+The default value is 60.
+
+
+unprivileged_userfaultfd
+========================
+
+This flag controls whether unprivileged users can use the userfaultfd
+system calls. Set this to 1 to allow unprivileged users to use the
+userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
+privileged users (with SYS_CAP_PTRACE capability).
+
+The default value is 1.
+
+
+user_reserve_kbytes
+===================
+
+When overcommit_memory is set to 2, "never overcommit" mode, reserve
+min(3% of current process size, user_reserve_kbytes) of free memory.
+This is intended to prevent a user from starting a single memory hogging
+process, such that they cannot recover (kill the hog).
+
+user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
+
+If this is reduced to zero, then the user will be allowed to allocate
+all free memory with a single process, minus admin_reserve_kbytes.
+Any subsequent attempts to execute a command will result in
+"fork: Cannot allocate memory".
+
+Changing this takes effect whenever an application requests memory.
+
+
+vfs_cache_pressure
+==================
+
+This percentage value controls the tendency of the kernel to reclaim
+the memory which is used for caching of directory and inode objects.
+
+At the default value of vfs_cache_pressure=100 the kernel will attempt to
+reclaim dentries and inodes at a "fair" rate with respect to pagecache and
+swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
+to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
+never reclaim dentries and inodes due to memory pressure and this can easily
+lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
+causes the kernel to prefer to reclaim dentries and inodes.
+
+Increasing vfs_cache_pressure significantly beyond 100 may have negative
+performance impact. Reclaim code needs to take various locks to find freeable
+directory and inode objects. With vfs_cache_pressure=1000, it will look for
+ten times more freeable objects than there are.
+
+
+watermark_boost_factor
+======================
+
+This factor controls the level of reclaim when memory is being fragmented.
+It defines the percentage of the high watermark of a zone that will be
+reclaimed if pages of different mobility are being mixed within pageblocks.
+The intent is that compaction has less work to do in the future and to
+increase the success rate of future high-order allocations such as SLUB
+allocations, THP and hugetlbfs pages.
+
+To make it sensible with respect to the watermark_scale_factor
+parameter, the unit is in fractions of 10,000. The default value of
+15,000 on !DISCONTIGMEM configurations means that up to 150% of the high
+watermark will be reclaimed in the event of a pageblock being mixed due
+to fragmentation. The level of reclaim is determined by the number of
+fragmentation events that occurred in the recent past. If this value is
+smaller than a pageblock then a pageblocks worth of pages will be reclaimed
+(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature.
+
+
+watermark_scale_factor
+======================
+
+This factor controls the aggressiveness of kswapd. It defines the
+amount of memory left in a node/system before kswapd is woken up and
+how much memory needs to be free before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 10 means the
+distances between watermarks are 0.1% of the available memory in the
+node/system. The maximum value is 1000, or 10% of memory.
+
+A high rate of threads entering direct reclaim (allocstall) or kswapd
+going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
+that the number of free pages kswapd maintains for latency reasons is
+too small for the allocation bursts occurring in the system. This knob
+can then be used to tune kswapd aggressiveness accordingly.
+
+
+zone_reclaim_mode
+=================
+
+Zone_reclaim_mode allows someone to set more or less aggressive approaches to
+reclaim memory when a zone runs out of memory. If it is set to zero then no
+zone reclaim occurs. Allocations will be satisfied from other zones / nodes
+in the system.
+
+This is value OR'ed together of
+
+= ===================================
+1 Zone reclaim on
+2 Zone reclaim writes dirty pages out
+4 Zone reclaim swaps pages
+= ===================================
+
+zone_reclaim_mode is disabled by default. For file servers or workloads
+that benefit from having their data cached, zone_reclaim_mode should be
+left disabled as the caching effect is likely to be more important than
+data locality.
+
+zone_reclaim may be enabled if it's known that the workload is partitioned
+such that each partition fits within a NUMA node and that accessing remote
+memory would cause a measurable performance reduction. The page allocator
+will then reclaim easily reusable pages (those page cache pages that are
+currently not used) before allocating off node pages.
+
+Allowing zone reclaim to write out pages stops processes that are
+writing large amounts of data from dirtying pages on other nodes. Zone
+reclaim will write out dirty pages if a zone fills up and so effectively
+throttle the process. This may decrease the performance of a single process
+since it cannot use all of system memory to buffer the outgoing writes
+anymore but it preserve the memory on other nodes so that the performance
+of other processes running on other nodes will not be affected.
+
+Allowing regular swap effectively restricts allocations to the local
+node unless explicitly overridden by memory policies or cpuset
+configurations.
diff --git a/Documentation/video-output.txt b/Documentation/admin-guide/video-output.rst
index 56d6fa2e2368..56d6fa2e2368 100644
--- a/Documentation/video-output.txt
+++ b/Documentation/admin-guide/video-output.rst
diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst
new file mode 100644
index 000000000000..e76665a8f2f2
--- /dev/null
+++ b/Documentation/admin-guide/xfs.rst
@@ -0,0 +1,466 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======================
+The SGI XFS Filesystem
+======================
+
+XFS is a high performance journaling filesystem which originated
+on the SGI IRIX platform. It is completely multi-threaded, can
+support large files and large filesystems, extended attributes,
+variable block sizes, is extent based, and makes extensive use of
+Btrees (directories, extents, free space) to aid both performance
+and scalability.
+
+Refer to the documentation at https://xfs.wiki.kernel.org/
+for further details. This implementation is on-disk compatible
+with the IRIX version of XFS.
+
+
+Mount Options
+=============
+
+When mounting an XFS filesystem, the following options are accepted.
+
+ allocsize=size
+ Sets the buffered I/O end-of-file preallocation size when
+ doing delayed allocation writeout (default size is 64KiB).
+ Valid values for this option are page size (typically 4KiB)
+ through to 1GiB, inclusive, in power-of-2 increments.
+
+ The default behaviour is for dynamic end-of-file
+ preallocation size, which uses a set of heuristics to
+ optimise the preallocation size based on the current
+ allocation patterns within the file and the access patterns
+ to the file. Specifying a fixed ``allocsize`` value turns off
+ the dynamic behaviour.
+
+ attr2 or noattr2
+ The options enable/disable an "opportunistic" improvement to
+ be made in the way inline extended attributes are stored
+ on-disk. When the new form is used for the first time when
+ ``attr2`` is selected (either when setting or removing extended
+ attributes) the on-disk superblock feature bit field will be
+ updated to reflect this format being in use.
+
+ The default behaviour is determined by the on-disk feature
+ bit indicating that ``attr2`` behaviour is active. If either
+ mount option is set, then that becomes the new default used
+ by the filesystem.
+
+ CRC enabled filesystems always use the ``attr2`` format, and so
+ will reject the ``noattr2`` mount option if it is set.
+
+ discard or nodiscard (default)
+ Enable/disable the issuing of commands to let the block
+ device reclaim space freed by the filesystem. This is
+ useful for SSD devices, thinly provisioned LUNs and virtual
+ machine images, but may have a performance impact.
+
+ Note: It is currently recommended that you use the ``fstrim``
+ application to ``discard`` unused blocks rather than the ``discard``
+ mount option because the performance impact of this option
+ is quite severe.
+
+ grpid/bsdgroups or nogrpid/sysvgroups (default)
+ These options define what group ID a newly created file
+ gets. When ``grpid`` is set, it takes the group ID of the
+ directory in which it is created; otherwise it takes the
+ ``fsgid`` of the current process, unless the directory has the
+ ``setgid`` bit set, in which case it takes the ``gid`` from the
+ parent directory, and also gets the ``setgid`` bit set if it is
+ a directory itself.
+
+ filestreams
+ Make the data allocator use the filestreams allocation mode
+ across the entire filesystem rather than just on directories
+ configured to use it.
+
+ ikeep or noikeep (default)
+ When ``ikeep`` is specified, XFS does not delete empty inode
+ clusters and keeps them around on disk. When ``noikeep`` is
+ specified, empty inode clusters are returned to the free
+ space pool.
+
+ inode32 or inode64 (default)
+ When ``inode32`` is specified, it indicates that XFS limits
+ inode creation to locations which will not result in inode
+ numbers with more than 32 bits of significance.
+
+ When ``inode64`` is specified, it indicates that XFS is allowed
+ to create inodes at any location in the filesystem,
+ including those which will result in inode numbers occupying
+ more than 32 bits of significance.
+
+ ``inode32`` is provided for backwards compatibility with older
+ systems and applications, since 64 bits inode numbers might
+ cause problems for some applications that cannot handle
+ large inode numbers. If applications are in use which do
+ not handle inode numbers bigger than 32 bits, the ``inode32``
+ option should be specified.
+
+ largeio or nolargeio (default)
+ If ``nolargeio`` is specified, the optimal I/O reported in
+ ``st_blksize`` by **stat(2)** will be as small as possible to allow
+ user applications to avoid inefficient read/modify/write
+ I/O. This is typically the page size of the machine, as
+ this is the granularity of the page cache.
+
+ If ``largeio`` is specified, a filesystem that was created with a
+ ``swidth`` specified will return the ``swidth`` value (in bytes)
+ in ``st_blksize``. If the filesystem does not have a ``swidth``
+ specified but does specify an ``allocsize`` then ``allocsize``
+ (in bytes) will be returned instead. Otherwise the behaviour
+ is the same as if ``nolargeio`` was specified.
+
+ logbufs=value
+ Set the number of in-memory log buffers. Valid numbers
+ range from 2-8 inclusive.
+
+ The default value is 8 buffers.
+
+ If the memory cost of 8 log buffers is too high on small
+ systems, then it may be reduced at some cost to performance
+ on metadata intensive workloads. The ``logbsize`` option below
+ controls the size of each buffer and so is also relevant to
+ this case.
+
+ logbsize=value
+ Set the size of each in-memory log buffer. The size may be
+ specified in bytes, or in kilobytes with a "k" suffix.
+ Valid sizes for version 1 and version 2 logs are 16384 (16k)
+ and 32768 (32k). Valid sizes for version 2 logs also
+ include 65536 (64k), 131072 (128k) and 262144 (256k). The
+ logbsize must be an integer multiple of the log
+ stripe unit configured at **mkfs(8)** time.
+
+ The default value for for version 1 logs is 32768, while the
+ default value for version 2 logs is MAX(32768, log_sunit).
+
+ logdev=device and rtdev=device
+ Use an external log (metadata journal) and/or real-time device.
+ An XFS filesystem has up to three parts: a data section, a log
+ section, and a real-time section. The real-time section is
+ optional, and the log section can be separate from the data
+ section or contained within it.
+
+ noalign
+ Data allocations will not be aligned at stripe unit
+ boundaries. This is only relevant to filesystems created
+ with non-zero data alignment parameters (``sunit``, ``swidth``) by
+ **mkfs(8)**.
+
+ norecovery
+ The filesystem will be mounted without running log recovery.
+ If the filesystem was not cleanly unmounted, it is likely to
+ be inconsistent when mounted in ``norecovery`` mode.
+ Some files or directories may not be accessible because of this.
+ Filesystems mounted ``norecovery`` must be mounted read-only or
+ the mount will fail.
+
+ nouuid
+ Don't check for double mounted file systems using the file
+ system ``uuid``. This is useful to mount LVM snapshot volumes,
+ and often used in combination with ``norecovery`` for mounting
+ read-only snapshots.
+
+ noquota
+ Forcibly turns off all quota accounting and enforcement
+ within the filesystem.
+
+ uquota/usrquota/uqnoenforce/quota
+ User disk quota accounting enabled, and limits (optionally)
+ enforced. Refer to **xfs_quota(8)** for further details.
+
+ gquota/grpquota/gqnoenforce
+ Group disk quota accounting enabled and limits (optionally)
+ enforced. Refer to **xfs_quota(8)** for further details.
+
+ pquota/prjquota/pqnoenforce
+ Project disk quota accounting enabled and limits (optionally)
+ enforced. Refer to **xfs_quota(8)** for further details.
+
+ sunit=value and swidth=value
+ Used to specify the stripe unit and width for a RAID device
+ or a stripe volume. "value" must be specified in 512-byte
+ block units. These options are only relevant to filesystems
+ that were created with non-zero data alignment parameters.
+
+ The ``sunit`` and ``swidth`` parameters specified must be compatible
+ with the existing filesystem alignment characteristics. In
+ general, that means the only valid changes to ``sunit`` are
+ increasing it by a power-of-2 multiple. Valid ``swidth`` values
+ are any integer multiple of a valid ``sunit`` value.
+
+ Typically the only time these mount options are necessary if
+ after an underlying RAID device has had it's geometry
+ modified, such as adding a new disk to a RAID5 lun and
+ reshaping it.
+
+ swalloc
+ Data allocations will be rounded up to stripe width boundaries
+ when the current end of file is being extended and the file
+ size is larger than the stripe width size.
+
+ wsync
+ When specified, all filesystem namespace operations are
+ executed synchronously. This ensures that when the namespace
+ operation (create, unlink, etc) completes, the change to the
+ namespace is on stable storage. This is useful in HA setups
+ where failover must not result in clients seeing
+ inconsistent namespace presentation during or after a
+ failover event.
+
+
+Deprecated Mount Options
+========================
+
+=========================== ================
+ Name Removal Schedule
+=========================== ================
+=========================== ================
+
+
+Removed Mount Options
+=====================
+
+=========================== =======
+ Name Removed
+=========================== =======
+ delaylog/nodelaylog v4.0
+ ihashsize v4.0
+ irixsgid v4.0
+ osyncisdsync/osyncisosync v4.0
+ barrier v4.19
+ nobarrier v4.19
+=========================== =======
+
+sysctls
+=======
+
+The following sysctls are available for the XFS filesystem:
+
+ fs.xfs.stats_clear (Min: 0 Default: 0 Max: 1)
+ Setting this to "1" clears accumulated XFS statistics
+ in /proc/fs/xfs/stat. It then immediately resets to "0".
+
+ fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000)
+ The interval at which the filesystem flushes metadata
+ out to disk and runs internal cache cleanup routines.
+
+ fs.xfs.filestream_centisecs (Min: 1 Default: 3000 Max: 360000)
+ The interval at which the filesystem ages filestreams cache
+ references and returns timed-out AGs back to the free stream
+ pool.
+
+ fs.xfs.speculative_prealloc_lifetime
+ (Units: seconds Min: 1 Default: 300 Max: 86400)
+ The interval at which the background scanning for inodes
+ with unused speculative preallocation runs. The scan
+ removes unused preallocation from clean inodes and releases
+ the unused space back to the free pool.
+
+ fs.xfs.error_level (Min: 0 Default: 3 Max: 11)
+ A volume knob for error reporting when internal errors occur.
+ This will generate detailed messages & backtraces for filesystem
+ shutdowns, for example. Current threshold values are:
+
+ XFS_ERRLEVEL_OFF: 0
+ XFS_ERRLEVEL_LOW: 1
+ XFS_ERRLEVEL_HIGH: 5
+
+ fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256)
+ Causes certain error conditions to call BUG(). Value is a bitmask;
+ OR together the tags which represent errors which should cause panics:
+
+ XFS_NO_PTAG 0
+ XFS_PTAG_IFLUSH 0x00000001
+ XFS_PTAG_LOGRES 0x00000002
+ XFS_PTAG_AILDELETE 0x00000004
+ XFS_PTAG_ERROR_REPORT 0x00000008
+ XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
+ XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
+ XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
+ XFS_PTAG_FSBLOCK_ZERO 0x00000080
+ XFS_PTAG_VERIFIER_ERROR 0x00000100
+
+ This option is intended for debugging only.
+
+ fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1)
+ Controls whether symlinks are created with mode 0777 (default)
+ or whether their mode is affected by the umask (irix mode).
+
+ fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1)
+ Controls files created in SGID directories.
+ If the group ID of the new file does not match the effective group
+ ID or one of the supplementary group IDs of the parent dir, the
+ ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
+ is set.
+
+ fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "sync" flag set
+ by the **xfs_io(8)** chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.inherit_nodump (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "nodump" flag set
+ by the **xfs_io(8)** chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.inherit_noatime (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "noatime" flag set
+ by the **xfs_io(8)** chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.inherit_nosymlinks (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "nosymlinks" flag set
+ by the **xfs_io(8)** chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1)
+ Setting this to "1" will cause the "nodefrag" flag set
+ by the **xfs_io(8)** chattr command on a directory to be
+ inherited by files in that directory.
+
+ fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256)
+ In "inode32" allocation mode, this option determines how many
+ files the allocator attempts to allocate in the same allocation
+ group before moving to the next allocation group. The intent
+ is to control the rate at which the allocator moves between
+ allocation groups when allocating extents for new files.
+
+Deprecated Sysctls
+==================
+
+None at present.
+
+
+Removed Sysctls
+===============
+
+ Name Removed
+ ---- -------
+ fs.xfs.xfsbufd_centisec v4.0
+ fs.xfs.age_buffer_centisecs v4.0
+
+
+Error handling
+==============
+
+XFS can act differently according to the type of error found during its
+operation. The implementation introduces the following concepts to the error
+handler:
+
+ -failure speed:
+ Defines how fast XFS should propagate an error upwards when a specific
+ error is found during the filesystem operation. It can propagate
+ immediately, after a defined number of retries, after a set time period,
+ or simply retry forever.
+
+ -error classes:
+ Specifies the subsystem the error configuration will apply to, such as
+ metadata IO or memory allocation. Different subsystems will have
+ different error handlers for which behaviour can be configured.
+
+ -error handlers:
+ Defines the behavior for a specific error.
+
+The filesystem behavior during an error can be set via ``sysfs`` files. Each
+error handler works independently - the first condition met by an error handler
+for a specific class will cause the error to be propagated rather than reset and
+retried.
+
+The action taken by the filesystem when the error is propagated is context
+dependent - it may cause a shut down in the case of an unrecoverable error,
+it may be reported back to userspace, or it may even be ignored because
+there's nothing useful we can with the error or anyone we can report it to (e.g.
+during unmount).
+
+The configuration files are organized into the following hierarchy for each
+mounted filesystem:
+
+ /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+Where:
+ <dev>
+ The short device name of the mounted filesystem. This is the same device
+ name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
+
+ <class>
+ The subsystem the error configuration belongs to. As of 4.9, the defined
+ classes are:
+
+ - "metadata": applies metadata buffer write IO
+
+ <error>
+ The individual error handler configurations.
+
+
+Each filesystem has "global" error configuration options defined in their top
+level directory:
+
+ /sys/fs/xfs/<dev>/error/
+
+ fail_at_unmount (Min: 0 Default: 1 Max: 1)
+ Defines the filesystem error behavior at unmount time.
+
+ If set to a value of 1, XFS will override all other error configurations
+ during unmount and replace them with "immediate fail" characteristics.
+ i.e. no retries, no retry timeout. This will always allow unmount to
+ succeed when there are persistent errors present.
+
+ If set to 0, the configured retry behaviour will continue until all
+ retries and/or timeouts have been exhausted. This will delay unmount
+ completion when there are persistent errors, and it may prevent the
+ filesystem from ever unmounting fully in the case of "retry forever"
+ handler configurations.
+
+ Note: there is no guarantee that fail_at_unmount can be set while an
+ unmount is in progress. It is possible that the ``sysfs`` entries are
+ removed by the unmounting filesystem before a "retry forever" error
+ handler configuration causes unmount to hang, and hence the filesystem
+ must be configured appropriately before unmount begins to prevent
+ unmount hangs.
+
+Each filesystem has specific error class handlers that define the error
+propagation behaviour for specific errors. There is also a "default" error
+handler defined, which defines the behaviour for all errors that don't have
+specific handlers defined. Where multiple retry constraints are configured for
+a single error, the first retry configuration that expires will cause the error
+to be propagated. The handler configurations are found in the directory:
+
+ /sys/fs/xfs/<dev>/error/<class>/<error>/
+
+ max_retries (Min: -1 Default: Varies Max: INTMAX)
+ Defines the allowed number of retries of a specific error before
+ the filesystem will propagate the error. The retry count for a given
+ error context (e.g. a specific metadata buffer) is reset every time
+ there is a successful completion of the operation.
+
+ Setting the value to "-1" will cause XFS to retry forever for this
+ specific error.
+
+ Setting the value to "0" will cause XFS to fail immediately when the
+ specific error is reported.
+
+ Setting the value to "N" (where 0 < N < Max) will make XFS retry the
+ operation "N" times before propagating the error.
+
+ retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day)
+ Define the amount of time (in seconds) that the filesystem is
+ allowed to retry its operations when the specific error is
+ found.
+
+ Setting the value to "-1" will allow XFS to retry forever for this
+ specific error.
+
+ Setting the value to "0" will cause XFS to fail immediately when the
+ specific error is reported.
+
+ Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
+ operation for up to "N" seconds before propagating the error.
+
+**Note:** The default behaviour for a specific error handler is dependent on both
+the class and error context. For example, the default values for
+"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
+to "fail immediately" behaviour. This is done because ENODEV is a fatal,
+unrecoverable error no matter how many times the metadata IO is retried.
diff --git a/Documentation/aoe/aoe.rst b/Documentation/aoe/aoe.rst
deleted file mode 100644
index 58747ecec71d..000000000000
--- a/Documentation/aoe/aoe.rst
+++ /dev/null
@@ -1,150 +0,0 @@
-Introduction
-============
-
-ATA over Ethernet is a network protocol that provides simple access to
-block storage on the LAN.
-
- http://support.coraid.com/documents/AoEr11.txt
-
-The EtherDrive (R) HOWTO for 2.6 and 3.x kernels is found at ...
-
- http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO.html
-
-It has many tips and hints! Please see, especially, recommended
-tunings for virtual memory:
-
- http://support.coraid.com/support/linux/EtherDrive-2.6-HOWTO-5.html#ss5.19
-
-The aoetools are userland programs that are designed to work with this
-driver. The aoetools are on sourceforge.
-
- http://aoetools.sourceforge.net/
-
-The scripts in this Documentation/aoe directory are intended to
-document the use of the driver and are not necessary if you install
-the aoetools.
-
-
-Creating Device Nodes
-=====================
-
- Users of udev should find the block device nodes created
- automatically, but to create all the necessary device nodes, use the
- udev configuration rules provided in udev.txt (in this directory).
-
- There is a udev-install.sh script that shows how to install these
- rules on your system.
-
- There is also an autoload script that shows how to edit
- /etc/modprobe.d/aoe.conf to ensure that the aoe module is loaded when
- necessary. Preloading the aoe module is preferable to autoloading,
- however, because AoE discovery takes a few seconds. It can be
- confusing when an AoE device is not present the first time the a
- command is run but appears a second later.
-
-Using Device Nodes
-==================
-
- "cat /dev/etherd/err" blocks, waiting for error diagnostic output,
- like any retransmitted packets.
-
- "echo eth2 eth4 > /dev/etherd/interfaces" tells the aoe driver to
- limit ATA over Ethernet traffic to eth2 and eth4. AoE traffic from
- untrusted networks should be ignored as a matter of security. See
- also the aoe_iflist driver option described below.
-
- "echo > /dev/etherd/discover" tells the driver to find out what AoE
- devices are available.
-
- In the future these character devices may disappear and be replaced
- by sysfs counterparts. Using the commands in aoetools insulates
- users from these implementation details.
-
- The block devices are named like this::
-
- e{shelf}.{slot}
- e{shelf}.{slot}p{part}
-
- ... so that "e0.2" is the third blade from the left (slot 2) in the
- first shelf (shelf address zero). That's the whole disk. The first
- partition on that disk would be "e0.2p1".
-
-Using sysfs
-===========
-
- Each aoe block device in /sys/block has the extra attributes of
- state, mac, and netif. The state attribute is "up" when the device
- is ready for I/O and "down" if detected but unusable. The
- "down,closewait" state shows that the device is still open and
- cannot come up again until it has been closed.
-
- The mac attribute is the ethernet address of the remote AoE device.
- The netif attribute is the network interface on the localhost
- through which we are communicating with the remote AoE device.
-
- There is a script in this directory that formats this information in
- a convenient way. Users with aoetools should use the aoe-stat
- command::
-
- root@makki root# sh Documentation/aoe/status.sh
- e10.0 eth3 up
- e10.1 eth3 up
- e10.2 eth3 up
- e10.3 eth3 up
- e10.4 eth3 up
- e10.5 eth3 up
- e10.6 eth3 up
- e10.7 eth3 up
- e10.8 eth3 up
- e10.9 eth3 up
- e4.0 eth1 up
- e4.1 eth1 up
- e4.2 eth1 up
- e4.3 eth1 up
- e4.4 eth1 up
- e4.5 eth1 up
- e4.6 eth1 up
- e4.7 eth1 up
- e4.8 eth1 up
- e4.9 eth1 up
-
- Use /sys/module/aoe/parameters/aoe_iflist (or better, the driver
- option discussed below) instead of /dev/etherd/interfaces to limit
- AoE traffic to the network interfaces in the given
- whitespace-separated list. Unlike the old character device, the
- sysfs entry can be read from as well as written to.
-
- It's helpful to trigger discovery after setting the list of allowed
- interfaces. The aoetools package provides an aoe-discover script
- for this purpose. You can also directly use the
- /dev/etherd/discover special file described above.
-
-Driver Options
-==============
-
- There is a boot option for the built-in aoe driver and a
- corresponding module parameter, aoe_iflist. Without this option,
- all network interfaces may be used for ATA over Ethernet. Here is a
- usage example for the module parameter::
-
- modprobe aoe_iflist="eth1 eth3"
-
- The aoe_deadsecs module parameter determines the maximum number of
- seconds that the driver will wait for an AoE device to provide a
- response to an AoE command. After aoe_deadsecs seconds have
- elapsed, the AoE device will be marked as "down". A value of zero
- is supported for testing purposes and makes the aoe driver keep
- trying AoE commands forever.
-
- The aoe_maxout module parameter has a default of 128. This is the
- maximum number of unresponded packets that will be sent to an AoE
- target at one time.
-
- The aoe_dyndevs module parameter defaults to 1, meaning that the
- driver will assign a block device minor number to a discovered AoE
- target based on the order of its discovery. With dynamic minor
- device numbers in use, a greater range of AoE shelf and slot
- addresses can be supported. Users with udev will never have to
- think about minor numbers. Using aoe_dyndevs=0 allows device nodes
- to be pre-created using a static minor-number scheme with the
- aoe-mkshelf script in the aoetools.
diff --git a/Documentation/aoe/index.rst b/Documentation/aoe/index.rst
deleted file mode 100644
index 4394b9b7913c..000000000000
--- a/Documentation/aoe/index.rst
+++ /dev/null
@@ -1,19 +0,0 @@
-:orphan:
-
-=======================
-ATA over Ethernet (AoE)
-=======================
-
-.. toctree::
- :maxdepth: 1
-
- aoe
- todo
- examples
-
-.. only:: subproject and html
-
- Indices
- =======
-
- * :ref:`genindex`
diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt
deleted file mode 100644
index 54feda5a0772..000000000000
--- a/Documentation/aoe/udev.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# These rules tell udev what device nodes to create for aoe support.
-# They may be installed along the following lines. Check the section
-# 8 udev manpage to see whether your udev supports SUBSYSTEM, and
-# whether it uses one or two equal signs for SUBSYSTEM and KERNEL.
-#
-# ecashin@makki ~$ su
-# Password:
-# bash# find /etc -type f -name udev.conf
-# /etc/udev/udev.conf
-# bash# grep udev_rules= /etc/udev/udev.conf
-# udev_rules="/etc/udev/rules.d/"
-# bash# ls /etc/udev/rules.d/
-# 10-wacom.rules 50-udev.rules
-# bash# cp /path/to/linux/Documentation/aoe/udev.txt \
-# /etc/udev/rules.d/60-aoe.rules
-#
-
-# aoe char devices
-SUBSYSTEM=="aoe", KERNEL=="discover", NAME="etherd/%k", GROUP="disk", MODE="0220"
-SUBSYSTEM=="aoe", KERNEL=="err", NAME="etherd/%k", GROUP="disk", MODE="0440"
-SUBSYSTEM=="aoe", KERNEL=="interfaces", NAME="etherd/%k", GROUP="disk", MODE="0220"
-SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="0220"
-SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220"
-
-# aoe block devices
-KERNEL=="etherd*", GROUP="disk"
diff --git a/Documentation/arm/Booting b/Documentation/arm/Booting
deleted file mode 100644
index f1f965ce93d6..000000000000
--- a/Documentation/arm/Booting
+++ /dev/null
@@ -1,218 +0,0 @@
- Booting ARM Linux
- =================
-
-Author: Russell King
-Date : 18 May 2002
-
-The following documentation is relevant to 2.4.18-rmk6 and beyond.
-
-In order to boot ARM Linux, you require a boot loader, which is a small
-program that runs before the main kernel. The boot loader is expected
-to initialise various devices, and eventually call the Linux kernel,
-passing information to the kernel.
-
-Essentially, the boot loader should provide (as a minimum) the
-following:
-
-1. Setup and initialise the RAM.
-2. Initialise one serial port.
-3. Detect the machine type.
-4. Setup the kernel tagged list.
-5. Load initramfs.
-6. Call the kernel image.
-
-
-1. Setup and initialise RAM
----------------------------
-
-Existing boot loaders: MANDATORY
-New boot loaders: MANDATORY
-
-The boot loader is expected to find and initialise all RAM that the
-kernel will use for volatile data storage in the system. It performs
-this in a machine dependent manner. (It may use internal algorithms
-to automatically locate and size all RAM, or it may use knowledge of
-the RAM in the machine, or any other method the boot loader designer
-sees fit.)
-
-
-2. Initialise one serial port
------------------------------
-
-Existing boot loaders: OPTIONAL, RECOMMENDED
-New boot loaders: OPTIONAL, RECOMMENDED
-
-The boot loader should initialise and enable one serial port on the
-target. This allows the kernel serial driver to automatically detect
-which serial port it should use for the kernel console (generally
-used for debugging purposes, or communication with the target.)
-
-As an alternative, the boot loader can pass the relevant 'console='
-option to the kernel via the tagged lists specifying the port, and
-serial format options as described in
-
- Documentation/admin-guide/kernel-parameters.rst.
-
-
-3. Detect the machine type
---------------------------
-
-Existing boot loaders: OPTIONAL
-New boot loaders: MANDATORY except for DT-only platforms
-
-The boot loader should detect the machine type its running on by some
-method. Whether this is a hard coded value or some algorithm that
-looks at the connected hardware is beyond the scope of this document.
-The boot loader must ultimately be able to provide a MACH_TYPE_xxx
-value to the kernel. (see linux/arch/arm/tools/mach-types). This
-should be passed to the kernel in register r1.
-
-For DT-only platforms, the machine type will be determined by device
-tree. set the machine type to all ones (~0). This is not strictly
-necessary, but assures that it will not match any existing types.
-
-4. Setup boot data
-------------------
-
-Existing boot loaders: OPTIONAL, HIGHLY RECOMMENDED
-New boot loaders: MANDATORY
-
-The boot loader must provide either a tagged list or a dtb image for
-passing configuration data to the kernel. The physical address of the
-boot data is passed to the kernel in register r2.
-
-4a. Setup the kernel tagged list
---------------------------------
-
-The boot loader must create and initialise the kernel tagged list.
-A valid tagged list starts with ATAG_CORE and ends with ATAG_NONE.
-The ATAG_CORE tag may or may not be empty. An empty ATAG_CORE tag
-has the size field set to '2' (0x00000002). The ATAG_NONE must set
-the size field to zero.
-
-Any number of tags can be placed in the list. It is undefined
-whether a repeated tag appends to the information carried by the
-previous tag, or whether it replaces the information in its
-entirety; some tags behave as the former, others the latter.
-
-The boot loader must pass at a minimum the size and location of
-the system memory, and root filesystem location. Therefore, the
-minimum tagged list should look:
-
- +-----------+
-base -> | ATAG_CORE | |
- +-----------+ |
- | ATAG_MEM | | increasing address
- +-----------+ |
- | ATAG_NONE | |
- +-----------+ v
-
-The tagged list should be stored in system RAM.
-
-The tagged list must be placed in a region of memory where neither
-the kernel decompressor nor initrd 'bootp' program will overwrite
-it. The recommended placement is in the first 16KiB of RAM.
-
-4b. Setup the device tree
--------------------------
-
-The boot loader must load a device tree image (dtb) into system ram
-at a 64bit aligned address and initialize it with the boot data. The
-dtb format is documented in Documentation/devicetree/booting-without-of.txt.
-The kernel will look for the dtb magic value of 0xd00dfeed at the dtb
-physical address to determine if a dtb has been passed instead of a
-tagged list.
-
-The boot loader must pass at a minimum the size and location of the
-system memory, and the root filesystem location. The dtb must be
-placed in a region of memory where the kernel decompressor will not
-overwrite it, while remaining within the region which will be covered
-by the kernel's low-memory mapping.
-
-A safe location is just above the 128MiB boundary from start of RAM.
-
-5. Load initramfs.
-------------------
-
-Existing boot loaders: OPTIONAL
-New boot loaders: OPTIONAL
-
-If an initramfs is in use then, as with the dtb, it must be placed in
-a region of memory where the kernel decompressor will not overwrite it
-while also with the region which will be covered by the kernel's
-low-memory mapping.
-
-A safe location is just above the device tree blob which itself will
-be loaded just above the 128MiB boundary from the start of RAM as
-recommended above.
-
-6. Calling the kernel image
----------------------------
-
-Existing boot loaders: MANDATORY
-New boot loaders: MANDATORY
-
-There are two options for calling the kernel zImage. If the zImage
-is stored in flash, and is linked correctly to be run from flash,
-then it is legal for the boot loader to call the zImage in flash
-directly.
-
-The zImage may also be placed in system RAM and called there. The
-kernel should be placed in the first 128MiB of RAM. It is recommended
-that it is loaded above 32MiB in order to avoid the need to relocate
-prior to decompression, which will make the boot process slightly
-faster.
-
-When booting a raw (non-zImage) kernel the constraints are tighter.
-In this case the kernel must be loaded at an offset into system equal
-to TEXT_OFFSET - PAGE_OFFSET.
-
-In any case, the following conditions must be met:
-
-- Quiesce all DMA capable devices so that memory does not get
- corrupted by bogus network packets or disk data. This will save
- you many hours of debug.
-
-- CPU register settings
- r0 = 0,
- r1 = machine type number discovered in (3) above.
- r2 = physical address of tagged list in system RAM, or
- physical address of device tree block (dtb) in system RAM
-
-- CPU mode
- All forms of interrupts must be disabled (IRQs and FIQs)
-
- For CPUs which do not include the ARM virtualization extensions, the
- CPU must be in SVC mode. (A special exception exists for Angel)
-
- CPUs which include support for the virtualization extensions can be
- entered in HYP mode in order to enable the kernel to make full use of
- these extensions. This is the recommended boot method for such CPUs,
- unless the virtualisations are already in use by a pre-installed
- hypervisor.
-
- If the kernel is not entered in HYP mode for any reason, it must be
- entered in SVC mode.
-
-- Caches, MMUs
- The MMU must be off.
- Instruction cache may be on or off.
- Data cache must be off.
-
- If the kernel is entered in HYP mode, the above requirements apply to
- the HYP mode configuration in addition to the ordinary PL1 (privileged
- kernel modes) configuration. In addition, all traps into the
- hypervisor must be disabled, and PL1 access must be granted for all
- peripherals and CPU resources for which this is architecturally
- possible. Except for entering in HYP mode, the system configuration
- should be such that a kernel which does not include support for the
- virtualization extensions can boot correctly without extra help.
-
-- The boot loader is expected to call the kernel image by jumping
- directly to the first instruction of the kernel image.
-
- On CPUs supporting the ARM instruction set, the entry must be
- made in ARM state, even for a Thumb-2 kernel.
-
- On CPUs supporting only the Thumb instruction set such as
- Cortex-M class CPUs, the entry must be made in Thumb state.
diff --git a/Documentation/arm/IXP4xx b/Documentation/arm/IXP4xx
deleted file mode 100644
index e48b74de6ac0..000000000000
--- a/Documentation/arm/IXP4xx
+++ /dev/null
@@ -1,172 +0,0 @@
-
--------------------------------------------------------------------------
-Release Notes for Linux on Intel's IXP4xx Network Processor
-
-Maintained by Deepak Saxena <dsaxena@plexity.net>
--------------------------------------------------------------------------
-
-1. Overview
-
-Intel's IXP4xx network processor is a highly integrated SOC that
-is targeted for network applications, though it has become popular
-in industrial control and other areas due to low cost and power
-consumption. The IXP4xx family currently consists of several processors
-that support different network offload functions such as encryption,
-routing, firewalling, etc. The IXP46x family is an updated version which
-supports faster speeds, new memory and flash configurations, and more
-integration such as an on-chip I2C controller.
-
-For more information on the various versions of the CPU, see:
-
- http://developer.intel.com/design/network/products/npfamily/ixp4xx.htm
-
-Intel also made the IXCP1100 CPU for sometime which is an IXP4xx
-stripped of much of the network intelligence.
-
-2. Linux Support
-
-Linux currently supports the following features on the IXP4xx chips:
-
-- Dual serial ports
-- PCI interface
-- Flash access (MTD/JFFS)
-- I2C through GPIO on IXP42x
-- GPIO for input/output/interrupts
- See arch/arm/mach-ixp4xx/include/mach/platform.h for access functions.
-- Timers (watchdog, OS)
-
-The following components of the chips are not supported by Linux and
-require the use of Intel's proprietary CSR software:
-
-- USB device interface
-- Network interfaces (HSS, Utopia, NPEs, etc)
-- Network offload functionality
-
-If you need to use any of the above, you need to download Intel's
-software from:
-
- http://developer.intel.com/design/network/products/npfamily/ixp425.htm
-
-DO NOT POST QUESTIONS TO THE LINUX MAILING LISTS REGARDING THE PROPRIETARY
-SOFTWARE.
-
-There are several websites that provide directions/pointers on using
-Intel's software:
-
- http://sourceforge.net/projects/ixp4xx-osdg/
- Open Source Developer's Guide for using uClinux and the Intel libraries
-
-http://gatewaymaker.sourceforge.net/
- Simple one page summary of building a gateway using an IXP425 and Linux
-
-http://ixp425.sourceforge.net/
- ATM device driver for IXP425 that relies on Intel's libraries
-
-3. Known Issues/Limitations
-
-3a. Limited inbound PCI window
-
-The IXP4xx family allows for up to 256MB of memory but the PCI interface
-can only expose 64MB of that memory to the PCI bus. This means that if
-you are running with > 64MB, all PCI buffers outside of the accessible
-range will be bounced using the routines in arch/arm/common/dmabounce.c.
-
-3b. Limited outbound PCI window
-
-IXP4xx provides two methods of accessing PCI memory space:
-
-1) A direct mapped window from 0x48000000 to 0x4bffffff (64MB).
- To access PCI via this space, we simply ioremap() the BAR
- into the kernel and we can use the standard read[bwl]/write[bwl]
- macros. This is the preffered method due to speed but it
- limits the system to just 64MB of PCI memory. This can be
- problamatic if using video cards and other memory-heavy devices.
-
-2) If > 64MB of memory space is required, the IXP4xx can be
- configured to use indirect registers to access PCI This allows
- for up to 128MB (0x48000000 to 0x4fffffff) of memory on the bus.
- The disadvantage of this is that every PCI access requires
- three local register accesses plus a spinlock, but in some
- cases the performance hit is acceptable. In addition, you cannot
- mmap() PCI devices in this case due to the indirect nature
- of the PCI window.
-
-By default, the direct method is used for performance reasons. If
-you need more PCI memory, enable the IXP4XX_INDIRECT_PCI config option.
-
-3c. GPIO as Interrupts
-
-Currently the code only handles level-sensitive GPIO interrupts
-
-4. Supported platforms
-
-ADI Engineering Coyote Gateway Reference Platform
-http://www.adiengineering.com/productsCoyote.html
-
- The ADI Coyote platform is reference design for those building
- small residential/office gateways. One NPE is connected to a 10/100
- interface, one to 4-port 10/100 switch, and the third to and ADSL
- interface. In addition, it also supports to POTs interfaces connected
- via SLICs. Note that those are not supported by Linux ATM. Finally,
- the platform has two mini-PCI slots used for 802.11[bga] cards.
- Finally, there is an IDE port hanging off the expansion bus.
-
-Gateworks Avila Network Platform
-http://www.gateworks.com/support/overview.php
-
- The Avila platform is basically and IXDP425 with the 4 PCI slots
- replaced with mini-PCI slots and a CF IDE interface hanging off
- the expansion bus.
-
-Intel IXDP425 Development Platform
-http://www.intel.com/design/network/products/npfamily/ixdpg425.htm
-
- This is Intel's standard reference platform for the IXDP425 and is
- also known as the Richfield board. It contains 4 PCI slots, 16MB
- of flash, two 10/100 ports and one ADSL port.
-
-Intel IXDP465 Development Platform
-http://www.intel.com/design/network/products/npfamily/ixdp465.htm
-
- This is basically an IXDP425 with an IXP465 and 32M of flash instead
- of just 16.
-
-Intel IXDPG425 Development Platform
-
- This is basically and ADI Coyote board with a NEC EHCI controller
- added. One issue with this board is that the mini-PCI slots only
- have the 3.3v line connected, so you can't use a PCI to mini-PCI
- adapter with an E100 card. So to NFS root you need to use either
- the CSR or a WiFi card and a ramdisk that BOOTPs and then does
- a pivot_root to NFS.
-
-Motorola PrPMC1100 Processor Mezanine Card
-http://www.fountainsys.com
-
- The PrPMC1100 is based on the IXCP1100 and is meant to plug into
- and IXP2400/2800 system to act as the system controller. It simply
- contains a CPU and 16MB of flash on the board and needs to be
- plugged into a carrier board to function. Currently Linux only
- supports the Motorola PrPMC carrier board for this platform.
-
-5. TODO LIST
-
-- Add support for Coyote IDE
-- Add support for edge-based GPIO interrupts
-- Add support for CF IDE on expansion bus
-
-6. Thanks
-
-The IXP4xx work has been funded by Intel Corp. and MontaVista Software, Inc.
-
-The following people have contributed patches/comments/etc:
-
-Lennerty Buytenhek
-Lutz Jaenicke
-Justin Mayfield
-Robert E. Ranslam
-[I know I've forgotten others, please email me to be added]
-
--------------------------------------------------------------------------
-
-Last Update: 01/04/2005
diff --git a/Documentation/arm/Interrupts b/Documentation/arm/Interrupts
deleted file mode 100644
index f09ab1b90ef1..000000000000
--- a/Documentation/arm/Interrupts
+++ /dev/null
@@ -1,167 +0,0 @@
-2.5.2-rmk5
-----------
-
-This is the first kernel that contains a major shake up of some of the
-major architecture-specific subsystems.
-
-Firstly, it contains some pretty major changes to the way we handle the
-MMU TLB. Each MMU TLB variant is now handled completely separately -
-we have TLB v3, TLB v4 (without write buffer), TLB v4 (with write buffer),
-and finally TLB v4 (with write buffer, with I TLB invalidate entry).
-There is more assembly code inside each of these functions, mainly to
-allow more flexible TLB handling for the future.
-
-Secondly, the IRQ subsystem.
-
-The 2.5 kernels will be having major changes to the way IRQs are handled.
-Unfortunately, this means that machine types that touch the irq_desc[]
-array (basically all machine types) will break, and this means every
-machine type that we currently have.
-
-Lets take an example. On the Assabet with Neponset, we have:
-
- GPIO25 IRR:2
- SA1100 ------------> Neponset -----------> SA1111
- IIR:1
- -----------> USAR
- IIR:0
- -----------> SMC9196
-
-The way stuff currently works, all SA1111 interrupts are mutually
-exclusive of each other - if you're processing one interrupt from the
-SA1111 and another comes in, you have to wait for that interrupt to
-finish processing before you can service the new interrupt. Eg, an
-IDE PIO-based interrupt on the SA1111 excludes all other SA1111 and
-SMC9196 interrupts until it has finished transferring its multi-sector
-data, which can be a long time. Note also that since we loop in the
-SA1111 IRQ handler, SA1111 IRQs can hold off SMC9196 IRQs indefinitely.
-
-
-The new approach brings several new ideas...
-
-We introduce the concept of a "parent" and a "child". For example,
-to the Neponset handler, the "parent" is GPIO25, and the "children"d
-are SA1111, SMC9196 and USAR.
-
-We also bring the idea of an IRQ "chip" (mainly to reduce the size of
-the irqdesc array). This doesn't have to be a real "IC"; indeed the
-SA11x0 IRQs are handled by two separate "chip" structures, one for
-GPIO0-10, and another for all the rest. It is just a container for
-the various operations (maybe this'll change to a better name).
-This structure has the following operations:
-
-struct irqchip {
- /*
- * Acknowledge the IRQ.
- * If this is a level-based IRQ, then it is expected to mask the IRQ
- * as well.
- */
- void (*ack)(unsigned int irq);
- /*
- * Mask the IRQ in hardware.
- */
- void (*mask)(unsigned int irq);
- /*
- * Unmask the IRQ in hardware.
- */
- void (*unmask)(unsigned int irq);
- /*
- * Re-run the IRQ
- */
- void (*rerun)(unsigned int irq);
- /*
- * Set the type of the IRQ.
- */
- int (*type)(unsigned int irq, unsigned int, type);
-};
-
-ack - required. May be the same function as mask for IRQs
- handled by do_level_IRQ.
-mask - required.
-unmask - required.
-rerun - optional. Not required if you're using do_level_IRQ for all
- IRQs that use this 'irqchip'. Generally expected to re-trigger
- the hardware IRQ if possible. If not, may call the handler
- directly.
-type - optional. If you don't support changing the type of an IRQ,
- it should be null so people can detect if they are unable to
- set the IRQ type.
-
-For each IRQ, we keep the following information:
-
- - "disable" depth (number of disable_irq()s without enable_irq()s)
- - flags indicating what we can do with this IRQ (valid, probe,
- noautounmask) as before
- - status of the IRQ (probing, enable, etc)
- - chip
- - per-IRQ handler
- - irqaction structure list
-
-The handler can be one of the 3 standard handlers - "level", "edge" and
-"simple", or your own specific handler if you need to do something special.
-
-The "level" handler is what we currently have - its pretty simple.
-"edge" knows about the brokenness of such IRQ implementations - that you
-need to leave the hardware IRQ enabled while processing it, and queueing
-further IRQ events should the IRQ happen again while processing. The
-"simple" handler is very basic, and does not perform any hardware
-manipulation, nor state tracking. This is useful for things like the
-SMC9196 and USAR above.
-
-So, what's changed?
-
-1. Machine implementations must not write to the irqdesc array.
-
-2. New functions to manipulate the irqdesc array. The first 4 are expected
- to be useful only to machine specific code. The last is recommended to
- only be used by machine specific code, but may be used in drivers if
- absolutely necessary.
-
- set_irq_chip(irq,chip)
-
- Set the mask/unmask methods for handling this IRQ
-
- set_irq_handler(irq,handler)
-
- Set the handler for this IRQ (level, edge, simple)
-
- set_irq_chained_handler(irq,handler)
-
- Set a "chained" handler for this IRQ - automatically
- enables this IRQ (eg, Neponset and SA1111 handlers).
-
- set_irq_flags(irq,flags)
-
- Set the valid/probe/noautoenable flags.
-
- set_irq_type(irq,type)
-
- Set active the IRQ edge(s)/level. This replaces the
- SA1111 INTPOL manipulation, and the set_GPIO_IRQ_edge()
- function. Type should be one of IRQ_TYPE_xxx defined in
- <linux/irq.h>
-
-3. set_GPIO_IRQ_edge() is obsolete, and should be replaced by set_irq_type.
-
-4. Direct access to SA1111 INTPOL is deprecated. Use set_irq_type instead.
-
-5. A handler is expected to perform any necessary acknowledgement of the
- parent IRQ via the correct chip specific function. For instance, if
- the SA1111 is directly connected to a SA1110 GPIO, then you should
- acknowledge the SA1110 IRQ each time you re-read the SA1111 IRQ status.
-
-6. For any child which doesn't have its own IRQ enable/disable controls
- (eg, SMC9196), the handler must mask or acknowledge the parent IRQ
- while the child handler is called, and the child handler should be the
- "simple" handler (not "edge" nor "level"). After the handler completes,
- the parent IRQ should be unmasked, and the status of all children must
- be re-checked for pending events. (see the Neponset IRQ handler for
- details).
-
-7. fixup_irq() is gone, as is arch/arm/mach-*/include/mach/irq.h
-
-Please note that this will not solve all problems - some of them are
-hardware based. Mixing level-based and edge-based IRQs on the same
-parent signal (eg neponset) is one such area where a software based
-solution can't provide the full answer to low IRQ latency.
-
diff --git a/Documentation/arm/Marvell/README b/Documentation/arm/Marvell/README
deleted file mode 100644
index 56ada27c53be..000000000000
--- a/Documentation/arm/Marvell/README
+++ /dev/null
@@ -1,395 +0,0 @@
-ARM Marvell SoCs
-================
-
-This document lists all the ARM Marvell SoCs that are currently
-supported in mainline by the Linux kernel. As the Marvell families of
-SoCs are large and complex, it is hard to understand where the support
-for a particular SoC is available in the Linux kernel. This document
-tries to help in understanding where those SoCs are supported, and to
-match them with their corresponding public datasheet, when available.
-
-Orion family
-------------
-
- Flavors:
- 88F5082
- 88F5181
- 88F5181L
- 88F5182
- Datasheet : http://www.embeddedarm.com/documentation/third-party/MV88F5182-datasheet.pdf
- Programmer's User Guide : http://www.embeddedarm.com/documentation/third-party/MV88F5182-opensource-manual.pdf
- User Manual : http://www.embeddedarm.com/documentation/third-party/MV88F5182-usermanual.pdf
- 88F5281
- Datasheet : http://www.ocmodshop.com/images/reviews/networking/qnap_ts409u/marvel_88f5281_data_sheet.pdf
- 88F6183
- Core: Feroceon 88fr331 (88f51xx) or 88fr531-vd (88f52xx) ARMv5 compatible
- Linux kernel mach directory: arch/arm/mach-orion5x
- Linux kernel plat directory: arch/arm/plat-orion
-
-Kirkwood family
----------------
-
- Flavors:
- 88F6282 a.k.a Armada 300
- Product Brief : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
- 88F6283 a.k.a Armada 310
- Product Brief : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
- 88F6190
- Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6190-003_WEB.pdf
- Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
- 88F6192
- Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6192-003_ver1.pdf
- Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
- 88F6182
- 88F6180
- Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6180-003_ver1.pdf
- Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6180_OpenSource.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
- 88F6281
- Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6281-004_ver1.pdf
- Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6281_OpenSource.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
- Homepage: http://www.marvell.com/embedded-processors/kirkwood/
- Core: Feroceon 88fr131 ARMv5 compatible
- Linux kernel mach directory: arch/arm/mach-mvebu
- Linux kernel plat directory: none
-
-Discovery family
-----------------
-
- Flavors:
- MV78100
- Product Brief : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78100-003_WEB.pdf
- Hardware Spec : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78100_OpenSource.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
- MV78200
- Product Brief : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78200-002_WEB.pdf
- Hardware Spec : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78200_OpenSource.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
- MV76100
- Not supported by the Linux kernel.
-
- Core: Feroceon 88fr571-vd ARMv5 compatible
-
- Linux kernel mach directory: arch/arm/mach-mv78xx0
- Linux kernel plat directory: arch/arm/plat-orion
-
-EBU Armada family
------------------
-
- Armada 370 Flavors:
- 88F6710
- 88F6707
- 88F6W11
- Product Brief: http://www.marvell.com/embedded-processors/armada-300/assets/Marvell_ARMADA_370_SoC.pdf
- Hardware Spec: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-datasheet.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-FunctionalSpec-datasheet.pdf
- Core: Sheeva ARMv7 compatible PJ4B
-
- Armada 375 Flavors:
- 88F6720
- Product Brief: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA_375_SoC-01_product_brief.pdf
- Core: ARM Cortex-A9
-
- Armada 38x Flavors:
- 88F6810 Armada 380
- 88F6820 Armada 385
- 88F6828 Armada 388
- Product infos: http://www.marvell.com/embedded-processors/armada-38x/
- Functional Spec: https://marvellcorp.wufoo.com/forms/marvell-armada-38x-functional-specifications/
- Core: ARM Cortex-A9
-
- Armada 39x Flavors:
- 88F6920 Armada 390
- 88F6928 Armada 398
- Product infos: http://www.marvell.com/embedded-processors/armada-39x/
- Core: ARM Cortex-A9
-
- Armada XP Flavors:
- MV78230
- MV78260
- MV78460
- NOTE: not to be confused with the non-SMP 78xx0 SoCs
- Product Brief: http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf
- Functional Spec: http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf
- Hardware Specs:
- http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF
- http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF
- http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF
- Core: Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP
-
- Linux kernel mach directory: arch/arm/mach-mvebu
- Linux kernel plat directory: none
-
-EBU Armada family ARMv8
------------------------
-
- Armada 3710/3720 Flavors:
- 88F3710
- 88F3720
- Core: ARM Cortex A53 (ARMv8)
-
- Homepage: http://www.marvell.com/embedded-processors/armada-3700/
- Product Brief: http://www.marvell.com/embedded-processors/assets/PB-88F3700-FNL.pdf
- Device tree files: arch/arm64/boot/dts/marvell/armada-37*
-
- Armada 7K Flavors:
- 88F7020 (AP806 Dual + one CP110)
- 88F7040 (AP806 Quad + one CP110)
- Core: ARM Cortex A72
-
- Homepage: http://www.marvell.com/embedded-processors/armada-70xx/
- Product Brief: http://www.marvell.com/embedded-processors/assets/Armada7020PB-Jan2016.pdf
- http://www.marvell.com/embedded-processors/assets/Armada7040PB-Jan2016.pdf
- Device tree files: arch/arm64/boot/dts/marvell/armada-70*
-
- Armada 8K Flavors:
- 88F8020 (AP806 Dual + two CP110)
- 88F8040 (AP806 Quad + two CP110)
- Core: ARM Cortex A72
-
- Homepage: http://www.marvell.com/embedded-processors/armada-80xx/
- Product Brief: http://www.marvell.com/embedded-processors/assets/Armada8020PB-Jan2016.pdf
- http://www.marvell.com/embedded-processors/assets/Armada8040PB-Jan2016.pdf
- Device tree files: arch/arm64/boot/dts/marvell/armada-80*
-
-Avanta family
--------------
-
- Flavors:
- 88F6510
- 88F6530P
- 88F6550
- 88F6560
- Homepage : http://www.marvell.com/broadband/
- Product Brief: http://www.marvell.com/broadband/assets/Marvell_Avanta_88F6510_305_060-001_product_brief.pdf
- No public datasheet available.
-
- Core: ARMv5 compatible
-
- Linux kernel mach directory: no code in mainline yet, planned for the future
- Linux kernel plat directory: no code in mainline yet, planned for the future
-
-Storage family
---------------
-
- Armada SP:
- 88RC1580
- Product infos: http://www.marvell.com/storage/armada-sp/
- Core: Sheeva ARMv7 comatible Quad-core PJ4C
- (not supported in upstream Linux kernel)
-
-Dove family (application processor)
------------------------------------
-
- Flavors:
- 88AP510 a.k.a Armada 510
- Product Brief : http://www.marvell.com/application-processors/armada-500/assets/Marvell_Armada510_SoC.pdf
- Hardware Spec : http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Hardware-Spec.pdf
- Functional Spec : http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Functional-Spec.pdf
- Homepage: http://www.marvell.com/application-processors/armada-500/
- Core: ARMv7 compatible
-
- Directory: arch/arm/mach-mvebu (DT enabled platforms)
- arch/arm/mach-dove (non-DT enabled platforms)
-
-PXA 2xx/3xx/93x/95x family
---------------------------
-
- Flavors:
- PXA21x, PXA25x, PXA26x
- Application processor only
- Core: ARMv5 XScale1 core
- PXA270, PXA271, PXA272
- Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_pb.pdf
- Design guide : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_design_guide.pdf
- Developers manual : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_dev_man.pdf
- Specification : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_emts.pdf
- Specification update : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_spec_update.pdf
- Application processor only
- Core: ARMv5 XScale2 core
- PXA300, PXA310, PXA320
- PXA 300 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA300_PB_R4.pdf
- PXA 310 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA310_PB_R4.pdf
- PXA 320 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA320_PB_R4.pdf
- Design guide : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Design_Guide.pdf
- Developers manual : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Developers_Manual.zip
- Specifications : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_EMTS.pdf
- Specification Update : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Spec_Update.zip
- Reference Manual : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_TavorP_BootROM_Ref_Manual.pdf
- Application processor only
- Core: ARMv5 XScale3 core
- PXA930, PXA935
- Application processor with Communication processor
- Core: ARMv5 XScale3 core
- PXA955
- Application processor with Communication processor
- Core: ARMv7 compatible Sheeva PJ4 core
-
- Comments:
-
- * This line of SoCs originates from the XScale family developed by
- Intel and acquired by Marvell in ~2006. The PXA21x, PXA25x,
- PXA26x, PXA27x, PXA3xx and PXA93x were developed by Intel, while
- the later PXA95x were developed by Marvell.
-
- * Due to their XScale origin, these SoCs have virtually nothing in
- common with the other (Kirkwood, Dove, etc.) families of Marvell
- SoCs, except with the MMP/MMP2 family of SoCs.
-
- Linux kernel mach directory: arch/arm/mach-pxa
- Linux kernel plat directory: arch/arm/plat-pxa
-
-MMP/MMP2/MMP3 family (communication processor)
------------------------------------------
-
- Flavors:
- PXA168, a.k.a Armada 168
- Homepage : http://www.marvell.com/application-processors/armada-100/armada-168.jsp
- Product brief : http://www.marvell.com/application-processors/armada-100/assets/pxa_168_pb.pdf
- Hardware manual : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_datasheet.pdf
- Software manual : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_software_manual.pdf
- Specification update : http://www.marvell.com/application-processors/armada-100/assets/ARMADA16x_Spec_update.pdf
- Boot ROM manual : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_ref_manual.pdf
- App node package : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_app_note_package.pdf
- Application processor only
- Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
- PXA910/PXA920
- Homepage : http://www.marvell.com/communication-processors/pxa910/
- Product Brief : http://www.marvell.com/communication-processors/pxa910/assets/Marvell_PXA910_Platform-001_PB_final.pdf
- Application processor with Communication processor
- Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
- PXA688, a.k.a. MMP2, a.k.a Armada 610
- Product Brief : http://www.marvell.com/application-processors/armada-600/assets/armada610_pb.pdf
- Application processor only
- Core: ARMv7 compatible Sheeva PJ4 88sv581x core
- PXA2128, a.k.a. MMP3 (OLPC XO4, Linux support not upstream)
- Product Brief : http://www.marvell.com/application-processors/armada/pxa2128/assets/Marvell-ARMADA-PXA2128-SoC-PB.pdf
- Application processor only
- Core: Dual-core ARMv7 compatible Sheeva PJ4C core
- PXA960/PXA968/PXA978 (Linux support not upstream)
- Application processor with Communication Processor
- Core: ARMv7 compatible Sheeva PJ4 core
- PXA986/PXA988 (Linux support not upstream)
- Application processor with Communication Processor
- Core: Dual-core ARMv7 compatible Sheeva PJ4B-MP core
- PXA1088/PXA1920 (Linux support not upstream)
- Application processor with Communication Processor
- Core: quad-core ARMv7 Cortex-A7
- PXA1908/PXA1928/PXA1936
- Application processor with Communication Processor
- Core: multi-core ARMv8 Cortex-A53
-
- Comments:
-
- * This line of SoCs originates from the XScale family developed by
- Intel and acquired by Marvell in ~2006. All the processors of
- this MMP/MMP2 family were developed by Marvell.
-
- * Due to their XScale origin, these SoCs have virtually nothing in
- common with the other (Kirkwood, Dove, etc.) families of Marvell
- SoCs, except with the PXA family of SoCs listed above.
-
- Linux kernel mach directory: arch/arm/mach-mmp
- Linux kernel plat directory: arch/arm/plat-pxa
-
-Berlin family (Multimedia Solutions)
--------------------------------------
-
- Flavors:
- 88DE3010, Armada 1000 (no Linux support)
- Core: Marvell PJ1 (ARMv5TE), Dual-core
- Product Brief: http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
- 88DE3005, Armada 1500 Mini
- Design name: BG2CD
- Core: ARM Cortex-A9, PL310 L2CC
- 88DE3006, Armada 1500 Mini Plus
- Design name: BG2CDP
- Core: Dual Core ARM Cortex-A7
- 88DE3100, Armada 1500
- Design name: BG2
- Core: Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
- 88DE3114, Armada 1500 Pro
- Design name: BG2Q
- Core: Quad Core ARM Cortex-A9, PL310 L2CC
- 88DE3214, Armada 1500 Pro 4K
- Design name: BG3
- Core: ARM Cortex-A15, CA15 integrated L2CC
- 88DE3218, ARMADA 1500 Ultra
- Core: ARM Cortex-A53
-
- Homepage: https://www.synaptics.com/products/multimedia-solutions
- Directory: arch/arm/mach-berlin
-
- Comments:
-
- * This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
- with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
-
- * The Berlin family was acquired by Synaptics from Marvell in 2017.
-
-CPU Cores
----------
-
-The XScale cores were designed by Intel, and shipped by Marvell in the older
-PXA processors. Feroceon is a Marvell designed core that developed in-house,
-and that evolved into Sheeva. The XScale and Feroceon cores were phased out
-over time and replaced with Sheeva cores in later products, which subsequently
-got replaced with licensed ARM Cortex-A cores.
-
- XScale 1
- CPUID 0x69052xxx
- ARMv5, iWMMXt
- XScale 2
- CPUID 0x69054xxx
- ARMv5, iWMMXt
- XScale 3
- CPUID 0x69056xxx or 0x69056xxx
- ARMv5, iWMMXt
- Feroceon-1850 88fr331 "Mohawk"
- CPUID 0x5615331x or 0x41xx926x
- ARMv5TE, single issue
- Feroceon-2850 88fr531-vd "Jolteon"
- CPUID 0x5605531x or 0x41xx926x
- ARMv5TE, VFP, dual-issue
- Feroceon 88fr571-vd "Jolteon"
- CPUID 0x5615571x
- ARMv5TE, VFP, dual-issue
- Feroceon 88fr131 "Mohawk-D"
- CPUID 0x5625131x
- ARMv5TE, single-issue in-order
- Sheeva PJ1 88sv331 "Mohawk"
- CPUID 0x561584xx
- ARMv5, single-issue iWMMXt v2
- Sheeva PJ4 88sv581x "Flareon"
- CPUID 0x560f581x
- ARMv7, idivt, optional iWMMXt v2
- Sheeva PJ4B 88sv581x
- CPUID 0x561f581x
- ARMv7, idivt, optional iWMMXt v2
- Sheeva PJ4B-MP / PJ4C
- CPUID 0x562f584x
- ARMv7, idivt/idiva, LPAE, optional iWMMXt v2 and/or NEON
-
-Long-term plans
----------------
-
- * Unify the mach-dove/, mach-mv78xx0/, mach-orion5x/ into the
- mach-mvebu/ to support all SoCs from the Marvell EBU (Engineering
- Business Unit) in a single mach-<foo> directory. The plat-orion/
- would therefore disappear.
-
- * Unify the mach-mmp/ and mach-pxa/ into the same mach-pxa
- directory. The plat-pxa/ would therefore disappear.
-
-Credits
--------
-
- Maen Suleiman <maen@marvell.com>
- Lior Amsalem <alior@marvell.com>
- Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
- Andrew Lunn <andrew@lunn.ch>
- Nicolas Pitre <nico@fluxnic.net>
- Eric Miao <eric.y.miao@gmail.com>
diff --git a/Documentation/arm/Microchip/README b/Documentation/arm/Microchip/README
deleted file mode 100644
index a366f37d38f1..000000000000
--- a/Documentation/arm/Microchip/README
+++ /dev/null
@@ -1,169 +0,0 @@
-ARM Microchip SoCs (aka AT91)
-=============================
-
-
-Introduction
-------------
-This document gives useful information about the ARM Microchip SoCs that are
-currently supported in Linux Mainline (you know, the one on kernel.org).
-
-It is important to note that the Microchip (previously Atmel) ARM-based MPU
-product line is historically named "AT91" or "at91" throughout the Linux kernel
-development process even if this product prefix has completely disappeared from
-the official Microchip product name. Anyway, files, directories, git trees,
-git branches/tags and email subject always contain this "at91" sub-string.
-
-
-AT91 SoCs
----------
-Documentation and detailed datasheet for each product are available on
-the Microchip website: http://www.microchip.com.
-
- Flavors:
- * ARM 920 based SoC
- - at91rm9200
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-1768-32-bit-ARM920T-Embedded-Microprocessor-AT91RM9200_Datasheet.pdf
-
- * ARM 926 based SoCs
- - at91sam9260
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6221-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9260_Datasheet.pdf
-
- - at91sam9xe
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6254-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9XE_Datasheet.pdf
-
- - at91sam9261
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6062-ARM926EJ-S-Microprocessor-SAM9261_Datasheet.pdf
-
- - at91sam9263
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6249-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9263_Datasheet.pdf
-
- - at91sam9rl
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/doc6289.pdf
-
- - at91sam9g20
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001516A.pdf
-
- - at91sam9g45 family
- - at91sam9g45
- - at91sam9g46
- - at91sam9m10
- - at91sam9m11 (device superset)
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6437-32-bit-ARM926-Embedded-Microprocessor-SAM9M11_Datasheet.pdf
-
- - at91sam9x5 family (aka "The 5 series")
- - at91sam9g15
- - at91sam9g25
- - at91sam9g35
- - at91sam9x25
- - at91sam9x35
- + Datasheet (can be considered as covering the whole family)
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11055-32-bit-ARM926EJ-S-Microcontroller-SAM9X35_Datasheet.pdf
-
- - at91sam9n12
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
-
- * ARM Cortex-A5 based SoCs
- - sama5d3 family
- - sama5d31
- - sama5d33
- - sama5d34
- - sama5d35
- - sama5d36 (device superset)
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11121-32-bit-Cortex-A5-Microcontroller-SAMA5D3_Datasheet.pdf
-
- * ARM Cortex-A5 + NEON based SoCs
- - sama5d4 family
- - sama5d41
- - sama5d42
- - sama5d43
- - sama5d44 (device superset)
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/60001525A.pdf
-
- - sama5d2 family
- - sama5d21
- - sama5d22
- - sama5d23
- - sama5d24
- - sama5d26
- - sama5d27 (device superset)
- - sama5d28 (device superset + environmental monitors)
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001476B.pdf
-
- * ARM Cortex-M7 MCUs
- - sams70 family
- - sams70j19
- - sams70j20
- - sams70j21
- - sams70n19
- - sams70n20
- - sams70n21
- - sams70q19
- - sams70q20
- - sams70q21
-
- - samv70 family
- - samv70j19
- - samv70j20
- - samv70n19
- - samv70n20
- - samv70q19
- - samv70q20
-
- - samv71 family
- - samv71j19
- - samv71j20
- - samv71j21
- - samv71n19
- - samv71n20
- - samv71n21
- - samv71q19
- - samv71q20
- - samv71q21
-
- + Datasheet
- http://ww1.microchip.com/downloads/en/DeviceDoc/60001527A.pdf
-
-
-Linux kernel information
-------------------------
-Linux kernel mach directory: arch/arm/mach-at91
-MAINTAINERS entry is: "ARM/Microchip (AT91) SoC support"
-
-
-Device Tree for AT91 SoCs and boards
-------------------------------------
-All AT91 SoCs are converted to Device Tree. Since Linux 3.19, these products
-must use this method to boot the Linux kernel.
-
-Work In Progress statement:
-Device Tree files and Device Tree bindings that apply to AT91 SoCs and boards are
-considered as "Unstable". To be completely clear, any at91 binding can change at
-any time. So, be sure to use a Device Tree Binary and a Kernel Image generated from
-the same source tree.
-Please refer to the Documentation/devicetree/bindings/ABI.txt file for a
-definition of a "Stable" binding/ABI.
-This statement will be removed by AT91 MAINTAINERS when appropriate.
-
-Naming conventions and best practice:
-- SoCs Device Tree Source Include files are named after the official name of
- the product (at91sam9g20.dtsi or sama5d33.dtsi for instance).
-- Device Tree Source Include files (.dtsi) are used to collect common nodes that can be
- shared across SoCs or boards (sama5d3.dtsi or at91sam9x5cm.dtsi for instance).
- When collecting nodes for a particular peripheral or topic, the identifier have to
- be placed at the end of the file name, separated with a "_" (at91sam9x5_can.dtsi
- or sama5d3_gmac.dtsi for example).
-- board Device Tree Source files (.dts) are prefixed by the string "at91-" so
- that they can be identified easily. Note that some files are historical exceptions
- to this rule (sama5d3[13456]ek.dts, usb_a9g20.dts or animeo_ip.dts for example).
diff --git a/Documentation/arm/Netwinder b/Documentation/arm/Netwinder
deleted file mode 100644
index f1b457fbd3de..000000000000
--- a/Documentation/arm/Netwinder
+++ /dev/null
@@ -1,78 +0,0 @@
-NetWinder specific documentation
-================================
-
-The NetWinder is a small low-power computer, primarily designed
-to run Linux. It is based around the StrongARM RISC processor,
-DC21285 PCI bridge, with PC-type hardware glued around it.
-
-Port usage
-==========
-
-Min - Max Description
----------------------------
-0x0000 - 0x000f DMA1
-0x0020 - 0x0021 PIC1
-0x0060 - 0x006f Keyboard
-0x0070 - 0x007f RTC
-0x0080 - 0x0087 DMA1
-0x0088 - 0x008f DMA2
-0x00a0 - 0x00a3 PIC2
-0x00c0 - 0x00df DMA2
-0x0180 - 0x0187 IRDA
-0x01f0 - 0x01f6 ide0
-0x0201 Game port
-0x0203 RWA010 configuration read
-0x0220 - ? SoundBlaster
-0x0250 - ? WaveArtist
-0x0279 RWA010 configuration index
-0x02f8 - 0x02ff Serial ttyS1
-0x0300 - 0x031f Ether10
-0x0338 GPIO1
-0x033a GPIO2
-0x0370 - 0x0371 W83977F configuration registers
-0x0388 - ? AdLib
-0x03c0 - 0x03df VGA
-0x03f6 ide0
-0x03f8 - 0x03ff Serial ttyS0
-0x0400 - 0x0408 DC21143
-0x0480 - 0x0487 DMA1
-0x0488 - 0x048f DMA2
-0x0a79 RWA010 configuration write
-0xe800 - 0xe80f ide0/ide1 BM DMA
-
-
-Interrupt usage
-===============
-
-IRQ type Description
----------------------------
- 0 ISA 100Hz timer
- 1 ISA Keyboard
- 2 ISA cascade
- 3 ISA Serial ttyS1
- 4 ISA Serial ttyS0
- 5 ISA PS/2 mouse
- 6 ISA IRDA
- 7 ISA Printer
- 8 ISA RTC alarm
- 9 ISA
-10 ISA GP10 (Orange reset button)
-11 ISA
-12 ISA WaveArtist
-13 ISA
-14 ISA hda1
-15 ISA
-
-DMA usage
-=========
-
-DMA type Description
----------------------------
- 0 ISA IRDA
- 1 ISA
- 2 ISA cascade
- 3 ISA WaveArtist
- 4 ISA
- 5 ISA
- 6 ISA
- 7 ISA WaveArtist
diff --git a/Documentation/arm/OMAP/DSS b/Documentation/arm/OMAP/DSS
deleted file mode 100644
index 4484e021290e..000000000000
--- a/Documentation/arm/OMAP/DSS
+++ /dev/null
@@ -1,362 +0,0 @@
-OMAP2/3 Display Subsystem
--------------------------
-
-This is an almost total rewrite of the OMAP FB driver in drivers/video/omap
-(let's call it DSS1). The main differences between DSS1 and DSS2 are DSI,
-TV-out and multiple display support, but there are lots of small improvements
-also.
-
-The DSS2 driver (omapdss module) is in arch/arm/plat-omap/dss/, and the FB,
-panel and controller drivers are in drivers/video/omap2/. DSS1 and DSS2 live
-currently side by side, you can choose which one to use.
-
-Features
---------
-
-Working and tested features include:
-
-- MIPI DPI (parallel) output
-- MIPI DSI output in command mode
-- MIPI DBI (RFBI) output
-- SDI output
-- TV output
-- All pieces can be compiled as a module or inside kernel
-- Use DISPC to update any of the outputs
-- Use CPU to update RFBI or DSI output
-- OMAP DISPC planes
-- RGB16, RGB24 packed, RGB24 unpacked
-- YUV2, UYVY
-- Scaling
-- Adjusting DSS FCK to find a good pixel clock
-- Use DSI DPLL to create DSS FCK
-
-Tested boards include:
-- OMAP3 SDP board
-- Beagle board
-- N810
-
-omapdss driver
---------------
-
-The DSS driver does not itself have any support for Linux framebuffer, V4L or
-such like the current ones, but it has an internal kernel API that upper level
-drivers can use.
-
-The DSS driver models OMAP's overlays, overlay managers and displays in a
-flexible way to enable non-common multi-display configuration. In addition to
-modelling the hardware overlays, omapdss supports virtual overlays and overlay
-managers. These can be used when updating a display with CPU or system DMA.
-
-omapdss driver support for audio
---------------------------------
-There exist several display technologies and standards that support audio as
-well. Hence, it is relevant to update the DSS device driver to provide an audio
-interface that may be used by an audio driver or any other driver interested in
-the functionality.
-
-The audio_enable function is intended to prepare the relevant
-IP for playback (e.g., enabling an audio FIFO, taking in/out of reset
-some IP, enabling companion chips, etc). It is intended to be called before
-audio_start. The audio_disable function performs the reverse operation and is
-intended to be called after audio_stop.
-
-While a given DSS device driver may support audio, it is possible that for
-certain configurations audio is not supported (e.g., an HDMI display using a
-VESA video timing). The audio_supported function is intended to query whether
-the current configuration of the display supports audio.
-
-The audio_config function is intended to configure all the relevant audio
-parameters of the display. In order to make the function independent of any
-specific DSS device driver, a struct omap_dss_audio is defined. Its purpose
-is to contain all the required parameters for audio configuration. At the
-moment, such structure contains pointers to IEC-60958 channel status word
-and CEA-861 audio infoframe structures. This should be enough to support
-HDMI and DisplayPort, as both are based on CEA-861 and IEC-60958.
-
-The audio_enable/disable, audio_config and audio_supported functions could be
-implemented as functions that may sleep. Hence, they should not be called
-while holding a spinlock or a readlock.
-
-The audio_start/audio_stop function is intended to effectively start/stop audio
-playback after the configuration has taken place. These functions are designed
-to be used in an atomic context. Hence, audio_start should return quickly and be
-called only after all the needed resources for audio playback (audio FIFOs,
-DMA channels, companion chips, etc) have been enabled to begin data transfers.
-audio_stop is designed to only stop the audio transfers. The resources used
-for playback are released using audio_disable.
-
-The enum omap_dss_audio_state may be used to help the implementations of
-the interface to keep track of the audio state. The initial state is _DISABLED;
-then, the state transitions to _CONFIGURED, and then, when it is ready to
-play audio, to _ENABLED. The state _PLAYING is used when the audio is being
-rendered.
-
-
-Panel and controller drivers
-----------------------------
-
-The drivers implement panel or controller specific functionality and are not
-usually visible to users except through omapfb driver. They register
-themselves to the DSS driver.
-
-omapfb driver
--------------
-
-The omapfb driver implements arbitrary number of standard linux framebuffers.
-These framebuffers can be routed flexibly to any overlays, thus allowing very
-dynamic display architecture.
-
-The driver exports some omapfb specific ioctls, which are compatible with the
-ioctls in the old driver.
-
-The rest of the non standard features are exported via sysfs. Whether the final
-implementation will use sysfs, or ioctls, is still open.
-
-V4L2 drivers
-------------
-
-V4L2 is being implemented in TI.
-
-From omapdss point of view the V4L2 drivers should be similar to framebuffer
-driver.
-
-Architecture
---------------------
-
-Some clarification what the different components do:
-
- - Framebuffer is a memory area inside OMAP's SRAM/SDRAM that contains the
- pixel data for the image. Framebuffer has width and height and color
- depth.
- - Overlay defines where the pixels are read from and where they go on the
- screen. The overlay may be smaller than framebuffer, thus displaying only
- part of the framebuffer. The position of the overlay may be changed if
- the overlay is smaller than the display.
- - Overlay manager combines the overlays in to one image and feeds them to
- display.
- - Display is the actual physical display device.
-
-A framebuffer can be connected to multiple overlays to show the same pixel data
-on all of the overlays. Note that in this case the overlay input sizes must be
-the same, but, in case of video overlays, the output size can be different. Any
-framebuffer can be connected to any overlay.
-
-An overlay can be connected to one overlay manager. Also DISPC overlays can be
-connected only to DISPC overlay managers, and virtual overlays can be only
-connected to virtual overlays.
-
-An overlay manager can be connected to one display. There are certain
-restrictions which kinds of displays an overlay manager can be connected:
-
- - DISPC TV overlay manager can be only connected to TV display.
- - Virtual overlay managers can only be connected to DBI or DSI displays.
- - DISPC LCD overlay manager can be connected to all displays, except TV
- display.
-
-Sysfs
------
-The sysfs interface is mainly used for testing. I don't think sysfs
-interface is the best for this in the final version, but I don't quite know
-what would be the best interfaces for these things.
-
-The sysfs interface is divided to two parts: DSS and FB.
-
-/sys/class/graphics/fb? directory:
-mirror 0=off, 1=on
-rotate Rotation 0-3 for 0, 90, 180, 270 degrees
-rotate_type 0 = DMA rotation, 1 = VRFB rotation
-overlays List of overlay numbers to which framebuffer pixels go
-phys_addr Physical address of the framebuffer
-virt_addr Virtual address of the framebuffer
-size Size of the framebuffer
-
-/sys/devices/platform/omapdss/overlay? directory:
-enabled 0=off, 1=on
-input_size width,height (ie. the framebuffer size)
-manager Destination overlay manager name
-name
-output_size width,height
-position x,y
-screen_width width
-global_alpha global alpha 0-255 0=transparent 255=opaque
-
-/sys/devices/platform/omapdss/manager? directory:
-display Destination display
-name
-alpha_blending_enabled 0=off, 1=on
-trans_key_enabled 0=off, 1=on
-trans_key_type gfx-destination, video-source
-trans_key_value transparency color key (RGB24)
-default_color default background color (RGB24)
-
-/sys/devices/platform/omapdss/display? directory:
-ctrl_name Controller name
-mirror 0=off, 1=on
-update_mode 0=off, 1=auto, 2=manual
-enabled 0=off, 1=on
-name
-rotate Rotation 0-3 for 0, 90, 180, 270 degrees
-timings Display timings (pixclock,xres/hfp/hbp/hsw,yres/vfp/vbp/vsw)
- When writing, two special timings are accepted for tv-out:
- "pal" and "ntsc"
-panel_name
-tear_elim Tearing elimination 0=off, 1=on
-output_type Output type (video encoder only): "composite" or "svideo"
-
-There are also some debugfs files at <debugfs>/omapdss/ which show information
-about clocks and registers.
-
-Examples
---------
-
-The following definitions have been made for the examples below:
-
-ovl0=/sys/devices/platform/omapdss/overlay0
-ovl1=/sys/devices/platform/omapdss/overlay1
-ovl2=/sys/devices/platform/omapdss/overlay2
-
-mgr0=/sys/devices/platform/omapdss/manager0
-mgr1=/sys/devices/platform/omapdss/manager1
-
-lcd=/sys/devices/platform/omapdss/display0
-dvi=/sys/devices/platform/omapdss/display1
-tv=/sys/devices/platform/omapdss/display2
-
-fb0=/sys/class/graphics/fb0
-fb1=/sys/class/graphics/fb1
-fb2=/sys/class/graphics/fb2
-
-Default setup on OMAP3 SDP
---------------------------
-
-Here's the default setup on OMAP3 SDP board. All planes go to LCD. DVI
-and TV-out are not in use. The columns from left to right are:
-framebuffers, overlays, overlay managers, displays. Framebuffers are
-handled by omapfb, and the rest by the DSS.
-
-FB0 --- GFX -\ DVI
-FB1 --- VID1 --+- LCD ---- LCD
-FB2 --- VID2 -/ TV ----- TV
-
-Example: Switch from LCD to DVI
-----------------------
-
-w=`cat $dvi/timings | cut -d "," -f 2 | cut -d "/" -f 1`
-h=`cat $dvi/timings | cut -d "," -f 3 | cut -d "/" -f 1`
-
-echo "0" > $lcd/enabled
-echo "" > $mgr0/display
-fbset -fb /dev/fb0 -xres $w -yres $h -vxres $w -vyres $h
-# at this point you have to switch the dvi/lcd dip-switch from the omap board
-echo "dvi" > $mgr0/display
-echo "1" > $dvi/enabled
-
-After this the configuration looks like:
-
-FB0 --- GFX -\ -- DVI
-FB1 --- VID1 --+- LCD -/ LCD
-FB2 --- VID2 -/ TV ----- TV
-
-Example: Clone GFX overlay to LCD and TV
--------------------------------
-
-w=`cat $tv/timings | cut -d "," -f 2 | cut -d "/" -f 1`
-h=`cat $tv/timings | cut -d "," -f 3 | cut -d "/" -f 1`
-
-echo "0" > $ovl0/enabled
-echo "0" > $ovl1/enabled
-
-echo "" > $fb1/overlays
-echo "0,1" > $fb0/overlays
-
-echo "$w,$h" > $ovl1/output_size
-echo "tv" > $ovl1/manager
-
-echo "1" > $ovl0/enabled
-echo "1" > $ovl1/enabled
-
-echo "1" > $tv/enabled
-
-After this the configuration looks like (only relevant parts shown):
-
-FB0 +-- GFX ---- LCD ---- LCD
- \- VID1 ---- TV ---- TV
-
-Misc notes
-----------
-
-OMAP FB allocates the framebuffer memory using the standard dma allocator. You
-can enable Contiguous Memory Allocator (CONFIG_CMA) to improve the dma
-allocator, and if CMA is enabled, you use "cma=" kernel parameter to increase
-the global memory area for CMA.
-
-Using DSI DPLL to generate pixel clock it is possible produce the pixel clock
-of 86.5MHz (max possible), and with that you get 1280x1024@57 output from DVI.
-
-Rotation and mirroring currently only supports RGB565 and RGB8888 modes. VRFB
-does not support mirroring.
-
-VRFB rotation requires much more memory than non-rotated framebuffer, so you
-probably need to increase your vram setting before using VRFB rotation. Also,
-many applications may not work with VRFB if they do not pay attention to all
-framebuffer parameters.
-
-Kernel boot arguments
----------------------
-
-omapfb.mode=<display>:<mode>[,...]
- - Default video mode for specified displays. For example,
- "dvi:800x400MR-24@60". See drivers/video/modedb.c.
- There are also two special modes: "pal" and "ntsc" that
- can be used to tv out.
-
-omapfb.vram=<fbnum>:<size>[@<physaddr>][,...]
- - VRAM allocated for a framebuffer. Normally omapfb allocates vram
- depending on the display size. With this you can manually allocate
- more or define the physical address of each framebuffer. For example,
- "1:4M" to allocate 4M for fb1.
-
-omapfb.debug=<y|n>
- - Enable debug printing. You have to have OMAPFB debug support enabled
- in kernel config.
-
-omapfb.test=<y|n>
- - Draw test pattern to framebuffer whenever framebuffer settings change.
- You need to have OMAPFB debug support enabled in kernel config.
-
-omapfb.vrfb=<y|n>
- - Use VRFB rotation for all framebuffers.
-
-omapfb.rotate=<angle>
- - Default rotation applied to all framebuffers.
- 0 - 0 degree rotation
- 1 - 90 degree rotation
- 2 - 180 degree rotation
- 3 - 270 degree rotation
-
-omapfb.mirror=<y|n>
- - Default mirror for all framebuffers. Only works with DMA rotation.
-
-omapdss.def_disp=<display>
- - Name of default display, to which all overlays will be connected.
- Common examples are "lcd" or "tv".
-
-omapdss.debug=<y|n>
- - Enable debug printing. You have to have DSS debug support enabled in
- kernel config.
-
-TODO
-----
-
-DSS locking
-
-Error checking
-- Lots of checks are missing or implemented just as BUG()
-
-System DMA update for DSI
-- Can be used for RGB16 and RGB24P modes. Probably not for RGB24U (how
- to skip the empty byte?)
-
-OMAP1 support
-- Not sure if needed
-
diff --git a/Documentation/arm/OMAP/README b/Documentation/arm/OMAP/README
deleted file mode 100644
index 90c6c57d61e8..000000000000
--- a/Documentation/arm/OMAP/README
+++ /dev/null
@@ -1,11 +0,0 @@
-This file contains documentation for running mainline
-kernel on omaps.
-
-KERNEL NEW DEPENDENCIES
-v4.3+ Update is needed for custom .config files to make sure
- CONFIG_REGULATOR_PBIAS is enabled for MMC1 to work
- properly.
-
-v4.18+ Update is needed for custom .config files to make sure
- CONFIG_MMC_SDHCI_OMAP is enabled for all MMC instances
- to work in DRA7 and K2G based boards.
diff --git a/Documentation/arm/OMAP/omap_pm b/Documentation/arm/OMAP/omap_pm
deleted file mode 100644
index 4ae915a9f899..000000000000
--- a/Documentation/arm/OMAP/omap_pm
+++ /dev/null
@@ -1,154 +0,0 @@
-
-The OMAP PM interface
-=====================
-
-This document describes the temporary OMAP PM interface. Driver
-authors use these functions to communicate minimum latency or
-throughput constraints to the kernel power management code.
-Over time, the intention is to merge features from the OMAP PM
-interface into the Linux PM QoS code.
-
-Drivers need to express PM parameters which:
-
-- support the range of power management parameters present in the TI SRF;
-
-- separate the drivers from the underlying PM parameter
- implementation, whether it is the TI SRF or Linux PM QoS or Linux
- latency framework or something else;
-
-- specify PM parameters in terms of fundamental units, such as
- latency and throughput, rather than units which are specific to OMAP
- or to particular OMAP variants;
-
-- allow drivers which are shared with other architectures (e.g.,
- DaVinci) to add these constraints in a way which won't affect non-OMAP
- systems,
-
-- can be implemented immediately with minimal disruption of other
- architectures.
-
-
-This document proposes the OMAP PM interface, including the following
-five power management functions for driver code:
-
-1. Set the maximum MPU wakeup latency:
- (*pdata->set_max_mpu_wakeup_lat)(struct device *dev, unsigned long t)
-
-2. Set the maximum device wakeup latency:
- (*pdata->set_max_dev_wakeup_lat)(struct device *dev, unsigned long t)
-
-3. Set the maximum system DMA transfer start latency (CORE pwrdm):
- (*pdata->set_max_sdma_lat)(struct device *dev, long t)
-
-4. Set the minimum bus throughput needed by a device:
- (*pdata->set_min_bus_tput)(struct device *dev, u8 agent_id, unsigned long r)
-
-5. Return the number of times the device has lost context
- (*pdata->get_dev_context_loss_count)(struct device *dev)
-
-
-Further documentation for all OMAP PM interface functions can be
-found in arch/arm/plat-omap/include/mach/omap-pm.h.
-
-
-The OMAP PM layer is intended to be temporary
----------------------------------------------
-
-The intention is that eventually the Linux PM QoS layer should support
-the range of power management features present in OMAP3. As this
-happens, existing drivers using the OMAP PM interface can be modified
-to use the Linux PM QoS code; and the OMAP PM interface can disappear.
-
-
-Driver usage of the OMAP PM functions
--------------------------------------
-
-As the 'pdata' in the above examples indicates, these functions are
-exposed to drivers through function pointers in driver .platform_data
-structures. The function pointers are initialized by the board-*.c
-files to point to the corresponding OMAP PM functions:
-.set_max_dev_wakeup_lat will point to
-omap_pm_set_max_dev_wakeup_lat(), etc. Other architectures which do
-not support these functions should leave these function pointers set
-to NULL. Drivers should use the following idiom:
-
- if (pdata->set_max_dev_wakeup_lat)
- (*pdata->set_max_dev_wakeup_lat)(dev, t);
-
-The most common usage of these functions will probably be to specify
-the maximum time from when an interrupt occurs, to when the device
-becomes accessible. To accomplish this, driver writers should use the
-set_max_mpu_wakeup_lat() function to constrain the MPU wakeup
-latency, and the set_max_dev_wakeup_lat() function to constrain the
-device wakeup latency (from clk_enable() to accessibility). For
-example,
-
- /* Limit MPU wakeup latency */
- if (pdata->set_max_mpu_wakeup_lat)
- (*pdata->set_max_mpu_wakeup_lat)(dev, tc);
-
- /* Limit device powerdomain wakeup latency */
- if (pdata->set_max_dev_wakeup_lat)
- (*pdata->set_max_dev_wakeup_lat)(dev, td);
-
- /* total wakeup latency in this example: (tc + td) */
-
-The PM parameters can be overwritten by calling the function again
-with the new value. The settings can be removed by calling the
-function with a t argument of -1 (except in the case of
-set_max_bus_tput(), which should be called with an r argument of 0).
-
-The fifth function above, omap_pm_get_dev_context_loss_count(),
-is intended as an optimization to allow drivers to determine whether the
-device has lost its internal context. If context has been lost, the
-driver must restore its internal context before proceeding.
-
-
-Other specialized interface functions
--------------------------------------
-
-The five functions listed above are intended to be usable by any
-device driver. DSPBridge and CPUFreq have a few special requirements.
-DSPBridge expresses target DSP performance levels in terms of OPP IDs.
-CPUFreq expresses target MPU performance levels in terms of MPU
-frequency. The OMAP PM interface contains functions for these
-specialized cases to convert that input information (OPPs/MPU
-frequency) into the form that the underlying power management
-implementation needs:
-
-6. (*pdata->dsp_get_opp_table)(void)
-
-7. (*pdata->dsp_set_min_opp)(u8 opp_id)
-
-8. (*pdata->dsp_get_opp)(void)
-
-9. (*pdata->cpu_get_freq_table)(void)
-
-10. (*pdata->cpu_set_freq)(unsigned long f)
-
-11. (*pdata->cpu_get_freq)(void)
-
-Customizing OPP for platform
-============================
-Defining CONFIG_PM should enable OPP layer for the silicon
-and the registration of OPP table should take place automatically.
-However, in special cases, the default OPP table may need to be
-tweaked, for e.g.:
- * enable default OPPs which are disabled by default, but which
- could be enabled on a platform
- * Disable an unsupported OPP on the platform
- * Define and add a custom opp table entry
-in these cases, the board file needs to do additional steps as follows:
-arch/arm/mach-omapx/board-xyz.c
- #include "pm.h"
- ....
- static void __init omap_xyz_init_irq(void)
- {
- ....
- /* Initialize the default table */
- omapx_opp_init();
- /* Do customization to the defaults */
- ....
- }
-NOTE: omapx_opp_init will be omap3_opp_init or as required
-based on the omap family.
diff --git a/Documentation/arm/Porting b/Documentation/arm/Porting
deleted file mode 100644
index a492233931b9..000000000000
--- a/Documentation/arm/Porting
+++ /dev/null
@@ -1,135 +0,0 @@
-Taken from list archive at http://lists.arm.linux.org.uk/pipermail/linux-arm-kernel/2001-July/004064.html
-
-Initial definitions
--------------------
-
-The following symbol definitions rely on you knowing the translation that
-__virt_to_phys() does for your machine. This macro converts the passed
-virtual address to a physical address. Normally, it is simply:
-
- phys = virt - PAGE_OFFSET + PHYS_OFFSET
-
-
-Decompressor Symbols
---------------------
-
-ZTEXTADDR
- Start address of decompressor. There's no point in talking about
- virtual or physical addresses here, since the MMU will be off at
- the time when you call the decompressor code. You normally call
- the kernel at this address to start it booting. This doesn't have
- to be located in RAM, it can be in flash or other read-only or
- read-write addressable medium.
-
-ZBSSADDR
- Start address of zero-initialised work area for the decompressor.
- This must be pointing at RAM. The decompressor will zero initialise
- this for you. Again, the MMU will be off.
-
-ZRELADDR
- This is the address where the decompressed kernel will be written,
- and eventually executed. The following constraint must be valid:
-
- __virt_to_phys(TEXTADDR) == ZRELADDR
-
- The initial part of the kernel is carefully coded to be position
- independent.
-
-INITRD_PHYS
- Physical address to place the initial RAM disk. Only relevant if
- you are using the bootpImage stuff (which only works on the old
- struct param_struct).
-
-INITRD_VIRT
- Virtual address of the initial RAM disk. The following constraint
- must be valid:
-
- __virt_to_phys(INITRD_VIRT) == INITRD_PHYS
-
-PARAMS_PHYS
- Physical address of the struct param_struct or tag list, giving the
- kernel various parameters about its execution environment.
-
-
-Kernel Symbols
---------------
-
-PHYS_OFFSET
- Physical start address of the first bank of RAM.
-
-PAGE_OFFSET
- Virtual start address of the first bank of RAM. During the kernel
- boot phase, virtual address PAGE_OFFSET will be mapped to physical
- address PHYS_OFFSET, along with any other mappings you supply.
- This should be the same value as TASK_SIZE.
-
-TASK_SIZE
- The maximum size of a user process in bytes. Since user space
- always starts at zero, this is the maximum address that a user
- process can access+1. The user space stack grows down from this
- address.
-
- Any virtual address below TASK_SIZE is deemed to be user process
- area, and therefore managed dynamically on a process by process
- basis by the kernel. I'll call this the user segment.
-
- Anything above TASK_SIZE is common to all processes. I'll call
- this the kernel segment.
-
- (In other words, you can't put IO mappings below TASK_SIZE, and
- hence PAGE_OFFSET).
-
-TEXTADDR
- Virtual start address of kernel, normally PAGE_OFFSET + 0x8000.
- This is where the kernel image ends up. With the latest kernels,
- it must be located at 32768 bytes into a 128MB region. Previous
- kernels placed a restriction of 256MB here.
-
-DATAADDR
- Virtual address for the kernel data segment. Must not be defined
- when using the decompressor.
-
-VMALLOC_START
-VMALLOC_END
- Virtual addresses bounding the vmalloc() area. There must not be
- any static mappings in this area; vmalloc will overwrite them.
- The addresses must also be in the kernel segment (see above).
- Normally, the vmalloc() area starts VMALLOC_OFFSET bytes above the
- last virtual RAM address (found using variable high_memory).
-
-VMALLOC_OFFSET
- Offset normally incorporated into VMALLOC_START to provide a hole
- between virtual RAM and the vmalloc area. We do this to allow
- out of bounds memory accesses (eg, something writing off the end
- of the mapped memory map) to be caught. Normally set to 8MB.
-
-Architecture Specific Macros
-----------------------------
-
-BOOT_MEM(pram,pio,vio)
- `pram' specifies the physical start address of RAM. Must always
- be present, and should be the same as PHYS_OFFSET.
-
- `pio' is the physical address of an 8MB region containing IO for
- use with the debugging macros in arch/arm/kernel/debug-armv.S.
-
- `vio' is the virtual address of the 8MB debugging region.
-
- It is expected that the debugging region will be re-initialised
- by the architecture specific code later in the code (via the
- MAPIO function).
-
-BOOT_PARAMS
- Same as, and see PARAMS_PHYS.
-
-FIXUP(func)
- Machine specific fixups, run before memory subsystems have been
- initialised.
-
-MAPIO(func)
- Machine specific function to map IO areas (including the debug
- region above).
-
-INITIRQ(func)
- Machine specific function to initialise interrupts.
-
diff --git a/Documentation/arm/README b/Documentation/arm/README
deleted file mode 100644
index 9d1e5b2c92e6..000000000000
--- a/Documentation/arm/README
+++ /dev/null
@@ -1,204 +0,0 @@
- ARM Linux 2.6
- =============
-
- Please check <ftp://ftp.arm.linux.org.uk/pub/armlinux> for
- updates.
-
-Compilation of kernel
----------------------
-
- In order to compile ARM Linux, you will need a compiler capable of
- generating ARM ELF code with GNU extensions. GCC 3.3 is known to be
- a good compiler. Fortunately, you needn't guess. The kernel will report
- an error if your compiler is a recognized offender.
-
- To build ARM Linux natively, you shouldn't have to alter the ARCH = line
- in the top level Makefile. However, if you don't have the ARM Linux ELF
- tools installed as default, then you should change the CROSS_COMPILE
- line as detailed below.
-
- If you wish to cross-compile, then alter the following lines in the top
- level make file:
-
- ARCH = <whatever>
- with
- ARCH = arm
-
- and
-
- CROSS_COMPILE=
- to
- CROSS_COMPILE=<your-path-to-your-compiler-without-gcc>
- eg.
- CROSS_COMPILE=arm-linux-
-
- Do a 'make config', followed by 'make Image' to build the kernel
- (arch/arm/boot/Image). A compressed image can be built by doing a
- 'make zImage' instead of 'make Image'.
-
-
-Bug reports etc
----------------
-
- Please send patches to the patch system. For more information, see
- http://www.arm.linux.org.uk/developer/patches/info.php Always include some
- explanation as to what the patch does and why it is needed.
-
- Bug reports should be sent to linux-arm-kernel@lists.arm.linux.org.uk,
- or submitted through the web form at
- http://www.arm.linux.org.uk/developer/
-
- When sending bug reports, please ensure that they contain all relevant
- information, eg. the kernel messages that were printed before/during
- the problem, what you were doing, etc.
-
-
-Include files
--------------
-
- Several new include directories have been created under include/asm-arm,
- which are there to reduce the clutter in the top-level directory. These
- directories, and their purpose is listed below:
-
- arch-* machine/platform specific header files
- hardware driver-internal ARM specific data structures/definitions
- mach descriptions of generic ARM to specific machine interfaces
- proc-* processor dependent header files (currently only two
- categories)
-
-
-Machine/Platform support
-------------------------
-
- The ARM tree contains support for a lot of different machine types. To
- continue supporting these differences, it has become necessary to split
- machine-specific parts by directory. For this, the machine category is
- used to select which directories and files get included (we will use
- $(MACHINE) to refer to the category)
-
- To this end, we now have arch/arm/mach-$(MACHINE) directories which are
- designed to house the non-driver files for a particular machine (eg, PCI,
- memory management, architecture definitions etc). For all future
- machines, there should be a corresponding arch/arm/mach-$(MACHINE)/include/mach
- directory.
-
-
-Modules
--------
-
- Although modularisation is supported (and required for the FP emulator),
- each module on an ARM2/ARM250/ARM3 machine when is loaded will take
- memory up to the next 32k boundary due to the size of the pages.
- Therefore, is modularisation on these machines really worth it?
-
- However, ARM6 and up machines allow modules to take multiples of 4k, and
- as such Acorn RiscPCs and other architectures using these processors can
- make good use of modularisation.
-
-
-ADFS Image files
-----------------
-
- You can access image files on your ADFS partitions by mounting the ADFS
- partition, and then using the loopback device driver. You must have
- losetup installed.
-
- Please note that the PCEmulator DOS partitions have a partition table at
- the start, and as such, you will have to give '-o offset' to losetup.
-
-
-Request to developers
----------------------
-
- When writing device drivers which include a separate assembler file, please
- include it in with the C file, and not the arch/arm/lib directory. This
- allows the driver to be compiled as a loadable module without requiring
- half the code to be compiled into the kernel image.
-
- In general, try to avoid using assembler unless it is really necessary. It
- makes drivers far less easy to port to other hardware.
-
-
-ST506 hard drives
------------------
-
- The ST506 hard drive controllers seem to be working fine (if a little
- slowly). At the moment they will only work off the controllers on an
- A4x0's motherboard, but for it to work off a Podule just requires
- someone with a podule to add the addresses for the IRQ mask and the
- HDC base to the source.
-
- As of 31/3/96 it works with two drives (you should get the ADFS
- *configure harddrive set to 2). I've got an internal 20MB and a great
- big external 5.25" FH 64MB drive (who could ever want more :-) ).
-
- I've just got 240K/s off it (a dd with bs=128k); thats about half of what
- RiscOS gets; but it's a heck of a lot better than the 50K/s I was getting
- last week :-)
-
- Known bug: Drive data errors can cause a hang; including cases where
- the controller has fixed the error using ECC. (Possibly ONLY
- in that case...hmm).
-
-
-1772 Floppy
------------
- This also seems to work OK, but hasn't been stressed much lately. It
- hasn't got any code for disc change detection in there at the moment which
- could be a bit of a problem! Suggestions on the correct way to do this
- are welcome.
-
-
-CONFIG_MACH_ and CONFIG_ARCH_
------------------------------
- A change was made in 2003 to the macro names for new machines.
- Historically, CONFIG_ARCH_ was used for the bonafide architecture,
- e.g. SA1100, as well as implementations of the architecture,
- e.g. Assabet. It was decided to change the implementation macros
- to read CONFIG_MACH_ for clarity. Moreover, a retroactive fixup has
- not been made because it would complicate patching.
-
- Previous registrations may be found online.
-
- <http://www.arm.linux.org.uk/developer/machines/>
-
-Kernel entry (head.S)
---------------------------
- The initial entry into the kernel is via head.S, which uses machine
- independent code. The machine is selected by the value of 'r1' on
- entry, which must be kept unique.
-
- Due to the large number of machines which the ARM port of Linux provides
- for, we have a method to manage this which ensures that we don't end up
- duplicating large amounts of code.
-
- We group machine (or platform) support code into machine classes. A
- class typically based around one or more system on a chip devices, and
- acts as a natural container around the actual implementations. These
- classes are given directories - arch/arm/mach-<class> and
- arch/arm/mach-<class> - which contain the source files to/include/mach
- support the machine class. This directories also contain any machine
- specific supporting code.
-
- For example, the SA1100 class is based upon the SA1100 and SA1110 SoC
- devices, and contains the code to support the way the on-board and off-
- board devices are used, or the device is setup, and provides that
- machine specific "personality."
-
- For platforms that support device tree (DT), the machine selection is
- controlled at runtime by passing the device tree blob to the kernel. At
- compile-time, support for the machine type must be selected. This allows for
- a single multiplatform kernel build to be used for several machine types.
-
- For platforms that do not use device tree, this machine selection is
- controlled by the machine type ID, which acts both as a run-time and a
- compile-time code selection method. You can register a new machine via the
- web site at:
-
- <http://www.arm.linux.org.uk/developer/machines/>
-
- Note: Please do not register a machine type for DT-only platforms. If your
- platform is DT-only, you do not need a registered machine type.
-
----
-Russell King (15/03/2004)
diff --git a/Documentation/arm/SA1100/ADSBitsy b/Documentation/arm/SA1100/ADSBitsy
deleted file mode 100644
index f9f62e8c0719..000000000000
--- a/Documentation/arm/SA1100/ADSBitsy
+++ /dev/null
@@ -1,43 +0,0 @@
-ADS Bitsy Single Board Computer
-(It is different from Bitsy(iPAQ) of Compaq)
-
-For more details, contact Applied Data Systems or see
-http://www.applieddata.net/products.html
-
-The Linux support for this product has been provided by
-Woojung Huh <whuh@applieddata.net>
-
-Use 'make adsbitsy_config' before any 'make config'.
-This will set up defaults for ADS Bitsy support.
-
-The kernel zImage is linked to be loaded and executed at 0xc0400000.
-
-Linux can be used with the ADS BootLoader that ships with the
-newer rev boards. See their documentation on how to load Linux.
-
-Supported peripherals:
-- SA1100 LCD frame buffer (8/16bpp...sort of)
-- SA1111 USB Master
-- SA1100 serial port
-- pcmcia, compact flash
-- touchscreen(ucb1200)
-- console on LCD screen
-- serial ports (ttyS[0-2])
- - ttyS0 is default for serial console
-
-To do:
-- everything else! :-)
-
-Notes:
-
-- The flash on board is divided into 3 partitions.
- You should be careful to use flash on board.
- Its partition is different from GraphicsClient Plus and GraphicsMaster
-
-- 16bpp mode requires a different cable than what ships with the board.
- Contact ADS or look through the manual to wire your own. Currently,
- if you compile with 16bit mode support and switch into a lower bpp
- mode, the timing is off so the image is corrupted. This will be
- fixed soon.
-
-Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/Assabet b/Documentation/arm/SA1100/Assabet
deleted file mode 100644
index e08a6739e72c..000000000000
--- a/Documentation/arm/SA1100/Assabet
+++ /dev/null
@@ -1,300 +0,0 @@
-The Intel Assabet (SA-1110 evaluation) board
-============================================
-
-Please see:
-http://developer.intel.com
-
-Also some notes from John G Dorsey <jd5q@andrew.cmu.edu>:
-http://www.cs.cmu.edu/~wearable/software/assabet.html
-
-
-Building the kernel
--------------------
-
-To build the kernel with current defaults:
-
- make assabet_config
- make oldconfig
- make zImage
-
-The resulting kernel image should be available in linux/arch/arm/boot/zImage.
-
-
-Installing a bootloader
------------------------
-
-A couple of bootloaders able to boot Linux on Assabet are available:
-
-BLOB (http://www.lartmaker.nl/lartware/blob/)
-
- BLOB is a bootloader used within the LART project. Some contributed
- patches were merged into BLOB to add support for Assabet.
-
-Compaq's Bootldr + John Dorsey's patch for Assabet support
-(http://www.handhelds.org/Compaq/bootldr.html)
-(http://www.wearablegroup.org/software/bootldr/)
-
- Bootldr is the bootloader developed by Compaq for the iPAQ Pocket PC.
- John Dorsey has produced add-on patches to add support for Assabet and
- the JFFS filesystem.
-
-RedBoot (http://sources.redhat.com/redboot/)
-
- RedBoot is a bootloader developed by Red Hat based on the eCos RTOS
- hardware abstraction layer. It supports Assabet amongst many other
- hardware platforms.
-
-RedBoot is currently the recommended choice since it's the only one to have
-networking support, and is the most actively maintained.
-
-Brief examples on how to boot Linux with RedBoot are shown below. But first
-you need to have RedBoot installed in your flash memory. A known to work
-precompiled RedBoot binary is available from the following location:
-
-ftp://ftp.netwinder.org/users/n/nico/
-ftp://ftp.arm.linux.org.uk/pub/linux/arm/people/nico/
-ftp://ftp.handhelds.org/pub/linux/arm/sa-1100-patches/
-
-Look for redboot-assabet*.tgz. Some installation infos are provided in
-redboot-assabet*.txt.
-
-
-Initial RedBoot configuration
------------------------------
-
-The commands used here are explained in The RedBoot User's Guide available
-on-line at http://sources.redhat.com/ecos/docs.html.
-Please refer to it for explanations.
-
-If you have a CF network card (my Assabet kit contained a CF+ LP-E from
-Socket Communications Inc.), you should strongly consider using it for TFTP
-file transfers. You must insert it before RedBoot runs since it can't detect
-it dynamically.
-
-To initialize the flash directory:
-
- fis init -f
-
-To initialize the non-volatile settings, like whether you want to use BOOTP or
-a static IP address, etc, use this command:
-
- fconfig -i
-
-
-Writing a kernel image into flash
----------------------------------
-
-First, the kernel image must be loaded into RAM. If you have the zImage file
-available on a TFTP server:
-
- load zImage -r -b 0x100000
-
-If you rather want to use Y-Modem upload over the serial port:
-
- load -m ymodem -r -b 0x100000
-
-To write it to flash:
-
- fis create "Linux kernel" -b 0x100000 -l 0xc0000
-
-
-Booting the kernel
-------------------
-
-The kernel still requires a filesystem to boot. A ramdisk image can be loaded
-as follows:
-
- load ramdisk_image.gz -r -b 0x800000
-
-Again, Y-Modem upload can be used instead of TFTP by replacing the file name
-by '-y ymodem'.
-
-Now the kernel can be retrieved from flash like this:
-
- fis load "Linux kernel"
-
-or loaded as described previously. To boot the kernel:
-
- exec -b 0x100000 -l 0xc0000
-
-The ramdisk image could be stored into flash as well, but there are better
-solutions for on-flash filesystems as mentioned below.
-
-
-Using JFFS2
------------
-
-Using JFFS2 (the Second Journalling Flash File System) is probably the most
-convenient way to store a writable filesystem into flash. JFFS2 is used in
-conjunction with the MTD layer which is responsible for low-level flash
-management. More information on the Linux MTD can be found on-line at:
-http://www.linux-mtd.infradead.org/. A JFFS howto with some infos about
-creating JFFS/JFFS2 images is available from the same site.
-
-For instance, a sample JFFS2 image can be retrieved from the same FTP sites
-mentioned below for the precompiled RedBoot image.
-
-To load this file:
-
- load sample_img.jffs2 -r -b 0x100000
-
-The result should look like:
-
-RedBoot> load sample_img.jffs2 -r -b 0x100000
-Raw file loaded 0x00100000-0x00377424
-
-Now we must know the size of the unallocated flash:
-
- fis free
-
-Result:
-
-RedBoot> fis free
- 0x500E0000 .. 0x503C0000
-
-The values above may be different depending on the size of the filesystem and
-the type of flash. See their usage below as an example and take care of
-substituting yours appropriately.
-
-We must determine some values:
-
-size of unallocated flash: 0x503c0000 - 0x500e0000 = 0x2e0000
-size of the filesystem image: 0x00377424 - 0x00100000 = 0x277424
-
-We want to fit the filesystem image of course, but we also want to give it all
-the remaining flash space as well. To write it:
-
- fis unlock -f 0x500E0000 -l 0x2e0000
- fis erase -f 0x500E0000 -l 0x2e0000
- fis write -b 0x100000 -l 0x277424 -f 0x500E0000
- fis create "JFFS2" -n -f 0x500E0000 -l 0x2e0000
-
-Now the filesystem is associated to a MTD "partition" once Linux has discovered
-what they are in the boot process. From Redboot, the 'fis list' command
-displays them:
-
-RedBoot> fis list
-Name FLASH addr Mem addr Length Entry point
-RedBoot 0x50000000 0x50000000 0x00020000 0x00000000
-RedBoot config 0x503C0000 0x503C0000 0x00020000 0x00000000
-FIS directory 0x503E0000 0x503E0000 0x00020000 0x00000000
-Linux kernel 0x50020000 0x00100000 0x000C0000 0x00000000
-JFFS2 0x500E0000 0x500E0000 0x002E0000 0x00000000
-
-However Linux should display something like:
-
-SA1100 flash: probing 32-bit flash bus
-SA1100 flash: Found 2 x16 devices at 0x0 in 32-bit mode
-Using RedBoot partition definition
-Creating 5 MTD partitions on "SA1100 flash":
-0x00000000-0x00020000 : "RedBoot"
-0x00020000-0x000e0000 : "Linux kernel"
-0x000e0000-0x003c0000 : "JFFS2"
-0x003c0000-0x003e0000 : "RedBoot config"
-0x003e0000-0x00400000 : "FIS directory"
-
-What's important here is the position of the partition we are interested in,
-which is the third one. Within Linux, this correspond to /dev/mtdblock2.
-Therefore to boot Linux with the kernel and its root filesystem in flash, we
-need this RedBoot command:
-
- fis load "Linux kernel"
- exec -b 0x100000 -l 0xc0000 -c "root=/dev/mtdblock2"
-
-Of course other filesystems than JFFS might be used, like cramfs for example.
-You might want to boot with a root filesystem over NFS, etc. It is also
-possible, and sometimes more convenient, to flash a filesystem directly from
-within Linux while booted from a ramdisk or NFS. The Linux MTD repository has
-many tools to deal with flash memory as well, to erase it for example. JFFS2
-can then be mounted directly on a freshly erased partition and files can be
-copied over directly. Etc...
-
-
-RedBoot scripting
------------------
-
-All the commands above aren't so useful if they have to be typed in every
-time the Assabet is rebooted. Therefore it's possible to automate the boot
-process using RedBoot's scripting capability.
-
-For example, I use this to boot Linux with both the kernel and the ramdisk
-images retrieved from a TFTP server on the network:
-
-RedBoot> fconfig
-Run script at boot: false true
-Boot script:
-Enter script, terminate with empty line
->> load zImage -r -b 0x100000
->> load ramdisk_ks.gz -r -b 0x800000
->> exec -b 0x100000 -l 0xc0000
->>
-Boot script timeout (1000ms resolution): 3
-Use BOOTP for network configuration: true
-GDB connection port: 9000
-Network debug at boot time: false
-Update RedBoot non-volatile configuration - are you sure (y/n)? y
-
-Then, rebooting the Assabet is just a matter of waiting for the login prompt.
-
-
-
-Nicolas Pitre
-nico@fluxnic.net
-June 12, 2001
-
-
-Status of peripherals in -rmk tree (updated 14/10/2001)
--------------------------------------------------------
-
-Assabet:
- Serial ports:
- Radio: TX, RX, CTS, DSR, DCD, RI
- PM: Not tested.
- COM: TX, RX, CTS, DSR, DCD, RTS, DTR, PM
- PM: Not tested.
- I2C: Implemented, not fully tested.
- L3: Fully tested, pass.
- PM: Not tested.
-
- Video:
- LCD: Fully tested. PM
- (LCD doesn't like being blanked with
- neponset connected)
- Video out: Not fully
-
- Audio:
- UDA1341:
- Playback: Fully tested, pass.
- Record: Implemented, not tested.
- PM: Not tested.
-
- UCB1200:
- Audio play: Implemented, not heavily tested.
- Audio rec: Implemented, not heavily tested.
- Telco audio play: Implemented, not heavily tested.
- Telco audio rec: Implemented, not heavily tested.
- POTS control: No
- Touchscreen: Yes
- PM: Not tested.
-
- Other:
- PCMCIA:
- LPE: Fully tested, pass.
- USB: No
- IRDA:
- SIR: Fully tested, pass.
- FIR: Fully tested, pass.
- PM: Not tested.
-
-Neponset:
- Serial ports:
- COM1,2: TX, RX, CTS, DSR, DCD, RTS, DTR
- PM: Not tested.
- USB: Implemented, not heavily tested.
- PCMCIA: Implemented, not heavily tested.
- PM: Not tested.
- CF: Implemented, not heavily tested.
- PM: Not tested.
-
-More stuff can be found in the -np (Nicolas Pitre's) tree.
-
diff --git a/Documentation/arm/SA1100/Brutus b/Documentation/arm/SA1100/Brutus
deleted file mode 100644
index 6a3aa95e9bfd..000000000000
--- a/Documentation/arm/SA1100/Brutus
+++ /dev/null
@@ -1,66 +0,0 @@
-Brutus is an evaluation platform for the SA1100 manufactured by Intel.
-For more details, see:
-
-http://developer.intel.com
-
-To compile for Brutus, you must issue the following commands:
-
- make brutus_config
- make config
- [accept all the defaults]
- make zImage
-
-The resulting kernel will end up in linux/arch/arm/boot/zImage. This file
-must be loaded at 0xc0008000 in Brutus's memory and execution started at
-0xc0008000 as well with the value of registers r0 = 0 and r1 = 16 upon
-entry.
-
-But prior to execute the kernel, a ramdisk image must also be loaded in
-memory. Use memory address 0xd8000000 for this. Note that the file
-containing the (compressed) ramdisk image must not exceed 4 MB.
-
-Typically, you'll need angelboot to load the kernel.
-The following angelboot.opt file should be used:
-
------ begin angelboot.opt -----
-base 0xc0008000
-entry 0xc0008000
-r0 0x00000000
-r1 0x00000010
-device /dev/ttyS0
-options "9600 8N1"
-baud 115200
-otherfile ramdisk_img.gz
-otherbase 0xd8000000
------ end angelboot.opt -----
-
-Then load the kernel and ramdisk with:
-
- angelboot -f angelboot.opt zImage
-
-The first Brutus serial port (assumed to be linked to /dev/ttyS0 on your
-host PC) is used by angel to load the kernel and ramdisk image. The serial
-console is provided through the second Brutus serial port. To access it,
-you may use minicom configured with /dev/ttyS1, 9600 baud, 8N1, no flow
-control.
-
-Currently supported:
- - RS232 serial ports
- - audio output
- - LCD screen
- - keyboard
-
-The actual Brutus support may not be complete without extra patches.
-If such patches exist, they should be found from
-ftp.netwinder.org/users/n/nico.
-
-A full PCMCIA support is still missing, although it's possible to hack
-some drivers in order to drive already inserted cards at boot time with
-little modifications.
-
-Any contribution is welcome.
-
-Please send patches to nico@fluxnic.net
-
-Have Fun !
-
diff --git a/Documentation/arm/SA1100/CERF b/Documentation/arm/SA1100/CERF
deleted file mode 100644
index b3d845301ef1..000000000000
--- a/Documentation/arm/SA1100/CERF
+++ /dev/null
@@ -1,29 +0,0 @@
-*** The StrongARM version of the CerfBoard/Cube has been discontinued ***
-
-The Intrinsyc CerfBoard is a StrongARM 1110-based computer on a board
-that measures approximately 2" square. It includes an Ethernet
-controller, an RS232-compatible serial port, a USB function port, and
-one CompactFlash+ slot on the back. Pictures can be found at the
-Intrinsyc website, http://www.intrinsyc.com.
-
-This document describes the support in the Linux kernel for the
-Intrinsyc CerfBoard.
-
-Supported in this version:
- - CompactFlash+ slot (select PCMCIA in General Setup and any options
- that may be required)
- - Onboard Crystal CS8900 Ethernet controller (Cerf CS8900A support in
- Network Devices)
- - Serial ports with a serial console (hardcoded to 38400 8N1)
-
-In order to get this kernel onto your Cerf, you need a server that runs
-both BOOTP and TFTP. Detailed instructions should have come with your
-evaluation kit on how to use the bootloader. This series of commands
-will suffice:
-
- make ARCH=arm CROSS_COMPILE=arm-linux- cerfcube_defconfig
- make ARCH=arm CROSS_COMPILE=arm-linux- zImage
- make ARCH=arm CROSS_COMPILE=arm-linux- modules
- cp arch/arm/boot/zImage <TFTP directory>
-
-support@intrinsyc.com
diff --git a/Documentation/arm/SA1100/FreeBird b/Documentation/arm/SA1100/FreeBird
deleted file mode 100644
index ab9193663b2b..000000000000
--- a/Documentation/arm/SA1100/FreeBird
+++ /dev/null
@@ -1,21 +0,0 @@
-Freebird-1.1 is produced by Legend(C), Inc.
-http://web.archive.org/web/*/http://www.legend.com.cn
-and software/linux maintained by Coventive(C), Inc.
-(http://www.coventive.com)
-
-Based on the Nicolas's strongarm kernel tree.
-
-===============================================================
-Maintainer:
-
-Chester Kuo <chester@coventive.com>
- <chester@linux.org.tw>
-
-Author :
-Tim wu <timwu@coventive.com>
-CIH <cih@coventive.com>
-Eric Peng <ericpeng@coventive.com>
-Jeff Lee <jeff_lee@coventive.com>
-Allen Cheng
-Tony Liu <tonyliu@coventive.com>
-
diff --git a/Documentation/arm/SA1100/GraphicsClient b/Documentation/arm/SA1100/GraphicsClient
deleted file mode 100644
index 867bb35943af..000000000000
--- a/Documentation/arm/SA1100/GraphicsClient
+++ /dev/null
@@ -1,98 +0,0 @@
-ADS GraphicsClient Plus Single Board Computer
-
-For more details, contact Applied Data Systems or see
-http://www.applieddata.net/products.html
-
-The original Linux support for this product has been provided by
-Nicolas Pitre <nico@fluxnic.net>. Continued development work by
-Woojung Huh <whuh@applieddata.net>
-
-It's currently possible to mount a root filesystem via NFS providing a
-complete Linux environment. Otherwise a ramdisk image may be used. The
-board supports MTD/JFFS, so you could also mount something on there.
-
-Use 'make graphicsclient_config' before any 'make config'. This will set up
-defaults for GraphicsClient Plus support.
-
-The kernel zImage is linked to be loaded and executed at 0xc0200000.
-Also the following registers should have the specified values upon entry:
-
- r0 = 0
- r1 = 29 (this is the GraphicsClient architecture number)
-
-Linux can be used with the ADS BootLoader that ships with the
-newer rev boards. See their documentation on how to load Linux.
-Angel is not available for the GraphicsClient Plus AFAIK.
-
-There is a board known as just the GraphicsClient that ADS used to
-produce but has end of lifed. This code will not work on the older
-board with the ADS bootloader, but should still work with Angel,
-as outlined below. In any case, if you're planning on deploying
-something en masse, you should probably get the newer board.
-
-If using Angel on the older boards, here is a typical angel.opt option file
-if the kernel is loaded through the Angel Debug Monitor:
-
------ begin angelboot.opt -----
-base 0xc0200000
-entry 0xc0200000
-r0 0x00000000
-r1 0x0000001d
-device /dev/ttyS1
-options "38400 8N1"
-baud 115200
-#otherfile ramdisk.gz
-#otherbase 0xc0800000
-exec minicom
------ end angelboot.opt -----
-
-Then the kernel (and ramdisk if otherfile/otherbase lines above are
-uncommented) would be loaded with:
-
- angelboot -f angelboot.opt zImage
-
-Here it is assumed that the board is connected to ttyS1 on your PC
-and that minicom is preconfigured with /dev/ttyS1, 38400 baud, 8N1, no flow
-control by default.
-
-If any other bootloader is used, ensure it accomplish the same, especially
-for r0/r1 register values before jumping into the kernel.
-
-
-Supported peripherals:
-- SA1100 LCD frame buffer (8/16bpp...sort of)
-- on-board SMC 92C96 ethernet NIC
-- SA1100 serial port
-- flash memory access (MTD/JFFS)
-- pcmcia
-- touchscreen(ucb1200)
-- ps/2 keyboard
-- console on LCD screen
-- serial ports (ttyS[0-2])
- - ttyS0 is default for serial console
-- Smart I/O (ADC, keypad, digital inputs, etc)
- See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
- and example user space code. ps/2 keybd is multiplexed through this driver
-
-To do:
-- UCB1200 audio with new ucb_generic layer
-- everything else! :-)
-
-Notes:
-
-- The flash on board is divided into 3 partitions. mtd0 is where
- the ADS boot ROM and zImage is stored. It's been marked as
- read-only to keep you from blasting over the bootloader. :) mtd1 is
- for the ramdisk.gz image. mtd2 is user flash space and can be
- utilized for either JFFS or if you're feeling crazy, running ext2
- on top of it. If you're not using the ADS bootloader, you're
- welcome to blast over the mtd1 partition also.
-
-- 16bpp mode requires a different cable than what ships with the board.
- Contact ADS or look through the manual to wire your own. Currently,
- if you compile with 16bit mode support and switch into a lower bpp
- mode, the timing is off so the image is corrupted. This will be
- fixed soon.
-
-Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
-
diff --git a/Documentation/arm/SA1100/GraphicsMaster b/Documentation/arm/SA1100/GraphicsMaster
deleted file mode 100644
index 9145088a0ba2..000000000000
--- a/Documentation/arm/SA1100/GraphicsMaster
+++ /dev/null
@@ -1,53 +0,0 @@
-ADS GraphicsMaster Single Board Computer
-
-For more details, contact Applied Data Systems or see
-http://www.applieddata.net/products.html
-
-The original Linux support for this product has been provided by
-Nicolas Pitre <nico@fluxnic.net>. Continued development work by
-Woojung Huh <whuh@applieddata.net>
-
-Use 'make graphicsmaster_config' before any 'make config'.
-This will set up defaults for GraphicsMaster support.
-
-The kernel zImage is linked to be loaded and executed at 0xc0400000.
-
-Linux can be used with the ADS BootLoader that ships with the
-newer rev boards. See their documentation on how to load Linux.
-
-Supported peripherals:
-- SA1100 LCD frame buffer (8/16bpp...sort of)
-- SA1111 USB Master
-- on-board SMC 92C96 ethernet NIC
-- SA1100 serial port
-- flash memory access (MTD/JFFS)
-- pcmcia, compact flash
-- touchscreen(ucb1200)
-- ps/2 keyboard
-- console on LCD screen
-- serial ports (ttyS[0-2])
- - ttyS0 is default for serial console
-- Smart I/O (ADC, keypad, digital inputs, etc)
- See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
- and example user space code. ps/2 keybd is multiplexed through this driver
-
-To do:
-- everything else! :-)
-
-Notes:
-
-- The flash on board is divided into 3 partitions. mtd0 is where
- the zImage is stored. It's been marked as read-only to keep you
- from blasting over the bootloader. :) mtd1 is
- for the ramdisk.gz image. mtd2 is user flash space and can be
- utilized for either JFFS or if you're feeling crazy, running ext2
- on top of it. If you're not using the ADS bootloader, you're
- welcome to blast over the mtd1 partition also.
-
-- 16bpp mode requires a different cable than what ships with the board.
- Contact ADS or look through the manual to wire your own. Currently,
- if you compile with 16bit mode support and switch into a lower bpp
- mode, the timing is off so the image is corrupted. This will be
- fixed soon.
-
-Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/SA1100/HUW_WEBPANEL b/Documentation/arm/SA1100/HUW_WEBPANEL
deleted file mode 100644
index fd56b48d4833..000000000000
--- a/Documentation/arm/SA1100/HUW_WEBPANEL
+++ /dev/null
@@ -1,17 +0,0 @@
-The HUW_WEBPANEL is a product of the german company Hoeft & Wessel AG
-
-If you want more information, please visit
-http://www.hoeft-wessel.de
-
-To build the kernel:
- make huw_webpanel_config
- make oldconfig
- [accept all defaults]
- make zImage
-
-Mostly of the work is done by:
-Roman Jordan jor@hoeft-wessel.de
-Christoph Schulz schu@hoeft-wessel.de
-
-2000/12/18/
-
diff --git a/Documentation/arm/SA1100/Itsy b/Documentation/arm/SA1100/Itsy
deleted file mode 100644
index 44b94997fa0d..000000000000
--- a/Documentation/arm/SA1100/Itsy
+++ /dev/null
@@ -1,39 +0,0 @@
-Itsy is a research project done by the Western Research Lab, and Systems
-Research Center in Palo Alto, CA. The Itsy project is one of several
-research projects at Compaq that are related to pocket computing.
-
-For more information, see:
-
- http://www.hpl.hp.com/downloads/crl/itsy/
-
-Notes on initial 2.4 Itsy support (8/27/2000) :
-The port was done on an Itsy version 1.5 machine with a daughtercard with
-64 Meg of DRAM and 32 Meg of Flash. The initial work includes support for
-serial console (to see what you're doing). No other devices have been
-enabled.
-
-To build, do a "make menuconfig" (or xmenuconfig) and select Itsy support.
-Disable Flash and LCD support. and then do a make zImage.
-Finally, you will need to cd to arch/arm/boot/tools and execute a make there
-to build the params-itsy program used to boot the kernel.
-
-In order to install the port of 2.4 to the itsy, You will need to set the
-configuration parameters in the monitor as follows:
-Arg 1:0x08340000, Arg2: 0xC0000000, Arg3:18 (0x12), Arg4:0
-Make sure the start-routine address is set to 0x00060000.
-
-Next, flash the params-itsy program to 0x00060000 ("p 1 0x00060000" in the
-flash menu) Flash the kernel in arch/arm/boot/zImage into 0x08340000
-("p 1 0x00340000"). Finally flash an initial ramdisk into 0xC8000000
-("p 2 0x0") We used ramdisk-2-30.gz from the 0.11 version directory on
-handhelds.org.
-
-The serial connection we established was at:
- 8-bit data, no parity, 1 stop bit(s), 115200.00 b/s. in the monitor, in the
-params-itsy program, and in the kernel itself. This can be changed, but
-not easily. The monitor parameters are easily changed, the params program
-setup is assembly outl's, and the kernel is a configuration item specific to
-the itsy. (i.e. grep for CONFIG_SA1100_ITSY and you'll find where it is.)
-
-
-This should get you a properly booting 2.4 kernel on the itsy.
diff --git a/Documentation/arm/SA1100/LART b/Documentation/arm/SA1100/LART
deleted file mode 100644
index 6d412b685598..000000000000
--- a/Documentation/arm/SA1100/LART
+++ /dev/null
@@ -1,14 +0,0 @@
-Linux Advanced Radio Terminal (LART)
-------------------------------------
-
-The LART is a small (7.5 x 10cm) SA-1100 board, designed for embedded
-applications. It has 32 MB DRAM, 4MB Flash ROM, double RS232 and all
-other StrongARM-gadgets. Almost all SA signals are directly accessible
-through a number of connectors. The powersupply accepts voltages
-between 3.5V and 16V and is overdimensioned to support a range of
-daughterboards. A quad Ethernet / IDE / PS2 / sound daughterboard
-is under development, with plenty of others in different stages of
-planning.
-
-The hardware designs for this board have been released under an open license;
-see the LART page at http://www.lartmaker.nl/ for more information.
diff --git a/Documentation/arm/SA1100/PLEB b/Documentation/arm/SA1100/PLEB
deleted file mode 100644
index b9c8a631a351..000000000000
--- a/Documentation/arm/SA1100/PLEB
+++ /dev/null
@@ -1,11 +0,0 @@
-The PLEB project was started as a student initiative at the School of
-Computer Science and Engineering, University of New South Wales to make a
-pocket computer capable of running the Linux Kernel.
-
-PLEB support has yet to be fully integrated.
-
-For more information, see:
-
- http://www.cse.unsw.edu.au
-
-
diff --git a/Documentation/arm/SA1100/Pangolin b/Documentation/arm/SA1100/Pangolin
deleted file mode 100644
index 077a6120e129..000000000000
--- a/Documentation/arm/SA1100/Pangolin
+++ /dev/null
@@ -1,23 +0,0 @@
-Pangolin is a StrongARM 1110-based evaluation platform produced
-by Dialogue Technology (http://www.dialogue.com.tw/).
-It has EISA slots for ease of configuration with SDRAM/Flash
-memory card, USB/Serial/Audio card, Compact Flash card,
-PCMCIA/IDE card and TFT-LCD card.
-
-To compile for Pangolin, you must issue the following commands:
-
- make pangolin_config
- make oldconfig
- make zImage
-
-Supported peripherals:
-- SA1110 serial port (UART1/UART2/UART3)
-- flash memory access
-- compact flash driver
-- UDA1341 sound driver
-- SA1100 LCD controller for 800x600 16bpp TFT-LCD
-- MQ-200 driver for 800x600 16bpp TFT-LCD
-- Penmount(touch panel) driver
-- PCMCIA driver
-- SMC91C94 LAN driver
-- IDE driver (experimental)
diff --git a/Documentation/arm/SA1100/Tifon b/Documentation/arm/SA1100/Tifon
deleted file mode 100644
index dd1934d9c851..000000000000
--- a/Documentation/arm/SA1100/Tifon
+++ /dev/null
@@ -1,7 +0,0 @@
-Tifon
------
-
-More info has to come...
-
-Contact: Peter Danielsson <peter.danielsson@era-t.ericsson.se>
-
diff --git a/Documentation/arm/SA1100/Yopy b/Documentation/arm/SA1100/Yopy
deleted file mode 100644
index e14f16d836ac..000000000000
--- a/Documentation/arm/SA1100/Yopy
+++ /dev/null
@@ -1,2 +0,0 @@
-See http://www.yopydeveloper.org for more.
-
diff --git a/Documentation/arm/SA1100/empeg b/Documentation/arm/SA1100/empeg
deleted file mode 100644
index 4ece4849a42c..000000000000
--- a/Documentation/arm/SA1100/empeg
+++ /dev/null
@@ -1,2 +0,0 @@
-See ../empeg/README
-
diff --git a/Documentation/arm/SA1100/nanoEngine b/Documentation/arm/SA1100/nanoEngine
deleted file mode 100644
index 48a7934f95f6..000000000000
--- a/Documentation/arm/SA1100/nanoEngine
+++ /dev/null
@@ -1,11 +0,0 @@
-nanoEngine
-----------
-
-"nanoEngine" is a SA1110 based single board computer from
-Bright Star Engineering Inc. See www.brightstareng.com/arm
-for more info.
-(Ref: Stuart Adams <sja@brightstareng.com>)
-
-Also visit Larry Doolittle's "Linux for the nanoEngine" site:
-http://www.brightstareng.com/arm/nanoeng.htm
-
diff --git a/Documentation/arm/SA1100/serial_UART b/Documentation/arm/SA1100/serial_UART
deleted file mode 100644
index a63966f1d083..000000000000
--- a/Documentation/arm/SA1100/serial_UART
+++ /dev/null
@@ -1,47 +0,0 @@
-The SA1100 serial port had its major/minor numbers officially assigned:
-
-> Date: Sun, 24 Sep 2000 21:40:27 -0700
-> From: H. Peter Anvin <hpa@transmeta.com>
-> To: Nicolas Pitre <nico@CAM.ORG>
-> Cc: Device List Maintainer <device@lanana.org>
-> Subject: Re: device
->
-> Okay. Note that device numbers 204 and 205 are used for "low density
-> serial devices", so you will have a range of minors on those majors (the
-> tty device layer handles this just fine, so you don't have to worry about
-> doing anything special.)
->
-> So your assignments are:
->
-> 204 char Low-density serial ports
-> 5 = /dev/ttySA0 SA1100 builtin serial port 0
-> 6 = /dev/ttySA1 SA1100 builtin serial port 1
-> 7 = /dev/ttySA2 SA1100 builtin serial port 2
->
-> 205 char Low-density serial ports (alternate device)
-> 5 = /dev/cusa0 Callout device for ttySA0
-> 6 = /dev/cusa1 Callout device for ttySA1
-> 7 = /dev/cusa2 Callout device for ttySA2
->
-
-You must create those inodes in /dev on the root filesystem used
-by your SA1100-based device:
-
- mknod ttySA0 c 204 5
- mknod ttySA1 c 204 6
- mknod ttySA2 c 204 7
- mknod cusa0 c 205 5
- mknod cusa1 c 205 6
- mknod cusa2 c 205 7
-
-In addition to the creation of the appropriate device nodes above, you
-must ensure your user space applications make use of the correct device
-name. The classic example is the content of the /etc/inittab file where
-you might have a getty process started on ttyS0. In this case:
-
-- replace occurrences of ttyS0 with ttySA0, ttyS1 with ttySA1, etc.
-
-- don't forget to add 'ttySA0', 'console', or the appropriate tty name
- in /etc/securetty for root to be allowed to login as well.
-
-
diff --git a/Documentation/arm/SPEAr/overview.txt b/Documentation/arm/SPEAr/overview.txt
deleted file mode 100644
index 1b049be6c84f..000000000000
--- a/Documentation/arm/SPEAr/overview.txt
+++ /dev/null
@@ -1,63 +0,0 @@
- SPEAr ARM Linux Overview
- ==========================
-
-Introduction
-------------
-
- SPEAr (Structured Processor Enhanced Architecture).
- weblink : http://www.st.com/spear
-
- The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
- supported by the 'spear' platform of ARM Linux. Currently SPEAr1310,
- SPEAr1340, SPEAr300, SPEAr310, SPEAr320 and SPEAr600 SOCs are supported.
-
- Hierarchy in SPEAr is as follows:
-
- SPEAr (Platform)
- - SPEAr3XX (3XX SOC series, based on ARM9)
- - SPEAr300 (SOC)
- - SPEAr300 Evaluation Board
- - SPEAr310 (SOC)
- - SPEAr310 Evaluation Board
- - SPEAr320 (SOC)
- - SPEAr320 Evaluation Board
- - SPEAr6XX (6XX SOC series, based on ARM9)
- - SPEAr600 (SOC)
- - SPEAr600 Evaluation Board
- - SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
- - SPEAr1310 (SOC)
- - SPEAr1310 Evaluation Board
- - SPEAr1340 (SOC)
- - SPEAr1340 Evaluation Board
-
- Configuration
- -------------
-
- A generic configuration is provided for each machine, and can be used as the
- default by
- make spear13xx_defconfig
- make spear3xx_defconfig
- make spear6xx_defconfig
-
- Layout
- ------
-
- The common files for multiple machine families (SPEAr3xx, SPEAr6xx and
- SPEAr13xx) are located in the platform code contained in arch/arm/plat-spear
- with headers in plat/.
-
- Each machine series have a directory with name arch/arm/mach-spear followed by
- series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
-
- Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c, for
- spear6xx is mach-spear6xx/spear6xx.c and for spear13xx family is
- mach-spear13xx/spear13xx.c. mach-spear* also contain soc/machine specific
- files, like spear1310.c, spear1340.c spear300.c, spear310.c, spear320.c and
- spear600.c. mach-spear* doesn't contains board specific files as they fully
- support Flattened Device Tree.
-
-
- Document Author
- ---------------
-
- Viresh Kumar <vireshk@kernel.org>, (c) 2010-2012 ST Microelectronics
diff --git a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt b/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
deleted file mode 100644
index fa968aa99d67..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/CPUfreq.txt
+++ /dev/null
@@ -1,75 +0,0 @@
- S3C24XX CPUfreq support
- =======================
-
-Introduction
-------------
-
- The S3C24XX series support a number of power saving systems, such as
- the ability to change the core, memory and peripheral operating
- frequencies. The core control is exported via the CPUFreq driver
- which has a number of different manual or automatic controls over the
- rate the core is running at.
-
- There are two forms of the driver depending on the specific CPU and
- how the clocks are arranged. The first implementation used as single
- PLL to feed the ARM, memory and peripherals via a series of dividers
- and muxes and this is the implementation that is documented here. A
- newer version where there is a separate PLL and clock divider for the
- ARM core is available as a separate driver.
-
-
-Layout
-------
-
- The code core manages the CPU specific drivers, any data that they
- need to register and the interface to the generic drivers/cpufreq
- system. Each CPU registers a driver to control the PLL, clock dividers
- and anything else associated with it. Any board that wants to use this
- framework needs to supply at least basic details of what is required.
-
- The core registers with drivers/cpufreq at init time if all the data
- necessary has been supplied.
-
-
-CPU support
------------
-
- The support for each CPU depends on the facilities provided by the
- SoC and the driver as each device has different PLL and clock chains
- associated with it.
-
-
-Slow Mode
----------
-
- The SLOW mode where the PLL is turned off altogether and the
- system is fed by the external crystal input is currently not
- supported.
-
-
-sysfs
------
-
- The core code exports extra information via sysfs in the directory
- devices/system/cpu/cpu0/arch-freq.
-
-
-Board Support
--------------
-
- Each board that wants to use the cpufreq code must register some basic
- information with the core driver to provide information about what the
- board requires and any restrictions being placed on it.
-
- The board needs to supply information about whether it needs the IO bank
- timings changing, any maximum frequency limits and information about the
- SDRAM refresh rate.
-
-
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2009 Simtec Electronics
-Licensed under GPLv2
diff --git a/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt b/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt
deleted file mode 100644
index b87292e05f2f..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/EB2410ITX.txt
+++ /dev/null
@@ -1,58 +0,0 @@
- Simtec Electronics EB2410ITX (BAST)
- ===================================
-
- http://www.simtec.co.uk/products/EB2410ITX/
-
-Introduction
-------------
-
- The EB2410ITX is a S3C2410 based development board with a variety of
- peripherals and expansion connectors. This board is also known by
- the shortened name of Bast.
-
-
-Configuration
--------------
-
- To set the default configuration, use `make bast_defconfig` which
- supports the commonly used features of this board.
-
-
-Support
--------
-
- Official support information can be found on the Simtec Electronics
- website, at the product page http://www.simtec.co.uk/products/EB2410ITX/
-
- Useful links:
-
- - Resources Page http://www.simtec.co.uk/products/EB2410ITX/resources.html
-
- - Board FAQ at http://www.simtec.co.uk/products/EB2410ITX/faq.html
-
- - Bootloader info http://www.simtec.co.uk/products/SWABLE/resources.html
- and FAQ http://www.simtec.co.uk/products/SWABLE/faq.html
-
-
-MTD
----
-
- The NAND and NOR support has been merged from the linux-mtd project.
- Any problems, see http://www.linux-mtd.infradead.org/ for more
- information or up-to-date versions of linux-mtd.
-
-
-IDE
----
-
- Both onboard IDE ports are supported, however there is no support for
- changing speed of devices, PIO Mode 4 capable drives should be used.
-
-
-Maintainers
------------
-
- This board is maintained by Simtec Electronics.
-
-
-Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/GPIO.txt b/Documentation/arm/Samsung-S3C24XX/GPIO.txt
deleted file mode 100644
index e8f918b96123..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/GPIO.txt
+++ /dev/null
@@ -1,171 +0,0 @@
- S3C24XX GPIO Control
- ====================
-
-Introduction
-------------
-
- The s3c2410 kernel provides an interface to configure and
- manipulate the state of the GPIO pins, and find out other
- information about them.
-
- There are a number of conditions attached to the configuration
- of the s3c2410 GPIO system, please read the Samsung provided
- data-sheet/users manual to find out the complete list.
-
- See Documentation/arm/Samsung/GPIO.txt for the core implementation.
-
-
-GPIOLIB
--------
-
- With the event of the GPIOLIB in drivers/gpio, support for some
- of the GPIO functions such as reading and writing a pin will
- be removed in favour of this common access method.
-
- Once all the extant drivers have been converted, the functions
- listed below will be removed (they may be marked as __deprecated
- in the near future).
-
- The following functions now either have a s3c_ specific variant
- or are merged into gpiolib. See the definitions in
- arch/arm/plat-samsung/include/plat/gpio-cfg.h:
-
- s3c2410_gpio_setpin() gpio_set_value() or gpio_direction_output()
- s3c2410_gpio_getpin() gpio_get_value() or gpio_direction_input()
- s3c2410_gpio_getirq() gpio_to_irq()
- s3c2410_gpio_cfgpin() s3c_gpio_cfgpin()
- s3c2410_gpio_getcfg() s3c_gpio_getcfg()
- s3c2410_gpio_pullup() s3c_gpio_setpull()
-
-
-GPIOLIB conversion
-------------------
-
-If you need to convert your board or driver to use gpiolib from the phased
-out s3c2410 API, then here are some notes on the process.
-
-1) If your board is exclusively using an GPIO, say to control peripheral
- power, then it will require to claim the gpio with gpio_request() before
- it can use it.
-
- It is recommended to check the return value, with at least WARN_ON()
- during initialisation.
-
-2) The s3c2410_gpio_cfgpin() can be directly replaced with s3c_gpio_cfgpin()
- as they have the same arguments, and can either take the pin specific
- values, or the more generic special-function-number arguments.
-
-3) s3c2410_gpio_pullup() changes have the problem that while the
- s3c2410_gpio_pullup(x, 1) can be easily translated to the
- s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
- are not so easy.
-
- The s3c2410_gpio_pullup(x, 0) case enables the pull-up (or in the case
- of some of the devices, a pull-down) and as such the new API distinguishes
- between the UP and DOWN case. There is currently no 'just turn on' setting
- which may be required if this becomes a problem.
-
-4) s3c2410_gpio_setpin() can be replaced by gpio_set_value(), the old call
- does not implicitly configure the relevant gpio to output. The gpio
- direction should be changed before using gpio_set_value().
-
-5) s3c2410_gpio_getpin() is replaceable by gpio_get_value() if the pin
- has been set to input. It is currently unknown what the behaviour is
- when using gpio_get_value() on an output pin (s3c2410_gpio_getpin
- would return the value the pin is supposed to be outputting).
-
-6) s3c2410_gpio_getirq() should be directly replaceable with the
- gpio_to_irq() call.
-
-The s3c2410_gpio and gpio_ calls have always operated on the same gpio
-numberspace, so there is no problem with converting the gpio numbering
-between the calls.
-
-
-Headers
--------
-
- See arch/arm/mach-s3c24xx/include/mach/regs-gpio.h for the list
- of GPIO pins, and the configuration values for them. This
- is included by using #include <mach/regs-gpio.h>
-
-
-PIN Numbers
------------
-
- Each pin has an unique number associated with it in regs-gpio.h,
- e.g. S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
- the GPIO functions which pin is to be used.
-
- With the conversion to gpiolib, there is no longer a direct conversion
- from gpio pin number to register base address as in earlier kernels. This
- is due to the number space required for newer SoCs where the later
- GPIOs are not contiguous.
-
-
-Configuring a pin
------------------
-
- The following function allows the configuration of a given pin to
- be changed.
-
- void s3c_gpio_cfgpin(unsigned int pin, unsigned int function);
-
- e.g.:
-
- s3c_gpio_cfgpin(S3C2410_GPA(0), S3C_GPIO_SFN(1));
- s3c_gpio_cfgpin(S3C2410_GPE(8), S3C_GPIO_SFN(2));
-
- which would turn GPA(0) into the lowest Address line A0, and set
- GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
-
-
-Reading the current configuration
----------------------------------
-
- The current configuration of a pin can be read by using standard
- gpiolib function:
-
- s3c_gpio_getcfg(unsigned int pin);
-
- The return value will be from the same set of values which can be
- passed to s3c_gpio_cfgpin().
-
-
-Configuring a pull-up resistor
-------------------------------
-
- A large proportion of the GPIO pins on the S3C2410 can have weak
- pull-up resistors enabled. This can be configured by the following
- function:
-
- void s3c_gpio_setpull(unsigned int pin, unsigned int to);
-
- Where the to value is S3C_GPIO_PULL_NONE to set the pull-up off,
- and S3C_GPIO_PULL_UP to enable the specified pull-up. Any other
- values are currently undefined.
-
-
-Getting and setting the state of a PIN
---------------------------------------
-
- These calls are now implemented by the relevant gpiolib calls, convert
- your board or driver to use gpiolib.
-
-
-Getting the IRQ number associated with a PIN
---------------------------------------------
-
- A standard gpiolib function can map the given pin number to an IRQ
- number to pass to the IRQ system.
-
- int gpio_to_irq(unsigned int pin);
-
- Note, not all pins have an IRQ.
-
-
-Author
--------
-
-Ben Dooks, 03 October 2004
-Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/H1940.txt b/Documentation/arm/Samsung-S3C24XX/H1940.txt
deleted file mode 100644
index b738859b1fc0..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/H1940.txt
+++ /dev/null
@@ -1,40 +0,0 @@
- HP IPAQ H1940
- =============
-
-http://www.handhelds.org/projects/h1940.html
-
-Introduction
-------------
-
- The HP H1940 is a S3C2410 based handheld device, with
- bluetooth connectivity.
-
-
-Support
--------
-
- A variety of information is available
-
- handhelds.org project page:
-
- http://www.handhelds.org/projects/h1940.html
-
- handhelds.org wiki page:
-
- http://handhelds.org/moin/moin.cgi/HpIpaqH1940
-
- Herbert Pötzl pages:
-
- http://vserver.13thfloor.at/H1940/
-
-
-Maintainers
------------
-
- This project is being maintained and developed by a variety
- of people, including Ben Dooks, Arnaud Patard, and Herbert Pötzl.
-
- Thanks to the many others who have also provided support.
-
-
-(c) 2005 Ben Dooks
diff --git a/Documentation/arm/Samsung-S3C24XX/NAND.txt b/Documentation/arm/Samsung-S3C24XX/NAND.txt
deleted file mode 100644
index bc478a3409b8..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/NAND.txt
+++ /dev/null
@@ -1,30 +0,0 @@
- S3C24XX NAND Support
- ====================
-
-Introduction
-------------
-
-Small Page NAND
----------------
-
-The driver uses a 512 byte (1 page) ECC code for this setup. The
-ECC code is not directly compatible with the default kernel ECC
-code, so the driver enforces its own OOB layout and ECC parameters
-
-Large Page NAND
----------------
-
-The driver is capable of handling NAND flash with a 2KiB page
-size, with support for hardware ECC generation and correction.
-
-Unlike the 512byte page mode, the driver generates ECC data for
-each 256 byte block in an 2KiB page. This means that more than
-one error in a page can be rectified. It also means that the
-OOB layout remains the default kernel layout for these flashes.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2007 Simtec Electronics
-
diff --git a/Documentation/arm/Samsung-S3C24XX/Overview.txt b/Documentation/arm/Samsung-S3C24XX/Overview.txt
deleted file mode 100644
index 00d3c3141e21..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/Overview.txt
+++ /dev/null
@@ -1,318 +0,0 @@
- S3C24XX ARM Linux Overview
- ==========================
-
-
-
-Introduction
-------------
-
- The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported
- by the 's3c2410' architecture of ARM Linux. Currently the S3C2410,
- S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443 and S3C2450 devices
- are supported.
-
- Support for the S3C2400 and S3C24A0 series was never completed and the
- corresponding code has been removed after a while. If someone wishes to
- revive this effort, partial support can be retrieved from earlier Linux
- versions.
-
- The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
- included under the arch/arm/mach-s3c2416 directory. Note, while core
- support for these SoCs is in, work on some of the extra peripherals
- and extra interrupts is still ongoing.
-
-
-Configuration
--------------
-
- A generic S3C2410 configuration is provided, and can be used as the
- default by `make s3c2410_defconfig`. This configuration has support
- for all the machines, and the commonly used features on them.
-
- Certain machines may have their own default configurations as well,
- please check the machine specific documentation.
-
-
-Layout
-------
-
- The core support files are located in the platform code contained in
- arch/arm/plat-s3c24xx with headers in include/asm-arm/plat-s3c24xx.
- This directory should be kept to items shared between the platform
- code (arch/arm/plat-s3c24xx) and the arch/arm/mach-s3c24* code.
-
- Each cpu has a directory with the support files for it, and the
- machines that carry the device. For example S3C2410 is contained
- in arch/arm/mach-s3c2410 and S3C2440 in arch/arm/mach-s3c2440
-
- Register, kernel and platform data definitions are held in the
- arch/arm/mach-s3c2410 directory./include/mach
-
-arch/arm/plat-s3c24xx:
-
- Files in here are either common to all the s3c24xx family,
- or are common to only some of them with names to indicate this
- status. The files that are not common to all are generally named
- with the initial cpu they support in the series to ensure a short
- name without any possibility of confusion with newer devices.
-
- As an example, initially s3c244x would cover s3c2440 and s3c2442, but
- with the s3c2443 which does not share many of the same drivers in
- this directory, the name becomes invalid. We stick to s3c2440-<x>
- to indicate a driver that is s3c2440 and s3c2442 compatible.
-
- This does mean that to find the status of any given SoC, a number
- of directories may need to be searched.
-
-
-Machines
---------
-
- The currently supported machines are as follows:
-
- Simtec Electronics EB2410ITX (BAST)
-
- A general purpose development board, see EB2410ITX.txt for further
- details
-
- Simtec Electronics IM2440D20 (Osiris)
-
- CPU Module from Simtec Electronics, with a S3C2440A CPU, nand flash
- and a PCMCIA controller.
-
- Samsung SMDK2410
-
- Samsung's own development board, geared for PDA work.
-
- Samsung/Aiji SMDK2412
-
- The S3C2412 version of the SMDK2440.
-
- Samsung/Aiji SMDK2413
-
- The S3C2412 version of the SMDK2440.
-
- Samsung/Meritech SMDK2440
-
- The S3C2440 compatible version of the SMDK2440, which has the
- option of an S3C2440 or S3C2442 CPU module.
-
- Thorcom VR1000
-
- Custom embedded board
-
- HP IPAQ 1940
-
- Handheld (IPAQ), available in several varieties
-
- HP iPAQ rx3715
-
- S3C2440 based IPAQ, with a number of variations depending on
- features shipped.
-
- Acer N30
-
- A S3C2410 based PDA from Acer. There is a Wiki page at
- http://handhelds.org/moin/moin.cgi/AcerN30Documentation .
-
- AML M5900
-
- American Microsystems' M5900
-
- Nex Vision Nexcoder
- Nex Vision Otom
-
- Two machines by Nex Vision
-
-
-Adding New Machines
--------------------
-
- The architecture has been designed to support as many machines as can
- be configured for it in one kernel build, and any future additions
- should keep this in mind before altering items outside of their own
- machine files.
-
- Machine definitions should be kept in linux/arch/arm/mach-s3c2410,
- and there are a number of examples that can be looked at.
-
- Read the kernel patch submission policies as well as the
- Documentation/arm directory before submitting patches. The
- ARM kernel series is managed by Russell King, and has a patch system
- located at http://www.arm.linux.org.uk/developer/patches/
- as well as mailing lists that can be found from the same site.
-
- As a courtesy, please notify <ben-linux@fluff.org> of any new
- machines or other modifications.
-
- Any large scale modifications, or new drivers should be discussed
- on the ARM kernel mailing list (linux-arm-kernel) before being
- attempted. See http://www.arm.linux.org.uk/mailinglists/ for the
- mailing list information.
-
-
-I2C
----
-
- The hardware I2C core in the CPU is supported in single master
- mode, and can be configured via platform data.
-
-
-RTC
----
-
- Support for the onboard RTC unit, including alarm function.
-
- This has recently been upgraded to use the new RTC core,
- and the module has been renamed to rtc-s3c to fit in with
- the new rtc naming scheme.
-
-
-Watchdog
---------
-
- The onchip watchdog is available via the standard watchdog
- interface.
-
-
-NAND
-----
-
- The current kernels now have support for the s3c2410 NAND
- controller. If there are any problems the latest linux-mtd
- code can be found from http://www.linux-mtd.infradead.org/
-
- For more information see Documentation/arm/Samsung-S3C24XX/NAND.txt
-
-
-SD/MMC
-------
-
- The SD/MMC hardware pre S3C2443 is supported in the current
- kernel, the driver is drivers/mmc/host/s3cmci.c and supports
- 1 and 4 bit SD or MMC cards.
-
- The SDIO behaviour of this driver has not been fully tested. There is no
- current support for hardware SDIO interrupts.
-
-
-Serial
-------
-
- The s3c2410 serial driver provides support for the internal
- serial ports. These devices appear as /dev/ttySAC0 through 3.
-
- To create device nodes for these, use the following commands
-
- mknod ttySAC0 c 204 64
- mknod ttySAC1 c 204 65
- mknod ttySAC2 c 204 66
-
-
-GPIO
-----
-
- The core contains support for manipulating the GPIO, see the
- documentation in GPIO.txt in the same directory as this file.
-
- Newer kernels carry GPIOLIB, and support is being moved towards
- this with some of the older support in line to be removed.
-
- As of v2.6.34, the move towards using gpiolib support is almost
- complete, and very little of the old calls are left.
-
- See Documentation/arm/Samsung-S3C24XX/GPIO.txt for the S3C24XX specific
- support and Documentation/arm/Samsung/GPIO.txt for the core Samsung
- implementation.
-
-
-Clock Management
-----------------
-
- The core provides the interface defined in the header file
- include/asm-arm/hardware/clock.h, to allow control over the
- various clock units
-
-
-Suspend to RAM
---------------
-
- For boards that provide support for suspend to RAM, the
- system can be placed into low power suspend.
-
- See Suspend.txt for more information.
-
-
-SPI
----
-
- SPI drivers are available for both the in-built hardware
- (although there is no DMA support yet) and a generic
- GPIO based solution.
-
-
-LEDs
-----
-
- There is support for GPIO based LEDs via a platform driver
- in the LED subsystem.
-
-
-Platform Data
--------------
-
- Whenever a device has platform specific data that is specified
- on a per-machine basis, care should be taken to ensure the
- following:
-
- 1) that default data is not left in the device to confuse the
- driver if a machine does not set it at startup
-
- 2) the data should (if possible) be marked as __initdata,
- to ensure that the data is thrown away if the machine is
- not the one currently in use.
-
- The best way of doing this is to make a function that
- kmalloc()s an area of memory, and copies the __initdata
- and then sets the relevant device's platform data. Making
- the function `__init` takes care of ensuring it is discarded
- with the rest of the initialisation code
-
- static __init void s3c24xx_xxx_set_platdata(struct xxx_data *pd)
- {
- struct s3c2410_xxx_mach_info *npd;
-
- npd = kmalloc(sizeof(struct s3c2410_xxx_mach_info), GFP_KERNEL);
- if (npd) {
- memcpy(npd, pd, sizeof(struct s3c2410_xxx_mach_info));
- s3c_device_xxx.dev.platform_data = npd;
- } else {
- printk(KERN_ERR "no memory for xxx platform data\n");
- }
- }
-
- Note, since the code is marked as __init, it should not be
- exported outside arch/arm/mach-s3c2410/, or exported to
- modules via EXPORT_SYMBOL() and related functions.
-
-
-Port Contributors
------------------
-
- Ben Dooks (BJD)
- Vincent Sanders
- Herbert Potzl
- Arnaud Patard (RTP)
- Roc Wu
- Klaus Fetscher
- Dimitry Andric
- Shannon Holland
- Guillaume Gourat (NexVision)
- Christer Weinigel (wingel) (Acer N30)
- Lucas Correia Villa Real (S3C2400 port)
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2004-2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/S3C2412.txt b/Documentation/arm/Samsung-S3C24XX/S3C2412.txt
deleted file mode 100644
index dc1fd362d3c1..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/S3C2412.txt
+++ /dev/null
@@ -1,120 +0,0 @@
- S3C2412 ARM Linux Overview
- ==========================
-
-Introduction
-------------
-
- The S3C2412 is part of the S3C24XX range of ARM9 System-on-Chip CPUs
- from Samsung. This part has an ARM926-EJS core, capable of running up
- to 266MHz (see data-sheet for more information)
-
-
-Clock
------
-
- The core clock code provides a set of clocks to the drivers, and allows
- for source selection and a number of other features.
-
-
-Power
------
-
- No support for suspend/resume to RAM in the current system.
-
-
-DMA
----
-
- No current support for DMA.
-
-
-GPIO
-----
-
- There is support for setting the GPIO to input/output/special function
- and reading or writing to them.
-
-
-UART
-----
-
- The UART hardware is similar to the S3C2440, and is supported by the
- s3c2410 driver in the drivers/serial directory.
-
-
-NAND
-----
-
- The NAND hardware is similar to the S3C2440, and is supported by the
- s3c2410 driver in the drivers/mtd/nand/raw directory.
-
-
-USB Host
---------
-
- The USB hardware is similar to the S3C2410, with extended clock source
- control. The OHCI portion is supported by the ohci-s3c2410 driver, and
- the clock control selection is supported by the core clock code.
-
-
-USB Device
-----------
-
- No current support in the kernel
-
-
-IRQs
-----
-
- All the standard, and external interrupt sources are supported. The
- extra sub-sources are not yet supported.
-
-
-RTC
----
-
- The RTC hardware is similar to the S3C2410, and is supported by the
- s3c2410-rtc driver.
-
-
-Watchdog
---------
-
- The watchdog hardware is the same as the S3C2410, and is supported by
- the s3c2410_wdt driver.
-
-
-MMC/SD/SDIO
------------
-
- No current support for the MMC/SD/SDIO block.
-
-IIC
----
-
- The IIC hardware is the same as the S3C2410, and is supported by the
- i2c-s3c24xx driver.
-
-
-IIS
----
-
- No current support for the IIS interface.
-
-
-SPI
----
-
- No current support for the SPI interfaces.
-
-
-ATA
----
-
- No current support for the on-board ATA block.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/S3C2413.txt b/Documentation/arm/Samsung-S3C24XX/S3C2413.txt
deleted file mode 100644
index 909bdc7dd7b5..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/S3C2413.txt
+++ /dev/null
@@ -1,21 +0,0 @@
- S3C2413 ARM Linux Overview
- ==========================
-
-Introduction
-------------
-
- The S3C2413 is an extended version of the S3C2412, with an camera
- interface and mobile DDR memory support. See the S3C2412 support
- documentation for more information.
-
-
-Camera Interface
----------------
-
- This block is currently not supported.
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt b/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt
deleted file mode 100644
index 429390bd4684..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/SMDK2440.txt
+++ /dev/null
@@ -1,56 +0,0 @@
- Samsung/Meritech SMDK2440
- =========================
-
-Introduction
-------------
-
- The SMDK2440 is a two part evaluation board for the Samsung S3C2440
- processor. It includes support for LCD, SmartMedia, Audio, SD and
- 10MBit Ethernet, and expansion headers for various signals, including
- the camera and unused GPIO.
-
-
-Configuration
--------------
-
- To set the default configuration, use `make smdk2440_defconfig` which
- will configure the common features of this board, or use
- `make s3c2410_config` to include support for all s3c2410/s3c2440 machines
-
-
-Support
--------
-
- Ben Dooks' SMDK2440 site at http://www.fluff.org/ben/smdk2440/ which
- includes linux based USB download tools.
-
- Some of the h1940 patches that can be found from the H1940 project
- site at http://www.handhelds.org/projects/h1940.html can also be
- applied to this board.
-
-
-Peripherals
------------
-
- There is no current support for any of the extra peripherals on the
- base-board itself.
-
-
-MTD
----
-
- The NAND flash should be supported by the in kernel MTD NAND support,
- NOR flash will be added later.
-
-
-Maintainers
------------
-
- This board is being maintained by Ben Dooks, for more info, see
- http://www.fluff.org/ben/smdk2440/
-
- Many thanks to Dimitry Andric of TomTom for the loan of the SMDK2440,
- and to Simtec Electronics for allowing me time to work on this.
-
-
-(c) 2004 Ben Dooks
diff --git a/Documentation/arm/Samsung-S3C24XX/Suspend.txt b/Documentation/arm/Samsung-S3C24XX/Suspend.txt
deleted file mode 100644
index cb4f0c0cdf9d..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/Suspend.txt
+++ /dev/null
@@ -1,137 +0,0 @@
- S3C24XX Suspend Support
- =======================
-
-
-Introduction
-------------
-
- The S3C24XX supports a low-power suspend mode, where the SDRAM is kept
- in Self-Refresh mode, and all but the essential peripheral blocks are
- powered down. For more information on how this works, please look
- at the relevant CPU datasheet from Samsung.
-
-
-Requirements
-------------
-
- 1) A bootloader that can support the necessary resume operation
-
- 2) Support for at least 1 source for resume
-
- 3) CONFIG_PM enabled in the kernel
-
- 4) Any peripherals that are going to be powered down at the same
- time require suspend/resume support.
-
-
-Resuming
---------
-
- The S3C2410 user manual defines the process of sending the CPU to
- sleep and how it resumes. The default behaviour of the Linux code
- is to set the GSTATUS3 register to the physical address of the
- code to resume Linux operation.
-
- GSTATUS4 is currently left alone by the sleep code, and is free to
- use for any other purposes (for example, the EB2410ITX uses this to
- save memory configuration in).
-
-
-Machine Support
----------------
-
- The machine specific functions must call the s3c_pm_init() function
- to say that its bootloader is capable of resuming. This can be as
- simple as adding the following to the machine's definition:
-
- INITMACHINE(s3c_pm_init)
-
- A board can do its own setup before calling s3c_pm_init, if it
- needs to setup anything else for power management support.
-
- There is currently no support for over-riding the default method of
- saving the resume address, if your board requires it, then contact
- the maintainer and discuss what is required.
-
- Note, the original method of adding an late_initcall() is wrong,
- and will end up initialising all compiled machines' pm init!
-
- The following is an example of code used for testing wakeup from
- an falling edge on IRQ_EINT0:
-
-
-static irqreturn_t button_irq(int irq, void *pw)
-{
- return IRQ_HANDLED;
-}
-
-statuc void __init machine_init(void)
-{
- ...
-
- request_irq(IRQ_EINT0, button_irq, IRQF_TRIGGER_FALLING,
- "button-irq-eint0", NULL);
-
- enable_irq_wake(IRQ_EINT0);
-
- s3c_pm_init();
-}
-
-
-Debugging
----------
-
- There are several important things to remember when using PM suspend:
-
- 1) The uart drivers will disable the clocks to the UART blocks when
- suspending, which means that use of printascii() or similar direct
- access to the UARTs will cause the debug to stop.
-
- 2) While the pm code itself will attempt to re-enable the UART clocks,
- care should be taken that any external clock sources that the UARTs
- rely on are still enabled at that point.
-
- 3) If any debugging is placed in the resume path, then it must have the
- relevant clocks and peripherals setup before use (ie, bootloader).
-
- For example, if you transmit a character from the UART, the baud
- rate and uart controls must be setup beforehand.
-
-
-Configuration
--------------
-
- The S3C2410 specific configuration in `System Type` defines various
- aspects of how the S3C2410 suspend and resume support is configured
-
- `S3C2410 PM Suspend debug`
-
- This option prints messages to the serial console before and after
- the actual suspend, giving detailed information on what is
- happening
-
-
- `S3C2410 PM Suspend Memory CRC`
-
- Allows the entire memory to be checksummed before and after the
- suspend to see if there has been any corruption of the contents.
-
- Note, the time to calculate the CRC is dependent on the CPU speed
- and the size of memory. For an 64Mbyte RAM area on an 200MHz
- S3C2410, this can take approximately 4 seconds to complete.
-
- This support requires the CRC32 function to be enabled.
-
-
- `S3C2410 PM Suspend CRC Chunksize (KiB)`
-
- Defines the size of memory each CRC chunk covers. A smaller value
- will mean that the CRC data block will take more memory, but will
- identify any faults with better precision
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2004 Simtec Electronics
-
diff --git a/Documentation/arm/Samsung-S3C24XX/USB-Host.txt b/Documentation/arm/Samsung-S3C24XX/USB-Host.txt
deleted file mode 100644
index f82b1faefad5..000000000000
--- a/Documentation/arm/Samsung-S3C24XX/USB-Host.txt
+++ /dev/null
@@ -1,93 +0,0 @@
- S3C24XX USB Host support
- ========================
-
-
-
-Introduction
-------------
-
- This document details the S3C2410/S3C2440 in-built OHCI USB host support.
-
-Configuration
--------------
-
- Enable at least the following kernel options:
-
- menuconfig:
-
- Device Drivers --->
- USB support --->
- <*> Support for Host-side USB
- <*> OHCI HCD support
-
-
- .config:
- CONFIG_USB
- CONFIG_USB_OHCI_HCD
-
-
- Once these options are configured, the standard set of USB device
- drivers can be configured and used.
-
-
-Board Support
--------------
-
- The driver attaches to a platform device, which will need to be
- added by the board specific support file in linux/arch/arm/mach-s3c2410,
- such as mach-bast.c or mach-smdk2410.c
-
- The platform device's platform_data field is only needed if the
- board implements extra power control or over-current monitoring.
-
- The OHCI driver does not ensure the state of the S3C2410's MISCCTRL
- register, so if both ports are to be used for the host, then it is
- the board support file's responsibility to ensure that the second
- port is configured to be connected to the OHCI core.
-
-
-Platform Data
--------------
-
- See arch/arm/mach-s3c2410/include/mach/usb-control.h for the
- descriptions of the platform device data. An implementation
- can be found in linux/arch/arm/mach-s3c2410/usb-simtec.c .
-
- The `struct s3c2410_hcd_info` contains a pair of functions
- that get called to enable over-current detection, and to
- control the port power status.
-
- The ports are numbered 0 and 1.
-
- power_control:
-
- Called to enable or disable the power on the port.
-
- enable_oc:
-
- Called to enable or disable the over-current monitoring.
- This should claim or release the resources being used to
- check the power condition on the port, such as an IRQ.
-
- report_oc:
-
- The OHCI driver fills this field in for the over-current code
- to call when there is a change to the over-current state on
- an port. The ports argument is a bitmask of 1 bit per port,
- with bit X being 1 for an over-current on port X.
-
- The function s3c2410_usb_report_oc() has been provided to
- ensure this is called correctly.
-
- port[x]:
-
- This is struct describes each port, 0 or 1. The platform driver
- should set the flags field of each port to S3C_HCDFLG_USED if
- the port is enabled.
-
-
-
-Document Author
----------------
-
-Ben Dooks, Copyright 2005 Simtec Electronics
diff --git a/Documentation/arm/Samsung/Bootloader-interface.txt b/Documentation/arm/Samsung/Bootloader-interface.txt
deleted file mode 100644
index d17ed518a7ea..000000000000
--- a/Documentation/arm/Samsung/Bootloader-interface.txt
+++ /dev/null
@@ -1,68 +0,0 @@
- Interface between kernel and boot loaders on Exynos boards
- ==========================================================
-
-Author: Krzysztof Kozlowski
-Date : 6 June 2015
-
-The document tries to describe currently used interface between Linux kernel
-and boot loaders on Samsung Exynos based boards. This is not a definition
-of interface but rather a description of existing state, a reference
-for information purpose only.
-
-In the document "boot loader" means any of following: U-boot, proprietary
-SBOOT or any other firmware for ARMv7 and ARMv8 initializing the board before
-executing kernel.
-
-
-1. Non-Secure mode
-
-Address: sysram_ns_base_addr
-Offset Value Purpose
-=============================================================================
-0x08 exynos_cpu_resume_ns, mcpm_entry_point System suspend
-0x0c 0x00000bad (Magic cookie) System suspend
-0x1c exynos4_secondary_startup Secondary CPU boot
-0x1c + 4*cpu exynos4_secondary_startup (Exynos4412) Secondary CPU boot
-0x20 0xfcba0d10 (Magic cookie) AFTR
-0x24 exynos_cpu_resume_ns AFTR
-0x28 + 4*cpu 0x8 (Magic cookie, Exynos3250) AFTR
-0x28 0x0 or last value during resume (Exynos542x) System suspend
-
-
-2. Secure mode
-
-Address: sysram_base_addr
-Offset Value Purpose
-=============================================================================
-0x00 exynos4_secondary_startup Secondary CPU boot
-0x04 exynos4_secondary_startup (Exynos542x) Secondary CPU boot
-4*cpu exynos4_secondary_startup (Exynos4412) Secondary CPU boot
-0x20 exynos_cpu_resume (Exynos4210 r1.0) AFTR
-0x24 0xfcba0d10 (Magic cookie, Exynos4210 r1.0) AFTR
-
-Address: pmu_base_addr
-Offset Value Purpose
-=============================================================================
-0x0800 exynos_cpu_resume AFTR, suspend
-0x0800 mcpm_entry_point (Exynos542x with MCPM) AFTR, suspend
-0x0804 0xfcba0d10 (Magic cookie) AFTR
-0x0804 0x00000bad (Magic cookie) System suspend
-0x0814 exynos4_secondary_startup (Exynos4210 r1.1) Secondary CPU boot
-0x0818 0xfcba0d10 (Magic cookie, Exynos4210 r1.1) AFTR
-0x081C exynos_cpu_resume (Exynos4210 r1.1) AFTR
-
-
-3. Other (regardless of secure/non-secure mode)
-
-Address: pmu_base_addr
-Offset Value Purpose
-=============================================================================
-0x0908 Non-zero Secondary CPU boot up indicator
- on Exynos3250 and Exynos542x
-
-
-4. Glossary
-
-AFTR - ARM Off Top Running, a low power mode, Cortex cores and many other
-modules are power gated, except the TOP modules
-MCPM - Multi-Cluster Power Management
diff --git a/Documentation/arm/Samsung/GPIO.txt b/Documentation/arm/Samsung/GPIO.txt
deleted file mode 100644
index 795adfd88081..000000000000
--- a/Documentation/arm/Samsung/GPIO.txt
+++ /dev/null
@@ -1,40 +0,0 @@
- Samsung GPIO implementation
- ===========================
-
-Introduction
-------------
-
-This outlines the Samsung GPIO implementation and the architecture
-specific calls provided alongside the drivers/gpio core.
-
-
-S3C24XX (Legacy)
-----------------
-
-See Documentation/arm/Samsung-S3C24XX/GPIO.txt for more information
-about these devices. Their implementation has been brought into line
-with the core samsung implementation described in this document.
-
-
-GPIOLIB integration
--------------------
-
-The gpio implementation uses gpiolib as much as possible, only providing
-specific calls for the items that require Samsung specific handling, such
-as pin special-function or pull resistor control.
-
-GPIO numbering is synchronised between the Samsung and gpiolib system.
-
-
-PIN configuration
------------------
-
-Pin configuration is specific to the Samsung architecture, with each SoC
-registering the necessary information for the core gpio configuration
-implementation to configure pins as necessary.
-
-The s3c_gpio_cfgpin() and s3c_gpio_setpull() provide the means for a
-driver or machine to change gpio configuration.
-
-See arch/arm/plat-samsung/include/plat/gpio-cfg.h for more information
-on these functions.
diff --git a/Documentation/arm/Samsung/Overview.txt b/Documentation/arm/Samsung/Overview.txt
deleted file mode 100644
index 8f7309bad460..000000000000
--- a/Documentation/arm/Samsung/Overview.txt
+++ /dev/null
@@ -1,86 +0,0 @@
- Samsung ARM Linux Overview
- ==========================
-
-Introduction
-------------
-
- The Samsung range of ARM SoCs spans many similar devices, from the initial
- ARM9 through to the newest ARM cores. This document shows an overview of
- the current kernel support, how to use it and where to find the code
- that supports this.
-
- The currently supported SoCs are:
-
- - S3C24XX: See Documentation/arm/Samsung-S3C24XX/Overview.txt for full list
- - S3C64XX: S3C6400 and S3C6410
- - S5PC110 / S5PV210
-
-
-S3C24XX Systems
----------------
-
- There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
- deals with the architecture and drivers specific to these devices.
-
- See Documentation/arm/Samsung-S3C24XX/Overview.txt for more information
- on the implementation details and specific support.
-
-
-Configuration
--------------
-
- A number of configurations are supplied, as there is no current way of
- unifying all the SoCs into one kernel.
-
- s5pc110_defconfig - S5PC110 specific default configuration
- s5pv210_defconfig - S5PV210 specific default configuration
-
-
-Layout
-------
-
- The directory layout is currently being restructured, and consists of
- several platform directories and then the machine specific directories
- of the CPUs being built for.
-
- plat-samsung provides the base for all the implementations, and is the
- last in the line of include directories that are processed for the build
- specific information. It contains the base clock, GPIO and device definitions
- to get the system running.
-
- plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
-
- plat-s5p is for s5p specific builds, and contains common support for the
- S5P specific systems. Not all S5Ps use all the features in this directory
- due to differences in the hardware.
-
-
-Layout changes
---------------
-
- The old plat-s3c and plat-s5pc1xx directories have been removed, with
- support moved to either plat-samsung or plat-s5p as necessary. These moves
- where to simplify the include and dependency issues involved with having
- so many different platform directories.
-
-
-Port Contributors
------------------
-
- Ben Dooks (BJD)
- Vincent Sanders
- Herbert Potzl
- Arnaud Patard (RTP)
- Roc Wu
- Klaus Fetscher
- Dimitry Andric
- Shannon Holland
- Guillaume Gourat (NexVision)
- Christer Weinigel (wingel) (Acer N30)
- Lucas Correia Villa Real (S3C2400 port)
-
-
-Document Author
----------------
-
-Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>
diff --git a/Documentation/arm/Setup b/Documentation/arm/Setup
deleted file mode 100644
index 0cb1e64bde80..000000000000
--- a/Documentation/arm/Setup
+++ /dev/null
@@ -1,129 +0,0 @@
-Kernel initialisation parameters on ARM Linux
----------------------------------------------
-
-The following document describes the kernel initialisation parameter
-structure, otherwise known as 'struct param_struct' which is used
-for most ARM Linux architectures.
-
-This structure is used to pass initialisation parameters from the
-kernel loader to the Linux kernel proper, and may be short lived
-through the kernel initialisation process. As a general rule, it
-should not be referenced outside of arch/arm/kernel/setup.c:setup_arch().
-
-There are a lot of parameters listed in there, and they are described
-below:
-
- page_size
-
- This parameter must be set to the page size of the machine, and
- will be checked by the kernel.
-
- nr_pages
-
- This is the total number of pages of memory in the system. If
- the memory is banked, then this should contain the total number
- of pages in the system.
-
- If the system contains separate VRAM, this value should not
- include this information.
-
- ramdisk_size
-
- This is now obsolete, and should not be used.
-
- flags
-
- Various kernel flags, including:
- bit 0 - 1 = mount root read only
- bit 1 - unused
- bit 2 - 0 = load ramdisk
- bit 3 - 0 = prompt for ramdisk
-
- rootdev
-
- major/minor number pair of device to mount as the root filesystem.
-
- video_num_cols
- video_num_rows
-
- These two together describe the character size of the dummy console,
- or VGA console character size. They should not be used for any other
- purpose.
-
- It's generally a good idea to set these to be either standard VGA, or
- the equivalent character size of your fbcon display. This then allows
- all the bootup messages to be displayed correctly.
-
- video_x
- video_y
-
- This describes the character position of cursor on VGA console, and
- is otherwise unused. (should not be used for other console types, and
- should not be used for other purposes).
-
- memc_control_reg
-
- MEMC chip control register for Acorn Archimedes and Acorn A5000
- based machines. May be used differently by different architectures.
-
- sounddefault
-
- Default sound setting on Acorn machines. May be used differently by
- different architectures.
-
- adfsdrives
-
- Number of ADFS/MFM disks. May be used differently by different
- architectures.
-
- bytes_per_char_h
- bytes_per_char_v
-
- These are now obsolete, and should not be used.
-
- pages_in_bank[4]
-
- Number of pages in each bank of the systems memory (used for RiscPC).
- This is intended to be used on systems where the physical memory
- is non-contiguous from the processors point of view.
-
- pages_in_vram
-
- Number of pages in VRAM (used on Acorn RiscPC). This value may also
- be used by loaders if the size of the video RAM can't be obtained
- from the hardware.
-
- initrd_start
- initrd_size
-
- This describes the kernel virtual start address and size of the
- initial ramdisk.
-
- rd_start
-
- Start address in sectors of the ramdisk image on a floppy disk.
-
- system_rev
-
- system revision number.
-
- system_serial_low
- system_serial_high
-
- system 64-bit serial number
-
- mem_fclk_21285
-
- The speed of the external oscillator to the 21285 (footbridge),
- which control's the speed of the memory bus, timer & serial port.
- Depending upon the speed of the cpu its value can be between
- 0-66 MHz. If no params are passed or a value of zero is passed,
- then a value of 50 Mhz is the default on 21285 architectures.
-
- paths[8][128]
-
- These are now obsolete, and should not be used.
-
- commandline
-
- Kernel command line parameters. Details can be found elsewhere.
diff --git a/Documentation/arm/VFP/release-notes.txt b/Documentation/arm/VFP/release-notes.txt
deleted file mode 100644
index 28a2795705ca..000000000000
--- a/Documentation/arm/VFP/release-notes.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Release notes for Linux Kernel VFP support code
------------------------------------------------
-
-Date: 20 May 2004
-Author: Russell King
-
-This is the first release of the Linux Kernel VFP support code. It
-provides support for the exceptions bounced from VFP hardware found
-on ARM926EJ-S.
-
-This release has been validated against the SoftFloat-2b library by
-John R. Hauser using the TestFloat-2a test suite. Details of this
-library and test suite can be found at:
-
- http://www.jhauser.us/arithmetic/SoftFloat.html
-
-The operations which have been tested with this package are:
-
- - fdiv
- - fsub
- - fadd
- - fmul
- - fcmp
- - fcmpe
- - fcvtd
- - fcvts
- - fsito
- - ftosi
- - fsqrt
-
-All the above pass softfloat tests with the following exceptions:
-
-- fadd/fsub shows some differences in the handling of +0 / -0 results
- when input operands differ in signs.
-- the handling of underflow exceptions is slightly different. If a
- result underflows before rounding, but becomes a normalised number
- after rounding, we do not signal an underflow exception.
-
-Other operations which have been tested by basic assembly-only tests
-are:
-
- - fcpy
- - fabs
- - fneg
- - ftoui
- - ftosiz
- - ftouiz
-
-The combination operations have not been tested:
-
- - fmac
- - fnmac
- - fmsc
- - fnmsc
- - fnmul
diff --git a/Documentation/arm/arm.rst b/Documentation/arm/arm.rst
new file mode 100644
index 000000000000..2edc509df92a
--- /dev/null
+++ b/Documentation/arm/arm.rst
@@ -0,0 +1,214 @@
+=======================
+ARM Linux 2.6 and upper
+=======================
+
+ Please check <ftp://ftp.arm.linux.org.uk/pub/armlinux> for
+ updates.
+
+Compilation of kernel
+---------------------
+
+ In order to compile ARM Linux, you will need a compiler capable of
+ generating ARM ELF code with GNU extensions. GCC 3.3 is known to be
+ a good compiler. Fortunately, you needn't guess. The kernel will report
+ an error if your compiler is a recognized offender.
+
+ To build ARM Linux natively, you shouldn't have to alter the ARCH = line
+ in the top level Makefile. However, if you don't have the ARM Linux ELF
+ tools installed as default, then you should change the CROSS_COMPILE
+ line as detailed below.
+
+ If you wish to cross-compile, then alter the following lines in the top
+ level make file::
+
+ ARCH = <whatever>
+
+ with::
+
+ ARCH = arm
+
+ and::
+
+ CROSS_COMPILE=
+
+ to::
+
+ CROSS_COMPILE=<your-path-to-your-compiler-without-gcc>
+
+ eg.::
+
+ CROSS_COMPILE=arm-linux-
+
+ Do a 'make config', followed by 'make Image' to build the kernel
+ (arch/arm/boot/Image). A compressed image can be built by doing a
+ 'make zImage' instead of 'make Image'.
+
+
+Bug reports etc
+---------------
+
+ Please send patches to the patch system. For more information, see
+ http://www.arm.linux.org.uk/developer/patches/info.php Always include some
+ explanation as to what the patch does and why it is needed.
+
+ Bug reports should be sent to linux-arm-kernel@lists.arm.linux.org.uk,
+ or submitted through the web form at
+ http://www.arm.linux.org.uk/developer/
+
+ When sending bug reports, please ensure that they contain all relevant
+ information, eg. the kernel messages that were printed before/during
+ the problem, what you were doing, etc.
+
+
+Include files
+-------------
+
+ Several new include directories have been created under include/asm-arm,
+ which are there to reduce the clutter in the top-level directory. These
+ directories, and their purpose is listed below:
+
+ ============= ==========================================================
+ `arch-*` machine/platform specific header files
+ `hardware` driver-internal ARM specific data structures/definitions
+ `mach` descriptions of generic ARM to specific machine interfaces
+ `proc-*` processor dependent header files (currently only two
+ categories)
+ ============= ==========================================================
+
+
+Machine/Platform support
+------------------------
+
+ The ARM tree contains support for a lot of different machine types. To
+ continue supporting these differences, it has become necessary to split
+ machine-specific parts by directory. For this, the machine category is
+ used to select which directories and files get included (we will use
+ $(MACHINE) to refer to the category)
+
+ To this end, we now have arch/arm/mach-$(MACHINE) directories which are
+ designed to house the non-driver files for a particular machine (eg, PCI,
+ memory management, architecture definitions etc). For all future
+ machines, there should be a corresponding arch/arm/mach-$(MACHINE)/include/mach
+ directory.
+
+
+Modules
+-------
+
+ Although modularisation is supported (and required for the FP emulator),
+ each module on an ARM2/ARM250/ARM3 machine when is loaded will take
+ memory up to the next 32k boundary due to the size of the pages.
+ Therefore, is modularisation on these machines really worth it?
+
+ However, ARM6 and up machines allow modules to take multiples of 4k, and
+ as such Acorn RiscPCs and other architectures using these processors can
+ make good use of modularisation.
+
+
+ADFS Image files
+----------------
+
+ You can access image files on your ADFS partitions by mounting the ADFS
+ partition, and then using the loopback device driver. You must have
+ losetup installed.
+
+ Please note that the PCEmulator DOS partitions have a partition table at
+ the start, and as such, you will have to give '-o offset' to losetup.
+
+
+Request to developers
+---------------------
+
+ When writing device drivers which include a separate assembler file, please
+ include it in with the C file, and not the arch/arm/lib directory. This
+ allows the driver to be compiled as a loadable module without requiring
+ half the code to be compiled into the kernel image.
+
+ In general, try to avoid using assembler unless it is really necessary. It
+ makes drivers far less easy to port to other hardware.
+
+
+ST506 hard drives
+-----------------
+
+ The ST506 hard drive controllers seem to be working fine (if a little
+ slowly). At the moment they will only work off the controllers on an
+ A4x0's motherboard, but for it to work off a Podule just requires
+ someone with a podule to add the addresses for the IRQ mask and the
+ HDC base to the source.
+
+ As of 31/3/96 it works with two drives (you should get the ADFS
+ `*configure` harddrive set to 2). I've got an internal 20MB and a great
+ big external 5.25" FH 64MB drive (who could ever want more :-) ).
+
+ I've just got 240K/s off it (a dd with bs=128k); thats about half of what
+ RiscOS gets; but it's a heck of a lot better than the 50K/s I was getting
+ last week :-)
+
+ Known bug: Drive data errors can cause a hang; including cases where
+ the controller has fixed the error using ECC. (Possibly ONLY
+ in that case...hmm).
+
+
+1772 Floppy
+-----------
+ This also seems to work OK, but hasn't been stressed much lately. It
+ hasn't got any code for disc change detection in there at the moment which
+ could be a bit of a problem! Suggestions on the correct way to do this
+ are welcome.
+
+
+`CONFIG_MACH_` and `CONFIG_ARCH_`
+---------------------------------
+ A change was made in 2003 to the macro names for new machines.
+ Historically, `CONFIG_ARCH_` was used for the bonafide architecture,
+ e.g. SA1100, as well as implementations of the architecture,
+ e.g. Assabet. It was decided to change the implementation macros
+ to read `CONFIG_MACH_` for clarity. Moreover, a retroactive fixup has
+ not been made because it would complicate patching.
+
+ Previous registrations may be found online.
+
+ <http://www.arm.linux.org.uk/developer/machines/>
+
+Kernel entry (head.S)
+---------------------
+ The initial entry into the kernel is via head.S, which uses machine
+ independent code. The machine is selected by the value of 'r1' on
+ entry, which must be kept unique.
+
+ Due to the large number of machines which the ARM port of Linux provides
+ for, we have a method to manage this which ensures that we don't end up
+ duplicating large amounts of code.
+
+ We group machine (or platform) support code into machine classes. A
+ class typically based around one or more system on a chip devices, and
+ acts as a natural container around the actual implementations. These
+ classes are given directories - arch/arm/mach-<class> and
+ arch/arm/mach-<class> - which contain the source files to/include/mach
+ support the machine class. This directories also contain any machine
+ specific supporting code.
+
+ For example, the SA1100 class is based upon the SA1100 and SA1110 SoC
+ devices, and contains the code to support the way the on-board and off-
+ board devices are used, or the device is setup, and provides that
+ machine specific "personality."
+
+ For platforms that support device tree (DT), the machine selection is
+ controlled at runtime by passing the device tree blob to the kernel. At
+ compile-time, support for the machine type must be selected. This allows for
+ a single multiplatform kernel build to be used for several machine types.
+
+ For platforms that do not use device tree, this machine selection is
+ controlled by the machine type ID, which acts both as a run-time and a
+ compile-time code selection method. You can register a new machine via the
+ web site at:
+
+ <http://www.arm.linux.org.uk/developer/machines/>
+
+ Note: Please do not register a machine type for DT-only platforms. If your
+ platform is DT-only, you do not need a registered machine type.
+
+---
+
+Russell King (15/03/2004)
diff --git a/Documentation/arm/booting.rst b/Documentation/arm/booting.rst
new file mode 100644
index 000000000000..4babb6c6ae1e
--- /dev/null
+++ b/Documentation/arm/booting.rst
@@ -0,0 +1,237 @@
+=================
+Booting ARM Linux
+=================
+
+Author: Russell King
+
+Date : 18 May 2002
+
+The following documentation is relevant to 2.4.18-rmk6 and beyond.
+
+In order to boot ARM Linux, you require a boot loader, which is a small
+program that runs before the main kernel. The boot loader is expected
+to initialise various devices, and eventually call the Linux kernel,
+passing information to the kernel.
+
+Essentially, the boot loader should provide (as a minimum) the
+following:
+
+1. Setup and initialise the RAM.
+2. Initialise one serial port.
+3. Detect the machine type.
+4. Setup the kernel tagged list.
+5. Load initramfs.
+6. Call the kernel image.
+
+
+1. Setup and initialise RAM
+---------------------------
+
+Existing boot loaders:
+ MANDATORY
+New boot loaders:
+ MANDATORY
+
+The boot loader is expected to find and initialise all RAM that the
+kernel will use for volatile data storage in the system. It performs
+this in a machine dependent manner. (It may use internal algorithms
+to automatically locate and size all RAM, or it may use knowledge of
+the RAM in the machine, or any other method the boot loader designer
+sees fit.)
+
+
+2. Initialise one serial port
+-----------------------------
+
+Existing boot loaders:
+ OPTIONAL, RECOMMENDED
+New boot loaders:
+ OPTIONAL, RECOMMENDED
+
+The boot loader should initialise and enable one serial port on the
+target. This allows the kernel serial driver to automatically detect
+which serial port it should use for the kernel console (generally
+used for debugging purposes, or communication with the target.)
+
+As an alternative, the boot loader can pass the relevant 'console='
+option to the kernel via the tagged lists specifying the port, and
+serial format options as described in
+
+ Documentation/admin-guide/kernel-parameters.rst.
+
+
+3. Detect the machine type
+--------------------------
+
+Existing boot loaders:
+ OPTIONAL
+New boot loaders:
+ MANDATORY except for DT-only platforms
+
+The boot loader should detect the machine type its running on by some
+method. Whether this is a hard coded value or some algorithm that
+looks at the connected hardware is beyond the scope of this document.
+The boot loader must ultimately be able to provide a MACH_TYPE_xxx
+value to the kernel. (see linux/arch/arm/tools/mach-types). This
+should be passed to the kernel in register r1.
+
+For DT-only platforms, the machine type will be determined by device
+tree. set the machine type to all ones (~0). This is not strictly
+necessary, but assures that it will not match any existing types.
+
+4. Setup boot data
+------------------
+
+Existing boot loaders:
+ OPTIONAL, HIGHLY RECOMMENDED
+New boot loaders:
+ MANDATORY
+
+The boot loader must provide either a tagged list or a dtb image for
+passing configuration data to the kernel. The physical address of the
+boot data is passed to the kernel in register r2.
+
+4a. Setup the kernel tagged list
+--------------------------------
+
+The boot loader must create and initialise the kernel tagged list.
+A valid tagged list starts with ATAG_CORE and ends with ATAG_NONE.
+The ATAG_CORE tag may or may not be empty. An empty ATAG_CORE tag
+has the size field set to '2' (0x00000002). The ATAG_NONE must set
+the size field to zero.
+
+Any number of tags can be placed in the list. It is undefined
+whether a repeated tag appends to the information carried by the
+previous tag, or whether it replaces the information in its
+entirety; some tags behave as the former, others the latter.
+
+The boot loader must pass at a minimum the size and location of
+the system memory, and root filesystem location. Therefore, the
+minimum tagged list should look::
+
+ +-----------+
+ base -> | ATAG_CORE | |
+ +-----------+ |
+ | ATAG_MEM | | increasing address
+ +-----------+ |
+ | ATAG_NONE | |
+ +-----------+ v
+
+The tagged list should be stored in system RAM.
+
+The tagged list must be placed in a region of memory where neither
+the kernel decompressor nor initrd 'bootp' program will overwrite
+it. The recommended placement is in the first 16KiB of RAM.
+
+4b. Setup the device tree
+-------------------------
+
+The boot loader must load a device tree image (dtb) into system ram
+at a 64bit aligned address and initialize it with the boot data. The
+dtb format is documented in Documentation/devicetree/booting-without-of.txt.
+The kernel will look for the dtb magic value of 0xd00dfeed at the dtb
+physical address to determine if a dtb has been passed instead of a
+tagged list.
+
+The boot loader must pass at a minimum the size and location of the
+system memory, and the root filesystem location. The dtb must be
+placed in a region of memory where the kernel decompressor will not
+overwrite it, while remaining within the region which will be covered
+by the kernel's low-memory mapping.
+
+A safe location is just above the 128MiB boundary from start of RAM.
+
+5. Load initramfs.
+------------------
+
+Existing boot loaders:
+ OPTIONAL
+New boot loaders:
+ OPTIONAL
+
+If an initramfs is in use then, as with the dtb, it must be placed in
+a region of memory where the kernel decompressor will not overwrite it
+while also with the region which will be covered by the kernel's
+low-memory mapping.
+
+A safe location is just above the device tree blob which itself will
+be loaded just above the 128MiB boundary from the start of RAM as
+recommended above.
+
+6. Calling the kernel image
+---------------------------
+
+Existing boot loaders:
+ MANDATORY
+New boot loaders:
+ MANDATORY
+
+There are two options for calling the kernel zImage. If the zImage
+is stored in flash, and is linked correctly to be run from flash,
+then it is legal for the boot loader to call the zImage in flash
+directly.
+
+The zImage may also be placed in system RAM and called there. The
+kernel should be placed in the first 128MiB of RAM. It is recommended
+that it is loaded above 32MiB in order to avoid the need to relocate
+prior to decompression, which will make the boot process slightly
+faster.
+
+When booting a raw (non-zImage) kernel the constraints are tighter.
+In this case the kernel must be loaded at an offset into system equal
+to TEXT_OFFSET - PAGE_OFFSET.
+
+In any case, the following conditions must be met:
+
+- Quiesce all DMA capable devices so that memory does not get
+ corrupted by bogus network packets or disk data. This will save
+ you many hours of debug.
+
+- CPU register settings
+
+ - r0 = 0,
+ - r1 = machine type number discovered in (3) above.
+ - r2 = physical address of tagged list in system RAM, or
+ physical address of device tree block (dtb) in system RAM
+
+- CPU mode
+
+ All forms of interrupts must be disabled (IRQs and FIQs)
+
+ For CPUs which do not include the ARM virtualization extensions, the
+ CPU must be in SVC mode. (A special exception exists for Angel)
+
+ CPUs which include support for the virtualization extensions can be
+ entered in HYP mode in order to enable the kernel to make full use of
+ these extensions. This is the recommended boot method for such CPUs,
+ unless the virtualisations are already in use by a pre-installed
+ hypervisor.
+
+ If the kernel is not entered in HYP mode for any reason, it must be
+ entered in SVC mode.
+
+- Caches, MMUs
+
+ The MMU must be off.
+
+ Instruction cache may be on or off.
+
+ Data cache must be off.
+
+ If the kernel is entered in HYP mode, the above requirements apply to
+ the HYP mode configuration in addition to the ordinary PL1 (privileged
+ kernel modes) configuration. In addition, all traps into the
+ hypervisor must be disabled, and PL1 access must be granted for all
+ peripherals and CPU resources for which this is architecturally
+ possible. Except for entering in HYP mode, the system configuration
+ should be such that a kernel which does not include support for the
+ virtualization extensions can boot correctly without extra help.
+
+- The boot loader is expected to call the kernel image by jumping
+ directly to the first instruction of the kernel image.
+
+ On CPUs supporting the ARM instruction set, the entry must be
+ made in ARM state, even for a Thumb-2 kernel.
+
+ On CPUs supporting only the Thumb instruction set such as
+ Cortex-M class CPUs, the entry must be made in Thumb state.
diff --git a/Documentation/arm/cluster-pm-race-avoidance.rst b/Documentation/arm/cluster-pm-race-avoidance.rst
new file mode 100644
index 000000000000..aa58603d3f28
--- /dev/null
+++ b/Documentation/arm/cluster-pm-race-avoidance.rst
@@ -0,0 +1,533 @@
+=========================================================
+Cluster-wide Power-up/power-down race avoidance algorithm
+=========================================================
+
+This file documents the algorithm which is used to coordinate CPU and
+cluster setup and teardown operations and to manage hardware coherency
+controls safely.
+
+The section "Rationale" explains what the algorithm is for and why it is
+needed. "Basic model" explains general concepts using a simplified view
+of the system. The other sections explain the actual details of the
+algorithm in use.
+
+
+Rationale
+---------
+
+In a system containing multiple CPUs, it is desirable to have the
+ability to turn off individual CPUs when the system is idle, reducing
+power consumption and thermal dissipation.
+
+In a system containing multiple clusters of CPUs, it is also desirable
+to have the ability to turn off entire clusters.
+
+Turning entire clusters off and on is a risky business, because it
+involves performing potentially destructive operations affecting a group
+of independently running CPUs, while the OS continues to run. This
+means that we need some coordination in order to ensure that critical
+cluster-level operations are only performed when it is truly safe to do
+so.
+
+Simple locking may not be sufficient to solve this problem, because
+mechanisms like Linux spinlocks may rely on coherency mechanisms which
+are not immediately enabled when a cluster powers up. Since enabling or
+disabling those mechanisms may itself be a non-atomic operation (such as
+writing some hardware registers and invalidating large caches), other
+methods of coordination are required in order to guarantee safe
+power-down and power-up at the cluster level.
+
+The mechanism presented in this document describes a coherent memory
+based protocol for performing the needed coordination. It aims to be as
+lightweight as possible, while providing the required safety properties.
+
+
+Basic model
+-----------
+
+Each cluster and CPU is assigned a state, as follows:
+
+ - DOWN
+ - COMING_UP
+ - UP
+ - GOING_DOWN
+
+::
+
+ +---------> UP ----------+
+ | v
+
+ COMING_UP GOING_DOWN
+
+ ^ |
+ +--------- DOWN <--------+
+
+
+DOWN:
+ The CPU or cluster is not coherent, and is either powered off or
+ suspended, or is ready to be powered off or suspended.
+
+COMING_UP:
+ The CPU or cluster has committed to moving to the UP state.
+ It may be part way through the process of initialisation and
+ enabling coherency.
+
+UP:
+ The CPU or cluster is active and coherent at the hardware
+ level. A CPU in this state is not necessarily being used
+ actively by the kernel.
+
+GOING_DOWN:
+ The CPU or cluster has committed to moving to the DOWN
+ state. It may be part way through the process of teardown and
+ coherency exit.
+
+
+Each CPU has one of these states assigned to it at any point in time.
+The CPU states are described in the "CPU state" section, below.
+
+Each cluster is also assigned a state, but it is necessary to split the
+state value into two parts (the "cluster" state and "inbound" state) and
+to introduce additional states in order to avoid races between different
+CPUs in the cluster simultaneously modifying the state. The cluster-
+level states are described in the "Cluster state" section.
+
+To help distinguish the CPU states from cluster states in this
+discussion, the state names are given a `CPU_` prefix for the CPU states,
+and a `CLUSTER_` or `INBOUND_` prefix for the cluster states.
+
+
+CPU state
+---------
+
+In this algorithm, each individual core in a multi-core processor is
+referred to as a "CPU". CPUs are assumed to be single-threaded:
+therefore, a CPU can only be doing one thing at a single point in time.
+
+This means that CPUs fit the basic model closely.
+
+The algorithm defines the following states for each CPU in the system:
+
+ - CPU_DOWN
+ - CPU_COMING_UP
+ - CPU_UP
+ - CPU_GOING_DOWN
+
+::
+
+ cluster setup and
+ CPU setup complete policy decision
+ +-----------> CPU_UP ------------+
+ | v
+
+ CPU_COMING_UP CPU_GOING_DOWN
+
+ ^ |
+ +----------- CPU_DOWN <----------+
+ policy decision CPU teardown complete
+ or hardware event
+
+
+The definitions of the four states correspond closely to the states of
+the basic model.
+
+Transitions between states occur as follows.
+
+A trigger event (spontaneous) means that the CPU can transition to the
+next state as a result of making local progress only, with no
+requirement for any external event to happen.
+
+
+CPU_DOWN:
+ A CPU reaches the CPU_DOWN state when it is ready for
+ power-down. On reaching this state, the CPU will typically
+ power itself down or suspend itself, via a WFI instruction or a
+ firmware call.
+
+ Next state:
+ CPU_COMING_UP
+ Conditions:
+ none
+
+ Trigger events:
+ a) an explicit hardware power-up operation, resulting
+ from a policy decision on another CPU;
+
+ b) a hardware event, such as an interrupt.
+
+
+CPU_COMING_UP:
+ A CPU cannot start participating in hardware coherency until the
+ cluster is set up and coherent. If the cluster is not ready,
+ then the CPU will wait in the CPU_COMING_UP state until the
+ cluster has been set up.
+
+ Next state:
+ CPU_UP
+ Conditions:
+ The CPU's parent cluster must be in CLUSTER_UP.
+ Trigger events:
+ Transition of the parent cluster to CLUSTER_UP.
+
+ Refer to the "Cluster state" section for a description of the
+ CLUSTER_UP state.
+
+
+CPU_UP:
+ When a CPU reaches the CPU_UP state, it is safe for the CPU to
+ start participating in local coherency.
+
+ This is done by jumping to the kernel's CPU resume code.
+
+ Note that the definition of this state is slightly different
+ from the basic model definition: CPU_UP does not mean that the
+ CPU is coherent yet, but it does mean that it is safe to resume
+ the kernel. The kernel handles the rest of the resume
+ procedure, so the remaining steps are not visible as part of the
+ race avoidance algorithm.
+
+ The CPU remains in this state until an explicit policy decision
+ is made to shut down or suspend the CPU.
+
+ Next state:
+ CPU_GOING_DOWN
+ Conditions:
+ none
+ Trigger events:
+ explicit policy decision
+
+
+CPU_GOING_DOWN:
+ While in this state, the CPU exits coherency, including any
+ operations required to achieve this (such as cleaning data
+ caches).
+
+ Next state:
+ CPU_DOWN
+ Conditions:
+ local CPU teardown complete
+ Trigger events:
+ (spontaneous)
+
+
+Cluster state
+-------------
+
+A cluster is a group of connected CPUs with some common resources.
+Because a cluster contains multiple CPUs, it can be doing multiple
+things at the same time. This has some implications. In particular, a
+CPU can start up while another CPU is tearing the cluster down.
+
+In this discussion, the "outbound side" is the view of the cluster state
+as seen by a CPU tearing the cluster down. The "inbound side" is the
+view of the cluster state as seen by a CPU setting the CPU up.
+
+In order to enable safe coordination in such situations, it is important
+that a CPU which is setting up the cluster can advertise its state
+independently of the CPU which is tearing down the cluster. For this
+reason, the cluster state is split into two parts:
+
+ "cluster" state: The global state of the cluster; or the state
+ on the outbound side:
+
+ - CLUSTER_DOWN
+ - CLUSTER_UP
+ - CLUSTER_GOING_DOWN
+
+ "inbound" state: The state of the cluster on the inbound side.
+
+ - INBOUND_NOT_COMING_UP
+ - INBOUND_COMING_UP
+
+
+ The different pairings of these states results in six possible
+ states for the cluster as a whole::
+
+ CLUSTER_UP
+ +==========> INBOUND_NOT_COMING_UP -------------+
+ # |
+ |
+ CLUSTER_UP <----+ |
+ INBOUND_COMING_UP | v
+
+ ^ CLUSTER_GOING_DOWN CLUSTER_GOING_DOWN
+ # INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP
+
+ CLUSTER_DOWN | |
+ INBOUND_COMING_UP <----+ |
+ |
+ ^ |
+ +=========== CLUSTER_DOWN <------------+
+ INBOUND_NOT_COMING_UP
+
+ Transitions -----> can only be made by the outbound CPU, and
+ only involve changes to the "cluster" state.
+
+ Transitions ===##> can only be made by the inbound CPU, and only
+ involve changes to the "inbound" state, except where there is no
+ further transition possible on the outbound side (i.e., the
+ outbound CPU has put the cluster into the CLUSTER_DOWN state).
+
+ The race avoidance algorithm does not provide a way to determine
+ which exact CPUs within the cluster play these roles. This must
+ be decided in advance by some other means. Refer to the section
+ "Last man and first man selection" for more explanation.
+
+
+ CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the
+ cluster can actually be powered down.
+
+ The parallelism of the inbound and outbound CPUs is observed by
+ the existence of two different paths from CLUSTER_GOING_DOWN/
+ INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic
+ model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to
+ COMING_UP in the basic model). The second path avoids cluster
+ teardown completely.
+
+ CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic
+ model. The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP
+ is trivial and merely resets the state machine ready for the
+ next cycle.
+
+ Details of the allowable transitions follow.
+
+ The next state in each case is notated
+
+ <cluster state>/<inbound state> (<transitioner>)
+
+ where the <transitioner> is the side on which the transition
+ can occur; either the inbound or the outbound side.
+
+
+CLUSTER_DOWN/INBOUND_NOT_COMING_UP:
+ Next state:
+ CLUSTER_DOWN/INBOUND_COMING_UP (inbound)
+ Conditions:
+ none
+
+ Trigger events:
+ a) an explicit hardware power-up operation, resulting
+ from a policy decision on another CPU;
+
+ b) a hardware event, such as an interrupt.
+
+
+CLUSTER_DOWN/INBOUND_COMING_UP:
+
+ In this state, an inbound CPU sets up the cluster, including
+ enabling of hardware coherency at the cluster level and any
+ other operations (such as cache invalidation) which are required
+ in order to achieve this.
+
+ The purpose of this state is to do sufficient cluster-level
+ setup to enable other CPUs in the cluster to enter coherency
+ safely.
+
+ Next state:
+ CLUSTER_UP/INBOUND_COMING_UP (inbound)
+ Conditions:
+ cluster-level setup and hardware coherency complete
+ Trigger events:
+ (spontaneous)
+
+
+CLUSTER_UP/INBOUND_COMING_UP:
+
+ Cluster-level setup is complete and hardware coherency is
+ enabled for the cluster. Other CPUs in the cluster can safely
+ enter coherency.
+
+ This is a transient state, leading immediately to
+ CLUSTER_UP/INBOUND_NOT_COMING_UP. All other CPUs on the cluster
+ should consider treat these two states as equivalent.
+
+ Next state:
+ CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound)
+ Conditions:
+ none
+ Trigger events:
+ (spontaneous)
+
+
+CLUSTER_UP/INBOUND_NOT_COMING_UP:
+
+ Cluster-level setup is complete and hardware coherency is
+ enabled for the cluster. Other CPUs in the cluster can safely
+ enter coherency.
+
+ The cluster will remain in this state until a policy decision is
+ made to power the cluster down.
+
+ Next state:
+ CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound)
+ Conditions:
+ none
+ Trigger events:
+ policy decision to power down the cluster
+
+
+CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP:
+
+ An outbound CPU is tearing the cluster down. The selected CPU
+ must wait in this state until all CPUs in the cluster are in the
+ CPU_DOWN state.
+
+ When all CPUs are in the CPU_DOWN state, the cluster can be torn
+ down, for example by cleaning data caches and exiting
+ cluster-level coherency.
+
+ To avoid wasteful unnecessary teardown operations, the outbound
+ should check the inbound cluster state for asynchronous
+ transitions to INBOUND_COMING_UP. Alternatively, individual
+ CPUs can be checked for entry into CPU_COMING_UP or CPU_UP.
+
+
+ Next states:
+
+ CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound)
+ Conditions:
+ cluster torn down and ready to power off
+ Trigger events:
+ (spontaneous)
+
+ CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound)
+ Conditions:
+ none
+
+ Trigger events:
+ a) an explicit hardware power-up operation,
+ resulting from a policy decision on another
+ CPU;
+
+ b) a hardware event, such as an interrupt.
+
+
+CLUSTER_GOING_DOWN/INBOUND_COMING_UP:
+
+ The cluster is (or was) being torn down, but another CPU has
+ come online in the meantime and is trying to set up the cluster
+ again.
+
+ If the outbound CPU observes this state, it has two choices:
+
+ a) back out of teardown, restoring the cluster to the
+ CLUSTER_UP state;
+
+ b) finish tearing the cluster down and put the cluster
+ in the CLUSTER_DOWN state; the inbound CPU will
+ set up the cluster again from there.
+
+ Choice (a) permits the removal of some latency by avoiding
+ unnecessary teardown and setup operations in situations where
+ the cluster is not really going to be powered down.
+
+
+ Next states:
+
+ CLUSTER_UP/INBOUND_COMING_UP (outbound)
+ Conditions:
+ cluster-level setup and hardware
+ coherency complete
+
+ Trigger events:
+ (spontaneous)
+
+ CLUSTER_DOWN/INBOUND_COMING_UP (outbound)
+ Conditions:
+ cluster torn down and ready to power off
+
+ Trigger events:
+ (spontaneous)
+
+
+Last man and First man selection
+--------------------------------
+
+The CPU which performs cluster tear-down operations on the outbound side
+is commonly referred to as the "last man".
+
+The CPU which performs cluster setup on the inbound side is commonly
+referred to as the "first man".
+
+The race avoidance algorithm documented above does not provide a
+mechanism to choose which CPUs should play these roles.
+
+
+Last man:
+
+When shutting down the cluster, all the CPUs involved are initially
+executing Linux and hence coherent. Therefore, ordinary spinlocks can
+be used to select a last man safely, before the CPUs become
+non-coherent.
+
+
+First man:
+
+Because CPUs may power up asynchronously in response to external wake-up
+events, a dynamic mechanism is needed to make sure that only one CPU
+attempts to play the first man role and do the cluster-level
+initialisation: any other CPUs must wait for this to complete before
+proceeding.
+
+Cluster-level initialisation may involve actions such as configuring
+coherency controls in the bus fabric.
+
+The current implementation in mcpm_head.S uses a separate mutual exclusion
+mechanism to do this arbitration. This mechanism is documented in
+detail in vlocks.txt.
+
+
+Features and Limitations
+------------------------
+
+Implementation:
+
+ The current ARM-based implementation is split between
+ arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and
+ arch/arm/common/mcpm_entry.c (everything else):
+
+ __mcpm_cpu_going_down() signals the transition of a CPU to the
+ CPU_GOING_DOWN state.
+
+ __mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN
+ state.
+
+ A CPU transitions to CPU_COMING_UP and then to CPU_UP via the
+ low-level power-up code in mcpm_head.S. This could
+ involve CPU-specific setup code, but in the current
+ implementation it does not.
+
+ __mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical()
+ handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN
+ and from there to CLUSTER_DOWN or back to CLUSTER_UP (in
+ the case of an aborted cluster power-down).
+
+ These functions are more complex than the __mcpm_cpu_*()
+ functions due to the extra inter-CPU coordination which
+ is needed for safe transitions at the cluster level.
+
+ A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via
+ the low-level power-up code in mcpm_head.S. This
+ typically involves platform-specific setup code,
+ provided by the platform-specific power_up_setup
+ function registered via mcpm_sync_init.
+
+Deep topologies:
+
+ As currently described and implemented, the algorithm does not
+ support CPU topologies involving more than two levels (i.e.,
+ clusters of clusters are not supported). The algorithm could be
+ extended by replicating the cluster-level states for the
+ additional topological levels, and modifying the transition
+ rules for the intermediate (non-outermost) cluster levels.
+
+
+Colophon
+--------
+
+Originally created and documented by Dave Martin for Linaro Limited, in
+collaboration with Nicolas Pitre and Achin Gupta.
+
+Copyright (C) 2012-2013 Linaro Limited
+Distributed under the terms of Version 2 of the GNU General Public
+License, as defined in linux/COPYING.
diff --git a/Documentation/arm/cluster-pm-race-avoidance.txt b/Documentation/arm/cluster-pm-race-avoidance.txt
deleted file mode 100644
index 750b6fc24af9..000000000000
--- a/Documentation/arm/cluster-pm-race-avoidance.txt
+++ /dev/null
@@ -1,498 +0,0 @@
-Cluster-wide Power-up/power-down race avoidance algorithm
-=========================================================
-
-This file documents the algorithm which is used to coordinate CPU and
-cluster setup and teardown operations and to manage hardware coherency
-controls safely.
-
-The section "Rationale" explains what the algorithm is for and why it is
-needed. "Basic model" explains general concepts using a simplified view
-of the system. The other sections explain the actual details of the
-algorithm in use.
-
-
-Rationale
----------
-
-In a system containing multiple CPUs, it is desirable to have the
-ability to turn off individual CPUs when the system is idle, reducing
-power consumption and thermal dissipation.
-
-In a system containing multiple clusters of CPUs, it is also desirable
-to have the ability to turn off entire clusters.
-
-Turning entire clusters off and on is a risky business, because it
-involves performing potentially destructive operations affecting a group
-of independently running CPUs, while the OS continues to run. This
-means that we need some coordination in order to ensure that critical
-cluster-level operations are only performed when it is truly safe to do
-so.
-
-Simple locking may not be sufficient to solve this problem, because
-mechanisms like Linux spinlocks may rely on coherency mechanisms which
-are not immediately enabled when a cluster powers up. Since enabling or
-disabling those mechanisms may itself be a non-atomic operation (such as
-writing some hardware registers and invalidating large caches), other
-methods of coordination are required in order to guarantee safe
-power-down and power-up at the cluster level.
-
-The mechanism presented in this document describes a coherent memory
-based protocol for performing the needed coordination. It aims to be as
-lightweight as possible, while providing the required safety properties.
-
-
-Basic model
------------
-
-Each cluster and CPU is assigned a state, as follows:
-
- DOWN
- COMING_UP
- UP
- GOING_DOWN
-
- +---------> UP ----------+
- | v
-
- COMING_UP GOING_DOWN
-
- ^ |
- +--------- DOWN <--------+
-
-
-DOWN: The CPU or cluster is not coherent, and is either powered off or
- suspended, or is ready to be powered off or suspended.
-
-COMING_UP: The CPU or cluster has committed to moving to the UP state.
- It may be part way through the process of initialisation and
- enabling coherency.
-
-UP: The CPU or cluster is active and coherent at the hardware
- level. A CPU in this state is not necessarily being used
- actively by the kernel.
-
-GOING_DOWN: The CPU or cluster has committed to moving to the DOWN
- state. It may be part way through the process of teardown and
- coherency exit.
-
-
-Each CPU has one of these states assigned to it at any point in time.
-The CPU states are described in the "CPU state" section, below.
-
-Each cluster is also assigned a state, but it is necessary to split the
-state value into two parts (the "cluster" state and "inbound" state) and
-to introduce additional states in order to avoid races between different
-CPUs in the cluster simultaneously modifying the state. The cluster-
-level states are described in the "Cluster state" section.
-
-To help distinguish the CPU states from cluster states in this
-discussion, the state names are given a CPU_ prefix for the CPU states,
-and a CLUSTER_ or INBOUND_ prefix for the cluster states.
-
-
-CPU state
----------
-
-In this algorithm, each individual core in a multi-core processor is
-referred to as a "CPU". CPUs are assumed to be single-threaded:
-therefore, a CPU can only be doing one thing at a single point in time.
-
-This means that CPUs fit the basic model closely.
-
-The algorithm defines the following states for each CPU in the system:
-
- CPU_DOWN
- CPU_COMING_UP
- CPU_UP
- CPU_GOING_DOWN
-
- cluster setup and
- CPU setup complete policy decision
- +-----------> CPU_UP ------------+
- | v
-
- CPU_COMING_UP CPU_GOING_DOWN
-
- ^ |
- +----------- CPU_DOWN <----------+
- policy decision CPU teardown complete
- or hardware event
-
-
-The definitions of the four states correspond closely to the states of
-the basic model.
-
-Transitions between states occur as follows.
-
-A trigger event (spontaneous) means that the CPU can transition to the
-next state as a result of making local progress only, with no
-requirement for any external event to happen.
-
-
-CPU_DOWN:
-
- A CPU reaches the CPU_DOWN state when it is ready for
- power-down. On reaching this state, the CPU will typically
- power itself down or suspend itself, via a WFI instruction or a
- firmware call.
-
- Next state: CPU_COMING_UP
- Conditions: none
-
- Trigger events:
-
- a) an explicit hardware power-up operation, resulting
- from a policy decision on another CPU;
-
- b) a hardware event, such as an interrupt.
-
-
-CPU_COMING_UP:
-
- A CPU cannot start participating in hardware coherency until the
- cluster is set up and coherent. If the cluster is not ready,
- then the CPU will wait in the CPU_COMING_UP state until the
- cluster has been set up.
-
- Next state: CPU_UP
- Conditions: The CPU's parent cluster must be in CLUSTER_UP.
- Trigger events: Transition of the parent cluster to CLUSTER_UP.
-
- Refer to the "Cluster state" section for a description of the
- CLUSTER_UP state.
-
-
-CPU_UP:
- When a CPU reaches the CPU_UP state, it is safe for the CPU to
- start participating in local coherency.
-
- This is done by jumping to the kernel's CPU resume code.
-
- Note that the definition of this state is slightly different
- from the basic model definition: CPU_UP does not mean that the
- CPU is coherent yet, but it does mean that it is safe to resume
- the kernel. The kernel handles the rest of the resume
- procedure, so the remaining steps are not visible as part of the
- race avoidance algorithm.
-
- The CPU remains in this state until an explicit policy decision
- is made to shut down or suspend the CPU.
-
- Next state: CPU_GOING_DOWN
- Conditions: none
- Trigger events: explicit policy decision
-
-
-CPU_GOING_DOWN:
-
- While in this state, the CPU exits coherency, including any
- operations required to achieve this (such as cleaning data
- caches).
-
- Next state: CPU_DOWN
- Conditions: local CPU teardown complete
- Trigger events: (spontaneous)
-
-
-Cluster state
--------------
-
-A cluster is a group of connected CPUs with some common resources.
-Because a cluster contains multiple CPUs, it can be doing multiple
-things at the same time. This has some implications. In particular, a
-CPU can start up while another CPU is tearing the cluster down.
-
-In this discussion, the "outbound side" is the view of the cluster state
-as seen by a CPU tearing the cluster down. The "inbound side" is the
-view of the cluster state as seen by a CPU setting the CPU up.
-
-In order to enable safe coordination in such situations, it is important
-that a CPU which is setting up the cluster can advertise its state
-independently of the CPU which is tearing down the cluster. For this
-reason, the cluster state is split into two parts:
-
- "cluster" state: The global state of the cluster; or the state
- on the outbound side:
-
- CLUSTER_DOWN
- CLUSTER_UP
- CLUSTER_GOING_DOWN
-
- "inbound" state: The state of the cluster on the inbound side.
-
- INBOUND_NOT_COMING_UP
- INBOUND_COMING_UP
-
-
- The different pairings of these states results in six possible
- states for the cluster as a whole:
-
- CLUSTER_UP
- +==========> INBOUND_NOT_COMING_UP -------------+
- # |
- |
- CLUSTER_UP <----+ |
- INBOUND_COMING_UP | v
-
- ^ CLUSTER_GOING_DOWN CLUSTER_GOING_DOWN
- # INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP
-
- CLUSTER_DOWN | |
- INBOUND_COMING_UP <----+ |
- |
- ^ |
- +=========== CLUSTER_DOWN <------------+
- INBOUND_NOT_COMING_UP
-
- Transitions -----> can only be made by the outbound CPU, and
- only involve changes to the "cluster" state.
-
- Transitions ===##> can only be made by the inbound CPU, and only
- involve changes to the "inbound" state, except where there is no
- further transition possible on the outbound side (i.e., the
- outbound CPU has put the cluster into the CLUSTER_DOWN state).
-
- The race avoidance algorithm does not provide a way to determine
- which exact CPUs within the cluster play these roles. This must
- be decided in advance by some other means. Refer to the section
- "Last man and first man selection" for more explanation.
-
-
- CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the
- cluster can actually be powered down.
-
- The parallelism of the inbound and outbound CPUs is observed by
- the existence of two different paths from CLUSTER_GOING_DOWN/
- INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic
- model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to
- COMING_UP in the basic model). The second path avoids cluster
- teardown completely.
-
- CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic
- model. The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP
- is trivial and merely resets the state machine ready for the
- next cycle.
-
- Details of the allowable transitions follow.
-
- The next state in each case is notated
-
- <cluster state>/<inbound state> (<transitioner>)
-
- where the <transitioner> is the side on which the transition
- can occur; either the inbound or the outbound side.
-
-
-CLUSTER_DOWN/INBOUND_NOT_COMING_UP:
-
- Next state: CLUSTER_DOWN/INBOUND_COMING_UP (inbound)
- Conditions: none
- Trigger events:
-
- a) an explicit hardware power-up operation, resulting
- from a policy decision on another CPU;
-
- b) a hardware event, such as an interrupt.
-
-
-CLUSTER_DOWN/INBOUND_COMING_UP:
-
- In this state, an inbound CPU sets up the cluster, including
- enabling of hardware coherency at the cluster level and any
- other operations (such as cache invalidation) which are required
- in order to achieve this.
-
- The purpose of this state is to do sufficient cluster-level
- setup to enable other CPUs in the cluster to enter coherency
- safely.
-
- Next state: CLUSTER_UP/INBOUND_COMING_UP (inbound)
- Conditions: cluster-level setup and hardware coherency complete
- Trigger events: (spontaneous)
-
-
-CLUSTER_UP/INBOUND_COMING_UP:
-
- Cluster-level setup is complete and hardware coherency is
- enabled for the cluster. Other CPUs in the cluster can safely
- enter coherency.
-
- This is a transient state, leading immediately to
- CLUSTER_UP/INBOUND_NOT_COMING_UP. All other CPUs on the cluster
- should consider treat these two states as equivalent.
-
- Next state: CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound)
- Conditions: none
- Trigger events: (spontaneous)
-
-
-CLUSTER_UP/INBOUND_NOT_COMING_UP:
-
- Cluster-level setup is complete and hardware coherency is
- enabled for the cluster. Other CPUs in the cluster can safely
- enter coherency.
-
- The cluster will remain in this state until a policy decision is
- made to power the cluster down.
-
- Next state: CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound)
- Conditions: none
- Trigger events: policy decision to power down the cluster
-
-
-CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP:
-
- An outbound CPU is tearing the cluster down. The selected CPU
- must wait in this state until all CPUs in the cluster are in the
- CPU_DOWN state.
-
- When all CPUs are in the CPU_DOWN state, the cluster can be torn
- down, for example by cleaning data caches and exiting
- cluster-level coherency.
-
- To avoid wasteful unnecessary teardown operations, the outbound
- should check the inbound cluster state for asynchronous
- transitions to INBOUND_COMING_UP. Alternatively, individual
- CPUs can be checked for entry into CPU_COMING_UP or CPU_UP.
-
-
- Next states:
-
- CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound)
- Conditions: cluster torn down and ready to power off
- Trigger events: (spontaneous)
-
- CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound)
- Conditions: none
- Trigger events:
-
- a) an explicit hardware power-up operation,
- resulting from a policy decision on another
- CPU;
-
- b) a hardware event, such as an interrupt.
-
-
-CLUSTER_GOING_DOWN/INBOUND_COMING_UP:
-
- The cluster is (or was) being torn down, but another CPU has
- come online in the meantime and is trying to set up the cluster
- again.
-
- If the outbound CPU observes this state, it has two choices:
-
- a) back out of teardown, restoring the cluster to the
- CLUSTER_UP state;
-
- b) finish tearing the cluster down and put the cluster
- in the CLUSTER_DOWN state; the inbound CPU will
- set up the cluster again from there.
-
- Choice (a) permits the removal of some latency by avoiding
- unnecessary teardown and setup operations in situations where
- the cluster is not really going to be powered down.
-
-
- Next states:
-
- CLUSTER_UP/INBOUND_COMING_UP (outbound)
- Conditions: cluster-level setup and hardware
- coherency complete
- Trigger events: (spontaneous)
-
- CLUSTER_DOWN/INBOUND_COMING_UP (outbound)
- Conditions: cluster torn down and ready to power off
- Trigger events: (spontaneous)
-
-
-Last man and First man selection
---------------------------------
-
-The CPU which performs cluster tear-down operations on the outbound side
-is commonly referred to as the "last man".
-
-The CPU which performs cluster setup on the inbound side is commonly
-referred to as the "first man".
-
-The race avoidance algorithm documented above does not provide a
-mechanism to choose which CPUs should play these roles.
-
-
-Last man:
-
-When shutting down the cluster, all the CPUs involved are initially
-executing Linux and hence coherent. Therefore, ordinary spinlocks can
-be used to select a last man safely, before the CPUs become
-non-coherent.
-
-
-First man:
-
-Because CPUs may power up asynchronously in response to external wake-up
-events, a dynamic mechanism is needed to make sure that only one CPU
-attempts to play the first man role and do the cluster-level
-initialisation: any other CPUs must wait for this to complete before
-proceeding.
-
-Cluster-level initialisation may involve actions such as configuring
-coherency controls in the bus fabric.
-
-The current implementation in mcpm_head.S uses a separate mutual exclusion
-mechanism to do this arbitration. This mechanism is documented in
-detail in vlocks.txt.
-
-
-Features and Limitations
-------------------------
-
-Implementation:
-
- The current ARM-based implementation is split between
- arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and
- arch/arm/common/mcpm_entry.c (everything else):
-
- __mcpm_cpu_going_down() signals the transition of a CPU to the
- CPU_GOING_DOWN state.
-
- __mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN
- state.
-
- A CPU transitions to CPU_COMING_UP and then to CPU_UP via the
- low-level power-up code in mcpm_head.S. This could
- involve CPU-specific setup code, but in the current
- implementation it does not.
-
- __mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical()
- handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN
- and from there to CLUSTER_DOWN or back to CLUSTER_UP (in
- the case of an aborted cluster power-down).
-
- These functions are more complex than the __mcpm_cpu_*()
- functions due to the extra inter-CPU coordination which
- is needed for safe transitions at the cluster level.
-
- A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via
- the low-level power-up code in mcpm_head.S. This
- typically involves platform-specific setup code,
- provided by the platform-specific power_up_setup
- function registered via mcpm_sync_init.
-
-Deep topologies:
-
- As currently described and implemented, the algorithm does not
- support CPU topologies involving more than two levels (i.e.,
- clusters of clusters are not supported). The algorithm could be
- extended by replicating the cluster-level states for the
- additional topological levels, and modifying the transition
- rules for the intermediate (non-outermost) cluster levels.
-
-
-Colophon
---------
-
-Originally created and documented by Dave Martin for Linaro Limited, in
-collaboration with Nicolas Pitre and Achin Gupta.
-
-Copyright (C) 2012-2013 Linaro Limited
-Distributed under the terms of Version 2 of the GNU General Public
-License, as defined in linux/COPYING.
diff --git a/Documentation/arm/firmware.rst b/Documentation/arm/firmware.rst
new file mode 100644
index 000000000000..efd844baec1d
--- /dev/null
+++ b/Documentation/arm/firmware.rst
@@ -0,0 +1,72 @@
+==========================================================================
+Interface for registering and calling firmware-specific operations for ARM
+==========================================================================
+
+Written by Tomasz Figa <t.figa@samsung.com>
+
+Some boards are running with secure firmware running in TrustZone secure
+world, which changes the way some things have to be initialized. This makes
+a need to provide an interface for such platforms to specify available firmware
+operations and call them when needed.
+
+Firmware operations can be specified by filling in a struct firmware_ops
+with appropriate callbacks and then registering it with register_firmware_ops()
+function::
+
+ void register_firmware_ops(const struct firmware_ops *ops)
+
+The ops pointer must be non-NULL. More information about struct firmware_ops
+and its members can be found in arch/arm/include/asm/firmware.h header.
+
+There is a default, empty set of operations provided, so there is no need to
+set anything if platform does not require firmware operations.
+
+To call a firmware operation, a helper macro is provided::
+
+ #define call_firmware_op(op, ...) \
+ ((firmware_ops->op) ? firmware_ops->op(__VA_ARGS__) : (-ENOSYS))
+
+the macro checks if the operation is provided and calls it or otherwise returns
+-ENOSYS to signal that given operation is not available (for example, to allow
+fallback to legacy operation).
+
+Example of registering firmware operations::
+
+ /* board file */
+
+ static int platformX_do_idle(void)
+ {
+ /* tell platformX firmware to enter idle */
+ return 0;
+ }
+
+ static int platformX_cpu_boot(int i)
+ {
+ /* tell platformX firmware to boot CPU i */
+ return 0;
+ }
+
+ static const struct firmware_ops platformX_firmware_ops = {
+ .do_idle = exynos_do_idle,
+ .cpu_boot = exynos_cpu_boot,
+ /* other operations not available on platformX */
+ };
+
+ /* init_early callback of machine descriptor */
+ static void __init board_init_early(void)
+ {
+ register_firmware_ops(&platformX_firmware_ops);
+ }
+
+Example of using a firmware operation::
+
+ /* some platform code, e.g. SMP initialization */
+
+ __raw_writel(__pa_symbol(exynos4_secondary_startup),
+ CPU1_BOOT_REG);
+
+ /* Call Exynos specific smc call */
+ if (call_firmware_op(cpu_boot, cpu) == -ENOSYS)
+ cpu_boot_legacy(...); /* Try legacy way */
+
+ gic_raise_softirq(cpumask_of(cpu), 1);
diff --git a/Documentation/arm/firmware.txt b/Documentation/arm/firmware.txt
deleted file mode 100644
index 7f175dbb427e..000000000000
--- a/Documentation/arm/firmware.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-Interface for registering and calling firmware-specific operations for ARM.
-----
-Written by Tomasz Figa <t.figa@samsung.com>
-
-Some boards are running with secure firmware running in TrustZone secure
-world, which changes the way some things have to be initialized. This makes
-a need to provide an interface for such platforms to specify available firmware
-operations and call them when needed.
-
-Firmware operations can be specified by filling in a struct firmware_ops
-with appropriate callbacks and then registering it with register_firmware_ops()
-function.
-
- void register_firmware_ops(const struct firmware_ops *ops)
-
-The ops pointer must be non-NULL. More information about struct firmware_ops
-and its members can be found in arch/arm/include/asm/firmware.h header.
-
-There is a default, empty set of operations provided, so there is no need to
-set anything if platform does not require firmware operations.
-
-To call a firmware operation, a helper macro is provided
-
- #define call_firmware_op(op, ...) \
- ((firmware_ops->op) ? firmware_ops->op(__VA_ARGS__) : (-ENOSYS))
-
-the macro checks if the operation is provided and calls it or otherwise returns
--ENOSYS to signal that given operation is not available (for example, to allow
-fallback to legacy operation).
-
-Example of registering firmware operations:
-
- /* board file */
-
- static int platformX_do_idle(void)
- {
- /* tell platformX firmware to enter idle */
- return 0;
- }
-
- static int platformX_cpu_boot(int i)
- {
- /* tell platformX firmware to boot CPU i */
- return 0;
- }
-
- static const struct firmware_ops platformX_firmware_ops = {
- .do_idle = exynos_do_idle,
- .cpu_boot = exynos_cpu_boot,
- /* other operations not available on platformX */
- };
-
- /* init_early callback of machine descriptor */
- static void __init board_init_early(void)
- {
- register_firmware_ops(&platformX_firmware_ops);
- }
-
-Example of using a firmware operation:
-
- /* some platform code, e.g. SMP initialization */
-
- __raw_writel(__pa_symbol(exynos4_secondary_startup),
- CPU1_BOOT_REG);
-
- /* Call Exynos specific smc call */
- if (call_firmware_op(cpu_boot, cpu) == -ENOSYS)
- cpu_boot_legacy(...); /* Try legacy way */
-
- gic_raise_softirq(cpumask_of(cpu), 1);
diff --git a/Documentation/arm/index.rst b/Documentation/arm/index.rst
new file mode 100644
index 000000000000..5fc072dd0c5e
--- /dev/null
+++ b/Documentation/arm/index.rst
@@ -0,0 +1,80 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+================
+ARM Architecture
+================
+
+.. toctree::
+ :maxdepth: 1
+
+ arm
+ booting
+ cluster-pm-race-avoidance
+ firmware
+ interrupts
+ kernel_mode_neon
+ kernel_user_helpers
+ memory
+ mem_alignment
+ tcm
+ setup
+ swp_emulation
+ uefi
+ vlocks
+ porting
+
+SoC-specific documents
+======================
+
+.. toctree::
+ :maxdepth: 1
+
+ ixp4xx
+
+ marvel
+ microchip
+
+ netwinder
+ nwfpe/index
+
+ keystone/overview
+ keystone/knav-qmss
+
+ omap/index
+
+ pxa/mfp
+
+
+ sa1100/index
+
+ stm32/stm32f746-overview
+ stm32/overview
+ stm32/stm32h743-overview
+ stm32/stm32f769-overview
+ stm32/stm32f429-overview
+ stm32/stm32mp157-overview
+
+ sunxi
+
+ samsung/index
+ samsung-s3c24xx/index
+
+ sunxi/clocks
+
+ spear/overview
+
+ sti/stih416-overview
+ sti/stih407-overview
+ sti/stih418-overview
+ sti/overview
+ sti/stih415-overview
+
+ vfp/release-notes
+
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/arm/interrupts.rst b/Documentation/arm/interrupts.rst
new file mode 100644
index 000000000000..2ae70e0e9732
--- /dev/null
+++ b/Documentation/arm/interrupts.rst
@@ -0,0 +1,169 @@
+==========
+Interrupts
+==========
+
+2.5.2-rmk5:
+ This is the first kernel that contains a major shake up of some of the
+ major architecture-specific subsystems.
+
+Firstly, it contains some pretty major changes to the way we handle the
+MMU TLB. Each MMU TLB variant is now handled completely separately -
+we have TLB v3, TLB v4 (without write buffer), TLB v4 (with write buffer),
+and finally TLB v4 (with write buffer, with I TLB invalidate entry).
+There is more assembly code inside each of these functions, mainly to
+allow more flexible TLB handling for the future.
+
+Secondly, the IRQ subsystem.
+
+The 2.5 kernels will be having major changes to the way IRQs are handled.
+Unfortunately, this means that machine types that touch the irq_desc[]
+array (basically all machine types) will break, and this means every
+machine type that we currently have.
+
+Lets take an example. On the Assabet with Neponset, we have::
+
+ GPIO25 IRR:2
+ SA1100 ------------> Neponset -----------> SA1111
+ IIR:1
+ -----------> USAR
+ IIR:0
+ -----------> SMC9196
+
+The way stuff currently works, all SA1111 interrupts are mutually
+exclusive of each other - if you're processing one interrupt from the
+SA1111 and another comes in, you have to wait for that interrupt to
+finish processing before you can service the new interrupt. Eg, an
+IDE PIO-based interrupt on the SA1111 excludes all other SA1111 and
+SMC9196 interrupts until it has finished transferring its multi-sector
+data, which can be a long time. Note also that since we loop in the
+SA1111 IRQ handler, SA1111 IRQs can hold off SMC9196 IRQs indefinitely.
+
+
+The new approach brings several new ideas...
+
+We introduce the concept of a "parent" and a "child". For example,
+to the Neponset handler, the "parent" is GPIO25, and the "children"d
+are SA1111, SMC9196 and USAR.
+
+We also bring the idea of an IRQ "chip" (mainly to reduce the size of
+the irqdesc array). This doesn't have to be a real "IC"; indeed the
+SA11x0 IRQs are handled by two separate "chip" structures, one for
+GPIO0-10, and another for all the rest. It is just a container for
+the various operations (maybe this'll change to a better name).
+This structure has the following operations::
+
+ struct irqchip {
+ /*
+ * Acknowledge the IRQ.
+ * If this is a level-based IRQ, then it is expected to mask the IRQ
+ * as well.
+ */
+ void (*ack)(unsigned int irq);
+ /*
+ * Mask the IRQ in hardware.
+ */
+ void (*mask)(unsigned int irq);
+ /*
+ * Unmask the IRQ in hardware.
+ */
+ void (*unmask)(unsigned int irq);
+ /*
+ * Re-run the IRQ
+ */
+ void (*rerun)(unsigned int irq);
+ /*
+ * Set the type of the IRQ.
+ */
+ int (*type)(unsigned int irq, unsigned int, type);
+ };
+
+ack
+ - required. May be the same function as mask for IRQs
+ handled by do_level_IRQ.
+mask
+ - required.
+unmask
+ - required.
+rerun
+ - optional. Not required if you're using do_level_IRQ for all
+ IRQs that use this 'irqchip'. Generally expected to re-trigger
+ the hardware IRQ if possible. If not, may call the handler
+ directly.
+type
+ - optional. If you don't support changing the type of an IRQ,
+ it should be null so people can detect if they are unable to
+ set the IRQ type.
+
+For each IRQ, we keep the following information:
+
+ - "disable" depth (number of disable_irq()s without enable_irq()s)
+ - flags indicating what we can do with this IRQ (valid, probe,
+ noautounmask) as before
+ - status of the IRQ (probing, enable, etc)
+ - chip
+ - per-IRQ handler
+ - irqaction structure list
+
+The handler can be one of the 3 standard handlers - "level", "edge" and
+"simple", or your own specific handler if you need to do something special.
+
+The "level" handler is what we currently have - its pretty simple.
+"edge" knows about the brokenness of such IRQ implementations - that you
+need to leave the hardware IRQ enabled while processing it, and queueing
+further IRQ events should the IRQ happen again while processing. The
+"simple" handler is very basic, and does not perform any hardware
+manipulation, nor state tracking. This is useful for things like the
+SMC9196 and USAR above.
+
+So, what's changed?
+===================
+
+1. Machine implementations must not write to the irqdesc array.
+
+2. New functions to manipulate the irqdesc array. The first 4 are expected
+ to be useful only to machine specific code. The last is recommended to
+ only be used by machine specific code, but may be used in drivers if
+ absolutely necessary.
+
+ set_irq_chip(irq,chip)
+ Set the mask/unmask methods for handling this IRQ
+
+ set_irq_handler(irq,handler)
+ Set the handler for this IRQ (level, edge, simple)
+
+ set_irq_chained_handler(irq,handler)
+ Set a "chained" handler for this IRQ - automatically
+ enables this IRQ (eg, Neponset and SA1111 handlers).
+
+ set_irq_flags(irq,flags)
+ Set the valid/probe/noautoenable flags.
+
+ set_irq_type(irq,type)
+ Set active the IRQ edge(s)/level. This replaces the
+ SA1111 INTPOL manipulation, and the set_GPIO_IRQ_edge()
+ function. Type should be one of IRQ_TYPE_xxx defined in
+ <linux/irq.h>
+
+3. set_GPIO_IRQ_edge() is obsolete, and should be replaced by set_irq_type.
+
+4. Direct access to SA1111 INTPOL is deprecated. Use set_irq_type instead.
+
+5. A handler is expected to perform any necessary acknowledgement of the
+ parent IRQ via the correct chip specific function. For instance, if
+ the SA1111 is directly connected to a SA1110 GPIO, then you should
+ acknowledge the SA1110 IRQ each time you re-read the SA1111 IRQ status.
+
+6. For any child which doesn't have its own IRQ enable/disable controls
+ (eg, SMC9196), the handler must mask or acknowledge the parent IRQ
+ while the child handler is called, and the child handler should be the
+ "simple" handler (not "edge" nor "level"). After the handler completes,
+ the parent IRQ should be unmasked, and the status of all children must
+ be re-checked for pending events. (see the Neponset IRQ handler for
+ details).
+
+7. fixup_irq() is gone, as is `arch/arm/mach-*/include/mach/irq.h`
+
+Please note that this will not solve all problems - some of them are
+hardware based. Mixing level-based and edge-based IRQs on the same
+parent signal (eg neponset) is one such area where a software based
+solution can't provide the full answer to low IRQ latency.
diff --git a/Documentation/arm/ixp4xx.rst b/Documentation/arm/ixp4xx.rst
new file mode 100644
index 000000000000..a57235616294
--- /dev/null
+++ b/Documentation/arm/ixp4xx.rst
@@ -0,0 +1,173 @@
+===========================================================
+Release Notes for Linux on Intel's IXP4xx Network Processor
+===========================================================
+
+Maintained by Deepak Saxena <dsaxena@plexity.net>
+-------------------------------------------------------------------------
+
+1. Overview
+
+Intel's IXP4xx network processor is a highly integrated SOC that
+is targeted for network applications, though it has become popular
+in industrial control and other areas due to low cost and power
+consumption. The IXP4xx family currently consists of several processors
+that support different network offload functions such as encryption,
+routing, firewalling, etc. The IXP46x family is an updated version which
+supports faster speeds, new memory and flash configurations, and more
+integration such as an on-chip I2C controller.
+
+For more information on the various versions of the CPU, see:
+
+ http://developer.intel.com/design/network/products/npfamily/ixp4xx.htm
+
+Intel also made the IXCP1100 CPU for sometime which is an IXP4xx
+stripped of much of the network intelligence.
+
+2. Linux Support
+
+Linux currently supports the following features on the IXP4xx chips:
+
+- Dual serial ports
+- PCI interface
+- Flash access (MTD/JFFS)
+- I2C through GPIO on IXP42x
+- GPIO for input/output/interrupts
+ See arch/arm/mach-ixp4xx/include/mach/platform.h for access functions.
+- Timers (watchdog, OS)
+
+The following components of the chips are not supported by Linux and
+require the use of Intel's proprietary CSR software:
+
+- USB device interface
+- Network interfaces (HSS, Utopia, NPEs, etc)
+- Network offload functionality
+
+If you need to use any of the above, you need to download Intel's
+software from:
+
+ http://developer.intel.com/design/network/products/npfamily/ixp425.htm
+
+DO NOT POST QUESTIONS TO THE LINUX MAILING LISTS REGARDING THE PROPRIETARY
+SOFTWARE.
+
+There are several websites that provide directions/pointers on using
+Intel's software:
+
+ - http://sourceforge.net/projects/ixp4xx-osdg/
+ Open Source Developer's Guide for using uClinux and the Intel libraries
+
+ - http://gatewaymaker.sourceforge.net/
+ Simple one page summary of building a gateway using an IXP425 and Linux
+
+ - http://ixp425.sourceforge.net/
+ ATM device driver for IXP425 that relies on Intel's libraries
+
+3. Known Issues/Limitations
+
+3a. Limited inbound PCI window
+
+The IXP4xx family allows for up to 256MB of memory but the PCI interface
+can only expose 64MB of that memory to the PCI bus. This means that if
+you are running with > 64MB, all PCI buffers outside of the accessible
+range will be bounced using the routines in arch/arm/common/dmabounce.c.
+
+3b. Limited outbound PCI window
+
+IXP4xx provides two methods of accessing PCI memory space:
+
+1) A direct mapped window from 0x48000000 to 0x4bffffff (64MB).
+ To access PCI via this space, we simply ioremap() the BAR
+ into the kernel and we can use the standard read[bwl]/write[bwl]
+ macros. This is the preffered method due to speed but it
+ limits the system to just 64MB of PCI memory. This can be
+ problamatic if using video cards and other memory-heavy devices.
+
+2) If > 64MB of memory space is required, the IXP4xx can be
+ configured to use indirect registers to access PCI This allows
+ for up to 128MB (0x48000000 to 0x4fffffff) of memory on the bus.
+ The disadvantage of this is that every PCI access requires
+ three local register accesses plus a spinlock, but in some
+ cases the performance hit is acceptable. In addition, you cannot
+ mmap() PCI devices in this case due to the indirect nature
+ of the PCI window.
+
+By default, the direct method is used for performance reasons. If
+you need more PCI memory, enable the IXP4XX_INDIRECT_PCI config option.
+
+3c. GPIO as Interrupts
+
+Currently the code only handles level-sensitive GPIO interrupts
+
+4. Supported platforms
+
+ADI Engineering Coyote Gateway Reference Platform
+http://www.adiengineering.com/productsCoyote.html
+
+ The ADI Coyote platform is reference design for those building
+ small residential/office gateways. One NPE is connected to a 10/100
+ interface, one to 4-port 10/100 switch, and the third to and ADSL
+ interface. In addition, it also supports to POTs interfaces connected
+ via SLICs. Note that those are not supported by Linux ATM. Finally,
+ the platform has two mini-PCI slots used for 802.11[bga] cards.
+ Finally, there is an IDE port hanging off the expansion bus.
+
+Gateworks Avila Network Platform
+http://www.gateworks.com/support/overview.php
+
+ The Avila platform is basically and IXDP425 with the 4 PCI slots
+ replaced with mini-PCI slots and a CF IDE interface hanging off
+ the expansion bus.
+
+Intel IXDP425 Development Platform
+http://www.intel.com/design/network/products/npfamily/ixdpg425.htm
+
+ This is Intel's standard reference platform for the IXDP425 and is
+ also known as the Richfield board. It contains 4 PCI slots, 16MB
+ of flash, two 10/100 ports and one ADSL port.
+
+Intel IXDP465 Development Platform
+http://www.intel.com/design/network/products/npfamily/ixdp465.htm
+
+ This is basically an IXDP425 with an IXP465 and 32M of flash instead
+ of just 16.
+
+Intel IXDPG425 Development Platform
+
+ This is basically and ADI Coyote board with a NEC EHCI controller
+ added. One issue with this board is that the mini-PCI slots only
+ have the 3.3v line connected, so you can't use a PCI to mini-PCI
+ adapter with an E100 card. So to NFS root you need to use either
+ the CSR or a WiFi card and a ramdisk that BOOTPs and then does
+ a pivot_root to NFS.
+
+Motorola PrPMC1100 Processor Mezanine Card
+http://www.fountainsys.com
+
+ The PrPMC1100 is based on the IXCP1100 and is meant to plug into
+ and IXP2400/2800 system to act as the system controller. It simply
+ contains a CPU and 16MB of flash on the board and needs to be
+ plugged into a carrier board to function. Currently Linux only
+ supports the Motorola PrPMC carrier board for this platform.
+
+5. TODO LIST
+
+- Add support for Coyote IDE
+- Add support for edge-based GPIO interrupts
+- Add support for CF IDE on expansion bus
+
+6. Thanks
+
+The IXP4xx work has been funded by Intel Corp. and MontaVista Software, Inc.
+
+The following people have contributed patches/comments/etc:
+
+- Lennerty Buytenhek
+- Lutz Jaenicke
+- Justin Mayfield
+- Robert E. Ranslam
+
+[I know I've forgotten others, please email me to be added]
+
+-------------------------------------------------------------------------
+
+Last Update: 01/04/2005
diff --git a/Documentation/arm/kernel_mode_neon.rst b/Documentation/arm/kernel_mode_neon.rst
new file mode 100644
index 000000000000..9bfb71a2a9b9
--- /dev/null
+++ b/Documentation/arm/kernel_mode_neon.rst
@@ -0,0 +1,124 @@
+================
+Kernel mode NEON
+================
+
+TL;DR summary
+-------------
+* Use only NEON instructions, or VFP instructions that don't rely on support
+ code
+* Isolate your NEON code in a separate compilation unit, and compile it with
+ '-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
+* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
+ NEON code
+* Don't sleep in your NEON code, and be aware that it will be executed with
+ preemption disabled
+
+
+Introduction
+------------
+It is possible to use NEON instructions (and in some cases, VFP instructions) in
+code that runs in kernel mode. However, for performance reasons, the NEON/VFP
+register file is not preserved and restored at every context switch or taken
+exception like the normal register file is, so some manual intervention is
+required. Furthermore, special care is required for code that may sleep [i.e.,
+may call schedule()], as NEON or VFP instructions will be executed in a
+non-preemptible section for reasons outlined below.
+
+
+Lazy preserve and restore
+-------------------------
+The NEON/VFP register file is managed using lazy preserve (on UP systems) and
+lazy restore (on both SMP and UP systems). This means that the register file is
+kept 'live', and is only preserved and restored when multiple tasks are
+contending for the NEON/VFP unit (or, in the SMP case, when a task migrates to
+another core). Lazy restore is implemented by disabling the NEON/VFP unit after
+every context switch, resulting in a trap when subsequently a NEON/VFP
+instruction is issued, allowing the kernel to step in and perform the restore if
+necessary.
+
+Any use of the NEON/VFP unit in kernel mode should not interfere with this, so
+it is required to do an 'eager' preserve of the NEON/VFP register file, and
+enable the NEON/VFP unit explicitly so no exceptions are generated on first
+subsequent use. This is handled by the function kernel_neon_begin(), which
+should be called before any kernel mode NEON or VFP instructions are issued.
+Likewise, the NEON/VFP unit should be disabled again after use to make sure user
+mode will hit the lazy restore trap upon next use. This is handled by the
+function kernel_neon_end().
+
+
+Interruptions in kernel mode
+----------------------------
+For reasons of performance and simplicity, it was decided that there shall be no
+preserve/restore mechanism for the kernel mode NEON/VFP register contents. This
+implies that interruptions of a kernel mode NEON section can only be allowed if
+they are guaranteed not to touch the NEON/VFP registers. For this reason, the
+following rules and restrictions apply in the kernel:
+* NEON/VFP code is not allowed in interrupt context;
+* NEON/VFP code is not allowed to sleep;
+* NEON/VFP code is executed with preemption disabled.
+
+If latency is a concern, it is possible to put back to back calls to
+kernel_neon_end() and kernel_neon_begin() in places in your code where none of
+the NEON registers are live. (Additional calls to kernel_neon_begin() should be
+reasonably cheap if no context switch occurred in the meantime)
+
+
+VFP and support code
+--------------------
+Earlier versions of VFP (prior to version 3) rely on software support for things
+like IEEE-754 compliant underflow handling etc. When the VFP unit needs such
+software assistance, it signals the kernel by raising an undefined instruction
+exception. The kernel responds by inspecting the VFP control registers and the
+current instruction and arguments, and emulates the instruction in software.
+
+Such software assistance is currently not implemented for VFP instructions
+executed in kernel mode. If such a condition is encountered, the kernel will
+fail and generate an OOPS.
+
+
+Separating NEON code from ordinary code
+---------------------------------------
+The compiler is not aware of the special significance of kernel_neon_begin() and
+kernel_neon_end(), i.e., that it is only allowed to issue NEON/VFP instructions
+between calls to these respective functions. Furthermore, GCC may generate NEON
+instructions of its own at -O3 level if -mfpu=neon is selected, and even if the
+kernel is currently compiled at -O2, future changes may result in NEON/VFP
+instructions appearing in unexpected places if no special care is taken.
+
+Therefore, the recommended and only supported way of using NEON/VFP in the
+kernel is by adhering to the following rules:
+
+* isolate the NEON code in a separate compilation unit and compile it with
+ '-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
+* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
+ into the unit containing the NEON code from a compilation unit which is *not*
+ built with the GCC flag '-mfpu=neon' set.
+
+As the kernel is compiled with '-msoft-float', the above will guarantee that
+both NEON and VFP instructions will only ever appear in designated compilation
+units at any optimization level.
+
+
+NEON assembler
+--------------
+NEON assembler is supported with no additional caveats as long as the rules
+above are followed.
+
+
+NEON code generated by GCC
+--------------------------
+The GCC option -ftree-vectorize (implied by -O3) tries to exploit implicit
+parallelism, and generates NEON code from ordinary C source code. This is fully
+supported as long as the rules above are followed.
+
+
+NEON intrinsics
+---------------
+NEON intrinsics are also supported. However, as code using NEON intrinsics
+relies on the GCC header <arm_neon.h>, (which #includes <stdint.h>), you should
+observe the following in addition to the rules above:
+
+* Compile the unit containing the NEON intrinsics with '-ffreestanding' so GCC
+ uses its builtin version of <stdint.h> (this is a C99 header which the kernel
+ does not supply);
+* Include <arm_neon.h> last, or at least after <linux/types.h>
diff --git a/Documentation/arm/kernel_mode_neon.txt b/Documentation/arm/kernel_mode_neon.txt
deleted file mode 100644
index b9e060c5b61e..000000000000
--- a/Documentation/arm/kernel_mode_neon.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-Kernel mode NEON
-================
-
-TL;DR summary
--------------
-* Use only NEON instructions, or VFP instructions that don't rely on support
- code
-* Isolate your NEON code in a separate compilation unit, and compile it with
- '-march=armv7-a -mfpu=neon -mfloat-abi=softfp'
-* Put kernel_neon_begin() and kernel_neon_end() calls around the calls into your
- NEON code
-* Don't sleep in your NEON code, and be aware that it will be executed with
- preemption disabled
-
-
-Introduction
-------------
-It is possible to use NEON instructions (and in some cases, VFP instructions) in
-code that runs in kernel mode. However, for performance reasons, the NEON/VFP
-register file is not preserved and restored at every context switch or taken
-exception like the normal register file is, so some manual intervention is
-required. Furthermore, special care is required for code that may sleep [i.e.,
-may call schedule()], as NEON or VFP instructions will be executed in a
-non-preemptible section for reasons outlined below.
-
-
-Lazy preserve and restore
--------------------------
-The NEON/VFP register file is managed using lazy preserve (on UP systems) and
-lazy restore (on both SMP and UP systems). This means that the register file is
-kept 'live', and is only preserved and restored when multiple tasks are
-contending for the NEON/VFP unit (or, in the SMP case, when a task migrates to
-another core). Lazy restore is implemented by disabling the NEON/VFP unit after
-every context switch, resulting in a trap when subsequently a NEON/VFP
-instruction is issued, allowing the kernel to step in and perform the restore if
-necessary.
-
-Any use of the NEON/VFP unit in kernel mode should not interfere with this, so
-it is required to do an 'eager' preserve of the NEON/VFP register file, and
-enable the NEON/VFP unit explicitly so no exceptions are generated on first
-subsequent use. This is handled by the function kernel_neon_begin(), which
-should be called before any kernel mode NEON or VFP instructions are issued.
-Likewise, the NEON/VFP unit should be disabled again after use to make sure user
-mode will hit the lazy restore trap upon next use. This is handled by the
-function kernel_neon_end().
-
-
-Interruptions in kernel mode
-----------------------------
-For reasons of performance and simplicity, it was decided that there shall be no
-preserve/restore mechanism for the kernel mode NEON/VFP register contents. This
-implies that interruptions of a kernel mode NEON section can only be allowed if
-they are guaranteed not to touch the NEON/VFP registers. For this reason, the
-following rules and restrictions apply in the kernel:
-* NEON/VFP code is not allowed in interrupt context;
-* NEON/VFP code is not allowed to sleep;
-* NEON/VFP code is executed with preemption disabled.
-
-If latency is a concern, it is possible to put back to back calls to
-kernel_neon_end() and kernel_neon_begin() in places in your code where none of
-the NEON registers are live. (Additional calls to kernel_neon_begin() should be
-reasonably cheap if no context switch occurred in the meantime)
-
-
-VFP and support code
---------------------
-Earlier versions of VFP (prior to version 3) rely on software support for things
-like IEEE-754 compliant underflow handling etc. When the VFP unit needs such
-software assistance, it signals the kernel by raising an undefined instruction
-exception. The kernel responds by inspecting the VFP control registers and the
-current instruction and arguments, and emulates the instruction in software.
-
-Such software assistance is currently not implemented for VFP instructions
-executed in kernel mode. If such a condition is encountered, the kernel will
-fail and generate an OOPS.
-
-
-Separating NEON code from ordinary code
----------------------------------------
-The compiler is not aware of the special significance of kernel_neon_begin() and
-kernel_neon_end(), i.e., that it is only allowed to issue NEON/VFP instructions
-between calls to these respective functions. Furthermore, GCC may generate NEON
-instructions of its own at -O3 level if -mfpu=neon is selected, and even if the
-kernel is currently compiled at -O2, future changes may result in NEON/VFP
-instructions appearing in unexpected places if no special care is taken.
-
-Therefore, the recommended and only supported way of using NEON/VFP in the
-kernel is by adhering to the following rules:
-* isolate the NEON code in a separate compilation unit and compile it with
- '-march=armv7-a -mfpu=neon -mfloat-abi=softfp';
-* issue the calls to kernel_neon_begin(), kernel_neon_end() as well as the calls
- into the unit containing the NEON code from a compilation unit which is *not*
- built with the GCC flag '-mfpu=neon' set.
-
-As the kernel is compiled with '-msoft-float', the above will guarantee that
-both NEON and VFP instructions will only ever appear in designated compilation
-units at any optimization level.
-
-
-NEON assembler
---------------
-NEON assembler is supported with no additional caveats as long as the rules
-above are followed.
-
-
-NEON code generated by GCC
---------------------------
-The GCC option -ftree-vectorize (implied by -O3) tries to exploit implicit
-parallelism, and generates NEON code from ordinary C source code. This is fully
-supported as long as the rules above are followed.
-
-
-NEON intrinsics
----------------
-NEON intrinsics are also supported. However, as code using NEON intrinsics
-relies on the GCC header <arm_neon.h>, (which #includes <stdint.h>), you should
-observe the following in addition to the rules above:
-* Compile the unit containing the NEON intrinsics with '-ffreestanding' so GCC
- uses its builtin version of <stdint.h> (this is a C99 header which the kernel
- does not supply);
-* Include <arm_neon.h> last, or at least after <linux/types.h>
diff --git a/Documentation/arm/kernel_user_helpers.rst b/Documentation/arm/kernel_user_helpers.rst
new file mode 100644
index 000000000000..eb6f3d916622
--- /dev/null
+++ b/Documentation/arm/kernel_user_helpers.rst
@@ -0,0 +1,268 @@
+============================
+Kernel-provided User Helpers
+============================
+
+These are segment of kernel provided user code reachable from user space
+at a fixed address in kernel memory. This is used to provide user space
+with some operations which require kernel help because of unimplemented
+native feature and/or instructions in many ARM CPUs. The idea is for this
+code to be executed directly in user mode for best efficiency but which is
+too intimate with the kernel counter part to be left to user libraries.
+In fact this code might even differ from one CPU to another depending on
+the available instruction set, or whether it is a SMP systems. In other
+words, the kernel reserves the right to change this code as needed without
+warning. Only the entry points and their results as documented here are
+guaranteed to be stable.
+
+This is different from (but doesn't preclude) a full blown VDSO
+implementation, however a VDSO would prevent some assembly tricks with
+constants that allows for efficient branching to those code segments. And
+since those code segments only use a few cycles before returning to user
+code, the overhead of a VDSO indirect far call would add a measurable
+overhead to such minimalistic operations.
+
+User space is expected to bypass those helpers and implement those things
+inline (either in the code emitted directly by the compiler, or part of
+the implementation of a library call) when optimizing for a recent enough
+processor that has the necessary native support, but only if resulting
+binaries are already to be incompatible with earlier ARM processors due to
+usage of similar native instructions for other things. In other words
+don't make binaries unable to run on earlier processors just for the sake
+of not using these kernel helpers if your compiled code is not going to
+use new instructions for other purpose.
+
+New helpers may be added over time, so an older kernel may be missing some
+helpers present in a newer kernel. For this reason, programs must check
+the value of __kuser_helper_version (see below) before assuming that it is
+safe to call any particular helper. This check should ideally be
+performed only once at process startup time, and execution aborted early
+if the required helpers are not provided by the kernel version that
+process is running on.
+
+kuser_helper_version
+--------------------
+
+Location: 0xffff0ffc
+
+Reference declaration::
+
+ extern int32_t __kuser_helper_version;
+
+Definition:
+
+ This field contains the number of helpers being implemented by the
+ running kernel. User space may read this to determine the availability
+ of a particular helper.
+
+Usage example::
+
+ #define __kuser_helper_version (*(int32_t *)0xffff0ffc)
+
+ void check_kuser_version(void)
+ {
+ if (__kuser_helper_version < 2) {
+ fprintf(stderr, "can't do atomic operations, kernel too old\n");
+ abort();
+ }
+ }
+
+Notes:
+
+ User space may assume that the value of this field never changes
+ during the lifetime of any single process. This means that this
+ field can be read once during the initialisation of a library or
+ startup phase of a program.
+
+kuser_get_tls
+-------------
+
+Location: 0xffff0fe0
+
+Reference prototype::
+
+ void * __kuser_get_tls(void);
+
+Input:
+
+ lr = return address
+
+Output:
+
+ r0 = TLS value
+
+Clobbered registers:
+
+ none
+
+Definition:
+
+ Get the TLS value as previously set via the __ARM_NR_set_tls syscall.
+
+Usage example::
+
+ typedef void * (__kuser_get_tls_t)(void);
+ #define __kuser_get_tls (*(__kuser_get_tls_t *)0xffff0fe0)
+
+ void foo()
+ {
+ void *tls = __kuser_get_tls();
+ printf("TLS = %p\n", tls);
+ }
+
+Notes:
+
+ - Valid only if __kuser_helper_version >= 1 (from kernel version 2.6.12).
+
+kuser_cmpxchg
+-------------
+
+Location: 0xffff0fc0
+
+Reference prototype::
+
+ int __kuser_cmpxchg(int32_t oldval, int32_t newval, volatile int32_t *ptr);
+
+Input:
+
+ r0 = oldval
+ r1 = newval
+ r2 = ptr
+ lr = return address
+
+Output:
+
+ r0 = success code (zero or non-zero)
+ C flag = set if r0 == 0, clear if r0 != 0
+
+Clobbered registers:
+
+ r3, ip, flags
+
+Definition:
+
+ Atomically store newval in `*ptr` only if `*ptr` is equal to oldval.
+ Return zero if `*ptr` was changed or non-zero if no exchange happened.
+ The C flag is also set if `*ptr` was changed to allow for assembly
+ optimization in the calling code.
+
+Usage example::
+
+ typedef int (__kuser_cmpxchg_t)(int oldval, int newval, volatile int *ptr);
+ #define __kuser_cmpxchg (*(__kuser_cmpxchg_t *)0xffff0fc0)
+
+ int atomic_add(volatile int *ptr, int val)
+ {
+ int old, new;
+
+ do {
+ old = *ptr;
+ new = old + val;
+ } while(__kuser_cmpxchg(old, new, ptr));
+
+ return new;
+ }
+
+Notes:
+
+ - This routine already includes memory barriers as needed.
+
+ - Valid only if __kuser_helper_version >= 2 (from kernel version 2.6.12).
+
+kuser_memory_barrier
+--------------------
+
+Location: 0xffff0fa0
+
+Reference prototype::
+
+ void __kuser_memory_barrier(void);
+
+Input:
+
+ lr = return address
+
+Output:
+
+ none
+
+Clobbered registers:
+
+ none
+
+Definition:
+
+ Apply any needed memory barrier to preserve consistency with data modified
+ manually and __kuser_cmpxchg usage.
+
+Usage example::
+
+ typedef void (__kuser_dmb_t)(void);
+ #define __kuser_dmb (*(__kuser_dmb_t *)0xffff0fa0)
+
+Notes:
+
+ - Valid only if __kuser_helper_version >= 3 (from kernel version 2.6.15).
+
+kuser_cmpxchg64
+---------------
+
+Location: 0xffff0f60
+
+Reference prototype::
+
+ int __kuser_cmpxchg64(const int64_t *oldval,
+ const int64_t *newval,
+ volatile int64_t *ptr);
+
+Input:
+
+ r0 = pointer to oldval
+ r1 = pointer to newval
+ r2 = pointer to target value
+ lr = return address
+
+Output:
+
+ r0 = success code (zero or non-zero)
+ C flag = set if r0 == 0, clear if r0 != 0
+
+Clobbered registers:
+
+ r3, lr, flags
+
+Definition:
+
+ Atomically store the 64-bit value pointed by `*newval` in `*ptr` only if `*ptr`
+ is equal to the 64-bit value pointed by `*oldval`. Return zero if `*ptr` was
+ changed or non-zero if no exchange happened.
+
+ The C flag is also set if `*ptr` was changed to allow for assembly
+ optimization in the calling code.
+
+Usage example::
+
+ typedef int (__kuser_cmpxchg64_t)(const int64_t *oldval,
+ const int64_t *newval,
+ volatile int64_t *ptr);
+ #define __kuser_cmpxchg64 (*(__kuser_cmpxchg64_t *)0xffff0f60)
+
+ int64_t atomic_add64(volatile int64_t *ptr, int64_t val)
+ {
+ int64_t old, new;
+
+ do {
+ old = *ptr;
+ new = old + val;
+ } while(__kuser_cmpxchg64(&old, &new, ptr));
+
+ return new;
+ }
+
+Notes:
+
+ - This routine already includes memory barriers as needed.
+
+ - Due to the length of this sequence, this spans 2 conventional kuser
+ "slots", therefore 0xffff0f80 is not used as a valid entry point.
+
+ - Valid only if __kuser_helper_version >= 5 (from kernel version 3.1).
diff --git a/Documentation/arm/kernel_user_helpers.txt b/Documentation/arm/kernel_user_helpers.txt
deleted file mode 100644
index 5673594717cf..000000000000
--- a/Documentation/arm/kernel_user_helpers.txt
+++ /dev/null
@@ -1,267 +0,0 @@
-Kernel-provided User Helpers
-============================
-
-These are segment of kernel provided user code reachable from user space
-at a fixed address in kernel memory. This is used to provide user space
-with some operations which require kernel help because of unimplemented
-native feature and/or instructions in many ARM CPUs. The idea is for this
-code to be executed directly in user mode for best efficiency but which is
-too intimate with the kernel counter part to be left to user libraries.
-In fact this code might even differ from one CPU to another depending on
-the available instruction set, or whether it is a SMP systems. In other
-words, the kernel reserves the right to change this code as needed without
-warning. Only the entry points and their results as documented here are
-guaranteed to be stable.
-
-This is different from (but doesn't preclude) a full blown VDSO
-implementation, however a VDSO would prevent some assembly tricks with
-constants that allows for efficient branching to those code segments. And
-since those code segments only use a few cycles before returning to user
-code, the overhead of a VDSO indirect far call would add a measurable
-overhead to such minimalistic operations.
-
-User space is expected to bypass those helpers and implement those things
-inline (either in the code emitted directly by the compiler, or part of
-the implementation of a library call) when optimizing for a recent enough
-processor that has the necessary native support, but only if resulting
-binaries are already to be incompatible with earlier ARM processors due to
-usage of similar native instructions for other things. In other words
-don't make binaries unable to run on earlier processors just for the sake
-of not using these kernel helpers if your compiled code is not going to
-use new instructions for other purpose.
-
-New helpers may be added over time, so an older kernel may be missing some
-helpers present in a newer kernel. For this reason, programs must check
-the value of __kuser_helper_version (see below) before assuming that it is
-safe to call any particular helper. This check should ideally be
-performed only once at process startup time, and execution aborted early
-if the required helpers are not provided by the kernel version that
-process is running on.
-
-kuser_helper_version
---------------------
-
-Location: 0xffff0ffc
-
-Reference declaration:
-
- extern int32_t __kuser_helper_version;
-
-Definition:
-
- This field contains the number of helpers being implemented by the
- running kernel. User space may read this to determine the availability
- of a particular helper.
-
-Usage example:
-
-#define __kuser_helper_version (*(int32_t *)0xffff0ffc)
-
-void check_kuser_version(void)
-{
- if (__kuser_helper_version < 2) {
- fprintf(stderr, "can't do atomic operations, kernel too old\n");
- abort();
- }
-}
-
-Notes:
-
- User space may assume that the value of this field never changes
- during the lifetime of any single process. This means that this
- field can be read once during the initialisation of a library or
- startup phase of a program.
-
-kuser_get_tls
--------------
-
-Location: 0xffff0fe0
-
-Reference prototype:
-
- void * __kuser_get_tls(void);
-
-Input:
-
- lr = return address
-
-Output:
-
- r0 = TLS value
-
-Clobbered registers:
-
- none
-
-Definition:
-
- Get the TLS value as previously set via the __ARM_NR_set_tls syscall.
-
-Usage example:
-
-typedef void * (__kuser_get_tls_t)(void);
-#define __kuser_get_tls (*(__kuser_get_tls_t *)0xffff0fe0)
-
-void foo()
-{
- void *tls = __kuser_get_tls();
- printf("TLS = %p\n", tls);
-}
-
-Notes:
-
- - Valid only if __kuser_helper_version >= 1 (from kernel version 2.6.12).
-
-kuser_cmpxchg
--------------
-
-Location: 0xffff0fc0
-
-Reference prototype:
-
- int __kuser_cmpxchg(int32_t oldval, int32_t newval, volatile int32_t *ptr);
-
-Input:
-
- r0 = oldval
- r1 = newval
- r2 = ptr
- lr = return address
-
-Output:
-
- r0 = success code (zero or non-zero)
- C flag = set if r0 == 0, clear if r0 != 0
-
-Clobbered registers:
-
- r3, ip, flags
-
-Definition:
-
- Atomically store newval in *ptr only if *ptr is equal to oldval.
- Return zero if *ptr was changed or non-zero if no exchange happened.
- The C flag is also set if *ptr was changed to allow for assembly
- optimization in the calling code.
-
-Usage example:
-
-typedef int (__kuser_cmpxchg_t)(int oldval, int newval, volatile int *ptr);
-#define __kuser_cmpxchg (*(__kuser_cmpxchg_t *)0xffff0fc0)
-
-int atomic_add(volatile int *ptr, int val)
-{
- int old, new;
-
- do {
- old = *ptr;
- new = old + val;
- } while(__kuser_cmpxchg(old, new, ptr));
-
- return new;
-}
-
-Notes:
-
- - This routine already includes memory barriers as needed.
-
- - Valid only if __kuser_helper_version >= 2 (from kernel version 2.6.12).
-
-kuser_memory_barrier
---------------------
-
-Location: 0xffff0fa0
-
-Reference prototype:
-
- void __kuser_memory_barrier(void);
-
-Input:
-
- lr = return address
-
-Output:
-
- none
-
-Clobbered registers:
-
- none
-
-Definition:
-
- Apply any needed memory barrier to preserve consistency with data modified
- manually and __kuser_cmpxchg usage.
-
-Usage example:
-
-typedef void (__kuser_dmb_t)(void);
-#define __kuser_dmb (*(__kuser_dmb_t *)0xffff0fa0)
-
-Notes:
-
- - Valid only if __kuser_helper_version >= 3 (from kernel version 2.6.15).
-
-kuser_cmpxchg64
----------------
-
-Location: 0xffff0f60
-
-Reference prototype:
-
- int __kuser_cmpxchg64(const int64_t *oldval,
- const int64_t *newval,
- volatile int64_t *ptr);
-
-Input:
-
- r0 = pointer to oldval
- r1 = pointer to newval
- r2 = pointer to target value
- lr = return address
-
-Output:
-
- r0 = success code (zero or non-zero)
- C flag = set if r0 == 0, clear if r0 != 0
-
-Clobbered registers:
-
- r3, lr, flags
-
-Definition:
-
- Atomically store the 64-bit value pointed by *newval in *ptr only if *ptr
- is equal to the 64-bit value pointed by *oldval. Return zero if *ptr was
- changed or non-zero if no exchange happened.
-
- The C flag is also set if *ptr was changed to allow for assembly
- optimization in the calling code.
-
-Usage example:
-
-typedef int (__kuser_cmpxchg64_t)(const int64_t *oldval,
- const int64_t *newval,
- volatile int64_t *ptr);
-#define __kuser_cmpxchg64 (*(__kuser_cmpxchg64_t *)0xffff0f60)
-
-int64_t atomic_add64(volatile int64_t *ptr, int64_t val)
-{
- int64_t old, new;
-
- do {
- old = *ptr;
- new = old + val;
- } while(__kuser_cmpxchg64(&old, &new, ptr));
-
- return new;
-}
-
-Notes:
-
- - This routine already includes memory barriers as needed.
-
- - Due to the length of this sequence, this spans 2 conventional kuser
- "slots", therefore 0xffff0f80 is not used as a valid entry point.
-
- - Valid only if __kuser_helper_version >= 5 (from kernel version 3.1).
diff --git a/Documentation/arm/keystone/Overview.txt b/Documentation/arm/keystone/Overview.txt
deleted file mode 100644
index 400c0c270d2e..000000000000
--- a/Documentation/arm/keystone/Overview.txt
+++ /dev/null
@@ -1,55 +0,0 @@
- TI Keystone Linux Overview
- --------------------------
-
-Introduction
-------------
-Keystone range of SoCs are based on ARM Cortex-A15 MPCore Processors
-and c66x DSP cores. This document describes essential information required
-for users to run Linux on Keystone based EVMs from Texas Instruments.
-
-Following SoCs & EVMs are currently supported:-
-
------------- K2HK SoC and EVM --------------------------------------------------
-
-a.k.a Keystone 2 Hawking/Kepler SoC
-TCI6636K2H & TCI6636K2K: See documentation at
- http://www.ti.com/product/tci6638k2k
- http://www.ti.com/product/tci6638k2h
-
-EVM:
-http://www.advantech.com/Support/TI-EVM/EVMK2HX_sd.aspx
-
------------- K2E SoC and EVM ---------------------------------------------------
-
-a.k.a Keystone 2 Edison SoC
-K2E - 66AK2E05: See documentation at
- http://www.ti.com/product/66AK2E05/technicaldocuments
-
-EVM:
-https://www.einfochips.com/index.php/partnerships/texas-instruments/k2e-evm.html
-
------------- K2L SoC and EVM ---------------------------------------------------
-
-a.k.a Keystone 2 Lamarr SoC
-K2L - TCI6630K2L: See documentation at
- http://www.ti.com/product/TCI6630K2L/technicaldocuments
-EVM:
-https://www.einfochips.com/index.php/partnerships/texas-instruments/k2l-evm.html
-
-Configuration
--------------
-
-All of the K2 SoCs/EVMs share a common defconfig, keystone_defconfig and same
-image is used to boot on individual EVMs. The platform configuration is
-specified through DTS. Following are the DTS used:-
- K2HK EVM : k2hk-evm.dts
- K2E EVM : k2e-evm.dts
- K2L EVM : k2l-evm.dts
-
-The device tree documentation for the keystone machines are located at
- Documentation/devicetree/bindings/arm/keystone/keystone.txt
-
-Document Author
----------------
-Murali Karicheri <m-karicheri2@ti.com>
-Copyright 2015 Texas Instruments
diff --git a/Documentation/arm/keystone/knav-qmss.rst b/Documentation/arm/keystone/knav-qmss.rst
new file mode 100644
index 000000000000..7f7638d80b42
--- /dev/null
+++ b/Documentation/arm/keystone/knav-qmss.rst
@@ -0,0 +1,60 @@
+======================================================================
+Texas Instruments Keystone Navigator Queue Management SubSystem driver
+======================================================================
+
+Driver source code path
+ drivers/soc/ti/knav_qmss.c
+ drivers/soc/ti/knav_qmss_acc.c
+
+The QMSS (Queue Manager Sub System) found on Keystone SOCs is one of
+the main hardware sub system which forms the backbone of the Keystone
+multi-core Navigator. QMSS consist of queue managers, packed-data structure
+processors(PDSP), linking RAM, descriptor pools and infrastructure
+Packet DMA.
+The Queue Manager is a hardware module that is responsible for accelerating
+management of the packet queues. Packets are queued/de-queued by writing or
+reading descriptor address to a particular memory mapped location. The PDSPs
+perform QMSS related functions like accumulation, QoS, or event management.
+Linking RAM registers are used to link the descriptors which are stored in
+descriptor RAM. Descriptor RAM is configurable as internal or external memory.
+The QMSS driver manages the PDSP setups, linking RAM regions,
+queue pool management (allocation, push, pop and notify) and descriptor
+pool management.
+
+knav qmss driver provides a set of APIs to drivers to open/close qmss queues,
+allocate descriptor pools, map the descriptors, push/pop to queues etc. For
+details of the available APIs, please refers to include/linux/soc/ti/knav_qmss.h
+
+DT documentation is available at
+Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
+
+Accumulator QMSS queues using PDSP firmware
+============================================
+The QMSS PDSP firmware support accumulator channel that can monitor a single
+queue or multiple contiguous queues. drivers/soc/ti/knav_qmss_acc.c is the
+driver that interface with the accumulator PDSP. This configures
+accumulator channels defined in DTS (example in DT documentation) to monitor
+1 or 32 queues per channel. More description on the firmware is available in
+CPPI/QMSS Low Level Driver document (docs/CPPI_QMSS_LLD_SDS.pdf) at
+
+ git://git.ti.com/keystone-rtos/qmss-lld.git
+
+k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin firmware supports upto 48 accumulator
+channels. This firmware is available under ti-keystone folder of
+firmware.git at
+
+ git://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git
+
+To use copy the firmware image to lib/firmware folder of the initramfs or
+ubifs file system and provide a sym link to k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin
+in the file system and boot up the kernel. User would see
+
+ "firmware file ks2_qmss_pdsp_acc48.bin downloaded for PDSP"
+
+in the boot up log if loading of firmware to PDSP is successful.
+
+Use of accumulated queues requires the firmware image to be present in the
+file system. The driver doesn't acc queues to the supported queue range if
+PDSP is not running in the SoC. The API call fails if there is a queue open
+request to an acc queue and PDSP is not running. So make sure to copy firmware
+to file system before using these queue types.
diff --git a/Documentation/arm/keystone/knav-qmss.txt b/Documentation/arm/keystone/knav-qmss.txt
deleted file mode 100644
index fcdb9fd5f53a..000000000000
--- a/Documentation/arm/keystone/knav-qmss.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-* Texas Instruments Keystone Navigator Queue Management SubSystem driver
-
-Driver source code path
- drivers/soc/ti/knav_qmss.c
- drivers/soc/ti/knav_qmss_acc.c
-
-The QMSS (Queue Manager Sub System) found on Keystone SOCs is one of
-the main hardware sub system which forms the backbone of the Keystone
-multi-core Navigator. QMSS consist of queue managers, packed-data structure
-processors(PDSP), linking RAM, descriptor pools and infrastructure
-Packet DMA.
-The Queue Manager is a hardware module that is responsible for accelerating
-management of the packet queues. Packets are queued/de-queued by writing or
-reading descriptor address to a particular memory mapped location. The PDSPs
-perform QMSS related functions like accumulation, QoS, or event management.
-Linking RAM registers are used to link the descriptors which are stored in
-descriptor RAM. Descriptor RAM is configurable as internal or external memory.
-The QMSS driver manages the PDSP setups, linking RAM regions,
-queue pool management (allocation, push, pop and notify) and descriptor
-pool management.
-
-knav qmss driver provides a set of APIs to drivers to open/close qmss queues,
-allocate descriptor pools, map the descriptors, push/pop to queues etc. For
-details of the available APIs, please refers to include/linux/soc/ti/knav_qmss.h
-
-DT documentation is available at
-Documentation/devicetree/bindings/soc/ti/keystone-navigator-qmss.txt
-
-Accumulator QMSS queues using PDSP firmware
-============================================
-The QMSS PDSP firmware support accumulator channel that can monitor a single
-queue or multiple contiguous queues. drivers/soc/ti/knav_qmss_acc.c is the
-driver that interface with the accumulator PDSP. This configures
-accumulator channels defined in DTS (example in DT documentation) to monitor
-1 or 32 queues per channel. More description on the firmware is available in
-CPPI/QMSS Low Level Driver document (docs/CPPI_QMSS_LLD_SDS.pdf) at
- git://git.ti.com/keystone-rtos/qmss-lld.git
-
-k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin firmware supports upto 48 accumulator
-channels. This firmware is available under ti-keystone folder of
-firmware.git at
- git://git.kernel.org/pub/scm/linux/kernel/git/firmware/linux-firmware.git
-
-To use copy the firmware image to lib/firmware folder of the initramfs or
-ubifs file system and provide a sym link to k2_qmss_pdsp_acc48_k2_le_1_0_0_9.bin
-in the file system and boot up the kernel. User would see
-
- "firmware file ks2_qmss_pdsp_acc48.bin downloaded for PDSP"
-
-in the boot up log if loading of firmware to PDSP is successful.
-
-Use of accumulated queues requires the firmware image to be present in the
-file system. The driver doesn't acc queues to the supported queue range if
-PDSP is not running in the SoC. The API call fails if there is a queue open
-request to an acc queue and PDSP is not running. So make sure to copy firmware
-to file system before using these queue types.
diff --git a/Documentation/arm/keystone/overview.rst b/Documentation/arm/keystone/overview.rst
new file mode 100644
index 000000000000..cd90298c493c
--- /dev/null
+++ b/Documentation/arm/keystone/overview.rst
@@ -0,0 +1,74 @@
+==========================
+TI Keystone Linux Overview
+==========================
+
+Introduction
+------------
+Keystone range of SoCs are based on ARM Cortex-A15 MPCore Processors
+and c66x DSP cores. This document describes essential information required
+for users to run Linux on Keystone based EVMs from Texas Instruments.
+
+Following SoCs & EVMs are currently supported:-
+
+K2HK SoC and EVM
+=================
+
+a.k.a Keystone 2 Hawking/Kepler SoC
+TCI6636K2H & TCI6636K2K: See documentation at
+
+ http://www.ti.com/product/tci6638k2k
+ http://www.ti.com/product/tci6638k2h
+
+EVM:
+ http://www.advantech.com/Support/TI-EVM/EVMK2HX_sd.aspx
+
+K2E SoC and EVM
+===============
+
+a.k.a Keystone 2 Edison SoC
+
+K2E - 66AK2E05:
+
+See documentation at
+
+ http://www.ti.com/product/66AK2E05/technicaldocuments
+
+EVM:
+ https://www.einfochips.com/index.php/partnerships/texas-instruments/k2e-evm.html
+
+K2L SoC and EVM
+===============
+
+a.k.a Keystone 2 Lamarr SoC
+
+K2L - TCI6630K2L:
+
+See documentation at
+ http://www.ti.com/product/TCI6630K2L/technicaldocuments
+
+EVM:
+ https://www.einfochips.com/index.php/partnerships/texas-instruments/k2l-evm.html
+
+Configuration
+-------------
+
+All of the K2 SoCs/EVMs share a common defconfig, keystone_defconfig and same
+image is used to boot on individual EVMs. The platform configuration is
+specified through DTS. Following are the DTS used:
+
+ K2HK EVM:
+ k2hk-evm.dts
+ K2E EVM:
+ k2e-evm.dts
+ K2L EVM:
+ k2l-evm.dts
+
+The device tree documentation for the keystone machines are located at
+
+ Documentation/devicetree/bindings/arm/keystone/keystone.txt
+
+Document Author
+---------------
+Murali Karicheri <m-karicheri2@ti.com>
+
+Copyright 2015 Texas Instruments
diff --git a/Documentation/arm/marvel.rst b/Documentation/arm/marvel.rst
new file mode 100644
index 000000000000..16ab2eb085b8
--- /dev/null
+++ b/Documentation/arm/marvel.rst
@@ -0,0 +1,488 @@
+================
+ARM Marvell SoCs
+================
+
+This document lists all the ARM Marvell SoCs that are currently
+supported in mainline by the Linux kernel. As the Marvell families of
+SoCs are large and complex, it is hard to understand where the support
+for a particular SoC is available in the Linux kernel. This document
+tries to help in understanding where those SoCs are supported, and to
+match them with their corresponding public datasheet, when available.
+
+Orion family
+------------
+
+ Flavors:
+ - 88F5082
+ - 88F5181
+ - 88F5181L
+ - 88F5182
+
+ - Datasheet: http://www.embeddedarm.com/documentation/third-party/MV88F5182-datasheet.pdf
+ - Programmer's User Guide: http://www.embeddedarm.com/documentation/third-party/MV88F5182-opensource-manual.pdf
+ - User Manual: http://www.embeddedarm.com/documentation/third-party/MV88F5182-usermanual.pdf
+ - 88F5281
+
+ - Datasheet: http://www.ocmodshop.com/images/reviews/networking/qnap_ts409u/marvel_88f5281_data_sheet.pdf
+ - 88F6183
+ Core:
+ Feroceon 88fr331 (88f51xx) or 88fr531-vd (88f52xx) ARMv5 compatible
+ Linux kernel mach directory:
+ arch/arm/mach-orion5x
+ Linux kernel plat directory:
+ arch/arm/plat-orion
+
+Kirkwood family
+---------------
+
+ Flavors:
+ - 88F6282 a.k.a Armada 300
+
+ - Product Brief : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
+ - 88F6283 a.k.a Armada 310
+
+ - Product Brief : http://www.marvell.com/embedded-processors/armada-300/assets/armada_310.pdf
+ - 88F6190
+
+ - Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6190-003_WEB.pdf
+ - Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
+ - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+ - 88F6192
+
+ - Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6192-003_ver1.pdf
+ - Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F619x_OpenSource.pdf
+ - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+ - 88F6182
+ - 88F6180
+
+ - Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6180-003_ver1.pdf
+ - Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6180_OpenSource.pdf
+ - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+ - 88F6281
+
+ - Product Brief : http://www.marvell.com/embedded-processors/kirkwood/assets/88F6281-004_ver1.pdf
+ - Hardware Spec : http://www.marvell.com/embedded-processors/kirkwood/assets/HW_88F6281_OpenSource.pdf
+ - Functional Spec: http://www.marvell.com/embedded-processors/kirkwood/assets/FS_88F6180_9x_6281_OpenSource.pdf
+ Homepage:
+ http://www.marvell.com/embedded-processors/kirkwood/
+ Core:
+ Feroceon 88fr131 ARMv5 compatible
+ Linux kernel mach directory:
+ arch/arm/mach-mvebu
+ Linux kernel plat directory:
+ none
+
+Discovery family
+----------------
+
+ Flavors:
+ - MV78100
+
+ - Product Brief : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78100-003_WEB.pdf
+ - Hardware Spec : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78100_OpenSource.pdf
+ - Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
+ - MV78200
+
+ - Product Brief : http://www.marvell.com/embedded-processors/discovery-innovation/assets/MV78200-002_WEB.pdf
+ - Hardware Spec : http://www.marvell.com/embedded-processors/discovery-innovation/assets/HW_MV78200_OpenSource.pdf
+ - Functional Spec: http://www.marvell.com/embedded-processors/discovery-innovation/assets/FS_MV76100_78100_78200_OpenSource.pdf
+ - MV76100
+
+ Not supported by the Linux kernel.
+
+ Core:
+ Feroceon 88fr571-vd ARMv5 compatible
+
+ Linux kernel mach directory:
+ arch/arm/mach-mv78xx0
+ Linux kernel plat directory:
+ arch/arm/plat-orion
+
+EBU Armada family
+-----------------
+
+ Armada 370 Flavors:
+ - 88F6710
+ - 88F6707
+ - 88F6W11
+
+ - Product Brief: http://www.marvell.com/embedded-processors/armada-300/assets/Marvell_ARMADA_370_SoC.pdf
+ - Hardware Spec: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-datasheet.pdf
+ - Functional Spec: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA370-FunctionalSpec-datasheet.pdf
+
+ Core:
+ Sheeva ARMv7 compatible PJ4B
+
+ Armada 375 Flavors:
+ - 88F6720
+
+ - Product Brief: http://www.marvell.com/embedded-processors/armada-300/assets/ARMADA_375_SoC-01_product_brief.pdf
+
+ Core:
+ ARM Cortex-A9
+
+ Armada 38x Flavors:
+ - 88F6810 Armada 380
+ - 88F6820 Armada 385
+ - 88F6828 Armada 388
+
+ - Product infos: http://www.marvell.com/embedded-processors/armada-38x/
+ - Functional Spec: https://marvellcorp.wufoo.com/forms/marvell-armada-38x-functional-specifications/
+
+ Core:
+ ARM Cortex-A9
+
+ Armada 39x Flavors:
+ - 88F6920 Armada 390
+ - 88F6928 Armada 398
+
+ - Product infos: http://www.marvell.com/embedded-processors/armada-39x/
+
+ Core:
+ ARM Cortex-A9
+
+ Armada XP Flavors:
+ - MV78230
+ - MV78260
+ - MV78460
+
+ NOTE:
+ not to be confused with the non-SMP 78xx0 SoCs
+
+ Product Brief:
+ http://www.marvell.com/embedded-processors/armada-xp/assets/Marvell-ArmadaXP-SoC-product%20brief.pdf
+
+ Functional Spec:
+ http://www.marvell.com/embedded-processors/armada-xp/assets/ARMADA-XP-Functional-SpecDatasheet.pdf
+
+ - Hardware Specs:
+
+ - http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78230_OS.PDF
+ - http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78260_OS.PDF
+ - http://www.marvell.com/embedded-processors/armada-xp/assets/HW_MV78460_OS.PDF
+
+ Core:
+ Sheeva ARMv7 compatible Dual-core or Quad-core PJ4B-MP
+
+ Linux kernel mach directory:
+ arch/arm/mach-mvebu
+ Linux kernel plat directory:
+ none
+
+EBU Armada family ARMv8
+-----------------------
+
+ Armada 3710/3720 Flavors:
+ - 88F3710
+ - 88F3720
+
+ Core:
+ ARM Cortex A53 (ARMv8)
+
+ Homepage:
+ http://www.marvell.com/embedded-processors/armada-3700/
+
+ Product Brief:
+ http://www.marvell.com/embedded-processors/assets/PB-88F3700-FNL.pdf
+
+ Device tree files:
+ arch/arm64/boot/dts/marvell/armada-37*
+
+ Armada 7K Flavors:
+ - 88F7020 (AP806 Dual + one CP110)
+ - 88F7040 (AP806 Quad + one CP110)
+
+ Core: ARM Cortex A72
+
+ Homepage:
+ http://www.marvell.com/embedded-processors/armada-70xx/
+
+ Product Brief:
+ - http://www.marvell.com/embedded-processors/assets/Armada7020PB-Jan2016.pdf
+ - http://www.marvell.com/embedded-processors/assets/Armada7040PB-Jan2016.pdf
+
+ Device tree files:
+ arch/arm64/boot/dts/marvell/armada-70*
+
+ Armada 8K Flavors:
+ - 88F8020 (AP806 Dual + two CP110)
+ - 88F8040 (AP806 Quad + two CP110)
+ Core:
+ ARM Cortex A72
+
+ Homepage:
+ http://www.marvell.com/embedded-processors/armada-80xx/
+
+ Product Brief:
+ - http://www.marvell.com/embedded-processors/assets/Armada8020PB-Jan2016.pdf
+ - http://www.marvell.com/embedded-processors/assets/Armada8040PB-Jan2016.pdf
+
+ Device tree files:
+ arch/arm64/boot/dts/marvell/armada-80*
+
+Avanta family
+-------------
+
+ Flavors:
+ - 88F6510
+ - 88F6530P
+ - 88F6550
+ - 88F6560
+
+ Homepage:
+ http://www.marvell.com/broadband/
+
+ Product Brief:
+ http://www.marvell.com/broadband/assets/Marvell_Avanta_88F6510_305_060-001_product_brief.pdf
+
+ No public datasheet available.
+
+ Core:
+ ARMv5 compatible
+
+ Linux kernel mach directory:
+ no code in mainline yet, planned for the future
+ Linux kernel plat directory:
+ no code in mainline yet, planned for the future
+
+Storage family
+--------------
+
+ Armada SP:
+ - 88RC1580
+
+ Product infos:
+ http://www.marvell.com/storage/armada-sp/
+
+ Core:
+ Sheeva ARMv7 comatible Quad-core PJ4C
+
+ (not supported in upstream Linux kernel)
+
+Dove family (application processor)
+-----------------------------------
+
+ Flavors:
+ - 88AP510 a.k.a Armada 510
+
+ Product Brief:
+ http://www.marvell.com/application-processors/armada-500/assets/Marvell_Armada510_SoC.pdf
+
+ Hardware Spec:
+ http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Hardware-Spec.pdf
+
+ Functional Spec:
+ http://www.marvell.com/application-processors/armada-500/assets/Armada-510-Functional-Spec.pdf
+
+ Homepage:
+ http://www.marvell.com/application-processors/armada-500/
+
+ Core:
+ ARMv7 compatible
+
+ Directory:
+ - arch/arm/mach-mvebu (DT enabled platforms)
+ - arch/arm/mach-dove (non-DT enabled platforms)
+
+PXA 2xx/3xx/93x/95x family
+--------------------------
+
+ Flavors:
+ - PXA21x, PXA25x, PXA26x
+ - Application processor only
+ - Core: ARMv5 XScale1 core
+ - PXA270, PXA271, PXA272
+ - Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_pb.pdf
+ - Design guide : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_design_guide.pdf
+ - Developers manual : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_dev_man.pdf
+ - Specification : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_emts.pdf
+ - Specification update : http://www.marvell.com/application-processors/pxa-family/assets/pxa_27x_spec_update.pdf
+ - Application processor only
+ - Core: ARMv5 XScale2 core
+ - PXA300, PXA310, PXA320
+ - PXA 300 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA300_PB_R4.pdf
+ - PXA 310 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA310_PB_R4.pdf
+ - PXA 320 Product Brief : http://www.marvell.com/application-processors/pxa-family/assets/PXA320_PB_R4.pdf
+ - Design guide : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Design_Guide.pdf
+ - Developers manual : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Developers_Manual.zip
+ - Specifications : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_EMTS.pdf
+ - Specification Update : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_Spec_Update.zip
+ - Reference Manual : http://www.marvell.com/application-processors/pxa-family/assets/PXA3xx_TavorP_BootROM_Ref_Manual.pdf
+ - Application processor only
+ - Core: ARMv5 XScale3 core
+ - PXA930, PXA935
+ - Application processor with Communication processor
+ - Core: ARMv5 XScale3 core
+ - PXA955
+ - Application processor with Communication processor
+ - Core: ARMv7 compatible Sheeva PJ4 core
+
+ Comments:
+
+ * This line of SoCs originates from the XScale family developed by
+ Intel and acquired by Marvell in ~2006. The PXA21x, PXA25x,
+ PXA26x, PXA27x, PXA3xx and PXA93x were developed by Intel, while
+ the later PXA95x were developed by Marvell.
+
+ * Due to their XScale origin, these SoCs have virtually nothing in
+ common with the other (Kirkwood, Dove, etc.) families of Marvell
+ SoCs, except with the MMP/MMP2 family of SoCs.
+
+ Linux kernel mach directory:
+ arch/arm/mach-pxa
+ Linux kernel plat directory:
+ arch/arm/plat-pxa
+
+MMP/MMP2/MMP3 family (communication processor)
+----------------------------------------------
+
+ Flavors:
+ - PXA168, a.k.a Armada 168
+ - Homepage : http://www.marvell.com/application-processors/armada-100/armada-168.jsp
+ - Product brief : http://www.marvell.com/application-processors/armada-100/assets/pxa_168_pb.pdf
+ - Hardware manual : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_datasheet.pdf
+ - Software manual : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_software_manual.pdf
+ - Specification update : http://www.marvell.com/application-processors/armada-100/assets/ARMADA16x_Spec_update.pdf
+ - Boot ROM manual : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_ref_manual.pdf
+ - App node package : http://www.marvell.com/application-processors/armada-100/assets/armada_16x_app_note_package.pdf
+ - Application processor only
+ - Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
+ - PXA910/PXA920
+ - Homepage : http://www.marvell.com/communication-processors/pxa910/
+ - Product Brief : http://www.marvell.com/communication-processors/pxa910/assets/Marvell_PXA910_Platform-001_PB_final.pdf
+ - Application processor with Communication processor
+ - Core: ARMv5 compatible Marvell PJ1 88sv331 (Mohawk)
+ - PXA688, a.k.a. MMP2, a.k.a Armada 610
+ - Product Brief : http://www.marvell.com/application-processors/armada-600/assets/armada610_pb.pdf
+ - Application processor only
+ - Core: ARMv7 compatible Sheeva PJ4 88sv581x core
+ - PXA2128, a.k.a. MMP3 (OLPC XO4, Linux support not upstream)
+ - Product Brief : http://www.marvell.com/application-processors/armada/pxa2128/assets/Marvell-ARMADA-PXA2128-SoC-PB.pdf
+ - Application processor only
+ - Core: Dual-core ARMv7 compatible Sheeva PJ4C core
+ - PXA960/PXA968/PXA978 (Linux support not upstream)
+ - Application processor with Communication Processor
+ - Core: ARMv7 compatible Sheeva PJ4 core
+ - PXA986/PXA988 (Linux support not upstream)
+ - Application processor with Communication Processor
+ - Core: Dual-core ARMv7 compatible Sheeva PJ4B-MP core
+ - PXA1088/PXA1920 (Linux support not upstream)
+ - Application processor with Communication Processor
+ - Core: quad-core ARMv7 Cortex-A7
+ - PXA1908/PXA1928/PXA1936
+ - Application processor with Communication Processor
+ - Core: multi-core ARMv8 Cortex-A53
+
+ Comments:
+
+ * This line of SoCs originates from the XScale family developed by
+ Intel and acquired by Marvell in ~2006. All the processors of
+ this MMP/MMP2 family were developed by Marvell.
+
+ * Due to their XScale origin, these SoCs have virtually nothing in
+ common with the other (Kirkwood, Dove, etc.) families of Marvell
+ SoCs, except with the PXA family of SoCs listed above.
+
+ Linux kernel mach directory:
+ arch/arm/mach-mmp
+ Linux kernel plat directory:
+ arch/arm/plat-pxa
+
+Berlin family (Multimedia Solutions)
+-------------------------------------
+
+ - Flavors:
+ - 88DE3010, Armada 1000 (no Linux support)
+ - Core: Marvell PJ1 (ARMv5TE), Dual-core
+ - Product Brief: http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
+ - 88DE3005, Armada 1500 Mini
+ - Design name: BG2CD
+ - Core: ARM Cortex-A9, PL310 L2CC
+ - 88DE3006, Armada 1500 Mini Plus
+ - Design name: BG2CDP
+ - Core: Dual Core ARM Cortex-A7
+ - 88DE3100, Armada 1500
+ - Design name: BG2
+ - Core: Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
+ - 88DE3114, Armada 1500 Pro
+ - Design name: BG2Q
+ - Core: Quad Core ARM Cortex-A9, PL310 L2CC
+ - 88DE3214, Armada 1500 Pro 4K
+ - Design name: BG3
+ - Core: ARM Cortex-A15, CA15 integrated L2CC
+ - 88DE3218, ARMADA 1500 Ultra
+ - Core: ARM Cortex-A53
+
+ Homepage: https://www.synaptics.com/products/multimedia-solutions
+ Directory: arch/arm/mach-berlin
+
+ Comments:
+
+ * This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
+ with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
+
+ * The Berlin family was acquired by Synaptics from Marvell in 2017.
+
+CPU Cores
+---------
+
+The XScale cores were designed by Intel, and shipped by Marvell in the older
+PXA processors. Feroceon is a Marvell designed core that developed in-house,
+and that evolved into Sheeva. The XScale and Feroceon cores were phased out
+over time and replaced with Sheeva cores in later products, which subsequently
+got replaced with licensed ARM Cortex-A cores.
+
+ XScale 1
+ CPUID 0x69052xxx
+ ARMv5, iWMMXt
+ XScale 2
+ CPUID 0x69054xxx
+ ARMv5, iWMMXt
+ XScale 3
+ CPUID 0x69056xxx or 0x69056xxx
+ ARMv5, iWMMXt
+ Feroceon-1850 88fr331 "Mohawk"
+ CPUID 0x5615331x or 0x41xx926x
+ ARMv5TE, single issue
+ Feroceon-2850 88fr531-vd "Jolteon"
+ CPUID 0x5605531x or 0x41xx926x
+ ARMv5TE, VFP, dual-issue
+ Feroceon 88fr571-vd "Jolteon"
+ CPUID 0x5615571x
+ ARMv5TE, VFP, dual-issue
+ Feroceon 88fr131 "Mohawk-D"
+ CPUID 0x5625131x
+ ARMv5TE, single-issue in-order
+ Sheeva PJ1 88sv331 "Mohawk"
+ CPUID 0x561584xx
+ ARMv5, single-issue iWMMXt v2
+ Sheeva PJ4 88sv581x "Flareon"
+ CPUID 0x560f581x
+ ARMv7, idivt, optional iWMMXt v2
+ Sheeva PJ4B 88sv581x
+ CPUID 0x561f581x
+ ARMv7, idivt, optional iWMMXt v2
+ Sheeva PJ4B-MP / PJ4C
+ CPUID 0x562f584x
+ ARMv7, idivt/idiva, LPAE, optional iWMMXt v2 and/or NEON
+
+Long-term plans
+---------------
+
+ * Unify the mach-dove/, mach-mv78xx0/, mach-orion5x/ into the
+ mach-mvebu/ to support all SoCs from the Marvell EBU (Engineering
+ Business Unit) in a single mach-<foo> directory. The plat-orion/
+ would therefore disappear.
+
+ * Unify the mach-mmp/ and mach-pxa/ into the same mach-pxa
+ directory. The plat-pxa/ would therefore disappear.
+
+Credits
+-------
+
+- Maen Suleiman <maen@marvell.com>
+- Lior Amsalem <alior@marvell.com>
+- Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+- Andrew Lunn <andrew@lunn.ch>
+- Nicolas Pitre <nico@fluxnic.net>
+- Eric Miao <eric.y.miao@gmail.com>
diff --git a/Documentation/arm/mem_alignment b/Documentation/arm/mem_alignment
deleted file mode 100644
index e110e2781039..000000000000
--- a/Documentation/arm/mem_alignment
+++ /dev/null
@@ -1,58 +0,0 @@
-Too many problems popped up because of unnoticed misaligned memory access in
-kernel code lately. Therefore the alignment fixup is now unconditionally
-configured in for SA11x0 based targets. According to Alan Cox, this is a
-bad idea to configure it out, but Russell King has some good reasons for
-doing so on some f***ed up ARM architectures like the EBSA110. However
-this is not the case on many design I'm aware of, like all SA11x0 based
-ones.
-
-Of course this is a bad idea to rely on the alignment trap to perform
-unaligned memory access in general. If those access are predictable, you
-are better to use the macros provided by include/asm/unaligned.h. The
-alignment trap can fixup misaligned access for the exception cases, but at
-a high performance cost. It better be rare.
-
-Now for user space applications, it is possible to configure the alignment
-trap to SIGBUS any code performing unaligned access (good for debugging bad
-code), or even fixup the access by software like for kernel code. The later
-mode isn't recommended for performance reasons (just think about the
-floating point emulation that works about the same way). Fix your code
-instead!
-
-Please note that randomly changing the behaviour without good thought is
-real bad - it changes the behaviour of all unaligned instructions in user
-space, and might cause programs to fail unexpectedly.
-
-To change the alignment trap behavior, simply echo a number into
-/proc/cpu/alignment. The number is made up from various bits:
-
-bit behavior when set
---- -----------------
-
-0 A user process performing an unaligned memory access
- will cause the kernel to print a message indicating
- process name, pid, pc, instruction, address, and the
- fault code.
-
-1 The kernel will attempt to fix up the user process
- performing the unaligned access. This is of course
- slow (think about the floating point emulator) and
- not recommended for production use.
-
-2 The kernel will send a SIGBUS signal to the user process
- performing the unaligned access.
-
-Note that not all combinations are supported - only values 0 through 5.
-(6 and 7 don't make sense).
-
-For example, the following will turn on the warnings, but without
-fixing up or sending SIGBUS signals:
-
- echo 1 > /proc/cpu/alignment
-
-You can also read the content of the same file to get statistical
-information on unaligned access occurrences plus the current mode of
-operation for user space code.
-
-
-Nicolas Pitre, Mar 13, 2001. Modified Russell King, Nov 30, 2001.
diff --git a/Documentation/arm/mem_alignment.rst b/Documentation/arm/mem_alignment.rst
new file mode 100644
index 000000000000..aa22893b62bc
--- /dev/null
+++ b/Documentation/arm/mem_alignment.rst
@@ -0,0 +1,63 @@
+================
+Memory alignment
+================
+
+Too many problems popped up because of unnoticed misaligned memory access in
+kernel code lately. Therefore the alignment fixup is now unconditionally
+configured in for SA11x0 based targets. According to Alan Cox, this is a
+bad idea to configure it out, but Russell King has some good reasons for
+doing so on some f***ed up ARM architectures like the EBSA110. However
+this is not the case on many design I'm aware of, like all SA11x0 based
+ones.
+
+Of course this is a bad idea to rely on the alignment trap to perform
+unaligned memory access in general. If those access are predictable, you
+are better to use the macros provided by include/asm/unaligned.h. The
+alignment trap can fixup misaligned access for the exception cases, but at
+a high performance cost. It better be rare.
+
+Now for user space applications, it is possible to configure the alignment
+trap to SIGBUS any code performing unaligned access (good for debugging bad
+code), or even fixup the access by software like for kernel code. The later
+mode isn't recommended for performance reasons (just think about the
+floating point emulation that works about the same way). Fix your code
+instead!
+
+Please note that randomly changing the behaviour without good thought is
+real bad - it changes the behaviour of all unaligned instructions in user
+space, and might cause programs to fail unexpectedly.
+
+To change the alignment trap behavior, simply echo a number into
+/proc/cpu/alignment. The number is made up from various bits:
+
+=== ========================================================
+bit behavior when set
+=== ========================================================
+0 A user process performing an unaligned memory access
+ will cause the kernel to print a message indicating
+ process name, pid, pc, instruction, address, and the
+ fault code.
+
+1 The kernel will attempt to fix up the user process
+ performing the unaligned access. This is of course
+ slow (think about the floating point emulator) and
+ not recommended for production use.
+
+2 The kernel will send a SIGBUS signal to the user process
+ performing the unaligned access.
+=== ========================================================
+
+Note that not all combinations are supported - only values 0 through 5.
+(6 and 7 don't make sense).
+
+For example, the following will turn on the warnings, but without
+fixing up or sending SIGBUS signals::
+
+ echo 1 > /proc/cpu/alignment
+
+You can also read the content of the same file to get statistical
+information on unaligned access occurrences plus the current mode of
+operation for user space code.
+
+
+Nicolas Pitre, Mar 13, 2001. Modified Russell King, Nov 30, 2001.
diff --git a/Documentation/arm/memory.rst b/Documentation/arm/memory.rst
new file mode 100644
index 000000000000..0521b4ce5c96
--- /dev/null
+++ b/Documentation/arm/memory.rst
@@ -0,0 +1,93 @@
+=================================
+Kernel Memory Layout on ARM Linux
+=================================
+
+ Russell King <rmk@arm.linux.org.uk>
+
+ November 17, 2005 (2.6.15)
+
+This document describes the virtual memory layout which the Linux
+kernel uses for ARM processors. It indicates which regions are
+free for platforms to use, and which are used by generic code.
+
+The ARM CPU is capable of addressing a maximum of 4GB virtual memory
+space, and this must be shared between user space processes, the
+kernel, and hardware devices.
+
+As the ARM architecture matures, it becomes necessary to reserve
+certain regions of VM space for use for new facilities; therefore
+this document may reserve more VM space over time.
+
+=============== =============== ===============================================
+Start End Use
+=============== =============== ===============================================
+ffff8000 ffffffff copy_user_page / clear_user_page use.
+ For SA11xx and Xscale, this is used to
+ setup a minicache mapping.
+
+ffff4000 ffffffff cache aliasing on ARMv6 and later CPUs.
+
+ffff1000 ffff7fff Reserved.
+ Platforms must not use this address range.
+
+ffff0000 ffff0fff CPU vector page.
+ The CPU vectors are mapped here if the
+ CPU supports vector relocation (control
+ register V bit.)
+
+fffe0000 fffeffff XScale cache flush area. This is used
+ in proc-xscale.S to flush the whole data
+ cache. (XScale does not have TCM.)
+
+fffe8000 fffeffff DTCM mapping area for platforms with
+ DTCM mounted inside the CPU.
+
+fffe0000 fffe7fff ITCM mapping area for platforms with
+ ITCM mounted inside the CPU.
+
+ffc00000 ffefffff Fixmap mapping region. Addresses provided
+ by fix_to_virt() will be located here.
+
+fee00000 feffffff Mapping of PCI I/O space. This is a static
+ mapping within the vmalloc space.
+
+VMALLOC_START VMALLOC_END-1 vmalloc() / ioremap() space.
+ Memory returned by vmalloc/ioremap will
+ be dynamically placed in this region.
+ Machine specific static mappings are also
+ located here through iotable_init().
+ VMALLOC_START is based upon the value
+ of the high_memory variable, and VMALLOC_END
+ is equal to 0xff800000.
+
+PAGE_OFFSET high_memory-1 Kernel direct-mapped RAM region.
+ This maps the platforms RAM, and typically
+ maps all platform RAM in a 1:1 relationship.
+
+PKMAP_BASE PAGE_OFFSET-1 Permanent kernel mappings
+ One way of mapping HIGHMEM pages into kernel
+ space.
+
+MODULES_VADDR MODULES_END-1 Kernel module space
+ Kernel modules inserted via insmod are
+ placed here using dynamic mappings.
+
+00001000 TASK_SIZE-1 User space mappings
+ Per-thread mappings are placed here via
+ the mmap() system call.
+
+00000000 00000fff CPU vector page / null pointer trap
+ CPUs which do not support vector remapping
+ place their vector page here. NULL pointer
+ dereferences by both the kernel and user
+ space are also caught via this mapping.
+=============== =============== ===============================================
+
+Please note that mappings which collide with the above areas may result
+in a non-bootable kernel, or may cause the kernel to (eventually) panic
+at run time.
+
+Since future CPUs may impact the kernel mapping layout, user programs
+must not access any memory which is not mapped inside their 0x0001000
+to TASK_SIZE address range. If they wish to access these areas, they
+must set up their own mappings using open() and mmap().
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
deleted file mode 100644
index 546a39048eb0..000000000000
--- a/Documentation/arm/memory.txt
+++ /dev/null
@@ -1,88 +0,0 @@
- Kernel Memory Layout on ARM Linux
-
- Russell King <rmk@arm.linux.org.uk>
- November 17, 2005 (2.6.15)
-
-This document describes the virtual memory layout which the Linux
-kernel uses for ARM processors. It indicates which regions are
-free for platforms to use, and which are used by generic code.
-
-The ARM CPU is capable of addressing a maximum of 4GB virtual memory
-space, and this must be shared between user space processes, the
-kernel, and hardware devices.
-
-As the ARM architecture matures, it becomes necessary to reserve
-certain regions of VM space for use for new facilities; therefore
-this document may reserve more VM space over time.
-
-Start End Use
---------------------------------------------------------------------------
-ffff8000 ffffffff copy_user_page / clear_user_page use.
- For SA11xx and Xscale, this is used to
- setup a minicache mapping.
-
-ffff4000 ffffffff cache aliasing on ARMv6 and later CPUs.
-
-ffff1000 ffff7fff Reserved.
- Platforms must not use this address range.
-
-ffff0000 ffff0fff CPU vector page.
- The CPU vectors are mapped here if the
- CPU supports vector relocation (control
- register V bit.)
-
-fffe0000 fffeffff XScale cache flush area. This is used
- in proc-xscale.S to flush the whole data
- cache. (XScale does not have TCM.)
-
-fffe8000 fffeffff DTCM mapping area for platforms with
- DTCM mounted inside the CPU.
-
-fffe0000 fffe7fff ITCM mapping area for platforms with
- ITCM mounted inside the CPU.
-
-ffc00000 ffefffff Fixmap mapping region. Addresses provided
- by fix_to_virt() will be located here.
-
-fee00000 feffffff Mapping of PCI I/O space. This is a static
- mapping within the vmalloc space.
-
-VMALLOC_START VMALLOC_END-1 vmalloc() / ioremap() space.
- Memory returned by vmalloc/ioremap will
- be dynamically placed in this region.
- Machine specific static mappings are also
- located here through iotable_init().
- VMALLOC_START is based upon the value
- of the high_memory variable, and VMALLOC_END
- is equal to 0xff800000.
-
-PAGE_OFFSET high_memory-1 Kernel direct-mapped RAM region.
- This maps the platforms RAM, and typically
- maps all platform RAM in a 1:1 relationship.
-
-PKMAP_BASE PAGE_OFFSET-1 Permanent kernel mappings
- One way of mapping HIGHMEM pages into kernel
- space.
-
-MODULES_VADDR MODULES_END-1 Kernel module space
- Kernel modules inserted via insmod are
- placed here using dynamic mappings.
-
-00001000 TASK_SIZE-1 User space mappings
- Per-thread mappings are placed here via
- the mmap() system call.
-
-00000000 00000fff CPU vector page / null pointer trap
- CPUs which do not support vector remapping
- place their vector page here. NULL pointer
- dereferences by both the kernel and user
- space are also caught via this mapping.
-
-Please note that mappings which collide with the above areas may result
-in a non-bootable kernel, or may cause the kernel to (eventually) panic
-at run time.
-
-Since future CPUs may impact the kernel mapping layout, user programs
-must not access any memory which is not mapped inside their 0x0001000
-to TASK_SIZE address range. If they wish to access these areas, they
-must set up their own mappings using open() and mmap().
diff --git a/Documentation/arm/microchip.rst b/Documentation/arm/microchip.rst
new file mode 100644
index 000000000000..c9a44c98e868
--- /dev/null
+++ b/Documentation/arm/microchip.rst
@@ -0,0 +1,204 @@
+=============================
+ARM Microchip SoCs (aka AT91)
+=============================
+
+
+Introduction
+------------
+This document gives useful information about the ARM Microchip SoCs that are
+currently supported in Linux Mainline (you know, the one on kernel.org).
+
+It is important to note that the Microchip (previously Atmel) ARM-based MPU
+product line is historically named "AT91" or "at91" throughout the Linux kernel
+development process even if this product prefix has completely disappeared from
+the official Microchip product name. Anyway, files, directories, git trees,
+git branches/tags and email subject always contain this "at91" sub-string.
+
+
+AT91 SoCs
+---------
+Documentation and detailed datasheet for each product are available on
+the Microchip website: http://www.microchip.com.
+
+ Flavors:
+ * ARM 920 based SoC
+ - at91rm9200
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-1768-32-bit-ARM920T-Embedded-Microprocessor-AT91RM9200_Datasheet.pdf
+
+ * ARM 926 based SoCs
+ - at91sam9260
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6221-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9260_Datasheet.pdf
+
+ - at91sam9xe
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6254-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9XE_Datasheet.pdf
+
+ - at91sam9261
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6062-ARM926EJ-S-Microprocessor-SAM9261_Datasheet.pdf
+
+ - at91sam9263
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6249-32-bit-ARM926EJ-S-Embedded-Microprocessor-SAM9263_Datasheet.pdf
+
+ - at91sam9rl
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/doc6289.pdf
+
+ - at91sam9g20
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001516A.pdf
+
+ - at91sam9g45 family
+ - at91sam9g45
+ - at91sam9g46
+ - at91sam9m10
+ - at91sam9m11 (device superset)
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-6437-32-bit-ARM926-Embedded-Microprocessor-SAM9M11_Datasheet.pdf
+
+ - at91sam9x5 family (aka "The 5 series")
+ - at91sam9g15
+ - at91sam9g25
+ - at91sam9g35
+ - at91sam9x25
+ - at91sam9x35
+
+ * Datasheet (can be considered as covering the whole family)
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11055-32-bit-ARM926EJ-S-Microcontroller-SAM9X35_Datasheet.pdf
+
+ - at91sam9n12
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf
+
+ * ARM Cortex-A5 based SoCs
+ - sama5d3 family
+
+ - sama5d31
+ - sama5d33
+ - sama5d34
+ - sama5d35
+ - sama5d36 (device superset)
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/Atmel-11121-32-bit-Cortex-A5-Microcontroller-SAMA5D3_Datasheet.pdf
+
+ * ARM Cortex-A5 + NEON based SoCs
+ - sama5d4 family
+
+ - sama5d41
+ - sama5d42
+ - sama5d43
+ - sama5d44 (device superset)
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/60001525A.pdf
+
+ - sama5d2 family
+
+ - sama5d21
+ - sama5d22
+ - sama5d23
+ - sama5d24
+ - sama5d26
+ - sama5d27 (device superset)
+ - sama5d28 (device superset + environmental monitors)
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001476B.pdf
+
+ * ARM Cortex-M7 MCUs
+ - sams70 family
+
+ - sams70j19
+ - sams70j20
+ - sams70j21
+ - sams70n19
+ - sams70n20
+ - sams70n21
+ - sams70q19
+ - sams70q20
+ - sams70q21
+
+ - samv70 family
+
+ - samv70j19
+ - samv70j20
+ - samv70n19
+ - samv70n20
+ - samv70q19
+ - samv70q20
+
+ - samv71 family
+
+ - samv71j19
+ - samv71j20
+ - samv71j21
+ - samv71n19
+ - samv71n20
+ - samv71n21
+ - samv71q19
+ - samv71q20
+ - samv71q21
+
+ * Datasheet
+
+ http://ww1.microchip.com/downloads/en/DeviceDoc/60001527A.pdf
+
+
+Linux kernel information
+------------------------
+Linux kernel mach directory: arch/arm/mach-at91
+MAINTAINERS entry is: "ARM/Microchip (AT91) SoC support"
+
+
+Device Tree for AT91 SoCs and boards
+------------------------------------
+All AT91 SoCs are converted to Device Tree. Since Linux 3.19, these products
+must use this method to boot the Linux kernel.
+
+Work In Progress statement:
+Device Tree files and Device Tree bindings that apply to AT91 SoCs and boards are
+considered as "Unstable". To be completely clear, any at91 binding can change at
+any time. So, be sure to use a Device Tree Binary and a Kernel Image generated from
+the same source tree.
+Please refer to the Documentation/devicetree/bindings/ABI.txt file for a
+definition of a "Stable" binding/ABI.
+This statement will be removed by AT91 MAINTAINERS when appropriate.
+
+Naming conventions and best practice:
+
+- SoCs Device Tree Source Include files are named after the official name of
+ the product (at91sam9g20.dtsi or sama5d33.dtsi for instance).
+- Device Tree Source Include files (.dtsi) are used to collect common nodes that can be
+ shared across SoCs or boards (sama5d3.dtsi or at91sam9x5cm.dtsi for instance).
+ When collecting nodes for a particular peripheral or topic, the identifier have to
+ be placed at the end of the file name, separated with a "_" (at91sam9x5_can.dtsi
+ or sama5d3_gmac.dtsi for example).
+- board Device Tree Source files (.dts) are prefixed by the string "at91-" so
+ that they can be identified easily. Note that some files are historical exceptions
+ to this rule (sama5d3[13456]ek.dts, usb_a9g20.dts or animeo_ip.dts for example).
diff --git a/Documentation/arm/netwinder.rst b/Documentation/arm/netwinder.rst
new file mode 100644
index 000000000000..8eab66caa2ac
--- /dev/null
+++ b/Documentation/arm/netwinder.rst
@@ -0,0 +1,85 @@
+================================
+NetWinder specific documentation
+================================
+
+The NetWinder is a small low-power computer, primarily designed
+to run Linux. It is based around the StrongARM RISC processor,
+DC21285 PCI bridge, with PC-type hardware glued around it.
+
+Port usage
+==========
+
+======= ====== ===============================
+Min Max Description
+======= ====== ===============================
+0x0000 0x000f DMA1
+0x0020 0x0021 PIC1
+0x0060 0x006f Keyboard
+0x0070 0x007f RTC
+0x0080 0x0087 DMA1
+0x0088 0x008f DMA2
+0x00a0 0x00a3 PIC2
+0x00c0 0x00df DMA2
+0x0180 0x0187 IRDA
+0x01f0 0x01f6 ide0
+0x0201 Game port
+0x0203 RWA010 configuration read
+0x0220 ? SoundBlaster
+0x0250 ? WaveArtist
+0x0279 RWA010 configuration index
+0x02f8 0x02ff Serial ttyS1
+0x0300 0x031f Ether10
+0x0338 GPIO1
+0x033a GPIO2
+0x0370 0x0371 W83977F configuration registers
+0x0388 ? AdLib
+0x03c0 0x03df VGA
+0x03f6 ide0
+0x03f8 0x03ff Serial ttyS0
+0x0400 0x0408 DC21143
+0x0480 0x0487 DMA1
+0x0488 0x048f DMA2
+0x0a79 RWA010 configuration write
+0xe800 0xe80f ide0/ide1 BM DMA
+======= ====== ===============================
+
+
+Interrupt usage
+===============
+
+======= ======= ========================
+IRQ type Description
+======= ======= ========================
+ 0 ISA 100Hz timer
+ 1 ISA Keyboard
+ 2 ISA cascade
+ 3 ISA Serial ttyS1
+ 4 ISA Serial ttyS0
+ 5 ISA PS/2 mouse
+ 6 ISA IRDA
+ 7 ISA Printer
+ 8 ISA RTC alarm
+ 9 ISA
+10 ISA GP10 (Orange reset button)
+11 ISA
+12 ISA WaveArtist
+13 ISA
+14 ISA hda1
+15 ISA
+======= ======= ========================
+
+DMA usage
+=========
+
+======= ======= ===========
+DMA type Description
+======= ======= ===========
+ 0 ISA IRDA
+ 1 ISA
+ 2 ISA cascade
+ 3 ISA WaveArtist
+ 4 ISA
+ 5 ISA
+ 6 ISA
+ 7 ISA WaveArtist
+======= ======= ===========
diff --git a/Documentation/arm/nwfpe/NOTES b/Documentation/arm/nwfpe/NOTES
deleted file mode 100644
index 40577b5a49d3..000000000000
--- a/Documentation/arm/nwfpe/NOTES
+++ /dev/null
@@ -1,29 +0,0 @@
-There seems to be a problem with exp(double) and our emulator. I haven't
-been able to track it down yet. This does not occur with the emulator
-supplied by Russell King.
-
-I also found one oddity in the emulator. I don't think it is serious but
-will point it out. The ARM calling conventions require floating point
-registers f4-f7 to be preserved over a function call. The compiler quite
-often uses an stfe instruction to save f4 on the stack upon entry to a
-function, and an ldfe instruction to restore it before returning.
-
-I was looking at some code, that calculated a double result, stored it in f4
-then made a function call. Upon return from the function call the number in
-f4 had been converted to an extended value in the emulator.
-
-This is a side effect of the stfe instruction. The double in f4 had to be
-converted to extended, then stored. If an lfm/sfm combination had been used,
-then no conversion would occur. This has performance considerations. The
-result from the function call and f4 were used in a multiplication. If the
-emulator sees a multiply of a double and extended, it promotes the double to
-extended, then does the multiply in extended precision.
-
-This code will cause this problem:
-
-double x, y, z;
-z = log(x)/log(y);
-
-The result of log(x) (a double) will be calculated, returned in f0, then
-moved to f4 to preserve it over the log(y) call. The division will be done
-in extended precision, due to the stfe instruction used to save f4 in log(y).
diff --git a/Documentation/arm/nwfpe/README b/Documentation/arm/nwfpe/README
deleted file mode 100644
index 771871de0c8b..000000000000
--- a/Documentation/arm/nwfpe/README
+++ /dev/null
@@ -1,70 +0,0 @@
-This directory contains the version 0.92 test release of the NetWinder
-Floating Point Emulator.
-
-The majority of the code was written by me, Scott Bambrough It is
-written in C, with a small number of routines in inline assembler
-where required. It was written quickly, with a goal of implementing a
-working version of all the floating point instructions the compiler
-emits as the first target. I have attempted to be as optimal as
-possible, but there remains much room for improvement.
-
-I have attempted to make the emulator as portable as possible. One of
-the problems is with leading underscores on kernel symbols. Elf
-kernels have no leading underscores, a.out compiled kernels do. I
-have attempted to use the C_SYMBOL_NAME macro wherever this may be
-important.
-
-Another choice I made was in the file structure. I have attempted to
-contain all operating system specific code in one module (fpmodule.*).
-All the other files contain emulator specific code. This should allow
-others to port the emulator to NetBSD for instance relatively easily.
-
-The floating point operations are based on SoftFloat Release 2, by
-John Hauser. SoftFloat is a software implementation of floating-point
-that conforms to the IEC/IEEE Standard for Binary Floating-point
-Arithmetic. As many as four formats are supported: single precision,
-double precision, extended double precision, and quadruple precision.
-All operations required by the standard are implemented, except for
-conversions to and from decimal. We use only the single precision,
-double precision and extended double precision formats. The port of
-SoftFloat to the ARM was done by Phil Blundell, based on an earlier
-port of SoftFloat version 1 by Neil Carson for NetBSD/arm32.
-
-The file README.FPE contains a description of what has been implemented
-so far in the emulator. The file TODO contains a information on what
-remains to be done, and other ideas for the emulator.
-
-Bug reports, comments, suggestions should be directed to me at
-<scottb@netwinder.org>. General reports of "this program doesn't
-work correctly when your emulator is installed" are useful for
-determining that bugs still exist; but are virtually useless when
-attempting to isolate the problem. Please report them, but don't
-expect quick action. Bugs still exist. The problem remains in isolating
-which instruction contains the bug. Small programs illustrating a specific
-problem are a godsend.
-
-Legal Notices
--------------
-
-The NetWinder Floating Point Emulator is free software. Everything Rebel.com
-has written is provided under the GNU GPL. See the file COPYING for copying
-conditions. Excluded from the above is the SoftFloat code. John Hauser's
-legal notice for SoftFloat is included below.
-
--------------------------------------------------------------------------------
-SoftFloat Legal Notice
-
-SoftFloat was written by John R. Hauser. This work was made possible in
-part by the International Computer Science Institute, located at Suite 600,
-1947 Center Street, Berkeley, California 94704. Funding was partially
-provided by the National Science Foundation under grant MIP-9311980. The
-original version of this code was written as part of a project to build
-a fixed-point vector processor in collaboration with the University of
-California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek.
-
-THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
-has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
-TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
-PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
-AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
--------------------------------------------------------------------------------
diff --git a/Documentation/arm/nwfpe/README.FPE b/Documentation/arm/nwfpe/README.FPE
deleted file mode 100644
index 26f5d7bb9a41..000000000000
--- a/Documentation/arm/nwfpe/README.FPE
+++ /dev/null
@@ -1,156 +0,0 @@
-The following describes the current state of the NetWinder's floating point
-emulator.
-
-In the following nomenclature is used to describe the floating point
-instructions. It follows the conventions in the ARM manual.
-
-<S|D|E> = <single|double|extended>, no default
-{P|M|Z} = {round to +infinity,round to -infinity,round to zero},
- default = round to nearest
-
-Note: items enclosed in {} are optional.
-
-Floating Point Coprocessor Data Transfer Instructions (CPDT)
-------------------------------------------------------------
-
-LDF/STF - load and store floating
-
-<LDF|STF>{cond}<S|D|E> Fd, Rn
-<LDF|STF>{cond}<S|D|E> Fd, [Rn, #<expression>]{!}
-<LDF|STF>{cond}<S|D|E> Fd, [Rn], #<expression>
-
-These instructions are fully implemented.
-
-LFM/SFM - load and store multiple floating
-
-Form 1 syntax:
-<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn]
-<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn, #<expression>]{!}
-<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn], #<expression>
-
-Form 2 syntax:
-<LFM|SFM>{cond}<FD,EA> Fd, <count>, [Rn]{!}
-
-These instructions are fully implemented. They store/load three words
-for each floating point register into the memory location given in the
-instruction. The format in memory is unlikely to be compatible with
-other implementations, in particular the actual hardware. Specific
-mention of this is made in the ARM manuals.
-
-Floating Point Coprocessor Register Transfer Instructions (CPRT)
-----------------------------------------------------------------
-
-Conversions, read/write status/control register instructions
-
-FLT{cond}<S,D,E>{P,M,Z} Fn, Rd Convert integer to floating point
-FIX{cond}{P,M,Z} Rd, Fn Convert floating point to integer
-WFS{cond} Rd Write floating point status register
-RFS{cond} Rd Read floating point status register
-WFC{cond} Rd Write floating point control register
-RFC{cond} Rd Read floating point control register
-
-FLT/FIX are fully implemented.
-
-RFS/WFS are fully implemented.
-
-RFC/WFC are fully implemented. RFC/WFC are supervisor only instructions, and
-presently check the CPU mode, and do an invalid instruction trap if not called
-from supervisor mode.
-
-Compare instructions
-
-CMF{cond} Fn, Fm Compare floating
-CMFE{cond} Fn, Fm Compare floating with exception
-CNF{cond} Fn, Fm Compare negated floating
-CNFE{cond} Fn, Fm Compare negated floating with exception
-
-These are fully implemented.
-
-Floating Point Coprocessor Data Instructions (CPDT)
----------------------------------------------------
-
-Dyadic operations:
-
-ADF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - add
-SUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - subtract
-RSF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse subtract
-MUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - multiply
-DVF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - divide
-RDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse divide
-
-These are fully implemented.
-
-FML{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast multiply
-FDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast divide
-FRD{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast reverse divide
-
-These are fully implemented as well. They use the same algorithm as the
-non-fast versions. Hence, in this implementation their performance is
-equivalent to the MUF/DVF/RDV instructions. This is acceptable according
-to the ARM manual. The manual notes these are defined only for single
-operands, on the actual FPA11 hardware they do not work for double or
-extended precision operands. The emulator currently does not check
-the requested permissions conditions, and performs the requested operation.
-
-RMF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - IEEE remainder
-
-This is fully implemented.
-
-Monadic operations:
-
-MVF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move
-MNF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move negated
-
-These are fully implemented.
-
-ABS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - absolute value
-SQT{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - square root
-RND{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - round
-
-These are fully implemented.
-
-URD{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - unnormalized round
-NRM{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - normalize
-
-These are implemented. URD is implemented using the same code as the RND
-instruction. Since URD cannot return a unnormalized number, NRM becomes
-a NOP.
-
-Library calls:
-
-POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
-RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
-POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
-
-LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
-LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
-EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
-SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
-COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
-TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
-ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
-ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
-ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
-
-These are not implemented. They are not currently issued by the compiler,
-and are handled by routines in libc. These are not implemented by the FPA11
-hardware, but are handled by the floating point support code. They should
-be implemented in future versions.
-
-Signalling:
-
-Signals are implemented. However current ELF kernels produced by Rebel.com
-have a bug in them that prevents the module from generating a SIGFPE. This
-is caused by a failure to alias fp_current to the kernel variable
-current_set[0] correctly.
-
-The kernel provided with this distribution (vmlinux-nwfpe-0.93) contains
-a fix for this problem and also incorporates the current version of the
-emulator directly. It is possible to run with no floating point module
-loaded with this kernel. It is provided as a demonstration of the
-technology and for those who want to do floating point work that depends
-on signals. It is not strictly necessary to use the module.
-
-A module (either the one provided by Russell King, or the one in this
-distribution) can be loaded to replace the functionality of the emulator
-built into the kernel.
diff --git a/Documentation/arm/nwfpe/TODO b/Documentation/arm/nwfpe/TODO
deleted file mode 100644
index 8027061b60eb..000000000000
--- a/Documentation/arm/nwfpe/TODO
+++ /dev/null
@@ -1,67 +0,0 @@
-TODO LIST
----------
-
-POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
-RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
-POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
-
-LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
-LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
-EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
-SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
-COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
-TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
-ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
-ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
-ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
-
-These are not implemented. They are not currently issued by the compiler,
-and are handled by routines in libc. These are not implemented by the FPA11
-hardware, but are handled by the floating point support code. They should
-be implemented in future versions.
-
-There are a couple of ways to approach the implementation of these. One
-method would be to use accurate table methods for these routines. I have
-a couple of papers by S. Gal from IBM's research labs in Haifa, Israel that
-seem to promise extreme accuracy (in the order of 99.8%) and reasonable speed.
-These methods are used in GLIBC for some of the transcendental functions.
-
-Another approach, which I know little about is CORDIC. This stands for
-Coordinate Rotation Digital Computer, and is a method of computing
-transcendental functions using mostly shifts and adds and a few
-multiplications and divisions. The ARM excels at shifts and adds,
-so such a method could be promising, but requires more research to
-determine if it is feasible.
-
-Rounding Methods
-
-The IEEE standard defines 4 rounding modes. Round to nearest is the
-default, but rounding to + or - infinity or round to zero are also allowed.
-Many architectures allow the rounding mode to be specified by modifying bits
-in a control register. Not so with the ARM FPA11 architecture. To change
-the rounding mode one must specify it with each instruction.
-
-This has made porting some benchmarks difficult. It is possible to
-introduce such a capability into the emulator. The FPCR contains
-bits describing the rounding mode. The emulator could be altered to
-examine a flag, which if set forced it to ignore the rounding mode in
-the instruction, and use the mode specified in the bits in the FPCR.
-
-This would require a method of getting/setting the flag, and the bits
-in the FPCR. This requires a kernel call in ArmLinux, as WFC/RFC are
-supervisor only instructions. If anyone has any ideas or comments I
-would like to hear them.
-
-[NOTE: pulled out from some docs on ARM floating point, specifically
- for the Acorn FPE, but not limited to it:
-
- The floating point control register (FPCR) may only be present in some
- implementations: it is there to control the hardware in an implementation-
- specific manner, for example to disable the floating point system. The user
- mode of the ARM is not permitted to use this register (since the right is
- reserved to alter it between implementations) and the WFC and RFC
- instructions will trap if tried in user mode.
-
- Hence, the answer is yes, you could do this, but then you will run a high
- risk of becoming isolated if and when hardware FP emulation comes out
- -- Russell].
diff --git a/Documentation/arm/nwfpe/index.rst b/Documentation/arm/nwfpe/index.rst
new file mode 100644
index 000000000000..3c4d2f9aa10e
--- /dev/null
+++ b/Documentation/arm/nwfpe/index.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================
+NetWinder's floating point emulator
+===================================
+
+.. toctree::
+ :maxdepth: 1
+
+ nwfpe
+ netwinder-fpe
+ notes
+ todo
diff --git a/Documentation/arm/nwfpe/netwinder-fpe.rst b/Documentation/arm/nwfpe/netwinder-fpe.rst
new file mode 100644
index 000000000000..cbb320960fc4
--- /dev/null
+++ b/Documentation/arm/nwfpe/netwinder-fpe.rst
@@ -0,0 +1,162 @@
+=============
+Current State
+=============
+
+The following describes the current state of the NetWinder's floating point
+emulator.
+
+In the following nomenclature is used to describe the floating point
+instructions. It follows the conventions in the ARM manual.
+
+::
+
+ <S|D|E> = <single|double|extended>, no default
+ {P|M|Z} = {round to +infinity,round to -infinity,round to zero},
+ default = round to nearest
+
+Note: items enclosed in {} are optional.
+
+Floating Point Coprocessor Data Transfer Instructions (CPDT)
+------------------------------------------------------------
+
+LDF/STF - load and store floating
+
+<LDF|STF>{cond}<S|D|E> Fd, Rn
+<LDF|STF>{cond}<S|D|E> Fd, [Rn, #<expression>]{!}
+<LDF|STF>{cond}<S|D|E> Fd, [Rn], #<expression>
+
+These instructions are fully implemented.
+
+LFM/SFM - load and store multiple floating
+
+Form 1 syntax:
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn]
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn, #<expression>]{!}
+<LFM|SFM>{cond}<S|D|E> Fd, <count>, [Rn], #<expression>
+
+Form 2 syntax:
+<LFM|SFM>{cond}<FD,EA> Fd, <count>, [Rn]{!}
+
+These instructions are fully implemented. They store/load three words
+for each floating point register into the memory location given in the
+instruction. The format in memory is unlikely to be compatible with
+other implementations, in particular the actual hardware. Specific
+mention of this is made in the ARM manuals.
+
+Floating Point Coprocessor Register Transfer Instructions (CPRT)
+----------------------------------------------------------------
+
+Conversions, read/write status/control register instructions
+
+FLT{cond}<S,D,E>{P,M,Z} Fn, Rd Convert integer to floating point
+FIX{cond}{P,M,Z} Rd, Fn Convert floating point to integer
+WFS{cond} Rd Write floating point status register
+RFS{cond} Rd Read floating point status register
+WFC{cond} Rd Write floating point control register
+RFC{cond} Rd Read floating point control register
+
+FLT/FIX are fully implemented.
+
+RFS/WFS are fully implemented.
+
+RFC/WFC are fully implemented. RFC/WFC are supervisor only instructions, and
+presently check the CPU mode, and do an invalid instruction trap if not called
+from supervisor mode.
+
+Compare instructions
+
+CMF{cond} Fn, Fm Compare floating
+CMFE{cond} Fn, Fm Compare floating with exception
+CNF{cond} Fn, Fm Compare negated floating
+CNFE{cond} Fn, Fm Compare negated floating with exception
+
+These are fully implemented.
+
+Floating Point Coprocessor Data Instructions (CPDT)
+---------------------------------------------------
+
+Dyadic operations:
+
+ADF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - add
+SUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - subtract
+RSF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse subtract
+MUF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - multiply
+DVF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - divide
+RDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse divide
+
+These are fully implemented.
+
+FML{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast multiply
+FDV{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast divide
+FRD{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - fast reverse divide
+
+These are fully implemented as well. They use the same algorithm as the
+non-fast versions. Hence, in this implementation their performance is
+equivalent to the MUF/DVF/RDV instructions. This is acceptable according
+to the ARM manual. The manual notes these are defined only for single
+operands, on the actual FPA11 hardware they do not work for double or
+extended precision operands. The emulator currently does not check
+the requested permissions conditions, and performs the requested operation.
+
+RMF{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - IEEE remainder
+
+This is fully implemented.
+
+Monadic operations:
+
+MVF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move
+MNF{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - move negated
+
+These are fully implemented.
+
+ABS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - absolute value
+SQT{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - square root
+RND{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - round
+
+These are fully implemented.
+
+URD{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - unnormalized round
+NRM{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - normalize
+
+These are implemented. URD is implemented using the same code as the RND
+instruction. Since URD cannot return a unnormalized number, NRM becomes
+a NOP.
+
+Library calls:
+
+POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
+RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
+POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
+
+LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
+LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
+EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
+SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
+COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
+TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
+ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
+ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
+ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
+
+These are not implemented. They are not currently issued by the compiler,
+and are handled by routines in libc. These are not implemented by the FPA11
+hardware, but are handled by the floating point support code. They should
+be implemented in future versions.
+
+Signalling:
+
+Signals are implemented. However current ELF kernels produced by Rebel.com
+have a bug in them that prevents the module from generating a SIGFPE. This
+is caused by a failure to alias fp_current to the kernel variable
+current_set[0] correctly.
+
+The kernel provided with this distribution (vmlinux-nwfpe-0.93) contains
+a fix for this problem and also incorporates the current version of the
+emulator directly. It is possible to run with no floating point module
+loaded with this kernel. It is provided as a demonstration of the
+technology and for those who want to do floating point work that depends
+on signals. It is not strictly necessary to use the module.
+
+A module (either the one provided by Russell King, or the one in this
+distribution) can be loaded to replace the functionality of the emulator
+built into the kernel.
diff --git a/Documentation/arm/nwfpe/notes.rst b/Documentation/arm/nwfpe/notes.rst
new file mode 100644
index 000000000000..102e55af8439
--- /dev/null
+++ b/Documentation/arm/nwfpe/notes.rst
@@ -0,0 +1,32 @@
+Notes
+=====
+
+There seems to be a problem with exp(double) and our emulator. I haven't
+been able to track it down yet. This does not occur with the emulator
+supplied by Russell King.
+
+I also found one oddity in the emulator. I don't think it is serious but
+will point it out. The ARM calling conventions require floating point
+registers f4-f7 to be preserved over a function call. The compiler quite
+often uses an stfe instruction to save f4 on the stack upon entry to a
+function, and an ldfe instruction to restore it before returning.
+
+I was looking at some code, that calculated a double result, stored it in f4
+then made a function call. Upon return from the function call the number in
+f4 had been converted to an extended value in the emulator.
+
+This is a side effect of the stfe instruction. The double in f4 had to be
+converted to extended, then stored. If an lfm/sfm combination had been used,
+then no conversion would occur. This has performance considerations. The
+result from the function call and f4 were used in a multiplication. If the
+emulator sees a multiply of a double and extended, it promotes the double to
+extended, then does the multiply in extended precision.
+
+This code will cause this problem:
+
+double x, y, z;
+z = log(x)/log(y);
+
+The result of log(x) (a double) will be calculated, returned in f0, then
+moved to f4 to preserve it over the log(y) call. The division will be done
+in extended precision, due to the stfe instruction used to save f4 in log(y).
diff --git a/Documentation/arm/nwfpe/nwfpe.rst b/Documentation/arm/nwfpe/nwfpe.rst
new file mode 100644
index 000000000000..35cd90dacbff
--- /dev/null
+++ b/Documentation/arm/nwfpe/nwfpe.rst
@@ -0,0 +1,74 @@
+Introduction
+============
+
+This directory contains the version 0.92 test release of the NetWinder
+Floating Point Emulator.
+
+The majority of the code was written by me, Scott Bambrough It is
+written in C, with a small number of routines in inline assembler
+where required. It was written quickly, with a goal of implementing a
+working version of all the floating point instructions the compiler
+emits as the first target. I have attempted to be as optimal as
+possible, but there remains much room for improvement.
+
+I have attempted to make the emulator as portable as possible. One of
+the problems is with leading underscores on kernel symbols. Elf
+kernels have no leading underscores, a.out compiled kernels do. I
+have attempted to use the C_SYMBOL_NAME macro wherever this may be
+important.
+
+Another choice I made was in the file structure. I have attempted to
+contain all operating system specific code in one module (fpmodule.*).
+All the other files contain emulator specific code. This should allow
+others to port the emulator to NetBSD for instance relatively easily.
+
+The floating point operations are based on SoftFloat Release 2, by
+John Hauser. SoftFloat is a software implementation of floating-point
+that conforms to the IEC/IEEE Standard for Binary Floating-point
+Arithmetic. As many as four formats are supported: single precision,
+double precision, extended double precision, and quadruple precision.
+All operations required by the standard are implemented, except for
+conversions to and from decimal. We use only the single precision,
+double precision and extended double precision formats. The port of
+SoftFloat to the ARM was done by Phil Blundell, based on an earlier
+port of SoftFloat version 1 by Neil Carson for NetBSD/arm32.
+
+The file README.FPE contains a description of what has been implemented
+so far in the emulator. The file TODO contains a information on what
+remains to be done, and other ideas for the emulator.
+
+Bug reports, comments, suggestions should be directed to me at
+<scottb@netwinder.org>. General reports of "this program doesn't
+work correctly when your emulator is installed" are useful for
+determining that bugs still exist; but are virtually useless when
+attempting to isolate the problem. Please report them, but don't
+expect quick action. Bugs still exist. The problem remains in isolating
+which instruction contains the bug. Small programs illustrating a specific
+problem are a godsend.
+
+Legal Notices
+-------------
+
+The NetWinder Floating Point Emulator is free software. Everything Rebel.com
+has written is provided under the GNU GPL. See the file COPYING for copying
+conditions. Excluded from the above is the SoftFloat code. John Hauser's
+legal notice for SoftFloat is included below.
+
+-------------------------------------------------------------------------------
+
+SoftFloat Legal Notice
+
+SoftFloat was written by John R. Hauser. This work was made possible in
+part by the International Computer Science Institute, located at Suite 600,
+1947 Center Street, Berkeley, California 94704. Funding was partially
+provided by the National Science Foundation under grant MIP-9311980. The
+original version of this code was written as part of a project to build
+a fixed-point vector processor in collaboration with the University of
+California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek.
+
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
+-------------------------------------------------------------------------------
diff --git a/Documentation/arm/nwfpe/todo.rst b/Documentation/arm/nwfpe/todo.rst
new file mode 100644
index 000000000000..393f11b14540
--- /dev/null
+++ b/Documentation/arm/nwfpe/todo.rst
@@ -0,0 +1,72 @@
+TODO LIST
+=========
+
+::
+
+ POW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - power
+ RPW{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - reverse power
+ POL{cond}<S|D|E>{P,M,Z} Fd, Fn, <Fm,#value> - polar angle (arctan2)
+
+ LOG{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base 10
+ LGN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - logarithm to base e
+ EXP{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - exponent
+ SIN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - sine
+ COS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - cosine
+ TAN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - tangent
+ ASN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arcsine
+ ACS{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arccosine
+ ATN{cond}<S|D|E>{P,M,Z} Fd, <Fm,#value> - arctangent
+
+These are not implemented. They are not currently issued by the compiler,
+and are handled by routines in libc. These are not implemented by the FPA11
+hardware, but are handled by the floating point support code. They should
+be implemented in future versions.
+
+There are a couple of ways to approach the implementation of these. One
+method would be to use accurate table methods for these routines. I have
+a couple of papers by S. Gal from IBM's research labs in Haifa, Israel that
+seem to promise extreme accuracy (in the order of 99.8%) and reasonable speed.
+These methods are used in GLIBC for some of the transcendental functions.
+
+Another approach, which I know little about is CORDIC. This stands for
+Coordinate Rotation Digital Computer, and is a method of computing
+transcendental functions using mostly shifts and adds and a few
+multiplications and divisions. The ARM excels at shifts and adds,
+so such a method could be promising, but requires more research to
+determine if it is feasible.
+
+Rounding Methods
+----------------
+
+The IEEE standard defines 4 rounding modes. Round to nearest is the
+default, but rounding to + or - infinity or round to zero are also allowed.
+Many architectures allow the rounding mode to be specified by modifying bits
+in a control register. Not so with the ARM FPA11 architecture. To change
+the rounding mode one must specify it with each instruction.
+
+This has made porting some benchmarks difficult. It is possible to
+introduce such a capability into the emulator. The FPCR contains
+bits describing the rounding mode. The emulator could be altered to
+examine a flag, which if set forced it to ignore the rounding mode in
+the instruction, and use the mode specified in the bits in the FPCR.
+
+This would require a method of getting/setting the flag, and the bits
+in the FPCR. This requires a kernel call in ArmLinux, as WFC/RFC are
+supervisor only instructions. If anyone has any ideas or comments I
+would like to hear them.
+
+NOTE:
+ pulled out from some docs on ARM floating point, specifically
+ for the Acorn FPE, but not limited to it:
+
+ The floating point control register (FPCR) may only be present in some
+ implementations: it is there to control the hardware in an implementation-
+ specific manner, for example to disable the floating point system. The user
+ mode of the ARM is not permitted to use this register (since the right is
+ reserved to alter it between implementations) and the WFC and RFC
+ instructions will trap if tried in user mode.
+
+ Hence, the answer is yes, you could do this, but then you will run a high
+ risk of becoming isolated if and when hardware FP emulation comes out
+
+ -- Russell.
diff --git a/Documentation/arm/omap/dss.rst b/Documentation/arm/omap/dss.rst
new file mode 100644
index 000000000000..a40c4d9c717a
--- /dev/null
+++ b/Documentation/arm/omap/dss.rst
@@ -0,0 +1,372 @@
+=========================
+OMAP2/3 Display Subsystem
+=========================
+
+This is an almost total rewrite of the OMAP FB driver in drivers/video/omap
+(let's call it DSS1). The main differences between DSS1 and DSS2 are DSI,
+TV-out and multiple display support, but there are lots of small improvements
+also.
+
+The DSS2 driver (omapdss module) is in arch/arm/plat-omap/dss/, and the FB,
+panel and controller drivers are in drivers/video/omap2/. DSS1 and DSS2 live
+currently side by side, you can choose which one to use.
+
+Features
+--------
+
+Working and tested features include:
+
+- MIPI DPI (parallel) output
+- MIPI DSI output in command mode
+- MIPI DBI (RFBI) output
+- SDI output
+- TV output
+- All pieces can be compiled as a module or inside kernel
+- Use DISPC to update any of the outputs
+- Use CPU to update RFBI or DSI output
+- OMAP DISPC planes
+- RGB16, RGB24 packed, RGB24 unpacked
+- YUV2, UYVY
+- Scaling
+- Adjusting DSS FCK to find a good pixel clock
+- Use DSI DPLL to create DSS FCK
+
+Tested boards include:
+- OMAP3 SDP board
+- Beagle board
+- N810
+
+omapdss driver
+--------------
+
+The DSS driver does not itself have any support for Linux framebuffer, V4L or
+such like the current ones, but it has an internal kernel API that upper level
+drivers can use.
+
+The DSS driver models OMAP's overlays, overlay managers and displays in a
+flexible way to enable non-common multi-display configuration. In addition to
+modelling the hardware overlays, omapdss supports virtual overlays and overlay
+managers. These can be used when updating a display with CPU or system DMA.
+
+omapdss driver support for audio
+--------------------------------
+There exist several display technologies and standards that support audio as
+well. Hence, it is relevant to update the DSS device driver to provide an audio
+interface that may be used by an audio driver or any other driver interested in
+the functionality.
+
+The audio_enable function is intended to prepare the relevant
+IP for playback (e.g., enabling an audio FIFO, taking in/out of reset
+some IP, enabling companion chips, etc). It is intended to be called before
+audio_start. The audio_disable function performs the reverse operation and is
+intended to be called after audio_stop.
+
+While a given DSS device driver may support audio, it is possible that for
+certain configurations audio is not supported (e.g., an HDMI display using a
+VESA video timing). The audio_supported function is intended to query whether
+the current configuration of the display supports audio.
+
+The audio_config function is intended to configure all the relevant audio
+parameters of the display. In order to make the function independent of any
+specific DSS device driver, a struct omap_dss_audio is defined. Its purpose
+is to contain all the required parameters for audio configuration. At the
+moment, such structure contains pointers to IEC-60958 channel status word
+and CEA-861 audio infoframe structures. This should be enough to support
+HDMI and DisplayPort, as both are based on CEA-861 and IEC-60958.
+
+The audio_enable/disable, audio_config and audio_supported functions could be
+implemented as functions that may sleep. Hence, they should not be called
+while holding a spinlock or a readlock.
+
+The audio_start/audio_stop function is intended to effectively start/stop audio
+playback after the configuration has taken place. These functions are designed
+to be used in an atomic context. Hence, audio_start should return quickly and be
+called only after all the needed resources for audio playback (audio FIFOs,
+DMA channels, companion chips, etc) have been enabled to begin data transfers.
+audio_stop is designed to only stop the audio transfers. The resources used
+for playback are released using audio_disable.
+
+The enum omap_dss_audio_state may be used to help the implementations of
+the interface to keep track of the audio state. The initial state is _DISABLED;
+then, the state transitions to _CONFIGURED, and then, when it is ready to
+play audio, to _ENABLED. The state _PLAYING is used when the audio is being
+rendered.
+
+
+Panel and controller drivers
+----------------------------
+
+The drivers implement panel or controller specific functionality and are not
+usually visible to users except through omapfb driver. They register
+themselves to the DSS driver.
+
+omapfb driver
+-------------
+
+The omapfb driver implements arbitrary number of standard linux framebuffers.
+These framebuffers can be routed flexibly to any overlays, thus allowing very
+dynamic display architecture.
+
+The driver exports some omapfb specific ioctls, which are compatible with the
+ioctls in the old driver.
+
+The rest of the non standard features are exported via sysfs. Whether the final
+implementation will use sysfs, or ioctls, is still open.
+
+V4L2 drivers
+------------
+
+V4L2 is being implemented in TI.
+
+From omapdss point of view the V4L2 drivers should be similar to framebuffer
+driver.
+
+Architecture
+--------------------
+
+Some clarification what the different components do:
+
+ - Framebuffer is a memory area inside OMAP's SRAM/SDRAM that contains the
+ pixel data for the image. Framebuffer has width and height and color
+ depth.
+ - Overlay defines where the pixels are read from and where they go on the
+ screen. The overlay may be smaller than framebuffer, thus displaying only
+ part of the framebuffer. The position of the overlay may be changed if
+ the overlay is smaller than the display.
+ - Overlay manager combines the overlays in to one image and feeds them to
+ display.
+ - Display is the actual physical display device.
+
+A framebuffer can be connected to multiple overlays to show the same pixel data
+on all of the overlays. Note that in this case the overlay input sizes must be
+the same, but, in case of video overlays, the output size can be different. Any
+framebuffer can be connected to any overlay.
+
+An overlay can be connected to one overlay manager. Also DISPC overlays can be
+connected only to DISPC overlay managers, and virtual overlays can be only
+connected to virtual overlays.
+
+An overlay manager can be connected to one display. There are certain
+restrictions which kinds of displays an overlay manager can be connected:
+
+ - DISPC TV overlay manager can be only connected to TV display.
+ - Virtual overlay managers can only be connected to DBI or DSI displays.
+ - DISPC LCD overlay manager can be connected to all displays, except TV
+ display.
+
+Sysfs
+-----
+The sysfs interface is mainly used for testing. I don't think sysfs
+interface is the best for this in the final version, but I don't quite know
+what would be the best interfaces for these things.
+
+The sysfs interface is divided to two parts: DSS and FB.
+
+/sys/class/graphics/fb? directory:
+mirror 0=off, 1=on
+rotate Rotation 0-3 for 0, 90, 180, 270 degrees
+rotate_type 0 = DMA rotation, 1 = VRFB rotation
+overlays List of overlay numbers to which framebuffer pixels go
+phys_addr Physical address of the framebuffer
+virt_addr Virtual address of the framebuffer
+size Size of the framebuffer
+
+/sys/devices/platform/omapdss/overlay? directory:
+enabled 0=off, 1=on
+input_size width,height (ie. the framebuffer size)
+manager Destination overlay manager name
+name
+output_size width,height
+position x,y
+screen_width width
+global_alpha global alpha 0-255 0=transparent 255=opaque
+
+/sys/devices/platform/omapdss/manager? directory:
+display Destination display
+name
+alpha_blending_enabled 0=off, 1=on
+trans_key_enabled 0=off, 1=on
+trans_key_type gfx-destination, video-source
+trans_key_value transparency color key (RGB24)
+default_color default background color (RGB24)
+
+/sys/devices/platform/omapdss/display? directory:
+
+=============== =============================================================
+ctrl_name Controller name
+mirror 0=off, 1=on
+update_mode 0=off, 1=auto, 2=manual
+enabled 0=off, 1=on
+name
+rotate Rotation 0-3 for 0, 90, 180, 270 degrees
+timings Display timings (pixclock,xres/hfp/hbp/hsw,yres/vfp/vbp/vsw)
+ When writing, two special timings are accepted for tv-out:
+ "pal" and "ntsc"
+panel_name
+tear_elim Tearing elimination 0=off, 1=on
+output_type Output type (video encoder only): "composite" or "svideo"
+=============== =============================================================
+
+There are also some debugfs files at <debugfs>/omapdss/ which show information
+about clocks and registers.
+
+Examples
+--------
+
+The following definitions have been made for the examples below::
+
+ ovl0=/sys/devices/platform/omapdss/overlay0
+ ovl1=/sys/devices/platform/omapdss/overlay1
+ ovl2=/sys/devices/platform/omapdss/overlay2
+
+ mgr0=/sys/devices/platform/omapdss/manager0
+ mgr1=/sys/devices/platform/omapdss/manager1
+
+ lcd=/sys/devices/platform/omapdss/display0
+ dvi=/sys/devices/platform/omapdss/display1
+ tv=/sys/devices/platform/omapdss/display2
+
+ fb0=/sys/class/graphics/fb0
+ fb1=/sys/class/graphics/fb1
+ fb2=/sys/class/graphics/fb2
+
+Default setup on OMAP3 SDP
+--------------------------
+
+Here's the default setup on OMAP3 SDP board. All planes go to LCD. DVI
+and TV-out are not in use. The columns from left to right are:
+framebuffers, overlays, overlay managers, displays. Framebuffers are
+handled by omapfb, and the rest by the DSS::
+
+ FB0 --- GFX -\ DVI
+ FB1 --- VID1 --+- LCD ---- LCD
+ FB2 --- VID2 -/ TV ----- TV
+
+Example: Switch from LCD to DVI
+-------------------------------
+
+::
+
+ w=`cat $dvi/timings | cut -d "," -f 2 | cut -d "/" -f 1`
+ h=`cat $dvi/timings | cut -d "," -f 3 | cut -d "/" -f 1`
+
+ echo "0" > $lcd/enabled
+ echo "" > $mgr0/display
+ fbset -fb /dev/fb0 -xres $w -yres $h -vxres $w -vyres $h
+ # at this point you have to switch the dvi/lcd dip-switch from the omap board
+ echo "dvi" > $mgr0/display
+ echo "1" > $dvi/enabled
+
+After this the configuration looks like:::
+
+ FB0 --- GFX -\ -- DVI
+ FB1 --- VID1 --+- LCD -/ LCD
+ FB2 --- VID2 -/ TV ----- TV
+
+Example: Clone GFX overlay to LCD and TV
+----------------------------------------
+
+::
+
+ w=`cat $tv/timings | cut -d "," -f 2 | cut -d "/" -f 1`
+ h=`cat $tv/timings | cut -d "," -f 3 | cut -d "/" -f 1`
+
+ echo "0" > $ovl0/enabled
+ echo "0" > $ovl1/enabled
+
+ echo "" > $fb1/overlays
+ echo "0,1" > $fb0/overlays
+
+ echo "$w,$h" > $ovl1/output_size
+ echo "tv" > $ovl1/manager
+
+ echo "1" > $ovl0/enabled
+ echo "1" > $ovl1/enabled
+
+ echo "1" > $tv/enabled
+
+After this the configuration looks like (only relevant parts shown)::
+
+ FB0 +-- GFX ---- LCD ---- LCD
+ \- VID1 ---- TV ---- TV
+
+Misc notes
+----------
+
+OMAP FB allocates the framebuffer memory using the standard dma allocator. You
+can enable Contiguous Memory Allocator (CONFIG_CMA) to improve the dma
+allocator, and if CMA is enabled, you use "cma=" kernel parameter to increase
+the global memory area for CMA.
+
+Using DSI DPLL to generate pixel clock it is possible produce the pixel clock
+of 86.5MHz (max possible), and with that you get 1280x1024@57 output from DVI.
+
+Rotation and mirroring currently only supports RGB565 and RGB8888 modes. VRFB
+does not support mirroring.
+
+VRFB rotation requires much more memory than non-rotated framebuffer, so you
+probably need to increase your vram setting before using VRFB rotation. Also,
+many applications may not work with VRFB if they do not pay attention to all
+framebuffer parameters.
+
+Kernel boot arguments
+---------------------
+
+omapfb.mode=<display>:<mode>[,...]
+ - Default video mode for specified displays. For example,
+ "dvi:800x400MR-24@60". See drivers/video/modedb.c.
+ There are also two special modes: "pal" and "ntsc" that
+ can be used to tv out.
+
+omapfb.vram=<fbnum>:<size>[@<physaddr>][,...]
+ - VRAM allocated for a framebuffer. Normally omapfb allocates vram
+ depending on the display size. With this you can manually allocate
+ more or define the physical address of each framebuffer. For example,
+ "1:4M" to allocate 4M for fb1.
+
+omapfb.debug=<y|n>
+ - Enable debug printing. You have to have OMAPFB debug support enabled
+ in kernel config.
+
+omapfb.test=<y|n>
+ - Draw test pattern to framebuffer whenever framebuffer settings change.
+ You need to have OMAPFB debug support enabled in kernel config.
+
+omapfb.vrfb=<y|n>
+ - Use VRFB rotation for all framebuffers.
+
+omapfb.rotate=<angle>
+ - Default rotation applied to all framebuffers.
+ 0 - 0 degree rotation
+ 1 - 90 degree rotation
+ 2 - 180 degree rotation
+ 3 - 270 degree rotation
+
+omapfb.mirror=<y|n>
+ - Default mirror for all framebuffers. Only works with DMA rotation.
+
+omapdss.def_disp=<display>
+ - Name of default display, to which all overlays will be connected.
+ Common examples are "lcd" or "tv".
+
+omapdss.debug=<y|n>
+ - Enable debug printing. You have to have DSS debug support enabled in
+ kernel config.
+
+TODO
+----
+
+DSS locking
+
+Error checking
+
+- Lots of checks are missing or implemented just as BUG()
+
+System DMA update for DSI
+
+- Can be used for RGB16 and RGB24P modes. Probably not for RGB24U (how
+ to skip the empty byte?)
+
+OMAP1 support
+
+- Not sure if needed
diff --git a/Documentation/arm/omap/index.rst b/Documentation/arm/omap/index.rst
new file mode 100644
index 000000000000..8b365b212e49
--- /dev/null
+++ b/Documentation/arm/omap/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======
+TI OMAP
+=======
+
+.. toctree::
+ :maxdepth: 1
+
+ omap
+ omap_pm
+ dss
diff --git a/Documentation/arm/omap/omap.rst b/Documentation/arm/omap/omap.rst
new file mode 100644
index 000000000000..f440c0f4613f
--- /dev/null
+++ b/Documentation/arm/omap/omap.rst
@@ -0,0 +1,18 @@
+============
+OMAP history
+============
+
+This file contains documentation for running mainline
+kernel on omaps.
+
+====== ======================================================
+KERNEL NEW DEPENDENCIES
+====== ======================================================
+v4.3+ Update is needed for custom .config files to make sure
+ CONFIG_REGULATOR_PBIAS is enabled for MMC1 to work
+ properly.
+
+v4.18+ Update is needed for custom .config files to make sure
+ CONFIG_MMC_SDHCI_OMAP is enabled for all MMC instances
+ to work in DRA7 and K2G based boards.
+====== ======================================================
diff --git a/Documentation/arm/omap/omap_pm.rst b/Documentation/arm/omap/omap_pm.rst
new file mode 100644
index 000000000000..a335e4c8ce2c
--- /dev/null
+++ b/Documentation/arm/omap/omap_pm.rst
@@ -0,0 +1,165 @@
+=====================
+The OMAP PM interface
+=====================
+
+This document describes the temporary OMAP PM interface. Driver
+authors use these functions to communicate minimum latency or
+throughput constraints to the kernel power management code.
+Over time, the intention is to merge features from the OMAP PM
+interface into the Linux PM QoS code.
+
+Drivers need to express PM parameters which:
+
+- support the range of power management parameters present in the TI SRF;
+
+- separate the drivers from the underlying PM parameter
+ implementation, whether it is the TI SRF or Linux PM QoS or Linux
+ latency framework or something else;
+
+- specify PM parameters in terms of fundamental units, such as
+ latency and throughput, rather than units which are specific to OMAP
+ or to particular OMAP variants;
+
+- allow drivers which are shared with other architectures (e.g.,
+ DaVinci) to add these constraints in a way which won't affect non-OMAP
+ systems,
+
+- can be implemented immediately with minimal disruption of other
+ architectures.
+
+
+This document proposes the OMAP PM interface, including the following
+five power management functions for driver code:
+
+1. Set the maximum MPU wakeup latency::
+
+ (*pdata->set_max_mpu_wakeup_lat)(struct device *dev, unsigned long t)
+
+2. Set the maximum device wakeup latency::
+
+ (*pdata->set_max_dev_wakeup_lat)(struct device *dev, unsigned long t)
+
+3. Set the maximum system DMA transfer start latency (CORE pwrdm)::
+
+ (*pdata->set_max_sdma_lat)(struct device *dev, long t)
+
+4. Set the minimum bus throughput needed by a device::
+
+ (*pdata->set_min_bus_tput)(struct device *dev, u8 agent_id, unsigned long r)
+
+5. Return the number of times the device has lost context::
+
+ (*pdata->get_dev_context_loss_count)(struct device *dev)
+
+
+Further documentation for all OMAP PM interface functions can be
+found in arch/arm/plat-omap/include/mach/omap-pm.h.
+
+
+The OMAP PM layer is intended to be temporary
+---------------------------------------------
+
+The intention is that eventually the Linux PM QoS layer should support
+the range of power management features present in OMAP3. As this
+happens, existing drivers using the OMAP PM interface can be modified
+to use the Linux PM QoS code; and the OMAP PM interface can disappear.
+
+
+Driver usage of the OMAP PM functions
+-------------------------------------
+
+As the 'pdata' in the above examples indicates, these functions are
+exposed to drivers through function pointers in driver .platform_data
+structures. The function pointers are initialized by the `board-*.c`
+files to point to the corresponding OMAP PM functions:
+
+- set_max_dev_wakeup_lat will point to
+ omap_pm_set_max_dev_wakeup_lat(), etc. Other architectures which do
+ not support these functions should leave these function pointers set
+ to NULL. Drivers should use the following idiom::
+
+ if (pdata->set_max_dev_wakeup_lat)
+ (*pdata->set_max_dev_wakeup_lat)(dev, t);
+
+The most common usage of these functions will probably be to specify
+the maximum time from when an interrupt occurs, to when the device
+becomes accessible. To accomplish this, driver writers should use the
+set_max_mpu_wakeup_lat() function to constrain the MPU wakeup
+latency, and the set_max_dev_wakeup_lat() function to constrain the
+device wakeup latency (from clk_enable() to accessibility). For
+example::
+
+ /* Limit MPU wakeup latency */
+ if (pdata->set_max_mpu_wakeup_lat)
+ (*pdata->set_max_mpu_wakeup_lat)(dev, tc);
+
+ /* Limit device powerdomain wakeup latency */
+ if (pdata->set_max_dev_wakeup_lat)
+ (*pdata->set_max_dev_wakeup_lat)(dev, td);
+
+ /* total wakeup latency in this example: (tc + td) */
+
+The PM parameters can be overwritten by calling the function again
+with the new value. The settings can be removed by calling the
+function with a t argument of -1 (except in the case of
+set_max_bus_tput(), which should be called with an r argument of 0).
+
+The fifth function above, omap_pm_get_dev_context_loss_count(),
+is intended as an optimization to allow drivers to determine whether the
+device has lost its internal context. If context has been lost, the
+driver must restore its internal context before proceeding.
+
+
+Other specialized interface functions
+-------------------------------------
+
+The five functions listed above are intended to be usable by any
+device driver. DSPBridge and CPUFreq have a few special requirements.
+DSPBridge expresses target DSP performance levels in terms of OPP IDs.
+CPUFreq expresses target MPU performance levels in terms of MPU
+frequency. The OMAP PM interface contains functions for these
+specialized cases to convert that input information (OPPs/MPU
+frequency) into the form that the underlying power management
+implementation needs:
+
+6. `(*pdata->dsp_get_opp_table)(void)`
+
+7. `(*pdata->dsp_set_min_opp)(u8 opp_id)`
+
+8. `(*pdata->dsp_get_opp)(void)`
+
+9. `(*pdata->cpu_get_freq_table)(void)`
+
+10. `(*pdata->cpu_set_freq)(unsigned long f)`
+
+11. `(*pdata->cpu_get_freq)(void)`
+
+Customizing OPP for platform
+============================
+Defining CONFIG_PM should enable OPP layer for the silicon
+and the registration of OPP table should take place automatically.
+However, in special cases, the default OPP table may need to be
+tweaked, for e.g.:
+
+ * enable default OPPs which are disabled by default, but which
+ could be enabled on a platform
+ * Disable an unsupported OPP on the platform
+ * Define and add a custom opp table entry
+ in these cases, the board file needs to do additional steps as follows:
+
+arch/arm/mach-omapx/board-xyz.c::
+
+ #include "pm.h"
+ ....
+ static void __init omap_xyz_init_irq(void)
+ {
+ ....
+ /* Initialize the default table */
+ omapx_opp_init();
+ /* Do customization to the defaults */
+ ....
+ }
+
+NOTE:
+ omapx_opp_init will be omap3_opp_init or as required
+ based on the omap family.
diff --git a/Documentation/arm/porting.rst b/Documentation/arm/porting.rst
new file mode 100644
index 000000000000..bd21958bdb2d
--- /dev/null
+++ b/Documentation/arm/porting.rst
@@ -0,0 +1,137 @@
+=======
+Porting
+=======
+
+Taken from list archive at http://lists.arm.linux.org.uk/pipermail/linux-arm-kernel/2001-July/004064.html
+
+Initial definitions
+-------------------
+
+The following symbol definitions rely on you knowing the translation that
+__virt_to_phys() does for your machine. This macro converts the passed
+virtual address to a physical address. Normally, it is simply:
+
+ phys = virt - PAGE_OFFSET + PHYS_OFFSET
+
+
+Decompressor Symbols
+--------------------
+
+ZTEXTADDR
+ Start address of decompressor. There's no point in talking about
+ virtual or physical addresses here, since the MMU will be off at
+ the time when you call the decompressor code. You normally call
+ the kernel at this address to start it booting. This doesn't have
+ to be located in RAM, it can be in flash or other read-only or
+ read-write addressable medium.
+
+ZBSSADDR
+ Start address of zero-initialised work area for the decompressor.
+ This must be pointing at RAM. The decompressor will zero initialise
+ this for you. Again, the MMU will be off.
+
+ZRELADDR
+ This is the address where the decompressed kernel will be written,
+ and eventually executed. The following constraint must be valid:
+
+ __virt_to_phys(TEXTADDR) == ZRELADDR
+
+ The initial part of the kernel is carefully coded to be position
+ independent.
+
+INITRD_PHYS
+ Physical address to place the initial RAM disk. Only relevant if
+ you are using the bootpImage stuff (which only works on the old
+ struct param_struct).
+
+INITRD_VIRT
+ Virtual address of the initial RAM disk. The following constraint
+ must be valid:
+
+ __virt_to_phys(INITRD_VIRT) == INITRD_PHYS
+
+PARAMS_PHYS
+ Physical address of the struct param_struct or tag list, giving the
+ kernel various parameters about its execution environment.
+
+
+Kernel Symbols
+--------------
+
+PHYS_OFFSET
+ Physical start address of the first bank of RAM.
+
+PAGE_OFFSET
+ Virtual start address of the first bank of RAM. During the kernel
+ boot phase, virtual address PAGE_OFFSET will be mapped to physical
+ address PHYS_OFFSET, along with any other mappings you supply.
+ This should be the same value as TASK_SIZE.
+
+TASK_SIZE
+ The maximum size of a user process in bytes. Since user space
+ always starts at zero, this is the maximum address that a user
+ process can access+1. The user space stack grows down from this
+ address.
+
+ Any virtual address below TASK_SIZE is deemed to be user process
+ area, and therefore managed dynamically on a process by process
+ basis by the kernel. I'll call this the user segment.
+
+ Anything above TASK_SIZE is common to all processes. I'll call
+ this the kernel segment.
+
+ (In other words, you can't put IO mappings below TASK_SIZE, and
+ hence PAGE_OFFSET).
+
+TEXTADDR
+ Virtual start address of kernel, normally PAGE_OFFSET + 0x8000.
+ This is where the kernel image ends up. With the latest kernels,
+ it must be located at 32768 bytes into a 128MB region. Previous
+ kernels placed a restriction of 256MB here.
+
+DATAADDR
+ Virtual address for the kernel data segment. Must not be defined
+ when using the decompressor.
+
+VMALLOC_START / VMALLOC_END
+ Virtual addresses bounding the vmalloc() area. There must not be
+ any static mappings in this area; vmalloc will overwrite them.
+ The addresses must also be in the kernel segment (see above).
+ Normally, the vmalloc() area starts VMALLOC_OFFSET bytes above the
+ last virtual RAM address (found using variable high_memory).
+
+VMALLOC_OFFSET
+ Offset normally incorporated into VMALLOC_START to provide a hole
+ between virtual RAM and the vmalloc area. We do this to allow
+ out of bounds memory accesses (eg, something writing off the end
+ of the mapped memory map) to be caught. Normally set to 8MB.
+
+Architecture Specific Macros
+----------------------------
+
+BOOT_MEM(pram,pio,vio)
+ `pram` specifies the physical start address of RAM. Must always
+ be present, and should be the same as PHYS_OFFSET.
+
+ `pio` is the physical address of an 8MB region containing IO for
+ use with the debugging macros in arch/arm/kernel/debug-armv.S.
+
+ `vio` is the virtual address of the 8MB debugging region.
+
+ It is expected that the debugging region will be re-initialised
+ by the architecture specific code later in the code (via the
+ MAPIO function).
+
+BOOT_PARAMS
+ Same as, and see PARAMS_PHYS.
+
+FIXUP(func)
+ Machine specific fixups, run before memory subsystems have been
+ initialised.
+
+MAPIO(func)
+ Machine specific function to map IO areas (including the debug
+ region above).
+
+INITIRQ(func)
+ Machine specific function to initialise interrupts.
diff --git a/Documentation/arm/pxa/mfp.rst b/Documentation/arm/pxa/mfp.rst
new file mode 100644
index 000000000000..ac34e5d7ee44
--- /dev/null
+++ b/Documentation/arm/pxa/mfp.rst
@@ -0,0 +1,288 @@
+==============================================
+MFP Configuration for PXA2xx/PXA3xx Processors
+==============================================
+
+ Eric Miao <eric.miao@marvell.com>
+
+MFP stands for Multi-Function Pin, which is the pin-mux logic on PXA3xx and
+later PXA series processors. This document describes the existing MFP API,
+and how board/platform driver authors could make use of it.
+
+Basic Concept
+=============
+
+Unlike the GPIO alternate function settings on PXA25x and PXA27x, a new MFP
+mechanism is introduced from PXA3xx to completely move the pin-mux functions
+out of the GPIO controller. In addition to pin-mux configurations, the MFP
+also controls the low power state, driving strength, pull-up/down and event
+detection of each pin. Below is a diagram of internal connections between
+the MFP logic and the remaining SoC peripherals::
+
+ +--------+
+ | |--(GPIO19)--+
+ | GPIO | |
+ | |--(GPIO...) |
+ +--------+ |
+ | +---------+
+ +--------+ +------>| |
+ | PWM2 |--(PWM_OUT)-------->| MFP |
+ +--------+ +------>| |-------> to external PAD
+ | +---->| |
+ +--------+ | | +-->| |
+ | SSP2 |---(TXD)----+ | | +---------+
+ +--------+ | |
+ | |
+ +--------+ | |
+ | Keypad |--(MKOUT4)----+ |
+ +--------+ |
+ |
+ +--------+ |
+ | UART2 |---(TXD)--------+
+ +--------+
+
+NOTE: the external pad is named as MFP_PIN_GPIO19, it doesn't necessarily
+mean it's dedicated for GPIO19, only as a hint that internally this pin
+can be routed from GPIO19 of the GPIO controller.
+
+To better understand the change from PXA25x/PXA27x GPIO alternate function
+to this new MFP mechanism, here are several key points:
+
+ 1. GPIO controller on PXA3xx is now a dedicated controller, same as other
+ internal controllers like PWM, SSP and UART, with 128 internal signals
+ which can be routed to external through one or more MFPs (e.g. GPIO<0>
+ can be routed through either MFP_PIN_GPIO0 as well as MFP_PIN_GPIO0_2,
+ see arch/arm/mach-pxa/mfp-pxa300.h)
+
+ 2. Alternate function configuration is removed from this GPIO controller,
+ the remaining functions are pure GPIO-specific, i.e.
+
+ - GPIO signal level control
+ - GPIO direction control
+ - GPIO level change detection
+
+ 3. Low power state for each pin is now controlled by MFP, this means the
+ PGSRx registers on PXA2xx are now useless on PXA3xx
+
+ 4. Wakeup detection is now controlled by MFP, PWER does not control the
+ wakeup from GPIO(s) any more, depending on the sleeping state, ADxER
+ (as defined in pxa3xx-regs.h) controls the wakeup from MFP
+
+NOTE: with such a clear separation of MFP and GPIO, by GPIO<xx> we normally
+mean it is a GPIO signal, and by MFP<xxx> or pin xxx, we mean a physical
+pad (or ball).
+
+MFP API Usage
+=============
+
+For board code writers, here are some guidelines:
+
+1. include ONE of the following header files in your <board>.c:
+
+ - #include "mfp-pxa25x.h"
+ - #include "mfp-pxa27x.h"
+ - #include "mfp-pxa300.h"
+ - #include "mfp-pxa320.h"
+ - #include "mfp-pxa930.h"
+
+ NOTE: only one file in your <board>.c, depending on the processors used,
+ because pin configuration definitions may conflict in these file (i.e.
+ same name, different meaning and settings on different processors). E.g.
+ for zylonite platform, which support both PXA300/PXA310 and PXA320, two
+ separate files are introduced: zylonite_pxa300.c and zylonite_pxa320.c
+ (in addition to handle MFP configuration differences, they also handle
+ the other differences between the two combinations).
+
+ NOTE: PXA300 and PXA310 are almost identical in pin configurations (with
+ PXA310 supporting some additional ones), thus the difference is actually
+ covered in a single mfp-pxa300.h.
+
+2. prepare an array for the initial pin configurations, e.g.::
+
+ static unsigned long mainstone_pin_config[] __initdata = {
+ /* Chip Select */
+ GPIO15_nCS_1,
+
+ /* LCD - 16bpp Active TFT */
+ GPIOxx_TFT_LCD_16BPP,
+ GPIO16_PWM0_OUT, /* Backlight */
+
+ /* MMC */
+ GPIO32_MMC_CLK,
+ GPIO112_MMC_CMD,
+ GPIO92_MMC_DAT_0,
+ GPIO109_MMC_DAT_1,
+ GPIO110_MMC_DAT_2,
+ GPIO111_MMC_DAT_3,
+
+ ...
+
+ /* GPIO */
+ GPIO1_GPIO | WAKEUP_ON_EDGE_BOTH,
+ };
+
+ a) once the pin configurations are passed to pxa{2xx,3xx}_mfp_config(),
+ and written to the actual registers, they are useless and may discard,
+ adding '__initdata' will help save some additional bytes here.
+
+ b) when there is only one possible pin configurations for a component,
+ some simplified definitions can be used, e.g. GPIOxx_TFT_LCD_16BPP on
+ PXA25x and PXA27x processors
+
+ c) if by board design, a pin can be configured to wake up the system
+ from low power state, it can be 'OR'ed with any of:
+
+ WAKEUP_ON_EDGE_BOTH
+ WAKEUP_ON_EDGE_RISE
+ WAKEUP_ON_EDGE_FALL
+ WAKEUP_ON_LEVEL_HIGH - specifically for enabling of keypad GPIOs,
+
+ to indicate that this pin has the capability of wake-up the system,
+ and on which edge(s). This, however, doesn't necessarily mean the
+ pin _will_ wakeup the system, it will only when set_irq_wake() is
+ invoked with the corresponding GPIO IRQ (GPIO_IRQ(xx) or gpio_to_irq())
+ and eventually calls gpio_set_wake() for the actual register setting.
+
+ d) although PXA3xx MFP supports edge detection on each pin, the
+ internal logic will only wakeup the system when those specific bits
+ in ADxER registers are set, which can be well mapped to the
+ corresponding peripheral, thus set_irq_wake() can be called with
+ the peripheral IRQ to enable the wakeup.
+
+
+MFP on PXA3xx
+=============
+
+Every external I/O pad on PXA3xx (excluding those for special purpose) has
+one MFP logic associated, and is controlled by one MFP register (MFPR).
+
+The MFPR has the following bit definitions (for PXA300/PXA310/PXA320)::
+
+ 31 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ | RESERVED |PS|PU|PD| DRIVE |SS|SD|SO|EC|EF|ER|--| AF_SEL |
+ +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+
+ Bit 3: RESERVED
+ Bit 4: EDGE_RISE_EN - enable detection of rising edge on this pin
+ Bit 5: EDGE_FALL_EN - enable detection of falling edge on this pin
+ Bit 6: EDGE_CLEAR - disable edge detection on this pin
+ Bit 7: SLEEP_OE_N - enable outputs during low power modes
+ Bit 8: SLEEP_DATA - output data on the pin during low power modes
+ Bit 9: SLEEP_SEL - selection control for low power modes signals
+ Bit 13: PULLDOWN_EN - enable the internal pull-down resistor on this pin
+ Bit 14: PULLUP_EN - enable the internal pull-up resistor on this pin
+ Bit 15: PULL_SEL - pull state controlled by selected alternate function
+ (0) or by PULL{UP,DOWN}_EN bits (1)
+
+ Bit 0 - 2: AF_SEL - alternate function selection, 8 possibilities, from 0-7
+ Bit 10-12: DRIVE - drive strength and slew rate
+ 0b000 - fast 1mA
+ 0b001 - fast 2mA
+ 0b002 - fast 3mA
+ 0b003 - fast 4mA
+ 0b004 - slow 6mA
+ 0b005 - fast 6mA
+ 0b006 - slow 10mA
+ 0b007 - fast 10mA
+
+MFP Design for PXA2xx/PXA3xx
+============================
+
+Due to the difference of pin-mux handling between PXA2xx and PXA3xx, a unified
+MFP API is introduced to cover both series of processors.
+
+The basic idea of this design is to introduce definitions for all possible pin
+configurations, these definitions are processor and platform independent, and
+the actual API invoked to convert these definitions into register settings and
+make them effective there-after.
+
+Files Involved
+--------------
+
+ - arch/arm/mach-pxa/include/mach/mfp.h
+
+ for
+ 1. Unified pin definitions - enum constants for all configurable pins
+ 2. processor-neutral bit definitions for a possible MFP configuration
+
+ - arch/arm/mach-pxa/mfp-pxa3xx.h
+
+ for PXA3xx specific MFPR register bit definitions and PXA3xx common pin
+ configurations
+
+ - arch/arm/mach-pxa/mfp-pxa2xx.h
+
+ for PXA2xx specific definitions and PXA25x/PXA27x common pin configurations
+
+ - arch/arm/mach-pxa/mfp-pxa25x.h
+ arch/arm/mach-pxa/mfp-pxa27x.h
+ arch/arm/mach-pxa/mfp-pxa300.h
+ arch/arm/mach-pxa/mfp-pxa320.h
+ arch/arm/mach-pxa/mfp-pxa930.h
+
+ for processor specific definitions
+
+ - arch/arm/mach-pxa/mfp-pxa3xx.c
+ - arch/arm/mach-pxa/mfp-pxa2xx.c
+
+ for implementation of the pin configuration to take effect for the actual
+ processor.
+
+Pin Configuration
+-----------------
+
+ The following comments are copied from mfp.h (see the actual source code
+ for most updated info)::
+
+ /*
+ * a possible MFP configuration is represented by a 32-bit integer
+ *
+ * bit 0.. 9 - MFP Pin Number (1024 Pins Maximum)
+ * bit 10..12 - Alternate Function Selection
+ * bit 13..15 - Drive Strength
+ * bit 16..18 - Low Power Mode State
+ * bit 19..20 - Low Power Mode Edge Detection
+ * bit 21..22 - Run Mode Pull State
+ *
+ * to facilitate the definition, the following macros are provided
+ *
+ * MFP_CFG_DEFAULT - default MFP configuration value, with
+ * alternate function = 0,
+ * drive strength = fast 3mA (MFP_DS03X)
+ * low power mode = default
+ * edge detection = none
+ *
+ * MFP_CFG - default MFPR value with alternate function
+ * MFP_CFG_DRV - default MFPR value with alternate function and
+ * pin drive strength
+ * MFP_CFG_LPM - default MFPR value with alternate function and
+ * low power mode
+ * MFP_CFG_X - default MFPR value with alternate function,
+ * pin drive strength and low power mode
+ */
+
+ Examples of pin configurations are::
+
+ #define GPIO94_SSP3_RXD MFP_CFG_X(GPIO94, AF1, DS08X, FLOAT)
+
+ which reads GPIO94 can be configured as SSP3_RXD, with alternate function
+ selection of 1, driving strength of 0b101, and a float state in low power
+ modes.
+
+ NOTE: this is the default setting of this pin being configured as SSP3_RXD
+ which can be modified a bit in board code, though it is not recommended to
+ do so, simply because this default setting is usually carefully encoded,
+ and is supposed to work in most cases.
+
+Register Settings
+-----------------
+
+ Register settings on PXA3xx for a pin configuration is actually very
+ straight-forward, most bits can be converted directly into MFPR value
+ in a easier way. Two sets of MFPR values are calculated: the run-time
+ ones and the low power mode ones, to allow different settings.
+
+ The conversion from a generic pin configuration to the actual register
+ settings on PXA2xx is a bit complicated: many registers are involved,
+ including GAFRx, GPDRx, PGSRx, PWER, PKWR, PFER and PRER. Please see
+ mfp-pxa2xx.c for how the conversion is made.
diff --git a/Documentation/arm/pxa/mfp.txt b/Documentation/arm/pxa/mfp.txt
deleted file mode 100644
index 0b7cab978c02..000000000000
--- a/Documentation/arm/pxa/mfp.txt
+++ /dev/null
@@ -1,286 +0,0 @@
- MFP Configuration for PXA2xx/PXA3xx Processors
-
- Eric Miao <eric.miao@marvell.com>
-
-MFP stands for Multi-Function Pin, which is the pin-mux logic on PXA3xx and
-later PXA series processors. This document describes the existing MFP API,
-and how board/platform driver authors could make use of it.
-
- Basic Concept
-===============
-
-Unlike the GPIO alternate function settings on PXA25x and PXA27x, a new MFP
-mechanism is introduced from PXA3xx to completely move the pin-mux functions
-out of the GPIO controller. In addition to pin-mux configurations, the MFP
-also controls the low power state, driving strength, pull-up/down and event
-detection of each pin. Below is a diagram of internal connections between
-the MFP logic and the remaining SoC peripherals:
-
- +--------+
- | |--(GPIO19)--+
- | GPIO | |
- | |--(GPIO...) |
- +--------+ |
- | +---------+
- +--------+ +------>| |
- | PWM2 |--(PWM_OUT)-------->| MFP |
- +--------+ +------>| |-------> to external PAD
- | +---->| |
- +--------+ | | +-->| |
- | SSP2 |---(TXD)----+ | | +---------+
- +--------+ | |
- | |
- +--------+ | |
- | Keypad |--(MKOUT4)----+ |
- +--------+ |
- |
- +--------+ |
- | UART2 |---(TXD)--------+
- +--------+
-
-NOTE: the external pad is named as MFP_PIN_GPIO19, it doesn't necessarily
-mean it's dedicated for GPIO19, only as a hint that internally this pin
-can be routed from GPIO19 of the GPIO controller.
-
-To better understand the change from PXA25x/PXA27x GPIO alternate function
-to this new MFP mechanism, here are several key points:
-
- 1. GPIO controller on PXA3xx is now a dedicated controller, same as other
- internal controllers like PWM, SSP and UART, with 128 internal signals
- which can be routed to external through one or more MFPs (e.g. GPIO<0>
- can be routed through either MFP_PIN_GPIO0 as well as MFP_PIN_GPIO0_2,
- see arch/arm/mach-pxa/mfp-pxa300.h)
-
- 2. Alternate function configuration is removed from this GPIO controller,
- the remaining functions are pure GPIO-specific, i.e.
-
- - GPIO signal level control
- - GPIO direction control
- - GPIO level change detection
-
- 3. Low power state for each pin is now controlled by MFP, this means the
- PGSRx registers on PXA2xx are now useless on PXA3xx
-
- 4. Wakeup detection is now controlled by MFP, PWER does not control the
- wakeup from GPIO(s) any more, depending on the sleeping state, ADxER
- (as defined in pxa3xx-regs.h) controls the wakeup from MFP
-
-NOTE: with such a clear separation of MFP and GPIO, by GPIO<xx> we normally
-mean it is a GPIO signal, and by MFP<xxx> or pin xxx, we mean a physical
-pad (or ball).
-
- MFP API Usage
-===============
-
-For board code writers, here are some guidelines:
-
-1. include ONE of the following header files in your <board>.c:
-
- - #include "mfp-pxa25x.h"
- - #include "mfp-pxa27x.h"
- - #include "mfp-pxa300.h"
- - #include "mfp-pxa320.h"
- - #include "mfp-pxa930.h"
-
- NOTE: only one file in your <board>.c, depending on the processors used,
- because pin configuration definitions may conflict in these file (i.e.
- same name, different meaning and settings on different processors). E.g.
- for zylonite platform, which support both PXA300/PXA310 and PXA320, two
- separate files are introduced: zylonite_pxa300.c and zylonite_pxa320.c
- (in addition to handle MFP configuration differences, they also handle
- the other differences between the two combinations).
-
- NOTE: PXA300 and PXA310 are almost identical in pin configurations (with
- PXA310 supporting some additional ones), thus the difference is actually
- covered in a single mfp-pxa300.h.
-
-2. prepare an array for the initial pin configurations, e.g.:
-
- static unsigned long mainstone_pin_config[] __initdata = {
- /* Chip Select */
- GPIO15_nCS_1,
-
- /* LCD - 16bpp Active TFT */
- GPIOxx_TFT_LCD_16BPP,
- GPIO16_PWM0_OUT, /* Backlight */
-
- /* MMC */
- GPIO32_MMC_CLK,
- GPIO112_MMC_CMD,
- GPIO92_MMC_DAT_0,
- GPIO109_MMC_DAT_1,
- GPIO110_MMC_DAT_2,
- GPIO111_MMC_DAT_3,
-
- ...
-
- /* GPIO */
- GPIO1_GPIO | WAKEUP_ON_EDGE_BOTH,
- };
-
- a) once the pin configurations are passed to pxa{2xx,3xx}_mfp_config(),
- and written to the actual registers, they are useless and may discard,
- adding '__initdata' will help save some additional bytes here.
-
- b) when there is only one possible pin configurations for a component,
- some simplified definitions can be used, e.g. GPIOxx_TFT_LCD_16BPP on
- PXA25x and PXA27x processors
-
- c) if by board design, a pin can be configured to wake up the system
- from low power state, it can be 'OR'ed with any of:
-
- WAKEUP_ON_EDGE_BOTH
- WAKEUP_ON_EDGE_RISE
- WAKEUP_ON_EDGE_FALL
- WAKEUP_ON_LEVEL_HIGH - specifically for enabling of keypad GPIOs,
-
- to indicate that this pin has the capability of wake-up the system,
- and on which edge(s). This, however, doesn't necessarily mean the
- pin _will_ wakeup the system, it will only when set_irq_wake() is
- invoked with the corresponding GPIO IRQ (GPIO_IRQ(xx) or gpio_to_irq())
- and eventually calls gpio_set_wake() for the actual register setting.
-
- d) although PXA3xx MFP supports edge detection on each pin, the
- internal logic will only wakeup the system when those specific bits
- in ADxER registers are set, which can be well mapped to the
- corresponding peripheral, thus set_irq_wake() can be called with
- the peripheral IRQ to enable the wakeup.
-
-
- MFP on PXA3xx
-===============
-
-Every external I/O pad on PXA3xx (excluding those for special purpose) has
-one MFP logic associated, and is controlled by one MFP register (MFPR).
-
-The MFPR has the following bit definitions (for PXA300/PXA310/PXA320):
-
- 31 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
- +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
- | RESERVED |PS|PU|PD| DRIVE |SS|SD|SO|EC|EF|ER|--| AF_SEL |
- +-------------------------+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-
- Bit 3: RESERVED
- Bit 4: EDGE_RISE_EN - enable detection of rising edge on this pin
- Bit 5: EDGE_FALL_EN - enable detection of falling edge on this pin
- Bit 6: EDGE_CLEAR - disable edge detection on this pin
- Bit 7: SLEEP_OE_N - enable outputs during low power modes
- Bit 8: SLEEP_DATA - output data on the pin during low power modes
- Bit 9: SLEEP_SEL - selection control for low power modes signals
- Bit 13: PULLDOWN_EN - enable the internal pull-down resistor on this pin
- Bit 14: PULLUP_EN - enable the internal pull-up resistor on this pin
- Bit 15: PULL_SEL - pull state controlled by selected alternate function
- (0) or by PULL{UP,DOWN}_EN bits (1)
-
- Bit 0 - 2: AF_SEL - alternate function selection, 8 possibilities, from 0-7
- Bit 10-12: DRIVE - drive strength and slew rate
- 0b000 - fast 1mA
- 0b001 - fast 2mA
- 0b002 - fast 3mA
- 0b003 - fast 4mA
- 0b004 - slow 6mA
- 0b005 - fast 6mA
- 0b006 - slow 10mA
- 0b007 - fast 10mA
-
- MFP Design for PXA2xx/PXA3xx
-==============================
-
-Due to the difference of pin-mux handling between PXA2xx and PXA3xx, a unified
-MFP API is introduced to cover both series of processors.
-
-The basic idea of this design is to introduce definitions for all possible pin
-configurations, these definitions are processor and platform independent, and
-the actual API invoked to convert these definitions into register settings and
-make them effective there-after.
-
- Files Involved
- --------------
-
- - arch/arm/mach-pxa/include/mach/mfp.h
-
- for
- 1. Unified pin definitions - enum constants for all configurable pins
- 2. processor-neutral bit definitions for a possible MFP configuration
-
- - arch/arm/mach-pxa/mfp-pxa3xx.h
-
- for PXA3xx specific MFPR register bit definitions and PXA3xx common pin
- configurations
-
- - arch/arm/mach-pxa/mfp-pxa2xx.h
-
- for PXA2xx specific definitions and PXA25x/PXA27x common pin configurations
-
- - arch/arm/mach-pxa/mfp-pxa25x.h
- arch/arm/mach-pxa/mfp-pxa27x.h
- arch/arm/mach-pxa/mfp-pxa300.h
- arch/arm/mach-pxa/mfp-pxa320.h
- arch/arm/mach-pxa/mfp-pxa930.h
-
- for processor specific definitions
-
- - arch/arm/mach-pxa/mfp-pxa3xx.c
- - arch/arm/mach-pxa/mfp-pxa2xx.c
-
- for implementation of the pin configuration to take effect for the actual
- processor.
-
- Pin Configuration
- -----------------
-
- The following comments are copied from mfp.h (see the actual source code
- for most updated info)
-
- /*
- * a possible MFP configuration is represented by a 32-bit integer
- *
- * bit 0.. 9 - MFP Pin Number (1024 Pins Maximum)
- * bit 10..12 - Alternate Function Selection
- * bit 13..15 - Drive Strength
- * bit 16..18 - Low Power Mode State
- * bit 19..20 - Low Power Mode Edge Detection
- * bit 21..22 - Run Mode Pull State
- *
- * to facilitate the definition, the following macros are provided
- *
- * MFP_CFG_DEFAULT - default MFP configuration value, with
- * alternate function = 0,
- * drive strength = fast 3mA (MFP_DS03X)
- * low power mode = default
- * edge detection = none
- *
- * MFP_CFG - default MFPR value with alternate function
- * MFP_CFG_DRV - default MFPR value with alternate function and
- * pin drive strength
- * MFP_CFG_LPM - default MFPR value with alternate function and
- * low power mode
- * MFP_CFG_X - default MFPR value with alternate function,
- * pin drive strength and low power mode
- */
-
- Examples of pin configurations are:
-
- #define GPIO94_SSP3_RXD MFP_CFG_X(GPIO94, AF1, DS08X, FLOAT)
-
- which reads GPIO94 can be configured as SSP3_RXD, with alternate function
- selection of 1, driving strength of 0b101, and a float state in low power
- modes.
-
- NOTE: this is the default setting of this pin being configured as SSP3_RXD
- which can be modified a bit in board code, though it is not recommended to
- do so, simply because this default setting is usually carefully encoded,
- and is supposed to work in most cases.
-
- Register Settings
- -----------------
-
- Register settings on PXA3xx for a pin configuration is actually very
- straight-forward, most bits can be converted directly into MFPR value
- in a easier way. Two sets of MFPR values are calculated: the run-time
- ones and the low power mode ones, to allow different settings.
-
- The conversion from a generic pin configuration to the actual register
- settings on PXA2xx is a bit complicated: many registers are involved,
- including GAFRx, GPDRx, PGSRx, PWER, PKWR, PFER and PRER. Please see
- mfp-pxa2xx.c for how the conversion is made.
diff --git a/Documentation/arm/sa1100/adsbitsy.rst b/Documentation/arm/sa1100/adsbitsy.rst
new file mode 100644
index 000000000000..c179cb26b682
--- /dev/null
+++ b/Documentation/arm/sa1100/adsbitsy.rst
@@ -0,0 +1,51 @@
+===============================
+ADS Bitsy Single Board Computer
+===============================
+
+(It is different from Bitsy(iPAQ) of Compaq)
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The Linux support for this product has been provided by
+Woojung Huh <whuh@applieddata.net>
+
+Use 'make adsbitsy_config' before any 'make config'.
+This will set up defaults for ADS Bitsy support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0400000.
+
+Linux can be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+
+Supported peripherals
+=====================
+
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- SA1111 USB Master
+- SA1100 serial port
+- pcmcia, compact flash
+- touchscreen(ucb1200)
+- console on LCD screen
+- serial ports (ttyS[0-2])
+ - ttyS0 is default for serial console
+
+To do
+=====
+
+- everything else! :-)
+
+Notes
+=====
+
+- The flash on board is divided into 3 partitions.
+ You should be careful to use flash on board.
+ Its partition is different from GraphicsClient Plus and GraphicsMaster
+
+- 16bpp mode requires a different cable than what ships with the board.
+ Contact ADS or look through the manual to wire your own. Currently,
+ if you compile with 16bit mode support and switch into a lower bpp
+ mode, the timing is off so the image is corrupted. This will be
+ fixed soon.
+
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/sa1100/assabet.rst b/Documentation/arm/sa1100/assabet.rst
new file mode 100644
index 000000000000..3e704831c311
--- /dev/null
+++ b/Documentation/arm/sa1100/assabet.rst
@@ -0,0 +1,301 @@
+============================================
+The Intel Assabet (SA-1110 evaluation) board
+============================================
+
+Please see:
+http://developer.intel.com
+
+Also some notes from John G Dorsey <jd5q@andrew.cmu.edu>:
+http://www.cs.cmu.edu/~wearable/software/assabet.html
+
+
+Building the kernel
+-------------------
+
+To build the kernel with current defaults::
+
+ make assabet_config
+ make oldconfig
+ make zImage
+
+The resulting kernel image should be available in linux/arch/arm/boot/zImage.
+
+
+Installing a bootloader
+-----------------------
+
+A couple of bootloaders able to boot Linux on Assabet are available:
+
+BLOB (http://www.lartmaker.nl/lartware/blob/)
+
+ BLOB is a bootloader used within the LART project. Some contributed
+ patches were merged into BLOB to add support for Assabet.
+
+Compaq's Bootldr + John Dorsey's patch for Assabet support
+(http://www.handhelds.org/Compaq/bootldr.html)
+(http://www.wearablegroup.org/software/bootldr/)
+
+ Bootldr is the bootloader developed by Compaq for the iPAQ Pocket PC.
+ John Dorsey has produced add-on patches to add support for Assabet and
+ the JFFS filesystem.
+
+RedBoot (http://sources.redhat.com/redboot/)
+
+ RedBoot is a bootloader developed by Red Hat based on the eCos RTOS
+ hardware abstraction layer. It supports Assabet amongst many other
+ hardware platforms.
+
+RedBoot is currently the recommended choice since it's the only one to have
+networking support, and is the most actively maintained.
+
+Brief examples on how to boot Linux with RedBoot are shown below. But first
+you need to have RedBoot installed in your flash memory. A known to work
+precompiled RedBoot binary is available from the following location:
+
+- ftp://ftp.netwinder.org/users/n/nico/
+- ftp://ftp.arm.linux.org.uk/pub/linux/arm/people/nico/
+- ftp://ftp.handhelds.org/pub/linux/arm/sa-1100-patches/
+
+Look for redboot-assabet*.tgz. Some installation infos are provided in
+redboot-assabet*.txt.
+
+
+Initial RedBoot configuration
+-----------------------------
+
+The commands used here are explained in The RedBoot User's Guide available
+on-line at http://sources.redhat.com/ecos/docs.html.
+Please refer to it for explanations.
+
+If you have a CF network card (my Assabet kit contained a CF+ LP-E from
+Socket Communications Inc.), you should strongly consider using it for TFTP
+file transfers. You must insert it before RedBoot runs since it can't detect
+it dynamically.
+
+To initialize the flash directory::
+
+ fis init -f
+
+To initialize the non-volatile settings, like whether you want to use BOOTP or
+a static IP address, etc, use this command::
+
+ fconfig -i
+
+
+Writing a kernel image into flash
+---------------------------------
+
+First, the kernel image must be loaded into RAM. If you have the zImage file
+available on a TFTP server::
+
+ load zImage -r -b 0x100000
+
+If you rather want to use Y-Modem upload over the serial port::
+
+ load -m ymodem -r -b 0x100000
+
+To write it to flash::
+
+ fis create "Linux kernel" -b 0x100000 -l 0xc0000
+
+
+Booting the kernel
+------------------
+
+The kernel still requires a filesystem to boot. A ramdisk image can be loaded
+as follows::
+
+ load ramdisk_image.gz -r -b 0x800000
+
+Again, Y-Modem upload can be used instead of TFTP by replacing the file name
+by '-y ymodem'.
+
+Now the kernel can be retrieved from flash like this::
+
+ fis load "Linux kernel"
+
+or loaded as described previously. To boot the kernel::
+
+ exec -b 0x100000 -l 0xc0000
+
+The ramdisk image could be stored into flash as well, but there are better
+solutions for on-flash filesystems as mentioned below.
+
+
+Using JFFS2
+-----------
+
+Using JFFS2 (the Second Journalling Flash File System) is probably the most
+convenient way to store a writable filesystem into flash. JFFS2 is used in
+conjunction with the MTD layer which is responsible for low-level flash
+management. More information on the Linux MTD can be found on-line at:
+http://www.linux-mtd.infradead.org/. A JFFS howto with some infos about
+creating JFFS/JFFS2 images is available from the same site.
+
+For instance, a sample JFFS2 image can be retrieved from the same FTP sites
+mentioned below for the precompiled RedBoot image.
+
+To load this file::
+
+ load sample_img.jffs2 -r -b 0x100000
+
+The result should look like::
+
+ RedBoot> load sample_img.jffs2 -r -b 0x100000
+ Raw file loaded 0x00100000-0x00377424
+
+Now we must know the size of the unallocated flash::
+
+ fis free
+
+Result::
+
+ RedBoot> fis free
+ 0x500E0000 .. 0x503C0000
+
+The values above may be different depending on the size of the filesystem and
+the type of flash. See their usage below as an example and take care of
+substituting yours appropriately.
+
+We must determine some values::
+
+ size of unallocated flash: 0x503c0000 - 0x500e0000 = 0x2e0000
+ size of the filesystem image: 0x00377424 - 0x00100000 = 0x277424
+
+We want to fit the filesystem image of course, but we also want to give it all
+the remaining flash space as well. To write it::
+
+ fis unlock -f 0x500E0000 -l 0x2e0000
+ fis erase -f 0x500E0000 -l 0x2e0000
+ fis write -b 0x100000 -l 0x277424 -f 0x500E0000
+ fis create "JFFS2" -n -f 0x500E0000 -l 0x2e0000
+
+Now the filesystem is associated to a MTD "partition" once Linux has discovered
+what they are in the boot process. From Redboot, the 'fis list' command
+displays them::
+
+ RedBoot> fis list
+ Name FLASH addr Mem addr Length Entry point
+ RedBoot 0x50000000 0x50000000 0x00020000 0x00000000
+ RedBoot config 0x503C0000 0x503C0000 0x00020000 0x00000000
+ FIS directory 0x503E0000 0x503E0000 0x00020000 0x00000000
+ Linux kernel 0x50020000 0x00100000 0x000C0000 0x00000000
+ JFFS2 0x500E0000 0x500E0000 0x002E0000 0x00000000
+
+However Linux should display something like::
+
+ SA1100 flash: probing 32-bit flash bus
+ SA1100 flash: Found 2 x16 devices at 0x0 in 32-bit mode
+ Using RedBoot partition definition
+ Creating 5 MTD partitions on "SA1100 flash":
+ 0x00000000-0x00020000 : "RedBoot"
+ 0x00020000-0x000e0000 : "Linux kernel"
+ 0x000e0000-0x003c0000 : "JFFS2"
+ 0x003c0000-0x003e0000 : "RedBoot config"
+ 0x003e0000-0x00400000 : "FIS directory"
+
+What's important here is the position of the partition we are interested in,
+which is the third one. Within Linux, this correspond to /dev/mtdblock2.
+Therefore to boot Linux with the kernel and its root filesystem in flash, we
+need this RedBoot command::
+
+ fis load "Linux kernel"
+ exec -b 0x100000 -l 0xc0000 -c "root=/dev/mtdblock2"
+
+Of course other filesystems than JFFS might be used, like cramfs for example.
+You might want to boot with a root filesystem over NFS, etc. It is also
+possible, and sometimes more convenient, to flash a filesystem directly from
+within Linux while booted from a ramdisk or NFS. The Linux MTD repository has
+many tools to deal with flash memory as well, to erase it for example. JFFS2
+can then be mounted directly on a freshly erased partition and files can be
+copied over directly. Etc...
+
+
+RedBoot scripting
+-----------------
+
+All the commands above aren't so useful if they have to be typed in every
+time the Assabet is rebooted. Therefore it's possible to automate the boot
+process using RedBoot's scripting capability.
+
+For example, I use this to boot Linux with both the kernel and the ramdisk
+images retrieved from a TFTP server on the network::
+
+ RedBoot> fconfig
+ Run script at boot: false true
+ Boot script:
+ Enter script, terminate with empty line
+ >> load zImage -r -b 0x100000
+ >> load ramdisk_ks.gz -r -b 0x800000
+ >> exec -b 0x100000 -l 0xc0000
+ >>
+ Boot script timeout (1000ms resolution): 3
+ Use BOOTP for network configuration: true
+ GDB connection port: 9000
+ Network debug at boot time: false
+ Update RedBoot non-volatile configuration - are you sure (y/n)? y
+
+Then, rebooting the Assabet is just a matter of waiting for the login prompt.
+
+
+
+Nicolas Pitre
+nico@fluxnic.net
+
+June 12, 2001
+
+
+Status of peripherals in -rmk tree (updated 14/10/2001)
+-------------------------------------------------------
+
+Assabet:
+ Serial ports:
+ Radio: TX, RX, CTS, DSR, DCD, RI
+ - PM: Not tested.
+ - COM: TX, RX, CTS, DSR, DCD, RTS, DTR, PM
+ - PM: Not tested.
+ - I2C: Implemented, not fully tested.
+ - L3: Fully tested, pass.
+ - PM: Not tested.
+
+ Video:
+ - LCD: Fully tested. PM
+
+ (LCD doesn't like being blanked with neponset connected)
+
+ - Video out: Not fully
+
+ Audio:
+ UDA1341:
+ - Playback: Fully tested, pass.
+ - Record: Implemented, not tested.
+ - PM: Not tested.
+
+ UCB1200:
+ - Audio play: Implemented, not heavily tested.
+ - Audio rec: Implemented, not heavily tested.
+ - Telco audio play: Implemented, not heavily tested.
+ - Telco audio rec: Implemented, not heavily tested.
+ - POTS control: No
+ - Touchscreen: Yes
+ - PM: Not tested.
+
+ Other:
+ - PCMCIA:
+ - LPE: Fully tested, pass.
+ - USB: No
+ - IRDA:
+ - SIR: Fully tested, pass.
+ - FIR: Fully tested, pass.
+ - PM: Not tested.
+
+Neponset:
+ Serial ports:
+ - COM1,2: TX, RX, CTS, DSR, DCD, RTS, DTR
+ - PM: Not tested.
+ - USB: Implemented, not heavily tested.
+ - PCMCIA: Implemented, not heavily tested.
+ - CF: Implemented, not heavily tested.
+ - PM: Not tested.
+
+More stuff can be found in the -np (Nicolas Pitre's) tree.
diff --git a/Documentation/arm/sa1100/brutus.rst b/Documentation/arm/sa1100/brutus.rst
new file mode 100644
index 000000000000..e1a23bee6d44
--- /dev/null
+++ b/Documentation/arm/sa1100/brutus.rst
@@ -0,0 +1,69 @@
+======
+Brutus
+======
+
+Brutus is an evaluation platform for the SA1100 manufactured by Intel.
+For more details, see:
+
+http://developer.intel.com
+
+To compile for Brutus, you must issue the following commands::
+
+ make brutus_config
+ make config
+ [accept all the defaults]
+ make zImage
+
+The resulting kernel will end up in linux/arch/arm/boot/zImage. This file
+must be loaded at 0xc0008000 in Brutus's memory and execution started at
+0xc0008000 as well with the value of registers r0 = 0 and r1 = 16 upon
+entry.
+
+But prior to execute the kernel, a ramdisk image must also be loaded in
+memory. Use memory address 0xd8000000 for this. Note that the file
+containing the (compressed) ramdisk image must not exceed 4 MB.
+
+Typically, you'll need angelboot to load the kernel.
+The following angelboot.opt file should be used::
+
+ base 0xc0008000
+ entry 0xc0008000
+ r0 0x00000000
+ r1 0x00000010
+ device /dev/ttyS0
+ options "9600 8N1"
+ baud 115200
+ otherfile ramdisk_img.gz
+ otherbase 0xd8000000
+
+Then load the kernel and ramdisk with::
+
+ angelboot -f angelboot.opt zImage
+
+The first Brutus serial port (assumed to be linked to /dev/ttyS0 on your
+host PC) is used by angel to load the kernel and ramdisk image. The serial
+console is provided through the second Brutus serial port. To access it,
+you may use minicom configured with /dev/ttyS1, 9600 baud, 8N1, no flow
+control.
+
+Currently supported
+===================
+
+ - RS232 serial ports
+ - audio output
+ - LCD screen
+ - keyboard
+
+The actual Brutus support may not be complete without extra patches.
+If such patches exist, they should be found from
+ftp.netwinder.org/users/n/nico.
+
+A full PCMCIA support is still missing, although it's possible to hack
+some drivers in order to drive already inserted cards at boot time with
+little modifications.
+
+Any contribution is welcome.
+
+Please send patches to nico@fluxnic.net
+
+Have Fun !
diff --git a/Documentation/arm/sa1100/cerf.rst b/Documentation/arm/sa1100/cerf.rst
new file mode 100644
index 000000000000..7fa71b609bf9
--- /dev/null
+++ b/Documentation/arm/sa1100/cerf.rst
@@ -0,0 +1,35 @@
+==============
+CerfBoard/Cube
+==============
+
+*** The StrongARM version of the CerfBoard/Cube has been discontinued ***
+
+The Intrinsyc CerfBoard is a StrongARM 1110-based computer on a board
+that measures approximately 2" square. It includes an Ethernet
+controller, an RS232-compatible serial port, a USB function port, and
+one CompactFlash+ slot on the back. Pictures can be found at the
+Intrinsyc website, http://www.intrinsyc.com.
+
+This document describes the support in the Linux kernel for the
+Intrinsyc CerfBoard.
+
+Supported in this version
+=========================
+
+ - CompactFlash+ slot (select PCMCIA in General Setup and any options
+ that may be required)
+ - Onboard Crystal CS8900 Ethernet controller (Cerf CS8900A support in
+ Network Devices)
+ - Serial ports with a serial console (hardcoded to 38400 8N1)
+
+In order to get this kernel onto your Cerf, you need a server that runs
+both BOOTP and TFTP. Detailed instructions should have come with your
+evaluation kit on how to use the bootloader. This series of commands
+will suffice::
+
+ make ARCH=arm CROSS_COMPILE=arm-linux- cerfcube_defconfig
+ make ARCH=arm CROSS_COMPILE=arm-linux- zImage
+ make ARCH=arm CROSS_COMPILE=arm-linux- modules
+ cp arch/arm/boot/zImage <TFTP directory>
+
+support@intrinsyc.com
diff --git a/Documentation/arm/sa1100/freebird.rst b/Documentation/arm/sa1100/freebird.rst
new file mode 100644
index 000000000000..81043d0c6d64
--- /dev/null
+++ b/Documentation/arm/sa1100/freebird.rst
@@ -0,0 +1,25 @@
+========
+Freebird
+========
+
+Freebird-1.1 is produced by Legend(C), Inc.
+`http://web.archive.org/web/*/http://www.legend.com.cn`
+and software/linux maintained by Coventive(C), Inc.
+(http://www.coventive.com)
+
+Based on the Nicolas's strongarm kernel tree.
+
+Maintainer:
+
+Chester Kuo
+ - <chester@coventive.com>
+ - <chester@linux.org.tw>
+
+Author:
+
+- Tim wu <timwu@coventive.com>
+- CIH <cih@coventive.com>
+- Eric Peng <ericpeng@coventive.com>
+- Jeff Lee <jeff_lee@coventive.com>
+- Allen Cheng
+- Tony Liu <tonyliu@coventive.com>
diff --git a/Documentation/arm/sa1100/graphicsclient.rst b/Documentation/arm/sa1100/graphicsclient.rst
new file mode 100644
index 000000000000..a73d61c3ce91
--- /dev/null
+++ b/Documentation/arm/sa1100/graphicsclient.rst
@@ -0,0 +1,102 @@
+=============================================
+ADS GraphicsClient Plus Single Board Computer
+=============================================
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The original Linux support for this product has been provided by
+Nicolas Pitre <nico@fluxnic.net>. Continued development work by
+Woojung Huh <whuh@applieddata.net>
+
+It's currently possible to mount a root filesystem via NFS providing a
+complete Linux environment. Otherwise a ramdisk image may be used. The
+board supports MTD/JFFS, so you could also mount something on there.
+
+Use 'make graphicsclient_config' before any 'make config'. This will set up
+defaults for GraphicsClient Plus support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0200000.
+Also the following registers should have the specified values upon entry::
+
+ r0 = 0
+ r1 = 29 (this is the GraphicsClient architecture number)
+
+Linux can be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+Angel is not available for the GraphicsClient Plus AFAIK.
+
+There is a board known as just the GraphicsClient that ADS used to
+produce but has end of lifed. This code will not work on the older
+board with the ADS bootloader, but should still work with Angel,
+as outlined below. In any case, if you're planning on deploying
+something en masse, you should probably get the newer board.
+
+If using Angel on the older boards, here is a typical angel.opt option file
+if the kernel is loaded through the Angel Debug Monitor::
+
+ base 0xc0200000
+ entry 0xc0200000
+ r0 0x00000000
+ r1 0x0000001d
+ device /dev/ttyS1
+ options "38400 8N1"
+ baud 115200
+ #otherfile ramdisk.gz
+ #otherbase 0xc0800000
+ exec minicom
+
+Then the kernel (and ramdisk if otherfile/otherbase lines above are
+uncommented) would be loaded with::
+
+ angelboot -f angelboot.opt zImage
+
+Here it is assumed that the board is connected to ttyS1 on your PC
+and that minicom is preconfigured with /dev/ttyS1, 38400 baud, 8N1, no flow
+control by default.
+
+If any other bootloader is used, ensure it accomplish the same, especially
+for r0/r1 register values before jumping into the kernel.
+
+
+Supported peripherals
+=====================
+
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- on-board SMC 92C96 ethernet NIC
+- SA1100 serial port
+- flash memory access (MTD/JFFS)
+- pcmcia
+- touchscreen(ucb1200)
+- ps/2 keyboard
+- console on LCD screen
+- serial ports (ttyS[0-2])
+ - ttyS0 is default for serial console
+- Smart I/O (ADC, keypad, digital inputs, etc)
+ See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
+ and example user space code. ps/2 keybd is multiplexed through this driver
+
+To do
+=====
+
+- UCB1200 audio with new ucb_generic layer
+- everything else! :-)
+
+Notes
+=====
+
+- The flash on board is divided into 3 partitions. mtd0 is where
+ the ADS boot ROM and zImage is stored. It's been marked as
+ read-only to keep you from blasting over the bootloader. :) mtd1 is
+ for the ramdisk.gz image. mtd2 is user flash space and can be
+ utilized for either JFFS or if you're feeling crazy, running ext2
+ on top of it. If you're not using the ADS bootloader, you're
+ welcome to blast over the mtd1 partition also.
+
+- 16bpp mode requires a different cable than what ships with the board.
+ Contact ADS or look through the manual to wire your own. Currently,
+ if you compile with 16bit mode support and switch into a lower bpp
+ mode, the timing is off so the image is corrupted. This will be
+ fixed soon.
+
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/sa1100/graphicsmaster.rst b/Documentation/arm/sa1100/graphicsmaster.rst
new file mode 100644
index 000000000000..e39892514f0c
--- /dev/null
+++ b/Documentation/arm/sa1100/graphicsmaster.rst
@@ -0,0 +1,60 @@
+========================================
+ADS GraphicsMaster Single Board Computer
+========================================
+
+For more details, contact Applied Data Systems or see
+http://www.applieddata.net/products.html
+
+The original Linux support for this product has been provided by
+Nicolas Pitre <nico@fluxnic.net>. Continued development work by
+Woojung Huh <whuh@applieddata.net>
+
+Use 'make graphicsmaster_config' before any 'make config'.
+This will set up defaults for GraphicsMaster support.
+
+The kernel zImage is linked to be loaded and executed at 0xc0400000.
+
+Linux can be used with the ADS BootLoader that ships with the
+newer rev boards. See their documentation on how to load Linux.
+
+Supported peripherals
+=====================
+
+- SA1100 LCD frame buffer (8/16bpp...sort of)
+- SA1111 USB Master
+- on-board SMC 92C96 ethernet NIC
+- SA1100 serial port
+- flash memory access (MTD/JFFS)
+- pcmcia, compact flash
+- touchscreen(ucb1200)
+- ps/2 keyboard
+- console on LCD screen
+- serial ports (ttyS[0-2])
+ - ttyS0 is default for serial console
+- Smart I/O (ADC, keypad, digital inputs, etc)
+ See http://www.eurotech-inc.com/linux-sbc.asp for IOCTL documentation
+ and example user space code. ps/2 keybd is multiplexed through this driver
+
+To do
+=====
+
+- everything else! :-)
+
+Notes
+=====
+
+- The flash on board is divided into 3 partitions. mtd0 is where
+ the zImage is stored. It's been marked as read-only to keep you
+ from blasting over the bootloader. :) mtd1 is
+ for the ramdisk.gz image. mtd2 is user flash space and can be
+ utilized for either JFFS or if you're feeling crazy, running ext2
+ on top of it. If you're not using the ADS bootloader, you're
+ welcome to blast over the mtd1 partition also.
+
+- 16bpp mode requires a different cable than what ships with the board.
+ Contact ADS or look through the manual to wire your own. Currently,
+ if you compile with 16bit mode support and switch into a lower bpp
+ mode, the timing is off so the image is corrupted. This will be
+ fixed soon.
+
+Any contribution can be sent to nico@fluxnic.net and will be greatly welcome!
diff --git a/Documentation/arm/sa1100/huw_webpanel.rst b/Documentation/arm/sa1100/huw_webpanel.rst
new file mode 100644
index 000000000000..1dc7ccb165f0
--- /dev/null
+++ b/Documentation/arm/sa1100/huw_webpanel.rst
@@ -0,0 +1,21 @@
+=======================
+Hoeft & Wessel Webpanel
+=======================
+
+The HUW_WEBPANEL is a product of the german company Hoeft & Wessel AG
+
+If you want more information, please visit
+http://www.hoeft-wessel.de
+
+To build the kernel::
+
+ make huw_webpanel_config
+ make oldconfig
+ [accept all defaults]
+ make zImage
+
+Mostly of the work is done by:
+Roman Jordan jor@hoeft-wessel.de
+Christoph Schulz schu@hoeft-wessel.de
+
+2000/12/18/
diff --git a/Documentation/arm/sa1100/index.rst b/Documentation/arm/sa1100/index.rst
new file mode 100644
index 000000000000..68c2a280a745
--- /dev/null
+++ b/Documentation/arm/sa1100/index.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====================
+Intel StrongARM 1100
+====================
+
+.. toctree::
+ :maxdepth: 1
+
+ adsbitsy
+ assabet
+ brutus
+ cerf
+ freebird
+ graphicsclient
+ graphicsmaster
+ huw_webpanel
+ itsy
+ lart
+ nanoengine
+ pangolin
+ pleb
+ serial_uart
+ tifon
+ yopy
diff --git a/Documentation/arm/sa1100/itsy.rst b/Documentation/arm/sa1100/itsy.rst
new file mode 100644
index 000000000000..f49896ba3ef1
--- /dev/null
+++ b/Documentation/arm/sa1100/itsy.rst
@@ -0,0 +1,47 @@
+====
+Itsy
+====
+
+Itsy is a research project done by the Western Research Lab, and Systems
+Research Center in Palo Alto, CA. The Itsy project is one of several
+research projects at Compaq that are related to pocket computing.
+
+For more information, see:
+
+ http://www.hpl.hp.com/downloads/crl/itsy/
+
+Notes on initial 2.4 Itsy support (8/27/2000) :
+
+The port was done on an Itsy version 1.5 machine with a daughtercard with
+64 Meg of DRAM and 32 Meg of Flash. The initial work includes support for
+serial console (to see what you're doing). No other devices have been
+enabled.
+
+To build, do a "make menuconfig" (or xmenuconfig) and select Itsy support.
+Disable Flash and LCD support. and then do a make zImage.
+Finally, you will need to cd to arch/arm/boot/tools and execute a make there
+to build the params-itsy program used to boot the kernel.
+
+In order to install the port of 2.4 to the itsy, You will need to set the
+configuration parameters in the monitor as follows::
+
+ Arg 1:0x08340000, Arg2: 0xC0000000, Arg3:18 (0x12), Arg4:0
+
+Make sure the start-routine address is set to 0x00060000.
+
+Next, flash the params-itsy program to 0x00060000 ("p 1 0x00060000" in the
+flash menu) Flash the kernel in arch/arm/boot/zImage into 0x08340000
+("p 1 0x00340000"). Finally flash an initial ramdisk into 0xC8000000
+("p 2 0x0") We used ramdisk-2-30.gz from the 0.11 version directory on
+handhelds.org.
+
+The serial connection we established was at:
+
+8-bit data, no parity, 1 stop bit(s), 115200.00 b/s. in the monitor, in the
+params-itsy program, and in the kernel itself. This can be changed, but
+not easily. The monitor parameters are easily changed, the params program
+setup is assembly outl's, and the kernel is a configuration item specific to
+the itsy. (i.e. grep for CONFIG_SA1100_ITSY and you'll find where it is.)
+
+
+This should get you a properly booting 2.4 kernel on the itsy.
diff --git a/Documentation/arm/sa1100/lart.rst b/Documentation/arm/sa1100/lart.rst
new file mode 100644
index 000000000000..94c0568d1095
--- /dev/null
+++ b/Documentation/arm/sa1100/lart.rst
@@ -0,0 +1,15 @@
+====================================
+Linux Advanced Radio Terminal (LART)
+====================================
+
+The LART is a small (7.5 x 10cm) SA-1100 board, designed for embedded
+applications. It has 32 MB DRAM, 4MB Flash ROM, double RS232 and all
+other StrongARM-gadgets. Almost all SA signals are directly accessible
+through a number of connectors. The powersupply accepts voltages
+between 3.5V and 16V and is overdimensioned to support a range of
+daughterboards. A quad Ethernet / IDE / PS2 / sound daughterboard
+is under development, with plenty of others in different stages of
+planning.
+
+The hardware designs for this board have been released under an open license;
+see the LART page at http://www.lartmaker.nl/ for more information.
diff --git a/Documentation/arm/sa1100/nanoengine.rst b/Documentation/arm/sa1100/nanoengine.rst
new file mode 100644
index 000000000000..47f1a14cf98a
--- /dev/null
+++ b/Documentation/arm/sa1100/nanoengine.rst
@@ -0,0 +1,11 @@
+==========
+nanoEngine
+==========
+
+"nanoEngine" is a SA1110 based single board computer from
+Bright Star Engineering Inc. See www.brightstareng.com/arm
+for more info.
+(Ref: Stuart Adams <sja@brightstareng.com>)
+
+Also visit Larry Doolittle's "Linux for the nanoEngine" site:
+http://www.brightstareng.com/arm/nanoeng.htm
diff --git a/Documentation/arm/sa1100/pangolin.rst b/Documentation/arm/sa1100/pangolin.rst
new file mode 100644
index 000000000000..f0c5c1618553
--- /dev/null
+++ b/Documentation/arm/sa1100/pangolin.rst
@@ -0,0 +1,29 @@
+========
+Pangolin
+========
+
+Pangolin is a StrongARM 1110-based evaluation platform produced
+by Dialogue Technology (http://www.dialogue.com.tw/).
+It has EISA slots for ease of configuration with SDRAM/Flash
+memory card, USB/Serial/Audio card, Compact Flash card,
+PCMCIA/IDE card and TFT-LCD card.
+
+To compile for Pangolin, you must issue the following commands::
+
+ make pangolin_config
+ make oldconfig
+ make zImage
+
+Supported peripherals
+=====================
+
+- SA1110 serial port (UART1/UART2/UART3)
+- flash memory access
+- compact flash driver
+- UDA1341 sound driver
+- SA1100 LCD controller for 800x600 16bpp TFT-LCD
+- MQ-200 driver for 800x600 16bpp TFT-LCD
+- Penmount(touch panel) driver
+- PCMCIA driver
+- SMC91C94 LAN driver
+- IDE driver (experimental)
diff --git a/Documentation/arm/sa1100/pleb.rst b/Documentation/arm/sa1100/pleb.rst
new file mode 100644
index 000000000000..d5b732967aa3
--- /dev/null
+++ b/Documentation/arm/sa1100/pleb.rst
@@ -0,0 +1,13 @@
+====
+PLEB
+====
+
+The PLEB project was started as a student initiative at the School of
+Computer Science and Engineering, University of New South Wales to make a
+pocket computer capable of running the Linux Kernel.
+
+PLEB support has yet to be fully integrated.
+
+For more information, see:
+
+ http://www.cse.unsw.edu.au
diff --git a/Documentation/arm/sa1100/serial_uart.rst b/Documentation/arm/sa1100/serial_uart.rst
new file mode 100644
index 000000000000..ea983642b9be
--- /dev/null
+++ b/Documentation/arm/sa1100/serial_uart.rst
@@ -0,0 +1,51 @@
+==================
+SA1100 serial port
+==================
+
+The SA1100 serial port had its major/minor numbers officially assigned::
+
+ > Date: Sun, 24 Sep 2000 21:40:27 -0700
+ > From: H. Peter Anvin <hpa@transmeta.com>
+ > To: Nicolas Pitre <nico@CAM.ORG>
+ > Cc: Device List Maintainer <device@lanana.org>
+ > Subject: Re: device
+ >
+ > Okay. Note that device numbers 204 and 205 are used for "low density
+ > serial devices", so you will have a range of minors on those majors (the
+ > tty device layer handles this just fine, so you don't have to worry about
+ > doing anything special.)
+ >
+ > So your assignments are:
+ >
+ > 204 char Low-density serial ports
+ > 5 = /dev/ttySA0 SA1100 builtin serial port 0
+ > 6 = /dev/ttySA1 SA1100 builtin serial port 1
+ > 7 = /dev/ttySA2 SA1100 builtin serial port 2
+ >
+ > 205 char Low-density serial ports (alternate device)
+ > 5 = /dev/cusa0 Callout device for ttySA0
+ > 6 = /dev/cusa1 Callout device for ttySA1
+ > 7 = /dev/cusa2 Callout device for ttySA2
+ >
+
+You must create those inodes in /dev on the root filesystem used
+by your SA1100-based device::
+
+ mknod ttySA0 c 204 5
+ mknod ttySA1 c 204 6
+ mknod ttySA2 c 204 7
+ mknod cusa0 c 205 5
+ mknod cusa1 c 205 6
+ mknod cusa2 c 205 7
+
+In addition to the creation of the appropriate device nodes above, you
+must ensure your user space applications make use of the correct device
+name. The classic example is the content of the /etc/inittab file where
+you might have a getty process started on ttyS0.
+
+In this case:
+
+- replace occurrences of ttyS0 with ttySA0, ttyS1 with ttySA1, etc.
+
+- don't forget to add 'ttySA0', 'console', or the appropriate tty name
+ in /etc/securetty for root to be allowed to login as well.
diff --git a/Documentation/arm/sa1100/tifon.rst b/Documentation/arm/sa1100/tifon.rst
new file mode 100644
index 000000000000..c26e910b9ea7
--- /dev/null
+++ b/Documentation/arm/sa1100/tifon.rst
@@ -0,0 +1,7 @@
+=====
+Tifon
+=====
+
+More info has to come...
+
+Contact: Peter Danielsson <peter.danielsson@era-t.ericsson.se>
diff --git a/Documentation/arm/sa1100/yopy.rst b/Documentation/arm/sa1100/yopy.rst
new file mode 100644
index 000000000000..5b35a5f61a44
--- /dev/null
+++ b/Documentation/arm/sa1100/yopy.rst
@@ -0,0 +1,5 @@
+====
+Yopy
+====
+
+See http://www.yopydeveloper.org for more.
diff --git a/Documentation/arm/samsung-s3c24xx/cpufreq.rst b/Documentation/arm/samsung-s3c24xx/cpufreq.rst
new file mode 100644
index 000000000000..2ddc26c03b1f
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/cpufreq.rst
@@ -0,0 +1,76 @@
+=======================
+S3C24XX CPUfreq support
+=======================
+
+Introduction
+------------
+
+ The S3C24XX series support a number of power saving systems, such as
+ the ability to change the core, memory and peripheral operating
+ frequencies. The core control is exported via the CPUFreq driver
+ which has a number of different manual or automatic controls over the
+ rate the core is running at.
+
+ There are two forms of the driver depending on the specific CPU and
+ how the clocks are arranged. The first implementation used as single
+ PLL to feed the ARM, memory and peripherals via a series of dividers
+ and muxes and this is the implementation that is documented here. A
+ newer version where there is a separate PLL and clock divider for the
+ ARM core is available as a separate driver.
+
+
+Layout
+------
+
+ The code core manages the CPU specific drivers, any data that they
+ need to register and the interface to the generic drivers/cpufreq
+ system. Each CPU registers a driver to control the PLL, clock dividers
+ and anything else associated with it. Any board that wants to use this
+ framework needs to supply at least basic details of what is required.
+
+ The core registers with drivers/cpufreq at init time if all the data
+ necessary has been supplied.
+
+
+CPU support
+-----------
+
+ The support for each CPU depends on the facilities provided by the
+ SoC and the driver as each device has different PLL and clock chains
+ associated with it.
+
+
+Slow Mode
+---------
+
+ The SLOW mode where the PLL is turned off altogether and the
+ system is fed by the external crystal input is currently not
+ supported.
+
+
+sysfs
+-----
+
+ The core code exports extra information via sysfs in the directory
+ devices/system/cpu/cpu0/arch-freq.
+
+
+Board Support
+-------------
+
+ Each board that wants to use the cpufreq code must register some basic
+ information with the core driver to provide information about what the
+ board requires and any restrictions being placed on it.
+
+ The board needs to supply information about whether it needs the IO bank
+ timings changing, any maximum frequency limits and information about the
+ SDRAM refresh rate.
+
+
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2009 Simtec Electronics
+Licensed under GPLv2
diff --git a/Documentation/arm/samsung-s3c24xx/eb2410itx.rst b/Documentation/arm/samsung-s3c24xx/eb2410itx.rst
new file mode 100644
index 000000000000..7863c93652f8
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/eb2410itx.rst
@@ -0,0 +1,59 @@
+===================================
+Simtec Electronics EB2410ITX (BAST)
+===================================
+
+ http://www.simtec.co.uk/products/EB2410ITX/
+
+Introduction
+------------
+
+ The EB2410ITX is a S3C2410 based development board with a variety of
+ peripherals and expansion connectors. This board is also known by
+ the shortened name of Bast.
+
+
+Configuration
+-------------
+
+ To set the default configuration, use `make bast_defconfig` which
+ supports the commonly used features of this board.
+
+
+Support
+-------
+
+ Official support information can be found on the Simtec Electronics
+ website, at the product page http://www.simtec.co.uk/products/EB2410ITX/
+
+ Useful links:
+
+ - Resources Page http://www.simtec.co.uk/products/EB2410ITX/resources.html
+
+ - Board FAQ at http://www.simtec.co.uk/products/EB2410ITX/faq.html
+
+ - Bootloader info http://www.simtec.co.uk/products/SWABLE/resources.html
+ and FAQ http://www.simtec.co.uk/products/SWABLE/faq.html
+
+
+MTD
+---
+
+ The NAND and NOR support has been merged from the linux-mtd project.
+ Any problems, see http://www.linux-mtd.infradead.org/ for more
+ information or up-to-date versions of linux-mtd.
+
+
+IDE
+---
+
+ Both onboard IDE ports are supported, however there is no support for
+ changing speed of devices, PIO Mode 4 capable drives should be used.
+
+
+Maintainers
+-----------
+
+ This board is maintained by Simtec Electronics.
+
+
+Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/gpio.rst b/Documentation/arm/samsung-s3c24xx/gpio.rst
new file mode 100644
index 000000000000..f7c3d7d011a2
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/gpio.rst
@@ -0,0 +1,172 @@
+====================
+S3C24XX GPIO Control
+====================
+
+Introduction
+------------
+
+ The s3c2410 kernel provides an interface to configure and
+ manipulate the state of the GPIO pins, and find out other
+ information about them.
+
+ There are a number of conditions attached to the configuration
+ of the s3c2410 GPIO system, please read the Samsung provided
+ data-sheet/users manual to find out the complete list.
+
+ See Documentation/arm/samsung/gpio.rst for the core implementation.
+
+
+GPIOLIB
+-------
+
+ With the event of the GPIOLIB in drivers/gpio, support for some
+ of the GPIO functions such as reading and writing a pin will
+ be removed in favour of this common access method.
+
+ Once all the extant drivers have been converted, the functions
+ listed below will be removed (they may be marked as __deprecated
+ in the near future).
+
+ The following functions now either have a `s3c_` specific variant
+ or are merged into gpiolib. See the definitions in
+ arch/arm/plat-samsung/include/plat/gpio-cfg.h:
+
+ - s3c2410_gpio_setpin() gpio_set_value() or gpio_direction_output()
+ - s3c2410_gpio_getpin() gpio_get_value() or gpio_direction_input()
+ - s3c2410_gpio_getirq() gpio_to_irq()
+ - s3c2410_gpio_cfgpin() s3c_gpio_cfgpin()
+ - s3c2410_gpio_getcfg() s3c_gpio_getcfg()
+ - s3c2410_gpio_pullup() s3c_gpio_setpull()
+
+
+GPIOLIB conversion
+------------------
+
+If you need to convert your board or driver to use gpiolib from the phased
+out s3c2410 API, then here are some notes on the process.
+
+1) If your board is exclusively using an GPIO, say to control peripheral
+ power, then it will require to claim the gpio with gpio_request() before
+ it can use it.
+
+ It is recommended to check the return value, with at least WARN_ON()
+ during initialisation.
+
+2) The s3c2410_gpio_cfgpin() can be directly replaced with s3c_gpio_cfgpin()
+ as they have the same arguments, and can either take the pin specific
+ values, or the more generic special-function-number arguments.
+
+3) s3c2410_gpio_pullup() changes have the problem that while the
+ s3c2410_gpio_pullup(x, 1) can be easily translated to the
+ s3c_gpio_setpull(x, S3C_GPIO_PULL_NONE), the s3c2410_gpio_pullup(x, 0)
+ are not so easy.
+
+ The s3c2410_gpio_pullup(x, 0) case enables the pull-up (or in the case
+ of some of the devices, a pull-down) and as such the new API distinguishes
+ between the UP and DOWN case. There is currently no 'just turn on' setting
+ which may be required if this becomes a problem.
+
+4) s3c2410_gpio_setpin() can be replaced by gpio_set_value(), the old call
+ does not implicitly configure the relevant gpio to output. The gpio
+ direction should be changed before using gpio_set_value().
+
+5) s3c2410_gpio_getpin() is replaceable by gpio_get_value() if the pin
+ has been set to input. It is currently unknown what the behaviour is
+ when using gpio_get_value() on an output pin (s3c2410_gpio_getpin
+ would return the value the pin is supposed to be outputting).
+
+6) s3c2410_gpio_getirq() should be directly replaceable with the
+ gpio_to_irq() call.
+
+The s3c2410_gpio and `gpio_` calls have always operated on the same gpio
+numberspace, so there is no problem with converting the gpio numbering
+between the calls.
+
+
+Headers
+-------
+
+ See arch/arm/mach-s3c24xx/include/mach/regs-gpio.h for the list
+ of GPIO pins, and the configuration values for them. This
+ is included by using #include <mach/regs-gpio.h>
+
+
+PIN Numbers
+-----------
+
+ Each pin has an unique number associated with it in regs-gpio.h,
+ e.g. S3C2410_GPA(0) or S3C2410_GPF(1). These defines are used to tell
+ the GPIO functions which pin is to be used.
+
+ With the conversion to gpiolib, there is no longer a direct conversion
+ from gpio pin number to register base address as in earlier kernels. This
+ is due to the number space required for newer SoCs where the later
+ GPIOs are not contiguous.
+
+
+Configuring a pin
+-----------------
+
+ The following function allows the configuration of a given pin to
+ be changed.
+
+ void s3c_gpio_cfgpin(unsigned int pin, unsigned int function);
+
+ e.g.:
+
+ s3c_gpio_cfgpin(S3C2410_GPA(0), S3C_GPIO_SFN(1));
+ s3c_gpio_cfgpin(S3C2410_GPE(8), S3C_GPIO_SFN(2));
+
+ which would turn GPA(0) into the lowest Address line A0, and set
+ GPE(8) to be connected to the SDIO/MMC controller's SDDAT1 line.
+
+
+Reading the current configuration
+---------------------------------
+
+ The current configuration of a pin can be read by using standard
+ gpiolib function:
+
+ s3c_gpio_getcfg(unsigned int pin);
+
+ The return value will be from the same set of values which can be
+ passed to s3c_gpio_cfgpin().
+
+
+Configuring a pull-up resistor
+------------------------------
+
+ A large proportion of the GPIO pins on the S3C2410 can have weak
+ pull-up resistors enabled. This can be configured by the following
+ function:
+
+ void s3c_gpio_setpull(unsigned int pin, unsigned int to);
+
+ Where the to value is S3C_GPIO_PULL_NONE to set the pull-up off,
+ and S3C_GPIO_PULL_UP to enable the specified pull-up. Any other
+ values are currently undefined.
+
+
+Getting and setting the state of a PIN
+--------------------------------------
+
+ These calls are now implemented by the relevant gpiolib calls, convert
+ your board or driver to use gpiolib.
+
+
+Getting the IRQ number associated with a PIN
+--------------------------------------------
+
+ A standard gpiolib function can map the given pin number to an IRQ
+ number to pass to the IRQ system.
+
+ int gpio_to_irq(unsigned int pin);
+
+ Note, not all pins have an IRQ.
+
+
+Author
+-------
+
+Ben Dooks, 03 October 2004
+Copyright 2004 Ben Dooks, Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/h1940.rst b/Documentation/arm/samsung-s3c24xx/h1940.rst
new file mode 100644
index 000000000000..62a562c178e3
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/h1940.rst
@@ -0,0 +1,41 @@
+=============
+HP IPAQ H1940
+=============
+
+http://www.handhelds.org/projects/h1940.html
+
+Introduction
+------------
+
+ The HP H1940 is a S3C2410 based handheld device, with
+ bluetooth connectivity.
+
+
+Support
+-------
+
+ A variety of information is available
+
+ handhelds.org project page:
+
+ http://www.handhelds.org/projects/h1940.html
+
+ handhelds.org wiki page:
+
+ http://handhelds.org/moin/moin.cgi/HpIpaqH1940
+
+ Herbert Pötzl pages:
+
+ http://vserver.13thfloor.at/H1940/
+
+
+Maintainers
+-----------
+
+ This project is being maintained and developed by a variety
+ of people, including Ben Dooks, Arnaud Patard, and Herbert Pötzl.
+
+ Thanks to the many others who have also provided support.
+
+
+(c) 2005 Ben Dooks
diff --git a/Documentation/arm/samsung-s3c24xx/index.rst b/Documentation/arm/samsung-s3c24xx/index.rst
new file mode 100644
index 000000000000..5b8a7f9398d8
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/index.rst
@@ -0,0 +1,20 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Samsung S3C24XX SoC Family
+==========================
+
+.. toctree::
+ :maxdepth: 1
+
+ h1940
+ gpio
+ cpufreq
+ suspend
+ usb-host
+ s3c2412
+ eb2410itx
+ nand
+ smdk2440
+ s3c2413
+ overview
diff --git a/Documentation/arm/samsung-s3c24xx/nand.rst b/Documentation/arm/samsung-s3c24xx/nand.rst
new file mode 100644
index 000000000000..938995694ee7
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/nand.rst
@@ -0,0 +1,30 @@
+====================
+S3C24XX NAND Support
+====================
+
+Introduction
+------------
+
+Small Page NAND
+---------------
+
+The driver uses a 512 byte (1 page) ECC code for this setup. The
+ECC code is not directly compatible with the default kernel ECC
+code, so the driver enforces its own OOB layout and ECC parameters
+
+Large Page NAND
+---------------
+
+The driver is capable of handling NAND flash with a 2KiB page
+size, with support for hardware ECC generation and correction.
+
+Unlike the 512byte page mode, the driver generates ECC data for
+each 256 byte block in an 2KiB page. This means that more than
+one error in a page can be rectified. It also means that the
+OOB layout remains the default kernel layout for these flashes.
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2007 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/overview.rst b/Documentation/arm/samsung-s3c24xx/overview.rst
new file mode 100644
index 000000000000..e9a1dc7276b5
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/overview.rst
@@ -0,0 +1,319 @@
+==========================
+S3C24XX ARM Linux Overview
+==========================
+
+
+
+Introduction
+------------
+
+ The Samsung S3C24XX range of ARM9 System-on-Chip CPUs are supported
+ by the 's3c2410' architecture of ARM Linux. Currently the S3C2410,
+ S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443 and S3C2450 devices
+ are supported.
+
+ Support for the S3C2400 and S3C24A0 series was never completed and the
+ corresponding code has been removed after a while. If someone wishes to
+ revive this effort, partial support can be retrieved from earlier Linux
+ versions.
+
+ The S3C2416 and S3C2450 devices are very similar and S3C2450 support is
+ included under the arch/arm/mach-s3c2416 directory. Note, while core
+ support for these SoCs is in, work on some of the extra peripherals
+ and extra interrupts is still ongoing.
+
+
+Configuration
+-------------
+
+ A generic S3C2410 configuration is provided, and can be used as the
+ default by `make s3c2410_defconfig`. This configuration has support
+ for all the machines, and the commonly used features on them.
+
+ Certain machines may have their own default configurations as well,
+ please check the machine specific documentation.
+
+
+Layout
+------
+
+ The core support files are located in the platform code contained in
+ arch/arm/plat-s3c24xx with headers in include/asm-arm/plat-s3c24xx.
+ This directory should be kept to items shared between the platform
+ code (arch/arm/plat-s3c24xx) and the arch/arm/mach-s3c24* code.
+
+ Each cpu has a directory with the support files for it, and the
+ machines that carry the device. For example S3C2410 is contained
+ in arch/arm/mach-s3c2410 and S3C2440 in arch/arm/mach-s3c2440
+
+ Register, kernel and platform data definitions are held in the
+ arch/arm/mach-s3c2410 directory./include/mach
+
+arch/arm/plat-s3c24xx:
+
+ Files in here are either common to all the s3c24xx family,
+ or are common to only some of them with names to indicate this
+ status. The files that are not common to all are generally named
+ with the initial cpu they support in the series to ensure a short
+ name without any possibility of confusion with newer devices.
+
+ As an example, initially s3c244x would cover s3c2440 and s3c2442, but
+ with the s3c2443 which does not share many of the same drivers in
+ this directory, the name becomes invalid. We stick to s3c2440-<x>
+ to indicate a driver that is s3c2440 and s3c2442 compatible.
+
+ This does mean that to find the status of any given SoC, a number
+ of directories may need to be searched.
+
+
+Machines
+--------
+
+ The currently supported machines are as follows:
+
+ Simtec Electronics EB2410ITX (BAST)
+
+ A general purpose development board, see EB2410ITX.txt for further
+ details
+
+ Simtec Electronics IM2440D20 (Osiris)
+
+ CPU Module from Simtec Electronics, with a S3C2440A CPU, nand flash
+ and a PCMCIA controller.
+
+ Samsung SMDK2410
+
+ Samsung's own development board, geared for PDA work.
+
+ Samsung/Aiji SMDK2412
+
+ The S3C2412 version of the SMDK2440.
+
+ Samsung/Aiji SMDK2413
+
+ The S3C2412 version of the SMDK2440.
+
+ Samsung/Meritech SMDK2440
+
+ The S3C2440 compatible version of the SMDK2440, which has the
+ option of an S3C2440 or S3C2442 CPU module.
+
+ Thorcom VR1000
+
+ Custom embedded board
+
+ HP IPAQ 1940
+
+ Handheld (IPAQ), available in several varieties
+
+ HP iPAQ rx3715
+
+ S3C2440 based IPAQ, with a number of variations depending on
+ features shipped.
+
+ Acer N30
+
+ A S3C2410 based PDA from Acer. There is a Wiki page at
+ http://handhelds.org/moin/moin.cgi/AcerN30Documentation .
+
+ AML M5900
+
+ American Microsystems' M5900
+
+ Nex Vision Nexcoder
+ Nex Vision Otom
+
+ Two machines by Nex Vision
+
+
+Adding New Machines
+-------------------
+
+ The architecture has been designed to support as many machines as can
+ be configured for it in one kernel build, and any future additions
+ should keep this in mind before altering items outside of their own
+ machine files.
+
+ Machine definitions should be kept in linux/arch/arm/mach-s3c2410,
+ and there are a number of examples that can be looked at.
+
+ Read the kernel patch submission policies as well as the
+ Documentation/arm directory before submitting patches. The
+ ARM kernel series is managed by Russell King, and has a patch system
+ located at http://www.arm.linux.org.uk/developer/patches/
+ as well as mailing lists that can be found from the same site.
+
+ As a courtesy, please notify <ben-linux@fluff.org> of any new
+ machines or other modifications.
+
+ Any large scale modifications, or new drivers should be discussed
+ on the ARM kernel mailing list (linux-arm-kernel) before being
+ attempted. See http://www.arm.linux.org.uk/mailinglists/ for the
+ mailing list information.
+
+
+I2C
+---
+
+ The hardware I2C core in the CPU is supported in single master
+ mode, and can be configured via platform data.
+
+
+RTC
+---
+
+ Support for the onboard RTC unit, including alarm function.
+
+ This has recently been upgraded to use the new RTC core,
+ and the module has been renamed to rtc-s3c to fit in with
+ the new rtc naming scheme.
+
+
+Watchdog
+--------
+
+ The onchip watchdog is available via the standard watchdog
+ interface.
+
+
+NAND
+----
+
+ The current kernels now have support for the s3c2410 NAND
+ controller. If there are any problems the latest linux-mtd
+ code can be found from http://www.linux-mtd.infradead.org/
+
+ For more information see Documentation/arm/samsung-s3c24xx/nand.rst
+
+
+SD/MMC
+------
+
+ The SD/MMC hardware pre S3C2443 is supported in the current
+ kernel, the driver is drivers/mmc/host/s3cmci.c and supports
+ 1 and 4 bit SD or MMC cards.
+
+ The SDIO behaviour of this driver has not been fully tested. There is no
+ current support for hardware SDIO interrupts.
+
+
+Serial
+------
+
+ The s3c2410 serial driver provides support for the internal
+ serial ports. These devices appear as /dev/ttySAC0 through 3.
+
+ To create device nodes for these, use the following commands
+
+ mknod ttySAC0 c 204 64
+ mknod ttySAC1 c 204 65
+ mknod ttySAC2 c 204 66
+
+
+GPIO
+----
+
+ The core contains support for manipulating the GPIO, see the
+ documentation in GPIO.txt in the same directory as this file.
+
+ Newer kernels carry GPIOLIB, and support is being moved towards
+ this with some of the older support in line to be removed.
+
+ As of v2.6.34, the move towards using gpiolib support is almost
+ complete, and very little of the old calls are left.
+
+ See Documentation/arm/samsung-s3c24xx/gpio.rst for the S3C24XX specific
+ support and Documentation/arm/samsung/gpio.rst for the core Samsung
+ implementation.
+
+
+Clock Management
+----------------
+
+ The core provides the interface defined in the header file
+ include/asm-arm/hardware/clock.h, to allow control over the
+ various clock units
+
+
+Suspend to RAM
+--------------
+
+ For boards that provide support for suspend to RAM, the
+ system can be placed into low power suspend.
+
+ See Suspend.txt for more information.
+
+
+SPI
+---
+
+ SPI drivers are available for both the in-built hardware
+ (although there is no DMA support yet) and a generic
+ GPIO based solution.
+
+
+LEDs
+----
+
+ There is support for GPIO based LEDs via a platform driver
+ in the LED subsystem.
+
+
+Platform Data
+-------------
+
+ Whenever a device has platform specific data that is specified
+ on a per-machine basis, care should be taken to ensure the
+ following:
+
+ 1) that default data is not left in the device to confuse the
+ driver if a machine does not set it at startup
+
+ 2) the data should (if possible) be marked as __initdata,
+ to ensure that the data is thrown away if the machine is
+ not the one currently in use.
+
+ The best way of doing this is to make a function that
+ kmalloc()s an area of memory, and copies the __initdata
+ and then sets the relevant device's platform data. Making
+ the function `__init` takes care of ensuring it is discarded
+ with the rest of the initialisation code::
+
+ static __init void s3c24xx_xxx_set_platdata(struct xxx_data *pd)
+ {
+ struct s3c2410_xxx_mach_info *npd;
+
+ npd = kmalloc(sizeof(struct s3c2410_xxx_mach_info), GFP_KERNEL);
+ if (npd) {
+ memcpy(npd, pd, sizeof(struct s3c2410_xxx_mach_info));
+ s3c_device_xxx.dev.platform_data = npd;
+ } else {
+ printk(KERN_ERR "no memory for xxx platform data\n");
+ }
+ }
+
+ Note, since the code is marked as __init, it should not be
+ exported outside arch/arm/mach-s3c2410/, or exported to
+ modules via EXPORT_SYMBOL() and related functions.
+
+
+Port Contributors
+-----------------
+
+ Ben Dooks (BJD)
+ Vincent Sanders
+ Herbert Potzl
+ Arnaud Patard (RTP)
+ Roc Wu
+ Klaus Fetscher
+ Dimitry Andric
+ Shannon Holland
+ Guillaume Gourat (NexVision)
+ Christer Weinigel (wingel) (Acer N30)
+ Lucas Correia Villa Real (S3C2400 port)
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2004-2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/s3c2412.rst b/Documentation/arm/samsung-s3c24xx/s3c2412.rst
new file mode 100644
index 000000000000..68b985fc6bf4
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/s3c2412.rst
@@ -0,0 +1,121 @@
+==========================
+S3C2412 ARM Linux Overview
+==========================
+
+Introduction
+------------
+
+ The S3C2412 is part of the S3C24XX range of ARM9 System-on-Chip CPUs
+ from Samsung. This part has an ARM926-EJS core, capable of running up
+ to 266MHz (see data-sheet for more information)
+
+
+Clock
+-----
+
+ The core clock code provides a set of clocks to the drivers, and allows
+ for source selection and a number of other features.
+
+
+Power
+-----
+
+ No support for suspend/resume to RAM in the current system.
+
+
+DMA
+---
+
+ No current support for DMA.
+
+
+GPIO
+----
+
+ There is support for setting the GPIO to input/output/special function
+ and reading or writing to them.
+
+
+UART
+----
+
+ The UART hardware is similar to the S3C2440, and is supported by the
+ s3c2410 driver in the drivers/serial directory.
+
+
+NAND
+----
+
+ The NAND hardware is similar to the S3C2440, and is supported by the
+ s3c2410 driver in the drivers/mtd/nand/raw directory.
+
+
+USB Host
+--------
+
+ The USB hardware is similar to the S3C2410, with extended clock source
+ control. The OHCI portion is supported by the ohci-s3c2410 driver, and
+ the clock control selection is supported by the core clock code.
+
+
+USB Device
+----------
+
+ No current support in the kernel
+
+
+IRQs
+----
+
+ All the standard, and external interrupt sources are supported. The
+ extra sub-sources are not yet supported.
+
+
+RTC
+---
+
+ The RTC hardware is similar to the S3C2410, and is supported by the
+ s3c2410-rtc driver.
+
+
+Watchdog
+--------
+
+ The watchdog hardware is the same as the S3C2410, and is supported by
+ the s3c2410_wdt driver.
+
+
+MMC/SD/SDIO
+-----------
+
+ No current support for the MMC/SD/SDIO block.
+
+IIC
+---
+
+ The IIC hardware is the same as the S3C2410, and is supported by the
+ i2c-s3c24xx driver.
+
+
+IIS
+---
+
+ No current support for the IIS interface.
+
+
+SPI
+---
+
+ No current support for the SPI interfaces.
+
+
+ATA
+---
+
+ No current support for the on-board ATA block.
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/s3c2413.rst b/Documentation/arm/samsung-s3c24xx/s3c2413.rst
new file mode 100644
index 000000000000..1f51e207fc46
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/s3c2413.rst
@@ -0,0 +1,22 @@
+==========================
+S3C2413 ARM Linux Overview
+==========================
+
+Introduction
+------------
+
+ The S3C2413 is an extended version of the S3C2412, with an camera
+ interface and mobile DDR memory support. See the S3C2412 support
+ documentation for more information.
+
+
+Camera Interface
+----------------
+
+ This block is currently not supported.
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2006 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/smdk2440.rst b/Documentation/arm/samsung-s3c24xx/smdk2440.rst
new file mode 100644
index 000000000000..524fd0b4afaf
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/smdk2440.rst
@@ -0,0 +1,57 @@
+=========================
+Samsung/Meritech SMDK2440
+=========================
+
+Introduction
+------------
+
+ The SMDK2440 is a two part evaluation board for the Samsung S3C2440
+ processor. It includes support for LCD, SmartMedia, Audio, SD and
+ 10MBit Ethernet, and expansion headers for various signals, including
+ the camera and unused GPIO.
+
+
+Configuration
+-------------
+
+ To set the default configuration, use `make smdk2440_defconfig` which
+ will configure the common features of this board, or use
+ `make s3c2410_config` to include support for all s3c2410/s3c2440 machines
+
+
+Support
+-------
+
+ Ben Dooks' SMDK2440 site at http://www.fluff.org/ben/smdk2440/ which
+ includes linux based USB download tools.
+
+ Some of the h1940 patches that can be found from the H1940 project
+ site at http://www.handhelds.org/projects/h1940.html can also be
+ applied to this board.
+
+
+Peripherals
+-----------
+
+ There is no current support for any of the extra peripherals on the
+ base-board itself.
+
+
+MTD
+---
+
+ The NAND flash should be supported by the in kernel MTD NAND support,
+ NOR flash will be added later.
+
+
+Maintainers
+-----------
+
+ This board is being maintained by Ben Dooks, for more info, see
+ http://www.fluff.org/ben/smdk2440/
+
+ Many thanks to Dimitry Andric of TomTom for the loan of the SMDK2440,
+ and to Simtec Electronics for allowing me time to work on this.
+
+
+(c) 2004 Ben Dooks
diff --git a/Documentation/arm/samsung-s3c24xx/suspend.rst b/Documentation/arm/samsung-s3c24xx/suspend.rst
new file mode 100644
index 000000000000..b4f3ae9fe76e
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/suspend.rst
@@ -0,0 +1,137 @@
+=======================
+S3C24XX Suspend Support
+=======================
+
+
+Introduction
+------------
+
+ The S3C24XX supports a low-power suspend mode, where the SDRAM is kept
+ in Self-Refresh mode, and all but the essential peripheral blocks are
+ powered down. For more information on how this works, please look
+ at the relevant CPU datasheet from Samsung.
+
+
+Requirements
+------------
+
+ 1) A bootloader that can support the necessary resume operation
+
+ 2) Support for at least 1 source for resume
+
+ 3) CONFIG_PM enabled in the kernel
+
+ 4) Any peripherals that are going to be powered down at the same
+ time require suspend/resume support.
+
+
+Resuming
+--------
+
+ The S3C2410 user manual defines the process of sending the CPU to
+ sleep and how it resumes. The default behaviour of the Linux code
+ is to set the GSTATUS3 register to the physical address of the
+ code to resume Linux operation.
+
+ GSTATUS4 is currently left alone by the sleep code, and is free to
+ use for any other purposes (for example, the EB2410ITX uses this to
+ save memory configuration in).
+
+
+Machine Support
+---------------
+
+ The machine specific functions must call the s3c_pm_init() function
+ to say that its bootloader is capable of resuming. This can be as
+ simple as adding the following to the machine's definition:
+
+ INITMACHINE(s3c_pm_init)
+
+ A board can do its own setup before calling s3c_pm_init, if it
+ needs to setup anything else for power management support.
+
+ There is currently no support for over-riding the default method of
+ saving the resume address, if your board requires it, then contact
+ the maintainer and discuss what is required.
+
+ Note, the original method of adding an late_initcall() is wrong,
+ and will end up initialising all compiled machines' pm init!
+
+ The following is an example of code used for testing wakeup from
+ an falling edge on IRQ_EINT0::
+
+
+ static irqreturn_t button_irq(int irq, void *pw)
+ {
+ return IRQ_HANDLED;
+ }
+
+ statuc void __init machine_init(void)
+ {
+ ...
+
+ request_irq(IRQ_EINT0, button_irq, IRQF_TRIGGER_FALLING,
+ "button-irq-eint0", NULL);
+
+ enable_irq_wake(IRQ_EINT0);
+
+ s3c_pm_init();
+ }
+
+
+Debugging
+---------
+
+ There are several important things to remember when using PM suspend:
+
+ 1) The uart drivers will disable the clocks to the UART blocks when
+ suspending, which means that use of printascii() or similar direct
+ access to the UARTs will cause the debug to stop.
+
+ 2) While the pm code itself will attempt to re-enable the UART clocks,
+ care should be taken that any external clock sources that the UARTs
+ rely on are still enabled at that point.
+
+ 3) If any debugging is placed in the resume path, then it must have the
+ relevant clocks and peripherals setup before use (ie, bootloader).
+
+ For example, if you transmit a character from the UART, the baud
+ rate and uart controls must be setup beforehand.
+
+
+Configuration
+-------------
+
+ The S3C2410 specific configuration in `System Type` defines various
+ aspects of how the S3C2410 suspend and resume support is configured
+
+ `S3C2410 PM Suspend debug`
+
+ This option prints messages to the serial console before and after
+ the actual suspend, giving detailed information on what is
+ happening
+
+
+ `S3C2410 PM Suspend Memory CRC`
+
+ Allows the entire memory to be checksummed before and after the
+ suspend to see if there has been any corruption of the contents.
+
+ Note, the time to calculate the CRC is dependent on the CPU speed
+ and the size of memory. For an 64Mbyte RAM area on an 200MHz
+ S3C2410, this can take approximately 4 seconds to complete.
+
+ This support requires the CRC32 function to be enabled.
+
+
+ `S3C2410 PM Suspend CRC Chunksize (KiB)`
+
+ Defines the size of memory each CRC chunk covers. A smaller value
+ will mean that the CRC data block will take more memory, but will
+ identify any faults with better precision
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2004 Simtec Electronics
diff --git a/Documentation/arm/samsung-s3c24xx/usb-host.rst b/Documentation/arm/samsung-s3c24xx/usb-host.rst
new file mode 100644
index 000000000000..c84268bd1884
--- /dev/null
+++ b/Documentation/arm/samsung-s3c24xx/usb-host.rst
@@ -0,0 +1,91 @@
+========================
+S3C24XX USB Host support
+========================
+
+
+
+Introduction
+------------
+
+ This document details the S3C2410/S3C2440 in-built OHCI USB host support.
+
+Configuration
+-------------
+
+ Enable at least the following kernel options:
+
+ menuconfig::
+
+ Device Drivers --->
+ USB support --->
+ <*> Support for Host-side USB
+ <*> OHCI HCD support
+
+
+ .config:
+
+ - CONFIG_USB
+ - CONFIG_USB_OHCI_HCD
+
+
+ Once these options are configured, the standard set of USB device
+ drivers can be configured and used.
+
+
+Board Support
+-------------
+
+ The driver attaches to a platform device, which will need to be
+ added by the board specific support file in linux/arch/arm/mach-s3c2410,
+ such as mach-bast.c or mach-smdk2410.c
+
+ The platform device's platform_data field is only needed if the
+ board implements extra power control or over-current monitoring.
+
+ The OHCI driver does not ensure the state of the S3C2410's MISCCTRL
+ register, so if both ports are to be used for the host, then it is
+ the board support file's responsibility to ensure that the second
+ port is configured to be connected to the OHCI core.
+
+
+Platform Data
+-------------
+
+ See arch/arm/mach-s3c2410/include/mach/usb-control.h for the
+ descriptions of the platform device data. An implementation
+ can be found in linux/arch/arm/mach-s3c2410/usb-simtec.c .
+
+ The `struct s3c2410_hcd_info` contains a pair of functions
+ that get called to enable over-current detection, and to
+ control the port power status.
+
+ The ports are numbered 0 and 1.
+
+ power_control:
+ Called to enable or disable the power on the port.
+
+ enable_oc:
+ Called to enable or disable the over-current monitoring.
+ This should claim or release the resources being used to
+ check the power condition on the port, such as an IRQ.
+
+ report_oc:
+ The OHCI driver fills this field in for the over-current code
+ to call when there is a change to the over-current state on
+ an port. The ports argument is a bitmask of 1 bit per port,
+ with bit X being 1 for an over-current on port X.
+
+ The function s3c2410_usb_report_oc() has been provided to
+ ensure this is called correctly.
+
+ port[x]:
+ This is struct describes each port, 0 or 1. The platform driver
+ should set the flags field of each port to S3C_HCDFLG_USED if
+ the port is enabled.
+
+
+
+Document Author
+---------------
+
+Ben Dooks, Copyright 2005 Simtec Electronics
diff --git a/Documentation/arm/samsung/bootloader-interface.rst b/Documentation/arm/samsung/bootloader-interface.rst
new file mode 100644
index 000000000000..a56f325dae78
--- /dev/null
+++ b/Documentation/arm/samsung/bootloader-interface.rst
@@ -0,0 +1,81 @@
+==========================================================
+Interface between kernel and boot loaders on Exynos boards
+==========================================================
+
+Author: Krzysztof Kozlowski
+
+Date : 6 June 2015
+
+The document tries to describe currently used interface between Linux kernel
+and boot loaders on Samsung Exynos based boards. This is not a definition
+of interface but rather a description of existing state, a reference
+for information purpose only.
+
+In the document "boot loader" means any of following: U-boot, proprietary
+SBOOT or any other firmware for ARMv7 and ARMv8 initializing the board before
+executing kernel.
+
+
+1. Non-Secure mode
+
+Address: sysram_ns_base_addr
+
+============= ============================================ ==================
+Offset Value Purpose
+============= ============================================ ==================
+0x08 exynos_cpu_resume_ns, mcpm_entry_point System suspend
+0x0c 0x00000bad (Magic cookie) System suspend
+0x1c exynos4_secondary_startup Secondary CPU boot
+0x1c + 4*cpu exynos4_secondary_startup (Exynos4412) Secondary CPU boot
+0x20 0xfcba0d10 (Magic cookie) AFTR
+0x24 exynos_cpu_resume_ns AFTR
+0x28 + 4*cpu 0x8 (Magic cookie, Exynos3250) AFTR
+0x28 0x0 or last value during resume (Exynos542x) System suspend
+============= ============================================ ==================
+
+
+2. Secure mode
+
+Address: sysram_base_addr
+
+============= ============================================ ==================
+Offset Value Purpose
+============= ============================================ ==================
+0x00 exynos4_secondary_startup Secondary CPU boot
+0x04 exynos4_secondary_startup (Exynos542x) Secondary CPU boot
+4*cpu exynos4_secondary_startup (Exynos4412) Secondary CPU boot
+0x20 exynos_cpu_resume (Exynos4210 r1.0) AFTR
+0x24 0xfcba0d10 (Magic cookie, Exynos4210 r1.0) AFTR
+============= ============================================ ==================
+
+Address: pmu_base_addr
+
+============= ============================================ ==================
+Offset Value Purpose
+============= ============================================ ==================
+0x0800 exynos_cpu_resume AFTR, suspend
+0x0800 mcpm_entry_point (Exynos542x with MCPM) AFTR, suspend
+0x0804 0xfcba0d10 (Magic cookie) AFTR
+0x0804 0x00000bad (Magic cookie) System suspend
+0x0814 exynos4_secondary_startup (Exynos4210 r1.1) Secondary CPU boot
+0x0818 0xfcba0d10 (Magic cookie, Exynos4210 r1.1) AFTR
+0x081C exynos_cpu_resume (Exynos4210 r1.1) AFTR
+============= ============================================ ==================
+
+3. Other (regardless of secure/non-secure mode)
+
+Address: pmu_base_addr
+
+============= =============================== ===============================
+Offset Value Purpose
+============= =============================== ===============================
+0x0908 Non-zero Secondary CPU boot up indicator
+ on Exynos3250 and Exynos542x
+============= =============================== ===============================
+
+
+4. Glossary
+
+AFTR - ARM Off Top Running, a low power mode, Cortex cores and many other
+modules are power gated, except the TOP modules
+MCPM - Multi-Cluster Power Management
diff --git a/Documentation/arm/Samsung/clksrc-change-registers.awk b/Documentation/arm/samsung/clksrc-change-registers.awk
index 7be1b8aa7cd9..7be1b8aa7cd9 100755
--- a/Documentation/arm/Samsung/clksrc-change-registers.awk
+++ b/Documentation/arm/samsung/clksrc-change-registers.awk
diff --git a/Documentation/arm/samsung/gpio.rst b/Documentation/arm/samsung/gpio.rst
new file mode 100644
index 000000000000..5f7cadd7159e
--- /dev/null
+++ b/Documentation/arm/samsung/gpio.rst
@@ -0,0 +1,41 @@
+===========================
+Samsung GPIO implementation
+===========================
+
+Introduction
+------------
+
+This outlines the Samsung GPIO implementation and the architecture
+specific calls provided alongside the drivers/gpio core.
+
+
+S3C24XX (Legacy)
+----------------
+
+See Documentation/arm/samsung-s3c24xx/gpio.rst for more information
+about these devices. Their implementation has been brought into line
+with the core samsung implementation described in this document.
+
+
+GPIOLIB integration
+-------------------
+
+The gpio implementation uses gpiolib as much as possible, only providing
+specific calls for the items that require Samsung specific handling, such
+as pin special-function or pull resistor control.
+
+GPIO numbering is synchronised between the Samsung and gpiolib system.
+
+
+PIN configuration
+-----------------
+
+Pin configuration is specific to the Samsung architecture, with each SoC
+registering the necessary information for the core gpio configuration
+implementation to configure pins as necessary.
+
+The s3c_gpio_cfgpin() and s3c_gpio_setpull() provide the means for a
+driver or machine to change gpio configuration.
+
+See arch/arm/plat-samsung/include/plat/gpio-cfg.h for more information
+on these functions.
diff --git a/Documentation/arm/samsung/index.rst b/Documentation/arm/samsung/index.rst
new file mode 100644
index 000000000000..8142cce3d23e
--- /dev/null
+++ b/Documentation/arm/samsung/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========
+Samsung SoC
+===========
+
+.. toctree::
+ :maxdepth: 1
+
+ gpio
+ bootloader-interface
+ overview
diff --git a/Documentation/arm/samsung/overview.rst b/Documentation/arm/samsung/overview.rst
new file mode 100644
index 000000000000..e74307897416
--- /dev/null
+++ b/Documentation/arm/samsung/overview.rst
@@ -0,0 +1,89 @@
+==========================
+Samsung ARM Linux Overview
+==========================
+
+Introduction
+------------
+
+ The Samsung range of ARM SoCs spans many similar devices, from the initial
+ ARM9 through to the newest ARM cores. This document shows an overview of
+ the current kernel support, how to use it and where to find the code
+ that supports this.
+
+ The currently supported SoCs are:
+
+ - S3C24XX: See Documentation/arm/samsung-s3c24xx/overview.rst for full list
+ - S3C64XX: S3C6400 and S3C6410
+ - S5PC110 / S5PV210
+
+
+S3C24XX Systems
+---------------
+
+ There is still documentation in Documnetation/arm/Samsung-S3C24XX/ which
+ deals with the architecture and drivers specific to these devices.
+
+ See Documentation/arm/samsung-s3c24xx/overview.rst for more information
+ on the implementation details and specific support.
+
+
+Configuration
+-------------
+
+ A number of configurations are supplied, as there is no current way of
+ unifying all the SoCs into one kernel.
+
+ s5pc110_defconfig
+ - S5PC110 specific default configuration
+ s5pv210_defconfig
+ - S5PV210 specific default configuration
+
+
+Layout
+------
+
+ The directory layout is currently being restructured, and consists of
+ several platform directories and then the machine specific directories
+ of the CPUs being built for.
+
+ plat-samsung provides the base for all the implementations, and is the
+ last in the line of include directories that are processed for the build
+ specific information. It contains the base clock, GPIO and device definitions
+ to get the system running.
+
+ plat-s3c24xx is for s3c24xx specific builds, see the S3C24XX docs.
+
+ plat-s5p is for s5p specific builds, and contains common support for the
+ S5P specific systems. Not all S5Ps use all the features in this directory
+ due to differences in the hardware.
+
+
+Layout changes
+--------------
+
+ The old plat-s3c and plat-s5pc1xx directories have been removed, with
+ support moved to either plat-samsung or plat-s5p as necessary. These moves
+ where to simplify the include and dependency issues involved with having
+ so many different platform directories.
+
+
+Port Contributors
+-----------------
+
+ Ben Dooks (BJD)
+ Vincent Sanders
+ Herbert Potzl
+ Arnaud Patard (RTP)
+ Roc Wu
+ Klaus Fetscher
+ Dimitry Andric
+ Shannon Holland
+ Guillaume Gourat (NexVision)
+ Christer Weinigel (wingel) (Acer N30)
+ Lucas Correia Villa Real (S3C2400 port)
+
+
+Document Author
+---------------
+
+Copyright 2009-2010 Ben Dooks <ben-linux@fluff.org>
diff --git a/Documentation/arm/setup.rst b/Documentation/arm/setup.rst
new file mode 100644
index 000000000000..8e12ef3fb9a7
--- /dev/null
+++ b/Documentation/arm/setup.rst
@@ -0,0 +1,108 @@
+=============================================
+Kernel initialisation parameters on ARM Linux
+=============================================
+
+The following document describes the kernel initialisation parameter
+structure, otherwise known as 'struct param_struct' which is used
+for most ARM Linux architectures.
+
+This structure is used to pass initialisation parameters from the
+kernel loader to the Linux kernel proper, and may be short lived
+through the kernel initialisation process. As a general rule, it
+should not be referenced outside of arch/arm/kernel/setup.c:setup_arch().
+
+There are a lot of parameters listed in there, and they are described
+below:
+
+ page_size
+ This parameter must be set to the page size of the machine, and
+ will be checked by the kernel.
+
+ nr_pages
+ This is the total number of pages of memory in the system. If
+ the memory is banked, then this should contain the total number
+ of pages in the system.
+
+ If the system contains separate VRAM, this value should not
+ include this information.
+
+ ramdisk_size
+ This is now obsolete, and should not be used.
+
+ flags
+ Various kernel flags, including:
+
+ ===== ========================
+ bit 0 1 = mount root read only
+ bit 1 unused
+ bit 2 0 = load ramdisk
+ bit 3 0 = prompt for ramdisk
+ ===== ========================
+
+ rootdev
+ major/minor number pair of device to mount as the root filesystem.
+
+ video_num_cols / video_num_rows
+ These two together describe the character size of the dummy console,
+ or VGA console character size. They should not be used for any other
+ purpose.
+
+ It's generally a good idea to set these to be either standard VGA, or
+ the equivalent character size of your fbcon display. This then allows
+ all the bootup messages to be displayed correctly.
+
+ video_x / video_y
+ This describes the character position of cursor on VGA console, and
+ is otherwise unused. (should not be used for other console types, and
+ should not be used for other purposes).
+
+ memc_control_reg
+ MEMC chip control register for Acorn Archimedes and Acorn A5000
+ based machines. May be used differently by different architectures.
+
+ sounddefault
+ Default sound setting on Acorn machines. May be used differently by
+ different architectures.
+
+ adfsdrives
+ Number of ADFS/MFM disks. May be used differently by different
+ architectures.
+
+ bytes_per_char_h / bytes_per_char_v
+ These are now obsolete, and should not be used.
+
+ pages_in_bank[4]
+ Number of pages in each bank of the systems memory (used for RiscPC).
+ This is intended to be used on systems where the physical memory
+ is non-contiguous from the processors point of view.
+
+ pages_in_vram
+ Number of pages in VRAM (used on Acorn RiscPC). This value may also
+ be used by loaders if the size of the video RAM can't be obtained
+ from the hardware.
+
+ initrd_start / initrd_size
+ This describes the kernel virtual start address and size of the
+ initial ramdisk.
+
+ rd_start
+ Start address in sectors of the ramdisk image on a floppy disk.
+
+ system_rev
+ system revision number.
+
+ system_serial_low / system_serial_high
+ system 64-bit serial number
+
+ mem_fclk_21285
+ The speed of the external oscillator to the 21285 (footbridge),
+ which control's the speed of the memory bus, timer & serial port.
+ Depending upon the speed of the cpu its value can be between
+ 0-66 MHz. If no params are passed or a value of zero is passed,
+ then a value of 50 Mhz is the default on 21285 architectures.
+
+ paths[8][128]
+ These are now obsolete, and should not be used.
+
+ commandline
+ Kernel command line parameters. Details can be found elsewhere.
diff --git a/Documentation/arm/SH-Mobile/.gitignore b/Documentation/arm/sh-mobile/.gitignore
index c928dbf3cc88..c928dbf3cc88 100644
--- a/Documentation/arm/SH-Mobile/.gitignore
+++ b/Documentation/arm/sh-mobile/.gitignore
diff --git a/Documentation/arm/spear/overview.rst b/Documentation/arm/spear/overview.rst
new file mode 100644
index 000000000000..1a77f6b213b6
--- /dev/null
+++ b/Documentation/arm/spear/overview.rst
@@ -0,0 +1,66 @@
+========================
+SPEAr ARM Linux Overview
+========================
+
+Introduction
+------------
+
+ SPEAr (Structured Processor Enhanced Architecture).
+ weblink : http://www.st.com/spear
+
+ The ST Microelectronics SPEAr range of ARM9/CortexA9 System-on-Chip CPUs are
+ supported by the 'spear' platform of ARM Linux. Currently SPEAr1310,
+ SPEAr1340, SPEAr300, SPEAr310, SPEAr320 and SPEAr600 SOCs are supported.
+
+ Hierarchy in SPEAr is as follows:
+
+ SPEAr (Platform)
+
+ - SPEAr3XX (3XX SOC series, based on ARM9)
+ - SPEAr300 (SOC)
+ - SPEAr300 Evaluation Board
+ - SPEAr310 (SOC)
+ - SPEAr310 Evaluation Board
+ - SPEAr320 (SOC)
+ - SPEAr320 Evaluation Board
+ - SPEAr6XX (6XX SOC series, based on ARM9)
+ - SPEAr600 (SOC)
+ - SPEAr600 Evaluation Board
+ - SPEAr13XX (13XX SOC series, based on ARM CORTEXA9)
+ - SPEAr1310 (SOC)
+ - SPEAr1310 Evaluation Board
+ - SPEAr1340 (SOC)
+ - SPEAr1340 Evaluation Board
+
+Configuration
+-------------
+
+ A generic configuration is provided for each machine, and can be used as the
+ default by::
+
+ make spear13xx_defconfig
+ make spear3xx_defconfig
+ make spear6xx_defconfig
+
+Layout
+------
+
+ The common files for multiple machine families (SPEAr3xx, SPEAr6xx and
+ SPEAr13xx) are located in the platform code contained in arch/arm/plat-spear
+ with headers in plat/.
+
+ Each machine series have a directory with name arch/arm/mach-spear followed by
+ series name. Like mach-spear3xx, mach-spear6xx and mach-spear13xx.
+
+ Common file for machines of spear3xx family is mach-spear3xx/spear3xx.c, for
+ spear6xx is mach-spear6xx/spear6xx.c and for spear13xx family is
+ mach-spear13xx/spear13xx.c. mach-spear* also contain soc/machine specific
+ files, like spear1310.c, spear1340.c spear300.c, spear310.c, spear320.c and
+ spear600.c. mach-spear* doesn't contains board specific files as they fully
+ support Flattened Device Tree.
+
+
+Document Author
+---------------
+
+ Viresh Kumar <vireshk@kernel.org>, (c) 2010-2012 ST Microelectronics
diff --git a/Documentation/arm/sti/overview.rst b/Documentation/arm/sti/overview.rst
new file mode 100644
index 000000000000..70743617a74f
--- /dev/null
+++ b/Documentation/arm/sti/overview.rst
@@ -0,0 +1,36 @@
+======================
+STi ARM Linux Overview
+======================
+
+Introduction
+------------
+
+ The ST Microelectronics Multimedia and Application Processors range of
+ CortexA9 System-on-Chip are supported by the 'STi' platform of
+ ARM Linux. Currently STiH415, STiH416 SOCs are supported with both
+ B2000 and B2020 Reference boards.
+
+
+configuration
+-------------
+
+ A generic configuration is provided for both STiH415/416, and can be used as the
+ default by::
+
+ make stih41x_defconfig
+
+Layout
+------
+
+ All the files for multiple machine families (STiH415, STiH416, and STiG125)
+ are located in the platform code contained in arch/arm/mach-sti
+
+ There is a generic board board-dt.c in the mach folder which support
+ Flattened Device Tree, which means, It works with any compatible board with
+ Device Trees.
+
+
+Document Author
+---------------
+
+ Srinivas Kandagatla <srinivas.kandagatla@st.com>, (c) 2013 ST Microelectronics
diff --git a/Documentation/arm/sti/overview.txt b/Documentation/arm/sti/overview.txt
deleted file mode 100644
index 1a4e93d6027f..000000000000
--- a/Documentation/arm/sti/overview.txt
+++ /dev/null
@@ -1,33 +0,0 @@
- STi ARM Linux Overview
- ==========================
-
-Introduction
-------------
-
- The ST Microelectronics Multimedia and Application Processors range of
- CortexA9 System-on-Chip are supported by the 'STi' platform of
- ARM Linux. Currently STiH415, STiH416 SOCs are supported with both
- B2000 and B2020 Reference boards.
-
-
- configuration
- -------------
-
- A generic configuration is provided for both STiH415/416, and can be used as the
- default by
- make stih41x_defconfig
-
- Layout
- ------
- All the files for multiple machine families (STiH415, STiH416, and STiG125)
- are located in the platform code contained in arch/arm/mach-sti
-
- There is a generic board board-dt.c in the mach folder which support
- Flattened Device Tree, which means, It works with any compatible board with
- Device Trees.
-
-
- Document Author
- ---------------
-
- Srinivas Kandagatla <srinivas.kandagatla@st.com>, (c) 2013 ST Microelectronics
diff --git a/Documentation/arm/sti/stih407-overview.rst b/Documentation/arm/sti/stih407-overview.rst
new file mode 100644
index 000000000000..027e75bc7b7c
--- /dev/null
+++ b/Documentation/arm/sti/stih407-overview.rst
@@ -0,0 +1,19 @@
+================
+STiH407 Overview
+================
+
+Introduction
+------------
+
+ The STiH407 is the new generation of SoC for Multi-HD, AVC set-top boxes
+ and server/connected client application for satellite, cable, terrestrial
+ and IP-STB markets.
+
+ Features
+ - ARM Cortex-A9 1.5 GHz dual core CPU (28nm)
+ - SATA2, USB 3.0, PCIe, Gbit Ethernet
+
+Document Author
+---------------
+
+ Maxime Coquelin <maxime.coquelin@st.com>, (c) 2014 ST Microelectronics
diff --git a/Documentation/arm/sti/stih407-overview.txt b/Documentation/arm/sti/stih407-overview.txt
deleted file mode 100644
index 3343f32f58bc..000000000000
--- a/Documentation/arm/sti/stih407-overview.txt
+++ /dev/null
@@ -1,18 +0,0 @@
- STiH407 Overview
- ================
-
-Introduction
-------------
-
- The STiH407 is the new generation of SoC for Multi-HD, AVC set-top boxes
- and server/connected client application for satellite, cable, terrestrial
- and IP-STB markets.
-
- Features
- - ARM Cortex-A9 1.5 GHz dual core CPU (28nm)
- - SATA2, USB 3.0, PCIe, Gbit Ethernet
-
- Document Author
- ---------------
-
- Maxime Coquelin <maxime.coquelin@st.com>, (c) 2014 ST Microelectronics
diff --git a/Documentation/arm/sti/stih415-overview.rst b/Documentation/arm/sti/stih415-overview.rst
new file mode 100644
index 000000000000..b67452d610c4
--- /dev/null
+++ b/Documentation/arm/sti/stih415-overview.rst
@@ -0,0 +1,14 @@
+================
+STiH415 Overview
+================
+
+Introduction
+------------
+
+ The STiH415 is the next generation of HD, AVC set-top box processors
+ for satellite, cable, terrestrial and IP-STB markets.
+
+ Features:
+
+ - ARM Cortex-A9 1.0 GHz, dual-core CPU
+ - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih415-overview.txt b/Documentation/arm/sti/stih415-overview.txt
deleted file mode 100644
index 1383e33f265d..000000000000
--- a/Documentation/arm/sti/stih415-overview.txt
+++ /dev/null
@@ -1,12 +0,0 @@
- STiH415 Overview
- ================
-
-Introduction
-------------
-
- The STiH415 is the next generation of HD, AVC set-top box processors
- for satellite, cable, terrestrial and IP-STB markets.
-
- Features
- - ARM Cortex-A9 1.0 GHz, dual-core CPU
- - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih416-overview.rst b/Documentation/arm/sti/stih416-overview.rst
new file mode 100644
index 000000000000..93f17d74d8db
--- /dev/null
+++ b/Documentation/arm/sti/stih416-overview.rst
@@ -0,0 +1,13 @@
+================
+STiH416 Overview
+================
+
+Introduction
+------------
+
+ The STiH416 is the next generation of HD, AVC set-top box processors
+ for satellite, cable, terrestrial and IP-STB markets.
+
+ Features
+ - ARM Cortex-A9 1.2 GHz dual core CPU
+ - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih416-overview.txt b/Documentation/arm/sti/stih416-overview.txt
deleted file mode 100644
index 558444c201c6..000000000000
--- a/Documentation/arm/sti/stih416-overview.txt
+++ /dev/null
@@ -1,12 +0,0 @@
- STiH416 Overview
- ================
-
-Introduction
-------------
-
- The STiH416 is the next generation of HD, AVC set-top box processors
- for satellite, cable, terrestrial and IP-STB markets.
-
- Features
- - ARM Cortex-A9 1.2 GHz dual core CPU
- - SATA2x2,USB 2.0x3, PCIe, Gbit Ethernet MACx2
diff --git a/Documentation/arm/sti/stih418-overview.rst b/Documentation/arm/sti/stih418-overview.rst
new file mode 100644
index 000000000000..b563c1f4fe5a
--- /dev/null
+++ b/Documentation/arm/sti/stih418-overview.rst
@@ -0,0 +1,21 @@
+================
+STiH418 Overview
+================
+
+Introduction
+------------
+
+ The STiH418 is the new generation of SoC for UHDp60 set-top boxes
+ and server/connected client application for satellite, cable, terrestrial
+ and IP-STB markets.
+
+ Features
+ - ARM Cortex-A9 1.5 GHz quad core CPU (28nm)
+ - SATA2, USB 3.0, PCIe, Gbit Ethernet
+ - HEVC L5.1 Main 10
+ - VP9
+
+Document Author
+---------------
+
+ Maxime Coquelin <maxime.coquelin@st.com>, (c) 2015 ST Microelectronics
diff --git a/Documentation/arm/sti/stih418-overview.txt b/Documentation/arm/sti/stih418-overview.txt
deleted file mode 100644
index 1cd8fc80646d..000000000000
--- a/Documentation/arm/sti/stih418-overview.txt
+++ /dev/null
@@ -1,20 +0,0 @@
- STiH418 Overview
- ================
-
-Introduction
-------------
-
- The STiH418 is the new generation of SoC for UHDp60 set-top boxes
- and server/connected client application for satellite, cable, terrestrial
- and IP-STB markets.
-
- Features
- - ARM Cortex-A9 1.5 GHz quad core CPU (28nm)
- - SATA2, USB 3.0, PCIe, Gbit Ethernet
- - HEVC L5.1 Main 10
- - VP9
-
- Document Author
- ---------------
-
- Maxime Coquelin <maxime.coquelin@st.com>, (c) 2015 ST Microelectronics
diff --git a/Documentation/arm/stm32/overview.rst b/Documentation/arm/stm32/overview.rst
index f7e734153860..85cfc8410798 100644
--- a/Documentation/arm/stm32/overview.rst
+++ b/Documentation/arm/stm32/overview.rst
@@ -1,5 +1,3 @@
-:orphan:
-
========================
STM32 ARM Linux Overview
========================
diff --git a/Documentation/arm/stm32/stm32f429-overview.rst b/Documentation/arm/stm32/stm32f429-overview.rst
index 65bbb1c3b423..a7ebe8ea6697 100644
--- a/Documentation/arm/stm32/stm32f429-overview.rst
+++ b/Documentation/arm/stm32/stm32f429-overview.rst
@@ -1,5 +1,4 @@
-:orphan:
-
+==================
STM32F429 Overview
==================
@@ -23,6 +22,4 @@ Datasheet and reference manual are publicly available on ST website (STM32F429_)
.. _STM32F429: http://www.st.com/web/en/catalog/mmc/FM141/SC1169/SS1577/LN1806?ecmp=stm32f429-439_pron_pr-ces2014_nov2013
-:Authors:
-
-Maxime Coquelin <mcoquelin.stm32@gmail.com>
+:Authors: Maxime Coquelin <mcoquelin.stm32@gmail.com>
diff --git a/Documentation/arm/stm32/stm32f746-overview.rst b/Documentation/arm/stm32/stm32f746-overview.rst
index 42d593085015..78befddc7740 100644
--- a/Documentation/arm/stm32/stm32f746-overview.rst
+++ b/Documentation/arm/stm32/stm32f746-overview.rst
@@ -1,5 +1,4 @@
-:orphan:
-
+==================
STM32F746 Overview
==================
@@ -30,6 +29,4 @@ Datasheet and reference manual are publicly available on ST website (STM32F746_)
.. _STM32F746: http://www.st.com/content/st_com/en/products/microcontrollers/stm32-32-bit-arm-cortex-mcus/stm32f7-series/stm32f7x6/stm32f746ng.html
-:Authors:
-
-Alexandre Torgue <alexandre.torgue@st.com>
+:Authors: Alexandre Torgue <alexandre.torgue@st.com>
diff --git a/Documentation/arm/stm32/stm32f769-overview.rst b/Documentation/arm/stm32/stm32f769-overview.rst
index f6adac862b17..e482980ddf21 100644
--- a/Documentation/arm/stm32/stm32f769-overview.rst
+++ b/Documentation/arm/stm32/stm32f769-overview.rst
@@ -1,5 +1,4 @@
-:orphan:
-
+==================
STM32F769 Overview
==================
@@ -32,6 +31,4 @@ Datasheet and reference manual are publicly available on ST website (STM32F769_)
.. _STM32F769: http://www.st.com/content/st_com/en/products/microcontrollers/stm32-32-bit-arm-cortex-mcus/stm32-high-performance-mcus/stm32f7-series/stm32f7x9/stm32f769ni.html
-:Authors:
-
-Alexandre Torgue <alexandre.torgue@st.com>
+:Authors: Alexandre Torgue <alexandre.torgue@st.com>
diff --git a/Documentation/arm/stm32/stm32h743-overview.rst b/Documentation/arm/stm32/stm32h743-overview.rst
index c525835e7473..4e15f1a42730 100644
--- a/Documentation/arm/stm32/stm32h743-overview.rst
+++ b/Documentation/arm/stm32/stm32h743-overview.rst
@@ -1,5 +1,4 @@
-:orphan:
-
+==================
STM32H743 Overview
==================
@@ -31,6 +30,4 @@ Datasheet and reference manual are publicly available on ST website (STM32H743_)
.. _STM32H743: http://www.st.com/en/microcontrollers/stm32h7x3.html?querycriteria=productId=LN2033
-:Authors:
-
-Alexandre Torgue <alexandre.torgue@st.com>
+:Authors: Alexandre Torgue <alexandre.torgue@st.com>
diff --git a/Documentation/arm/stm32/stm32mp157-overview.rst b/Documentation/arm/stm32/stm32mp157-overview.rst
index 2c52cd020601..f62fdc8e7d8d 100644
--- a/Documentation/arm/stm32/stm32mp157-overview.rst
+++ b/Documentation/arm/stm32/stm32mp157-overview.rst
@@ -1,5 +1,4 @@
-:orphan:
-
+===================
STM32MP157 Overview
===================
diff --git a/Documentation/arm/sunxi.rst b/Documentation/arm/sunxi.rst
new file mode 100644
index 000000000000..b037428aee98
--- /dev/null
+++ b/Documentation/arm/sunxi.rst
@@ -0,0 +1,150 @@
+==================
+ARM Allwinner SoCs
+==================
+
+This document lists all the ARM Allwinner SoCs that are currently
+supported in mainline by the Linux kernel. This document will also
+provide links to documentation and/or datasheet for these SoCs.
+
+SunXi family
+------------
+ Linux kernel mach directory: arch/arm/mach-sunxi
+
+ Flavors:
+
+ * ARM926 based SoCs
+ - Allwinner F20 (sun3i)
+
+ * Not Supported
+
+ * ARM Cortex-A8 based SoCs
+ - Allwinner A10 (sun4i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A10/A10%20Datasheet%20-%20v1.21%20%282012-04-06%29.pdf
+ * User Manual
+
+ http://dl.linux-sunxi.org/A10/A10%20User%20Manual%20-%20v1.20%20%282012-04-09%2c%20DECRYPTED%29.pdf
+
+ - Allwinner A10s (sun5i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A10s/A10s%20Datasheet%20-%20v1.20%20%282012-03-27%29.pdf
+
+ - Allwinner A13 / R8 (sun5i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A13/A13%20Datasheet%20-%20v1.12%20%282012-03-29%29.pdf
+ * User Manual
+
+ http://dl.linux-sunxi.org/A13/A13%20User%20Manual%20-%20v1.2%20%282013-01-08%29.pdf
+
+ - Next Thing Co GR8 (sun5i)
+
+ * Single ARM Cortex-A7 based SoCs
+ - Allwinner V3s (sun8i)
+
+ * Datasheet
+
+ http://linux-sunxi.org/File:Allwinner_V3s_Datasheet_V1.0.pdf
+
+ * Dual ARM Cortex-A7 based SoCs
+ - Allwinner A20 (sun7i)
+
+ * User Manual
+
+ http://dl.linux-sunxi.org/A20/A20%20User%20Manual%202013-03-22.pdf
+
+ - Allwinner A23 (sun8i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A23/A23%20Datasheet%20V1.0%2020130830.pdf
+
+ * User Manual
+
+ http://dl.linux-sunxi.org/A23/A23%20User%20Manual%20V1.0%2020130830.pdf
+
+ * Quad ARM Cortex-A7 based SoCs
+ - Allwinner A31 (sun6i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20datasheet%20V1.3%2020131106.pdf
+
+ * User Manual
+
+ http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20user%20manual%20V1.1%2020130630.pdf
+
+ - Allwinner A31s (sun6i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20datasheet%20V1.3%2020131106.pdf
+
+ * User Manual
+
+ http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20User%20Manual%20%20V1.0%2020130322.pdf
+
+ - Allwinner A33 (sun8i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A33/A33%20Datasheet%20release%201.1.pdf
+
+ * User Manual
+
+ http://dl.linux-sunxi.org/A33/A33%20user%20manual%20release%201.1.pdf
+
+ - Allwinner H2+ (sun8i)
+
+ * No document available now, but is known to be working properly with
+ H3 drivers and memory map.
+
+ - Allwinner H3 (sun8i)
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/H3/Allwinner_H3_Datasheet_V1.0.pdf
+
+ - Allwinner R40 (sun8i)
+
+ * Datasheet
+
+ https://github.com/tinalinux/docs/raw/r40-v1.y/R40_Datasheet_V1.0.pdf
+
+ * User Manual
+
+ https://github.com/tinalinux/docs/raw/r40-v1.y/Allwinner_R40_User_Manual_V1.0.pdf
+
+ * Quad ARM Cortex-A15, Quad ARM Cortex-A7 based SoCs
+ - Allwinner A80
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A80/A80_Datasheet_Revision_1.0_0404.pdf
+
+ * Octa ARM Cortex-A7 based SoCs
+ - Allwinner A83T
+
+ * Datasheet
+
+ https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_Datasheet_v1.3_20150510.pdf
+
+ * User Manual
+
+ https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_User_Manual_v1.5.1_20150513.pdf
+
+ * Quad ARM Cortex-A53 based SoCs
+ - Allwinner A64
+
+ * Datasheet
+
+ http://dl.linux-sunxi.org/A64/A64_Datasheet_V1.1.pdf
+
+ * User Manual
+
+ http://dl.linux-sunxi.org/A64/Allwinner%20A64%20User%20Manual%20v1.0.pdf
diff --git a/Documentation/arm/sunxi/README b/Documentation/arm/sunxi/README
deleted file mode 100644
index f8efc21998bf..000000000000
--- a/Documentation/arm/sunxi/README
+++ /dev/null
@@ -1,102 +0,0 @@
-ARM Allwinner SoCs
-==================
-
-This document lists all the ARM Allwinner SoCs that are currently
-supported in mainline by the Linux kernel. This document will also
-provide links to documentation and/or datasheet for these SoCs.
-
-SunXi family
-------------
- Linux kernel mach directory: arch/arm/mach-sunxi
-
- Flavors:
- * ARM926 based SoCs
- - Allwinner F20 (sun3i)
- + Not Supported
-
- * ARM Cortex-A8 based SoCs
- - Allwinner A10 (sun4i)
- + Datasheet
- http://dl.linux-sunxi.org/A10/A10%20Datasheet%20-%20v1.21%20%282012-04-06%29.pdf
- + User Manual
- http://dl.linux-sunxi.org/A10/A10%20User%20Manual%20-%20v1.20%20%282012-04-09%2c%20DECRYPTED%29.pdf
-
- - Allwinner A10s (sun5i)
- + Datasheet
- http://dl.linux-sunxi.org/A10s/A10s%20Datasheet%20-%20v1.20%20%282012-03-27%29.pdf
-
- - Allwinner A13 / R8 (sun5i)
- + Datasheet
- http://dl.linux-sunxi.org/A13/A13%20Datasheet%20-%20v1.12%20%282012-03-29%29.pdf
- + User Manual
- http://dl.linux-sunxi.org/A13/A13%20User%20Manual%20-%20v1.2%20%282013-01-08%29.pdf
-
- - Next Thing Co GR8 (sun5i)
-
- * Single ARM Cortex-A7 based SoCs
- - Allwinner V3s (sun8i)
- + Datasheet
- http://linux-sunxi.org/File:Allwinner_V3s_Datasheet_V1.0.pdf
-
- * Dual ARM Cortex-A7 based SoCs
- - Allwinner A20 (sun7i)
- + User Manual
- http://dl.linux-sunxi.org/A20/A20%20User%20Manual%202013-03-22.pdf
-
- - Allwinner A23 (sun8i)
- + Datasheet
- http://dl.linux-sunxi.org/A23/A23%20Datasheet%20V1.0%2020130830.pdf
- + User Manual
- http://dl.linux-sunxi.org/A23/A23%20User%20Manual%20V1.0%2020130830.pdf
-
- * Quad ARM Cortex-A7 based SoCs
- - Allwinner A31 (sun6i)
- + Datasheet
- http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20datasheet%20V1.3%2020131106.pdf
- + User Manual
- http://dl.linux-sunxi.org/A31/A3x_release_document/A31/IC/A31%20user%20manual%20V1.1%2020130630.pdf
-
- - Allwinner A31s (sun6i)
- + Datasheet
- http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20datasheet%20V1.3%2020131106.pdf
- + User Manual
- http://dl.linux-sunxi.org/A31/A3x_release_document/A31s/IC/A31s%20User%20Manual%20%20V1.0%2020130322.pdf
-
- - Allwinner A33 (sun8i)
- + Datasheet
- http://dl.linux-sunxi.org/A33/A33%20Datasheet%20release%201.1.pdf
- + User Manual
- http://dl.linux-sunxi.org/A33/A33%20user%20manual%20release%201.1.pdf
-
- - Allwinner H2+ (sun8i)
- + No document available now, but is known to be working properly with
- H3 drivers and memory map.
-
- - Allwinner H3 (sun8i)
- + Datasheet
- http://dl.linux-sunxi.org/H3/Allwinner_H3_Datasheet_V1.0.pdf
-
- - Allwinner R40 (sun8i)
- + Datasheet
- https://github.com/tinalinux/docs/raw/r40-v1.y/R40_Datasheet_V1.0.pdf
- + User Manual
- https://github.com/tinalinux/docs/raw/r40-v1.y/Allwinner_R40_User_Manual_V1.0.pdf
-
- * Quad ARM Cortex-A15, Quad ARM Cortex-A7 based SoCs
- - Allwinner A80
- + Datasheet
- http://dl.linux-sunxi.org/A80/A80_Datasheet_Revision_1.0_0404.pdf
-
- * Octa ARM Cortex-A7 based SoCs
- - Allwinner A83T
- + Datasheet
- https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_Datasheet_v1.3_20150510.pdf
- + User Manual
- https://github.com/allwinner-zh/documents/raw/master/A83T/A83T_User_Manual_v1.5.1_20150513.pdf
-
- * Quad ARM Cortex-A53 based SoCs
- - Allwinner A64
- + Datasheet
- http://dl.linux-sunxi.org/A64/A64_Datasheet_V1.1.pdf
- + User Manual
- http://dl.linux-sunxi.org/A64/Allwinner%20A64%20User%20Manual%20v1.0.pdf
diff --git a/Documentation/arm/sunxi/clocks.rst b/Documentation/arm/sunxi/clocks.rst
new file mode 100644
index 000000000000..23bd03f3e21f
--- /dev/null
+++ b/Documentation/arm/sunxi/clocks.rst
@@ -0,0 +1,57 @@
+=======================================================
+Frequently asked questions about the sunxi clock system
+=======================================================
+
+This document contains useful bits of information that people tend to ask
+about the sunxi clock system, as well as accompanying ASCII art when adequate.
+
+Q: Why is the main 24MHz oscillator gatable? Wouldn't that break the
+ system?
+
+A: The 24MHz oscillator allows gating to save power. Indeed, if gated
+ carelessly the system would stop functioning, but with the right
+ steps, one can gate it and keep the system running. Consider this
+ simplified suspend example:
+
+ While the system is operational, you would see something like::
+
+ 24MHz 32kHz
+ |
+ PLL1
+ \
+ \_ CPU Mux
+ |
+ [CPU]
+
+ When you are about to suspend, you switch the CPU Mux to the 32kHz
+ oscillator::
+
+ 24Mhz 32kHz
+ | |
+ PLL1 |
+ /
+ CPU Mux _/
+ |
+ [CPU]
+
+ Finally you can gate the main oscillator::
+
+ 32kHz
+ |
+ |
+ /
+ CPU Mux _/
+ |
+ [CPU]
+
+Q: Were can I learn more about the sunxi clocks?
+
+A: The linux-sunxi wiki contains a page documenting the clock registers,
+ you can find it at
+
+ http://linux-sunxi.org/A10/CCM
+
+ The authoritative source for information at this time is the ccmu driver
+ released by Allwinner, you can find it at
+
+ https://github.com/linux-sunxi/linux-sunxi/tree/sunxi-3.0/arch/arm/mach-sun4i/clock/ccmu
diff --git a/Documentation/arm/sunxi/clocks.txt b/Documentation/arm/sunxi/clocks.txt
deleted file mode 100644
index e09a88aa3136..000000000000
--- a/Documentation/arm/sunxi/clocks.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Frequently asked questions about the sunxi clock system
-=======================================================
-
-This document contains useful bits of information that people tend to ask
-about the sunxi clock system, as well as accompanying ASCII art when adequate.
-
-Q: Why is the main 24MHz oscillator gatable? Wouldn't that break the
- system?
-
-A: The 24MHz oscillator allows gating to save power. Indeed, if gated
- carelessly the system would stop functioning, but with the right
- steps, one can gate it and keep the system running. Consider this
- simplified suspend example:
-
- While the system is operational, you would see something like
-
- 24MHz 32kHz
- |
- PLL1
- \
- \_ CPU Mux
- |
- [CPU]
-
- When you are about to suspend, you switch the CPU Mux to the 32kHz
- oscillator:
-
- 24Mhz 32kHz
- | |
- PLL1 |
- /
- CPU Mux _/
- |
- [CPU]
-
- Finally you can gate the main oscillator
-
- 32kHz
- |
- |
- /
- CPU Mux _/
- |
- [CPU]
-
-Q: Were can I learn more about the sunxi clocks?
-
-A: The linux-sunxi wiki contains a page documenting the clock registers,
- you can find it at
-
- http://linux-sunxi.org/A10/CCM
-
- The authoritative source for information at this time is the ccmu driver
- released by Allwinner, you can find it at
-
- https://github.com/linux-sunxi/linux-sunxi/tree/sunxi-3.0/arch/arm/mach-sun4i/clock/ccmu
diff --git a/Documentation/arm/swp_emulation b/Documentation/arm/swp_emulation
deleted file mode 100644
index af903d22fd93..000000000000
--- a/Documentation/arm/swp_emulation
+++ /dev/null
@@ -1,27 +0,0 @@
-Software emulation of deprecated SWP instruction (CONFIG_SWP_EMULATE)
----------------------------------------------------------------------
-
-ARMv6 architecture deprecates use of the SWP/SWPB instructions, and recommeds
-moving to the load-locked/store-conditional instructions LDREX and STREX.
-
-ARMv7 multiprocessing extensions introduce the ability to disable these
-instructions, triggering an undefined instruction exception when executed.
-Trapped instructions are emulated using an LDREX/STREX or LDREXB/STREXB
-sequence. If a memory access fault (an abort) occurs, a segmentation fault is
-signalled to the triggering process.
-
-/proc/cpu/swp_emulation holds some statistics/information, including the PID of
-the last process to trigger the emulation to be invocated. For example:
----
-Emulated SWP: 12
-Emulated SWPB: 0
-Aborted SWP{B}: 1
-Last process: 314
----
-
-NOTE: when accessing uncached shared regions, LDREX/STREX rely on an external
-transaction monitoring block called a global monitor to maintain update
-atomicity. If your system does not implement a global monitor, this option can
-cause programs that perform SWP operations to uncached memory to deadlock, as
-the STREX operation will always fail.
-
diff --git a/Documentation/arm/swp_emulation.rst b/Documentation/arm/swp_emulation.rst
new file mode 100644
index 000000000000..6a608a9c3715
--- /dev/null
+++ b/Documentation/arm/swp_emulation.rst
@@ -0,0 +1,27 @@
+Software emulation of deprecated SWP instruction (CONFIG_SWP_EMULATE)
+---------------------------------------------------------------------
+
+ARMv6 architecture deprecates use of the SWP/SWPB instructions, and recommeds
+moving to the load-locked/store-conditional instructions LDREX and STREX.
+
+ARMv7 multiprocessing extensions introduce the ability to disable these
+instructions, triggering an undefined instruction exception when executed.
+Trapped instructions are emulated using an LDREX/STREX or LDREXB/STREXB
+sequence. If a memory access fault (an abort) occurs, a segmentation fault is
+signalled to the triggering process.
+
+/proc/cpu/swp_emulation holds some statistics/information, including the PID of
+the last process to trigger the emulation to be invocated. For example::
+
+ Emulated SWP: 12
+ Emulated SWPB: 0
+ Aborted SWP{B}: 1
+ Last process: 314
+
+
+NOTE:
+ when accessing uncached shared regions, LDREX/STREX rely on an external
+ transaction monitoring block called a global monitor to maintain update
+ atomicity. If your system does not implement a global monitor, this option can
+ cause programs that perform SWP operations to uncached memory to deadlock, as
+ the STREX operation will always fail.
diff --git a/Documentation/arm/tcm.rst b/Documentation/arm/tcm.rst
new file mode 100644
index 000000000000..effd9c7bc968
--- /dev/null
+++ b/Documentation/arm/tcm.rst
@@ -0,0 +1,161 @@
+==================================================
+ARM TCM (Tightly-Coupled Memory) handling in Linux
+==================================================
+
+Written by Linus Walleij <linus.walleij@stericsson.com>
+
+Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory).
+This is usually just a few (4-64) KiB of RAM inside the ARM
+processor.
+
+Due to being embedded inside the CPU The TCM has a
+Harvard-architecture, so there is an ITCM (instruction TCM)
+and a DTCM (data TCM). The DTCM can not contain any
+instructions, but the ITCM can actually contain data.
+The size of DTCM or ITCM is minimum 4KiB so the typical
+minimum configuration is 4KiB ITCM and 4KiB DTCM.
+
+ARM CPU:s have special registers to read out status, physical
+location and size of TCM memories. arch/arm/include/asm/cputype.h
+defines a CPUID_TCM register that you can read out from the
+system control coprocessor. Documentation from ARM can be found
+at http://infocenter.arm.com, search for "TCM Status Register"
+to see documents for all CPUs. Reading this register you can
+determine if ITCM (bits 1-0) and/or DTCM (bit 17-16) is present
+in the machine.
+
+There is further a TCM region register (search for "TCM Region
+Registers" at the ARM site) that can report and modify the location
+size of TCM memories at runtime. This is used to read out and modify
+TCM location and size. Notice that this is not a MMU table: you
+actually move the physical location of the TCM around. At the
+place you put it, it will mask any underlying RAM from the
+CPU so it is usually wise not to overlap any physical RAM with
+the TCM.
+
+The TCM memory can then be remapped to another address again using
+the MMU, but notice that the TCM if often used in situations where
+the MMU is turned off. To avoid confusion the current Linux
+implementation will map the TCM 1 to 1 from physical to virtual
+memory in the location specified by the kernel. Currently Linux
+will map ITCM to 0xfffe0000 and on, and DTCM to 0xfffe8000 and
+on, supporting a maximum of 32KiB of ITCM and 32KiB of DTCM.
+
+Newer versions of the region registers also support dividing these
+TCMs in two separate banks, so for example an 8KiB ITCM is divided
+into two 4KiB banks with its own control registers. The idea is to
+be able to lock and hide one of the banks for use by the secure
+world (TrustZone).
+
+TCM is used for a few things:
+
+- FIQ and other interrupt handlers that need deterministic
+ timing and cannot wait for cache misses.
+
+- Idle loops where all external RAM is set to self-refresh
+ retention mode, so only on-chip RAM is accessible by
+ the CPU and then we hang inside ITCM waiting for an
+ interrupt.
+
+- Other operations which implies shutting off or reconfiguring
+ the external RAM controller.
+
+There is an interface for using TCM on the ARM architecture
+in <asm/tcm.h>. Using this interface it is possible to:
+
+- Define the physical address and size of ITCM and DTCM.
+
+- Tag functions to be compiled into ITCM.
+
+- Tag data and constants to be allocated to DTCM and ITCM.
+
+- Have the remaining TCM RAM added to a special
+ allocation pool with gen_pool_create() and gen_pool_add()
+ and provice tcm_alloc() and tcm_free() for this
+ memory. Such a heap is great for things like saving
+ device state when shutting off device power domains.
+
+A machine that has TCM memory shall select HAVE_TCM from
+arch/arm/Kconfig for itself. Code that needs to use TCM shall
+#include <asm/tcm.h>
+
+Functions to go into itcm can be tagged like this:
+int __tcmfunc foo(int bar);
+
+Since these are marked to become long_calls and you may want
+to have functions called locally inside the TCM without
+wasting space, there is also the __tcmlocalfunc prefix that
+will make the call relative.
+
+Variables to go into dtcm can be tagged like this::
+
+ int __tcmdata foo;
+
+Constants can be tagged like this::
+
+ int __tcmconst foo;
+
+To put assembler into TCM just use::
+
+ .section ".tcm.text" or .section ".tcm.data"
+
+respectively.
+
+Example code::
+
+ #include <asm/tcm.h>
+
+ /* Uninitialized data */
+ static u32 __tcmdata tcmvar;
+ /* Initialized data */
+ static u32 __tcmdata tcmassigned = 0x2BADBABEU;
+ /* Constant */
+ static const u32 __tcmconst tcmconst = 0xCAFEBABEU;
+
+ static void __tcmlocalfunc tcm_to_tcm(void)
+ {
+ int i;
+ for (i = 0; i < 100; i++)
+ tcmvar ++;
+ }
+
+ static void __tcmfunc hello_tcm(void)
+ {
+ /* Some abstract code that runs in ITCM */
+ int i;
+ for (i = 0; i < 100; i++) {
+ tcmvar ++;
+ }
+ tcm_to_tcm();
+ }
+
+ static void __init test_tcm(void)
+ {
+ u32 *tcmem;
+ int i;
+
+ hello_tcm();
+ printk("Hello TCM executed from ITCM RAM\n");
+
+ printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar);
+ tcmvar = 0xDEADBEEFU;
+ printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar);
+
+ printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned);
+
+ printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst);
+
+ /* Allocate some TCM memory from the pool */
+ tcmem = tcm_alloc(20);
+ if (tcmem) {
+ printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem);
+ tcmem[0] = 0xDEADBEEFU;
+ tcmem[1] = 0x2BADBABEU;
+ tcmem[2] = 0xCAFEBABEU;
+ tcmem[3] = 0xDEADBEEFU;
+ tcmem[4] = 0x2BADBABEU;
+ for (i = 0; i < 5; i++)
+ printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]);
+ tcm_free(tcmem, 20);
+ }
+ }
diff --git a/Documentation/arm/tcm.txt b/Documentation/arm/tcm.txt
deleted file mode 100644
index 7c15871c1885..000000000000
--- a/Documentation/arm/tcm.txt
+++ /dev/null
@@ -1,155 +0,0 @@
-ARM TCM (Tightly-Coupled Memory) handling in Linux
-----
-Written by Linus Walleij <linus.walleij@stericsson.com>
-
-Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory).
-This is usually just a few (4-64) KiB of RAM inside the ARM
-processor.
-
-Due to being embedded inside the CPU The TCM has a
-Harvard-architecture, so there is an ITCM (instruction TCM)
-and a DTCM (data TCM). The DTCM can not contain any
-instructions, but the ITCM can actually contain data.
-The size of DTCM or ITCM is minimum 4KiB so the typical
-minimum configuration is 4KiB ITCM and 4KiB DTCM.
-
-ARM CPU:s have special registers to read out status, physical
-location and size of TCM memories. arch/arm/include/asm/cputype.h
-defines a CPUID_TCM register that you can read out from the
-system control coprocessor. Documentation from ARM can be found
-at http://infocenter.arm.com, search for "TCM Status Register"
-to see documents for all CPUs. Reading this register you can
-determine if ITCM (bits 1-0) and/or DTCM (bit 17-16) is present
-in the machine.
-
-There is further a TCM region register (search for "TCM Region
-Registers" at the ARM site) that can report and modify the location
-size of TCM memories at runtime. This is used to read out and modify
-TCM location and size. Notice that this is not a MMU table: you
-actually move the physical location of the TCM around. At the
-place you put it, it will mask any underlying RAM from the
-CPU so it is usually wise not to overlap any physical RAM with
-the TCM.
-
-The TCM memory can then be remapped to another address again using
-the MMU, but notice that the TCM if often used in situations where
-the MMU is turned off. To avoid confusion the current Linux
-implementation will map the TCM 1 to 1 from physical to virtual
-memory in the location specified by the kernel. Currently Linux
-will map ITCM to 0xfffe0000 and on, and DTCM to 0xfffe8000 and
-on, supporting a maximum of 32KiB of ITCM and 32KiB of DTCM.
-
-Newer versions of the region registers also support dividing these
-TCMs in two separate banks, so for example an 8KiB ITCM is divided
-into two 4KiB banks with its own control registers. The idea is to
-be able to lock and hide one of the banks for use by the secure
-world (TrustZone).
-
-TCM is used for a few things:
-
-- FIQ and other interrupt handlers that need deterministic
- timing and cannot wait for cache misses.
-
-- Idle loops where all external RAM is set to self-refresh
- retention mode, so only on-chip RAM is accessible by
- the CPU and then we hang inside ITCM waiting for an
- interrupt.
-
-- Other operations which implies shutting off or reconfiguring
- the external RAM controller.
-
-There is an interface for using TCM on the ARM architecture
-in <asm/tcm.h>. Using this interface it is possible to:
-
-- Define the physical address and size of ITCM and DTCM.
-
-- Tag functions to be compiled into ITCM.
-
-- Tag data and constants to be allocated to DTCM and ITCM.
-
-- Have the remaining TCM RAM added to a special
- allocation pool with gen_pool_create() and gen_pool_add()
- and provice tcm_alloc() and tcm_free() for this
- memory. Such a heap is great for things like saving
- device state when shutting off device power domains.
-
-A machine that has TCM memory shall select HAVE_TCM from
-arch/arm/Kconfig for itself. Code that needs to use TCM shall
-#include <asm/tcm.h>
-
-Functions to go into itcm can be tagged like this:
-int __tcmfunc foo(int bar);
-
-Since these are marked to become long_calls and you may want
-to have functions called locally inside the TCM without
-wasting space, there is also the __tcmlocalfunc prefix that
-will make the call relative.
-
-Variables to go into dtcm can be tagged like this:
-int __tcmdata foo;
-
-Constants can be tagged like this:
-int __tcmconst foo;
-
-To put assembler into TCM just use
-.section ".tcm.text" or .section ".tcm.data"
-respectively.
-
-Example code:
-
-#include <asm/tcm.h>
-
-/* Uninitialized data */
-static u32 __tcmdata tcmvar;
-/* Initialized data */
-static u32 __tcmdata tcmassigned = 0x2BADBABEU;
-/* Constant */
-static const u32 __tcmconst tcmconst = 0xCAFEBABEU;
-
-static void __tcmlocalfunc tcm_to_tcm(void)
-{
- int i;
- for (i = 0; i < 100; i++)
- tcmvar ++;
-}
-
-static void __tcmfunc hello_tcm(void)
-{
- /* Some abstract code that runs in ITCM */
- int i;
- for (i = 0; i < 100; i++) {
- tcmvar ++;
- }
- tcm_to_tcm();
-}
-
-static void __init test_tcm(void)
-{
- u32 *tcmem;
- int i;
-
- hello_tcm();
- printk("Hello TCM executed from ITCM RAM\n");
-
- printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar);
- tcmvar = 0xDEADBEEFU;
- printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar);
-
- printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned);
-
- printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst);
-
- /* Allocate some TCM memory from the pool */
- tcmem = tcm_alloc(20);
- if (tcmem) {
- printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem);
- tcmem[0] = 0xDEADBEEFU;
- tcmem[1] = 0x2BADBABEU;
- tcmem[2] = 0xCAFEBABEU;
- tcmem[3] = 0xDEADBEEFU;
- tcmem[4] = 0x2BADBABEU;
- for (i = 0; i < 5; i++)
- printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]);
- tcm_free(tcmem, 20);
- }
-}
diff --git a/Documentation/arm/uefi.rst b/Documentation/arm/uefi.rst
new file mode 100644
index 000000000000..f868330df6be
--- /dev/null
+++ b/Documentation/arm/uefi.rst
@@ -0,0 +1,67 @@
+================================================
+The Unified Extensible Firmware Interface (UEFI)
+================================================
+
+UEFI, the Unified Extensible Firmware Interface, is a specification
+governing the behaviours of compatible firmware interfaces. It is
+maintained by the UEFI Forum - http://www.uefi.org/.
+
+UEFI is an evolution of its predecessor 'EFI', so the terms EFI and
+UEFI are used somewhat interchangeably in this document and associated
+source code. As a rule, anything new uses 'UEFI', whereas 'EFI' refers
+to legacy code or specifications.
+
+UEFI support in Linux
+=====================
+Booting on a platform with firmware compliant with the UEFI specification
+makes it possible for the kernel to support additional features:
+
+- UEFI Runtime Services
+- Retrieving various configuration information through the standardised
+ interface of UEFI configuration tables. (ACPI, SMBIOS, ...)
+
+For actually enabling [U]EFI support, enable:
+
+- CONFIG_EFI=y
+- CONFIG_EFI_VARS=y or m
+
+The implementation depends on receiving information about the UEFI environment
+in a Flattened Device Tree (FDT) - so is only available with CONFIG_OF.
+
+UEFI stub
+=========
+The "stub" is a feature that extends the Image/zImage into a valid UEFI
+PE/COFF executable, including a loader application that makes it possible to
+load the kernel directly from the UEFI shell, boot menu, or one of the
+lightweight bootloaders like Gummiboot or rEFInd.
+
+The kernel image built with stub support remains a valid kernel image for
+booting in non-UEFI environments.
+
+UEFI kernel support on ARM
+==========================
+UEFI kernel support on the ARM architectures (arm and arm64) is only available
+when boot is performed through the stub.
+
+When booting in UEFI mode, the stub deletes any memory nodes from a provided DT.
+Instead, the kernel reads the UEFI memory map.
+
+The stub populates the FDT /chosen node with (and the kernel scans for) the
+following parameters:
+
+========================== ====== ===========================================
+Name Size Description
+========================== ====== ===========================================
+linux,uefi-system-table 64-bit Physical address of the UEFI System Table.
+
+linux,uefi-mmap-start 64-bit Physical address of the UEFI memory map,
+ populated by the UEFI GetMemoryMap() call.
+
+linux,uefi-mmap-size 32-bit Size in bytes of the UEFI memory map
+ pointed to in previous entry.
+
+linux,uefi-mmap-desc-size 32-bit Size in bytes of each entry in the UEFI
+ memory map.
+
+linux,uefi-mmap-desc-ver 32-bit Version of the mmap descriptor format.
+========================== ====== ===========================================
diff --git a/Documentation/arm/uefi.txt b/Documentation/arm/uefi.txt
deleted file mode 100644
index 6543a0adea8a..000000000000
--- a/Documentation/arm/uefi.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-UEFI, the Unified Extensible Firmware Interface, is a specification
-governing the behaviours of compatible firmware interfaces. It is
-maintained by the UEFI Forum - http://www.uefi.org/.
-
-UEFI is an evolution of its predecessor 'EFI', so the terms EFI and
-UEFI are used somewhat interchangeably in this document and associated
-source code. As a rule, anything new uses 'UEFI', whereas 'EFI' refers
-to legacy code or specifications.
-
-UEFI support in Linux
-=====================
-Booting on a platform with firmware compliant with the UEFI specification
-makes it possible for the kernel to support additional features:
-- UEFI Runtime Services
-- Retrieving various configuration information through the standardised
- interface of UEFI configuration tables. (ACPI, SMBIOS, ...)
-
-For actually enabling [U]EFI support, enable:
-- CONFIG_EFI=y
-- CONFIG_EFI_VARS=y or m
-
-The implementation depends on receiving information about the UEFI environment
-in a Flattened Device Tree (FDT) - so is only available with CONFIG_OF.
-
-UEFI stub
-=========
-The "stub" is a feature that extends the Image/zImage into a valid UEFI
-PE/COFF executable, including a loader application that makes it possible to
-load the kernel directly from the UEFI shell, boot menu, or one of the
-lightweight bootloaders like Gummiboot or rEFInd.
-
-The kernel image built with stub support remains a valid kernel image for
-booting in non-UEFI environments.
-
-UEFI kernel support on ARM
-==========================
-UEFI kernel support on the ARM architectures (arm and arm64) is only available
-when boot is performed through the stub.
-
-When booting in UEFI mode, the stub deletes any memory nodes from a provided DT.
-Instead, the kernel reads the UEFI memory map.
-
-The stub populates the FDT /chosen node with (and the kernel scans for) the
-following parameters:
-________________________________________________________________________________
-Name | Size | Description
-================================================================================
-linux,uefi-system-table | 64-bit | Physical address of the UEFI System Table.
---------------------------------------------------------------------------------
-linux,uefi-mmap-start | 64-bit | Physical address of the UEFI memory map,
- | | populated by the UEFI GetMemoryMap() call.
---------------------------------------------------------------------------------
-linux,uefi-mmap-size | 32-bit | Size in bytes of the UEFI memory map
- | | pointed to in previous entry.
---------------------------------------------------------------------------------
-linux,uefi-mmap-desc-size | 32-bit | Size in bytes of each entry in the UEFI
- | | memory map.
---------------------------------------------------------------------------------
-linux,uefi-mmap-desc-ver | 32-bit | Version of the mmap descriptor format.
---------------------------------------------------------------------------------
diff --git a/Documentation/arm/vfp/release-notes.rst b/Documentation/arm/vfp/release-notes.rst
new file mode 100644
index 000000000000..c6b04937cee3
--- /dev/null
+++ b/Documentation/arm/vfp/release-notes.rst
@@ -0,0 +1,57 @@
+===============================================
+Release notes for Linux Kernel VFP support code
+===============================================
+
+Date: 20 May 2004
+
+Author: Russell King
+
+This is the first release of the Linux Kernel VFP support code. It
+provides support for the exceptions bounced from VFP hardware found
+on ARM926EJ-S.
+
+This release has been validated against the SoftFloat-2b library by
+John R. Hauser using the TestFloat-2a test suite. Details of this
+library and test suite can be found at:
+
+ http://www.jhauser.us/arithmetic/SoftFloat.html
+
+The operations which have been tested with this package are:
+
+ - fdiv
+ - fsub
+ - fadd
+ - fmul
+ - fcmp
+ - fcmpe
+ - fcvtd
+ - fcvts
+ - fsito
+ - ftosi
+ - fsqrt
+
+All the above pass softfloat tests with the following exceptions:
+
+- fadd/fsub shows some differences in the handling of +0 / -0 results
+ when input operands differ in signs.
+- the handling of underflow exceptions is slightly different. If a
+ result underflows before rounding, but becomes a normalised number
+ after rounding, we do not signal an underflow exception.
+
+Other operations which have been tested by basic assembly-only tests
+are:
+
+ - fcpy
+ - fabs
+ - fneg
+ - ftoui
+ - ftosiz
+ - ftouiz
+
+The combination operations have not been tested:
+
+ - fmac
+ - fnmac
+ - fmsc
+ - fnmsc
+ - fnmul
diff --git a/Documentation/arm/vlocks.rst b/Documentation/arm/vlocks.rst
new file mode 100644
index 000000000000..a40a1742110b
--- /dev/null
+++ b/Documentation/arm/vlocks.rst
@@ -0,0 +1,212 @@
+======================================
+vlocks for Bare-Metal Mutual Exclusion
+======================================
+
+Voting Locks, or "vlocks" provide a simple low-level mutual exclusion
+mechanism, with reasonable but minimal requirements on the memory
+system.
+
+These are intended to be used to coordinate critical activity among CPUs
+which are otherwise non-coherent, in situations where the hardware
+provides no other mechanism to support this and ordinary spinlocks
+cannot be used.
+
+
+vlocks make use of the atomicity provided by the memory system for
+writes to a single memory location. To arbitrate, every CPU "votes for
+itself", by storing a unique number to a common memory location. The
+final value seen in that memory location when all the votes have been
+cast identifies the winner.
+
+In order to make sure that the election produces an unambiguous result
+in finite time, a CPU will only enter the election in the first place if
+no winner has been chosen and the election does not appear to have
+started yet.
+
+
+Algorithm
+---------
+
+The easiest way to explain the vlocks algorithm is with some pseudo-code::
+
+
+ int currently_voting[NR_CPUS] = { 0, };
+ int last_vote = -1; /* no votes yet */
+
+ bool vlock_trylock(int this_cpu)
+ {
+ /* signal our desire to vote */
+ currently_voting[this_cpu] = 1;
+ if (last_vote != -1) {
+ /* someone already volunteered himself */
+ currently_voting[this_cpu] = 0;
+ return false; /* not ourself */
+ }
+
+ /* let's suggest ourself */
+ last_vote = this_cpu;
+ currently_voting[this_cpu] = 0;
+
+ /* then wait until everyone else is done voting */
+ for_each_cpu(i) {
+ while (currently_voting[i] != 0)
+ /* wait */;
+ }
+
+ /* result */
+ if (last_vote == this_cpu)
+ return true; /* we won */
+ return false;
+ }
+
+ bool vlock_unlock(void)
+ {
+ last_vote = -1;
+ }
+
+
+The currently_voting[] array provides a way for the CPUs to determine
+whether an election is in progress, and plays a role analogous to the
+"entering" array in Lamport's bakery algorithm [1].
+
+However, once the election has started, the underlying memory system
+atomicity is used to pick the winner. This avoids the need for a static
+priority rule to act as a tie-breaker, or any counters which could
+overflow.
+
+As long as the last_vote variable is globally visible to all CPUs, it
+will contain only one value that won't change once every CPU has cleared
+its currently_voting flag.
+
+
+Features and limitations
+------------------------
+
+ * vlocks are not intended to be fair. In the contended case, it is the
+ _last_ CPU which attempts to get the lock which will be most likely
+ to win.
+
+ vlocks are therefore best suited to situations where it is necessary
+ to pick a unique winner, but it does not matter which CPU actually
+ wins.
+
+ * Like other similar mechanisms, vlocks will not scale well to a large
+ number of CPUs.
+
+ vlocks can be cascaded in a voting hierarchy to permit better scaling
+ if necessary, as in the following hypothetical example for 4096 CPUs::
+
+ /* first level: local election */
+ my_town = towns[(this_cpu >> 4) & 0xf];
+ I_won = vlock_trylock(my_town, this_cpu & 0xf);
+ if (I_won) {
+ /* we won the town election, let's go for the state */
+ my_state = states[(this_cpu >> 8) & 0xf];
+ I_won = vlock_lock(my_state, this_cpu & 0xf));
+ if (I_won) {
+ /* and so on */
+ I_won = vlock_lock(the_whole_country, this_cpu & 0xf];
+ if (I_won) {
+ /* ... */
+ }
+ vlock_unlock(the_whole_country);
+ }
+ vlock_unlock(my_state);
+ }
+ vlock_unlock(my_town);
+
+
+ARM implementation
+------------------
+
+The current ARM implementation [2] contains some optimisations beyond
+the basic algorithm:
+
+ * By packing the members of the currently_voting array close together,
+ we can read the whole array in one transaction (providing the number
+ of CPUs potentially contending the lock is small enough). This
+ reduces the number of round-trips required to external memory.
+
+ In the ARM implementation, this means that we can use a single load
+ and comparison::
+
+ LDR Rt, [Rn]
+ CMP Rt, #0
+
+ ...in place of code equivalent to::
+
+ LDRB Rt, [Rn]
+ CMP Rt, #0
+ LDRBEQ Rt, [Rn, #1]
+ CMPEQ Rt, #0
+ LDRBEQ Rt, [Rn, #2]
+ CMPEQ Rt, #0
+ LDRBEQ Rt, [Rn, #3]
+ CMPEQ Rt, #0
+
+ This cuts down on the fast-path latency, as well as potentially
+ reducing bus contention in contended cases.
+
+ The optimisation relies on the fact that the ARM memory system
+ guarantees coherency between overlapping memory accesses of
+ different sizes, similarly to many other architectures. Note that
+ we do not care which element of currently_voting appears in which
+ bits of Rt, so there is no need to worry about endianness in this
+ optimisation.
+
+ If there are too many CPUs to read the currently_voting array in
+ one transaction then multiple transations are still required. The
+ implementation uses a simple loop of word-sized loads for this
+ case. The number of transactions is still fewer than would be
+ required if bytes were loaded individually.
+
+
+ In principle, we could aggregate further by using LDRD or LDM, but
+ to keep the code simple this was not attempted in the initial
+ implementation.
+
+
+ * vlocks are currently only used to coordinate between CPUs which are
+ unable to enable their caches yet. This means that the
+ implementation removes many of the barriers which would be required
+ when executing the algorithm in cached memory.
+
+ packing of the currently_voting array does not work with cached
+ memory unless all CPUs contending the lock are cache-coherent, due
+ to cache writebacks from one CPU clobbering values written by other
+ CPUs. (Though if all the CPUs are cache-coherent, you should be
+ probably be using proper spinlocks instead anyway).
+
+
+ * The "no votes yet" value used for the last_vote variable is 0 (not
+ -1 as in the pseudocode). This allows statically-allocated vlocks
+ to be implicitly initialised to an unlocked state simply by putting
+ them in .bss.
+
+ An offset is added to each CPU's ID for the purpose of setting this
+ variable, so that no CPU uses the value 0 for its ID.
+
+
+Colophon
+--------
+
+Originally created and documented by Dave Martin for Linaro Limited, for
+use in ARM-based big.LITTLE platforms, with review and input gratefully
+received from Nicolas Pitre and Achin Gupta. Thanks to Nicolas for
+grabbing most of this text out of the relevant mail thread and writing
+up the pseudocode.
+
+Copyright (C) 2012-2013 Linaro Limited
+Distributed under the terms of Version 2 of the GNU General Public
+License, as defined in linux/COPYING.
+
+
+References
+----------
+
+[1] Lamport, L. "A New Solution of Dijkstra's Concurrent Programming
+ Problem", Communications of the ACM 17, 8 (August 1974), 453-455.
+
+ https://en.wikipedia.org/wiki/Lamport%27s_bakery_algorithm
+
+[2] linux/arch/arm/common/vlock.S, www.kernel.org.
diff --git a/Documentation/arm/vlocks.txt b/Documentation/arm/vlocks.txt
deleted file mode 100644
index 45731672c564..000000000000
--- a/Documentation/arm/vlocks.txt
+++ /dev/null
@@ -1,211 +0,0 @@
-vlocks for Bare-Metal Mutual Exclusion
-======================================
-
-Voting Locks, or "vlocks" provide a simple low-level mutual exclusion
-mechanism, with reasonable but minimal requirements on the memory
-system.
-
-These are intended to be used to coordinate critical activity among CPUs
-which are otherwise non-coherent, in situations where the hardware
-provides no other mechanism to support this and ordinary spinlocks
-cannot be used.
-
-
-vlocks make use of the atomicity provided by the memory system for
-writes to a single memory location. To arbitrate, every CPU "votes for
-itself", by storing a unique number to a common memory location. The
-final value seen in that memory location when all the votes have been
-cast identifies the winner.
-
-In order to make sure that the election produces an unambiguous result
-in finite time, a CPU will only enter the election in the first place if
-no winner has been chosen and the election does not appear to have
-started yet.
-
-
-Algorithm
----------
-
-The easiest way to explain the vlocks algorithm is with some pseudo-code:
-
-
- int currently_voting[NR_CPUS] = { 0, };
- int last_vote = -1; /* no votes yet */
-
- bool vlock_trylock(int this_cpu)
- {
- /* signal our desire to vote */
- currently_voting[this_cpu] = 1;
- if (last_vote != -1) {
- /* someone already volunteered himself */
- currently_voting[this_cpu] = 0;
- return false; /* not ourself */
- }
-
- /* let's suggest ourself */
- last_vote = this_cpu;
- currently_voting[this_cpu] = 0;
-
- /* then wait until everyone else is done voting */
- for_each_cpu(i) {
- while (currently_voting[i] != 0)
- /* wait */;
- }
-
- /* result */
- if (last_vote == this_cpu)
- return true; /* we won */
- return false;
- }
-
- bool vlock_unlock(void)
- {
- last_vote = -1;
- }
-
-
-The currently_voting[] array provides a way for the CPUs to determine
-whether an election is in progress, and plays a role analogous to the
-"entering" array in Lamport's bakery algorithm [1].
-
-However, once the election has started, the underlying memory system
-atomicity is used to pick the winner. This avoids the need for a static
-priority rule to act as a tie-breaker, or any counters which could
-overflow.
-
-As long as the last_vote variable is globally visible to all CPUs, it
-will contain only one value that won't change once every CPU has cleared
-its currently_voting flag.
-
-
-Features and limitations
-------------------------
-
- * vlocks are not intended to be fair. In the contended case, it is the
- _last_ CPU which attempts to get the lock which will be most likely
- to win.
-
- vlocks are therefore best suited to situations where it is necessary
- to pick a unique winner, but it does not matter which CPU actually
- wins.
-
- * Like other similar mechanisms, vlocks will not scale well to a large
- number of CPUs.
-
- vlocks can be cascaded in a voting hierarchy to permit better scaling
- if necessary, as in the following hypothetical example for 4096 CPUs:
-
- /* first level: local election */
- my_town = towns[(this_cpu >> 4) & 0xf];
- I_won = vlock_trylock(my_town, this_cpu & 0xf);
- if (I_won) {
- /* we won the town election, let's go for the state */
- my_state = states[(this_cpu >> 8) & 0xf];
- I_won = vlock_lock(my_state, this_cpu & 0xf));
- if (I_won) {
- /* and so on */
- I_won = vlock_lock(the_whole_country, this_cpu & 0xf];
- if (I_won) {
- /* ... */
- }
- vlock_unlock(the_whole_country);
- }
- vlock_unlock(my_state);
- }
- vlock_unlock(my_town);
-
-
-ARM implementation
-------------------
-
-The current ARM implementation [2] contains some optimisations beyond
-the basic algorithm:
-
- * By packing the members of the currently_voting array close together,
- we can read the whole array in one transaction (providing the number
- of CPUs potentially contending the lock is small enough). This
- reduces the number of round-trips required to external memory.
-
- In the ARM implementation, this means that we can use a single load
- and comparison:
-
- LDR Rt, [Rn]
- CMP Rt, #0
-
- ...in place of code equivalent to:
-
- LDRB Rt, [Rn]
- CMP Rt, #0
- LDRBEQ Rt, [Rn, #1]
- CMPEQ Rt, #0
- LDRBEQ Rt, [Rn, #2]
- CMPEQ Rt, #0
- LDRBEQ Rt, [Rn, #3]
- CMPEQ Rt, #0
-
- This cuts down on the fast-path latency, as well as potentially
- reducing bus contention in contended cases.
-
- The optimisation relies on the fact that the ARM memory system
- guarantees coherency between overlapping memory accesses of
- different sizes, similarly to many other architectures. Note that
- we do not care which element of currently_voting appears in which
- bits of Rt, so there is no need to worry about endianness in this
- optimisation.
-
- If there are too many CPUs to read the currently_voting array in
- one transaction then multiple transations are still required. The
- implementation uses a simple loop of word-sized loads for this
- case. The number of transactions is still fewer than would be
- required if bytes were loaded individually.
-
-
- In principle, we could aggregate further by using LDRD or LDM, but
- to keep the code simple this was not attempted in the initial
- implementation.
-
-
- * vlocks are currently only used to coordinate between CPUs which are
- unable to enable their caches yet. This means that the
- implementation removes many of the barriers which would be required
- when executing the algorithm in cached memory.
-
- packing of the currently_voting array does not work with cached
- memory unless all CPUs contending the lock are cache-coherent, due
- to cache writebacks from one CPU clobbering values written by other
- CPUs. (Though if all the CPUs are cache-coherent, you should be
- probably be using proper spinlocks instead anyway).
-
-
- * The "no votes yet" value used for the last_vote variable is 0 (not
- -1 as in the pseudocode). This allows statically-allocated vlocks
- to be implicitly initialised to an unlocked state simply by putting
- them in .bss.
-
- An offset is added to each CPU's ID for the purpose of setting this
- variable, so that no CPU uses the value 0 for its ID.
-
-
-Colophon
---------
-
-Originally created and documented by Dave Martin for Linaro Limited, for
-use in ARM-based big.LITTLE platforms, with review and input gratefully
-received from Nicolas Pitre and Achin Gupta. Thanks to Nicolas for
-grabbing most of this text out of the relevant mail thread and writing
-up the pseudocode.
-
-Copyright (C) 2012-2013 Linaro Limited
-Distributed under the terms of Version 2 of the GNU General Public
-License, as defined in linux/COPYING.
-
-
-References
-----------
-
-[1] Lamport, L. "A New Solution of Dijkstra's Concurrent Programming
- Problem", Communications of the ACM 17, 8 (August 1974), 453-455.
-
- https://en.wikipedia.org/wiki/Lamport%27s_bakery_algorithm
-
-[2] linux/arch/arm/common/vlock.S, www.kernel.org.
diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst
index 3d041d0d16e8..d3f3a60fbf25 100644
--- a/Documentation/arm64/booting.rst
+++ b/Documentation/arm64/booting.rst
@@ -284,7 +284,7 @@ following manner:
processors") to bring CPUs into the kernel.
The device tree should contain a 'psci' node, as described in
- Documentation/devicetree/bindings/arm/psci.txt.
+ Documentation/devicetree/bindings/arm/psci.yaml.
- Secondary CPU general-purpose register settings
x0 = 0 (reserved for future use)
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index c7cbf4b571c0..91f79529c58c 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -180,6 +180,10 @@ HWCAP_ILRCPC
HWCAP_FLAGM
Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0001.
+HWCAP2_FLAGM2
+
+ Functionality implied by ID_AA64ISAR0_EL1.TS == 0b0010.
+
HWCAP_SSBS
Functionality implied by ID_AA64PFR1_EL1.SSBS == 0b0010.
@@ -193,6 +197,10 @@ HWCAP_PACG
ID_AA64ISAR1_EL1.GPI == 0b0001, as described by
Documentation/arm64/pointer-authentication.rst.
+HWCAP2_FRINT
+
+ Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001.
+
4. Unused AT_HWCAP bits
-----------------------
diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst
index 018b7836ecb7..96b696ba4e6c 100644
--- a/Documentation/arm64/index.rst
+++ b/Documentation/arm64/index.rst
@@ -1,5 +1,3 @@
-:orphan:
-
==================
ARM64 Architecture
==================
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index c792774be59e..3e57d09246e6 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -86,6 +86,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Neoverse-N1 | #1349291 | N/A |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | MMU-500 | #841119,826419 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
+----------------+-----------------+-----------------+-----------------------------+
diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst
index 38422ab249dd..5689c74c8082 100644
--- a/Documentation/arm64/sve.rst
+++ b/Documentation/arm64/sve.rst
@@ -58,6 +58,18 @@ model features for SVE is included in Appendix A.
is to connect to a target process first and then attempt a
ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov).
+* Whenever SVE scalable register values (Zn, Pn, FFR) are exchanged in memory
+ between userspace and the kernel, the register value is encoded in memory in
+ an endianness-invariant layout, with bits [(8 * i + 7) : (8 * i)] encoded at
+ byte offset i from the start of the memory representation. This affects for
+ example the signal frame (struct sve_context) and ptrace interface
+ (struct user_sve_header) and associated data.
+
+ Beware that on big-endian systems this results in a different byte order than
+ for the FPSIMD V-registers, which are stored as single host-endian 128-bit
+ values, with bits [(127 - 8 * i) : (120 - 8 * i)] of the register encoded at
+ byte offset i. (struct fpsimd_context, struct user_fpsimd_state).
+
2. Vector length terminology
-----------------------------
@@ -126,6 +138,10 @@ the SVE instruction set architecture.
size and layout. Macros SVE_SIG_* are defined [1] to facilitate access to
the members.
+* Each scalable register (Zn, Pn, FFR) is stored in an endianness-invariant
+ layout, with bits [(8 * i + 7) : (8 * i)] stored at byte offset i from the
+ start of the register's representation in memory.
+
* If the SVE context is too big to fit in sigcontext.__reserved[], then extra
space is allocated on the stack, an extra_context record is written in
__reserved[] referencing this space. sve_context is then written in the
diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index dca3fb0554db..0ab747e0d5ac 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -81,9 +81,11 @@ Non-RMW ops:
The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
-smp_store_release() respectively.
+smp_store_release() respectively. Therefore, if you find yourself only using
+the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all
+and are doing it wrong.
-The one detail to this is that atomic_set{}() should be observable to the RMW
+A subtle detail of atomic_set{}() is that it should be observable to the RMW
ops. That is:
C atomic-set
@@ -187,13 +189,22 @@ The barriers:
smp_mb__{before,after}_atomic()
-only apply to the RMW ops and can be used to augment/upgrade the ordering
-inherent to the used atomic op. These barriers provide a full smp_mb().
+only apply to the RMW atomic ops and can be used to augment/upgrade the
+ordering inherent to the op. These barriers act almost like a full smp_mb():
+smp_mb__before_atomic() orders all earlier accesses against the RMW op
+itself and all accesses following it, and smp_mb__after_atomic() orders all
+later accesses against the RMW op and all accesses preceding it. However,
+accesses between the smp_mb__{before,after}_atomic() and the RMW op are not
+ordered, so it is advisable to place the barrier right next to the RMW atomic
+op whenever possible.
These helper barriers exist because architectures have varying implicit
ordering on their SMP atomic primitives. For example our TSO architectures
provide full ordered atomics and these barriers are no-ops.
+NOTE: when the atomic RmW ops are fully ordered, they should also imply a
+compiler barrier.
+
Thus:
atomic_fetch_add();
@@ -212,7 +223,9 @@ Further, while something like:
atomic_dec(&X);
is a 'typical' RELEASE pattern, the barrier is strictly stronger than
-a RELEASE. Similarly for something like:
+a RELEASE because it orders preceding instructions against both the read
+and write parts of the atomic_dec(), and against all following instructions
+as well. Similarly, something like:
atomic_inc(&X);
smp_mb__after_atomic();
@@ -244,7 +257,8 @@ strictly stronger than ACQUIRE. As illustrated:
This should not happen; but a hypothetical atomic_inc_acquire() --
(void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
-since then:
+because it would not order the W part of the RMW against the following
+WRITE_ONCE. Thus:
P1 P2
diff --git a/Documentation/auxdisplay/lcd-panel-cgram.txt b/Documentation/auxdisplay/lcd-panel-cgram.txt
deleted file mode 100644
index 7f82c905763d..000000000000
--- a/Documentation/auxdisplay/lcd-panel-cgram.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Some LCDs allow you to define up to 8 characters, mapped to ASCII
-characters 0 to 7. The escape code to define a new character is
-'\e[LG' followed by one digit from 0 to 7, representing the character
-number, and up to 8 couples of hex digits terminated by a semi-colon
-(';'). Each couple of digits represents a line, with 1-bits for each
-illuminated pixel with LSB on the right. Lines are numbered from the
-top of the character to the bottom. On a 5x7 matrix, only the 5 lower
-bits of the 7 first bytes are used for each character. If the string
-is incomplete, only complete lines will be redefined. Here are some
-examples :
-
- printf "\e[LG0010101050D1F0C04;" => 0 = [enter]
- printf "\e[LG1040E1F0000000000;" => 1 = [up]
- printf "\e[LG2000000001F0E0400;" => 2 = [down]
- printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down]
- printf "\e[LG40002060E1E0E0602;" => 4 = [left]
- printf "\e[LG500080C0E0F0E0C08;" => 5 = [right]
- printf "\e[LG60016051516141400;" => 6 = "IP"
-
- printf "\e[LG00103071F1F070301;" => big speaker
- printf "\e[LG00002061E1E060200;" => small speaker
-
-Willy
-
diff --git a/Documentation/backlight/lp855x-driver.txt b/Documentation/backlight/lp855x-driver.txt
deleted file mode 100644
index 01bce243d3d7..000000000000
--- a/Documentation/backlight/lp855x-driver.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-Kernel driver lp855x
-====================
-
-Backlight driver for LP855x ICs
-
-Supported chips:
- Texas Instruments LP8550, LP8551, LP8552, LP8553, LP8555, LP8556 and
- LP8557
-
-Author: Milo(Woogyom) Kim <milo.kim@ti.com>
-
-Description
------------
-
-* Brightness control
-
-Brightness can be controlled by the pwm input or the i2c command.
-The lp855x driver supports both cases.
-
-* Device attributes
-
-1) bl_ctl_mode
-Backlight control mode.
-Value : pwm based or register based
-
-2) chip_id
-The lp855x chip id.
-Value : lp8550/lp8551/lp8552/lp8553/lp8555/lp8556/lp8557
-
-Platform data for lp855x
-------------------------
-
-For supporting platform specific data, the lp855x platform data can be used.
-
-* name : Backlight driver name. If it is not defined, default name is set.
-* device_control : Value of DEVICE CONTROL register.
-* initial_brightness : Initial value of backlight brightness.
-* period_ns : Platform specific PWM period value. unit is nano.
- Only valid when brightness is pwm input mode.
-* size_program : Total size of lp855x_rom_data.
-* rom_data : List of new eeprom/eprom registers.
-
-example 1) lp8552 platform data : i2c register mode with new eeprom data
-
-#define EEPROM_A5_ADDR 0xA5
-#define EEPROM_A5_VAL 0x4f /* EN_VSYNC=0 */
-
-static struct lp855x_rom_data lp8552_eeprom_arr[] = {
- {EEPROM_A5_ADDR, EEPROM_A5_VAL},
-};
-
-static struct lp855x_platform_data lp8552_pdata = {
- .name = "lcd-bl",
- .device_control = I2C_CONFIG(LP8552),
- .initial_brightness = INITIAL_BRT,
- .size_program = ARRAY_SIZE(lp8552_eeprom_arr),
- .rom_data = lp8552_eeprom_arr,
-};
-
-example 2) lp8556 platform data : pwm input mode with default rom data
-
-static struct lp855x_platform_data lp8556_pdata = {
- .device_control = PWM_CONFIG(LP8556),
- .initial_brightness = INITIAL_BRT,
- .period_ns = 1000000,
-};
diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst
new file mode 100644
index 000000000000..0d237d402860
--- /dev/null
+++ b/Documentation/block/bfq-iosched.rst
@@ -0,0 +1,597 @@
+==========================
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+
+- BFQ guarantees a high system and application responsiveness, and a
+ low latency for time-sensitive applications, such as audio or video
+ players;
+- BFQ distributes bandwidth, and not just time, among processes or
+ groups (switching back to time distribution when needed to keep
+ throughput high).
+
+In its default configuration, BFQ privileges latency over
+throughput. So, when needed for achieving a lower latency, BFQ builds
+schedules that may lead to a lower throughput. If your main or only
+goal, for a given device, is to achieve the maximum-possible
+throughput at all times, then do switch off all low-latency heuristics
+for that device, by setting low_latency to 0. See Section 3 for
+details on how to configure BFQ for the desired tradeoff between
+latency and throughput, or on how to maximize throughput.
+
+As every I/O scheduler, BFQ adds some overhead to per-I/O-request
+processing. To give an idea of this overhead, the total,
+single-lock-protected, per-request processing time of BFQ---i.e., the
+sum of the execution times of the request insertion, dispatch and
+completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
+(dated CPU for notebooks; time measured with simple code
+instrumentation, and using the throughput-sync.sh script of the S
+suite [1], in performance-profiling mode). To put this result into
+context, the total, single-lock-protected, per-request execution time
+of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
+us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
+
+Scheduling overhead further limits the maximum IOPS that a CPU can
+process (already limited by the execution of the rest of the I/O
+stack). To give an idea of the limits with BFQ, on slow or average
+CPUs, here are, first, the limits of BFQ for three different CPUs, on,
+respectively, an average laptop, an old desktop, and a cheap embedded
+system, in case full hierarchical support is enabled (i.e.,
+CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_BFQ_CGROUP_DEBUG is not
+set (Section 4-2):
+- Intel i7-4850HQ: 400 KIOPS
+- AMD A8-3850: 250 KIOPS
+- ARM CortexTM-A53 Octa-core: 80 KIOPS
+
+If CONFIG_BFQ_CGROUP_DEBUG is set (and of course full hierarchical
+support is enabled), then the sustainable throughput with BFQ
+decreases, because all blkio.bfq* statistics are created and updated
+(Section 4-2). For BFQ, this leads to the following maximum
+sustainable throughputs, on the same systems as above:
+- Intel i7-4850HQ: 310 KIOPS
+- AMD A8-3850: 200 KIOPS
+- ARM CortexTM-A53 Octa-core: 56 KIOPS
+
+BFQ works for multi-queue devices too.
+
+.. The table of contents follow. Impatients can just jump to Section 3.
+
+.. CONTENTS
+
+ 1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+ 2. How does BFQ work?
+ 3. What are BFQ's tunables and how to properly configure BFQ?
+ 4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+ databases,
+
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+^^^^^^^^^^^^^^^
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+* audio and video-streaming with zero or very low jitter and drop
+ rate;
+
+* fast retrieval of WEB pages and embedded objects;
+
+* real-time recording of data in live-dumping applications (e.g.,
+ packet logging);
+
+* responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+ `(bfq_)queue`.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+ (process) at a time, and implements this service model by
+ associating every queue with a budget, measured in number of
+ sectors.
+
+ - After a queue is granted access to the device, the budget of the
+ queue is decremented, on each request dispatch, by the size of the
+ request.
+
+ - The in-service queue is expired, i.e., its service is suspended,
+ only if one of the following events occurs: 1) the queue finishes
+ its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+ - The budget timeout prevents processes doing random I/O from
+ holding the device for too long and dramatically reducing
+ throughput.
+
+ - Actually, as in CFQ, a queue associated with a process issuing
+ sync requests may not be expired immediately when it empties. In
+ contrast, BFQ may idle the device for a short time interval,
+ giving the process the chance to go on being served if it issues
+ a new request in time. Device idling typically boosts the
+ throughput on rotational devices and on non-queueing flash-based
+ devices, if processes do synchronous and sequential I/O. In
+ addition, under BFQ, device idling is also instrumental in
+ guaranteeing the desired throughput fraction to processes
+ issuing sync requests (see the description of the slice_idle
+ tunable in this document, or [1, 2], for more details).
+
+ - With respect to idling for service guarantees, if several
+ processes are competing for the device at the same time, but
+ all processes and groups have the same weight, then BFQ
+ guarantees the expected throughput distribution without ever
+ idling the device. Throughput is thus as high as possible in
+ this common scenario.
+
+ - On flash-based storage with internal queueing of commands
+ (typically NCQ), device idling happens to be always detrimental
+ for throughput. So, with these devices, BFQ performs idling
+ only when strictly needed for service guarantees, i.e., for
+ guaranteeing low latency or fairness. In these cases, overall
+ throughput may be sub-optimal. No solution currently exists to
+ provide both strong service guarantees and optimal throughput
+ on devices with internal queueing.
+
+ - If low-latency mode is enabled (default configuration), BFQ
+ executes some special heuristics to detect interactive and soft
+ real-time applications (e.g., video or audio players/streamers),
+ and to reduce their latency. The most important action taken to
+ achieve this goal is to give to the queues associated with these
+ applications more than their fair share of the device
+ throughput. For brevity, we call just "weight-raising" the whole
+ sets of actions taken by BFQ to privilege these queues. In
+ particular, BFQ provides a milder form of weight-raising for
+ interactive applications, and a stronger form for soft real-time
+ applications.
+
+ - BFQ automatically deactivates idling for queues born in a burst of
+ queue creations. In fact, these queues are usually associated with
+ the processes of applications and services that benefit mostly
+ from a high throughput. Examples are systemd during boot, or git
+ grep.
+
+ - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+ performing random I/O that becomes mostly sequential if
+ merged. Differently from CFQ, BFQ achieves this goal with a more
+ reactive mechanism, called Early Queue Merge (EQM). EQM is so
+ responsive in detecting interleaved I/O (cooperating processes),
+ that it enables BFQ to achieve a high throughput, by queue
+ merging, even for queues for which CFQ needs a different
+ mechanism, preemption, to get a high throughput. As such EQM is a
+ unified mechanism to achieve a high throughput with interleaved
+ I/O.
+
+ - Queues are scheduled according to a variant of WF2Q+, named
+ B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+ O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
+ also ready for hierarchical scheduling, details in Section 4.
+
+ - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+ perfectly fair, and smooth service. In particular, B-WF2Q+
+ guarantees that each queue receives a fraction of the device
+ throughput proportional to its weight, even if the throughput
+ fluctuates, and regardless of: the device parameters, the current
+ workload and the budgets assigned to the queue.
+
+ - The last, budget-independence, property (although probably
+ counterintuitive in the first place) is definitely beneficial, for
+ the following reasons:
+
+ - First, with any proportional-share scheduler, the maximum
+ deviation with respect to an ideal service is proportional to
+ the maximum budget (slice) assigned to queues. As a consequence,
+ BFQ can keep this deviation tight not only because of the
+ accurate service of B-WF2Q+, but also because BFQ *does not*
+ need to assign a larger budget to a queue to let the queue
+ receive a higher fraction of the device throughput.
+
+ - Second, BFQ is free to choose, for every process (queue), the
+ budget that best fits the needs of the process, or best
+ leverages the I/O pattern of the process. In particular, BFQ
+ updates queue budgets with a simple feedback-loop algorithm that
+ allows a high throughput to be achieved, while still providing
+ tight latency guarantees to time-sensitive applications. When
+ the in-service queue expires, this algorithm computes the next
+ budget of the queue so as to:
+
+ - Let large budgets be eventually assigned to the queues
+ associated with I/O-bound applications performing sequential
+ I/O: in fact, the longer these applications are served once
+ got access to the device, the higher the throughput is.
+
+ - Let small budgets be eventually assigned to the queues
+ associated with time-sensitive applications (which typically
+ perform sporadic and short I/O), because, the smaller the
+ budget assigned to a queue waiting for service is, the sooner
+ B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+ but all processes and groups have the same weight, then BFQ
+ guarantees the expected throughput distribution without ever idling
+ the device. It uses preemption instead. Throughput is then much
+ higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+ lower-priority queues are not served as long as there are
+ higher-priority queues. Among queues in the same class, the
+ bandwidth is distributed in proportion to the weight of each
+ queue. A very thin extra bandwidth is however guaranteed to
+ the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunables and how to properly configure BFQ?
+=============================================================
+
+Most BFQ tunables affect service guarantees (basically latency and
+fairness) and throughput. For full details on how to choose the
+desired tradeoff between service guarantees and throughput, see the
+parameters slice_idle, strict_guarantees and low_latency. For details
+on how to maximise throughput, see slice_idle, timeout_sync and
+max_budget. The other performance-related parameters have been
+inherited from, and have been preserved mostly for compatibility with
+CFQ. So far, no performance improvement has been reported after
+changing the latter parameters in BFQ.
+
+In particular, the tunables back_seek-max, back_seek_penalty,
+fifo_expire_async and fifo_expire_sync below are the same as in
+CFQ. Their description is just copied from that for CFQ. Some
+considerations in the description of slice_idle are copied from CFQ
+too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration, as well
+as flash-based storage with internal command queueing (and
+parallelism).
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0. In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), or with
+flash-based fast storage, setting slice_idle=0 might end up in better
+throughput and acceptable latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput (as detailed in Section 2). As a
+consequence of this behavior, and of further issues described for the
+strict_guarantees tunable, short-term service guarantees may be
+occasionally violated. And, in some cases, these guarantees may be
+more important than guaranteeing maximum throughput. For example, in
+video playing/streaming, a very low drop rate may be more important
+than maximum throughput. In these cases, consider setting the
+strict_guarantees parameter.
+
+slice_idle_us
+-------------
+
+Controls the same tuning parameter as slice_idle, but in microseconds.
+Either tunable can be used to set idling behavior. Afterwards, the
+other tunable will reflect the newly set value in sysfs.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+ new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable. The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+DISABLE this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
+In addition, as already highlighted at the beginning of this document,
+DISABLE this mode if your only goal is to achieve a high throughput.
+In fact, privileging the I/O of some application over the rest may
+entail a lower throughput. To achieve the highest-possible throughput
+on a non-rotational device, setting slice_idle to 0 may be needed too
+(at the cost of giving up any strong guarantee on fairness and low
+latency).
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+For specific devices, some users have occasionally reported to have
+reached a higher throughput by setting max_budget explicitly, i.e., by
+setting max_budget to a higher value than 0. In particular, they have
+set max_budget to higher values than those to which BFQ would have set
+it with auto-tuning. An alternative way to achieve this goal is to
+just increase the value of timeout_sync, leaving max_budget equal to 0.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+As for cgroups-v1 (blkio controller), the exact set of stat files
+created, and kept up-to-date by bfq, depends on whether
+CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all
+the stat files documented in
+Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead,
+CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files::
+
+ blkio.bfq.io_service_bytes
+ blkio.bfq.io_service_bytes_recursive
+ blkio.bfq.io_serviced
+ blkio.bfq.io_serviced_recursive
+
+The value of CONFIG_BFQ_CGROUP_DEBUG greatly influences the maximum
+throughput sustainable with bfq, because updating the blkio.bfq.*
+stats is rather costly, especially for some of the stats enabled by
+CONFIG_BFQ_CGROUP_DEBUG.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+
+[1]
+ P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ Scheduler", Proceedings of the First Workshop on Mobile System
+ Technologies (MST-2015), May 2015.
+
+ http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2]
+ P. Valente and M. Andreolini, "Improving Application
+ Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+ the 5th Annual International Systems and Storage Conference
+ (SYSTOR '12), June 2012.
+
+ Slightly extended version:
+
+ http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-results.pdf
+
+[3]
+ https://github.com/Algodev-github/S
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
deleted file mode 100644
index 1a0f2ac02eb6..000000000000
--- a/Documentation/block/bfq-iosched.txt
+++ /dev/null
@@ -1,583 +0,0 @@
-BFQ (Budget Fair Queueing)
-==========================
-
-BFQ is a proportional-share I/O scheduler, with some extra
-low-latency capabilities. In addition to cgroups support (blkio or io
-controllers), BFQ's main features are:
-- BFQ guarantees a high system and application responsiveness, and a
- low latency for time-sensitive applications, such as audio or video
- players;
-- BFQ distributes bandwidth, and not just time, among processes or
- groups (switching back to time distribution when needed to keep
- throughput high).
-
-In its default configuration, BFQ privileges latency over
-throughput. So, when needed for achieving a lower latency, BFQ builds
-schedules that may lead to a lower throughput. If your main or only
-goal, for a given device, is to achieve the maximum-possible
-throughput at all times, then do switch off all low-latency heuristics
-for that device, by setting low_latency to 0. See Section 3 for
-details on how to configure BFQ for the desired tradeoff between
-latency and throughput, or on how to maximize throughput.
-
-As every I/O scheduler, BFQ adds some overhead to per-I/O-request
-processing. To give an idea of this overhead, the total,
-single-lock-protected, per-request processing time of BFQ---i.e., the
-sum of the execution times of the request insertion, dispatch and
-completion hooks---is, e.g., 1.9 us on an Intel Core i7-2760QM@2.40GHz
-(dated CPU for notebooks; time measured with simple code
-instrumentation, and using the throughput-sync.sh script of the S
-suite [1], in performance-profiling mode). To put this result into
-context, the total, single-lock-protected, per-request execution time
-of the lightest I/O scheduler available in blk-mq, mq-deadline, is 0.7
-us (mq-deadline is ~800 LOC, against ~10500 LOC for BFQ).
-
-Scheduling overhead further limits the maximum IOPS that a CPU can
-process (already limited by the execution of the rest of the I/O
-stack). To give an idea of the limits with BFQ, on slow or average
-CPUs, here are, first, the limits of BFQ for three different CPUs, on,
-respectively, an average laptop, an old desktop, and a cheap embedded
-system, in case full hierarchical support is enabled (i.e.,
-CONFIG_BFQ_GROUP_IOSCHED is set), but CONFIG_DEBUG_BLK_CGROUP is not
-set (Section 4-2):
-- Intel i7-4850HQ: 400 KIOPS
-- AMD A8-3850: 250 KIOPS
-- ARM CortexTM-A53 Octa-core: 80 KIOPS
-
-If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical
-support is enabled), then the sustainable throughput with BFQ
-decreases, because all blkio.bfq* statistics are created and updated
-(Section 4-2). For BFQ, this leads to the following maximum
-sustainable throughputs, on the same systems as above:
-- Intel i7-4850HQ: 310 KIOPS
-- AMD A8-3850: 200 KIOPS
-- ARM CortexTM-A53 Octa-core: 56 KIOPS
-
-BFQ works for multi-queue devices too.
-
-The table of contents follow. Impatients can just jump to Section 3.
-
-CONTENTS
-
-1. When may BFQ be useful?
- 1-1 Personal systems
- 1-2 Server systems
-2. How does BFQ work?
-3. What are BFQ's tunables and how to properly configure BFQ?
-4. BFQ group scheduling
- 4-1 Service guarantees provided
- 4-2 Interface
-
-1. When may BFQ be useful?
-==========================
-
-BFQ provides the following benefits on personal and server systems.
-
-1-1 Personal systems
---------------------
-
-Low latency for interactive applications
-
-Regardless of the actual background workload, BFQ guarantees that, for
-interactive tasks, the storage device is virtually as responsive as if
-it was idle. For example, even if one or more of the following
-background workloads are being executed:
-- one or more large files are being read, written or copied,
-- a tree of source files is being compiled,
-- one or more virtual machines are performing I/O,
-- a software update is in progress,
-- indexing daemons are scanning filesystems and updating their
- databases,
-starting an application or loading a file from within an application
-takes about the same time as if the storage device was idle. As a
-comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
-applications experience high latencies, or even become unresponsive
-until the background workload terminates (also on SSDs).
-
-Low latency for soft real-time applications
-
-Also soft real-time applications, such as audio and video
-players/streamers, enjoy a low latency and a low drop rate, regardless
-of the background I/O workload. As a consequence, these applications
-do not suffer from almost any glitch due to the background workload.
-
-Higher speed for code-development tasks
-
-If some additional workload happens to be executed in parallel, then
-BFQ executes the I/O-related components of typical code-development
-tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
-NOOP or DEADLINE.
-
-High throughput
-
-On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
-up to 150% higher throughput than DEADLINE and NOOP, with all the
-sequential workloads considered in our tests. With random workloads,
-and with all the workloads on flash-based devices, BFQ achieves,
-instead, about the same throughput as the other schedulers.
-
-Strong fairness, bandwidth and delay guarantees
-
-BFQ distributes the device throughput, and not just the device time,
-among I/O-bound applications in proportion their weights, with any
-workload and regardless of the device parameters. From these bandwidth
-guarantees, it is possible to compute tight per-I/O-request delay
-guarantees by a simple formula. If not configured for strict service
-guarantees, BFQ switches to time-based resource sharing (only) for
-applications that would otherwise cause a throughput loss.
-
-1-2 Server systems
-------------------
-
-Most benefits for server systems follow from the same service
-properties as above. In particular, regardless of whether additional,
-possibly heavy workloads are being served, BFQ guarantees:
-
-. audio and video-streaming with zero or very low jitter and drop
- rate;
-
-. fast retrieval of WEB pages and embedded objects;
-
-. real-time recording of data in live-dumping applications (e.g.,
- packet logging);
-
-. responsiveness in local and remote access to a server.
-
-
-2. How does BFQ work?
-=====================
-
-BFQ is a proportional-share I/O scheduler, whose general structure,
-plus a lot of code, are borrowed from CFQ.
-
-- Each process doing I/O on a device is associated with a weight and a
- (bfq_)queue.
-
-- BFQ grants exclusive access to the device, for a while, to one queue
- (process) at a time, and implements this service model by
- associating every queue with a budget, measured in number of
- sectors.
-
- - After a queue is granted access to the device, the budget of the
- queue is decremented, on each request dispatch, by the size of the
- request.
-
- - The in-service queue is expired, i.e., its service is suspended,
- only if one of the following events occurs: 1) the queue finishes
- its budget, 2) the queue empties, 3) a "budget timeout" fires.
-
- - The budget timeout prevents processes doing random I/O from
- holding the device for too long and dramatically reducing
- throughput.
-
- - Actually, as in CFQ, a queue associated with a process issuing
- sync requests may not be expired immediately when it empties. In
- contrast, BFQ may idle the device for a short time interval,
- giving the process the chance to go on being served if it issues
- a new request in time. Device idling typically boosts the
- throughput on rotational devices and on non-queueing flash-based
- devices, if processes do synchronous and sequential I/O. In
- addition, under BFQ, device idling is also instrumental in
- guaranteeing the desired throughput fraction to processes
- issuing sync requests (see the description of the slice_idle
- tunable in this document, or [1, 2], for more details).
-
- - With respect to idling for service guarantees, if several
- processes are competing for the device at the same time, but
- all processes and groups have the same weight, then BFQ
- guarantees the expected throughput distribution without ever
- idling the device. Throughput is thus as high as possible in
- this common scenario.
-
- - On flash-based storage with internal queueing of commands
- (typically NCQ), device idling happens to be always detrimental
- for throughput. So, with these devices, BFQ performs idling
- only when strictly needed for service guarantees, i.e., for
- guaranteeing low latency or fairness. In these cases, overall
- throughput may be sub-optimal. No solution currently exists to
- provide both strong service guarantees and optimal throughput
- on devices with internal queueing.
-
- - If low-latency mode is enabled (default configuration), BFQ
- executes some special heuristics to detect interactive and soft
- real-time applications (e.g., video or audio players/streamers),
- and to reduce their latency. The most important action taken to
- achieve this goal is to give to the queues associated with these
- applications more than their fair share of the device
- throughput. For brevity, we call just "weight-raising" the whole
- sets of actions taken by BFQ to privilege these queues. In
- particular, BFQ provides a milder form of weight-raising for
- interactive applications, and a stronger form for soft real-time
- applications.
-
- - BFQ automatically deactivates idling for queues born in a burst of
- queue creations. In fact, these queues are usually associated with
- the processes of applications and services that benefit mostly
- from a high throughput. Examples are systemd during boot, or git
- grep.
-
- - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
- performing random I/O that becomes mostly sequential if
- merged. Differently from CFQ, BFQ achieves this goal with a more
- reactive mechanism, called Early Queue Merge (EQM). EQM is so
- responsive in detecting interleaved I/O (cooperating processes),
- that it enables BFQ to achieve a high throughput, by queue
- merging, even for queues for which CFQ needs a different
- mechanism, preemption, to get a high throughput. As such EQM is a
- unified mechanism to achieve a high throughput with interleaved
- I/O.
-
- - Queues are scheduled according to a variant of WF2Q+, named
- B-WF2Q+, and implemented using an augmented rb-tree to preserve an
- O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
- also ready for hierarchical scheduling, details in Section 4.
-
- - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
- perfectly fair, and smooth service. In particular, B-WF2Q+
- guarantees that each queue receives a fraction of the device
- throughput proportional to its weight, even if the throughput
- fluctuates, and regardless of: the device parameters, the current
- workload and the budgets assigned to the queue.
-
- - The last, budget-independence, property (although probably
- counterintuitive in the first place) is definitely beneficial, for
- the following reasons:
-
- - First, with any proportional-share scheduler, the maximum
- deviation with respect to an ideal service is proportional to
- the maximum budget (slice) assigned to queues. As a consequence,
- BFQ can keep this deviation tight not only because of the
- accurate service of B-WF2Q+, but also because BFQ *does not*
- need to assign a larger budget to a queue to let the queue
- receive a higher fraction of the device throughput.
-
- - Second, BFQ is free to choose, for every process (queue), the
- budget that best fits the needs of the process, or best
- leverages the I/O pattern of the process. In particular, BFQ
- updates queue budgets with a simple feedback-loop algorithm that
- allows a high throughput to be achieved, while still providing
- tight latency guarantees to time-sensitive applications. When
- the in-service queue expires, this algorithm computes the next
- budget of the queue so as to:
-
- - Let large budgets be eventually assigned to the queues
- associated with I/O-bound applications performing sequential
- I/O: in fact, the longer these applications are served once
- got access to the device, the higher the throughput is.
-
- - Let small budgets be eventually assigned to the queues
- associated with time-sensitive applications (which typically
- perform sporadic and short I/O), because, the smaller the
- budget assigned to a queue waiting for service is, the sooner
- B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
-
-- If several processes are competing for the device at the same time,
- but all processes and groups have the same weight, then BFQ
- guarantees the expected throughput distribution without ever idling
- the device. It uses preemption instead. Throughput is then much
- higher in this common scenario.
-
-- ioprio classes are served in strict priority order, i.e.,
- lower-priority queues are not served as long as there are
- higher-priority queues. Among queues in the same class, the
- bandwidth is distributed in proportion to the weight of each
- queue. A very thin extra bandwidth is however guaranteed to
- the Idle class, to prevent it from starving.
-
-
-3. What are BFQ's tunables and how to properly configure BFQ?
-=============================================================
-
-Most BFQ tunables affect service guarantees (basically latency and
-fairness) and throughput. For full details on how to choose the
-desired tradeoff between service guarantees and throughput, see the
-parameters slice_idle, strict_guarantees and low_latency. For details
-on how to maximise throughput, see slice_idle, timeout_sync and
-max_budget. The other performance-related parameters have been
-inherited from, and have been preserved mostly for compatibility with
-CFQ. So far, no performance improvement has been reported after
-changing the latter parameters in BFQ.
-
-In particular, the tunables back_seek-max, back_seek_penalty,
-fifo_expire_async and fifo_expire_sync below are the same as in
-CFQ. Their description is just copied from that for CFQ. Some
-considerations in the description of slice_idle are copied from CFQ
-too.
-
-per-process ioprio and weight
------------------------------
-
-Unless the cgroups interface is used (see "4. BFQ group scheduling"),
-weights can be assigned to processes only indirectly, through I/O
-priorities, and according to the relation:
-weight = (IOPRIO_BE_NR - ioprio) * 10.
-
-Beware that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-slice_idle
-----------
-
-This parameter specifies how long BFQ should idle for next I/O
-request, when certain sync BFQ queues become empty. By default
-slice_idle is a non-zero value. Idling has a double purpose: boosting
-throughput and making sure that the desired throughput distribution is
-respected (see the description of how BFQ works, and, if needed, the
-papers referred there).
-
-As for throughput, idling can be very helpful on highly seeky media
-like single spindle SATA/SAS disks where we can cut down on overall
-number of seeks and see improved throughput.
-
-Setting slice_idle to 0 will remove all the idling on queues and one
-should see an overall improved throughput on faster storage devices
-like multiple SATA/SAS disks in hardware RAID configuration, as well
-as flash-based storage with internal command queueing (and
-parallelism).
-
-So depending on storage and workload, it might be useful to set
-slice_idle=0. In general for SATA/SAS disks and software RAID of
-SATA/SAS disks keeping slice_idle enabled should be useful. For any
-configurations where there are multiple spindles behind single LUN
-(Host based hardware RAID controller or for storage arrays), or with
-flash-based fast storage, setting slice_idle=0 might end up in better
-throughput and acceptable latencies.
-
-Idling is however necessary to have service guarantees enforced in
-case of differentiated weights or differentiated I/O-request lengths.
-To see why, suppose that a given BFQ queue A must get several I/O
-requests served for each request served for another queue B. Idling
-ensures that, if A makes a new I/O request slightly after becoming
-empty, then no request of B is dispatched in the middle, and thus A
-does not lose the possibility to get more than one request dispatched
-before the next request of B is dispatched. Note that idling
-guarantees the desired differentiated treatment of queues only in
-terms of I/O-request dispatches. To guarantee that the actual service
-order then corresponds to the dispatch order, the strict_guarantees
-tunable must be set too.
-
-There is an important flipside for idling: apart from the above cases
-where it is beneficial also for throughput, idling can severely impact
-throughput. One important case is random workload. Because of this
-issue, BFQ tends to avoid idling as much as possible, when it is not
-beneficial also for throughput (as detailed in Section 2). As a
-consequence of this behavior, and of further issues described for the
-strict_guarantees tunable, short-term service guarantees may be
-occasionally violated. And, in some cases, these guarantees may be
-more important than guaranteeing maximum throughput. For example, in
-video playing/streaming, a very low drop rate may be more important
-than maximum throughput. In these cases, consider setting the
-strict_guarantees parameter.
-
-slice_idle_us
--------------
-
-Controls the same tuning parameter as slice_idle, but in microseconds.
-Either tunable can be used to set idling behavior. Afterwards, the
-other tunable will reflect the newly set value in sysfs.
-
-strict_guarantees
------------------
-
-If this parameter is set (default: unset), then BFQ
-
-- always performs idling when the in-service queue becomes empty;
-
-- forces the device to serve one I/O request at a time, by dispatching a
- new request only if there is no outstanding request.
-
-In the presence of differentiated weights or I/O-request sizes, both
-the above conditions are needed to guarantee that every BFQ queue
-receives its allotted share of the bandwidth. The first condition is
-needed for the reasons explained in the description of the slice_idle
-tunable. The second condition is needed because all modern storage
-devices reorder internally-queued requests, which may trivially break
-the service guarantees enforced by the I/O scheduler.
-
-Setting strict_guarantees may evidently affect throughput.
-
-back_seek_max
--------------
-
-This specifies, given in Kbytes, the maximum "distance" for backward seeking.
-The distance is the amount of space from the current head location to the
-sectors that are backward in terms of distance.
-
-This parameter allows the scheduler to anticipate requests in the "backward"
-direction and consider them as being the "next" if they are within this
-distance from the current head location.
-
-back_seek_penalty
------------------
-
-This parameter is used to compute the cost of backward seeking. If the
-backward distance of request is just 1/back_seek_penalty from a "front"
-request, then the seeking cost of two requests is considered equivalent.
-
-So scheduler will not bias toward one or the other request (otherwise scheduler
-will bias toward front request). Default value of back_seek_penalty is 2.
-
-fifo_expire_async
------------------
-
-This parameter is used to set the timeout of asynchronous requests. Default
-value of this is 248ms.
-
-fifo_expire_sync
-----------------
-
-This parameter is used to set the timeout of synchronous requests. Default
-value of this is 124ms. In case to favor synchronous requests over asynchronous
-one, this value should be decreased relative to fifo_expire_async.
-
-low_latency
------------
-
-This parameter is used to enable/disable BFQ's low latency mode. By
-default, low latency mode is enabled. If enabled, interactive and soft
-real-time applications are privileged and experience a lower latency,
-as explained in more detail in the description of how BFQ works.
-
-DISABLE this mode if you need full control on bandwidth
-distribution. In fact, if it is enabled, then BFQ automatically
-increases the bandwidth share of privileged applications, as the main
-means to guarantee a lower latency to them.
-
-In addition, as already highlighted at the beginning of this document,
-DISABLE this mode if your only goal is to achieve a high throughput.
-In fact, privileging the I/O of some application over the rest may
-entail a lower throughput. To achieve the highest-possible throughput
-on a non-rotational device, setting slice_idle to 0 may be needed too
-(at the cost of giving up any strong guarantee on fairness and low
-latency).
-
-timeout_sync
-------------
-
-Maximum amount of device time that can be given to a task (queue) once
-it has been selected for service. On devices with costly seeks,
-increasing this time usually increases maximum throughput. On the
-opposite end, increasing this time coarsens the granularity of the
-short-term bandwidth and latency guarantees, especially if the
-following parameter is set to zero.
-
-max_budget
-----------
-
-Maximum amount of service, measured in sectors, that can be provided
-to a BFQ queue once it is set in service (of course within the limits
-of the above timeout). According to what said in the description of
-the algorithm, larger values increase the throughput in proportion to
-the percentage of sequential I/O requests issued. The price of larger
-values is that they coarsen the granularity of short-term bandwidth
-and latency guarantees.
-
-The default value is 0, which enables auto-tuning: BFQ sets max_budget
-to the maximum number of sectors that can be served during
-timeout_sync, according to the estimated peak rate.
-
-For specific devices, some users have occasionally reported to have
-reached a higher throughput by setting max_budget explicitly, i.e., by
-setting max_budget to a higher value than 0. In particular, they have
-set max_budget to higher values than those to which BFQ would have set
-it with auto-tuning. An alternative way to achieve this goal is to
-just increase the value of timeout_sync, leaving max_budget equal to 0.
-
-weights
--------
-
-Read-only parameter, used to show the weights of the currently active
-BFQ queues.
-
-
-4. Group scheduling with BFQ
-============================
-
-BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
-blkio and io. In particular, BFQ supports weight-based proportional
-share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
-
-4-1 Service guarantees provided
--------------------------------
-
-With BFQ, proportional share means true proportional share of the
-device bandwidth, according to group weights. For example, a group
-with weight 200 gets twice the bandwidth, and not just twice the time,
-of a group with weight 100.
-
-BFQ supports hierarchies (group trees) of any depth. Bandwidth is
-distributed among groups and processes in the expected way: for each
-group, the children of the group share the whole bandwidth of the
-group in proportion to their weights. In particular, this implies
-that, for each leaf group, every process of the group receives the
-same share of the whole group bandwidth, unless the ioprio of the
-process is modified.
-
-The resource-sharing guarantee for a group may partially or totally
-switch from bandwidth to time, if providing bandwidth guarantees to
-the group lowers the throughput too much. This switch occurs on a
-per-process basis: if a process of a leaf group causes throughput loss
-if served in such a way to receive its share of the bandwidth, then
-BFQ switches back to just time-based proportional share for that
-process.
-
-4-2 Interface
--------------
-
-To get proportional sharing of bandwidth with BFQ for a given device,
-BFQ must of course be the active scheduler for that device.
-
-Within each group directory, the names of the files associated with
-BFQ-specific cgroup parameters and stats begin with the "bfq."
-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
-parameter to set the weight of a group with BFQ is blkio.bfq.weight
-or io.bfq.weight.
-
-As for cgroups-v1 (blkio controller), the exact set of stat files
-created, and kept up-to-date by bfq, depends on whether
-CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all
-the stat files documented in
-Documentation/cgroup-v1/blkio-controller.txt. If, instead,
-CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files
-blkio.bfq.io_service_bytes
-blkio.bfq.io_service_bytes_recursive
-blkio.bfq.io_serviced
-blkio.bfq.io_serviced_recursive
-
-The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum
-throughput sustainable with bfq, because updating the blkio.bfq.*
-stats is rather costly, especially for some of the stats enabled by
-CONFIG_DEBUG_BLK_CGROUP.
-
-Parameters to set
------------------
-
-For each group, there is only the following parameter to set.
-
-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
-group inside its parent. Available values: 1..10000 (default 100). The
-linear mapping between ioprio and weights, described at the beginning
-of the tunable section, is still valid, but all weights higher than
-IOPRIO_BE_NR*10 are mapped to ioprio 0.
-
-Recall that, if low-latency is set, then BFQ automatically raises the
-weight of the queues associated with interactive and soft real-time
-applications. Unset this tunable if you need/want to control weights.
-
-
-[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
- Scheduler", Proceedings of the First Workshop on Mobile System
- Technologies (MST-2015), May 2015.
- http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
-
-[2] P. Valente and M. Andreolini, "Improving Application
- Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
- the 5th Annual International Systems and Storage Conference
- (SYSTOR '12), June 2012.
- Slightly extended version:
- http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
- results.pdf
-
-[3] https://github.com/Algodev-github/S
diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst
new file mode 100644
index 000000000000..b964796ec9c7
--- /dev/null
+++ b/Documentation/block/biodoc.rst
@@ -0,0 +1,1164 @@
+=====================================================
+Notes on the Generic Block Layer Rewrite in Linux 2.5
+=====================================================
+
+.. note::
+
+ It seems that there are lot of outdated stuff here. This seems
+ to be written somewhat as a task list. Yet, eventually, something
+ here might still be useful.
+
+Notes Written on Jan 15, 2002:
+
+ - Jens Axboe <jens.axboe@oracle.com>
+ - Suparna Bhattacharya <suparna@in.ibm.com>
+
+Last Updated May 2, 2002
+
+September 2003: Updated I/O Scheduler portions
+ - Nick Piggin <npiggin@kernel.dk>
+
+Introduction
+============
+
+These are some notes describing some aspects of the 2.5 block layer in the
+context of the bio rewrite. The idea is to bring out some of the key
+changes and a glimpse of the rationale behind those changes.
+
+Please mail corrections & suggestions to suparna@in.ibm.com.
+
+Credits
+=======
+
+2.5 bio rewrite:
+ - Jens Axboe <jens.axboe@oracle.com>
+
+Many aspects of the generic block layer redesign were driven by and evolved
+over discussions, prior patches and the collective experience of several
+people. See sections 8 and 9 for a list of some related references.
+
+The following people helped with review comments and inputs for this
+document:
+
+ - Christoph Hellwig <hch@infradead.org>
+ - Arjan van de Ven <arjanv@redhat.com>
+ - Randy Dunlap <rdunlap@xenotime.net>
+ - Andre Hedrick <andre@linux-ide.org>
+
+The following people helped with fixes/contributions to the bio patches
+while it was still work-in-progress:
+
+ - David S. Miller <davem@redhat.com>
+
+
+.. Description of Contents:
+
+ 1. Scope for tuning of logic to various needs
+ 1.1 Tuning based on device or low level driver capabilities
+ - Per-queue parameters
+ - Highmem I/O support
+ - I/O scheduler modularization
+ 1.2 Tuning based on high level requirements/capabilities
+ 1.2.1 Request Priority/Latency
+ 1.3 Direct access/bypass to lower layers for diagnostics and special
+ device operations
+ 1.3.1 Pre-built commands
+ 2. New flexible and generic but minimalist i/o structure or descriptor
+ (instead of using buffer heads at the i/o layer)
+ 2.1 Requirements/Goals addressed
+ 2.2 The bio struct in detail (multi-page io unit)
+ 2.3 Changes in the request structure
+ 3. Using bios
+ 3.1 Setup/teardown (allocation, splitting)
+ 3.2 Generic bio helper routines
+ 3.2.1 Traversing segments and completion units in a request
+ 3.2.2 Setting up DMA scatterlists
+ 3.2.3 I/O completion
+ 3.2.4 Implications for drivers that do not interpret bios (don't handle
+ multiple segments)
+ 3.3 I/O submission
+ 4. The I/O scheduler
+ 5. Scalability related changes
+ 5.1 Granular locking: Removal of io_request_lock
+ 5.2 Prepare for transition to 64 bit sector_t
+ 6. Other Changes/Implications
+ 6.1 Partition re-mapping handled by the generic block layer
+ 7. A few tips on migration of older drivers
+ 8. A list of prior/related/impacted patches/ideas
+ 9. Other References/Discussion Threads
+
+
+Bio Notes
+=========
+
+Let us discuss the changes in the context of how some overall goals for the
+block layer are addressed.
+
+1. Scope for tuning the generic logic to satisfy various requirements
+=====================================================================
+
+The block layer design supports adaptable abstractions to handle common
+processing with the ability to tune the logic to an appropriate extent
+depending on the nature of the device and the requirements of the caller.
+One of the objectives of the rewrite was to increase the degree of tunability
+and to enable higher level code to utilize underlying device/driver
+capabilities to the maximum extent for better i/o performance. This is
+important especially in the light of ever improving hardware capabilities
+and application/middleware software designed to take advantage of these
+capabilities.
+
+1.1 Tuning based on low level device / driver capabilities
+----------------------------------------------------------
+
+Sophisticated devices with large built-in caches, intelligent i/o scheduling
+optimizations, high memory DMA support, etc may find some of the
+generic processing an overhead, while for less capable devices the
+generic functionality is essential for performance or correctness reasons.
+Knowledge of some of the capabilities or parameters of the device should be
+used at the generic block layer to take the right decisions on
+behalf of the driver.
+
+How is this achieved ?
+
+Tuning at a per-queue level:
+
+i. Per-queue limits/values exported to the generic layer by the driver
+
+Various parameters that the generic i/o scheduler logic uses are set at
+a per-queue level (e.g maximum request size, maximum number of segments in
+a scatter-gather list, logical block size)
+
+Some parameters that were earlier available as global arrays indexed by
+major/minor are now directly associated with the queue. Some of these may
+move into the block device structure in the future. Some characteristics
+have been incorporated into a queue flags field rather than separate fields
+in themselves. There are blk_queue_xxx functions to set the parameters,
+rather than update the fields directly
+
+Some new queue property settings:
+
+ blk_queue_bounce_limit(q, u64 dma_address)
+ Enable I/O to highmem pages, dma_address being the
+ limit. No highmem default.
+
+ blk_queue_max_sectors(q, max_sectors)
+ Sets two variables that limit the size of the request.
+
+ - The request queue's max_sectors, which is a soft size in
+ units of 512 byte sectors, and could be dynamically varied
+ by the core kernel.
+
+ - The request queue's max_hw_sectors, which is a hard limit
+ and reflects the maximum size request a driver can handle
+ in units of 512 byte sectors.
+
+ The default for both max_sectors and max_hw_sectors is
+ 255. The upper limit of max_sectors is 1024.
+
+ blk_queue_max_phys_segments(q, max_segments)
+ Maximum physical segments you can handle in a request. 128
+ default (driver limit). (See 3.2.2)
+
+ blk_queue_max_hw_segments(q, max_segments)
+ Maximum dma segments the hardware can handle in a request. 128
+ default (host adapter limit, after dma remapping).
+ (See 3.2.2)
+
+ blk_queue_max_segment_size(q, max_seg_size)
+ Maximum size of a clustered segment, 64kB default.
+
+ blk_queue_logical_block_size(q, logical_block_size)
+ Lowest possible sector size that the hardware can operate
+ on, 512 bytes default.
+
+New queue flags:
+
+ - QUEUE_FLAG_CLUSTER (see 3.2.2)
+ - QUEUE_FLAG_QUEUED (see 3.2.4)
+
+
+ii. High-mem i/o capabilities are now considered the default
+
+The generic bounce buffer logic, present in 2.4, where the block layer would
+by default copyin/out i/o requests on high-memory buffers to low-memory buffers
+assuming that the driver wouldn't be able to handle it directly, has been
+changed in 2.5. The bounce logic is now applied only for memory ranges
+for which the device cannot handle i/o. A driver can specify this by
+setting the queue bounce limit for the request queue for the device
+(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
+where a device is capable of handling high memory i/o.
+
+In order to enable high-memory i/o where the device is capable of supporting
+it, the pci dma mapping routines and associated data structures have now been
+modified to accomplish a direct page -> bus translation, without requiring
+a virtual address mapping (unlike the earlier scheme of virtual address
+-> bus translation). So this works uniformly for high-memory pages (which
+do not have a corresponding kernel virtual address space mapping) and
+low-memory pages.
+
+Note: Please refer to Documentation/DMA-API-HOWTO.txt for a discussion
+on PCI high mem DMA aspects and mapping of scatter gather lists, and support
+for 64 bit PCI.
+
+Special handling is required only for cases where i/o needs to happen on
+pages at physical memory addresses beyond what the device can support. In these
+cases, a bounce bio representing a buffer from the supported memory range
+is used for performing the i/o with copyin/copyout as needed depending on
+the type of the operation. For example, in case of a read operation, the
+data read has to be copied to the original buffer on i/o completion, so a
+callback routine is set up to do this, while for write, the data is copied
+from the original buffer to the bounce buffer prior to issuing the
+operation. Since an original buffer may be in a high memory area that's not
+mapped in kernel virtual addr, a kmap operation may be required for
+performing the copy, and special care may be needed in the completion path
+as it may not be in irq context. Special care is also required (by way of
+GFP flags) when allocating bounce buffers, to avoid certain highmem
+deadlock possibilities.
+
+It is also possible that a bounce buffer may be allocated from high-memory
+area that's not mapped in kernel virtual addr, but within the range that the
+device can use directly; so the bounce page may need to be kmapped during
+copy operations. [Note: This does not hold in the current implementation,
+though]
+
+There are some situations when pages from high memory may need to
+be kmapped, even if bounce buffers are not necessary. For example a device
+may need to abort DMA operations and revert to PIO for the transfer, in
+which case a virtual mapping of the page is required. For SCSI it is also
+done in some scenarios where the low level driver cannot be trusted to
+handle a single sg entry correctly. The driver is expected to perform the
+kmaps as needed on such occasions as appropriate. A driver could also use
+the blk_queue_bounce() routine on its own to bounce highmem i/o to low
+memory for specific requests if so desired.
+
+iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
+
+As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
+queue or pick from (copy) existing generic schedulers and replace/override
+certain portions of it. The 2.5 rewrite provides improved modularization
+of the i/o scheduler. There are more pluggable callbacks, e.g for init,
+add request, extract request, which makes it possible to abstract specific
+i/o scheduling algorithm aspects and details outside of the generic loop.
+It also makes it possible to completely hide the implementation details of
+the i/o scheduler from block drivers.
+
+I/O scheduler wrappers are to be used instead of accessing the queue directly.
+See section 4. The I/O scheduler for details.
+
+1.2 Tuning Based on High level code capabilities
+------------------------------------------------
+
+i. Application capabilities for raw i/o
+
+This comes from some of the high-performance database/middleware
+requirements where an application prefers to make its own i/o scheduling
+decisions based on an understanding of the access patterns and i/o
+characteristics
+
+ii. High performance filesystems or other higher level kernel code's
+capabilities
+
+Kernel components like filesystems could also take their own i/o scheduling
+decisions for optimizing performance. Journalling filesystems may need
+some control over i/o ordering.
+
+What kind of support exists at the generic block layer for this ?
+
+The flags and rw fields in the bio structure can be used for some tuning
+from above e.g indicating that an i/o is just a readahead request, or priority
+settings (currently unused). As far as user applications are concerned they
+would need an additional mechanism either via open flags or ioctls, or some
+other upper level mechanism to communicate such settings to block.
+
+1.2.1 Request Priority/Latency
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Todo/Under discussion::
+
+ Arjan's proposed request priority scheme allows higher levels some broad
+ control (high/med/low) over the priority of an i/o request vs other pending
+ requests in the queue. For example it allows reads for bringing in an
+ executable page on demand to be given a higher priority over pending write
+ requests which haven't aged too much on the queue. Potentially this priority
+ could even be exposed to applications in some manner, providing higher level
+ tunability. Time based aging avoids starvation of lower priority
+ requests. Some bits in the bi_opf flags field in the bio structure are
+ intended to be used for this priority information.
+
+
+1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
+-----------------------------------------------------------------------
+
+(e.g Diagnostics, Systems Management)
+
+There are situations where high-level code needs to have direct access to
+the low level device capabilities or requires the ability to issue commands
+to the device bypassing some of the intermediate i/o layers.
+These could, for example, be special control commands issued through ioctl
+interfaces, or could be raw read/write commands that stress the drive's
+capabilities for certain kinds of fitness tests. Having direct interfaces at
+multiple levels without having to pass through upper layers makes
+it possible to perform bottom up validation of the i/o path, layer by
+layer, starting from the media.
+
+The normal i/o submission interfaces, e.g submit_bio, could be bypassed
+for specially crafted requests which such ioctl or diagnostics
+interfaces would typically use, and the elevator add_request routine
+can instead be used to directly insert such requests in the queue or preferably
+the blk_do_rq routine can be used to place the request on the queue and
+wait for completion. Alternatively, sometimes the caller might just
+invoke a lower level driver specific interface with the request as a
+parameter.
+
+If the request is a means for passing on special information associated with
+the command, then such information is associated with the request->special
+field (rather than misuse the request->buffer field which is meant for the
+request data buffer's virtual mapping).
+
+For passing request data, the caller must build up a bio descriptor
+representing the concerned memory buffer if the underlying driver interprets
+bio segments or uses the block layer end*request* functions for i/o
+completion. Alternatively one could directly use the request->buffer field to
+specify the virtual address of the buffer, if the driver expects buffer
+addresses passed in this way and ignores bio entries for the request type
+involved. In the latter case, the driver would modify and manage the
+request->buffer, request->sector and request->nr_sectors or
+request->current_nr_sectors fields itself rather than using the block layer
+end_request or end_that_request_first completion interfaces.
+(See 2.3 or Documentation/block/request.rst for a brief explanation of
+the request structure fields)
+
+::
+
+ [TBD: end_that_request_last should be usable even in this case;
+ Perhaps an end_that_direct_request_first routine could be implemented to make
+ handling direct requests easier for such drivers; Also for drivers that
+ expect bios, a helper function could be provided for setting up a bio
+ corresponding to a data buffer]
+
+ <JENS: I dont understand the above, why is end_that_request_first() not
+ usable? Or _last for that matter. I must be missing something>
+
+ <SUP: What I meant here was that if the request doesn't have a bio, then
+ end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
+ and hence can't be used for advancing request state settings on the
+ completion of partial transfers. The driver has to modify these fields
+ directly by hand.
+ This is because end_that_request_first only iterates over the bio list,
+ and always returns 0 if there are none associated with the request.
+ _last works OK in this case, and is not a problem, as I mentioned earlier
+ >
+
+1.3.1 Pre-built Commands
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+A request can be created with a pre-built custom command to be sent directly
+to the device. The cmd block in the request structure has room for filling
+in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
+command pre-building, and the type of the request is now indicated
+through rq->flags instead of via rq->cmd)
+
+The request structure flags can be set up to indicate the type of request
+in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
+packet command issued via blk_do_rq, REQ_SPECIAL: special request).
+
+It can help to pre-build device commands for requests in advance.
+Drivers can now specify a request prepare function (q->prep_rq_fn) that the
+block layer would invoke to pre-build device commands for a given request,
+or perform other preparatory processing for the request. This is routine is
+called by elv_next_request(), i.e. typically just before servicing a request.
+(The prepare function would not be called for requests that have RQF_DONTPREP
+enabled)
+
+Aside:
+ Pre-building could possibly even be done early, i.e before placing the
+ request on the queue, rather than construct the command on the fly in the
+ driver while servicing the request queue when it may affect latencies in
+ interrupt context or responsiveness in general. One way to add early
+ pre-building would be to do it whenever we fail to merge on a request.
+ Now REQ_NOMERGE is set in the request flags to skip this one in the future,
+ which means that it will not change before we feed it to the device. So
+ the pre-builder hook can be invoked there.
+
+
+2. Flexible and generic but minimalist i/o structure/descriptor
+===============================================================
+
+2.1 Reason for a new structure and requirements addressed
+---------------------------------------------------------
+
+Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
+layer, and the low level request structure was associated with a chain of
+buffer heads for a contiguous i/o request. This led to certain inefficiencies
+when it came to large i/o requests and readv/writev style operations, as it
+forced such requests to be broken up into small chunks before being passed
+on to the generic block layer, only to be merged by the i/o scheduler
+when the underlying device was capable of handling the i/o in one shot.
+Also, using the buffer head as an i/o structure for i/os that didn't originate
+from the buffer cache unnecessarily added to the weight of the descriptors
+which were generated for each such chunk.
+
+The following were some of the goals and expectations considered in the
+redesign of the block i/o data structure in 2.5.
+
+1. Should be appropriate as a descriptor for both raw and buffered i/o -
+ avoid cache related fields which are irrelevant in the direct/page i/o path,
+ or filesystem block size alignment restrictions which may not be relevant
+ for raw i/o.
+2. Ability to represent high-memory buffers (which do not have a virtual
+ address mapping in kernel address space).
+3. Ability to represent large i/os w/o unnecessarily breaking them up (i.e
+ greater than PAGE_SIZE chunks in one shot)
+4. At the same time, ability to retain independent identity of i/os from
+ different sources or i/o units requiring individual completion (e.g. for
+ latency reasons)
+5. Ability to represent an i/o involving multiple physical memory segments
+ (including non-page aligned page fragments, as specified via readv/writev)
+ without unnecessarily breaking it up, if the underlying device is capable of
+ handling it.
+6. Preferably should be based on a memory descriptor structure that can be
+ passed around different types of subsystems or layers, maybe even
+ networking, without duplication or extra copies of data/descriptor fields
+ themselves in the process
+7. Ability to handle the possibility of splits/merges as the structure passes
+ through layered drivers (lvm, md, evms), with minimal overhead.
+
+The solution was to define a new structure (bio) for the block layer,
+instead of using the buffer head structure (bh) directly, the idea being
+avoidance of some associated baggage and limitations. The bio structure
+is uniformly used for all i/o at the block layer ; it forms a part of the
+bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
+mapped to bio structures.
+
+2.2 The bio struct
+------------------
+
+The bio structure uses a vector representation pointing to an array of tuples
+of <page, offset, len> to describe the i/o buffer, and has various other
+fields describing i/o parameters and state that needs to be maintained for
+performing the i/o.
+
+Notice that this representation means that a bio has no virtual address
+mapping at all (unlike buffer heads).
+
+::
+
+ struct bio_vec {
+ struct page *bv_page;
+ unsigned short bv_len;
+ unsigned short bv_offset;
+ };
+
+ /*
+ * main unit of I/O for the block layer and lower layers (ie drivers)
+ */
+ struct bio {
+ struct bio *bi_next; /* request queue link */
+ struct block_device *bi_bdev; /* target device */
+ unsigned long bi_flags; /* status, command, etc */
+ unsigned long bi_opf; /* low bits: r/w, high: priority */
+
+ unsigned int bi_vcnt; /* how may bio_vec's */
+ struct bvec_iter bi_iter; /* current index into bio_vec array */
+
+ unsigned int bi_size; /* total size in bytes */
+ unsigned short bi_hw_segments; /* segments after DMA remapping */
+ unsigned int bi_max; /* max bio_vecs we can hold
+ used as index into pool */
+ struct bio_vec *bi_io_vec; /* the actual vec list */
+ bio_end_io_t *bi_end_io; /* bi_end_io (bio) */
+ atomic_t bi_cnt; /* pin count: free when it hits zero */
+ void *bi_private;
+ };
+
+With this multipage bio design:
+
+- Large i/os can be sent down in one go using a bio_vec list consisting
+ of an array of <page, offset, len> fragments (similar to the way fragments
+ are represented in the zero-copy network code)
+- Splitting of an i/o request across multiple devices (as in the case of
+ lvm or raid) is achieved by cloning the bio (where the clone points to
+ the same bi_io_vec array, but with the index and size accordingly modified)
+- A linked list of bios is used as before for unrelated merges [#]_ - this
+ avoids reallocs and makes independent completions easier to handle.
+- Code that traverses the req list can find all the segments of a bio
+ by using rq_for_each_segment. This handles the fact that a request
+ has multiple bios, each of which can have multiple segments.
+- Drivers which can't process a large bio in one shot can use the bi_iter
+ field to keep track of the next bio_vec entry to process.
+ (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
+ [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
+ bi_offset an len fields]
+
+.. [#]
+
+ unrelated merges -- a request ends up containing two or more bios that
+ didn't originate from the same place.
+
+bi_end_io() i/o callback gets called on i/o completion of the entire bio.
+
+At a lower level, drivers build a scatter gather list from the merged bios.
+The scatter gather list is in the form of an array of <page, offset, len>
+entries with their corresponding dma address mappings filled in at the
+appropriate time. As an optimization, contiguous physical pages can be
+covered by a single entry where <page> refers to the first page and <len>
+covers the range of pages (up to 16 contiguous pages could be covered this
+way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
+the sg list.
+
+Note: Right now the only user of bios with more than one page is ll_rw_kio,
+which in turn means that only raw I/O uses it (direct i/o may not work
+right now). The intent however is to enable clustering of pages etc to
+become possible. The pagebuf abstraction layer from SGI also uses multi-page
+bios, but that is currently not included in the stock development kernels.
+The same is true of Andrew Morton's work-in-progress multipage bio writeout
+and readahead patches.
+
+2.3 Changes in the Request Structure
+------------------------------------
+
+The request structure is the structure that gets passed down to low level
+drivers. The block layer make_request function builds up a request structure,
+places it on the queue and invokes the drivers request_fn. The driver makes
+use of block layer helper routine elv_next_request to pull the next request
+off the queue. Control or diagnostic functions might bypass block and directly
+invoke underlying driver entry points passing in a specially constructed
+request structure.
+
+Only some relevant fields (mainly those which changed or may be referred
+to in some of the discussion here) are listed below, not necessarily in
+the order in which they occur in the structure (see include/linux/blkdev.h)
+Refer to Documentation/block/request.rst for details about all the request
+structure fields and a quick reference about the layers which are
+supposed to use or modify those fields::
+
+ struct request {
+ struct list_head queuelist; /* Not meant to be directly accessed by
+ the driver.
+ Used by q->elv_next_request_fn
+ rq->queue is gone
+ */
+ .
+ .
+ unsigned char cmd[16]; /* prebuilt command data block */
+ unsigned long flags; /* also includes earlier rq->cmd settings */
+ .
+ .
+ sector_t sector; /* this field is now of type sector_t instead of int
+ preparation for 64 bit sectors */
+ .
+ .
+
+ /* Number of scatter-gather DMA addr+len pairs after
+ * physical address coalescing is performed.
+ */
+ unsigned short nr_phys_segments;
+
+ /* Number of scatter-gather addr+len pairs after
+ * physical and DMA remapping hardware coalescing is performed.
+ * This is the number of scatter-gather entries the driver
+ * will actually have to deal with after DMA mapping is done.
+ */
+ unsigned short nr_hw_segments;
+
+ /* Various sector counts */
+ unsigned long nr_sectors; /* no. of sectors left: driver modifiable */
+ unsigned long hard_nr_sectors; /* block internal copy of above */
+ unsigned int current_nr_sectors; /* no. of sectors left in the
+ current segment:driver modifiable */
+ unsigned long hard_cur_sectors; /* block internal copy of the above */
+ .
+ .
+ int tag; /* command tag associated with request */
+ void *special; /* same as before */
+ char *buffer; /* valid only for low memory buffers up to
+ current_nr_sectors */
+ .
+ .
+ struct bio *bio, *biotail; /* bio list instead of bh */
+ struct request_list *rl;
+ }
+
+See the req_ops and req_flag_bits definitions for an explanation of the various
+flags available. Some bits are used by the block layer or i/o scheduler.
+
+The behaviour of the various sector counts are almost the same as before,
+except that since we have multi-segment bios, current_nr_sectors refers
+to the numbers of sectors in the current segment being processed which could
+be one of the many segments in the current bio (i.e i/o completion unit).
+The nr_sectors value refers to the total number of sectors in the whole
+request that remain to be transferred (no change). The purpose of the
+hard_xxx values is for block to remember these counts every time it hands
+over the request to the driver. These values are updated by block on
+end_that_request_first, i.e. every time the driver completes a part of the
+transfer and invokes block end*request helpers to mark this. The
+driver should not modify these values. The block layer sets up the
+nr_sectors and current_nr_sectors fields (based on the corresponding
+hard_xxx values and the number of bytes transferred) and updates it on
+every transfer that invokes end_that_request_first. It does the same for the
+buffer, bio, bio->bi_iter fields too.
+
+The buffer field is just a virtual address mapping of the current segment
+of the i/o buffer in cases where the buffer resides in low-memory. For high
+memory i/o, this field is not valid and must not be used by drivers.
+
+Code that sets up its own request structures and passes them down to
+a driver needs to be careful about interoperation with the block layer helper
+functions which the driver uses. (Section 1.3)
+
+3. Using bios
+=============
+
+3.1 Setup/Teardown
+------------------
+
+There are routines for managing the allocation, and reference counting, and
+freeing of bios (bio_alloc, bio_get, bio_put).
+
+This makes use of Ingo Molnar's mempool implementation, which enables
+subsystems like bio to maintain their own reserve memory pools for guaranteed
+deadlock-free allocations during extreme VM load. For example, the VM
+subsystem makes use of the block layer to writeout dirty pages in order to be
+able to free up memory space, a case which needs careful handling. The
+allocation logic draws from the preallocated emergency reserve in situations
+where it cannot allocate through normal means. If the pool is empty and it
+can wait, then it would trigger action that would help free up memory or
+replenish the pool (without deadlocking) and wait for availability in the pool.
+If it is in IRQ context, and hence not in a position to do this, allocation
+could fail if the pool is empty. In general mempool always first tries to
+perform allocation without having to wait, even if it means digging into the
+pool as long it is not less that 50% full.
+
+On a free, memory is released to the pool or directly freed depending on
+the current availability in the pool. The mempool interface lets the
+subsystem specify the routines to be used for normal alloc and free. In the
+case of bio, these routines make use of the standard slab allocator.
+
+The caller of bio_alloc is expected to taken certain steps to avoid
+deadlocks, e.g. avoid trying to allocate more memory from the pool while
+already holding memory obtained from the pool.
+
+::
+
+ [TBD: This is a potential issue, though a rare possibility
+ in the bounce bio allocation that happens in the current code, since
+ it ends up allocating a second bio from the same pool while
+ holding the original bio ]
+
+Memory allocated from the pool should be released back within a limited
+amount of time (in the case of bio, that would be after the i/o is completed).
+This ensures that if part of the pool has been used up, some work (in this
+case i/o) must already be in progress and memory would be available when it
+is over. If allocating from multiple pools in the same code path, the order
+or hierarchy of allocation needs to be consistent, just the way one deals
+with multiple locks.
+
+The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
+for a non-clone bio. There are the 6 pools setup for different size biovecs,
+so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
+given size from these slabs.
+
+The bio_get() routine may be used to hold an extra reference on a bio prior
+to i/o submission, if the bio fields are likely to be accessed after the
+i/o is issued (since the bio may otherwise get freed in case i/o completion
+happens in the meantime).
+
+The bio_clone_fast() routine may be used to duplicate a bio, where the clone
+shares the bio_vec_list with the original bio (i.e. both point to the
+same bio_vec_list). This would typically be used for splitting i/o requests
+in lvm or md.
+
+3.2 Generic bio helper Routines
+-------------------------------
+
+3.2.1 Traversing segments and completion units in a request
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The macro rq_for_each_segment() should be used for traversing the bios
+in the request list (drivers should avoid directly trying to do it
+themselves). Using these helpers should also make it easier to cope
+with block changes in the future.
+
+::
+
+ struct req_iterator iter;
+ rq_for_each_segment(bio_vec, rq, iter)
+ /* bio_vec is now current segment */
+
+I/O completion callbacks are per-bio rather than per-segment, so drivers
+that traverse bio chains on completion need to keep that in mind. Drivers
+which don't make a distinction between segments and completion units would
+need to be reorganized to support multi-segment bios.
+
+3.2.2 Setting up DMA scatterlists
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The blk_rq_map_sg() helper routine would be used for setting up scatter
+gather lists from a request, so a driver need not do it on its own.
+
+ nr_segments = blk_rq_map_sg(q, rq, scatterlist);
+
+The helper routine provides a level of abstraction which makes it easier
+to modify the internals of request to scatterlist conversion down the line
+without breaking drivers. The blk_rq_map_sg routine takes care of several
+things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
+is set) and correct segment accounting to avoid exceeding the limits which
+the i/o hardware can handle, based on various queue properties.
+
+- Prevents a clustered segment from crossing a 4GB mem boundary
+- Avoids building segments that would exceed the number of physical
+ memory segments that the driver can handle (phys_segments) and the
+ number that the underlying hardware can handle at once, accounting for
+ DMA remapping (hw_segments) (i.e. IOMMU aware limits).
+
+Routines which the low level driver can use to set up the segment limits:
+
+blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
+hw data segments in a request (i.e. the maximum number of address/length
+pairs the host adapter can actually hand to the device at once)
+
+blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
+of physical data segments in a request (i.e. the largest sized scatter list
+a driver could handle)
+
+3.2.3 I/O completion
+^^^^^^^^^^^^^^^^^^^^
+
+The existing generic block layer helper routines end_request,
+end_that_request_first and end_that_request_last can be used for i/o
+completion (and setting things up so the rest of the i/o or the next
+request can be kicked of) as before. With the introduction of multi-page
+bio support, end_that_request_first requires an additional argument indicating
+the number of sectors completed.
+
+3.2.4 Implications for drivers that do not interpret bios
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+(don't handle multiple segments)
+
+Drivers that do not interpret bios e.g those which do not handle multiple
+segments and do not support i/o into high memory addresses (require bounce
+buffers) and expect only virtually mapped buffers, can access the rq->buffer
+field. As before the driver should use current_nr_sectors to determine the
+size of remaining data in the current segment (that is the maximum it can
+transfer in one go unless it interprets segments), and rely on the block layer
+end_request, or end_that_request_first/last to take care of all accounting
+and transparent mapping of the next bio segment when a segment boundary
+is crossed on completion of a transfer. (The end*request* functions should
+be used if only if the request has come down from block/bio path, not for
+direct access requests which only specify rq->buffer without a valid rq->bio)
+
+3.3 I/O Submission
+------------------
+
+The routine submit_bio() is used to submit a single io. Higher level i/o
+routines make use of this:
+
+(a) Buffered i/o:
+
+The routine submit_bh() invokes submit_bio() on a bio corresponding to the
+bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
+
+(b) Kiobuf i/o (for raw/direct i/o):
+
+The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
+maps the array to one or more multi-page bios, issuing submit_bio() to
+perform the i/o on each of these.
+
+The embedded bh array in the kiobuf structure has been removed and no
+preallocation of bios is done for kiobufs. [The intent is to remove the
+blocks array as well, but it's currently in there to kludge around direct i/o.]
+Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
+
+Todo/Observation:
+
+ A single kiobuf structure is assumed to correspond to a contiguous range
+ of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
+ So right now it wouldn't work for direct i/o on non-contiguous blocks.
+ This is to be resolved. The eventual direction is to replace kiobuf
+ by kvec's.
+
+ Badari Pulavarty has a patch to implement direct i/o correctly using
+ bio and kvec.
+
+
+(c) Page i/o:
+
+Todo/Under discussion:
+
+ Andrew Morton's multi-page bio patches attempt to issue multi-page
+ writeouts (and reads) from the page cache, by directly building up
+ large bios for submission completely bypassing the usage of buffer
+ heads. This work is still in progress.
+
+ Christoph Hellwig had some code that uses bios for page-io (rather than
+ bh). This isn't included in bio as yet. Christoph was also working on a
+ design for representing virtual/real extents as an entity and modifying
+ some of the address space ops interfaces to utilize this abstraction rather
+ than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
+ abstraction, but intended to be as lightweight as possible).
+
+(d) Direct access i/o:
+
+Direct access requests that do not contain bios would be submitted differently
+as discussed earlier in section 1.3.
+
+Aside:
+
+ Kvec i/o:
+
+ Ben LaHaise's aio code uses a slightly different structure instead
+ of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
+ tuples (very much like the networking code), together with a callback function
+ and data pointer. This is embedded into a brw_cb structure when passed
+ to brw_kvec_async().
+
+ Now it should be possible to directly map these kvecs to a bio. Just as while
+ cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
+ array pointer to point to the veclet array in kvecs.
+
+ TBD: In order for this to work, some changes are needed in the way multi-page
+ bios are handled today. The values of the tuples in such a vector passed in
+ from higher level code should not be modified by the block layer in the course
+ of its request processing, since that would make it hard for the higher layer
+ to continue to use the vector descriptor (kvec) after i/o completes. Instead,
+ all such transient state should either be maintained in the request structure,
+ and passed on in some way to the endio completion routine.
+
+
+4. The I/O scheduler
+====================
+
+I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch
+queue and specific I/O schedulers. Unless stated otherwise, elevator is used
+to refer to both parts and I/O scheduler to specific I/O schedulers.
+
+Block layer implements generic dispatch queue in `block/*.c`.
+The generic dispatch queue is responsible for requeueing, handling non-fs
+requests and all other subtleties.
+
+Specific I/O schedulers are responsible for ordering normal filesystem
+requests. They can also choose to delay certain requests to improve
+throughput or whatever purpose. As the plural form indicates, there are
+multiple I/O schedulers. They can be built as modules but at least one should
+be built inside the kernel. Each queue can choose different one and can also
+change to another one dynamically.
+
+A block layer call to the i/o scheduler follows the convention elv_xxx(). This
+calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
+and xxx might not match exactly, but use your imagination. If an elevator
+doesn't implement a function, the switch does nothing or some minimal house
+keeping work.
+
+4.1. I/O scheduler API
+----------------------
+
+The functions an elevator may implement are: (* are mandatory)
+
+=============================== ================================================
+elevator_merge_fn called to query requests for merge with a bio
+
+elevator_merge_req_fn called when two requests get merged. the one
+ which gets merged into the other one will be
+ never seen by I/O scheduler again. IOW, after
+ being merged, the request is gone.
+
+elevator_merged_fn called when a request in the scheduler has been
+ involved in a merge. It is used in the deadline
+ scheduler for example, to reposition the request
+ if its sorting order has changed.
+
+elevator_allow_merge_fn called whenever the block layer determines
+ that a bio can be merged into an existing
+ request safely. The io scheduler may still
+ want to stop a merge at this point if it
+ results in some sort of conflict internally,
+ this hook allows it to do that. Note however
+ that two *requests* can still be merged at later
+ time. Currently the io scheduler has no way to
+ prevent that. It can only learn about the fact
+ from elevator_merge_req_fn callback.
+
+elevator_dispatch_fn* fills the dispatch queue with ready requests.
+ I/O schedulers are free to postpone requests by
+ not filling the dispatch queue unless @force
+ is non-zero. Once dispatched, I/O schedulers
+ are not allowed to manipulate the requests -
+ they belong to generic dispatch queue.
+
+elevator_add_req_fn* called to add a new request into the scheduler
+
+elevator_former_req_fn
+elevator_latter_req_fn These return the request before or after the
+ one specified in disk sort order. Used by the
+ block layer to find merge possibilities.
+
+elevator_completed_req_fn called when a request is completed.
+
+elevator_set_req_fn
+elevator_put_req_fn Must be used to allocate and free any elevator
+ specific storage for a request.
+
+elevator_activate_req_fn Called when device driver first sees a request.
+ I/O schedulers can use this callback to
+ determine when actual execution of a request
+ starts.
+elevator_deactivate_req_fn Called when device driver decides to delay
+ a request by requeueing it.
+
+elevator_init_fn*
+elevator_exit_fn Allocate and free any elevator specific storage
+ for a queue.
+=============================== ================================================
+
+4.2 Request flows seen by I/O schedulers
+----------------------------------------
+
+All requests seen by I/O schedulers strictly follow one of the following three
+flows.
+
+ set_req_fn ->
+
+ i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
+ (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
+ ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
+ iii. [none]
+
+ -> put_req_fn
+
+4.3 I/O scheduler implementation
+--------------------------------
+
+The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
+optimal disk scan and request servicing performance (based on generic
+principles and device capabilities), optimized for:
+
+i. improved throughput
+ii. improved latency
+iii. better utilization of h/w & CPU time
+
+Characteristics:
+
+i. Binary tree
+AS and deadline i/o schedulers use red black binary trees for disk position
+sorting and searching, and a fifo linked list for time-based searching. This
+gives good scalability and good availability of information. Requests are
+almost always dispatched in disk sort order, so a cache is kept of the next
+request in sort order to prevent binary tree lookups.
+
+This arrangement is not a generic block layer characteristic however, so
+elevators may implement queues as they please.
+
+ii. Merge hash
+AS and deadline use a hash table indexed by the last sector of a request. This
+enables merging code to quickly look up "back merge" candidates, even when
+multiple I/O streams are being performed at once on one disk.
+
+"Front merges", a new request being merged at the front of an existing request,
+are far less common than "back merges" due to the nature of most I/O patterns.
+Front merges are handled by the binary trees in AS and deadline schedulers.
+
+iii. Plugging the queue to batch requests in anticipation of opportunities for
+ merge/sort optimizations
+
+Plugging is an approach that the current i/o scheduling algorithm resorts to so
+that it collects up enough requests in the queue to be able to take
+advantage of the sorting/merging logic in the elevator. If the
+queue is empty when a request comes in, then it plugs the request queue
+(sort of like plugging the bath tub of a vessel to get fluid to build up)
+till it fills up with a few more requests, before starting to service
+the requests. This provides an opportunity to merge/sort the requests before
+passing them down to the device. There are various conditions when the queue is
+unplugged (to open up the flow again), either through a scheduled task or
+could be on demand. For example wait_on_buffer sets the unplugging going
+through sync_buffer() running blk_run_address_space(mapping). Or the caller
+can do it explicity through blk_unplug(bdev). So in the read case,
+the queue gets explicitly unplugged as part of waiting for completion on that
+buffer.
+
+Aside:
+ This is kind of controversial territory, as it's not clear if plugging is
+ always the right thing to do. Devices typically have their own queues,
+ and allowing a big queue to build up in software, while letting the device be
+ idle for a while may not always make sense. The trick is to handle the fine
+ balance between when to plug and when to open up. Also now that we have
+ multi-page bios being queued in one shot, we may not need to wait to merge
+ a big request from the broken up pieces coming by.
+
+4.4 I/O contexts
+----------------
+
+I/O contexts provide a dynamically allocated per process data area. They may
+be used in I/O schedulers, and in the block layer (could be used for IO statis,
+priorities for example). See `*io_context` in block/ll_rw_blk.c, and as-iosched.c
+for an example of usage in an i/o scheduler.
+
+
+5. Scalability related changes
+==============================
+
+5.1 Granular Locking: io_request_lock replaced by a per-queue lock
+------------------------------------------------------------------
+
+The global io_request_lock has been removed as of 2.5, to avoid
+the scalability bottleneck it was causing, and has been replaced by more
+granular locking. The request queue structure has a pointer to the
+lock to be used for that queue. As a result, locking can now be
+per-queue, with a provision for sharing a lock across queues if
+necessary (e.g the scsi layer sets the queue lock pointers to the
+corresponding adapter lock, which results in a per host locking
+granularity). The locking semantics are the same, i.e. locking is
+still imposed by the block layer, grabbing the lock before
+request_fn execution which it means that lots of older drivers
+should still be SMP safe. Drivers are free to drop the queue
+lock themselves, if required. Drivers that explicitly used the
+io_request_lock for serialization need to be modified accordingly.
+Usually it's as easy as adding a global lock::
+
+ static DEFINE_SPINLOCK(my_driver_lock);
+
+and passing the address to that lock to blk_init_queue().
+
+5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
+----------------------------------------------------------------
+
+The sector number used in the bio structure has been changed to sector_t,
+which could be defined as 64 bit in preparation for 64 bit sector support.
+
+6. Other Changes/Implications
+=============================
+
+6.1 Partition re-mapping handled by the generic block layer
+-----------------------------------------------------------
+
+In 2.5 some of the gendisk/partition related code has been reorganized.
+Now the generic block layer performs partition-remapping early and thus
+provides drivers with a sector number relative to whole device, rather than
+having to take partition number into account in order to arrive at the true
+sector number. The routine blk_partition_remap() is invoked by
+generic_make_request even before invoking the queue specific make_request_fn,
+so the i/o scheduler also gets to operate on whole disk sector numbers. This
+should typically not require changes to block drivers, it just never gets
+to invoke its own partition sector offset calculations since all bios
+sent are offset from the beginning of the device.
+
+
+7. A Few Tips on Migration of older drivers
+===========================================
+
+Old-style drivers that just use CURRENT and ignores clustered requests,
+may not need much change. The generic layer will automatically handle
+clustered requests, multi-page bios, etc for the driver.
+
+For a low performance driver or hardware that is PIO driven or just doesn't
+support scatter-gather changes should be minimal too.
+
+The following are some points to keep in mind when converting old drivers
+to bio.
+
+Drivers should use elv_next_request to pick up requests and are no longer
+supposed to handle looping directly over the request list.
+(struct request->queue has been removed)
+
+Now end_that_request_first takes an additional number_of_sectors argument.
+It used to handle always just the first buffer_head in a request, now
+it will loop and handle as many sectors (on a bio-segment granularity)
+as specified.
+
+Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
+right thing to use is bio_endio(bio) instead.
+
+If the driver is dropping the io_request_lock from its request_fn strategy,
+then it just needs to replace that with q->queue_lock instead.
+
+As described in Sec 1.1, drivers can set max sector size, max segment size
+etc per queue now. Drivers that used to define their own merge functions i
+to handle things like this can now just use the blk_queue_* functions at
+blk_init_queue time.
+
+Drivers no longer have to map a {partition, sector offset} into the
+correct absolute location anymore, this is done by the block layer, so
+where a driver received a request ala this before::
+
+ rq->rq_dev = mk_kdev(3, 5); /* /dev/hda5 */
+ rq->sector = 0; /* first sector on hda5 */
+
+it will now see::
+
+ rq->rq_dev = mk_kdev(3, 0); /* /dev/hda */
+ rq->sector = 123128; /* offset from start of disk */
+
+As mentioned, there is no virtual mapping of a bio. For DMA, this is
+not a problem as the driver probably never will need a virtual mapping.
+Instead it needs a bus mapping (dma_map_page for a single segment or
+use dma_map_sg for scatter gather) to be able to ship it to the driver. For
+PIO drivers (or drivers that need to revert to PIO transfer once in a
+while (IDE for example)), where the CPU is doing the actual data
+transfer a virtual mapping is needed. If the driver supports highmem I/O,
+(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
+a bio into the virtual address space.
+
+
+8. Prior/Related/Impacted patches
+=================================
+
+8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
+-----------------------------------------------------
+
+- orig kiobuf & raw i/o patches (now in 2.4 tree)
+- direct kiobuf based i/o to devices (no intermediate bh's)
+- page i/o using kiobuf
+- kiobuf splitting for lvm (mkp)
+- elevator support for kiobuf request merging (axboe)
+
+8.2. Zero-copy networking (Dave Miller)
+---------------------------------------
+
+8.3. SGI XFS - pagebuf patches - use of kiobufs
+-----------------------------------------------
+8.4. Multi-page pioent patch for bio (Christoph Hellwig)
+--------------------------------------------------------
+8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
+--------------------------------------------------------------------
+8.6. Async i/o implementation patch (Ben LaHaise)
+-------------------------------------------------
+8.7. EVMS layering design (IBM EVMS team)
+-----------------------------------------
+8.8. Larger page cache size patch (Ben LaHaise) and Large page size (Daniel Phillips)
+-------------------------------------------------------------------------------------
+
+ => larger contiguous physical memory buffers
+
+8.9. VM reservations patch (Ben LaHaise)
+----------------------------------------
+8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
+----------------------------------------------------------
+8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
+---------------------------------------------------------------------------
+8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar, Badari)
+-------------------------------------------------------------------------------
+8.13 Priority based i/o scheduler - prepatches (Arjan van de Ven)
+------------------------------------------------------------------
+8.14 IDE Taskfile i/o patch (Andre Hedrick)
+--------------------------------------------
+8.15 Multi-page writeout and readahead patches (Andrew Morton)
+---------------------------------------------------------------
+8.16 Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
+-----------------------------------------------------------------------
+
+9. Other References
+===================
+
+9.1 The Splice I/O Model
+------------------------
+
+Larry McVoy (and subsequent discussions on lkml, and Linus' comments - Jan 2001
+
+9.2 Discussions about kiobuf and bh design
+------------------------------------------
+
+On lkml between sct, linus, alan et al - Feb-March 2001 (many of the
+initial thoughts that led to bio were brought up in this discussion thread)
+
+9.3 Discussions on mempool on lkml - Dec 2001.
+----------------------------------------------
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
deleted file mode 100644
index ac18b488cb5e..000000000000
--- a/Documentation/block/biodoc.txt
+++ /dev/null
@@ -1,1077 +0,0 @@
- Notes on the Generic Block Layer Rewrite in Linux 2.5
- =====================================================
-
-Notes Written on Jan 15, 2002:
- Jens Axboe <jens.axboe@oracle.com>
- Suparna Bhattacharya <suparna@in.ibm.com>
-
-Last Updated May 2, 2002
-September 2003: Updated I/O Scheduler portions
- Nick Piggin <npiggin@kernel.dk>
-
-Introduction:
-
-These are some notes describing some aspects of the 2.5 block layer in the
-context of the bio rewrite. The idea is to bring out some of the key
-changes and a glimpse of the rationale behind those changes.
-
-Please mail corrections & suggestions to suparna@in.ibm.com.
-
-Credits:
----------
-
-2.5 bio rewrite:
- Jens Axboe <jens.axboe@oracle.com>
-
-Many aspects of the generic block layer redesign were driven by and evolved
-over discussions, prior patches and the collective experience of several
-people. See sections 8 and 9 for a list of some related references.
-
-The following people helped with review comments and inputs for this
-document:
- Christoph Hellwig <hch@infradead.org>
- Arjan van de Ven <arjanv@redhat.com>
- Randy Dunlap <rdunlap@xenotime.net>
- Andre Hedrick <andre@linux-ide.org>
-
-The following people helped with fixes/contributions to the bio patches
-while it was still work-in-progress:
- David S. Miller <davem@redhat.com>
-
-
-Description of Contents:
-------------------------
-
-1. Scope for tuning of logic to various needs
- 1.1 Tuning based on device or low level driver capabilities
- - Per-queue parameters
- - Highmem I/O support
- - I/O scheduler modularization
- 1.2 Tuning based on high level requirements/capabilities
- 1.2.1 Request Priority/Latency
- 1.3 Direct access/bypass to lower layers for diagnostics and special
- device operations
- 1.3.1 Pre-built commands
-2. New flexible and generic but minimalist i/o structure or descriptor
- (instead of using buffer heads at the i/o layer)
- 2.1 Requirements/Goals addressed
- 2.2 The bio struct in detail (multi-page io unit)
- 2.3 Changes in the request structure
-3. Using bios
- 3.1 Setup/teardown (allocation, splitting)
- 3.2 Generic bio helper routines
- 3.2.1 Traversing segments and completion units in a request
- 3.2.2 Setting up DMA scatterlists
- 3.2.3 I/O completion
- 3.2.4 Implications for drivers that do not interpret bios (don't handle
- multiple segments)
- 3.3 I/O submission
-4. The I/O scheduler
-5. Scalability related changes
- 5.1 Granular locking: Removal of io_request_lock
- 5.2 Prepare for transition to 64 bit sector_t
-6. Other Changes/Implications
- 6.1 Partition re-mapping handled by the generic block layer
-7. A few tips on migration of older drivers
-8. A list of prior/related/impacted patches/ideas
-9. Other References/Discussion Threads
-
----------------------------------------------------------------------------
-
-Bio Notes
---------
-
-Let us discuss the changes in the context of how some overall goals for the
-block layer are addressed.
-
-1. Scope for tuning the generic logic to satisfy various requirements
-
-The block layer design supports adaptable abstractions to handle common
-processing with the ability to tune the logic to an appropriate extent
-depending on the nature of the device and the requirements of the caller.
-One of the objectives of the rewrite was to increase the degree of tunability
-and to enable higher level code to utilize underlying device/driver
-capabilities to the maximum extent for better i/o performance. This is
-important especially in the light of ever improving hardware capabilities
-and application/middleware software designed to take advantage of these
-capabilities.
-
-1.1 Tuning based on low level device / driver capabilities
-
-Sophisticated devices with large built-in caches, intelligent i/o scheduling
-optimizations, high memory DMA support, etc may find some of the
-generic processing an overhead, while for less capable devices the
-generic functionality is essential for performance or correctness reasons.
-Knowledge of some of the capabilities or parameters of the device should be
-used at the generic block layer to take the right decisions on
-behalf of the driver.
-
-How is this achieved ?
-
-Tuning at a per-queue level:
-
-i. Per-queue limits/values exported to the generic layer by the driver
-
-Various parameters that the generic i/o scheduler logic uses are set at
-a per-queue level (e.g maximum request size, maximum number of segments in
-a scatter-gather list, logical block size)
-
-Some parameters that were earlier available as global arrays indexed by
-major/minor are now directly associated with the queue. Some of these may
-move into the block device structure in the future. Some characteristics
-have been incorporated into a queue flags field rather than separate fields
-in themselves. There are blk_queue_xxx functions to set the parameters,
-rather than update the fields directly
-
-Some new queue property settings:
-
- blk_queue_bounce_limit(q, u64 dma_address)
- Enable I/O to highmem pages, dma_address being the
- limit. No highmem default.
-
- blk_queue_max_sectors(q, max_sectors)
- Sets two variables that limit the size of the request.
-
- - The request queue's max_sectors, which is a soft size in
- units of 512 byte sectors, and could be dynamically varied
- by the core kernel.
-
- - The request queue's max_hw_sectors, which is a hard limit
- and reflects the maximum size request a driver can handle
- in units of 512 byte sectors.
-
- The default for both max_sectors and max_hw_sectors is
- 255. The upper limit of max_sectors is 1024.
-
- blk_queue_max_phys_segments(q, max_segments)
- Maximum physical segments you can handle in a request. 128
- default (driver limit). (See 3.2.2)
-
- blk_queue_max_hw_segments(q, max_segments)
- Maximum dma segments the hardware can handle in a request. 128
- default (host adapter limit, after dma remapping).
- (See 3.2.2)
-
- blk_queue_max_segment_size(q, max_seg_size)
- Maximum size of a clustered segment, 64kB default.
-
- blk_queue_logical_block_size(q, logical_block_size)
- Lowest possible sector size that the hardware can operate
- on, 512 bytes default.
-
-New queue flags:
-
- QUEUE_FLAG_CLUSTER (see 3.2.2)
- QUEUE_FLAG_QUEUED (see 3.2.4)
-
-
-ii. High-mem i/o capabilities are now considered the default
-
-The generic bounce buffer logic, present in 2.4, where the block layer would
-by default copyin/out i/o requests on high-memory buffers to low-memory buffers
-assuming that the driver wouldn't be able to handle it directly, has been
-changed in 2.5. The bounce logic is now applied only for memory ranges
-for which the device cannot handle i/o. A driver can specify this by
-setting the queue bounce limit for the request queue for the device
-(blk_queue_bounce_limit()). This avoids the inefficiencies of the copyin/out
-where a device is capable of handling high memory i/o.
-
-In order to enable high-memory i/o where the device is capable of supporting
-it, the pci dma mapping routines and associated data structures have now been
-modified to accomplish a direct page -> bus translation, without requiring
-a virtual address mapping (unlike the earlier scheme of virtual address
--> bus translation). So this works uniformly for high-memory pages (which
-do not have a corresponding kernel virtual address space mapping) and
-low-memory pages.
-
-Note: Please refer to Documentation/DMA-API-HOWTO.txt for a discussion
-on PCI high mem DMA aspects and mapping of scatter gather lists, and support
-for 64 bit PCI.
-
-Special handling is required only for cases where i/o needs to happen on
-pages at physical memory addresses beyond what the device can support. In these
-cases, a bounce bio representing a buffer from the supported memory range
-is used for performing the i/o with copyin/copyout as needed depending on
-the type of the operation. For example, in case of a read operation, the
-data read has to be copied to the original buffer on i/o completion, so a
-callback routine is set up to do this, while for write, the data is copied
-from the original buffer to the bounce buffer prior to issuing the
-operation. Since an original buffer may be in a high memory area that's not
-mapped in kernel virtual addr, a kmap operation may be required for
-performing the copy, and special care may be needed in the completion path
-as it may not be in irq context. Special care is also required (by way of
-GFP flags) when allocating bounce buffers, to avoid certain highmem
-deadlock possibilities.
-
-It is also possible that a bounce buffer may be allocated from high-memory
-area that's not mapped in kernel virtual addr, but within the range that the
-device can use directly; so the bounce page may need to be kmapped during
-copy operations. [Note: This does not hold in the current implementation,
-though]
-
-There are some situations when pages from high memory may need to
-be kmapped, even if bounce buffers are not necessary. For example a device
-may need to abort DMA operations and revert to PIO for the transfer, in
-which case a virtual mapping of the page is required. For SCSI it is also
-done in some scenarios where the low level driver cannot be trusted to
-handle a single sg entry correctly. The driver is expected to perform the
-kmaps as needed on such occasions as appropriate. A driver could also use
-the blk_queue_bounce() routine on its own to bounce highmem i/o to low
-memory for specific requests if so desired.
-
-iii. The i/o scheduler algorithm itself can be replaced/set as appropriate
-
-As in 2.4, it is possible to plugin a brand new i/o scheduler for a particular
-queue or pick from (copy) existing generic schedulers and replace/override
-certain portions of it. The 2.5 rewrite provides improved modularization
-of the i/o scheduler. There are more pluggable callbacks, e.g for init,
-add request, extract request, which makes it possible to abstract specific
-i/o scheduling algorithm aspects and details outside of the generic loop.
-It also makes it possible to completely hide the implementation details of
-the i/o scheduler from block drivers.
-
-I/O scheduler wrappers are to be used instead of accessing the queue directly.
-See section 4. The I/O scheduler for details.
-
-1.2 Tuning Based on High level code capabilities
-
-i. Application capabilities for raw i/o
-
-This comes from some of the high-performance database/middleware
-requirements where an application prefers to make its own i/o scheduling
-decisions based on an understanding of the access patterns and i/o
-characteristics
-
-ii. High performance filesystems or other higher level kernel code's
-capabilities
-
-Kernel components like filesystems could also take their own i/o scheduling
-decisions for optimizing performance. Journalling filesystems may need
-some control over i/o ordering.
-
-What kind of support exists at the generic block layer for this ?
-
-The flags and rw fields in the bio structure can be used for some tuning
-from above e.g indicating that an i/o is just a readahead request, or priority
-settings (currently unused). As far as user applications are concerned they
-would need an additional mechanism either via open flags or ioctls, or some
-other upper level mechanism to communicate such settings to block.
-
-1.2.1 Request Priority/Latency
-
-Todo/Under discussion:
-Arjan's proposed request priority scheme allows higher levels some broad
- control (high/med/low) over the priority of an i/o request vs other pending
- requests in the queue. For example it allows reads for bringing in an
- executable page on demand to be given a higher priority over pending write
- requests which haven't aged too much on the queue. Potentially this priority
- could even be exposed to applications in some manner, providing higher level
- tunability. Time based aging avoids starvation of lower priority
- requests. Some bits in the bi_opf flags field in the bio structure are
- intended to be used for this priority information.
-
-
-1.3 Direct Access to Low level Device/Driver Capabilities (Bypass mode)
- (e.g Diagnostics, Systems Management)
-
-There are situations where high-level code needs to have direct access to
-the low level device capabilities or requires the ability to issue commands
-to the device bypassing some of the intermediate i/o layers.
-These could, for example, be special control commands issued through ioctl
-interfaces, or could be raw read/write commands that stress the drive's
-capabilities for certain kinds of fitness tests. Having direct interfaces at
-multiple levels without having to pass through upper layers makes
-it possible to perform bottom up validation of the i/o path, layer by
-layer, starting from the media.
-
-The normal i/o submission interfaces, e.g submit_bio, could be bypassed
-for specially crafted requests which such ioctl or diagnostics
-interfaces would typically use, and the elevator add_request routine
-can instead be used to directly insert such requests in the queue or preferably
-the blk_do_rq routine can be used to place the request on the queue and
-wait for completion. Alternatively, sometimes the caller might just
-invoke a lower level driver specific interface with the request as a
-parameter.
-
-If the request is a means for passing on special information associated with
-the command, then such information is associated with the request->special
-field (rather than misuse the request->buffer field which is meant for the
-request data buffer's virtual mapping).
-
-For passing request data, the caller must build up a bio descriptor
-representing the concerned memory buffer if the underlying driver interprets
-bio segments or uses the block layer end*request* functions for i/o
-completion. Alternatively one could directly use the request->buffer field to
-specify the virtual address of the buffer, if the driver expects buffer
-addresses passed in this way and ignores bio entries for the request type
-involved. In the latter case, the driver would modify and manage the
-request->buffer, request->sector and request->nr_sectors or
-request->current_nr_sectors fields itself rather than using the block layer
-end_request or end_that_request_first completion interfaces.
-(See 2.3 or Documentation/block/request.txt for a brief explanation of
-the request structure fields)
-
-[TBD: end_that_request_last should be usable even in this case;
-Perhaps an end_that_direct_request_first routine could be implemented to make
-handling direct requests easier for such drivers; Also for drivers that
-expect bios, a helper function could be provided for setting up a bio
-corresponding to a data buffer]
-
-<JENS: I dont understand the above, why is end_that_request_first() not
-usable? Or _last for that matter. I must be missing something>
-<SUP: What I meant here was that if the request doesn't have a bio, then
- end_that_request_first doesn't modify nr_sectors or current_nr_sectors,
- and hence can't be used for advancing request state settings on the
- completion of partial transfers. The driver has to modify these fields
- directly by hand.
- This is because end_that_request_first only iterates over the bio list,
- and always returns 0 if there are none associated with the request.
- _last works OK in this case, and is not a problem, as I mentioned earlier
->
-
-1.3.1 Pre-built Commands
-
-A request can be created with a pre-built custom command to be sent directly
-to the device. The cmd block in the request structure has room for filling
-in the command bytes. (i.e rq->cmd is now 16 bytes in size, and meant for
-command pre-building, and the type of the request is now indicated
-through rq->flags instead of via rq->cmd)
-
-The request structure flags can be set up to indicate the type of request
-in such cases (REQ_PC: direct packet command passed to driver, REQ_BLOCK_PC:
-packet command issued via blk_do_rq, REQ_SPECIAL: special request).
-
-It can help to pre-build device commands for requests in advance.
-Drivers can now specify a request prepare function (q->prep_rq_fn) that the
-block layer would invoke to pre-build device commands for a given request,
-or perform other preparatory processing for the request. This is routine is
-called by elv_next_request(), i.e. typically just before servicing a request.
-(The prepare function would not be called for requests that have RQF_DONTPREP
-enabled)
-
-Aside:
- Pre-building could possibly even be done early, i.e before placing the
- request on the queue, rather than construct the command on the fly in the
- driver while servicing the request queue when it may affect latencies in
- interrupt context or responsiveness in general. One way to add early
- pre-building would be to do it whenever we fail to merge on a request.
- Now REQ_NOMERGE is set in the request flags to skip this one in the future,
- which means that it will not change before we feed it to the device. So
- the pre-builder hook can be invoked there.
-
-
-2. Flexible and generic but minimalist i/o structure/descriptor.
-
-2.1 Reason for a new structure and requirements addressed
-
-Prior to 2.5, buffer heads were used as the unit of i/o at the generic block
-layer, and the low level request structure was associated with a chain of
-buffer heads for a contiguous i/o request. This led to certain inefficiencies
-when it came to large i/o requests and readv/writev style operations, as it
-forced such requests to be broken up into small chunks before being passed
-on to the generic block layer, only to be merged by the i/o scheduler
-when the underlying device was capable of handling the i/o in one shot.
-Also, using the buffer head as an i/o structure for i/os that didn't originate
-from the buffer cache unnecessarily added to the weight of the descriptors
-which were generated for each such chunk.
-
-The following were some of the goals and expectations considered in the
-redesign of the block i/o data structure in 2.5.
-
-i. Should be appropriate as a descriptor for both raw and buffered i/o -
- avoid cache related fields which are irrelevant in the direct/page i/o path,
- or filesystem block size alignment restrictions which may not be relevant
- for raw i/o.
-ii. Ability to represent high-memory buffers (which do not have a virtual
- address mapping in kernel address space).
-iii.Ability to represent large i/os w/o unnecessarily breaking them up (i.e
- greater than PAGE_SIZE chunks in one shot)
-iv. At the same time, ability to retain independent identity of i/os from
- different sources or i/o units requiring individual completion (e.g. for
- latency reasons)
-v. Ability to represent an i/o involving multiple physical memory segments
- (including non-page aligned page fragments, as specified via readv/writev)
- without unnecessarily breaking it up, if the underlying device is capable of
- handling it.
-vi. Preferably should be based on a memory descriptor structure that can be
- passed around different types of subsystems or layers, maybe even
- networking, without duplication or extra copies of data/descriptor fields
- themselves in the process
-vii.Ability to handle the possibility of splits/merges as the structure passes
- through layered drivers (lvm, md, evms), with minimal overhead.
-
-The solution was to define a new structure (bio) for the block layer,
-instead of using the buffer head structure (bh) directly, the idea being
-avoidance of some associated baggage and limitations. The bio structure
-is uniformly used for all i/o at the block layer ; it forms a part of the
-bh structure for buffered i/o, and in the case of raw/direct i/o kiobufs are
-mapped to bio structures.
-
-2.2 The bio struct
-
-The bio structure uses a vector representation pointing to an array of tuples
-of <page, offset, len> to describe the i/o buffer, and has various other
-fields describing i/o parameters and state that needs to be maintained for
-performing the i/o.
-
-Notice that this representation means that a bio has no virtual address
-mapping at all (unlike buffer heads).
-
-struct bio_vec {
- struct page *bv_page;
- unsigned short bv_len;
- unsigned short bv_offset;
-};
-
-/*
- * main unit of I/O for the block layer and lower layers (ie drivers)
- */
-struct bio {
- struct bio *bi_next; /* request queue link */
- struct block_device *bi_bdev; /* target device */
- unsigned long bi_flags; /* status, command, etc */
- unsigned long bi_opf; /* low bits: r/w, high: priority */
-
- unsigned int bi_vcnt; /* how may bio_vec's */
- struct bvec_iter bi_iter; /* current index into bio_vec array */
-
- unsigned int bi_size; /* total size in bytes */
- unsigned short bi_phys_segments; /* segments after physaddr coalesce*/
- unsigned short bi_hw_segments; /* segments after DMA remapping */
- unsigned int bi_max; /* max bio_vecs we can hold
- used as index into pool */
- struct bio_vec *bi_io_vec; /* the actual vec list */
- bio_end_io_t *bi_end_io; /* bi_end_io (bio) */
- atomic_t bi_cnt; /* pin count: free when it hits zero */
- void *bi_private;
-};
-
-With this multipage bio design:
-
-- Large i/os can be sent down in one go using a bio_vec list consisting
- of an array of <page, offset, len> fragments (similar to the way fragments
- are represented in the zero-copy network code)
-- Splitting of an i/o request across multiple devices (as in the case of
- lvm or raid) is achieved by cloning the bio (where the clone points to
- the same bi_io_vec array, but with the index and size accordingly modified)
-- A linked list of bios is used as before for unrelated merges (*) - this
- avoids reallocs and makes independent completions easier to handle.
-- Code that traverses the req list can find all the segments of a bio
- by using rq_for_each_segment. This handles the fact that a request
- has multiple bios, each of which can have multiple segments.
-- Drivers which can't process a large bio in one shot can use the bi_iter
- field to keep track of the next bio_vec entry to process.
- (e.g a 1MB bio_vec needs to be handled in max 128kB chunks for IDE)
- [TBD: Should preferably also have a bi_voffset and bi_vlen to avoid modifying
- bi_offset an len fields]
-
-(*) unrelated merges -- a request ends up containing two or more bios that
- didn't originate from the same place.
-
-bi_end_io() i/o callback gets called on i/o completion of the entire bio.
-
-At a lower level, drivers build a scatter gather list from the merged bios.
-The scatter gather list is in the form of an array of <page, offset, len>
-entries with their corresponding dma address mappings filled in at the
-appropriate time. As an optimization, contiguous physical pages can be
-covered by a single entry where <page> refers to the first page and <len>
-covers the range of pages (up to 16 contiguous pages could be covered this
-way). There is a helper routine (blk_rq_map_sg) which drivers can use to build
-the sg list.
-
-Note: Right now the only user of bios with more than one page is ll_rw_kio,
-which in turn means that only raw I/O uses it (direct i/o may not work
-right now). The intent however is to enable clustering of pages etc to
-become possible. The pagebuf abstraction layer from SGI also uses multi-page
-bios, but that is currently not included in the stock development kernels.
-The same is true of Andrew Morton's work-in-progress multipage bio writeout
-and readahead patches.
-
-2.3 Changes in the Request Structure
-
-The request structure is the structure that gets passed down to low level
-drivers. The block layer make_request function builds up a request structure,
-places it on the queue and invokes the drivers request_fn. The driver makes
-use of block layer helper routine elv_next_request to pull the next request
-off the queue. Control or diagnostic functions might bypass block and directly
-invoke underlying driver entry points passing in a specially constructed
-request structure.
-
-Only some relevant fields (mainly those which changed or may be referred
-to in some of the discussion here) are listed below, not necessarily in
-the order in which they occur in the structure (see include/linux/blkdev.h)
-Refer to Documentation/block/request.txt for details about all the request
-structure fields and a quick reference about the layers which are
-supposed to use or modify those fields.
-
-struct request {
- struct list_head queuelist; /* Not meant to be directly accessed by
- the driver.
- Used by q->elv_next_request_fn
- rq->queue is gone
- */
- .
- .
- unsigned char cmd[16]; /* prebuilt command data block */
- unsigned long flags; /* also includes earlier rq->cmd settings */
- .
- .
- sector_t sector; /* this field is now of type sector_t instead of int
- preparation for 64 bit sectors */
- .
- .
-
- /* Number of scatter-gather DMA addr+len pairs after
- * physical address coalescing is performed.
- */
- unsigned short nr_phys_segments;
-
- /* Number of scatter-gather addr+len pairs after
- * physical and DMA remapping hardware coalescing is performed.
- * This is the number of scatter-gather entries the driver
- * will actually have to deal with after DMA mapping is done.
- */
- unsigned short nr_hw_segments;
-
- /* Various sector counts */
- unsigned long nr_sectors; /* no. of sectors left: driver modifiable */
- unsigned long hard_nr_sectors; /* block internal copy of above */
- unsigned int current_nr_sectors; /* no. of sectors left in the
- current segment:driver modifiable */
- unsigned long hard_cur_sectors; /* block internal copy of the above */
- .
- .
- int tag; /* command tag associated with request */
- void *special; /* same as before */
- char *buffer; /* valid only for low memory buffers up to
- current_nr_sectors */
- .
- .
- struct bio *bio, *biotail; /* bio list instead of bh */
- struct request_list *rl;
-}
-
-See the req_ops and req_flag_bits definitions for an explanation of the various
-flags available. Some bits are used by the block layer or i/o scheduler.
-
-The behaviour of the various sector counts are almost the same as before,
-except that since we have multi-segment bios, current_nr_sectors refers
-to the numbers of sectors in the current segment being processed which could
-be one of the many segments in the current bio (i.e i/o completion unit).
-The nr_sectors value refers to the total number of sectors in the whole
-request that remain to be transferred (no change). The purpose of the
-hard_xxx values is for block to remember these counts every time it hands
-over the request to the driver. These values are updated by block on
-end_that_request_first, i.e. every time the driver completes a part of the
-transfer and invokes block end*request helpers to mark this. The
-driver should not modify these values. The block layer sets up the
-nr_sectors and current_nr_sectors fields (based on the corresponding
-hard_xxx values and the number of bytes transferred) and updates it on
-every transfer that invokes end_that_request_first. It does the same for the
-buffer, bio, bio->bi_iter fields too.
-
-The buffer field is just a virtual address mapping of the current segment
-of the i/o buffer in cases where the buffer resides in low-memory. For high
-memory i/o, this field is not valid and must not be used by drivers.
-
-Code that sets up its own request structures and passes them down to
-a driver needs to be careful about interoperation with the block layer helper
-functions which the driver uses. (Section 1.3)
-
-3. Using bios
-
-3.1 Setup/Teardown
-
-There are routines for managing the allocation, and reference counting, and
-freeing of bios (bio_alloc, bio_get, bio_put).
-
-This makes use of Ingo Molnar's mempool implementation, which enables
-subsystems like bio to maintain their own reserve memory pools for guaranteed
-deadlock-free allocations during extreme VM load. For example, the VM
-subsystem makes use of the block layer to writeout dirty pages in order to be
-able to free up memory space, a case which needs careful handling. The
-allocation logic draws from the preallocated emergency reserve in situations
-where it cannot allocate through normal means. If the pool is empty and it
-can wait, then it would trigger action that would help free up memory or
-replenish the pool (without deadlocking) and wait for availability in the pool.
-If it is in IRQ context, and hence not in a position to do this, allocation
-could fail if the pool is empty. In general mempool always first tries to
-perform allocation without having to wait, even if it means digging into the
-pool as long it is not less that 50% full.
-
-On a free, memory is released to the pool or directly freed depending on
-the current availability in the pool. The mempool interface lets the
-subsystem specify the routines to be used for normal alloc and free. In the
-case of bio, these routines make use of the standard slab allocator.
-
-The caller of bio_alloc is expected to taken certain steps to avoid
-deadlocks, e.g. avoid trying to allocate more memory from the pool while
-already holding memory obtained from the pool.
-[TBD: This is a potential issue, though a rare possibility
- in the bounce bio allocation that happens in the current code, since
- it ends up allocating a second bio from the same pool while
- holding the original bio ]
-
-Memory allocated from the pool should be released back within a limited
-amount of time (in the case of bio, that would be after the i/o is completed).
-This ensures that if part of the pool has been used up, some work (in this
-case i/o) must already be in progress and memory would be available when it
-is over. If allocating from multiple pools in the same code path, the order
-or hierarchy of allocation needs to be consistent, just the way one deals
-with multiple locks.
-
-The bio_alloc routine also needs to allocate the bio_vec_list (bvec_alloc())
-for a non-clone bio. There are the 6 pools setup for different size biovecs,
-so bio_alloc(gfp_mask, nr_iovecs) will allocate a vec_list of the
-given size from these slabs.
-
-The bio_get() routine may be used to hold an extra reference on a bio prior
-to i/o submission, if the bio fields are likely to be accessed after the
-i/o is issued (since the bio may otherwise get freed in case i/o completion
-happens in the meantime).
-
-The bio_clone_fast() routine may be used to duplicate a bio, where the clone
-shares the bio_vec_list with the original bio (i.e. both point to the
-same bio_vec_list). This would typically be used for splitting i/o requests
-in lvm or md.
-
-3.2 Generic bio helper Routines
-
-3.2.1 Traversing segments and completion units in a request
-
-The macro rq_for_each_segment() should be used for traversing the bios
-in the request list (drivers should avoid directly trying to do it
-themselves). Using these helpers should also make it easier to cope
-with block changes in the future.
-
- struct req_iterator iter;
- rq_for_each_segment(bio_vec, rq, iter)
- /* bio_vec is now current segment */
-
-I/O completion callbacks are per-bio rather than per-segment, so drivers
-that traverse bio chains on completion need to keep that in mind. Drivers
-which don't make a distinction between segments and completion units would
-need to be reorganized to support multi-segment bios.
-
-3.2.2 Setting up DMA scatterlists
-
-The blk_rq_map_sg() helper routine would be used for setting up scatter
-gather lists from a request, so a driver need not do it on its own.
-
- nr_segments = blk_rq_map_sg(q, rq, scatterlist);
-
-The helper routine provides a level of abstraction which makes it easier
-to modify the internals of request to scatterlist conversion down the line
-without breaking drivers. The blk_rq_map_sg routine takes care of several
-things like collapsing physically contiguous segments (if QUEUE_FLAG_CLUSTER
-is set) and correct segment accounting to avoid exceeding the limits which
-the i/o hardware can handle, based on various queue properties.
-
-- Prevents a clustered segment from crossing a 4GB mem boundary
-- Avoids building segments that would exceed the number of physical
- memory segments that the driver can handle (phys_segments) and the
- number that the underlying hardware can handle at once, accounting for
- DMA remapping (hw_segments) (i.e. IOMMU aware limits).
-
-Routines which the low level driver can use to set up the segment limits:
-
-blk_queue_max_hw_segments() : Sets an upper limit of the maximum number of
-hw data segments in a request (i.e. the maximum number of address/length
-pairs the host adapter can actually hand to the device at once)
-
-blk_queue_max_phys_segments() : Sets an upper limit on the maximum number
-of physical data segments in a request (i.e. the largest sized scatter list
-a driver could handle)
-
-3.2.3 I/O completion
-
-The existing generic block layer helper routines end_request,
-end_that_request_first and end_that_request_last can be used for i/o
-completion (and setting things up so the rest of the i/o or the next
-request can be kicked of) as before. With the introduction of multi-page
-bio support, end_that_request_first requires an additional argument indicating
-the number of sectors completed.
-
-3.2.4 Implications for drivers that do not interpret bios (don't handle
- multiple segments)
-
-Drivers that do not interpret bios e.g those which do not handle multiple
-segments and do not support i/o into high memory addresses (require bounce
-buffers) and expect only virtually mapped buffers, can access the rq->buffer
-field. As before the driver should use current_nr_sectors to determine the
-size of remaining data in the current segment (that is the maximum it can
-transfer in one go unless it interprets segments), and rely on the block layer
-end_request, or end_that_request_first/last to take care of all accounting
-and transparent mapping of the next bio segment when a segment boundary
-is crossed on completion of a transfer. (The end*request* functions should
-be used if only if the request has come down from block/bio path, not for
-direct access requests which only specify rq->buffer without a valid rq->bio)
-
-3.3 I/O Submission
-
-The routine submit_bio() is used to submit a single io. Higher level i/o
-routines make use of this:
-
-(a) Buffered i/o:
-The routine submit_bh() invokes submit_bio() on a bio corresponding to the
-bh, allocating the bio if required. ll_rw_block() uses submit_bh() as before.
-
-(b) Kiobuf i/o (for raw/direct i/o):
-The ll_rw_kio() routine breaks up the kiobuf into page sized chunks and
-maps the array to one or more multi-page bios, issuing submit_bio() to
-perform the i/o on each of these.
-
-The embedded bh array in the kiobuf structure has been removed and no
-preallocation of bios is done for kiobufs. [The intent is to remove the
-blocks array as well, but it's currently in there to kludge around direct i/o.]
-Thus kiobuf allocation has switched back to using kmalloc rather than vmalloc.
-
-Todo/Observation:
-
- A single kiobuf structure is assumed to correspond to a contiguous range
- of data, so brw_kiovec() invokes ll_rw_kio for each kiobuf in a kiovec.
- So right now it wouldn't work for direct i/o on non-contiguous blocks.
- This is to be resolved. The eventual direction is to replace kiobuf
- by kvec's.
-
- Badari Pulavarty has a patch to implement direct i/o correctly using
- bio and kvec.
-
-
-(c) Page i/o:
-Todo/Under discussion:
-
- Andrew Morton's multi-page bio patches attempt to issue multi-page
- writeouts (and reads) from the page cache, by directly building up
- large bios for submission completely bypassing the usage of buffer
- heads. This work is still in progress.
-
- Christoph Hellwig had some code that uses bios for page-io (rather than
- bh). This isn't included in bio as yet. Christoph was also working on a
- design for representing virtual/real extents as an entity and modifying
- some of the address space ops interfaces to utilize this abstraction rather
- than buffer_heads. (This is somewhat along the lines of the SGI XFS pagebuf
- abstraction, but intended to be as lightweight as possible).
-
-(d) Direct access i/o:
-Direct access requests that do not contain bios would be submitted differently
-as discussed earlier in section 1.3.
-
-Aside:
-
- Kvec i/o:
-
- Ben LaHaise's aio code uses a slightly different structure instead
- of kiobufs, called a kvec_cb. This contains an array of <page, offset, len>
- tuples (very much like the networking code), together with a callback function
- and data pointer. This is embedded into a brw_cb structure when passed
- to brw_kvec_async().
-
- Now it should be possible to directly map these kvecs to a bio. Just as while
- cloning, in this case rather than PRE_BUILT bio_vecs, we set the bi_io_vec
- array pointer to point to the veclet array in kvecs.
-
- TBD: In order for this to work, some changes are needed in the way multi-page
- bios are handled today. The values of the tuples in such a vector passed in
- from higher level code should not be modified by the block layer in the course
- of its request processing, since that would make it hard for the higher layer
- to continue to use the vector descriptor (kvec) after i/o completes. Instead,
- all such transient state should either be maintained in the request structure,
- and passed on in some way to the endio completion routine.
-
-
-4. The I/O scheduler
-I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch
-queue and specific I/O schedulers. Unless stated otherwise, elevator is used
-to refer to both parts and I/O scheduler to specific I/O schedulers.
-
-Block layer implements generic dispatch queue in block/*.c.
-The generic dispatch queue is responsible for requeueing, handling non-fs
-requests and all other subtleties.
-
-Specific I/O schedulers are responsible for ordering normal filesystem
-requests. They can also choose to delay certain requests to improve
-throughput or whatever purpose. As the plural form indicates, there are
-multiple I/O schedulers. They can be built as modules but at least one should
-be built inside the kernel. Each queue can choose different one and can also
-change to another one dynamically.
-
-A block layer call to the i/o scheduler follows the convention elv_xxx(). This
-calls elevator_xxx_fn in the elevator switch (block/elevator.c). Oh, xxx
-and xxx might not match exactly, but use your imagination. If an elevator
-doesn't implement a function, the switch does nothing or some minimal house
-keeping work.
-
-4.1. I/O scheduler API
-
-The functions an elevator may implement are: (* are mandatory)
-elevator_merge_fn called to query requests for merge with a bio
-
-elevator_merge_req_fn called when two requests get merged. the one
- which gets merged into the other one will be
- never seen by I/O scheduler again. IOW, after
- being merged, the request is gone.
-
-elevator_merged_fn called when a request in the scheduler has been
- involved in a merge. It is used in the deadline
- scheduler for example, to reposition the request
- if its sorting order has changed.
-
-elevator_allow_merge_fn called whenever the block layer determines
- that a bio can be merged into an existing
- request safely. The io scheduler may still
- want to stop a merge at this point if it
- results in some sort of conflict internally,
- this hook allows it to do that. Note however
- that two *requests* can still be merged at later
- time. Currently the io scheduler has no way to
- prevent that. It can only learn about the fact
- from elevator_merge_req_fn callback.
-
-elevator_dispatch_fn* fills the dispatch queue with ready requests.
- I/O schedulers are free to postpone requests by
- not filling the dispatch queue unless @force
- is non-zero. Once dispatched, I/O schedulers
- are not allowed to manipulate the requests -
- they belong to generic dispatch queue.
-
-elevator_add_req_fn* called to add a new request into the scheduler
-
-elevator_former_req_fn
-elevator_latter_req_fn These return the request before or after the
- one specified in disk sort order. Used by the
- block layer to find merge possibilities.
-
-elevator_completed_req_fn called when a request is completed.
-
-elevator_may_queue_fn returns true if the scheduler wants to allow the
- current context to queue a new request even if
- it is over the queue limit. This must be used
- very carefully!!
-
-elevator_set_req_fn
-elevator_put_req_fn Must be used to allocate and free any elevator
- specific storage for a request.
-
-elevator_activate_req_fn Called when device driver first sees a request.
- I/O schedulers can use this callback to
- determine when actual execution of a request
- starts.
-elevator_deactivate_req_fn Called when device driver decides to delay
- a request by requeueing it.
-
-elevator_init_fn*
-elevator_exit_fn Allocate and free any elevator specific storage
- for a queue.
-
-4.2 Request flows seen by I/O schedulers
-All requests seen by I/O schedulers strictly follow one of the following three
-flows.
-
- set_req_fn ->
-
- i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
- (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
- ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
- iii. [none]
-
- -> put_req_fn
-
-4.3 I/O scheduler implementation
-The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
-optimal disk scan and request servicing performance (based on generic
-principles and device capabilities), optimized for:
-i. improved throughput
-ii. improved latency
-iii. better utilization of h/w & CPU time
-
-Characteristics:
-
-i. Binary tree
-AS and deadline i/o schedulers use red black binary trees for disk position
-sorting and searching, and a fifo linked list for time-based searching. This
-gives good scalability and good availability of information. Requests are
-almost always dispatched in disk sort order, so a cache is kept of the next
-request in sort order to prevent binary tree lookups.
-
-This arrangement is not a generic block layer characteristic however, so
-elevators may implement queues as they please.
-
-ii. Merge hash
-AS and deadline use a hash table indexed by the last sector of a request. This
-enables merging code to quickly look up "back merge" candidates, even when
-multiple I/O streams are being performed at once on one disk.
-
-"Front merges", a new request being merged at the front of an existing request,
-are far less common than "back merges" due to the nature of most I/O patterns.
-Front merges are handled by the binary trees in AS and deadline schedulers.
-
-iii. Plugging the queue to batch requests in anticipation of opportunities for
- merge/sort optimizations
-
-Plugging is an approach that the current i/o scheduling algorithm resorts to so
-that it collects up enough requests in the queue to be able to take
-advantage of the sorting/merging logic in the elevator. If the
-queue is empty when a request comes in, then it plugs the request queue
-(sort of like plugging the bath tub of a vessel to get fluid to build up)
-till it fills up with a few more requests, before starting to service
-the requests. This provides an opportunity to merge/sort the requests before
-passing them down to the device. There are various conditions when the queue is
-unplugged (to open up the flow again), either through a scheduled task or
-could be on demand. For example wait_on_buffer sets the unplugging going
-through sync_buffer() running blk_run_address_space(mapping). Or the caller
-can do it explicity through blk_unplug(bdev). So in the read case,
-the queue gets explicitly unplugged as part of waiting for completion on that
-buffer.
-
-Aside:
- This is kind of controversial territory, as it's not clear if plugging is
- always the right thing to do. Devices typically have their own queues,
- and allowing a big queue to build up in software, while letting the device be
- idle for a while may not always make sense. The trick is to handle the fine
- balance between when to plug and when to open up. Also now that we have
- multi-page bios being queued in one shot, we may not need to wait to merge
- a big request from the broken up pieces coming by.
-
-4.4 I/O contexts
-I/O contexts provide a dynamically allocated per process data area. They may
-be used in I/O schedulers, and in the block layer (could be used for IO statis,
-priorities for example). See *io_context in block/ll_rw_blk.c, and as-iosched.c
-for an example of usage in an i/o scheduler.
-
-
-5. Scalability related changes
-
-5.1 Granular Locking: io_request_lock replaced by a per-queue lock
-
-The global io_request_lock has been removed as of 2.5, to avoid
-the scalability bottleneck it was causing, and has been replaced by more
-granular locking. The request queue structure has a pointer to the
-lock to be used for that queue. As a result, locking can now be
-per-queue, with a provision for sharing a lock across queues if
-necessary (e.g the scsi layer sets the queue lock pointers to the
-corresponding adapter lock, which results in a per host locking
-granularity). The locking semantics are the same, i.e. locking is
-still imposed by the block layer, grabbing the lock before
-request_fn execution which it means that lots of older drivers
-should still be SMP safe. Drivers are free to drop the queue
-lock themselves, if required. Drivers that explicitly used the
-io_request_lock for serialization need to be modified accordingly.
-Usually it's as easy as adding a global lock:
-
- static DEFINE_SPINLOCK(my_driver_lock);
-
-and passing the address to that lock to blk_init_queue().
-
-5.2 64 bit sector numbers (sector_t prepares for 64 bit support)
-
-The sector number used in the bio structure has been changed to sector_t,
-which could be defined as 64 bit in preparation for 64 bit sector support.
-
-6. Other Changes/Implications
-
-6.1 Partition re-mapping handled by the generic block layer
-
-In 2.5 some of the gendisk/partition related code has been reorganized.
-Now the generic block layer performs partition-remapping early and thus
-provides drivers with a sector number relative to whole device, rather than
-having to take partition number into account in order to arrive at the true
-sector number. The routine blk_partition_remap() is invoked by
-generic_make_request even before invoking the queue specific make_request_fn,
-so the i/o scheduler also gets to operate on whole disk sector numbers. This
-should typically not require changes to block drivers, it just never gets
-to invoke its own partition sector offset calculations since all bios
-sent are offset from the beginning of the device.
-
-
-7. A Few Tips on Migration of older drivers
-
-Old-style drivers that just use CURRENT and ignores clustered requests,
-may not need much change. The generic layer will automatically handle
-clustered requests, multi-page bios, etc for the driver.
-
-For a low performance driver or hardware that is PIO driven or just doesn't
-support scatter-gather changes should be minimal too.
-
-The following are some points to keep in mind when converting old drivers
-to bio.
-
-Drivers should use elv_next_request to pick up requests and are no longer
-supposed to handle looping directly over the request list.
-(struct request->queue has been removed)
-
-Now end_that_request_first takes an additional number_of_sectors argument.
-It used to handle always just the first buffer_head in a request, now
-it will loop and handle as many sectors (on a bio-segment granularity)
-as specified.
-
-Now bh->b_end_io is replaced by bio->bi_end_io, but most of the time the
-right thing to use is bio_endio(bio) instead.
-
-If the driver is dropping the io_request_lock from its request_fn strategy,
-then it just needs to replace that with q->queue_lock instead.
-
-As described in Sec 1.1, drivers can set max sector size, max segment size
-etc per queue now. Drivers that used to define their own merge functions i
-to handle things like this can now just use the blk_queue_* functions at
-blk_init_queue time.
-
-Drivers no longer have to map a {partition, sector offset} into the
-correct absolute location anymore, this is done by the block layer, so
-where a driver received a request ala this before:
-
- rq->rq_dev = mk_kdev(3, 5); /* /dev/hda5 */
- rq->sector = 0; /* first sector on hda5 */
-
- it will now see
-
- rq->rq_dev = mk_kdev(3, 0); /* /dev/hda */
- rq->sector = 123128; /* offset from start of disk */
-
-As mentioned, there is no virtual mapping of a bio. For DMA, this is
-not a problem as the driver probably never will need a virtual mapping.
-Instead it needs a bus mapping (dma_map_page for a single segment or
-use dma_map_sg for scatter gather) to be able to ship it to the driver. For
-PIO drivers (or drivers that need to revert to PIO transfer once in a
-while (IDE for example)), where the CPU is doing the actual data
-transfer a virtual mapping is needed. If the driver supports highmem I/O,
-(Sec 1.1, (ii) ) it needs to use kmap_atomic or similar to temporarily map
-a bio into the virtual address space.
-
-
-8. Prior/Related/Impacted patches
-
-8.1. Earlier kiobuf patches (sct/axboe/chait/hch/mkp)
-- orig kiobuf & raw i/o patches (now in 2.4 tree)
-- direct kiobuf based i/o to devices (no intermediate bh's)
-- page i/o using kiobuf
-- kiobuf splitting for lvm (mkp)
-- elevator support for kiobuf request merging (axboe)
-8.2. Zero-copy networking (Dave Miller)
-8.3. SGI XFS - pagebuf patches - use of kiobufs
-8.4. Multi-page pioent patch for bio (Christoph Hellwig)
-8.5. Direct i/o implementation (Andrea Arcangeli) since 2.4.10-pre11
-8.6. Async i/o implementation patch (Ben LaHaise)
-8.7. EVMS layering design (IBM EVMS team)
-8.8. Larger page cache size patch (Ben LaHaise) and
- Large page size (Daniel Phillips)
- => larger contiguous physical memory buffers
-8.9. VM reservations patch (Ben LaHaise)
-8.10. Write clustering patches ? (Marcelo/Quintela/Riel ?)
-8.11. Block device in page cache patch (Andrea Archangeli) - now in 2.4.10+
-8.12. Multiple block-size transfers for faster raw i/o (Shailabh Nagar,
- Badari)
-8.13 Priority based i/o scheduler - prepatches (Arjan van de Ven)
-8.14 IDE Taskfile i/o patch (Andre Hedrick)
-8.15 Multi-page writeout and readahead patches (Andrew Morton)
-8.16 Direct i/o patches for 2.5 using kvec and bio (Badari Pulavarthy)
-
-9. Other References:
-
-9.1 The Splice I/O Model - Larry McVoy (and subsequent discussions on lkml,
-and Linus' comments - Jan 2001)
-9.2 Discussions about kiobuf and bh design on lkml between sct, linus, alan
-et al - Feb-March 2001 (many of the initial thoughts that led to bio were
-brought up in this discussion thread)
-9.3 Discussions on mempool on lkml - Dec 2001.
-
diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst
new file mode 100644
index 000000000000..86fa66c87172
--- /dev/null
+++ b/Documentation/block/biovecs.rst
@@ -0,0 +1,146 @@
+======================================
+Immutable biovecs and biovec iterators
+======================================
+
+Kent Overstreet <kmo@daterainc.com>
+
+As of 3.13, biovecs should never be modified after a bio has been submitted.
+Instead, we have a new struct bvec_iter which represents a range of a biovec -
+the iterator will be modified as the bio is completed, not the biovec.
+
+More specifically, old code that needed to partially complete a bio would
+update bi_sector and bi_size, and advance bi_idx to the next biovec. If it
+ended up partway through a biovec, it would increment bv_offset and decrement
+bv_len by the number of bytes completed in that biovec.
+
+In the new scheme of things, everything that must be mutated in order to
+partially complete a bio is segregated into struct bvec_iter: bi_sector,
+bi_size and bi_idx have been moved there; and instead of modifying bv_offset
+and bv_len, struct bvec_iter has bi_bvec_done, which represents the number of
+bytes completed in the current bvec.
+
+There are a bunch of new helper macros for hiding the gory details - in
+particular, presenting the illusion of partially completed biovecs so that
+normal code doesn't have to deal with bi_bvec_done.
+
+ * Driver code should no longer refer to biovecs directly; we now have
+ bio_iovec() and bio_iter_iovec() macros that return literal struct biovecs,
+ constructed from the raw biovecs but taking into account bi_bvec_done and
+ bi_size.
+
+ bio_for_each_segment() has been updated to take a bvec_iter argument
+ instead of an integer (that corresponded to bi_idx); for a lot of code the
+ conversion just required changing the types of the arguments to
+ bio_for_each_segment().
+
+ * Advancing a bvec_iter is done with bio_advance_iter(); bio_advance() is a
+ wrapper around bio_advance_iter() that operates on bio->bi_iter, and also
+ advances the bio integrity's iter if present.
+
+ There is a lower level advance function - bvec_iter_advance() - which takes
+ a pointer to a biovec, not a bio; this is used by the bio integrity code.
+
+What's all this get us?
+=======================
+
+Having a real iterator, and making biovecs immutable, has a number of
+advantages:
+
+ * Before, iterating over bios was very awkward when you weren't processing
+ exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
+ which copies the contents of one bio into another. Because the biovecs
+ wouldn't necessarily be the same size, the old code was tricky convoluted -
+ it had to walk two different bios at the same time, keeping both bi_idx and
+ and offset into the current biovec for each.
+
+ The new code is much more straightforward - have a look. This sort of
+ pattern comes up in a lot of places; a lot of drivers were essentially open
+ coding bvec iterators before, and having common implementation considerably
+ simplifies a lot of code.
+
+ * Before, any code that might need to use the biovec after the bio had been
+ completed (perhaps to copy the data somewhere else, or perhaps to resubmit
+ it somewhere else if there was an error) had to save the entire bvec array
+ - again, this was being done in a fair number of places.
+
+ * Biovecs can be shared between multiple bios - a bvec iter can represent an
+ arbitrary range of an existing biovec, both starting and ending midway
+ through biovecs. This is what enables efficient splitting of arbitrary
+ bios. Note that this means we _only_ use bi_size to determine when we've
+ reached the end of a bio, not bi_vcnt - and the bio_iovec() macro takes
+ bi_size into account when constructing biovecs.
+
+ * Splitting bios is now much simpler. The old bio_split() didn't even work on
+ bios with more than a single bvec! Now, we can efficiently split arbitrary
+ size bios - because the new bio can share the old bio's biovec.
+
+ Care must be taken to ensure the biovec isn't freed while the split bio is
+ still using it, in case the original bio completes first, though. Using
+ bio_chain() when splitting bios helps with this.
+
+ * Submitting partially completed bios is now perfectly fine - this comes up
+ occasionally in stacking block drivers and various code (e.g. md and
+ bcache) had some ugly workarounds for this.
+
+ It used to be the case that submitting a partially completed bio would work
+ fine to _most_ devices, but since accessing the raw bvec array was the
+ norm, not all drivers would respect bi_idx and those would break. Now,
+ since all drivers _must_ go through the bvec iterator - and have been
+ audited to make sure they are - submitting partially completed bios is
+ perfectly fine.
+
+Other implications:
+===================
+
+ * Almost all usage of bi_idx is now incorrect and has been removed; instead,
+ where previously you would have used bi_idx you'd now use a bvec_iter,
+ probably passing it to one of the helper macros.
+
+ I.e. instead of using bio_iovec_idx() (or bio->bi_iovec[bio->bi_idx]), you
+ now use bio_iter_iovec(), which takes a bvec_iter and returns a
+ literal struct bio_vec - constructed on the fly from the raw biovec but
+ taking into account bi_bvec_done (and bi_size).
+
+ * bi_vcnt can't be trusted or relied upon by driver code - i.e. anything that
+ doesn't actually own the bio. The reason is twofold: firstly, it's not
+ actually needed for iterating over the bio anymore - we only use bi_size.
+ Secondly, when cloning a bio and reusing (a portion of) the original bio's
+ biovec, in order to calculate bi_vcnt for the new bio we'd have to iterate
+ over all the biovecs in the new bio - which is silly as it's not needed.
+
+ So, don't use bi_vcnt anymore.
+
+ * The current interface allows the block layer to split bios as needed, so we
+ could eliminate a lot of complexity particularly in stacked drivers. Code
+ that creates bios can then create whatever size bios are convenient, and
+ more importantly stacked drivers don't have to deal with both their own bio
+ size limitations and the limitations of the underlying devices. Thus
+ there's no need to define ->merge_bvec_fn() callbacks for individual block
+ drivers.
+
+Usage of helpers:
+=================
+
+* The following helpers whose names have the suffix of `_all` can only be used
+ on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
+ shouldn't use them because the bio may have been split before it reached the
+ driver.
+
+::
+
+ bio_for_each_segment_all()
+ bio_first_bvec_all()
+ bio_first_page_all()
+ bio_last_bvec_all()
+
+* The following helpers iterate over single-page segment. The passed 'struct
+ bio_vec' will contain a single-page IO vector during the iteration::
+
+ bio_for_each_segment()
+ bio_for_each_segment_all()
+
+* The following helpers iterate over multi-page bvec. The passed 'struct
+ bio_vec' will contain a multi-page IO vector during the iteration::
+
+ bio_for_each_bvec()
+ rq_for_each_bvec()
diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
deleted file mode 100644
index ce6eccaf5df7..000000000000
--- a/Documentation/block/biovecs.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-
-Immutable biovecs and biovec iterators:
-=======================================
-
-Kent Overstreet <kmo@daterainc.com>
-
-As of 3.13, biovecs should never be modified after a bio has been submitted.
-Instead, we have a new struct bvec_iter which represents a range of a biovec -
-the iterator will be modified as the bio is completed, not the biovec.
-
-More specifically, old code that needed to partially complete a bio would
-update bi_sector and bi_size, and advance bi_idx to the next biovec. If it
-ended up partway through a biovec, it would increment bv_offset and decrement
-bv_len by the number of bytes completed in that biovec.
-
-In the new scheme of things, everything that must be mutated in order to
-partially complete a bio is segregated into struct bvec_iter: bi_sector,
-bi_size and bi_idx have been moved there; and instead of modifying bv_offset
-and bv_len, struct bvec_iter has bi_bvec_done, which represents the number of
-bytes completed in the current bvec.
-
-There are a bunch of new helper macros for hiding the gory details - in
-particular, presenting the illusion of partially completed biovecs so that
-normal code doesn't have to deal with bi_bvec_done.
-
- * Driver code should no longer refer to biovecs directly; we now have
- bio_iovec() and bio_iter_iovec() macros that return literal struct biovecs,
- constructed from the raw biovecs but taking into account bi_bvec_done and
- bi_size.
-
- bio_for_each_segment() has been updated to take a bvec_iter argument
- instead of an integer (that corresponded to bi_idx); for a lot of code the
- conversion just required changing the types of the arguments to
- bio_for_each_segment().
-
- * Advancing a bvec_iter is done with bio_advance_iter(); bio_advance() is a
- wrapper around bio_advance_iter() that operates on bio->bi_iter, and also
- advances the bio integrity's iter if present.
-
- There is a lower level advance function - bvec_iter_advance() - which takes
- a pointer to a biovec, not a bio; this is used by the bio integrity code.
-
-What's all this get us?
-=======================
-
-Having a real iterator, and making biovecs immutable, has a number of
-advantages:
-
- * Before, iterating over bios was very awkward when you weren't processing
- exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c,
- which copies the contents of one bio into another. Because the biovecs
- wouldn't necessarily be the same size, the old code was tricky convoluted -
- it had to walk two different bios at the same time, keeping both bi_idx and
- and offset into the current biovec for each.
-
- The new code is much more straightforward - have a look. This sort of
- pattern comes up in a lot of places; a lot of drivers were essentially open
- coding bvec iterators before, and having common implementation considerably
- simplifies a lot of code.
-
- * Before, any code that might need to use the biovec after the bio had been
- completed (perhaps to copy the data somewhere else, or perhaps to resubmit
- it somewhere else if there was an error) had to save the entire bvec array
- - again, this was being done in a fair number of places.
-
- * Biovecs can be shared between multiple bios - a bvec iter can represent an
- arbitrary range of an existing biovec, both starting and ending midway
- through biovecs. This is what enables efficient splitting of arbitrary
- bios. Note that this means we _only_ use bi_size to determine when we've
- reached the end of a bio, not bi_vcnt - and the bio_iovec() macro takes
- bi_size into account when constructing biovecs.
-
- * Splitting bios is now much simpler. The old bio_split() didn't even work on
- bios with more than a single bvec! Now, we can efficiently split arbitrary
- size bios - because the new bio can share the old bio's biovec.
-
- Care must be taken to ensure the biovec isn't freed while the split bio is
- still using it, in case the original bio completes first, though. Using
- bio_chain() when splitting bios helps with this.
-
- * Submitting partially completed bios is now perfectly fine - this comes up
- occasionally in stacking block drivers and various code (e.g. md and
- bcache) had some ugly workarounds for this.
-
- It used to be the case that submitting a partially completed bio would work
- fine to _most_ devices, but since accessing the raw bvec array was the
- norm, not all drivers would respect bi_idx and those would break. Now,
- since all drivers _must_ go through the bvec iterator - and have been
- audited to make sure they are - submitting partially completed bios is
- perfectly fine.
-
-Other implications:
-===================
-
- * Almost all usage of bi_idx is now incorrect and has been removed; instead,
- where previously you would have used bi_idx you'd now use a bvec_iter,
- probably passing it to one of the helper macros.
-
- I.e. instead of using bio_iovec_idx() (or bio->bi_iovec[bio->bi_idx]), you
- now use bio_iter_iovec(), which takes a bvec_iter and returns a
- literal struct bio_vec - constructed on the fly from the raw biovec but
- taking into account bi_bvec_done (and bi_size).
-
- * bi_vcnt can't be trusted or relied upon by driver code - i.e. anything that
- doesn't actually own the bio. The reason is twofold: firstly, it's not
- actually needed for iterating over the bio anymore - we only use bi_size.
- Secondly, when cloning a bio and reusing (a portion of) the original bio's
- biovec, in order to calculate bi_vcnt for the new bio we'd have to iterate
- over all the biovecs in the new bio - which is silly as it's not needed.
-
- So, don't use bi_vcnt anymore.
-
- * The current interface allows the block layer to split bios as needed, so we
- could eliminate a lot of complexity particularly in stacked drivers. Code
- that creates bios can then create whatever size bios are convenient, and
- more importantly stacked drivers don't have to deal with both their own bio
- size limitations and the limitations of the underlying devices. Thus
- there's no need to define ->merge_bvec_fn() callbacks for individual block
- drivers.
-
-Usage of helpers:
-=================
-
-* The following helpers whose names have the suffix of "_all" can only be used
-on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
-shouldn't use them because the bio may have been split before it reached the
-driver.
-
- bio_for_each_segment_all()
- bio_first_bvec_all()
- bio_first_page_all()
- bio_last_bvec_all()
-
-* The following helpers iterate over single-page segment. The passed 'struct
-bio_vec' will contain a single-page IO vector during the iteration
-
- bio_for_each_segment()
- bio_for_each_segment_all()
-
-* The following helpers iterate over multi-page bvec. The passed 'struct
-bio_vec' will contain a multi-page IO vector during the iteration
-
- bio_for_each_bvec()
- rq_for_each_bvec()
diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst
new file mode 100644
index 000000000000..2cf258d64bbe
--- /dev/null
+++ b/Documentation/block/capability.rst
@@ -0,0 +1,18 @@
+===============================
+Generic Block Device Capability
+===============================
+
+This file documents the sysfs file block/<disk>/capability
+
+capability is a hex word indicating which capabilities a specific disk
+supports. For more information on bits not listed here, see
+include/linux/genhd.h
+
+GENHD_FL_MEDIA_CHANGE_NOTIFY
+----------------------------
+
+Value: 4
+
+When this bit is set, the disk supports Asynchronous Notification
+of media change events. These events will be broadcast to user
+space via kernel uevent.
diff --git a/Documentation/block/capability.txt b/Documentation/block/capability.txt
deleted file mode 100644
index 2f1729424ef4..000000000000
--- a/Documentation/block/capability.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-Generic Block Device Capability
-===============================================================================
-This file documents the sysfs file block/<disk>/capability
-
-capability is a hex word indicating which capabilities a specific disk
-supports. For more information on bits not listed here, see
-include/linux/genhd.h
-
-Capability Value
--------------------------------------------------------------------------------
-GENHD_FL_MEDIA_CHANGE_NOTIFY 4
- When this bit is set, the disk supports Asynchronous Notification
- of media change events. These events will be broadcast to user
- space via kernel uevent.
-
diff --git a/Documentation/block/cmdline-partition.rst b/Documentation/block/cmdline-partition.rst
new file mode 100644
index 000000000000..530bedff548a
--- /dev/null
+++ b/Documentation/block/cmdline-partition.rst
@@ -0,0 +1,53 @@
+==============================================
+Embedded device command line partition parsing
+==============================================
+
+The "blkdevparts" command line option adds support for reading the
+block device partition table from the kernel command line.
+
+It is typically used for fixed block (eMMC) embedded devices.
+It has no MBR, so saves storage space. Bootloader can be easily accessed
+by absolute address of data on the block device.
+Users can easily change the partition.
+
+The format for the command line is just like mtdparts:
+
+blkdevparts=<blkdev-def>[;<blkdev-def>]
+ <blkdev-def> := <blkdev-id>:<partdef>[,<partdef>]
+ <partdef> := <size>[@<offset>](part-name)
+
+<blkdev-id>
+ block device disk name. Embedded device uses fixed block device.
+ Its disk name is also fixed, such as: mmcblk0, mmcblk1, mmcblk0boot0.
+
+<size>
+ partition size, in bytes, such as: 512, 1m, 1G.
+ size may contain an optional suffix of (upper or lower case):
+
+ K, M, G, T, P, E.
+
+ "-" is used to denote all remaining space.
+
+<offset>
+ partition start address, in bytes.
+ offset may contain an optional suffix of (upper or lower case):
+
+ K, M, G, T, P, E.
+
+(part-name)
+ partition name. Kernel sends uevent with "PARTNAME". Application can
+ create a link to block device partition with the name "PARTNAME".
+ User space application can access partition by partition name.
+
+Example:
+
+ eMMC disk names are "mmcblk0" and "mmcblk0boot0".
+
+ bootargs::
+
+ 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
+
+ dmesg::
+
+ mmcblk0: p1(data0) p2(data1) p3()
+ mmcblk0boot0: p1(boot) p2(kernel)
diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt
deleted file mode 100644
index 760a3f7c3ed4..000000000000
--- a/Documentation/block/cmdline-partition.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Embedded device command line partition parsing
-=====================================================================
-
-The "blkdevparts" command line option adds support for reading the
-block device partition table from the kernel command line.
-
-It is typically used for fixed block (eMMC) embedded devices.
-It has no MBR, so saves storage space. Bootloader can be easily accessed
-by absolute address of data on the block device.
-Users can easily change the partition.
-
-The format for the command line is just like mtdparts:
-
-blkdevparts=<blkdev-def>[;<blkdev-def>]
- <blkdev-def> := <blkdev-id>:<partdef>[,<partdef>]
- <partdef> := <size>[@<offset>](part-name)
-
-<blkdev-id>
- block device disk name. Embedded device uses fixed block device.
- Its disk name is also fixed, such as: mmcblk0, mmcblk1, mmcblk0boot0.
-
-<size>
- partition size, in bytes, such as: 512, 1m, 1G.
- size may contain an optional suffix of (upper or lower case):
- K, M, G, T, P, E.
- "-" is used to denote all remaining space.
-
-<offset>
- partition start address, in bytes.
- offset may contain an optional suffix of (upper or lower case):
- K, M, G, T, P, E.
-
-(part-name)
- partition name. Kernel sends uevent with "PARTNAME". Application can
- create a link to block device partition with the name "PARTNAME".
- User space application can access partition by partition name.
-
-Example:
- eMMC disk names are "mmcblk0" and "mmcblk0boot0".
-
- bootargs:
- 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
-
- dmesg:
- mmcblk0: p1(data0) p2(data1) p3()
- mmcblk0boot0: p1(boot) p2(kernel)
diff --git a/Documentation/block/data-integrity.rst b/Documentation/block/data-integrity.rst
new file mode 100644
index 000000000000..4f2452a95c43
--- /dev/null
+++ b/Documentation/block/data-integrity.rst
@@ -0,0 +1,291 @@
+==============
+Data Integrity
+==============
+
+1. Introduction
+===============
+
+Modern filesystems feature checksumming of data and metadata to
+protect against data corruption. However, the detection of the
+corruption is done at read time which could potentially be months
+after the data was written. At that point the original data that the
+application tried to write is most likely lost.
+
+The solution is to ensure that the disk is actually storing what the
+application meant it to. Recent additions to both the SCSI family
+protocols (SBC Data Integrity Field, SCC protection proposal) as well
+as SATA/T13 (External Path Protection) try to remedy this by adding
+support for appending integrity metadata to an I/O. The integrity
+metadata (or protection information in SCSI terminology) includes a
+checksum for each sector as well as an incrementing counter that
+ensures the individual sectors are written in the right order. And
+for some protection schemes also that the I/O is written to the right
+place on disk.
+
+Current storage controllers and devices implement various protective
+measures, for instance checksumming and scrubbing. But these
+technologies are working in their own isolated domains or at best
+between adjacent nodes in the I/O path. The interesting thing about
+DIF and the other integrity extensions is that the protection format
+is well defined and every node in the I/O path can verify the
+integrity of the I/O and reject it if corruption is detected. This
+allows not only corruption prevention but also isolation of the point
+of failure.
+
+2. The Data Integrity Extensions
+================================
+
+As written, the protocol extensions only protect the path between
+controller and storage device. However, many controllers actually
+allow the operating system to interact with the integrity metadata
+(IMD). We have been working with several FC/SAS HBA vendors to enable
+the protection information to be transferred to and from their
+controllers.
+
+The SCSI Data Integrity Field works by appending 8 bytes of protection
+information to each sector. The data + integrity metadata is stored
+in 520 byte sectors on disk. Data + IMD are interleaved when
+transferred between the controller and target. The T13 proposal is
+similar.
+
+Because it is highly inconvenient for operating systems to deal with
+520 (and 4104) byte sectors, we approached several HBA vendors and
+encouraged them to allow separation of the data and integrity metadata
+scatter-gather lists.
+
+The controller will interleave the buffers on write and split them on
+read. This means that Linux can DMA the data buffers to and from
+host memory without changes to the page cache.
+
+Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs
+is somewhat heavy to compute in software. Benchmarks found that
+calculating this checksum had a significant impact on system
+performance for a number of workloads. Some controllers allow a
+lighter-weight checksum to be used when interfacing with the operating
+system. Emulex, for instance, supports the TCP/IP checksum instead.
+The IP checksum received from the OS is converted to the 16-bit CRC
+when writing and vice versa. This allows the integrity metadata to be
+generated by Linux or the application at very low cost (comparable to
+software RAID5).
+
+The IP checksum is weaker than the CRC in terms of detecting bit
+errors. However, the strength is really in the separation of the data
+buffers and the integrity metadata. These two distinct buffers must
+match up for an I/O to complete.
+
+The separation of the data and integrity metadata buffers as well as
+the choice in checksums is referred to as the Data Integrity
+Extensions. As these extensions are outside the scope of the protocol
+bodies (T10, T13), Oracle and its partners are trying to standardize
+them within the Storage Networking Industry Association.
+
+3. Kernel Changes
+=================
+
+The data integrity framework in Linux enables protection information
+to be pinned to I/Os and sent to/received from controllers that
+support it.
+
+The advantage to the integrity extensions in SCSI and SATA is that
+they enable us to protect the entire path from application to storage
+device. However, at the same time this is also the biggest
+disadvantage. It means that the protection information must be in a
+format that can be understood by the disk.
+
+Generally Linux/POSIX applications are agnostic to the intricacies of
+the storage devices they are accessing. The virtual filesystem switch
+and the block layer make things like hardware sector size and
+transport protocols completely transparent to the application.
+
+However, this level of detail is required when preparing the
+protection information to send to a disk. Consequently, the very
+concept of an end-to-end protection scheme is a layering violation.
+It is completely unreasonable for an application to be aware whether
+it is accessing a SCSI or SATA disk.
+
+The data integrity support implemented in Linux attempts to hide this
+from the application. As far as the application (and to some extent
+the kernel) is concerned, the integrity metadata is opaque information
+that's attached to the I/O.
+
+The current implementation allows the block layer to automatically
+generate the protection information for any I/O. Eventually the
+intent is to move the integrity metadata calculation to userspace for
+user data. Metadata and other I/O that originates within the kernel
+will still use the automatic generation interface.
+
+Some storage devices allow each hardware sector to be tagged with a
+16-bit value. The owner of this tag space is the owner of the block
+device. I.e. the filesystem in most cases. The filesystem can use
+this extra space to tag sectors as they see fit. Because the tag
+space is limited, the block interface allows tagging bigger chunks by
+way of interleaving. This way, 8*16 bits of information can be
+attached to a typical 4KB filesystem block.
+
+This also means that applications such as fsck and mkfs will need
+access to manipulate the tags from user space. A passthrough
+interface for this is being worked on.
+
+
+4. Block Layer Implementation Details
+=====================================
+
+4.1 Bio
+-------
+
+The data integrity patches add a new field to struct bio when
+CONFIG_BLK_DEV_INTEGRITY is enabled. bio_integrity(bio) returns a
+pointer to a struct bip which contains the bio integrity payload.
+Essentially a bip is a trimmed down struct bio which holds a bio_vec
+containing the integrity metadata and the required housekeeping
+information (bvec pool, vector count, etc.)
+
+A kernel subsystem can enable data integrity protection on a bio by
+calling bio_integrity_alloc(bio). This will allocate and attach the
+bip to the bio.
+
+Individual pages containing integrity metadata can subsequently be
+attached using bio_integrity_add_page().
+
+bio_free() will automatically free the bip.
+
+
+4.2 Block Device
+----------------
+
+Because the format of the protection data is tied to the physical
+disk, each block device has been extended with a block integrity
+profile (struct blk_integrity). This optional profile is registered
+with the block layer using blk_integrity_register().
+
+The profile contains callback functions for generating and verifying
+the protection data, as well as getting and setting application tags.
+The profile also contains a few constants to aid in completing,
+merging and splitting the integrity metadata.
+
+Layered block devices will need to pick a profile that's appropriate
+for all subdevices. blk_integrity_compare() can help with that. DM
+and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6
+will require extra work due to the application tag.
+
+
+5.0 Block Layer Integrity API
+=============================
+
+5.1 Normal Filesystem
+---------------------
+
+ The normal filesystem is unaware that the underlying block device
+ is capable of sending/receiving integrity metadata. The IMD will
+ be automatically generated by the block layer at submit_bio() time
+ in case of a WRITE. A READ request will cause the I/O integrity
+ to be verified upon completion.
+
+ IMD generation and verification can be toggled using the::
+
+ /sys/block/<bdev>/integrity/write_generate
+
+ and::
+
+ /sys/block/<bdev>/integrity/read_verify
+
+ flags.
+
+
+5.2 Integrity-Aware Filesystem
+------------------------------
+
+ A filesystem that is integrity-aware can prepare I/Os with IMD
+ attached. It can also use the application tag space if this is
+ supported by the block device.
+
+
+ `bool bio_integrity_prep(bio);`
+
+ To generate IMD for WRITE and to set up buffers for READ, the
+ filesystem must call bio_integrity_prep(bio).
+
+ Prior to calling this function, the bio data direction and start
+ sector must be set, and the bio should have all data pages
+ added. It is up to the caller to ensure that the bio does not
+ change while I/O is in progress.
+ Complete bio with error if prepare failed for some reson.
+
+
+5.3 Passing Existing Integrity Metadata
+---------------------------------------
+
+ Filesystems that either generate their own integrity metadata or
+ are capable of transferring IMD from user space can use the
+ following calls:
+
+
+ `struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages);`
+
+ Allocates the bio integrity payload and hangs it off of the bio.
+ nr_pages indicate how many pages of protection data need to be
+ stored in the integrity bio_vec list (similar to bio_alloc()).
+
+ The integrity payload will be freed at bio_free() time.
+
+
+ `int bio_integrity_add_page(bio, page, len, offset);`
+
+ Attaches a page containing integrity metadata to an existing
+ bio. The bio must have an existing bip,
+ i.e. bio_integrity_alloc() must have been called. For a WRITE,
+ the integrity metadata in the pages must be in a format
+ understood by the target device with the notable exception that
+ the sector numbers will be remapped as the request traverses the
+ I/O stack. This implies that the pages added using this call
+ will be modified during I/O! The first reference tag in the
+ integrity metadata must have a value of bip->bip_sector.
+
+ Pages can be added using bio_integrity_add_page() as long as
+ there is room in the bip bio_vec array (nr_pages).
+
+ Upon completion of a READ operation, the attached pages will
+ contain the integrity metadata received from the storage device.
+ It is up to the receiver to process them and verify data
+ integrity upon completion.
+
+
+5.4 Registering A Block Device As Capable Of Exchanging Integrity Metadata
+--------------------------------------------------------------------------
+
+ To enable integrity exchange on a block device the gendisk must be
+ registered as capable:
+
+ `int blk_integrity_register(gendisk, blk_integrity);`
+
+ The blk_integrity struct is a template and should contain the
+ following::
+
+ static struct blk_integrity my_profile = {
+ .name = "STANDARDSBODY-TYPE-VARIANT-CSUM",
+ .generate_fn = my_generate_fn,
+ .verify_fn = my_verify_fn,
+ .tuple_size = sizeof(struct my_tuple_size),
+ .tag_size = <tag bytes per hw sector>,
+ };
+
+ 'name' is a text string which will be visible in sysfs. This is
+ part of the userland API so chose it carefully and never change
+ it. The format is standards body-type-variant.
+ E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC.
+
+ 'generate_fn' generates appropriate integrity metadata (for WRITE).
+
+ 'verify_fn' verifies that the data buffer matches the integrity
+ metadata.
+
+ 'tuple_size' must be set to match the size of the integrity
+ metadata per sector. I.e. 8 for DIF and EPP.
+
+ 'tag_size' must be set to identify how many bytes of tag space
+ are available per hardware sector. For DIF this is either 2 or
+ 0 depending on the value of the Control Mode Page ATO bit.
+
+----------------------------------------------------------------------
+
+2007-12-24 Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/block/data-integrity.txt b/Documentation/block/data-integrity.txt
deleted file mode 100644
index 934c44ea0c57..000000000000
--- a/Documentation/block/data-integrity.txt
+++ /dev/null
@@ -1,281 +0,0 @@
-----------------------------------------------------------------------
-1. INTRODUCTION
-
-Modern filesystems feature checksumming of data and metadata to
-protect against data corruption. However, the detection of the
-corruption is done at read time which could potentially be months
-after the data was written. At that point the original data that the
-application tried to write is most likely lost.
-
-The solution is to ensure that the disk is actually storing what the
-application meant it to. Recent additions to both the SCSI family
-protocols (SBC Data Integrity Field, SCC protection proposal) as well
-as SATA/T13 (External Path Protection) try to remedy this by adding
-support for appending integrity metadata to an I/O. The integrity
-metadata (or protection information in SCSI terminology) includes a
-checksum for each sector as well as an incrementing counter that
-ensures the individual sectors are written in the right order. And
-for some protection schemes also that the I/O is written to the right
-place on disk.
-
-Current storage controllers and devices implement various protective
-measures, for instance checksumming and scrubbing. But these
-technologies are working in their own isolated domains or at best
-between adjacent nodes in the I/O path. The interesting thing about
-DIF and the other integrity extensions is that the protection format
-is well defined and every node in the I/O path can verify the
-integrity of the I/O and reject it if corruption is detected. This
-allows not only corruption prevention but also isolation of the point
-of failure.
-
-----------------------------------------------------------------------
-2. THE DATA INTEGRITY EXTENSIONS
-
-As written, the protocol extensions only protect the path between
-controller and storage device. However, many controllers actually
-allow the operating system to interact with the integrity metadata
-(IMD). We have been working with several FC/SAS HBA vendors to enable
-the protection information to be transferred to and from their
-controllers.
-
-The SCSI Data Integrity Field works by appending 8 bytes of protection
-information to each sector. The data + integrity metadata is stored
-in 520 byte sectors on disk. Data + IMD are interleaved when
-transferred between the controller and target. The T13 proposal is
-similar.
-
-Because it is highly inconvenient for operating systems to deal with
-520 (and 4104) byte sectors, we approached several HBA vendors and
-encouraged them to allow separation of the data and integrity metadata
-scatter-gather lists.
-
-The controller will interleave the buffers on write and split them on
-read. This means that Linux can DMA the data buffers to and from
-host memory without changes to the page cache.
-
-Also, the 16-bit CRC checksum mandated by both the SCSI and SATA specs
-is somewhat heavy to compute in software. Benchmarks found that
-calculating this checksum had a significant impact on system
-performance for a number of workloads. Some controllers allow a
-lighter-weight checksum to be used when interfacing with the operating
-system. Emulex, for instance, supports the TCP/IP checksum instead.
-The IP checksum received from the OS is converted to the 16-bit CRC
-when writing and vice versa. This allows the integrity metadata to be
-generated by Linux or the application at very low cost (comparable to
-software RAID5).
-
-The IP checksum is weaker than the CRC in terms of detecting bit
-errors. However, the strength is really in the separation of the data
-buffers and the integrity metadata. These two distinct buffers must
-match up for an I/O to complete.
-
-The separation of the data and integrity metadata buffers as well as
-the choice in checksums is referred to as the Data Integrity
-Extensions. As these extensions are outside the scope of the protocol
-bodies (T10, T13), Oracle and its partners are trying to standardize
-them within the Storage Networking Industry Association.
-
-----------------------------------------------------------------------
-3. KERNEL CHANGES
-
-The data integrity framework in Linux enables protection information
-to be pinned to I/Os and sent to/received from controllers that
-support it.
-
-The advantage to the integrity extensions in SCSI and SATA is that
-they enable us to protect the entire path from application to storage
-device. However, at the same time this is also the biggest
-disadvantage. It means that the protection information must be in a
-format that can be understood by the disk.
-
-Generally Linux/POSIX applications are agnostic to the intricacies of
-the storage devices they are accessing. The virtual filesystem switch
-and the block layer make things like hardware sector size and
-transport protocols completely transparent to the application.
-
-However, this level of detail is required when preparing the
-protection information to send to a disk. Consequently, the very
-concept of an end-to-end protection scheme is a layering violation.
-It is completely unreasonable for an application to be aware whether
-it is accessing a SCSI or SATA disk.
-
-The data integrity support implemented in Linux attempts to hide this
-from the application. As far as the application (and to some extent
-the kernel) is concerned, the integrity metadata is opaque information
-that's attached to the I/O.
-
-The current implementation allows the block layer to automatically
-generate the protection information for any I/O. Eventually the
-intent is to move the integrity metadata calculation to userspace for
-user data. Metadata and other I/O that originates within the kernel
-will still use the automatic generation interface.
-
-Some storage devices allow each hardware sector to be tagged with a
-16-bit value. The owner of this tag space is the owner of the block
-device. I.e. the filesystem in most cases. The filesystem can use
-this extra space to tag sectors as they see fit. Because the tag
-space is limited, the block interface allows tagging bigger chunks by
-way of interleaving. This way, 8*16 bits of information can be
-attached to a typical 4KB filesystem block.
-
-This also means that applications such as fsck and mkfs will need
-access to manipulate the tags from user space. A passthrough
-interface for this is being worked on.
-
-
-----------------------------------------------------------------------
-4. BLOCK LAYER IMPLEMENTATION DETAILS
-
-4.1 BIO
-
-The data integrity patches add a new field to struct bio when
-CONFIG_BLK_DEV_INTEGRITY is enabled. bio_integrity(bio) returns a
-pointer to a struct bip which contains the bio integrity payload.
-Essentially a bip is a trimmed down struct bio which holds a bio_vec
-containing the integrity metadata and the required housekeeping
-information (bvec pool, vector count, etc.)
-
-A kernel subsystem can enable data integrity protection on a bio by
-calling bio_integrity_alloc(bio). This will allocate and attach the
-bip to the bio.
-
-Individual pages containing integrity metadata can subsequently be
-attached using bio_integrity_add_page().
-
-bio_free() will automatically free the bip.
-
-
-4.2 BLOCK DEVICE
-
-Because the format of the protection data is tied to the physical
-disk, each block device has been extended with a block integrity
-profile (struct blk_integrity). This optional profile is registered
-with the block layer using blk_integrity_register().
-
-The profile contains callback functions for generating and verifying
-the protection data, as well as getting and setting application tags.
-The profile also contains a few constants to aid in completing,
-merging and splitting the integrity metadata.
-
-Layered block devices will need to pick a profile that's appropriate
-for all subdevices. blk_integrity_compare() can help with that. DM
-and MD linear, RAID0 and RAID1 are currently supported. RAID4/5/6
-will require extra work due to the application tag.
-
-
-----------------------------------------------------------------------
-5.0 BLOCK LAYER INTEGRITY API
-
-5.1 NORMAL FILESYSTEM
-
- The normal filesystem is unaware that the underlying block device
- is capable of sending/receiving integrity metadata. The IMD will
- be automatically generated by the block layer at submit_bio() time
- in case of a WRITE. A READ request will cause the I/O integrity
- to be verified upon completion.
-
- IMD generation and verification can be toggled using the
-
- /sys/block/<bdev>/integrity/write_generate
-
- and
-
- /sys/block/<bdev>/integrity/read_verify
-
- flags.
-
-
-5.2 INTEGRITY-AWARE FILESYSTEM
-
- A filesystem that is integrity-aware can prepare I/Os with IMD
- attached. It can also use the application tag space if this is
- supported by the block device.
-
-
- bool bio_integrity_prep(bio);
-
- To generate IMD for WRITE and to set up buffers for READ, the
- filesystem must call bio_integrity_prep(bio).
-
- Prior to calling this function, the bio data direction and start
- sector must be set, and the bio should have all data pages
- added. It is up to the caller to ensure that the bio does not
- change while I/O is in progress.
- Complete bio with error if prepare failed for some reson.
-
-
-5.3 PASSING EXISTING INTEGRITY METADATA
-
- Filesystems that either generate their own integrity metadata or
- are capable of transferring IMD from user space can use the
- following calls:
-
-
- struct bip * bio_integrity_alloc(bio, gfp_mask, nr_pages);
-
- Allocates the bio integrity payload and hangs it off of the bio.
- nr_pages indicate how many pages of protection data need to be
- stored in the integrity bio_vec list (similar to bio_alloc()).
-
- The integrity payload will be freed at bio_free() time.
-
-
- int bio_integrity_add_page(bio, page, len, offset);
-
- Attaches a page containing integrity metadata to an existing
- bio. The bio must have an existing bip,
- i.e. bio_integrity_alloc() must have been called. For a WRITE,
- the integrity metadata in the pages must be in a format
- understood by the target device with the notable exception that
- the sector numbers will be remapped as the request traverses the
- I/O stack. This implies that the pages added using this call
- will be modified during I/O! The first reference tag in the
- integrity metadata must have a value of bip->bip_sector.
-
- Pages can be added using bio_integrity_add_page() as long as
- there is room in the bip bio_vec array (nr_pages).
-
- Upon completion of a READ operation, the attached pages will
- contain the integrity metadata received from the storage device.
- It is up to the receiver to process them and verify data
- integrity upon completion.
-
-
-5.4 REGISTERING A BLOCK DEVICE AS CAPABLE OF EXCHANGING INTEGRITY
- METADATA
-
- To enable integrity exchange on a block device the gendisk must be
- registered as capable:
-
- int blk_integrity_register(gendisk, blk_integrity);
-
- The blk_integrity struct is a template and should contain the
- following:
-
- static struct blk_integrity my_profile = {
- .name = "STANDARDSBODY-TYPE-VARIANT-CSUM",
- .generate_fn = my_generate_fn,
- .verify_fn = my_verify_fn,
- .tuple_size = sizeof(struct my_tuple_size),
- .tag_size = <tag bytes per hw sector>,
- };
-
- 'name' is a text string which will be visible in sysfs. This is
- part of the userland API so chose it carefully and never change
- it. The format is standards body-type-variant.
- E.g. T10-DIF-TYPE1-IP or T13-EPP-0-CRC.
-
- 'generate_fn' generates appropriate integrity metadata (for WRITE).
-
- 'verify_fn' verifies that the data buffer matches the integrity
- metadata.
-
- 'tuple_size' must be set to match the size of the integrity
- metadata per sector. I.e. 8 for DIF and EPP.
-
- 'tag_size' must be set to identify how many bytes of tag space
- are available per hardware sector. For DIF this is either 2 or
- 0 depending on the value of the Control Mode Page ATO bit.
-
-----------------------------------------------------------------------
-2007-12-24 Martin K. Petersen <martin.petersen@oracle.com>
diff --git a/Documentation/block/deadline-iosched.rst b/Documentation/block/deadline-iosched.rst
new file mode 100644
index 000000000000..9f5c5a4c370e
--- /dev/null
+++ b/Documentation/block/deadline-iosched.rst
@@ -0,0 +1,72 @@
+==============================
+Deadline IO scheduler tunables
+==============================
+
+This little file attempts to document how the deadline io scheduler works.
+In particular, it will clarify the meaning of the exposed tunables that may be
+of interest to power users.
+
+Selecting IO schedulers
+-----------------------
+Refer to Documentation/block/switching-sched.rst for information on
+selecting an io scheduler on a per-device basis.
+
+------------------------------------------------------------------------------
+
+read_expire (in ms)
+-----------------------
+
+The goal of the deadline io scheduler is to attempt to guarantee a start
+service time for a request. As we focus mainly on read latencies, this is
+tunable. When a read request first enters the io scheduler, it is assigned
+a deadline that is the current time + the read_expire value in units of
+milliseconds.
+
+
+write_expire (in ms)
+-----------------------
+
+Similar to read_expire mentioned above, but for writes.
+
+
+fifo_batch (number of requests)
+------------------------------------
+
+Requests are grouped into ``batches`` of a particular data direction (read or
+write) which are serviced in increasing sector order. To limit extra seeking,
+deadline expiries are only checked between batches. fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput. When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour). Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
+
+
+writes_starved (number of dispatches)
+--------------------------------------
+
+When we have to move requests from the io scheduler queue to the block
+device dispatch queue, we always give a preference to reads. However, we
+don't want to starve writes indefinitely either. So writes_starved controls
+how many times we give preference to reads over writes. When that has been
+done writes_starved number of times, we dispatch some writes based on the
+same criteria as reads.
+
+
+front_merges (bool)
+----------------------
+
+Sometimes it happens that a request enters the io scheduler that is contiguous
+with a request that is already on the queue. Either it fits in the back of that
+request, or it fits at the front. That is called either a back merge candidate
+or a front merge candidate. Due to the way files are typically laid out,
+back merges are much more common than front merges. For some work loads, you
+may even know that it is a waste of time to spend any time attempting to
+front merge requests. Setting front_merges to 0 disables this functionality.
+Front merges may still occur due to the cached last_merge hint, but since
+that comes at basically 0 cost we leave that on. We simply disable the
+rbtree front sector lookup when the io scheduler merge function is called.
+
+
+Nov 11 2002, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/deadline-iosched.txt b/Documentation/block/deadline-iosched.txt
deleted file mode 100644
index 2d82c80322cb..000000000000
--- a/Documentation/block/deadline-iosched.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-Deadline IO scheduler tunables
-==============================
-
-This little file attempts to document how the deadline io scheduler works.
-In particular, it will clarify the meaning of the exposed tunables that may be
-of interest to power users.
-
-Selecting IO schedulers
------------------------
-Refer to Documentation/block/switching-sched.txt for information on
-selecting an io scheduler on a per-device basis.
-
-
-********************************************************************************
-
-
-read_expire (in ms)
------------
-
-The goal of the deadline io scheduler is to attempt to guarantee a start
-service time for a request. As we focus mainly on read latencies, this is
-tunable. When a read request first enters the io scheduler, it is assigned
-a deadline that is the current time + the read_expire value in units of
-milliseconds.
-
-
-write_expire (in ms)
------------
-
-Similar to read_expire mentioned above, but for writes.
-
-
-fifo_batch (number of requests)
-----------
-
-Requests are grouped into ``batches'' of a particular data direction (read or
-write) which are serviced in increasing sector order. To limit extra seeking,
-deadline expiries are only checked between batches. fifo_batch controls the
-maximum number of requests per batch.
-
-This parameter tunes the balance between per-request latency and aggregate
-throughput. When low latency is the primary concern, smaller is better (where
-a value of 1 yields first-come first-served behaviour). Increasing fifo_batch
-generally improves throughput, at the cost of latency variation.
-
-
-writes_starved (number of dispatches)
---------------
-
-When we have to move requests from the io scheduler queue to the block
-device dispatch queue, we always give a preference to reads. However, we
-don't want to starve writes indefinitely either. So writes_starved controls
-how many times we give preference to reads over writes. When that has been
-done writes_starved number of times, we dispatch some writes based on the
-same criteria as reads.
-
-
-front_merges (bool)
-------------
-
-Sometimes it happens that a request enters the io scheduler that is contiguous
-with a request that is already on the queue. Either it fits in the back of that
-request, or it fits at the front. That is called either a back merge candidate
-or a front merge candidate. Due to the way files are typically laid out,
-back merges are much more common than front merges. For some work loads, you
-may even know that it is a waste of time to spend any time attempting to
-front merge requests. Setting front_merges to 0 disables this functionality.
-Front merges may still occur due to the cached last_merge hint, but since
-that comes at basically 0 cost we leave that on. We simply disable the
-rbtree front sector lookup when the io scheduler merge function is called.
-
-
-Nov 11 2002, Jens Axboe <jens.axboe@oracle.com>
-
-
diff --git a/Documentation/block/index.rst b/Documentation/block/index.rst
new file mode 100644
index 000000000000..3fa7a52fafa4
--- /dev/null
+++ b/Documentation/block/index.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====
+Block
+=====
+
+.. toctree::
+ :maxdepth: 1
+
+ bfq-iosched
+ biodoc
+ biovecs
+ capability
+ cmdline-partition
+ data-integrity
+ deadline-iosched
+ ioprio
+ kyber-iosched
+ null_blk
+ pr
+ queue-sysfs
+ request
+ stat
+ switching-sched
+ writeback_cache_control
diff --git a/Documentation/block/ioprio.rst b/Documentation/block/ioprio.rst
new file mode 100644
index 000000000000..f72b0de65af7
--- /dev/null
+++ b/Documentation/block/ioprio.rst
@@ -0,0 +1,182 @@
+===================
+Block io priorities
+===================
+
+
+Intro
+-----
+
+With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io
+priorities are supported for reads on files. This enables users to io nice
+processes or process groups, similar to what has been possible with cpu
+scheduling for ages. This document mainly details the current possibilities
+with cfq; other io schedulers do not support io priorities thus far.
+
+Scheduling classes
+------------------
+
+CFQ implements three generic scheduling classes that determine how io is
+served for a process.
+
+IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
+higher priority than any other in the system, processes from this class are
+given first access to the disk every time. Thus it needs to be used with some
+care, one io RT process can starve the entire system. Within the RT class,
+there are 8 levels of class data that determine exactly how much time this
+process needs the disk for on each service. In the future this might change
+to be more directly mappable to performance, by passing in a wanted data
+rate instead.
+
+IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default
+for any process that hasn't set a specific io priority. The class data
+determines how much io bandwidth the process will get, it's directly mappable
+to the cpu nice levels just more coarsely implemented. 0 is the highest
+BE prio level, 7 is the lowest. The mapping between cpu nice level and io
+nice level is determined as: io_nice = (cpu_nice + 20) / 5.
+
+IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this
+level only get io time when no one else needs the disk. The idle class has no
+class data, since it doesn't really apply here.
+
+Tools
+-----
+
+See below for a sample ionice tool. Usage::
+
+ # ionice -c<class> -n<level> -p<pid>
+
+If pid isn't given, the current process is assumed. IO priority settings
+are inherited on fork, so you can use ionice to start the process at a given
+level::
+
+ # ionice -c2 -n0 /bin/ls
+
+will run ls at the best-effort scheduling class at the highest priority.
+For a running process, you can give the pid instead::
+
+ # ionice -c1 -n2 -p100
+
+will change pid 100 to run at the realtime scheduling class, at priority 2.
+
+ionice.c tool::
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <errno.h>
+ #include <getopt.h>
+ #include <unistd.h>
+ #include <sys/ptrace.h>
+ #include <asm/unistd.h>
+
+ extern int sys_ioprio_set(int, int, int);
+ extern int sys_ioprio_get(int, int);
+
+ #if defined(__i386__)
+ #define __NR_ioprio_set 289
+ #define __NR_ioprio_get 290
+ #elif defined(__ppc__)
+ #define __NR_ioprio_set 273
+ #define __NR_ioprio_get 274
+ #elif defined(__x86_64__)
+ #define __NR_ioprio_set 251
+ #define __NR_ioprio_get 252
+ #elif defined(__ia64__)
+ #define __NR_ioprio_set 1274
+ #define __NR_ioprio_get 1275
+ #else
+ #error "Unsupported arch"
+ #endif
+
+ static inline int ioprio_set(int which, int who, int ioprio)
+ {
+ return syscall(__NR_ioprio_set, which, who, ioprio);
+ }
+
+ static inline int ioprio_get(int which, int who)
+ {
+ return syscall(__NR_ioprio_get, which, who);
+ }
+
+ enum {
+ IOPRIO_CLASS_NONE,
+ IOPRIO_CLASS_RT,
+ IOPRIO_CLASS_BE,
+ IOPRIO_CLASS_IDLE,
+ };
+
+ enum {
+ IOPRIO_WHO_PROCESS = 1,
+ IOPRIO_WHO_PGRP,
+ IOPRIO_WHO_USER,
+ };
+
+ #define IOPRIO_CLASS_SHIFT 13
+
+ const char *to_prio[] = { "none", "realtime", "best-effort", "idle", };
+
+ int main(int argc, char *argv[])
+ {
+ int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE;
+ int c, pid = 0;
+
+ while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) {
+ switch (c) {
+ case 'n':
+ ioprio = strtol(optarg, NULL, 10);
+ set = 1;
+ break;
+ case 'c':
+ ioprio_class = strtol(optarg, NULL, 10);
+ set = 1;
+ break;
+ case 'p':
+ pid = strtol(optarg, NULL, 10);
+ break;
+ }
+ }
+
+ switch (ioprio_class) {
+ case IOPRIO_CLASS_NONE:
+ ioprio_class = IOPRIO_CLASS_BE;
+ break;
+ case IOPRIO_CLASS_RT:
+ case IOPRIO_CLASS_BE:
+ break;
+ case IOPRIO_CLASS_IDLE:
+ ioprio = 7;
+ break;
+ default:
+ printf("bad prio class %d\n", ioprio_class);
+ return 1;
+ }
+
+ if (!set) {
+ if (!pid && argv[optind])
+ pid = strtol(argv[optind], NULL, 10);
+
+ ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid);
+
+ printf("pid=%d, %d\n", pid, ioprio);
+
+ if (ioprio == -1)
+ perror("ioprio_get");
+ else {
+ ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT;
+ ioprio = ioprio & 0xff;
+ printf("%s: prio %d\n", to_prio[ioprio_class], ioprio);
+ }
+ } else {
+ if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) {
+ perror("ioprio_set");
+ return 1;
+ }
+
+ if (argv[optind])
+ execvp(argv[optind], &argv[optind]);
+ }
+
+ return 0;
+ }
+
+
+March 11 2005, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/ioprio.txt b/Documentation/block/ioprio.txt
deleted file mode 100644
index 8ed8c59380b4..000000000000
--- a/Documentation/block/ioprio.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-Block io priorities
-===================
-
-
-Intro
------
-
-With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io
-priorities are supported for reads on files. This enables users to io nice
-processes or process groups, similar to what has been possible with cpu
-scheduling for ages. This document mainly details the current possibilities
-with cfq; other io schedulers do not support io priorities thus far.
-
-Scheduling classes
-------------------
-
-CFQ implements three generic scheduling classes that determine how io is
-served for a process.
-
-IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
-higher priority than any other in the system, processes from this class are
-given first access to the disk every time. Thus it needs to be used with some
-care, one io RT process can starve the entire system. Within the RT class,
-there are 8 levels of class data that determine exactly how much time this
-process needs the disk for on each service. In the future this might change
-to be more directly mappable to performance, by passing in a wanted data
-rate instead.
-
-IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default
-for any process that hasn't set a specific io priority. The class data
-determines how much io bandwidth the process will get, it's directly mappable
-to the cpu nice levels just more coarsely implemented. 0 is the highest
-BE prio level, 7 is the lowest. The mapping between cpu nice level and io
-nice level is determined as: io_nice = (cpu_nice + 20) / 5.
-
-IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this
-level only get io time when no one else needs the disk. The idle class has no
-class data, since it doesn't really apply here.
-
-Tools
------
-
-See below for a sample ionice tool. Usage:
-
-# ionice -c<class> -n<level> -p<pid>
-
-If pid isn't given, the current process is assumed. IO priority settings
-are inherited on fork, so you can use ionice to start the process at a given
-level:
-
-# ionice -c2 -n0 /bin/ls
-
-will run ls at the best-effort scheduling class at the highest priority.
-For a running process, you can give the pid instead:
-
-# ionice -c1 -n2 -p100
-
-will change pid 100 to run at the realtime scheduling class, at priority 2.
-
----> snip ionice.c tool <---
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <getopt.h>
-#include <unistd.h>
-#include <sys/ptrace.h>
-#include <asm/unistd.h>
-
-extern int sys_ioprio_set(int, int, int);
-extern int sys_ioprio_get(int, int);
-
-#if defined(__i386__)
-#define __NR_ioprio_set 289
-#define __NR_ioprio_get 290
-#elif defined(__ppc__)
-#define __NR_ioprio_set 273
-#define __NR_ioprio_get 274
-#elif defined(__x86_64__)
-#define __NR_ioprio_set 251
-#define __NR_ioprio_get 252
-#elif defined(__ia64__)
-#define __NR_ioprio_set 1274
-#define __NR_ioprio_get 1275
-#else
-#error "Unsupported arch"
-#endif
-
-static inline int ioprio_set(int which, int who, int ioprio)
-{
- return syscall(__NR_ioprio_set, which, who, ioprio);
-}
-
-static inline int ioprio_get(int which, int who)
-{
- return syscall(__NR_ioprio_get, which, who);
-}
-
-enum {
- IOPRIO_CLASS_NONE,
- IOPRIO_CLASS_RT,
- IOPRIO_CLASS_BE,
- IOPRIO_CLASS_IDLE,
-};
-
-enum {
- IOPRIO_WHO_PROCESS = 1,
- IOPRIO_WHO_PGRP,
- IOPRIO_WHO_USER,
-};
-
-#define IOPRIO_CLASS_SHIFT 13
-
-const char *to_prio[] = { "none", "realtime", "best-effort", "idle", };
-
-int main(int argc, char *argv[])
-{
- int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE;
- int c, pid = 0;
-
- while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) {
- switch (c) {
- case 'n':
- ioprio = strtol(optarg, NULL, 10);
- set = 1;
- break;
- case 'c':
- ioprio_class = strtol(optarg, NULL, 10);
- set = 1;
- break;
- case 'p':
- pid = strtol(optarg, NULL, 10);
- break;
- }
- }
-
- switch (ioprio_class) {
- case IOPRIO_CLASS_NONE:
- ioprio_class = IOPRIO_CLASS_BE;
- break;
- case IOPRIO_CLASS_RT:
- case IOPRIO_CLASS_BE:
- break;
- case IOPRIO_CLASS_IDLE:
- ioprio = 7;
- break;
- default:
- printf("bad prio class %d\n", ioprio_class);
- return 1;
- }
-
- if (!set) {
- if (!pid && argv[optind])
- pid = strtol(argv[optind], NULL, 10);
-
- ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid);
-
- printf("pid=%d, %d\n", pid, ioprio);
-
- if (ioprio == -1)
- perror("ioprio_get");
- else {
- ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT;
- ioprio = ioprio & 0xff;
- printf("%s: prio %d\n", to_prio[ioprio_class], ioprio);
- }
- } else {
- if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) {
- perror("ioprio_set");
- return 1;
- }
-
- if (argv[optind])
- execvp(argv[optind], &argv[optind]);
- }
-
- return 0;
-}
-
----> snip ionice.c tool <---
-
-
-March 11 2005, Jens Axboe <jens.axboe@oracle.com>
diff --git a/Documentation/block/kyber-iosched.rst b/Documentation/block/kyber-iosched.rst
new file mode 100644
index 000000000000..3e164dd0617c
--- /dev/null
+++ b/Documentation/block/kyber-iosched.rst
@@ -0,0 +1,15 @@
+============================
+Kyber I/O scheduler tunables
+============================
+
+The only two tunables for the Kyber scheduler are the target latencies for
+reads and synchronous writes. Kyber will throttle requests in order to meet
+these target latencies.
+
+read_lat_nsec
+-------------
+Target latency for reads (in nanoseconds).
+
+write_lat_nsec
+--------------
+Target latency for synchronous writes (in nanoseconds).
diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt
deleted file mode 100644
index e94feacd7edc..000000000000
--- a/Documentation/block/kyber-iosched.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-Kyber I/O scheduler tunables
-===========================
-
-The only two tunables for the Kyber scheduler are the target latencies for
-reads and synchronous writes. Kyber will throttle requests in order to meet
-these target latencies.
-
-read_lat_nsec
--------------
-Target latency for reads (in nanoseconds).
-
-write_lat_nsec
---------------
-Target latency for synchronous writes (in nanoseconds).
diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst
new file mode 100644
index 000000000000..31451d80783c
--- /dev/null
+++ b/Documentation/block/null_blk.rst
@@ -0,0 +1,126 @@
+========================
+Null block device driver
+========================
+
+1. Overview
+===========
+
+The null block device (/dev/nullb*) is used for benchmarking the various
+block-layer implementations. It emulates a block device of X gigabytes in size.
+The following instances are possible:
+
+ Single-queue block-layer
+
+ - Request-based.
+ - Single submission queue per device.
+ - Implements IO scheduling algorithms (CFQ, Deadline, noop).
+
+ Multi-queue block-layer
+
+ - Request-based.
+ - Configurable submission queues per device.
+
+ No block-layer (Known as bio-based)
+
+ - Bio-based. IO requests are submitted directly to the device driver.
+ - Directly accepts bio data structure and returns them.
+
+All of them have a completion queue for each core in the system.
+
+2. Module parameters applicable for all instances
+=================================================
+
+queue_mode=[0-2]: Default: 2-Multi-queue
+ Selects which block-layer the module should instantiate with.
+
+ = ============
+ 0 Bio-based
+ 1 Single-queue
+ 2 Multi-queue
+ = ============
+
+home_node=[0--nr_nodes]: Default: NUMA_NO_NODE
+ Selects what CPU node the data structures are allocated from.
+
+gb=[Size in GB]: Default: 250GB
+ The size of the device reported to the system.
+
+bs=[Block size (in bytes)]: Default: 512 bytes
+ The block size reported to the system.
+
+nr_devices=[Number of devices]: Default: 1
+ Number of block devices instantiated. They are instantiated as /dev/nullb0,
+ etc.
+
+irqmode=[0-2]: Default: 1-Soft-irq
+ The completion mode used for completing IOs to the block-layer.
+
+ = ===========================================================================
+ 0 None.
+ 1 Soft-irq. Uses IPI to complete IOs across CPU nodes. Simulates the overhead
+ when IOs are issued from another CPU node than the home the device is
+ connected to.
+ 2 Timer: Waits a specific period (completion_nsec) for each IO before
+ completion.
+ = ===========================================================================
+
+completion_nsec=[ns]: Default: 10,000ns
+ Combined with irqmode=2 (timer). The time each completion event must wait.
+
+submit_queues=[1..nr_cpus]:
+ The number of submission queues attached to the device driver. If unset, it
+ defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
+ parameter is 1.
+
+hw_queue_depth=[0..qdepth]: Default: 64
+ The hardware queue depth of the device.
+
+III: Multi-queue specific parameters
+
+use_per_node_hctx=[0/1]: Default: 0
+
+ = =====================================================================
+ 0 The number of submit queues are set to the value of the submit_queues
+ parameter.
+ 1 The multi-queue block layer is instantiated with a hardware dispatch
+ queue for each CPU node in the system.
+ = =====================================================================
+
+no_sched=[0/1]: Default: 0
+
+ = ======================================
+ 0 nullb* use default blk-mq io scheduler
+ 1 nullb* doesn't use io scheduler
+ = ======================================
+
+blocking=[0/1]: Default: 0
+
+ = ===============================================================
+ 0 Register as a non-blocking blk-mq driver device.
+ 1 Register as a blocking blk-mq driver device, null_blk will set
+ the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
+ needs to block in its ->queue_rq() function.
+ = ===============================================================
+
+shared_tags=[0/1]: Default: 0
+
+ = ================================================================
+ 0 Tag set is not shared.
+ 1 Tag set shared between devices for blk-mq. Only makes sense with
+ nr_devices > 1, otherwise there's no tag set to share.
+ = ================================================================
+
+zoned=[0/1]: Default: 0
+
+ = ======================================================================
+ 0 Block device is exposed as a random-access block device.
+ 1 Block device is exposed as a host-managed zoned block device. Requires
+ CONFIG_BLK_DEV_ZONED.
+ = ======================================================================
+
+zone_size=[MB]: Default: 256
+ Per zone size when exposed as a zoned block device. Must be a power of two.
+
+zone_nr_conv=[nr_conv]: Default: 0
+ The number of conventional zones to create when block device is zoned. If
+ zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
deleted file mode 100644
index 41f0a3d33bbd..000000000000
--- a/Documentation/block/null_blk.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-Null block device driver
-================================================================================
-
-I. Overview
-
-The null block device (/dev/nullb*) is used for benchmarking the various
-block-layer implementations. It emulates a block device of X gigabytes in size.
-The following instances are possible:
-
- Single-queue block-layer
- - Request-based.
- - Single submission queue per device.
- - Implements IO scheduling algorithms (CFQ, Deadline, noop).
- Multi-queue block-layer
- - Request-based.
- - Configurable submission queues per device.
- No block-layer (Known as bio-based)
- - Bio-based. IO requests are submitted directly to the device driver.
- - Directly accepts bio data structure and returns them.
-
-All of them have a completion queue for each core in the system.
-
-II. Module parameters applicable for all instances:
-
-queue_mode=[0-2]: Default: 2-Multi-queue
- Selects which block-layer the module should instantiate with.
-
- 0: Bio-based.
- 1: Single-queue.
- 2: Multi-queue.
-
-home_node=[0--nr_nodes]: Default: NUMA_NO_NODE
- Selects what CPU node the data structures are allocated from.
-
-gb=[Size in GB]: Default: 250GB
- The size of the device reported to the system.
-
-bs=[Block size (in bytes)]: Default: 512 bytes
- The block size reported to the system.
-
-nr_devices=[Number of devices]: Default: 1
- Number of block devices instantiated. They are instantiated as /dev/nullb0,
- etc.
-
-irqmode=[0-2]: Default: 1-Soft-irq
- The completion mode used for completing IOs to the block-layer.
-
- 0: None.
- 1: Soft-irq. Uses IPI to complete IOs across CPU nodes. Simulates the overhead
- when IOs are issued from another CPU node than the home the device is
- connected to.
- 2: Timer: Waits a specific period (completion_nsec) for each IO before
- completion.
-
-completion_nsec=[ns]: Default: 10,000ns
- Combined with irqmode=2 (timer). The time each completion event must wait.
-
-submit_queues=[1..nr_cpus]:
- The number of submission queues attached to the device driver. If unset, it
- defaults to 1. For multi-queue, it is ignored when use_per_node_hctx module
- parameter is 1.
-
-hw_queue_depth=[0..qdepth]: Default: 64
- The hardware queue depth of the device.
-
-III: Multi-queue specific parameters
-
-use_per_node_hctx=[0/1]: Default: 0
- 0: The number of submit queues are set to the value of the submit_queues
- parameter.
- 1: The multi-queue block layer is instantiated with a hardware dispatch
- queue for each CPU node in the system.
-
-no_sched=[0/1]: Default: 0
- 0: nullb* use default blk-mq io scheduler.
- 1: nullb* doesn't use io scheduler.
-
-blocking=[0/1]: Default: 0
- 0: Register as a non-blocking blk-mq driver device.
- 1: Register as a blocking blk-mq driver device, null_blk will set
- the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
- needs to block in its ->queue_rq() function.
-
-shared_tags=[0/1]: Default: 0
- 0: Tag set is not shared.
- 1: Tag set shared between devices for blk-mq. Only makes sense with
- nr_devices > 1, otherwise there's no tag set to share.
-
-zoned=[0/1]: Default: 0
- 0: Block device is exposed as a random-access block device.
- 1: Block device is exposed as a host-managed zoned block device. Requires
- CONFIG_BLK_DEV_ZONED.
-
-zone_size=[MB]: Default: 256
- Per zone size when exposed as a zoned block device. Must be a power of two.
-
-zone_nr_conv=[nr_conv]: Default: 0
- The number of conventional zones to create when block device is zoned. If
- zone_nr_conv >= nr_zones, it will be reduced to nr_zones - 1.
diff --git a/Documentation/block/pr.rst b/Documentation/block/pr.rst
new file mode 100644
index 000000000000..30ea1c2e39eb
--- /dev/null
+++ b/Documentation/block/pr.rst
@@ -0,0 +1,119 @@
+===============================================
+Block layer support for Persistent Reservations
+===============================================
+
+The Linux kernel supports a user space interface for simplified
+Persistent Reservations which map to block devices that support
+these (like SCSI). Persistent Reservations allow restricting
+access to block devices to specific initiators in a shared storage
+setup.
+
+This document gives a general overview of the support ioctl commands.
+For a more detailed reference please refer the the SCSI Primary
+Commands standard, specifically the section on Reservations and the
+"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands.
+
+All implementations are expected to ensure the reservations survive
+a power loss and cover all connections in a multi path environment.
+These behaviors are optional in SPC but will be automatically applied
+by Linux.
+
+
+The following types of reservations are supported:
+--------------------------------------------------
+
+ - PR_WRITE_EXCLUSIVE
+ Only the initiator that owns the reservation can write to the
+ device. Any initiator can read from the device.
+
+ - PR_EXCLUSIVE_ACCESS
+ Only the initiator that owns the reservation can access the
+ device.
+
+ - PR_WRITE_EXCLUSIVE_REG_ONLY
+ Only initiators with a registered key can write to the device,
+ Any initiator can read from the device.
+
+ - PR_EXCLUSIVE_ACCESS_REG_ONLY
+ Only initiators with a registered key can access the device.
+
+ - PR_WRITE_EXCLUSIVE_ALL_REGS
+
+ Only initiators with a registered key can write to the device,
+ Any initiator can read from the device.
+ All initiators with a registered key are considered reservation
+ holders.
+ Please reference the SPC spec on the meaning of a reservation
+ holder if you want to use this type.
+
+ - PR_EXCLUSIVE_ACCESS_ALL_REGS
+ Only initiators with a registered key can access the device.
+ All initiators with a registered key are considered reservation
+ holders.
+ Please reference the SPC spec on the meaning of a reservation
+ holder if you want to use this type.
+
+
+The following ioctl are supported:
+----------------------------------
+
+1. IOC_PR_REGISTER
+^^^^^^^^^^^^^^^^^^
+
+This ioctl command registers a new reservation if the new_key argument
+is non-null. If no existing reservation exists old_key must be zero,
+if an existing reservation should be replaced old_key must contain
+the old reservation key.
+
+If the new_key argument is 0 it unregisters the existing reservation passed
+in old_key.
+
+
+2. IOC_PR_RESERVE
+^^^^^^^^^^^^^^^^^
+
+This ioctl command reserves the device and thus restricts access for other
+devices based on the type argument. The key argument must be the existing
+reservation key for the device as acquired by the IOC_PR_REGISTER,
+IOC_PR_REGISTER_IGNORE, IOC_PR_PREEMPT or IOC_PR_PREEMPT_ABORT commands.
+
+
+3. IOC_PR_RELEASE
+^^^^^^^^^^^^^^^^^
+
+This ioctl command releases the reservation specified by key and flags
+and thus removes any access restriction implied by it.
+
+
+4. IOC_PR_PREEMPT
+^^^^^^^^^^^^^^^^^
+
+This ioctl command releases the existing reservation referred to by
+old_key and replaces it with a new reservation of type for the
+reservation key new_key.
+
+
+5. IOC_PR_PREEMPT_ABORT
+^^^^^^^^^^^^^^^^^^^^^^^
+
+This ioctl command works like IOC_PR_PREEMPT except that it also aborts
+any outstanding command sent over a connection identified by old_key.
+
+6. IOC_PR_CLEAR
+^^^^^^^^^^^^^^^
+
+This ioctl command unregisters both key and any other reservation key
+registered with the device and drops any existing reservation.
+
+
+Flags
+-----
+
+All the ioctls have a flag field. Currently only one flag is supported:
+
+ - PR_FL_IGNORE_KEY
+ Ignore the existing reservation key. This is commonly supported for
+ IOC_PR_REGISTER, and some implementation may support the flag for
+ IOC_PR_RESERVE.
+
+For all unknown flags the kernel will return -EOPNOTSUPP.
diff --git a/Documentation/block/pr.txt b/Documentation/block/pr.txt
deleted file mode 100644
index ac9b8e70e64b..000000000000
--- a/Documentation/block/pr.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-
-Block layer support for Persistent Reservations
-===============================================
-
-The Linux kernel supports a user space interface for simplified
-Persistent Reservations which map to block devices that support
-these (like SCSI). Persistent Reservations allow restricting
-access to block devices to specific initiators in a shared storage
-setup.
-
-This document gives a general overview of the support ioctl commands.
-For a more detailed reference please refer the the SCSI Primary
-Commands standard, specifically the section on Reservations and the
-"PERSISTENT RESERVE IN" and "PERSISTENT RESERVE OUT" commands.
-
-All implementations are expected to ensure the reservations survive
-a power loss and cover all connections in a multi path environment.
-These behaviors are optional in SPC but will be automatically applied
-by Linux.
-
-
-The following types of reservations are supported:
---------------------------------------------------
-
- - PR_WRITE_EXCLUSIVE
-
- Only the initiator that owns the reservation can write to the
- device. Any initiator can read from the device.
-
- - PR_EXCLUSIVE_ACCESS
-
- Only the initiator that owns the reservation can access the
- device.
-
- - PR_WRITE_EXCLUSIVE_REG_ONLY
-
- Only initiators with a registered key can write to the device,
- Any initiator can read from the device.
-
- - PR_EXCLUSIVE_ACCESS_REG_ONLY
-
- Only initiators with a registered key can access the device.
-
- - PR_WRITE_EXCLUSIVE_ALL_REGS
-
- Only initiators with a registered key can write to the device,
- Any initiator can read from the device.
- All initiators with a registered key are considered reservation
- holders.
- Please reference the SPC spec on the meaning of a reservation
- holder if you want to use this type.
-
- - PR_EXCLUSIVE_ACCESS_ALL_REGS
-
- Only initiators with a registered key can access the device.
- All initiators with a registered key are considered reservation
- holders.
- Please reference the SPC spec on the meaning of a reservation
- holder if you want to use this type.
-
-
-The following ioctl are supported:
-----------------------------------
-
-1. IOC_PR_REGISTER
-
-This ioctl command registers a new reservation if the new_key argument
-is non-null. If no existing reservation exists old_key must be zero,
-if an existing reservation should be replaced old_key must contain
-the old reservation key.
-
-If the new_key argument is 0 it unregisters the existing reservation passed
-in old_key.
-
-
-2. IOC_PR_RESERVE
-
-This ioctl command reserves the device and thus restricts access for other
-devices based on the type argument. The key argument must be the existing
-reservation key for the device as acquired by the IOC_PR_REGISTER,
-IOC_PR_REGISTER_IGNORE, IOC_PR_PREEMPT or IOC_PR_PREEMPT_ABORT commands.
-
-
-3. IOC_PR_RELEASE
-
-This ioctl command releases the reservation specified by key and flags
-and thus removes any access restriction implied by it.
-
-
-4. IOC_PR_PREEMPT
-
-This ioctl command releases the existing reservation referred to by
-old_key and replaces it with a new reservation of type for the
-reservation key new_key.
-
-
-5. IOC_PR_PREEMPT_ABORT
-
-This ioctl command works like IOC_PR_PREEMPT except that it also aborts
-any outstanding command sent over a connection identified by old_key.
-
-6. IOC_PR_CLEAR
-
-This ioctl command unregisters both key and any other reservation key
-registered with the device and drops any existing reservation.
-
-
-Flags
------
-
-All the ioctls have a flag field. Currently only one flag is supported:
-
- - PR_FL_IGNORE_KEY
-
- Ignore the existing reservation key. This is commonly supported for
- IOC_PR_REGISTER, and some implementation may support the flag for
- IOC_PR_RESERVE.
-
-For all unknown flags the kernel will return -EOPNOTSUPP.
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
new file mode 100644
index 000000000000..6a8513af9201
--- /dev/null
+++ b/Documentation/block/queue-sysfs.rst
@@ -0,0 +1,254 @@
+=================
+Queue sysfs files
+=================
+
+This text file will detail the queue files that are located in the sysfs tree
+for each block device. Note that stacked devices typically do not export
+any settings, since their queue merely functions are a remapping target.
+These files are the ones found in the /sys/block/xxx/queue/ directory.
+
+Files denoted with a RO postfix are readonly and the RW postfix means
+read-write.
+
+add_random (RW)
+---------------
+This file allows to turn off the disk entropy contribution. Default
+value of this file is '1'(on).
+
+chunk_sectors (RO)
+------------------
+This has different meaning depending on the type of the block device.
+For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors
+of the RAID volume stripe segment. For a zoned block device, either host-aware
+or host-managed, chunk_sectors indicates the size in 512B sectors of the zones
+of the device, with the eventual exception of the last zone of the device which
+may be smaller.
+
+dax (RO)
+--------
+This file indicates whether the device supports Direct Access (DAX),
+used by CPU-addressable storage to bypass the pagecache. It shows '1'
+if true, '0' if not.
+
+discard_granularity (RO)
+------------------------
+This shows the size of internal allocation of the device in bytes, if
+reported by the device. A value of '0' means device does not support
+the discard functionality.
+
+discard_max_hw_bytes (RO)
+-------------------------
+Devices that support discard functionality may have internal limits on
+the number of bytes that can be trimmed or unmapped in a single operation.
+The discard_max_bytes parameter is set by the device driver to the maximum
+number of bytes that can be discarded in a single operation. Discard
+requests issued to the device must not exceed this limit. A discard_max_bytes
+value of 0 means that the device does not support discard functionality.
+
+discard_max_bytes (RW)
+----------------------
+While discard_max_hw_bytes is the hardware limit for the device, this
+setting is the software limit. Some devices exhibit large latencies when
+large discards are issued, setting this value lower will make Linux issue
+smaller discards and potentially help reduce latencies induced by large
+discard operations.
+
+discard_zeroes_data (RO)
+------------------------
+Obsolete. Always zero.
+
+fua (RO)
+--------
+Whether or not the block driver supports the FUA flag for write requests.
+FUA stands for Force Unit Access. If the FUA flag is set that means that
+write requests must bypass the volatile cache of the storage device.
+
+hw_sector_size (RO)
+-------------------
+This is the hardware sector size of the device, in bytes.
+
+io_poll (RW)
+------------
+When read, this file shows whether polling is enabled (1) or disabled
+(0). Writing '0' to this file will disable polling for this device.
+Writing any non-zero value will enable this feature.
+
+io_poll_delay (RW)
+------------------
+If polling is enabled, this controls what kind of polling will be
+performed. It defaults to -1, which is classic polling. In this mode,
+the CPU will repeatedly ask for completions without giving up any time.
+If set to 0, a hybrid polling mode is used, where the kernel will attempt
+to make an educated guess at when the IO will complete. Based on this
+guess, the kernel will put the process issuing IO to sleep for an amount
+of time, before entering a classic poll loop. This mode might be a
+little slower than pure classic polling, but it will be more efficient.
+If set to a value larger than 0, the kernel will put the process issuing
+IO to sleep for this amount of microseconds before entering classic
+polling.
+
+io_timeout (RW)
+---------------
+io_timeout is the request timeout in milliseconds. If a request does not
+complete in this time then the block driver timeout handler is invoked.
+That timeout handler can decide to retry the request, to fail it or to start
+a device recovery strategy.
+
+iostats (RW)
+-------------
+This file is used to control (on/off) the iostats accounting of the
+disk.
+
+logical_block_size (RO)
+-----------------------
+This is the logical block size of the device, in bytes.
+
+max_discard_segments (RO)
+-------------------------
+The maximum number of DMA scatter/gather entries in a discard request.
+
+max_hw_sectors_kb (RO)
+----------------------
+This is the maximum number of kilobytes supported in a single data transfer.
+
+max_integrity_segments (RO)
+---------------------------
+Maximum number of elements in a DMA scatter/gather list with integrity
+data that will be submitted by the block layer core to the associated
+block driver.
+
+max_sectors_kb (RW)
+-------------------
+This is the maximum number of kilobytes that the block layer will allow
+for a filesystem request. Must be smaller than or equal to the maximum
+size allowed by the hardware.
+
+max_segments (RO)
+-----------------
+Maximum number of elements in a DMA scatter/gather list that is submitted
+to the associated block driver.
+
+max_segment_size (RO)
+---------------------
+Maximum size in bytes of a single element in a DMA scatter/gather list.
+
+minimum_io_size (RO)
+--------------------
+This is the smallest preferred IO size reported by the device.
+
+nomerges (RW)
+-------------
+This enables the user to disable the lookup logic involved with IO
+merging requests in the block layer. By default (0) all merges are
+enabled. When set to 1 only simple one-hit merges will be tried. When
+set to 2 no merge algorithms will be tried (including one-hit or more
+complex tree/hash lookups).
+
+nr_requests (RW)
+----------------
+This controls how many requests may be allocated in the block layer for
+read or write requests. Note that the total allocated number may be twice
+this amount, since it applies only to reads or writes (not the accumulated
+sum).
+
+To avoid priority inversion through request starvation, a request
+queue maintains a separate request pool per each cgroup when
+CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
+per-block-cgroup request pool. IOW, if there are N block cgroups,
+each request queue may have up to N request pools, each independently
+regulated by nr_requests.
+
+nr_zones (RO)
+-------------
+For zoned block devices (zoned attribute indicating "host-managed" or
+"host-aware"), this indicates the total number of zones of the device.
+This is always 0 for regular block devices.
+
+optimal_io_size (RO)
+--------------------
+This is the optimal IO size reported by the device.
+
+physical_block_size (RO)
+------------------------
+This is the physical block size of device, in bytes.
+
+read_ahead_kb (RW)
+------------------
+Maximum number of kilobytes to read-ahead for filesystems on this block
+device.
+
+rotational (RW)
+---------------
+This file is used to stat if the device is of rotational type or
+non-rotational type.
+
+rq_affinity (RW)
+----------------
+If this option is '1', the block layer will migrate request completions to the
+cpu "group" that originally submitted the request. For some workloads this
+provides a significant reduction in CPU cycles due to caching effects.
+
+For storage configurations that need to maximize distribution of completion
+processing setting this option to '2' forces the completion to run on the
+requesting cpu (bypassing the "group" aggregation logic).
+
+scheduler (RW)
+--------------
+When read, this file will display the current and available IO schedulers
+for this block device. The currently active IO scheduler will be enclosed
+in [] brackets. Writing an IO scheduler name to this file will switch
+control of this block device to that new IO scheduler. Note that writing
+an IO scheduler name to this file will attempt to load that IO scheduler
+module, if it isn't already present in the system.
+
+write_cache (RW)
+----------------
+When read, this file will display whether the device has write back
+caching enabled or not. It will return "write back" for the former
+case, and "write through" for the latter. Writing to this file can
+change the kernels view of the device, but it doesn't alter the
+device state. This means that it might not be safe to toggle the
+setting from "write back" to "write through", since that will also
+eliminate cache flushes issued by the kernel.
+
+write_same_max_bytes (RO)
+-------------------------
+This is the number of bytes the device can write in a single write-same
+command. A value of '0' means write-same is not supported by this
+device.
+
+wbt_lat_usec (RW)
+-----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes. Writing a value of '0' to this file disables the
+feature. Writing a value of '-1' to this file resets the value to the
+default setting.
+
+throttle_sample_time (RW)
+-------------------------
+This is the time window that blk-throttle samples data, in millisecond.
+blk-throttle makes decision based on the samplings. Lower time means cgroups
+have more smooth throughput, but higher CPU overhead. This exists only when
+CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
+
+write_zeroes_max_bytes (RO)
+---------------------------
+For block drivers that support REQ_OP_WRITE_ZEROES, the maximum number of
+bytes that can be zeroed at once. The value 0 means that REQ_OP_WRITE_ZEROES
+is not supported.
+
+zoned (RO)
+----------
+This indicates if the device is a zoned block device and the zone model of the
+device if it is indeed zoned. The possible values indicated by zoned are
+"none" for regular block devices and "host-aware" or "host-managed" for zoned
+block devices. The characteristics of host-aware and host-managed zoned block
+devices are described in the ZBC (Zoned Block Commands) and ZAC
+(Zoned Device ATA Command Set) standards. These standards also define the
+"drive-managed" zone model. However, since drive-managed zoned block devices
+do not support zone commands, they will be treated as regular block devices
+and zoned will report "none".
+
+Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
deleted file mode 100644
index 83b457e24bba..000000000000
--- a/Documentation/block/queue-sysfs.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Queue sysfs files
-=================
-
-This text file will detail the queue files that are located in the sysfs tree
-for each block device. Note that stacked devices typically do not export
-any settings, since their queue merely functions are a remapping target.
-These files are the ones found in the /sys/block/xxx/queue/ directory.
-
-Files denoted with a RO postfix are readonly and the RW postfix means
-read-write.
-
-add_random (RW)
-----------------
-This file allows to turn off the disk entropy contribution. Default
-value of this file is '1'(on).
-
-dax (RO)
---------
-This file indicates whether the device supports Direct Access (DAX),
-used by CPU-addressable storage to bypass the pagecache. It shows '1'
-if true, '0' if not.
-
-discard_granularity (RO)
------------------------
-This shows the size of internal allocation of the device in bytes, if
-reported by the device. A value of '0' means device does not support
-the discard functionality.
-
-discard_max_hw_bytes (RO)
-----------------------
-Devices that support discard functionality may have internal limits on
-the number of bytes that can be trimmed or unmapped in a single operation.
-The discard_max_bytes parameter is set by the device driver to the maximum
-number of bytes that can be discarded in a single operation. Discard
-requests issued to the device must not exceed this limit. A discard_max_bytes
-value of 0 means that the device does not support discard functionality.
-
-discard_max_bytes (RW)
-----------------------
-While discard_max_hw_bytes is the hardware limit for the device, this
-setting is the software limit. Some devices exhibit large latencies when
-large discards are issued, setting this value lower will make Linux issue
-smaller discards and potentially help reduce latencies induced by large
-discard operations.
-
-hw_sector_size (RO)
--------------------
-This is the hardware sector size of the device, in bytes.
-
-io_poll (RW)
-------------
-When read, this file shows whether polling is enabled (1) or disabled
-(0). Writing '0' to this file will disable polling for this device.
-Writing any non-zero value will enable this feature.
-
-io_poll_delay (RW)
-------------------
-If polling is enabled, this controls what kind of polling will be
-performed. It defaults to -1, which is classic polling. In this mode,
-the CPU will repeatedly ask for completions without giving up any time.
-If set to 0, a hybrid polling mode is used, where the kernel will attempt
-to make an educated guess at when the IO will complete. Based on this
-guess, the kernel will put the process issuing IO to sleep for an amount
-of time, before entering a classic poll loop. This mode might be a
-little slower than pure classic polling, but it will be more efficient.
-If set to a value larger than 0, the kernel will put the process issuing
-IO to sleep for this amount of microseconds before entering classic
-polling.
-
-io_timeout (RW)
----------------
-io_timeout is the request timeout in milliseconds. If a request does not
-complete in this time then the block driver timeout handler is invoked.
-That timeout handler can decide to retry the request, to fail it or to start
-a device recovery strategy.
-
-iostats (RW)
--------------
-This file is used to control (on/off) the iostats accounting of the
-disk.
-
-logical_block_size (RO)
------------------------
-This is the logical block size of the device, in bytes.
-
-max_hw_sectors_kb (RO)
-----------------------
-This is the maximum number of kilobytes supported in a single data transfer.
-
-max_integrity_segments (RO)
----------------------------
-When read, this file shows the max limit of integrity segments as
-set by block layer which a hardware controller can handle.
-
-max_sectors_kb (RW)
--------------------
-This is the maximum number of kilobytes that the block layer will allow
-for a filesystem request. Must be smaller than or equal to the maximum
-size allowed by the hardware.
-
-max_segments (RO)
------------------
-Maximum number of segments of the device.
-
-max_segment_size (RO)
----------------------
-Maximum segment size of the device.
-
-minimum_io_size (RO)
---------------------
-This is the smallest preferred IO size reported by the device.
-
-nomerges (RW)
--------------
-This enables the user to disable the lookup logic involved with IO
-merging requests in the block layer. By default (0) all merges are
-enabled. When set to 1 only simple one-hit merges will be tried. When
-set to 2 no merge algorithms will be tried (including one-hit or more
-complex tree/hash lookups).
-
-nr_requests (RW)
-----------------
-This controls how many requests may be allocated in the block layer for
-read or write requests. Note that the total allocated number may be twice
-this amount, since it applies only to reads or writes (not the accumulated
-sum).
-
-To avoid priority inversion through request starvation, a request
-queue maintains a separate request pool per each cgroup when
-CONFIG_BLK_CGROUP is enabled, and this parameter applies to each such
-per-block-cgroup request pool. IOW, if there are N block cgroups,
-each request queue may have up to N request pools, each independently
-regulated by nr_requests.
-
-optimal_io_size (RO)
---------------------
-This is the optimal IO size reported by the device.
-
-physical_block_size (RO)
-------------------------
-This is the physical block size of device, in bytes.
-
-read_ahead_kb (RW)
-------------------
-Maximum number of kilobytes to read-ahead for filesystems on this block
-device.
-
-rotational (RW)
----------------
-This file is used to stat if the device is of rotational type or
-non-rotational type.
-
-rq_affinity (RW)
-----------------
-If this option is '1', the block layer will migrate request completions to the
-cpu "group" that originally submitted the request. For some workloads this
-provides a significant reduction in CPU cycles due to caching effects.
-
-For storage configurations that need to maximize distribution of completion
-processing setting this option to '2' forces the completion to run on the
-requesting cpu (bypassing the "group" aggregation logic).
-
-scheduler (RW)
---------------
-When read, this file will display the current and available IO schedulers
-for this block device. The currently active IO scheduler will be enclosed
-in [] brackets. Writing an IO scheduler name to this file will switch
-control of this block device to that new IO scheduler. Note that writing
-an IO scheduler name to this file will attempt to load that IO scheduler
-module, if it isn't already present in the system.
-
-write_cache (RW)
-----------------
-When read, this file will display whether the device has write back
-caching enabled or not. It will return "write back" for the former
-case, and "write through" for the latter. Writing to this file can
-change the kernels view of the device, but it doesn't alter the
-device state. This means that it might not be safe to toggle the
-setting from "write back" to "write through", since that will also
-eliminate cache flushes issued by the kernel.
-
-write_same_max_bytes (RO)
--------------------------
-This is the number of bytes the device can write in a single write-same
-command. A value of '0' means write-same is not supported by this
-device.
-
-wb_lat_usec (RW)
-----------------
-If the device is registered for writeback throttling, then this file shows
-the target minimum read latency. If this latency is exceeded in a given
-window of time (see wb_window_usec), then the writeback throttling will start
-scaling back writes. Writing a value of '0' to this file disables the
-feature. Writing a value of '-1' to this file resets the value to the
-default setting.
-
-throttle_sample_time (RW)
--------------------------
-This is the time window that blk-throttle samples data, in millisecond.
-blk-throttle makes decision based on the samplings. Lower time means cgroups
-have more smooth throughput, but higher CPU overhead. This exists only when
-CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
-
-zoned (RO)
-----------
-This indicates if the device is a zoned block device and the zone model of the
-device if it is indeed zoned. The possible values indicated by zoned are
-"none" for regular block devices and "host-aware" or "host-managed" for zoned
-block devices. The characteristics of host-aware and host-managed zoned block
-devices are described in the ZBC (Zoned Block Commands) and ZAC
-(Zoned Device ATA Command Set) standards. These standards also define the
-"drive-managed" zone model. However, since drive-managed zoned block devices
-do not support zone commands, they will be treated as regular block devices
-and zoned will report "none".
-
-nr_zones (RO)
--------------
-For zoned block devices (zoned attribute indicating "host-managed" or
-"host-aware"), this indicates the total number of zones of the device.
-This is always 0 for regular block devices.
-
-chunk_sectors (RO)
-------------------
-This has different meaning depending on the type of the block device.
-For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors
-of the RAID volume stripe segment. For a zoned block device, either host-aware
-or host-managed, chunk_sectors indicates the size in 512B sectors of the zones
-of the device, with the eventual exception of the last zone of the device which
-may be smaller.
-
-Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/block/request.rst b/Documentation/block/request.rst
new file mode 100644
index 000000000000..747021e1ffdb
--- /dev/null
+++ b/Documentation/block/request.rst
@@ -0,0 +1,99 @@
+============================
+struct request documentation
+============================
+
+Jens Axboe <jens.axboe@oracle.com> 27/05/02
+
+
+.. FIXME:
+ No idea about what does mean - seems just some noise, so comment it
+
+ 1.0
+ Index
+
+ 2.0 Struct request members classification
+
+ 2.1 struct request members explanation
+
+ 3.0
+
+
+ 2.0
+
+
+
+Short explanation of request members
+====================================
+
+Classification flags:
+
+ = ====================
+ D driver member
+ B block layer member
+ I I/O scheduler member
+ = ====================
+
+Unless an entry contains a D classification, a device driver must not access
+this member. Some members may contain D classifications, but should only be
+access through certain macros or functions (eg ->flags).
+
+<linux/blkdev.h>
+
+=============================== ======= =======================================
+Member Flag Comment
+=============================== ======= =======================================
+struct list_head queuelist BI Organization on various internal
+ queues
+
+``void *elevator_private`` I I/O scheduler private data
+
+unsigned char cmd[16] D Driver can use this for setting up
+ a cdb before execution, see
+ blk_queue_prep_rq
+
+unsigned long flags DBI Contains info about data direction,
+ request type, etc.
+
+int rq_status D Request status bits
+
+kdev_t rq_dev DBI Target device
+
+int errors DB Error counts
+
+sector_t sector DBI Target location
+
+unsigned long hard_nr_sectors B Used to keep sector sane
+
+unsigned long nr_sectors DBI Total number of sectors in request
+
+unsigned long hard_nr_sectors B Used to keep nr_sectors sane
+
+unsigned short nr_phys_segments DB Number of physical scatter gather
+ segments in a request
+
+unsigned short nr_hw_segments DB Number of hardware scatter gather
+ segments in a request
+
+unsigned int current_nr_sectors DB Number of sectors in first segment
+ of request
+
+unsigned int hard_cur_sectors B Used to keep current_nr_sectors sane
+
+int tag DB TCQ tag, if assigned
+
+``void *special`` D Free to be used by driver
+
+``char *buffer`` D Map of first segment, also see
+ section on bouncing SECTION
+
+``struct completion *waiting`` D Can be used by driver to get signalled
+ on request completion
+
+``struct bio *bio`` DBI First bio in request
+
+``struct bio *biotail`` DBI Last bio in request
+
+``struct request_queue *q`` DB Request queue this request belongs to
+
+``struct request_list *rl`` B Request list this request came from
+=============================== ======= =======================================
diff --git a/Documentation/block/request.txt b/Documentation/block/request.txt
deleted file mode 100644
index 754e104ed369..000000000000
--- a/Documentation/block/request.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-
-struct request documentation
-
-Jens Axboe <jens.axboe@oracle.com> 27/05/02
-
-1.0
-Index
-
-2.0 Struct request members classification
-
- 2.1 struct request members explanation
-
-3.0
-
-
-2.0
-Short explanation of request members
-
-Classification flags:
-
- D driver member
- B block layer member
- I I/O scheduler member
-
-Unless an entry contains a D classification, a device driver must not access
-this member. Some members may contain D classifications, but should only be
-access through certain macros or functions (eg ->flags).
-
-<linux/blkdev.h>
-
-2.1
-Member Flag Comment
------- ---- -------
-
-struct list_head queuelist BI Organization on various internal
- queues
-
-void *elevator_private I I/O scheduler private data
-
-unsigned char cmd[16] D Driver can use this for setting up
- a cdb before execution, see
- blk_queue_prep_rq
-
-unsigned long flags DBI Contains info about data direction,
- request type, etc.
-
-int rq_status D Request status bits
-
-kdev_t rq_dev DBI Target device
-
-int errors DB Error counts
-
-sector_t sector DBI Target location
-
-unsigned long hard_nr_sectors B Used to keep sector sane
-
-unsigned long nr_sectors DBI Total number of sectors in request
-
-unsigned long hard_nr_sectors B Used to keep nr_sectors sane
-
-unsigned short nr_phys_segments DB Number of physical scatter gather
- segments in a request
-
-unsigned short nr_hw_segments DB Number of hardware scatter gather
- segments in a request
-
-unsigned int current_nr_sectors DB Number of sectors in first segment
- of request
-
-unsigned int hard_cur_sectors B Used to keep current_nr_sectors sane
-
-int tag DB TCQ tag, if assigned
-
-void *special D Free to be used by driver
-
-char *buffer D Map of first segment, also see
- section on bouncing SECTION
-
-struct completion *waiting D Can be used by driver to get signalled
- on request completion
-
-struct bio *bio DBI First bio in request
-
-struct bio *biotail DBI Last bio in request
-
-struct request_queue *q DB Request queue this request belongs to
-
-struct request_list *rl B Request list this request came from
diff --git a/Documentation/block/stat.rst b/Documentation/block/stat.rst
new file mode 100644
index 000000000000..9c07bc22b0bc
--- /dev/null
+++ b/Documentation/block/stat.rst
@@ -0,0 +1,93 @@
+===============================================
+Block layer statistics in /sys/block/<dev>/stat
+===============================================
+
+This file documents the contents of the /sys/block/<dev>/stat file.
+
+The stat file provides several statistics about the state of block
+device <dev>.
+
+Q.
+ Why are there multiple statistics in a single file? Doesn't sysfs
+ normally contain a single value per file?
+
+A.
+ By having a single file, the kernel can guarantee that the statistics
+ represent a consistent snapshot of the state of the device. If the
+ statistics were exported as multiple files containing one statistic
+ each, it would be impossible to guarantee that a set of readings
+ represent a single point in time.
+
+The stat file consists of a single line of text containing 11 decimal
+values separated by whitespace. The fields are summarized in the
+following table, and described in more detail below.
+
+
+=============== ============= =================================================
+Name units description
+=============== ============= =================================================
+read I/Os requests number of read I/Os processed
+read merges requests number of read I/Os merged with in-queue I/O
+read sectors sectors number of sectors read
+read ticks milliseconds total wait time for read requests
+write I/Os requests number of write I/Os processed
+write merges requests number of write I/Os merged with in-queue I/O
+write sectors sectors number of sectors written
+write ticks milliseconds total wait time for write requests
+in_flight requests number of I/Os currently in flight
+io_ticks milliseconds total time this block device has been active
+time_in_queue milliseconds total wait time for all requests
+discard I/Os requests number of discard I/Os processed
+discard merges requests number of discard I/Os merged with in-queue I/O
+discard sectors sectors number of sectors discarded
+discard ticks milliseconds total wait time for discard requests
+=============== ============= =================================================
+
+read I/Os, write I/Os, discard I/0s
+===================================
+
+These values increment when an I/O request completes.
+
+read merges, write merges, discard merges
+=========================================
+
+These values increment when an I/O request is merged with an
+already-queued I/O request.
+
+read sectors, write sectors, discard_sectors
+============================================
+
+These values count the number of sectors read from, written to, or
+discarded from this block device. The "sectors" in question are the
+standard UNIX 512-byte sectors, not any device- or filesystem-specific
+block size. The counters are incremented when the I/O completes.
+
+read ticks, write ticks, discard ticks
+======================================
+
+These values count the number of milliseconds that I/O requests have
+waited on this block device. If there are multiple I/O requests waiting,
+these values will increase at a rate greater than 1000/second; for
+example, if 60 read requests wait for an average of 30 ms, the read_ticks
+field will increase by 60*30 = 1800.
+
+in_flight
+=========
+
+This value counts the number of I/O requests that have been issued to
+the device driver but have not yet completed. It does not include I/O
+requests that are in the queue but not yet issued to the device driver.
+
+io_ticks
+========
+
+This value counts the number of milliseconds during which the device has
+had I/O requests queued.
+
+time_in_queue
+=============
+
+This value counts the number of milliseconds that I/O requests have waited
+on this block device. If there are multiple I/O requests waiting, this
+value will increase as the product of the number of milliseconds times the
+number of requests waiting (see "read ticks" above for an example).
diff --git a/Documentation/block/stat.txt b/Documentation/block/stat.txt
deleted file mode 100644
index 0aace9cc536c..000000000000
--- a/Documentation/block/stat.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-Block layer statistics in /sys/block/<dev>/stat
-===============================================
-
-This file documents the contents of the /sys/block/<dev>/stat file.
-
-The stat file provides several statistics about the state of block
-device <dev>.
-
-Q. Why are there multiple statistics in a single file? Doesn't sysfs
- normally contain a single value per file?
-A. By having a single file, the kernel can guarantee that the statistics
- represent a consistent snapshot of the state of the device. If the
- statistics were exported as multiple files containing one statistic
- each, it would be impossible to guarantee that a set of readings
- represent a single point in time.
-
-The stat file consists of a single line of text containing 11 decimal
-values separated by whitespace. The fields are summarized in the
-following table, and described in more detail below.
-
-Name units description
----- ----- -----------
-read I/Os requests number of read I/Os processed
-read merges requests number of read I/Os merged with in-queue I/O
-read sectors sectors number of sectors read
-read ticks milliseconds total wait time for read requests
-write I/Os requests number of write I/Os processed
-write merges requests number of write I/Os merged with in-queue I/O
-write sectors sectors number of sectors written
-write ticks milliseconds total wait time for write requests
-in_flight requests number of I/Os currently in flight
-io_ticks milliseconds total time this block device has been active
-time_in_queue milliseconds total wait time for all requests
-discard I/Os requests number of discard I/Os processed
-discard merges requests number of discard I/Os merged with in-queue I/O
-discard sectors sectors number of sectors discarded
-discard ticks milliseconds total wait time for discard requests
-
-read I/Os, write I/Os, discard I/0s
-===================================
-
-These values increment when an I/O request completes.
-
-read merges, write merges, discard merges
-=========================================
-
-These values increment when an I/O request is merged with an
-already-queued I/O request.
-
-read sectors, write sectors, discard_sectors
-============================================
-
-These values count the number of sectors read from, written to, or
-discarded from this block device. The "sectors" in question are the
-standard UNIX 512-byte sectors, not any device- or filesystem-specific
-block size. The counters are incremented when the I/O completes.
-
-read ticks, write ticks, discard ticks
-======================================
-
-These values count the number of milliseconds that I/O requests have
-waited on this block device. If there are multiple I/O requests waiting,
-these values will increase at a rate greater than 1000/second; for
-example, if 60 read requests wait for an average of 30 ms, the read_ticks
-field will increase by 60*30 = 1800.
-
-in_flight
-=========
-
-This value counts the number of I/O requests that have been issued to
-the device driver but have not yet completed. It does not include I/O
-requests that are in the queue but not yet issued to the device driver.
-
-io_ticks
-========
-
-This value counts the number of milliseconds during which the device has
-had I/O requests queued.
-
-time_in_queue
-=============
-
-This value counts the number of milliseconds that I/O requests have waited
-on this block device. If there are multiple I/O requests waiting, this
-value will increase as the product of the number of milliseconds times the
-number of requests waiting (see "read ticks" above for an example).
diff --git a/Documentation/block/switching-sched.rst b/Documentation/block/switching-sched.rst
new file mode 100644
index 000000000000..42042417380e
--- /dev/null
+++ b/Documentation/block/switching-sched.rst
@@ -0,0 +1,39 @@
+===================
+Switching Scheduler
+===================
+
+To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
+'noop' and 'cfq' (the default) are also available. IO schedulers are assigned
+globally at boot time only presently.
+
+Each io queue has a set of io scheduler tunables associated with it. These
+tunables control how the io scheduler works. You can find these entries
+in::
+
+ /sys/block/<device>/queue/iosched
+
+assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
+you can do so by typing::
+
+ # mount none /sys -t sysfs
+
+It is possible to change the IO scheduler for a given block device on
+the fly to select one of mq-deadline, none, bfq, or kyber schedulers -
+which can improve that device's throughput.
+
+To set a specific scheduler, simply do this::
+
+ echo SCHEDNAME > /sys/block/DEV/queue/scheduler
+
+where SCHEDNAME is the name of a defined IO scheduler, and DEV is the
+device name (hda, hdb, sga, or whatever you happen to have).
+
+The list of defined schedulers can be found by simply doing
+a "cat /sys/block/DEV/queue/scheduler" - the list of valid names
+will be displayed, with the currently selected scheduler in brackets::
+
+ # cat /sys/block/sda/queue/scheduler
+ [mq-deadline] kyber bfq none
+ # echo none >/sys/block/sda/queue/scheduler
+ # cat /sys/block/sda/queue/scheduler
+ [none] mq-deadline kyber bfq
diff --git a/Documentation/block/switching-sched.txt b/Documentation/block/switching-sched.txt
deleted file mode 100644
index 3b2612e342f1..000000000000
--- a/Documentation/block/switching-sched.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-To choose IO schedulers at boot time, use the argument 'elevator=deadline'.
-'noop' and 'cfq' (the default) are also available. IO schedulers are assigned
-globally at boot time only presently.
-
-Each io queue has a set of io scheduler tunables associated with it. These
-tunables control how the io scheduler works. You can find these entries
-in:
-
-/sys/block/<device>/queue/iosched
-
-assuming that you have sysfs mounted on /sys. If you don't have sysfs mounted,
-you can do so by typing:
-
-# mount none /sys -t sysfs
-
-As of the Linux 2.6.10 kernel, it is now possible to change the
-IO scheduler for a given block device on the fly (thus making it possible,
-for instance, to set the CFQ scheduler for the system default, but
-set a specific device to use the deadline or noop schedulers - which
-can improve that device's throughput).
-
-To set a specific scheduler, simply do this:
-
-echo SCHEDNAME > /sys/block/DEV/queue/scheduler
-
-where SCHEDNAME is the name of a defined IO scheduler, and DEV is the
-device name (hda, hdb, sga, or whatever you happen to have).
-
-The list of defined schedulers can be found by simply doing
-a "cat /sys/block/DEV/queue/scheduler" - the list of valid names
-will be displayed, with the currently selected scheduler in brackets:
-
-# cat /sys/block/hda/queue/scheduler
-noop deadline [cfq]
-# echo deadline > /sys/block/hda/queue/scheduler
-# cat /sys/block/hda/queue/scheduler
-noop [deadline] cfq
diff --git a/Documentation/block/writeback_cache_control.rst b/Documentation/block/writeback_cache_control.rst
new file mode 100644
index 000000000000..2c752c57c14c
--- /dev/null
+++ b/Documentation/block/writeback_cache_control.rst
@@ -0,0 +1,86 @@
+==========================================
+Explicit volatile write back cache control
+==========================================
+
+Introduction
+------------
+
+Many storage devices, especially in the consumer market, come with volatile
+write back caches. That means the devices signal I/O completion to the
+operating system before data actually has hit the non-volatile storage. This
+behavior obviously speeds up various workloads, but it means the operating
+system needs to force data out to the non-volatile storage when it performs
+a data integrity operation like fsync, sync or an unmount.
+
+The Linux block layer provides two simple mechanisms that let filesystems
+control the caching behavior of the storage device. These mechanisms are
+a forced cache flush, and the Force Unit Access (FUA) flag for requests.
+
+
+Explicit cache flushes
+----------------------
+
+The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from
+the filesystem and will make sure the volatile cache of the storage device
+has been flushed before the actual I/O operation is started. This explicitly
+guarantees that previously completed write requests are on non-volatile
+storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be
+set on an otherwise empty bio structure, which causes only an explicit cache
+flush without any dependent I/O. It is recommend to use
+the blkdev_issue_flush() helper for a pure cache flush.
+
+
+Forced Unit Access
+------------------
+
+The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the
+filesystem and will make sure that I/O completion for this request is only
+signaled after the data has been committed to non-volatile storage.
+
+
+Implementation details for filesystems
+--------------------------------------
+
+Filesystems can simply set the REQ_PREFLUSH and REQ_FUA bits and do not have to
+worry if the underlying devices need any explicit cache flushing and how
+the Forced Unit Access is implemented. The REQ_PREFLUSH and REQ_FUA flags
+may both be set on a single bio.
+
+
+Implementation details for make_request_fn based block drivers
+--------------------------------------------------------------
+
+These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit
+directly below the submit_bio interface. For remapping drivers the REQ_FUA
+bits need to be propagated to underlying devices, and a global flush needs
+to be implemented for bios with the REQ_PREFLUSH bit set. For real device
+drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits
+on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without
+data can be completed successfully without doing any work. Drivers for
+devices with volatile caches need to implement the support for these
+flags themselves without any help from the block layer.
+
+
+Implementation details for request_fn based block drivers
+---------------------------------------------------------
+
+For devices that do not support volatile write caches there is no driver
+support required, the block layer completes empty REQ_PREFLUSH requests before
+entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from
+requests that have a payload. For devices with volatile write caches the
+driver needs to tell the block layer that it supports flushing caches by
+doing::
+
+ blk_queue_write_cache(sdkp->disk->queue, true, false);
+
+and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn. Note that
+REQ_PREFLUSH requests with a payload are automatically turned into a sequence
+of an empty REQ_OP_FLUSH request followed by the actual write by the block
+layer. For devices that also support the FUA bit the block layer needs
+to be told to pass through the REQ_FUA bit using::
+
+ blk_queue_write_cache(sdkp->disk->queue, true, true);
+
+and the driver must handle write requests that have the REQ_FUA bit set
+in prep_fn/request_fn. If the FUA bit is not natively supported the block
+layer turns it into an empty REQ_OP_FLUSH request after the actual write.
diff --git a/Documentation/block/writeback_cache_control.txt b/Documentation/block/writeback_cache_control.txt
deleted file mode 100644
index 8a6bdada5f6b..000000000000
--- a/Documentation/block/writeback_cache_control.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-
-Explicit volatile write back cache control
-=====================================
-
-Introduction
-------------
-
-Many storage devices, especially in the consumer market, come with volatile
-write back caches. That means the devices signal I/O completion to the
-operating system before data actually has hit the non-volatile storage. This
-behavior obviously speeds up various workloads, but it means the operating
-system needs to force data out to the non-volatile storage when it performs
-a data integrity operation like fsync, sync or an unmount.
-
-The Linux block layer provides two simple mechanisms that let filesystems
-control the caching behavior of the storage device. These mechanisms are
-a forced cache flush, and the Force Unit Access (FUA) flag for requests.
-
-
-Explicit cache flushes
-----------------------
-
-The REQ_PREFLUSH flag can be OR ed into the r/w flags of a bio submitted from
-the filesystem and will make sure the volatile cache of the storage device
-has been flushed before the actual I/O operation is started. This explicitly
-guarantees that previously completed write requests are on non-volatile
-storage before the flagged bio starts. In addition the REQ_PREFLUSH flag can be
-set on an otherwise empty bio structure, which causes only an explicit cache
-flush without any dependent I/O. It is recommend to use
-the blkdev_issue_flush() helper for a pure cache flush.
-
-
-Forced Unit Access
------------------
-
-The REQ_FUA flag can be OR ed into the r/w flags of a bio submitted from the
-filesystem and will make sure that I/O completion for this request is only
-signaled after the data has been committed to non-volatile storage.
-
-
-Implementation details for filesystems
---------------------------------------
-
-Filesystems can simply set the REQ_PREFLUSH and REQ_FUA bits and do not have to
-worry if the underlying devices need any explicit cache flushing and how
-the Forced Unit Access is implemented. The REQ_PREFLUSH and REQ_FUA flags
-may both be set on a single bio.
-
-
-Implementation details for make_request_fn based block drivers
---------------------------------------------------------------
-
-These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit
-directly below the submit_bio interface. For remapping drivers the REQ_FUA
-bits need to be propagated to underlying devices, and a global flush needs
-to be implemented for bios with the REQ_PREFLUSH bit set. For real device
-drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits
-on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without
-data can be completed successfully without doing any work. Drivers for
-devices with volatile caches need to implement the support for these
-flags themselves without any help from the block layer.
-
-
-Implementation details for request_fn based block drivers
---------------------------------------------------------------
-
-For devices that do not support volatile write caches there is no driver
-support required, the block layer completes empty REQ_PREFLUSH requests before
-entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from
-requests that have a payload. For devices with volatile write caches the
-driver needs to tell the block layer that it supports flushing caches by
-doing:
-
- blk_queue_write_cache(sdkp->disk->queue, true, false);
-
-and handle empty REQ_OP_FLUSH requests in its prep_fn/request_fn. Note that
-REQ_PREFLUSH requests with a payload are automatically turned into a sequence
-of an empty REQ_OP_FLUSH request followed by the actual write by the block
-layer. For devices that also support the FUA bit the block layer needs
-to be told to pass through the REQ_FUA bit using:
-
- blk_queue_write_cache(sdkp->disk->queue, true, true);
-
-and the driver must handle write requests that have the REQ_FUA bit set
-in prep_fn/request_fn. If the FUA bit is not natively supported the block
-layer turns it into an empty REQ_OP_FLUSH request after the actual write.
diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt
deleted file mode 100644
index 627b0a1bf35e..000000000000
--- a/Documentation/blockdev/drbd/README.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Description
-
- DRBD is a shared-nothing, synchronously replicated block device. It
- is designed to serve as a building block for high availability
- clusters and in this context, is a "drop-in" replacement for shared
- storage. Simplistically, you could see it as a network RAID 1.
-
- Please visit http://www.drbd.org to find out more.
-
-The here included files are intended to help understand the implementation
-
-DRBD-8.3-data-packets.svg, DRBD-data-packets.svg
- relates some functions, and write packets.
-
-conn-states-8.dot, disk-states-8.dot, node-states-8.dot
- The sub graphs of DRBD's state transitions
diff --git a/Documentation/blockdev/drbd/data-structure-v9.txt b/Documentation/blockdev/drbd/data-structure-v9.txt
deleted file mode 100644
index 1e52a0e32624..000000000000
--- a/Documentation/blockdev/drbd/data-structure-v9.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-This describes the in kernel data structure for DRBD-9. Starting with
-Linux v3.14 we are reorganizing DRBD to use this data structure.
-
-Basic Data Structure
-====================
-
-A node has a number of DRBD resources. Each such resource has a number of
-devices (aka volumes) and connections to other nodes ("peer nodes"). Each DRBD
-device is represented by a block device locally.
-
-The DRBD objects are interconnected to form a matrix as depicted below; a
-drbd_peer_device object sits at each intersection between a drbd_device and a
-drbd_connection:
-
- /--------------+---------------+.....+---------------\
- | resource | device | | device |
- +--------------+---------------+.....+---------------+
- | connection | peer_device | | peer_device |
- +--------------+---------------+.....+---------------+
- : : : : :
- : : : : :
- +--------------+---------------+.....+---------------+
- | connection | peer_device | | peer_device |
- \--------------+---------------+.....+---------------/
-
-In this table, horizontally, devices can be accessed from resources by their
-volume number. Likewise, peer_devices can be accessed from connections by
-their volume number. Objects in the vertical direction are connected by double
-linked lists. There are back pointers from peer_devices to their connections a
-devices, and from connections and devices to their resource.
-
-All resources are in the drbd_resources double-linked list. In addition, all
-devices can be accessed by their minor device number via the drbd_devices idr.
-
-The drbd_resource, drbd_connection, and drbd_device objects are reference
-counted. The peer_device objects only serve to establish the links between
-devices and connections; their lifetime is determined by the lifetime of the
-device and connection which they reference.
diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot
deleted file mode 100644
index 4a2b00c23547..000000000000
--- a/Documentation/blockdev/drbd/node-states-8.dot
+++ /dev/null
@@ -1,14 +0,0 @@
-digraph node_states {
- Secondary -> Primary [ label = "ioctl_set_state()" ]
- Primary -> Secondary [ label = "ioctl_set_state()" ]
-}
-
-digraph peer_states {
- Secondary -> Primary [ label = "recv state packet" ]
- Primary -> Secondary [ label = "recv state packet" ]
- Primary -> Unknown [ label = "connection lost" ]
- Secondary -> Unknown [ label = "connection lost" ]
- Unknown -> Primary [ label = "connected" ]
- Unknown -> Secondary [ label = "connected" ]
-}
-
diff --git a/Documentation/blockdev/floppy.txt b/Documentation/blockdev/floppy.txt
deleted file mode 100644
index e2240f5ab64d..000000000000
--- a/Documentation/blockdev/floppy.txt
+++ /dev/null
@@ -1,245 +0,0 @@
-This file describes the floppy driver.
-
-FAQ list:
-=========
-
- A FAQ list may be found in the fdutils package (see below), and also
-at <http://fdutils.linux.lu/faq.html>.
-
-
-LILO configuration options (Thinkpad users, read this)
-======================================================
-
- The floppy driver is configured using the 'floppy=' option in
-lilo. This option can be typed at the boot prompt, or entered in the
-lilo configuration file.
-
- Example: If your kernel is called linux-2.6.9, type the following line
-at the lilo boot prompt (if you have a thinkpad):
-
- linux-2.6.9 floppy=thinkpad
-
-You may also enter the following line in /etc/lilo.conf, in the description
-of linux-2.6.9:
-
- append = "floppy=thinkpad"
-
- Several floppy related options may be given, example:
-
- linux-2.6.9 floppy=daring floppy=two_fdc
- append = "floppy=daring floppy=two_fdc"
-
- If you give options both in the lilo config file and on the boot
-prompt, the option strings of both places are concatenated, the boot
-prompt options coming last. That's why there are also options to
-restore the default behavior.
-
-
-Module configuration options
-============================
-
- If you use the floppy driver as a module, use the following syntax:
-modprobe floppy floppy="<options>"
-
-Example:
- modprobe floppy floppy="omnibook messages"
-
- If you need certain options enabled every time you load the floppy driver,
-you can put:
-
- options floppy floppy="omnibook messages"
-
-in a configuration file in /etc/modprobe.d/.
-
-
- The floppy driver related options are:
-
- floppy=asus_pci
- Sets the bit mask to allow only units 0 and 1. (default)
-
- floppy=daring
- Tells the floppy driver that you have a well behaved floppy controller.
- This allows more efficient and smoother operation, but may fail on
- certain controllers. This may speed up certain operations.
-
- floppy=0,daring
- Tells the floppy driver that your floppy controller should be used
- with caution.
-
- floppy=one_fdc
- Tells the floppy driver that you have only one floppy controller.
- (default)
-
- floppy=two_fdc
- floppy=<address>,two_fdc
- Tells the floppy driver that you have two floppy controllers.
- The second floppy controller is assumed to be at <address>.
- This option is not needed if the second controller is at address
- 0x370, and if you use the 'cmos' option.
-
- floppy=thinkpad
- Tells the floppy driver that you have a Thinkpad. Thinkpads use an
- inverted convention for the disk change line.
-
- floppy=0,thinkpad
- Tells the floppy driver that you don't have a Thinkpad.
-
- floppy=omnibook
- floppy=nodma
- Tells the floppy driver not to use Dma for data transfers.
- This is needed on HP Omnibooks, which don't have a workable
- DMA channel for the floppy driver. This option is also useful
- if you frequently get "Unable to allocate DMA memory" messages.
- Indeed, dma memory needs to be continuous in physical memory,
- and is thus harder to find, whereas non-dma buffers may be
- allocated in virtual memory. However, I advise against this if
- you have an FDC without a FIFO (8272A or 82072). 82072A and
- later are OK. You also need at least a 486 to use nodma.
- If you use nodma mode, I suggest you also set the FIFO
- threshold to 10 or lower, in order to limit the number of data
- transfer interrupts.
-
- If you have a FIFO-able FDC, the floppy driver automatically
- falls back on non DMA mode if no DMA-able memory can be found.
- If you want to avoid this, explicitly ask for 'yesdma'.
-
- floppy=yesdma
- Tells the floppy driver that a workable DMA channel is available.
- (default)
-
- floppy=nofifo
- Disables the FIFO entirely. This is needed if you get "Bus
- master arbitration error" messages from your Ethernet card (or
- from other devices) while accessing the floppy.
-
- floppy=usefifo
- Enables the FIFO. (default)
-
- floppy=<threshold>,fifo_depth
- Sets the FIFO threshold. This is mostly relevant in DMA
- mode. If this is higher, the floppy driver tolerates more
- interrupt latency, but it triggers more interrupts (i.e. it
- imposes more load on the rest of the system). If this is
- lower, the interrupt latency should be lower too (faster
- processor). The benefit of a lower threshold is less
- interrupts.
-
- To tune the fifo threshold, switch on over/underrun messages
- using 'floppycontrol --messages'. Then access a floppy
- disk. If you get a huge amount of "Over/Underrun - retrying"
- messages, then the fifo threshold is too low. Try with a
- higher value, until you only get an occasional Over/Underrun.
- It is a good idea to compile the floppy driver as a module
- when doing this tuning. Indeed, it allows to try different
- fifo values without rebooting the machine for each test. Note
- that you need to do 'floppycontrol --messages' every time you
- re-insert the module.
-
- Usually, tuning the fifo threshold should not be needed, as
- the default (0xa) is reasonable.
-
- floppy=<drive>,<type>,cmos
- Sets the CMOS type of <drive> to <type>. This is mandatory if
- you have more than two floppy drives (only two can be
- described in the physical CMOS), or if your BIOS uses
- non-standard CMOS types. The CMOS types are:
-
- 0 - Use the value of the physical CMOS
- 1 - 5 1/4 DD
- 2 - 5 1/4 HD
- 3 - 3 1/2 DD
- 4 - 3 1/2 HD
- 5 - 3 1/2 ED
- 6 - 3 1/2 ED
- 16 - unknown or not installed
-
- (Note: there are two valid types for ED drives. This is because 5 was
- initially chosen to represent floppy *tapes*, and 6 for ED drives.
- AMI ignored this, and used 5 for ED drives. That's why the floppy
- driver handles both.)
-
- floppy=unexpected_interrupts
- Print a warning message when an unexpected interrupt is received.
- (default)
-
- floppy=no_unexpected_interrupts
- floppy=L40SX
- Don't print a message when an unexpected interrupt is received. This
- is needed on IBM L40SX laptops in certain video modes. (There seems
- to be an interaction between video and floppy. The unexpected
- interrupts affect only performance, and can be safely ignored.)
-
- floppy=broken_dcl
- Don't use the disk change line, but assume that the disk was
- changed whenever the device node is reopened. Needed on some
- boxes where the disk change line is broken or unsupported.
- This should be regarded as a stopgap measure, indeed it makes
- floppy operation less efficient due to unneeded cache
- flushings, and slightly more unreliable. Please verify your
- cable, connection and jumper settings if you have any DCL
- problems. However, some older drives, and also some laptops
- are known not to have a DCL.
-
- floppy=debug
- Print debugging messages.
-
- floppy=messages
- Print informational messages for some operations (disk change
- notifications, warnings about over and underruns, and about
- autodetection).
-
- floppy=silent_dcl_clear
- Uses a less noisy way to clear the disk change line (which
- doesn't involve seeks). Implied by 'daring' option.
-
- floppy=<nr>,irq
- Sets the floppy IRQ to <nr> instead of 6.
-
- floppy=<nr>,dma
- Sets the floppy DMA channel to <nr> instead of 2.
-
- floppy=slow
- Use PS/2 stepping rate:
- " PS/2 floppies have much slower step rates than regular floppies.
- It's been recommended that take about 1/4 of the default speed
- in some more extreme cases."
-
-
-Supporting utilities and additional documentation:
-==================================================
-
- Additional parameters of the floppy driver can be configured at
-runtime. Utilities which do this can be found in the fdutils package.
-This package also contains a new version of mtools which allows to
-access high capacity disks (up to 1992K on a high density 3 1/2 disk!).
-It also contains additional documentation about the floppy driver.
-
-The latest version can be found at fdutils homepage:
- http://fdutils.linux.lu
-
-The fdutils releases can be found at:
- http://fdutils.linux.lu/download.html
- http://www.tux.org/pub/knaff/fdutils/
- ftp://metalab.unc.edu/pub/Linux/utils/disk-management/
-
-Reporting problems about the floppy driver
-==========================================
-
- If you have a question or a bug report about the floppy driver, mail
-me at Alain.Knaff@poboxes.com . If you post to Usenet, preferably use
-comp.os.linux.hardware. As the volume in these groups is rather high,
-be sure to include the word "floppy" (or "FLOPPY") in the subject
-line. If the reported problem happens when mounting floppy disks, be
-sure to mention also the type of the filesystem in the subject line.
-
- Be sure to read the FAQ before mailing/posting any bug reports!
-
- Alain
-
-Changelog
-=========
-
-10-30-2004 : Cleanup, updating, add reference to module configuration.
- James Nelson <james4765@gmail.com>
-
-6-3-2000 : Original Document
diff --git a/Documentation/blockdev/nbd.txt b/Documentation/blockdev/nbd.txt
deleted file mode 100644
index db242ea2bce8..000000000000
--- a/Documentation/blockdev/nbd.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-Network Block Device (TCP version)
-==================================
-
-1) Overview
------------
-
-What is it: With this compiled in the kernel (or as a module), Linux
-can use a remote server as one of its block devices. So every time
-the client computer wants to read, e.g., /dev/nb0, it sends a
-request over TCP to the server, which will reply with the data read.
-This can be used for stations with low disk space (or even diskless)
-to borrow disk space from another computer.
-Unlike NFS, it is possible to put any filesystem on it, etc.
-
-For more information, or to download the nbd-client and nbd-server
-tools, go to http://nbd.sf.net/.
-
-The nbd kernel module need only be installed on the client
-system, as the nbd-server is completely in userspace. In fact,
-the nbd-server has been successfully ported to other operating
-systems, including Windows.
-
-A) NBD parameters
------------------
-
-max_part
- Number of partitions per device (default: 0).
-
-nbds_max
- Number of block devices that should be initialized (default: 16).
-
diff --git a/Documentation/blockdev/paride.txt b/Documentation/blockdev/paride.txt
deleted file mode 100644
index ee6717e3771d..000000000000
--- a/Documentation/blockdev/paride.txt
+++ /dev/null
@@ -1,417 +0,0 @@
-
- Linux and parallel port IDE devices
-
-PARIDE v1.03 (c) 1997-8 Grant Guenther <grant@torque.net>
-
-1. Introduction
-
-Owing to the simplicity and near universality of the parallel port interface
-to personal computers, many external devices such as portable hard-disk,
-CD-ROM, LS-120 and tape drives use the parallel port to connect to their
-host computer. While some devices (notably scanners) use ad-hoc methods
-to pass commands and data through the parallel port interface, most
-external devices are actually identical to an internal model, but with
-a parallel-port adapter chip added in. Some of the original parallel port
-adapters were little more than mechanisms for multiplexing a SCSI bus.
-(The Iomega PPA-3 adapter used in the ZIP drives is an example of this
-approach). Most current designs, however, take a different approach.
-The adapter chip reproduces a small ISA or IDE bus in the external device
-and the communication protocol provides operations for reading and writing
-device registers, as well as data block transfer functions. Sometimes,
-the device being addressed via the parallel cable is a standard SCSI
-controller like an NCR 5380. The "ditto" family of external tape
-drives use the ISA replicator to interface a floppy disk controller,
-which is then connected to a floppy-tape mechanism. The vast majority
-of external parallel port devices, however, are now based on standard
-IDE type devices, which require no intermediate controller. If one
-were to open up a parallel port CD-ROM drive, for instance, one would
-find a standard ATAPI CD-ROM drive, a power supply, and a single adapter
-that interconnected a standard PC parallel port cable and a standard
-IDE cable. It is usually possible to exchange the CD-ROM device with
-any other device using the IDE interface.
-
-The document describes the support in Linux for parallel port IDE
-devices. It does not cover parallel port SCSI devices, "ditto" tape
-drives or scanners. Many different devices are supported by the
-parallel port IDE subsystem, including:
-
- MicroSolutions backpack CD-ROM
- MicroSolutions backpack PD/CD
- MicroSolutions backpack hard-drives
- MicroSolutions backpack 8000t tape drive
- SyQuest EZ-135, EZ-230 & SparQ drives
- Avatar Shark
- Imation Superdisk LS-120
- Maxell Superdisk LS-120
- FreeCom Power CD
- Hewlett-Packard 5GB and 8GB tape drives
- Hewlett-Packard 7100 and 7200 CD-RW drives
-
-as well as most of the clone and no-name products on the market.
-
-To support such a wide range of devices, PARIDE, the parallel port IDE
-subsystem, is actually structured in three parts. There is a base
-paride module which provides a registry and some common methods for
-accessing the parallel ports. The second component is a set of
-high-level drivers for each of the different types of supported devices:
-
- pd IDE disk
- pcd ATAPI CD-ROM
- pf ATAPI disk
- pt ATAPI tape
- pg ATAPI generic
-
-(Currently, the pg driver is only used with CD-R drives).
-
-The high-level drivers function according to the relevant standards.
-The third component of PARIDE is a set of low-level protocol drivers
-for each of the parallel port IDE adapter chips. Thanks to the interest
-and encouragement of Linux users from many parts of the world,
-support is available for almost all known adapter protocols:
-
- aten ATEN EH-100 (HK)
- bpck Microsolutions backpack (US)
- comm DataStor (old-type) "commuter" adapter (TW)
- dstr DataStor EP-2000 (TW)
- epat Shuttle EPAT (UK)
- epia Shuttle EPIA (UK)
- fit2 FIT TD-2000 (US)
- fit3 FIT TD-3000 (US)
- friq Freecom IQ cable (DE)
- frpw Freecom Power (DE)
- kbic KingByte KBIC-951A and KBIC-971A (TW)
- ktti KT Technology PHd adapter (SG)
- on20 OnSpec 90c20 (US)
- on26 OnSpec 90c26 (US)
-
-
-2. Using the PARIDE subsystem
-
-While configuring the Linux kernel, you may choose either to build
-the PARIDE drivers into your kernel, or to build them as modules.
-
-In either case, you will need to select "Parallel port IDE device support"
-as well as at least one of the high-level drivers and at least one
-of the parallel port communication protocols. If you do not know
-what kind of parallel port adapter is used in your drive, you could
-begin by checking the file names and any text files on your DOS
-installation floppy. Alternatively, you can look at the markings on
-the adapter chip itself. That's usually sufficient to identify the
-correct device.
-
-You can actually select all the protocol modules, and allow the PARIDE
-subsystem to try them all for you.
-
-For the "brand-name" products listed above, here are the protocol
-and high-level drivers that you would use:
-
- Manufacturer Model Driver Protocol
-
- MicroSolutions CD-ROM pcd bpck
- MicroSolutions PD drive pf bpck
- MicroSolutions hard-drive pd bpck
- MicroSolutions 8000t tape pt bpck
- SyQuest EZ, SparQ pd epat
- Imation Superdisk pf epat
- Maxell Superdisk pf friq
- Avatar Shark pd epat
- FreeCom CD-ROM pcd frpw
- Hewlett-Packard 5GB Tape pt epat
- Hewlett-Packard 7200e (CD) pcd epat
- Hewlett-Packard 7200e (CD-R) pg epat
-
-2.1 Configuring built-in drivers
-
-We recommend that you get to know how the drivers work and how to
-configure them as loadable modules, before attempting to compile a
-kernel with the drivers built-in.
-
-If you built all of your PARIDE support directly into your kernel,
-and you have just a single parallel port IDE device, your kernel should
-locate it automatically for you. If you have more than one device,
-you may need to give some command line options to your bootloader
-(eg: LILO), how to do that is beyond the scope of this document.
-
-The high-level drivers accept a number of command line parameters, all
-of which are documented in the source files in linux/drivers/block/paride.
-By default, each driver will automatically try all parallel ports it
-can find, and all protocol types that have been installed, until it finds
-a parallel port IDE adapter. Once it finds one, the probe stops. So,
-if you have more than one device, you will need to tell the drivers
-how to identify them. This requires specifying the port address, the
-protocol identification number and, for some devices, the drive's
-chain ID. While your system is booting, a number of messages are
-displayed on the console. Like all such messages, they can be
-reviewed with the 'dmesg' command. Among those messages will be
-some lines like:
-
- paride: bpck registered as protocol 0
- paride: epat registered as protocol 1
-
-The numbers will always be the same until you build a new kernel with
-different protocol selections. You should note these numbers as you
-will need them to identify the devices.
-
-If you happen to be using a MicroSolutions backpack device, you will
-also need to know the unit ID number for each drive. This is usually
-the last two digits of the drive's serial number (but read MicroSolutions'
-documentation about this).
-
-As an example, let's assume that you have a MicroSolutions PD/CD drive
-with unit ID number 36 connected to the parallel port at 0x378, a SyQuest
-EZ-135 connected to the chained port on the PD/CD drive and also an
-Imation Superdisk connected to port 0x278. You could give the following
-options on your boot command:
-
- pd.drive0=0x378,1 pf.drive0=0x278,1 pf.drive1=0x378,0,36
-
-In the last option, pf.drive1 configures device /dev/pf1, the 0x378
-is the parallel port base address, the 0 is the protocol registration
-number and 36 is the chain ID.
-
-Please note: while PARIDE will work both with and without the
-PARPORT parallel port sharing system that is included by the
-"Parallel port support" option, PARPORT must be included and enabled
-if you want to use chains of devices on the same parallel port.
-
-2.2 Loading and configuring PARIDE as modules
-
-It is much faster and simpler to get to understand the PARIDE drivers
-if you use them as loadable kernel modules.
-
-Note 1: using these drivers with the "kerneld" automatic module loading
-system is not recommended for beginners, and is not documented here.
-
-Note 2: if you build PARPORT support as a loadable module, PARIDE must
-also be built as loadable modules, and PARPORT must be loaded before the
-PARIDE modules.
-
-To use PARIDE, you must begin by
-
- insmod paride
-
-this loads a base module which provides a registry for the protocols,
-among other tasks.
-
-Then, load as many of the protocol modules as you think you might need.
-As you load each module, it will register the protocols that it supports,
-and print a log message to your kernel log file and your console. For
-example:
-
- # insmod epat
- paride: epat registered as protocol 0
- # insmod kbic
- paride: k951 registered as protocol 1
- paride: k971 registered as protocol 2
-
-Finally, you can load high-level drivers for each kind of device that
-you have connected. By default, each driver will autoprobe for a single
-device, but you can support up to four similar devices by giving their
-individual co-ordinates when you load the driver.
-
-For example, if you had two no-name CD-ROM drives both using the
-KingByte KBIC-951A adapter, one on port 0x378 and the other on 0x3bc
-you could give the following command:
-
- # insmod pcd drive0=0x378,1 drive1=0x3bc,1
-
-For most adapters, giving a port address and protocol number is sufficient,
-but check the source files in linux/drivers/block/paride for more
-information. (Hopefully someone will write some man pages one day !).
-
-As another example, here's what happens when PARPORT is installed, and
-a SyQuest EZ-135 is attached to port 0x378:
-
- # insmod paride
- paride: version 1.0 installed
- # insmod epat
- paride: epat registered as protocol 0
- # insmod pd
- pd: pd version 1.0, major 45, cluster 64, nice 0
- pda: Sharing parport1 at 0x378
- pda: epat 1.0, Shuttle EPAT chip c3 at 0x378, mode 5 (EPP-32), delay 1
- pda: SyQuest EZ135A, 262144 blocks [128M], (512/16/32), removable media
- pda: pda1
-
-Note that the last line is the output from the generic partition table
-scanner - in this case it reports that it has found a disk with one partition.
-
-2.3 Using a PARIDE device
-
-Once the drivers have been loaded, you can access PARIDE devices in the
-same way as their traditional counterparts. You will probably need to
-create the device "special files". Here is a simple script that you can
-cut to a file and execute:
-
-#!/bin/bash
-#
-# mkd -- a script to create the device special files for the PARIDE subsystem
-#
-function mkdev {
- mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
-}
-#
-function pd {
- D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
- mkdev pd$D b 45 $[ $1 * 16 ]
- for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
- do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
- done
-}
-#
-cd /dev
-#
-for u in 0 1 2 3 ; do pd $u ; done
-for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done
-for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done
-for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done
-for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done
-for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done
-#
-# end of mkd
-
-With the device files and drivers in place, you can access PARIDE devices
-like any other Linux device. For example, to mount a CD-ROM in pcd0, use:
-
- mount /dev/pcd0 /cdrom
-
-If you have a fresh Avatar Shark cartridge, and the drive is pda, you
-might do something like:
-
- fdisk /dev/pda -- make a new partition table with
- partition 1 of type 83
-
- mke2fs /dev/pda1 -- to build the file system
-
- mkdir /shark -- make a place to mount the disk
-
- mount /dev/pda1 /shark
-
-Devices like the Imation superdisk work in the same way, except that
-they do not have a partition table. For example to make a 120MB
-floppy that you could share with a DOS system:
-
- mkdosfs /dev/pf0
- mount /dev/pf0 /mnt
-
-
-2.4 The pf driver
-
-The pf driver is intended for use with parallel port ATAPI disk
-devices. The most common devices in this category are PD drives
-and LS-120 drives. Traditionally, media for these devices are not
-partitioned. Consequently, the pf driver does not support partitioned
-media. This may be changed in a future version of the driver.
-
-2.5 Using the pt driver
-
-The pt driver for parallel port ATAPI tape drives is a minimal driver.
-It does not yet support many of the standard tape ioctl operations.
-For best performance, a block size of 32KB should be used. You will
-probably want to set the parallel port delay to 0, if you can.
-
-2.6 Using the pg driver
-
-The pg driver can be used in conjunction with the cdrecord program
-to create CD-ROMs. Please get cdrecord version 1.6.1 or later
-from ftp://ftp.fokus.gmd.de/pub/unix/cdrecord/ . To record CD-R media
-your parallel port should ideally be set to EPP mode, and the "port delay"
-should be set to 0. With those settings it is possible to record at 2x
-speed without any buffer underruns. If you cannot get the driver to work
-in EPP mode, try to use "bidirectional" or "PS/2" mode and 1x speeds only.
-
-
-3. Troubleshooting
-
-3.1 Use EPP mode if you can
-
-The most common problems that people report with the PARIDE drivers
-concern the parallel port CMOS settings. At this time, none of the
-PARIDE protocol modules support ECP mode, or any ECP combination modes.
-If you are able to do so, please set your parallel port into EPP mode
-using your CMOS setup procedure.
-
-3.2 Check the port delay
-
-Some parallel ports cannot reliably transfer data at full speed. To
-offset the errors, the PARIDE protocol modules introduce a "port
-delay" between each access to the i/o ports. Each protocol sets
-a default value for this delay. In most cases, the user can override
-the default and set it to 0 - resulting in somewhat higher transfer
-rates. In some rare cases (especially with older 486 systems) the
-default delays are not long enough. if you experience corrupt data
-transfers, or unexpected failures, you may wish to increase the
-port delay. The delay can be programmed using the "driveN" parameters
-to each of the high-level drivers. Please see the notes above, or
-read the comments at the beginning of the driver source files in
-linux/drivers/block/paride.
-
-3.3 Some drives need a printer reset
-
-There appear to be a number of "noname" external drives on the market
-that do not always power up correctly. We have noticed this with some
-drives based on OnSpec and older Freecom adapters. In these rare cases,
-the adapter can often be reinitialised by issuing a "printer reset" on
-the parallel port. As the reset operation is potentially disruptive in
-multiple device environments, the PARIDE drivers will not do it
-automatically. You can however, force a printer reset by doing:
-
- insmod lp reset=1
- rmmod lp
-
-If you have one of these marginal cases, you should probably build
-your paride drivers as modules, and arrange to do the printer reset
-before loading the PARIDE drivers.
-
-3.4 Use the verbose option and dmesg if you need help
-
-While a lot of testing has gone into these drivers to make them work
-as smoothly as possible, problems will arise. If you do have problems,
-please check all the obvious things first: does the drive work in
-DOS with the manufacturer's drivers ? If that doesn't yield any useful
-clues, then please make sure that only one drive is hooked to your system,
-and that either (a) PARPORT is enabled or (b) no other device driver
-is using your parallel port (check in /proc/ioports). Then, load the
-appropriate drivers (you can load several protocol modules if you want)
-as in:
-
- # insmod paride
- # insmod epat
- # insmod bpck
- # insmod kbic
- ...
- # insmod pd verbose=1
-
-(using the correct driver for the type of device you have, of course).
-The verbose=1 parameter will cause the drivers to log a trace of their
-activity as they attempt to locate your drive.
-
-Use 'dmesg' to capture a log of all the PARIDE messages (any messages
-beginning with paride:, a protocol module's name or a driver's name) and
-include that with your bug report. You can submit a bug report in one
-of two ways. Either send it directly to the author of the PARIDE suite,
-by e-mail to grant@torque.net, or join the linux-parport mailing list
-and post your report there.
-
-3.5 For more information or help
-
-You can join the linux-parport mailing list by sending a mail message
-to
- linux-parport-request@torque.net
-
-with the single word
-
- subscribe
-
-in the body of the mail message (not in the subject line). Please be
-sure that your mail program is correctly set up when you do this, as
-the list manager is a robot that will subscribe you using the reply
-address in your mail headers. REMOVE any anti-spam gimmicks you may
-have in your mail headers, when sending mail to the list server.
-
-You might also find some useful information on the linux-parport
-web pages (although they are not always up to date) at
-
- http://web.archive.org/web/*/http://www.torque.net/parport/
-
-
diff --git a/Documentation/blockdev/ramdisk.txt b/Documentation/blockdev/ramdisk.txt
deleted file mode 100644
index 501e12e0323e..000000000000
--- a/Documentation/blockdev/ramdisk.txt
+++ /dev/null
@@ -1,174 +0,0 @@
-Using the RAM disk block device with Linux
-------------------------------------------
-
-Contents:
-
- 1) Overview
- 2) Kernel Command Line Parameters
- 3) Using "rdev -r"
- 4) An Example of Creating a Compressed RAM Disk
-
-
-1) Overview
------------
-
-The RAM disk driver is a way to use main system memory as a block device. It
-is required for initrd, an initial filesystem used if you need to load modules
-in order to access the root filesystem (see Documentation/admin-guide/initrd.rst). It can
-also be used for a temporary filesystem for crypto work, since the contents
-are erased on reboot.
-
-The RAM disk dynamically grows as more space is required. It does this by using
-RAM from the buffer cache. The driver marks the buffers it is using as dirty
-so that the VM subsystem does not try to reclaim them later.
-
-The RAM disk supports up to 16 RAM disks by default, and can be reconfigured
-to support an unlimited number of RAM disks (at your own risk). Just change
-the configuration symbol BLK_DEV_RAM_COUNT in the Block drivers config menu
-and (re)build the kernel.
-
-To use RAM disk support with your system, run './MAKEDEV ram' from the /dev
-directory. RAM disks are all major number 1, and start with minor number 0
-for /dev/ram0, etc. If used, modern kernels use /dev/ram0 for an initrd.
-
-The new RAM disk also has the ability to load compressed RAM disk images,
-allowing one to squeeze more programs onto an average installation or
-rescue floppy disk.
-
-
-2) Parameters
----------------------------------
-
-2a) Kernel Command Line Parameters
-
- ramdisk_size=N
- ==============
-
-This parameter tells the RAM disk driver to set up RAM disks of N k size. The
-default is 4096 (4 MB).
-
-2b) Module parameters
-
- rd_nr
- =====
- /dev/ramX devices created.
-
- max_part
- ========
- Maximum partition number.
-
- rd_size
- =======
- See ramdisk_size.
-
-3) Using "rdev -r"
-------------------
-
-The usage of the word (two bytes) that "rdev -r" sets in the kernel image is
-as follows. The low 11 bits (0 -> 10) specify an offset (in 1 k blocks) of up
-to 2 MB (2^11) of where to find the RAM disk (this used to be the size). Bit
-14 indicates that a RAM disk is to be loaded, and bit 15 indicates whether a
-prompt/wait sequence is to be given before trying to read the RAM disk. Since
-the RAM disk dynamically grows as data is being written into it, a size field
-is not required. Bits 11 to 13 are not currently used and may as well be zero.
-These numbers are no magical secrets, as seen below:
-
-./arch/x86/kernel/setup.c:#define RAMDISK_IMAGE_START_MASK 0x07FF
-./arch/x86/kernel/setup.c:#define RAMDISK_PROMPT_FLAG 0x8000
-./arch/x86/kernel/setup.c:#define RAMDISK_LOAD_FLAG 0x4000
-
-Consider a typical two floppy disk setup, where you will have the
-kernel on disk one, and have already put a RAM disk image onto disk #2.
-
-Hence you want to set bits 0 to 13 as 0, meaning that your RAM disk
-starts at an offset of 0 kB from the beginning of the floppy.
-The command line equivalent is: "ramdisk_start=0"
-
-You want bit 14 as one, indicating that a RAM disk is to be loaded.
-The command line equivalent is: "load_ramdisk=1"
-
-You want bit 15 as one, indicating that you want a prompt/keypress
-sequence so that you have a chance to switch floppy disks.
-The command line equivalent is: "prompt_ramdisk=1"
-
-Putting that together gives 2^15 + 2^14 + 0 = 49152 for an rdev word.
-So to create disk one of the set, you would do:
-
- /usr/src/linux# cat arch/x86/boot/zImage > /dev/fd0
- /usr/src/linux# rdev /dev/fd0 /dev/fd0
- /usr/src/linux# rdev -r /dev/fd0 49152
-
-If you make a boot disk that has LILO, then for the above, you would use:
- append = "ramdisk_start=0 load_ramdisk=1 prompt_ramdisk=1"
-Since the default start = 0 and the default prompt = 1, you could use:
- append = "load_ramdisk=1"
-
-
-4) An Example of Creating a Compressed RAM Disk
-----------------------------------------------
-
-To create a RAM disk image, you will need a spare block device to
-construct it on. This can be the RAM disk device itself, or an
-unused disk partition (such as an unmounted swap partition). For this
-example, we will use the RAM disk device, "/dev/ram0".
-
-Note: This technique should not be done on a machine with less than 8 MB
-of RAM. If using a spare disk partition instead of /dev/ram0, then this
-restriction does not apply.
-
-a) Decide on the RAM disk size that you want. Say 2 MB for this example.
- Create it by writing to the RAM disk device. (This step is not currently
- required, but may be in the future.) It is wise to zero out the
- area (esp. for disks) so that maximal compression is achieved for
- the unused blocks of the image that you are about to create.
-
- dd if=/dev/zero of=/dev/ram0 bs=1k count=2048
-
-b) Make a filesystem on it. Say ext2fs for this example.
-
- mke2fs -vm0 /dev/ram0 2048
-
-c) Mount it, copy the files you want to it (eg: /etc/* /dev/* ...)
- and unmount it again.
-
-d) Compress the contents of the RAM disk. The level of compression
- will be approximately 50% of the space used by the files. Unused
- space on the RAM disk will compress to almost nothing.
-
- dd if=/dev/ram0 bs=1k count=2048 | gzip -v9 > /tmp/ram_image.gz
-
-e) Put the kernel onto the floppy
-
- dd if=zImage of=/dev/fd0 bs=1k
-
-f) Put the RAM disk image onto the floppy, after the kernel. Use an offset
- that is slightly larger than the kernel, so that you can put another
- (possibly larger) kernel onto the same floppy later without overlapping
- the RAM disk image. An offset of 400 kB for kernels about 350 kB in
- size would be reasonable. Make sure offset+size of ram_image.gz is
- not larger than the total space on your floppy (usually 1440 kB).
-
- dd if=/tmp/ram_image.gz of=/dev/fd0 bs=1k seek=400
-
-g) Use "rdev" to set the boot device, RAM disk offset, prompt flag, etc.
- For prompt_ramdisk=1, load_ramdisk=1, ramdisk_start=400, one would
- have 2^15 + 2^14 + 400 = 49552.
-
- rdev /dev/fd0 /dev/fd0
- rdev -r /dev/fd0 49552
-
-That is it. You now have your boot/root compressed RAM disk floppy. Some
-users may wish to combine steps (d) and (f) by using a pipe.
-
---------------------------------------------------------------------------
- Paul Gortmaker 12/95
-
-Changelog:
-----------
-
-10-22-04 : Updated to reflect changes in command line options, remove
- obsolete references, general cleanup.
- James Nelson (james4765@gmail.com)
-
-
-12-95 : Original Document
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
deleted file mode 100644
index 4df0ce271085..000000000000
--- a/Documentation/blockdev/zram.txt
+++ /dev/null
@@ -1,355 +0,0 @@
-zram: Compressed RAM based block devices
-----------------------------------------
-
-* Introduction
-
-The zram module creates RAM based block devices named /dev/zram<id>
-(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
-in memory itself. These disks allow very fast I/O and compression provides
-good amounts of memory savings. Some of the usecases include /tmp storage,
-use as swap disks, various caches under /var and maybe many more :)
-
-Statistics for individual zram devices are exported through sysfs nodes at
-/sys/block/zram<id>/
-
-* Usage
-
-There are several ways to configure and manage zram device(-s):
-a) using zram and zram_control sysfs attributes
-b) using zramctl utility, provided by util-linux (util-linux@vger.kernel.org).
-
-In this document we will describe only 'manual' zram configuration steps,
-IOW, zram and zram_control sysfs attributes.
-
-In order to get a better idea about zramctl please consult util-linux
-documentation, zramctl man-page or `zramctl --help'. Please be informed
-that zram maintainers do not develop/maintain util-linux or zramctl, should
-you have any questions please contact util-linux@vger.kernel.org
-
-Following shows a typical sequence of steps for using zram.
-
-WARNING
-=======
-For the sake of simplicity we skip error checking parts in most of the
-examples below. However, it is your sole responsibility to handle errors.
-
-zram sysfs attributes always return negative values in case of errors.
-The list of possible return codes:
--EBUSY -- an attempt to modify an attribute that cannot be changed once
-the device has been initialised. Please reset device first;
--ENOMEM -- zram was not able to allocate enough memory to fulfil your
-needs;
--EINVAL -- invalid input has been provided.
-
-If you use 'echo', the returned value that is changed by 'echo' utility,
-and, in general case, something like:
-
- echo 3 > /sys/block/zram0/max_comp_streams
- if [ $? -ne 0 ];
- handle_error
- fi
-
-should suffice.
-
-1) Load Module:
- modprobe zram num_devices=4
- This creates 4 devices: /dev/zram{0,1,2,3}
-
-num_devices parameter is optional and tells zram how many devices should be
-pre-created. Default: 1.
-
-2) Set max number of compression streams
-Regardless the value passed to this attribute, ZRAM will always
-allocate multiple compression streams - one per online CPUs - thus
-allowing several concurrent compression operations. The number of
-allocated compression streams goes down when some of the CPUs
-become offline. There is no single-compression-stream mode anymore,
-unless you are running a UP system or has only 1 CPU online.
-
-To find out how many streams are currently available:
- cat /sys/block/zram0/max_comp_streams
-
-3) Select compression algorithm
-Using comp_algorithm device attribute one can see available and
-currently selected (shown in square brackets) compression algorithms,
-change selected compression algorithm (once the device is initialised
-there is no way to change compression algorithm).
-
-Examples:
- #show supported compression algorithms
- cat /sys/block/zram0/comp_algorithm
- lzo [lz4]
-
- #select lzo compression algorithm
- echo lzo > /sys/block/zram0/comp_algorithm
-
-For the time being, the `comp_algorithm' content does not necessarily
-show every compression algorithm supported by the kernel. We keep this
-list primarily to simplify device configuration and one can configure
-a new device with a compression algorithm that is not listed in
-`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API
-and, if some of the algorithms were built as modules, it's impossible
-to list all of them using, for instance, /proc/crypto or any other
-method. This, however, has an advantage of permitting the usage of
-custom crypto compression modules (implementing S/W or H/W compression).
-
-4) Set Disksize
-Set disk size by writing the value to sysfs node 'disksize'.
-The value can be either in bytes or you can use mem suffixes.
-Examples:
- # Initialize /dev/zram0 with 50MB disksize
- echo $((50*1024*1024)) > /sys/block/zram0/disksize
-
- # Using mem suffixes
- echo 256K > /sys/block/zram0/disksize
- echo 512M > /sys/block/zram0/disksize
- echo 1G > /sys/block/zram0/disksize
-
-Note:
-There is little point creating a zram of greater than twice the size of memory
-since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
-size of the disk when not in use so a huge zram is wasteful.
-
-5) Set memory limit: Optional
-Set memory limit by writing the value to sysfs node 'mem_limit'.
-The value can be either in bytes or you can use mem suffixes.
-In addition, you could change the value in runtime.
-Examples:
- # limit /dev/zram0 with 50MB memory
- echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
-
- # Using mem suffixes
- echo 256K > /sys/block/zram0/mem_limit
- echo 512M > /sys/block/zram0/mem_limit
- echo 1G > /sys/block/zram0/mem_limit
-
- # To disable memory limit
- echo 0 > /sys/block/zram0/mem_limit
-
-6) Activate:
- mkswap /dev/zram0
- swapon /dev/zram0
-
- mkfs.ext4 /dev/zram1
- mount /dev/zram1 /tmp
-
-7) Add/remove zram devices
-
-zram provides a control interface, which enables dynamic (on-demand) device
-addition and removal.
-
-In order to add a new /dev/zramX device, perform read operation on hot_add
-attribute. This will return either new device's device id (meaning that you
-can use /dev/zram<id>) or error code.
-
-Example:
- cat /sys/class/zram-control/hot_add
- 1
-
-To remove the existing /dev/zramX device (where X is a device id)
-execute
- echo X > /sys/class/zram-control/hot_remove
-
-8) Stats:
-Per-device statistics are exported as various nodes under /sys/block/zram<id>/
-
-A brief description of exported device attributes. For more details please
-read Documentation/ABI/testing/sysfs-block-zram.
-
-Name access description
----- ------ -----------
-disksize RW show and set the device's disk size
-initstate RO shows the initialization state of the device
-reset WO trigger device reset
-mem_used_max WO reset the `mem_used_max' counter (see later)
-mem_limit WO specifies the maximum amount of memory ZRAM can use
- to store the compressed data
-writeback_limit WO specifies the maximum amount of write IO zram can
- write out to backing device as 4KB unit
-writeback_limit_enable RW show and set writeback_limit feature
-max_comp_streams RW the number of possible concurrent compress operations
-comp_algorithm RW show and change the compression algorithm
-compact WO trigger memory compaction
-debug_stat RO this file is used for zram debugging purposes
-backing_dev RW set up backend storage for zram to write out
-idle WO mark allocated slot as idle
-
-
-User space is advised to use the following files to read the device statistics.
-
-File /sys/block/zram<id>/stat
-
-Represents block layer statistics. Read Documentation/block/stat.txt for
-details.
-
-File /sys/block/zram<id>/io_stat
-
-The stat file represents device's I/O statistics not accounted by block
-layer and, thus, not available in zram<id>/stat file. It consists of a
-single line of text and contains the following stats separated by
-whitespace:
- failed_reads the number of failed reads
- failed_writes the number of failed writes
- invalid_io the number of non-page-size-aligned I/O requests
- notify_free Depending on device usage scenario it may account
- a) the number of pages freed because of swap slot free
- notifications or b) the number of pages freed because of
- REQ_OP_DISCARD requests sent by bio. The former ones are
- sent to a swap block device when a swap slot is freed,
- which implies that this disk is being used as a swap disk.
- The latter ones are sent by filesystem mounted with
- discard option, whenever some data blocks are getting
- discarded.
-
-File /sys/block/zram<id>/mm_stat
-
-The stat file represents device's mm statistics. It consists of a single
-line of text and contains the following stats separated by whitespace:
- orig_data_size uncompressed size of data stored in this disk.
- This excludes same-element-filled pages (same_pages) since
- no memory is allocated for them.
- Unit: bytes
- compr_data_size compressed size of data stored in this disk
- mem_used_total the amount of memory allocated for this disk. This
- includes allocator fragmentation and metadata overhead,
- allocated for this disk. So, allocator space efficiency
- can be calculated using compr_data_size and this statistic.
- Unit: bytes
- mem_limit the maximum amount of memory ZRAM can use to store
- the compressed data
- mem_used_max the maximum amount of memory zram have consumed to
- store the data
- same_pages the number of same element filled pages written to this disk.
- No memory is allocated for such pages.
- pages_compacted the number of pages freed during compaction
- huge_pages the number of incompressible pages
-
-File /sys/block/zram<id>/bd_stat
-
-The stat file represents device's backing device statistics. It consists of
-a single line of text and contains the following stats separated by whitespace:
- bd_count size of data written in backing device.
- Unit: 4K bytes
- bd_reads the number of reads from backing device
- Unit: 4K bytes
- bd_writes the number of writes to backing device
- Unit: 4K bytes
-
-9) Deactivate:
- swapoff /dev/zram0
- umount /dev/zram1
-
-10) Reset:
- Write any positive value to 'reset' sysfs node
- echo 1 > /sys/block/zram0/reset
- echo 1 > /sys/block/zram1/reset
-
- This frees all the memory allocated for the given device and
- resets the disksize to zero. You must set the disksize again
- before reusing the device.
-
-* Optional Feature
-
-= writeback
-
-With CONFIG_ZRAM_WRITEBACK, zram can write idle/incompressible page
-to backing storage rather than keeping it in memory.
-To use the feature, admin should set up backing device via
-
- "echo /dev/sda5 > /sys/block/zramX/backing_dev"
-
-before disksize setting. It supports only partition at this moment.
-If admin want to use incompressible page writeback, they could do via
-
- "echo huge > /sys/block/zramX/write"
-
-To use idle page writeback, first, user need to declare zram pages
-as idle.
-
- "echo all > /sys/block/zramX/idle"
-
-From now on, any pages on zram are idle pages. The idle mark
-will be removed until someone request access of the block.
-IOW, unless there is access request, those pages are still idle pages.
-
-Admin can request writeback of those idle pages at right timing via
-
- "echo idle > /sys/block/zramX/writeback"
-
-With the command, zram writeback idle pages from memory to the storage.
-
-If there are lots of write IO with flash device, potentially, it has
-flash wearout problem so that admin needs to design write limitation
-to guarantee storage health for entire product life.
-
-To overcome the concern, zram supports "writeback_limit" feature.
-The "writeback_limit_enable"'s default value is 0 so that it doesn't limit
-any writeback. IOW, if admin want to apply writeback budget, he should
-enable writeback_limit_enable via
-
- $ echo 1 > /sys/block/zramX/writeback_limit_enable
-
-Once writeback_limit_enable is set, zram doesn't allow any writeback
-until admin set the budget via /sys/block/zramX/writeback_limit.
-
-(If admin doesn't enable writeback_limit_enable, writeback_limit's value
-assigned via /sys/block/zramX/writeback_limit is meaninless.)
-
-If admin want to limit writeback as per-day 400M, he could do it
-like below.
-
- $ MB_SHIFT=20
- $ 4K_SHIFT=12
- $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
- /sys/block/zram0/writeback_limit.
- $ echo 1 > /sys/block/zram0/writeback_limit_enable
-
-If admin want to allow further write again once the bugdet is exausted,
-he could do it like below
-
- $ echo $((400<<MB_SHIFT>>4K_SHIFT)) > \
- /sys/block/zram0/writeback_limit
-
-If admin want to see remaining writeback budget since he set,
-
- $ cat /sys/block/zramX/writeback_limit
-
-If admin want to disable writeback limit, he could do
-
- $ echo 0 > /sys/block/zramX/writeback_limit_enable
-
-The writeback_limit count will reset whenever you reset zram(e.g.,
-system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of
-writeback happened until you reset the zram to allocate extra writeback
-budget in next setting is user's job.
-
-If admin want to measure writeback count in a certain period, he could
-know it via /sys/block/zram0/bd_stat's 3rd column.
-
-= memory tracking
-
-With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
-zram block. It could be useful to catch cold or incompressible
-pages of the process with*pagemap.
-If you enable the feature, you could see block state via
-/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
-
- 300 75.033841 .wh.
- 301 63.806904 s...
- 302 63.806919 ..hi
-
-First column is zram's block index.
-Second column is access time since the system was booted
-Third column is state of the block.
-(s: same page
-w: written page to backing store
-h: huge page
-i: idle page)
-
-First line of above example says 300th block is accessed at 75.033841sec
-and the block's state is huge so it is written back to the backing
-storage. It's a debugging feature so anyone shouldn't rely on it to work
-properly.
-
-Nitin Gupta
-ngupta@vflare.org
diff --git a/Documentation/bpf/bpf_design_QA.rst b/Documentation/bpf/bpf_design_QA.rst
index cb402c59eca5..12a246fcf6cb 100644
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -172,11 +172,31 @@ registers which makes BPF inefficient virtual machine for 32-bit
CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
be added to BPF in the future?
-A: NO. The first thing to improve performance on 32-bit archs is to teach
-LLVM to generate code that uses 32-bit subregisters. Then second step
-is to teach verifier to mark operations where zero-ing upper bits
-is unnecessary. Then JITs can take advantage of those markings and
-drastically reduce size of generated code and improve performance.
+A: NO.
+
+But some optimizations on zero-ing the upper 32 bits for BPF registers are
+available, and can be leveraged to improve the performance of JITed BPF
+programs for 32-bit architectures.
+
+Starting with version 7, LLVM is able to generate instructions that operate
+on 32-bit subregisters, provided the option -mattr=+alu32 is passed for
+compiling a program. Furthermore, the verifier can now mark the
+instructions for which zero-ing the upper bits of the destination register
+is required, and insert an explicit zero-extension (zext) instruction
+(a mov32 variant). This means that for architectures without zext hardware
+support, the JIT back-ends do not need to clear the upper bits for
+subregisters written by alu32 instructions or narrow loads. Instead, the
+back-ends simply need to support code generation for that mov32 variant,
+and to overwrite bpf_jit_needs_zext() to make it return "true" (in order to
+enable zext insertion in the verifier).
+
+Note that it is possible for a JIT back-end to have partial hardware
+support for zext. In that case, if verifier zext insertion is enabled,
+it could lead to the insertion of unnecessary zext instructions. Such
+instructions could be removed by creating a simple peephole inside the JIT
+back-end: if one instruction has hardware support for zext and if the next
+instruction is an explicit zext, then the latter can be skipped when doing
+the code generation.
Q: Does BPF have a stable ABI?
------------------------------
diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index d3fe4cac0c90..801a6ed3f2e5 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -42,6 +42,7 @@ Program types
.. toctree::
:maxdepth: 1
+ prog_cgroup_sockopt
prog_cgroup_sysctl
prog_flow_dissector
diff --git a/Documentation/bpf/prog_cgroup_sockopt.rst b/Documentation/bpf/prog_cgroup_sockopt.rst
new file mode 100644
index 000000000000..c47d974629ae
--- /dev/null
+++ b/Documentation/bpf/prog_cgroup_sockopt.rst
@@ -0,0 +1,93 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+BPF_PROG_TYPE_CGROUP_SOCKOPT
+============================
+
+``BPF_PROG_TYPE_CGROUP_SOCKOPT`` program type can be attached to two
+cgroup hooks:
+
+* ``BPF_CGROUP_GETSOCKOPT`` - called every time process executes ``getsockopt``
+ system call.
+* ``BPF_CGROUP_SETSOCKOPT`` - called every time process executes ``setsockopt``
+ system call.
+
+The context (``struct bpf_sockopt``) has associated socket (``sk``) and
+all input arguments: ``level``, ``optname``, ``optval`` and ``optlen``.
+
+BPF_CGROUP_SETSOCKOPT
+=====================
+
+``BPF_CGROUP_SETSOCKOPT`` is triggered *before* the kernel handling of
+sockopt and it has writable context: it can modify the supplied arguments
+before passing them down to the kernel. This hook has access to the cgroup
+and socket local storage.
+
+If BPF program sets ``optlen`` to -1, the control will be returned
+back to the userspace after all other BPF programs in the cgroup
+chain finish (i.e. kernel ``setsockopt`` handling will *not* be executed).
+
+Note, that ``optlen`` can not be increased beyond the user-supplied
+value. It can only be decreased or set to -1. Any other value will
+trigger ``EFAULT``.
+
+Return Type
+-----------
+
+* ``0`` - reject the syscall, ``EPERM`` will be returned to the userspace.
+* ``1`` - success, continue with next BPF program in the cgroup chain.
+
+BPF_CGROUP_GETSOCKOPT
+=====================
+
+``BPF_CGROUP_GETSOCKOPT`` is triggered *after* the kernel handing of
+sockopt. The BPF hook can observe ``optval``, ``optlen`` and ``retval``
+if it's interested in whatever kernel has returned. BPF hook can override
+the values above, adjust ``optlen`` and reset ``retval`` to 0. If ``optlen``
+has been increased above initial ``getsockopt`` value (i.e. userspace
+buffer is too small), ``EFAULT`` is returned.
+
+This hook has access to the cgroup and socket local storage.
+
+Note, that the only acceptable value to set to ``retval`` is 0 and the
+original value that the kernel returned. Any other value will trigger
+``EFAULT``.
+
+Return Type
+-----------
+
+* ``0`` - reject the syscall, ``EPERM`` will be returned to the userspace.
+* ``1`` - success: copy ``optval`` and ``optlen`` to userspace, return
+ ``retval`` from the syscall (note that this can be overwritten by
+ the BPF program from the parent cgroup).
+
+Cgroup Inheritance
+==================
+
+Suppose, there is the following cgroup hierarchy where each cgroup
+has ``BPF_CGROUP_GETSOCKOPT`` attached at each level with
+``BPF_F_ALLOW_MULTI`` flag::
+
+ A (root, parent)
+ \
+ B (child)
+
+When the application calls ``getsockopt`` syscall from the cgroup B,
+the programs are executed from the bottom up: B, A. First program
+(B) sees the result of kernel's ``getsockopt``. It can optionally
+adjust ``optval``, ``optlen`` and reset ``retval`` to 0. After that
+control will be passed to the second (A) program which will see the
+same context as B including any potential modifications.
+
+Same for ``BPF_CGROUP_SETSOCKOPT``: if the program is attached to
+A and B, the trigger order is B, then A. If B does any changes
+to the input arguments (``level``, ``optname``, ``optval``, ``optlen``),
+then the next program in the chain (A) will see those changes,
+*not* the original input ``setsockopt`` arguments. The potentially
+modified values will be then passed down to the kernel.
+
+Example
+=======
+
+See ``tools/testing/selftests/bpf/progs/sockopt_sk.c`` for an example
+of BPF program that handles socket options.
diff --git a/Documentation/bus-devices/ti-gpmc.txt b/Documentation/bus-devices/ti-gpmc.txt
deleted file mode 100644
index cc9ce57e0a26..000000000000
--- a/Documentation/bus-devices/ti-gpmc.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-GPMC (General Purpose Memory Controller):
-=========================================
-
-GPMC is an unified memory controller dedicated to interfacing external
-memory devices like
- * Asynchronous SRAM like memories and application specific integrated
- circuit devices.
- * Asynchronous, synchronous, and page mode burst NOR flash devices
- NAND flash
- * Pseudo-SRAM devices
-
-GPMC is found on Texas Instruments SoC's (OMAP based)
-IP details: http://www.ti.com/lit/pdf/spruh73 section 7.1
-
-
-GPMC generic timing calculation:
-================================
-
-GPMC has certain timings that has to be programmed for proper
-functioning of the peripheral, while peripheral has another set of
-timings. To have peripheral work with gpmc, peripheral timings has to
-be translated to the form gpmc can understand. The way it has to be
-translated depends on the connected peripheral. Also there is a
-dependency for certain gpmc timings on gpmc clock frequency. Hence a
-generic timing routine was developed to achieve above requirements.
-
-Generic routine provides a generic method to calculate gpmc timings
-from gpmc peripheral timings. struct gpmc_device_timings fields has to
-be updated with timings from the datasheet of the peripheral that is
-connected to gpmc. A few of the peripheral timings can be fed either
-in time or in cycles, provision to handle this scenario has been
-provided (refer struct gpmc_device_timings definition). It may so
-happen that timing as specified by peripheral datasheet is not present
-in timing structure, in this scenario, try to correlate peripheral
-timing to the one available. If that doesn't work, try to add a new
-field as required by peripheral, educate generic timing routine to
-handle it, make sure that it does not break any of the existing.
-Then there may be cases where peripheral datasheet doesn't mention
-certain fields of struct gpmc_device_timings, zero those entries.
-
-Generic timing routine has been verified to work properly on
-multiple onenand's and tusb6010 peripherals.
-
-A word of caution: generic timing routine has been developed based
-on understanding of gpmc timings, peripheral timings, available
-custom timing routines, a kind of reverse engineering without
-most of the datasheets & hardware (to be exact none of those supported
-in mainline having custom timing routine) and by simulation.
-
-gpmc timing dependency on peripheral timings:
-[<gpmc_timing>: <peripheral timing1>, <peripheral timing2> ...]
-
-1. common
-cs_on: t_ceasu
-adv_on: t_avdasu, t_ceavd
-
-2. sync common
-sync_clk: clk
-page_burst_access: t_bacc
-clk_activation: t_ces, t_avds
-
-3. read async muxed
-adv_rd_off: t_avdp_r
-oe_on: t_oeasu, t_aavdh
-access: t_iaa, t_oe, t_ce, t_aa
-rd_cycle: t_rd_cycle, t_cez_r, t_oez
-
-4. read async non-muxed
-adv_rd_off: t_avdp_r
-oe_on: t_oeasu
-access: t_iaa, t_oe, t_ce, t_aa
-rd_cycle: t_rd_cycle, t_cez_r, t_oez
-
-5. read sync muxed
-adv_rd_off: t_avdp_r, t_avdh
-oe_on: t_oeasu, t_ach, cyc_aavdh_oe
-access: t_iaa, cyc_iaa, cyc_oe
-rd_cycle: t_cez_r, t_oez, t_ce_rdyz
-
-6. read sync non-muxed
-adv_rd_off: t_avdp_r
-oe_on: t_oeasu
-access: t_iaa, cyc_iaa, cyc_oe
-rd_cycle: t_cez_r, t_oez, t_ce_rdyz
-
-7. write async muxed
-adv_wr_off: t_avdp_w
-we_on, wr_data_mux_bus: t_weasu, t_aavdh, cyc_aavhd_we
-we_off: t_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_wr_cycle
-
-8. write async non-muxed
-adv_wr_off: t_avdp_w
-we_on, wr_data_mux_bus: t_weasu
-we_off: t_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_wr_cycle
-
-9. write sync muxed
-adv_wr_off: t_avdp_w, t_avdh
-we_on, wr_data_mux_bus: t_weasu, t_rdyo, t_aavdh, cyc_aavhd_we
-we_off: t_wpl, cyc_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_ce_rdyz
-
-10. write sync non-muxed
-adv_wr_off: t_avdp_w
-we_on, wr_data_mux_bus: t_weasu, t_rdyo
-we_off: t_wpl, cyc_wpl
-cs_wr_off: t_wph
-wr_cycle: t_cez_w, t_ce_rdyz
-
-
-Note: Many of gpmc timings are dependent on other gpmc timings (a few
-gpmc timings purely dependent on other gpmc timings, a reason that
-some of the gpmc timings are missing above), and it will result in
-indirect dependency of peripheral timings to gpmc timings other than
-mentioned above, refer timing routine for more details. To know what
-these peripheral timings correspond to, please see explanations in
-struct gpmc_device_timings definition. And for gpmc timings refer
-IP details (link above).
diff --git a/Documentation/cdrom/index.rst b/Documentation/cdrom/index.rst
index efbd5d111825..338ad5f94e7c 100644
--- a/Documentation/cdrom/index.rst
+++ b/Documentation/cdrom/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
=====
cdrom
diff --git a/Documentation/cgroup-v1/blkio-controller.txt b/Documentation/cgroup-v1/blkio-controller.txt
deleted file mode 100644
index 673dc34d3f78..000000000000
--- a/Documentation/cgroup-v1/blkio-controller.txt
+++ /dev/null
@@ -1,375 +0,0 @@
- Block IO Controller
- ===================
-Overview
-========
-cgroup subsys "blkio" implements the block io controller. There seems to be
-a need of various kinds of IO control policies (like proportional BW, max BW)
-both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
-Plan is to use the same cgroup based management interface for blkio controller
-and based on user options switch IO policies in the background.
-
-Currently two IO control policies are implemented. First one is proportional
-weight time based division of disk policy. It is implemented in CFQ. Hence
-this policy takes effect only on leaf nodes when CFQ is being used. The second
-one is throttling policy which can be used to specify upper IO rate limits
-on devices. This policy is implemented in generic block layer and can be
-used on leaf nodes as well as higher level logical devices like device mapper.
-
-HOWTO
-=====
-Proportional Weight division of bandwidth
------------------------------------------
-You can do a very simple testing of running two dd threads in two different
-cgroups. Here is what you can do.
-
-- Enable Block IO controller
- CONFIG_BLK_CGROUP=y
-
-- Enable group scheduling in CFQ
- CONFIG_CFQ_GROUP_IOSCHED=y
-
-- Compile and boot into kernel and mount IO controller (blkio); see
- cgroups.txt, Why are cgroups needed?.
-
- mount -t tmpfs cgroup_root /sys/fs/cgroup
- mkdir /sys/fs/cgroup/blkio
- mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Create two cgroups
- mkdir -p /sys/fs/cgroup/blkio/test1/ /sys/fs/cgroup/blkio/test2
-
-- Set weights of group test1 and test2
- echo 1000 > /sys/fs/cgroup/blkio/test1/blkio.weight
- echo 500 > /sys/fs/cgroup/blkio/test2/blkio.weight
-
-- Create two same size files (say 512MB each) on same disk (file1, file2) and
- launch two dd threads in different cgroup to read those files.
-
- sync
- echo 3 > /proc/sys/vm/drop_caches
-
- dd if=/mnt/sdb/zerofile1 of=/dev/null &
- echo $! > /sys/fs/cgroup/blkio/test1/tasks
- cat /sys/fs/cgroup/blkio/test1/tasks
-
- dd if=/mnt/sdb/zerofile2 of=/dev/null &
- echo $! > /sys/fs/cgroup/blkio/test2/tasks
- cat /sys/fs/cgroup/blkio/test2/tasks
-
-- At macro level, first dd should finish first. To get more precise data, keep
- on looking at (with the help of script), at blkio.disk_time and
- blkio.disk_sectors files of both test1 and test2 groups. This will tell how
- much disk time (in milliseconds), each group got and how many sectors each
- group dispatched to the disk. We provide fairness in terms of disk time, so
- ideally io.disk_time of cgroups should be in proportion to the weight.
-
-Throttling/Upper Limit policy
------------------------------
-- Enable Block IO controller
- CONFIG_BLK_CGROUP=y
-
-- Enable throttling in block layer
- CONFIG_BLK_DEV_THROTTLING=y
-
-- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)
- mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Specify a bandwidth rate on particular device for root group. The format
- for policy is "<major>:<minor> <bytes_per_second>".
-
- echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
-
- Above will put a limit of 1MB/second on reads happening for root group
- on device having major/minor number 8:16.
-
-- Run dd to read a file and see if rate is throttled to 1MB/s or not.
-
- # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
- 1024+0 records in
- 1024+0 records out
- 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
-
- Limits for writes can be put using blkio.throttle.write_bps_device file.
-
-Hierarchical Cgroups
-====================
-
-Both CFQ and throttling implement hierarchy support; however,
-throttling's hierarchy support is enabled iff "sane_behavior" is
-enabled from cgroup side, which currently is a development option and
-not publicly available.
-
-If somebody created a hierarchy like as follows.
-
- root
- / \
- test1 test2
- |
- test3
-
-CFQ by default and throttling with "sane_behavior" will handle the
-hierarchy correctly. For details on CFQ hierarchy support, refer to
-Documentation/block/cfq-iosched.txt. For throttling, all limits apply
-to the whole subtree while all statistics are local to the IOs
-directly generated by tasks in that cgroup.
-
-Throttling without "sane_behavior" enabled from cgroup side will
-practically treat all groups at same level as if it looks like the
-following.
-
- pivot
- / / \ \
- root test1 test2 test3
-
-Various user visible config options
-===================================
-CONFIG_BLK_CGROUP
- - Block IO controller.
-
-CONFIG_DEBUG_BLK_CGROUP
- - Debug help. Right now some additional stats file show up in cgroup
- if this option is enabled.
-
-CONFIG_CFQ_GROUP_IOSCHED
- - Enables group scheduling in CFQ. Currently only 1 level of group
- creation is allowed.
-
-CONFIG_BLK_DEV_THROTTLING
- - Enable block device throttling support in block layer.
-
-Details of cgroup files
-=======================
-Proportional weight policy files
---------------------------------
-- blkio.weight
- - Specifies per cgroup weight. This is default weight of the group
- on all the devices until and unless overridden by per device rule.
- (See blkio.weight_device).
- Currently allowed range of weights is from 10 to 1000.
-
-- blkio.weight_device
- - One can specify per cgroup per device rules using this interface.
- These rules override the default value of group weight as specified
- by blkio.weight.
-
- Following is the format.
-
- # echo dev_maj:dev_minor weight > blkio.weight_device
- Configure weight=300 on /dev/sdb (8:16) in this cgroup
- # echo 8:16 300 > blkio.weight_device
- # cat blkio.weight_device
- dev weight
- 8:16 300
-
- Configure weight=500 on /dev/sda (8:0) in this cgroup
- # echo 8:0 500 > blkio.weight_device
- # cat blkio.weight_device
- dev weight
- 8:0 500
- 8:16 300
-
- Remove specific weight for /dev/sda in this cgroup
- # echo 8:0 0 > blkio.weight_device
- # cat blkio.weight_device
- dev weight
- 8:16 300
-
-- blkio.leaf_weight[_device]
- - Equivalents of blkio.weight[_device] for the purpose of
- deciding how much weight tasks in the given cgroup has while
- competing with the cgroup's child cgroups. For details,
- please refer to Documentation/block/cfq-iosched.txt.
-
-- blkio.time
- - disk time allocated to cgroup per device in milliseconds. First
- two fields specify the major and minor number of the device and
- third field specifies the disk time allocated to group in
- milliseconds.
-
-- blkio.sectors
- - number of sectors transferred to/from disk by the group. First
- two fields specify the major and minor number of the device and
- third field specifies the number of sectors transferred by the
- group to/from the device.
-
-- blkio.io_service_bytes
- - Number of bytes transferred to/from the disk by the group. These
- are further divided by the type of operation - read or write, sync
- or async. First two fields specify the major and minor number of the
- device, third field specifies the operation type and the fourth field
- specifies the number of bytes.
-
-- blkio.io_serviced
- - Number of IOs (bio) issued to the disk by the group. These
- are further divided by the type of operation - read or write, sync
- or async. First two fields specify the major and minor number of the
- device, third field specifies the operation type and the fourth field
- specifies the number of IOs.
-
-- blkio.io_service_time
- - Total amount of time between request dispatch and request completion
- for the IOs done by this cgroup. This is in nanoseconds to make it
- meaningful for flash devices too. For devices with queue depth of 1,
- this time represents the actual service time. When queue_depth > 1,
- that is no longer true as requests may be served out of order. This
- may cause the service time for a given IO to include the service time
- of multiple IOs when served out of order which may result in total
- io_service_time > actual time elapsed. This time is further divided by
- the type of operation - read or write, sync or async. First two fields
- specify the major and minor number of the device, third field
- specifies the operation type and the fourth field specifies the
- io_service_time in ns.
-
-- blkio.io_wait_time
- - Total amount of time the IOs for this cgroup spent waiting in the
- scheduler queues for service. This can be greater than the total time
- elapsed since it is cumulative io_wait_time for all IOs. It is not a
- measure of total time the cgroup spent waiting but rather a measure of
- the wait_time for its individual IOs. For devices with queue_depth > 1
- this metric does not include the time spent waiting for service once
- the IO is dispatched to the device but till it actually gets serviced
- (there might be a time lag here due to re-ordering of requests by the
- device). This is in nanoseconds to make it meaningful for flash
- devices too. This time is further divided by the type of operation -
- read or write, sync or async. First two fields specify the major and
- minor number of the device, third field specifies the operation type
- and the fourth field specifies the io_wait_time in ns.
-
-- blkio.io_merged
- - Total number of bios/requests merged into requests belonging to this
- cgroup. This is further divided by the type of operation - read or
- write, sync or async.
-
-- blkio.io_queued
- - Total number of requests queued up at any given instant for this
- cgroup. This is further divided by the type of operation - read or
- write, sync or async.
-
-- blkio.avg_queue_size
- - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
- The average queue size for this cgroup over the entire time of this
- cgroup's existence. Queue size samples are taken each time one of the
- queues of this cgroup gets a timeslice.
-
-- blkio.group_wait_time
- - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
- This is the amount of time the cgroup had to wait since it became busy
- (i.e., went from 0 to 1 request queued) to get a timeslice for one of
- its queues. This is different from the io_wait_time which is the
- cumulative total of the amount of time spent by each IO in that cgroup
- waiting in the scheduler queue. This is in nanoseconds. If this is
- read when the cgroup is in a waiting (for timeslice) state, the stat
- will only report the group_wait_time accumulated till the last time it
- got a timeslice and will not include the current delta.
-
-- blkio.empty_time
- - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
- This is the amount of time a cgroup spends without any pending
- requests when not being served, i.e., it does not include any time
- spent idling for one of the queues of the cgroup. This is in
- nanoseconds. If this is read when the cgroup is in an empty state,
- the stat will only report the empty_time accumulated till the last
- time it had a pending request and will not include the current delta.
-
-- blkio.idle_time
- - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
- This is the amount of time spent by the IO scheduler idling for a
- given cgroup in anticipation of a better request than the existing ones
- from other queues/cgroups. This is in nanoseconds. If this is read
- when the cgroup is in an idling state, the stat will only report the
- idle_time accumulated till the last idle period and will not include
- the current delta.
-
-- blkio.dequeue
- - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
- gives the statistics about how many a times a group was dequeued
- from service tree of the device. First two fields specify the major
- and minor number of the device and third field specifies the number
- of times a group was dequeued from a particular device.
-
-- blkio.*_recursive
- - Recursive version of various stats. These files show the
- same information as their non-recursive counterparts but
- include stats from all the descendant cgroups.
-
-Throttling/Upper limit policy files
------------------------------------
-- blkio.throttle.read_bps_device
- - Specifies upper limit on READ rate from the device. IO rate is
- specified in bytes per second. Rules are per device. Following is
- the format.
-
- echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
-
-- blkio.throttle.write_bps_device
- - Specifies upper limit on WRITE rate to the device. IO rate is
- specified in bytes per second. Rules are per device. Following is
- the format.
-
- echo "<major>:<minor> <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
-
-- blkio.throttle.read_iops_device
- - Specifies upper limit on READ rate from the device. IO rate is
- specified in IO per second. Rules are per device. Following is
- the format.
-
- echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
-
-- blkio.throttle.write_iops_device
- - Specifies upper limit on WRITE rate to the device. IO rate is
- specified in io per second. Rules are per device. Following is
- the format.
-
- echo "<major>:<minor> <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
-
-Note: If both BW and IOPS rules are specified for a device, then IO is
- subjected to both the constraints.
-
-- blkio.throttle.io_serviced
- - Number of IOs (bio) issued to the disk by the group. These
- are further divided by the type of operation - read or write, sync
- or async. First two fields specify the major and minor number of the
- device, third field specifies the operation type and the fourth field
- specifies the number of IOs.
-
-- blkio.throttle.io_service_bytes
- - Number of bytes transferred to/from the disk by the group. These
- are further divided by the type of operation - read or write, sync
- or async. First two fields specify the major and minor number of the
- device, third field specifies the operation type and the fourth field
- specifies the number of bytes.
-
-Common files among various policies
------------------------------------
-- blkio.reset_stats
- - Writing an int to this file will result in resetting all the stats
- for that cgroup.
-
-CFQ sysfs tunable
-=================
-/sys/block/<disk>/queue/iosched/slice_idle
-------------------------------------------
-On a faster hardware CFQ can be slow, especially with sequential workload.
-This happens because CFQ idles on a single queue and single queue might not
-drive deeper request queue depths to keep the storage busy. In such scenarios
-one can try setting slice_idle=0 and that would switch CFQ to IOPS
-(IO operations per second) mode on NCQ supporting hardware.
-
-That means CFQ will not idle between cfq queues of a cfq group and hence be
-able to driver higher queue depth and achieve better throughput. That also
-means that cfq provides fairness among groups in terms of IOPS and not in
-terms of disk time.
-
-/sys/block/<disk>/queue/iosched/group_idle
-------------------------------------------
-If one disables idling on individual cfq queues and cfq service trees by
-setting slice_idle=0, group_idle kicks in. That means CFQ will still idle
-on the group in an attempt to provide fairness among groups.
-
-By default group_idle is same as slice_idle and does not do anything if
-slice_idle is enabled.
-
-One can experience an overall throughput drop if you have created multiple
-groups and put applications in that group which are not driving enough
-IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
-on individual groups and throughput should improve.
diff --git a/Documentation/cgroup-v1/cgroups.txt b/Documentation/cgroup-v1/cgroups.txt
deleted file mode 100644
index 059f7063eea6..000000000000
--- a/Documentation/cgroup-v1/cgroups.txt
+++ /dev/null
@@ -1,677 +0,0 @@
- CGROUPS
- -------
-
-Written by Paul Menage <menage@google.com> based on
-Documentation/cgroup-v1/cpusets.txt
-
-Original copyright statements from cpusets.txt:
-Portions Copyright (C) 2004 BULL SA.
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <cl@linux.com>
-
-CONTENTS:
-=========
-
-1. Control Groups
- 1.1 What are cgroups ?
- 1.2 Why are cgroups needed ?
- 1.3 How are cgroups implemented ?
- 1.4 What does notify_on_release do ?
- 1.5 What does clone_children do ?
- 1.6 How do I use cgroups ?
-2. Usage Examples and Syntax
- 2.1 Basic Usage
- 2.2 Attaching processes
- 2.3 Mounting hierarchies by name
-3. Kernel API
- 3.1 Overview
- 3.2 Synchronization
- 3.3 Subsystem API
-4. Extended attributes usage
-5. Questions
-
-1. Control Groups
-=================
-
-1.1 What are cgroups ?
-----------------------
-
-Control Groups provide a mechanism for aggregating/partitioning sets of
-tasks, and all their future children, into hierarchical groups with
-specialized behaviour.
-
-Definitions:
-
-A *cgroup* associates a set of tasks with a set of parameters for one
-or more subsystems.
-
-A *subsystem* is a module that makes use of the task grouping
-facilities provided by cgroups to treat groups of tasks in
-particular ways. A subsystem is typically a "resource controller" that
-schedules a resource or applies per-cgroup limits, but it may be
-anything that wants to act on a group of processes, e.g. a
-virtualization subsystem.
-
-A *hierarchy* is a set of cgroups arranged in a tree, such that
-every task in the system is in exactly one of the cgroups in the
-hierarchy, and a set of subsystems; each subsystem has system-specific
-state attached to each cgroup in the hierarchy. Each hierarchy has
-an instance of the cgroup virtual filesystem associated with it.
-
-At any one time there may be multiple active hierarchies of task
-cgroups. Each hierarchy is a partition of all tasks in the system.
-
-User-level code may create and destroy cgroups by name in an
-instance of the cgroup virtual file system, specify and query to
-which cgroup a task is assigned, and list the task PIDs assigned to
-a cgroup. Those creations and assignments only affect the hierarchy
-associated with that instance of the cgroup file system.
-
-On their own, the only use for cgroups is for simple job
-tracking. The intention is that other subsystems hook into the generic
-cgroup support to provide new attributes for cgroups, such as
-accounting/limiting the resources which processes in a cgroup can
-access. For example, cpusets (see Documentation/cgroup-v1/cpusets.txt) allow
-you to associate a set of CPUs and a set of memory nodes with the
-tasks in each cgroup.
-
-1.2 Why are cgroups needed ?
-----------------------------
-
-There are multiple efforts to provide process aggregations in the
-Linux kernel, mainly for resource-tracking purposes. Such efforts
-include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-namespaces. These all require the basic notion of a
-grouping/partitioning of processes, with newly forked processes ending
-up in the same group (cgroup) as their parent process.
-
-The kernel cgroup patch provides the minimum essential kernel
-mechanisms required to efficiently implement such groups. It has
-minimal impact on the system fast paths, and provides hooks for
-specific subsystems such as cpusets to provide additional behaviour as
-desired.
-
-Multiple hierarchy support is provided to allow for situations where
-the division of tasks into cgroups is distinctly different for
-different subsystems - having parallel hierarchies allows each
-hierarchy to be a natural division of tasks, without having to handle
-complex combinations of tasks that would be present if several
-unrelated subsystems needed to be forced into the same tree of
-cgroups.
-
-At one extreme, each resource controller or subsystem could be in a
-separate hierarchy; at the other extreme, all subsystems
-would be attached to the same hierarchy.
-
-As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-that can benefit from multiple hierarchies, consider a large
-university server with various users - students, professors, system
-tasks etc. The resource planning for this server could be along the
-following lines:
-
- CPU : "Top cpuset"
- / \
- CPUSet1 CPUSet2
- | |
- (Professors) (Students)
-
- In addition (system tasks) are attached to topcpuset (so
- that they can run anywhere) with a limit of 20%
-
- Memory : Professors (50%), Students (30%), system (20%)
-
- Disk : Professors (50%), Students (30%), system (20%)
-
- Network : WWW browsing (20%), Network File System (60%), others (20%)
- / \
- Professors (15%) students (5%)
-
-Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
-into the NFS network class.
-
-At the same time Firefox/Lynx will share an appropriate CPU/Memory class
-depending on who launched it (prof/student).
-
-With the ability to classify tasks differently for different resources
-(by putting those resource subsystems in different hierarchies),
-the admin can easily set up a script which receives exec notifications
-and depending on who is launching the browser he can
-
- # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
-
-With only a single hierarchy, he now would potentially have to create
-a separate cgroup for every browser launched and associate it with
-appropriate network and other resource class. This may lead to
-proliferation of such cgroups.
-
-Also let's say that the administrator would like to give enhanced network
-access temporarily to a student's browser (since it is night and the user
-wants to do online gaming :)) OR give one of the student's simulation
-apps enhanced CPU power.
-
-With ability to write PIDs directly to resource classes, it's just a
-matter of:
-
- # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
- (after some time)
- # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
-
-Without this ability, the administrator would have to split the cgroup into
-multiple separate ones and then associate the new cgroups with the
-new resource classes.
-
-
-
-1.3 How are cgroups implemented ?
----------------------------------
-
-Control Groups extends the kernel as follows:
-
- - Each task in the system has a reference-counted pointer to a
- css_set.
-
- - A css_set contains a set of reference-counted pointers to
- cgroup_subsys_state objects, one for each cgroup subsystem
- registered in the system. There is no direct link from a task to
- the cgroup of which it's a member in each hierarchy, but this
- can be determined by following pointers through the
- cgroup_subsys_state objects. This is because accessing the
- subsystem state is something that's expected to happen frequently
- and in performance-critical code, whereas operations that require a
- task's actual cgroup assignments (in particular, moving between
- cgroups) are less common. A linked list runs through the cg_list
- field of each task_struct using the css_set, anchored at
- css_set->tasks.
-
- - A cgroup hierarchy filesystem can be mounted for browsing and
- manipulation from user space.
-
- - You can list all the tasks (by PID) attached to any cgroup.
-
-The implementation of cgroups requires a few, simple hooks
-into the rest of the kernel, none in performance-critical paths:
-
- - in init/main.c, to initialize the root cgroups and initial
- css_set at system boot.
-
- - in fork and exit, to attach and detach a task from its css_set.
-
-In addition, a new file system of type "cgroup" may be mounted, to
-enable browsing and modifying the cgroups presently known to the
-kernel. When mounting a cgroup hierarchy, you may specify a
-comma-separated list of subsystems to mount as the filesystem mount
-options. By default, mounting the cgroup filesystem attempts to
-mount a hierarchy containing all registered subsystems.
-
-If an active hierarchy with exactly the same set of subsystems already
-exists, it will be reused for the new mount. If no existing hierarchy
-matches, and any of the requested subsystems are in use in an existing
-hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-is activated, associated with the requested subsystems.
-
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
-
-When a cgroup filesystem is unmounted, if there are any
-child cgroups created below the top-level cgroup, that hierarchy
-will remain active even though unmounted; if there are no
-child cgroups then the hierarchy will be deactivated.
-
-No new system calls are added for cgroups - all support for
-querying and modifying cgroups is via this cgroup file system.
-
-Each task under /proc has an added file named 'cgroup' displaying,
-for each active hierarchy, the subsystem names and the cgroup name
-as the path relative to the root of the cgroup file system.
-
-Each cgroup is represented by a directory in the cgroup file system
-containing the following files describing that cgroup:
-
- - tasks: list of tasks (by PID) attached to that cgroup. This list
- is not guaranteed to be sorted. Writing a thread ID into this file
- moves the thread into this cgroup.
- - cgroup.procs: list of thread group IDs in the cgroup. This list is
- not guaranteed to be sorted or free of duplicate TGIDs, and userspace
- should sort/uniquify the list if this property is required.
- Writing a thread group ID into this file moves all threads in that
- group into this cgroup.
- - notify_on_release flag: run the release agent on exit?
- - release_agent: the path to use for release notifications (this file
- exists in the top cgroup only)
-
-Other subsystems such as cpusets may add additional files in each
-cgroup dir.
-
-New cgroups are created using the mkdir system call or shell
-command. The properties of a cgroup, such as its flags, are
-modified by writing to the appropriate file in that cgroups
-directory, as listed above.
-
-The named hierarchical structure of nested cgroups allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cgroup allows organizing the work load
-on a system into related sets of tasks. A task may be re-attached to
-any other cgroup, if allowed by the permissions on the necessary
-cgroup file system directories.
-
-When a task is moved from one cgroup to another, it gets a new
-css_set pointer - if there's an already existing css_set with the
-desired collection of cgroups then that group is reused, otherwise a new
-css_set is allocated. The appropriate existing css_set is located by
-looking into a hash table.
-
-To allow access from a cgroup to the css_sets (and hence tasks)
-that comprise it, a set of cg_cgroup_link objects form a lattice;
-each cg_cgroup_link is linked into a list of cg_cgroup_links for
-a single cgroup on its cgrp_link_list field, and a list of
-cg_cgroup_links for a single css_set on its cg_link_list.
-
-Thus the set of tasks in a cgroup can be listed by iterating over
-each css_set that references the cgroup, and sub-iterating over
-each css_set's task set.
-
-The use of a Linux virtual file system (vfs) to represent the
-cgroup hierarchy provides for a familiar permission and name space
-for cgroups, with a minimum of additional kernel code.
-
-1.4 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cgroup, then
-whenever the last task in the cgroup leaves (exits or attaches to
-some other cgroup) and the last child cgroup of that cgroup
-is removed, then the kernel runs the command specified by the contents
-of the "release_agent" file in that hierarchy's root directory,
-supplying the pathname (relative to the mount point of the cgroup
-file system) of the abandoned cgroup. This enables automatic
-removal of abandoned cgroups. The default value of
-notify_on_release in the root cgroup at system boot is disabled
-(0). The default value of other cgroups at creation is the current
-value of their parents' notify_on_release settings. The default value of
-a cgroup hierarchy's release_agent path is empty.
-
-1.5 What does clone_children do ?
----------------------------------
-
-This flag only affects the cpuset controller. If the clone_children
-flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
-configuration from the parent during initialization.
-
-1.6 How do I use cgroups ?
---------------------------
-
-To start a new job that is to be contained within a cgroup, using
-the "cpuset" cgroup subsystem, the steps are something like:
-
- 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
- 2) mkdir /sys/fs/cgroup/cpuset
- 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
- the /sys/fs/cgroup/cpuset virtual file system.
- 5) Start a task that will be the "founding father" of the new job.
- 6) Attach that task to the new cgroup by writing its PID to the
- /sys/fs/cgroup/cpuset tasks file for that cgroup.
- 7) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cgroup
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cgroup:
-
- mount -t tmpfs cgroup_root /sys/fs/cgroup
- mkdir /sys/fs/cgroup/cpuset
- mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
- cd /sys/fs/cgroup/cpuset
- mkdir Charlie
- cd Charlie
- /bin/echo 2-3 > cpuset.cpus
- /bin/echo 1 > cpuset.mems
- /bin/echo $$ > tasks
- sh
- # The subshell 'sh' is now running in cgroup Charlie
- # The next line should display '/Charlie'
- cat /proc/self/cgroup
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using cgroups can be done through the cgroup
-virtual filesystem.
-
-To mount a cgroup hierarchy with all available subsystems, type:
-# mount -t cgroup xxx /sys/fs/cgroup
-
-The "xxx" is not interpreted by the cgroup code, but will appear in
-/proc/mounts so may be any useful identifying string that you like.
-
-Note: Some subsystems do not work without some user input first. For instance,
-if cpusets are enabled the user will have to populate the cpus and mems files
-for each new cgroup created before that group can be used.
-
-As explained in section `1.2 Why are cgroups needed?' you should create
-different hierarchies of cgroups for each single resource or group of
-resources you want to control. Therefore, you should mount a tmpfs on
-/sys/fs/cgroup and create directories for each cgroup resource or resource
-group.
-
-# mount -t tmpfs cgroup_root /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/rg1
-
-To mount a cgroup hierarchy with just the cpuset and memory
-subsystems, type:
-# mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
-
-While remounting cgroups is currently supported, it is not recommend
-to use it. Remounting allows changing bound subsystems and
-release_agent. Rebinding is hardly useful as it only works when the
-hierarchy is empty and release_agent itself should be replaced with
-conventional fsnotify. The support for remounting will be removed in
-the future.
-
-To Specify a hierarchy's release_agent:
-# mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
- xxx /sys/fs/cgroup/rg1
-
-Note that specifying 'release_agent' more than once will return failure.
-
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
-
-Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
-tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
-is the cgroup that holds the whole system.
-
-If you want to change the value of release_agent:
-# echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
-
-It can also be changed via remount.
-
-If you want to create a new cgroup under /sys/fs/cgroup/rg1:
-# cd /sys/fs/cgroup/rg1
-# mkdir my_cgroup
-
-Now you want to do something with this cgroup.
-# cd my_cgroup
-
-In this directory you can find several files:
-# ls
-cgroup.procs notify_on_release tasks
-(plus whatever files added by the attached subsystems)
-
-Now attach your shell to this cgroup:
-# /bin/echo $$ > tasks
-
-You can also create cgroups inside your cgroup by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cgroup, just use rmdir:
-# rmdir my_sub_cs
-
-This will fail if the cgroup is in use (has cgroups inside, or
-has processes attached, or is held alive by other subsystem-specific
-reference).
-
-2.2 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
- ...
-# /bin/echo PIDn > tasks
-
-You can attach the current shell task by echoing 0:
-
-# echo 0 > tasks
-
-You can use the cgroup.procs file instead of the tasks file to move all
-threads in a threadgroup at once. Echoing the PID of any task in a
-threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
-in the writing task's threadgroup.
-
-Note: Since every task is always a member of exactly one cgroup in each
-mounted hierarchy, to remove a task from its current cgroup you must
-move it into a new cgroup (possibly the root cgroup) by writing to the
-new cgroup's tasks file.
-
-Note: Due to some restrictions enforced by some cgroup subsystems, moving
-a process to another cgroup can fail.
-
-2.3 Mounting hierarchies by name
---------------------------------
-
-Passing the name=<x> option when mounting a cgroups hierarchy
-associates the given name with the hierarchy. This can be used when
-mounting a pre-existing hierarchy, in order to refer to it by name
-rather than by its set of active subsystems. Each hierarchy is either
-nameless, or has a unique name.
-
-The name should match [\w.-]+
-
-When passing a name=<x> option for a new hierarchy, you need to
-specify subsystems manually; the legacy behaviour of mounting all
-subsystems when none are explicitly specified is not supported when
-you give a subsystem a name.
-
-The name of the subsystem appears as part of the hierarchy description
-in /proc/mounts and /proc/<pid>/cgroups.
-
-
-3. Kernel API
-=============
-
-3.1 Overview
-------------
-
-Each kernel subsystem that wants to hook into the generic cgroup
-system needs to create a cgroup_subsys object. This contains
-various methods, which are callbacks from the cgroup system, along
-with a subsystem ID which will be assigned by the cgroup system.
-
-Other fields in the cgroup_subsys object include:
-
-- subsys_id: a unique array index for the subsystem, indicating which
- entry in cgroup->subsys[] this subsystem should be managing.
-
-- name: should be initialized to a unique subsystem name. Should be
- no longer than MAX_CGROUP_TYPE_NAMELEN.
-
-- early_init: indicate if the subsystem needs early initialization
- at system boot.
-
-Each cgroup object created by the system has an array of pointers,
-indexed by subsystem ID; this pointer is entirely managed by the
-subsystem; the generic cgroup code will never touch this pointer.
-
-3.2 Synchronization
--------------------
-
-There is a global mutex, cgroup_mutex, used by the cgroup
-system. This should be taken by anything that wants to modify a
-cgroup. It may also be taken to prevent cgroups from being
-modified, but more specific locks may be more appropriate in that
-situation.
-
-See kernel/cgroup.c for more details.
-
-Subsystems can take/release the cgroup_mutex via the functions
-cgroup_lock()/cgroup_unlock().
-
-Accessing a task's cgroup pointer may be done in the following ways:
-- while holding cgroup_mutex
-- while holding the task's alloc_lock (via task_lock())
-- inside an rcu_read_lock() section via rcu_dereference()
-
-3.3 Subsystem API
------------------
-
-Each subsystem should:
-
-- add an entry in linux/cgroup_subsys.h
-- define a cgroup_subsys object called <name>_cgrp_subsys
-
-Each subsystem may export the following methods. The only mandatory
-methods are css_alloc/free. Any others that are null are presumed to
-be successful no-ops.
-
-struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called to allocate a subsystem state object for a cgroup. The
-subsystem should allocate its subsystem state object for the passed
-cgroup, returning a pointer to the new object on success or a
-ERR_PTR() value. On success, the subsystem pointer should point to
-a structure of type cgroup_subsys_state (typically embedded in a
-larger subsystem-specific object), which will be initialized by the
-cgroup system. Note that this will be called at initialization to
-create the root subsystem state for this subsystem; this case can be
-identified by the passed cgroup object having a NULL parent (since
-it's the root of the hierarchy) and may be an appropriate place for
-initialization code.
-
-int css_online(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-Called after @cgrp successfully completed all allocations and made
-visible to cgroup_for_each_child/descendant_*() iterators. The
-subsystem may choose to fail creation by returning -errno. This
-callback can be used to implement reliable state sharing and
-propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
-
-void css_offline(struct cgroup *cgrp);
-(cgroup_mutex held by caller)
-
-This is the counterpart of css_online() and called iff css_online()
-has succeeded on @cgrp. This signifies the beginning of the end of
-@cgrp. @cgrp is being removed and the subsystem should start dropping
-all references it's holding on @cgrp. When all references are dropped,
-cgroup removal will proceed to the next step - css_free(). After this
-callback, @cgrp should be considered dead to the subsystem.
-
-void css_free(struct cgroup *cgrp)
-(cgroup_mutex held by caller)
-
-The cgroup system is about to free @cgrp; the subsystem should free
-its subsystem state object. By the time this method is called, @cgrp
-is completely unused; @cgrp->parent is still valid. (Note - can also
-be called for a newly-created cgroup if an error occurs after this
-subsystem's create() method has been called for the new cgroup).
-
-int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called prior to moving one or more tasks into a cgroup; if the
-subsystem returns an error, this will abort the attach operation.
-@tset contains the tasks to be attached and is guaranteed to have at
-least one task in it.
-
-If there are multiple tasks in the taskset, then:
- - it's guaranteed that all are from the same thread group
- - @tset contains all tasks from the thread group whether or not
- they're switching cgroups
- - the first task is the leader
-
-Each @tset entry also contains the task's old cgroup and tasks which
-aren't switching cgroup can be skipped easily using the
-cgroup_taskset_for_each() iterator. Note that this isn't called on a
-fork. If this method returns 0 (success) then this should remain valid
-while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future.
-
-void css_reset(struct cgroup_subsys_state *css)
-(cgroup_mutex held by caller)
-
-An optional operation which should restore @css's configuration to the
-initial state. This is currently only used on the unified hierarchy
-when a subsystem is disabled on a cgroup through
-"cgroup.subtree_control" but should remain enabled because other
-subsystems depend on it. cgroup core makes such a css invisible by
-removing the associated interface files and invokes this callback so
-that the hidden subsystem can return to the initial neutral state.
-This prevents unexpected resource control from a hidden css and
-ensures that the configuration is in the initial state when it is made
-visible again later.
-
-void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called when a task attach operation has failed after can_attach() has succeeded.
-A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsystem can implement a rollback. If not, not necessary.
-This will be called only about subsystems whose can_attach() operation have
-succeeded. The parameters are identical to can_attach().
-
-void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
-(cgroup_mutex held by caller)
-
-Called after the task has been attached to the cgroup, to allow any
-post-attachment activity that requires memory allocations or blocking.
-The parameters are identical to can_attach().
-
-void fork(struct task_struct *task)
-
-Called when a task is forked into a cgroup.
-
-void exit(struct task_struct *task)
-
-Called during task exit.
-
-void free(struct task_struct *task)
-
-Called when the task_struct is freed.
-
-void bind(struct cgroup *root)
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
-4. Extended attribute usage
-===========================
-
-cgroup filesystem supports certain types of extended attributes in its
-directories and files. The current supported types are:
- - Trusted (XATTR_TRUSTED)
- - Security (XATTR_SECURITY)
-
-Both require CAP_SYS_ADMIN capability to set.
-
-Like in tmpfs, the extended attributes in cgroup filesystem are stored
-using kernel memory and it's advised to keep the usage at minimum. This
-is the reason why user defined extended attributes are not supported, since
-any user can do it and there's no limit in the value size.
-
-The current known users for this feature are SELinux to limit cgroup usage
-in containers and systemd for assorted meta data like main PID in a cgroup
-(systemd creates a cgroup per service).
-
-5. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
- errors. If you use it in the cgroup file system, you won't be
- able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
- put only ONE PID.
-
diff --git a/Documentation/cgroup-v1/cpuacct.txt b/Documentation/cgroup-v1/cpuacct.txt
deleted file mode 100644
index 9d73cc0cadb9..000000000000
--- a/Documentation/cgroup-v1/cpuacct.txt
+++ /dev/null
@@ -1,49 +0,0 @@
-CPU Accounting Controller
--------------------------
-
-The CPU accounting controller is used to group tasks using cgroups and
-account the CPU usage of these groups of tasks.
-
-The CPU accounting controller supports multi-hierarchy groups. An accounting
-group accumulates the CPU usage of all of its child groups and the tasks
-directly present in its group.
-
-Accounting groups can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -ocpuacct none /sys/fs/cgroup
-
-With the above step, the initial or the parent accounting group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
-by this group which is essentially the CPU time obtained by all the tasks
-in the system.
-
-New accounting groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it. CPU time consumed by this bash and its children
-can be obtained from g1/cpuacct.usage and the same is accumulated in
-/sys/fs/cgroup/cpuacct.usage also.
-
-cpuacct.stat file lists a few statistics which further divide the
-CPU time obtained by the cgroup into user and system times. Currently
-the following statistics are supported:
-
-user: Time spent by tasks of the cgroup in user mode.
-system: Time spent by tasks of the cgroup in kernel mode.
-
-user and system are in USER_HZ unit.
-
-cpuacct controller uses percpu_counter interface to collect user and
-system times. This has two side effects:
-
-- It is theoretically possible to see wrong values for user and system times.
- This is because percpu_counter_read() on 32bit systems isn't safe
- against concurrent writes.
-- It is possible to see slightly outdated values for user and system times
- due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-v1/cpusets.txt b/Documentation/cgroup-v1/cpusets.txt
deleted file mode 100644
index 8402dd6de8df..000000000000
--- a/Documentation/cgroup-v1/cpusets.txt
+++ /dev/null
@@ -1,839 +0,0 @@
- CPUSETS
- -------
-
-Copyright (C) 2004 BULL SA.
-Written by Simon.Derr@bull.net
-
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-Modified by Paul Jackson <pj@sgi.com>
-Modified by Christoph Lameter <cl@linux.com>
-Modified by Paul Menage <menage@google.com>
-Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
-
-CONTENTS:
-=========
-
-1. Cpusets
- 1.1 What are cpusets ?
- 1.2 Why are cpusets needed ?
- 1.3 How are cpusets implemented ?
- 1.4 What are exclusive cpusets ?
- 1.5 What is memory_pressure ?
- 1.6 What is memory spread ?
- 1.7 What is sched_load_balance ?
- 1.8 What is sched_relax_domain_level ?
- 1.9 How do I use cpusets ?
-2. Usage Examples and Syntax
- 2.1 Basic Usage
- 2.2 Adding/removing cpus
- 2.3 Setting flags
- 2.4 Attaching processes
-3. Questions
-4. Contact
-
-1. Cpusets
-==========
-
-1.1 What are cpusets ?
-----------------------
-
-Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks. In this document "Memory Node" refers to
-an on-line node that contains memory.
-
-Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a task's current cpuset. They form a nested
-hierarchy visible in a virtual file system. These are the essential
-hooks, beyond what is already present, required to manage dynamic
-job placement on large systems.
-
-Cpusets use the generic cgroup subsystem described in
-Documentation/cgroup-v1/cgroups.txt.
-
-Requests by a task, using the sched_setaffinity(2) system call to
-include CPUs in its CPU affinity mask, and using the mbind(2) and
-set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that task's cpuset, filtering out any
-CPUs or Memory Nodes not in that cpuset. The scheduler will not
-schedule a task on a CPU that is not allowed in its cpus_allowed
-vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting task's mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cgroup
-virtual file system, manage the attributes and permissions of these
-cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
-specify and query to which cpuset a task is assigned, and list the
-task pids assigned to a cpuset.
-
-
-1.2 Why are cpusets needed ?
-----------------------------
-
-The management of large computer systems, with many processors (CPUs),
-complex memory cache hierarchies and multiple Memory Nodes having
-non-uniform access times (NUMA) presents additional challenges for
-the efficient scheduling and memory placement of processes.
-
-Frequently more modest sized systems can be operated with adequate
-efficiency just by letting the operating system automatically share
-the available CPU and Memory resources amongst the requesting tasks.
-
-But larger systems, which benefit more from careful processor and
-memory placement to reduce memory access times and contention,
-and which typically represent a larger investment for the customer,
-can benefit from explicitly placing jobs on properly sized subsets of
-the system.
-
-This can be especially valuable on:
-
- * Web Servers running multiple instances of the same web application,
- * Servers running different applications (for instance, a web server
- and a database), or
- * NUMA systems running large HPC applications with demanding
- performance characteristics.
-
-These subsets, or "soft partitions" must be able to be dynamically
-adjusted, as the job mix changes, without impacting other concurrently
-executing jobs. The location of the running jobs pages may also be moved
-when the memory locations are changed.
-
-The kernel cpuset patch provides the minimum essential kernel
-mechanisms required to efficiently implement such subsets. It
-leverages existing CPU and Memory Placement facilities in the Linux
-kernel to avoid any additional impact on the critical scheduler or
-memory allocator code.
-
-
-1.3 How are cpusets implemented ?
----------------------------------
-
-Cpusets provide a Linux kernel mechanism to constrain which CPUs and
-Memory Nodes are used by a process or set of processes.
-
-The Linux kernel already has a pair of mechanisms to specify on which
-CPUs a task may be scheduled (sched_setaffinity) and on which Memory
-Nodes it may obtain memory (mbind, set_mempolicy).
-
-Cpusets extends these two mechanisms as follows:
-
- - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
- kernel.
- - Each task in the system is attached to a cpuset, via a pointer
- in the task structure to a reference counted cgroup structure.
- - Calls to sched_setaffinity are filtered to just those CPUs
- allowed in that task's cpuset.
- - Calls to mbind and set_mempolicy are filtered to just
- those Memory Nodes allowed in that task's cpuset.
- - The root cpuset contains all the systems CPUs and Memory
- Nodes.
- - For any cpuset, one can define child cpusets containing a subset
- of the parents CPU and Memory Node resources.
- - The hierarchy of cpusets can be mounted at /dev/cpuset, for
- browsing and manipulation from user space.
- - A cpuset may be marked exclusive, which ensures that no other
- cpuset (except direct ancestors and descendants) may contain
- any overlapping CPUs or Memory Nodes.
- - You can list all the tasks (by pid) attached to any cpuset.
-
-The implementation of cpusets requires a few, simple hooks
-into the rest of the kernel, none in performance critical paths:
-
- - in init/main.c, to initialize the root cpuset at system boot.
- - in fork and exit, to attach and detach a task from its cpuset.
- - in sched_setaffinity, to mask the requested CPUs by what's
- allowed in that task's cpuset.
- - in sched.c migrate_live_tasks(), to keep migrating tasks within
- the CPUs allowed by their cpuset, if possible.
- - in the mbind and set_mempolicy system calls, to mask the requested
- Memory Nodes by what's allowed in that task's cpuset.
- - in page_alloc.c, to restrict memory to allowed nodes.
- - in vmscan.c, to restrict page recovery to the current cpuset.
-
-You should mount the "cgroup" filesystem type in order to enable
-browsing and modifying the cpusets presently known to the kernel. No
-new system calls are added for cpusets - all support for querying and
-modifying cpusets is via this cpuset file system.
-
-The /proc/<pid>/status file for each task has four added lines,
-displaying the task's cpus_allowed (on which CPUs it may be scheduled)
-and mems_allowed (on which Memory Nodes it may obtain memory),
-in the two formats seen in the following example:
-
- Cpus_allowed: ffffffff,ffffffff,ffffffff,ffffffff
- Cpus_allowed_list: 0-127
- Mems_allowed: ffffffff,ffffffff
- Mems_allowed_list: 0-63
-
-Each cpuset is represented by a directory in the cgroup file system
-containing (on top of the standard cgroup files) the following
-files describing that cpuset:
-
- - cpuset.cpus: list of CPUs in that cpuset
- - cpuset.mems: list of Memory Nodes in that cpuset
- - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
- - cpuset.cpu_exclusive flag: is cpu placement exclusive?
- - cpuset.mem_exclusive flag: is memory placement exclusive?
- - cpuset.mem_hardwall flag: is memory allocation hardwalled
- - cpuset.memory_pressure: measure of how much paging pressure in cpuset
- - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - cpuset.sched_relax_domain_level: the searching range when migrating tasks
-
-In addition, only the root cpuset has the following file:
- - cpuset.memory_pressure_enabled flag: compute memory_pressure?
-
-New cpusets are created using the mkdir system call or shell
-command. The properties of a cpuset, such as its flags, allowed
-CPUs and Memory Nodes, and attached tasks, are modified by writing
-to the appropriate file in that cpusets directory, as listed above.
-
-The named hierarchical structure of nested cpusets allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cpuset allows organizing the work load
-on a system into related sets of tasks such that each set is constrained
-to using the CPUs and Memory Nodes of a particular cpuset. A task
-may be re-attached to any other cpuset, if allowed by the permissions
-on the necessary cpuset file system directories.
-
-Such management of a system "in the large" integrates smoothly with
-the detailed placement done on individual tasks and memory regions
-using the sched_setaffinity, mbind and set_mempolicy system calls.
-
-The following rules apply to each cpuset:
-
- - Its CPUs and Memory Nodes must be a subset of its parents.
- - It can't be marked exclusive unless its parent is.
- - If its cpu or memory is exclusive, they may not overlap any sibling.
-
-These rules, and the natural hierarchy of cpusets, enable efficient
-enforcement of the exclusive guarantee, without having to scan all
-cpusets every time any of them change to ensure nothing overlaps a
-exclusive cpuset. Also, the use of a Linux virtual file system (vfs)
-to represent the cpuset hierarchy provides for a familiar permission
-and name space for cpusets, with a minimum of additional kernel code.
-
-The cpus and mems files in the root (top_cpuset) cpuset are
-read-only. The cpus file automatically tracks the value of
-cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_MEMORY]--i.e.,
-nodes with memory--using the cpuset_track_online_nodes() hook.
-
-
-1.4 What are exclusive cpusets ?
---------------------------------
-
-If a cpuset is cpu or mem exclusive, no other cpuset, other than
-a direct ancestor or descendant, may share any of the same CPUs or
-Memory Nodes.
-
-A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
-i.e. it restricts kernel allocations for page, buffer and other data
-commonly shared by the kernel across multiple users. All cpusets,
-whether hardwalled or not, restrict allocations of memory for user
-space. This enables configuring a system so that several independent
-jobs can share common kernel data, such as file system pages, while
-isolating each job's user allocation in its own cpuset. To do this,
-construct a large mem_exclusive cpuset to hold all the jobs, and
-construct child, non-mem_exclusive cpusets for each individual job.
-Only a small amount of typical kernel memory, such as requests from
-interrupt handlers, is allowed to be taken outside even a
-mem_exclusive cpuset.
-
-
-1.5 What is memory_pressure ?
------------------------------
-The memory_pressure of a cpuset provides a simple per-cpuset metric
-of the rate that the tasks in a cpuset are attempting to free up in
-use memory on the nodes of the cpuset to satisfy additional memory
-requests.
-
-This enables batch managers monitoring jobs running in dedicated
-cpusets to efficiently detect what level of memory pressure that job
-is causing.
-
-This is useful both on tightly managed systems running a wide mix of
-submitted jobs, which may choose to terminate or re-prioritize jobs that
-are trying to use more memory than allowed on the nodes assigned to them,
-and with tightly coupled, long running, massively parallel scientific
-computing jobs that will dramatically fail to meet required performance
-goals if they start to use more memory than allowed to them.
-
-This mechanism provides a very economical way for the batch manager
-to monitor a cpuset for signs of memory pressure. It's up to the
-batch manager or other user code to decide what to do about it and
-take action.
-
-==> Unless this feature is enabled by writing "1" to the special file
- /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
- code of __alloc_pages() for this metric reduces to simply noticing
- that the cpuset_memory_pressure_enabled flag is zero. So only
- systems that enable this feature will compute the metric.
-
-Why a per-cpuset, running average:
-
- Because this meter is per-cpuset, rather than per-task or mm,
- the system load imposed by a batch scheduler monitoring this
- metric is sharply reduced on large systems, because a scan of
- the tasklist can be avoided on each set of queries.
-
- Because this meter is a running average, instead of an accumulating
- counter, a batch scheduler can detect memory pressure with a
- single read, instead of having to read and accumulate results
- for a period of time.
-
- Because this meter is per-cpuset rather than per-task or mm,
- the batch scheduler can obtain the key information, memory
- pressure in a cpuset, with a single read, rather than having to
- query and accumulate results over all the (dynamically changing)
- set of tasks in the cpuset.
-
-A per-cpuset simple digital filter (requires a spinlock and 3 words
-of data per-cpuset) is kept, and updated by any task attached to that
-cpuset, if it enters the synchronous (direct) page reclaim code.
-
-A per-cpuset file provides an integer number representing the recent
-(half-life of 10 seconds) rate of direct page reclaims caused by
-the tasks in the cpuset, in units of reclaims attempted per second,
-times 1000.
-
-
-1.6 What is memory spread ?
----------------------------
-There are two boolean flag files per cpuset that control where the
-kernel allocates pages for the file system buffers and related in
-kernel data structures. They are called 'cpuset.memory_spread_page' and
-'cpuset.memory_spread_slab'.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
-the kernel will spread the file system buffers (page cache) evenly
-over all the nodes that the faulting task is allowed to use, instead
-of preferring to put those pages on the node where the task is running.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
-then the kernel will spread some file system related slab caches,
-such as for inodes and dentries evenly over all the nodes that the
-faulting task is allowed to use, instead of preferring to put those
-pages on the node where the task is running.
-
-The setting of these flags does not affect anonymous data segment or
-stack segment pages of a task.
-
-By default, both kinds of memory spreading are off, and memory
-pages are allocated on the node local to where the task is running,
-except perhaps as modified by the task's NUMA mempolicy or cpuset
-configuration, so long as sufficient free memory pages are available.
-
-When new cpusets are created, they inherit the memory spread settings
-of their parent.
-
-Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the task's NUMA mempolicy and be spread
-instead. Tasks using mbind() or set_mempolicy() calls to set NUMA
-mempolicies will not notice any change in these calls as a result of
-their containing task's memory spread settings. If memory spreading
-is turned off, then the currently specified NUMA mempolicy once again
-applies to memory page allocations.
-
-Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
-files. By default they contain "0", meaning that the feature is off
-for that cpuset. If a "1" is written to that file, then that turns
-the named feature on.
-
-The implementation is simple.
-
-Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
-PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
-joins that cpuset. The page allocation calls for the page cache
-is modified to perform an inline check for this PFA_SPREAD_PAGE task
-flag, and if set, a call to a new routine cpuset_mem_spread_node()
-returns the node to prefer for the allocation.
-
-Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
-PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
-pages from the node returned by cpuset_mem_spread_node().
-
-The cpuset_mem_spread_node() routine is also simple. It uses the
-value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current task's mems_allowed to prefer for the allocation.
-
-This memory placement policy is also known (in other contexts) as
-round-robin or interleave.
-
-This policy can provide substantial improvements for jobs that need
-to place thread local data on the corresponding node, but that need
-to access large file system data sets that need to be spread across
-the several nodes in the jobs cpuset in order to fit. Without this
-policy, especially for jobs that might have one thread reading in the
-data set, the memory allocation across the nodes in the jobs cpuset
-can become very uneven.
-
-1.7 What is sched_load_balance ?
---------------------------------
-
-The kernel scheduler (kernel/sched/core.c) automatically load balances
-tasks. If one CPU is underutilized, kernel code running on that
-CPU will look for tasks on other more overloaded CPUs and move those
-tasks to itself, within the constraints of such placement mechanisms
-as cpusets and sched_setaffinity.
-
-The algorithmic cost of load balancing and its impact on key shared
-kernel data structures such as the task list increases more than
-linearly with the number of CPUs being balanced. So the scheduler
-has support to partition the systems CPUs into a number of sched
-domains such that it only load balances within each sched domain.
-Each sched domain covers some subset of the CPUs in the system;
-no two sched domains overlap; some CPUs might not be in any sched
-domain and hence won't be load balanced.
-
-Put simply, it costs less to balance between two smaller sched domains
-than one big one, but doing so means that overloads in one of the
-two domains won't be load balanced to the other one.
-
-By default, there is one sched domain covering all CPUs, including those
-marked isolated using the kernel boot time "isolcpus=" argument. However,
-the isolated CPUs will not participate in load balancing, and will not
-have tasks running on them unless explicitly assigned.
-
-This default load balancing across all CPUs is not well suited for
-the following two situations:
- 1) On large systems, load balancing across many CPUs is expensive.
- If the system is managed using cpusets to place independent jobs
- on separate sets of CPUs, full load balancing is unnecessary.
- 2) Systems supporting realtime on some CPUs need to minimize
- system overhead on those CPUs, including avoiding task load
- balancing if that is not needed.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
-be contained in a single sched domain, ensuring that load balancing
-can move a task (not otherwised pinned, as by sched_setaffinity)
-from any CPU in that cpuset to any other.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
-scheduler will avoid load balancing across the CPUs in that cpuset,
---except-- in so far as is necessary because some overlapping cpuset
-has "sched_load_balance" enabled.
-
-So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
-enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
-cpusets won't matter, as we're already fully load balancing.
-
-Therefore in the above two situations, the top cpuset flag
-"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
-child cpusets have this flag enabled.
-
-When doing this, you don't usually want to leave any unpinned tasks in
-the top cpuset that might use non-trivial amounts of CPU, as such tasks
-may be artificially constrained to some subset of CPUs, depending on
-the particulars of this flag setting in descendant cpusets. Even if
-such a task could use spare CPU cycles in some other CPUs, the kernel
-scheduler might not consider the possibility of load balancing that
-task to that underused CPU.
-
-Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
-else anyway.
-
-There is an impedance mismatch here, between cpusets and sched domains.
-Cpusets are hierarchical and nest. Sched domains are flat; they don't
-overlap and each CPU is in at most one sched domain.
-
-It is necessary for sched domains to be flat because load balancing
-across partially overlapping sets of CPUs would risk unstable dynamics
-that would be beyond our understanding. So if each of two partially
-overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
-form a single sched domain that is a superset of both. We won't move
-a task to a CPU outside its cpuset, but the scheduler load balancing
-code might waste some compute cycles considering that possibility.
-
-This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "cpuset.sched_load_balance" enabled,
-and the sched domain configuration. If a cpuset enables the flag, it
-will get balancing across all its CPUs, but if it disables the flag,
-it will only be assured of no load balancing if no other overlapping
-cpuset enables the flag.
-
-If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
-one of them has this flag enabled, then the other may find its
-tasks only partially load balanced, just on the overlapping CPUs.
-This is just the general case of the top_cpuset example given a few
-paragraphs above. In the general case, as in the top cpuset case,
-don't leave tasks that might use non-trivial amounts of CPU in
-such partially load balanced cpusets, as they may be artificially
-constrained to some subset of the CPUs allowed to them, for lack of
-load balancing to the other CPUs.
-
-CPUs in "cpuset.isolcpus" were excluded from load balancing by the
-isolcpus= kernel boot option, and will never be load balanced regardless
-of the value of "cpuset.sched_load_balance" in any cpuset.
-
-1.7.1 sched_load_balance implementation details.
-------------------------------------------------
-
-The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
-to most cpuset flags.) When enabled for a cpuset, the kernel will
-ensure that it can load balance across all the CPUs in that cpuset
-(makes sure that all the CPUs in the cpus_allowed of that cpuset are
-in the same sched domain.)
-
-If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
-then they will be (must be) both in the same sched domain.
-
-If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
-then by the above that means there is a single sched domain covering
-the whole system, regardless of any other cpuset settings.
-
-The kernel commits to user space that it will avoid load balancing
-where it can. It will pick as fine a granularity partition of sched
-domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
-
-The internal kernel cpuset to scheduler interface passes from the
-cpuset code to the scheduler code a partition of the load balanced
-CPUs in the system. This partition is a set of subsets (represented
-as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
-all the CPUs that must be load balanced.
-
-The cpuset code builds a new such partition and passes it to the
-scheduler sched domain setup code, to have the sched domains rebuilt
-as necessary, whenever:
- - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
- - or CPUs come or go from a cpuset with this flag enabled,
- - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
- and with this flag enabled changes,
- - or a cpuset with non-empty CPUs and with this flag enabled is removed,
- - or a cpu is offlined/onlined.
-
-This partition exactly defines what sched domains the scheduler should
-setup - one sched domain for each element (struct cpumask) in the
-partition.
-
-The scheduler remembers the currently active sched domain partitions.
-When the scheduler routine partition_sched_domains() is invoked from
-the cpuset code to update these sched domains, it compares the new
-partition requested with the current, and updates its sched domains,
-removing the old and adding the new, for each change.
-
-
-1.8 What is sched_relax_domain_level ?
---------------------------------------
-
-In sched domain, the scheduler migrates tasks in 2 ways; periodic load
-balance on tick, and at time of some schedule events.
-
-When a task is woken up, scheduler try to move the task on idle CPU.
-For example, if a task A running on CPU X activates another task B
-on the same CPU X, and if CPU Y is X's sibling and performing idle,
-then scheduler migrate task B to CPU Y so that task B can start on
-CPU Y without waiting task A on CPU X.
-
-And if a CPU run out of tasks in its runqueue, the CPU try to pull
-extra tasks from other busy CPUs to help them before it is going to
-be idle.
-
-Of course it takes some searching cost to find movable tasks and/or
-idle CPUs, the scheduler might not search all CPUs in the domain
-every time. In fact, in some architectures, the searching ranges on
-events are limited in the same socket or node where the CPU locates,
-while the load balance on tick searches all.
-
-For example, assume CPU Z is relatively far from CPU X. Even if CPU Z
-is idle while CPU X and the siblings are busy, scheduler can't migrate
-woken task B from X to Z since it is out of its searching range.
-As the result, task B on CPU X need to wait task A or wait load balance
-on the next tick. For some applications in special situation, waiting
-1 tick may be too long.
-
-The 'cpuset.sched_relax_domain_level' file allows you to request changing
-this searching range as you like. This file takes int value which
-indicates size of searching range in levels ideally as follows,
-otherwise initial value -1 that indicates the cpuset has no request.
-
- -1 : no request. use system default or follow request of others.
- 0 : no search.
- 1 : search siblings (hyperthreads in a core).
- 2 : search cores in a package.
- 3 : search cpus in a node [= system wide on non-NUMA system]
- 4 : search nodes in a chunk of node [on NUMA system]
- 5 : search system wide [on NUMA system]
-
-The system default is architecture dependent. The system default
-can be changed using the relax_domain_level= boot parameter.
-
-This file is per-cpuset and affect the sched domain where the cpuset
-belongs to. Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
-is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
-there is no sched domain belonging the cpuset.
-
-If multiple cpusets are overlapping and hence they form a single sched
-domain, the largest value among those is used. Be careful, if one
-requests 0 and others are -1 then 0 is used.
-
-Note that modifying this file will have both good and bad effects,
-and whether it is acceptable or not depends on your situation.
-Don't modify this file if you are not sure.
-
-If your situation is:
- - The migration costs between each cpu can be assumed considerably
- small(for you) due to your special application's behavior or
- special hardware support for CPU cache etc.
- - The searching cost doesn't have impact(for you) or you can make
- the searching cost enough small by managing cpuset to compact etc.
- - The latency is required even it sacrifices cache hit rate etc.
-then increasing 'sched_relax_domain_level' would benefit you.
-
-
-1.9 How do I use cpusets ?
---------------------------
-
-In order to minimize the impact of cpusets on critical kernel
-code, such as the scheduler, and due to the fact that the kernel
-does not support one task updating the memory placement of another
-task directly, the impact on a task of changing its cpuset CPU
-or Memory Node placement, or of changing to which cpuset a task
-is attached, is subtle.
-
-If a cpuset has its Memory Nodes modified, then for each task attached
-to that cpuset, the next time that the kernel attempts to allocate
-a page of memory for that task, the kernel will notice the change
-in the task's cpuset, and update its per-task memory placement to
-remain within the new cpusets memory placement. If the task was using
-mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
-its new cpuset, then the task will continue to use whatever subset
-of MPOL_BIND nodes are still allowed in the new cpuset. If the task
-was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
-in the new cpuset, then the task will be essentially treated as if it
-was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
-as queried by get_mempolicy(), doesn't change). If a task is moved
-from one cpuset to another, then the kernel will adjust the task's
-memory placement, as above, the next time that the kernel attempts
-to allocate a page of memory for that task.
-
-If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
-will have its allowed CPU placement changed immediately. Similarly,
-if a task's pid is written to another cpuset's 'tasks' file, then its
-allowed CPU placement is changed immediately. If such a task had been
-bound to some subset of its cpuset using the sched_setaffinity() call,
-the task will be allowed to run on any CPU allowed in its new cpuset,
-negating the effect of the prior sched_setaffinity() call.
-
-In summary, the memory placement of a task whose cpuset is changed is
-updated by the kernel, on the next allocation of a page for that task,
-and the processor placement is updated immediately.
-
-Normally, once a page is allocated (given a physical page
-of main memory) then that page stays on whatever node it
-was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'cpuset.mems' subsequently changes.
-If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
-tasks are attached to that cpuset, any pages that task had
-allocated to it on nodes in its previous cpuset are migrated
-to the task's new cpuset. The relative placement of the page within
-the cpuset is preserved during these migration operations if possible.
-For example if the page was on the second valid node of the prior cpuset
-then the page will be placed on the second valid node of the new cpuset.
-
-Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
-'cpuset.mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'cpuset.mems',
-will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the task's prior cpuset, or in the cpuset's
-prior 'cpuset.mems' setting, will not be moved.
-
-There is an exception to the above. If hotplug functionality is used
-to remove all the CPUs that are currently assigned to a cpuset,
-then all the tasks in that cpuset will be moved to the nearest ancestor
-with non-empty cpus. But the moving of some (or all) tasks might fail if
-cpuset is bound with another cgroup subsystem which has some restrictions
-on task attaching. In this failing case, those tasks will stay
-in the original cpuset, and the kernel will automatically update
-their cpus_allowed to allow all online CPUs. When memory hotplug
-functionality for removing Memory Nodes is available, a similar exception
-is expected to apply there as well. In general, the kernel prefers to
-violate cpuset placement, over starving a task that has had all
-its allowed CPUs or Memory Nodes taken offline.
-
-There is a second exception to the above. GFP_ATOMIC requests are
-kernel internal allocations that must be satisfied, immediately.
-The kernel may drop some request, in rare cases even panic, if a
-GFP_ATOMIC alloc fails. If the request cannot be satisfied within
-the current task's cpuset, then we relax the cpuset, and look for
-memory anywhere we can find it. It's better to violate the cpuset
-than stress the kernel.
-
-To start a new job that is to be contained within a cpuset, the steps are:
-
- 1) mkdir /sys/fs/cgroup/cpuset
- 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
- the /sys/fs/cgroup/cpuset virtual file system.
- 4) Start a task that will be the "founding father" of the new job.
- 5) Attach that task to the new cpuset by writing its pid to the
- /sys/fs/cgroup/cpuset tasks file for that cpuset.
- 6) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cpuset
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cpuset:
-
- mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- cd /sys/fs/cgroup/cpuset
- mkdir Charlie
- cd Charlie
- /bin/echo 2-3 > cpuset.cpus
- /bin/echo 1 > cpuset.mems
- /bin/echo $$ > tasks
- sh
- # The subshell 'sh' is now running in cpuset Charlie
- # The next line should display '/Charlie'
- cat /proc/self/cpuset
-
-There are ways to query or modify cpusets:
- - via the cpuset file system directly, using the various cd, mkdir, echo,
- cat, rmdir commands from the shell, or their equivalent from C.
- - via the C library libcpuset.
- - via the C library libcgroup.
- (http://sourceforge.net/projects/libcg/)
- - via the python application cset.
- (http://code.google.com/p/cpuset/)
-
-The sched_setaffinity calls can also be done at the shell prompt using
-SGI's runon or Robert Love's taskset. The mbind and set_mempolicy
-calls can be done at the shell prompt using the numactl command
-(part of Andi Kleen's numa package).
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using the cpusets can be done through the cpuset
-virtual filesystem.
-
-To mount it, type:
-# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
-
-Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
-tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
-is the cpuset that holds the whole system.
-
-If you want to create a new cpuset under /sys/fs/cgroup/cpuset:
-# cd /sys/fs/cgroup/cpuset
-# mkdir my_cpuset
-
-Now you want to do something with this cpuset.
-# cd my_cpuset
-
-In this directory you can find several files:
-# ls
-cgroup.clone_children cpuset.memory_pressure
-cgroup.event_control cpuset.memory_spread_page
-cgroup.procs cpuset.memory_spread_slab
-cpuset.cpu_exclusive cpuset.mems
-cpuset.cpus cpuset.sched_load_balance
-cpuset.mem_exclusive cpuset.sched_relax_domain_level
-cpuset.mem_hardwall notify_on_release
-cpuset.memory_migrate tasks
-
-Reading them will give you information about the state of this cpuset:
-the CPUs and Memory Nodes it can use, the processes that are using
-it, its properties. By writing to these files you can manipulate
-the cpuset.
-
-Set some flags:
-# /bin/echo 1 > cpuset.cpu_exclusive
-
-Add some cpus:
-# /bin/echo 0-7 > cpuset.cpus
-
-Add some mems:
-# /bin/echo 0-7 > cpuset.mems
-
-Now attach your shell to this cpuset:
-# /bin/echo $$ > tasks
-
-You can also create cpusets inside your cpuset by using mkdir in this
-directory.
-# mkdir my_sub_cs
-
-To remove a cpuset, just use rmdir:
-# rmdir my_sub_cs
-This will fail if the cpuset is in use (has cpusets inside, or has
-processes attached).
-
-Note that for legacy reasons, the "cpuset" filesystem exists as a
-wrapper around the cgroup filesystem.
-
-The command
-
-mount -t cpuset X /sys/fs/cgroup/cpuset
-
-is equivalent to
-
-mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
-echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
-
-2.2 Adding/removing cpus
-------------------------
-
-This is the syntax to use when writing in the cpus or mems files
-in cpuset directories:
-
-# /bin/echo 1-4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
-# /bin/echo 1,2,3,4 > cpuset.cpus -> set cpus list to cpus 1,2,3,4
-
-To add a CPU to a cpuset, write the new list of CPUs including the
-CPU to be added. To add 6 to the above cpuset:
-
-# /bin/echo 1-4,6 > cpuset.cpus -> set cpus list to cpus 1,2,3,4,6
-
-Similarly to remove a CPU from a cpuset, write the new list of CPUs
-without the CPU to be removed.
-
-To remove all the CPUs:
-
-# /bin/echo "" > cpuset.cpus -> clear cpus list
-
-2.3 Setting flags
------------------
-
-The syntax is very simple:
-
-# /bin/echo 1 > cpuset.cpu_exclusive -> set flag 'cpuset.cpu_exclusive'
-# /bin/echo 0 > cpuset.cpu_exclusive -> unset flag 'cpuset.cpu_exclusive'
-
-2.4 Attaching processes
------------------------
-
-# /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another:
-
-# /bin/echo PID1 > tasks
-# /bin/echo PID2 > tasks
- ...
-# /bin/echo PIDn > tasks
-
-
-3. Questions
-============
-
-Q: what's up with this '/bin/echo' ?
-A: bash's builtin 'echo' command does not check calls to write() against
- errors. If you use it in the cpuset file system, you won't be
- able to tell whether a command succeeded or failed.
-
-Q: When I attach processes, only the first of the line gets really attached !
-A: We can only return one error code per call to write(). So you should also
- put only ONE pid.
-
-4. Contact
-==========
-
-Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-v1/devices.txt b/Documentation/cgroup-v1/devices.txt
deleted file mode 100644
index 3c1095ca02ea..000000000000
--- a/Documentation/cgroup-v1/devices.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-Device Whitelist Controller
-
-1. Description:
-
-Implement a cgroup to track and enforce open and mknod restrictions
-on device files. A device cgroup associates a device access
-whitelist with each cgroup. A whitelist entry has 4 fields.
-'type' is a (all), c (char), or b (block). 'all' means it applies
-to all types and all major and minor numbers. Major and minor are
-either an integer or * for all. Access is a composition of r
-(read), w (write), and m (mknod).
-
-The root device cgroup starts with rwm to 'all'. A child device
-cgroup gets a copy of the parent. Administrators can then remove
-devices from the whitelist or add new entries. A child cgroup can
-never receive a device access which is denied by its parent.
-
-2. User Interface
-
-An entry is added using devices.allow, and removed using
-devices.deny. For instance
-
- echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
-
-allows cgroup 1 to read and mknod the device usually known as
-/dev/null. Doing
-
- echo a > /sys/fs/cgroup/1/devices.deny
-
-will remove the default 'a *:* rwm' entry. Doing
-
- echo a > /sys/fs/cgroup/1/devices.allow
-
-will add the 'a *:* rwm' entry to the whitelist.
-
-3. Security
-
-Any task can move itself between cgroups. This clearly won't
-suffice, but we can decide the best way to adequately restrict
-movement as people get some experience with this. We may just want
-to require CAP_SYS_ADMIN, which at least is a separate bit from
-CAP_MKNOD. We may want to just refuse moving to a cgroup which
-isn't a descendant of the current one. Or we may want to use
-CAP_MAC_ADMIN, since we really are trying to lock down root.
-
-CAP_SYS_ADMIN is needed to modify the whitelist or move another
-task to a new cgroup. (Again we'll probably want to change that).
-
-A cgroup may not be granted more permissions than the cgroup's
-parent has.
-
-4. Hierarchy
-
-device cgroups maintain hierarchy by making sure a cgroup never has more
-access permissions than its parent. Every time an entry is written to
-a cgroup's devices.deny file, all its children will have that entry removed
-from their whitelist and all the locally set whitelist entries will be
-re-evaluated. In case one of the locally set whitelist entries would provide
-more access than the cgroup's parent, it'll be removed from the whitelist.
-
-Example:
- A
- / \
- B
-
- group behavior exceptions
- A allow "b 8:* rwm", "c 116:1 rw"
- B deny "c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
-
-If a device is denied in group A:
- # echo "c 116:* r" > A/devices.deny
-it'll propagate down and after revalidating B's entries, the whitelist entry
-"c 116:2 rwm" will be removed:
-
- group whitelist entries denied devices
- A all "b 8:* rwm", "c 116:* rw"
- B "c 1:3 rwm", "b 3:* rwm" all the rest
-
-In case parent's exceptions change and local exceptions are not allowed
-anymore, they'll be deleted.
-
-Notice that new whitelist entries will not be propagated:
- A
- / \
- B
-
- group whitelist entries denied devices
- A "c 1:3 rwm", "c 1:5 r" all the rest
- B "c 1:3 rwm", "c 1:5 r" all the rest
-
-when adding "c *:3 rwm":
- # echo "c *:3 rwm" >A/devices.allow
-
-the result:
- group whitelist entries denied devices
- A "c *:3 rwm", "c 1:5 r" all the rest
- B "c 1:3 rwm", "c 1:5 r" all the rest
-
-but now it'll be possible to add new entries to B:
- # echo "c 2:3 rwm" >B/devices.allow
- # echo "c 50:3 r" >B/devices.allow
-or even
- # echo "c *:3 rwm" >B/devices.allow
-
-Allowing or denying all by writing 'a' to devices.allow or devices.deny will
-not be possible once the device cgroups has children.
-
-4.1 Hierarchy (internal implementation)
-
-device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
-list of exceptions. The internal state is controlled using the same user
-interface to preserve compatibility with the previous whitelist-only
-implementation. Removal or addition of exceptions that will reduce the access
-to devices will be propagated down the hierarchy.
-For every propagated exception, the effective rules will be re-evaluated based
-on current parent's access rules.
diff --git a/Documentation/cgroup-v1/freezer-subsystem.txt b/Documentation/cgroup-v1/freezer-subsystem.txt
deleted file mode 100644
index e831cb2b8394..000000000000
--- a/Documentation/cgroup-v1/freezer-subsystem.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-The cgroup freezer is useful to batch job management system which start
-and stop sets of tasks in order to schedule the resources of a machine
-according to the desires of a system administrator. This sort of program
-is often used on HPC clusters to schedule access to the cluster as a
-whole. The cgroup freezer uses cgroups to describe the set of tasks to
-be started/stopped by the batch job management system. It also provides
-a means to start and stop the tasks composing the job.
-
-The cgroup freezer will also be useful for checkpointing running groups
-of tasks. The freezer allows the checkpoint code to obtain a consistent
-image of the tasks by attempting to force the tasks in a cgroup into a
-quiescent state. Once the tasks are quiescent another task can
-walk /proc or invoke a kernel interface to gather information about the
-quiesced tasks. Checkpointed tasks can be restarted later should a
-recoverable error occur. This also allows the checkpointed tasks to be
-migrated between nodes in a cluster by copying the gathered information
-to another node and restarting the tasks there.
-
-Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-and resuming tasks in userspace. Both of these signals are observable
-from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-SIGCONT is especially unsuitable since it can be caught by the task. Any
-programs designed to watch for SIGSTOP and SIGCONT could be broken by
-attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-demonstrate this problem using nested bash shells:
-
- $ echo $$
- 16644
- $ bash
- $ echo $$
- 16690
-
- From a second, unrelated bash shell:
- $ kill -SIGSTOP 16690
- $ kill -SIGCONT 16690
-
- <at this point 16690 exits and causes 16644 to exit too>
-
-This happens because bash can observe both signals and choose how it
-responds to them.
-
-Another example of a program which catches and responds to these
-signals is gdb. In fact any program designed to use ptrace is likely to
-have a problem with this method of stopping and resuming tasks.
-
-In contrast, the cgroup freezer uses the kernel freezer code to
-prevent the freeze/unfreeze cycle from becoming visible to the tasks
-being frozen. This allows the bash example above and gdb to run as
-expected.
-
-The cgroup freezer is hierarchical. Freezing a cgroup freezes all
-tasks belonging to the cgroup and all its descendant cgroups. Each
-cgroup has its own state (self-state) and the state inherited from the
-parent (parent-state). Iff both states are THAWED, the cgroup is
-THAWED.
-
-The following cgroupfs files are created by cgroup freezer.
-
-* freezer.state: Read-write.
-
- When read, returns the effective state of the cgroup - "THAWED",
- "FREEZING" or "FROZEN". This is the combined self and parent-states.
- If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
-
- FREEZING cgroup transitions into FROZEN state when all tasks
- belonging to the cgroup and its descendants become frozen. Note that
- a cgroup reverts to FREEZING from FROZEN after a new task is added
- to the cgroup or one of its descendant cgroups until the new task is
- frozen.
-
- When written, sets the self-state of the cgroup. Two values are
- allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
- if not already freezing, enters FREEZING state along with all its
- descendant cgroups.
-
- If THAWED is written, the self-state of the cgroup is changed to
- THAWED. Note that the effective state may not change to THAWED if
- the parent-state is still freezing. If a cgroup's effective state
- becomes THAWED, all its descendants which are freezing because of
- the cgroup also leave the freezing state.
-
-* freezer.self_freezing: Read only.
-
- Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
- This value is 1 iff the last write to freezer.state was "FROZEN".
-
-* freezer.parent_freezing: Read only.
-
- Shows the parent-state. 0 if none of the cgroup's ancestors is
- frozen; otherwise, 1.
-
-The root cgroup is non-freezable and the above interface files don't
-exist.
-
-* Examples of usage :
-
- # mkdir /sys/fs/cgroup/freezer
- # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
- # mkdir /sys/fs/cgroup/freezer/0
- # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
-
-to get status of the freezer subsystem :
-
- # cat /sys/fs/cgroup/freezer/0/freezer.state
- THAWED
-
-to freeze all tasks in the container :
-
- # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
- # cat /sys/fs/cgroup/freezer/0/freezer.state
- FREEZING
- # cat /sys/fs/cgroup/freezer/0/freezer.state
- FROZEN
-
-to unfreeze all tasks in the container :
-
- # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
- # cat /sys/fs/cgroup/freezer/0/freezer.state
- THAWED
-
-This is the basic mechanism which should do the right thing for user space task
-in a simple scenario.
diff --git a/Documentation/cgroup-v1/hugetlb.txt b/Documentation/cgroup-v1/hugetlb.txt
deleted file mode 100644
index 106245c3aecc..000000000000
--- a/Documentation/cgroup-v1/hugetlb.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-HugeTLB Controller
--------------------
-
-The HugeTLB controller allows to limit the HugeTLB usage per control group and
-enforces the controller limit during page fault. Since HugeTLB doesn't
-support page reclaim, enforcing the limit at page fault time implies that,
-the application will get SIGBUS signal if it tries to access HugeTLB pages
-beyond its limit. This requires the application to know beforehand how much
-HugeTLB pages it would require for its use.
-
-HugeTLB controller can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -o hugetlb none /sys/fs/cgroup
-
-With the above step, the initial or the parent HugeTLB group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-
-New groups can be created under the parent group /sys/fs/cgroup.
-
-# cd /sys/fs/cgroup
-# mkdir g1
-# echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it.
-
-Brief summary of control files
-
- hugetlb.<hugepagesize>.limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage
- hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
- hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
- hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit
-
-For a system supporting two hugepage size (16M and 16G) the control
-files include:
-
-hugetlb.16GB.limit_in_bytes
-hugetlb.16GB.max_usage_in_bytes
-hugetlb.16GB.usage_in_bytes
-hugetlb.16GB.failcnt
-hugetlb.16MB.limit_in_bytes
-hugetlb.16MB.max_usage_in_bytes
-hugetlb.16MB.usage_in_bytes
-hugetlb.16MB.failcnt
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt
deleted file mode 100644
index 621e29ffb358..000000000000
--- a/Documentation/cgroup-v1/memcg_test.txt
+++ /dev/null
@@ -1,280 +0,0 @@
-Memory Resource Controller(Memcg) Implementation Memo.
-Last Updated: 2010/2
-Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
-
-Because VM is getting complex (one of reasons is memcg...), memcg's behavior
-is complex. This is a document for memcg's internal behavior.
-Please note that implementation details can be changed.
-
-(*) Topics on API should be in Documentation/cgroup-v1/memory.txt)
-
-0. How to record usage ?
- 2 objects are used.
-
- page_cgroup ....an object per page.
- Allocated at boot or memory hotplug. Freed at memory hot removal.
-
- swap_cgroup ... an entry per swp_entry.
- Allocated at swapon(). Freed at swapoff().
-
- The page_cgroup has USED bit and double count against a page_cgroup never
- occurs. swap_cgroup is used only when a charged page is swapped-out.
-
-1. Charge
-
- a page/swp_entry may be charged (usage += PAGE_SIZE) at
-
- mem_cgroup_try_charge()
-
-2. Uncharge
- a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
-
- mem_cgroup_uncharge()
- Called when a page's refcount goes down to 0.
-
- mem_cgroup_uncharge_swap()
- Called when swp_entry's refcnt goes down to 0. A charge against swap
- disappears.
-
-3. charge-commit-cancel
- Memcg pages are charged in two steps:
- mem_cgroup_try_charge()
- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
-
- At try_charge(), there are no flags to say "this page is charged".
- at this point, usage += PAGE_SIZE.
-
- At commit(), the page is associated with the memcg.
-
- At cancel(), simply usage -= PAGE_SIZE.
-
-Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
-
-4. Anonymous
- Anonymous page is newly allocated at
- - page fault into MAP_ANONYMOUS mapping.
- - Copy-On-Write.
-
- 4.1 Swap-in.
- At swap-in, the page is taken from swap-cache. There are 2 cases.
-
- (a) If the SwapCache is newly allocated and read, it has no charges.
- (b) If the SwapCache has been mapped by processes, it has been
- charged already.
-
- 4.2 Swap-out.
- At swap-out, typical state transition is below.
-
- (a) add to swap cache. (marked as SwapCache)
- swp_entry's refcnt += 1.
- (b) fully unmapped.
- swp_entry's refcnt += # of ptes.
- (c) write back to swap.
- (d) delete from swap cache. (remove from SwapCache)
- swp_entry's refcnt -= 1.
-
-
- Finally, at task exit,
- (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-
-5. Page Cache
- Page Cache is charged at
- - add_to_page_cache_locked().
-
- The logic is very clear. (About migration, see below)
- Note: __remove_from_page_cache() is called by remove_from_page_cache()
- and __remove_mapping().
-
-6. Shmem(tmpfs) Page Cache
- The best way to understand shmem's page state transition is to read
- mm/shmem.c.
- But brief explanation of the behavior of memcg around shmem will be
- helpful to understand the logic.
-
- Shmem's page (just leaf page, not direct/indirect block) can be on
- - radix-tree of shmem's inode.
- - SwapCache.
- - Both on radix-tree and SwapCache. This happens at swap-in
- and swap-out,
-
- It's charged when...
- - A new page is added to shmem's radix-tree.
- - A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-
-7. Page Migration
-
- mem_cgroup_migrate()
-
-8. LRU
- Each memcg has its own private LRU. Now, its handling is under global
- VM's control (means that it's handled under global pgdat->lru_lock).
- Almost all routines around memcg's LRU is called by global LRU's
- list management functions under pgdat->lru_lock.
-
- A special function is mem_cgroup_isolate_pages(). This scans
- memcg's private LRU and call __isolate_lru_page() to extract a page
- from LRU.
- (By __isolate_lru_page(), the page is removed from both of global and
- private LRU.)
-
-
-9. Typical Tests.
-
- Tests for racy cases.
-
- 9.1 Small limit to memcg.
- When you do test to do racy case, it's good test to set memcg's limit
- to be very small rather than GB. Many races found in the test under
- xKB or xxMB limits.
- (Memory behavior under GB and Memory behavior under MB shows very
- different situation.)
-
- 9.2 Shmem
- Historically, memcg's shmem handling was poor and we saw some amount
- of troubles here. This is because shmem is page-cache but can be
- SwapCache. Test with shmem/tmpfs is always good test.
-
- 9.3 Migration
- For NUMA, migration is an another special case. To do easy test, cpuset
- is useful. Following is a sample script to do migration.
-
- mount -t cgroup -o cpuset none /opt/cpuset
-
- mkdir /opt/cpuset/01
- echo 1 > /opt/cpuset/01/cpuset.cpus
- echo 0 > /opt/cpuset/01/cpuset.mems
- echo 1 > /opt/cpuset/01/cpuset.memory_migrate
- mkdir /opt/cpuset/02
- echo 1 > /opt/cpuset/02/cpuset.cpus
- echo 1 > /opt/cpuset/02/cpuset.mems
- echo 1 > /opt/cpuset/02/cpuset.memory_migrate
-
- In above set, when you moves a task from 01 to 02, page migration to
- node 0 to node 1 will occur. Following is a script to migrate all
- under cpuset.
- --
- move_task()
- {
- for pid in $1
- do
- /bin/echo $pid >$2/tasks 2>/dev/null
- echo -n $pid
- echo -n " "
- done
- echo END
- }
-
- G1_TASK=`cat ${G1}/tasks`
- G2_TASK=`cat ${G2}/tasks`
- move_task "${G1_TASK}" ${G2} &
- --
- 9.4 Memory hotplug.
- memory hotplug test is one of good test.
- to offline memory, do following.
- # echo offline > /sys/devices/system/memory/memoryXXX/state
- (XXX is the place of memory)
- This is an easy way to test page migration, too.
-
- 9.5 mkdir/rmdir
- When using hierarchy, mkdir/rmdir test should be done.
- Use tests like the following.
-
- echo 1 >/opt/cgroup/01/memory/use_hierarchy
- mkdir /opt/cgroup/01/child_a
- mkdir /opt/cgroup/01/child_b
-
- set limit to 01.
- add limit to 01/child_b
- run jobs under child_a and child_b
-
- create/delete following groups at random while jobs are running.
- /opt/cgroup/01/child_a/child_aa
- /opt/cgroup/01/child_b/child_bb
- /opt/cgroup/01/child_c
-
- running new jobs in new group is also good.
-
- 9.6 Mount with other subsystems.
- Mounting with other subsystems is a good test because there is a
- race and lock dependency with other cgroup subsystems.
-
- example)
- # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
-
- and do task move, mkdir, rmdir etc...under this.
-
- 9.7 swapoff.
- Besides management of swap is one of complicated parts of memcg,
- call path of swap-in at swapoff is not same as usual swap-in path..
- It's worth to be tested explicitly.
-
- For example, test like following is good.
- (Shell-A)
- # mount -t cgroup none /cgroup -o memory
- # mkdir /cgroup/test
- # echo 40M > /cgroup/test/memory.limit_in_bytes
- # echo 0 > /cgroup/test/tasks
- Run malloc(100M) program under this. You'll see 60M of swaps.
- (Shell-B)
- # move all tasks in /cgroup/test to /cgroup
- # /sbin/swapoff -a
- # rmdir /cgroup/test
- # kill malloc task.
-
- Of course, tmpfs v.s. swapoff test should be tested, too.
-
- 9.8 OOM-Killer
- Out-of-memory caused by memcg's limit will kill tasks under
- the memcg. When hierarchy is used, a task under hierarchy
- will be killed by the kernel.
- In this case, panic_on_oom shouldn't be invoked and tasks
- in other groups shouldn't be killed.
-
- It's not difficult to cause OOM under memcg as following.
- Case A) when you can swapoff
- #swapoff -a
- #echo 50M > /memory.limit_in_bytes
- run 51M of malloc
-
- Case B) when you use mem+swap limitation.
- #echo 50M > memory.limit_in_bytes
- #echo 50M > memory.memsw.limit_in_bytes
- run 51M of malloc
-
- 9.9 Move charges at task migration
- Charges associated with a task can be moved along with task migration.
-
- (Shell-A)
- #mkdir /cgroup/A
- #echo $$ >/cgroup/A/tasks
- run some programs which uses some amount of memory in /cgroup/A.
-
- (Shell-B)
- #mkdir /cgroup/B
- #echo 1 >/cgroup/B/memory.move_charge_at_immigrate
- #echo "pid of the program running in group A" >/cgroup/B/tasks
-
- You can see charges have been moved by reading *.usage_in_bytes or
- memory.stat of both A and B.
- See 8.2 of Documentation/cgroup-v1/memory.txt to see what value should be
- written to move_charge_at_immigrate.
-
- 9.10 Memory thresholds
- Memory controller implements memory thresholds using cgroups notification
- API. You can use tools/cgroup/cgroup_event_listener.c to test it.
-
- (Shell-A) Create cgroup and run event listener
- # mkdir /cgroup/A
- # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
-
- (Shell-B) Add task to cgroup and try to allocate and free memory
- # echo $$ >/cgroup/A/tasks
- # a="$(dd if=/dev/zero bs=1M count=10)"
- # a=
-
- You will see message from cgroup_event_listener every time you cross
- the thresholds.
-
- Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
-
- It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt
deleted file mode 100644
index a33cedf85427..000000000000
--- a/Documentation/cgroup-v1/memory.txt
+++ /dev/null
@@ -1,892 +0,0 @@
-Memory Resource Controller
-
-NOTE: This document is hopelessly outdated and it asks for a complete
- rewrite. It still contains a useful information so we are keeping it
- here but make sure to check the current code if you need a deeper
- understanding.
-
-NOTE: The Memory Resource Controller has generically been referred to as the
- memory controller in this document. Do not confuse memory controller
- used here with the memory controller that is used in hardware.
-
-(For editors)
-In this document:
- When we mention a cgroup (cgroupfs's directory) with memory controller,
- we call it "memory cgroup". When you see git-log and source code, you'll
- see patch's title and function names tend to use "memcg".
- In this document, we avoid using it.
-
-Benefits and Purpose of the memory controller
-
-The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
-uses of the memory controller. The memory controller can be used to
-
-a. Isolate an application or a group of applications
- Memory-hungry applications can be isolated and limited to a smaller
- amount of memory.
-b. Create a cgroup with a limited amount of memory; this can be used
- as a good alternative to booting with mem=XXXX.
-c. Virtualization solutions can control the amount of memory they want
- to assign to a virtual machine instance.
-d. A CD/DVD burner could control the amount of memory used by the
- rest of the system to ensure that burning does not fail due to lack
- of available memory.
-e. There are several other use cases; find one or use the controller just
- for fun (to learn and hack on the VM subsystem).
-
-Current Status: linux-2.6.34-mmotm(development version of 2010/April)
-
-Features:
- - accounting anonymous pages, file caches, swap caches usage and limiting them.
- - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
- - optionally, memory+swap usage can be accounted and limited.
- - hierarchical accounting
- - soft limit
- - moving (recharging) account at moving a task is selectable.
- - usage threshold notifier
- - memory pressure notifier
- - oom-killer disable knob and oom-notifier
- - Root cgroup has no limit controls.
-
- Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
-
-Brief summary of control files.
-
- tasks # attach a task(thread) and show list of threads
- cgroup.procs # show list of processes
- cgroup.event_control # an interface for event_fd()
- memory.usage_in_bytes # show current usage for memory
- (See 5.5 for details)
- memory.memsw.usage_in_bytes # show current usage for memory+Swap
- (See 5.5 for details)
- memory.limit_in_bytes # set/show limit of memory usage
- memory.memsw.limit_in_bytes # set/show limit of memory+Swap usage
- memory.failcnt # show the number of memory usage hits limits
- memory.memsw.failcnt # show the number of memory+Swap hits limits
- memory.max_usage_in_bytes # show max memory usage recorded
- memory.memsw.max_usage_in_bytes # show max memory+Swap usage recorded
- memory.soft_limit_in_bytes # set/show soft limit of memory usage
- memory.stat # show various statistics
- memory.use_hierarchy # set/show hierarchical account enabled
- memory.force_empty # trigger forced page reclaim
- memory.pressure_level # set memory pressure notifications
- memory.swappiness # set/show swappiness parameter of vmscan
- (See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate # set/show controls of moving charges
- memory.oom_control # set/show oom controls.
- memory.numa_stat # show the number of memory usage per numa node
-
- memory.kmem.limit_in_bytes # set/show hard limit for kernel memory
- memory.kmem.usage_in_bytes # show current kernel memory allocation
- memory.kmem.failcnt # show the number of kernel memory usage hits limits
- memory.kmem.max_usage_in_bytes # show max kernel memory usage recorded
-
- memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory
- memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation
- memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits
- memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded
-
-1. History
-
-The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
-there were several implementations for memory control. The goal of the
-RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
-at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
-
-2. Memory Control
-
-Memory is a unique resource in the sense that it is present in a limited
-amount. If a task requires a lot of CPU processing, the task can spread
-its processing over a period of hours, days, months or years, but with
-memory, the same physical memory needs to be reused to accomplish the task.
-
-The memory controller implementation has been divided into phases. These
-are:
-
-1. Memory controller
-2. mlock(2) controller
-3. Kernel user memory accounting and slab control
-4. user mappings length controller
-
-The memory controller is the first controller developed.
-
-2.1. Design
-
-The core of the design is a counter called the page_counter. The
-page_counter tracks the current memory usage and limit of the group of
-processes associated with the controller. Each cgroup has a memory controller
-specific data structure (mem_cgroup) associated with it.
-
-2.2. Accounting
-
- +--------------------+
- | mem_cgroup |
- | (page_counter) |
- +--------------------+
- / ^ \
- / | \
- +---------------+ | +---------------+
- | mm_struct | |.... | mm_struct |
- | | | | |
- +---------------+ | +---------------+
- |
- + --------------+
- |
- +---------------+ +------+--------+
- | page +----------> page_cgroup|
- | | | |
- +---------------+ +---------------+
-
- (Figure 1: Hierarchy of Accounting)
-
-
-Figure 1 shows the important aspects of the controller
-
-1. Accounting happens per cgroup
-2. Each mm_struct knows about which cgroup it belongs to
-3. Each page has a pointer to the page_cgroup, which in turn knows the
- cgroup it belongs to
-
-The accounting is done as follows: mem_cgroup_charge_common() is invoked to
-set up the necessary data structures and check if the cgroup that is being
-charged is over its limit. If it is, then reclaim is invoked on the cgroup.
-More details can be found in the reclaim section of this document.
-If everything goes well, a page meta-data-structure called page_cgroup is
-updated. page_cgroup has its own LRU on cgroup.
-(*) page_cgroup structure is allocated at boot/memory-hotplug time.
-
-2.2.1 Accounting details
-
-All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
-Some pages which are never reclaimable and will not be on the LRU
-are not accounted. We just account pages under usual VM management.
-
-RSS pages are accounted at page_fault unless they've already been accounted
-for earlier. A file page will be accounted for as Page Cache when it's
-inserted into inode (radix-tree). While it's mapped into the page tables of
-processes, duplicate accounting is carefully avoided.
-
-An RSS page is unaccounted when it's fully unmapped. A PageCache page is
-unaccounted when it's removed from radix-tree. Even if RSS pages are fully
-unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
-
-Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
-
-At page migration, accounting information is kept.
-
-Note: we just account pages-on-LRU because our purpose is to control amount
-of used pages; not-on-LRU pages tend to be out-of-control from VM view.
-
-2.3 Shared Page Accounting
-
-Shared pages are accounted on the basis of the first touch approach. The
-cgroup that first touches a page is accounted for the page. The principle
-behind this approach is that a cgroup that aggressively uses a shared
-page will eventually get charged for it (once it is uncharged from
-the cgroup that brought it in -- this will happen on memory pressure).
-
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
-
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
-
-When swap is accounted, following files are added.
- - memory.memsw.usage_in_bytes.
- - memory.memsw.limit_in_bytes.
-
-memsw means memory+swap. Usage of memory+swap is limited by
-memsw.limit_in_bytes.
-
-Example: Assume a system with 4G of swap. A task which allocates 6G of memory
-(by mistake) under 2G memory limitation will use all swap.
-In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using the memsw limit, you can avoid system OOM which can be caused by swap
-shortage.
-
-* why 'memory+swap' rather than swap.
-The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
-to move account from memory to swap...there is no change in usage of
-memory+swap. In other words, when we want to limit the usage of swap without
-affecting global LRU, memory+swap limit is better than just limiting swap from
-an OS point of view.
-
-* What happens when a cgroup hits memory.memsw.limit_in_bytes
-When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
-in this cgroup. Then, swap-out will not be done by cgroup routine and file
-caches are dropped. But as mentioned above, global LRU can do swapout memory
-from it for sanity of the system's memory management state. You can't forbid
-it by cgroup.
-
-2.5 Reclaim
-
-Each cgroup maintains a per cgroup LRU which has the same structure as
-global VM. When a cgroup goes over its limit, we first try
-to reclaim memory from the cgroup so as to make space for the new
-pages that the cgroup has touched. If the reclaim is unsuccessful,
-an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
-
-The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per-cgroup LRU
-list.
-
-NOTE: Reclaim does not work for the root cgroup, since we cannot set any
-limits on the root cgroup.
-
-Note2: When panic_on_oom is set to "2", the whole system will panic.
-
-When oom event notifier is registered, event will be delivered.
-(See oom_control section)
-
-2.6 Locking
-
- lock_page_cgroup()/unlock_page_cgroup() should not be called under
- the i_pages lock.
-
- Other lock order is following:
- PG_locked.
- mm->page_table_lock
- pgdat->lru_lock
- lock_page_cgroup.
- In many cases, just lock_page_cgroup() is called.
- per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
- pgdat->lru_lock, it has no lock of its own.
-
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
-
-With the Kernel memory extension, the Memory Controller is able to limit
-the amount of kernel memory used by the system. Kernel memory is fundamentally
-different than user memory, since it can't be swapped out, which makes it
-possible to DoS the system by consuming too much of this precious resource.
-
-Kernel memory accounting is enabled for all memory cgroups by default. But
-it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
-at boot time. In this case, kernel memory will not be accounted at all.
-
-Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted. The memory used is accumulated into
-memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
-(currently only for tcp).
-The main "kmem" counter is fed into the main counter, so kmem charges will
-also be visible from the user counter.
-
-Currently no soft limit is implemented for kernel memory. It is future work
-to trigger slab reclaim when those limits are reached.
-
-2.7.1 Current Kernel Memory resources accounted
-
-* stack pages: every process consumes some stack pages. By accounting into
-kernel memory, we prevent new processes from being created when the kernel
-memory usage is too high.
-
-* slab pages: pages allocated by the SLAB or SLUB allocator are tracked. A copy
-of each kmem_cache is created every time the cache is touched by the first time
-from inside the memcg. The creation is done lazily, so some objects can still be
-skipped while the cache is being created. All objects in a slab page should
-belong to the same memcg. This only fails to hold when a task is migrated to a
-different memcg during the page allocation by the cache.
-
-* sockets memory pressure: some sockets protocols have memory pressure
-thresholds. The Memory Controller allows them to be controlled individually
-per cgroup, instead of globally.
-
-* tcp memory pressure: sockets memory pressure for the tcp protocol.
-
-2.7.2 Common use cases
-
-Because the "kmem" counter is fed to the main user counter, kernel memory can
-never be limited completely independently of user memory. Say "U" is the user
-limit, and "K" the kernel limit. There are three possible ways limits can be
-set:
-
- U != 0, K = unlimited:
- This is the standard memcg limitation mechanism already present before kmem
- accounting. Kernel memory is completely ignored.
-
- U != 0, K < U:
- Kernel memory is a subset of the user memory. This setup is useful in
- deployments where the total amount of memory per-cgroup is overcommited.
- Overcommiting kernel memory limits is definitely not recommended, since the
- box can still run out of non-reclaimable memory.
- In this case, the admin could set up K so that the sum of all groups is
- never greater than the total memory, and freely set U at the cost of his
- QoS.
- WARNING: In the current implementation, memory reclaim will NOT be
- triggered for a cgroup when it hits K while staying below U, which makes
- this setup impractical.
-
- U != 0, K >= U:
- Since kmem charges will also be fed to the user counter and reclaim will be
- triggered for the cgroup for both kinds of memory. This setup gives the
- admin a unified view of memory, and it is also useful for people who just
- want to track kernel memory usage.
-
-3. User Interface
-
-3.0. Configuration
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
-
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
-# mount -t tmpfs none /sys/fs/cgroup
-# mkdir /sys/fs/cgroup/memory
-# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-
-3.2. Make the new group and move bash into it
-# mkdir /sys/fs/cgroup/memory/0
-# echo $$ > /sys/fs/cgroup/memory/0/tasks
-
-Since now we're in the 0 cgroup, we can alter the memory limit:
-# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-
-NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
-mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes, Gibibytes.)
-
-NOTE: We can write "-1" to reset the *.limit_in_bytes(unlimited).
-NOTE: We cannot set limits on the root cgroup any more.
-
-# cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-4194304
-
-We can check the usage:
-# cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
-1216512
-
-A successful write to this file does not guarantee a successful setting of
-this limit to the value written into the file. This can be due to a
-number of factors, such as rounding up to page boundaries or the total
-availability of memory on the system. The user is required to re-read
-this file after a write to guarantee the value committed by the kernel.
-
-# echo 1 > memory.limit_in_bytes
-# cat memory.limit_in_bytes
-4096
-
-The memory.failcnt field gives the number of times that the cgroup limit was
-exceeded.
-
-The memory.stat file gives accounting information. Now, the number of
-caches, RSS and Active pages/Inactive pages are shown.
-
-4. Testing
-
-For testing features and implementation, see memcg_test.txt.
-
-Performance test is also important. To see pure memory controller's overhead,
-testing on tmpfs will give you good numbers of small overheads.
-Example: do kernel make on tmpfs.
-
-Page-fault scalability is also important. At measuring parallel
-page fault test, multi-process test may be better than multi-thread
-test because it has noise of shared objects/status.
-
-But the above two are testing extreme situations.
-Trying usual test under memory controller is always helpful.
-
-4.1 Troubleshooting
-
-Sometimes a user might find that the application under a cgroup is
-terminated by the OOM killer. There are several causes for this:
-
-1. The cgroup limit is too low (just too low to do anything useful)
-2. The user is using anonymous memory and swap is turned off or too low
-
-A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
-some of the pages cached in the cgroup (page cache pages).
-
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
-
-4.2 Task migration
-
-When a task migrates from one cgroup to another, its charge is not
-carried forward by default. The pages allocated from the original cgroup still
-remain charged to it, the charge is dropped when the page is freed or
-reclaimed.
-
-You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
-
-4.3 Removing a cgroup
-
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
-
-We move the stats to root (if use_hierarchy==0) or parent (if
-use_hierarchy==1), and no change on the charge except uncharging
-from the child.
-
-Charges recorded in swap information is not updated at removal of cgroup.
-Recorded information is discarded and a cgroup which uses swap (swapcache)
-will be charged as a new owner of it.
-
-About use_hierarchy, see Section 6.
-
-5. Misc. interfaces.
-
-5.1 force_empty
- memory.force_empty interface is provided to make cgroup's memory usage empty.
- When writing anything to this
-
- # echo 0 > memory.force_empty
-
- the cgroup will be reclaimed and as many pages reclaimed as possible.
-
- The typical use case for this interface is before calling rmdir().
- Though rmdir() offlines memcg, but the memcg may still stay there due to
- charged file caches. Some out-of-use page caches may keep charged until
- memory pressure happens. If you want to avoid that, force_empty will be useful.
-
- Also, note that when memory.kmem.limit_in_bytes is set the charges due to
- kernel pages will still be seen. This is not considered a failure and the
- write will still return success. In this case, it is expected that
- memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
- About use_hierarchy, see Section 6.
-
-5.2 stat file
-
-memory.stat file includes following statistics
-
-# per-memory cgroup local status
-cache - # of bytes of page cache memory.
-rss - # of bytes of anonymous and swap cache memory (includes
- transparent hugepages).
-rss_huge - # of bytes of anonymous transparent hugepages.
-mapped_file - # of bytes of mapped file (includes tmpfs/shmem)
-pgpgin - # of charging events to the memory cgroup. The charging
- event happens each time a page is accounted as either mapped
- anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout - # of uncharging events to the memory cgroup. The uncharging
- event happens each time a page is unaccounted from the cgroup.
-swap - # of bytes of swap usage
-dirty - # of bytes that are waiting to get written back to the disk.
-writeback - # of bytes of file/anon cache that are queued for syncing to
- disk.
-inactive_anon - # of bytes of anonymous and swap cache memory on inactive
- LRU list.
-active_anon - # of bytes of anonymous and swap cache memory on active
- LRU list.
-inactive_file - # of bytes of file-backed memory on inactive LRU list.
-active_file - # of bytes of file-backed memory on active LRU list.
-unevictable - # of bytes of memory that cannot be reclaimed (mlocked etc).
-
-# status considering hierarchy (see memory.use_hierarchy settings)
-
-hierarchical_memory_limit - # of bytes of memory limit with regard to hierarchy
- under which the memory cgroup is
-hierarchical_memsw_limit - # of bytes of memory+swap limit with regard to
- hierarchy under which memory cgroup is.
-
-total_<counter> - # hierarchical version of <counter>, which in
- addition to the cgroup's own value includes the
- sum of all hierarchical children's values of
- <counter>, i.e. total_cache
-
-# The following additional stats are dependent on CONFIG_DEBUG_VM.
-
-recent_rotated_anon - VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file - VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon - VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file - VM internal parameter. (see mm/vmscan.c)
-
-Memo:
- recent_rotated means recent frequency of LRU rotation.
- recent_scanned means recent # of scans to LRU.
- showing for better debug please see the code for meanings.
-
-Note:
- Only anonymous and swap cache memory is listed as part of 'rss' stat.
- This should not be confused with the true 'resident set size' or the
- amount of physical memory used by the cgroup.
- 'rss + mapped_file" will give you resident set size of cgroup.
- (Note: file and shmem may be shared among other cgroups. In that case,
- mapped_file is accounted only when the memory cgroup is owner of page
- cache.)
-
-5.3 swappiness
-
-Overrides /proc/sys/vm/swappiness for the particular group. The tunable
-in the root cgroup corresponds to the global swappiness setting.
-
-Please note that unlike during the global reclaim, limit reclaim
-enforces that 0 swappiness really prevents from any swapping even if
-there is a swap storage available. This might lead to memcg OOM killer
-if there are no file pages to reclaim.
-
-5.4 failcnt
-
-A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
-This failcnt(== failure count) shows the number of times that a usage counter
-hit its limit. When a memory cgroup hits a limit, failcnt increases and
-memory under it will be reclaimed.
-
-You can reset failcnt by writing 0 to failcnt file.
-# echo 0 > .../memory.failcnt
-
-5.5 usage_in_bytes
-
-For efficiency, as other kernel components, memory cgroup uses some optimization
-to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
-value for efficient access. (Of course, when necessary, it's synchronized.)
-If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
-value in memory.stat(see 5.2).
-
-5.6 numa_stat
-
-This is similar to numa_maps but operates on a per-memcg basis. This is
-useful for providing visibility into the numa locality information within
-an memcg since the pages are allowed to be allocated from any physical
-node. One of the use cases is evaluating application performance by
-combining this information with the application's CPU allocation.
-
-Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
-per-node page counts including "hierarchical_<counter>" which sums up all
-hierarchical children's values in addition to the memcg's own value.
-
-The output format of memory.numa_stat is:
-
-total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
-file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
-anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
-
-The "total" count is sum of file + anon + unevictable.
-
-6. Hierarchy support
-
-The memory controller supports a deep hierarchy and hierarchical accounting.
-The hierarchy is created by creating the appropriate cgroups in the
-cgroup filesystem. Consider for example, the following cgroup filesystem
-hierarchy
-
- root
- / | \
- / | \
- a b c
- | \
- | \
- d e
-
-In the diagram above, with hierarchical accounting enabled, all memory
-usage of e, is accounted to its ancestors up until the root (i.e, c and root),
-that has memory.use_hierarchy enabled. If one of the ancestors goes over its
-limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
-children of the ancestor.
-
-6.1 Enabling hierarchical accounting and reclaim
-
-A memory cgroup by default disables the hierarchy feature. Support
-can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup
-
-# echo 1 > memory.use_hierarchy
-
-The feature can be disabled by
-
-# echo 0 > memory.use_hierarchy
-
-NOTE1: Enabling/disabling will fail if either the cgroup already has other
- cgroups created below it, or if the parent cgroup has use_hierarchy
- enabled.
-
-NOTE2: When panic_on_oom is set to "2", the whole system will panic in
- case of an OOM event in any cgroup.
-
-7. Soft limits
-
-Soft limits allow for greater sharing of memory. The idea behind soft limits
-is to allow control groups to use as much of the memory as needed, provided
-
-a. There is no memory contention
-b. They do not exceed their hard limit
-
-When the system detects memory contention or low memory, control groups
-are pushed back to their soft limits. If the soft limit of each control
-group is very high, they are pushed back as much as possible to make
-sure that one control group does not starve the others of memory.
-
-Please note that soft limits is a best-effort feature; it comes with
-no guarantees, but it does its best to make sure that when memory is
-heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is set up such that
-it gets invoked from balance_pgdat (kswapd).
-
-7.1 Interface
-
-Soft limits can be setup by using the following commands (in this example we
-assume a soft limit of 256 MiB)
-
-# echo 256M > memory.soft_limit_in_bytes
-
-If we want to change this to 1G, we can at any time use
-
-# echo 1G > memory.soft_limit_in_bytes
-
-NOTE1: Soft limits take effect over a long period of time, since they involve
- reclaiming memory for balancing between memory cgroups
-NOTE2: It is recommended to set the soft limit always below the hard limit,
- otherwise the hard limit will take precedence.
-
-8. Move charges at task migration
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it:
-
-# echo (some positive value) > memory.move_charge_at_immigrate
-
-Note: Each bits of move_charge_at_immigrate has its own meaning about what type
- of charges should be moved. See 8.2 for details.
-Note: Charges are moved only when you move mm->owner, in other words,
- a leader of a thread group.
-Note: If we cannot find enough space for the task in the destination cgroup, we
- try to make space by reclaiming memory. Task migration may fail if we
- cannot make enough space.
-Note: It can take several seconds if you move charges much.
-
-And if you want disable it again:
-
-# echo 0 > memory.move_charge_at_immigrate
-
-8.2 Type of charges which can be moved
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
- bit | what type of charges would be moved ?
- -----+------------------------------------------------------------------------
- 0 | A charge of an anonymous page (or swap of it) used by the target task.
- | You must enable Swap Extension (see 2.4) to enable move of swap charges.
- -----+------------------------------------------------------------------------
- 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory)
- | and swaps of tmpfs file) mmapped by the target task. Unlike the case of
- | anonymous pages, file pages (and swaps) in the range mmapped by the task
- | will be moved even if the task hasn't done page fault, i.e. they might
- | not be the task's "RSS", but other task's "RSS" that maps the same file.
- | And mapcount of the page is ignored (the page can be moved even if
- | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to
- | enable move of swap charges.
-
-8.3 TODO
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
- behavior to hold the mutex too long, so we may need some trick.
-
-9. Memory thresholds
-
-Memory cgroup implements memory thresholds using the cgroups notification
-API (see cgroups.txt). It allows to register multiple memory and memsw
-thresholds and gets notifications when it crosses.
-
-To register a threshold, an application must:
-- create an eventfd using eventfd(2);
-- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
-- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
- cgroup.event_control.
-
-Application will be notified through eventfd when memory usage crosses
-threshold in any direction.
-
-It's applicable for root and non-root cgroup.
-
-10. OOM Control
-
-memory.oom_control file is for OOM notification and other controls.
-
-Memory cgroup implements OOM notifier using the cgroup notification
-API (See cgroups.txt). It allows to register multiple OOM notification
-delivery and gets notification when OOM happens.
-
-To register a notifier, an application must:
- - create an eventfd using eventfd(2)
- - open memory.oom_control file
- - write string like "<event_fd> <fd of memory.oom_control>" to
- cgroup.event_control
-
-The application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for the root cgroup.
-
-You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
-
- #echo 1 > memory.oom_control
-
-If OOM-killer is disabled, tasks under cgroup will hang/sleep
-in memory cgroup's OOM-waitqueue when they request accountable memory.
-
-For running them, you have to relax the memory cgroup's OOM status by
- * enlarge limit or reduce usage.
-To reduce usage,
- * kill some tasks.
- * move some tasks to other group with account migration.
- * remove some files (on tmpfs?)
-
-Then, stopped tasks will work again.
-
-At reading, current status of OOM is shown.
- oom_kill_disable 0 or 1 (if 1, oom-killer is disabled)
- under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
- be stopped.)
-
-11. Memory Pressure
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-By default, events are propagated upward until the event is handled, i.e. the
-events are not pass-through. For example, you have three cgroups: A->B->C. Now
-you set up an event listener on cgroups A, B and C, and suppose group C
-experiences some pressure. In this situation, only group C will receive the
-notification, i.e. groups A and B will not receive it. This is done to avoid
-excessive "broadcasting" of messages, which disturbs the system and which is
-especially bad if we are low on memory or thrashing. Group B, will receive
-notification only if there are no event listers for group C.
-
-There are three optional modes that specify different propagation behavior:
-
- - "default": this is the default behavior specified above. This mode is the
- same as omitting the optional mode parameter, preserved by backwards
- compatibility.
-
- - "hierarchy": events always propagate up to the root, similar to the default
- behavior, except that propagation continues regardless of whether there are
- event listeners at each level, with the "hierarchy" mode. In the above
- example, groups A, B, and C will receive notification of memory pressure.
-
- - "local": events are pass-through, i.e. they only receive notifications when
- memory pressure is experienced in the memcg for which the notification is
- registered. In the above example, group C will receive notification if
- registered for "local" notification and the group experiences memory
- pressure. However, group B will never receive notification, regardless if
- there is an event listener for group C or not, if group B is registered for
- local notification.
-
-The level and event notification mode ("hierarchy" or "local", if necessary) are
-specified by a comma-delimited string, i.e. "low,hierarchy" specifies
-hierarchical, pass-through, notification for all ancestor memcgs. Notification
-that is the default, non pass-through behavior, does not specify a mode.
-"medium,local" specifies pass-through notification for the medium level.
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.pressure_level;
-- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
- to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
- Here is a small script example that makes a new cgroup, sets up a
- memory limit, sets up a notification in the cgroup and then makes child
- cgroup experience a critical pressure:
-
- # cd /sys/fs/cgroup/memory/
- # mkdir foo
- # cd foo
- # cgroup_event_listener memory.pressure_level low,hierarchy &
- # echo 8000000 > memory.limit_in_bytes
- # echo 8000000 > memory.memsw.limit_in_bytes
- # echo $$ > tasks
- # dd if=/dev/zero | read x
-
- (Expect a bunch of notifications, and eventually, the oom-killer will
- trigger.)
-
-12. TODO
-
-1. Make per-cgroup scanner reclaim not-shared pages first
-2. Teach controller to account for shared-pages
-3. Start reclamation in the background when the limit is
- not yet hit but the usage is getting closer
-
-Summary
-
-Overall, the memory controller has been a stable controller and has been
-commented and discussed quite extensively in the community.
-
-References
-
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
- http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
- http://lkml.org/lkml/2007/3/6/198
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
- http://lkml.org/lkml/2007/4/9/78
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
- http://lkml.org/lkml/2007/5/30/244
-6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
-7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
- subsystem (v3), http://lwn.net/Articles/235534/
-8. Singh, Balbir. RSS controller v2 test results (lmbench),
- http://lkml.org/lkml/2007/5/17/232
-9. Singh, Balbir. RSS controller v2 AIM9 results
- http://lkml.org/lkml/2007/5/18/1
-10. Singh, Balbir. Memory controller v6 test results,
- http://lkml.org/lkml/2007/8/19/36
-11. Singh, Balbir. Memory controller introduction (v6),
- http://lkml.org/lkml/2007/8/17/69
-12. Corbet, Jonathan, Controlling memory use in cgroups,
- http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-v1/net_cls.txt b/Documentation/cgroup-v1/net_cls.txt
deleted file mode 100644
index ec182346dea2..000000000000
--- a/Documentation/cgroup-v1/net_cls.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Network classifier cgroup
--------------------------
-
-The Network classifier cgroup provides an interface to
-tag network packets with a class identifier (classid).
-
-The Traffic Controller (tc) can be used to assign
-different priorities to packets from different cgroups.
-Also, Netfilter (iptables) can use this tag to perform
-actions on such packets.
-
-Creating a net_cls cgroups instance creates a net_cls.classid file.
-This net_cls.classid value is initialized to 0.
-
-You can write hexadecimal values to net_cls.classid; the format for these
-values is 0xAAAABBBB; AAAA is the major handle number and BBBB
-is the minor handle number.
-Reading net_cls.classid yields a decimal result.
-
-Example:
-mkdir /sys/fs/cgroup/net_cls
-mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
-mkdir /sys/fs/cgroup/net_cls/0
-echo 0x100001 > /sys/fs/cgroup/net_cls/0/net_cls.classid
- - setting a 10:1 handle.
-
-cat /sys/fs/cgroup/net_cls/0/net_cls.classid
-1048577
-
-configuring tc:
-tc qdisc add dev eth0 root handle 10: htb
-
-tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
- - creating traffic class 10:1
-
-tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
-
-configuring iptables, basic example:
-iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_prio.txt b/Documentation/cgroup-v1/net_prio.txt
deleted file mode 100644
index a82cbd28ea8a..000000000000
--- a/Documentation/cgroup-v1/net_prio.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Network priority cgroup
--------------------------
-
-The Network priority cgroup provides an interface to allow an administrator to
-dynamically set the priority of network traffic generated by various
-applications
-
-Nominally, an application would set the priority of its traffic via the
-SO_PRIORITY socket option. This however, is not always possible because:
-
-1) The application may not have been coded to set this value
-2) The priority of application traffic is often a site-specific administrative
- decision rather than an application defined one.
-
-This cgroup allows an administrator to assign a process to a group which defines
-the priority of egress traffic on a given interface. Network priority groups can
-be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
-
-With the above step, the initial group acting as the parent accounting group
-becomes visible at '/sys/fs/cgroup/net_prio'. This group includes all tasks in
-the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
-
-Each net_prio cgroup contains two files that are subsystem specific
-
-net_prio.prioidx
-This file is read-only, and is simply informative. It contains a unique integer
-value that the kernel uses as an internal representation of this cgroup.
-
-net_prio.ifpriomap
-This file contains a map of the priorities assigned to traffic originating from
-processes in this group and egressing the system on various interfaces. It
-contains a list of tuples in the form <ifname priority>. Contents of this file
-can be modified by echoing a string into the file using the same tuple format.
-for example:
-
-echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
-
-This command would force any traffic originating from processes belonging to the
-iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
-said traffic set to the value 5. The parent accounting group also has a
-writeable 'net_prio.ifpriomap' file that can be used to set a system default
-priority.
-
-Priorities are set immediately prior to queueing a frame to the device
-queueing discipline (qdisc) so priorities will be assigned prior to the hardware
-queue selection being made.
-
-One usage for the net_prio cgroup is with mqprio qdisc allowing application
-traffic to be steered to hardware/driver based traffic classes. These mappings
-can then be managed by administrators or other networking protocols such as
-DCBX.
-
-A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-v1/pids.txt b/Documentation/cgroup-v1/pids.txt
deleted file mode 100644
index e105d708ccde..000000000000
--- a/Documentation/cgroup-v1/pids.txt
+++ /dev/null
@@ -1,88 +0,0 @@
- Process Number Controller
- =========================
-
-Abstract
---------
-
-The process number controller is used to allow a cgroup hierarchy to stop any
-new tasks from being fork()'d or clone()'d after a certain limit is reached.
-
-Since it is trivial to hit the task limit without hitting any kmemcg limits in
-place, PIDs are a fundamental resource. As such, PID exhaustion must be
-preventable in the scope of a cgroup hierarchy by allowing resource limiting of
-the number of tasks in a cgroup.
-
-Usage
------
-
-In order to use the `pids` controller, set the maximum number of tasks in
-pids.max (this is not available in the root cgroup for obvious reasons). The
-number of processes currently in the cgroup is given by pids.current.
-
-Organisational operations are not blocked by cgroup policies, so it is possible
-to have pids.current > pids.max. This can be done by either setting the limit to
-be smaller than pids.current, or attaching enough processes to the cgroup such
-that pids.current > pids.max. However, it is not possible to violate a cgroup
-policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
-creation of a new process would cause a cgroup policy to be violated.
-
-To set a cgroup to have no limit, set pids.max to "max". This is the default for
-all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
-limit in the hierarchy is followed).
-
-pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
-superset of parent/child/pids.current.
-
-The pids.events file contains event counters:
- - max: Number of times fork failed because limit was hit.
-
-Example
--------
-
-First, we mount the pids controller:
-# mkdir -p /sys/fs/cgroup/pids
-# mount -t cgroup -o pids none /sys/fs/cgroup/pids
-
-Then we create a hierarchy, set limits and attach processes to it:
-# mkdir -p /sys/fs/cgroup/pids/parent/child
-# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
-# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-#
-
-It should be noted that attempts to overcome the set limit (2 in this case) will
-fail:
-
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-Even if we migrate to a child cgroup (which doesn't have a set limit), we will
-not be able to overcome the most stringent limit in the hierarchy (in this case,
-parent's):
-
-# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
-# cat /sys/fs/cgroup/pids/parent/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.current
-2
-# cat /sys/fs/cgroup/pids/parent/child/pids.max
-max
-# ( /bin/echo "Here's some processes for you." | cat )
-sh: fork: Resource temporary unavailable
-#
-
-We can set a limit that is smaller than pids.current, which will stop any new
-processes from being forked at all (note that the shell itself counts towards
-pids.current):
-
-# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
-# /bin/echo "We can't even spawn a single process now."
-sh: fork: Resource temporary unavailable
-#
diff --git a/Documentation/cgroup-v1/rdma.txt b/Documentation/cgroup-v1/rdma.txt
deleted file mode 100644
index 9bdb7fd03f83..000000000000
--- a/Documentation/cgroup-v1/rdma.txt
+++ /dev/null
@@ -1,109 +0,0 @@
- RDMA Controller
- ----------------
-
-Contents
---------
-
-1. Overview
- 1-1. What is RDMA controller?
- 1-2. Why RDMA controller needed?
- 1-3. How is RDMA controller implemented?
-2. Usage Examples
-
-1. Overview
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can lead to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
- hca_handle Maximum number of HCA Handles
- hca_object Maximum number of HCA Objects
-
-2. Usage Examples
------------------
-
-(a) Configure resource limit:
-echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit:
-cat /sys/fs/cgroup/rdma/2/rdma.max
-#Output:
-mlx4_0 hca_handle=2 hca_object=2000
-ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage:
-cat /sys/fs/cgroup/rdma/2/rdma.current
-#Output:
-mlx4_0 hca_handle=1 hca_object=20
-ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit:
-echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/cma/debugfs.txt b/Documentation/cma/debugfs.txt
deleted file mode 100644
index 6cef20a8cedc..000000000000
--- a/Documentation/cma/debugfs.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-The CMA debugfs interface is useful to retrieve basic information out of the
-different CMA areas and to test allocation/release in each of the areas.
-
-Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
-kernel's CMA index. So the first CMA zone would be:
-
- <debugfs>/cma/cma-0
-
-The structure of the files created under that directory is as follows:
-
- - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
- - [RO] count: Amount of memory in the CMA area.
- - [RO] order_per_bit: Order of pages represented by one bit.
- - [RO] bitmap: The bitmap of page states in the zone.
- - [WO] alloc: Allocate N pages from that CMA area. For example:
-
- echo 5 > <debugfs>/cma/cma-2/alloc
-
-would try to allocate 5 pages from the cma-2 area.
-
- - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
deleted file mode 100644
index ab7ca897fab7..000000000000
--- a/Documentation/connector/connector.txt
+++ /dev/null
@@ -1,196 +0,0 @@
-/*****************************************/
-Kernel Connector.
-/*****************************************/
-
-Kernel connector - new netlink based userspace <-> kernel space easy
-to use communication module.
-
-The Connector driver makes it easy to connect various agents using a
-netlink based network. One must register a callback and an identifier.
-When the driver receives a special netlink message with the appropriate
-identifier, the appropriate callback will be called.
-
-From the userspace point of view it's quite straightforward:
-
- socket();
- bind();
- send();
- recv();
-
-But if kernelspace wants to use the full power of such connections, the
-driver writer must create special sockets, must know about struct sk_buff
-handling, etc... The Connector driver allows any kernelspace agents to use
-netlink based networking for inter-process communication in a significantly
-easier way:
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
-void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
-
-struct cb_id
-{
- __u32 idx;
- __u32 val;
-};
-
-idx and val are unique identifiers which must be registered in the
-connector.h header for in-kernel usage. void (*callback) (void *) is a
-callback function which will be called when a message with above idx.val
-is received by the connector core. The argument for that function must
-be dereferenced to struct cn_msg *.
-
-struct cn_msg
-{
- struct cb_id id;
-
- __u32 seq;
- __u32 ack;
-
- __u32 len; /* Length of the following data */
- __u8 data[0];
-};
-
-/*****************************************/
-Connector interfaces.
-/*****************************************/
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-
- Registers new callback with connector core.
-
- struct cb_id *id - unique connector's user identifier.
- It must be registered in connector.h for legal in-kernel users.
- char *name - connector's callback symbolic name.
- void (*callback) (struct cn..) - connector's callback.
- cn_msg and the sender's credentials
-
-
-void cn_del_callback(struct cb_id *id);
-
- Unregisters new callback with connector core.
-
- struct cb_id *id - unique connector's user identifier.
-
-
-int cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __groups, int gfp_mask);
-int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __groups, int gfp_mask);
-
- Sends message to the specified groups. It can be safely called from
- softirq context, but may silently fail under strong memory pressure.
- If there are no listeners for given group -ESRCH can be returned.
-
- struct cn_msg * - message header(with attached data).
- u16 len - for *_multi multiple cn_msg messages can be sent
- u32 port - destination port.
- If non-zero the message will be sent to the
- given port, which should be set to the
- original sender.
- u32 __group - destination group.
- If port and __group is zero, then appropriate group will
- be searched through all registered connector users,
- and message will be delivered to the group which was
- created for user with the same ID as in msg.
- If __group is not zero, then message will be delivered
- to the specified group.
- int gfp_mask - GFP mask.
-
- Note: When registering new callback user, connector core assigns
- netlink group to the user which is equal to its id.idx.
-
-/*****************************************/
-Protocol description.
-/*****************************************/
-
-The current framework offers a transport layer with fixed headers. The
-recommended protocol which uses such a header is as following:
-
-msg->seq and msg->ack are used to determine message genealogy. When
-someone sends a message, they use a locally unique sequence and random
-acknowledge number. The sequence number may be copied into
-nlmsghdr->nlmsg_seq too.
-
-The sequence number is incremented with each message sent.
-
-If you expect a reply to the message, then the sequence number in the
-received message MUST be the same as in the original message, and the
-acknowledge number MUST be the same + 1.
-
-If we receive a message and its sequence number is not equal to one we
-are expecting, then it is a new message. If we receive a message and
-its sequence number is the same as one we are expecting, but its
-acknowledge is not equal to the sequence number in the original
-message + 1, then it is a new message.
-
-Obviously, the protocol header contains the above id.
-
-The connector allows event notification in the following form: kernel
-driver or userspace process can ask connector to notify it when
-selected ids will be turned on or off (registered or unregistered its
-callback). It is done by sending a special command to the connector
-driver (it also registers itself with id={-1, -1}).
-
-As example of this usage can be found in the cn_test.c module which
-uses the connector to request notification and to send messages.
-
-/*****************************************/
-Reliability.
-/*****************************************/
-
-Netlink itself is not a reliable protocol. That means that messages can
-be lost due to memory pressure or process' receiving queue overflowed,
-so caller is warned that it must be prepared. That is why the struct
-cn_msg [main connector's message header] contains u32 seq and u32 ack
-fields.
-
-/*****************************************/
-Userspace usage.
-/*****************************************/
-
-2.6.14 has a new netlink socket implementation, which by default does not
-allow people to send data to netlink groups other than 1.
-So, if you wish to use a netlink socket (for example using connector)
-with a different group number, the userspace application must subscribe to
-that group first. It can be achieved by the following pseudocode:
-
-s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
-
-l_local.nl_family = AF_NETLINK;
-l_local.nl_groups = 12345;
-l_local.nl_pid = 0;
-
-if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
- perror("bind");
- close(s);
- return -1;
-}
-
-{
- int on = l_local.nl_groups;
- setsockopt(s, 270, 1, &on, sizeof(on));
-}
-
-Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
-option. To drop a multicast subscription, one should call the above socket
-option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
-
-2.6.14 netlink code only allows to select a group which is less or equal to
-the maximum group number, which is used at netlink_kernel_create() time.
-In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
-group number 12345, you must increment CN_NETLINK_USERS to that number.
-Additional 0xf numbers are allocated to be used by non-in-kernel users.
-
-Due to this limitation, group 0xffffffff does not work now, so one can
-not use add/remove connector's group notifications, but as far as I know,
-only cn_test.c test module used it.
-
-Some work in netlink area is still being done, so things can be changed in
-2.6.15 timeframe, if it will happen, documentation will be updated for that
-kernel.
-
-/*****************************************/
-Code samples
-/*****************************************/
-
-Sample code for a connector test module and user space can be found
-in samples/connector/. To build this code, enable CONFIG_CONNECTOR
-and CONFIG_SAMPLES.
diff --git a/Documentation/console/console.txt b/Documentation/console/console.txt
deleted file mode 100644
index d73c2ab4beda..000000000000
--- a/Documentation/console/console.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-Console Drivers
-===============
-
-The Linux kernel has 2 general types of console drivers. The first type is
-assigned by the kernel to all the virtual consoles during the boot process.
-This type will be called 'system driver', and only one system driver is allowed
-to exist. The system driver is persistent and it can never be unloaded, though
-it may become inactive.
-
-The second type has to be explicitly loaded and unloaded. This will be called
-'modular driver' by this document. Multiple modular drivers can coexist at
-any time with each driver sharing the console with other drivers including
-the system driver. However, modular drivers cannot take over the console
-that is currently occupied by another modular driver. (Exception: Drivers that
-call do_take_over_console() will succeed in the takeover regardless of the type
-of driver occupying the consoles.) They can only take over the console that is
-occupied by the system driver. In the same token, if the modular driver is
-released by the console, the system driver will take over.
-
-Modular drivers, from the programmer's point of view, have to call:
-
- do_take_over_console() - load and bind driver to console layer
- give_up_console() - unload driver; it will only work if driver
- is fully unbound
-
-In newer kernels, the following are also available:
-
- do_register_con_driver()
- do_unregister_con_driver()
-
-If sysfs is enabled, the contents of /sys/class/vtconsole can be
-examined. This shows the console backends currently registered by the
-system which are named vtcon<n> where <n> is an integer from 0 to 15. Thus:
-
- ls /sys/class/vtconsole
- . .. vtcon0 vtcon1
-
-Each directory in /sys/class/vtconsole has 3 files:
-
- ls /sys/class/vtconsole/vtcon0
- . .. bind name uevent
-
-What do these files signify?
-
- 1. bind - this is a read/write file. It shows the status of the driver if
- read, or acts to bind or unbind the driver to the virtual consoles
- when written to. The possible values are:
-
- 0 - means the driver is not bound and if echo'ed, commands the driver
- to unbind
-
- 1 - means the driver is bound and if echo'ed, commands the driver to
- bind
-
- 2. name - read-only file. Shows the name of the driver in this format:
-
- cat /sys/class/vtconsole/vtcon0/name
- (S) VGA+
-
- '(S)' stands for a (S)ystem driver, i.e., it cannot be directly
- commanded to bind or unbind
-
- 'VGA+' is the name of the driver
-
- cat /sys/class/vtconsole/vtcon1/name
- (M) frame buffer device
-
- In this case, '(M)' stands for a (M)odular driver, one that can be
- directly commanded to bind or unbind.
-
- 3. uevent - ignore this file
-
-When unbinding, the modular driver is detached first, and then the system
-driver takes over the consoles vacated by the driver. Binding, on the other
-hand, will bind the driver to the consoles that are currently occupied by a
-system driver.
-
-NOTE1: Binding and unbinding must be selected in Kconfig. It's under:
-
-Device Drivers -> Character devices -> Support for binding and unbinding
-console drivers
-
-NOTE2: If any of the virtual consoles are in KD_GRAPHICS mode, then binding or
-unbinding will not succeed. An example of an application that sets the console
-to KD_GRAPHICS is X.
-
-How useful is this feature? This is very useful for console driver
-developers. By unbinding the driver from the console layer, one can unload the
-driver, make changes, recompile, reload and rebind the driver without any need
-for rebooting the kernel. For regular users who may want to switch from
-framebuffer console to VGA console and vice versa, this feature also makes
-this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb
-for more details.)
-
-Notes for developers:
-=====================
-
-do_take_over_console() is now broken up into:
-
- do_register_con_driver()
- do_bind_con_driver() - private function
-
-give_up_console() is a wrapper to do_unregister_con_driver(), and a driver must
-be fully unbound for this call to succeed. con_is_bound() will check if the
-driver is bound or not.
-
-Guidelines for console driver writers:
-=====================================
-
-In order for binding to and unbinding from the console to properly work,
-console drivers must follow these guidelines:
-
-1. All drivers, except system drivers, must call either do_register_con_driver()
- or do_take_over_console(). do_register_con_driver() will just add the driver
- to the console's internal list. It won't take over the
- console. do_take_over_console(), as it name implies, will also take over (or
- bind to) the console.
-
-2. All resources allocated during con->con_init() must be released in
- con->con_deinit().
-
-3. All resources allocated in con->con_startup() must be released when the
- driver, which was previously bound, becomes unbound. The console layer
- does not have a complementary call to con->con_startup() so it's up to the
- driver to check when it's legal to release these resources. Calling
- con_is_bound() in con->con_deinit() will help. If the call returned
- false(), then it's safe to release the resources. This balance has to be
- ensured because con->con_startup() can be called again when a request to
- rebind the driver to the console arrives.
-
-4. Upon exit of the driver, ensure that the driver is totally unbound. If the
- condition is satisfied, then the driver must call do_unregister_con_driver()
- or give_up_console().
-
-5. do_unregister_con_driver() can also be called on conditions which make it
- impossible for the driver to service console requests. This can happen
- with the framebuffer console that suddenly lost all of its drivers.
-
-The current crop of console drivers should still work correctly, but binding
-and unbinding them may cause problems. With minimal fixes, these drivers can
-be made to work correctly.
-
-==========================
-Antonino Daplas <adaplas@pol.net>
-
diff --git a/Documentation/core-api/circular-buffers.rst b/Documentation/core-api/circular-buffers.rst
index 53e51caa3347..50966f66e398 100644
--- a/Documentation/core-api/circular-buffers.rst
+++ b/Documentation/core-api/circular-buffers.rst
@@ -3,7 +3,7 @@ Circular Buffers
================
:Author: David Howells <dhowells@redhat.com>
-:Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+:Author: Paul E. McKenney <paulmck@linux.ibm.com>
Linux provides a number of features that can be used to implement circular
diff --git a/Documentation/gcc-plugins.txt b/Documentation/core-api/gcc-plugins.rst
index 8502f24396fb..8502f24396fb 100644
--- a/Documentation/gcc-plugins.txt
+++ b/Documentation/core-api/gcc-plugins.rst
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 322ac954b390..da0ed972d224 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -36,6 +36,7 @@ Core utilities
memory-hotplug
protection-keys
../RCU/index
+ gcc-plugins
Interfaces for kernel debugging
diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 824f24ccf401..08af5caf036d 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -54,7 +54,7 @@ The Linux kernel provides more basic utility functions.
Bit Operations
--------------
-.. kernel-doc:: arch/x86/include/asm/bitops.h
+.. kernel-doc:: include/asm-generic/bitops-instrumented.h
:internal:
Bitmap Operations
diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 75d2bbe9813f..c6224d039bcb 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -119,7 +119,7 @@ Kernel Pointers
For printing kernel pointers which should be hidden from unprivileged
users. The behaviour of %pK depends on the kptr_restrict sysctl - see
-Documentation/sysctl/kernel.txt for more details.
+Documentation/admin-guide/sysctl/kernel.rst for more details.
Unmodified Addresses
--------------------
diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst
index 5f87d9c8b04d..c0ffa30c7c37 100644
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@@ -65,7 +65,7 @@ different format depending on what is required by the user:
.. c:function:: u64 ktime_get_ns( void )
u64 ktime_get_boottime_ns( void )
u64 ktime_get_real_ns( void )
- u64 ktime_get_tai_ns( void )
+ u64 ktime_get_clocktai_ns( void )
u64 ktime_get_raw_ns( void )
Same as the plain ktime_get functions, but returning a u64 number
@@ -99,16 +99,20 @@ Coarse and fast_ns access
Some additional variants exist for more specialized cases:
-.. c:function:: ktime_t ktime_get_coarse_boottime( void )
+.. c:function:: ktime_t ktime_get_coarse( void )
+ ktime_t ktime_get_coarse_boottime( void )
ktime_t ktime_get_coarse_real( void )
ktime_t ktime_get_coarse_clocktai( void )
- ktime_t ktime_get_coarse_raw( void )
+
+.. c:function:: u64 ktime_get_coarse_ns( void )
+ u64 ktime_get_coarse_boottime_ns( void )
+ u64 ktime_get_coarse_real_ns( void )
+ u64 ktime_get_coarse_clocktai_ns( void )
.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
void ktime_get_coarse_boottime_ts64( struct timespec64 * )
void ktime_get_coarse_real_ts64( struct timespec64 * )
void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
- void ktime_get_coarse_raw_ts64( struct timespec64 * )
These are quicker than the non-coarse versions, but less accurate,
corresponding to CLOCK_MONOTONIC_COARSE and CLOCK_REALTIME_COARSE
diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 073f128af5a7..55193e680250 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -95,7 +95,7 @@ flags - flags of the cpufreq driver
3. CPUFreq Table Generation with Operating Performance Point (OPP)
==================================================================
-For details about OPP, see Documentation/power/opp.txt
+For details about OPP, see Documentation/power/opp.rst
dev_pm_opp_init_cpufreq_table -
This function provides a ready to use conversion routine to translate
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
deleted file mode 100644
index cb61277e2308..000000000000
--- a/Documentation/cputopology.txt
+++ /dev/null
@@ -1,159 +0,0 @@
-===========================================
-How CPU topology info is exported via sysfs
-===========================================
-
-Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo output of some architectures. They reside in
-/sys/devices/system/cpu/cpuX/topology/:
-
-physical_package_id:
-
- physical package id of cpuX. Typically corresponds to a physical
- socket number, but the actual value is architecture and platform
- dependent.
-
-core_id:
-
- the CPU core ID of cpuX. Typically it is the hardware platform's
- identifier (rather than the kernel's). The actual value is
- architecture and platform dependent.
-
-book_id:
-
- the book ID of cpuX. Typically it is the hardware platform's
- identifier (rather than the kernel's). The actual value is
- architecture and platform dependent.
-
-drawer_id:
-
- the drawer ID of cpuX. Typically it is the hardware platform's
- identifier (rather than the kernel's). The actual value is
- architecture and platform dependent.
-
-thread_siblings:
-
- internal kernel map of cpuX's hardware threads within the same
- core as cpuX.
-
-thread_siblings_list:
-
- human-readable list of cpuX's hardware threads within the same
- core as cpuX.
-
-core_siblings:
-
- internal kernel map of cpuX's hardware threads within the same
- physical_package_id.
-
-core_siblings_list:
-
- human-readable list of cpuX's hardware threads within the same
- physical_package_id.
-
-book_siblings:
-
- internal kernel map of cpuX's hardware threads within the same
- book_id.
-
-book_siblings_list:
-
- human-readable list of cpuX's hardware threads within the same
- book_id.
-
-drawer_siblings:
-
- internal kernel map of cpuX's hardware threads within the same
- drawer_id.
-
-drawer_siblings_list:
-
- human-readable list of cpuX's hardware threads within the same
- drawer_id.
-
-Architecture-neutral, drivers/base/topology.c, exports these attributes.
-However, the book and drawer related sysfs files will only be created if
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
-
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
-where they reflect the cpu and cache hierarchy.
-
-For an architecture to support this feature, it must define some of
-these macros in include/asm-XXX/topology.h::
-
- #define topology_physical_package_id(cpu)
- #define topology_core_id(cpu)
- #define topology_book_id(cpu)
- #define topology_drawer_id(cpu)
- #define topology_sibling_cpumask(cpu)
- #define topology_core_cpumask(cpu)
- #define topology_book_cpumask(cpu)
- #define topology_drawer_cpumask(cpu)
-
-The type of ``**_id macros`` is int.
-The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
-correspond with appropriate ``**_siblings`` sysfs attributes (except for
-topology_sibling_cpumask() which corresponds with thread_siblings).
-
-To be consistent on all architectures, include/linux/topology.h
-provides default definitions for any of the above macros that are
-not defined by include/asm-XXX/topology.h:
-
-1) topology_physical_package_id: -1
-2) topology_core_id: 0
-3) topology_sibling_cpumask: just the given CPU
-4) topology_core_cpumask: just the given CPU
-
-For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
-default definitions for topology_book_id() and topology_book_cpumask().
-For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
-no default definitions for topology_drawer_id() and topology_drawer_cpumask().
-
-Additionally, CPU topology information is provided under
-/sys/devices/system/cpu and includes these files. The internal
-source for the output is in brackets ("[]").
-
- =========== ==========================================================
- kernel_max: the maximum CPU index allowed by the kernel configuration.
- [NR_CPUS-1]
-
- offline: CPUs that are not online because they have been
- HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
- of CPUs allowed by the kernel configuration (kernel_max
- above). [~cpu_online_mask + cpus >= NR_CPUS]
-
- online: CPUs that are online and being scheduled [cpu_online_mask]
-
- possible: CPUs that have been allocated resources and can be
- brought online if they are present. [cpu_possible_mask]
-
- present: CPUs that have been identified as being present in the
- system. [cpu_present_mask]
- =========== ==========================================================
-
-The format for the above output is compatible with cpulist_parse()
-[see <linux/cpumask.h>]. Some examples follow.
-
-In this example, there are 64 CPUs in the system but cpus 32-63 exceed
-the kernel max which is limited to 0..31 by the NR_CPUS config option
-being 32. Note also that CPUs 2 and 4-31 are not online but could be
-brought online as they are both present and possible::
-
- kernel_max: 31
- offline: 2,4-31,32-63
- online: 0-1,3
- possible: 0-31
- present: 0-31
-
-In this example, the NR_CPUS config option is 128, but the kernel was
-started with possible_cpus=144. There are 4 CPUs in the system and cpu2
-was manually taken offline (and is the only CPU that can be brought
-online.)::
-
- kernel_max: 127
- offline: 2,4-127,128-143
- online: 0-1,3
- possible: 0-127
- present: 0-3
-
-See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
-as well as more information on the various cpumasks.
diff --git a/Documentation/crypto/api-samples.rst b/Documentation/crypto/api-samples.rst
index f14afaaf2f32..e923f17bc2bd 100644
--- a/Documentation/crypto/api-samples.rst
+++ b/Documentation/crypto/api-samples.rst
@@ -4,111 +4,89 @@ Code Examples
Code Example For Symmetric Key Cipher Operation
-----------------------------------------------
-::
-
-
- /* tie all data structures together */
- struct skcipher_def {
- struct scatterlist sg;
- struct crypto_skcipher *tfm;
- struct skcipher_request *req;
- struct crypto_wait wait;
- };
-
- /* Perform cipher operation */
- static unsigned int test_skcipher_encdec(struct skcipher_def *sk,
- int enc)
- {
- int rc;
-
- if (enc)
- rc = crypto_wait_req(crypto_skcipher_encrypt(sk->req), &sk->wait);
- else
- rc = crypto_wait_req(crypto_skcipher_decrypt(sk->req), &sk->wait);
-
- if (rc)
- pr_info("skcipher encrypt returned with result %d\n", rc);
+This code encrypts some data with AES-256-XTS. For sake of example,
+all inputs are random bytes, the encryption is done in-place, and it's
+assumed the code is running in a context where it can sleep.
- return rc;
- }
+::
- /* Initialize and trigger cipher operation */
static int test_skcipher(void)
{
- struct skcipher_def sk;
- struct crypto_skcipher *skcipher = NULL;
- struct skcipher_request *req = NULL;
- char *scratchpad = NULL;
- char *ivdata = NULL;
- unsigned char key[32];
- int ret = -EFAULT;
-
- skcipher = crypto_alloc_skcipher("cbc-aes-aesni", 0, 0);
- if (IS_ERR(skcipher)) {
- pr_info("could not allocate skcipher handle\n");
- return PTR_ERR(skcipher);
- }
-
- req = skcipher_request_alloc(skcipher, GFP_KERNEL);
- if (!req) {
- pr_info("could not allocate skcipher request\n");
- ret = -ENOMEM;
- goto out;
- }
-
- skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- crypto_req_done,
- &sk.wait);
-
- /* AES 256 with random key */
- get_random_bytes(&key, 32);
- if (crypto_skcipher_setkey(skcipher, key, 32)) {
- pr_info("key could not be set\n");
- ret = -EAGAIN;
- goto out;
- }
-
- /* IV will be random */
- ivdata = kmalloc(16, GFP_KERNEL);
- if (!ivdata) {
- pr_info("could not allocate ivdata\n");
- goto out;
- }
- get_random_bytes(ivdata, 16);
-
- /* Input data will be random */
- scratchpad = kmalloc(16, GFP_KERNEL);
- if (!scratchpad) {
- pr_info("could not allocate scratchpad\n");
- goto out;
- }
- get_random_bytes(scratchpad, 16);
-
- sk.tfm = skcipher;
- sk.req = req;
-
- /* We encrypt one block */
- sg_init_one(&sk.sg, scratchpad, 16);
- skcipher_request_set_crypt(req, &sk.sg, &sk.sg, 16, ivdata);
- crypto_init_wait(&sk.wait);
-
- /* encrypt data */
- ret = test_skcipher_encdec(&sk, 1);
- if (ret)
- goto out;
-
- pr_info("Encryption triggered successfully\n");
-
+ struct crypto_skcipher *tfm = NULL;
+ struct skcipher_request *req = NULL;
+ u8 *data = NULL;
+ const size_t datasize = 512; /* data size in bytes */
+ struct scatterlist sg;
+ DECLARE_CRYPTO_WAIT(wait);
+ u8 iv[16]; /* AES-256-XTS takes a 16-byte IV */
+ u8 key[64]; /* AES-256-XTS takes a 64-byte key */
+ int err;
+
+ /*
+ * Allocate a tfm (a transformation object) and set the key.
+ *
+ * In real-world use, a tfm and key are typically used for many
+ * encryption/decryption operations. But in this example, we'll just do a
+ * single encryption operation with it (which is not very efficient).
+ */
+
+ tfm = crypto_alloc_skcipher("xts(aes)", 0, 0);
+ if (IS_ERR(tfm)) {
+ pr_err("Error allocating xts(aes) handle: %ld\n", PTR_ERR(tfm));
+ return PTR_ERR(tfm);
+ }
+
+ get_random_bytes(key, sizeof(key));
+ err = crypto_skcipher_setkey(tfm, key, sizeof(key));
+ if (err) {
+ pr_err("Error setting key: %d\n", err);
+ goto out;
+ }
+
+ /* Allocate a request object */
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Prepare the input data */
+ data = kmalloc(datasize, GFP_KERNEL);
+ if (!data) {
+ err = -ENOMEM;
+ goto out;
+ }
+ get_random_bytes(data, datasize);
+
+ /* Initialize the IV */
+ get_random_bytes(iv, sizeof(iv));
+
+ /*
+ * Encrypt the data in-place.
+ *
+ * For simplicity, in this example we wait for the request to complete
+ * before proceeding, even if the underlying implementation is asynchronous.
+ *
+ * To decrypt instead of encrypt, just change crypto_skcipher_encrypt() to
+ * crypto_skcipher_decrypt().
+ */
+ sg_init_one(&sg, data, datasize);
+ skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+ CRYPTO_TFM_REQ_MAY_SLEEP,
+ crypto_req_done, &wait);
+ skcipher_request_set_crypt(req, &sg, &sg, datasize, iv);
+ err = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+ if (err) {
+ pr_err("Error encrypting data: %d\n", err);
+ goto out;
+ }
+
+ pr_debug("Encryption was successful\n");
out:
- if (skcipher)
- crypto_free_skcipher(skcipher);
- if (req)
+ crypto_free_skcipher(tfm);
skcipher_request_free(req);
- if (ivdata)
- kfree(ivdata);
- if (scratchpad)
- kfree(scratchpad);
- return ret;
+ kfree(data);
+ return err;
}
diff --git a/Documentation/crypto/api-skcipher.rst b/Documentation/crypto/api-skcipher.rst
index 4eec4a93f7e3..20ba08dddf2e 100644
--- a/Documentation/crypto/api-skcipher.rst
+++ b/Documentation/crypto/api-skcipher.rst
@@ -5,7 +5,7 @@ Block Cipher Algorithm Definitions
:doc: Block Cipher Algorithm Definitions
.. kernel-doc:: include/linux/crypto.h
- :functions: crypto_alg ablkcipher_alg blkcipher_alg cipher_alg
+ :functions: crypto_alg ablkcipher_alg blkcipher_alg cipher_alg compress_alg
Symmetric Key Cipher API
------------------------
diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst
index ee8ff0762d7f..3eae1ae7f798 100644
--- a/Documentation/crypto/architecture.rst
+++ b/Documentation/crypto/architecture.rst
@@ -208,9 +208,7 @@ the aforementioned cipher types:
- CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as
an ECDH or DH implementation
-- CRYPTO_ALG_TYPE_DIGEST Raw message digest
-
-- CRYPTO_ALG_TYPE_HASH Alias for CRYPTO_ALG_TYPE_DIGEST
+- CRYPTO_ALG_TYPE_HASH Raw message digest
- CRYPTO_ALG_TYPE_SHASH Synchronous multi-block hash
diff --git a/Documentation/crypto/crypto_engine.rst b/Documentation/crypto/crypto_engine.rst
index 1d56221dfe35..236c674d6897 100644
--- a/Documentation/crypto/crypto_engine.rst
+++ b/Documentation/crypto/crypto_engine.rst
@@ -1,50 +1,85 @@
-=============
-CRYPTO ENGINE
+.. SPDX-License-Identifier: GPL-2.0
+Crypto Engine
=============
Overview
--------
-The crypto engine API (CE), is a crypto queue manager.
+The crypto engine (CE) API is a crypto queue manager.
Requirement
-----------
-You have to put at start of your tfm_ctx the struct crypto_engine_ctx::
+You must put, at the start of your transform context your_tfm_ctx, the structure
+crypto_engine:
+
+::
- struct your_tfm_ctx {
- struct crypto_engine_ctx enginectx;
- ...
- };
+ struct your_tfm_ctx {
+ struct crypto_engine engine;
+ ...
+ };
-Why: Since CE manage only crypto_async_request, it cannot know the underlying
-request_type and so have access only on the TFM.
-So using container_of for accessing __ctx is impossible.
-Furthermore, the crypto engine cannot know the "struct your_tfm_ctx",
-so it must assume that crypto_engine_ctx is at start of it.
+The crypto engine only manages asynchronous requests in the form of
+crypto_async_request. It cannot know the underlying request type and thus only
+has access to the transform structure. It is not possible to access the context
+using container_of. In addition, the engine knows nothing about your
+structure "``struct your_tfm_ctx``". The engine assumes (requires) the placement
+of the known member ``struct crypto_engine`` at the beginning.
Order of operations
-------------------
-You have to obtain a struct crypto_engine via crypto_engine_alloc_init().
-And start it via crypto_engine_start().
-
-Before transferring any request, you have to fill the enginectx.
-- prepare_request: (taking a function pointer) If you need to do some processing before doing the request
-- unprepare_request: (taking a function pointer) Undoing what's done in prepare_request
-- do_one_request: (taking a function pointer) Do encryption for current request
-
-Note: that those three functions get the crypto_async_request associated with the received request.
-So your need to get the original request via container_of(areq, struct yourrequesttype_request, base);
-
-When your driver receive a crypto_request, you have to transfer it to
-the cryptoengine via one of:
-- crypto_transfer_ablkcipher_request_to_engine()
-- crypto_transfer_aead_request_to_engine()
-- crypto_transfer_akcipher_request_to_engine()
-- crypto_transfer_hash_request_to_engine()
-- crypto_transfer_skcipher_request_to_engine()
-
-At the end of the request process, a call to one of the following function is needed:
-- crypto_finalize_ablkcipher_request
-- crypto_finalize_aead_request
-- crypto_finalize_akcipher_request
-- crypto_finalize_hash_request
-- crypto_finalize_skcipher_request
+You are required to obtain a struct crypto_engine via ``crypto_engine_alloc_init()``.
+Start it via ``crypto_engine_start()``. When finished with your work, shut down the
+engine using ``crypto_engine_stop()`` and destroy the engine with
+``crypto_engine_exit()``.
+
+Before transferring any request, you have to fill the context enginectx by
+providing functions for the following:
+
+* ``prepare_crypt_hardware``: Called once before any prepare functions are
+ called.
+
+* ``unprepare_crypt_hardware``: Called once after all unprepare functions have
+ been called.
+
+* ``prepare_cipher_request``/``prepare_hash_request``: Called before each
+ corresponding request is performed. If some processing or other preparatory
+ work is required, do it here.
+
+* ``unprepare_cipher_request``/``unprepare_hash_request``: Called after each
+ request is handled. Clean up / undo what was done in the prepare function.
+
+* ``cipher_one_request``/``hash_one_request``: Handle the current request by
+ performing the operation.
+
+Note that these functions access the crypto_async_request structure
+associated with the received request. You are able to retrieve the original
+request by using:
+
+::
+
+ container_of(areq, struct yourrequesttype_request, base);
+
+When your driver receives a crypto_request, you must to transfer it to
+the crypto engine via one of:
+
+* crypto_transfer_ablkcipher_request_to_engine()
+
+* crypto_transfer_aead_request_to_engine()
+
+* crypto_transfer_akcipher_request_to_engine()
+
+* crypto_transfer_hash_request_to_engine()
+
+* crypto_transfer_skcipher_request_to_engine()
+
+At the end of the request process, a call to one of the following functions is needed:
+
+* crypto_finalize_ablkcipher_request()
+
+* crypto_finalize_aead_request()
+
+* crypto_finalize_akcipher_request()
+
+* crypto_finalize_hash_request()
+
+* crypto_finalize_skcipher_request()
diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index e6f51260ff32..3621cd5e1eef 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -2,8 +2,8 @@ Kernel Memory Leak Detector
===========================
Kmemleak provides a way of detecting possible kernel memory leaks in a
-way similar to a tracing garbage collector
-(https://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors),
+way similar to a `tracing garbage collector
+<https://en.wikipedia.org/wiki/Tracing_garbage_collection>`_,
with the difference that the orphan objects are not freed but only
reported via /sys/kernel/debug/kmemleak. A similar method is used by the
Valgrind tool (``memcheck --leak-check``) to detect the memory leaks in
@@ -15,10 +15,13 @@ Usage
CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
thread scans the memory every 10 minutes (by default) and prints the
-number of new unreferenced objects found. To display the details of all
-the possible memory leaks::
+number of new unreferenced objects found. If the ``debugfs`` isn't already
+mounted, mount with::
# mount -t debugfs nodev /sys/kernel/debug/
+
+To display the details of all the possible scanned memory leaks::
+
# cat /sys/kernel/debug/kmemleak
To trigger an intermediate memory scan::
@@ -72,6 +75,9 @@ If CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF are enabled, the kmemleak is
disabled by default. Passing ``kmemleak=on`` on the kernel command
line enables the function.
+If you are getting errors like "Error while writing to stdout" or "write_loop:
+Invalid argument", make sure kmemleak is properly enabled.
+
Basic Algorithm
---------------
@@ -218,3 +224,37 @@ the pointer is calculated by other methods than the usual container_of
macro or the pointer is stored in a location not scanned by kmemleak.
Page allocations and ioremap are not tracked.
+
+Testing with kmemleak-test
+--------------------------
+
+To check if you have all set up to use kmemleak, you can use the kmemleak-test
+module, a module that deliberately leaks memory. Set CONFIG_DEBUG_KMEMLEAK_TEST
+as module (it can't be used as bult-in) and boot the kernel with kmemleak
+enabled. Load the module and perform a scan with::
+
+ # modprobe kmemleak-test
+ # echo scan > /sys/kernel/debug/kmemleak
+
+Note that the you may not get results instantly or on the first scanning. When
+kmemleak gets results, it'll log ``kmemleak: <count of leaks> new suspected
+memory leaks``. Then read the file to see then::
+
+ # cat /sys/kernel/debug/kmemleak
+ unreferenced object 0xffff89862ca702e8 (size 32):
+ comm "modprobe", pid 2088, jiffies 4294680594 (age 375.486s)
+ hex dump (first 32 bytes):
+ 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk
+ 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkkkkkk.
+ backtrace:
+ [<00000000e0a73ec7>] 0xffffffffc01d2036
+ [<000000000c5d2a46>] do_one_initcall+0x41/0x1df
+ [<0000000046db7e0a>] do_init_module+0x55/0x200
+ [<00000000542b9814>] load_module+0x203c/0x2480
+ [<00000000c2850256>] __do_sys_finit_module+0xba/0xe0
+ [<000000006564e7ef>] do_syscall_64+0x43/0x110
+ [<000000007c873fa6>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
+ ...
+
+Removing the module with ``rmmod kmemleak_test`` should also trigger some
+kmemleak results.
diff --git a/Documentation/dev-tools/sparse.rst b/Documentation/dev-tools/sparse.rst
index c401c952a340..6f4870528226 100644
--- a/Documentation/dev-tools/sparse.rst
+++ b/Documentation/dev-tools/sparse.rst
@@ -81,11 +81,6 @@ of sparse using git to clone::
git://git.kernel.org/pub/scm/devel/sparse/sparse.git
-DaveJ has hourly generated tarballs of the git tree available at::
-
- http://www.codemonkey.org.uk/projects/git-snapshots/sparse/
-
-
Once you have it, just do::
make
diff --git a/Documentation/device-mapper/index.rst b/Documentation/device-mapper/index.rst
deleted file mode 100644
index 105e253bc231..000000000000
--- a/Documentation/device-mapper/index.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-:orphan:
-
-=============
-Device Mapper
-=============
-
-.. toctree::
- :maxdepth: 1
-
- cache-policies
- cache
- delay
- dm-crypt
- dm-flakey
- dm-init
- dm-integrity
- dm-io
- dm-log
- dm-queue-length
- dm-raid
- dm-service-time
- dm-uevent
- dm-zoned
- era
- kcopyd
- linear
- log-writes
- persistent-data
- snapshot
- statistics
- striped
- switch
- thin-provisioning
- unstriped
- verity
- writecache
- zero
-
-.. only:: subproject and html
-
- Indices
- =======
-
- * :ref:`genindex`
diff --git a/Documentation/device-mapper/snapshot.rst b/Documentation/device-mapper/snapshot.rst
deleted file mode 100644
index 4c53304e72f1..000000000000
--- a/Documentation/device-mapper/snapshot.rst
+++ /dev/null
@@ -1,180 +0,0 @@
-==============================
-Device-mapper snapshot support
-==============================
-
-Device-mapper allows you, without massive data copying:
-
-- To create snapshots of any block device i.e. mountable, saved states of
- the block device which are also writable without interfering with the
- original content;
-- To create device "forks", i.e. multiple different versions of the
- same data stream.
-- To merge a snapshot of a block device back into the snapshot's origin
- device.
-
-In the first two cases, dm copies only the chunks of data that get
-changed and uses a separate copy-on-write (COW) block device for
-storage.
-
-For snapshot merge the contents of the COW storage are merged back into
-the origin device.
-
-
-There are three dm targets available:
-snapshot, snapshot-origin, and snapshot-merge.
-
-- snapshot-origin <origin>
-
-which will normally have one or more snapshots based on it.
-Reads will be mapped directly to the backing device. For each write, the
-original data will be saved in the <COW device> of each snapshot to keep
-its visible content unchanged, at least until the <COW device> fills up.
-
-
-- snapshot <origin> <COW device> <persistent?> <chunksize>
-
-A snapshot of the <origin> block device is created. Changed chunks of
-<chunksize> sectors will be stored on the <COW device>. Writes will
-only go to the <COW device>. Reads will come from the <COW device> or
-from <origin> for unchanged data. <COW device> will often be
-smaller than the origin and if it fills up the snapshot will become
-useless and be disabled, returning errors. So it is important to monitor
-the amount of free space and expand the <COW device> before it fills up.
-
-<persistent?> is P (Persistent) or N (Not persistent - will not survive
-after reboot). O (Overflow) can be added as a persistent store option
-to allow userspace to advertise its support for seeing "Overflow" in the
-snapshot status. So supported store types are "P", "PO" and "N".
-
-The difference between persistent and transient is with transient
-snapshots less metadata must be saved on disk - they can be kept in
-memory by the kernel.
-
-When loading or unloading the snapshot target, the corresponding
-snapshot-origin or snapshot-merge target must be suspended. A failure to
-suspend the origin target could result in data corruption.
-
-
-* snapshot-merge <origin> <COW device> <persistent> <chunksize>
-
-takes the same table arguments as the snapshot target except it only
-works with persistent snapshots. This target assumes the role of the
-"snapshot-origin" target and must not be loaded if the "snapshot-origin"
-is still present for <origin>.
-
-Creates a merging snapshot that takes control of the changed chunks
-stored in the <COW device> of an existing snapshot, through a handover
-procedure, and merges these chunks back into the <origin>. Once merging
-has started (in the background) the <origin> may be opened and the merge
-will continue while I/O is flowing to it. Changes to the <origin> are
-deferred until the merging snapshot's corresponding chunk(s) have been
-merged. Once merging has started the snapshot device, associated with
-the "snapshot" target, will return -EIO when accessed.
-
-
-How snapshot is used by LVM2
-============================
-When you create the first LVM2 snapshot of a volume, four dm devices are used:
-
-1) a device containing the original mapping table of the source volume;
-2) a device used as the <COW device>;
-3) a "snapshot" device, combining #1 and #2, which is the visible snapshot
- volume;
-4) the "original" volume (which uses the device number used by the original
- source volume), whose table is replaced by a "snapshot-origin" mapping
- from device #1.
-
-A fixed naming scheme is used, so with the following commands::
-
- lvcreate -L 1G -n base volumeGroup
- lvcreate -L 100M --snapshot -n snap volumeGroup/base
-
-we'll have this situation (with volumes in above order)::
-
- # dmsetup table|grep volumeGroup
-
- volumeGroup-base-real: 0 2097152 linear 8:19 384
- volumeGroup-snap-cow: 0 204800 linear 8:19 2097536
- volumeGroup-snap: 0 2097152 snapshot 254:11 254:12 P 16
- volumeGroup-base: 0 2097152 snapshot-origin 254:11
-
- # ls -lL /dev/mapper/volumeGroup-*
- brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
- brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
- brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
- brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
-
-
-How snapshot-merge is used by LVM2
-==================================
-A merging snapshot assumes the role of the "snapshot-origin" while
-merging. As such the "snapshot-origin" is replaced with
-"snapshot-merge". The "-real" device is not changed and the "-cow"
-device is renamed to <origin name>-cow to aid LVM2's cleanup of the
-merging snapshot after it completes. The "snapshot" that hands over its
-COW device to the "snapshot-merge" is deactivated (unless using lvchange
---refresh); but if it is left active it will simply return I/O errors.
-
-A snapshot will merge into its origin with the following command::
-
- lvconvert --merge volumeGroup/snap
-
-we'll now have this situation::
-
- # dmsetup table|grep volumeGroup
-
- volumeGroup-base-real: 0 2097152 linear 8:19 384
- volumeGroup-base-cow: 0 204800 linear 8:19 2097536
- volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
-
- # ls -lL /dev/mapper/volumeGroup-*
- brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
- brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
- brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base
-
-
-How to determine when a merging is complete
-===========================================
-The snapshot-merge and snapshot status lines end with:
-
- <sectors_allocated>/<total_sectors> <metadata_sectors>
-
-Both <sectors_allocated> and <total_sectors> include both data and metadata.
-During merging, the number of sectors allocated gets smaller and
-smaller. Merging has finished when the number of sectors holding data
-is zero, in other words <sectors_allocated> == <metadata_sectors>.
-
-Here is a practical example (using a hybrid of lvm and dmsetup commands)::
-
- # lvs
- LV VG Attr LSize Origin Snap% Move Log Copy% Convert
- base volumeGroup owi-a- 4.00g
- snap volumeGroup swi-a- 1.00g base 18.97
-
- # dmsetup status volumeGroup-snap
- 0 8388608 snapshot 397896/2097152 1560
- ^^^^ metadata sectors
-
- # lvconvert --merge -b volumeGroup/snap
- Merging of volume snap started.
-
- # lvs volumeGroup/snap
- LV VG Attr LSize Origin Snap% Move Log Copy% Convert
- base volumeGroup Owi-a- 4.00g 17.23
-
- # dmsetup status volumeGroup-base
- 0 8388608 snapshot-merge 281688/2097152 1104
-
- # dmsetup status volumeGroup-base
- 0 8388608 snapshot-merge 180480/2097152 712
-
- # dmsetup status volumeGroup-base
- 0 8388608 snapshot-merge 16/2097152 16
-
-Merging has finished.
-
-::
-
- # lvs
- LV VG Attr LSize Origin Snap% Move Log Copy% Convert
- base volumeGroup owi-a- 4.00g
diff --git a/Documentation/device-mapper/statistics.rst b/Documentation/device-mapper/statistics.rst
deleted file mode 100644
index 3d80a9f850cc..000000000000
--- a/Documentation/device-mapper/statistics.rst
+++ /dev/null
@@ -1,225 +0,0 @@
-=============
-DM statistics
-=============
-
-Device Mapper supports the collection of I/O statistics on user-defined
-regions of a DM device. If no regions are defined no statistics are
-collected so there isn't any performance impact. Only bio-based DM
-devices are currently supported.
-
-Each user-defined region specifies a starting sector, length and step.
-Individual statistics will be collected for each step-sized area within
-the range specified.
-
-The I/O statistics counters for each step-sized area of a region are
-in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see:
-Documentation/iostats.txt). But two extra counters (12 and 13) are
-provided: total time spent reading and writing. When the histogram
-argument is used, the 14th parameter is reported that represents the
-histogram of latencies. All these counters may be accessed by sending
-the @stats_print message to the appropriate DM device via dmsetup.
-
-The reported times are in milliseconds and the granularity depends on
-the kernel ticks. When the option precise_timestamps is used, the
-reported times are in nanoseconds.
-
-Each region has a corresponding unique identifier, which we call a
-region_id, that is assigned when the region is created. The region_id
-must be supplied when querying statistics about the region, deleting the
-region, etc. Unique region_ids enable multiple userspace programs to
-request and process statistics for the same DM device without stepping
-on each other's data.
-
-The creation of DM statistics will allocate memory via kmalloc or
-fallback to using vmalloc space. At most, 1/4 of the overall system
-memory may be allocated by DM statistics. The admin can see how much
-memory is used by reading:
-
- /sys/module/dm_mod/parameters/stats_current_allocated_bytes
-
-Messages
-========
-
- @stats_create <range> <step> [<number_of_optional_arguments> <optional_arguments>...] [<program_id> [<aux_data>]]
- Create a new region and return the region_id.
-
- <range>
- "-"
- whole device
- "<start_sector>+<length>"
- a range of <length> 512-byte sectors
- starting with <start_sector>.
-
- <step>
- "<area_size>"
- the range is subdivided into areas each containing
- <area_size> sectors.
- "/<number_of_areas>"
- the range is subdivided into the specified
- number of areas.
-
- <number_of_optional_arguments>
- The number of optional arguments
-
- <optional_arguments>
- The following optional arguments are supported:
-
- precise_timestamps
- use precise timer with nanosecond resolution
- instead of the "jiffies" variable. When this argument is
- used, the resulting times are in nanoseconds instead of
- milliseconds. Precise timestamps are a little bit slower
- to obtain than jiffies-based timestamps.
- histogram:n1,n2,n3,n4,...
- collect histogram of latencies. The
- numbers n1, n2, etc are times that represent the boundaries
- of the histogram. If precise_timestamps is not used, the
- times are in milliseconds, otherwise they are in
- nanoseconds. For each range, the kernel will report the
- number of requests that completed within this range. For
- example, if we use "histogram:10,20,30", the kernel will
- report four numbers a:b:c:d. a is the number of requests
- that took 0-10 ms to complete, b is the number of requests
- that took 10-20 ms to complete, c is the number of requests
- that took 20-30 ms to complete and d is the number of
- requests that took more than 30 ms to complete.
-
- <program_id>
- An optional parameter. A name that uniquely identifies
- the userspace owner of the range. This groups ranges together
- so that userspace programs can identify the ranges they
- created and ignore those created by others.
- The kernel returns this string back in the output of
- @stats_list message, but it doesn't use it for anything else.
- If we omit the number of optional arguments, program id must not
- be a number, otherwise it would be interpreted as the number of
- optional arguments.
-
- <aux_data>
- An optional parameter. A word that provides auxiliary data
- that is useful to the client program that created the range.
- The kernel returns this string back in the output of
- @stats_list message, but it doesn't use this value for anything.
-
- @stats_delete <region_id>
- Delete the region with the specified id.
-
- <region_id>
- region_id returned from @stats_create
-
- @stats_clear <region_id>
- Clear all the counters except the in-flight i/o counters.
-
- <region_id>
- region_id returned from @stats_create
-
- @stats_list [<program_id>]
- List all regions registered with @stats_create.
-
- <program_id>
- An optional parameter.
- If this parameter is specified, only matching regions
- are returned.
- If it is not specified, all regions are returned.
-
- Output format:
- <region_id>: <start_sector>+<length> <step> <program_id> <aux_data>
- precise_timestamps histogram:n1,n2,n3,...
-
- The strings "precise_timestamps" and "histogram" are printed only
- if they were specified when creating the region.
-
- @stats_print <region_id> [<starting_line> <number_of_lines>]
- Print counters for each step-sized area of a region.
-
- <region_id>
- region_id returned from @stats_create
-
- <starting_line>
- The index of the starting line in the output.
- If omitted, all lines are returned.
-
- <number_of_lines>
- The number of lines to include in the output.
- If omitted, all lines are returned.
-
- Output format for each step-sized area of a region:
-
- <start_sector>+<length>
- counters
-
- The first 11 counters have the same meaning as
- `/sys/block/*/stat or /proc/diskstats`.
-
- Please refer to Documentation/iostats.txt for details.
-
- 1. the number of reads completed
- 2. the number of reads merged
- 3. the number of sectors read
- 4. the number of milliseconds spent reading
- 5. the number of writes completed
- 6. the number of writes merged
- 7. the number of sectors written
- 8. the number of milliseconds spent writing
- 9. the number of I/Os currently in progress
- 10. the number of milliseconds spent doing I/Os
- 11. the weighted number of milliseconds spent doing I/Os
-
- Additional counters:
-
- 12. the total time spent reading in milliseconds
- 13. the total time spent writing in milliseconds
-
- @stats_print_clear <region_id> [<starting_line> <number_of_lines>]
- Atomically print and then clear all the counters except the
- in-flight i/o counters. Useful when the client consuming the
- statistics does not want to lose any statistics (those updated
- between printing and clearing).
-
- <region_id>
- region_id returned from @stats_create
-
- <starting_line>
- The index of the starting line in the output.
- If omitted, all lines are printed and then cleared.
-
- <number_of_lines>
- The number of lines to process.
- If omitted, all lines are printed and then cleared.
-
- @stats_set_aux <region_id> <aux_data>
- Store auxiliary data aux_data for the specified region.
-
- <region_id>
- region_id returned from @stats_create
-
- <aux_data>
- The string that identifies data which is useful to the client
- program that created the range. The kernel returns this
- string back in the output of @stats_list message, but it
- doesn't use this value for anything.
-
-Examples
-========
-
-Subdivide the DM device 'vol' into 100 pieces and start collecting
-statistics on them::
-
- dmsetup message vol 0 @stats_create - /100
-
-Set the auxiliary data string to "foo bar baz" (the escape for each
-space must also be escaped, otherwise the shell will consume them)::
-
- dmsetup message vol 0 @stats_set_aux 0 foo\\ bar\\ baz
-
-List the statistics::
-
- dmsetup message vol 0 @stats_list
-
-Print the statistics::
-
- dmsetup message vol 0 @stats_print 0
-
-Delete the statistics::
-
- dmsetup message vol 0 @stats_delete 0
diff --git a/Documentation/devicetree/bindings/Makefile b/Documentation/devicetree/bindings/Makefile
index 8a2774b5834b..6b0dfd5c17ba 100644
--- a/Documentation/devicetree/bindings/Makefile
+++ b/Documentation/devicetree/bindings/Makefile
@@ -25,7 +25,7 @@ DT_DOCS = $(shell \
DT_SCHEMA_FILES ?= $(addprefix $(src)/,$(DT_DOCS))
extra-y += $(patsubst $(src)/%.yaml,%.example.dts, $(DT_SCHEMA_FILES))
-extra-y += $(patsubst $(src)/%.yaml,%.example.dtb, $(DT_SCHEMA_FILES))
+extra-y += $(patsubst $(src)/%.yaml,%.example.dt.yaml, $(DT_SCHEMA_FILES))
$(obj)/$(DT_TMP_SCHEMA): $(DT_SCHEMA_FILES) FORCE
$(call if_changed,mk_schema)
diff --git a/Documentation/devicetree/bindings/arm/al,alpine.txt b/Documentation/devicetree/bindings/arm/al,alpine.txt
deleted file mode 100644
index d00debe2e86f..000000000000
--- a/Documentation/devicetree/bindings/arm/al,alpine.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Annapurna Labs Alpine Platform Device Tree Bindings
----------------------------------------------------------------
-
-Boards in the Alpine family shall have the following properties:
-
-* Required root node properties:
-compatible: must contain "al,alpine"
-
-* Example:
-
-/ {
- model = "Annapurna Labs Alpine Dev Board";
- compatible = "al,alpine";
-
- ...
-}
diff --git a/Documentation/devicetree/bindings/arm/al,alpine.yaml b/Documentation/devicetree/bindings/arm/al,alpine.yaml
new file mode 100644
index 000000000000..a70dff277e05
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/al,alpine.yaml
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/al,alpine.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Annapurna Labs Alpine Platform Device Tree Bindings
+
+maintainers:
+ - Tsahee Zidenberg <tsahee@annapurnalabs.com>
+ - Antoine Tenart <antoine.tenart@bootlin.com>
+
+properties:
+ compatible:
+ items:
+ - const: al,alpine
+ model:
+ items:
+ - const: "Annapurna Labs Alpine Dev Board"
+
+...
diff --git a/Documentation/devicetree/bindings/arm/amlogic.txt b/Documentation/devicetree/bindings/arm/amlogic.txt
deleted file mode 100644
index 061f7b98a07f..000000000000
--- a/Documentation/devicetree/bindings/arm/amlogic.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Amlogic MesonX device tree bindings
--------------------------------------------
-
-Work in progress statement:
-
-Device tree files and bindings applying to Amlogic SoCs and boards are
-considered "unstable". Any Amlogic device tree binding may change at
-any time. Be sure to use a device tree binary and a kernel image
-generated from the same source tree.
-
-Please refer to Documentation/devicetree/bindings/ABI.txt for a definition of a
-stable binding/ABI.
-
----------------------------------------------------------------
-
-Boards with the Amlogic Meson6 SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,meson6"
-
-Boards with the Amlogic Meson8 SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,meson8";
-
-Boards with the Amlogic Meson8b SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,meson8b";
-
-Boards with the Amlogic Meson8m2 SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,meson8m2";
-
-Boards with the Amlogic Meson GXBaby SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,meson-gxbb";
-
-Boards with the Amlogic Meson GXL S905X SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,s905x", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXL S905D SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,s905d", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXL S805X SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,s805x", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXL S905W SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,s905w", "amlogic,meson-gxl";
-
-Boards with the Amlogic Meson GXM S912 SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,s912", "amlogic,meson-gxm";
-
-Boards with the Amlogic Meson AXG A113D SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,a113d", "amlogic,meson-axg";
-
-Boards with the Amlogic Meson G12A S905D2 SoC shall have the following properties:
- Required root node property:
- compatible: "amlogic,g12a";
-
-Board compatible values (alphabetically, grouped by SoC):
-
- - "geniatech,atv1200" (Meson6)
-
- - "minix,neo-x8" (Meson8)
-
- - "endless,ec100" (Meson8b)
- - "hardkernel,odroid-c1" (Meson8b)
- - "tronfy,mxq" (Meson8b)
-
- - "tronsmart,mxiii-plus" (Meson8m2)
-
- - "amlogic,p200" (Meson gxbb)
- - "amlogic,p201" (Meson gxbb)
- - "friendlyarm,nanopi-k2" (Meson gxbb)
- - "hardkernel,odroid-c2" (Meson gxbb)
- - "nexbox,a95x" (Meson gxbb or Meson gxl s905x)
- - "tronsmart,vega-s95-pro", "tronsmart,vega-s95" (Meson gxbb)
- - "tronsmart,vega-s95-meta", "tronsmart,vega-s95" (Meson gxbb)
- - "tronsmart,vega-s95-telos", "tronsmart,vega-s95" (Meson gxbb)
- - "wetek,hub" (Meson gxbb)
- - "wetek,play2" (Meson gxbb)
-
- - "amlogic,p212" (Meson gxl s905x)
- - "hwacom,amazetv" (Meson gxl s905x)
- - "khadas,vim" (Meson gxl s905x)
- - "libretech,cc" (Meson gxl s905x)
-
- - "amlogic,p230" (Meson gxl s905d)
- - "amlogic,p231" (Meson gxl s905d)
- - "phicomm,n1" (Meson gxl s905d)
-
- - "amlogic,p241" (Meson gxl s805x)
- - "libretech,aml-s805x-ac" (Meson gxl s805x)
-
- - "amlogic,p281" (Meson gxl s905w)
- - "oranth,tx3-mini" (Meson gxl s905w)
-
- - "amlogic,q200" (Meson gxm s912)
- - "amlogic,q201" (Meson gxm s912)
- - "khadas,vim2" (Meson gxm s912)
- - "kingnovel,r-box-pro" (Meson gxm S912)
- - "nexbox,a1" (Meson gxm s912)
- - "tronsmart,vega-s96" (Meson gxm s912)
-
- - "amlogic,s400" (Meson axg a113d)
-
- - "amlogic,u200" (Meson g12a s905d2)
- - "amediatech,x96-max" (Meson g12a s905x2)
- - "seirobotics,sei510" (Meson g12a s905x2)
-
-Amlogic Meson Firmware registers Interface
-------------------------------------------
-
-The Meson SoCs have a register bank with status and data shared with the
-secure firmware.
-
-Required properties:
- - compatible: For Meson GX SoCs, must be "amlogic,meson-gx-ao-secure", "syscon"
-
-Properties should indentify components of this register interface :
-
-Meson GX SoC Information
-------------------------
-A firmware register encodes the SoC type, package and revision information on
-the Meson GX SoCs.
-If present, the following property should be added :
-
-Optional properties:
- - amlogic,has-chip-id: If present, the interface gives the current SoC version.
-
-Example
--------
-
-ao-secure@140 {
- compatible = "amlogic,meson-gx-ao-secure", "syscon";
- reg = <0x0 0x140 0x0 0x140>;
- amlogic,has-chip-id;
-};
diff --git a/Documentation/devicetree/bindings/arm/amlogic.yaml b/Documentation/devicetree/bindings/arm/amlogic.yaml
new file mode 100644
index 000000000000..325c6fd3566d
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/amlogic.yaml
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/amlogic.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Amlogic MesonX device tree bindings
+
+maintainers:
+ - Kevin Hilman <khilman@baylibre.com>
+
+description: |+
+ Work in progress statement:
+
+ Device tree files and bindings applying to Amlogic SoCs and boards are
+ considered "unstable". Any Amlogic device tree binding may change at
+ any time. Be sure to use a device tree binary and a kernel image
+ generated from the same source tree.
+
+ Please refer to Documentation/devicetree/bindings/ABI.txt for a definition of a
+ stable binding/ABI.
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - description: Boards with the Amlogic Meson6 SoC
+ items:
+ - enum:
+ - geniatech,atv1200
+ - const: amlogic,meson6
+
+ - description: Boards with the Amlogic Meson8 SoC
+ items:
+ - enum:
+ - minix,neo-x8
+ - const: amlogic,meson8
+
+ - description: Boards with the Amlogic Meson8m2 SoC
+ items:
+ - enum:
+ - tronsmart,mxiii-plus
+ - const: amlogic,meson8m2
+
+ - description: Boards with the Amlogic Meson8b SoC
+ items:
+ - enum:
+ - endless,ec100
+ - hardkernel,odroid-c1
+ - tronfy,mxq
+ - const: amlogic,meson8b
+
+ - description: Boards with the Amlogic Meson GXBaby SoC
+ items:
+ - enum:
+ - amlogic,p200
+ - amlogic,p201
+ - friendlyarm,nanopi-k2
+ - hardkernel,odroid-c2
+ - nexbox,a95x
+ - wetek,hub
+ - wetek,play2
+ - const: amlogic,meson-gxbb
+
+ - description: Tronsmart Vega S95 devices
+ items:
+ - enum:
+ - tronsmart,vega-s95-pro
+ - tronsmart,vega-s95-meta
+ - tronsmart,vega-s95-telos
+ - const: tronsmart,vega-s95
+ - const: amlogic,meson-gxbb
+
+ - description: Boards with the Amlogic Meson GXL S805X SoC
+ items:
+ - enum:
+ - amlogic,p241
+ - libretech,aml-s805x-ac
+ - const: amlogic,s805x
+ - const: amlogic,meson-gxl
+
+ - description: Boards with the Amlogic Meson GXL S905W SoC
+ items:
+ - enum:
+ - amlogic,p281
+ - oranth,tx3-mini
+ - const: amlogic,s905w
+ - const: amlogic,meson-gxl
+
+ - description: Boards with the Amlogic Meson GXL S905X SoC
+ items:
+ - enum:
+ - amediatech,x96-max
+ - amlogic,p212
+ - hwacom,amazetv
+ - khadas,vim
+ - libretech,cc
+ - nexbox,a95x
+ - seirobotics,sei510
+ - const: amlogic,s905x
+ - const: amlogic,meson-gxl
+
+ - description: Boards with the Amlogic Meson GXL S905D SoC
+ items:
+ - enum:
+ - amlogic,p230
+ - amlogic,p231
+ - phicomm,n1
+ - const: amlogic,s905d
+ - const: amlogic,meson-gxl
+
+ - description: Boards with the Amlogic Meson GXM S912 SoC
+ items:
+ - enum:
+ - amlogic,q200
+ - amlogic,q201
+ - khadas,vim2
+ - kingnovel,r-box-pro
+ - nexbox,a1
+ - tronsmart,vega-s96
+ - const: amlogic,s912
+ - const: amlogic,meson-gxm
+
+ - description: Boards with the Amlogic Meson AXG A113D SoC
+ items:
+ - enum:
+ - amlogic,s400
+ - const: amlogic,a113d
+ - const: amlogic,meson-axg
+
+ - description: Boards with the Amlogic Meson G12A S905D2 SoC
+ items:
+ - enum:
+ - amlogic,u200
+ - const: amlogic,g12a
+
+ - description: Boards with the Amlogic Meson G12B S922X SoC
+ items:
+ - enum:
+ - hardkernel,odroid-n2
+ - const: amlogic,g12b
+
+...
diff --git a/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt
new file mode 100644
index 000000000000..c67d9f48fb91
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/amlogic/amlogic,meson-gx-ao-secure.txt
@@ -0,0 +1,28 @@
+Amlogic Meson Firmware registers Interface
+------------------------------------------
+
+The Meson SoCs have a register bank with status and data shared with the
+secure firmware.
+
+Required properties:
+ - compatible: For Meson GX SoCs, must be "amlogic,meson-gx-ao-secure", "syscon"
+
+Properties should indentify components of this register interface :
+
+Meson GX SoC Information
+------------------------
+A firmware register encodes the SoC type, package and revision information on
+the Meson GX SoCs.
+If present, the following property should be added :
+
+Optional properties:
+ - amlogic,has-chip-id: If present, the interface gives the current SoC version.
+
+Example
+-------
+
+ao-secure@140 {
+ compatible = "amlogic,meson-gx-ao-secure", "syscon";
+ reg = <0x0 0x140 0x0 0x140>;
+ amlogic,has-chip-id;
+};
diff --git a/Documentation/devicetree/bindings/arm/arm,scmi.txt b/Documentation/devicetree/bindings/arm/arm,scmi.txt
index 5f3719ab7075..317a2fc3667a 100644
--- a/Documentation/devicetree/bindings/arm/arm,scmi.txt
+++ b/Documentation/devicetree/bindings/arm/arm,scmi.txt
@@ -6,7 +6,7 @@ that are provided by the hardware platform it is running on, including power
and performance functions.
This binding is intended to define the interface the firmware implementing
-the SCMI as described in ARM document number ARM DUI 0922B ("ARM System Control
+the SCMI as described in ARM document number ARM DEN 0056A ("ARM System Control
and Management Interface Platform Design Document")[0] provide for OSPM in
the device tree.
diff --git a/Documentation/devicetree/bindings/arm/arm-boards b/Documentation/devicetree/bindings/arm/arm-boards
index abff8d834a6a..6758ece324b1 100644
--- a/Documentation/devicetree/bindings/arm/arm-boards
+++ b/Documentation/devicetree/bindings/arm/arm-boards
@@ -197,7 +197,7 @@ Required nodes:
The description for the board must include:
- a "psci" node describing the boot method used for the secondary CPUs.
A detailed description of the bindings used for "psci" nodes is present
- in the psci.txt file.
+ in the psci.yaml file.
- a "cpus" node describing the available cores and their associated
"enable-method"s. For more details see cpus.txt file.
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.txt b/Documentation/devicetree/bindings/arm/atmel-at91.txt
deleted file mode 100644
index 99dee23c74a4..000000000000
--- a/Documentation/devicetree/bindings/arm/atmel-at91.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-Atmel AT91 device tree bindings.
-================================
-
-Boards with a SoC of the Atmel AT91 or SMART family shall have the following
-properties:
-
-Required root node properties:
-compatible: must be one of:
- * "atmel,at91rm9200"
-
- * "atmel,at91sam9" for SoCs using an ARM926EJ-S core, shall be extended with
- the specific SoC family or compatible:
- o "atmel,at91sam9260"
- o "atmel,at91sam9261"
- o "atmel,at91sam9263"
- o "atmel,at91sam9x5" for the 5 series, shall be extended with the specific
- SoC compatible:
- - "atmel,at91sam9g15"
- - "atmel,at91sam9g25"
- - "atmel,at91sam9g35"
- - "atmel,at91sam9x25"
- - "atmel,at91sam9x35"
- o "atmel,at91sam9g20"
- o "atmel,at91sam9g45"
- o "atmel,at91sam9n12"
- o "atmel,at91sam9rl"
- o "atmel,at91sam9xe"
- o "microchip,sam9x60"
- * "atmel,sama5" for SoCs using a Cortex-A5, shall be extended with the specific
- SoC family:
- o "atmel,sama5d2" shall be extended with the specific SoC compatible:
- - "atmel,sama5d27"
- o "atmel,sama5d3" shall be extended with the specific SoC compatible:
- - "atmel,sama5d31"
- - "atmel,sama5d33"
- - "atmel,sama5d34"
- - "atmel,sama5d35"
- - "atmel,sama5d36"
- o "atmel,sama5d4" shall be extended with the specific SoC compatible:
- - "atmel,sama5d41"
- - "atmel,sama5d42"
- - "atmel,sama5d43"
- - "atmel,sama5d44"
-
- * "atmel,samv7" for MCUs using a Cortex-M7, shall be extended with the specific
- SoC family:
- o "atmel,sams70" shall be extended with the specific MCU compatible:
- - "atmel,sams70j19"
- - "atmel,sams70j20"
- - "atmel,sams70j21"
- - "atmel,sams70n19"
- - "atmel,sams70n20"
- - "atmel,sams70n21"
- - "atmel,sams70q19"
- - "atmel,sams70q20"
- - "atmel,sams70q21"
- o "atmel,samv70" shall be extended with the specific MCU compatible:
- - "atmel,samv70j19"
- - "atmel,samv70j20"
- - "atmel,samv70n19"
- - "atmel,samv70n20"
- - "atmel,samv70q19"
- - "atmel,samv70q20"
- o "atmel,samv71" shall be extended with the specific MCU compatible:
- - "atmel,samv71j19"
- - "atmel,samv71j20"
- - "atmel,samv71j21"
- - "atmel,samv71n19"
- - "atmel,samv71n20"
- - "atmel,samv71n21"
- - "atmel,samv71q19"
- - "atmel,samv71q20"
- - "atmel,samv71q21"
diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.yaml b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
new file mode 100644
index 000000000000..6e168abcd4d1
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/atmel-at91.yaml
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/atmel-at91.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Atmel AT91 device tree bindings.
+
+maintainers:
+ - Alexandre Belloni <alexandre.belloni@bootlin.com>
+ - Ludovic Desroches <ludovic.desroches@microchip.com>
+
+description: |
+ Boards with a SoC of the Atmel AT91 or SMART family shall have the following
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - items:
+ - const: atmel,at91rm9200
+ - items:
+ - enum:
+ - olimex,sam9-l9260
+ - enum:
+ - atmel,at91sam9260
+ - atmel,at91sam9261
+ - atmel,at91sam9263
+ - atmel,at91sam9g20
+ - atmel,at91sam9g45
+ - atmel,at91sam9n12
+ - atmel,at91sam9rl
+ - atmel,at91sam9xe
+ - atmel,at91sam9x60
+ - const: atmel,at91sam9
+
+ - items:
+ - enum:
+ - atmel,at91sam9g15
+ - atmel,at91sam9g25
+ - atmel,at91sam9g35
+ - atmel,at91sam9x25
+ - atmel,at91sam9x35
+ - const: atmel,at91sam9x5
+ - const: atmel,at91sam9
+
+ - items:
+ - const: atmel,sama5d27
+ - const: atmel,sama5d2
+ - const: atmel,sama5
+
+ - description: Nattis v2 board with Natte v2 power board
+ items:
+ - const: axentia,nattis-2
+ - const: axentia,natte-2
+ - const: axentia,linea
+ - const: atmel,sama5d31
+ - const: atmel,sama5d3
+ - const: atmel,sama5
+
+ - description: TSE-850 v3 board
+ items:
+ - const: axentia,tse850v3
+ - const: axentia,linea
+ - const: atmel,sama5d31
+ - const: atmel,sama5d3
+ - const: atmel,sama5
+
+ - items:
+ - const: axentia,linea
+ - const: atmel,sama5d31
+ - const: atmel,sama5d3
+ - const: atmel,sama5
+
+ - items:
+ - enum:
+ - atmel,sama5d31
+ - atmel,sama5d33
+ - atmel,sama5d34
+ - atmel,sama5d35
+ - atmel,sama5d36
+ - const: atmel,sama5d3
+ - const: atmel,sama5
+
+ - items:
+ - enum:
+ - atmel,sama5d41
+ - atmel,sama5d42
+ - atmel,sama5d43
+ - atmel,sama5d44
+ - const: atmel,sama5d4
+ - const: atmel,sama5
+
+ - items:
+ - enum:
+ - atmel,sams70j19
+ - atmel,sams70j20
+ - atmel,sams70j21
+ - atmel,sams70n19
+ - atmel,sams70n20
+ - atmel,sams70n21
+ - atmel,sams70q19
+ - atmel,sams70q20
+ - atmel,sams70q21
+ - const: atmel,sams70
+ - const: atmel,samv7
+
+ - items:
+ - enum:
+ - atmel,samv70j19
+ - atmel,samv70j20
+ - atmel,samv70n19
+ - atmel,samv70n20
+ - atmel,samv70q19
+ - atmel,samv70q20
+ - const: atmel,samv70
+ - const: atmel,samv7
+
+ - items:
+ - enum:
+ - atmel,samv71j19
+ - atmel,samv71j20
+ - atmel,samv71j21
+ - atmel,samv71n19
+ - atmel,samv71n20
+ - atmel,samv71n21
+ - atmel,samv71q19
+ - atmel,samv71q20
+ - atmel,samv71q21
+ - const: atmel,samv71
+ - const: atmel,samv7
+
+...
diff --git a/Documentation/devicetree/bindings/arm/axxia.txt b/Documentation/devicetree/bindings/arm/axxia.txt
deleted file mode 100644
index 7b4ef9c07696..000000000000
--- a/Documentation/devicetree/bindings/arm/axxia.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Axxia AXM55xx device tree bindings
-
-Boards using the AXM55xx SoC need to have the following properties:
-
-Required root node property:
-
- - compatible = "lsi,axm5516"
-
-Boards:
-
- LSI AXM5516 Validation board (Amarillo)
- compatible = "lsi,axm5516-amarillo", "lsi,axm5516"
diff --git a/Documentation/devicetree/bindings/arm/axxia.yaml b/Documentation/devicetree/bindings/arm/axxia.yaml
new file mode 100644
index 000000000000..98780a569f22
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/axxia.yaml
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/axxia.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Axxia AXM55xx device tree bindings
+
+maintainers:
+ - Anders Berg <anders.berg@lsi.com>
+
+properties:
+ compatible:
+ description: LSI AXM5516 Validation board (Amarillo)
+ items:
+ - const: lsi,axm5516-amarillo
+ - const: lsi,axm5516
+
+...
diff --git a/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt b/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
index 298291211ea4..f1de3247c1b7 100644
--- a/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
+++ b/Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt
@@ -26,8 +26,8 @@ Required properties:
processor core is clocked by the internal CPU clock, so it
is enabled with CPU clock by default.
-- cpu : the CPU phandle the debug module is affined to. When omitted
- the module is considered to belong to CPU0.
+- cpu : the CPU phandle the debug module is affined to. Do not assume it
+ to default to CPU0 if omitted.
Optional properties:
diff --git a/Documentation/devicetree/bindings/arm/coresight.txt b/Documentation/devicetree/bindings/arm/coresight.txt
index 8a88ddebc1a2..fcc3bacfd8bc 100644
--- a/Documentation/devicetree/bindings/arm/coresight.txt
+++ b/Documentation/devicetree/bindings/arm/coresight.txt
@@ -59,6 +59,11 @@ its hardware characteristcs.
* port or ports: see "Graph bindings for Coresight" below.
+* Additional required property for Embedded Trace Macrocell (version 3.x and
+ version 4.x):
+ * cpu: the cpu phandle this ETM/PTM is affined to. Do not
+ assume it to default to CPU0 if omitted.
+
* Additional required properties for System Trace Macrocells (STM):
* reg: along with the physical base address and length of the register
set as described above, another entry is required to describe the
@@ -87,9 +92,6 @@ its hardware characteristcs.
* arm,cp14: must be present if the system accesses ETM/PTM management
registers via co-processor 14.
- * cpu: the cpu phandle this ETM/PTM is affined to. When omitted the
- source is considered to belong to CPU0.
-
* Optional property for TMC:
* arm,buffer-size: size of contiguous buffer space for TMC ETR
diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml
index 591bbd012d63..aa40b074b864 100644
--- a/Documentation/devicetree/bindings/arm/cpus.yaml
+++ b/Documentation/devicetree/bindings/arm/cpus.yaml
@@ -39,281 +39,242 @@ description: |+
described below.
properties:
- $nodename:
- const: cpus
- description: Container of cpu nodes
-
- '#address-cells':
- enum: [1, 2]
+ reg:
+ maxItems: 1
description: |
- Definition depends on ARM architecture version and configuration:
+ Usage and definition depend on ARM architecture version and
+ configuration:
On uniprocessor ARM architectures previous to v7
- value must be 1, to enable a simple enumeration
- scheme for processors that do not have a HW CPU
- identification register.
- On 32-bit ARM 11 MPcore, ARM v7 or later systems
- value must be 1, that corresponds to CPUID/MPIDR
- registers sizes.
- On ARM v8 64-bit systems value should be set to 2,
- that corresponds to the MPIDR_EL1 register size.
- If MPIDR_EL1[63:32] value is equal to 0 on all CPUs
- in the system, #address-cells can be set to 1, since
- MPIDR_EL1[63:32] bits are not used for CPUs
- identification.
-
- '#size-cells':
- const: 0
-
-patternProperties:
- '^cpu@[0-9a-f]+$':
- type: object
- properties:
- device_type:
- const: cpu
-
- reg:
- maxItems: 1
- description: |
- Usage and definition depend on ARM architecture version and
- configuration:
-
- On uniprocessor ARM architectures previous to v7
- this property is required and must be set to 0.
-
- On ARM 11 MPcore based systems this property is
- required and matches the CPUID[11:0] register bits.
-
- Bits [11:0] in the reg cell must be set to
- bits [11:0] in CPU ID register.
-
- All other bits in the reg cell must be set to 0.
-
- On 32-bit ARM v7 or later systems this property is
- required and matches the CPU MPIDR[23:0] register
- bits.
-
- Bits [23:0] in the reg cell must be set to
- bits [23:0] in MPIDR.
-
- All other bits in the reg cell must be set to 0.
-
- On ARM v8 64-bit systems this property is required
- and matches the MPIDR_EL1 register affinity bits.
+ this property is required and must be set to 0.
+
+ On ARM 11 MPcore based systems this property is
+ required and matches the CPUID[11:0] register bits.
+
+ Bits [11:0] in the reg cell must be set to
+ bits [11:0] in CPU ID register.
+
+ All other bits in the reg cell must be set to 0.
+
+ On 32-bit ARM v7 or later systems this property is
+ required and matches the CPU MPIDR[23:0] register
+ bits.
+
+ Bits [23:0] in the reg cell must be set to
+ bits [23:0] in MPIDR.
+
+ All other bits in the reg cell must be set to 0.
+
+ On ARM v8 64-bit systems this property is required
+ and matches the MPIDR_EL1 register affinity bits.
+
+ * If cpus node's #address-cells property is set to 2
+
+ The first reg cell bits [7:0] must be set to
+ bits [39:32] of MPIDR_EL1.
+
+ The second reg cell bits [23:0] must be set to
+ bits [23:0] of MPIDR_EL1.
+
+ * If cpus node's #address-cells property is set to 1
+
+ The reg cell bits [23:0] must be set to bits [23:0]
+ of MPIDR_EL1.
+
+ All other bits in the reg cells must be set to 0.
+
+ compatible:
+ enum:
+ - arm,arm710t
+ - arm,arm720t
+ - arm,arm740t
+ - arm,arm7ej-s
+ - arm,arm7tdmi
+ - arm,arm7tdmi-s
+ - arm,arm9es
+ - arm,arm9ej-s
+ - arm,arm920t
+ - arm,arm922t
+ - arm,arm925
+ - arm,arm926e-s
+ - arm,arm926ej-s
+ - arm,arm940t
+ - arm,arm946e-s
+ - arm,arm966e-s
+ - arm,arm968e-s
+ - arm,arm9tdmi
+ - arm,arm1020e
+ - arm,arm1020t
+ - arm,arm1022e
+ - arm,arm1026ej-s
+ - arm,arm1136j-s
+ - arm,arm1136jf-s
+ - arm,arm1156t2-s
+ - arm,arm1156t2f-s
+ - arm,arm1176jzf
+ - arm,arm1176jz-s
+ - arm,arm1176jzf-s
+ - arm,arm11mpcore
+ - arm,armv8 # Only for s/w models
+ - arm,cortex-a5
+ - arm,cortex-a7
+ - arm,cortex-a8
+ - arm,cortex-a9
+ - arm,cortex-a12
+ - arm,cortex-a15
+ - arm,cortex-a17
+ - arm,cortex-a53
+ - arm,cortex-a57
+ - arm,cortex-a72
+ - arm,cortex-a73
+ - arm,cortex-m0
+ - arm,cortex-m0+
+ - arm,cortex-m1
+ - arm,cortex-m3
+ - arm,cortex-m4
+ - arm,cortex-r4
+ - arm,cortex-r5
+ - arm,cortex-r7
+ - brcm,brahma-b15
+ - brcm,brahma-b53
+ - brcm,vulcan
+ - cavium,thunder
+ - cavium,thunder2
+ - faraday,fa526
+ - intel,sa110
+ - intel,sa1100
+ - marvell,feroceon
+ - marvell,mohawk
+ - marvell,pj4a
+ - marvell,pj4b
+ - marvell,sheeva-v5
+ - marvell,sheeva-v7
+ - nvidia,tegra132-denver
+ - nvidia,tegra186-denver
+ - nvidia,tegra194-carmel
+ - qcom,krait
+ - qcom,kryo
+ - qcom,kryo385
+ - qcom,scorpion
+
+ enable-method:
+ allOf:
+ - $ref: '/schemas/types.yaml#/definitions/string'
+ - oneOf:
+ # On ARM v8 64-bit this property is required
+ - enum:
+ - psci
+ - spin-table
+ # On ARM 32-bit systems this property is optional
+ - enum:
+ - actions,s500-smp
+ - allwinner,sun6i-a31
+ - allwinner,sun8i-a23
+ - allwinner,sun9i-a80-smp
+ - allwinner,sun8i-a83t-smp
+ - amlogic,meson8-smp
+ - amlogic,meson8b-smp
+ - arm,realview-smp
+ - brcm,bcm11351-cpu-method
+ - brcm,bcm23550
+ - brcm,bcm2836-smp
+ - brcm,bcm63138
+ - brcm,bcm-nsp-smp
+ - brcm,brahma-b15
+ - marvell,armada-375-smp
+ - marvell,armada-380-smp
+ - marvell,armada-390-smp
+ - marvell,armada-xp-smp
+ - marvell,98dx3236-smp
+ - mediatek,mt6589-smp
+ - mediatek,mt81xx-tz-smp
+ - qcom,gcc-msm8660
+ - qcom,kpss-acc-v1
+ - qcom,kpss-acc-v2
+ - renesas,apmu
+ - renesas,r9a06g032-smp
+ - rockchip,rk3036-smp
+ - rockchip,rk3066-smp
+ - socionext,milbeaut-m10v-smp
+ - ste,dbx500-smp
+
+ cpu-release-addr:
+ $ref: '/schemas/types.yaml#/definitions/uint64'
+
+ description:
+ Required for systems that have an "enable-method"
+ property value of "spin-table".
+ On ARM v8 64-bit systems must be a two cell
+ property identifying a 64-bit zero-initialised
+ memory location.
+
+ cpu-idle-states:
+ $ref: '/schemas/types.yaml#/definitions/phandle-array'
+ description: |
+ List of phandles to idle state nodes supported
+ by this cpu (see ./idle-states.txt).
+
+ capacity-dmips-mhz:
+ $ref: '/schemas/types.yaml#/definitions/uint32'
+ description:
+ u32 value representing CPU capacity (see ./cpu-capacity.txt) in
+ DMIPS/MHz, relative to highest capacity-dmips-mhz
+ in the system.
+
+ dynamic-power-coefficient:
+ $ref: '/schemas/types.yaml#/definitions/uint32'
+ description:
+ A u32 value that represents the running time dynamic
+ power coefficient in units of uW/MHz/V^2. The
+ coefficient can either be calculated from power
+ measurements or derived by analysis.
+
+ The dynamic power consumption of the CPU is
+ proportional to the square of the Voltage (V) and
+ the clock frequency (f). The coefficient is used to
+ calculate the dynamic power as below -
+
+ Pdyn = dynamic-power-coefficient * V^2 * f
+
+ where voltage is in V, frequency is in MHz.
+
+ qcom,saw:
+ $ref: '/schemas/types.yaml#/definitions/phandle'
+ description: |
+ Specifies the SAW* node associated with this CPU.
- * If cpus node's #address-cells property is set to 2
+ Required for systems that have an "enable-method" property
+ value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
- The first reg cell bits [7:0] must be set to
- bits [39:32] of MPIDR_EL1.
+ * arm/msm/qcom,saw2.txt
- The second reg cell bits [23:0] must be set to
- bits [23:0] of MPIDR_EL1.
+ qcom,acc:
+ $ref: '/schemas/types.yaml#/definitions/phandle'
+ description: |
+ Specifies the ACC* node associated with this CPU.
- * If cpus node's #address-cells property is set to 1
+ Required for systems that have an "enable-method" property
+ value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
- The reg cell bits [23:0] must be set to bits [23:0]
- of MPIDR_EL1.
+ * arm/msm/qcom,kpss-acc.txt
- All other bits in the reg cells must be set to 0.
+ rockchip,pmu:
+ $ref: '/schemas/types.yaml#/definitions/phandle'
+ description: |
+ Specifies the syscon node controlling the cpu core power domains.
- compatible:
- items:
- - enum:
- - arm,arm710t
- - arm,arm720t
- - arm,arm740t
- - arm,arm7ej-s
- - arm,arm7tdmi
- - arm,arm7tdmi-s
- - arm,arm9es
- - arm,arm9ej-s
- - arm,arm920t
- - arm,arm922t
- - arm,arm925
- - arm,arm926e-s
- - arm,arm926ej-s
- - arm,arm940t
- - arm,arm946e-s
- - arm,arm966e-s
- - arm,arm968e-s
- - arm,arm9tdmi
- - arm,arm1020e
- - arm,arm1020t
- - arm,arm1022e
- - arm,arm1026ej-s
- - arm,arm1136j-s
- - arm,arm1136jf-s
- - arm,arm1156t2-s
- - arm,arm1156t2f-s
- - arm,arm1176jzf
- - arm,arm1176jz-s
- - arm,arm1176jzf-s
- - arm,arm11mpcore
- - arm,armv8 # Only for s/w models
- - arm,cortex-a5
- - arm,cortex-a7
- - arm,cortex-a8
- - arm,cortex-a9
- - arm,cortex-a12
- - arm,cortex-a15
- - arm,cortex-a17
- - arm,cortex-a53
- - arm,cortex-a57
- - arm,cortex-a72
- - arm,cortex-a73
- - arm,cortex-m0
- - arm,cortex-m0+
- - arm,cortex-m1
- - arm,cortex-m3
- - arm,cortex-m4
- - arm,cortex-r4
- - arm,cortex-r5
- - arm,cortex-r7
- - brcm,brahma-b15
- - brcm,brahma-b53
- - brcm,vulcan
- - cavium,thunder
- - cavium,thunder2
- - faraday,fa526
- - intel,sa110
- - intel,sa1100
- - marvell,feroceon
- - marvell,mohawk
- - marvell,pj4a
- - marvell,pj4b
- - marvell,sheeva-v5
- - marvell,sheeva-v7
- - nvidia,tegra132-denver
- - nvidia,tegra186-denver
- - nvidia,tegra194-carmel
- - qcom,krait
- - qcom,kryo
- - qcom,kryo385
- - qcom,scorpion
-
- enable-method:
- allOf:
- - $ref: '/schemas/types.yaml#/definitions/string'
- - oneOf:
- # On ARM v8 64-bit this property is required
- - enum:
- - psci
- - spin-table
- # On ARM 32-bit systems this property is optional
- - enum:
- - actions,s500-smp
- - allwinner,sun6i-a31
- - allwinner,sun8i-a23
- - allwinner,sun9i-a80-smp
- - allwinner,sun8i-a83t-smp
- - amlogic,meson8-smp
- - amlogic,meson8b-smp
- - arm,realview-smp
- - brcm,bcm11351-cpu-method
- - brcm,bcm23550
- - brcm,bcm2836-smp
- - brcm,bcm63138
- - brcm,bcm-nsp-smp
- - brcm,brahma-b15
- - marvell,armada-375-smp
- - marvell,armada-380-smp
- - marvell,armada-390-smp
- - marvell,armada-xp-smp
- - marvell,98dx3236-smp
- - mediatek,mt6589-smp
- - mediatek,mt81xx-tz-smp
- - qcom,gcc-msm8660
- - qcom,kpss-acc-v1
- - qcom,kpss-acc-v2
- - renesas,apmu
- - renesas,r9a06g032-smp
- - rockchip,rk3036-smp
- - rockchip,rk3066-smp
- - socionext,milbeaut-m10v-smp
- - ste,dbx500-smp
-
- cpu-release-addr:
- $ref: '/schemas/types.yaml#/definitions/uint64'
-
- description:
- Required for systems that have an "enable-method"
- property value of "spin-table".
- On ARM v8 64-bit systems must be a two cell
- property identifying a 64-bit zero-initialised
- memory location.
-
- cpu-idle-states:
- $ref: '/schemas/types.yaml#/definitions/phandle-array'
- description: |
- List of phandles to idle state nodes supported
- by this cpu (see ./idle-states.txt).
-
- capacity-dmips-mhz:
- $ref: '/schemas/types.yaml#/definitions/uint32'
- description:
- u32 value representing CPU capacity (see ./cpu-capacity.txt) in
- DMIPS/MHz, relative to highest capacity-dmips-mhz
- in the system.
-
- dynamic-power-coefficient:
- $ref: '/schemas/types.yaml#/definitions/uint32'
- description:
- A u32 value that represents the running time dynamic
- power coefficient in units of uW/MHz/V^2. The
- coefficient can either be calculated from power
- measurements or derived by analysis.
-
- The dynamic power consumption of the CPU is
- proportional to the square of the Voltage (V) and
- the clock frequency (f). The coefficient is used to
- calculate the dynamic power as below -
-
- Pdyn = dynamic-power-coefficient * V^2 * f
-
- where voltage is in V, frequency is in MHz.
-
- qcom,saw:
- $ref: '/schemas/types.yaml#/definitions/phandle'
- description: |
- Specifies the SAW* node associated with this CPU.
-
- Required for systems that have an "enable-method" property
- value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
-
- * arm/msm/qcom,saw2.txt
-
- qcom,acc:
- $ref: '/schemas/types.yaml#/definitions/phandle'
- description: |
- Specifies the ACC* node associated with this CPU.
-
- Required for systems that have an "enable-method" property
- value of "qcom,kpss-acc-v1" or "qcom,kpss-acc-v2"
-
- * arm/msm/qcom,kpss-acc.txt
-
- rockchip,pmu:
- $ref: '/schemas/types.yaml#/definitions/phandle'
- description: |
- Specifies the syscon node controlling the cpu core power domains.
-
- Optional for systems that have an "enable-method"
- property value of "rockchip,rk3066-smp"
- While optional, it is the preferred way to get access to
- the cpu-core power-domains.
-
- required:
- - device_type
- - reg
- - compatible
-
- dependencies:
- cpu-release-addr: [enable-method]
- rockchip,pmu: [enable-method]
+ Optional for systems that have an "enable-method"
+ property value of "rockchip,rk3066-smp"
+ While optional, it is the preferred way to get access to
+ the cpu-core power-domains.
required:
- - '#address-cells'
- - '#size-cells'
+ - device_type
+ - reg
+ - compatible
+
+dependencies:
+ rockchip,pmu: [enable-method]
examples:
- |
diff --git a/Documentation/devicetree/bindings/arm/digicolor.txt b/Documentation/devicetree/bindings/arm/digicolor.txt
deleted file mode 100644
index 658553f40b23..000000000000
--- a/Documentation/devicetree/bindings/arm/digicolor.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-Conexant Digicolor Platforms Device Tree Bindings
-
-Each device tree must specify which Conexant Digicolor SoC it uses.
-Must be the following compatible string:
-
- cnxt,cx92755
diff --git a/Documentation/devicetree/bindings/arm/digicolor.yaml b/Documentation/devicetree/bindings/arm/digicolor.yaml
new file mode 100644
index 000000000000..d9c80b827e9b
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/digicolor.yaml
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/digicolor.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Conexant Digicolor Platforms Device Tree Bindings
+
+maintainers:
+ - Baruch Siach <baruch@tkos.co.il>
+
+properties:
+ compatible:
+ const: cnxt,cx92755
+
+...
diff --git a/Documentation/devicetree/bindings/arm/emtrion.txt b/Documentation/devicetree/bindings/arm/emtrion.txt
deleted file mode 100644
index 83329aefc483..000000000000
--- a/Documentation/devicetree/bindings/arm/emtrion.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-Emtrion Devicetree Bindings
-===========================
-
-emCON Series:
--------------
-
-Required root node properties
- - compatible:
- - "emtrion,emcon-mx6", "fsl,imx6q"; : emCON-MX6D or emCON-MX6Q SoM
- - "emtrion,emcon-mx6-avari", "fsl,imx6q"; : emCON-MX6D or emCON-MX6Q SoM on Avari Base
- - "emtrion,emcon-mx6", "fsl,imx6dl"; : emCON-MX6S or emCON-MX6DL SoM
- - "emtrion,emcon-mx6-avari", "fsl,imx6dl"; : emCON-MX6S or emCON-MX6DL SoM on Avari Base
diff --git a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
index 5d7dbabbb784..a575e42f7fec 100644
--- a/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
+++ b/Documentation/devicetree/bindings/arm/freescale/fsl,scu.txt
@@ -133,6 +133,28 @@ RTC bindings based on SCU Message Protocol
Required properties:
- compatible: should be "fsl,imx8qxp-sc-rtc";
+OCOTP bindings based on SCU Message Protocol
+------------------------------------------------------------
+Required properties:
+- compatible: Should be "fsl,imx8qxp-scu-ocotp"
+- #address-cells: Must be 1. Contains byte index
+- #size-cells: Must be 1. Contains byte length
+
+Optional Child nodes:
+
+- Data cells of ocotp:
+ Detailed bindings are described in bindings/nvmem/nvmem.txt
+
+Watchdog bindings based on SCU Message Protocol
+------------------------------------------------------------
+
+Required properties:
+- compatible: should be:
+ "fsl,imx8qxp-sc-wdt"
+ followed by "fsl,imx-sc-wdt";
+Optional properties:
+- timeout-sec: contains the watchdog timeout in seconds.
+
Example (imx8qxp):
-------------
aliases {
@@ -177,6 +199,16 @@ firmware {
...
};
+ ocotp: imx8qx-ocotp {
+ compatible = "fsl,imx8qxp-scu-ocotp";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ fec_mac0: mac@2c4 {
+ reg = <0x2c4 8>;
+ };
+ };
+
pd: imx8qx-pd {
compatible = "fsl,imx8qxp-scu-pd", "fsl,scu-pd";
#power-domain-cells = <1>;
@@ -185,6 +217,11 @@ firmware {
rtc: rtc {
compatible = "fsl,imx8qxp-sc-rtc";
};
+
+ watchdog {
+ compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
+ timeout-sec = <60>;
+ };
};
};
diff --git a/Documentation/devicetree/bindings/arm/fsl.yaml b/Documentation/devicetree/bindings/arm/fsl.yaml
index 407138ebc0d0..7294ac36f4c0 100644
--- a/Documentation/devicetree/bindings/arm/fsl.yaml
+++ b/Documentation/devicetree/bindings/arm/fsl.yaml
@@ -15,6 +15,13 @@ properties:
const: '/'
compatible:
oneOf:
+ - description: i.MX1 based Boards
+ items:
+ - enum:
+ - armadeus,imx1-apf9328
+ - fsl,imx1ads
+ - const: fsl,imx1
+
- description: i.MX23 based Boards
items:
- enum:
@@ -51,6 +58,25 @@ properties:
- const: i2se,duckbill-2
- const: fsl,imx28
+ - description: i.MX31 based Boards
+ items:
+ - enum:
+ - buglabs,imx31-bug
+ - logicpd,imx31-lite
+ - const: fsl,imx31
+
+ - description: i.MX35 based Boards
+ items:
+ - enum:
+ - fsl,imx35-pdk
+ - const: fsl,imx35
+
+ - description: i.MX35 Eukrea CPUIMX35 Board
+ items:
+ - const: eukrea,mbimxsd35-baseboard
+ - const: eukrea,cpuimx35
+ - const: fsl,imx35
+
- description: i.MX50 based Boards
items:
- enum:
@@ -80,6 +106,8 @@ properties:
- description: i.MX6Q based Boards
items:
- enum:
+ - emtrion,emcon-mx6 # emCON-MX6D or emCON-MX6Q SoM
+ - emtrion,emcon-mx6-avari # emCON-MX6D or emCON-MX6Q SoM on Avari Base
- fsl,imx6q-arm2
- fsl,imx6q-sabreauto
- fsl,imx6q-sabrelite
@@ -99,6 +127,8 @@ properties:
items:
- enum:
- eckelmann,imx6dl-ci4x10
+ - emtrion,emcon-mx6 # emCON-MX6S or emCON-MX6DL SoM
+ - emtrion,emcon-mx6-avari # emCON-MX6S or emCON-MX6DL SoM on Avari Base
- fsl,imx6dl-sabreauto # i.MX6 DualLite/Solo SABRE Automotive Board
- fsl,imx6dl-sabresd # i.MX6 DualLite SABRE Smart Device Board
- technologic,imx6dl-ts4900
@@ -156,6 +186,7 @@ properties:
items:
- enum:
- fsl,imx7d-sdb # i.MX7 SabreSD Board
+ - novtech,imx7d-meerkat96 # i.MX7 Meerkat96 Board
- tq,imx7d-mba7 # i.MX7D TQ MBa7 with TQMa7D SoM
- zii,imx7d-rpu2 # ZII RPU2 Board
- const: fsl,imx7d
@@ -171,12 +202,25 @@ properties:
- const: compulab,cl-som-imx7
- const: fsl,imx7d
+ - description: i.MX7ULP based Boards
+ items:
+ - enum:
+ - fsl,imx7ulp-evk # i.MX7ULP Evaluation Kit
+ - const: fsl,imx7ulp
+
- description: i.MX8MM based Boards
items:
- enum:
- fsl,imx8mm-evk # i.MX8MM EVK Board
- const: fsl,imx8mm
+ - description: i.MX8MQ based Boards
+ items:
+ - enum:
+ - fsl,imx8mq-evk # i.MX8MQ EVK Board
+ - purism,librem5-devkit # Purism Librem5 devkit
+ - const: fsl,imx8mq
+
- description: i.MX8QXP based Boards
items:
- enum:
diff --git a/Documentation/devicetree/bindings/arm/idle-states.txt b/Documentation/devicetree/bindings/arm/idle-states.txt
index 45730ba60af5..326f29b270ad 100644
--- a/Documentation/devicetree/bindings/arm/idle-states.txt
+++ b/Documentation/devicetree/bindings/arm/idle-states.txt
@@ -241,9 +241,13 @@ processor idle states, defined as device tree nodes, are listed.
- "psci"
# On ARM 32-bit systems this property is optional
-The nodes describing the idle states (state) can only be defined within the
-idle-states node, any other configuration is considered invalid and therefore
-must be ignored.
+This assumes that the "enable-method" property is set to "psci" in the cpu
+node[6] that is responsible for setting up CPU idle management in the OS
+implementation.
+
+The nodes describing the idle states (state) can only be defined
+within the idle-states node, any other configuration is considered invalid
+and therefore must be ignored.
===========================================
4 - state node
@@ -687,7 +691,7 @@ cpus {
Documentation/devicetree/bindings/arm/cpus.yaml
[2] ARM Linux Kernel documentation - PSCI bindings
- Documentation/devicetree/bindings/arm/psci.txt
+ Documentation/devicetree/bindings/arm/psci.yaml
[3] ARM Server Base System Architecture (SBSA)
http://infocenter.arm.com/help/index.jsp
@@ -697,3 +701,6 @@ cpus {
[5] Devicetree Specification
https://www.devicetree.org/specifications/
+
+[6] ARM Linux Kernel documentation - Booting AArch64 Linux
+ Documentation/arm64/booting.txt
diff --git a/Documentation/devicetree/bindings/arm/mediatek.txt b/Documentation/devicetree/bindings/arm/mediatek.txt
deleted file mode 100644
index 56ac7896d6d8..000000000000
--- a/Documentation/devicetree/bindings/arm/mediatek.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-MediaTek SoC based Platforms Device Tree Bindings
-
-Boards with a MediaTek SoC shall have the following property:
-
-Required root node property:
-
-compatible: Must contain one of
- "mediatek,mt2701"
- "mediatek,mt2712"
- "mediatek,mt6580"
- "mediatek,mt6589"
- "mediatek,mt6592"
- "mediatek,mt6755"
- "mediatek,mt6765"
- "mediatek,mt6795"
- "mediatek,mt6797"
- "mediatek,mt7622"
- "mediatek,mt7623"
- "mediatek,mt7629"
- "mediatek,mt8127"
- "mediatek,mt8135"
- "mediatek,mt8173"
- "mediatek,mt8183"
-
-
-Supported boards:
-
-- Evaluation board for MT2701:
- Required root node properties:
- - compatible = "mediatek,mt2701-evb", "mediatek,mt2701";
-- Evaluation board for MT2712:
- Required root node properties:
- - compatible = "mediatek,mt2712-evb", "mediatek,mt2712";
-- Evaluation board for MT6580:
- Required root node properties:
- - compatible = "mediatek,mt6580-evbp1", "mediatek,mt6580";
-- bq Aquaris5 smart phone:
- Required root node properties:
- - compatible = "mundoreader,bq-aquaris5", "mediatek,mt6589";
-- Evaluation board for MT6592:
- Required root node properties:
- - compatible = "mediatek,mt6592-evb", "mediatek,mt6592";
-- Evaluation phone for MT6755(Helio P10):
- Required root node properties:
- - compatible = "mediatek,mt6755-evb", "mediatek,mt6755";
-- Evaluation board for MT6765(Helio P22):
- Required root node properties:
- - compatible = "mediatek,mt6765-evb", "mediatek,mt6765";
-- Evaluation board for MT6795(Helio X10):
- Required root node properties:
- - compatible = "mediatek,mt6795-evb", "mediatek,mt6795";
-- Evaluation board for MT6797(Helio X20):
- Required root node properties:
- - compatible = "mediatek,mt6797-evb", "mediatek,mt6797";
-- Mediatek X20 Development Board:
- Required root node properties:
- - compatible = "archermind,mt6797-x20-dev", "mediatek,mt6797";
-- Reference board variant 1 for MT7622:
- Required root node properties:
- - compatible = "mediatek,mt7622-rfb1", "mediatek,mt7622";
-- Bananapi BPI-R64 for MT7622:
- Required root node properties:
- - compatible = "bananapi,bpi-r64", "mediatek,mt7622";
-- Reference board for MT7623a with eMMC:
- Required root node properties:
- - compatible = "mediatek,mt7623a-rfb-emmc", "mediatek,mt7623";
-- Reference board for MT7623a with NAND:
- Required root node properties:
- - compatible = "mediatek,mt7623a-rfb-nand", "mediatek,mt7623";
-- Reference board for MT7623n with eMMC:
- Required root node properties:
- - compatible = "mediatek,mt7623n-rfb-emmc", "mediatek,mt7623";
-- Bananapi BPI-R2 board:
- - compatible = "bananapi,bpi-r2", "mediatek,mt7623";
-- Reference board for MT7629:
- Required root node properties:
- - compatible = "mediatek,mt7629-rfb", "mediatek,mt7629";
-- MTK mt8127 tablet moose EVB:
- Required root node properties:
- - compatible = "mediatek,mt8127-moose", "mediatek,mt8127";
-- MTK mt8135 tablet EVB:
- Required root node properties:
- - compatible = "mediatek,mt8135-evbp1", "mediatek,mt8135";
-- MTK mt8173 tablet EVB:
- Required root node properties:
- - compatible = "mediatek,mt8173-evb", "mediatek,mt8173";
-- Evaluation board for MT8183:
- Required root node properties:
- - compatible = "mediatek,mt8183-evb", "mediatek,mt8183";
diff --git a/Documentation/devicetree/bindings/arm/mediatek.yaml b/Documentation/devicetree/bindings/arm/mediatek.yaml
new file mode 100644
index 000000000000..a4ad2eb926f9
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/mediatek.yaml
@@ -0,0 +1,91 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/mediatek.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MediaTek SoC based Platforms Device Tree Bindings
+
+maintainers:
+ - Sean Wang <sean.wang@mediatek.com>
+ - Matthias Brugger <matthias.bgg@gmail.com>
+description: |
+ Boards with a MediaTek SoC shall have the following properties.
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ oneOf:
+ - items:
+ - enum:
+ - mediatek,mt2701-evb
+ - const: mediatek,mt2701
+
+ - items:
+ - enum:
+ - mediatek,mt2712-evb
+ - const: mediatek,mt2712
+ - items:
+ - enum:
+ - mediatek,mt6580-evbp1
+ - const: mediatek,mt6580
+ - items:
+ - enum:
+ - mundoreader,bq-aquaris5
+ - const: mediatek,mt6589
+ - items:
+ - enum:
+ - mediatek,mt6592-evb
+ - const: mediatek,mt6592
+ - items:
+ - enum:
+ - mediatek,mt6755-evb
+ - const: mediatek,mt6755
+ - items:
+ - enum:
+ - mediatek,mt6765-evb
+ - const: mediatek,mt6765
+ - items:
+ - enum:
+ - mediatek,mt6795-evb
+ - const: mediatek,mt6795
+ - items:
+ - enum:
+ - archermind,mt6797-x20-dev
+ - mediatek,mt6797-evb
+ - const: mediatek,mt6797
+ - items:
+ - enum:
+ - bananapi,bpi-r64
+ - mediatek,mt7622-rfb1
+ - const: mediatek,mt7622
+ - items:
+ - enum:
+ - mediatek,mt7623a-rfb-emmc
+ - mediatek,mt7623a-rfb-nand
+ - mediatek,mt7623n-rfb-emmc
+ - bananapi,bpi-r2
+ - const: mediatek,mt7623
+
+ - items:
+ - enum:
+ - mediatek,mt7629-rfb
+ - const: mediatek,mt7629
+ - items:
+ - enum:
+ - mediatek,mt8127-moose
+ - const: mediatek,mt8127
+ - items:
+ - enum:
+ - mediatek,mt8135-evbp1
+ - const: mediatek,mt8135
+ - items:
+ - enum:
+ - mediatek,mt8173-evb
+ - const: mediatek,mt8173
+ - items:
+ - enum:
+ - mediatek,mt8183-evb
+ - const: mediatek,mt8183
+...
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
index f3cef1a6d95c..07c9d813465c 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,audsys.txt
@@ -10,6 +10,7 @@ Required Properties:
- "mediatek,mt7622-audsys", "syscon"
- "mediatek,mt7623-audsys", "mediatek,mt2701-audsys", "syscon"
- "mediatek,mt8183-audiosys", "syscon"
+ - "mediatek,mt8516-audsys", "syscon"
- #clock-cells: Must be 1
The AUDSYS controller uses the common clk binding from
diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
index 30cb645c0e54..f5518f26a914 100644
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,sgmiisys.txt
@@ -9,6 +9,8 @@ Required Properties:
- "mediatek,mt7622-sgmiisys", "syscon"
- "mediatek,mt7629-sgmiisys", "syscon"
- #clock-cells: Must be 1
+- mediatek,physpeed: Should be one of "auto", "1000" or "2500" to match up
+ the capability of the target PHY.
The SGMIISYS controller uses the common clk binding from
Documentation/devicetree/bindings/clock/clock-bindings.txt
diff --git a/Documentation/devicetree/bindings/arm/moxart.txt b/Documentation/devicetree/bindings/arm/moxart.txt
deleted file mode 100644
index 11087edb0658..000000000000
--- a/Documentation/devicetree/bindings/arm/moxart.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-MOXA ART device tree bindings
-
-Boards with the MOXA ART SoC shall have the following properties:
-
-Required root node property:
-
-compatible = "moxa,moxart";
-
-Boards:
-
-- UC-7112-LX: embedded computer
- compatible = "moxa,moxart-uc-7112-lx", "moxa,moxart"
diff --git a/Documentation/devicetree/bindings/arm/moxart.yaml b/Documentation/devicetree/bindings/arm/moxart.yaml
new file mode 100644
index 000000000000..c068df59fad2
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/moxart.yaml
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/moxart.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MOXA ART device tree bindings
+
+maintainers:
+ - Jonas Jensen <jonas.jensen@gmail.com>
+
+properties:
+ compatible:
+ description: UC-7112-LX embedded computer
+ items:
+ - const: moxa,moxart-uc-7112-lx
+ - const: moxa,moxart
+
+...
diff --git a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.txt b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.txt
deleted file mode 100644
index 56ec8ddc4a3b..000000000000
--- a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-NXP LPC32xx Platforms Device Tree Bindings
-------------------------------------------
-
-Boards with the NXP LPC32xx SoC shall have the following properties:
-
-Required root node property:
-
-compatible: must be "nxp,lpc3220", "nxp,lpc3230", "nxp,lpc3240" or "nxp,lpc3250"
diff --git a/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml
new file mode 100644
index 000000000000..07f39d3eee7e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/nxp/lpc32xx.yaml
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/nxp/lpc32xx.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP LPC32xx Platforms Device Tree Bindings
+
+maintainers:
+ - Roland Stigge <stigge@antcom.de>
+
+properties:
+ compatible:
+ oneOf:
+ - enum:
+ - nxp,lpc3220
+ - nxp,lpc3230
+ - nxp,lpc3240
+ - items:
+ - enum:
+ - ea,ea3250
+ - phytec,phy3250
+ - const: nxp,lpc3250
+
+...
diff --git a/Documentation/devicetree/bindings/arm/omap/omap.txt b/Documentation/devicetree/bindings/arm/omap/omap.txt
index 1c1e48fd94b5..b301f753ed2c 100644
--- a/Documentation/devicetree/bindings/arm/omap/omap.txt
+++ b/Documentation/devicetree/bindings/arm/omap/omap.txt
@@ -160,6 +160,9 @@ Boards:
- AM335X phyCORE-AM335x: Development kit
compatible = "phytec,am335x-pcm-953", "phytec,am335x-phycore-som", "ti,am33xx"
+- AM335x phyBOARD-REGOR: Single Board Computer
+ compatible = "phytec,am335x-regor", "phytec,am335x-phycore-som", "ti,am33xx"
+
- AM335X UC-8100-ME-T: Communication-centric industrial computing platform
compatible = "moxa,uc-8100-me-t", "ti,am33xx";
diff --git a/Documentation/devicetree/bindings/arm/psci.txt b/Documentation/devicetree/bindings/arm/psci.txt
deleted file mode 100644
index a2c4f1d52492..000000000000
--- a/Documentation/devicetree/bindings/arm/psci.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-* Power State Coordination Interface (PSCI)
-
-Firmware implementing the PSCI functions described in ARM document number
-ARM DEN 0022A ("Power State Coordination Interface System Software on ARM
-processors") can be used by Linux to initiate various CPU-centric power
-operations.
-
-Issue A of the specification describes functions for CPU suspend, hotplug
-and migration of secure software.
-
-Functions are invoked by trapping to the privilege level of the PSCI
-firmware (specified as part of the binding below) and passing arguments
-in a manner similar to that specified by AAPCS:
-
- r0 => 32-bit Function ID / return value
- {r1 - r3} => Parameters
-
-Note that the immediate field of the trapping instruction must be set
-to #0.
-
-
-Main node required properties:
-
- - compatible : should contain at least one of:
-
- * "arm,psci" : For implementations complying to PSCI versions prior
- to 0.2.
- For these cases function IDs must be provided.
-
- * "arm,psci-0.2" : For implementations complying to PSCI 0.2.
- Function IDs are not required and should be ignored by
- an OS with PSCI 0.2 support, but are permitted to be
- present for compatibility with existing software when
- "arm,psci" is later in the compatible list.
-
- * "arm,psci-1.0" : For implementations complying to PSCI 1.0.
- PSCI 1.0 is backward compatible with PSCI 0.2 with
- minor specification updates, as defined in the PSCI
- specification[2].
-
- - method : The method of calling the PSCI firmware. Permitted
- values are:
-
- "smc" : SMC #0, with the register assignments specified
- in this binding.
-
- "hvc" : HVC #0, with the register assignments specified
- in this binding.
-
-Main node optional properties:
-
- - cpu_suspend : Function ID for CPU_SUSPEND operation
-
- - cpu_off : Function ID for CPU_OFF operation
-
- - cpu_on : Function ID for CPU_ON operation
-
- - migrate : Function ID for MIGRATE operation
-
-Device tree nodes that require usage of PSCI CPU_SUSPEND function (ie idle
-state nodes, as per bindings in [1]) must specify the following properties:
-
-- arm,psci-suspend-param
- Usage: Required for state nodes[1] if the corresponding
- idle-states node entry-method property is set
- to "psci".
- Value type: <u32>
- Definition: power_state parameter to pass to the PSCI
- suspend call.
-
-Example:
-
-Case 1: PSCI v0.1 only.
-
- psci {
- compatible = "arm,psci";
- method = "smc";
- cpu_suspend = <0x95c10000>;
- cpu_off = <0x95c10001>;
- cpu_on = <0x95c10002>;
- migrate = <0x95c10003>;
- };
-
-Case 2: PSCI v0.2 only
-
- psci {
- compatible = "arm,psci-0.2";
- method = "smc";
- };
-
-Case 3: PSCI v0.2 and PSCI v0.1.
-
- A DTB may provide IDs for use by kernels without PSCI 0.2 support,
- enabling firmware and hypervisors to support existing and new kernels.
- These IDs will be ignored by kernels with PSCI 0.2 support, which will
- use the standard PSCI 0.2 IDs exclusively.
-
- psci {
- compatible = "arm,psci-0.2", "arm,psci";
- method = "hvc";
-
- cpu_on = < arbitrary value >;
- cpu_off = < arbitrary value >;
-
- ...
- };
-
-[1] Kernel documentation - ARM idle states bindings
- Documentation/devicetree/bindings/arm/idle-states.txt
-[2] Power State Coordination Interface (PSCI) specification
- http://infocenter.arm.com/help/topic/com.arm.doc.den0022c/DEN0022C_Power_State_Coordination_Interface.pdf
diff --git a/Documentation/devicetree/bindings/arm/psci.yaml b/Documentation/devicetree/bindings/arm/psci.yaml
new file mode 100644
index 000000000000..7abdf58b335e
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/psci.yaml
@@ -0,0 +1,163 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/psci.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Power State Coordination Interface (PSCI)
+
+maintainers:
+ - Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+
+description: |+
+ Firmware implementing the PSCI functions described in ARM document number
+ ARM DEN 0022A ("Power State Coordination Interface System Software on ARM
+ processors") can be used by Linux to initiate various CPU-centric power
+ operations.
+
+ Issue A of the specification describes functions for CPU suspend, hotplug
+ and migration of secure software.
+
+ Functions are invoked by trapping to the privilege level of the PSCI
+ firmware (specified as part of the binding below) and passing arguments
+ in a manner similar to that specified by AAPCS:
+
+ r0 => 32-bit Function ID / return value
+ {r1 - r3} => Parameters
+
+ Note that the immediate field of the trapping instruction must be set
+ to #0.
+
+ [2] Power State Coordination Interface (PSCI) specification
+ http://infocenter.arm.com/help/topic/com.arm.doc.den0022c/DEN0022C_Power_State_Coordination_Interface.pdf
+
+properties:
+ compatible:
+ oneOf:
+ - description:
+ For implementations complying to PSCI versions prior to 0.2.
+ const: arm,psci
+
+ - description:
+ For implementations complying to PSCI 0.2.
+ const: arm,psci-0.2
+
+ - description:
+ For implementations complying to PSCI 0.2.
+ Function IDs are not required and should be ignored by an OS with
+ PSCI 0.2 support, but are permitted to be present for compatibility
+ with existing software when "arm,psci" is later in the compatible
+ list.
+ items:
+ - const: arm,psci-0.2
+ - const: arm,psci
+
+ - description:
+ For implementations complying to PSCI 1.0.
+ const: arm,psci-1.0
+
+ - description:
+ For implementations complying to PSCI 1.0.
+ PSCI 1.0 is backward compatible with PSCI 0.2 with minor
+ specification updates, as defined in the PSCI specification[2].
+ items:
+ - const: arm,psci-1.0
+ - const: arm,psci-0.2
+
+ method:
+ description: The method of calling the PSCI firmware.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/string-array
+ - enum:
+ # SMC #0, with the register assignments specified in this binding.
+ - smc
+ # HVC #0, with the register assignments specified in this binding.
+ - hvc
+
+ cpu_suspend:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Function ID for CPU_SUSPEND operation
+
+ cpu_off:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Function ID for CPU_OFF operation
+
+ cpu_on:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Function ID for CPU_ON operation
+
+ migrate:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: Function ID for MIGRATE operation
+
+ arm,psci-suspend-param:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description: |
+ power_state parameter to pass to the PSCI suspend call.
+
+ Device tree nodes that require usage of PSCI CPU_SUSPEND function (ie
+ idle state nodes with entry-method property is set to "psci", as per
+ bindings in [1]) must specify this property.
+
+ [1] Kernel documentation - ARM idle states bindings
+ Documentation/devicetree/bindings/arm/idle-states.txt
+
+
+required:
+ - compatible
+ - method
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: arm,psci
+ then:
+ required:
+ - cpu_off
+ - cpu_on
+
+examples:
+ - |+
+
+ // Case 1: PSCI v0.1 only.
+
+ psci {
+ compatible = "arm,psci";
+ method = "smc";
+ cpu_suspend = <0x95c10000>;
+ cpu_off = <0x95c10001>;
+ cpu_on = <0x95c10002>;
+ migrate = <0x95c10003>;
+ };
+
+ - |+
+
+ // Case 2: PSCI v0.2 only
+
+ psci {
+ compatible = "arm,psci-0.2";
+ method = "smc";
+ };
+
+
+ - |+
+
+ // Case 3: PSCI v0.2 and PSCI v0.1.
+
+ /*
+ * A DTB may provide IDs for use by kernels without PSCI 0.2 support,
+ * enabling firmware and hypervisors to support existing and new kernels.
+ * These IDs will be ignored by kernels with PSCI 0.2 support, which will
+ * use the standard PSCI 0.2 IDs exclusively.
+ */
+
+ psci {
+ compatible = "arm,psci-0.2", "arm,psci";
+ method = "hvc";
+
+ cpu_on = <0x95c10002>;
+ cpu_off = <0x95c10001>;
+ };
+...
diff --git a/Documentation/devicetree/bindings/arm/qcom.yaml b/Documentation/devicetree/bindings/arm/qcom.yaml
index f6316ab66385..54ef6b6b9189 100644
--- a/Documentation/devicetree/bindings/arm/qcom.yaml
+++ b/Documentation/devicetree/bindings/arm/qcom.yaml
@@ -102,6 +102,15 @@ properties:
- const: qcom,msm8960
- items:
+ - enum:
+ - fairphone,fp2
+ - lge,hammerhead
+ - sony,xperia-amami
+ - sony,xperia-castor
+ - sony,xperia-honami
+ - const: qcom,msm8974
+
+ - items:
- const: qcom,msm8916-mtp/1
- const: qcom,msm8916-mtp
- const: qcom,msm8916
@@ -110,6 +119,11 @@ properties:
- const: qcom,msm8996-mtp
- items:
+ - enum:
+ - qcom,ipq4019-ap-dk04.1-c3
+ - qcom,ipq4019-ap-dk07.1-c1
+ - qcom,ipq4019-ap-dk07.1-c2
+ - qcom,ipq4019-dk04.1-c1
- const: qcom,ipq4019
- items:
diff --git a/Documentation/devicetree/bindings/arm/rda.txt b/Documentation/devicetree/bindings/arm/rda.txt
deleted file mode 100644
index 43c80762c428..000000000000
--- a/Documentation/devicetree/bindings/arm/rda.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-RDA Micro platforms device tree bindings
-----------------------------------------
-
-RDA8810PL SoC
-=============
-
-Required root node properties:
-
- - compatible : must contain "rda,8810pl"
-
-
-Boards:
-
-Root node property compatible must contain, depending on board:
-
- - Orange Pi 2G-IoT: "xunlong,orangepi-2g-iot"
- - Orange Pi i96: "xunlong,orangepi-i96"
diff --git a/Documentation/devicetree/bindings/arm/rda.yaml b/Documentation/devicetree/bindings/arm/rda.yaml
new file mode 100644
index 000000000000..51cec2b63b04
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/rda.yaml
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/rda.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RDA Micro platforms device tree bindings
+
+maintainers:
+ - Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - xunlong,orangepi-2g-iot # Orange Pi 2G-IoT
+ - xunlong,orangepi-i96 # Orange Pi i96
+ - const: rda,8810pl
+
+...
diff --git a/Documentation/devicetree/bindings/arm/renesas.yaml b/Documentation/devicetree/bindings/arm/renesas.yaml
index 19f379863d50..08c923f8c257 100644
--- a/Documentation/devicetree/bindings/arm/renesas.yaml
+++ b/Documentation/devicetree/bindings/arm/renesas.yaml
@@ -106,6 +106,14 @@ properties:
- description: RZ/G2M (R8A774A1)
items:
+ - enum:
+ - hoperun,hihope-rzg2m # HopeRun HiHope RZ/G2M platform
+ - const: renesas,r8a774a1
+
+ - items:
+ - enum:
+ - hoperun,hihope-rzg2-ex # HopeRun expansion board for HiHope RZ/G2 platforms
+ - const: hoperun,hihope-rzg2m
- const: renesas,r8a774a1
- description: RZ/G2E (R8A774C0)
diff --git a/Documentation/devicetree/bindings/arm/rockchip.yaml b/Documentation/devicetree/bindings/arm/rockchip.yaml
index 5c6bbf10abc9..34865042f4e4 100644
--- a/Documentation/devicetree/bindings/arm/rockchip.yaml
+++ b/Documentation/devicetree/bindings/arm/rockchip.yaml
@@ -316,6 +316,19 @@ properties:
- const: haoyu,marsboard-rk3066
- const: rockchip,rk3066a
+ - description: Hugsun X99 TV Box
+ items:
+ - const: hugsun,x99
+ - const: rockchip,rk3399
+
+ - description: Khadas Edge series boards
+ items:
+ - enum:
+ - khadas,edge
+ - khadas,edge-captain
+ - khadas,edge-v
+ - const: rockchip,rk3399
+
- description: mqmaker MiQi
items:
- const: mqmaker,miqi
diff --git a/Documentation/devicetree/bindings/arm/stm32/mlahb.txt b/Documentation/devicetree/bindings/arm/stm32/mlahb.txt
new file mode 100644
index 000000000000..25307aa1eb9b
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/mlahb.txt
@@ -0,0 +1,37 @@
+ML-AHB interconnect bindings
+
+These bindings describe the STM32 SoCs ML-AHB interconnect bus which connects
+a Cortex-M subsystem with dedicated memories.
+The MCU SRAM and RETRAM memory parts can be accessed through different addresses
+(see "RAM aliases" in [1]) using different buses (see [2]) : balancing the
+Cortex-M firmware accesses among those ports allows to tune the system
+performance.
+
+[1]: https://www.st.com/resource/en/reference_manual/dm00327659.pdf
+[2]: https://wiki.st.com/stm32mpu/wiki/STM32MP15_RAM_mapping
+
+Required properties:
+- compatible: should be "simple-bus"
+- dma-ranges: describes memory addresses translation between the local CPU and
+ the remote Cortex-M processor. Each memory region, is declared with
+ 3 parameters:
+ - param 1: device base address (Cortex-M processor address)
+ - param 2: physical base address (local CPU address)
+ - param 3: size of the memory region.
+
+The Cortex-M remote processor accessed via the mlahb interconnect is described
+by a child node.
+
+Example:
+mlahb {
+ compatible = "simple-bus";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ dma-ranges = <0x00000000 0x38000000 0x10000>,
+ <0x10000000 0x10000000 0x60000>,
+ <0x30000000 0x30000000 0x60000>;
+
+ m4_rproc: m4@10000000 {
+ ...
+ };
+};
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32.txt b/Documentation/devicetree/bindings/arm/stm32/stm32.txt
deleted file mode 100644
index 6808ed9ddfd5..000000000000
--- a/Documentation/devicetree/bindings/arm/stm32/stm32.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-STMicroelectronics STM32 Platforms Device Tree Bindings
-
-Each device tree must specify which STM32 SoC it uses,
-using one of the following compatible strings:
-
- st,stm32f429
- st,stm32f469
- st,stm32f746
- st,stm32h743
- st,stm32mp157
diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32.yaml b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml
new file mode 100644
index 000000000000..4d194f1eb03a
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/stm32/stm32.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: STMicroelectronics STM32 Platforms Device Tree Bindings
+
+maintainers:
+ - Alexandre Torgue <alexandre.torgue@st.com>
+
+properties:
+ compatible:
+ oneOf:
+ - items:
+ - const: st,stm32f429
+
+ - items:
+ - const: st,stm32f469
+
+ - items:
+ - const: st,stm32f746
+
+ - items:
+ - const: st,stm32h743
+
+ - items:
+ - enum:
+ - arrow,stm32mp157a-avenger96 # Avenger96
+ - const: st,stm32mp157
+...
diff --git a/Documentation/devicetree/bindings/arm/sunxi.yaml b/Documentation/devicetree/bindings/arm/sunxi.yaml
index 285f4fc8519d..000a00d12d6a 100644
--- a/Documentation/devicetree/bindings/arm/sunxi.yaml
+++ b/Documentation/devicetree/bindings/arm/sunxi.yaml
@@ -263,7 +263,7 @@ properties:
- description: ICNova A20 SWAC
items:
- - const: swac,icnova-a20-swac
+ - const: incircuit,icnova-a20-swac
- const: incircuit,icnova-a20
- const: allwinner,sun7i-a20
diff --git a/Documentation/devicetree/bindings/arm/ti/k3.txt b/Documentation/devicetree/bindings/arm/ti/k3.txt
index 6a059cabb2da..333e7256126a 100644
--- a/Documentation/devicetree/bindings/arm/ti/k3.txt
+++ b/Documentation/devicetree/bindings/arm/ti/k3.txt
@@ -13,6 +13,9 @@ architecture it uses, using one of the following compatible values:
- AM654
compatible = "ti,am654";
+- J721E
+ compatible = "ti,j721e";
+
Boards
------
diff --git a/Documentation/devicetree/bindings/arm/xen.txt b/Documentation/devicetree/bindings/arm/xen.txt
index c9b9321434ea..db5c56db30ec 100644
--- a/Documentation/devicetree/bindings/arm/xen.txt
+++ b/Documentation/devicetree/bindings/arm/xen.txt
@@ -54,7 +54,7 @@ hypervisor {
};
The format and meaning of the "xen,uefi-*" parameters are similar to those in
-Documentation/arm/uefi.txt, which are provided by the regular UEFI stub. However
+Documentation/arm/uefi.rst, which are provided by the regular UEFI stub. However
they differ because they are provided by the Xen hypervisor, together with a set
of UEFI runtime services implemented via hypercalls, see
http://xenbits.xen.org/docs/unstable/hypercall/x86_64/include,public,platform.h.html.
diff --git a/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml
new file mode 100644
index 000000000000..be32f087c529
--- /dev/null
+++ b/Documentation/devicetree/bindings/bus/allwinner,sun8i-a23-rsb.yaml
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bus/allwinner,sun8i-a23-rsb.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A23 RSB Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun8i-a23-rsb
+ - items:
+ - const: allwinner,sun8i-a83t-rsb
+ - const: allwinner,sun8i-a23-rsb
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ clock-frequency:
+ minimum: 1
+ maximum: 20000000
+
+patternProperties:
+ "^.*@[0-9a-fA-F]+$":
+ type: object
+ properties:
+ reg:
+ maxItems: 1
+
+ required:
+ - reg
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - resets
+
+examples:
+ - |
+ rsb@1f03400 {
+ compatible = "allwinner,sun8i-a23-rsb";
+ reg = <0x01f03400 0x400>;
+ interrupts = <0 39 4>;
+ clocks = <&apb0_gates 3>;
+ clock-frequency = <3000000>;
+ resets = <&apb0_rst 3>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ pmic@3e3 {
+ compatible = "...";
+ reg = <0x3e3>;
+
+ /* ... */
+ };
+ };
+
+additionalProperties: false
diff --git a/Documentation/devicetree/bindings/bus/sunxi-rsb.txt b/Documentation/devicetree/bindings/bus/sunxi-rsb.txt
deleted file mode 100644
index eb3ed628c6f1..000000000000
--- a/Documentation/devicetree/bindings/bus/sunxi-rsb.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-Allwinner Reduced Serial Bus (RSB) controller
-
-The RSB controller found on later Allwinner SoCs is an SMBus like 2 wire
-serial bus with 1 master and up to 15 slaves. It is represented by a node
-for the controller itself, and child nodes representing the slave devices.
-
-Required properties :
-
- - reg : Offset and length of the register set for the controller.
- - compatible : Shall be "allwinner,sun8i-a23-rsb".
- - interrupts : The interrupt line associated to the RSB controller.
- - clocks : The gate clk associated to the RSB controller.
- - resets : The reset line associated to the RSB controller.
- - #address-cells : shall be 1
- - #size-cells : shall be 0
-
-Optional properties :
-
- - clock-frequency : Desired RSB bus clock frequency in Hz. Maximum is 20MHz.
- If not set this defaults to 3MHz.
-
-Child nodes:
-
-An RSB controller node can contain zero or more child nodes representing
-slave devices on the bus. Child 'reg' properties should contain the slave
-device's hardware address. The hardware address is hardwired in the device,
-which can normally be found in the datasheet.
-
-Example:
-
- rsb@1f03400 {
- compatible = "allwinner,sun8i-a23-rsb";
- reg = <0x01f03400 0x400>;
- interrupts = <0 39 4>;
- clocks = <&apb0_gates 3>;
- clock-frequency = <3000000>;
- resets = <&apb0_rst 3>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- pmic@3e3 {
- compatible = "...";
- reg = <0x3e3>;
-
- /* ... */
- };
- };
diff --git a/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml
new file mode 100644
index 000000000000..c935405458fe
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/allwinner,sun4i-a10-ccu.yaml
@@ -0,0 +1,141 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/phy/allwinner,sun4i-a10-ccu.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner Clock Control Unit Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#clock-cells":
+ const: 1
+
+ "#reset-cells":
+ const: 1
+
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-ccu
+ - allwinner,sun5i-a10s-ccu
+ - allwinner,sun5i-a13-ccu
+ - allwinner,sun6i-a31-ccu
+ - allwinner,sun7i-a20-ccu
+ - allwinner,sun8i-a23-ccu
+ - allwinner,sun8i-a33-ccu
+ - allwinner,sun8i-a83t-ccu
+ - allwinner,sun8i-a83t-r-ccu
+ - allwinner,sun8i-h3-ccu
+ - allwinner,sun8i-h3-r-ccu
+ - allwinner,sun8i-r40-ccu
+ - allwinner,sun8i-v3s-ccu
+ - allwinner,sun9i-a80-ccu
+ - allwinner,sun50i-a64-ccu
+ - allwinner,sun50i-a64-r-ccu
+ - allwinner,sun50i-h5-ccu
+ - allwinner,sun50i-h6-ccu
+ - allwinner,sun50i-h6-r-ccu
+ - allwinner,suniv-f1c100s-ccu
+ - nextthing,gr8-ccu
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 4
+ items:
+ - description: High Frequency Oscillator (usually at 24MHz)
+ - description: Low Frequency Oscillator (usually at 32kHz)
+ - description: Internal Oscillator
+ - description: Peripherals PLL
+
+ clock-names:
+ minItems: 2
+ maxItems: 4
+ items:
+ - const: hosc
+ - const: losc
+ - const: iosc
+ - const: pll-periph
+
+required:
+ - "#clock-cells"
+ - "#reset-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+if:
+ properties:
+ compatible:
+ enum:
+ - allwinner,sun8i-a83t-r-ccu
+ - allwinner,sun8i-h3-r-ccu
+ - allwinner,sun50i-a64-r-ccu
+ - allwinner,sun50i-h6-r-ccu
+
+then:
+ properties:
+ clocks:
+ minItems: 4
+ maxItems: 4
+
+ clock-names:
+ minItems: 4
+ maxItems: 4
+
+else:
+ if:
+ properties:
+ compatible:
+ const: allwinner,sun50i-h6-ccu
+
+ then:
+ properties:
+ clocks:
+ minItems: 3
+ maxItems: 3
+
+ clock-names:
+ minItems: 3
+ maxItems: 3
+
+ else:
+ properties:
+ clocks:
+ minItems: 2
+ maxItems: 2
+
+ clock-names:
+ minItems: 2
+ maxItems: 2
+
+additionalProperties: false
+
+examples:
+ - |
+ ccu: clock@1c20000 {
+ compatible = "allwinner,sun8i-h3-ccu";
+ reg = <0x01c20000 0x400>;
+ clocks = <&osc24M>, <&osc32k>;
+ clock-names = "hosc", "losc";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
+
+ - |
+ r_ccu: clock@1f01400 {
+ compatible = "allwinner,sun50i-a64-r-ccu";
+ reg = <0x01f01400 0x100>;
+ clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu 11>;
+ clock-names = "hosc", "losc", "iosc", "pll-periph";
+ #clock-cells = <1>;
+ #reset-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
index 5c8b105be4d6..6eaa52092313 100644
--- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
+++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
@@ -10,6 +10,7 @@ Required Properties:
"amlogic,gxl-clkc" for GXL and GXM SoC,
"amlogic,axg-clkc" for AXG SoC.
"amlogic,g12a-clkc" for G12A SoC.
+ "amlogic,g12b-clkc" for G12B SoC.
- clocks : list of clock phandle, one for each entry clock-names.
- clock-names : should contain the following:
* "xtal": the platform xtal
diff --git a/Documentation/devicetree/bindings/clock/at91-clock.txt b/Documentation/devicetree/bindings/clock/at91-clock.txt
index b520280e33ff..13f45db3b66d 100644
--- a/Documentation/devicetree/bindings/clock/at91-clock.txt
+++ b/Documentation/devicetree/bindings/clock/at91-clock.txt
@@ -9,10 +9,11 @@ Slow Clock controller:
Required properties:
- compatible : shall be one of the following:
"atmel,at91sam9x5-sckc",
- "atmel,sama5d3-sckc" or
- "atmel,sama5d4-sckc":
+ "atmel,sama5d3-sckc",
+ "atmel,sama5d4-sckc" or
+ "microchip,sam9x60-sckc":
at91 SCKC (Slow Clock Controller)
-- #clock-cells : shall be 0.
+- #clock-cells : shall be 1 for "microchip,sam9x60-sckc" otherwise shall be 0.
- clocks : shall be the input parent clock phandle for the clock.
Optional properties:
diff --git a/Documentation/devicetree/bindings/clock/brcm,bcm63xx-clocks.txt b/Documentation/devicetree/bindings/clock/brcm,bcm63xx-clocks.txt
new file mode 100644
index 000000000000..3041657e2f96
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/brcm,bcm63xx-clocks.txt
@@ -0,0 +1,22 @@
+Gated Clock Controller Bindings for MIPS based BCM63XX SoCs
+
+Required properties:
+- compatible: must be one of:
+ "brcm,bcm3368-clocks"
+ "brcm,bcm6328-clocks"
+ "brcm,bcm6358-clocks"
+ "brcm,bcm6362-clocks"
+ "brcm,bcm6368-clocks"
+ "brcm,bcm63268-clocks"
+
+- reg: Address and length of the register set
+- #clock-cells: must be <1>
+
+
+Example:
+
+clkctl: clock-controller@10000004 {
+ compatible = "brcm,bcm6328-clocks";
+ reg = <0x10000004 0x4>;
+ #clock-cells = <1>;
+};
diff --git a/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt b/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt
index b8d8ef3bdc5f..52a064c789ee 100644
--- a/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt
+++ b/Documentation/devicetree/bindings/clock/cirrus,lochnagar.txt
@@ -40,6 +40,7 @@ Optional properties:
input audio clocks from host system.
- ln-psia1-mclk, ln-psia2-mclk : Optional input audio clocks from
external connector.
+ - ln-spdif-mclk : Optional input audio clock from SPDIF.
- ln-spdif-clkout : Optional input audio clock from SPDIF.
- ln-adat-mclk : Optional input audio clock from ADAT.
- ln-pmic-32k : On board fixed clock.
diff --git a/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt b/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt
index 796c260c183d..d8f5c490f893 100644
--- a/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt
+++ b/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt
@@ -59,6 +59,7 @@ Required properties:
"marvell,dove-core-clock" - for Dove SoC core clocks
"marvell,kirkwood-core-clock" - for Kirkwood SoC (except mv88f6180)
"marvell,mv88f6180-core-clock" - for Kirkwood MV88f6180 SoC
+ "marvell,mv98dx1135-core-clock" - for Kirkwood 98dx1135 SoC
"marvell,mv88f5181-core-clock" - for Orion MV88F5181 SoC
"marvell,mv88f5182-core-clock" - for Orion MV88F5182 SoC
"marvell,mv88f5281-core-clock" - for Orion MV88F5281 SoC
diff --git a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
index 4e5215ef1acd..269afe8a757e 100644
--- a/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,gpucc.txt
@@ -2,13 +2,15 @@ Qualcomm Graphics Clock & Reset Controller Binding
--------------------------------------------------
Required properties :
-- compatible : shall contain "qcom,sdm845-gpucc"
+- compatible : shall contain "qcom,sdm845-gpucc" or "qcom,msm8998-gpucc"
- reg : shall contain base register location and length
- #clock-cells : from common clock binding, shall contain 1
- #reset-cells : from common reset binding, shall contain 1
- #power-domain-cells : from generic power domain binding, shall contain 1
- clocks : shall contain the XO clock
+ shall contain the gpll0 out main clock (msm8998)
- clock-names : shall be "xo"
+ shall be "gpll0" (msm8998)
Example:
gpucc: clock-controller@5090000 {
diff --git a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt
index d60b99756bb9..aed713cf0831 100644
--- a/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt
+++ b/Documentation/devicetree/bindings/clock/renesas,r9a06g032-sysctrl.txt
@@ -13,6 +13,7 @@ Required Properties:
- external (optional) RGMII_REFCLK
- clock-names: Must be:
clock-names = "mclk", "rtc", "jtag", "rgmii_ref_ext";
+ - #power-domain-cells: Must be 0
Examples
--------
@@ -27,6 +28,7 @@ Examples
clocks = <&ext_mclk>, <&ext_rtc_clk>,
<&ext_jtag_clk>, <&ext_rgmii_ref>;
clock-names = "mclk", "rtc", "jtag", "rgmii_ref_ext";
+ #power-domain-cells = <0>;
};
- Other nodes can use the clocks provided by SYSCTRL as in:
@@ -38,6 +40,7 @@ Examples
interrupts = <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>;
reg-shift = <2>;
reg-io-width = <4>;
- clocks = <&sysctrl R9A06G032_CLK_UART0>;
- clock-names = "baudclk";
+ clocks = <&sysctrl R9A06G032_CLK_UART0>, <&sysctrl R9A06G032_HCLK_UART0>;
+ clock-names = "baudclk", "apb_pclk";
+ power-domains = <&sysctrl>;
};
diff --git a/Documentation/devicetree/bindings/clock/silabs,si5341.txt b/Documentation/devicetree/bindings/clock/silabs,si5341.txt
new file mode 100644
index 000000000000..a70c333e4cd4
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/silabs,si5341.txt
@@ -0,0 +1,162 @@
+Binding for Silicon Labs Si5341 and Si5340 programmable i2c clock generator.
+
+Reference
+[1] Si5341 Data Sheet
+ https://www.silabs.com/documents/public/data-sheets/Si5341-40-D-DataSheet.pdf
+[2] Si5341 Reference Manual
+ https://www.silabs.com/documents/public/reference-manuals/Si5341-40-D-RM.pdf
+
+The Si5341 and Si5340 are programmable i2c clock generators with up to 10 output
+clocks. The chip contains a PLL that sources 5 (or 4) multisynth clocks, which
+in turn can be directed to any of the 10 (or 4) outputs through a divider.
+The internal structure of the clock generators can be found in [2].
+
+The driver can be used in "as is" mode, reading the current settings from the
+chip at boot, in case you have a (pre-)programmed device. If the PLL is not
+configured when the driver probes, it assumes the driver must fully initialize
+it.
+
+The device type, speed grade and revision are determined runtime by probing.
+
+The driver currently only supports XTAL input mode, and does not support any
+fancy input configurations. They can still be programmed into the chip and
+the driver will leave them "as is".
+
+==I2C device node==
+
+Required properties:
+- compatible: shall be one of the following:
+ "silabs,si5340" - Si5340 A/B/C/D
+ "silabs,si5341" - Si5341 A/B/C/D
+- reg: i2c device address, usually 0x74
+- #clock-cells: from common clock binding; shall be set to 2.
+ The first value is "0" for outputs, "1" for synthesizers.
+ The second value is the output or synthesizer index.
+- clocks: from common clock binding; list of parent clock handles,
+ corresponding to inputs. Use a fixed clock for the "xtal" input.
+ At least one must be present.
+- clock-names: One of: "xtal", "in0", "in1", "in2"
+- vdd-supply: Regulator node for VDD
+
+Optional properties:
+- vdda-supply: Regulator node for VDDA
+- vdds-supply: Regulator node for VDDS
+- silabs,pll-m-num, silabs,pll-m-den: Numerator and denominator for PLL
+ feedback divider. Must be such that the PLL output is in the valid range. For
+ example, to create 14GHz from a 48MHz xtal, use m-num=14000 and m-den=48. Only
+ the fraction matters, using 3500 and 12 will deliver the exact same result.
+ If these are not specified, and the PLL is not yet programmed when the driver
+ probes, the PLL will be set to 14GHz.
+- silabs,reprogram: When present, the driver will always assume the device must
+ be initialized, and always performs the soft-reset routine. Since this will
+ temporarily stop all output clocks, don't do this if the chip is generating
+ the CPU clock for example.
+- interrupts: Interrupt for INTRb pin.
+- #address-cells: shall be set to 1.
+- #size-cells: shall be set to 0.
+
+
+== Child nodes: Outputs ==
+
+The child nodes list the output clocks.
+
+Each of the clock outputs can be overwritten individually by using a child node.
+If a child node for a clock output is not set, the configuration remains
+unchanged.
+
+Required child node properties:
+- reg: number of clock output.
+
+Optional child node properties:
+- vdd-supply: Regulator node for VDD for this output. The driver selects default
+ values for common-mode and amplitude based on the voltage.
+- silabs,format: Output format, one of:
+ 1 = differential (defaults to LVDS levels)
+ 2 = low-power (defaults to HCSL levels)
+ 4 = LVCMOS
+- silabs,common-mode: Manually override output common mode, see [2] for values
+- silabs,amplitude: Manually override output amplitude, see [2] for values
+- silabs,synth-master: boolean. If present, this output is allowed to change the
+ multisynth frequency dynamically.
+- silabs,silabs,disable-high: boolean. If set, the clock output is driven HIGH
+ when disabled, otherwise it's driven LOW.
+
+==Example==
+
+/* 48MHz reference crystal */
+ref48: ref48M {
+ compatible = "fixed-clock";
+ #clock-cells = <0>;
+ clock-frequency = <48000000>;
+};
+
+i2c-master-node {
+ /* Programmable clock (for logic) */
+ si5341: clock-generator@74 {
+ reg = <0x74>;
+ compatible = "silabs,si5341";
+ #clock-cells = <2>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ clocks = <&ref48>;
+ clock-names = "xtal";
+
+ silabs,pll-m-num = <14000>; /* PLL at 14.0 GHz */
+ silabs,pll-m-den = <48>;
+ silabs,reprogram; /* Chips are not programmed, always reset */
+
+ out@0 {
+ reg = <0>;
+ silabs,format = <1>; /* LVDS 3v3 */
+ silabs,common-mode = <3>;
+ silabs,amplitude = <3>;
+ silabs,synth-master;
+ };
+
+ /*
+ * Output 6 configuration:
+ * LVDS 1v8
+ */
+ out@6 {
+ reg = <6>;
+ silabs,format = <1>; /* LVDS 1v8 */
+ silabs,common-mode = <13>;
+ silabs,amplitude = <3>;
+ };
+
+ /*
+ * Output 8 configuration:
+ * HCSL 3v3
+ */
+ out@8 {
+ reg = <8>;
+ silabs,format = <2>;
+ silabs,common-mode = <11>;
+ silabs,amplitude = <3>;
+ };
+ };
+};
+
+some-video-node {
+ /* Standard clock bindings */
+ clock-names = "pixel";
+ clocks = <&si5341 0 7>; /* Output 7 */
+
+ /* Set output 7 to use syntesizer 3 as its parent */
+ assigned-clocks = <&si5341 0 7>, <&si5341 1 3>;
+ assigned-clock-parents = <&si5341 1 3>;
+ /* Set output 7 to 148.5 MHz using a synth frequency of 594 MHz */
+ assigned-clock-rates = <148500000>, <594000000>;
+};
+
+some-audio-node {
+ clock-names = "i2s-clk";
+ clocks = <&si5341 0 0>;
+ /*
+ * since output 0 is a synth-master, the synth will be automatically set
+ * to an appropriate frequency when the audio driver requests another
+ * frequency. We give control over synth 2 to this output here.
+ */
+ assigned-clocks = <&si5341 0 0>;
+ assigned-clock-parents = <&si5341 1 2>;
+};
diff --git a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
deleted file mode 100644
index e3bd88ae456b..000000000000
--- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-Allwinner Clock Control Unit Binding
-------------------------------------
-
-Required properties :
-- compatible: must contain one of the following compatibles:
- - "allwinner,sun4i-a10-ccu"
- - "allwinner,sun5i-a10s-ccu"
- - "allwinner,sun5i-a13-ccu"
- - "allwinner,sun6i-a31-ccu"
- - "allwinner,sun7i-a20-ccu"
- - "allwinner,sun8i-a23-ccu"
- - "allwinner,sun8i-a33-ccu"
- - "allwinner,sun8i-a83t-ccu"
- - "allwinner,sun8i-a83t-r-ccu"
- - "allwinner,sun8i-h3-ccu"
- - "allwinner,sun8i-h3-r-ccu"
-+ - "allwinner,sun8i-r40-ccu"
- - "allwinner,sun8i-v3s-ccu"
- - "allwinner,sun9i-a80-ccu"
- - "allwinner,sun50i-a64-ccu"
- - "allwinner,sun50i-a64-r-ccu"
- - "allwinner,sun50i-h5-ccu"
- - "allwinner,sun50i-h6-ccu"
- - "allwinner,sun50i-h6-r-ccu"
- - "allwinner,suniv-f1c100s-ccu"
- - "nextthing,gr8-ccu"
-
-- reg: Must contain the registers base address and length
-- clocks: phandle to the oscillators feeding the CCU. Two are needed:
- - "hosc": the high frequency oscillator (usually at 24MHz)
- - "losc": the low frequency oscillator (usually at 32kHz)
- On the A83T, this is the internal 16MHz oscillator divided by 512
-- clock-names: Must contain the clock names described just above
-- #clock-cells : must contain 1
-- #reset-cells : must contain 1
-
-For the main CCU on H6, one more clock is needed:
-- "iosc": the SoC's internal frequency oscillator
-
-For the PRCM CCUs on A83T/H3/A64/H6, two more clocks are needed:
-- "pll-periph": the SoC's peripheral PLL from the main CCU
-- "iosc": the SoC's internal frequency oscillator
-
-Example for generic CCU:
-ccu: clock@1c20000 {
- compatible = "allwinner,sun8i-h3-ccu";
- reg = <0x01c20000 0x400>;
- clocks = <&osc24M>, <&osc32k>;
- clock-names = "hosc", "losc";
- #clock-cells = <1>;
- #reset-cells = <1>;
-};
-
-Example for PRCM CCU:
-r_ccu: clock@1f01400 {
- compatible = "allwinner,sun50i-a64-r-ccu";
- reg = <0x01f01400 0x100>;
- clocks = <&osc24M>, <&osc32k>, <&iosc>, <&ccu CLK_PLL_PERIPH0>;
- clock-names = "hosc", "losc", "iosc", "pll-periph";
- #clock-cells = <1>;
- #reset-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/common-properties.txt b/Documentation/devicetree/bindings/common-properties.txt
index a3448bfa1c82..98a28130e100 100644
--- a/Documentation/devicetree/bindings/common-properties.txt
+++ b/Documentation/devicetree/bindings/common-properties.txt
@@ -5,30 +5,29 @@ Endianness
----------
The Devicetree Specification does not define any properties related to hardware
-byteswapping, but endianness issues show up frequently in porting Linux to
+byte swapping, but endianness issues show up frequently in porting drivers to
different machine types. This document attempts to provide a consistent
-way of handling byteswapping across drivers.
+way of handling byte swapping across drivers.
Optional properties:
- big-endian: Boolean; force big endian register accesses
unconditionally (e.g. ioread32be/iowrite32be). Use this if you
- know the peripheral always needs to be accessed in BE mode.
+ know the peripheral always needs to be accessed in big endian (BE) mode.
- little-endian: Boolean; force little endian register accesses
unconditionally (e.g. readl/writel). Use this if you know the
- peripheral always needs to be accessed in LE mode.
+ peripheral always needs to be accessed in little endian (LE) mode.
- native-endian: Boolean; always use register accesses matched to the
endianness of the kernel binary (e.g. LE vmlinux -> readl/writel,
- BE vmlinux -> ioread32be/iowrite32be). In this case no byteswaps
+ BE vmlinux -> ioread32be/iowrite32be). In this case no byte swaps
will ever be performed. Use this if the hardware "self-adjusts"
register endianness based on the CPU's configured endianness.
If a binding supports these properties, then the binding should also
specify the default behavior if none of these properties are present.
In such cases, little-endian is the preferred default, but it is not
-a requirement. The of_device_is_big_endian() and of_fdt_is_big_endian()
-helper functions do assume that little-endian is the default, because
-most existing (PCI-based) drivers implicitly default to LE by using
-readl/writel for MMIO accesses.
+a requirement. Some implementations assume that little-endian is
+the default, because most existing (PCI-based) drivers implicitly
+default to LE for their MMIO accesses.
Examples:
Scenario 1 : CPU in LE mode & device in LE mode.
diff --git a/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt b/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt
new file mode 100644
index 000000000000..87bff5add3f9
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/imx-cpufreq-dt.txt
@@ -0,0 +1,37 @@
+i.MX CPUFreq-DT OPP bindings
+================================
+
+Certain i.MX SoCs support different OPPs depending on the "market segment" and
+"speed grading" value which are written in fuses. These bits are combined with
+the opp-supported-hw values for each OPP to check if the OPP is allowed.
+
+Required properties:
+--------------------
+
+For each opp entry in 'operating-points-v2' table:
+- opp-supported-hw: Two bitmaps indicating:
+ - Supported speed grade mask
+ - Supported market segment mask
+ 0: Consumer
+ 1: Extended Consumer
+ 2: Industrial
+ 3: Automotive
+
+Example:
+--------
+
+opp_table {
+ compatible = "operating-points-v2";
+ opp-1000000000 {
+ opp-hz = /bits/ 64 <1000000000>;
+ /* grade >= 0, consumer only */
+ opp-supported-hw = <0xf>, <0x3>;
+ };
+
+ opp-1300000000 {
+ opp-hz = /bits/ 64 <1300000000>;
+ opp-microvolt = <1000000>;
+ /* grade >= 1, all segments */
+ opp-supported-hw = <0xe>, <0x7>;
+ };
+}
diff --git a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt b/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
index 6b458bb2440d..f2aab3dc2b52 100644
--- a/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
+++ b/Documentation/devicetree/bindings/crypto/atmel-crypto.txt
@@ -66,16 +66,3 @@ sha@f8034000 {
dmas = <&dma1 2 17>;
dma-names = "tx";
};
-
-* Eliptic Curve Cryptography (I2C)
-
-Required properties:
-- compatible : must be "atmel,atecc508a".
-- reg: I2C bus address of the device.
-- clock-frequency: must be present in the i2c controller node.
-
-Example:
-atecc508a@c0 {
- compatible = "atmel,atecc508a";
- reg = <0xC0>;
-};
diff --git a/Documentation/devicetree/bindings/csky/pmu.txt b/Documentation/devicetree/bindings/csky/pmu.txt
new file mode 100644
index 000000000000..728d05ca6a1c
--- /dev/null
+++ b/Documentation/devicetree/bindings/csky/pmu.txt
@@ -0,0 +1,38 @@
+===============================
+C-SKY Performance Monitor Units
+===============================
+
+C-SKY Performance Monitor is designed for ck807/ck810/ck860 SMP soc and
+it could count cpu's events for helping analysis performance issues.
+
+============================
+PMU node bindings definition
+============================
+
+ Description: Describes PMU
+
+ PROPERTIES
+
+ - compatible
+ Usage: required
+ Value type: <string>
+ Definition: must be "csky,csky-pmu"
+ - interrupts
+ Usage: required
+ Value type: <u32 IRQ_TYPE_XXX>
+ Definition: must be pmu irq num defined by soc
+ - count-width
+ Usage: optional
+ Value type: <u32>
+ Definition: the width of pmu counter
+
+Examples:
+---------
+#include <dt-bindings/interrupt-controller/irq.h>
+
+ pmu: performace-monitor {
+ compatible = "csky,csky-pmu";
+ interrupts = <23 IRQ_TYPE_EDGE_RISING>;
+ interrupt-parent = <&intc>;
+ count-width = <48>;
+ };
diff --git a/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml
new file mode 100644
index 000000000000..47950fced28d
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/allwinner,sun6i-a31-mipi-dsi.yaml
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/allwinner,sun6i-a31-mipi-dsi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 MIPI-DSI Controller Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#address-cells": true
+ "#size-cells": true
+
+ compatible:
+ const: allwinner,sun6i-a31-mipi-dsi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: mod
+
+ resets:
+ maxItems: 1
+
+ phys:
+ maxItems: 1
+
+ phy-names:
+ const: dphy
+
+ port:
+ type: object
+ description:
+ A port node with endpoint definitions as defined in
+ Documentation/devicetree/bindings/media/video-interfaces.txt. That
+ port should be the input endpoint, usually coming from the
+ associated TCON.
+
+patternProperties:
+ "^panel@[0-9]+$": true
+
+required:
+ - "#address-cells"
+ - "#size-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - phys
+ - phy-names
+ - resets
+ - port
+
+additionalProperties: false
+
+examples:
+ - |
+ dsi0: dsi@1ca0000 {
+ compatible = "allwinner,sun6i-a31-mipi-dsi";
+ reg = <0x01ca0000 0x1000>;
+ interrupts = <0 89 4>;
+ clocks = <&ccu 23>, <&ccu 96>;
+ clock-names = "bus", "mod";
+ resets = <&ccu 4>;
+ phys = <&dphy0>;
+ phy-names = "dphy";
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ panel@0 {
+ compatible = "bananapi,lhr050h41", "ilitek,ili9881c";
+ reg = <0>;
+ power-gpios = <&pio 1 7 0>; /* PB07 */
+ reset-gpios = <&r_pio 0 5 1>; /* PL05 */
+ backlight = <&pwm_bl>;
+ };
+
+ port {
+ dsi0_in_tcon0: endpoint {
+ remote-endpoint = <&tcon0_out_dsi0>;
+ };
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/display/arm,komeda.txt b/Documentation/devicetree/bindings/display/arm,komeda.txt
index 02b226532ebd..8513695ee47f 100644
--- a/Documentation/devicetree/bindings/display/arm,komeda.txt
+++ b/Documentation/devicetree/bindings/display/arm,komeda.txt
@@ -7,10 +7,13 @@ Required properties:
- clocks: A list of phandle + clock-specifier pairs, one for each entry
in 'clock-names'
- clock-names: A list of clock names. It should contain:
- - "mclk": for the main processor clock
- - "pclk": for the APB interface clock
+ - "aclk": for the main processor clock
- #address-cells: Must be 1
- #size-cells: Must be 0
+- iommus: configure the stream id to IOMMU, Must be configured if want to
+ enable iommu in display. for how to configure this node please reference
+ devicetree/bindings/iommu/arm,smmu-v3.txt,
+ devicetree/bindings/iommu/iommu.txt
Required properties for sub-node: pipeline@nq
Each device contains one or two pipeline sub-nodes (at least one), each
@@ -20,7 +23,6 @@ pipeline node should provide properties:
in 'clock-names'
- clock-names: should contain:
- "pxclk": pixel clock
- - "aclk": AXI interface clock
- port: each pipeline connect to an encoder input port. The connection is
modeled using the OF graph bindings specified in
@@ -42,12 +44,15 @@ Example:
compatible = "arm,mali-d71";
reg = <0xc00000 0x20000>;
interrupts = <0 168 4>;
- clocks = <&dpu_mclk>, <&dpu_aclk>;
- clock-names = "mclk", "pclk";
+ clocks = <&dpu_aclk>;
+ clock-names = "aclk";
+ iommus = <&smmu 0>, <&smmu 1>, <&smmu 2>, <&smmu 3>,
+ <&smmu 4>, <&smmu 5>, <&smmu 6>, <&smmu 7>,
+ <&smmu 8>, <&smmu 9>;
dp0_pipe0: pipeline@0 {
- clocks = <&fpgaosc2>, <&dpu_aclk>;
- clock-names = "pxclk", "aclk";
+ clocks = <&fpgaosc2>;
+ clock-names = "pxclk";
reg = <0>;
port {
@@ -58,8 +63,8 @@ Example:
};
dp0_pipe1: pipeline@1 {
- clocks = <&fpgaosc2>, <&dpu_aclk>;
- clock-names = "pxclk", "aclk";
+ clocks = <&fpgaosc2>;
+ clock-names = "pxclk";
reg = <1>;
port {
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt b/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt
index a41d280c3f9f..db680413e89c 100644
--- a/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,dw-hdmi.txt
@@ -12,10 +12,12 @@ following device-specific properties.
Required properties:
- compatible : Shall contain one or more of
+ - "renesas,r8a774a1-hdmi" for R8A774A1 (RZ/G2M) compatible HDMI TX
- "renesas,r8a7795-hdmi" for R8A7795 (R-Car H3) compatible HDMI TX
- "renesas,r8a7796-hdmi" for R8A7796 (R-Car M3-W) compatible HDMI TX
- "renesas,r8a77965-hdmi" for R8A77965 (R-Car M3-N) compatible HDMI TX
- - "renesas,rcar-gen3-hdmi" for the generic R-Car Gen3 compatible HDMI TX
+ - "renesas,rcar-gen3-hdmi" for the generic R-Car Gen3 and RZ/G2 compatible
+ HDMI TX
When compatible with generic versions, nodes must list the SoC-specific
version corresponding to the platform first, followed by the
diff --git a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt
index 900a884ad9f5..c6a196d0b075 100644
--- a/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt
+++ b/Documentation/devicetree/bindings/display/bridge/renesas,lvds.txt
@@ -9,6 +9,7 @@ Required properties:
- compatible : Shall contain one of
- "renesas,r8a7743-lvds" for R8A7743 (RZ/G1M) compatible LVDS encoders
- "renesas,r8a7744-lvds" for R8A7744 (RZ/G1N) compatible LVDS encoders
+ - "renesas,r8a774a1-lvds" for R8A774A1 (RZ/G2M) compatible LVDS encoders
- "renesas,r8a774c0-lvds" for R8A774C0 (RZ/G2E) compatible LVDS encoders
- "renesas,r8a7790-lvds" for R8A7790 (R-Car H2) compatible LVDS encoders
- "renesas,r8a7791-lvds" for R8A7791 (R-Car M2-W) compatible LVDS encoders
@@ -45,14 +46,24 @@ OF graph bindings specified in Documentation/devicetree/bindings/graph.txt.
Each port shall have a single endpoint.
+Optional properties:
+
+- renesas,companion : phandle to the companion LVDS encoder. This property is
+ mandatory for the first LVDS encoder on D3 and E3 SoCs, and shall point to
+ the second encoder to be used as a companion in dual-link mode. It shall not
+ be set for any other LVDS encoder.
+
Example:
lvds0: lvds@feb90000 {
- compatible = "renesas,r8a7790-lvds";
- reg = <0 0xfeb90000 0 0x1c>;
- clocks = <&cpg CPG_MOD 726>;
- resets = <&cpg 726>;
+ compatible = "renesas,r8a77990-lvds";
+ reg = <0 0xfeb90000 0 0x20>;
+ clocks = <&cpg CPG_MOD 727>;
+ power-domains = <&sysc R8A77990_PD_ALWAYS_ON>;
+ resets = <&cpg 727>;
+
+ renesas,companion = <&lvds1>;
ports {
#address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/display/bridge/sii902x.txt b/Documentation/devicetree/bindings/display/bridge/sii902x.txt
index 72d2dc6c3e6b..2df44b7d3821 100644
--- a/Documentation/devicetree/bindings/display/bridge/sii902x.txt
+++ b/Documentation/devicetree/bindings/display/bridge/sii902x.txt
@@ -5,10 +5,44 @@ Required properties:
- reg: i2c address of the bridge
Optional properties:
- - interrupts: describe the interrupt line used to inform the host
+ - interrupts: describe the interrupt line used to inform the host
about hotplug events.
- reset-gpios: OF device-tree gpio specification for RST_N pin.
+ HDMI audio properties:
+ - #sound-dai-cells: <0> or <1>. <0> if only i2s or spdif pin
+ is wired, <1> if the both are wired. HDMI audio is
+ configured only if this property is found.
+ - sil,i2s-data-lanes: Array of up to 4 integers with values of 0-3
+ Each integer indicates which i2s pin is connected to which
+ audio fifo. The first integer selects i2s audio pin for the
+ first audio fifo#0 (HDMI channels 1&2), second for fifo#1
+ (HDMI channels 3&4), and so on. There is 4 fifos and 4 i2s
+ pins (SD0 - SD3). Any i2s pin can be connected to any fifo,
+ but there can be no gaps. E.g. an i2s pin must be mapped to
+ fifo#0 and fifo#1 before mapping a channel to fifo#2. Default
+ value is <0>, describing SD0 pin beiging routed to hdmi audio
+ fifo #0.
+ - clocks: phandle and clock specifier for each clock listed in
+ the clock-names property
+ - clock-names: "mclk"
+ Describes SII902x MCLK input. MCLK is used to produce
+ HDMI audio CTS values. This property is required if
+ "#sound-dai-cells"-property is present. This property follows
+ Documentation/devicetree/bindings/clock/clock-bindings.txt
+ consumer binding.
+
+ If HDMI audio is configured the sii902x device becomes an I2S
+ and/or spdif audio codec component (e.g a digital audio sink),
+ that can be used in configuring a full audio devices with
+ simple-card or audio-graph-card binding. See their binding
+ documents on how to describe the way the sii902x device is
+ connected to the rest of the audio system:
+ Documentation/devicetree/bindings/sound/simple-card.txt
+ Documentation/devicetree/bindings/sound/audio-graph-card.txt
+ Note: In case of the audio-graph-card binding the used port
+ index should be 3.
+
Optional subnodes:
- video input: this subnode can contain a video input port node
to connect the bridge to a display controller output (See this
@@ -21,6 +55,12 @@ Example:
compatible = "sil,sii9022";
reg = <0x39>;
reset-gpios = <&pioA 1 0>;
+
+ #sound-dai-cells = <0>;
+ sil,i2s-data-lanes = < 0 1 2 >;
+ clocks = <&mclk>;
+ clock-names = "mclk";
+
ports {
#address-cells = <1>;
#size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt b/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt
index 37f0c04d5a28..d17d1e5820d7 100644
--- a/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt
+++ b/Documentation/devicetree/bindings/display/bridge/thine,thc63lvd1024.txt
@@ -28,6 +28,12 @@ Optional video port nodes:
- port@1: Second LVDS input port
- port@3: Second digital CMOS/TTL parallel output
+The device can operate in single-link mode or dual-link mode. In single-link
+mode, all pixels are received on port@0, and port@1 shall not contain any
+endpoint. In dual-link mode, even-numbered pixels are received on port@0 and
+odd-numbered pixels on port@1, and both port@0 and port@1 shall contain
+endpoints.
+
Example:
--------
diff --git a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
index e3f6aa6a214d..583c5e9dbe6b 100644
--- a/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
+++ b/Documentation/devicetree/bindings/display/bridge/toshiba,tc358767.txt
@@ -12,6 +12,7 @@ Optional properties:
(active high shutdown input)
- reset-gpios: OF device-tree gpio specification for RSTX pin
(active low system reset)
+ - toshiba,hpd-pin: TC358767 GPIO pin number to which HPD is connected to (0 or 1)
- ports: the ports node can contain video interface port nodes to connect
to a DPI/DSI source and to an eDP/DP sink according to [1][2]:
- port@0: DSI input port
diff --git a/Documentation/devicetree/bindings/display/ingenic,lcd.txt b/Documentation/devicetree/bindings/display/ingenic,lcd.txt
new file mode 100644
index 000000000000..7b536c8c6dde
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/ingenic,lcd.txt
@@ -0,0 +1,44 @@
+Ingenic JZ47xx LCD driver
+
+Required properties:
+- compatible: one of:
+ * ingenic,jz4740-lcd
+ * ingenic,jz4725b-lcd
+- reg: LCD registers location and length
+- clocks: LCD pixclock and device clock specifiers.
+ The device clock is only required on the JZ4740.
+- clock-names: "lcd_pclk" and "lcd"
+- interrupts: Specifies the interrupt line the LCD controller is connected to.
+
+Example:
+
+panel {
+ compatible = "sharp,ls020b1dd01d";
+
+ backlight = <&backlight>;
+ power-supply = <&vcc>;
+
+ port {
+ panel_input: endpoint {
+ remote-endpoint = <&panel_output>;
+ };
+ };
+};
+
+
+lcd: lcd-controller@13050000 {
+ compatible = "ingenic,jz4725b-lcd";
+ reg = <0x13050000 0x1000>;
+
+ interrupt-parent = <&intc>;
+ interrupts = <31>;
+
+ clocks = <&cgu JZ4725B_CLK_LCD>;
+ clock-names = "lcd";
+
+ port {
+ panel_output: endpoint {
+ remote-endpoint = <&panel_input>;
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/display/msm/dpu.txt b/Documentation/devicetree/bindings/display/msm/dpu.txt
index ad2e8830324e..a61dd40f3792 100644
--- a/Documentation/devicetree/bindings/display/msm/dpu.txt
+++ b/Documentation/devicetree/bindings/display/msm/dpu.txt
@@ -28,6 +28,11 @@ Required properties:
- #address-cells: number of address cells for the MDSS children. Should be 1.
- #size-cells: Should be 1.
- ranges: parent bus address space is the same as the child bus address space.
+- interconnects : interconnect path specifier for MDSS according to
+ Documentation/devicetree/bindings/interconnect/interconnect.txt. Should be
+ 2 paths corresponding to 2 AXI ports.
+- interconnect-names : MDSS will have 2 port names to differentiate between the
+ 2 interconnect paths defined with interconnect specifier.
Optional properties:
- assigned-clocks: list of clock specifiers for clocks needing rate assignment
@@ -86,6 +91,11 @@ Example:
interrupt-controller;
#interrupt-cells = <1>;
+ interconnects = <&rsc_hlos MASTER_MDP0 &rsc_hlos SLAVE_EBI1>,
+ <&rsc_hlos MASTER_MDP1 &rsc_hlos SLAVE_EBI1>;
+
+ interconnect-names = "mdp0-mem", "mdp1-mem";
+
iommus = <&apps_iommu 0>;
#address-cells = <2>;
diff --git a/Documentation/devicetree/bindings/display/msm/dsi.txt b/Documentation/devicetree/bindings/display/msm/dsi.txt
index 9ae946942720..af95586c898f 100644
--- a/Documentation/devicetree/bindings/display/msm/dsi.txt
+++ b/Documentation/devicetree/bindings/display/msm/dsi.txt
@@ -88,6 +88,7 @@ Required properties:
* "qcom,dsi-phy-28nm-8960"
* "qcom,dsi-phy-14nm"
* "qcom,dsi-phy-10nm"
+ * "qcom,dsi-phy-10nm-8998"
- reg: Physical base address and length of the registers of PLL, PHY. Some
revisions require the PHY regulator base address, whereas others require the
PHY lane base address. See below for each PHY revision.
diff --git a/Documentation/devicetree/bindings/display/panel/armadeus,st0700-adapt.txt b/Documentation/devicetree/bindings/display/panel/armadeus,st0700-adapt.txt
new file mode 100644
index 000000000000..a30d63db3c8f
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/armadeus,st0700-adapt.txt
@@ -0,0 +1,9 @@
+Armadeus ST0700 Adapt. A Santek ST0700I5Y-RBSLW 7.0" WVGA (800x480) TFT with
+an adapter board.
+
+Required properties:
+- compatible: "armadeus,st0700-adapt"
+- power-supply: see panel-common.txt
+
+Optional properties:
+- backlight: see panel-common.txt
diff --git a/Documentation/devicetree/bindings/display/panel/edt,et-series.txt b/Documentation/devicetree/bindings/display/panel/edt,et-series.txt
index f56b99ebd9be..be8684327ee4 100644
--- a/Documentation/devicetree/bindings/display/panel/edt,et-series.txt
+++ b/Documentation/devicetree/bindings/display/panel/edt,et-series.txt
@@ -6,6 +6,22 @@ Display bindings for EDT Display Technology Corp. Displays which are
compatible with the simple-panel binding, which is specified in
simple-panel.txt
+3,5" QVGA TFT Panels
+--------------------
++-----------------+---------------------+-------------------------------------+
+| Identifier | compatbile | description |
++=================+=====================+=====================================+
+| ET035012DM6 | edt,et035012dm6 | 3.5" QVGA TFT LCD panel |
++-----------------+---------------------+-------------------------------------+
+
+4,3" WVGA TFT Panels
+--------------------
+
++-----------------+---------------------+-------------------------------------+
+| Identifier | compatbile | description |
++=================+=====================+=====================================+
+| ETM0430G0DH6 | edt,etm0430g0dh6 | 480x272 TFT Display |
++-----------------+---------------------+-------------------------------------+
5,7" WVGA TFT Panels
--------------------
diff --git a/Documentation/devicetree/bindings/display/panel/evervision,vgg804821.txt b/Documentation/devicetree/bindings/display/panel/evervision,vgg804821.txt
new file mode 100644
index 000000000000..82d22e191ac3
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/evervision,vgg804821.txt
@@ -0,0 +1,12 @@
+Evervision Electronics Co. Ltd. VGG804821 5.0" WVGA TFT LCD Panel
+
+Required properties:
+- compatible: should be "evervision,vgg804821"
+- power-supply: See simple-panel.txt
+
+Optional properties:
+- backlight: See simple-panel.txt
+- enable-gpios: See simple-panel.txt
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/friendlyarm,hd702e.txt b/Documentation/devicetree/bindings/display/panel/friendlyarm,hd702e.txt
new file mode 100644
index 000000000000..6c9156fc3478
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/friendlyarm,hd702e.txt
@@ -0,0 +1,32 @@
+FriendlyELEC HD702E 800x1280 LCD panel
+
+HD702E lcd is FriendlyELEC developed eDP LCD panel with 800x1280
+resolution. It has built in Goodix, GT9271 captive touchscreen
+with backlight adjustable via PWM.
+
+Required properties:
+- compatible: should be "friendlyarm,hd702e"
+- power-supply: regulator to provide the supply voltage
+
+Optional properties:
+- backlight: phandle of the backlight device attached to the panel
+
+Optional nodes:
+- Video port for LCD panel input.
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
+
+Example:
+
+ panel {
+ compatible ="friendlyarm,hd702e", "simple-panel";
+ backlight = <&backlight>;
+ power-supply = <&vcc3v3_sys>;
+
+ port {
+ panel_in_edp: endpoint {
+ remote-endpoint = <&edp_out_panel>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/koe,tx14d24vm1bpa.txt b/Documentation/devicetree/bindings/display/panel/koe,tx14d24vm1bpa.txt
new file mode 100644
index 000000000000..be7ac666807b
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/koe,tx14d24vm1bpa.txt
@@ -0,0 +1,42 @@
+Kaohsiung Opto-Electronics Inc. 5.7" QVGA (320 x 240) TFT LCD panel
+
+Required properties:
+- compatible: should be "koe,tx14d24vm1bpa"
+- backlight: phandle of the backlight device attached to the panel
+- power-supply: single regulator to provide the supply voltage
+
+Required nodes:
+- port: Parallel port mapping to connect this display
+
+This panel needs single power supply voltage. Its backlight is conntrolled
+via PWM signal.
+
+Example:
+--------
+
+Example device-tree definition when connected to iMX53 based board
+
+ lcd_panel: lcd-panel {
+ compatible = "koe,tx14d24vm1bpa";
+ backlight = <&backlight_lcd>;
+ power-supply = <&reg_3v3>;
+
+ port {
+ lcd_panel_in: endpoint {
+ remote-endpoint = <&lcd_display_out>;
+ };
+ };
+ };
+
+Then one needs to extend the dispX node:
+
+ lcd_display: disp1 {
+
+ port@1 {
+ reg = <1>;
+
+ lcd_display_out: endpoint {
+ remote-endpoint = <&lcd_panel_in>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2045-53ts.txt b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2045-53ts.txt
new file mode 100644
index 000000000000..85c0b2cacfda
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2045-53ts.txt
@@ -0,0 +1,11 @@
+One Stop Displays OSD101T2045-53TS 10.1" 1920x1200 panel
+
+Required properties:
+- compatible: should be "osddisplays,osd101t2045-53ts"
+- power-supply: as specified in the base binding
+
+Optional properties:
+- backlight: as specified in the base binding
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2587-53ts.txt b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2587-53ts.txt
new file mode 100644
index 000000000000..9d88e96003fc
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/osddisplays,osd101t2587-53ts.txt
@@ -0,0 +1,14 @@
+One Stop Displays OSD101T2587-53TS 10.1" 1920x1200 panel
+
+The panel is similar to OSD101T2045-53TS, but it needs additional
+MIPI_DSI_TURN_ON_PERIPHERAL message from the host.
+
+Required properties:
+- compatible: should be "osddisplays,osd101t2587-53ts"
+- power-supply: as specified in the base binding
+
+Optional properties:
+- backlight: as specified in the base binding
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.txt b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.txt
new file mode 100644
index 000000000000..9fb9ebeef8e4
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/samsung,s6e63m0.txt
@@ -0,0 +1,33 @@
+Samsung s6e63m0 AMOLED LCD panel
+
+Required properties:
+ - compatible: "samsung,s6e63m0"
+ - reset-gpios: GPIO spec for reset pin
+ - vdd3-supply: VDD regulator
+ - vci-supply: VCI regulator
+
+The panel must obey rules for SPI slave device specified in document [1].
+
+The device node can contain one 'port' child node with one child
+'endpoint' node, according to the bindings defined in [2]. This
+node should describe panel's video bus.
+
+[1]: Documentation/devicetree/bindings/spi/spi-bus.txt
+[2]: Documentation/devicetree/bindings/media/video-interfaces.txt
+
+Example:
+
+ s6e63m0: display@0 {
+ compatible = "samsung,s6e63m0";
+ reg = <0>;
+ reset-gpio = <&mp05 5 1>;
+ vdd3-supply = <&ldo12_reg>;
+ vci-supply = <&ldo11_reg>;
+ spi-max-frequency = <1200000>;
+
+ port {
+ lcd_ep: endpoint {
+ remote-endpoint = <&fimd_ep>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/display/panel/tfc,s9700rtwv43tr-01b.txt b/Documentation/devicetree/bindings/display/panel/tfc,s9700rtwv43tr-01b.txt
new file mode 100644
index 000000000000..dfb572f085eb
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/tfc,s9700rtwv43tr-01b.txt
@@ -0,0 +1,15 @@
+TFC S9700RTWV43TR-01B 7" Three Five Corp 800x480 LCD panel with
+resistive touch
+
+The panel is found on TI AM335x-evm.
+
+Required properties:
+- compatible: should be "tfc,s9700rtwv43tr-01b"
+- power-supply: See panel-common.txt
+
+Optional properties:
+- enable-gpios: GPIO pin to enable or disable the panel, if there is one
+- backlight: phandle of the backlight device attached to the panel
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/panel/vl050_8048nt_c01.txt b/Documentation/devicetree/bindings/display/panel/vl050_8048nt_c01.txt
new file mode 100644
index 000000000000..b42bf06bbd99
--- /dev/null
+++ b/Documentation/devicetree/bindings/display/panel/vl050_8048nt_c01.txt
@@ -0,0 +1,12 @@
+VXT 800x480 color TFT LCD panel
+
+Required properties:
+- compatible: should be "vxt,vl050-8048nt-c01"
+- power-supply: as specified in the base binding
+
+Optional properties:
+- backlight: as specified in the base binding
+- enable-gpios: as specified in the base binding
+
+This binding is compatible with the simple-panel binding, which is specified
+in simple-panel.txt in this directory.
diff --git a/Documentation/devicetree/bindings/display/renesas,du.txt b/Documentation/devicetree/bindings/display/renesas,du.txt
index aedb22b4d161..c97dfacad281 100644
--- a/Documentation/devicetree/bindings/display/renesas,du.txt
+++ b/Documentation/devicetree/bindings/display/renesas,du.txt
@@ -7,6 +7,7 @@ Required Properties:
- "renesas,du-r8a7744" for R8A7744 (RZ/G1N) compatible DU
- "renesas,du-r8a7745" for R8A7745 (RZ/G1E) compatible DU
- "renesas,du-r8a77470" for R8A77470 (RZ/G1C) compatible DU
+ - "renesas,du-r8a774a1" for R8A774A1 (RZ/G2M) compatible DU
- "renesas,du-r8a774c0" for R8A774C0 (RZ/G2E) compatible DU
- "renesas,du-r8a7779" for R8A7779 (R-Car H1) compatible DU
- "renesas,du-r8a7790" for R8A7790 (R-Car H2) compatible DU
@@ -58,6 +59,7 @@ corresponding to each DU output.
R8A7744 (RZ/G1N) DPAD 0 LVDS 0 - -
R8A7745 (RZ/G1E) DPAD 0 DPAD 1 - -
R8A77470 (RZ/G1C) DPAD 0 DPAD 1 LVDS 0 -
+ R8A774A1 (RZ/G2M) DPAD 0 HDMI 0 LVDS 0 -
R8A774C0 (RZ/G2E) DPAD 0 LVDS 0 LVDS 1 -
R8A7779 (R-Car H1) DPAD 0 DPAD 1 - -
R8A7790 (R-Car H2) DPAD 0 LVDS 0 LVDS 1 -
diff --git a/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt b/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
index 39143424a474..3d32ce137e7f 100644
--- a/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
+++ b/Documentation/devicetree/bindings/display/rockchip/dw_hdmi-rockchip.txt
@@ -12,6 +12,7 @@ following device-specific properties.
Required properties:
- compatible: should be one of the following:
+ "rockchip,rk3228-dw-hdmi"
"rockchip,rk3288-dw-hdmi"
"rockchip,rk3328-dw-hdmi"
"rockchip,rk3399-dw-hdmi"
@@ -38,6 +39,13 @@ Optional properties
- phys: from general PHY binding: the phandle for the PHY device.
- phy-names: Should be "hdmi" if phys references an external phy.
+Optional pinctrl entry:
+- If you have both a "unwedge" and "default" pinctrl entry, dw_hdmi
+ will switch to the unwedge pinctrl state for 10ms if it ever gets an
+ i2c timeout. It's intended that this unwedge pinctrl entry will
+ cause the SDA line to be driven low to work around a hardware
+ errata.
+
Example:
hdmi: hdmi@ff980000 {
diff --git a/Documentation/devicetree/bindings/display/simple-framebuffer.yaml b/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
index b052d76cf8b6..678776b6012a 100644
--- a/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
+++ b/Documentation/devicetree/bindings/display/simple-framebuffer.yaml
@@ -126,6 +126,28 @@ required:
# but usually they will be filled by the bootloader.
- compatible
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,simple-framebuffer
+
+ then:
+ required:
+ - allwinner,pipeline
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: amlogic,simple-framebuffer
+
+ then:
+ required:
+ - amlogic,pipeline
+
+
additionalProperties: false
examples:
@@ -139,7 +161,8 @@ examples:
#size-cells = <1>;
stdout-path = "display0";
framebuffer0: framebuffer@1d385000 {
- compatible = "simple-framebuffer";
+ compatible = "allwinner,simple-framebuffer", "simple-framebuffer";
+ allwinner,pipeline = "de_be0-lcd0";
reg = <0x1d385000 3840000>;
width = <1600>;
height = <1200>;
diff --git a/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt b/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
index 3eb1b48b47dd..60c54da4e526 100644
--- a/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
+++ b/Documentation/devicetree/bindings/display/st,stm32-ltdc.txt
@@ -40,6 +40,8 @@ Mandatory nodes specific to STM32 DSI:
- panel or bridge node: A node containing the panel or bridge description as
documented in [6].
- port: panel or bridge port node, connected to the DSI output port (port@1).
+Optional properties:
+- phy-dsi-supply: phandle of the regulator that provides the supply voltage.
Note: You can find more documentation in the following references
[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
@@ -101,6 +103,7 @@ Example 2: DSI panel
clock-names = "pclk", "ref";
resets = <&rcc STM32F4_APB2_RESET(DSI)>;
reset-names = "apb";
+ phy-dsi-supply = <&reg18>;
ports {
#address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/display/sunxi/sun6i-dsi.txt b/Documentation/devicetree/bindings/display/sunxi/sun6i-dsi.txt
deleted file mode 100644
index 6a6cf5de08b0..000000000000
--- a/Documentation/devicetree/bindings/display/sunxi/sun6i-dsi.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-Allwinner A31 DSI Encoder
-=========================
-
-The DSI pipeline consists of two separate blocks: the DSI controller
-itself, and its associated D-PHY.
-
-DSI Encoder
------------
-
-The DSI Encoder generates the DSI signal from the TCON's.
-
-Required properties:
- - compatible: value must be one of:
- * allwinner,sun6i-a31-mipi-dsi
- - reg: base address and size of memory-mapped region
- - interrupts: interrupt associated to this IP
- - clocks: phandles to the clocks feeding the DSI encoder
- * bus: the DSI interface clock
- * mod: the DSI module clock
- - clock-names: the clock names mentioned above
- - phys: phandle to the D-PHY
- - phy-names: must be "dphy"
- - resets: phandle to the reset controller driving the encoder
-
- - ports: A ports node with endpoint definitions as defined in
- Documentation/devicetree/bindings/media/video-interfaces.txt. The
- first port should be the input endpoint, usually coming from the
- associated TCON.
-
-Any MIPI-DSI device attached to this should be described according to
-the bindings defined in ../mipi-dsi-bus.txt
-
-D-PHY
------
-
-Required properties:
- - compatible: value must be one of:
- * allwinner,sun6i-a31-mipi-dphy
- - reg: base address and size of memory-mapped region
- - clocks: phandles to the clocks feeding the DSI encoder
- * bus: the DSI interface clock
- * mod: the DSI module clock
- - clock-names: the clock names mentioned above
- - resets: phandle to the reset controller driving the encoder
-
-Example:
-
-dsi0: dsi@1ca0000 {
- compatible = "allwinner,sun6i-a31-mipi-dsi";
- reg = <0x01ca0000 0x1000>;
- interrupts = <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&ccu CLK_BUS_MIPI_DSI>,
- <&ccu CLK_DSI_SCLK>;
- clock-names = "bus", "mod";
- resets = <&ccu RST_BUS_MIPI_DSI>;
- phys = <&dphy0>;
- phy-names = "dphy";
- #address-cells = <1>;
- #size-cells = <0>;
-
- panel@0 {
- compatible = "bananapi,lhr050h41", "ilitek,ili9881c";
- reg = <0>;
- power-gpios = <&pio 1 7 GPIO_ACTIVE_HIGH>; /* PB07 */
- reset-gpios = <&r_pio 0 5 GPIO_ACTIVE_LOW>; /* PL05 */
- backlight = <&pwm_bl>;
- };
-
- ports {
- #address-cells = <1>;
- #size-cells = <0>;
-
- port@0 {
- #address-cells = <1>;
- #size-cells = <0>;
- reg = <0>;
-
- dsi0_in_tcon0: endpoint {
- remote-endpoint = <&tcon0_out_dsi0>;
- };
- };
- };
-};
-
-dphy0: d-phy@1ca1000 {
- compatible = "allwinner,sun6i-a31-mipi-dphy";
- reg = <0x01ca1000 0x1000>;
- clocks = <&ccu CLK_BUS_MIPI_DSI>,
- <&ccu CLK_DSI_DPHY>;
- clock-names = "bus", "mod";
- resets = <&ccu RST_BUS_MIPI_DSI>;
- #phy-cells = <0>;
-};
diff --git a/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt b/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt
deleted file mode 100644
index 3fe0961bcf64..000000000000
--- a/Documentation/devicetree/bindings/dma/8250_mtk_dma.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-* Mediatek UART APDMA Controller
-
-Required properties:
-- compatible should contain:
- * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA
- * "mediatek,mt6577-uart-dma" for MT6577 and all of the above
-
-- reg: The base address of the APDMA register bank.
-
-- interrupts: A single interrupt specifier.
-
-- clocks : Must contain an entry for each entry in clock-names.
- See ../clocks/clock-bindings.txt for details.
-- clock-names: The APDMA clock for register accesses
-
-Examples:
-
- apdma: dma-controller@11000380 {
- compatible = "mediatek,mt2712-uart-dma";
- reg = <0 0x11000380 0 0x400>;
- interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 64 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 65 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 66 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 67 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 68 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 69 IRQ_TYPE_LEVEL_LOW>,
- <GIC_SPI 70 IRQ_TYPE_LEVEL_LOW>;
- clocks = <&pericfg CLK_PERI_AP_DMA>;
- clock-names = "apdma";
- #dma-cells = <1>;
- };
-
diff --git a/Documentation/devicetree/bindings/dma/arm-pl330.txt b/Documentation/devicetree/bindings/dma/arm-pl330.txt
index db7e2260f9c5..2c7fd1941abb 100644
--- a/Documentation/devicetree/bindings/dma/arm-pl330.txt
+++ b/Documentation/devicetree/bindings/dma/arm-pl330.txt
@@ -16,6 +16,9 @@ Optional properties:
- dma-channels: contains the total number of DMA channels supported by the DMAC
- dma-requests: contains the total number of DMA requests supported by the DMAC
- arm,pl330-broken-no-flushp: quirk for avoiding to execute DMAFLUSHP
+ - resets: contains an entry for each entry in reset-names.
+ See ../reset/reset.txt for details.
+ - reset-names: must contain at least "dma", and optional is "dma-ocp".
Example:
diff --git a/Documentation/devicetree/bindings/dma/fsl-edma.txt b/Documentation/devicetree/bindings/dma/fsl-edma.txt
index 97e213e07660..29dd3ccb1235 100644
--- a/Documentation/devicetree/bindings/dma/fsl-edma.txt
+++ b/Documentation/devicetree/bindings/dma/fsl-edma.txt
@@ -9,15 +9,16 @@ group, DMAMUX0 or DMAMUX1, but not both.
Required properties:
- compatible :
- "fsl,vf610-edma" for eDMA used similar to that on Vybrid vf610 SoC
+ - "fsl,imx7ulp-edma" for eDMA2 used similar to that on i.mx7ulp
- reg : Specifies base physical address(s) and size of the eDMA registers.
The 1st region is eDMA control register's address and size.
The 2nd and the 3rd regions are programmable channel multiplexing
control register's address and size.
- interrupts : A list of interrupt-specifiers, one for each entry in
- interrupt-names.
-- interrupt-names : Should contain:
- "edma-tx" - the transmission interrupt
- "edma-err" - the error interrupt
+ interrupt-names on vf610 similar SoC. But for i.mx7ulp per channel
+ per transmission interrupt, total 16 channel interrupt and 1
+ error interrupt(located in the last), no interrupt-names list on
+ i.mx7ulp for clean on dts.
- #dma-cells : Must be <2>.
The 1st cell specifies the DMAMUX(0 for DMAMUX0 and 1 for DMAMUX1).
Specific request source can only be multiplexed by specific channels
@@ -28,6 +29,7 @@ Required properties:
- clock-names : A list of channel group clock names. Should contain:
"dmamux0" - clock name of mux0 group
"dmamux1" - clock name of mux1 group
+ Note: No dmamux0 on i.mx7ulp, but another 'dma' clk added on i.mx7ulp.
- clocks : A list of phandle and clock-specifier pairs, one for each entry in
clock-names.
@@ -35,6 +37,10 @@ Optional properties:
- big-endian: If present registers and hardware scatter/gather descriptors
of the eDMA are implemented in big endian mode, otherwise in little
mode.
+- interrupt-names : Should contain the below on vf610 similar SoC but not used
+ on i.mx7ulp similar SoC:
+ "edma-tx" - the transmission interrupt
+ "edma-err" - the error interrupt
Examples:
@@ -52,8 +58,36 @@ edma0: dma-controller@40018000 {
clock-names = "dmamux0", "dmamux1";
clocks = <&clks VF610_CLK_DMAMUX0>,
<&clks VF610_CLK_DMAMUX1>;
-};
+}; /* vf610 */
+edma1: dma-controller@40080000 {
+ #dma-cells = <2>;
+ compatible = "fsl,imx7ulp-edma";
+ reg = <0x40080000 0x2000>,
+ <0x40210000 0x1000>;
+ dma-channels = <32>;
+ interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 9 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 12 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>,
+ /* last is eDMA2-ERR interrupt */
+ <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
+ clock-names = "dma", "dmamux0";
+ clocks = <&pcc2 IMX7ULP_CLK_DMA1>,
+ <&pcc2 IMX7ULP_CLK_DMA_MUX1>;
+}; /* i.mx7ulp */
* DMA clients
DMA client drivers that uses the DMA function must use the format described
diff --git a/Documentation/devicetree/bindings/dma/fsl-qdma.txt b/Documentation/devicetree/bindings/dma/fsl-qdma.txt
index 6a0ff9059e72..da371c4d406c 100644
--- a/Documentation/devicetree/bindings/dma/fsl-qdma.txt
+++ b/Documentation/devicetree/bindings/dma/fsl-qdma.txt
@@ -7,6 +7,7 @@ Required properties:
- compatible: Must be one of
"fsl,ls1021a-qdma": for LS1021A Board
+ "fsl,ls1028a-qdma": for LS1028A Board
"fsl,ls1043a-qdma": for ls1043A Board
"fsl,ls1046a-qdma": for ls1046A Board
- reg: Should contain the register's base address and length.
diff --git a/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
new file mode 100644
index 000000000000..5d6f98c43e3d
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma/mtk-uart-apdma.txt
@@ -0,0 +1,54 @@
+* Mediatek UART APDMA Controller
+
+Required properties:
+- compatible should contain:
+ * "mediatek,mt2712-uart-dma" for MT2712 compatible APDMA
+ * "mediatek,mt6577-uart-dma" for MT6577 and all of the above
+
+- reg: The base address of the APDMA register bank.
+
+- interrupts: A single interrupt specifier.
+ One interrupt per dma-requests, or 8 if no dma-requests property is present
+
+- dma-requests: The number of DMA channels
+
+- clocks : Must contain an entry for each entry in clock-names.
+ See ../clocks/clock-bindings.txt for details.
+- clock-names: The APDMA clock for register accesses
+
+- mediatek,dma-33bits: Present if the DMA requires support
+
+Examples:
+
+ apdma: dma-controller@11000400 {
+ compatible = "mediatek,mt2712-uart-dma";
+ reg = <0 0x11000400 0 0x80>,
+ <0 0x11000480 0 0x80>,
+ <0 0x11000500 0 0x80>,
+ <0 0x11000580 0 0x80>,
+ <0 0x11000600 0 0x80>,
+ <0 0x11000680 0 0x80>,
+ <0 0x11000700 0 0x80>,
+ <0 0x11000780 0 0x80>,
+ <0 0x11000800 0 0x80>,
+ <0 0x11000880 0 0x80>,
+ <0 0x11000900 0 0x80>,
+ <0 0x11000980 0 0x80>;
+ interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 104 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 105 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 106 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 107 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 108 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 109 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 110 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 111 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 112 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 113 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 114 IRQ_TYPE_LEVEL_LOW>;
+ dma-requests = <12>;
+ clocks = <&pericfg CLK_PERI_AP_DMA>;
+ clock-names = "apdma";
+ mediatek,dma-33bits;
+ #dma-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/dma/sun6i-dma.txt b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
index 7fccc20d8331..cae31f4e77ba 100644
--- a/Documentation/devicetree/bindings/dma/sun6i-dma.txt
+++ b/Documentation/devicetree/bindings/dma/sun6i-dma.txt
@@ -28,12 +28,17 @@ Example:
};
------------------------------------------------------------------------------
-For A64 DMA controller:
+For A64 and H6 DMA controller:
Required properties:
-- compatible: "allwinner,sun50i-a64-dma"
+- compatible: Must be one of
+ "allwinner,sun50i-a64-dma"
+ "allwinner,sun50i-h6-dma"
- dma-channels: Number of DMA channels supported by the controller.
Refer to Documentation/devicetree/bindings/dma/dma.txt
+- clocks: In addition to parent AHB clock, it should also contain mbus
+ clock (H6 only)
+- clock-names: Should contain "bus" and "mbus" (H6 only)
- all properties above, i.e. reg, interrupts, clocks, resets and #dma-cells
Optional properties:
diff --git a/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt b/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt
new file mode 100644
index 000000000000..d592c21245f2
--- /dev/null
+++ b/Documentation/devicetree/bindings/extcon/extcon-fsa9480.txt
@@ -0,0 +1,19 @@
+FAIRCHILD SEMICONDUCTOR FSA9480 MICROUSB SWITCH
+
+The FSA9480 is a USB port accessory detector and switch. The FSA9480 is fully
+controlled using I2C and enables USB data, stereo and mono audio, video,
+microphone, and UART data to use a common connector port.
+
+Required properties:
+ - compatible : Must be "fcs,fsa9480"
+ - reg : Specifies i2c slave address. Must be 0x25.
+ - interrupts : Should contain one entry specifying interrupt signal of
+ interrupt parent to which interrupt pin of the chip is connected.
+
+ Example:
+ musb@25 {
+ compatible = "fcs,fsa9480";
+ reg = <0x25>;
+ interrupt-parent = <&gph2>;
+ interrupts = <7 0>;
+ };
diff --git a/Documentation/devicetree/bindings/gpio/gpio-davinci.txt b/Documentation/devicetree/bindings/gpio/gpio-davinci.txt
index 553b92a7e87b..bc6b4b62df83 100644
--- a/Documentation/devicetree/bindings/gpio/gpio-davinci.txt
+++ b/Documentation/devicetree/bindings/gpio/gpio-davinci.txt
@@ -5,6 +5,7 @@ Required Properties:
"ti,keystone-gpio": for Keystone 2 66AK2H/K, 66AK2L,
66AK2E SoCs
"ti,k2g-gpio", "ti,keystone-gpio": for 66AK2G
+ "ti,am654-gpio", "ti,keystone-gpio": for TI K3 AM654
- reg: Physical base address of the controller and the size of memory mapped
registers.
@@ -145,3 +146,20 @@ gpio0: gpio@260bf00 {
ti,ngpio = <32>;
ti,davinci-gpio-unbanked = <32>;
};
+
+Example for K3 AM654:
+
+wkup_gpio0: wkup_gpio0@42110000 {
+ compatible = "ti,am654-gpio", "ti,keystone-gpio";
+ reg = <0x42110000 0x100>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ interrupt-parent = <&intr_wkup_gpio>;
+ interrupts = <59 128>, <59 129>, <59 130>, <59 131>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ ti,ngpio = <56>;
+ ti,davinci-gpio-unbanked = <0>;
+ clocks = <&k3_clks 59 0>;
+ clock-names = "gpio";
+};
diff --git a/Documentation/devicetree/bindings/gpio/pl061-gpio.txt b/Documentation/devicetree/bindings/gpio/pl061-gpio.txt
deleted file mode 100644
index 89058d375b7c..000000000000
--- a/Documentation/devicetree/bindings/gpio/pl061-gpio.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-ARM PL061 GPIO controller
-
-Required properties:
-- compatible : "arm,pl061", "arm,primecell"
-- #gpio-cells : Should be two. The first cell is the pin number and the
- second cell is used to specify optional parameters:
- - bit 0 specifies polarity (0 for normal, 1 for inverted)
-- gpio-controller : Marks the device node as a GPIO controller.
-- interrupts : Interrupt mapping for GPIO IRQ.
-- gpio-ranges : Interaction with the PINCTRL subsystem.
diff --git a/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml b/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml
new file mode 100644
index 000000000000..313b17229247
--- /dev/null
+++ b/Documentation/devicetree/bindings/gpio/pl061-gpio.yaml
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/gpio/pl061-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM PL061 GPIO controller
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+ - Rob Herring <robh@kernel.org>
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,pl061
+ required:
+ - compatible
+
+properties:
+ $nodename:
+ pattern: "^gpio@[0-9a-f]+$"
+
+ compatible:
+ items:
+ - const: arm,pl061
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ oneOf:
+ - maxItems: 1
+ - maxItems: 8
+
+ interrupt-controller: true
+
+ "#interrupt-cells":
+ const: 2
+
+ clocks:
+ maxItems: 1
+
+ clock-names: true
+
+ "#gpio-cells":
+ const: 2
+
+ gpio-controller: true
+
+ gpio-ranges:
+ maxItems: 8
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - interrupt-controller
+ - "#interrupt-cells"
+ - clocks
+ - "#gpio-cells"
+ - gpio-controller
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt b/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt
index 1b1a74129141..9b298edec5b2 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-midgard.txt
@@ -15,7 +15,9 @@ Required properties:
+ "arm,mali-t860"
+ "arm,mali-t880"
* which must be preceded by one of the following vendor specifics:
+ + "allwinner,sun50i-h6-mali"
+ "amlogic,meson-gxm-mali"
+ + "samsung,exynos5433-mali"
+ "rockchip,rk3288-mali"
+ "rockchip,rk3399-mali"
@@ -31,21 +33,36 @@ Optional properties:
- clocks : Phandle to clock for the Mali Midgard device.
+- clock-names : Specify the names of the clocks specified in clocks
+ when multiple clocks are present.
+ * core: clock driving the GPU itself (When only one clock is present,
+ assume it's this clock.)
+ * bus: bus clock for the GPU
+
- mali-supply : Phandle to regulator for the Mali device. Refer to
Documentation/devicetree/bindings/regulator/regulator.txt for details.
- operating-points-v2 : Refer to Documentation/devicetree/bindings/opp/opp.txt
for details.
+- #cooling-cells: Refer to Documentation/devicetree/bindings/thermal/thermal.txt
+ for details.
+
- resets : Phandle of the GPU reset line.
Vendor-specific bindings
------------------------
The Mali GPU is integrated very differently from one SoC to
-another. In order to accomodate those differences, you have the option
+another. In order to accommodate those differences, you have the option
to specify one more vendor-specific compatible, among:
+- "allwinner,sun50i-h6-mali"
+ Required properties:
+ - clocks : phandles to core and bus clocks
+ - clock-names : must contain "core" and "bus"
+ - resets: phandle to GPU reset line
+
- "amlogic,meson-gxm-mali"
Required properties:
- resets : Should contain phandles of :
@@ -65,6 +82,7 @@ gpu@ffa30000 {
mali-supply = <&vdd_gpu>;
operating-points-v2 = <&gpu_opp_table>;
power-domains = <&power RK3288_PD_GPU>;
+ #cooling-cells = <2>;
};
gpu_opp_table: opp_table0 {
diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt
index ae63f09fda7d..b352a6851a06 100644
--- a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt
+++ b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt
@@ -17,6 +17,7 @@ Required properties:
+ amlogic,meson8b-mali
+ amlogic,meson-gxbb-mali
+ amlogic,meson-gxl-mali
+ + samsung,exynos4210-mali
+ rockchip,rk3036-mali
+ rockchip,rk3066-mali
+ rockchip,rk3188-mali
diff --git a/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt b/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt
index 2c9804f4f4ac..8d365f89694c 100644
--- a/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt
+++ b/Documentation/devicetree/bindings/hwlock/omap-hwspinlock.txt
@@ -1,12 +1,16 @@
-OMAP4+ HwSpinlock Driver
-========================
+TI HwSpinlock for OMAP and K3 based SoCs
+=========================================
Required properties:
-- compatible: Should be "ti,omap4-hwspinlock" for
- OMAP44xx, OMAP54xx, AM33xx, AM43xx, DRA7xx SoCs
+- compatible: Should be one of the following,
+ "ti,omap4-hwspinlock" for
+ OMAP44xx, OMAP54xx, AM33xx, AM43xx, DRA7xx SoCs
+ "ti,am654-hwspinlock" for
+ K3 AM65x and J721E SoCs
- reg: Contains the hwspinlock module register address space
(base address and length)
- ti,hwmods: Name of the hwmod associated with the hwspinlock device
+ (for OMAP architecture based SoCs only)
- #hwlock-cells: Should be 1. The OMAP hwspinlock users will use a
0-indexed relative hwlock number as the argument
specifier value for requesting a specific hwspinlock
@@ -17,10 +21,21 @@ Please look at the generic hwlock binding for usage information for consumers,
Example:
-/* OMAP4 */
+1. OMAP4 SoCs
hwspinlock: spinlock@4a0f6000 {
compatible = "ti,omap4-hwspinlock";
reg = <0x4a0f6000 0x1000>;
ti,hwmods = "spinlock";
#hwlock-cells = <1>;
};
+
+2. AM65x SoCs and J721E SoCs
+&cbass_main {
+ cbass_main_navss: interconnect0 {
+ hwspinlock: spinlock@30e00000 {
+ compatible = "ti,am654-hwspinlock";
+ reg = <0x00 0x30e00000 0x00 0x1000>;
+ #hwlock-cells = <1>;
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml b/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml
new file mode 100644
index 000000000000..f9d526b7da01
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/allwinner,sun6i-a31-p2wi.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/i2c/allwinner,sun6i-a31-p2wi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 P2WI (Push/Pull 2 Wires Interface) Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+allOf:
+ - $ref: /schemas/i2c/i2c-controller.yaml#
+
+properties:
+ compatible:
+ const: allwinner,sun6i-a31-p2wi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ clock-frequency:
+ minimum: 1
+ maximum: 6000000
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - resets
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+examples:
+ - |
+ i2c@1f03400 {
+ compatible = "allwinner,sun6i-a31-p2wi";
+ reg = <0x01f03400 0x400>;
+ interrupts = <0 39 4>;
+ clocks = <&apb0_gates 3>;
+ clock-frequency = <100000>;
+ resets = <&apb0_rst 3>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ axp221: pmic@68 {
+ compatible = "x-powers,axp221";
+ reg = <0x68>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/i2c/i2c-mt7621.txt b/Documentation/devicetree/bindings/i2c/i2c-mt7621.txt
new file mode 100644
index 000000000000..bc36f0eb94cd
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/i2c-mt7621.txt
@@ -0,0 +1,25 @@
+MediaTek MT7621/MT7628 I2C master controller
+
+Required properties:
+
+- compatible: Should be one of the following:
+ - "mediatek,mt7621-i2c": for MT7621/MT7628/MT7688 platforms
+- #address-cells: should be 1.
+- #size-cells: should be 0.
+- reg: Address and length of the register set for the device
+- resets: phandle to the reset controller asserting this device in
+ reset
+ See ../reset/reset.txt for details.
+
+Optional properties :
+
+Example:
+
+i2c: i2c@900 {
+ compatible = "mediatek,mt7621-i2c";
+ reg = <0x900 0x100>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ resets = <&rstctrl 16>;
+ reset-names = "i2c";
+};
diff --git a/Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt b/Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt
deleted file mode 100644
index 0ffe65a316ae..000000000000
--- a/Documentation/devicetree/bindings/i2c/i2c-mv64xxx.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-
-* Marvell MV64XXX I2C controller
-
-Required properties :
-
- - reg : Offset and length of the register set for the device
- - compatible : Should be either:
- - "allwinner,sun4i-a10-i2c"
- - "allwinner,sun6i-a31-i2c"
- - "marvell,mv64xxx-i2c"
- - "marvell,mv78230-i2c"
- - "marvell,mv78230-a0-i2c"
- * Note: Only use "marvell,mv78230-a0-i2c" for a
- very rare, initial version of the SoC which
- had broken offload support. Linux
- auto-detects this and sets it appropriately.
- - interrupts : The interrupt number
-
-Optional properties :
-
- - clock-frequency : Desired I2C bus clock frequency in Hz. If not set the
-default frequency is 100kHz
-
- - resets : phandle to the parent reset controller. Mandatory
- whenever you're using the "allwinner,sun6i-a31-i2c"
- compatible.
-
- - clocks: : pointers to the reference clocks for this device, the
- first one is the one used for the clock on the i2c bus,
- the second one is the clock used to acces the registers
- of the controller
-
- - clock-names : names of used clocks, mandatory if the second clock is
- used, the name must be "core", and "reg" (the latter is
- only for Armada 7K/8K).
-
-Examples:
-
- i2c@11000 {
- compatible = "marvell,mv64xxx-i2c";
- reg = <0x11000 0x20>;
- interrupts = <29>;
- clock-frequency = <100000>;
- };
-
-For the Armada XP:
-
- i2c@11000 {
- compatible = "marvell,mv78230-i2c", "marvell,mv64xxx-i2c";
- reg = <0x11000 0x100>;
- interrupts = <29>;
- clock-frequency = <100000>;
- };
-
-For the Armada 7040:
-
- i2c@701000 {
- compatible = "marvell,mv78230-i2c";
- reg = <0x701000 0x20>;
- interrupts = <29>;
- clock-frequency = <100000>;
- clock-names = "core", "reg";
- clocks = <&core_clock>, <&reg_clock>;
- };
diff --git a/Documentation/devicetree/bindings/i2c/i2c-ocores.txt b/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
index 17bef9a34e50..6b25a80ae8d3 100644
--- a/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
+++ b/Documentation/devicetree/bindings/i2c/i2c-ocores.txt
@@ -1,9 +1,13 @@
Device tree configuration for i2c-ocores
Required properties:
-- compatible : "opencores,i2c-ocores" or "aeroflexgaisler,i2cmst"
+- compatible : "opencores,i2c-ocores"
+ "aeroflexgaisler,i2cmst"
+ "sifive,fu540-c000-i2c", "sifive,i2c0"
+ For Opencore based I2C IP block reimplemented in
+ FU540-C000 SoC. Please refer to sifive-blocks-ip-versioning.txt
+ for additional details.
- reg : bus address start and address range size of device
-- interrupts : interrupt number
- clocks : handle to the controller clock; see the note below.
Mutually exclusive with opencores,ip-clock-frequency
- opencores,ip-clock-frequency: frequency of the controller clock in Hz;
@@ -12,6 +16,7 @@ Required properties:
- #size-cells : should be <0>
Optional properties:
+- interrupts : interrupt number.
- clock-frequency : frequency of bus clock in Hz; see the note below.
Defaults to 100 KHz when the property is not specified
- reg-shift : device register offsets are shifted by this value
diff --git a/Documentation/devicetree/bindings/i2c/i2c-omap.txt b/Documentation/devicetree/bindings/i2c/i2c-omap.txt
index 4b90ba9f31b7..a44573d7c118 100644
--- a/Documentation/devicetree/bindings/i2c/i2c-omap.txt
+++ b/Documentation/devicetree/bindings/i2c/i2c-omap.txt
@@ -7,6 +7,7 @@ Required properties :
"ti,omap3-i2c" for OMAP3 SoCs
"ti,omap4-i2c" for OMAP4+ SoCs
"ti,am654-i2c", "ti,omap4-i2c" for AM654 SoCs
+ "ti,j721e-i2c", "ti,omap4-i2c" for J721E SoCs
- ti,hwmods : Must be "i2c<n>", n being the instance number (1-based)
- #address-cells = <1>;
- #size-cells = <0>;
diff --git a/Documentation/devicetree/bindings/i2c/i2c-stm32.txt b/Documentation/devicetree/bindings/i2c/i2c-stm32.txt
index f334738f7a35..ce3df2fff6c8 100644
--- a/Documentation/devicetree/bindings/i2c/i2c-stm32.txt
+++ b/Documentation/devicetree/bindings/i2c/i2c-stm32.txt
@@ -21,6 +21,8 @@ Optional properties:
100000 and 400000.
For STM32F7, STM32H7 and STM32MP1 SoCs, Standard-mode, Fast-mode and Fast-mode
Plus are supported, possible values are 100000, 400000 and 1000000.
+- dmas: List of phandles to rx and tx DMA channels. Refer to stm32-dma.txt.
+- dma-names: List of dma names. Valid names are: "rx" and "tx".
- i2c-scl-rising-time-ns: I2C SCL Rising time for the board (default: 25)
For STM32F7, STM32H7 and STM32MP1 only.
- i2c-scl-falling-time-ns: I2C SCL Falling time for the board (default: 10)
diff --git a/Documentation/devicetree/bindings/i2c/i2c-sun6i-p2wi.txt b/Documentation/devicetree/bindings/i2c/i2c-sun6i-p2wi.txt
deleted file mode 100644
index 49df0053347a..000000000000
--- a/Documentation/devicetree/bindings/i2c/i2c-sun6i-p2wi.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-
-* Allwinner P2WI (Push/Pull 2 Wire Interface) controller
-
-Required properties :
-
- - reg : Offset and length of the register set for the device.
- - compatible : Should one of the following:
- - "allwinner,sun6i-a31-p2wi"
- - interrupts : The interrupt line connected to the P2WI peripheral.
- - clocks : The gate clk connected to the P2WI peripheral.
- - resets : The reset line connected to the P2WI peripheral.
-
-Optional properties :
-
- - clock-frequency : Desired P2WI bus clock frequency in Hz. If not set the
-default frequency is 100kHz
-
-A P2WI may contain one child node encoding a P2WI slave device.
-
-Slave device properties:
- Required properties:
- - reg : the I2C slave address used during the initialization
- process to switch from I2C to P2WI mode
-
-Example:
-
- p2wi@1f03400 {
- compatible = "allwinner,sun6i-a31-p2wi";
- reg = <0x01f03400 0x400>;
- interrupts = <0 39 4>;
- clocks = <&apb0_gates 3>;
- clock-frequency = <6000000>;
- resets = <&apb0_rst 3>;
-
- axp221: pmic@68 {
- compatible = "x-powers,axp221";
- reg = <0x68>;
-
- /* ... */
- };
- };
diff --git a/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml b/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml
new file mode 100644
index 000000000000..001f2b7abad0
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/marvell,mv64xxx-i2c.yaml
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/i2c/marvell,mv64xxx-i2c.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell MV64XXX I2C Controller Device Tree Bindings
+
+maintainers:
+ - Gregory CLEMENT <gregory.clement@bootlin.com>
+
+properties:
+ compatible:
+ oneOf:
+ - const: allwinner,sun4i-a10-i2c
+ - items:
+ - const: allwinner,sun7i-a20-i2c
+ - const: allwinner,sun4i-a10-i2c
+ - const: allwinner,sun6i-a31-i2c
+ - items:
+ - const: allwinner,sun8i-a23-i2c
+ - const: allwinner,sun6i-a31-i2c
+ - items:
+ - const: allwinner,sun8i-a83t-i2c
+ - const: allwinner,sun6i-a31-i2c
+ - items:
+ - const: allwinner,sun50i-a64-i2c
+ - const: allwinner,sun6i-a31-i2c
+
+ - const: marvell,mv64xxx-i2c
+ - const: marvell,mv78230-i2c
+ - const: marvell,mv78230-a0-i2c
+
+ description:
+ Only use "marvell,mv78230-a0-i2c" for a very rare, initial
+ version of the SoC which had broken offload support. Linux
+ auto-detects this and sets it appropriately.
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 1
+ maxItems: 2
+ items:
+ - description: Reference clock for the I2C bus
+ - description: Bus clock (Only for Armada 7K/8K)
+
+ clock-names:
+ minItems: 1
+ maxItems: 2
+ items:
+ - const: core
+ - const: reg
+ description:
+ Mandatory if two clocks are used (only for Armada 7k and 8k).
+
+ resets:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+allOf:
+ - $ref: /schemas/i2c/i2c-controller.yaml#
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun4i-a10-i2c
+ - allwinner,sun6i-a31-i2c
+
+ then:
+ required:
+ - clocks
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun6i-a31-i2c
+
+ then:
+ required:
+ - resets
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+examples:
+ - |
+ i2c@11000 {
+ compatible = "marvell,mv64xxx-i2c";
+ reg = <0x11000 0x20>;
+ interrupts = <29>;
+ clock-frequency = <100000>;
+ };
+
+ - |
+ i2c@11000 {
+ compatible = "marvell,mv78230-i2c";
+ reg = <0x11000 0x100>;
+ interrupts = <29>;
+ clock-frequency = <100000>;
+ };
+
+ - |
+ i2c@701000 {
+ compatible = "marvell,mv78230-i2c";
+ reg = <0x701000 0x20>;
+ interrupts = <29>;
+ clock-frequency = <100000>;
+ clock-names = "core", "reg";
+ clocks = <&core_clock>, <&reg_clock>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt
index 69da2115abdc..1cf6182f888c 100644
--- a/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt
+++ b/Documentation/devicetree/bindings/i3c/cdns,i3c-master.txt
@@ -38,6 +38,6 @@ Example:
nunchuk: nunchuk@52 {
compatible = "nintendo,nunchuk";
- reg = <0x52 0x80000010 0>;
+ reg = <0x52 0x0 0x10>;
};
};
diff --git a/Documentation/devicetree/bindings/i3c/i3c.txt b/Documentation/devicetree/bindings/i3c/i3c.txt
index ab729a0a86ae..4ffe059f0fec 100644
--- a/Documentation/devicetree/bindings/i3c/i3c.txt
+++ b/Documentation/devicetree/bindings/i3c/i3c.txt
@@ -39,7 +39,9 @@ valid here, but several new properties have been added.
New constraint on existing properties:
--------------------------------------
- reg: contains 3 cells
- + first cell : still encoding the I2C address
+ + first cell : still encoding the I2C address. 10 bit addressing is not
+ supported. Devices with 10 bit address can't be properly passed through
+ DEFSLVS command.
+ second cell: shall be 0
diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml
new file mode 100644
index 000000000000..7ba167e2e1ea
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl345.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers
+
+maintainers:
+ - Michael Hennerich <michael.hennerich@analog.com>
+
+description: |
+ Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers that supports
+ both I2C & SPI interfaces.
+ http://www.analog.com/en/products/mems/accelerometers/adxl345.html
+ http://www.analog.com/en/products/sensors-mems/accelerometers/adxl375.html
+
+properties:
+ compatible:
+ enum:
+ - adi,adxl345
+ - adi,adxl375
+
+ reg:
+ maxItems: 1
+
+ spi-cpha: true
+
+ spi-cpol: true
+
+ interrupts:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ i2c0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* Example for a I2C device node */
+ accelerometer@2a {
+ compatible = "adi,adxl345";
+ reg = <0x53>;
+ interrupt-parent = <&gpio0>;
+ interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ spi0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* Example for a SPI device node */
+ accelerometer@0 {
+ compatible = "adi,adxl345";
+ reg = <0>;
+ spi-max-frequency = <5000000>;
+ spi-cpol;
+ spi-cpha;
+ interrupt-parent = <&gpio0>;
+ interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml
new file mode 100644
index 000000000000..a7fafb9bf5c6
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl372.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer
+
+maintainers:
+ - Stefan Popa <stefan.popa@analog.com>
+
+description: |
+ Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer that supports
+ both I2C & SPI interfaces
+ https://www.analog.com/en/products/adxl372.html
+
+properties:
+ compatible:
+ enum:
+ - adi,adxl372
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ i2c0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ /* Example for a I2C device node */
+ accelerometer@53 {
+ compatible = "adi,adxl372";
+ reg = <0x53>;
+ interrupt-parent = <&gpio>;
+ interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
+ };
+ };
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ #include <dt-bindings/interrupt-controller/irq.h>
+ spi0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ accelerometer@0 {
+ compatible = "adi,adxl372";
+ reg = <0>;
+ spi-max-frequency = <1000000>;
+ interrupt-parent = <&gpio>;
+ interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/iio/accel/adxl345.txt b/Documentation/devicetree/bindings/iio/accel/adxl345.txt
deleted file mode 100644
index f9525f6e3d43..000000000000
--- a/Documentation/devicetree/bindings/iio/accel/adxl345.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers
-
-http://www.analog.com/en/products/mems/accelerometers/adxl345.html
-http://www.analog.com/en/products/sensors-mems/accelerometers/adxl375.html
-
-Required properties:
- - compatible : should be one of
- "adi,adxl345"
- "adi,adxl375"
- - reg : the I2C address or SPI chip select number of the sensor
-
-Required properties for SPI bus usage:
- - spi-max-frequency : set maximum clock frequency, must be 5000000
- - spi-cpol and spi-cpha : must be defined for adxl345 to enable SPI mode 3
-
-Optional properties:
- - interrupts: interrupt mapping for IRQ as documented in
- Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-
-Example for a I2C device node:
-
- accelerometer@2a {
- compatible = "adi,adxl345";
- reg = <0x53>;
- interrupt-parent = <&gpio1>;
- interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
- };
-
-Example for a SPI device node:
-
- accelerometer@0 {
- compatible = "adi,adxl345";
- reg = <0>;
- spi-max-frequency = <5000000>;
- spi-cpol;
- spi-cpha;
- interrupt-parent = <&gpio1>;
- interrupts = <0 IRQ_TYPE_LEVEL_HIGH>;
- };
diff --git a/Documentation/devicetree/bindings/iio/accel/adxl372.txt b/Documentation/devicetree/bindings/iio/accel/adxl372.txt
deleted file mode 100644
index a289964756a7..000000000000
--- a/Documentation/devicetree/bindings/iio/accel/adxl372.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer
-
-http://www.analog.com/media/en/technical-documentation/data-sheets/adxl372.pdf
-
-Required properties:
- - compatible : should be "adi,adxl372"
- - reg: the I2C address or SPI chip select number for the device
-
-Required properties for SPI bus usage:
- - spi-max-frequency: Max SPI frequency to use
-
-Optional properties:
- - interrupts: interrupt mapping for IRQ as documented in
- Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-
-Example for a I2C device node:
-
- accelerometer@53 {
- compatible = "adi,adxl372";
- reg = <0x53>;
- interrupt-parent = <&gpio>;
- interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
- };
-
-Example for a SPI device node:
-
- accelerometer@0 {
- compatible = "adi,adxl372";
- reg = <0>;
- spi-max-frequency = <1000000>;
- interrupt-parent = <&gpio>;
- interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
- };
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt
deleted file mode 100644
index 416273dce569..000000000000
--- a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.txt
+++ /dev/null
@@ -1,75 +0,0 @@
-Analog Devices AD7124 ADC device driver
-
-Required properties for the AD7124:
- - compatible: Must be one of "adi,ad7124-4" or "adi,ad7124-8"
- - reg: SPI chip select number for the device
- - spi-max-frequency: Max SPI frequency to use
- see: Documentation/devicetree/bindings/spi/spi-bus.txt
- - clocks: phandle to the master clock (mclk)
- see: Documentation/devicetree/bindings/clock/clock-bindings.txt
- - clock-names: Must be "mclk".
- - interrupts: IRQ line for the ADC
- see: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
-
- Required properties:
- * #address-cells: Must be 1.
- * #size-cells: Must be 0.
-
- Subnode(s) represent the external channels which are connected to the ADC.
- Each subnode represents one channel and has the following properties:
- Required properties:
- * reg: The channel number. It can have up to 4 channels on ad7124-4
- and 8 channels on ad7124-8, numbered from 0 to 15.
- * diff-channels: see: Documentation/devicetree/bindings/iio/adc/adc.txt
-
- Optional properties:
- * bipolar: see: Documentation/devicetree/bindings/iio/adc/adc.txt
- * adi,reference-select: Select the reference source to use when
- converting on the the specific channel. Valid values are:
- 0: REFIN1(+)/REFIN1(−).
- 1: REFIN2(+)/REFIN2(−).
- 3: AVDD
- If this field is left empty, internal reference is selected.
-
-Optional properties:
- - refin1-supply: refin1 supply can be used as reference for conversion.
- - refin2-supply: refin2 supply can be used as reference for conversion.
- - avdd-supply: avdd supply can be used as reference for conversion.
-
-Example:
- adc@0 {
- compatible = "adi,ad7124-4";
- reg = <0>;
- spi-max-frequency = <5000000>;
- interrupts = <25 2>;
- interrupt-parent = <&gpio>;
- refin1-supply = <&adc_vref>;
- clocks = <&ad7124_mclk>;
- clock-names = "mclk";
-
- #address-cells = <1>;
- #size-cells = <0>;
-
- channel@0 {
- reg = <0>;
- diff-channels = <0 1>;
- adi,reference-select = <0>;
- };
-
- channel@1 {
- reg = <1>;
- bipolar;
- diff-channels = <2 3>;
- adi,reference-select = <0>;
- };
-
- channel@2 {
- reg = <2>;
- diff-channels = <4 5>;
- };
-
- channel@3 {
- reg = <3>;
- diff-channels = <6 7>;
- };
- };
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml
new file mode 100644
index 000000000000..9692b7f719f5
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7124.yaml
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# Copyright 2019 Analog Devices Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/bindings/iio/adc/adi,ad7124.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices AD7124 ADC device driver
+
+maintainers:
+ - Stefan Popa <stefan.popa@analog.com>
+
+description: |
+ Bindings for the Analog Devices AD7124 ADC device. Datasheet can be
+ found here:
+ https://www.analog.com/media/en/technical-documentation/data-sheets/AD7124-8.pdf
+
+properties:
+ compatible:
+ enum:
+ - adi,ad7124-4
+ - adi,ad7124-8
+
+ reg:
+ description: SPI chip select number for the device
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+ description: phandle to the master clock (mclk)
+
+ clock-names:
+ items:
+ - const: mclk
+
+ interrupts:
+ description: IRQ line for the ADC
+ maxItems: 1
+
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
+
+ refin1-supply:
+ description: refin1 supply can be used as reference for conversion.
+ maxItems: 1
+
+ refin2-supply:
+ description: refin2 supply can be used as reference for conversion.
+ maxItems: 1
+
+ avdd-supply:
+ description: avdd supply can be used as reference for conversion.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - interrupts
+
+patternProperties:
+ "^channel@([0-9]|1[0-5])$":
+ type: object
+ description: |
+ Represents the external channels which are connected to the ADC.
+ See Documentation/devicetree/bindings/iio/adc/adc.txt.
+
+ properties:
+ reg:
+ description: |
+ The channel number. It can have up to 8 channels on ad7124-4
+ and 16 channels on ad7124-8, numbered from 0 to 15.
+ items:
+ minimum: 0
+ maximum: 15
+
+ adi,reference-select:
+ description: |
+ Select the reference source to use when converting on
+ the specific channel. Valid values are:
+ 0: REFIN1(+)/REFIN1(−).
+ 1: REFIN2(+)/REFIN2(−).
+ 3: AVDD
+ If this field is left empty, internal reference is selected.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - enum: [0, 1, 3]
+
+ diff-channels:
+ description: see Documentation/devicetree/bindings/iio/adc/adc.txt
+ items:
+ minimum: 0
+ maximum: 15
+
+ bipolar:
+ description: see Documentation/devicetree/bindings/iio/adc/adc.txt
+ type: boolean
+
+ adi,buffered-positive:
+ description: Enable buffered mode for positive input.
+ type: boolean
+
+ adi,buffered-negative:
+ description: Enable buffered mode for negative input.
+ type: boolean
+
+ required:
+ - reg
+ - diff-channels
+
+examples:
+ - |
+ spi {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ adc@0 {
+ compatible = "adi,ad7124-4";
+ reg = <0>;
+ spi-max-frequency = <5000000>;
+ interrupts = <25 2>;
+ interrupt-parent = <&gpio>;
+ refin1-supply = <&adc_vref>;
+ clocks = <&ad7124_mclk>;
+ clock-names = "mclk";
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ channel@0 {
+ reg = <0>;
+ diff-channels = <0 1>;
+ adi,reference-select = <0>;
+ adi,buffered-positive;
+ };
+
+ channel@1 {
+ reg = <1>;
+ bipolar;
+ diff-channels = <2 3>;
+ adi,reference-select = <0>;
+ adi,buffered-positive;
+ adi,buffered-negative;
+ };
+
+ channel@2 {
+ reg = <2>;
+ diff-channels = <4 5>;
+ };
+
+ channel@3 {
+ reg = <3>;
+ diff-channels = <6 7>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7780.txt b/Documentation/devicetree/bindings/iio/adc/adi,ad7780.txt
deleted file mode 100644
index 440e52555349..000000000000
--- a/Documentation/devicetree/bindings/iio/adc/adi,ad7780.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-* Analog Devices AD7170/AD7171/AD7780/AD7781
-
-Data sheets:
-
-- AD7170:
- * https://www.analog.com/media/en/technical-documentation/data-sheets/AD7170.pdf
-- AD7171:
- * https://www.analog.com/media/en/technical-documentation/data-sheets/AD7171.pdf
-- AD7780:
- * https://www.analog.com/media/en/technical-documentation/data-sheets/ad7780.pdf
-- AD7781:
- * https://www.analog.com/media/en/technical-documentation/data-sheets/AD7781.pdf
-
-Required properties:
-
-- compatible: should be one of
- * "adi,ad7170"
- * "adi,ad7171"
- * "adi,ad7780"
- * "adi,ad7781"
-- reg: spi chip select number for the device
-- vref-supply: the regulator supply for the ADC reference voltage
-
-Optional properties:
-
-- powerdown-gpios: must be the device tree identifier of the PDRST pin. If
- specified, it will be asserted during driver probe. As the
- line is active high, it should be marked GPIO_ACTIVE_HIGH.
-- adi,gain-gpios: must be the device tree identifier of the GAIN pin. Only for
- the ad778x chips. If specified, it will be asserted during
- driver probe. As the line is active low, it should be marked
- GPIO_ACTIVE_LOW.
-- adi,filter-gpios: must be the device tree identifier of the FILTER pin. Only
- for the ad778x chips. If specified, it will be asserted
- during driver probe. As the line is active low, it should be
- marked GPIO_ACTIVE_LOW.
-
-Example:
-
-adc@0 {
- compatible = "adi,ad7780";
- reg = <0>;
- vref-supply = <&vdd_supply>
-
- powerdown-gpios = <&gpio 12 GPIO_ACTIVE_HIGH>;
- adi,gain-gpios = <&gpio 5 GPIO_ACTIVE_LOW>;
- adi,filter-gpios = <&gpio 15 GPIO_ACTIVE_LOW>;
-};
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml
new file mode 100644
index 000000000000..d1109416963c
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7780.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/adc/adi,ad7780.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices AD7170/AD7171/AD7780/AD7781 analog to digital converters
+
+maintainers:
+ - Michael Hennerich <michael.hennerich@analog.com>
+
+description: |
+ The ad7780 is a sigma-delta analog to digital converter. This driver provides
+ reading voltage values and status bits from both the ad778x and ad717x series.
+ Its interface also allows writing on the FILTER and GAIN GPIO pins on the
+ ad778x.
+
+ Specifications on the converters can be found at:
+ AD7170:
+ https://www.analog.com/media/en/technical-documentation/data-sheets/AD7170.pdf
+ AD7171:
+ https://www.analog.com/media/en/technical-documentation/data-sheets/AD7171.pdf
+ AD7780:
+ https://www.analog.com/media/en/technical-documentation/data-sheets/ad7780.pdf
+ AD7781:
+ https://www.analog.com/media/en/technical-documentation/data-sheets/AD7781.pdf
+
+properties:
+ compatible:
+ enum:
+ - adi,ad7170
+ - adi,ad7171
+ - adi,ad7780
+ - adi,ad7781
+
+ reg:
+ maxItems: 1
+
+ avdd-supply:
+ description:
+ The regulator supply for the ADC reference voltage.
+ maxItems: 1
+
+ powerdown-gpios:
+ description:
+ Must be the device tree identifier of the PDRST pin. If
+ specified, it will be asserted during driver probe. As the
+ line is active high, it should be marked GPIO_ACTIVE_HIGH.
+ maxItems: 1
+
+ adi,gain-gpios:
+ description:
+ Must be the device tree identifier of the GAIN pin. Only for
+ the ad778x chips. If specified, it will be asserted during
+ driver probe. As the line is active low, it should be marked
+ GPIO_ACTIVE_LOW.
+ maxItems: 1
+
+ adi,filter-gpios:
+ description:
+ Must be the device tree identifier of the FILTER pin. Only
+ for the ad778x chips. If specified, it will be asserted
+ during driver probe. As the line is active low, it should be
+ marked GPIO_ACTIVE_LOW.
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ #include <dt-bindings/gpio/gpio.h>
+ spi0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ adc@0 {
+ compatible = "adi,ad7780";
+ reg = <0>;
+
+ avdd-supply = <&vdd_supply>;
+ powerdown-gpios = <&gpio0 12 GPIO_ACTIVE_HIGH>;
+ adi,gain-gpios = <&gpio1 5 GPIO_ACTIVE_LOW>;
+ adi,filter-gpios = <&gpio2 15 GPIO_ACTIVE_LOW>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml b/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml
index 8a4100ceeaf2..d76ece97c76c 100644
--- a/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml
+++ b/Documentation/devicetree/bindings/iio/adc/avia-hx711.yaml
@@ -61,6 +61,6 @@ examples:
compatible = "avia,hx711";
sck-gpios = <&gpio3 10 GPIO_ACTIVE_HIGH>;
dout-gpios = <&gpio0 7 GPIO_ACTIVE_HIGH>;
- avdd-suppy = <&avdd>;
+ avdd-supply = <&avdd>;
clock-frequency = <100000>;
};
diff --git a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt
index 0df9befdaecc..78c06e05c8e5 100644
--- a/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt
+++ b/Documentation/devicetree/bindings/iio/adc/mt6577_auxadc.txt
@@ -13,8 +13,10 @@ Required properties:
- compatible: Should be one of:
- "mediatek,mt2701-auxadc": For MT2701 family of SoCs
- "mediatek,mt2712-auxadc": For MT2712 family of SoCs
+ - "mediatek,mt6765-auxadc": For MT6765 family of SoCs
- "mediatek,mt7622-auxadc": For MT7622 family of SoCs
- "mediatek,mt8173-auxadc": For MT8173 family of SoCs
+ - "mediatek,mt8183-auxadc", "mediatek,mt8173-auxadc": For MT8183 family of SoCs
- reg: Address range of the AUXADC unit.
- clocks: Should contain a clock specifier for each entry in clock-names
- clock-names: Should contain "main".
diff --git a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
index 8346bcb04ad7..93a0bd2efc05 100644
--- a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
+++ b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt
@@ -38,6 +38,7 @@ Required properties:
It's required on stm32h7.
- clock-names: Must be "adc" and/or "bus" depending on part used.
- interrupt-controller: Identifies the controller node as interrupt-parent
+- vdda-supply: Phandle to the vdda input analog voltage.
- vref-supply: Phandle to the vref input analog reference voltage.
- #interrupt-cells = <1>;
- #address-cells = <1>;
diff --git a/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.txt b/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.txt
deleted file mode 100644
index 6eee2709b5b6..000000000000
--- a/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-* Sensirion SPS30 particulate matter sensor
-
-Required properties:
-- compatible: must be "sensirion,sps30"
-- reg: the I2C address of the sensor
-
-Example:
-
-sps30@69 {
- compatible = "sensirion,sps30";
- reg = <0x69>;
-};
diff --git a/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.yaml b/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.yaml
new file mode 100644
index 000000000000..50a50a0d7070
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/chemical/sensirion,sps30.yaml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/chemical/sensirion,sps30.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Sensirion SPS30 particulate matter sensor
+
+maintainers:
+ - Tomasz Duszynski <tduszyns@gmail.com>
+
+description: |
+ Air pollution sensor capable of measuring mass concentration of dust
+ particles.
+
+properties:
+ compatible:
+ enum:
+ - sensirion,sps30
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ air-pollution-sensor@69 {
+ compatible = "sensirion,sps30";
+ reg = <0x69>;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/iio/frequency/adf4371.yaml b/Documentation/devicetree/bindings/iio/frequency/adf4371.yaml
new file mode 100644
index 000000000000..7ec3ec94356b
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/frequency/adf4371.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/frequency/adf4371.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Analog Devices ADF4371/ADF4372 Wideband Synthesizers
+
+maintainers:
+ - Popa Stefan <stefan.popa@analog.com>
+
+description: |
+ Analog Devices ADF4371/ADF4372 SPI Wideband Synthesizers
+ https://www.analog.com/media/en/technical-documentation/data-sheets/adf4371.pdf
+ https://www.analog.com/media/en/technical-documentation/data-sheets/adf4372.pdf
+
+properties:
+ compatible:
+ enum:
+ - adi,adf4371
+ - adi,adf4372
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ description:
+ Definition of the external clock (see clock/clock-bindings.txt)
+ maxItems: 1
+
+ clock-names:
+ description:
+ Must be "clkin"
+ maxItems: 1
+
+ adi,mute-till-lock-en:
+ type: boolean
+ description:
+ If this property is present, then the supply current to RF8P and RF8N
+ output stage will shut down until the ADF4371/ADF4372 achieves lock as
+ measured by the digital lock detect circuitry.
+
+required:
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+
+examples:
+ - |
+ spi0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ frequency@0 {
+ compatible = "adi,adf4371";
+ reg = <0>;
+ spi-max-frequency = <1000000>;
+ clocks = <&adf4371_clkin>;
+ clock-names = "clkin";
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/iio/light/isl29018.txt b/Documentation/devicetree/bindings/iio/light/isl29018.txt
deleted file mode 100644
index b9bbde3e13ed..000000000000
--- a/Documentation/devicetree/bindings/iio/light/isl29018.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* ISL 29018/29023/29035 I2C ALS, Proximity, and Infrared sensor
-
-Required properties:
-
- - compatible: Should be one of
- "isil,isl29018"
- "isil,isl29023"
- "isil,isl29035"
- - reg: the I2C address of the device
-
-Optional properties:
-
- - interrupts: the sole interrupt generated by the device
-
- Refer to interrupt-controller/interrupts.txt for generic interrupt client
- node bindings.
-
- - vcc-supply: phandle to the regulator that provides power to the sensor.
-
-Example:
-
-isl29018@44 {
- compatible = "isil,isl29018";
- reg = <0x44>;
- interrupt-parent = <&gpio>;
- interrupts = <TEGRA_GPIO(Z, 2) IRQ_TYPE_LEVEL_HIGH>;
-};
diff --git a/Documentation/devicetree/bindings/iio/light/isl29018.yaml b/Documentation/devicetree/bindings/iio/light/isl29018.yaml
new file mode 100644
index 000000000000..cbb00be8f359
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/light/isl29018.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/light/isl29018.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: |
+ Intersil 29018/29023/29035 Ambient Light, Infrared Light, and Proximity Sensor
+
+maintainers:
+ - Brian Masney <masneyb@onstation.org>
+
+description: |
+ Ambient and infrared light sensing with proximity detection over an i2c
+ interface.
+
+ https://www.renesas.com/us/en/www/doc/datasheet/isl29018.pdf
+ https://www.renesas.com/us/en/www/doc/datasheet/isl29023.pdf
+ https://www.renesas.com/us/en/www/doc/datasheet/isl29035.pdf
+
+properties:
+ compatible:
+ enum:
+ - isil,isl29018
+ - isil,isl29023
+ - isil,isl29035
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ vcc-supply:
+ description: Regulator that provides power to the sensor
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@44 {
+ compatible = "isil,isl29018";
+ reg = <0x44>;
+ interrupts-extended = <&msmgpio 61 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2583.txt b/Documentation/devicetree/bindings/iio/light/tsl2583.txt
deleted file mode 100644
index 059dffa1829a..000000000000
--- a/Documentation/devicetree/bindings/iio/light/tsl2583.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-* TAOS TSL 2580/2581/2583 ALS sensor
-
-Required properties:
-
- - compatible: Should be one of
- "amstaos,tsl2580"
- "amstaos,tsl2581"
- "amstaos,tsl2583"
- - reg: the I2C address of the device
-
-Optional properties:
-
- - interrupts: the sole interrupt generated by the device
-
- Refer to interrupt-controller/interrupts.txt for generic interrupt client
- node bindings.
-
- - vcc-supply: phandle to the regulator that provides power to the sensor.
-
-Example:
-
-tsl2581@29 {
- compatible = "amstaos,tsl2581";
- reg = <0x29>;
-};
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2583.yaml b/Documentation/devicetree/bindings/iio/light/tsl2583.yaml
new file mode 100644
index 000000000000..e86ef64ecf03
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/light/tsl2583.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/light/tsl2583.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: AMS/TAOS Ambient Light Sensor (ALS)
+
+maintainers:
+ - Brian Masney <masneyb@onstation.org>
+
+description: |
+ Ambient light sensing with an i2c interface.
+
+properties:
+ compatible:
+ enum:
+ - amstaos,tsl2580
+ - amstaos,tsl2581
+ - amstaos,tsl2583
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ vcc-supply:
+ description: Regulator that provides power to the sensor
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ light-sensor@29 {
+ compatible = "amstaos,tsl2581";
+ reg = <0x29>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2772.txt b/Documentation/devicetree/bindings/iio/light/tsl2772.txt
deleted file mode 100644
index 1c5e6f17a1df..000000000000
--- a/Documentation/devicetree/bindings/iio/light/tsl2772.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-* AMS/TAOS ALS and proximity sensor
-
-Required properties:
-
- - compatible: Should be one of
- "amstaos,tsl2571"
- "amstaos,tsl2671"
- "amstaos,tmd2671"
- "amstaos,tsl2771"
- "amstaos,tmd2771"
- "amstaos,tsl2572"
- "amstaos,tsl2672"
- "amstaos,tmd2672"
- "amstaos,tsl2772"
- "amstaos,tmd2772"
- "avago,apds9930"
- - reg: the I2C address of the device
-
-Optional properties:
-
- - amstaos,proximity-diodes - proximity diodes to enable. <0>, <1>, or <0 1>
- are the only valid values.
- - led-max-microamp - current for the proximity LED. Must be 100000, 50000,
- 25000, or 13000.
- - vdd-supply: phandle to the regulator that provides power to the sensor.
- - vddio-supply: phandle to the regulator that provides power to the bus.
- - interrupts: the sole interrupt generated by the device
-
- Refer to interrupt-controller/interrupts.txt for generic interrupt client
- node bindings.
-
-Example:
-
-tsl2772@39 {
- compatible = "amstaos,tsl2772";
- reg = <0x39>;
- interrupts-extended = <&msmgpio 61 IRQ_TYPE_EDGE_FALLING>;
- vdd-supply = <&pm8941_l17>;
- vddio-supply = <&pm8941_lvs1>;
- amstaos,proximity-diodes = <0>;
- led-max-microamp = <100000>;
-};
diff --git a/Documentation/devicetree/bindings/iio/light/tsl2772.yaml b/Documentation/devicetree/bindings/iio/light/tsl2772.yaml
new file mode 100644
index 000000000000..ed2c3d5eadf5
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/light/tsl2772.yaml
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/light/tsl2772.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: AMS/TAOS Ambient Light Sensor (ALS) and Proximity Detector
+
+maintainers:
+ - Brian Masney <masneyb@onstation.org>
+
+description: |
+ Ambient light sensing and proximity detection with an i2c interface.
+ https://ams.com/documents/20143/36005/TSL2772_DS000181_2-00.pdf
+
+properties:
+ compatible:
+ enum:
+ - amstaos,tsl2571
+ - amstaos,tsl2671
+ - amstaos,tmd2671
+ - amstaos,tsl2771
+ - amstaos,tmd2771
+ - amstaos,tsl2572
+ - amstaos,tsl2672
+ - amstaos,tmd2672
+ - amstaos,tsl2772
+ - amstaos,tmd2772
+ - avago,apds9930
+
+ reg:
+ maxItems: 1
+
+ amstaos,proximity-diodes:
+ description: Proximity diodes to enable
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32-array
+ - minItems: 1
+ maxItems: 2
+ items:
+ minimum: 0
+ maximum: 1
+
+ interrupts:
+ maxItems: 1
+
+ led-max-microamp:
+ description: Current for the proximity LED
+ enum:
+ - 13000
+ - 25000
+ - 50000
+ - 100000
+
+ vdd-supply:
+ description: Regulator that provides power to the sensor
+
+ vddio-supply:
+ description: Regulator that provides power to the bus
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ #include <dt-bindings/interrupt-controller/irq.h>
+
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ sensor@39 {
+ compatible = "amstaos,tsl2772";
+ reg = <0x39>;
+ interrupts-extended = <&msmgpio 61 IRQ_TYPE_EDGE_FALLING>;
+ vdd-supply = <&pm8941_l17>;
+ vddio-supply = <&pm8941_lvs1>;
+ amstaos,proximity-diodes = <0>;
+ led-max-microamp = <100000>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/input/elan_i2c.txt b/Documentation/devicetree/bindings/input/elan_i2c.txt
index 797607460735..9963247706f2 100644
--- a/Documentation/devicetree/bindings/input/elan_i2c.txt
+++ b/Documentation/devicetree/bindings/input/elan_i2c.txt
@@ -13,9 +13,20 @@ Optional properties:
pinctrl binding [1]).
- vcc-supply: a phandle for the regulator supplying 3.3V power.
- elan,trackpoint: touchpad can support a trackpoint (boolean)
+- elan,clickpad: touchpad is a clickpad (the entire surface is a button)
+- elan,middle-button: touchpad has a physical middle button
+- elan,x_traces: number of antennas on the x axis
+- elan,y_traces: number of antennas on the y axis
+- some generic touchscreen properties [2]:
+ * touchscreen-size-x
+ * touchscreen-size-y
+ * touchscreen-x-mm
+ * touchscreen-y-mm
+
[0]: Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
[1]: Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+[2]: Documentation/devicetree/bindings/input/touchscreen/touchscreen.txt
Example:
&i2c1 {
diff --git a/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt b/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt
index 496125c6bfb7..507b737612ea 100644
--- a/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt
+++ b/Documentation/devicetree/bindings/input/sun4i-lradc-keys.txt
@@ -5,6 +5,7 @@ Required properties:
- compatible: should be one of the following string:
"allwinner,sun4i-a10-lradc-keys"
"allwinner,sun8i-a83t-r-lradc"
+ "allwinner,sun50i-a64-lradc", "allwinner,sun8i-a83t-r-lradc"
- reg: mmio address range of the chip
- interrupts: interrupt to which the chip is connected
- vref-supply: powersupply for the lradc reference voltage
diff --git a/Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt b/Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt
new file mode 100644
index 000000000000..4e82fd575cec
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/amazon,al-fic.txt
@@ -0,0 +1,29 @@
+Amazon's Annapurna Labs Fabric Interrupt Controller
+
+Required properties:
+
+- compatible: should be "amazon,al-fic"
+- reg: physical base address and size of the registers
+- interrupt-controller: identifies the node as an interrupt controller
+- #interrupt-cells: must be 2.
+ First cell defines the index of the interrupt within the controller.
+ Second cell is used to specify the trigger type and must be one of the
+ following:
+ - bits[3:0] trigger type and level flags
+ 1 = low-to-high edge triggered
+ 4 = active high level-sensitive
+- interrupt-parent: specifies the parent interrupt controller.
+- interrupts: describes which input line in the interrupt parent, this
+ fic's output is connected to. This field property depends on the parent's
+ binding
+
+Example:
+
+amazon_fic: interrupt-controller@0xfd8a8500 {
+ compatible = "amazon,al-fic";
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ reg = <0x0 0xfd8a8500 0x0 0x1000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SPI 0x0 IRQ_TYPE_LEVEL_HIGH>;
+};
diff --git a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
index 1502a51548bb..7d531d5fff29 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/amlogic,meson-gpio-intc.txt
@@ -15,6 +15,7 @@ Required properties:
"amlogic,meson-gxbb-gpio-intc" for GXBB SoCs (S905) or
"amlogic,meson-gxl-gpio-intc" for GXL SoCs (S905X, S912)
"amlogic,meson-axg-gpio-intc" for AXG SoCs (A113D, A113X)
+ "amlogic,meson-g12a-gpio-intc" for G12A SoCs (S905D2, S905X2, S905Y2)
- reg : Specifies base physical address and size of the registers.
- interrupt-controller : Identifies the node as an interrupt controller.
- #interrupt-cells : Specifies the number of cells needed to encode an
diff --git a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt
index ab921f1698fb..e13405355166 100644
--- a/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt
+++ b/Documentation/devicetree/bindings/interrupt-controller/csky,mpintc.txt
@@ -6,11 +6,16 @@ C-SKY Multi-processors Interrupt Controller is designed for ck807/ck810/ck860
SMP soc, and it also could be used in non-SMP system.
Interrupt number definition:
-
0-15 : software irq, and we use 15 as our IPI_IRQ.
16-31 : private irq, and we use 16 as the co-processor timer.
31-1024: common irq for soc ip.
+Interrupt triger mode: (Defined in dt-bindings/interrupt-controller/irq.h)
+ IRQ_TYPE_LEVEL_HIGH (default)
+ IRQ_TYPE_LEVEL_LOW
+ IRQ_TYPE_EDGE_RISING
+ IRQ_TYPE_EDGE_FALLING
+
=============================
intc node bindings definition
=============================
@@ -26,15 +31,22 @@ intc node bindings definition
- #interrupt-cells
Usage: required
Value type: <u32>
- Definition: must be <1>
+ Definition: <2>
- interrupt-controller:
Usage: required
-Examples:
+Examples: ("interrupts = <irq_num IRQ_TYPE_XXX>")
---------
+#include <dt-bindings/interrupt-controller/irq.h>
intc: interrupt-controller {
compatible = "csky,mpintc";
- #interrupt-cells = <1>;
+ #interrupt-cells = <2>;
interrupt-controller;
};
+
+ device: device-example {
+ ...
+ interrupts = <34 IRQ_TYPE_EDGE_RISING>;
+ interrupt-parent = <&intc>;
+ };
diff --git a/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt b/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt
new file mode 100644
index 000000000000..727b7e4cd6e0
--- /dev/null
+++ b/Documentation/devicetree/bindings/interrupt-controller/renesas,rza1-irqc.txt
@@ -0,0 +1,43 @@
+DT bindings for the Renesas RZ/A1 Interrupt Controller
+
+The RZ/A1 Interrupt Controller is a front-end for the GIC found on Renesas
+RZ/A1 and RZ/A2 SoCs:
+ - IRQ sense select for 8 external interrupts, 1:1-mapped to 8 GIC SPI
+ interrupts,
+ - NMI edge select.
+
+Required properties:
+ - compatible: Must be "renesas,<soctype>-irqc", and "renesas,rza1-irqc" as
+ fallback.
+ Examples with soctypes are:
+ - "renesas,r7s72100-irqc" (RZ/A1H)
+ - "renesas,r7s9210-irqc" (RZ/A2M)
+ - #interrupt-cells: Must be 2 (an interrupt index and flags, as defined
+ in interrupts.txt in this directory)
+ - #address-cells: Must be zero
+ - interrupt-controller: Marks the device as an interrupt controller
+ - reg: Base address and length of the memory resource used by the interrupt
+ controller
+ - interrupt-map: Specifies the mapping from external interrupts to GIC
+ interrupts
+ - interrupt-map-mask: Must be <7 0>
+
+Example:
+
+ irqc: interrupt-controller@fcfef800 {
+ compatible = "renesas,r7s72100-irqc", "renesas,rza1-irqc";
+ #interrupt-cells = <2>;
+ #address-cells = <0>;
+ interrupt-controller;
+ reg = <0xfcfef800 0x6>;
+ interrupt-map =
+ <0 0 &gic GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>,
+ <1 0 &gic GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>,
+ <2 0 &gic GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>,
+ <3 0 &gic GIC_SPI 3 IRQ_TYPE_LEVEL_HIGH>,
+ <4 0 &gic GIC_SPI 4 IRQ_TYPE_LEVEL_HIGH>,
+ <5 0 &gic GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>,
+ <6 0 &gic GIC_SPI 6 IRQ_TYPE_LEVEL_HIGH>,
+ <7 0 &gic GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-map-mask = <7 0>;
+ };
diff --git a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt
index 3538a214fff1..352f5e9c759b 100644
--- a/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt
+++ b/Documentation/devicetree/bindings/ipmi/npcm7xx-kcs-bmc.txt
@@ -36,4 +36,4 @@ Example:
kcs_chan = <2>;
status = "disabled";
};
- }; \ No newline at end of file
+ };
diff --git a/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml b/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml
index 4d61fe0a98a4..dc129d9a329e 100644
--- a/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml
+++ b/Documentation/devicetree/bindings/leds/backlight/lm3630a-backlight.yaml
@@ -23,16 +23,17 @@ properties:
reg:
maxItems: 1
- ti,linear-mapping-mode:
- description: |
- Enable linear mapping mode. If disabled, then it will use exponential
- mapping mode in which the ramp up/down appears to have a more uniform
- transition to the human eye.
- type: boolean
+ '#address-cells':
+ const: 1
+
+ '#size-cells':
+ const: 0
required:
- compatible
- reg
+ - '#address-cells'
+ - '#size-cells'
patternProperties:
"^led@[01]$":
@@ -48,7 +49,6 @@ patternProperties:
in this property. The two current sinks can be controlled
independently with both banks, or bank A can be configured to control
both sinks with the led-sources property.
- maxItems: 1
minimum: 0
maximum: 1
@@ -73,6 +73,13 @@ patternProperties:
minimum: 0
maximum: 255
+ ti,linear-mapping-mode:
+ description: |
+ Enable linear mapping mode. If disabled, then it will use exponential
+ mapping mode in which the ramp up/down appears to have a more uniform
+ transition to the human eye.
+ type: boolean
+
required:
- reg
diff --git a/Documentation/devicetree/bindings/leds/leds-lm36274.txt b/Documentation/devicetree/bindings/leds/leds-lm36274.txt
new file mode 100644
index 000000000000..39c230d59a4d
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-lm36274.txt
@@ -0,0 +1,85 @@
+* Texas Instruments LM36274 4-Channel LCD Backlight Driver w/Integrated Bias
+
+The LM36274 is an integrated four-channel WLED driver and LCD bias supply.
+The backlight boost provides the power to bias four parallel LED strings with
+up to 29V total output voltage. The 11-bit LED current is programmable via
+the I2C bus and/or controlled via a logic level PWM input from 60 uA to 30 mA.
+
+Parent device properties are documented in
+Documentation/devicetree/bindings/mfd/ti-lmu.txt
+
+Regulator properties are documented in
+Documentation/devicetree/bindings/regulator/lm363x-regulator.txt
+
+Required backlight properties:
+ - compatible:
+ "ti,lm36274-backlight"
+ - reg : 0
+ - #address-cells : 1
+ - #size-cells : 0
+ - led-sources : Indicates which LED strings will be enabled.
+ Values from 0-3, sources is 0 based so strings will be
+ source value + 1.
+
+Optional backlight properties:
+ - label : see Documentation/devicetree/bindings/leds/common.txt
+ - linux,default-trigger :
+ see Documentation/devicetree/bindings/leds/common.txt
+
+Example:
+
+HVLED string 1 and 3 are controlled by control bank A and HVLED 2 string is
+controlled by control bank B.
+
+lm36274@11 {
+ compatible = "ti,lm36274";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0x11>;
+
+ enable-gpios = <&gpio1 28 GPIO_ACTIVE_HIGH>;
+
+ regulators {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "ti,lm363x-regulator";
+
+ enable-gpios = <&pioC 0 GPIO_ACTIVE_HIGH>,
+ <&pioC 1 GPIO_ACTIVE_HIGH>;
+
+ vboost {
+ regulator-name = "lcd_boost";
+ regulator-min-microvolt = <4000000>;
+ regulator-max-microvolt = <7150000>;
+ regulator-always-on;
+ };
+
+ vpos {
+ regulator-name = "lcd_vpos";
+ regulator-min-microvolt = <4000000>;
+ regulator-max-microvolt = <6500000>;
+ };
+
+ vneg {
+ regulator-name = "lcd_vneg";
+ regulator-min-microvolt = <4000000>;
+ regulator-max-microvolt = <6500000>;
+ };
+ };
+
+ backlight {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "ti,lm36274-backlight";
+
+ led@0 {
+ reg = <0>;
+ led-sources = <0 2>;
+ label = "white:backlight_cluster";
+ linux,default-trigger = "backlight";
+ };
+ };
+};
+
+For more product information please see the link below:
+http://www.ti.com/lit/ds/symlink/lm36274.pdf
diff --git a/Documentation/devicetree/bindings/leds/leds-lm3697.txt b/Documentation/devicetree/bindings/leds/leds-lm3697.txt
new file mode 100644
index 000000000000..63992d732959
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-lm3697.txt
@@ -0,0 +1,73 @@
+* Texas Instruments - LM3697 Highly Efficient White LED Driver
+
+The LM3697 11-bit LED driver provides high-
+performance backlight dimming for 1, 2, or 3 series
+LED strings while delivering up to 90% efficiency.
+
+This device is suitable for display and keypad lighting
+
+Required properties:
+ - compatible:
+ "ti,lm3697"
+ - reg : I2C slave address
+ - #address-cells : 1
+ - #size-cells : 0
+
+Optional properties:
+ - enable-gpios : GPIO pin to enable/disable the device
+ - vled-supply : LED supply
+
+Required child properties:
+ - reg : 0 - LED is Controlled by bank A
+ 1 - LED is Controlled by bank B
+ - led-sources : Indicates which HVLED string is associated to which
+ control bank. This is a zero based property so
+ HVLED1 = 0, HVLED2 = 1, HVLED3 = 2.
+ Additional information is contained
+ in Documentation/devicetree/bindings/leds/common.txt
+
+Optional child properties:
+ - ti,brightness-resolution - see Documentation/devicetree/bindings/mfd/ti-lmu.txt
+ - ramp-up-us: see Documentation/devicetree/bindings/mfd/ti-lmu.txt
+ - ramp-down-us: see Documentation/devicetree/bindings/mfd/ti-lmu.txt
+ - label : see Documentation/devicetree/bindings/leds/common.txt
+ - linux,default-trigger :
+ see Documentation/devicetree/bindings/leds/common.txt
+
+Example:
+
+HVLED string 1 and 3 are controlled by control bank A and HVLED 2 string is
+controlled by control bank B.
+
+led-controller@36 {
+ compatible = "ti,lm3697";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0x36>;
+
+ enable-gpios = <&gpio1 28 GPIO_ACTIVE_HIGH>;
+ vled-supply = <&vbatt>;
+
+ led@0 {
+ reg = <0>;
+ led-sources = <0 2>;
+ ti,brightness-resolution = <2047>;
+ ramp-up-us = <5000>;
+ ramp-down-us = <1000>;
+ label = "white:first_backlight_cluster";
+ linux,default-trigger = "backlight";
+ };
+
+ led@1 {
+ reg = <1>;
+ led-sources = <1>;
+ ti,brightness-resolution = <255>;
+ ramp-up-us = <500>;
+ ramp-down-us = <1000>;
+ label = "white:second_backlight_cluster";
+ linux,default-trigger = "backlight";
+ };
+}
+
+For more product information please see the link below:
+http://www.ti.com/lit/ds/symlink/lm3697.pdf
diff --git a/Documentation/devicetree/bindings/leds/leds-spi-byte.txt b/Documentation/devicetree/bindings/leds/leds-spi-byte.txt
new file mode 100644
index 000000000000..28b6b2d9091e
--- /dev/null
+++ b/Documentation/devicetree/bindings/leds/leds-spi-byte.txt
@@ -0,0 +1,44 @@
+* Single Byte SPI LED Device Driver.
+
+The driver can be used for controllers with a very simple SPI protocol:
+- one LED is controlled by a single byte on MOSI
+- the value of the byte gives the brightness between two values (lowest to
+ highest)
+- no return value is necessary (no MISO signal)
+
+The value for lowest and highest brightness is dependent on the device and
+therefore on the compatible string.
+
+Depending on the compatible string some special functions (like hardware
+accelerated blinking) might can be supported too.
+
+The driver currently only supports one LED. The properties of the LED are
+configured in a sub-node in the device node.
+
+Required properties:
+- compatible: should be one of
+ * "ubnt,acb-spi-led" microcontroller (SONiX 8F26E611LA) based device
+ used for example in Ubiquiti airCube ISP
+
+Property rules described in Documentation/devicetree/bindings/spi/spi-bus.txt
+apply.
+
+LED sub-node properties:
+- label:
+ see Documentation/devicetree/bindings/leds/common.txt
+- default-state:
+ see Documentation/devicetree/bindings/leds/common.txt
+ Only "on" and "off" are supported.
+
+Example:
+
+led-controller@0 {
+ compatible = "ubnt,acb-spi-led";
+ reg = <0>;
+ spi-max-frequency = <100000>;
+
+ led {
+ label = "white:status";
+ default-state = "on";
+ };
+};
diff --git a/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt b/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt
index 0ef372656a3e..35c3f56b7f7b 100644
--- a/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt
+++ b/Documentation/devicetree/bindings/mailbox/omap-mailbox.txt
@@ -1,4 +1,4 @@
-OMAP2+ Mailbox Driver
+OMAP2+ and K3 Mailbox
=====================
The OMAP mailbox hardware facilitates communication between different processors
@@ -7,7 +7,7 @@ various processor subsystems and is connected on an interconnect bus. The
communication is achieved through a set of registers for message storage and
interrupt configuration registers.
-Each mailbox IP block has a certain number of h/w fifo queues and output
+Each mailbox IP block/cluster has a certain number of h/w fifo queues and output
interrupt lines. An output interrupt line is routed to an interrupt controller
within a processor subsystem, and there can be more than one line going to a
specific processor's interrupt controller. The interrupt line connections are
@@ -23,12 +23,16 @@ All the current OMAP SoCs except for the newest DRA7xx SoC has a single IP
instance. DRA7xx has multiple instances with different number of h/w fifo queues
and interrupt lines between different instances. The interrupt lines can also be
routed to different processor sub-systems on DRA7xx as they are routed through
-the Crossbar, a kind of interrupt router/multiplexer.
+the Crossbar, a kind of interrupt router/multiplexer. The K3 AM65x and J721E
+SoCs has each of these instances form a cluster and combine multiple clusters
+into a single IP block present within the Main NavSS. The interrupt lines from
+all these clusters are multiplexed and routed to different processor subsystems
+over a limited number of common interrupt output lines of an Interrupt Router.
Mailbox Device Node:
====================
-A Mailbox device node is used to represent a Mailbox IP instance within a SoC.
-The sub-mailboxes are represented as child nodes of this parent node.
+A Mailbox device node is used to represent a Mailbox IP instance/cluster within
+a SoC. The sub-mailboxes are represented as child nodes of this parent node.
Required properties:
--------------------
@@ -37,12 +41,12 @@ Required properties:
"ti,omap3-mailbox" for OMAP3430, OMAP3630 SoCs
"ti,omap4-mailbox" for OMAP44xx, OMAP54xx, AM33xx,
AM43xx and DRA7xx SoCs
+ "ti,am654-mailbox" for K3 AM65x and J721E SoCs
- reg: Contains the mailbox register address range (base
address and length)
- interrupts: Contains the interrupt information for the mailbox
device. The format is dependent on which interrupt
- controller the OMAP device uses
-- ti,hwmods: Name of the hwmod associated with the mailbox
+ controller the Mailbox device uses
- #mbox-cells: Common mailbox binding property to identify the number
of cells required for the mailbox specifier. Should be
1
@@ -50,6 +54,23 @@ Required properties:
device can interrupt
- ti,mbox-num-fifos: Number of h/w fifo queues within the mailbox IP block
+SoC-specific Required properties:
+---------------------------------
+The following are mandatory properties for the OMAP architecture based SoCs
+only:
+- ti,hwmods: Name of the hwmod associated with the mailbox. This
+ should be defined in the mailbox node only if the node
+ is not defined as a child node of a corresponding sysc
+ interconnect node.
+
+The following are mandatory properties for the K3 AM65x and J721E SoCs only:
+- interrupt-parent: Should contain a phandle to the TI-SCI interrupt
+ controller node that is used to dynamically program
+ the interrupt routes between the IP and the main GIC
+ controllers. See the following binding for additional
+ details,
+ Documentation/devicetree/bindings/interrupt-controller/ti,sci-intr.txt
+
Child Nodes:
============
A child node is used for representing the actual sub-mailbox device that is
@@ -98,7 +119,7 @@ to be used by the client user.
Example:
--------
-/* OMAP4 */
+1. /* OMAP4 */
mailbox: mailbox@4a0f4000 {
compatible = "ti,omap4-mailbox";
reg = <0x4a0f4000 0x200>;
@@ -123,7 +144,7 @@ dsp {
...
};
-/* AM33xx */
+2. /* AM33xx */
mailbox: mailbox@480c8000 {
compatible = "ti,omap4-mailbox";
reg = <0x480C8000 0x200>;
@@ -137,3 +158,23 @@ mailbox: mailbox@480c8000 {
ti,mbox-rx = <0 0 3>;
};
};
+
+3. /* AM65x */
+&cbass_main {
+ cbass_main_navss: interconnect0 {
+ mailbox0_cluster0: mailbox@31f80000 {
+ compatible = "ti,am654-mailbox";
+ reg = <0x00 0x31f80000 0x00 0x200>;
+ #mbox-cells = <1>;
+ ti,mbox-num-users = <4>;
+ ti,mbox-num-fifos = <16>;
+ interrupt-parent = <&intr_main_navss>;
+ interrupts = <164 0>;
+
+ mbox_mcu_r5fss0_core0: mbox-mcu-r5fss0-core0 {
+ ti,mbox-tx = <1 0 0>;
+ ti,mbox-rx = <0 0 0>;
+ };
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/media/allegro.txt b/Documentation/devicetree/bindings/media/allegro.txt
new file mode 100644
index 000000000000..a92e2fbf26c9
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/allegro.txt
@@ -0,0 +1,43 @@
+Device-tree bindings for the Allegro DVT video IP codecs present in the Xilinx
+ZynqMP SoC. The IP core may either be a H.264/H.265 encoder or H.264/H.265
+decoder ip core.
+
+Each actual codec engines is controlled by a microcontroller (MCU). Host
+software uses a provided mailbox interface to communicate with the MCU. The
+MCU share an interrupt.
+
+Required properties:
+ - compatible: value should be one of the following
+ "allegro,al5e-1.1", "allegro,al5e": encoder IP core
+ "allegro,al5d-1.1", "allegro,al5d": decoder IP core
+ - reg: base and length of the memory mapped register region and base and
+ length of the memory mapped sram
+ - reg-names: must include "regs" and "sram"
+ - interrupts: shared interrupt from the MCUs to the processing system
+ - clocks: must contain an entry for each entry in clock-names
+ - clock-names: must include "core_clk", "mcu_clk", "m_axi_core_aclk",
+ "m_axi_mcu_aclk", "s_axi_lite_aclk"
+
+Example:
+ al5e: video-codec@a0009000 {
+ compatible = "allegro,al5e-1.1", "allegro,al5e";
+ reg = <0 0xa0009000 0 0x1000>,
+ <0 0xa0000000 0 0x8000>;
+ reg-names = "regs", "sram";
+ interrupts = <0 96 4>;
+ clocks = <&xlnx_vcu 0>, <&xlnx_vcu 1>,
+ <&clkc 71>, <&clkc 71>, <&clkc 71>;
+ clock-names = "core_clk", "mcu_clk", "m_axi_core_aclk",
+ "m_axi_mcu_aclk", "s_axi_lite_aclk"
+ };
+ al5d: video-codec@a0029000 {
+ compatible = "allegro,al5d-1.1", "allegro,al5d";
+ reg = <0 0xa0029000 0 0x1000>,
+ <0 0xa0020000 0 0x8000>;
+ reg-names = "regs", "sram";
+ interrupts = <0 96 4>;
+ clocks = <&xlnx_vcu 2>, <&xlnx_vcu 3>,
+ <&clkc 71>, <&clkc 71>, <&clkc 71>;
+ clock-names = "core_clk", "mcu_clk", "m_axi_core_aclk",
+ "m_axi_mcu_aclk", "s_axi_lite_aclk"
+ };
diff --git a/Documentation/devicetree/bindings/media/amlogic,vdec.txt b/Documentation/devicetree/bindings/media/amlogic,vdec.txt
new file mode 100644
index 000000000000..aabdd01bcf32
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/amlogic,vdec.txt
@@ -0,0 +1,71 @@
+Amlogic Video Decoder
+================================
+
+The video decoding IP lies within the DOS memory region,
+except for the hardware bitstream parser that makes use of an undocumented
+region.
+
+It makes use of the following blocks:
+
+- ESPARSER is a bitstream parser that outputs to a VIFIFO. Further VDEC blocks
+then feed from this VIFIFO.
+- VDEC_1 can decode MPEG-1, MPEG-2, MPEG-4 part 2, MJPEG, H.263, H.264, VC-1.
+- VDEC_HEVC can decode HEVC and VP9.
+
+Both VDEC_1 and VDEC_HEVC share the "vdec" IRQ and as such cannot run
+concurrently.
+
+Device Tree Bindings:
+---------------------
+
+VDEC: Video Decoder
+--------------------------
+
+Required properties:
+- compatible: value should be different for each SoC family as :
+ - GXBB (S905) : "amlogic,gxbb-vdec"
+ - GXL (S905X, S905D) : "amlogic,gxl-vdec"
+ - GXM (S912) : "amlogic,gxm-vdec"
+- reg: base address and size of he following memory-mapped regions :
+ - dos
+ - esparser
+- reg-names: should contain the names of the previous memory regions
+- interrupts: should contain the following IRQs:
+ - vdec
+ - esparser
+- interrupt-names: should contain the names of the previous interrupts
+- amlogic,ao-sysctrl: should point to the AOBUS sysctrl node
+- amlogic,canvas: should point to a canvas provider node
+- clocks: should contain the following clocks :
+ - dos_parser
+ - dos
+ - vdec_1
+ - vdec_hevc
+- clock-names: should contain the names of the previous clocks
+- resets: should contain the parser reset
+- reset-names: should be "esparser"
+
+Example:
+
+vdec: video-decoder@c8820000 {
+ compatible = "amlogic,gxbb-vdec";
+ reg = <0x0 0xc8820000 0x0 0x10000>,
+ <0x0 0xc110a580 0x0 0xe4>;
+ reg-names = "dos", "esparser";
+
+ interrupts = <GIC_SPI 44 IRQ_TYPE_EDGE_RISING>,
+ <GIC_SPI 32 IRQ_TYPE_EDGE_RISING>;
+ interrupt-names = "vdec", "esparser";
+
+ amlogic,ao-sysctrl = <&sysctrl_AO>;
+ amlogic,canvas = <&canvas>;
+
+ clocks = <&clkc CLKID_DOS_PARSER>,
+ <&clkc CLKID_DOS>,
+ <&clkc CLKID_VDEC_1>,
+ <&clkc CLKID_VDEC_HEVC>;
+ clock-names = "dos_parser", "dos", "vdec_1", "vdec_hevc";
+
+ resets = <&reset RESET_PARSER>;
+ reset-names = "esparser";
+};
diff --git a/Documentation/devicetree/bindings/media/imx7-csi.txt b/Documentation/devicetree/bindings/media/imx7-csi.txt
index 3c07bc676bc3..443aef07356e 100644
--- a/Documentation/devicetree/bindings/media/imx7-csi.txt
+++ b/Documentation/devicetree/bindings/media/imx7-csi.txt
@@ -14,8 +14,7 @@ Required properties:
- interrupts : should contain CSI interrupt;
- clocks : list of clock specifiers, see
Documentation/devicetree/bindings/clock/clock-bindings.txt for details;
-- clock-names : must contain "axi", "mclk" and "dcic" entries, matching
- entries in the clock property;
+- clock-names : must contain "mclk";
The device node shall contain one 'port' child node with one child 'endpoint'
node, according to the bindings defined in:
@@ -32,10 +31,8 @@ example:
compatible = "fsl,imx7-csi";
reg = <0x30710000 0x10000>;
interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&clks IMX7D_CLK_DUMMY>,
- <&clks IMX7D_CSI_MCLK_ROOT_CLK>,
- <&clks IMX7D_CLK_DUMMY>;
- clock-names = "axi", "mclk", "dcic";
+ clocks = <&clks IMX7D_CSI_MCLK_ROOT_CLK>;
+ clock-names = "mclk";
port {
csi_from_csi_mux: endpoint {
diff --git a/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt b/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt
new file mode 100644
index 000000000000..7ec2c8c8a3b9
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/marvell,mmp2-ccic.txt
@@ -0,0 +1,50 @@
+Marvell MMP2 camera host interface
+
+Required properties:
+ - compatible: Should be "marvell,mmp2-ccic".
+ - reg: Register base and size.
+ - interrupts: The interrupt number.
+ - #clock-cells: Must be 0.
+
+Optional properties:
+ - clocks: Reference to the input clock as specified by
+ Documentation/devicetree/bindings/clock/clock-bindings.txt.
+ - clock-names: Names of the clocks used; "axi" for the AXI bus interface,
+ "func" for the peripheral clock and "phy" for the parallel
+ video bus interface.
+ - clock-output-names: Optional clock source for sensors. Shall be "mclk".
+
+Required subnodes:
+ - port: The parallel bus interface port with a single endpoint linked to
+ the sensor's endpoint as described in
+ Documentation/devicetree/bindings/media/video-interfaces.txt.
+
+Required endpoint properties:
+ - bus-type: data bus type, <5> or <6> for Parallel or Bt.656 respectively
+ - pclk-sample: pixel clock polarity
+ - hsync-active: horizontal synchronization polarity (only required for
+ parallel bus)
+ - vsync-active: vertical synchronization polarity (only required for
+ parallel bus)
+
+Example:
+
+ camera0: camera@d420a000 {
+ compatible = "marvell,mmp2-ccic";
+ reg = <0xd420a000 0x800>;
+ interrupts = <42>;
+ clocks = <&soc_clocks MMP2_CLK_CCIC0>;
+ clock-names = "axi";
+ #clock-cells = <0>;
+ clock-output-names = "mclk";
+
+ port {
+ camera0_0: endpoint {
+ remote-endpoint = <&ov7670_0>;
+ bus-type = <5>; /* Parallel */
+ hsync-active = <1>; /* Active high */
+ vsync-active = <1>; /* Active high */
+ pclk-sample = <0>; /* Falling */
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt b/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt
index 249790a93017..3122ded82eb4 100644
--- a/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt
+++ b/Documentation/devicetree/bindings/media/st,stm32-dcmi.txt
@@ -11,7 +11,7 @@ Required properties:
- clock-names: must contain "mclk", which is the DCMI peripherial clock
- pinctrl: the pincontrol settings to configure muxing properly
for pins that connect to DCMI device.
- See Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt.
+ See Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml.
- dmas: phandle to DMA controller node,
see Documentation/devicetree/bindings/dma/stm32-dma.txt
- dma-names: must contain "tx", which is the transmit channel from DCMI to DMA
diff --git a/Documentation/devicetree/bindings/media/sun6i-csi.txt b/Documentation/devicetree/bindings/media/sun6i-csi.txt
index 0dd540bb03db..a2e3e56f0257 100644
--- a/Documentation/devicetree/bindings/media/sun6i-csi.txt
+++ b/Documentation/devicetree/bindings/media/sun6i-csi.txt
@@ -6,6 +6,7 @@ Allwinner V3s SoC features a CSI module(CSI1) with parallel interface.
Required properties:
- compatible: value must be one of:
* "allwinner,sun6i-a31-csi"
+ * "allwinner,sun8i-a83t-csi"
* "allwinner,sun8i-h3-csi"
* "allwinner,sun8i-v3s-csi"
* "allwinner,sun50i-a64-csi"
diff --git a/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt b/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt
index f936b5589b19..59b8dcc118ee 100644
--- a/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt
+++ b/Documentation/devicetree/bindings/memory-controllers/ingenic,jz4780-nemc.txt
@@ -5,6 +5,7 @@ controller in Ingenic JZ4780
Required properties:
- compatible: Should be set to one of:
+ "ingenic,jz4740-nemc" (JZ4740)
"ingenic,jz4780-nemc" (JZ4780)
- reg: Should specify the NEMC controller registers location and length.
- clocks: Clock for the NEMC controller.
diff --git a/Documentation/devicetree/bindings/mfd/atmel-usart.txt b/Documentation/devicetree/bindings/mfd/atmel-usart.txt
index 7f0cd72f47d2..699fd3c9ace8 100644
--- a/Documentation/devicetree/bindings/mfd/atmel-usart.txt
+++ b/Documentation/devicetree/bindings/mfd/atmel-usart.txt
@@ -17,17 +17,24 @@ Required properties for USART in SPI mode:
- cs-gpios: chipselects (internal cs not supported)
- atmel,usart-mode : Must be <AT91_USART_MODE_SPI> (found in dt-bindings/mfd/at91-usart.h)
+Optional properties in serial and SPI mode:
+- dma bindings for dma transfer:
+ - dmas: DMA specifier, consisting of a phandle to DMA controller node,
+ memory peripheral interface and USART DMA channel ID, FIFO configuration.
+ The order of DMA channels is fixed. The first DMA channel must be TX
+ associated channel and the second one must be RX associated channel.
+ Refer to dma.txt and atmel-dma.txt for details.
+ - dma-names: "tx" for TX channel.
+ "rx" for RX channel.
+ The order of dma-names is also fixed. The first name must be "tx"
+ and the second one must be "rx" as in the examples below.
+
Optional properties in serial mode:
- atmel,use-dma-rx: use of PDC or DMA for receiving data
- atmel,use-dma-tx: use of PDC or DMA for transmitting data
- {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD line respectively.
It will use specified PIO instead of the peripheral function pin for the USART feature.
If unsure, don't specify this property.
-- add dma bindings for dma transfer:
- - dmas: DMA specifier, consisting of a phandle to DMA controller node,
- memory peripheral interface and USART DMA channel ID, FIFO configuration.
- Refer to dma.txt and atmel-dma.txt for details.
- - dma-names: "rx" for RX channel, "tx" for TX channel.
- atmel,fifo-size: maximum number of data the RX and TX FIFOs can store for FIFO
capable USARTs.
- rs485-rts-delay, rs485-rx-during-tx, linux,rs485-enabled-at-boot-time: see rs485.txt
@@ -81,5 +88,8 @@ Example:
interrupts = <12 IRQ_TYPE_LEVEL_HIGH 5>;
clocks = <&usart0_clk>;
clock-names = "usart";
+ dmas = <&dma0 2 AT91_DMA_CFG_PER_ID(3)>,
+ <&dma0 2 (AT91_DMA_CFG_PER_ID(4) | AT91_DMA_CFG_FIFOCFG_ASAP)>;
+ dma-names = "tx", "rx";
cs-gpios = <&pioB 3 0>;
};
diff --git a/Documentation/devicetree/bindings/mfd/cros-ec.txt b/Documentation/devicetree/bindings/mfd/cros-ec.txt
index 6245c9b1a68b..4860eabd0f72 100644
--- a/Documentation/devicetree/bindings/mfd/cros-ec.txt
+++ b/Documentation/devicetree/bindings/mfd/cros-ec.txt
@@ -3,7 +3,7 @@ ChromeOS Embedded Controller
Google's ChromeOS EC is a Cortex-M device which talks to the AP and
implements various function such as keyboard and battery charging.
-The EC can be connect through various means (I2C, SPI, LPC) and the
+The EC can be connect through various means (I2C, SPI, LPC, RPMSG) and the
compatible string used depends on the interface. Each connection method has
its own driver which connects to the top level interface-agnostic EC driver.
Other Linux driver (such as cros-ec-keyb for the matrix keyboard) connect to
@@ -17,6 +17,9 @@ Required properties (SPI):
- compatible: "google,cros-ec-spi"
- reg: SPI chip select
+Required properties (RPMSG):
+- compatible: "google,cros-ec-rpmsg"
+
Optional properties (SPI):
- google,cros-ec-spi-pre-delay: Some implementations of the EC need a little
time to wake up from sleep before they can receive SPI transfers at a high
diff --git a/Documentation/devicetree/bindings/mfd/lp87565.txt b/Documentation/devicetree/bindings/mfd/lp87565.txt
index a48df7c08ab0..41671e0dc26b 100644
--- a/Documentation/devicetree/bindings/mfd/lp87565.txt
+++ b/Documentation/devicetree/bindings/mfd/lp87565.txt
@@ -41,3 +41,39 @@ lp87565_pmic: pmic@60 {
};
};
};
+
+TI LP87561 PMIC:
+
+This is a single output 4-phase regulator configuration
+
+Required properties:
+ - compatible: "ti,lp87561-q1"
+ - reg: I2C slave address.
+ - gpio-controller: Marks the device node as a GPIO Controller.
+ - #gpio-cells: Should be two. The first cell is the pin number and
+ the second cell is used to specify flags.
+ See ../gpio/gpio.txt for more information.
+ - xxx-in-supply: Phandle to parent supply node of each regulator
+ populated under regulators node. xxx should match
+ the supply_name populated in driver.
+Example:
+
+lp87561_pmic: pmic@62 {
+ compatible = "ti,lp87561-q1";
+ reg = <0x62>;
+ gpio-controller;
+ #gpio-cells = <2>;
+
+ buck3210-in-supply = <&vsys_3v3>;
+
+ regulators: regulators {
+ buck3210_reg: buck3210 {
+ /* VDD_CORE */
+ regulator-name = "buck3210";
+ regulator-min-microvolt = <800000>;
+ regulator-max-microvolt = <800000>;
+ regulator-always-on;
+ regulator-boot-on;
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/mfd/madera.txt b/Documentation/devicetree/bindings/mfd/madera.txt
index db3266088386..cad0f2800502 100644
--- a/Documentation/devicetree/bindings/mfd/madera.txt
+++ b/Documentation/devicetree/bindings/mfd/madera.txt
@@ -11,10 +11,14 @@ bindings/sound/madera.txt
Required properties:
- compatible : One of the following chip-specific strings:
+ "cirrus,cs47l15"
"cirrus,cs47l35"
"cirrus,cs47l85"
"cirrus,cs47l90"
"cirrus,cs47l91"
+ "cirrus,cs42l92"
+ "cirrus,cs47l92"
+ "cirrus,cs47l93"
"cirrus,wm1840"
- reg : I2C slave address when connected using I2C, chip select number when
@@ -22,7 +26,7 @@ Required properties:
- DCVDD-supply : Power supply for the device as defined in
bindings/regulator/regulator.txt
- Mandatory on CS47L35, CS47L90, CS47L91
+ Mandatory on CS47L15, CS47L35, CS47L90, CS47L91, CS42L92, CS47L92, CS47L93
Optional on CS47L85, WM1840
- AVDD-supply, DBVDD1-supply, DBVDD2-supply, CPVDD1-supply, CPVDD2-supply :
@@ -35,7 +39,7 @@ Required properties:
(CS47L85, WM1840)
- SPKVDD-supply : Power supply for the device
- (CS47L35)
+ (CS47L15, CS47L35)
- interrupt-controller : Indicates that this device is an interrupt controller
diff --git a/Documentation/devicetree/bindings/mfd/rk808.txt b/Documentation/devicetree/bindings/mfd/rk808.txt
index 1683ec3245bc..04df07f6f793 100644
--- a/Documentation/devicetree/bindings/mfd/rk808.txt
+++ b/Documentation/devicetree/bindings/mfd/rk808.txt
@@ -3,11 +3,15 @@ RK8XX Power Management Integrated Circuit
The rk8xx family current members:
rk805
rk808
+rk809
+rk817
rk818
Required properties:
- compatible: "rockchip,rk805"
- compatible: "rockchip,rk808"
+- compatible: "rockchip,rk809"
+- compatible: "rockchip,rk817"
- compatible: "rockchip,rk818"
- reg: I2C slave address
- interrupts: the interrupt outputs of the controller.
@@ -45,6 +49,23 @@ Optional RK808 properties:
the gpio controller. If DVS GPIOs aren't present, voltage changes will happen
very quickly with no slow ramp time.
+Optional shared RK809 and RK817 properties:
+- vcc1-supply: The input supply for DCDC_REG1
+- vcc2-supply: The input supply for DCDC_REG2
+- vcc3-supply: The input supply for DCDC_REG3
+- vcc4-supply: The input supply for DCDC_REG4
+- vcc5-supply: The input supply for LDO_REG1, LDO_REG2, LDO_REG3
+- vcc6-supply: The input supply for LDO_REG4, LDO_REG5, LDO_REG6
+- vcc7-supply: The input supply for LDO_REG7, LDO_REG8, LDO_REG9
+
+Optional RK809 properties:
+- vcc8-supply: The input supply for SWITCH_REG1
+- vcc9-supply: The input supply for DCDC_REG5, SWITCH_REG2
+
+Optional RK817 properties:
+- vcc8-supply: The input supply for BOOST
+- vcc9-supply: The input supply for OTG_SWITCH
+
Optional RK818 properties:
- vcc1-supply: The input supply for DCDC_REG1
- vcc2-supply: The input supply for DCDC_REG2
@@ -86,6 +107,21 @@ number as described in RK808 datasheet.
- SWITCH_REGn
- valid values for n are 1 to 2
+Following regulators of the RK809 and RK817 PMIC blocks are supported. Note that
+the 'n' in regulator name, as in DCDC_REGn or LDOn, represents the DCDC or LDO
+number as described in RK809 and RK817 datasheets.
+
+ - DCDC_REGn
+ - valid values for n are 1 to 5 for RK809.
+ - valid values for n are 1 to 4 for RK817.
+ - LDO_REGn
+ - valid values for n are 1 to 9 for RK809.
+ - valid values for n are 1 to 9 for RK817.
+ - SWITCH_REGn
+ - valid values for n are 1 to 2 for RK809.
+ - BOOST for RK817
+ - OTG_SWITCH for RK817
+
Following regulators of the RK818 PMIC block are supported. Note that
the 'n' in regulator name, as in DCDC_REGn or LDOn, represents the DCDC or LDO
number as described in RK818 datasheet.
@@ -98,6 +134,14 @@ number as described in RK818 datasheet.
- HDMI_SWITCH
- OTG_SWITCH
+It is necessary to configure three pins for both the RK809 and RK817, the three
+pins are "gpio_ts" "gpio_gt" "gpio_slp".
+ The gpio_gt and gpio_ts pins support the gpio function.
+ The gpio_slp pin is for controlling the pmic states, as below:
+ - reset
+ - power down
+ - sleep
+
Standard regulator bindings are used inside regulator subnodes. Check
Documentation/devicetree/bindings/regulator/regulator.txt
for more details
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt b/Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt
new file mode 100644
index 000000000000..c3c02ce73cde
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/rohm,bd70528-pmic.txt
@@ -0,0 +1,102 @@
+* ROHM BD70528 Power Management Integrated Circuit bindings
+
+BD70528MWV is an ultra-low quiescent current general purpose, single-chip,
+power management IC for battery-powered portable devices. The IC
+integrates 3 ultra-low current consumption buck converters, 3 LDOs and 2
+LED Drivers. Also included are 4 GPIOs, a real-time clock (RTC), a 32kHz
+clock gate, high-accuracy VREF for use with an external ADC, flexible
+dual-input power path, 10 bit SAR ADC for battery temperature monitor and
+1S battery charger with scalable charge currents.
+
+Required properties:
+ - compatible : Should be "rohm,bd70528"
+ - reg : I2C slave address.
+ - interrupts : The interrupt line the device is connected to.
+ - interrupt-controller : To indicate BD70528 acts as an interrupt controller.
+ - #interrupt-cells : Should be 2. Usage is compliant to the 2 cells
+ variant of ../interrupt-controller/interrupts.txt
+ - gpio-controller : To indicate BD70528 acts as a GPIO controller.
+ - #gpio-cells : Should be 2. The first cell is the pin number and
+ the second cell is used to specify flags. See
+ ../gpio/gpio.txt for more information.
+ - #clock-cells : Should be 0.
+ - regulators: : List of child nodes that specify the regulators.
+ Please see ../regulator/rohm,bd70528-regulator.txt
+
+Optional properties:
+ - clock-output-names : Should contain name for output clock.
+
+Example:
+/* External oscillator */
+osc: oscillator {
+ compatible = "fixed-clock";
+ #clock-cells = <1>;
+ clock-frequency = <32768>;
+ clock-output-names = "osc";
+};
+
+pmic: pmic@4b {
+ compatible = "rohm,bd70528";
+ reg = <0x4b>;
+ interrupt-parent = <&gpio1>;
+ interrupts = <29 GPIO_ACTIVE_LOW>;
+ clocks = <&osc 0>;
+ #clock-cells = <0>;
+ clock-output-names = "bd70528-32k-out";
+ #gpio-cells = <2>;
+ gpio-controller;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+
+ regulators {
+ buck1: BUCK1 {
+ regulator-name = "buck1";
+ regulator-min-microvolt = <1200000>;
+ regulator-max-microvolt = <3400000>;
+ regulator-boot-on;
+ regulator-ramp-delay = <125>;
+ };
+ buck2: BUCK2 {
+ regulator-name = "buck2";
+ regulator-min-microvolt = <1200000>;
+ regulator-max-microvolt = <3300000>;
+ regulator-boot-on;
+ regulator-ramp-delay = <125>;
+ };
+ buck3: BUCK3 {
+ regulator-name = "buck3";
+ regulator-min-microvolt = <800000>;
+ regulator-max-microvolt = <1800000>;
+ regulator-boot-on;
+ regulator-ramp-delay = <250>;
+ };
+ ldo1: LDO1 {
+ regulator-name = "ldo1";
+ regulator-min-microvolt = <1650000>;
+ regulator-max-microvolt = <3300000>;
+ regulator-boot-on;
+ };
+ ldo2: LDO2 {
+ regulator-name = "ldo2";
+ regulator-min-microvolt = <1650000>;
+ regulator-max-microvolt = <3300000>;
+ regulator-boot-on;
+ };
+
+ ldo3: LDO3 {
+ regulator-name = "ldo3";
+ regulator-min-microvolt = <1650000>;
+ regulator-max-microvolt = <3300000>;
+ };
+ led_ldo1: LED_LDO1 {
+ regulator-name = "led_ldo1";
+ regulator-min-microvolt = <200000>;
+ regulator-max-microvolt = <300000>;
+ };
+ led_ldo2: LED_LDO2 {
+ regulator-name = "led_ldo2";
+ regulator-min-microvolt = <200000>;
+ regulator-max-microvolt = <300000>;
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
index d5f68ac78d15..f22d74c7a8db 100644
--- a/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
+++ b/Documentation/devicetree/bindings/mfd/rohm,bd71837-pmic.txt
@@ -8,6 +8,8 @@ and 6 LDOs.
Datasheet for BD71837 is available at:
https://www.rohm.com/datasheet/BD71837MWV/bd71837mwv-e
+Datasheet for BD71847 is available at:
+https://www.rohm.com/datasheet/BD71847AMWV/bd71847amwv-e
Required properties:
- compatible : Should be "rohm,bd71837" for bd71837
@@ -38,6 +40,14 @@ target state is set to READY by default. If SNVS state is used the boot
crucial regulators must have the regulator-always-on and regulator-boot-on
properties set in regulator node.
+- rohm,short-press-ms : Short press duration in milliseconds
+- rohm,long-press-ms : Long press duration in milliseconds
+
+Configure the "short press" and "long press" timers for the power button.
+Values are rounded to what hardware supports (500ms multiple for short and
+1000ms multiple for long). If these properties are not present the existing
+configuration (from bootloader or OTP) is not touched.
+
Example:
/* external oscillator node */
diff --git a/Documentation/devicetree/bindings/mfd/ti-lmu.txt b/Documentation/devicetree/bindings/mfd/ti-lmu.txt
index 86ca786d54fc..2296b8f24de4 100644
--- a/Documentation/devicetree/bindings/mfd/ti-lmu.txt
+++ b/Documentation/devicetree/bindings/mfd/ti-lmu.txt
@@ -8,7 +8,7 @@ TI LMU driver supports lighting devices below.
LM3632 Backlight and regulator
LM3633 Backlight, LED and fault monitor
LM3695 Backlight
- LM3697 Backlight and fault monitor
+ LM36274 Backlight and regulator
Required properties:
- compatible: Should be one of:
@@ -16,15 +16,32 @@ Required properties:
"ti,lm3632"
"ti,lm3633"
"ti,lm3695"
- "ti,lm3697"
+ "ti,lm36274"
- reg: I2C slave address.
0x11 for LM3632
0x29 for LM3631
- 0x36 for LM3633, LM3697
+ 0x36 for LM3633
0x63 for LM3695
+ 0x11 for LM36274
-Optional property:
+Optional properties:
- enable-gpios: A GPIO specifier for hardware enable pin.
+ - ramp-up-us: Current ramping from one brightness level to
+ the a higher brightness level.
+ Range from 2048 us - 117.44 s
+ - ramp-down-us: Current ramping from one brightness level to
+ the a lower brightness level.
+ Range from 2048 us - 117.44 s
+ - ti,brightness-resolution - This determines whether to use 8 bit brightness
+ mode or 11 bit brightness mode. If this value is
+ not set the device is defaulted to the preferred
+ 8bit brightness mode per 7.3.4.1 of the data
+ sheet. This setting can either be in the parent
+ node or as part of the LED child nodes. This
+ is determined by the part itself if the strings
+ have a common brightness register or individual
+ brightness registers.
+ The values are 255 (8bit) or 2047 (11bit).
Required node:
- backlight: All LMU devices have backlight child nodes.
@@ -35,14 +52,15 @@ Optional nodes:
Required properties:
- compatible: Should be one of:
"ti,lm3633-fault-monitor"
- "ti,lm3697-fault-monitor"
- leds: LED properties for LM3633. Please refer to [2].
+ LED properties for LM36274. Please refer to [4].
- regulators: Regulator properties for LM3631 and LM3632.
Please refer to [3].
[1] ../leds/backlight/ti-lmu-backlight.txt
[2] ../leds/leds-lm3633.txt
[3] ../regulator/lm363x-regulator.txt
+[4] ../leds/leds-lm36274.txt
lm3631@29 {
compatible = "ti,lm3631";
@@ -90,7 +108,7 @@ lm3631@29 {
lcd_bl {
led-sources = <0 1>;
- ramp-up-msec = <300>;
+ ramp-up-us = <300000>;
};
};
};
@@ -152,15 +170,15 @@ lm3633@36 {
main {
label = "main_lcd";
led-sources = <1 2>;
- ramp-up-msec = <500>;
- ramp-down-msec = <500>;
+ ramp-up-us = <500000>;
+ ramp-down-us = <500000>;
};
front {
label = "front_lcd";
led-sources = <0>;
- ramp-up-msec = <1000>;
- ramp-down-msec = <0>;
+ ramp-up-us = <1000000>;
+ ramp-down-us = <0>;
};
};
@@ -201,23 +219,51 @@ lm3695@63 {
};
};
-lm3697@36 {
- compatible = "ti,lm3697";
- reg = <0x36>;
+lm36274@11 {
+ compatible = "ti,lm36274";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0x11>;
enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>;
+ regulators {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "ti,lm363x-regulator";
- backlight {
- compatible = "ti,lm3697-backlight";
+ enable-gpios = <&pioC 0 GPIO_ACTIVE_HIGH>,
+ <&pioC 1 GPIO_ACTIVE_HIGH>;
- lcd {
- led-sources = <0 1 2>;
- ramp-up-msec = <200>;
- ramp-down-msec = <200>;
+ vboost {
+ regulator-name = "lcd_boost";
+ regulator-min-microvolt = <4000000>;
+ regulator-max-microvolt = <7150000>;
+ regulator-always-on;
+ };
+
+ vpos {
+ regulator-name = "lcd_vpos";
+ regulator-min-microvolt = <4000000>;
+ regulator-max-microvolt = <6500000>;
+ };
+
+ vneg {
+ regulator-name = "lcd_vneg";
+ regulator-min-microvolt = <4000000>;
+ regulator-max-microvolt = <6500000>;
};
};
- fault-monitor {
- compatible = "ti,lm3697-fault-monitor";
+ backlight {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "ti,lm36274-backlight";
+
+ led@0 {
+ reg = <0>;
+ led-sources = <0 2>;
+ label = "white:backlight_cluster";
+ linux,default-trigger = "backlight";
+ };
};
};
diff --git a/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt b/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt
new file mode 100644
index 000000000000..1442ba5d2d98
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/fsl,dpaa2-console.txt
@@ -0,0 +1,11 @@
+DPAA2 console support
+
+Required properties:
+
+ - compatible
+ Value type: <string>
+ Definition: Must be "fsl,dpaa2-console".
+ - reg
+ Value type: <prop-encoded-array>
+ Definition: A standard property. Specifies the region where the MCFBA
+ (MC firmware base address) register can be found.
diff --git a/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt b/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt
new file mode 100644
index 000000000000..8c4d649cdd8f
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/olpc,xo1.75-ec.txt
@@ -0,0 +1,23 @@
+OLPC XO-1.75 Embedded Controller
+
+Required properties:
+- compatible: Should be "olpc,xo1.75-ec".
+- cmd-gpios: gpio specifier of the CMD pin
+
+The embedded controller requires the SPI controller driver to signal readiness
+to receive a transfer (that is, when TX FIFO contains the response data) by
+strobing the ACK pin with the ready signal. See the "ready-gpios" property of the
+SSP binding as documented in:
+<Documentation/devicetree/bindings/spi/spi-pxa2xx.txt>.
+
+Example:
+ &ssp3 {
+ spi-slave;
+ ready-gpios = <&gpio 125 GPIO_ACTIVE_HIGH>;
+
+ slave {
+ compatible = "olpc,xo1.75-ec";
+ spi-cpha;
+ cmd-gpios = <&gpio 155 GPIO_ACTIVE_HIGH>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt b/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt
new file mode 100644
index 000000000000..e3289634fa30
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/xlnx,sd-fec.txt
@@ -0,0 +1,58 @@
+* Xilinx SDFEC(16nm) IP *
+
+The Soft Decision Forward Error Correction (SDFEC) Engine is a Hard IP block
+which provides high-throughput LDPC and Turbo Code implementations.
+The LDPC decode & encode functionality is capable of covering a range of
+customer specified Quasi-cyclic (QC) codes. The Turbo decode functionality
+principally covers codes used by LTE. The FEC Engine offers significant
+power and area savings versus implementations done in the FPGA fabric.
+
+
+Required properties:
+- compatible: Must be "xlnx,sd-fec-1.1"
+- clock-names : List of input clock names from the following:
+ - "core_clk", Main processing clock for processing core (required)
+ - "s_axi_aclk", AXI4-Lite memory-mapped slave interface clock (required)
+ - "s_axis_din_aclk", DIN AXI4-Stream Slave interface clock (optional)
+ - "s_axis_din_words-aclk", DIN_WORDS AXI4-Stream Slave interface clock (optional)
+ - "s_axis_ctrl_aclk", Control input AXI4-Stream Slave interface clock (optional)
+ - "m_axis_dout_aclk", DOUT AXI4-Stream Master interface clock (optional)
+ - "m_axis_dout_words_aclk", DOUT_WORDS AXI4-Stream Master interface clock (optional)
+ - "m_axis_status_aclk", Status output AXI4-Stream Master interface clock (optional)
+- clocks : Clock phandles (see clock_bindings.txt for details).
+- reg: Should contain Xilinx SDFEC 16nm Hardened IP block registers
+ location and length.
+- xlnx,sdfec-code : Should contain "ldpc" or "turbo" to describe the codes
+ being used.
+- xlnx,sdfec-din-words : A value 0 indicates that the DIN_WORDS interface is
+ driven with a fixed value and is not present on the device, a value of 1
+ configures the DIN_WORDS to be block based, while a value of 2 configures the
+ DIN_WORDS input to be supplied for each AXI transaction.
+- xlnx,sdfec-din-width : Configures the DIN AXI stream where a value of 1
+ configures a width of "1x128b", 2 a width of "2x128b" and 4 configures a width
+ of "4x128b".
+- xlnx,sdfec-dout-words : A value 0 indicates that the DOUT_WORDS interface is
+ driven with a fixed value and is not present on the device, a value of 1
+ configures the DOUT_WORDS to be block based, while a value of 2 configures the
+ DOUT_WORDS input to be supplied for each AXI transaction.
+- xlnx,sdfec-dout-width : Configures the DOUT AXI stream where a value of 1
+ configures a width of "1x128b", 2 a width of "2x128b" and 4 configures a width
+ of "4x128b".
+Optional properties:
+- interrupts: should contain SDFEC interrupt number
+
+Example
+---------------------------------------
+ sd_fec_0: sd-fec@a0040000 {
+ compatible = "xlnx,sd-fec-1.1";
+ clock-names = "core_clk","s_axi_aclk","s_axis_ctrl_aclk","s_axis_din_aclk","m_axis_status_aclk","m_axis_dout_aclk";
+ clocks = <&misc_clk_2>,<&misc_clk_0>,<&misc_clk_1>,<&misc_clk_1>,<&misc_clk_1>, <&misc_clk_1>;
+ reg = <0x0 0xa0040000 0x0 0x40000>;
+ interrupt-parent = <&axi_intc>;
+ interrupts = <1 0>;
+ xlnx,sdfec-code = "ldpc";
+ xlnx,sdfec-din-words = <0>;
+ xlnx,sdfec-din-width = <2>;
+ xlnx,sdfec-dout-words = <0>;
+ xlnx,sdfec-dout-width = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml b/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml
new file mode 100644
index 000000000000..df0280edef97
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/allwinner,sun4i-a10-mmc.yaml
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mmc/allwinner,sun4i-a10-mmc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 MMC Controller Device Tree Bindings
+
+allOf:
+ - $ref: "mmc-controller.yaml"
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#address-cells": true
+ "#size-cells": true
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun4i-a10-mmc
+ - const: allwinner,sun5i-a13-mmc
+ - const: allwinner,sun7i-a20-mmc
+ - const: allwinner,sun8i-a83t-emmc
+ - const: allwinner,sun9i-a80-mmc
+ - const: allwinner,sun50i-a64-emmc
+ - const: allwinner,sun50i-a64-mmc
+ - items:
+ - const: allwinner,sun8i-a83t-mmc
+ - const: allwinner,sun7i-a20-mmc
+ - items:
+ - const: allwinner,sun50i-h6-emmc
+ - const: allwinner,sun50i-a64-emmc
+ - items:
+ - const: allwinner,sun50i-h6-mmc
+ - const: allwinner,sun50i-a64-mmc
+ - items:
+ - const: allwinner,sun8i-r40-emmc
+ - const: allwinner,sun50i-a64-emmc
+ - items:
+ - const: allwinner,sun8i-r40-mmc
+ - const: allwinner,sun50i-a64-mmc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ minItems: 2
+ maxItems: 4
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+ - description: Output Clock
+ - description: Sample Clock
+
+ clock-names:
+ minItems: 2
+ maxItems: 4
+ items:
+ - const: ahb
+ - const: mmc
+ - const: output
+ - const: sample
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ const: ahb
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+
+examples:
+ - |
+ mmc0: mmc@1c0f000 {
+ compatible = "allwinner,sun5i-a13-mmc";
+ reg = <0x01c0f000 0x1000>;
+ clocks = <&ahb_gates 8>, <&mmc0_clk>;
+ clock-names = "ahb", "mmc";
+ interrupts = <32>;
+ bus-width = <4>;
+ cd-gpios = <&pio 7 1 0>;
+ };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt b/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt
index 13e70409e8ac..ccc5358db131 100644
--- a/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt
+++ b/Documentation/devicetree/bindings/mmc/amlogic,meson-gx.txt
@@ -22,6 +22,10 @@ Required properties:
clock rate requested by the MMC core.
- resets : phandle of the internal reset line
+Optional properties:
+- amlogic,dram-access-quirk: set when controller's internal DMA engine cannot access the
+ DRAM memory, like on the G12A dedicated SDIO controller.
+
Example:
sd_emmc_a: mmc@70000 {
diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml
new file mode 100644
index 000000000000..080754e0ef35
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml
@@ -0,0 +1,374 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mmc/mmc-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MMC Controller Generic Binding
+
+maintainers:
+ - Ulf Hansson <ulf.hansson@linaro.org>
+
+description: |
+ These properties are common to multiple MMC host controllers. Any host
+ that requires the respective functionality should implement them using
+ these definitions.
+
+properties:
+ $nodename:
+ pattern: "^mmc(@.*)?$"
+
+ "#address-cells":
+ const: 1
+ description: |
+ The cell is the slot ID if a function subnode is used.
+
+ "#size-cells":
+ const: 0
+
+ # Card Detection.
+ # If none of these properties are supplied, the host native card
+ # detect will be used. Only one of them should be provided.
+
+ broken-cd:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ There is no card detection available; polling must be used.
+
+ cd-gpios:
+ description:
+ The card detection will be done using the GPIO provided.
+
+ non-removable:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Non-removable slot (like eMMC); assume always present.
+
+ # *NOTE* on CD and WP polarity. To use common for all SD/MMC host
+ # controllers line polarity properties, we have to fix the meaning
+ # of the "normal" and "inverted" line levels. We choose to follow
+ # the SDHCI standard, which specifies both those lines as "active
+ # low." Therefore, using the "cd-inverted" property means, that the
+ # CD line is active high, i.e. it is high, when a card is
+ # inserted. Similar logic applies to the "wp-inverted" property.
+ #
+ # CD and WP lines can be implemented on the hardware in one of two
+ # ways: as GPIOs, specified in cd-gpios and wp-gpios properties, or
+ # as dedicated pins. Polarity of dedicated pins can be specified,
+ # using *-inverted properties. GPIO polarity can also be specified
+ # using the GPIO_ACTIVE_LOW flag. This creates an ambiguity in the
+ # latter case. We choose to use the XOR logic for GPIO CD and WP
+ # lines. This means, the two properties are "superimposed," for
+ # example leaving the GPIO_ACTIVE_LOW flag clear and specifying the
+ # respective *-inverted property property results in a
+ # double-inversion and actually means the "normal" line polarity is
+ # in effect.
+ wp-inverted:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The Write Protect line polarity is inverted.
+
+ cd-inverted:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The CD line polarity is inverted.
+
+ # Other properties
+
+ bus-width:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - enum: [1, 4, 8]
+ default: 1
+ description:
+ Number of data lines.
+
+ max-frequency:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - minimum: 400000
+ - maximum: 200000000
+ description:
+ Maximum operating frequency of the bus.
+
+ disable-wp:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ When set, no physical write-protect line is present. This
+ property should only be specified when the controller has a
+ dedicated write-protect detection logic. If a GPIO is always
+ used for the write-protect detection. If a GPIO is always used
+ for the write-protect detection logic, it is sufficient to not
+ specify the wp-gpios property in the absence of a write-protect
+ line.
+
+ wp-gpios:
+ description:
+ GPIO to use for the write-protect detection.
+
+ cd-debounce-delay-ms:
+ description:
+ Set delay time before detecting card after card insert
+ interrupt.
+
+ no-1-8-v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ When specified, denotes that 1.8V card voltage is not supported
+ on this system, even if the controller claims it.
+
+ cap-sd-highspeed:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SD high-speed timing is supported.
+
+ cap-mmc-highspeed:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ MMC high-speed timing is supported.
+
+ sd-uhs-sdr12:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SD UHS SDR12 speed is supported.
+
+ sd-uhs-sdr25:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SD UHS SDR25 speed is supported.
+
+ sd-uhs-sdr50:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SD UHS SDR50 speed is supported.
+
+ sd-uhs-sdr104:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SD UHS SDR104 speed is supported.
+
+ sd-uhs-ddr50:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SD UHS DDR50 speed is supported.
+
+ cap-power-off-card:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Powering off the card is safe.
+
+ cap-mmc-hw-reset:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC hardware reset is supported
+
+ cap-sdio-irq:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ enable SDIO IRQ signalling on this interface
+
+ full-pwr-cycle:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Full power cycle of the card is supported.
+
+ mmc-ddr-1_2v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC high-speed DDR mode (1.2V I/O) is supported.
+
+ mmc-ddr-1_8v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC high-speed DDR mode (1.8V I/O) is supported.
+
+ mmc-ddr-3_3v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC high-speed DDR mode (3.3V I/O) is supported.
+
+ mmc-hs200-1_2v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC HS200 mode (1.2V I/O) is supported.
+
+ mmc-hs200-1_8v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC HS200 mode (1.8V I/O) is supported.
+
+ mmc-hs400-1_2v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC HS400 mode (1.2V I/O) is supported.
+
+ mmc-hs400-1_8v:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC HS400 mode (1.8V I/O) is supported.
+
+ mmc-hs400-enhanced-strobe:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ eMMC HS400 enhanced strobe mode is supported
+
+ dsr:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - minimum: 0
+ - maximum: 0xffff
+ description:
+ Value the card Driver Stage Register (DSR) should be programmed
+ with.
+
+ no-sdio:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Controller is limited to send SDIO commands during
+ initialization.
+
+ no-sd:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Controller is limited to send SD commands during initialization.
+
+ no-mmc:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Controller is limited to send MMC commands during
+ initialization.
+
+ fixed-emmc-driver-type:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - minimum: 0
+ - maximum: 4
+ description:
+ For non-removable eMMC, enforce this driver type. The value is
+ the driver type as specified in the eMMC specification (table
+ 206 in spec version 5.1)
+
+ post-power-on-delay-ms:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - default: 10
+ description:
+ It was invented for MMC pwrseq-simple which could be referred to
+ mmc-pwrseq-simple.txt. But now it\'s reused as a tunable delay
+ waiting for I/O signalling and card power supply to be stable,
+ regardless of whether pwrseq-simple is used. Default to 10ms if
+ no available.
+
+ supports-cqe:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The presence of this property indicates that the corresponding
+ MMC host controller supports HW command queue feature.
+
+ disable-cqe-dcmd:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The presence of this property indicates that the MMC
+ controller\'s command queue engine (CQE) does not support direct
+ commands (DCMDs).
+
+ keep-power-in-suspend:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SDIO only. Preserves card power during a suspend/resume cycle.
+
+ # Deprecated: enable-sdio-wakeup
+ wakeup-source:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ SDIO only. Enables wake up of host system on SDIO IRQ assertion.
+
+ vmmc-supply:
+ description:
+ Supply for the card power
+
+ vqmmc-supply:
+ description:
+ Supply for the bus IO line power
+
+ mmc-pwrseq:
+ $ref: /schemas/types.yaml#/definitions/phandle
+ description:
+ System-on-Chip designs may specify a specific MMC power
+ sequence. To successfully detect an (e)MMC/SD/SDIO card, that
+ power sequence must be maintained while initializing the card.
+
+patternProperties:
+ "^.*@[0-9]+$":
+ type: object
+ description: |
+ On embedded systems the cards connected to a host may need
+ additional properties. These can be specified in subnodes to the
+ host controller node. The subnodes are identified by the
+ standard \'reg\' property. Which information exactly can be
+ specified depends on the bindings for the SDIO function driver
+ for the subnode, as specified by the compatible string.
+
+ properties:
+ compatible:
+ description: |
+ Name of SDIO function following generic names recommended
+ practice
+
+ reg:
+ items:
+ - minimum: 0
+ maximum: 7
+ description:
+ Must contain the SDIO function number of the function this
+ subnode describes. A value of 0 denotes the memory SD
+ function, values from 1 to 7 denote the SDIO functions.
+
+ broken-hpi:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Use this to indicate that the mmc-card has a broken hpi
+ implementation, and that hpi should not be used.
+
+ required:
+ - reg
+
+dependencies:
+ cd-debounce-delay-ms: [ cd-gpios ]
+ fixed-emmc-driver-type: [ non-removable ]
+
+examples:
+ - |
+ sdhci@ab000000 {
+ compatible = "sdhci";
+ reg = <0xab000000 0x200>;
+ interrupts = <23>;
+ bus-width = <4>;
+ cd-gpios = <&gpio 69 0>;
+ cd-inverted;
+ wp-gpios = <&gpio 70 0>;
+ max-frequency = <50000000>;
+ keep-power-in-suspend;
+ wakeup-source;
+ mmc-pwrseq = <&sdhci0_pwrseq>;
+ };
+
+ - |
+ mmc3: mmc@1c12000 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&mmc3_pins_a>;
+ vmmc-supply = <&reg_vmmc3>;
+ bus-width = <4>;
+ non-removable;
+ mmc-pwrseq = <&sdhci0_pwrseq>;
+
+ brcmf: bcrmf@1 {
+ reg = <1>;
+ compatible = "brcm,bcm43xx-fmac";
+ interrupt-parent = <&pio>;
+ interrupts = <10 8>;
+ interrupt-names = "host-wake";
+ };
+ };
diff --git a/Documentation/devicetree/bindings/mmc/mmc.txt b/Documentation/devicetree/bindings/mmc/mmc.txt
index c269dbe384fe..bf9d7d3febf1 100644
--- a/Documentation/devicetree/bindings/mmc/mmc.txt
+++ b/Documentation/devicetree/bindings/mmc/mmc.txt
@@ -1,177 +1 @@
-These properties are common to multiple MMC host controllers. Any host
-that requires the respective functionality should implement them using
-these definitions.
-
-Interpreted by the OF core:
-- reg: Registers location and length.
-- interrupts: Interrupts used by the MMC controller.
-
-Card detection:
-If no property below is supplied, host native card detect is used.
-Only one of the properties in this section should be supplied:
- - broken-cd: There is no card detection available; polling must be used.
- - cd-gpios: Specify GPIOs for card detection, see gpio binding
- - non-removable: non-removable slot (like eMMC); assume always present.
-
-Optional properties:
-- bus-width: Number of data lines, can be <1>, <4>, or <8>. The default
- will be <1> if the property is absent.
-- wp-gpios: Specify GPIOs for write protection, see gpio binding
-- cd-inverted: when present, polarity on the CD line is inverted. See the note
- below for the case, when a GPIO is used for the CD line
-- cd-debounce-delay-ms: Set delay time before detecting card after card insert interrupt.
- It's only valid when cd-gpios is present.
-- wp-inverted: when present, polarity on the WP line is inverted. See the note
- below for the case, when a GPIO is used for the WP line
-- disable-wp: When set no physical WP line is present. This property should
- only be specified when the controller has a dedicated write-protect
- detection logic. If a GPIO is always used for the write-protect detection
- logic it is sufficient to not specify wp-gpios property in the absence of a WP
- line.
-- max-frequency: maximum operating clock frequency
-- no-1-8-v: when present, denotes that 1.8v card voltage is not supported on
- this system, even if the controller claims it is.
-- cap-sd-highspeed: SD high-speed timing is supported
-- cap-mmc-highspeed: MMC high-speed timing is supported
-- sd-uhs-sdr12: SD UHS SDR12 speed is supported
-- sd-uhs-sdr25: SD UHS SDR25 speed is supported
-- sd-uhs-sdr50: SD UHS SDR50 speed is supported
-- sd-uhs-sdr104: SD UHS SDR104 speed is supported
-- sd-uhs-ddr50: SD UHS DDR50 speed is supported
-- cap-power-off-card: powering off the card is safe
-- cap-mmc-hw-reset: eMMC hardware reset is supported
-- cap-sdio-irq: enable SDIO IRQ signalling on this interface
-- full-pwr-cycle: full power cycle of the card is supported
-- mmc-ddr-3_3v: eMMC high-speed DDR mode(3.3V I/O) is supported
-- mmc-ddr-1_8v: eMMC high-speed DDR mode(1.8V I/O) is supported
-- mmc-ddr-1_2v: eMMC high-speed DDR mode(1.2V I/O) is supported
-- mmc-hs200-1_8v: eMMC HS200 mode(1.8V I/O) is supported
-- mmc-hs200-1_2v: eMMC HS200 mode(1.2V I/O) is supported
-- mmc-hs400-1_8v: eMMC HS400 mode(1.8V I/O) is supported
-- mmc-hs400-1_2v: eMMC HS400 mode(1.2V I/O) is supported
-- mmc-hs400-enhanced-strobe: eMMC HS400 enhanced strobe mode is supported
-- dsr: Value the card's (optional) Driver Stage Register (DSR) should be
- programmed with. Valid range: [0 .. 0xffff].
-- no-sdio: controller is limited to send sdio cmd during initialization
-- no-sd: controller is limited to send sd cmd during initialization
-- no-mmc: controller is limited to send mmc cmd during initialization
-- fixed-emmc-driver-type: for non-removable eMMC, enforce this driver type.
- The value <n> is the driver type as specified in the eMMC specification
- (table 206 in spec version 5.1).
-- post-power-on-delay-ms : It was invented for MMC pwrseq-simple which could
- be referred to mmc-pwrseq-simple.txt. But now it's reused as a tunable delay
- waiting for I/O signalling and card power supply to be stable, regardless of
- whether pwrseq-simple is used. Default to 10ms if no available.
-- supports-cqe : The presence of this property indicates that the corresponding
- MMC host controller supports HW command queue feature.
-- disable-cqe-dcmd: This property indicates that the MMC controller's command
- queue engine (CQE) does not support direct commands (DCMDs).
-
-*NOTE* on CD and WP polarity. To use common for all SD/MMC host controllers line
-polarity properties, we have to fix the meaning of the "normal" and "inverted"
-line levels. We choose to follow the SDHCI standard, which specifies both those
-lines as "active low." Therefore, using the "cd-inverted" property means, that
-the CD line is active high, i.e. it is high, when a card is inserted. Similar
-logic applies to the "wp-inverted" property.
-
-CD and WP lines can be implemented on the hardware in one of two ways: as GPIOs,
-specified in cd-gpios and wp-gpios properties, or as dedicated pins. Polarity of
-dedicated pins can be specified, using *-inverted properties. GPIO polarity can
-also be specified using the GPIO_ACTIVE_LOW flag. This creates an ambiguity
-in the latter case. We choose to use the XOR logic for GPIO CD and WP lines.
-This means, the two properties are "superimposed," for example leaving the
-GPIO_ACTIVE_LOW flag clear and specifying the respective *-inverted property
-property results in a double-inversion and actually means the "normal" line
-polarity is in effect.
-
-Optional SDIO properties:
-- keep-power-in-suspend: Preserves card power during a suspend/resume cycle
-- wakeup-source: Enables wake up of host system on SDIO IRQ assertion
- (Legacy property supported: "enable-sdio-wakeup")
-
-MMC power
----------
-
-Controllers may implement power control from both the connected cards and
-the IO signaling (for example to change to high-speed 1.8V signalling). If
-the system supports this, then the following two properties should point
-to valid regulator nodes:
-
-- vqmmc-supply: supply node for IO line power
-- vmmc-supply: supply node for card's power
-
-
-MMC power sequences:
---------------------
-
-System on chip designs may specify a specific MMC power sequence. To
-successfully detect an (e)MMC/SD/SDIO card, that power sequence must be
-maintained while initializing the card.
-
-Optional property:
-- mmc-pwrseq: phandle to the MMC power sequence node. See "mmc-pwrseq-*"
- for documentation of MMC power sequence bindings.
-
-
-Use of Function subnodes
-------------------------
-
-On embedded systems the cards connected to a host may need additional
-properties. These can be specified in subnodes to the host controller node.
-The subnodes are identified by the standard 'reg' property.
-Which information exactly can be specified depends on the bindings for the
-SDIO function driver for the subnode, as specified by the compatible string.
-
-Required host node properties when using function subnodes:
-- #address-cells: should be one. The cell is the slot id.
-- #size-cells: should be zero.
-
-Required function subnode properties:
-- reg: Must contain the SDIO function number of the function this subnode
- describes. A value of 0 denotes the memory SD function, values from
- 1 to 7 denote the SDIO functions.
-
-Optional function subnode properties:
-- compatible: name of SDIO function following generic names recommended practice
-
-
-Examples
---------
-
-Basic example:
-
-sdhci@ab000000 {
- compatible = "sdhci";
- reg = <0xab000000 0x200>;
- interrupts = <23>;
- bus-width = <4>;
- cd-gpios = <&gpio 69 0>;
- cd-inverted;
- wp-gpios = <&gpio 70 0>;
- max-frequency = <50000000>;
- keep-power-in-suspend;
- wakeup-source;
- mmc-pwrseq = <&sdhci0_pwrseq>
-}
-
-Example with sdio function subnode:
-
-mmc3: mmc@1c12000 {
- #address-cells = <1>;
- #size-cells = <0>;
-
- pinctrl-names = "default";
- pinctrl-0 = <&mmc3_pins_a>;
- vmmc-supply = <&reg_vmmc3>;
- bus-width = <4>;
- non-removable;
- mmc-pwrseq = <&sdhci0_pwrseq>
-
- brcmf: bcrmf@1 {
- reg = <1>;
- compatible = "brcm,bcm43xx-fmac";
- interrupt-parent = <&pio>;
- interrupts = <10 8>; /* PH10 / EINT10 */
- interrupt-names = "host-wake";
- };
-};
+This file has moved to mmc-controller.yaml.
diff --git a/Documentation/devicetree/bindings/mmc/renesas,sdhi.txt b/Documentation/devicetree/bindings/mmc/renesas,sdhi.txt
new file mode 100644
index 000000000000..dd08d038a65c
--- /dev/null
+++ b/Documentation/devicetree/bindings/mmc/renesas,sdhi.txt
@@ -0,0 +1,111 @@
+* Renesas SDHI SD/MMC controller
+
+Required properties:
+- compatible: should contain one or more of the following:
+ "renesas,sdhi-sh73a0" - SDHI IP on SH73A0 SoC
+ "renesas,sdhi-r7s72100" - SDHI IP on R7S72100 SoC
+ "renesas,sdhi-r7s9210" - SDHI IP on R7S9210 SoC
+ "renesas,sdhi-r8a73a4" - SDHI IP on R8A73A4 SoC
+ "renesas,sdhi-r8a7740" - SDHI IP on R8A7740 SoC
+ "renesas,sdhi-r8a7743" - SDHI IP on R8A7743 SoC
+ "renesas,sdhi-r8a7744" - SDHI IP on R8A7744 SoC
+ "renesas,sdhi-r8a7745" - SDHI IP on R8A7745 SoC
+ "renesas,sdhi-r8a774a1" - SDHI IP on R8A774A1 SoC
+ "renesas,sdhi-r8a774c0" - SDHI IP on R8A774C0 SoC
+ "renesas,sdhi-r8a77470" - SDHI IP on R8A77470 SoC
+ "renesas,sdhi-mmc-r8a77470" - SDHI/MMC IP on R8A77470 SoC
+ "renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC
+ "renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC
+ "renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC
+ "renesas,sdhi-r8a7791" - SDHI IP on R8A7791 SoC
+ "renesas,sdhi-r8a7792" - SDHI IP on R8A7792 SoC
+ "renesas,sdhi-r8a7793" - SDHI IP on R8A7793 SoC
+ "renesas,sdhi-r8a7794" - SDHI IP on R8A7794 SoC
+ "renesas,sdhi-r8a7795" - SDHI IP on R8A7795 SoC
+ "renesas,sdhi-r8a7796" - SDHI IP on R8A7796 SoC
+ "renesas,sdhi-r8a77965" - SDHI IP on R8A77965 SoC
+ "renesas,sdhi-r8a77970" - SDHI IP on R8A77970 SoC
+ "renesas,sdhi-r8a77980" - SDHI IP on R8A77980 SoC
+ "renesas,sdhi-r8a77990" - SDHI IP on R8A77990 SoC
+ "renesas,sdhi-r8a77995" - SDHI IP on R8A77995 SoC
+ "renesas,sdhi-shmobile" - a generic sh-mobile SDHI controller
+ "renesas,rcar-gen1-sdhi" - a generic R-Car Gen1 SDHI controller
+ "renesas,rcar-gen2-sdhi" - a generic R-Car Gen2 and RZ/G1 SDHI
+ (not SDHI/MMC) controller
+ "renesas,rcar-gen3-sdhi" - a generic R-Car Gen3 or RZ/G2
+ SDHI controller
+
+
+ When compatible with the generic version, nodes must list
+ the SoC-specific version corresponding to the platform
+ first followed by the generic version.
+
+- clocks: Most controllers only have 1 clock source per channel. However, on
+ some variations of this controller, the internal card detection
+ logic that exists in this controller is sectioned off to be run by a
+ separate second clock source to allow the main core clock to be turned
+ off to save power.
+ If 2 clocks are specified by the hardware, you must name them as
+ "core" and "cd". If the controller only has 1 clock, naming is not
+ required.
+ Devices which have more than 1 clock are listed below:
+ 2: R7S72100, R7S9210
+
+Optional properties:
+- pinctrl-names: should be "default", "state_uhs"
+- pinctrl-0: should contain default/high speed pin ctrl
+- pinctrl-1: should contain uhs mode pin ctrl
+
+Example: R8A7790 (R-Car H2) SDHI controller nodes
+
+ sdhi0: sd@ee100000 {
+ compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+ reg = <0 0xee100000 0 0x328>;
+ interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cpg CPG_MOD 314>;
+ dmas = <&dmac0 0xcd>, <&dmac0 0xce>,
+ <&dmac1 0xcd>, <&dmac1 0xce>;
+ dma-names = "tx", "rx", "tx", "rx";
+ max-frequency = <195000000>;
+ power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+ resets = <&cpg 314>;
+ };
+
+ sdhi1: sd@ee120000 {
+ compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+ reg = <0 0xee120000 0 0x328>;
+ interrupts = <GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cpg CPG_MOD 313>;
+ dmas = <&dmac0 0xc9>, <&dmac0 0xca>,
+ <&dmac1 0xc9>, <&dmac1 0xca>;
+ dma-names = "tx", "rx", "tx", "rx";
+ max-frequency = <195000000>;
+ power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+ resets = <&cpg 313>;
+ };
+
+ sdhi2: sd@ee140000 {
+ compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+ reg = <0 0xee140000 0 0x100>;
+ interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cpg CPG_MOD 312>;
+ dmas = <&dmac0 0xc1>, <&dmac0 0xc2>,
+ <&dmac1 0xc1>, <&dmac1 0xc2>;
+ dma-names = "tx", "rx", "tx", "rx";
+ max-frequency = <97500000>;
+ power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+ resets = <&cpg 312>;
+ };
+
+ sdhi3: sd@ee160000 {
+ compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
+ reg = <0 0xee160000 0 0x100>;
+ interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&cpg CPG_MOD 311>;
+ dmas = <&dmac0 0xd3>, <&dmac0 0xd4>,
+ <&dmac1 0xd3>, <&dmac1 0xd4>;
+ dma-names = "tx", "rx", "tx", "rx";
+ max-frequency = <97500000>;
+ power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
+ resets = <&cpg 311>;
+ };
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt
index 15dbbbace27e..50e87df47971 100644
--- a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt
+++ b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt
@@ -8,7 +8,10 @@ Only deviations are documented here.
[3] Documentation/devicetree/bindings/interrupt-controller/interrupts.txt
Required Properties:
- - compatible: should be "ti,am654-sdhci-5.1"
+ - compatible: should be one of:
+ "ti,am654-sdhci-5.1": SDHCI on AM654 device.
+ "ti,j721e-sdhci-8bit": 8 bit SDHCI on J721E device.
+ "ti,j721e-sdhci-4bit": 4 bit SDHCI on J721E device.
- reg: Must be two entries.
- The first should be the sdhci register space
- The second should the subsystem/phy register space
@@ -16,9 +19,13 @@ Required Properties:
- clock-names: Tuple including "clk_xin" and "clk_ahb"
- interrupts: Interrupt specifiers
- ti,otap-del-sel: Output Tap Delay select
+
+Optional Properties (Required for ti,am654-sdhci-5.1 and ti,j721e-sdhci-8bit):
- ti,trm-icp: DLL trim select
- ti,driver-strength-ohm: driver strength in ohms.
Valid values are 33, 40, 50, 66 and 100 ohms.
+Optional Properties:
+ - ti,strobe-sel: strobe select delay for HS400 speed mode. Default value: 0x0.
Example:
diff --git a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt
index 45c9978aad7b..eb7eb1b529f0 100644
--- a/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt
+++ b/Documentation/devicetree/bindings/mmc/sdhci-sprd.txt
@@ -14,10 +14,31 @@ Required properties:
- clock-names: Should contain the following:
"sdio" - SDIO source clock (required)
"enable" - gate clock which used for enabling/disabling the device (required)
+ "2x_enable" - gate clock controlling the device for some special platforms (optional)
Optional properties:
- assigned-clocks: the same with "sdio" clock
- assigned-clock-parents: the default parent of "sdio" clock
+- pinctrl-names: should be "default", "state_uhs"
+- pinctrl-0: should contain default/high speed pin control
+- pinctrl-1: should contain uhs mode pin control
+
+PHY DLL delays are used to delay the data valid window, and align the window
+to sampling clock. PHY DLL delays can be configured by following properties,
+and each property contains 4 cells which are used to configure the clock data
+write line delay value, clock read command line delay value, clock read data
+positive edge delay value and clock read data negative edge delay value.
+Each cell's delay value unit is cycle of the PHY clock.
+
+- sprd,phy-delay-legacy: Delay value for legacy timing.
+- sprd,phy-delay-sd-highspeed: Delay value for SD high-speed timing.
+- sprd,phy-delay-sd-uhs-sdr50: Delay value for SD UHS SDR50 timing.
+- sprd,phy-delay-sd-uhs-sdr104: Delay value for SD UHS SDR50 timing.
+- sprd,phy-delay-mmc-highspeed: Delay value for MMC high-speed timing.
+- sprd,phy-delay-mmc-ddr52: Delay value for MMC DDR52 timing.
+- sprd,phy-delay-mmc-hs200: Delay value for MMC HS200 timing.
+- sprd,phy-delay-mmc-hs400: Delay value for MMC HS400 timing.
+- sprd,phy-delay-mmc-hs400es: Delay value for MMC HS400 enhanced strobe timing.
Examples:
@@ -32,6 +53,11 @@ sdio0: sdio@20600000 {
assigned-clocks = <&ap_clk CLK_EMMC_2X>;
assigned-clock-parents = <&rpll CLK_RPLL_390M>;
+ pinctrl-names = "default", "state_uhs";
+ pinctrl-0 = <&sd0_pins_default>;
+ pinctrl-1 = <&sd0_pins_uhs>;
+
+ sprd,phy-delay-sd-uhs-sdr104 = <0x3f 0x7f 0x2e 0x2e>;
bus-width = <8>;
non-removable;
no-sdio;
diff --git a/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt b/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt
deleted file mode 100644
index e9cb3ec5e502..000000000000
--- a/Documentation/devicetree/bindings/mmc/sunxi-mmc.txt
+++ /dev/null
@@ -1,52 +0,0 @@
-* Allwinner sunxi MMC controller
-
-The highspeed MMC host controller on Allwinner SoCs provides an interface
-for MMC, SD and SDIO types of memory cards.
-
-Supported maximum speeds are the ones of the eMMC standard 4.5 as well
-as the speed of SD standard 3.0.
-Absolute maximum transfer rate is 200MB/s
-
-Required properties:
- - compatible : should be one of:
- * "allwinner,sun4i-a10-mmc"
- * "allwinner,sun5i-a13-mmc"
- * "allwinner,sun7i-a20-mmc"
- * "allwinner,sun8i-a83t-emmc"
- * "allwinner,sun9i-a80-mmc"
- * "allwinner,sun50i-a64-emmc"
- * "allwinner,sun50i-a64-mmc"
- * "allwinner,sun50i-h6-emmc", "allwinner.sun50i-a64-emmc"
- * "allwinner,sun50i-h6-mmc", "allwinner.sun50i-a64-mmc"
- - reg : mmc controller base registers
- - clocks : a list with 4 phandle + clock specifier pairs
- - clock-names : must contain "ahb", "mmc", "output" and "sample"
- - interrupts : mmc controller interrupt
-
-Optional properties:
- - resets : phandle + reset specifier pair
- - reset-names : must contain "ahb"
- - for cd, bus-width and additional generic mmc parameters
- please refer to mmc.txt within this directory
-
-Examples:
- - Within .dtsi:
- mmc0: mmc@1c0f000 {
- compatible = "allwinner,sun5i-a13-mmc";
- reg = <0x01c0f000 0x1000>;
- clocks = <&ahb_gates 8>, <&mmc0_clk>, <&mmc0_output_clk>, <&mmc0_sample_clk>;
- clock-names = "ahb", "mod", "output", "sample";
- interrupts = <0 32 4>;
- status = "disabled";
- };
-
- - Within dts:
- mmc0: mmc@1c0f000 {
- pinctrl-names = "default", "default";
- pinctrl-0 = <&mmc0_pins_a>;
- pinctrl-1 = <&mmc0_cd_pin_reference_design>;
- bus-width = <4>;
- cd-gpios = <&pio 7 1 0>; /* PH1 */
- cd-inverted;
- status = "okay";
- };
diff --git a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt b/Documentation/devicetree/bindings/mmc/tmio_mmc.txt
deleted file mode 100644
index 2b4f17ca9087..000000000000
--- a/Documentation/devicetree/bindings/mmc/tmio_mmc.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-* Toshiba Mobile IO SD/MMC controller
-
-The tmio-mmc driver doesn't probe its devices actively, instead its binding to
-devices is managed by either MFD drivers or by the sh_mobile_sdhi platform
-driver. Those drivers supply the tmio-mmc driver with platform data, that either
-describe hardware capabilities, known to them, or are obtained by them from
-their own platform data or from their DT information. In the latter case all
-compulsory and any optional properties, common to all SD/MMC drivers, as
-described in mmc.txt, can be used. Additionally the following tmio_mmc-specific
-optional bindings can be used.
-
-Required properties:
-- compatible: should contain one or more of the following:
- "renesas,sdhi-sh73a0" - SDHI IP on SH73A0 SoC
- "renesas,sdhi-r7s72100" - SDHI IP on R7S72100 SoC
- "renesas,sdhi-r7s9210" - SDHI IP on R7S9210 SoC
- "renesas,sdhi-r8a73a4" - SDHI IP on R8A73A4 SoC
- "renesas,sdhi-r8a7740" - SDHI IP on R8A7740 SoC
- "renesas,sdhi-r8a7743" - SDHI IP on R8A7743 SoC
- "renesas,sdhi-r8a7744" - SDHI IP on R8A7744 SoC
- "renesas,sdhi-r8a7745" - SDHI IP on R8A7745 SoC
- "renesas,sdhi-r8a774a1" - SDHI IP on R8A774A1 SoC
- "renesas,sdhi-r8a774c0" - SDHI IP on R8A774C0 SoC
- "renesas,sdhi-r8a77470" - SDHI IP on R8A77470 SoC
- "renesas,sdhi-mmc-r8a77470" - SDHI/MMC IP on R8A77470 SoC
- "renesas,sdhi-r8a7778" - SDHI IP on R8A7778 SoC
- "renesas,sdhi-r8a7779" - SDHI IP on R8A7779 SoC
- "renesas,sdhi-r8a7790" - SDHI IP on R8A7790 SoC
- "renesas,sdhi-r8a7791" - SDHI IP on R8A7791 SoC
- "renesas,sdhi-r8a7792" - SDHI IP on R8A7792 SoC
- "renesas,sdhi-r8a7793" - SDHI IP on R8A7793 SoC
- "renesas,sdhi-r8a7794" - SDHI IP on R8A7794 SoC
- "renesas,sdhi-r8a7795" - SDHI IP on R8A7795 SoC
- "renesas,sdhi-r8a7796" - SDHI IP on R8A7796 SoC
- "renesas,sdhi-r8a77965" - SDHI IP on R8A77965 SoC
- "renesas,sdhi-r8a77970" - SDHI IP on R8A77970 SoC
- "renesas,sdhi-r8a77980" - SDHI IP on R8A77980 SoC
- "renesas,sdhi-r8a77990" - SDHI IP on R8A77990 SoC
- "renesas,sdhi-r8a77995" - SDHI IP on R8A77995 SoC
- "renesas,sdhi-shmobile" - a generic sh-mobile SDHI controller
- "renesas,rcar-gen1-sdhi" - a generic R-Car Gen1 SDHI controller
- "renesas,rcar-gen2-sdhi" - a generic R-Car Gen2 and RZ/G1 SDHI
- (not SDHI/MMC) controller
- "renesas,rcar-gen3-sdhi" - a generic R-Car Gen3 or RZ/G2
- SDHI controller
-
-
- When compatible with the generic version, nodes must list
- the SoC-specific version corresponding to the platform
- first followed by the generic version.
-
-- clocks: Most controllers only have 1 clock source per channel. However, on
- some variations of this controller, the internal card detection
- logic that exists in this controller is sectioned off to be run by a
- separate second clock source to allow the main core clock to be turned
- off to save power.
- If 2 clocks are specified by the hardware, you must name them as
- "core" and "cd". If the controller only has 1 clock, naming is not
- required.
- Devices which have more than 1 clock are listed below:
- 2: R7S72100, R7S9210
-
-Optional properties:
-- pinctrl-names: should be "default", "state_uhs"
-- pinctrl-0: should contain default/high speed pin ctrl
-- pinctrl-1: should contain uhs mode pin ctrl
-
-Example: R8A7790 (R-Car H2) SDHI controller nodes
-
- sdhi0: sd@ee100000 {
- compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
- reg = <0 0xee100000 0 0x328>;
- interrupts = <GIC_SPI 165 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cpg CPG_MOD 314>;
- dmas = <&dmac0 0xcd>, <&dmac0 0xce>,
- <&dmac1 0xcd>, <&dmac1 0xce>;
- dma-names = "tx", "rx", "tx", "rx";
- max-frequency = <195000000>;
- power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
- resets = <&cpg 314>;
- };
-
- sdhi1: sd@ee120000 {
- compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
- reg = <0 0xee120000 0 0x328>;
- interrupts = <GIC_SPI 166 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cpg CPG_MOD 313>;
- dmas = <&dmac0 0xc9>, <&dmac0 0xca>,
- <&dmac1 0xc9>, <&dmac1 0xca>;
- dma-names = "tx", "rx", "tx", "rx";
- max-frequency = <195000000>;
- power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
- resets = <&cpg 313>;
- };
-
- sdhi2: sd@ee140000 {
- compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
- reg = <0 0xee140000 0 0x100>;
- interrupts = <GIC_SPI 167 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cpg CPG_MOD 312>;
- dmas = <&dmac0 0xc1>, <&dmac0 0xc2>,
- <&dmac1 0xc1>, <&dmac1 0xc2>;
- dma-names = "tx", "rx", "tx", "rx";
- max-frequency = <97500000>;
- power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
- resets = <&cpg 312>;
- };
-
- sdhi3: sd@ee160000 {
- compatible = "renesas,sdhi-r8a7790", "renesas,rcar-gen2-sdhi";
- reg = <0 0xee160000 0 0x100>;
- interrupts = <GIC_SPI 168 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&cpg CPG_MOD 311>;
- dmas = <&dmac0 0xd3>, <&dmac0 0xd4>,
- <&dmac1 0xd3>, <&dmac1 0xd4>;
- dma-names = "tx", "rx", "tx", "rx";
- max-frequency = <97500000>;
- power-domains = <&sysc R8A7790_PD_ALWAYS_ON>;
- resets = <&cpg 311>;
- };
diff --git a/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml b/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml
index fbd4da3684fc..b5b3cf5b1ac2 100644
--- a/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml
+++ b/Documentation/devicetree/bindings/mtd/allwinner,sun4i-a10-nand.yaml
@@ -55,9 +55,9 @@ patternProperties:
"^pinctrl-[0-9]+$": true
"^nand@[a-f0-9]+$":
+ type: object
properties:
reg:
- maxItems: 1
minimum: 0
maximum: 7
diff --git a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
index 0b7c3738b66c..82156dc8f304 100644
--- a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
+++ b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
@@ -28,6 +28,7 @@ Required properties:
brcm,brcmnand-v7.0
brcm,brcmnand-v7.1
brcm,brcmnand-v7.2
+ brcm,brcmnand-v7.3
brcm,brcmnand
- reg : the register start and length for NAND register region.
(optional) Flash DMA register range (if present)
@@ -101,10 +102,10 @@ Required properties:
number (e.g., 0, 1, 2, etc.)
- #address-cells : see partition.txt
- #size-cells : see partition.txt
-- nand-ecc-strength : see nand-controller.yaml
-- nand-ecc-step-size : must be 512 or 1024. See nand-controller.yaml
Optional properties:
+- nand-ecc-strength : see nand-controller.yaml
+- nand-ecc-step-size : must be 512 or 1024. See nand-controller.yaml
- nand-on-flash-bbt : boolean, to enable the on-flash BBT for this
chip-select. See nand-controller.yaml
- brcm,nand-oob-sector-size : integer, to denote the spare area sector size
diff --git a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
index 4345c3a6f530..945be7d5b236 100644
--- a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
+++ b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
@@ -35,6 +35,9 @@ custom properties:
(qspi_n_ss_out).
- cdns,tslch-ns : Delay in nanoseconds between setting qspi_n_ss_out low
and first bit transfer.
+- resets : Must contain an entry for each entry in reset-names.
+ See ../reset/reset.txt for details.
+- reset-names : Must include either "qspi" and/or "qspi-ocp".
Example:
@@ -50,6 +53,8 @@ Example:
cdns,fifo-depth = <128>;
cdns,fifo-width = <4>;
cdns,trigger-address = <0x00000000>;
+ resets = <&rst QSPI_RESET>, <&rst QSPI_OCP_RESET>;
+ reset-names = "qspi", "qspi-ocp";
flash0: n25q00@0 {
...
diff --git a/Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt b/Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt
new file mode 100644
index 000000000000..ad42f4db32f1
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt
@@ -0,0 +1,13 @@
+Bindings for HyperFlash NOR flash chips compliant with Cypress HyperBus
+specification and supports Cypress CFI specification 1.5 command set.
+
+Required properties:
+- compatible : "cypress,hyperflash", "cfi-flash" for HyperFlash NOR chips
+- reg : Address of flash's memory map
+
+Example:
+
+ flash@0 {
+ compatible = "cypress,hyperflash", "cfi-flash";
+ reg = <0x0 0x4000000>;
+ };
diff --git a/Documentation/devicetree/bindings/mtd/nand-controller.yaml b/Documentation/devicetree/bindings/mtd/nand-controller.yaml
index 199ba5ac2a06..d261b7096c69 100644
--- a/Documentation/devicetree/bindings/mtd/nand-controller.yaml
+++ b/Documentation/devicetree/bindings/mtd/nand-controller.yaml
@@ -40,6 +40,7 @@ properties:
patternProperties:
"^nand@[a-f0-9]$":
+ type: object
properties:
reg:
description:
diff --git a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt b/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt
deleted file mode 100644
index ddd18c135148..000000000000
--- a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-* STMicroelectronics Quad Serial Peripheral Interface(QuadSPI)
-
-Required properties:
-- compatible: should be "st,stm32f469-qspi"
-- reg: the first contains the register location and length.
- the second contains the memory mapping address and length
-- reg-names: should contain the reg names "qspi" "qspi_mm"
-- interrupts: should contain the interrupt for the device
-- clocks: the phandle of the clock needed by the QSPI controller
-- A pinctrl must be defined to set pins in mode of operation for QSPI transfer
-
-Optional properties:
-- resets: must contain the phandle to the reset controller.
-
-A spi flash must be a child of the nor_flash node and could have some
-properties. Also see jedec,spi-nor.txt.
-
-Required properties:
-- reg: chip-Select number (QSPI controller may connect 2 nor flashes)
-- spi-max-frequency: max frequency of spi bus
-
-Optional property:
-- spi-rx-bus-width: see ../spi/spi-bus.txt for the description
-
-Example:
-
-qspi: spi@a0001000 {
- compatible = "st,stm32f469-qspi";
- reg = <0xa0001000 0x1000>, <0x90000000 0x10000000>;
- reg-names = "qspi", "qspi_mm";
- interrupts = <91>;
- resets = <&rcc STM32F4_AHB3_RESET(QSPI)>;
- clocks = <&rcc 0 STM32F4_AHB3_CLOCK(QSPI)>;
- pinctrl-names = "default";
- pinctrl-0 = <&pinctrl_qspi0>;
-
- flash@0 {
- reg = <0>;
- spi-rx-bus-width = <4>;
- spi-max-frequency = <108000000>;
- ...
- };
-};
diff --git a/Documentation/devicetree/bindings/mtd/ti,am654-hbmc.txt b/Documentation/devicetree/bindings/mtd/ti,am654-hbmc.txt
new file mode 100644
index 000000000000..faa81c2e5da6
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/ti,am654-hbmc.txt
@@ -0,0 +1,51 @@
+Bindings for HyperBus Memory Controller (HBMC) on TI's K3 family of SoCs
+
+Required properties:
+- compatible : "ti,am654-hbmc" for AM654 SoC
+- reg : Two entries:
+ First entry pointed to the register space of HBMC controller
+ Second entry pointing to the memory map region dedicated for
+ MMIO access to attached flash devices
+- ranges : Address translation from offset within CS to allocated MMIO
+ space in SoC
+
+Optional properties:
+- mux-controls : phandle to the multiplexer that controls selection of
+ HBMC vs OSPI inside Flash SubSystem (FSS). Default is OSPI,
+ if property is absent.
+ See Documentation/devicetree/bindings/mux/reg-mux.txt
+ for mmio-mux binding details
+
+Example:
+
+ system-controller@47000000 {
+ compatible = "syscon", "simple-mfd";
+ reg = <0x0 0x47000000 0x0 0x100>;
+ #address-cells = <2>;
+ #size-cells = <2>;
+ ranges;
+
+ hbmc_mux: multiplexer {
+ compatible = "mmio-mux";
+ #mux-control-cells = <1>;
+ mux-reg-masks = <0x4 0x2>; /* 0: reg 0x4, bit 1 */
+ };
+ };
+
+ hbmc: hyperbus@47034000 {
+ compatible = "ti,am654-hbmc";
+ reg = <0x0 0x47034000 0x0 0x100>,
+ <0x5 0x00000000 0x1 0x0000000>;
+ power-domains = <&k3_pds 55>;
+ #address-cells = <2>;
+ #size-cells = <1>;
+ ranges = <0x0 0x0 0x5 0x00000000 0x4000000>, /* CS0 - 64MB */
+ <0x1 0x0 0x5 0x04000000 0x4000000>; /* CS1 - 64MB */
+ mux-controls = <&hbmc_mux 0>;
+
+ /* Slave flash node */
+ flash@0,0 {
+ compatible = "cypress,hyperflash", "cfi-flash";
+ reg = <0x0 0x0 0x4000000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/mux/mmio-mux.txt b/Documentation/devicetree/bindings/mux/mmio-mux.txt
deleted file mode 100644
index a9bfb4d8b6ac..000000000000
--- a/Documentation/devicetree/bindings/mux/mmio-mux.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-MMIO register bitfield-based multiplexer controller bindings
-
-Define register bitfields to be used to control multiplexers. The parent
-device tree node must be a syscon node to provide register access.
-
-Required properties:
-- compatible : "mmio-mux"
-- #mux-control-cells : <1>
-- mux-reg-masks : an array of register offset and pre-shifted bitfield mask
- pairs, each describing a single mux control.
-* Standard mux-controller bindings as decribed in mux-controller.txt
-
-Optional properties:
-- idle-states : if present, the state the muxes will have when idle. The
- special state MUX_IDLE_AS_IS is the default.
-
-The multiplexer state of each multiplexer is defined as the value of the
-bitfield described by the corresponding register offset and bitfield mask pair
-in the mux-reg-masks array, accessed through the parent syscon.
-
-Example:
-
- syscon {
- compatible = "syscon";
-
- mux: mux-controller {
- compatible = "mmio-mux";
- #mux-control-cells = <1>;
-
- mux-reg-masks = <0x3 0x30>, /* 0: reg 0x3, bits 5:4 */
- <0x3 0x40>, /* 1: reg 0x3, bit 6 */
- idle-states = <MUX_IDLE_AS_IS>, <0>;
- };
- };
-
- video-mux {
- compatible = "video-mux";
- mux-controls = <&mux 0>;
-
- ports {
- /* inputs 0..3 */
- port@0 {
- reg = <0>;
- };
- port@1 {
- reg = <1>;
- };
- port@2 {
- reg = <2>;
- };
- port@3 {
- reg = <3>;
- };
-
- /* output */
- port@4 {
- reg = <4>;
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/mux/reg-mux.txt b/Documentation/devicetree/bindings/mux/reg-mux.txt
new file mode 100644
index 000000000000..4afd7ba73d60
--- /dev/null
+++ b/Documentation/devicetree/bindings/mux/reg-mux.txt
@@ -0,0 +1,129 @@
+Generic register bitfield-based multiplexer controller bindings
+
+Define register bitfields to be used to control multiplexers. The parent
+device tree node must be a device node to provide register r/w access.
+
+Required properties:
+- compatible : should be one of
+ "reg-mux" : if parent device of mux controller is not syscon device
+ "mmio-mux" : if parent device of mux controller is syscon device
+- #mux-control-cells : <1>
+- mux-reg-masks : an array of register offset and pre-shifted bitfield mask
+ pairs, each describing a single mux control.
+* Standard mux-controller bindings as decribed in mux-controller.txt
+
+Optional properties:
+- idle-states : if present, the state the muxes will have when idle. The
+ special state MUX_IDLE_AS_IS is the default.
+
+The multiplexer state of each multiplexer is defined as the value of the
+bitfield described by the corresponding register offset and bitfield mask
+pair in the mux-reg-masks array.
+
+Example 1:
+The parent device of mux controller is not a syscon device.
+
+&i2c0 {
+ fpga@66 { // fpga connected to i2c
+ compatible = "fsl,lx2160aqds-fpga", "fsl,fpga-qixis-i2c",
+ "simple-mfd";
+ reg = <0x66>;
+
+ mux: mux-controller {
+ compatible = "reg-mux";
+ #mux-control-cells = <1>;
+ mux-reg-masks = <0x54 0xf8>, /* 0: reg 0x54, bits 7:3 */
+ <0x54 0x07>; /* 1: reg 0x54, bits 2:0 */
+ };
+ };
+};
+
+mdio-mux-1 {
+ compatible = "mdio-mux-multiplexer";
+ mux-controls = <&mux 0>;
+ mdio-parent-bus = <&emdio1>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ mdio@0 {
+ reg = <0x0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+ mdio@8 {
+ reg = <0x8>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+ ..
+ ..
+};
+
+mdio-mux-2 {
+ compatible = "mdio-mux-multiplexer";
+ mux-controls = <&mux 1>;
+ mdio-parent-bus = <&emdio2>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ mdio@0 {
+ reg = <0x0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+ mdio@1 {
+ reg = <0x1>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+ ..
+ ..
+};
+
+Example 2:
+The parent device of mux controller is syscon device.
+
+syscon {
+ compatible = "syscon";
+
+ mux: mux-controller {
+ compatible = "mmio-mux";
+ #mux-control-cells = <1>;
+
+ mux-reg-masks = <0x3 0x30>, /* 0: reg 0x3, bits 5:4 */
+ <0x3 0x40>, /* 1: reg 0x3, bit 6 */
+ idle-states = <MUX_IDLE_AS_IS>, <0>;
+ };
+};
+
+video-mux {
+ compatible = "video-mux";
+ mux-controls = <&mux 0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ports {
+ /* inputs 0..3 */
+ port@0 {
+ reg = <0>;
+ };
+ port@1 {
+ reg = <1>;
+ };
+ port@2 {
+ reg = <2>;
+ };
+ port@3 {
+ reg = <3>;
+ };
+
+ /* output */
+ port@4 {
+ reg = <4>;
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml
new file mode 100644
index 000000000000..792196bf4abd
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-emac.yaml
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-emac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 EMAC Ethernet Controller Device Tree Bindings
+
+allOf:
+ - $ref: "ethernet-controller.yaml#"
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ compatible:
+ const: allwinner,sun4i-a10-emac
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+ allwinner,sram:
+ description: Phandle to the device SRAM
+ $ref: /schemas/types.yaml#/definitions/phandle-array
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - phy-handle
+ - allwinner,sram
+
+examples:
+ - |
+ emac: ethernet@1c0b000 {
+ compatible = "allwinner,sun4i-a10-emac";
+ reg = <0x01c0b000 0x1000>;
+ interrupts = <55>;
+ clocks = <&ahb_gates 17>;
+ phy-handle = <&phy0>;
+ allwinner,sram = <&emac_sram 1>;
+ };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml
new file mode 100644
index 000000000000..df24d9d969f7
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun4i-a10-mdio.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun4i-a10-mdio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 MDIO Controller Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+allOf:
+ - $ref: "mdio.yaml#"
+
+# Select every compatible, including the deprecated ones. This way, we
+# will be able to report a warning when we have that compatible, since
+# we will validate the node thanks to the select, but won't report it
+# as a valid value in the compatible property description
+select:
+ properties:
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-mdio
+
+ # Deprecated
+ - allwinner,sun4i-mdio
+
+ required:
+ - compatible
+
+properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun4i-a10-mdio
+
+ reg:
+ maxItems: 1
+
+ phy-supply:
+ description: PHY regulator
+
+required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ mdio@1c0b080 {
+ compatible = "allwinner,sun4i-a10-mdio";
+ reg = <0x01c0b080 0x14>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ phy-supply = <&reg_emac_3v3>;
+
+ phy0: ethernet-phy@0 {
+ reg = <0>;
+ };
+ };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt b/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt
deleted file mode 100644
index e98118aef5f6..000000000000
--- a/Documentation/devicetree/bindings/net/allwinner,sun4i-emac.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-* Allwinner EMAC ethernet controller
-
-Required properties:
-- compatible: should be "allwinner,sun4i-a10-emac" (Deprecated:
- "allwinner,sun4i-emac")
-- reg: address and length of the register set for the device.
-- interrupts: interrupt for the device
-- phy: see ethernet.txt file in the same directory.
-- clocks: A phandle to the reference clock for this device
-
-Example:
-
-emac: ethernet@1c0b000 {
- compatible = "allwinner,sun4i-a10-emac";
- reg = <0x01c0b000 0x1000>;
- interrupts = <55>;
- clocks = <&ahb_gates 17>;
- phy = <&phy0>;
-};
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt b/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt
deleted file mode 100644
index ab5b8613b0ef..000000000000
--- a/Documentation/devicetree/bindings/net/allwinner,sun4i-mdio.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* Allwinner A10 MDIO Ethernet Controller interface
-
-Required properties:
-- compatible: should be "allwinner,sun4i-a10-mdio"
- (Deprecated: "allwinner,sun4i-mdio").
-- reg: address and length of the register set for the device.
-
-Optional properties:
-- phy-supply: phandle to a regulator if the PHY needs one
-
-Example at the SoC level:
-mdio@1c0b080 {
- compatible = "allwinner,sun4i-a10-mdio";
- reg = <0x01c0b080 0x14>;
- #address-cells = <1>;
- #size-cells = <0>;
-};
-
-And at the board level:
-
-mdio@1c0b080 {
- phy-supply = <&reg_emac_3v3>;
-
- phy0: ethernet-phy@0 {
- reg = <0>;
- };
-};
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt
deleted file mode 100644
index 8b3f953656e3..000000000000
--- a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-* Allwinner GMAC ethernet controller
-
-This device is a platform glue layer for stmmac.
-Please see stmmac.txt for the other unchanged properties.
-
-Required properties:
- - compatible: Should be "allwinner,sun7i-a20-gmac"
- - clocks: Should contain the GMAC main clock, and tx clock
- The tx clock type should be "allwinner,sun7i-a20-gmac-clk"
- - clock-names: Should contain the clock names "stmmaceth",
- and "allwinner_gmac_tx"
-
-Optional properties:
-- phy-supply: phandle to a regulator if the PHY needs one
-
-Examples:
-
- gmac: ethernet@1c50000 {
- compatible = "allwinner,sun7i-a20-gmac";
- reg = <0x01c50000 0x10000>,
- <0x01c20164 0x4>;
- interrupts = <0 85 1>;
- interrupt-names = "macirq";
- clocks = <&ahb_gates 49>, <&gmac_tx>;
- clock-names = "stmmaceth", "allwinner_gmac_tx";
- phy-mode = "mii";
- };
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml
new file mode 100644
index 000000000000..06b1cc8bea14
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun7i-a20-gmac.yaml
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun7i-a20-gmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A20 GMAC Device Tree Bindings
+
+allOf:
+ - $ref: "snps,dwmac.yaml#"
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ compatible:
+ const: allwinner,sun7i-a20-gmac
+
+ interrupts:
+ maxItems: 1
+
+ interrupt-names:
+ const: macirq
+
+ clocks:
+ items:
+ - description: GMAC main clock
+ - description: TX clock
+
+ clock-names:
+ items:
+ - const: stmmaceth
+ - const: allwinner_gmac_tx
+
+ phy-supply:
+ description:
+ PHY regulator
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - interrupt-names
+ - clocks
+ - clock-names
+ - phy-mode
+
+examples:
+ - |
+ gmac: ethernet@1c50000 {
+ compatible = "allwinner,sun7i-a20-gmac";
+ reg = <0x01c50000 0x10000>;
+ interrupts = <0 85 1>;
+ interrupt-names = "macirq";
+ clocks = <&ahb_gates 49>, <&gmac_tx>;
+ clock-names = "stmmaceth", "allwinner_gmac_tx";
+ phy-mode = "mii";
+ };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml
new file mode 100644
index 000000000000..d4084c149768
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/allwinner,sun8i-a83t-gmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A83t EMAC Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ compatible:
+ oneOf:
+ - const: allwinner,sun8i-a83t-emac
+ - const: allwinner,sun8i-h3-emac
+ - const: allwinner,sun8i-r40-emac
+ - const: allwinner,sun8i-v3s-emac
+ - const: allwinner,sun50i-a64-emac
+ - items:
+ - const: allwinner,sun50i-h6-emac
+ - const: allwinner,sun50i-a64-emac
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ interrupt-names:
+ const: macirq
+
+ clocks:
+ maxItems: 1
+
+ clock-names:
+ const: stmmaceth
+
+ syscon:
+ $ref: /schemas/types.yaml#definitions/phandle
+ description:
+ Phandle to the device containing the EMAC or GMAC clock
+ register
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - interrupt-names
+ - clocks
+ - clock-names
+ - resets
+ - reset-names
+ - phy-handle
+ - phy-mode
+ - syscon
+
+allOf:
+ - $ref: "snps,dwmac.yaml#"
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun8i-a83t-emac
+ - allwinner,sun8i-h3-emac
+ - allwinner,sun8i-v3s-emac
+ - allwinner,sun50i-a64-emac
+
+ then:
+ properties:
+ allwinner,tx-delay-ps:
+ default: 0
+ minimum: 0
+ maximum: 700
+ multipleOf: 100
+ description:
+ External RGMII PHY TX clock delay chain value in ps.
+
+ allwinner,rx-delay-ps:
+ default: 0
+ minimum: 0
+ maximum: 3100
+ multipleOf: 100
+ description:
+ External RGMII PHY TX clock delay chain value in ps.
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun8i-r40-emac
+
+ then:
+ properties:
+ allwinner,rx-delay-ps:
+ default: 0
+ minimum: 0
+ maximum: 700
+ multipleOf: 100
+ description:
+ External RGMII PHY TX clock delay chain value in ps.
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun8i-h3-emac
+ - allwinner,sun8i-v3s-emac
+
+ then:
+ properties:
+ allwinner,leds-active-low:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ EPHY LEDs are active low.
+
+ mdio-mux:
+ type: object
+
+ properties:
+ compatible:
+ const: allwinner,sun8i-h3-mdio-mux
+
+ mdio-parent-bus:
+ $ref: /schemas/types.yaml#definitions/phandle
+ description:
+ Phandle to EMAC MDIO.
+
+ mdio@1:
+ type: object
+ description: Internal MDIO Bus
+
+ properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun8i-h3-mdio-internal
+
+ reg:
+ const: 1
+
+ patternProperties:
+ "^ethernet-phy@[0-9a-f]$":
+ type: object
+ description:
+ Integrated PHY node
+
+ properties:
+ clocks:
+ maxItems: 1
+
+ resets:
+ maxItems: 1
+
+ required:
+ - clocks
+ - resets
+
+
+ mdio@2:
+ type: object
+ description: External MDIO Bus (H3 only)
+
+ properties:
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ reg:
+ const: 2
+
+ required:
+ - compatible
+ - mdio-parent-bus
+ - mdio@1
+
+examples:
+ - |
+ ethernet@1c0b000 {
+ compatible = "allwinner,sun8i-h3-emac";
+ syscon = <&syscon>;
+ reg = <0x01c0b000 0x104>;
+ interrupts = <0 82 1>;
+ interrupt-names = "macirq";
+ resets = <&ccu 12>;
+ reset-names = "stmmaceth";
+ clocks = <&ccu 27>;
+ clock-names = "stmmaceth";
+
+ phy-handle = <&int_mii_phy>;
+ phy-mode = "mii";
+ allwinner,leds-active-low;
+
+ mdio1: mdio {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "snps,dwmac-mdio";
+ };
+
+ mdio-mux {
+ compatible = "allwinner,sun8i-h3-mdio-mux";
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ mdio-parent-bus = <&mdio1>;
+
+ int_mii_phy: mdio@1 {
+ compatible = "allwinner,sun8i-h3-mdio-internal";
+ reg = <1>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ethernet-phy@1 {
+ reg = <1>;
+ clocks = <&ccu 67>;
+ resets = <&ccu 39>;
+ phy-is-integrated;
+ };
+ };
+
+ mdio@2 {
+ reg = <2>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+ };
+ };
+
+ - |
+ ethernet@1c0b000 {
+ compatible = "allwinner,sun8i-h3-emac";
+ syscon = <&syscon>;
+ reg = <0x01c0b000 0x104>;
+ interrupts = <0 82 1>;
+ interrupt-names = "macirq";
+ resets = <&ccu 12>;
+ reset-names = "stmmaceth";
+ clocks = <&ccu 27>;
+ clock-names = "stmmaceth";
+
+ phy-handle = <&ext_rgmii_phy>;
+ phy-mode = "rgmii";
+ allwinner,leds-active-low;
+
+ mdio2: mdio {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "snps,dwmac-mdio";
+ };
+
+ mdio-mux {
+ compatible = "allwinner,sun8i-h3-mdio-mux";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ mdio-parent-bus = <&mdio2>;
+
+ mdio@1 {
+ compatible = "allwinner,sun8i-h3-mdio-internal";
+ reg = <1>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ethernet-phy@1 {
+ reg = <1>;
+ clocks = <&ccu 67>;
+ resets = <&ccu 39>;
+ };
+ };
+
+ mdio@2 {
+ reg = <2>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ext_rgmii_phy: ethernet-phy@1 {
+ reg = <1>;
+ };
+ };
+ };
+ };
+
+ - |
+ ethernet@1c0b000 {
+ compatible = "allwinner,sun8i-a83t-emac";
+ syscon = <&syscon>;
+ reg = <0x01c0b000 0x104>;
+ interrupts = <0 82 1>;
+ interrupt-names = "macirq";
+ resets = <&ccu 13>;
+ reset-names = "stmmaceth";
+ clocks = <&ccu 27>;
+ clock-names = "stmmaceth";
+ phy-handle = <&ext_rgmii_phy1>;
+ phy-mode = "rgmii";
+
+ mdio {
+ compatible = "snps,dwmac-mdio";
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ext_rgmii_phy1: ethernet-phy@1 {
+ reg = <1>;
+ };
+ };
+ };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt b/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt
index 188c8bd4eb67..5a0111d4de58 100644
--- a/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt
+++ b/Documentation/devicetree/bindings/net/can/microchip,mcp251x.txt
@@ -4,6 +4,7 @@ Required properties:
- compatible: Should be one of the following:
- "microchip,mcp2510" for MCP2510.
- "microchip,mcp2515" for MCP2515.
+ - "microchip,mcp25625" for MCP25625.
- reg: SPI chip select.
- clocks: The clock feeding the CAN controller.
- interrupts: Should contain IRQ line for the CAN controller.
diff --git a/Documentation/devicetree/bindings/net/can/rcar_can.txt b/Documentation/devicetree/bindings/net/can/rcar_can.txt
index 9936b9ee67c3..b463e1268ac4 100644
--- a/Documentation/devicetree/bindings/net/can/rcar_can.txt
+++ b/Documentation/devicetree/bindings/net/can/rcar_can.txt
@@ -6,6 +6,7 @@ Required properties:
"renesas,can-r8a7744" if CAN controller is a part of R8A7744 SoC.
"renesas,can-r8a7745" if CAN controller is a part of R8A7745 SoC.
"renesas,can-r8a774a1" if CAN controller is a part of R8A774A1 SoC.
+ "renesas,can-r8a774c0" if CAN controller is a part of R8A774C0 SoC.
"renesas,can-r8a7778" if CAN controller is a part of R8A7778 SoC.
"renesas,can-r8a7779" if CAN controller is a part of R8A7779 SoC.
"renesas,can-r8a7790" if CAN controller is a part of R8A7790 SoC.
@@ -27,13 +28,8 @@ Required properties:
- reg: physical base address and size of the R-Car CAN register map.
- interrupts: interrupt specifier for the sole interrupt.
-- clocks: phandles and clock specifiers for 2 CAN clock inputs for RZ/G2
- devices.
- phandles and clock specifiers for 3 CAN clock inputs for every other
- SoC.
-- clock-names: 2 clock input name strings for RZ/G2: "clkp1", "can_clk".
- 3 clock input name strings for every other SoC: "clkp1", "clkp2",
- "can_clk".
+- clocks: phandles and clock specifiers for 3 CAN clock inputs.
+- clock-names: 3 clock input name strings: "clkp1", "clkp2", and "can_clk".
- pinctrl-0: pin control group to be used for this controller.
- pinctrl-names: must be "default".
@@ -49,8 +45,7 @@ using the below properties:
Optional properties:
- renesas,can-clock-select: R-Car CAN Clock Source Select. Valid values are:
<0x0> (default) : Peripheral clock (clkp1)
- <0x1> : Peripheral clock (clkp2) (not supported by
- RZ/G2 devices)
+ <0x1> : Peripheral clock (clkp2)
<0x3> : External input clock
Example
diff --git a/Documentation/devicetree/bindings/net/can/rcar_canfd.txt b/Documentation/devicetree/bindings/net/can/rcar_canfd.txt
index ac71daa46195..32f051f6d338 100644
--- a/Documentation/devicetree/bindings/net/can/rcar_canfd.txt
+++ b/Documentation/devicetree/bindings/net/can/rcar_canfd.txt
@@ -3,11 +3,14 @@ Renesas R-Car CAN FD controller Device Tree Bindings
Required properties:
- compatible: Must contain one or more of the following:
- - "renesas,rcar-gen3-canfd" for R-Car Gen3 compatible controller.
+ - "renesas,rcar-gen3-canfd" for R-Car Gen3 and RZ/G2 compatible controllers.
+ - "renesas,r8a774c0-canfd" for R8A774C0 (RZ/G2E) compatible controller.
- "renesas,r8a7795-canfd" for R8A7795 (R-Car H3) compatible controller.
- "renesas,r8a7796-canfd" for R8A7796 (R-Car M3-W) compatible controller.
+ - "renesas,r8a77965-canfd" for R8A77965 (R-Car M3-N) compatible controller.
- "renesas,r8a77970-canfd" for R8A77970 (R-Car V3M) compatible controller.
- "renesas,r8a77980-canfd" for R8A77980 (R-Car V3H) compatible controller.
+ - "renesas,r8a77990-canfd" for R8A77990 (R-Car E3) compatible controller.
When compatible with the generic version, nodes must list the
SoC-specific version corresponding to the platform first, followed by the
@@ -26,12 +29,13 @@ The name of the child nodes are "channel0" and "channel1" respectively. Each
child node supports the "status" property only, which is used to
enable/disable the respective channel.
-Required properties for "renesas,r8a7795-canfd" and "renesas,r8a7796-canfd"
+Required properties for "renesas,r8a774c0-canfd", "renesas,r8a7795-canfd",
+"renesas,r8a7796-canfd", "renesas,r8a77965-canfd", and "renesas,r8a77990-canfd"
compatible:
-In R8A7795 and R8A7796 SoCs, canfd clock is a div6 clock and can be used by both
-CAN and CAN FD controller at the same time. It needs to be scaled to maximum
-frequency if any of these controllers use it. This is done using the below
-properties:
+In R8A774C0, R8A7795, R8A7796, R8A77965, and R8A77990 SoCs, canfd clock is a
+div6 clock and can be used by both CAN and CAN FD controller at the same time.
+It needs to be scaled to maximum frequency if any of these controllers use it.
+This is done using the below properties:
- assigned-clocks: phandle of canfd clock.
- assigned-clock-rates: maximum frequency of this clock.
diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt
index e7db7268fd0f..4ac21cef370e 100644
--- a/Documentation/devicetree/bindings/net/dsa/ksz.txt
+++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt
@@ -16,6 +16,8 @@ Required properties:
Optional properties:
- reset-gpios : Should be a gpio specifier for a reset line
+- microchip,synclko-125 : Set if the output SYNCLKO frequency should be set to
+ 125MHz instead of 25MHz.
See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of additional
required and optional properties.
diff --git a/Documentation/devicetree/bindings/net/dsa/marvell.txt b/Documentation/devicetree/bindings/net/dsa/marvell.txt
index feb007af13cb..6f9538974bb9 100644
--- a/Documentation/devicetree/bindings/net/dsa/marvell.txt
+++ b/Documentation/devicetree/bindings/net/dsa/marvell.txt
@@ -21,10 +21,13 @@ which is at a different MDIO base address in different switch families.
6341, 6350, 6351, 6352
- "marvell,mv88e6190" : Switch has base address 0x00. Use with models:
6190, 6190X, 6191, 6290, 6390, 6390X
+- "marvell,mv88e6250" : Switch has base address 0x08 or 0x18. Use with model:
+ 6250
Required properties:
-- compatible : Should be one of "marvell,mv88e6085" or
- "marvell,mv88e6190" as indicated above
+- compatible : Should be one of "marvell,mv88e6085",
+ "marvell,mv88e6190" or "marvell,mv88e6250" as
+ indicated above
- reg : Address on the MII bus for the switch.
Optional properties:
diff --git a/Documentation/devicetree/bindings/net/dsa/qca8k.txt b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
index 93a7469e70d4..ccbc6d89325d 100644
--- a/Documentation/devicetree/bindings/net/dsa/qca8k.txt
+++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
@@ -9,6 +9,10 @@ Required properties:
- #size-cells: must be 0
- #address-cells: must be 1
+Optional properties:
+
+- reset-gpios: GPIO to be used to reset the whole device
+
Subnodes:
The integrated switch subnode should be specified according to the binding
@@ -66,6 +70,7 @@ for the external mdio-bus configuration:
#address-cells = <1>;
#size-cells = <0>;
+ reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>;
reg = <0x10>;
ports {
@@ -123,6 +128,7 @@ for the internal master mdio-bus configuration:
#address-cells = <1>;
#size-cells = <0>;
+ reset-gpios = <&gpio 42 GPIO_ACTIVE_LOW>;
reg = <0x10>;
ports {
diff --git a/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
index ed4710c40641..bbf4a13f6d75 100644
--- a/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
+++ b/Documentation/devicetree/bindings/net/dsa/vitesse,vsc73xx.txt
@@ -2,8 +2,8 @@ Vitesse VSC73xx Switches
========================
This defines device tree bindings for the Vitesse VSC73xx switch chips.
-The Vitesse company has been acquired by Microsemi and Microsemi in turn
-acquired by Microchip but retains this vendor branding.
+The Vitesse company has been acquired by Microsemi and Microsemi has
+been acquired Microchip but retains this vendor branding.
The currently supported switch chips are:
Vitesse VSC7385 SparX-G5 5+1-port Integrated Gigabit Ethernet Switch
@@ -11,8 +11,14 @@ Vitesse VSC7388 SparX-G8 8-port Integrated Gigabit Ethernet Switch
Vitesse VSC7395 SparX-G5e 5+1-port Integrated Gigabit Ethernet Switch
Vitesse VSC7398 SparX-G8e 8-port Integrated Gigabit Ethernet Switch
-The device tree node is an SPI device so it must reside inside a SPI bus
-device tree node, see spi/spi-bus.txt
+This switch could have two different management interface.
+
+If SPI interface is used, the device tree node is an SPI device so it must
+reside inside a SPI bus device tree node, see spi/spi-bus.txt
+
+When the chip is connected to a parallel memory bus and work in memory-mapped
+I/O mode, a platform device is used to represent the vsc73xx. In this case it
+must reside inside a platform bus device tree node.
Required properties:
@@ -38,6 +44,7 @@ and subnodes of DSA switches.
Examples:
+SPI:
switch@0 {
compatible = "vitesse,vsc7395";
reg = <0>;
@@ -79,3 +86,46 @@ switch@0 {
};
};
};
+
+Platform:
+switch@2,0 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "vitesse,vsc7385";
+ reg = <0x2 0x0 0x20000>;
+ reset-gpios = <&gpio0 12 GPIO_ACTIVE_LOW>;
+
+ ports {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ port@0 {
+ reg = <0>;
+ label = "lan1";
+ };
+ port@1 {
+ reg = <1>;
+ label = "lan2";
+ };
+ port@2 {
+ reg = <2>;
+ label = "lan3";
+ };
+ port@3 {
+ reg = <3>;
+ label = "lan4";
+ };
+ vsc: port@6 {
+ reg = <6>;
+ label = "cpu";
+ ethernet = <&enet0>;
+ phy-mode = "rgmii";
+ fixed-link {
+ speed = <1000>;
+ full-duplex;
+ pause;
+ };
+ };
+ };
+
+};
diff --git a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt b/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
deleted file mode 100644
index 54c66d0611cb..000000000000
--- a/Documentation/devicetree/bindings/net/dwmac-sun8i.txt
+++ /dev/null
@@ -1,201 +0,0 @@
-* Allwinner sun8i GMAC ethernet controller
-
-This device is a platform glue layer for stmmac.
-Please see stmmac.txt for the other unchanged properties.
-
-Required properties:
-- compatible: must be one of the following string:
- "allwinner,sun8i-a83t-emac"
- "allwinner,sun8i-h3-emac"
- "allwinner,sun8i-r40-gmac"
- "allwinner,sun8i-v3s-emac"
- "allwinner,sun50i-a64-emac"
- "allwinner,sun50i-h6-emac", "allwinner-sun50i-a64-emac"
-- reg: address and length of the register for the device.
-- interrupts: interrupt for the device
-- interrupt-names: must be "macirq"
-- clocks: A phandle to the reference clock for this device
-- clock-names: must be "stmmaceth"
-- resets: A phandle to the reset control for this device
-- reset-names: must be "stmmaceth"
-- phy-mode: See ethernet.txt
-- phy-handle: See ethernet.txt
-- syscon: A phandle to the device containing the EMAC or GMAC clock register
-
-Optional properties:
-- allwinner,tx-delay-ps: TX clock delay chain value in ps.
- Range is 0-700. Default is 0.
- Unavailable for allwinner,sun8i-r40-gmac
-- allwinner,rx-delay-ps: RX clock delay chain value in ps.
- Range is 0-3100. Default is 0.
- Range is 0-700 for allwinner,sun8i-r40-gmac
-Both delay properties need to be a multiple of 100. They control the
-clock delay for external RGMII PHY. They do not apply to the internal
-PHY or external non-RGMII PHYs.
-
-Optional properties for the following compatibles:
- - "allwinner,sun8i-h3-emac",
- - "allwinner,sun8i-v3s-emac":
-- allwinner,leds-active-low: EPHY LEDs are active low
-
-Required child node of emac:
-- mdio bus node: should be named mdio with compatible "snps,dwmac-mdio"
-
-Required properties of the mdio node:
-- #address-cells: shall be 1
-- #size-cells: shall be 0
-
-The device node referenced by "phy" or "phy-handle" must be a child node
-of the mdio node. See phy.txt for the generic PHY bindings.
-
-The following compatibles require that the emac node have a mdio-mux child
-node called "mdio-mux":
- - "allwinner,sun8i-h3-emac"
- - "allwinner,sun8i-v3s-emac":
-Required properties for the mdio-mux node:
- - compatible = "allwinner,sun8i-h3-mdio-mux"
- - mdio-parent-bus: a phandle to EMAC mdio
- - one child mdio for the integrated mdio with the compatible
- "allwinner,sun8i-h3-mdio-internal"
- - one child mdio for the external mdio if present (V3s have none)
-Required properties for the mdio-mux children node:
- - reg: 1 for internal MDIO bus, 2 for external MDIO bus
-
-The following compatibles require a PHY node representing the integrated
-PHY, under the integrated MDIO bus node if an mdio-mux node is used:
- - "allwinner,sun8i-h3-emac",
- - "allwinner,sun8i-v3s-emac":
-
-Additional information regarding generic multiplexer properties can be found
-at Documentation/devicetree/bindings/net/mdio-mux.txt
-
-Required properties of the integrated phy node:
-- clocks: a phandle to the reference clock for the EPHY
-- resets: a phandle to the reset control for the EPHY
-- Must be a child of the integrated mdio
-
-Example with integrated PHY:
-emac: ethernet@1c0b000 {
- compatible = "allwinner,sun8i-h3-emac";
- syscon = <&syscon>;
- reg = <0x01c0b000 0x104>;
- interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "macirq";
- resets = <&ccu RST_BUS_EMAC>;
- reset-names = "stmmaceth";
- clocks = <&ccu CLK_BUS_EMAC>;
- clock-names = "stmmaceth";
-
- phy-handle = <&int_mii_phy>;
- phy-mode = "mii";
- allwinner,leds-active-low;
-
- mdio: mdio {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "snps,dwmac-mdio";
- };
-
- mdio-mux {
- compatible = "mdio-mux", "allwinner,sun8i-h3-mdio-mux";
- #address-cells = <1>;
- #size-cells = <0>;
-
- mdio-parent-bus = <&mdio>;
-
- int_mdio: mdio@1 {
- compatible = "allwinner,sun8i-h3-mdio-internal";
- reg = <1>;
- #address-cells = <1>;
- #size-cells = <0>;
- int_mii_phy: ethernet-phy@1 {
- reg = <1>;
- clocks = <&ccu CLK_BUS_EPHY>;
- resets = <&ccu RST_BUS_EPHY>;
- phy-is-integrated;
- };
- };
- ext_mdio: mdio@2 {
- reg = <2>;
- #address-cells = <1>;
- #size-cells = <0>;
- };
- };
-};
-
-Example with external PHY:
-emac: ethernet@1c0b000 {
- compatible = "allwinner,sun8i-h3-emac";
- syscon = <&syscon>;
- reg = <0x01c0b000 0x104>;
- interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "macirq";
- resets = <&ccu RST_BUS_EMAC>;
- reset-names = "stmmaceth";
- clocks = <&ccu CLK_BUS_EMAC>;
- clock-names = "stmmaceth";
-
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
- allwinner,leds-active-low;
-
- mdio: mdio {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "snps,dwmac-mdio";
- };
-
- mdio-mux {
- compatible = "allwinner,sun8i-h3-mdio-mux";
- #address-cells = <1>;
- #size-cells = <0>;
-
- mdio-parent-bus = <&mdio>;
-
- int_mdio: mdio@1 {
- compatible = "allwinner,sun8i-h3-mdio-internal";
- reg = <1>;
- #address-cells = <1>;
- #size-cells = <0>;
- int_mii_phy: ethernet-phy@1 {
- reg = <1>;
- clocks = <&ccu CLK_BUS_EPHY>;
- resets = <&ccu RST_BUS_EPHY>;
- };
- };
- ext_mdio: mdio@2 {
- reg = <2>;
- #address-cells = <1>;
- #size-cells = <0>;
- ext_rgmii_phy: ethernet-phy@1 {
- reg = <1>;
- };
- }:
- };
-};
-
-Example with SoC without integrated PHY
-
-emac: ethernet@1c0b000 {
- compatible = "allwinner,sun8i-a83t-emac";
- syscon = <&syscon>;
- reg = <0x01c0b000 0x104>;
- interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>;
- interrupt-names = "macirq";
- resets = <&ccu RST_BUS_EMAC>;
- reset-names = "stmmaceth";
- clocks = <&ccu CLK_BUS_EMAC>;
- clock-names = "stmmaceth";
-
- phy-handle = <&ext_rgmii_phy>;
- phy-mode = "rgmii";
-
- mdio: mdio {
- compatible = "snps,dwmac-mdio";
- #address-cells = <1>;
- #size-cells = <0>;
- ext_rgmii_phy: ethernet-phy@1 {
- reg = <1>;
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
new file mode 100644
index 000000000000..0e7c31794ae6
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/ethernet-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ethernet Controller Generic Binding
+
+maintainers:
+ - David S. Miller <davem@davemloft.net>
+
+properties:
+ $nodename:
+ pattern: "^ethernet(@.*)?$"
+
+ local-mac-address:
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/uint8-array
+ - items:
+ - minItems: 6
+ maxItems: 6
+ description:
+ Specifies the MAC address that was assigned to the network device.
+
+ mac-address:
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/uint8-array
+ - items:
+ - minItems: 6
+ maxItems: 6
+ description:
+ Specifies the MAC address that was last used by the boot
+ program; should be used in cases where the MAC address assigned
+ to the device by the boot program is different from the
+ local-mac-address property.
+
+ max-frame-size:
+ $ref: /schemas/types.yaml#definitions/uint32
+ description:
+ Maximum transfer unit (IEEE defined MTU), rather than the
+ maximum frame size (there\'s contradiction in the Devicetree
+ Specification).
+
+ max-speed:
+ $ref: /schemas/types.yaml#definitions/uint32
+ description:
+ Specifies maximum speed in Mbit/s supported by the device.
+
+ nvmem-cells:
+ maxItems: 1
+ description:
+ Reference to an nvmem node for the MAC address
+
+ nvmem-cells-names:
+ const: mac-address
+
+ phy-connection-type:
+ description:
+ Operation mode of the PHY interface
+ enum:
+ # There is not a standard bus between the MAC and the PHY,
+ # something proprietary is being used to embed the PHY in the
+ # MAC.
+ - internal
+ - mii
+ - gmii
+ - sgmii
+ - qsgmii
+ - tbi
+ - rev-mii
+ - rmii
+
+ # RX and TX delays are added by the MAC when required
+ - rgmii
+
+ # RGMII with internal RX and TX delays provided by the PHY,
+ # the MAC should not add the RX or TX delays in this case
+ - rgmii-id
+
+ # RGMII with internal RX delay provided by the PHY, the MAC
+ # should not add an RX delay in this case
+ - rgmii-rxid
+
+ # RGMII with internal TX delay provided by the PHY, the MAC
+ # should not add an TX delay in this case
+ - rgmii-txid
+ - rtbi
+ - smii
+ - xgmii
+ - trgmii
+ - 1000base-x
+ - 2500base-x
+ - rxaui
+ - xaui
+
+ # 10GBASE-KR, XFI, SFI
+ - 10gbase-kr
+ - usxgmii
+
+ phy-mode:
+ $ref: "#/properties/phy-connection-type"
+
+ phy-handle:
+ $ref: /schemas/types.yaml#definitions/phandle
+ description:
+ Specifies a reference to a node representing a PHY device.
+
+ phy:
+ $ref: "#/properties/phy-handle"
+ deprecated: true
+
+ phy-device:
+ $ref: "#/properties/phy-handle"
+ deprecated: true
+
+ rx-fifo-depth:
+ $ref: /schemas/types.yaml#definitions/uint32
+ description:
+ The size of the controller\'s receive fifo in bytes. This is used
+ for components that can have configurable receive fifo sizes,
+ and is useful for determining certain configuration settings
+ such as flow control thresholds.
+
+ tx-fifo-depth:
+ $ref: /schemas/types.yaml#definitions/uint32
+ description:
+ The size of the controller\'s transmit fifo in bytes. This
+ is used for components that can have configurable fifo sizes.
+
+ managed:
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/string
+ - default: auto
+ enum:
+ - auto
+ - in-band-status
+ description:
+ Specifies the PHY management type. If auto is set and fixed-link
+ is not specified, it uses MDIO for management.
+
+ fixed-link:
+ allOf:
+ - if:
+ type: array
+ then:
+ deprecated: true
+ minItems: 1
+ maxItems: 1
+ items:
+ items:
+ - minimum: 0
+ maximum: 31
+ description:
+ Emulated PHY ID, choose any but unique to the all
+ specified fixed-links
+
+ - enum: [0, 1]
+ description:
+ Duplex configuration. 0 for half duplex or 1 for
+ full duplex
+
+ - enum: [10, 100, 1000]
+ description:
+ Link speed in Mbits/sec.
+
+ - enum: [0, 1]
+ description:
+ Pause configuration. 0 for no pause, 1 for pause
+
+ - enum: [0, 1]
+ description:
+ Asymmetric pause configuration. 0 for no asymmetric
+ pause, 1 for asymmetric pause
+
+
+ - if:
+ type: object
+ then:
+ properties:
+ speed:
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/uint32
+ - enum: [10, 100, 1000]
+ description:
+ Link speed.
+
+ full-duplex:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Indicates that full-duplex is used. When absent, half
+ duplex is assumed.
+
+ asym-pause:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Indicates that asym_pause should be enabled.
+
+ link-gpios:
+ maxItems: 1
+ description:
+ GPIO to determine if the link is up
+
+ required:
+ - speed
+
+...
diff --git a/Documentation/devicetree/bindings/net/ethernet-phy.yaml b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
new file mode 100644
index 000000000000..f70f18ff821f
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/ethernet-phy.yaml
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/ethernet-phy.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Ethernet PHY Generic Binding
+
+maintainers:
+ - Andrew Lunn <andrew@lunn.ch>
+ - Florian Fainelli <f.fainelli@gmail.com>
+ - Heiner Kallweit <hkallweit1@gmail.com>
+
+# The dt-schema tools will generate a select statement first by using
+# the compatible, and second by using the node name if any. In our
+# case, the node name is the one we want to match on, while the
+# compatible is optional.
+select:
+ properties:
+ $nodename:
+ pattern: "^ethernet-phy(@[a-f0-9]+)?$"
+
+ required:
+ - $nodename
+
+properties:
+ $nodename:
+ pattern: "^ethernet-phy(@[a-f0-9]+)?$"
+
+ compatible:
+ oneOf:
+ - const: ethernet-phy-ieee802.3-c22
+ description: PHYs that implement IEEE802.3 clause 22
+ - const: ethernet-phy-ieee802.3-c45
+ description: PHYs that implement IEEE802.3 clause 45
+ - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
+ description:
+ If the PHY reports an incorrect ID (or none at all) then the
+ compatible list may contain an entry with the correct PHY ID
+ in the above form.
+ The first group of digits is the 16 bit Phy Identifier 1
+ register, this is the chip vendor OUI bits 3:18. The
+ second group of digits is the Phy Identifier 2 register,
+ this is the chip vendor OUI bits 19:24, followed by 10
+ bits of a vendor specific ID.
+ - items:
+ - pattern: "^ethernet-phy-id[a-f0-9]{4}\\.[a-f0-9]{4}$"
+ - const: ethernet-phy-ieee802.3-c45
+
+ reg:
+ minimum: 0
+ maximum: 31
+ description:
+ The ID number for the PHY.
+
+ interrupts:
+ maxItems: 1
+
+ max-speed:
+ enum:
+ - 10
+ - 100
+ - 1000
+ - 2500
+ - 5000
+ - 10000
+ - 20000
+ - 25000
+ - 40000
+ - 50000
+ - 56000
+ - 100000
+ - 200000
+ description:
+ Maximum PHY supported speed in Mbits / seconds.
+
+ broken-turn-around:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ If set, indicates the PHY device does not correctly release
+ the turn around line low at the end of a MDIO transaction.
+
+ enet-phy-lane-swap:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ If set, indicates the PHY will swap the TX/RX lanes to
+ compensate for the board being designed with the lanes
+ swapped.
+
+ eee-broken-100tx:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Mark the corresponding energy efficient ethernet mode as
+ broken and request the ethernet to stop advertising it.
+
+ eee-broken-1000t:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Mark the corresponding energy efficient ethernet mode as
+ broken and request the ethernet to stop advertising it.
+
+ eee-broken-10gt:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Mark the corresponding energy efficient ethernet mode as
+ broken and request the ethernet to stop advertising it.
+
+ eee-broken-1000kx:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Mark the corresponding energy efficient ethernet mode as
+ broken and request the ethernet to stop advertising it.
+
+ eee-broken-10gkx4:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Mark the corresponding energy efficient ethernet mode as
+ broken and request the ethernet to stop advertising it.
+
+ eee-broken-10gkr:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Mark the corresponding energy efficient ethernet mode as
+ broken and request the ethernet to stop advertising it.
+
+ phy-is-integrated:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ If set, indicates that the PHY is integrated into the same
+ physical package as the Ethernet MAC. If needed, muxers
+ should be configured to ensure the integrated PHY is
+ used. The absence of this property indicates the muxers
+ should be configured so that the external PHY is used.
+
+ resets:
+ maxItems: 1
+
+ reset-names:
+ const: phy
+
+ reset-gpios:
+ maxItems: 1
+ description:
+ The GPIO phandle and specifier for the PHY reset signal.
+
+ reset-assert-us:
+ description:
+ Delay after the reset was asserted in microseconds. If this
+ property is missing the delay will be skipped.
+
+ reset-deassert-us:
+ description:
+ Delay after the reset was deasserted in microseconds. If
+ this property is missing the delay will be skipped.
+
+required:
+ - reg
+
+examples:
+ - |
+ ethernet {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ ethernet-phy@0 {
+ compatible = "ethernet-phy-id0141.0e90", "ethernet-phy-ieee802.3-c45";
+ interrupt-parent = <&PIC>;
+ interrupts = <35 1>;
+ reg = <0>;
+
+ resets = <&rst 8>;
+ reset-names = "phy";
+ reset-gpios = <&gpio1 4 1>;
+ reset-assert-us = <1000>;
+ reset-deassert-us = <2000>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/net/ethernet.txt b/Documentation/devicetree/bindings/net/ethernet.txt
index e88c3641d613..5df413d01be2 100644
--- a/Documentation/devicetree/bindings/net/ethernet.txt
+++ b/Documentation/devicetree/bindings/net/ethernet.txt
@@ -1,67 +1 @@
-The following properties are common to the Ethernet controllers:
-
-NOTE: All 'phy*' properties documented below are Ethernet specific. For the
-generic PHY 'phys' property, see
-Documentation/devicetree/bindings/phy/phy-bindings.txt.
-
-- mac-address: array of 6 bytes, specifies the MAC address that was last used by
- the boot program; should be used in cases where the MAC address assigned to
- the device by the boot program is different from the "local-mac-address"
- property;
-- local-mac-address: array of 6 bytes, specifies the MAC address that was
- assigned to the network device;
-- nvmem-cells: phandle, reference to an nvmem node for the MAC address
-- nvmem-cell-names: string, should be "mac-address" if nvmem is to be used
-- max-speed: number, specifies maximum speed in Mbit/s supported by the device;
-- max-frame-size: number, maximum transfer unit (IEEE defined MTU), rather than
- the maximum frame size (there's contradiction in the Devicetree
- Specification).
-- phy-mode: string, operation mode of the PHY interface. This is now a de-facto
- standard property; supported values are:
- * "internal" (Internal means there is not a standard bus between the MAC and
- the PHY, something proprietary is being used to embed the PHY in the MAC.)
- * "mii"
- * "gmii"
- * "sgmii"
- * "qsgmii"
- * "tbi"
- * "rev-mii"
- * "rmii"
- * "rgmii" (RX and TX delays are added by the MAC when required)
- * "rgmii-id" (RGMII with internal RX and TX delays provided by the PHY, the
- MAC should not add the RX or TX delays in this case)
- * "rgmii-rxid" (RGMII with internal RX delay provided by the PHY, the MAC
- should not add an RX delay in this case)
- * "rgmii-txid" (RGMII with internal TX delay provided by the PHY, the MAC
- should not add an TX delay in this case)
- * "rtbi"
- * "smii"
- * "xgmii"
- * "trgmii"
- * "1000base-x",
- * "2500base-x",
- * "rxaui"
- * "xaui"
- * "10gbase-kr" (10GBASE-KR, XFI, SFI)
-- phy-connection-type: the same as "phy-mode" property but described in the
- Devicetree Specification;
-- phy-handle: phandle, specifies a reference to a node representing a PHY
- device; this property is described in the Devicetree Specification and so
- preferred;
-- phy: the same as "phy-handle" property, not recommended for new bindings.
-- phy-device: the same as "phy-handle" property, not recommended for new
- bindings.
-- rx-fifo-depth: the size of the controller's receive fifo in bytes. This
- is used for components that can have configurable receive fifo sizes,
- and is useful for determining certain configuration settings such as
- flow control thresholds.
-- tx-fifo-depth: the size of the controller's transmit fifo in bytes. This
- is used for components that can have configurable fifo sizes.
-- managed: string, specifies the PHY management type. Supported values are:
- "auto", "in-band-status". "auto" is the default, it usess MDIO for
- management if fixed-link is not specified.
-
-Child nodes of the Ethernet controller are typically the individual PHY devices
-connected via the MDIO bus (sometimes the MDIO bus controller is separate).
-They are described in the phy.txt file in this same directory.
-For non-MDIO PHY management see fixed-link.txt.
+This file has moved to ethernet-controller.yaml.
diff --git a/Documentation/devicetree/bindings/net/fixed-link.txt b/Documentation/devicetree/bindings/net/fixed-link.txt
index ec5d889fe3d8..5df413d01be2 100644
--- a/Documentation/devicetree/bindings/net/fixed-link.txt
+++ b/Documentation/devicetree/bindings/net/fixed-link.txt
@@ -1,54 +1 @@
-Fixed link Device Tree binding
-------------------------------
-
-Some Ethernet MACs have a "fixed link", and are not connected to a
-normal MDIO-managed PHY device. For those situations, a Device Tree
-binding allows to describe a "fixed link".
-
-Such a fixed link situation is described by creating a 'fixed-link'
-sub-node of the Ethernet MAC device node, with the following
-properties:
-
-* 'speed' (integer, mandatory), to indicate the link speed. Accepted
- values are 10, 100 and 1000
-* 'full-duplex' (boolean, optional), to indicate that full duplex is
- used. When absent, half duplex is assumed.
-* 'pause' (boolean, optional), to indicate that pause should be
- enabled.
-* 'asym-pause' (boolean, optional), to indicate that asym_pause should
- be enabled.
-* 'link-gpios' ('gpio-list', optional), to indicate if a gpio can be read
- to determine if the link is up.
-
-Old, deprecated 'fixed-link' binding:
-
-* A 'fixed-link' property in the Ethernet MAC node, with 5 cells, of the
- form <a b c d e> with the following accepted values:
- - a: emulated PHY ID, choose any but but unique to the all specified
- fixed-links, from 0 to 31
- - b: duplex configuration: 0 for half duplex, 1 for full duplex
- - c: link speed in Mbits/sec, accepted values are: 10, 100 and 1000
- - d: pause configuration: 0 for no pause, 1 for pause
- - e: asymmetric pause configuration: 0 for no asymmetric pause, 1 for
- asymmetric pause
-
-Examples:
-
-ethernet@0 {
- ...
- fixed-link {
- speed = <1000>;
- full-duplex;
- };
- ...
-};
-
-ethernet@1 {
- ...
- fixed-link {
- speed = <1000>;
- pause;
- link-gpios = <&gpio0 12 GPIO_ACTIVE_HIGH>;
- };
- ...
-};
+This file has moved to ethernet-controller.yaml.
diff --git a/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt b/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
index d1df8a00e1f3..464c0dafc617 100644
--- a/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
+++ b/Documentation/devicetree/bindings/net/hisilicon-hip04-net.txt
@@ -10,6 +10,7 @@ Required properties:
phandle, specifies a reference to the syscon ppe node
port, port number connected to the controller
channel, recv channel start from channel * number (RX_DESC_NUM)
+ group, field in the pkg desc, in general, it is the same as the port.
- phy-mode: see ethernet.txt [1].
Optional properties:
@@ -66,7 +67,7 @@ Example:
reg = <0x28b0000 0x10000>;
interrupts = <0 413 4>;
phy-mode = "mii";
- port-handle = <&ppe 31 0>;
+ port-handle = <&ppe 31 0 31>;
};
ge0: ethernet@2800000 {
@@ -74,7 +75,7 @@ Example:
reg = <0x2800000 0x10000>;
interrupts = <0 402 4>;
phy-mode = "sgmii";
- port-handle = <&ppe 0 1>;
+ port-handle = <&ppe 0 1 0>;
phy-handle = <&phy0>;
};
@@ -83,6 +84,6 @@ Example:
reg = <0x2880000 0x10000>;
interrupts = <0 410 4>;
phy-mode = "sgmii";
- port-handle = <&ppe 8 2>;
+ port-handle = <&ppe 8 2 8>;
phy-handle = <&phy1>;
};
diff --git a/Documentation/devicetree/bindings/net/keystone-netcp.txt b/Documentation/devicetree/bindings/net/keystone-netcp.txt
index 6262c2f293b0..24f11e042f8d 100644
--- a/Documentation/devicetree/bindings/net/keystone-netcp.txt
+++ b/Documentation/devicetree/bindings/net/keystone-netcp.txt
@@ -104,6 +104,23 @@ Required properties:
- 10Gb mac<->mac forced mode : 11
----phy-handle: phandle to PHY device
+- cpts: sub-node time synchronization (CPTS) submodule configuration
+-- clocks: CPTS reference clock. Should point on cpts-refclk-mux clock.
+-- clock-names: should be "cpts"
+-- cpts-refclk-mux: multiplexer clock definition sub-node for CPTS reference (RFTCLK) clock
+--- #clock-cells: should be 0
+--- clocks: list of CPTS reference (RFTCLK) clock's parents as defined in Data manual
+--- ti,mux-tbl: array of multiplexer indexes as defined in Data manual
+--- assigned-clocks: should point on cpts-refclk-mux clock
+--- assigned-clock-parents: should point on required RFTCLK clock parent to be selected
+-- cpts_clock_mult: (optional) Numerator to convert input clock ticks
+ into nanoseconds
+-- cpts_clock_shift: (optional) Denominator to convert input clock ticks into
+ nanoseconds.
+ Mult and shift will be calculated basing on CPTS
+ rftclk frequency if both cpts_clock_shift and
+ cpts_clock_mult properties are not provided.
+
Optional properties:
- enable-ale: NetCP driver keeps the address learning feature in the ethernet
switch module disabled. This attribute is to enable the address
@@ -168,6 +185,23 @@ netcp: netcp@2000000 {
tx-queue = <648>;
tx-channel = <8>;
+ cpts {
+ clocks = <&cpts_refclk_mux>;
+ clock-names = "cpts";
+
+ cpts_refclk_mux: cpts-refclk-mux {
+ #clock-cells = <0>;
+ clocks = <&chipclk12>, <&chipclk13>,
+ <&timi0>, <&timi1>,
+ <&tsipclka>, <&tsrefclk>,
+ <&tsipclkb>;
+ ti,mux-tbl = <0x0>, <0x1>, <0x2>,
+ <0x3>, <0x4>, <0x8>, <0xC>;
+ assigned-clocks = <&cpts_refclk_mux>;
+ assigned-clock-parents = <&chipclk12>;
+ };
+ };
+
interfaces {
gbe0: interface-0 {
slave-port = <0>;
@@ -219,3 +253,13 @@ netcp: netcp@2000000 {
};
};
};
+
+CPTS board configuration - select external CPTS RFTCLK:
+
+&tsrefclk{
+ clock-frequency = <500000000>;
+};
+
+&cpts_refclk_mux {
+ assigned-clock-parents = <&tsrefclk>;
+};
diff --git a/Documentation/devicetree/bindings/net/macb.txt b/Documentation/devicetree/bindings/net/macb.txt
index 9c5e94482b5f..63c73fafe26d 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -15,8 +15,11 @@ Required properties:
Use "atmel,sama5d4-gem" for the GEM IP (10/100) available on Atmel sama5d4 SoCs.
Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
Use "cdns,zynqmp-gem" for Zynq Ultrascale+ MPSoC.
+ Use "sifive,fu540-macb" for SiFive FU540-C000 SoC.
Or the generic form: "cdns,emac".
- reg: Address and length of the register set for the device
+ For "sifive,fu540-macb", second range is required to specify the
+ address and length of the registers for GEMGXL Management block.
- interrupts: Should contain macb interrupt
- phy-mode: See ethernet.txt file in the same directory.
- clock-names: Tuple listing input clock names.
diff --git a/Documentation/devicetree/bindings/net/marvell-bluetooth.txt b/Documentation/devicetree/bindings/net/marvell-bluetooth.txt
new file mode 100644
index 000000000000..0e2842296032
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/marvell-bluetooth.txt
@@ -0,0 +1,25 @@
+Marvell Bluetooth Chips
+-----------------------
+
+This documents the binding structure and common properties for serial
+attached Marvell Bluetooth devices. The following chips are included in
+this binding:
+
+* Marvell 88W8897 Bluetooth devices
+
+Required properties:
+ - compatible: should be:
+ "mrvl,88w8897"
+
+Optional properties:
+None so far
+
+Example:
+
+&serial0 {
+ compatible = "ns16550a";
+ ...
+ bluetooth {
+ compatible = "mrvl,88w8897";
+ };
+};
diff --git a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt b/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
index 42cd81090a2c..3f3cfc1d8d4d 100644
--- a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
+++ b/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt
@@ -16,7 +16,7 @@ Required properties:
Optional properties:
- interrupts: interrupt line number for the SMI error/done interrupt
-- clocks: phandle for up to three required clocks for the MDIO instance
+- clocks: phandle for up to four required clocks for the MDIO instance
The child nodes of the MDIO driver are the individual PHY devices
connected to this MDIO bus. They must have a "reg" property given the
diff --git a/Documentation/devicetree/bindings/net/mdio.txt b/Documentation/devicetree/bindings/net/mdio.txt
index e3e1603f256c..cf8a0105488e 100644
--- a/Documentation/devicetree/bindings/net/mdio.txt
+++ b/Documentation/devicetree/bindings/net/mdio.txt
@@ -1,37 +1 @@
-Common MDIO bus properties.
-
-These are generic properties that can apply to any MDIO bus.
-
-Optional properties:
-- reset-gpios: One GPIO that control the RESET lines of all PHYs on that MDIO
- bus.
-- reset-delay-us: RESET pulse width in microseconds.
-
-A list of child nodes, one per device on the bus is expected. These
-should follow the generic phy.txt, or a device specific binding document.
-
-The 'reset-delay-us' indicates the RESET signal pulse width in microseconds and
-applies to all PHY devices. It must therefore be appropriately determined based
-on all PHY requirements (maximum value of all per-PHY RESET pulse widths).
-
-Example :
-This example shows these optional properties, plus other properties
-required for the TI Davinci MDIO driver.
-
- davinci_mdio: ethernet@5c030000 {
- compatible = "ti,davinci_mdio";
- reg = <0x5c030000 0x1000>;
- #address-cells = <1>;
- #size-cells = <0>;
-
- reset-gpios = <&gpio2 5 GPIO_ACTIVE_LOW>;
- reset-delay-us = <2>;
-
- ethphy0: ethernet-phy@1 {
- reg = <1>;
- };
-
- ethphy1: ethernet-phy@3 {
- reg = <3>;
- };
- };
+This file has moved to mdio.yaml.
diff --git a/Documentation/devicetree/bindings/net/mdio.yaml b/Documentation/devicetree/bindings/net/mdio.yaml
new file mode 100644
index 000000000000..5d08d2ffd4eb
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/mdio.yaml
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/mdio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: MDIO Bus Generic Binding
+
+maintainers:
+ - Andrew Lunn <andrew@lunn.ch>
+ - Florian Fainelli <f.fainelli@gmail.com>
+ - Heiner Kallweit <hkallweit1@gmail.com>
+
+description:
+ These are generic properties that can apply to any MDIO bus. Any
+ MDIO bus must have a list of child nodes, one per device on the
+ bus. These should follow the generic ethernet-phy.yaml document, or
+ a device specific binding document.
+
+properties:
+ $nodename:
+ pattern: "^mdio(@.*)?"
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ reset-gpios:
+ maxItems: 1
+ description:
+ The phandle and specifier for the GPIO that controls the RESET
+ lines of all PHYs on that MDIO bus.
+
+ reset-delay-us:
+ description:
+ RESET pulse width in microseconds. It applies to all PHY devices
+ and must therefore be appropriately determined based on all PHY
+ requirements (maximum value of all per-PHY RESET pulse widths).
+
+patternProperties:
+ "^ethernet-phy@[0-9a-f]+$":
+ type: object
+
+ properties:
+ reg:
+ minimum: 0
+ maximum: 31
+ description:
+ The ID number for the PHY.
+
+ required:
+ - reg
+
+examples:
+ - |
+ davinci_mdio: mdio@5c030000 {
+ compatible = "ti,davinci_mdio";
+ reg = <0x5c030000 0x1000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ reset-gpios = <&gpio2 5 1>;
+ reset-delay-us = <2>;
+
+ ethphy0: ethernet-phy@1 {
+ reg = <1>;
+ };
+
+ ethphy1: ethernet-phy@3 {
+ reg = <3>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt b/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt
index 41a7dcc80f5b..112011c51d5e 100644
--- a/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-bluetooth.txt
@@ -50,16 +50,33 @@ Required properties:
"mediatek,mt7663u-bluetooth": for MT7663U device
"mediatek,mt7668u-bluetooth": for MT7668U device
- vcc-supply: Main voltage regulator
+
+If the pin controller on the platform can support both pinmux and GPIO
+control such as the most of MediaTek platform. Please use below properties.
+
- pinctrl-names: Should be "default", "runtime"
- pinctrl-0: Should contain UART RXD low when the device is powered up to
enter proper bootstrap mode.
- pinctrl-1: Should contain UART mode pin ctrl
+Else, the pin controller on the platform only can support pinmux control and
+the GPIO control still has to rely on the dedicated GPIO controller such as
+a legacy MediaTek SoC, MT7621. Please use the below properties.
+
+- boot-gpios: GPIO same to the pin as UART RXD and used to keep LOW when
+ the device is powered up to enter proper bootstrap mode when
+- pinctrl-names: Should be "default"
+- pinctrl-0: Should contain UART mode pin ctrl
+
Optional properties:
- reset-gpios: GPIO used to reset the device whose initial state keeps low,
if the GPIO is missing, then board-level design should be
guaranteed.
+- clocks: Should be the clock specifiers corresponding to the entry in
+ clock-names property. If the clock is missing, then board-level
+ design should be guaranteed.
+- clock-names: Should contain "osc" entry for the external oscillator.
- current-speed: Current baud rate of the device whose defaults to 921600
Example:
diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt b/Documentation/devicetree/bindings/net/mediatek-net.txt
index 503f2b9194e2..770ff98d4524 100644
--- a/Documentation/devicetree/bindings/net/mediatek-net.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-net.txt
@@ -11,6 +11,7 @@ Required properties:
"mediatek,mt2701-eth": for MT2701 SoC
"mediatek,mt7623-eth", "mediatek,mt2701-eth": for MT7623 SoC
"mediatek,mt7622-eth": for MT7622 SoC
+ "mediatek,mt7629-eth": for MT7629 SoC
- reg: Address and length of the register set for the device
- interrupts: Should contain the three frame engines interrupts in numeric
order. These are fe_int0, fe_int1 and fe_int2.
@@ -19,14 +20,23 @@ Required properties:
"ethif", "esw", "gp2", "gp1" : For MT2701 and MT7623 SoC
"ethif", "esw", "gp0", "gp1", "gp2", "sgmii_tx250m", "sgmii_rx250m",
"sgmii_cdr_ref", "sgmii_cdr_fb", "sgmii_ck", "eth2pll" : For MT7622 SoC
+ "ethif", "sgmiitop", "esw", "gp0", "gp1", "gp2", "fe", "sgmii_tx250m",
+ "sgmii_rx250m", "sgmii_cdr_ref", "sgmii_cdr_fb", "sgmii2_tx250m",
+ "sgmii2_rx250m", "sgmii2_cdr_ref", "sgmii2_cdr_fb", "sgmii_ck",
+ "eth2pll" : For MT7629 SoC.
- power-domains: phandle to the power domain that the ethernet is part of
- resets: Should contain phandles to the ethsys reset signals
- reset-names: Should contain the names of reset signal listed in the resets
property
These are "fe", "gmac" and "ppe"
- mediatek,ethsys: phandle to the syscon node that handles the port setup
-- mediatek,sgmiisys: phandle to the syscon node that handles the SGMII setup
- which is required for those SoCs equipped with SGMII such as MT7622 SoC.
+- mediatek,infracfg: phandle to the syscon node that handles the path from
+ GMAC to PHY variants, which is required for MT7629 SoC.
+- mediatek,sgmiisys: a list of phandles to the syscon node that handles the
+ SGMII setup which is required for those SoCs equipped with SGMII such
+ as MT7622 and MT7629 SoC. And MT7622 have only one set of SGMII shared
+ by GMAC1 and GMAC2; MT7629 have two independent sets of SGMII directed
+ to GMAC1 and GMAC2, respectively.
- mediatek,pctl: phandle to the syscon node that handles the ports slew rate
and driver current: only for MT2701 and MT7623 SoC
diff --git a/Documentation/devicetree/bindings/net/phy.txt b/Documentation/devicetree/bindings/net/phy.txt
index 9b9e5b1765dd..2399ee60caed 100644
--- a/Documentation/devicetree/bindings/net/phy.txt
+++ b/Documentation/devicetree/bindings/net/phy.txt
@@ -1,79 +1 @@
-PHY nodes
-
-Required properties:
-
- - interrupts : interrupt specifier for the sole interrupt.
- - reg : The ID number for the phy, usually a small integer
-
-Optional Properties:
-
-- compatible: Compatible list, may contain
- "ethernet-phy-ieee802.3-c22" or "ethernet-phy-ieee802.3-c45" for
- PHYs that implement IEEE802.3 clause 22 or IEEE802.3 clause 45
- specifications. If neither of these are specified, the default is to
- assume clause 22.
-
- If the PHY reports an incorrect ID (or none at all) then the
- "compatible" list may contain an entry with the correct PHY ID in the
- form: "ethernet-phy-idAAAA.BBBB" where
- AAAA - The value of the 16 bit Phy Identifier 1 register as
- 4 hex digits. This is the chip vendor OUI bits 3:18
- BBBB - The value of the 16 bit Phy Identifier 2 register as
- 4 hex digits. This is the chip vendor OUI bits 19:24,
- followed by 10 bits of a vendor specific ID.
-
- The compatible list should not contain other values than those
- listed here.
-
-- max-speed: Maximum PHY supported speed (10, 100, 1000...)
-
-- broken-turn-around: If set, indicates the PHY device does not correctly
- release the turn around line low at the end of a MDIO transaction.
-
-- enet-phy-lane-swap: If set, indicates the PHY will swap the TX/RX lanes to
- compensate for the board being designed with the lanes swapped.
-
-- enet-phy-lane-no-swap: If set, indicates that PHY will disable swap of the
- TX/RX lanes. This property allows the PHY to work correcly after e.g. wrong
- bootstrap configuration caused by issues in PCB layout design.
-
-- eee-broken-100tx:
-- eee-broken-1000t:
-- eee-broken-10gt:
-- eee-broken-1000kx:
-- eee-broken-10gkx4:
-- eee-broken-10gkr:
- Mark the corresponding energy efficient ethernet mode as broken and
- request the ethernet to stop advertising it.
-
-- phy-is-integrated: If set, indicates that the PHY is integrated into the same
- physical package as the Ethernet MAC. If needed, muxers should be configured
- to ensure the integrated PHY is used. The absence of this property indicates
- the muxers should be configured so that the external PHY is used.
-
-- resets: The reset-controller phandle and specifier for the PHY reset signal.
-
-- reset-names: Must be "phy" for the PHY reset signal.
-
-- reset-gpios: The GPIO phandle and specifier for the PHY reset signal.
-
-- reset-assert-us: Delay after the reset was asserted in microseconds.
- If this property is missing the delay will be skipped.
-
-- reset-deassert-us: Delay after the reset was deasserted in microseconds.
- If this property is missing the delay will be skipped.
-
-Example:
-
-ethernet-phy@0 {
- compatible = "ethernet-phy-id0141.0e90", "ethernet-phy-ieee802.3-c22";
- interrupt-parent = <&PIC>;
- interrupts = <35 IRQ_TYPE_EDGE_RISING>;
- reg = <0>;
-
- resets = <&rst 8>;
- reset-names = "phy";
- reset-gpios = <&gpio1 4 GPIO_ACTIVE_LOW>;
- reset-assert-us = <1000>;
- reset-deassert-us = <2000>;
-};
+This file has moved to ethernet-phy.yaml.
diff --git a/Documentation/devicetree/bindings/net/qca,ar71xx.txt b/Documentation/devicetree/bindings/net/qca,ar71xx.txt
new file mode 100644
index 000000000000..2a33e71ba72b
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/qca,ar71xx.txt
@@ -0,0 +1,45 @@
+Required properties:
+- compatible: Should be "qca,<soc>-eth". Currently support compatibles are:
+ qca,ar7100-eth - Atheros AR7100
+ qca,ar7240-eth - Atheros AR7240
+ qca,ar7241-eth - Atheros AR7241
+ qca,ar7242-eth - Atheros AR7242
+ qca,ar9130-eth - Atheros AR9130
+ qca,ar9330-eth - Atheros AR9330
+ qca,ar9340-eth - Atheros AR9340
+ qca,qca9530-eth - Qualcomm Atheros QCA9530
+ qca,qca9550-eth - Qualcomm Atheros QCA9550
+ qca,qca9560-eth - Qualcomm Atheros QCA9560
+
+- reg : Address and length of the register set for the device
+- interrupts : Should contain eth interrupt
+- phy-mode : See ethernet.txt file in the same directory
+- clocks: the clock used by the core
+- clock-names: the names of the clock listed in the clocks property. These are
+ "eth" and "mdio".
+- resets: Should contain phandles to the reset signals
+- reset-names: Should contain the names of reset signal listed in the resets
+ property. These are "mac" and "mdio"
+
+Optional properties:
+- phy-handle : phandle to the PHY device connected to this device.
+- fixed-link : Assume a fixed link. See fixed-link.txt in the same directory.
+ Use instead of phy-handle.
+
+Optional subnodes:
+- mdio : specifies the mdio bus, used as a container for phy nodes
+ according to phy.txt in the same directory
+
+Example:
+
+ethernet@1a000000 {
+ compatible = "qca,ar9330-eth";
+ reg = <0x1a000000 0x200>;
+ interrupts = <5>;
+ resets = <&rst 13>, <&rst 23>;
+ reset-names = "mac", "mdio";
+ clocks = <&pll ATH79_CLK_AHB>, <&pll ATH79_CLK_MDIO>;
+ clock-names = "eth", "mdio";
+
+ phy-mode = "gmii";
+};
diff --git a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
index 7ef6118abd3d..68b67d9db63a 100644
--- a/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
+++ b/Documentation/devicetree/bindings/net/qualcomm-bluetooth.txt
@@ -17,6 +17,7 @@ Optional properties for compatible string qcom,qca6174-bt:
- enable-gpios: gpio specifier used to enable chip
- clocks: clock provided to the controller (SUSCLK_32KHZ)
+ - firmware-name: specify the name of nvm firmware to load
Required properties for compatible string qcom,wcn399x-bt:
@@ -28,6 +29,7 @@ Required properties for compatible string qcom,wcn399x-bt:
Optional properties for compatible string qcom,wcn399x-bt:
- max-speed: see Documentation/devicetree/bindings/serial/slave-device.txt
+ - firmware-name: specify the name of nvm firmware to load
Examples:
@@ -40,6 +42,7 @@ serial@7570000 {
enable-gpios = <&pm8994_gpios 19 GPIO_ACTIVE_HIGH>;
clocks = <&divclk4>;
+ firmware-name = "nvm_00440302.bin";
};
};
@@ -52,5 +55,6 @@ serial@898000 {
vddrf-supply = <&vreg_l17a_1p3>;
vddch0-supply = <&vreg_l25a_3p3>;
max-speed = <3200000>;
+ firmware-name = "crnv21.bin";
};
};
diff --git a/Documentation/devicetree/bindings/net/snps,dwmac.yaml b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
new file mode 100644
index 000000000000..76fea2be66ac
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/snps,dwmac.yaml
@@ -0,0 +1,411 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/net/snps,dwmac.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Synopsys DesignWare MAC Device Tree Bindings
+
+maintainers:
+ - Alexandre Torgue <alexandre.torgue@st.com>
+ - Giuseppe Cavallaro <peppe.cavallaro@st.com>
+ - Jose Abreu <joabreu@synopsys.com>
+
+# Select every compatible, including the deprecated ones. This way, we
+# will be able to report a warning when we have that compatible, since
+# we will validate the node thanks to the select, but won't report it
+# as a valid value in the compatible property description
+select:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - snps,dwmac
+ - snps,dwmac-3.50a
+ - snps,dwmac-3.610
+ - snps,dwmac-3.70a
+ - snps,dwmac-3.710
+ - snps,dwmac-4.00
+ - snps,dwmac-4.10a
+ - snps,dwxgmac
+ - snps,dwxgmac-2.10
+
+ # Deprecated
+ - st,spear600-gmac
+
+ required:
+ - compatible
+
+properties:
+
+ # We need to include all the compatibles from schemas that will
+ # include that schemas, otherwise compatible won't validate for
+ # those.
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun7i-a20-gmac
+ - allwinner,sun8i-a83t-emac
+ - allwinner,sun8i-h3-emac
+ - allwinner,sun8i-r40-emac
+ - allwinner,sun8i-v3s-emac
+ - allwinner,sun50i-a64-emac
+ - snps,dwmac
+ - snps,dwmac-3.50a
+ - snps,dwmac-3.610
+ - snps,dwmac-3.70a
+ - snps,dwmac-3.710
+ - snps,dwmac-4.00
+ - snps,dwmac-4.10a
+ - snps,dwxgmac
+ - snps,dwxgmac-2.10
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ minItems: 1
+ maxItems: 3
+ items:
+ - description: Combined signal for various interrupt events
+ - description: The interrupt to manage the remote wake-up packet detection
+ - description: The interrupt that occurs when Rx exits the LPI state
+
+ interrupt-names:
+ minItems: 1
+ maxItems: 3
+ items:
+ - const: macirq
+ - const: eth_wake_irq
+ - const: eth_lpi
+
+ clocks:
+ minItems: 1
+ maxItems: 3
+ items:
+ - description: GMAC main clock
+ - description: Peripheral registers interface clock
+ - description:
+ PTP reference clock. This clock is used for programming the
+ Timestamp Addend Register. If not passed then the system
+ clock will be used and this is fine on some platforms.
+
+ clock-names:
+ additionalItems: true
+ contains:
+ enum:
+ - stmmaceth
+ - pclk
+ - ptp_ref
+
+ resets:
+ maxItems: 1
+ description:
+ MAC Reset signal.
+
+ reset-names:
+ const: stmmaceth
+
+ snps,axi-config:
+ $ref: /schemas/types.yaml#definitions/phandle
+ description:
+ AXI BUS Mode parameters. Phandle to a node that can contain the
+ following properties
+ * snps,lpi_en, enable Low Power Interface
+ * snps,xit_frm, unlock on WoL
+ * snps,wr_osr_lmt, max write outstanding req. limit
+ * snps,rd_osr_lmt, max read outstanding req. limit
+ * snps,kbbe, do not cross 1KiB boundary.
+ * snps,blen, this is a vector of supported burst length.
+ * snps,fb, fixed-burst
+ * snps,mb, mixed-burst
+ * snps,rb, rebuild INCRx Burst
+
+ snps,mtl-rx-config:
+ $ref: /schemas/types.yaml#definitions/phandle
+ description:
+ Multiple RX Queues parameters. Phandle to a node that can
+ contain the following properties
+ * snps,rx-queues-to-use, number of RX queues to be used in the
+ driver
+ * Choose one of these RX scheduling algorithms
+ * snps,rx-sched-sp, Strict priority
+ * snps,rx-sched-wsp, Weighted Strict priority
+ * For each RX queue
+ * Choose one of these modes
+ * snps,dcb-algorithm, Queue to be enabled as DCB
+ * snps,avb-algorithm, Queue to be enabled as AVB
+ * snps,map-to-dma-channel, Channel to map
+ * Specifiy specific packet routing
+ * snps,route-avcp, AV Untagged Control packets
+ * snps,route-ptp, PTP Packets
+ * snps,route-dcbcp, DCB Control Packets
+ * snps,route-up, Untagged Packets
+ * snps,route-multi-broad, Multicast & Broadcast Packets
+ * snps,priority, RX queue priority (Range 0x0 to 0xF)
+
+ snps,mtl-tx-config:
+ $ref: /schemas/types.yaml#definitions/phandle
+ description:
+ Multiple TX Queues parameters. Phandle to a node that can
+ contain the following properties
+ * snps,tx-queues-to-use, number of TX queues to be used in the
+ driver
+ * Choose one of these TX scheduling algorithms
+ * snps,tx-sched-wrr, Weighted Round Robin
+ * snps,tx-sched-wfq, Weighted Fair Queuing
+ * snps,tx-sched-dwrr, Deficit Weighted Round Robin
+ * snps,tx-sched-sp, Strict priority
+ * For each TX queue
+ * snps,weight, TX queue weight (if using a DCB weight
+ algorithm)
+ * Choose one of these modes
+ * snps,dcb-algorithm, TX queue will be working in DCB
+ * snps,avb-algorithm, TX queue will be working in AVB
+ [Attention] Queue 0 is reserved for legacy traffic
+ and so no AVB is available in this queue.
+ * Configure Credit Base Shaper (if AVB Mode selected)
+ * snps,send_slope, enable Low Power Interface
+ * snps,idle_slope, unlock on WoL
+ * snps,high_credit, max write outstanding req. limit
+ * snps,low_credit, max read outstanding req. limit
+ * snps,priority, TX queue priority (Range 0x0 to 0xF)
+
+ snps,reset-gpio:
+ deprecated: true
+ maxItems: 1
+ description:
+ PHY Reset GPIO
+
+ snps,reset-active-low:
+ deprecated: true
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Indicates that the PHY Reset is active low
+
+ snps,reset-delays-us:
+ deprecated: true
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/uint32-array
+ - minItems: 3
+ maxItems: 3
+ description:
+ Triplet of delays. The 1st cell is reset pre-delay in micro
+ seconds. The 2nd cell is reset pulse in micro seconds. The 3rd
+ cell is reset post-delay in micro seconds.
+
+ snps,aal:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Use Address-Aligned Beats
+
+ snps,fixed-burst:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Program the DMA to use the fixed burst mode
+
+ snps,mixed-burst:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Program the DMA to use the mixed burst mode
+
+ snps,force_thresh_dma_mode:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Force DMA to use the threshold mode for both tx and rx
+
+ snps,force_sf_dma_mode:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Force DMA to use the Store and Forward mode for both tx and
+ rx. This flag is ignored if force_thresh_dma_mode is set.
+
+ snps,en-tx-lpi-clockgating:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Enable gating of the MAC TX clock during TX low-power mode
+
+ snps,multicast-filter-bins:
+ $ref: /schemas/types.yaml#definitions/uint32
+ description:
+ Number of multicast filter hash bins supported by this device
+ instance
+
+ snps,perfect-filter-entries:
+ $ref: /schemas/types.yaml#definitions/uint32
+ description:
+ Number of perfect filter entries supported by this device
+ instance
+
+ snps,ps-speed:
+ $ref: /schemas/types.yaml#definitions/uint32
+ description:
+ Port selection speed that can be passed to the core when PCS
+ is supported. For example, this is used in case of SGMII and
+ MAC2MAC connection.
+
+ mdio:
+ type: object
+ description:
+ Creates and registers an MDIO bus.
+
+ properties:
+ compatible:
+ const: snps,dwmac-mdio
+
+ required:
+ - compatible
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - interrupt-names
+ - phy-mode
+
+dependencies:
+ snps,reset-active-low: ["snps,reset-gpio"]
+ snps,reset-delay-us: ["snps,reset-gpio"]
+
+allOf:
+ - $ref: "ethernet-controller.yaml#"
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun7i-a20-gmac
+ - allwinner,sun8i-a83t-emac
+ - allwinner,sun8i-h3-emac
+ - allwinner,sun8i-r40-emac
+ - allwinner,sun8i-v3s-emac
+ - allwinner,sun50i-a64-emac
+ - snps,dwxgmac
+ - snps,dwxgmac-2.10
+ - st,spear600-gmac
+
+ then:
+ properties:
+ snps,pbl:
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/uint32
+ - enum: [2, 4, 8]
+ description:
+ Programmable Burst Length (tx and rx)
+
+ snps,txpbl:
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/uint32
+ - enum: [2, 4, 8]
+ description:
+ Tx Programmable Burst Length. If set, DMA tx will use this
+ value rather than snps,pbl.
+
+ snps,rxpbl:
+ allOf:
+ - $ref: /schemas/types.yaml#definitions/uint32
+ - enum: [2, 4, 8]
+ description:
+ Rx Programmable Burst Length. If set, DMA rx will use this
+ value rather than snps,pbl.
+
+ snps,no-pbl-x8:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Don\'t multiply the pbl/txpbl/rxpbl values by 8. For core
+ rev < 3.50, don\'t multiply the values by 4.
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun7i-a20-gmac
+ - allwinner,sun8i-a83t-emac
+ - allwinner,sun8i-h3-emac
+ - allwinner,sun8i-r40-emac
+ - allwinner,sun8i-v3s-emac
+ - allwinner,sun50i-a64-emac
+ - snps,dwmac-4.00
+ - snps,dwmac-4.10a
+ - snps,dwxgmac
+ - snps,dwxgmac-2.10
+ - st,spear600-gmac
+
+ then:
+ snps,tso:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Enables the TSO feature otherwise it will be managed by
+ MAC HW capability register.
+
+examples:
+ - |
+ stmmac_axi_setup: stmmac-axi-config {
+ snps,wr_osr_lmt = <0xf>;
+ snps,rd_osr_lmt = <0xf>;
+ snps,blen = <256 128 64 32 0 0 0>;
+ };
+
+ mtl_rx_setup: rx-queues-config {
+ snps,rx-queues-to-use = <1>;
+ snps,rx-sched-sp;
+ queue0 {
+ snps,dcb-algorithm;
+ snps,map-to-dma-channel = <0x0>;
+ snps,priority = <0x0>;
+ };
+ };
+
+ mtl_tx_setup: tx-queues-config {
+ snps,tx-queues-to-use = <2>;
+ snps,tx-sched-wrr;
+ queue0 {
+ snps,weight = <0x10>;
+ snps,dcb-algorithm;
+ snps,priority = <0x0>;
+ };
+
+ queue1 {
+ snps,avb-algorithm;
+ snps,send_slope = <0x1000>;
+ snps,idle_slope = <0x1000>;
+ snps,high_credit = <0x3E800>;
+ snps,low_credit = <0xFFC18000>;
+ snps,priority = <0x1>;
+ };
+ };
+
+ gmac0: ethernet@e0800000 {
+ compatible = "snps,dwxgmac-2.10", "snps,dwxgmac";
+ reg = <0xe0800000 0x8000>;
+ interrupt-parent = <&vic1>;
+ interrupts = <24 23 22>;
+ interrupt-names = "macirq", "eth_wake_irq", "eth_lpi";
+ mac-address = [000000000000]; /* Filled in by U-Boot */
+ max-frame-size = <3800>;
+ phy-mode = "gmii";
+ snps,multicast-filter-bins = <256>;
+ snps,perfect-filter-entries = <128>;
+ rx-fifo-depth = <16384>;
+ tx-fifo-depth = <16384>;
+ clocks = <&clock>;
+ clock-names = "stmmaceth";
+ snps,axi-config = <&stmmac_axi_setup>;
+ snps,mtl-rx-config = <&mtl_rx_setup>;
+ snps,mtl-tx-config = <&mtl_tx_setup>;
+ mdio0 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "snps,dwmac-mdio";
+ phy1: ethernet-phy@0 {
+ reg = <0>;
+ };
+ };
+ };
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
index 17d6819669c8..612a8e8abc88 100644
--- a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
@@ -6,11 +6,17 @@ present in Documentation/devicetree/bindings/net/stmmac.txt.
The device node has additional properties:
Required properties:
- - compatible : Should contain "altr,socfpga-stmmac" along with
- "snps,dwmac" and any applicable more detailed
+ - compatible : For Cyclone5/Arria5 SoCs it should contain
+ "altr,socfpga-stmmac". For Arria10/Agilex/Stratix10 SoCs
+ "altr,socfpga-stmmac-a10-s10".
+ Along with "snps,dwmac" and any applicable more detailed
designware version numbers documented in stmmac.txt
- altr,sysmgr-syscon : Should be the phandle to the system manager node that
encompasses the glue register, the register offset, and the register shift.
+ On Cyclone5/Arria5, the register shift represents the PHY mode bits, while
+ on the Arria10/Stratix10/Agilex platforms, the register shift represents
+ bit for each emac to enable/disable signals from the FPGA fabric to the
+ EMAC modules.
- altr,f2h_ptp_ref_clk use f2h_ptp_ref_clk instead of default eosc1 clock
for ptp ref clk. This affects all emacs as the clock is common.
diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt
index cb694062afff..7d48782767cb 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -1,178 +1 @@
-* STMicroelectronics 10/100/1000/2500/10000 Ethernet (GMAC/XGMAC)
-
-Required properties:
-- compatible: Should be "snps,dwmac-<ip_version>", "snps,dwmac" or
- "snps,dwxgmac-<ip_version>", "snps,dwxgmac".
- For backwards compatibility: "st,spear600-gmac" is also supported.
-- reg: Address and length of the register set for the device
-- interrupts: Should contain the STMMAC interrupts
-- interrupt-names: Should contain a list of interrupt names corresponding to
- the interrupts in the interrupts property, if available.
- Valid interrupt names are:
- - "macirq" (combined signal for various interrupt events)
- - "eth_wake_irq" (the interrupt to manage the remote wake-up packet detection)
- - "eth_lpi" (the interrupt that occurs when Rx exits the LPI state)
-- phy-mode: See ethernet.txt file in the same directory.
-- snps,reset-gpio gpio number for phy reset.
-- snps,reset-active-low boolean flag to indicate if phy reset is active low.
-- snps,reset-delays-us is triplet of delays
- The 1st cell is reset pre-delay in micro seconds.
- The 2nd cell is reset pulse in micro seconds.
- The 3rd cell is reset post-delay in micro seconds.
-
-Optional properties:
-- resets: Should contain a phandle to the STMMAC reset signal, if any
-- reset-names: Should contain the reset signal name "stmmaceth", if a
- reset phandle is given
-- max-frame-size: See ethernet.txt file in the same directory
-- clocks: If present, the first clock should be the GMAC main clock and
- the second clock should be peripheral's register interface clock. Further
- clocks may be specified in derived bindings.
-- clock-names: One name for each entry in the clocks property, the
- first one should be "stmmaceth" and the second one should be "pclk".
-- ptp_ref: this is the PTP reference clock; in case of the PTP is available
- this clock is used for programming the Timestamp Addend Register. If not
- passed then the system clock will be used and this is fine on some
- platforms.
-- tx-fifo-depth: See ethernet.txt file in the same directory
-- rx-fifo-depth: See ethernet.txt file in the same directory
-- snps,pbl Programmable Burst Length (tx and rx)
-- snps,txpbl Tx Programmable Burst Length. Only for GMAC and newer.
- If set, DMA tx will use this value rather than snps,pbl.
-- snps,rxpbl Rx Programmable Burst Length. Only for GMAC and newer.
- If set, DMA rx will use this value rather than snps,pbl.
-- snps,no-pbl-x8 Don't multiply the pbl/txpbl/rxpbl values by 8.
- For core rev < 3.50, don't multiply the values by 4.
-- snps,aal Address-Aligned Beats
-- snps,fixed-burst Program the DMA to use the fixed burst mode
-- snps,mixed-burst Program the DMA to use the mixed burst mode
-- snps,force_thresh_dma_mode Force DMA to use the threshold mode for
- both tx and rx
-- snps,force_sf_dma_mode Force DMA to use the Store and Forward
- mode for both tx and rx. This flag is
- ignored if force_thresh_dma_mode is set.
-- snps,en-tx-lpi-clockgating Enable gating of the MAC TX clock during
- TX low-power mode
-- snps,multicast-filter-bins: Number of multicast filter hash bins
- supported by this device instance
-- snps,perfect-filter-entries: Number of perfect filter entries supported
- by this device instance
-- snps,ps-speed: port selection speed that can be passed to the core when
- PCS is supported. For example, this is used in case of SGMII
- and MAC2MAC connection.
-- snps,tso: this enables the TSO feature otherwise it will be managed by
- MAC HW capability register. Only for GMAC4 and newer.
-- AXI BUS Mode parameters: below the list of all the parameters to program the
- AXI register inside the DMA module:
- - snps,lpi_en: enable Low Power Interface
- - snps,xit_frm: unlock on WoL
- - snps,wr_osr_lmt: max write outstanding req. limit
- - snps,rd_osr_lmt: max read outstanding req. limit
- - snps,kbbe: do not cross 1KiB boundary.
- - snps,blen: this is a vector of supported burst length.
- - snps,fb: fixed-burst
- - snps,mb: mixed-burst
- - snps,rb: rebuild INCRx Burst
-- mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus.
-- Multiple RX Queues parameters: below the list of all the parameters to
- configure the multiple RX queues:
- - snps,rx-queues-to-use: number of RX queues to be used in the driver
- - Choose one of these RX scheduling algorithms:
- - snps,rx-sched-sp: Strict priority
- - snps,rx-sched-wsp: Weighted Strict priority
- - For each RX queue
- - Choose one of these modes:
- - snps,dcb-algorithm: Queue to be enabled as DCB
- - snps,avb-algorithm: Queue to be enabled as AVB
- - snps,map-to-dma-channel: Channel to map
- - Specifiy specific packet routing:
- - snps,route-avcp: AV Untagged Control packets
- - snps,route-ptp: PTP Packets
- - snps,route-dcbcp: DCB Control Packets
- - snps,route-up: Untagged Packets
- - snps,route-multi-broad: Multicast & Broadcast Packets
- - snps,priority: RX queue priority (Range: 0x0 to 0xF)
-- Multiple TX Queues parameters: below the list of all the parameters to
- configure the multiple TX queues:
- - snps,tx-queues-to-use: number of TX queues to be used in the driver
- - Choose one of these TX scheduling algorithms:
- - snps,tx-sched-wrr: Weighted Round Robin
- - snps,tx-sched-wfq: Weighted Fair Queuing
- - snps,tx-sched-dwrr: Deficit Weighted Round Robin
- - snps,tx-sched-sp: Strict priority
- - For each TX queue
- - snps,weight: TX queue weight (if using a DCB weight algorithm)
- - Choose one of these modes:
- - snps,dcb-algorithm: TX queue will be working in DCB
- - snps,avb-algorithm: TX queue will be working in AVB
- [Attention] Queue 0 is reserved for legacy traffic
- and so no AVB is available in this queue.
- - Configure Credit Base Shaper (if AVB Mode selected):
- - snps,send_slope: enable Low Power Interface
- - snps,idle_slope: unlock on WoL
- - snps,high_credit: max write outstanding req. limit
- - snps,low_credit: max read outstanding req. limit
- - snps,priority: TX queue priority (Range: 0x0 to 0xF)
-Examples:
-
- stmmac_axi_setup: stmmac-axi-config {
- snps,wr_osr_lmt = <0xf>;
- snps,rd_osr_lmt = <0xf>;
- snps,blen = <256 128 64 32 0 0 0>;
- };
-
- mtl_rx_setup: rx-queues-config {
- snps,rx-queues-to-use = <1>;
- snps,rx-sched-sp;
- queue0 {
- snps,dcb-algorithm;
- snps,map-to-dma-channel = <0x0>;
- snps,priority = <0x0>;
- };
- };
-
- mtl_tx_setup: tx-queues-config {
- snps,tx-queues-to-use = <2>;
- snps,tx-sched-wrr;
- queue0 {
- snps,weight = <0x10>;
- snps,dcb-algorithm;
- snps,priority = <0x0>;
- };
-
- queue1 {
- snps,avb-algorithm;
- snps,send_slope = <0x1000>;
- snps,idle_slope = <0x1000>;
- snps,high_credit = <0x3E800>;
- snps,low_credit = <0xFFC18000>;
- snps,priority = <0x1>;
- };
- };
-
- gmac0: ethernet@e0800000 {
- compatible = "st,spear600-gmac";
- reg = <0xe0800000 0x8000>;
- interrupt-parent = <&vic1>;
- interrupts = <24 23 22>;
- interrupt-names = "macirq", "eth_wake_irq", "eth_lpi";
- mac-address = [000000000000]; /* Filled in by U-Boot */
- max-frame-size = <3800>;
- phy-mode = "gmii";
- snps,multicast-filter-bins = <256>;
- snps,perfect-filter-entries = <128>;
- rx-fifo-depth = <16384>;
- tx-fifo-depth = <16384>;
- clocks = <&clock>;
- clock-names = "stmmaceth";
- snps,axi-config = <&stmmac_axi_setup>;
- mdio0 {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "snps,dwmac-mdio";
- phy1: ethernet-phy@0 {
- };
- };
- snps,mtl-rx-config = <&mtl_rx_setup>;
- snps,mtl-tx-config = <&mtl_tx_setup>;
- };
+This file has moved to snps,dwmac.yaml.
diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt
index 9ef9338aaee1..db6aa3f2215b 100644
--- a/Documentation/devicetree/bindings/net/ti,dp83867.txt
+++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt
@@ -11,6 +11,14 @@ Required properties:
- ti,fifo-depth - Transmitt FIFO depth- see dt-bindings/net/ti-dp83867.h
for applicable values
+Note: If the interface type is PHY_INTERFACE_MODE_RGMII the TX/RX clock delays
+ will be left at their default values, as set by the PHY's pin strapping.
+ The default strapping will use a delay of 2.00 ns. Thus
+ PHY_INTERFACE_MODE_RGMII, by default, does not behave as RGMII with no
+ internal delay, but as PHY_INTERFACE_MODE_RGMII_ID. The device tree
+ should use "rgmii-id" if internal delays are desired as this may be
+ changed in future to cause "rgmii" mode to disable delays.
+
Optional property:
- ti,min-output-impedance - MAC Interface Impedance control to set
the programmable output impedance to
@@ -25,8 +33,10 @@ Optional property:
software needs to take when this pin is
strapped in these modes. See data manual
for details.
- - ti,clk-output-sel - Muxing option for CLK_OUT pin - see dt-bindings/net/ti-dp83867.h
- for applicable values.
+ - ti,clk-output-sel - Muxing option for CLK_OUT pin. See dt-bindings/net/ti-dp83867.h
+ for applicable values. The CLK_OUT pin can also
+ be disabled by this property. When omitted, the
+ PHY's default will be left as is.
Note: ti,min-output-impedance and ti,max-output-impedance are mutually
exclusive. When both properties are present ti,max-output-impedance
diff --git a/Documentation/devicetree/bindings/net/wiznet,w5x00.txt b/Documentation/devicetree/bindings/net/wiznet,w5x00.txt
new file mode 100644
index 000000000000..e9665798c4be
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/wiznet,w5x00.txt
@@ -0,0 +1,50 @@
+* Wiznet w5x00
+
+This is a standalone 10/100 MBit Ethernet controller with SPI interface.
+
+For each device connected to a SPI bus, define a child node within
+the SPI master node.
+
+Required properties:
+- compatible: Should be one of the following strings:
+ "wiznet,w5100"
+ "wiznet,w5200"
+ "wiznet,w5500"
+- reg: Specify the SPI chip select the chip is wired to.
+- interrupts: Specify the interrupt index within the interrupt controller (referred
+ to above in interrupt-parent) and interrupt type. w5x00 natively
+ generates falling edge interrupts, however, additional board logic
+ might invert the signal.
+- pinctrl-names: List of assigned state names, see pinctrl binding documentation.
+- pinctrl-0: List of phandles to configure the GPIO pin used as interrupt line,
+ see also generic and your platform specific pinctrl binding
+ documentation.
+
+Optional properties:
+- spi-max-frequency: Maximum frequency of the SPI bus when accessing the w5500.
+ According to the w5500 datasheet, the chip allows a maximum of 80 MHz, however,
+ board designs may need to limit this value.
+- local-mac-address: See ethernet.txt in the same directory.
+
+
+Example (for Raspberry Pi with pin control stuff for GPIO irq):
+
+&spi {
+ ethernet@0: w5500@0 {
+ compatible = "wiznet,w5500";
+ reg = <0>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&eth1_pins>;
+ interrupt-parent = <&gpio>;
+ interrupts = <25 IRQ_TYPE_EDGE_FALLING>;
+ spi-max-frequency = <30000000>;
+ };
+};
+
+&gpio {
+ eth1_pins: eth1_pins {
+ brcm,pins = <25>;
+ brcm,function = <0>; /* in */
+ brcm,pull = <0>; /* none */
+ };
+};
diff --git a/Documentation/devicetree/bindings/net/xilinx_axienet.txt b/Documentation/devicetree/bindings/net/xilinx_axienet.txt
index 38f9ec076743..7360617cdedb 100644
--- a/Documentation/devicetree/bindings/net/xilinx_axienet.txt
+++ b/Documentation/devicetree/bindings/net/xilinx_axienet.txt
@@ -17,8 +17,15 @@ For more details about mdio please refer phy.txt file in the same directory.
Required properties:
- compatible : Must be one of "xlnx,axi-ethernet-1.00.a",
"xlnx,axi-ethernet-1.01.a", "xlnx,axi-ethernet-2.01.a"
-- reg : Address and length of the IO space.
-- interrupts : Should be a list of two interrupt, TX and RX.
+- reg : Address and length of the IO space, as well as the address
+ and length of the AXI DMA controller IO space, unless
+ axistream-connected is specified, in which case the reg
+ attribute of the node referenced by it is used.
+- interrupts : Should be a list of 2 or 3 interrupts: TX DMA, RX DMA,
+ and optionally Ethernet core. If axistream-connected is
+ specified, the TX/RX DMA interrupts should be on that node
+ instead, and only the Ethernet core interrupt is optionally
+ specified here.
- phy-handle : Should point to the external phy device.
See ethernet.txt file in the same directory.
- xlnx,rxmem : Set to allocated memory buffer for Rx/Tx in the hardware
@@ -31,15 +38,29 @@ Optional properties:
1 to enable partial TX checksum offload,
2 to enable full TX checksum offload
- xlnx,rxcsum : Same values as xlnx,txcsum but for RX checksum offload
+- clocks : AXI bus clock for the device. Refer to common clock bindings.
+ Used to calculate MDIO clock divisor. If not specified, it is
+ auto-detected from the CPU clock (but only on platforms where
+ this is possible). New device trees should specify this - the
+ auto detection is only for backward compatibility.
+- axistream-connected: Reference to another node which contains the resources
+ for the AXI DMA controller used by this device.
+ If this is specified, the DMA-related resources from that
+ device (DMA registers and DMA TX/RX interrupts) rather
+ than this one will be used.
+ - mdio : Child node for MDIO bus. Must be defined if PHY access is
+ required through the core's MDIO interface (i.e. always,
+ unless the PHY is accessed through a different bus).
Example:
axi_ethernet_eth: ethernet@40c00000 {
compatible = "xlnx,axi-ethernet-1.00.a";
device_type = "network";
interrupt-parent = <&microblaze_0_axi_intc>;
- interrupts = <2 0>;
+ interrupts = <2 0 1>;
+ clocks = <&axi_clk>;
phy-mode = "mii";
- reg = <0x40c00000 0x40000>;
+ reg = <0x40c00000 0x40000 0x50c00000 0x40000>;
xlnx,rxcsum = <0x2>;
xlnx,rxmem = <0x800>;
xlnx,txcsum = <0x2>;
diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml b/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml
new file mode 100644
index 000000000000..c9efd6e2c134
--- /dev/null
+++ b/Documentation/devicetree/bindings/nvmem/allwinner,sun4i-a10-sid.yaml
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/nvmem/allwinner,sun4i-a10-sid.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 Security ID Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+allOf:
+ - $ref: "nvmem.yaml#"
+
+properties:
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-sid
+ - allwinner,sun7i-a20-sid
+ - allwinner,sun8i-a83t-sid
+ - allwinner,sun8i-h3-sid
+ - allwinner,sun50i-a64-sid
+ - allwinner,sun50i-h5-sid
+ - allwinner,sun50i-h6-sid
+
+ reg:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+
+# FIXME: We should set it, but it would report all the generic
+# properties as additional properties.
+# additionalProperties: false
+
+examples:
+ - |
+ sid@1c23800 {
+ compatible = "allwinner,sun4i-a10-sid";
+ reg = <0x01c23800 0x10>;
+ };
+
+ - |
+ sid@1c23800 {
+ compatible = "allwinner,sun7i-a20-sid";
+ reg = <0x01c23800 0x200>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt b/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt
deleted file mode 100644
index cfb18b4ef8f7..000000000000
--- a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-Allwinner sunxi-sid
-
-Required properties:
-- compatible: Should be one of the following:
- "allwinner,sun4i-a10-sid"
- "allwinner,sun7i-a20-sid"
- "allwinner,sun8i-a83t-sid"
- "allwinner,sun8i-h3-sid"
- "allwinner,sun50i-a64-sid"
- "allwinner,sun50i-h5-sid"
- "allwinner,sun50i-h6-sid"
-
-- reg: Should contain registers location and length
-
-= Data cells =
-Are child nodes of sunxi-sid, bindings of which as described in
-bindings/nvmem/nvmem.txt
-
-Example for sun4i:
- sid@1c23800 {
- compatible = "allwinner,sun4i-a10-sid";
- reg = <0x01c23800 0x10>
- };
-
-Example for sun7i:
- sid@1c23800 {
- compatible = "allwinner,sun7i-a20-sid";
- reg = <0x01c23800 0x200>
- };
diff --git a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt
index 68f7d6fdd140..96ffd06d2ca8 100644
--- a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt
+++ b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt
@@ -15,6 +15,7 @@ Required properties:
"fsl,imx6sll-ocotp" (i.MX6SLL),
"fsl,imx7ulp-ocotp" (i.MX7ULP),
"fsl,imx8mq-ocotp" (i.MX8MQ),
+ "fsl,imx8mm-ocotp" (i.MX8MM),
followed by "syscon".
- #address-cells : Should be 1
- #size-cells : Should be 1
diff --git a/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt b/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
index b9165b72473c..3abeecf4983f 100644
--- a/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
+++ b/Documentation/devicetree/bindings/pci/83xx-512x-pci.txt
@@ -9,7 +9,6 @@ Freescale 83xx and 512x SOCs include the same PCI bridge core.
Example (MPC8313ERDB)
pci0: pci@e0008500 {
- cell-index = <1>;
interrupt-map-mask = <0xf800 0x0 0x0 0x7>;
interrupt-map = <
/* IDSEL 0x0E -mini PCI */
diff --git a/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt b/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
index a618d4787dd7..64156993e052 100644
--- a/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/mobiveil-pcie.txt
@@ -10,8 +10,10 @@ Required properties:
interrupt source. The value must be 1.
- compatible: Should contain "mbvl,gpex40-pcie"
- reg: Should contain PCIe registers location and length
+ Mandatory:
"config_axi_slave": PCIe controller registers
"csr_axi_slave" : Bridge config registers
+ Optional:
"gpio_slave" : GPIO registers to control slot power
"apb_csr" : MSI registers
diff --git a/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt b/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt
index 145a4f04194f..7939bca47861 100644
--- a/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt
+++ b/Documentation/devicetree/bindings/pci/nvidia,tegra20-pcie.txt
@@ -65,6 +65,14 @@ Required properties:
- afi
- pcie_x
+Optional properties:
+- pinctrl-names: A list of pinctrl state names. Must contain the following
+ entries:
+ - "default": active state, puts PCIe I/O out of deep power down state
+ - "idle": puts PCIe I/O into deep power down state
+- pinctrl-0: phandle for the default/active state of pin configurations.
+- pinctrl-1: phandle for the idle state of pin configurations.
+
Required properties on Tegra124 and later (deprecated):
- phys: Must contain an entry for each entry in phy-names.
- phy-names: Must include the following entries:
diff --git a/Documentation/devicetree/bindings/pci/pci.txt b/Documentation/devicetree/bindings/pci/pci.txt
index 92c01db610df..2a5d91024059 100644
--- a/Documentation/devicetree/bindings/pci/pci.txt
+++ b/Documentation/devicetree/bindings/pci/pci.txt
@@ -24,6 +24,9 @@ driver implementation may support the following properties:
unsupported link speed, for instance, trying to do training for
unsupported link speed, etc. Must be '4' for gen4, '3' for gen3, '2'
for gen2, and '1' for gen1. Any other values are invalid.
+- reset-gpios:
+ If present this property specifies PERST# GPIO. Host drivers can parse the
+ GPIO and apply fundamental reset to endpoints.
PCI-PCI Bridge properties
-------------------------
diff --git a/Documentation/devicetree/bindings/pci/qcom,pcie.txt b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
index 1fd703bd73e0..ada80b01bf0c 100644
--- a/Documentation/devicetree/bindings/pci/qcom,pcie.txt
+++ b/Documentation/devicetree/bindings/pci/qcom,pcie.txt
@@ -10,6 +10,7 @@
- "qcom,pcie-msm8996" for msm8996 or apq8096
- "qcom,pcie-ipq4019" for ipq4019
- "qcom,pcie-ipq8074" for ipq8074
+ - "qcom,pcie-qcs404" for qcs404
- reg:
Usage: required
@@ -116,6 +117,15 @@
- "ahb" AHB clock
- "aux" Auxiliary clock
+- clock-names:
+ Usage: required for qcs404
+ Value type: <stringlist>
+ Definition: Should contain the following entries
+ - "iface" AHB clock
+ - "aux" Auxiliary clock
+ - "master_bus" AXI Master clock
+ - "slave_bus" AXI Slave clock
+
- resets:
Usage: required
Value type: <prop-encoded-array>
@@ -167,6 +177,17 @@
- "ahb" AHB Reset
- "axi_m_sticky" AXI Master Sticky reset
+- reset-names:
+ Usage: required for qcs404
+ Value type: <stringlist>
+ Definition: Should contain the following entries
+ - "axi_m" AXI Master reset
+ - "axi_s" AXI Slave reset
+ - "axi_m_sticky" AXI Master Sticky reset
+ - "pipe_sticky" PIPE sticky reset
+ - "pwr" PWR reset
+ - "ahb" AHB reset
+
- power-domains:
Usage: required for apq8084 and msm8996/apq8096
Value type: <prop-encoded-array>
@@ -195,12 +216,12 @@
Definition: A phandle to the PCIe endpoint power supply
- phys:
- Usage: required for apq8084
+ Usage: required for apq8084 and qcs404
Value type: <phandle>
Definition: List of phandle(s) as listed in phy-names property
- phy-names:
- Usage: required for apq8084
+ Usage: required for apq8084 and qcs404
Value type: <stringlist>
Definition: Should contain "pciephy"
diff --git a/Documentation/devicetree/bindings/pci/rcar-pci.txt b/Documentation/devicetree/bindings/pci/rcar-pci.txt
index 6904882a0e94..45bba9f88a51 100644
--- a/Documentation/devicetree/bindings/pci/rcar-pci.txt
+++ b/Documentation/devicetree/bindings/pci/rcar-pci.txt
@@ -3,6 +3,7 @@
Required properties:
compatible: "renesas,pcie-r8a7743" for the R8A7743 SoC;
"renesas,pcie-r8a7744" for the R8A7744 SoC;
+ "renesas,pcie-r8a774a1" for the R8A774A1 SoC;
"renesas,pcie-r8a774c0" for the R8A774C0 SoC;
"renesas,pcie-r8a7779" for the R8A7779 SoC;
"renesas,pcie-r8a7790" for the R8A7790 SoC;
diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt
new file mode 100644
index 000000000000..d77e3f26f9e6
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.txt
@@ -0,0 +1,21 @@
+* Freescale(NXP) IMX8 DDR performance monitor
+
+Required properties:
+
+- compatible: should be one of:
+ "fsl,imx8-ddr-pmu"
+ "fsl,imx8m-ddr-pmu"
+
+- reg: physical address and size
+
+- interrupts: single interrupt
+ generated by the control block
+
+Example:
+
+ ddr-pmu@5c020000 {
+ compatible = "fsl,imx8-ddr-pmu";
+ reg = <0x5c020000 0x10000>;
+ interrupt-parent = <&gic>;
+ interrupts = <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml b/Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml
new file mode 100644
index 000000000000..250f9d5aabdf
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/display/allwinner,sun6i-a31-mipi-dphy.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 MIPI D-PHY Controller Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#phy-cells":
+ const: 0
+
+ compatible:
+ const: allwinner,sun6i-a31-mipi-dphy
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+
+ clock-names:
+ items:
+ - const: bus
+ - const: mod
+
+ resets:
+ maxItems: 1
+
+required:
+ - "#phy-cells"
+ - compatible
+ - reg
+ - clocks
+ - clock-names
+ - resets
+
+additionalProperties: false
+
+examples:
+ - |
+ dphy0: d-phy@1ca1000 {
+ compatible = "allwinner,sun6i-a31-mipi-dphy";
+ reg = <0x01ca1000 0x1000>;
+ clocks = <&ccu 23>, <&ccu 97>;
+ clock-names = "bus", "mod";
+ resets = <&ccu 4>;
+ #phy-cells = <0>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt b/Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt
new file mode 100644
index 000000000000..9b23407233c0
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/mixel,mipi-dsi-phy.txt
@@ -0,0 +1,29 @@
+Mixel DSI PHY for i.MX8
+
+The Mixel MIPI-DSI PHY IP block is e.g. found on i.MX8 platforms (along the
+MIPI-DSI IP from Northwest Logic). It represents the physical layer for the
+electrical signals for DSI.
+
+Required properties:
+- compatible: Must be:
+ - "fsl,imx8mq-mipi-dphy"
+- clocks: Must contain an entry for each entry in clock-names.
+- clock-names: Must contain the following entries:
+ - "phy_ref": phandle and specifier referring to the DPHY ref clock
+- reg: the register range of the PHY controller
+- #phy-cells: number of cells in PHY, as defined in
+ Documentation/devicetree/bindings/phy/phy-bindings.txt
+ this must be <0>
+
+Optional properties:
+- power-domains: phandle to power domain
+
+Example:
+ dphy: dphy@30a0030 {
+ compatible = "fsl,imx8mq-mipi-dphy";
+ clocks = <&clk IMX8MQ_CLK_DSI_PHY_REF>;
+ clock-names = "phy_ref";
+ reg = <0x30a00300 0x100>;
+ power-domains = <&pd_mipi0>;
+ #phy-cells = <0>;
+ };
diff --git a/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt b/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt
index 6ac98b3b5f57..c9f5c0caf8a9 100644
--- a/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt
+++ b/Documentation/devicetree/bindings/phy/mxs-usb-phy.txt
@@ -7,6 +7,7 @@ Required properties:
* "fsl,imx6sl-usbphy" for imx6sl
* "fsl,vf610-usbphy" for Vybrid vf610
* "fsl,imx6sx-usbphy" for imx6sx
+ * "fsl,imx7ulp-usbphy" for imx7ulp
"fsl,imx23-usbphy" is still a fallback for other strings
- reg: Should contain registers location and length
- interrupts: Should contain phy interrupt
@@ -23,7 +24,7 @@ Optional properties:
the 17.78mA TX reference current. Default: 100
Example:
-usbphy1: usbphy@20c9000 {
+usbphy1: usb-phy@20c9000 {
compatible = "fsl,imx6q-usbphy", "fsl,imx23-usbphy";
reg = <0x020c9000 0x1000>;
interrupts = <0 44 0x04>;
diff --git a/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt b/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt
index daedb15f322e..9fb682e47c29 100644
--- a/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt
+++ b/Documentation/devicetree/bindings/phy/nvidia,tegra124-xusb-padctl.txt
@@ -42,6 +42,18 @@ Required properties:
- reset-names: Must include the following entries:
- "padctl"
+For Tegra124:
+- avdd-pll-utmip-supply: UTMI PLL power supply. Must supply 1.8 V.
+- avdd-pll-erefe-supply: PLLE reference PLL power supply. Must supply 1.05 V.
+- avdd-pex-pll-supply: PCIe/USB3 PLL power supply. Must supply 1.05 V.
+- hvdd-pex-pll-e-supply: High-voltage PLLE power supply. Must supply 3.3 V.
+
+For Tegra210:
+- avdd-pll-utmip-supply: UTMI PLL power supply. Must supply 1.8 V.
+- avdd-pll-uerefe-supply: PLLE reference PLL power supply. Must supply 1.05 V.
+- dvdd-pex-pll-supply: PCIe/USB3 PLL power supply. Must supply 1.05 V.
+- hvdd-pex-pll-e-supply: High-voltage PLLE power supply. Must supply 1.8 V.
+
For Tegra186:
- avdd-pll-erefeut-supply: UPHY brick and reference clock as well as UTMI PHY
power supply. Must supply 1.8 V.
diff --git a/Documentation/devicetree/bindings/phy/phy-bindings.txt b/Documentation/devicetree/bindings/phy/phy-bindings.txt
index a403b81d0679..c4eb38902533 100644
--- a/Documentation/devicetree/bindings/phy/phy-bindings.txt
+++ b/Documentation/devicetree/bindings/phy/phy-bindings.txt
@@ -1,5 +1,5 @@
This document explains only the device tree data binding. For general
-information about PHY subsystem refer to Documentation/phy.txt
+information about PHY subsystem refer to Documentation/driver-api/phy/phy.rst
PHY device node
===============
diff --git a/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt b/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt
new file mode 100644
index 000000000000..d80e36a77ec5
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/phy-pxa-usb.txt
@@ -0,0 +1,18 @@
+Marvell PXA USB PHY
+-------------------
+
+Required properties:
+- compatible: one of: "marvell,mmp2-usb-phy", "marvell,pxa910-usb-phy",
+ "marvell,pxa168-usb-phy",
+- #phy-cells: must be 0
+
+Example:
+ usb-phy: usbphy@d4207000 {
+ compatible = "marvell,mmp2-usb-phy";
+ reg = <0xd4207000 0x40>;
+ #phy-cells = <0>;
+ status = "okay";
+ };
+
+This document explains the device tree binding. For general
+information about PHY subsystem refer to Documentation/driver-api/phy/phy.rst
diff --git a/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt b/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt
new file mode 100644
index 000000000000..30064253f290
--- /dev/null
+++ b/Documentation/devicetree/bindings/phy/qcom-pcie2-phy.txt
@@ -0,0 +1,42 @@
+Qualcomm PCIe2 PHY controller
+=============================
+
+The Qualcomm PCIe2 PHY is a Synopsys based phy found in a number of Qualcomm
+platforms.
+
+Required properties:
+ - compatible: compatible list, should be:
+ "qcom,qcs404-pcie2-phy", "qcom,pcie2-phy"
+
+ - reg: offset and length of the PHY register set.
+ - #phy-cells: must be 0.
+
+ - clocks: a clock-specifier pair for the "pipe" clock
+
+ - vdda-vp-supply: phandle to low voltage regulator
+ - vdda-vph-supply: phandle to high voltage regulator
+
+ - resets: reset-specifier pairs for the "phy" and "pipe" resets
+ - reset-names: list of resets, should contain:
+ "phy" and "pipe"
+
+ - clock-output-names: name of the outgoing clock signal from the PHY PLL
+ - #clock-cells: must be 0
+
+Example:
+ phy@7786000 {
+ compatible = "qcom,qcs404-pcie2-phy", "qcom,pcie2-phy";
+ reg = <0x07786000 0xb8>;
+
+ clocks = <&gcc GCC_PCIE_0_PIPE_CLK>;
+ resets = <&gcc GCC_PCIEPHY_0_PHY_BCR>,
+ <&gcc GCC_PCIE_0_PIPE_ARES>;
+ reset-names = "phy", "pipe";
+
+ vdda-vp-supply = <&vreg_l3_1p05>;
+ vdda-vph-supply = <&vreg_l5_1p8>;
+
+ clock-output-names = "pcie_0_pipe_clk";
+ #clock-cells = <0>;
+ #phy-cells = <0>;
+ };
diff --git a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt
index d46188f450bf..503a8cfb3184 100644
--- a/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt
+++ b/Documentation/devicetree/bindings/phy/rcar-gen3-phy-usb2.txt
@@ -1,10 +1,12 @@
* Renesas R-Car generation 3 USB 2.0 PHY
This file provides information on what the device node for the R-Car generation
-3, RZ/G1C and RZ/G2 USB 2.0 PHY contain.
+3, RZ/G1C, RZ/G2 and RZ/A2 USB 2.0 PHY contain.
Required properties:
-- compatible: "renesas,usb2-phy-r8a77470" if the device is a part of an R8A77470
+- compatible: "renesas,usb2-phy-r7s9210" if the device is a part of an R7S9210
+ SoC.
+ "renesas,usb2-phy-r8a77470" if the device is a part of an R8A77470
SoC.
"renesas,usb2-phy-r8a774a1" if the device is a part of an R8A774A1
SoC.
@@ -20,8 +22,8 @@ Required properties:
R8A77990 SoC.
"renesas,usb2-phy-r8a77995" if the device is a part of an
R8A77995 SoC.
- "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3 or RZ/G2
- compatible device.
+ "renesas,rcar-gen3-usb2-phy" for a generic R-Car Gen3, RZ/G2 or
+ RZ/A2 compatible device.
When compatible with the generic version, nodes must list the
SoC-specific version corresponding to the platform first
@@ -46,6 +48,9 @@ channel as USB OTG:
regulator will be managed during the PHY power on/off sequence.
- renesas,no-otg-pins: boolean, specify when a board does not provide proper
otg pins.
+- dr_mode: string, indicates the working mode for the PHY. Can be "host",
+ "peripheral", or "otg". Should be set if otg controller is not used.
+
Example (R-Car H3):
diff --git a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
index cf96b7c20e4d..328585c6da58 100644
--- a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt
@@ -24,6 +24,8 @@ Required properties:
"allwinner,sun8i-h3-pinctrl"
"allwinner,sun8i-h3-r-pinctrl"
"allwinner,sun8i-r40-pinctrl"
+ "allwinner,sun8i-v3-pinctrl"
+ "allwinner,sun8i-v3s-pinctrl"
"allwinner,sun50i-a64-pinctrl"
"allwinner,sun50i-a64-r-pinctrl"
"allwinner,sun50i-h5-pinctrl"
diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml
new file mode 100644
index 000000000000..125599a2dc5e
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2400-pinctrl.yaml
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/aspeed,ast2400-pinctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ASPEED AST2400 Pin Controller
+
+maintainers:
+ - Andrew Jeffery <andrew@aj.id.au>
+
+description: |+
+ The pin controller node should be the child of a syscon node with the
+ required property:
+
+ - compatible: Should be one of the following:
+ "aspeed,ast2400-scu", "syscon", "simple-mfd"
+ "aspeed,g4-scu", "syscon", "simple-mfd"
+
+ Refer to the the bindings described in
+ Documentation/devicetree/bindings/mfd/syscon.txt
+
+properties:
+ compatible:
+ enum:
+ - aspeed,ast2400-pinctrl
+ - aspeed,g4-pinctrl
+
+patternProperties:
+ '^.*$':
+ if:
+ type: object
+ then:
+ patternProperties:
+ "^function|groups$":
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/string"
+ - enum: [ "ACPI", "ADC0", "ADC1", "ADC10", "ADC11", "ADC12", "ADC13",
+ "ADC14", "ADC15", "ADC2", "ADC3", "ADC4", "ADC5", "ADC6", "ADC7",
+ "ADC8", "ADC9", "BMCINT", "DDCCLK", "DDCDAT", "EXTRST", "FLACK",
+ "FLBUSY", "FLWP", "GPID", "GPID0", "GPID2", "GPID4", "GPID6",
+ "GPIE0", "GPIE2", "GPIE4", "GPIE6", "I2C10", "I2C11", "I2C12",
+ "I2C13", "I2C14", "I2C3", "I2C4", "I2C5", "I2C6", "I2C7", "I2C8",
+ "I2C9", "LPCPD", "LPCPME", "LPCRST", "LPCSMI", "MAC1LINK",
+ "MAC2LINK", "MDIO1", "MDIO2", "NCTS1", "NCTS2", "NCTS3", "NCTS4",
+ "NDCD1", "NDCD2", "NDCD3", "NDCD4", "NDSR1", "NDSR2", "NDSR3",
+ "NDSR4", "NDTR1", "NDTR2", "NDTR3", "NDTR4", "NDTS4", "NRI1",
+ "NRI2", "NRI3", "NRI4", "NRTS1", "NRTS2", "NRTS3", "OSCCLK",
+ "PWM0", "PWM1", "PWM2", "PWM3", "PWM4", "PWM5", "PWM6", "PWM7",
+ "RGMII1", "RGMII2", "RMII1", "RMII2", "ROM16", "ROM8", "ROMCS1",
+ "ROMCS2", "ROMCS3", "ROMCS4", "RXD1", "RXD2", "RXD3", "RXD4",
+ "SALT1", "SALT2", "SALT3", "SALT4", "SD1", "SD2", "SGPMCK",
+ "SGPMI", "SGPMLD", "SGPMO", "SGPSCK", "SGPSI0", "SGPSI1", "SGPSLD",
+ "SIOONCTRL", "SIOPBI", "SIOPBO", "SIOPWREQ", "SIOPWRGD", "SIOS3",
+ "SIOS5", "SIOSCI", "SPI1", "SPI1DEBUG", "SPI1PASSTHRU", "SPICS1",
+ "TIMER3", "TIMER4", "TIMER5", "TIMER6", "TIMER7", "TIMER8", "TXD1",
+ "TXD2", "TXD3", "TXD4", "UART6", "USB11D1", "USB11H2", "USB2D1",
+ "USB2H1", "USBCKI", "VGABIOS_ROM", "VGAHS", "VGAVS", "VPI18",
+ "VPI24", "VPI30", "VPO12", "VPO24", "WDTRST1", "WDTRST2" ]
+
+required:
+ - compatible
+
+examples:
+ - |
+ syscon: scu@1e6e2000 {
+ compatible = "aspeed,ast2400-scu", "syscon", "simple-mfd";
+ reg = <0x1e6e2000 0x1a8>;
+
+ pinctrl: pinctrl {
+ compatible = "aspeed,g4-pinctrl";
+
+ pinctrl_i2c3_default: i2c3_default {
+ function = "I2C3";
+ groups = "I2C3";
+ };
+
+ pinctrl_gpioh0_unbiased_default: gpioh0 {
+ pins = "A8";
+ bias-disable;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml
new file mode 100644
index 000000000000..3e6d85318577
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/aspeed,ast2500-pinctrl.yaml
@@ -0,0 +1,133 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/aspeed,ast2500-pinctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ASPEED AST2500 Pin Controller
+
+maintainers:
+ - Andrew Jeffery <andrew@aj.id.au>
+
+description: |+
+ The pin controller node should be the child of a syscon node with the
+ required property:
+
+ - compatible: Should be one of the following:
+ "aspeed,ast2500-scu", "syscon", "simple-mfd"
+ "aspeed,g5-scu", "syscon", "simple-mfd"
+
+ Refer to the the bindings described in
+ Documentation/devicetree/bindings/mfd/syscon.txt
+
+properties:
+ compatible:
+ enum:
+ - aspeed,ast2500-pinctrl
+ - aspeed,g5-pinctrl
+ aspeed,external-nodes:
+ minItems: 2
+ maxItems: 2
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/phandle-array
+ description: |
+ A cell of phandles to external controller nodes:
+ 0: compatible with "aspeed,ast2500-gfx", "syscon"
+ 1: compatible with "aspeed,ast2500-lhc", "syscon"
+
+patternProperties:
+ '^.*$':
+ if:
+ type: object
+ then:
+ patternProperties:
+ "^function|groups$":
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/string"
+ - enum: [ "ACPI", "ADC0", "ADC1", "ADC10", "ADC11", "ADC12", "ADC13",
+ "ADC14", "ADC15", "ADC2", "ADC3", "ADC4", "ADC5", "ADC6", "ADC7",
+ "ADC8", "ADC9", "BMCINT", "DDCCLK", "DDCDAT", "ESPI", "FWSPICS1",
+ "FWSPICS2", "GPID0", "GPID2", "GPID4", "GPID6", "GPIE0", "GPIE2",
+ "GPIE4", "GPIE6", "I2C10", "I2C11", "I2C12", "I2C13", "I2C14",
+ "I2C3", "I2C4", "I2C5", "I2C6", "I2C7", "I2C8", "I2C9", "LAD0",
+ "LAD1", "LAD2", "LAD3", "LCLK", "LFRAME", "LPCHC", "LPCPD",
+ "LPCPLUS", "LPCPME", "LPCRST", "LPCSMI", "LSIRQ", "MAC1LINK",
+ "MAC2LINK", "MDIO1", "MDIO2", "NCTS1", "NCTS2", "NCTS3", "NCTS4",
+ "NDCD1", "NDCD2", "NDCD3", "NDCD4", "NDSR1", "NDSR2", "NDSR3",
+ "NDSR4", "NDTR1", "NDTR2", "NDTR3", "NDTR4", "NRI1", "NRI2",
+ "NRI3", "NRI4", "NRTS1", "NRTS2", "NRTS3", "NRTS4", "OSCCLK",
+ "PEWAKE", "PNOR", "PWM0", "PWM1", "PWM2", "PWM3", "PWM4", "PWM5",
+ "PWM6", "PWM7", "RGMII1", "RGMII2", "RMII1", "RMII2", "RXD1",
+ "RXD2", "RXD3", "RXD4", "SALT1", "SALT10", "SALT11", "SALT12",
+ "SALT13", "SALT14", "SALT2", "SALT3", "SALT4", "SALT5", "SALT6",
+ "SALT7", "SALT8", "SALT9", "SCL1", "SCL2", "SD1", "SD2", "SDA1",
+ "SDA2", "SGPS1", "SGPS2", "SIOONCTRL", "SIOPBI", "SIOPBO",
+ "SIOPWREQ", "SIOPWRGD", "SIOS3", "SIOS5", "SIOSCI", "SPI1",
+ "SPI1CS1", "SPI1DEBUG", "SPI1PASSTHRU", "SPI2CK", "SPI2CS0",
+ "SPI2CS1", "SPI2MISO", "SPI2MOSI", "TIMER3", "TIMER4", "TIMER5",
+ "TIMER6", "TIMER7", "TIMER8", "TXD1", "TXD2", "TXD3", "TXD4",
+ "UART6", "USB11BHID", "USB2AD", "USB2AH", "USB2BD", "USB2BH",
+ "USBCKI", "VGABIOSROM", "VGAHS", "VGAVS", "VPI24", "VPO",
+ "WDTRST1", "WDTRST2", ]
+
+required:
+ - compatible
+ - aspeed,external-nodes
+
+examples:
+ - |
+ apb {
+ compatible = "simple-bus";
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges;
+
+ syscon: scu@1e6e2000 {
+ compatible = "aspeed,ast2500-scu", "syscon", "simple-mfd";
+ reg = <0x1e6e2000 0x1a8>;
+
+ pinctrl: pinctrl {
+ compatible = "aspeed,g5-pinctrl";
+ aspeed,external-nodes = <&gfx>, <&lhc>;
+
+ pinctrl_i2c3_default: i2c3_default {
+ function = "I2C3";
+ groups = "I2C3";
+ };
+
+ pinctrl_gpioh0_unbiased_default: gpioh0 {
+ pins = "A18";
+ bias-disable;
+ };
+ };
+ };
+
+ gfx: display@1e6e6000 {
+ compatible = "aspeed,ast2500-gfx", "syscon";
+ reg = <0x1e6e6000 0x1000>;
+ };
+ };
+
+ lpc: lpc@1e789000 {
+ compatible = "aspeed,ast2500-lpc", "simple-mfd";
+ reg = <0x1e789000 0x1000>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0x0 0x1e789000 0x1000>;
+
+ lpc_host: lpc-host@80 {
+ compatible = "aspeed,ast2500-lpc-host", "simple-mfd", "syscon";
+ reg = <0x80 0x1e0>;
+ reg-io-width = <4>;
+
+ #address-cells = <1>;
+ #size-cells = <1>;
+ ranges = <0x0 0x80 0x1e0>;
+
+ lhc: lhc@20 {
+ compatible = "aspeed,ast2500-lhc";
+ reg = <0x20 0x24 0x48 0x8>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt
index ed34bb1ee81c..4980776122cc 100644
--- a/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/bitmain,bm1880-pinctrl.txt
@@ -14,7 +14,8 @@ phrase "pin configuration node".
The pin configuration nodes act as a container for an arbitrary number of
subnodes. Each of these subnodes represents some desired configuration for a
pin, a group, or a list of pins or groups. This configuration for BM1880 SoC
-includes only pinmux as there is no pinconf support available in SoC.
+includes pinmux and various pin configuration parameters, such as pull-up,
+slew rate etc...
Each configuration node can consist of multiple nodes describing the pinmux
options. The name of each subnode is not important; all subnodes should be
@@ -84,10 +85,37 @@ Required Properties:
gpio66, gpio67, eth1, i2s0, i2s0_mclkin, i2s1, i2s1_mclkin,
spi0
+Optional Properties:
+
+- bias-disable: No arguments. Disable pin bias.
+- bias-pull-down: No arguments. The specified pins should be configured as
+ pull down.
+- bias-pull-up: No arguments. The specified pins should be configured as
+ pull up.
+- input-schmitt-enable: No arguments: Enable schmitt trigger for the specified
+ pins
+- input-schmitt-disable: No arguments: Disable schmitt trigger for the specified
+ pins
+- slew-rate: Integer. Sets slew rate for the specified pins.
+ Valid values are:
+ <0> - Slow
+ <1> - Fast
+- drive-strength: Integer. Selects the drive strength for the specified
+ pins in mA.
+ Valid values are:
+ <4>
+ <8>
+ <12>
+ <16>
+ <20>
+ <24>
+ <28>
+ <32>
+
Example:
- pinctrl: pinctrl@50 {
+ pinctrl: pinctrl@400 {
compatible = "bitmain,bm1880-pinctrl";
- reg = <0x50 0x4B0>;
+ reg = <0x400 0x120>;
pinctrl_uart0_default: uart0-default {
pinmux {
diff --git a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt
index 3fac0a061bcc..ac6d614d74e0 100644
--- a/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt
+++ b/Documentation/devicetree/bindings/pinctrl/brcm,bcm2835-gpio.txt
@@ -5,6 +5,9 @@ controller, and pinmux/control device.
Required properties:
- compatible: "brcm,bcm2835-gpio"
+- compatible: should be one of:
+ "brcm,bcm2835-gpio" - BCM2835 compatible pinctrl
+ "brcm,bcm7211-gpio" - BCM7211 compatible pinctrl
- reg: Should contain the physical address of the GPIO module's registers.
- gpio-controller: Marks the device node as a GPIO controller.
- #gpio-cells : Should be two. The first cell is the pin number and the
diff --git a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt
index 524a16fca666..e4e01c05cf83 100644
--- a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mm-pinctrl.txt
@@ -12,7 +12,7 @@ Required properties in sub-nodes:
- fsl,pins: each entry consists of 6 integers and represents the mux and config
setting for one pin. The first 5 integers <mux_reg conf_reg input_reg mux_val
input_val> are specified using a PIN_FUNC_ID macro, which can be found in
- <dt-bindings/pinctrl/imx8mm-pinfunc.h>. The last integer CONFIG is
+ <arch/arm64/boot/dts/freescale/imx8mm-pinfunc.h>. The last integer CONFIG is
the pad setting value like pull-up on this pin. Please refer to i.MX8M Mini
Reference Manual for detailed CONFIG settings.
diff --git a/Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.txt
new file mode 100644
index 000000000000..330716c971b9
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/fsl,imx8mn-pinctrl.txt
@@ -0,0 +1,39 @@
+* Freescale IMX8MN IOMUX Controller
+
+Please refer to fsl,imx-pinctrl.txt and pinctrl-bindings.txt in this directory
+for common binding part and usage.
+
+Required properties:
+- compatible: "fsl,imx8mn-iomuxc"
+- reg: should contain the base physical address and size of the iomuxc
+ registers.
+
+Required properties in sub-nodes:
+- fsl,pins: each entry consists of 6 integers and represents the mux and config
+ setting for one pin. The first 5 integers <mux_reg conf_reg input_reg mux_val
+ input_val> are specified using a PIN_FUNC_ID macro, which can be found in
+ <arch/arm64/boot/dts/freescale/imx8mn-pinfunc.h>. The last integer CONFIG is
+ the pad setting value like pull-up on this pin. Please refer to i.MX8M Nano
+ Reference Manual for detailed CONFIG settings.
+
+Examples:
+
+&uart1 {
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_uart1>;
+};
+
+iomuxc: pinctrl@30330000 {
+ compatible = "fsl,imx8mn-iomuxc";
+ reg = <0x0 0x30330000 0x0 0x10000>;
+
+ pinctrl_uart1: uart1grp {
+ fsl,pins = <
+ MX8MN_IOMUXC_UART1_RXD_UART1_DCE_RX 0x140
+ MX8MN_IOMUXC_UART1_TXD_UART1_DCE_TX 0x140
+ MX8MN_IOMUXC_UART3_RXD_UART1_DCE_CTS_B 0x140
+ MX8MN_IOMUXC_UART3_TXD_UART1_DCE_RTS_B 0x140
+ MX8MN_IOMUXC_SD1_DATA4_GPIO2_IO6 0x19
+ >;
+ };
+};
diff --git a/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt
index 6c0ea155b708..2932f171ee85 100644
--- a/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/marvell,kirkwood-pinctrl.txt
@@ -6,8 +6,8 @@ part and usage.
Required properties:
- compatible: "marvell,88f6180-pinctrl",
"marvell,88f6190-pinctrl", "marvell,88f6192-pinctrl",
- "marvell,88f6281-pinctrl", "marvell,88f6282-pinctrl"
- "marvell,98dx4122-pinctrl"
+ "marvell,88f6281-pinctrl", "marvell,88f6282-pinctrl",
+ "marvell,98dx4122-pinctrl", "marvell,98dx1135-pinctrl"
- reg: register specifier of MPP registers
This driver supports all kirkwood variants, i.e. 88f6180, 88f619x, and 88f628x.
@@ -317,3 +317,43 @@ mpp44 44 gpio
mpp45 45 gpio
mpp49 49 gpio
+* Marvell Poncat2 98dx1135
+
+name pins functions
+================================================================================
+
+mpp0 0 gpio, nand(io2), spi(cs)
+mpp1 1 gpo, nand(io3), spi(mosi)
+mpp2 2 gpo, nand(io4), spi(sck)
+mpp3 3 gpo, nand(io5), spi(miso)
+mpp4 4 gpio, nand(io6), uart0(rxd)
+mpp5 5 gpo, nand(io7), uart0(txd)
+mpp6 6 sysrst(out)
+mpp7 7 gpo, spi(cs)
+mpp8 8 gpio, twsi0(sda), uart1(rts)
+mpp9 9 gpio, twsi(sck), uart1(cts)
+mpp10 10 gpo, uart0(txd)
+mpp11 11 gpio, uart0(rxd)
+mpp13 13 gpio, uart1(txd)
+mpp14 14 gpio, uart1(rxd)
+mpp15 15 gpio, uart0(rts)
+mpp16 16 gpio, uart0(cts)
+mpp17 17 gpio, nand(cle)
+mpp18 18 gpo, nand(io0)
+mpp19 19 gpo, nand(io1)
+mpp20 20 gpio
+mpp21 21 gpio
+mpp22 22 gpio
+mpp23 23 gpio
+mpp24 24 gpio
+mpp25 25 gpio
+mpp26 26 gpio
+mpp27 27 gpio
+mpp28 28 gpio, nand(ren)
+mpp29 29 gpio, nand(wen)
+mpp30 30 gpio
+mpp31 31 gpio
+mpp32 32 gpio
+mpp33 33 gpio
+mpp34 34 gpio, nand(ale)
+mpp35 35 gpio, nand(cen)
diff --git a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt
index a47dd990a8d3..10dc4f7176ca 100644
--- a/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/meson,pinctrl.txt
@@ -47,9 +47,19 @@ Required properties for pinmux nodes are:
Required properties for configuration nodes:
- pins: a list of pin names
-Configuration nodes support the generic properties "bias-disable",
-"bias-pull-up" and "bias-pull-down", described in file
-pinctrl-bindings.txt
+Configuration nodes support the following generic properties, as
+described in file pinctrl-bindings.txt:
+ - "bias-disable"
+ - "bias-pull-up"
+ - "bias-pull-down"
+ - "output-enable"
+ - "output-disable"
+ - "output-low"
+ - "output-high"
+
+Optional properties :
+ - drive-strength-microamp: Drive strength for the specified pins in uA.
+ This property is only valid for G12A and newer.
=== Example ===
diff --git a/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt
index 29b72e303ebf..51efd2085113 100644
--- a/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/microchip,pic32-pinctrl.txt
@@ -5,7 +5,7 @@ Please refer to pinctrl-bindings.txt, ../gpio/gpio.txt, and
pin controller, GPIO, and interrupt bindings.
PIC32 'pin configuration node' is a node of a group of pins which can be
-used for a specific device or function. This node represents configuraions of
+used for a specific device or function. This node represents configurations of
pins, optional function, and optional mux related configuration.
Required properties for pin controller node:
diff --git a/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt
index 83f4bbac94bb..a1264cc8660d 100644
--- a/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/nuvoton,npcm7xx-pinctrl.txt
@@ -213,4 +213,4 @@ pinctrl: pinctrl@f0800000 {
groups = "clkreq";
function = "clkreq";
};
-}; \ No newline at end of file
+};
diff --git a/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt
new file mode 100644
index 000000000000..8763f448c376
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/nvidia,tegra194-pinmux.txt
@@ -0,0 +1,107 @@
+NVIDIA Tegra194 pinmux controller
+
+Required properties:
+- compatible: "nvidia,tegra194-pinmux"
+- reg: Should contain a list of base address and size pairs for:
+ - first entry: The APB_MISC_GP_*_PADCTRL registers (pad control)
+ - second entry: The PINMUX_AUX_* registers (pinmux)
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+Tegra's pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, tristate, drive strength, etc.
+
+See the TRM to determine which properties and values apply to each pin/group.
+Macro values for property values are defined in
+include/dt-binding/pinctrl/pinctrl-tegra.h.
+
+Required subnode-properties:
+- nvidia,pins : An array of strings. Each string contains the name of a pin or
+ group. Valid values for these names are listed below.
+
+Optional subnode-properties:
+- nvidia,function: A string containing the name of the function to mux to the
+ pin or group.
+- nvidia,pull: Integer, representing the pull-down/up to apply to the pin.
+ 0: none, 1: down, 2: up.
+- nvidia,tristate: Integer.
+ 0: drive, 1: tristate.
+- nvidia,enable-input: Integer. Enable the pin's input path.
+ enable :TEGRA_PIN_ENABLE and
+ disable or output only: TEGRA_PIN_DISABLE.
+- nvidia,open-drain: Integer.
+ enable: TEGRA_PIN_ENABLE.
+ disable: TEGRA_PIN_DISABLE.
+- nvidia,lock: Integer. Lock the pin configuration against further changes
+ until reset.
+ enable: TEGRA_PIN_ENABLE.
+ disable: TEGRA_PIN_DISABLE.
+- nvidia,io-hv: Integer. Select high-voltage receivers.
+ normal: TEGRA_PIN_DISABLE
+ high: TEGRA_PIN_ENABLE
+- nvidia,schmitt: Integer. Enables Schmitt Trigger on the input.
+ normal: TEGRA_PIN_DISABLE
+ high: TEGRA_PIN_ENABLE
+- nvidia,drive-type: Integer. Valid range 0...3.
+- nvidia,pull-down-strength: Integer. Controls drive strength. 0 is weakest.
+ The range of valid values depends on the pingroup. See "CAL_DRVDN" in the
+ Tegra TRM.
+- nvidia,pull-up-strength: Integer. Controls drive strength. 0 is weakest.
+ The range of valid values depends on the pingroup. See "CAL_DRVUP" in the
+ Tegra TRM.
+
+Valid values for pin and group names (nvidia,pin) are:
+
+ These correspond to Tegra PADCTL_* (pinmux) registers.
+
+ Mux groups:
+
+ These correspond to Tegra PADCTL_* (pinmux) registers. Any property
+ that exists in those registers may be set for the following pin names.
+
+ pex_l5_clkreq_n_pgg0, pex_l5_rst_n_pgg1
+
+ Drive groups:
+
+ These registers controls a single pin for which a mux group exists.
+ See the list above for the pin name to use when configuring the pinmux.
+
+ pex_l5_clkreq_n_pgg0, pex_l5_rst_n_pgg1
+
+Valid values for nvidia,functions are:
+
+ pe5
+
+Power Domain:
+ pex_l5_clkreq_n_pgg0 and pex_l5_rst_n_pgg1 are part of PCIE C5 power
+ partition. Client devices must enable this partition before accessing
+ these pins here.
+
+
+Example:
+
+ tegra_pinctrl: pinmux: pinmux@2430000 {
+ compatible = "nvidia,tegra194-pinmux";
+ reg = <0x2430000 0x17000
+ 0xc300000 0x4000>;
+
+ pinctrl-names = "pex_rst";
+ pinctrl-0 = <&pex_rst_c5_out_state>;
+
+ pex_rst_c5_out_state: pex_rst_c5_out {
+ pex_rst {
+ nvidia,pins = "pex_l5_rst_n_pgg1";
+ nvidia,schmitt = <TEGRA_PIN_DISABLE>;
+ nvidia,lpdr = <TEGRA_PIN_ENABLE>;
+ nvidia,enable-input = <TEGRA_PIN_DISABLE>;
+ nvidia,io-high-voltage = <TEGRA_PIN_ENABLE>;
+ nvidia,tristate = <TEGRA_PIN_DISABLE>;
+ nvidia,pull = <TEGRA_PIN_PULL_NONE>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt
deleted file mode 100644
index 3b7266c7c438..000000000000
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-======================
-Aspeed Pin Controllers
-======================
-
-The Aspeed SoCs vary in functionality inside a generation but have a common mux
-device register layout.
-
-Required properties for g4:
-- compatible : Should be one of the following:
- "aspeed,ast2400-pinctrl"
- "aspeed,g4-pinctrl"
-
-Required properties for g5:
-- compatible : Should be one of the following:
- "aspeed,ast2500-pinctrl"
- "aspeed,g5-pinctrl"
-
-- aspeed,external-nodes: A cell of phandles to external controller nodes:
- 0: compatible with "aspeed,ast2500-gfx", "syscon"
- 1: compatible with "aspeed,ast2500-lhc", "syscon"
-
-The pin controller node should be the child of a syscon node with the required
-property:
-
-- compatible : Should be one of the following:
- "aspeed,ast2400-scu", "syscon", "simple-mfd"
- "aspeed,g4-scu", "syscon", "simple-mfd"
- "aspeed,ast2500-scu", "syscon", "simple-mfd"
- "aspeed,g5-scu", "syscon", "simple-mfd"
-
-Refer to the the bindings described in
-Documentation/devicetree/bindings/mfd/syscon.txt
-
-Subnode Format
-==============
-
-The required properties of pinmux child nodes are:
-- function: the mux function to select
-- groups : the list of groups to select with this function
-
-Required properties of pinconf child nodes are:
-- groups: A list of groups to select (either this or "pins" must be
- specified)
-- pins : A list of ball names as strings, eg "D14" (either this or "groups"
- must be specified)
-
-Optional properties of pinconf child nodes are:
-- bias-disable : disable any pin bias
-- bias-pull-down: pull down the pin
-- drive-strength: sink or source at most X mA
-
-Definitions are as specified in
-Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt, with any
-further limitations as described above.
-
-For pinmux, each mux function has only one associated pin group. Each group is
-named by its function. The following values for the function and groups
-properties are supported:
-
-aspeed,ast2400-pinctrl, aspeed,g4-pinctrl:
-
-ACPI ADC0 ADC1 ADC10 ADC11 ADC12 ADC13 ADC14 ADC15 ADC2 ADC3 ADC4 ADC5 ADC6
-ADC7 ADC8 ADC9 BMCINT DDCCLK DDCDAT EXTRST FLACK FLBUSY FLWP GPID GPID0 GPID2
-GPID4 GPID6 GPIE0 GPIE2 GPIE4 GPIE6 I2C10 I2C11 I2C12 I2C13 I2C14 I2C3 I2C4
-I2C5 I2C6 I2C7 I2C8 I2C9 LPCPD LPCPME LPCRST LPCSMI MAC1LINK MAC2LINK MDIO1
-MDIO2 NCTS1 NCTS2 NCTS3 NCTS4 NDCD1 NDCD2 NDCD3 NDCD4 NDSR1 NDSR2 NDSR3 NDSR4
-NDTR1 NDTR2 NDTR3 NDTR4 NDTS4 NRI1 NRI2 NRI3 NRI4 NRTS1 NRTS2 NRTS3 OSCCLK PWM0
-PWM1 PWM2 PWM3 PWM4 PWM5 PWM6 PWM7 RGMII1 RGMII2 RMII1 RMII2 ROM16 ROM8 ROMCS1
-ROMCS2 ROMCS3 ROMCS4 RXD1 RXD2 RXD3 RXD4 SALT1 SALT2 SALT3 SALT4 SD1 SD2 SGPMCK
-SGPMI SGPMLD SGPMO SGPSCK SGPSI0 SGPSI1 SGPSLD SIOONCTRL SIOPBI SIOPBO SIOPWREQ
-SIOPWRGD SIOS3 SIOS5 SIOSCI SPI1 SPI1DEBUG SPI1PASSTHRU SPICS1 TIMER3 TIMER4
-TIMER5 TIMER6 TIMER7 TIMER8 TXD1 TXD2 TXD3 TXD4 UART6 USB11D1 USB11H2 USB2D1
-USB2H1 USBCKI VGABIOS_ROM VGAHS VGAVS VPI18 VPI24 VPI30 VPO12 VPO24 WDTRST1
-WDTRST2
-
-aspeed,ast2500-pinctrl, aspeed,g5-pinctrl:
-
-ACPI ADC0 ADC1 ADC10 ADC11 ADC12 ADC13 ADC14 ADC15 ADC2 ADC3 ADC4 ADC5 ADC6
-ADC7 ADC8 ADC9 BMCINT DDCCLK DDCDAT ESPI FWSPICS1 FWSPICS2 GPID0 GPID2 GPID4
-GPID6 GPIE0 GPIE2 GPIE4 GPIE6 I2C10 I2C11 I2C12 I2C13 I2C14 I2C3 I2C4 I2C5 I2C6
-I2C7 I2C8 I2C9 LAD0 LAD1 LAD2 LAD3 LCLK LFRAME LPCHC LPCPD LPCPLUS LPCPME
-LPCRST LPCSMI LSIRQ MAC1LINK MAC2LINK MDIO1 MDIO2 NCTS1 NCTS2 NCTS3 NCTS4 NDCD1
-NDCD2 NDCD3 NDCD4 NDSR1 NDSR2 NDSR3 NDSR4 NDTR1 NDTR2 NDTR3 NDTR4 NRI1 NRI2
-NRI3 NRI4 NRTS1 NRTS2 NRTS3 NRTS4 OSCCLK PEWAKE PNOR PWM0 PWM1 PWM2 PWM3 PWM4
-PWM5 PWM6 PWM7 RGMII1 RGMII2 RMII1 RMII2 RXD1 RXD2 RXD3 RXD4 SALT1 SALT10
-SALT11 SALT12 SALT13 SALT14 SALT2 SALT3 SALT4 SALT5 SALT6 SALT7 SALT8 SALT9
-SCL1 SCL2 SD1 SD2 SDA1 SDA2 SGPS1 SGPS2 SIOONCTRL SIOPBI SIOPBO SIOPWREQ
-SIOPWRGD SIOS3 SIOS5 SIOSCI SPI1 SPI1CS1 SPI1DEBUG SPI1PASSTHRU SPI2CK SPI2CS0
-SPI2CS1 SPI2MISO SPI2MOSI TIMER3 TIMER4 TIMER5 TIMER6 TIMER7 TIMER8 TXD1 TXD2
-TXD3 TXD4 UART6 USB11BHID USB2AD USB2AH USB2BD USB2BH USBCKI VGABIOSROM VGAHS
-VGAVS VPI24 VPO WDTRST1 WDTRST2
-
-Examples
-========
-
-g4 Example
-----------
-
-syscon: scu@1e6e2000 {
- compatible = "aspeed,ast2400-scu", "syscon", "simple-mfd";
- reg = <0x1e6e2000 0x1a8>;
-
- pinctrl: pinctrl {
- compatible = "aspeed,g4-pinctrl";
-
- pinctrl_i2c3_default: i2c3_default {
- function = "I2C3";
- groups = "I2C3";
- };
-
- pinctrl_gpioh0_unbiased_default: gpioh0 {
- pins = "A8";
- bias-disable;
- };
- };
-};
-
-g5 Example
-----------
-
-ahb {
- apb {
- syscon: scu@1e6e2000 {
- compatible = "aspeed,ast2500-scu", "syscon", "simple-mfd";
- reg = <0x1e6e2000 0x1a8>;
-
- pinctrl: pinctrl {
- compatible = "aspeed,g5-pinctrl";
- aspeed,external-nodes = <&gfx &lhc>;
-
- pinctrl_i2c3_default: i2c3_default {
- function = "I2C3";
- groups = "I2C3";
- };
-
- pinctrl_gpioh0_unbiased_default: gpioh0 {
- pins = "A18";
- bias-disable;
- };
- };
- };
-
- gfx: display@1e6e6000 {
- compatible = "aspeed,ast2500-gfx", "syscon";
- reg = <0x1e6e6000 0x1000>;
- };
- };
-
- lpc: lpc@1e789000 {
- compatible = "aspeed,ast2500-lpc", "simple-mfd";
- reg = <0x1e789000 0x1000>;
-
- #address-cells = <1>;
- #size-cells = <1>;
- ranges = <0x0 0x1e789000 0x1000>;
-
- lpc_host: lpc-host@80 {
- compatible = "aspeed,ast2500-lpc-host", "simple-mfd", "syscon";
- reg = <0x80 0x1e0>;
- reg-io-width = <4>;
-
- #address-cells = <1>;
- #size-cells = <1>;
- ranges = <0x0 0x80 0x1e0>;
-
- lhc: lhc@20 {
- compatible = "aspeed,ast2500-lhc";
- reg = <0x20 0x24 0x48 0x8>;
- };
- };
- };
-};
diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index cef2b5855d60..fcd37e93ed4d 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -258,6 +258,7 @@ drive-push-pull - drive actively high and low
drive-open-drain - drive with open drain
drive-open-source - drive with open source
drive-strength - sink or source at most X mA
+drive-strength-microamp - sink or source at most X uA
input-enable - enable input on pin (no effect on output, such as
enabling an input buffer)
input-disable - disable input on pin (no effect on output, such as
@@ -326,6 +327,8 @@ arguments are described below.
- drive-strength takes as argument the target strength in mA.
+- drive-strength-microamp takes as argument the target strength in uA.
+
- input-debounce takes the debounce time in usec as argument
or 0 to disable debouncing
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt
index 68e93d5b7ede..c9782397ff14 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,apq8084-pinctrl.txt
@@ -122,17 +122,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt
index 6dd72f8599e9..7b151894f5a0 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,ipq8074-pinctrl.txt
@@ -118,17 +118,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt
index 86ecdcfc4fb8..d46973968873 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,mdm9615-pinctrl.txt
@@ -97,17 +97,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt
index 195a7a0ef0cc..3354a63296d9 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8916-pinctrl.txt
@@ -130,17 +130,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt
index 5034eb6653c7..a7dd213c77c6 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8960-pinctrl.txt
@@ -124,17 +124,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
index f15443f6e78e..da52df6273bc 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8994-pinctrl.txt
@@ -128,17 +128,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt
index fa97f609fe45..a56cb882830c 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8996-pinctrl.txt
@@ -149,17 +149,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt
index e70c79bbbc5b..cdec1eeb2799 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,msm8998-pinctrl.txt
@@ -40,6 +40,14 @@ MSM8998 platform.
Definition: must be 2. Specifying the pin number and flags, as defined
in <dt-bindings/gpio/gpio.h>
+- gpio-ranges:
+ Usage: required
+ Definition: see ../gpio/gpio.txt
+
+- gpio-reserved-ranges:
+ Usage: optional
+ Definition: see ../gpio/gpio.txt
+
Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
a general description of GPIO and interrupt bindings.
@@ -135,17 +143,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
@@ -175,6 +183,8 @@ Example:
interrupts = <0 208 0>;
gpio-controller;
#gpio-cells = <2>;
+ gpio-ranges = <&tlmm 0 0 175>;
+ gpio-reserved-ranges = <0 4>, <81 4>;
interrupt-controller;
#interrupt-cells = <2>;
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt
index 2b8f77762edc..a50e74684195 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,qcs404-pinctrl.txt
@@ -150,17 +150,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
index 769ca83bb40d..be034d329e10 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm660-pinctrl.txt
@@ -142,17 +142,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt
index 665aadb5ea28..7462e3743c68 100644
--- a/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sdm845-pinctrl.txt
@@ -79,7 +79,7 @@ to specify in a pin configuration subnode:
gpio0-gpio149
Supports mux, bias and drive-strength
- sdc2_clk, sdc2_cmd, sdc2_data
+ sdc2_clk, sdc2_cmd, sdc2_data, ufs_reset
Supports bias and drive-strength
- function:
@@ -118,17 +118,17 @@ to specify in a pin configuration subnode:
- bias-disable:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as no pull.
+ Definition: The specified pins should be configured as no pull.
- bias-pull-down:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull down.
+ Definition: The specified pins should be configured as pull down.
- bias-pull-up:
Usage: optional
Value type: <none>
- Definition: The specified pins should be configued as pull up.
+ Definition: The specified pins should be configured as pull up.
- output-high:
Usage: optional
diff --git a/Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt
new file mode 100644
index 000000000000..fa37733e5102
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/qcom,sm8150-pinctrl.txt
@@ -0,0 +1,190 @@
+Qualcomm SM8150 TLMM block
+
+This binding describes the Top Level Mode Multiplexer block found in the
+QCS404 platform.
+
+- compatible:
+ Usage: required
+ Value type: <string>
+ Definition: must be "qcom,sm8150-pinctrl"
+
+- reg:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: the base address and size of the north, south, west
+ and east TLMM tiles.
+
+- reg-names:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Defintiion: names for the cells of reg, must contain "north", "south"
+ "west" and "east".
+
+- interrupts:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: should specify the TLMM summary IRQ.
+
+- interrupt-controller:
+ Usage: required
+ Value type: <none>
+ Definition: identifies this node as an interrupt controller
+
+- #interrupt-cells:
+ Usage: required
+ Value type: <u32>
+ Definition: must be 2. Specifying the pin number and flags, as defined
+ in <dt-bindings/interrupt-controller/irq.h>
+
+- gpio-controller:
+ Usage: required
+ Value type: <none>
+ Definition: identifies this node as a gpio controller
+
+- #gpio-cells:
+ Usage: required
+ Value type: <u32>
+ Definition: must be 2. Specifying the pin number and flags, as defined
+ in <dt-bindings/gpio/gpio.h>
+
+- gpio-ranges:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: see ../gpio/gpio.txt
+
+- gpio-reserved-ranges:
+ Usage: optional
+ Value type: <prop-encoded-array>
+ Definition: see ../gpio/gpio.txt
+
+Please refer to ../gpio/gpio.txt and ../interrupt-controller/interrupts.txt for
+a general description of GPIO and interrupt bindings.
+
+Please refer to pinctrl-bindings.txt in this directory for details of the
+common pinctrl bindings used by client devices, including the meaning of the
+phrase "pin configuration node".
+
+The pin configuration nodes act as a container for an arbitrary number of
+subnodes. Each of these subnodes represents some desired configuration for a
+pin, a group, or a list of pins or groups. This configuration can include the
+mux function to select on those pin(s)/group(s), and various pin configuration
+parameters, such as pull-up, drive strength, etc.
+
+
+PIN CONFIGURATION NODES:
+
+The name of each subnode is not important; all subnodes should be enumerated
+and processed purely based on their content.
+
+Each subnode only affects those parameters that are explicitly listed. In
+other words, a subnode that lists a mux function but no pin configuration
+parameters implies no information about any pin configuration parameters.
+Similarly, a pin subnode that describes a pullup parameter implies no
+information about e.g. the mux function.
+
+
+The following generic properties as defined in pinctrl-bindings.txt are valid
+to specify in a pin configuration subnode:
+
+- pins:
+ Usage: required
+ Value type: <string-array>
+ Definition: List of gpio pins affected by the properties specified in
+ this subnode.
+
+ Valid pins are:
+ gpio0-gpio149
+ Supports mux, bias and drive-strength
+
+ sdc1_clk, sdc1_cmd, sdc1_data sdc2_clk, sdc2_cmd,
+ sdc2_data sdc1_rclk
+ Supports bias and drive-strength
+
+ ufs_reset
+ Supports bias and drive-strength
+
+- function:
+ Usage: required
+ Value type: <string>
+ Definition: Specify the alternative function to be configured for the
+ specified pins. Functions are only valid for gpio pins.
+ Valid values are:
+
+ adsp_ext, agera_pll, aoss_cti, ddr_pxi2, atest_char,
+ atest_char0, atest_char1, atest_char2, atest_char3,
+ audio_ref, atest_usb1, atest_usb2, atest_usb10,
+ atest_usb11, atest_usb12, atest_usb13, atest_usb20,
+ atest_usb21, atest_usb22, atest_usb2, atest_usb23,
+ btfm_slimbus, cam_mclk, cci_async, cci_i2c, cci_timer0,
+ cci_timer1, cci_timer2, cci_timer3, cci_timer4,
+ cri_trng, cri_trng0, cri_trng1, dbg_out, ddr_bist,
+ ddr_pxi0, ddr_pxi1, ddr_pxi3, edp_hot, edp_lcd,
+ emac_phy, emac_pps, gcc_gp1, gcc_gp2, gcc_gp3, gpio,
+ hs1_mi2s, hs2_mi2s, hs3_mi2s, jitter_bist,
+ lpass_slimbus, mdp_vsync, mdp_vsync0, mdp_vsync1,
+ mdp_vsync2, mdp_vsync3, mss_lte, m_voc, nav_pps,
+ pa_indicator, pci_e0, phase_flag, pll_bypassnl,
+ pll_bist, pci_e1, pll_reset, pri_mi2s, pri_mi2s_ws,
+ prng_rosc, qdss, qdss_cti, qlink_request, qlink_enable,
+ qspi0, qspi1, qspi2, qspi3, qspi_clk, qspi_cs, qua_mi2s,
+ qup0, qup1, qup2, qup3, qup4, qup5, qup6, qup7, qup8,
+ qup9, qup10, qup11, qup12, qup13, qup14, qup15, qup16,
+ qup17, qup18, qup19, qup_l4, qup_l5, qup_l6, rgmii,
+ sdc4, sd_write, sec_mi2s, spkr_i2s, sp_cmu, ter_mi2s,
+ tgu_ch0, tgu_ch1, tgu_ch2, tgu_ch3, tsense_pwm1,
+ tsense_pwm2, tsif1, tsif2, uim1, uim2, uim_batt,
+ usb2phy_ac, usb_phy, vfr_1, vsense_trigger, wlan1_adc0,
+ wlan1_adc1, wlan2_adc0, wlan2_adc1, wmss_reset
+
+- bias-disable:
+ Usage: optional
+ Value type: <none>
+ Definition: The specified pins should be configued as no pull.
+
+- bias-pull-down:
+ Usage: optional
+ Value type: <none>
+ Definition: The specified pins should be configued as pull down.
+
+- bias-pull-up:
+ Usage: optional
+ Value type: <none>
+ Definition: The specified pins should be configued as pull up.
+
+- output-high:
+ Usage: optional
+ Value type: <none>
+ Definition: The specified pins are configured in output mode, driven
+ high.
+ Not valid for sdc pins.
+
+- output-low:
+ Usage: optional
+ Value type: <none>
+ Definition: The specified pins are configured in output mode, driven
+ low.
+ Not valid for sdc pins.
+
+- drive-strength:
+ Usage: optional
+ Value type: <u32>
+ Definition: Selects the drive strength for the specified pins, in mA.
+ Valid values are: 2, 4, 6, 8, 10, 12, 14 and 16
+
+Example:
+
+ tlmm: pinctrl@3000000 {
+ compatible = "qcom,sm8150-pinctrl";
+ reg = <0x03100000 0x300000>,
+ <0x03500000 0x300000>,
+ <0x03900000 0x300000>,
+ <0x03D00000 0x300000>;
+ reg-names = "west", "east", "north", "south";
+ interrupts = <GIC_SPI 208 IRQ_TYPE_LEVEL_HIGH>;
+ gpio-controller;
+ #gpio-cells = <2>;
+ gpio-ranges = <&tlmm 0 0 175>;
+ gpio-reserved-ranges = <0 4>, <126 4>;
+ interrupt-controller;
+ #interrupt-cells = <2>;
+ };
diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
deleted file mode 100644
index 00169255e48c..000000000000
--- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
+++ /dev/null
@@ -1,208 +0,0 @@
-* STM32 GPIO and Pin Mux/Config controller
-
-STMicroelectronics's STM32 MCUs intregrate a GPIO and Pin mux/config hardware
-controller. It controls the input/output settings on the available pins and
-also provides ability to multiplex and configure the output of various on-chip
-controllers onto these pads.
-
-Pin controller node:
-Required properies:
- - compatible: value should be one of the following:
- "st,stm32f429-pinctrl"
- "st,stm32f469-pinctrl"
- "st,stm32f746-pinctrl"
- "st,stm32f769-pinctrl"
- "st,stm32h743-pinctrl"
- "st,stm32mp157-pinctrl"
- "st,stm32mp157-z-pinctrl"
- - #address-cells: The value of this property must be 1
- - #size-cells : The value of this property must be 1
- - ranges : defines mapping between pin controller node (parent) to
- gpio-bank node (children).
- - pins-are-numbered: Specify the subnodes are using numbered pinmux to
- specify pins.
-
-GPIO controller/bank node:
-Required properties:
- - gpio-controller : Indicates this device is a GPIO controller
- - #gpio-cells : Should be two.
- The first cell is the pin number
- The second one is the polarity:
- - 0 for active high
- - 1 for active low
- - reg : The gpio address range, relative to the pinctrl range
- - clocks : clock that drives this bank
- - st,bank-name : Should be a name string for this bank as specified in
- the datasheet
-
-Optional properties:
- - reset: : Reference to the reset controller
- - st,syscfg: Should be phandle/offset/mask.
- -The phandle to the syscon node which includes IRQ mux selection register.
- -The offset of the IRQ mux selection register
- -The field mask of IRQ mux, needed if different of 0xf.
- - gpio-ranges: Define a dedicated mapping between a pin-controller and
- a gpio controller. Format is <&phandle a b c> with:
- -(phandle): phandle of pin-controller.
- -(a): gpio base offset in range.
- -(b): pin base offset in range.
- -(c): gpio count in range
- This entry has to be used either if there are holes inside a bank:
- GPIOB0/B1/B2/B14/B15 (see example 2)
- or if banks are not contiguous:
- GPIOA/B/C/E...
- NOTE: If "gpio-ranges" is used for a gpio controller, all gpio-controller
- have to use a "gpio-ranges" entry.
- More details in Documentation/devicetree/bindings/gpio/gpio.txt.
- - st,bank-ioport: should correspond to the EXTI IOport selection (EXTI line
- used to select GPIOs as interrupts).
- - hwlocks: reference to a phandle of a hardware spinlock provider node.
- - st,package: Indicates the SOC package used.
- More details in include/dt-bindings/pinctrl/stm32-pinfunc.h
-
-Example 1:
-#include <dt-bindings/pinctrl/stm32f429-pinfunc.h>
-...
-
- pin-controller {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "st,stm32f429-pinctrl";
- ranges = <0 0x40020000 0x3000>;
- pins-are-numbered;
-
- gpioa: gpio@40020000 {
- gpio-controller;
- #gpio-cells = <2>;
- reg = <0x0 0x400>;
- resets = <&reset_ahb1 0>;
- st,bank-name = "GPIOA";
- };
- ...
- pin-functions nodes follow...
- };
-
-Example 2:
-#include <dt-bindings/pinctrl/stm32f429-pinfunc.h>
-...
-
- pinctrl: pin-controller {
- #address-cells = <1>;
- #size-cells = <1>;
- compatible = "st,stm32f429-pinctrl";
- ranges = <0 0x40020000 0x3000>;
- pins-are-numbered;
-
- gpioa: gpio@40020000 {
- gpio-controller;
- #gpio-cells = <2>;
- reg = <0x0 0x400>;
- resets = <&reset_ahb1 0>;
- st,bank-name = "GPIOA";
- gpio-ranges = <&pinctrl 0 0 16>;
- };
-
- gpiob: gpio@40020400 {
- gpio-controller;
- #gpio-cells = <2>;
- reg = <0x0 0x400>;
- resets = <&reset_ahb1 0>;
- st,bank-name = "GPIOB";
- ngpios = 4;
- gpio-ranges = <&pinctrl 0 16 3>,
- <&pinctrl 14 30 2>;
- };
-
-
- ...
- pin-functions nodes follow...
- };
-
-
-Contents of function subnode node:
-----------------------------------
-Subnode format
-A pinctrl node should contain at least one subnode representing the
-pinctrl group available on the machine. Each subnode will list the
-pins it needs, and how they should be configured, with regard to muxer
-configuration, pullups, drive, output high/low and output speed.
-
- node {
- pinmux = <PIN_NUMBER_PINMUX>;
- GENERIC_PINCONFIG;
- };
-
-Required properties:
-- pinmux: integer array, represents gpio pin number and mux setting.
- Supported pin number and mux varies for different SoCs, and are defined in
- dt-bindings/pinctrl/<soc>-pinfunc.h directly.
- These defines are calculated as:
- ((port * 16 + line) << 8) | function
- With:
- - port: The gpio port index (PA = 0, PB = 1, ..., PK = 11)
- - line: The line offset within the port (PA0 = 0, PA1 = 1, ..., PA15 = 15)
- - function: The function number, can be:
- * 0 : GPIO
- * 1 : Alternate Function 0
- * 2 : Alternate Function 1
- * 3 : Alternate Function 2
- * ...
- * 16 : Alternate Function 15
- * 17 : Analog
-
- To simplify the usage, macro is available to generate "pinmux" field.
- This macro is available here:
- - include/dt-bindings/pinctrl/stm32-pinfunc.h
-
- Some examples of using macro:
- /* GPIO A9 set as alernate function 2 */
- ... {
- pinmux = <STM32_PINMUX('A', 9, AF2)>;
- };
- /* GPIO A9 set as GPIO */
- ... {
- pinmux = <STM32_PINMUX('A', 9, GPIO)>;
- };
- /* GPIO A9 set as analog */
- ... {
- pinmux = <STM32_PINMUX('A', 9, ANALOG)>;
- };
-
-Optional properties:
-- GENERIC_PINCONFIG: is the generic pinconfig options to use.
- Available options are:
- - bias-disable,
- - bias-pull-down,
- - bias-pull-up,
- - drive-push-pull,
- - drive-open-drain,
- - output-low
- - output-high
- - slew-rate = <x>, with x being:
- < 0 > : Low speed
- < 1 > : Medium speed
- < 2 > : Fast speed
- < 3 > : High speed
-
-Example:
-
-pin-controller {
-...
- usart1_pins_a: usart1@0 {
- pins1 {
- pinmux = <STM32_PINMUX('A', 9, AF7)>;
- bias-disable;
- drive-push-pull;
- slew-rate = <0>;
- };
- pins2 {
- pinmux = <STM32_PINMUX('A', 10, AF7)>;
- bias-disable;
- };
- };
-};
-
-&usart1 {
- pinctrl-0 = <&usart1_pins_a>;
- pinctrl-names = "default";
-};
diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
new file mode 100644
index 000000000000..91d3e78b3395
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# Copyright (C) STMicroelectronics 2019.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/st,stm32-pinctrl.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: STM32 GPIO and Pin Mux/Config controller
+
+maintainers:
+ - Alexandre TORGUE <alexandre.torgue@st.com>
+
+description: |
+ STMicroelectronics's STM32 MCUs intregrate a GPIO and Pin mux/config hardware
+ controller. It controls the input/output settings on the available pins and
+ also provides ability to multiplex and configure the output of various
+ on-chip controllers onto these pads.
+
+properties:
+ compatible:
+ enum:
+ - st,stm32f429-pinctrl
+ - st,stm32f469-pinctrl
+ - st,stm32f746-pinctrl
+ - st,stm32f769-pinctrl
+ - st,stm32h743-pinctrl
+ - st,stm32mp157-pinctrl
+ - st,stm32mp157-z-pinctrl
+
+ '#address-cells':
+ const: 1
+ '#size-cells':
+ const: 1
+
+ ranges: true
+ pins-are-numbered: true
+ hwlocks: true
+
+ st,syscfg:
+ $ref: "/schemas/types.yaml#/definitions/phandle-array"
+ description: Should be phandle/offset/mask
+ items:
+ - description: Phandle to the syscon node which includes IRQ mux selection.
+ - description: The offset of the IRQ mux selection register.
+ - description: The field mask of IRQ mux, needed if different of 0xf.
+
+ st,package:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - enum: [1, 2, 4, 8]
+ description:
+ Indicates the SOC package used.
+ More details in include/dt-bindings/pinctrl/stm32-pinfunc.h
+
+
+patternProperties:
+ '^gpio@[0-9a-f]*$':
+ type: object
+ properties:
+ gpio-controller: true
+ '#gpio-cells':
+ const: 2
+
+ reg:
+ maxItems: 1
+ clocks:
+ maxItems: 1
+ reset:
+ minItems: 1
+ maxItems: 1
+ gpio-ranges:
+ minItems: 1
+ maxItems: 16
+ ngpios:
+ description:
+ Number of available gpios in a bank.
+ minimum: 1
+ maximum: 16
+
+ st,bank-name:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/string"
+ - enum:
+ - GPIOA
+ - GPIOB
+ - GPIOC
+ - GPIOD
+ - GPIOE
+ - GPIOF
+ - GPIOG
+ - GPIOH
+ - GPIOI
+ - GPIOJ
+ - GPIOK
+ - GPIOZ
+ description:
+ Should be a name string for this bank as specified in the datasheet.
+
+ st,bank-ioport:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ - maximum: 11
+
+ description:
+ Should correspond to the EXTI IOport selection (EXTI line used
+ to select GPIOs as interrupts).
+
+ required:
+ - gpio-controller
+ - '#gpio-cells'
+ - reg
+ - clocks
+ - st,bank-name
+
+ '-[0-9]*$':
+ type: object
+ patternProperties:
+ '^pins':
+ type: object
+ description: |
+ A pinctrl node should contain at least one subnode representing the
+ pinctrl group available on the machine. Each subnode will list the
+ pins it needs, and how they should be configured, with regard to muxer
+ configuration, pullups, drive, output high/low and output speed.
+ properties:
+ pinmux:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32-array"
+ description: |
+ Integer array, represents gpio pin number and mux setting.
+ Supported pin number and mux varies for different SoCs, and are
+ defined in dt-bindings/pinctrl/<soc>-pinfunc.h directly.
+ These defines are calculated as: ((port * 16 + line) << 8) | function
+ With:
+ - port: The gpio port index (PA = 0, PB = 1, ..., PK = 11)
+ - line: The line offset within the port (PA0 = 0, PA1 = 1, ..., PA15 = 15)
+ - function: The function number, can be:
+ * 0 : GPIO
+ * 1 : Alternate Function 0
+ * 2 : Alternate Function 1
+ * 3 : Alternate Function 2
+ * ...
+ * 16 : Alternate Function 15
+ * 17 : Analog
+ To simplify the usage, macro is available to generate "pinmux" field.
+ This macro is available here:
+ - include/dt-bindings/pinctrl/stm32-pinfunc.h
+ Some examples of using macro:
+ /* GPIO A9 set as alernate function 2 */
+ ... {
+ pinmux = <STM32_PINMUX('A', 9, AF2)>;
+ };
+ /* GPIO A9 set as GPIO */
+ ... {
+ pinmux = <STM32_PINMUX('A', 9, GPIO)>;
+ };
+ /* GPIO A9 set as analog */
+ ... {
+ pinmux = <STM32_PINMUX('A', 9, ANALOG)>;
+ };
+
+ bias-disable:
+ type: boolean
+ bias-pull-down:
+ type: boolean
+ bias-pull-up:
+ type: boolean
+ drive-push-pull:
+ type: boolean
+ drive-open-drain:
+ type: boolean
+ output-low:
+ type: boolean
+ output-high:
+ type: boolean
+ slew-rate:
+ description: |
+ 0: Low speed
+ 1: Medium speed
+ 2: Fast speed
+ 3: High speed
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - enum: [0, 1, 2, 3]
+
+ required:
+ - pinmux
+
+required:
+ - compatible
+ - '#address-cells'
+ - '#size-cells'
+ - ranges
+ - pins-are-numbered
+
+examples:
+ - |
+ #include <dt-bindings/pinctrl/stm32-pinfunc.h>
+ #include <dt-bindings/mfd/stm32f4-rcc.h>
+ //Example 1
+ pinctrl@40020000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "st,stm32f429-pinctrl";
+ ranges = <0 0x40020000 0x3000>;
+ pins-are-numbered;
+
+ gpioa: gpio@0 {
+ gpio-controller;
+ #gpio-cells = <2>;
+ reg = <0x0 0x400>;
+ resets = <&reset_ahb1 0>;
+ clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOA)>;
+ st,bank-name = "GPIOA";
+ };
+ };
+
+ //Example 2 (using gpio-ranges)
+ pinctrl@50020000 {
+ #address-cells = <1>;
+ #size-cells = <1>;
+ compatible = "st,stm32f429-pinctrl";
+ ranges = <0 0x50020000 0x3000>;
+ pins-are-numbered;
+
+ gpiob: gpio@1000 {
+ gpio-controller;
+ #gpio-cells = <2>;
+ reg = <0x1000 0x400>;
+ resets = <&reset_ahb1 0>;
+ clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOB)>;
+ st,bank-name = "GPIOB";
+ gpio-ranges = <&pinctrl 0 0 16>;
+ };
+
+ gpioc: gpio@2000 {
+ gpio-controller;
+ #gpio-cells = <2>;
+ reg = <0x2000 0x400>;
+ resets = <&reset_ahb1 0>;
+ clocks = <&rcc 0 STM32F4_AHB1_CLOCK(GPIOC)>;
+ st,bank-name = "GPIOC";
+ ngpios = <5>;
+ gpio-ranges = <&pinctrl 0 16 3>,
+ <&pinctrl 14 30 2>;
+ };
+ };
+
+ //Example 3 pin groups
+ pinctrl@60020000 {
+ usart1_pins_a: usart1-0 {
+ pins1 {
+ pinmux = <STM32_PINMUX('A', 9, AF7)>;
+ bias-disable;
+ drive-push-pull;
+ slew-rate = <0>;
+ };
+ pins2 {
+ pinmux = <STM32_PINMUX('A', 10, AF7)>;
+ bias-disable;
+ };
+ };
+ };
+
+ usart1 {
+ pinctrl-0 = <&usart1_pins_a>;
+ pinctrl-names = "default";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
index 980e5413d18f..eb35b22f9e23 100644
--- a/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
+++ b/Documentation/devicetree/bindings/power/qcom,rpmpd.txt
@@ -6,6 +6,8 @@ which then translates it into a corresponding voltage on a rail
Required Properties:
- compatible: Should be one of the following
* qcom,msm8996-rpmpd: RPM Power domain for the msm8996 family of SoC
+ * qcom,msm8998-rpmpd: RPM Power domain for the msm8998 family of SoC
+ * qcom,qcs404-rpmpd: RPM Power domain for the qcs404 family of SoC
* qcom,sdm845-rpmhpd: RPMh Power domain for the sdm845 family of SoC
- #power-domain-cells: number of cells in Power domain specifier
must be 1.
diff --git a/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.txt b/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.txt
new file mode 100644
index 000000000000..752d6126d5da
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/reset/nvmem-reboot-mode.txt
@@ -0,0 +1,26 @@
+NVMEM reboot mode driver
+
+This driver gets reboot mode magic value from reboot-mode driver
+and stores it in a NVMEM cell named "reboot-mode". Then the bootloader
+can read it and take different action according to the magic
+value stored.
+
+Required properties:
+- compatible: should be "nvmem-reboot-mode".
+- nvmem-cells: A phandle to the reboot mode provided by a nvmem device.
+- nvmem-cell-names: Should be "reboot-mode".
+
+The rest of the properties should follow the generic reboot-mode description
+found in reboot-mode.txt
+
+Example:
+ reboot-mode {
+ compatible = "nvmem-reboot-mode";
+ nvmem-cells = <&reboot_mode>;
+ nvmem-cell-names = "reboot-mode";
+
+ mode-normal = <0xAAAA5501>;
+ mode-bootloader = <0xBBBB5500>;
+ mode-recovery = <0xCCCC5502>;
+ mode-test = <0xDDDD5503>;
+ };
diff --git a/Documentation/devicetree/bindings/power/reset/qcom,pon.txt b/Documentation/devicetree/bindings/power/reset/qcom,pon.txt
index 5705f575862d..0c0dc3a1e693 100644
--- a/Documentation/devicetree/bindings/power/reset/qcom,pon.txt
+++ b/Documentation/devicetree/bindings/power/reset/qcom,pon.txt
@@ -9,6 +9,7 @@ Required Properties:
-compatible: Must be one of:
"qcom,pm8916-pon"
"qcom,pms405-pon"
+ "qcom,pm8998-pon"
-reg: Specifies the physical address of the pon register
diff --git a/Documentation/devicetree/bindings/property-units.txt b/Documentation/devicetree/bindings/property-units.txt
index bfd33734faca..e9b8360b3288 100644
--- a/Documentation/devicetree/bindings/property-units.txt
+++ b/Documentation/devicetree/bindings/property-units.txt
@@ -12,32 +12,32 @@ unit prefixes.
Time/Frequency
----------------------------------------
-mhz : megahertz
--hz : Hertz (preferred)
--sec : seconds
--ms : milliseconds
--us : microseconds
--ns : nanoseconds
+-hz : hertz (preferred)
+-sec : second
+-ms : millisecond
+-us : microsecond
+-ns : nanosecond
Distance
----------------------------------------
--mm : millimeters
+-mm : millimeter
Electricity
----------------------------------------
--microamp : micro amps
--microamp-hours : micro amp-hours
--ohms : Ohms
--micro-ohms : micro Ohms
--microwatt-hours: micro Watt-hours
--microvolt : micro volts
--picofarads : picofarads
--femtofarads : femtofarads
+-microamp : microampere
+-microamp-hours : microampere hour
+-ohms : ohm
+-micro-ohms : microohm
+-microwatt-hours: microwatt hour
+-microvolt : microvolt
+-picofarads : picofarad
+-femtofarads : femtofarad
Temperature
----------------------------------------
--celsius : Degrees Celsius
--millicelsius : Degreee milli-Celsius
+-celsius : degree Celsius
+-millicelsius : millidegree Celsius
Pressure
----------------------------------------
--kpascal : kiloPascal
+-kpascal : kilopascal
diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
index 454c937076a2..d48f9eb3636e 100644
--- a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
+++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
@@ -4,6 +4,8 @@ General Properties:
- compatible Should be "fsl,etsec-ptp" for eTSEC
Should be "fsl,fman-ptp-timer" for DPAA FMan
+ Should be "fsl,dpaa2-ptp" for DPAA2
+ Should be "fsl,enetc-ptp" for ENETC
- reg Offset and length of the register set for the device
- interrupts There should be at least two interrupts. Some devices
have as many as four PTP related interrupts.
diff --git a/Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml b/Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml
new file mode 100644
index 000000000000..0ac52f83a58c
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/allwinner,sun4i-a10-pwm.yaml
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pwm/allwinner,sun4i-a10-pwm.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 PWM Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#pwm-cells":
+ const: 3
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun4i-a10-pwm
+ - const: allwinner,sun5i-a10s-pwm
+ - const: allwinner,sun5i-a13-pwm
+ - const: allwinner,sun7i-a20-pwm
+ - const: allwinner,sun8i-h3-pwm
+ - items:
+ - const: allwinner,sun8i-a83t-pwm
+ - const: allwinner,sun8i-h3-pwm
+ - items:
+ - const: allwinner,sun50i-a64-pwm
+ - const: allwinner,sun5i-a13-pwm
+ - items:
+ - const: allwinner,sun50i-h5-pwm
+ - const: allwinner,sun5i-a13-pwm
+
+ reg:
+ maxItems: 1
+
+ clocks:
+ maxItems: 1
+
+required:
+ - "#pwm-cells"
+ - compatible
+ - reg
+ - clocks
+
+additionalProperties: false
+
+examples:
+ - |
+ pwm: pwm@1c20e00 {
+ compatible = "allwinner,sun7i-a20-pwm";
+ reg = <0x01c20e00 0xc>;
+ clocks = <&osc24M>;
+ #pwm-cells = <3>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt b/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt
index 7d9d3f90641b..493bec80d59b 100644
--- a/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt
+++ b/Documentation/devicetree/bindings/pwm/ingenic,jz47xx-pwm.txt
@@ -2,10 +2,7 @@ Ingenic JZ47xx PWM Controller
=============================
Required properties:
-- compatible: One of:
- * "ingenic,jz4740-pwm"
- * "ingenic,jz4770-pwm"
- * "ingenic,jz4780-pwm"
+- compatible: Should be "ingenic,jz4740-pwm"
- #pwm-cells: Should be 3. See pwm.txt in this directory for a description
of the cells format.
- clocks : phandle to the external clock.
diff --git a/Documentation/devicetree/bindings/pwm/pwm-sifive.txt b/Documentation/devicetree/bindings/pwm/pwm-sifive.txt
new file mode 100644
index 000000000000..36447e3c9378
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/pwm-sifive.txt
@@ -0,0 +1,33 @@
+SiFive PWM controller
+
+Unlike most other PWM controllers, the SiFive PWM controller currently only
+supports one period for all channels in the PWM. All PWMs need to run at
+the same period. The period also has significant restrictions on the values
+it can achieve, which the driver rounds to the nearest achievable period.
+PWM RTL that corresponds to the IP block version numbers can be found
+here:
+
+https://github.com/sifive/sifive-blocks/tree/master/src/main/scala/devices/pwm
+
+Required properties:
+- compatible: Should be "sifive,<chip>-pwm" and "sifive,pwm<version>".
+ Supported compatible strings are: "sifive,fu540-c000-pwm" for the SiFive
+ PWM v0 as integrated onto the SiFive FU540 chip, and "sifive,pwm0" for the
+ SiFive PWM v0 IP block with no chip integration tweaks.
+ Please refer to sifive-blocks-ip-versioning.txt for details.
+- reg: physical base address and length of the controller's registers
+- clocks: Should contain a clock identifier for the PWM's parent clock.
+- #pwm-cells: Should be 3. See pwm.txt in this directory
+ for a description of the cell format.
+- interrupts: one interrupt per PWM channel
+
+Examples:
+
+pwm: pwm@10020000 {
+ compatible = "sifive,fu540-c000-pwm", "sifive,pwm0";
+ reg = <0x0 0x10020000 0x0 0x1000>;
+ clocks = <&tlclk>;
+ interrupt-parent = <&plic>;
+ interrupts = <42 43 44 45>;
+ #pwm-cells = <3>;
+};
diff --git a/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt b/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt
index bd23302e84be..6521bc44a74e 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-stm32-lp.txt
@@ -11,8 +11,10 @@ Required parameters:
bindings defined in pwm.txt.
Optional properties:
-- pinctrl-names: Set to "default".
-- pinctrl-0: Phandle pointing to pin configuration node for PWM.
+- pinctrl-names: Set to "default". An additional "sleep" state can be
+ defined to set pins in sleep state when in low power.
+- pinctrl-n: Phandle(s) pointing to pin configuration node for PWM,
+ respectively for "default" and "sleep" states.
Example:
timer@40002400 {
@@ -21,7 +23,8 @@ Example:
pwm {
compatible = "st,stm32-pwm-lp";
#pwm-cells = <3>;
- pinctrl-names = "default";
+ pinctrl-names = "default", "sleep";
pinctrl-0 = <&lppwm1_pins>;
+ pinctrl-1 = <&lppwm1_sleep_pins>;
};
};
diff --git a/Documentation/devicetree/bindings/pwm/pwm-stm32.txt b/Documentation/devicetree/bindings/pwm/pwm-stm32.txt
index 3e6d55018d7a..a8690bfa5e1f 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-stm32.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-stm32.txt
@@ -8,6 +8,8 @@ Required parameters:
- pinctrl-names: Set to "default".
- pinctrl-0: List of phandles pointing to pin configuration nodes for PWM module.
For Pinctrl properties see ../pinctrl/pinctrl-bindings.txt
+- #pwm-cells: Should be set to 3. This PWM chip uses the default 3 cells
+ bindings defined in pwm.txt.
Optional parameters:
- st,breakinput: One or two <index level filter> to describe break input configurations.
@@ -28,6 +30,7 @@ Example:
pwm {
compatible = "st,stm32-pwm";
+ #pwm-cells = <3>;
pinctrl-0 = <&pwm1_pins>;
pinctrl-names = "default";
st,breakinput = <0 1 5>;
diff --git a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt b/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
deleted file mode 100644
index 2a1affbff45e..000000000000
--- a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-Allwinner sun4i and sun7i SoC PWM controller
-
-Required properties:
- - compatible: should be one of:
- - "allwinner,sun4i-a10-pwm"
- - "allwinner,sun5i-a10s-pwm"
- - "allwinner,sun5i-a13-pwm"
- - "allwinner,sun7i-a20-pwm"
- - "allwinner,sun8i-h3-pwm"
- - "allwinner,sun50i-a64-pwm", "allwinner,sun5i-a13-pwm"
- - "allwinner,sun50i-h5-pwm", "allwinner,sun5i-a13-pwm"
- - reg: physical base address and length of the controller's registers
- - #pwm-cells: should be 3. See pwm.txt in this directory for a description of
- the cells format.
- - clocks: From common clock binding, handle to the parent clock.
-
-Example:
-
- pwm: pwm@1c20e00 {
- compatible = "allwinner,sun7i-a20-pwm";
- reg = <0x01c20e00 0xc>;
- clocks = <&osc24M>;
- #pwm-cells = <3>;
- };
diff --git a/Documentation/devicetree/bindings/regulator/arizona-regulator.txt b/Documentation/devicetree/bindings/regulator/arizona-regulator.txt
index 443564d7784f..69bf41949b01 100644
--- a/Documentation/devicetree/bindings/regulator/arizona-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/arizona-regulator.txt
@@ -5,7 +5,8 @@ of analogue I/O.
This document lists regulator specific bindings, see the primary binding
document:
- ../mfd/arizona.txt
+ For Wolfson Microelectronic Arizona codecs: ../mfd/arizona.txt
+ For Cirrus Logic Madera codecs: ../mfd/madera.txt
Optional properties:
- wlf,ldoena : GPIO specifier for the GPIO controlling LDOENA
diff --git a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml
index d289c2f7455a..a650b457085d 100644
--- a/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml
+++ b/Documentation/devicetree/bindings/regulator/fixed-regulator.yaml
@@ -12,10 +12,13 @@ maintainers:
description:
Any property defined as part of the core regulator binding, defined in
- regulator.txt, can also be used. However a fixed voltage regulator is
+ regulator.yaml, can also be used. However a fixed voltage regulator is
expected to have the regulator-min-microvolt and regulator-max-microvolt
to be the same.
+allOf:
+ - $ref: "regulator.yaml#"
+
properties:
compatible:
const: regulator-fixed
diff --git a/Documentation/devicetree/bindings/regulator/gpio-regulator.txt b/Documentation/devicetree/bindings/regulator/gpio-regulator.txt
deleted file mode 100644
index dd25e73b5d79..000000000000
--- a/Documentation/devicetree/bindings/regulator/gpio-regulator.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-GPIO controlled regulators
-
-Required properties:
-- compatible : Must be "regulator-gpio".
-- regulator-name : Defined in regulator.txt as optional, but required
- here.
-- gpios : Array of one or more GPIO pins used to select the
- regulator voltage/current listed in "states".
-- states : Selection of available voltages/currents provided by
- this regulator and matching GPIO configurations to
- achieve them. If there are no states in the "states"
- array, use a fixed regulator instead.
-
-Optional properties:
-- enable-gpios : GPIO used to enable/disable the regulator.
- Warning, the GPIO phandle flags are ignored and the
- GPIO polarity is controlled solely by the presence
- of "enable-active-high" DT property. This is due to
- compatibility with old DTs.
-- enable-active-high : Polarity of "enable-gpio" GPIO is active HIGH.
- Default is active LOW.
-- gpios-states : On operating systems, that don't support reading back
- gpio values in output mode (most notably linux), this
- array provides the state of GPIO pins set when
- requesting them from the gpio controller. Systems,
- that are capable of preserving state when requesting
- the lines, are free to ignore this property.
- 0: LOW, 1: HIGH. Default is LOW if nothing else
- is specified.
-- startup-delay-us : Startup time in microseconds.
-- regulator-type : Specifies what is being regulated, must be either
- "voltage" or "current", defaults to voltage.
-
-Any property defined as part of the core regulator binding defined in
-regulator.txt can also be used.
-
-Example:
-
- mmciv: gpio-regulator {
- compatible = "regulator-gpio";
-
- regulator-name = "mmci-gpio-supply";
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <2600000>;
- regulator-boot-on;
-
- enable-gpios = <&gpio0 23 0x4>;
- gpios = <&gpio0 24 0x4
- &gpio0 25 0x4>;
- states = <1800000 0x3
- 2200000 0x2
- 2600000 0x1
- 2900000 0x0>;
-
- startup-delay-us = <100000>;
- enable-active-high;
- };
diff --git a/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml b/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml
new file mode 100644
index 000000000000..9d3b28417fb6
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/gpio-regulator.yaml
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/gpio-regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: GPIO controlled regulators
+
+maintainers:
+ - Liam Girdwood <lgirdwood@gmail.com>
+ - Mark Brown <broonie@kernel.org>
+
+description:
+ Any property defined as part of the core regulator binding, defined in
+ regulator.txt, can also be used.
+
+allOf:
+ - $ref: "regulator.yaml#"
+
+properties:
+ compatible:
+ const: regulator-gpio
+
+ regulator-name: true
+
+ enable-gpios:
+ description: GPIO to use to enable/disable the regulator.
+ Warning, the GPIO phandle flags are ignored and the GPIO polarity is
+ controlled solely by the presence of "enable-active-high" DT property.
+ This is due to compatibility with old DTs.
+ maxItems: 1
+
+ gpios:
+ description: Array of one or more GPIO pins used to select the regulator
+ voltage/current listed in "states".
+ minItems: 1
+ maxItems: 8 # Should be enough...
+
+ gpios-states:
+ description: |
+ On operating systems, that don't support reading back gpio values in
+ output mode (most notably linux), this array provides the state of GPIO
+ pins set when requesting them from the gpio controller. Systems, that are
+ capable of preserving state when requesting the lines, are free to ignore
+ this property.
+ 0: LOW
+ 1: HIGH
+ Default is LOW if nothing else is specified.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32-array
+ - maxItems: 8
+ items:
+ enum: [ 0, 1 ]
+ default: 0
+
+ states:
+ description: Selection of available voltages/currents provided by this
+ regulator and matching GPIO configurations to achieve them. If there are
+ no states in the "states" array, use a fixed regulator instead.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32-matrix
+ - maxItems: 8
+ items:
+ items:
+ - description: Voltage in microvolts
+ - description: GPIO group state value
+
+ startup-delay-us:
+ description: startup time in microseconds
+
+ enable-active-high:
+ description: Polarity of "enable-gpio" GPIO is active HIGH. Default is
+ active LOW.
+ type: boolean
+
+ gpio-open-drain:
+ description:
+ GPIO is open drain type. If this property is missing then default
+ assumption is false.
+ type: boolean
+
+ regulator-type:
+ description: Specifies what is being regulated.
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/string
+ - enum:
+ - voltage
+ - current
+ default: voltage
+
+required:
+ - compatible
+ - regulator-name
+ - gpios
+ - states
+
+examples:
+ - |
+ gpio-regulator {
+ compatible = "regulator-gpio";
+
+ regulator-name = "mmci-gpio-supply";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <2600000>;
+ regulator-boot-on;
+
+ enable-gpios = <&gpio0 23 0x4>;
+ gpios = <&gpio0 24 0x4
+ &gpio0 25 0x4>;
+ states = <1800000 0x3>,
+ <2200000 0x2>,
+ <2600000 0x1>,
+ <2900000 0x0>;
+
+ startup-delay-us = <100000>;
+ enable-active-high;
+ };
+...
diff --git a/Documentation/devicetree/bindings/regulator/max8660.txt b/Documentation/devicetree/bindings/regulator/max8660.txt
deleted file mode 100644
index 8ba994d8a142..000000000000
--- a/Documentation/devicetree/bindings/regulator/max8660.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-Maxim MAX8660 voltage regulator
-
-Required properties:
-- compatible: must be one of "maxim,max8660", "maxim,max8661"
-- reg: I2C slave address, usually 0x34
-- any required generic properties defined in regulator.txt
-
-Example:
-
- i2c_master {
- max8660@34 {
- compatible = "maxim,max8660";
- reg = <0x34>;
-
- regulators {
- regulator@0 {
- regulator-compatible= "V3(DCDC)";
- regulator-min-microvolt = <725000>;
- regulator-max-microvolt = <1800000>;
- };
-
- regulator@1 {
- regulator-compatible= "V4(DCDC)";
- regulator-min-microvolt = <725000>;
- regulator-max-microvolt = <1800000>;
- };
-
- regulator@2 {
- regulator-compatible= "V5(LDO)";
- regulator-min-microvolt = <1700000>;
- regulator-max-microvolt = <2000000>;
- };
-
- regulator@3 {
- regulator-compatible= "V6(LDO)";
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <3300000>;
- };
-
- regulator@4 {
- regulator-compatible= "V7(LDO)";
- regulator-min-microvolt = <1800000>;
- regulator-max-microvolt = <3300000>;
- };
- };
- };
- };
diff --git a/Documentation/devicetree/bindings/regulator/max8660.yaml b/Documentation/devicetree/bindings/regulator/max8660.yaml
new file mode 100644
index 000000000000..9c038698f880
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/max8660.yaml
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/max8660.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Maxim MAX8660 voltage regulator
+
+maintainers:
+ - Daniel Mack <zonque@gmail.com>
+
+properties:
+ $nodename:
+ pattern: "pmic@[0-9a-f]{1,2}"
+ compatible:
+ enum:
+ - maxim,max8660
+ - maxim,max8661
+
+ reg:
+ maxItems: 1
+
+ regulators:
+ type: object
+
+ patternProperties:
+ "regulator-.+":
+ $ref: "regulator.yaml#"
+
+ additionalProperties: false
+
+additionalProperties: false
+
+examples:
+ - |
+ i2c {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ pmic@34 {
+ compatible = "maxim,max8660";
+ reg = <0x34>;
+
+ regulators {
+ regulator-V3 {
+ regulator-compatible= "V3(DCDC)";
+ regulator-min-microvolt = <725000>;
+ regulator-max-microvolt = <1800000>;
+ };
+
+ regulator-V4 {
+ regulator-compatible= "V4(DCDC)";
+ regulator-min-microvolt = <725000>;
+ regulator-max-microvolt = <1800000>;
+ };
+
+ regulator-V5 {
+ regulator-compatible= "V5(LDO)";
+ regulator-min-microvolt = <1700000>;
+ regulator-max-microvolt = <2000000>;
+ };
+
+ regulator-V6 {
+ regulator-compatible= "V6(LDO)";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <3300000>;
+ };
+
+ regulator-V7 {
+ regulator-compatible= "V7(LDO)";
+ regulator-min-microvolt = <1800000>;
+ regulator-max-microvolt = <3300000>;
+ };
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/regulator/pv88060.txt b/Documentation/devicetree/bindings/regulator/pv88060.txt
index 10a6dadc008e..6a7c8a92fdb0 100644
--- a/Documentation/devicetree/bindings/regulator/pv88060.txt
+++ b/Documentation/devicetree/bindings/regulator/pv88060.txt
@@ -121,4 +121,4 @@ Example
regulator-max-microvolt = <5000000>;
};
};
- }; \ No newline at end of file
+ };
diff --git a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt
index 406f2e570c50..430b8622bda1 100644
--- a/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/qcom,spmi-regulator.txt
@@ -4,11 +4,13 @@ Qualcomm SPMI Regulators
Usage: required
Value type: <string>
Definition: must be one of:
+ "qcom,pm8005-regulators"
"qcom,pm8841-regulators"
"qcom,pm8916-regulators"
"qcom,pm8941-regulators"
"qcom,pm8994-regulators"
"qcom,pmi8994-regulators"
+ "qcom,pms405-regulators"
- interrupts:
Usage: optional
@@ -110,6 +112,23 @@ Qualcomm SPMI Regulators
Definition: Reference to regulator supplying the input pin, as
described in the data sheet.
+- vdd_l1_l2-supply:
+- vdd_l3_l8-supply:
+- vdd_l4-supply:
+- vdd_l5_l6-supply:
+- vdd_l10_l11_l12_l13-supply:
+- vdd_l7-supply:
+- vdd_l9-supply:
+- vdd_s1-supply:
+- vdd_s2-supply:
+- vdd_s3-supply:
+- vdd_s4-supply:
+- vdd_s5-supply
+ Usage: optional (pms405 only)
+ Value type: <phandle>
+ Definition: Reference to regulator supplying the input pin, as
+ described in the data sheet.
+
- qcom,saw-reg:
Usage: optional
Value type: <phandle>
@@ -120,6 +139,9 @@ The regulator node houses sub-nodes for each regulator within the device. Each
sub-node is identified using the node's name, with valid values listed for each
of the PMICs below.
+pm8005:
+ s1, s2, s3, s4
+
pm8841:
s1, s2, s3, s4, s5, s6, s7, s8
diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt
index 0a3f087d5844..487ccd8370b3 100644
--- a/Documentation/devicetree/bindings/regulator/regulator.txt
+++ b/Documentation/devicetree/bindings/regulator/regulator.txt
@@ -1,139 +1 @@
-Voltage/Current Regulators
-
-Optional properties:
-- regulator-name: A string used as a descriptive name for regulator outputs
-- regulator-min-microvolt: smallest voltage consumers may set
-- regulator-max-microvolt: largest voltage consumers may set
-- regulator-microvolt-offset: Offset applied to voltages to compensate for voltage drops
-- regulator-min-microamp: smallest current consumers may set
-- regulator-max-microamp: largest current consumers may set
-- regulator-input-current-limit-microamp: maximum input current regulator allows
-- regulator-always-on: boolean, regulator should never be disabled
-- regulator-boot-on: bootloader/firmware enabled regulator
-- regulator-allow-bypass: allow the regulator to go into bypass mode
-- regulator-allow-set-load: allow the regulator performance level to be configured
-- <name>-supply: phandle to the parent supply/regulator node
-- regulator-ramp-delay: ramp delay for regulator(in uV/us)
- For hardware which supports disabling ramp rate, it should be explicitly
- initialised to zero (regulator-ramp-delay = <0>) for disabling ramp delay.
-- regulator-enable-ramp-delay: The time taken, in microseconds, for the supply
- rail to reach the target voltage, plus/minus whatever tolerance the board
- design requires. This property describes the total system ramp time
- required due to the combination of internal ramping of the regulator itself,
- and board design issues such as trace capacitance and load on the supply.
-- regulator-settling-time-us: Settling time, in microseconds, for voltage
- change if regulator have the constant time for any level voltage change.
- This is useful when regulator have exponential voltage change.
-- regulator-settling-time-up-us: Settling time, in microseconds, for voltage
- increase if the regulator needs a constant time to settle after voltage
- increases of any level. This is useful for regulators with exponential
- voltage changes.
-- regulator-settling-time-down-us: Settling time, in microseconds, for voltage
- decrease if the regulator needs a constant time to settle after voltage
- decreases of any level. This is useful for regulators with exponential
- voltage changes.
-- regulator-soft-start: Enable soft start so that voltage ramps slowly
-- regulator-state-standby sub-root node for Standby mode
- : equivalent with standby Linux sleep state, which provides energy savings
- with a relatively quick transition back time.
-- regulator-state-mem sub-root node for Suspend-to-RAM mode
- : suspend to memory, the device goes to sleep, but all data stored in memory,
- only some external interrupt can wake the device.
-- regulator-state-disk sub-root node for Suspend-to-DISK mode
- : suspend to disk, this state operates similarly to Suspend-to-RAM,
- but includes a final step of writing memory contents to disk.
-- regulator-state-[mem/disk/standby] node has following common properties:
- - regulator-on-in-suspend: regulator should be on in suspend state.
- - regulator-off-in-suspend: regulator should be off in suspend state.
- - regulator-suspend-min-microvolt: minimum voltage may be set in
- suspend state.
- - regulator-suspend-max-microvolt: maximum voltage may be set in
- suspend state.
- - regulator-suspend-microvolt: the default voltage which regulator
- would be set in suspend. This property is now deprecated, instead
- setting voltage for suspend mode via the API which regulator
- driver provides is recommended.
- - regulator-changeable-in-suspend: whether the default voltage and
- the regulator on/off in suspend can be changed in runtime.
- - regulator-mode: operating mode in the given suspend state.
- The set of possible operating modes depends on the capabilities of
- every hardware so the valid modes are documented on each regulator
- device tree binding document.
-- regulator-initial-mode: initial operating mode. The set of possible operating
- modes depends on the capabilities of every hardware so each device binding
- documentation explains which values the regulator supports.
-- regulator-allowed-modes: list of operating modes that software is allowed to
- configure for the regulator at run-time. Elements may be specified in any
- order. The set of possible operating modes depends on the capabilities of
- every hardware so each device binding document explains which values the
- regulator supports.
-- regulator-system-load: Load in uA present on regulator that is not captured by
- any consumer request.
-- regulator-pull-down: Enable pull down resistor when the regulator is disabled.
-- regulator-over-current-protection: Enable over current protection.
-- regulator-active-discharge: tristate, enable/disable active discharge of
- regulators. The values are:
- 0: Disable active discharge.
- 1: Enable active discharge.
- Absence of this property will leave configuration to default.
-- regulator-coupled-with: Regulators with which the regulator
- is coupled. The linkage is 2-way - all coupled regulators should be linked
- with each other. A regulator should not be coupled with its supplier.
-- regulator-coupled-max-spread: Array of maximum spread between voltages of
- coupled regulators in microvolts, each value in the array relates to the
- corresponding couple specified by the regulator-coupled-with property.
-- regulator-max-step-microvolt: Maximum difference between current and target
- voltages that can be changed safely in a single step.
-
-Deprecated properties:
-- regulator-compatible: If a regulator chip contains multiple
- regulators, and if the chip's binding contains a child node that
- describes each regulator, then this property indicates which regulator
- this child node is intended to configure. If this property is missing,
- the node's name will be used instead.
-
-Example:
-
- xyzreg: regulator@0 {
- regulator-min-microvolt = <1000000>;
- regulator-max-microvolt = <2500000>;
- regulator-always-on;
- vin-supply = <&vin>;
-
- regulator-state-mem {
- regulator-on-in-suspend;
- };
- };
-
-Regulator Consumers:
-Consumer nodes can reference one or more of its supplies/
-regulators using the below bindings.
-
-- <name>-supply: phandle to the regulator node
-
-These are the same bindings that a regulator in the above
-example used to reference its own supply, in which case
-its just seen as a special case of a regulator being a
-consumer itself.
-
-Example of a consumer device node (mmc) referencing two
-regulators (twl_reg1 and twl_reg2),
-
- twl_reg1: regulator@0 {
- ...
- ...
- ...
- };
-
- twl_reg2: regulator@1 {
- ...
- ...
- ...
- };
-
- mmc: mmc@0 {
- ...
- ...
- vmmc-supply = <&twl_reg1>;
- vmmcaux-supply = <&twl_reg2>;
- };
+This file has moved to regulator.yaml.
diff --git a/Documentation/devicetree/bindings/regulator/regulator.yaml b/Documentation/devicetree/bindings/regulator/regulator.yaml
new file mode 100644
index 000000000000..02c3043ce419
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/regulator.yaml
@@ -0,0 +1,200 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/regulator/regulator.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Voltage/Current Regulators
+
+maintainers:
+ - Liam Girdwood <lgirdwood@gmail.com>
+ - Mark Brown <broonie@kernel.org>
+
+properties:
+ regulator-name:
+ description: A string used as a descriptive name for regulator outputs
+ $ref: "/schemas/types.yaml#/definitions/string"
+
+ regulator-min-microvolt:
+ description: smallest voltage consumers may set
+
+ regulator-max-microvolt:
+ description: largest voltage consumers may set
+
+ regulator-microvolt-offset:
+ description: Offset applied to voltages to compensate for voltage drops
+
+ regulator-min-microamp:
+ description: smallest current consumers may set
+
+ regulator-max-microamp:
+ description: largest current consumers may set
+
+ regulator-input-current-limit-microamp:
+ description: maximum input current regulator allows
+
+ regulator-always-on:
+ description: boolean, regulator should never be disabled
+ type: boolean
+
+ regulator-boot-on:
+ description: bootloader/firmware enabled regulator
+ type: boolean
+
+ regulator-allow-bypass:
+ description: allow the regulator to go into bypass mode
+ type: boolean
+
+ regulator-allow-set-load:
+ description: allow the regulator performance level to be configured
+ type: boolean
+
+ regulator-ramp-delay:
+ description: ramp delay for regulator(in uV/us) For hardware which supports
+ disabling ramp rate, it should be explicitly initialised to zero (regulator-ramp-delay
+ = <0>) for disabling ramp delay.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ regulator-enable-ramp-delay:
+ description: The time taken, in microseconds, for the supply rail to
+ reach the target voltage, plus/minus whatever tolerance the board
+ design requires. This property describes the total system ramp time
+ required due to the combination of internal ramping of the regulator
+ itself, and board design issues such as trace capacitance and load
+ on the supply.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ regulator-settling-time-us:
+ description: Settling time, in microseconds, for voltage change if regulator
+ have the constant time for any level voltage change. This is useful
+ when regulator have exponential voltage change.
+
+ regulator-settling-time-up-us:
+ description: Settling time, in microseconds, for voltage increase if
+ the regulator needs a constant time to settle after voltage increases
+ of any level. This is useful for regulators with exponential voltage
+ changes.
+
+ regulator-settling-time-down-us:
+ description: Settling time, in microseconds, for voltage decrease if
+ the regulator needs a constant time to settle after voltage decreases
+ of any level. This is useful for regulators with exponential voltage
+ changes.
+
+ regulator-soft-start:
+ description: Enable soft start so that voltage ramps slowly
+ type: boolean
+
+ regulator-initial-mode:
+ description: initial operating mode. The set of possible operating modes
+ depends on the capabilities of every hardware so each device binding
+ documentation explains which values the regulator supports.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ regulator-allowed-modes:
+ description: list of operating modes that software is allowed to configure
+ for the regulator at run-time. Elements may be specified in any order.
+ The set of possible operating modes depends on the capabilities of
+ every hardware so each device binding document explains which values
+ the regulator supports.
+ $ref: "/schemas/types.yaml#/definitions/uint32-array"
+
+ regulator-system-load:
+ description: Load in uA present on regulator that is not captured by
+ any consumer request.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ regulator-pull-down:
+ description: Enable pull down resistor when the regulator is disabled.
+ type: boolean
+
+ regulator-over-current-protection:
+ description: Enable over current protection.
+ type: boolean
+
+ regulator-active-discharge:
+ description: |
+ tristate, enable/disable active discharge of regulators. The values are:
+ 0: Disable active discharge.
+ 1: Enable active discharge.
+ Absence of this property will leave configuration to default.
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - enum: [ 0, 1 ]
+
+ regulator-coupled-with:
+ description: Regulators with which the regulator is coupled. The linkage
+ is 2-way - all coupled regulators should be linked with each other.
+ A regulator should not be coupled with its supplier.
+ $ref: "/schemas/types.yaml#/definitions/phandle-array"
+
+ regulator-coupled-max-spread:
+ description: Array of maximum spread between voltages of coupled regulators
+ in microvolts, each value in the array relates to the corresponding
+ couple specified by the regulator-coupled-with property.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ regulator-max-step-microvolt:
+ description: Maximum difference between current and target voltages
+ that can be changed safely in a single step.
+
+patternProperties:
+ ".*-supply$":
+ description: Input supply phandle(s) for this node
+
+ regulator-state-(standby|mem|disk):
+ type: object
+ description:
+ sub-nodes for regulator state in Standby, Suspend-to-RAM, and
+ Suspend-to-DISK modes. Equivalent with standby, mem, and disk Linux
+ sleep states.
+
+ properties:
+ regulator-on-in-suspend:
+ description: regulator should be on in suspend state.
+ type: boolean
+
+ regulator-off-in-suspend:
+ description: regulator should be off in suspend state.
+ type: boolean
+
+ regulator-suspend-min-microvolt:
+ description: minimum voltage may be set in suspend state.
+
+ regulator-suspend-max-microvolt:
+ description: maximum voltage may be set in suspend state.
+
+ regulator-suspend-microvolt:
+ description: the default voltage which regulator would be set in
+ suspend. This property is now deprecated, instead setting voltage
+ for suspend mode via the API which regulator driver provides is
+ recommended.
+
+ regulator-changeable-in-suspend:
+ description: whether the default voltage and the regulator on/off
+ in suspend can be changed in runtime.
+ type: boolean
+
+ regulator-mode:
+ description: operating mode in the given suspend state. The set
+ of possible operating modes depends on the capabilities of every
+ hardware so the valid modes are documented on each regulator device
+ tree binding document.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ additionalProperties: false
+
+examples:
+ - |
+ xyzreg: regulator@0 {
+ regulator-min-microvolt = <1000000>;
+ regulator-max-microvolt = <2500000>;
+ regulator-always-on;
+ vin-supply = <&vin>;
+
+ regulator-state-mem {
+ regulator-on-in-suspend;
+ };
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/regulator/slg51000.txt b/Documentation/devicetree/bindings/regulator/slg51000.txt
new file mode 100644
index 000000000000..aa0733e49b90
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/slg51000.txt
@@ -0,0 +1,88 @@
+* Dialog Semiconductor SLG51000 Voltage Regulator
+
+Required properties:
+- compatible : Should be "dlg,slg51000" for SLG51000
+- reg : Specifies the I2C slave address.
+- xxx-supply: Input voltage supply regulator for ldo3 to ldo7.
+ These entries are required if regulators are enabled for a device.
+ An absence of these properties can cause the regulator registration to fail.
+ If some of input supply is powered through battery or always-on supply then
+ also it is required to have these parameters with proper node handle of always
+ on power supply.
+ vin3-supply: Input supply for ldo3
+ vin4-supply: Input supply for ldo4
+ vin5-supply: Input supply for ldo5
+ vin6-supply: Input supply for ldo6
+ vin7-supply: Input supply for ldo7
+
+Optional properties:
+- interrupt-parent : Specifies the reference to the interrupt controller.
+- interrupts : IRQ line information.
+- dlg,cs-gpios : Specify a valid GPIO for chip select
+
+Sub-nodes:
+- regulators : This node defines the settings for the regulators.
+ The content of the sub-node is defined by the standard binding
+ for regulators; see regulator.txt.
+
+ The SLG51000 regulators are bound using their names listed below:
+ ldo1
+ ldo2
+ ldo3
+ ldo4
+ ldo5
+ ldo6
+ ldo7
+
+Optional properties for regulators:
+- enable-gpios : Specify a valid GPIO for platform control of the regulator.
+
+Example:
+ pmic: slg51000@75 {
+ compatible = "dlg,slg51000";
+ reg = <0x75>;
+
+ regulators {
+ ldo1 {
+ regulator-name = "ldo1";
+ regulator-min-microvolt = <2400000>;
+ regulator-max-microvolt = <3300000>;
+ };
+
+ ldo2 {
+ regulator-name = "ldo2";
+ regulator-min-microvolt = <2400000>;
+ regulator-max-microvolt = <3300000>;
+ };
+
+ ldo3 {
+ regulator-name = "ldo3";
+ regulator-min-microvolt = <1200000>;
+ regulator-max-microvolt = <3750000>;
+ };
+
+ ldo4 {
+ regulator-name = "ldo4";
+ regulator-min-microvolt = <1200000>;
+ regulator-max-microvolt = <3750000>;
+ };
+
+ ldo5 {
+ regulator-name = "ldo5";
+ regulator-min-microvolt = <500000>;
+ regulator-max-microvolt = <1200000>;
+ };
+
+ ldo6 {
+ regulator-name = "ldo6";
+ regulator-min-microvolt = <500000>;
+ regulator-max-microvolt = <1200000>;
+ };
+
+ ldo7 {
+ regulator-name = "ldo7";
+ regulator-min-microvolt = <1200000>;
+ regulator-max-microvolt = <3750000>;
+ };
+ };
+ };
diff --git a/Documentation/devicetree/bindings/regulator/st,stm32-booster.txt b/Documentation/devicetree/bindings/regulator/st,stm32-booster.txt
new file mode 100644
index 000000000000..479ad4c8758e
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/st,stm32-booster.txt
@@ -0,0 +1,18 @@
+STM32 BOOSTER - Booster for ADC analog input switches
+
+Some STM32 devices embed a 3.3V booster supplied by Vdda, that can be used
+to supply ADC analog input switches.
+
+Required properties:
+- compatible: Should be one of:
+ "st,stm32h7-booster"
+ "st,stm32mp1-booster"
+- st,syscfg: Phandle to system configuration controller.
+- vdda-supply: Phandle to the vdda input analog voltage.
+
+Example:
+ booster: regulator-booster {
+ compatible = "st,stm32mp1-booster";
+ st,syscfg = <&syscfg>;
+ vdda-supply = <&vdda>;
+ };
diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt b/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt
deleted file mode 100644
index 66af2c30944f..000000000000
--- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp-pil.txt
+++ /dev/null
@@ -1,125 +0,0 @@
-Qualcomm Technology Inc. ADSP Peripheral Image Loader
-
-This document defines the binding for a component that loads and boots firmware
-on the Qualcomm Technology Inc. ADSP Hexagon core.
-
-- compatible:
- Usage: required
- Value type: <string>
- Definition: must be one of:
- "qcom,sdm845-adsp-pil"
-
-- reg:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: must specify the base address and size of the qdsp6ss register
-
-- interrupts-extended:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: must list the watchdog, fatal IRQs ready, handover and
- stop-ack IRQs
-
-- interrupt-names:
- Usage: required
- Value type: <stringlist>
- Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack"
-
-- clocks:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: List of 8 phandle and clock specifier pairs for the adsp.
-
-- clock-names:
- Usage: required
- Value type: <stringlist>
- Definition: List of clock input name strings sorted in the same
- order as the clocks property. Definition must have
- "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
- "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep"
- and "qdsp6ss_core".
-
-- power-domains:
- Usage: required
- Value type: <phandle>
- Definition: reference to cx power domain node.
-
-- resets:
- Usage: required
- Value type: <phandle>
- Definition: reference to the list of 2 reset-controller for the adsp.
-
-- reset-names:
- Usage: required
- Value type: <stringlist>
- Definition: must be "pdc_sync" and "cc_lpass"
-
-- qcom,halt-regs:
- Usage: required
- Value type: <prop-encoded-array>
- Definition: a phandle reference to a syscon representing TCSR followed
- by the offset within syscon for lpass halt register.
-
-- memory-region:
- Usage: required
- Value type: <phandle>
- Definition: reference to the reserved-memory for the ADSP
-
-- qcom,smem-states:
- Usage: required
- Value type: <phandle>
- Definition: reference to the smem state for requesting the ADSP to
- shut down
-
-- qcom,smem-state-names:
- Usage: required
- Value type: <stringlist>
- Definition: must be "stop"
-
-
-= SUBNODES
-The adsp node may have an subnode named "glink-edge" that describes the
-communication edge, channels and devices related to the ADSP.
-See ../soc/qcom/qcom,glink.txt for details on how to describe these.
-
-= EXAMPLE
-The following example describes the resources needed to boot control the
-ADSP, as it is found on SDM845 boards.
-
- remoteproc@17300000 {
- compatible = "qcom,sdm845-adsp-pil";
- reg = <0x17300000 0x40c>;
-
- interrupts-extended = <&intc GIC_SPI 162 IRQ_TYPE_EDGE_RISING>,
- <&adsp_smp2p_in 0 IRQ_TYPE_EDGE_RISING>,
- <&adsp_smp2p_in 1 IRQ_TYPE_EDGE_RISING>,
- <&adsp_smp2p_in 2 IRQ_TYPE_EDGE_RISING>,
- <&adsp_smp2p_in 3 IRQ_TYPE_EDGE_RISING>;
- interrupt-names = "wdog", "fatal", "ready",
- "handover", "stop-ack";
-
- clocks = <&rpmhcc RPMH_CXO_CLK>,
- <&gcc GCC_LPASS_SWAY_CLK>,
- <&lpasscc LPASS_Q6SS_AHBS_AON_CLK>,
- <&lpasscc LPASS_Q6SS_AHBM_AON_CLK>,
- <&lpasscc LPASS_QDSP6SS_XO_CLK>,
- <&lpasscc LPASS_QDSP6SS_SLEEP_CLK>,
- <&lpasscc LPASS_QDSP6SS_CORE_CLK>;
- clock-names = "xo", "sway_cbcr",
- "lpass_ahbs_aon_cbcr",
- "lpass_ahbm_aon_cbcr", "qdsp6ss_xo",
- "qdsp6ss_sleep", "qdsp6ss_core";
-
- power-domains = <&rpmhpd SDM845_CX>;
-
- resets = <&pdc_reset PDC_AUDIO_SYNC_RESET>,
- <&aoss_reset AOSS_CC_LPASS_RESTART>;
- reset-names = "pdc_sync", "cc_lpass";
-
- qcom,halt-regs = <&tcsr_mutex_regs 0x22000>;
-
- memory-region = <&pil_adsp_mem>;
-
- qcom,smem-states = <&adsp_smp2p_out 0>;
- qcom,smem-state-names = "stop";
- };
diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
new file mode 100644
index 000000000000..1337a3d93d35
--- /dev/null
+++ b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
@@ -0,0 +1,140 @@
+Qualcomm Technology Inc. Hexagon v56 Peripheral Image Loader
+
+This document defines the binding for a component that loads and boots firmware
+on the Qualcomm Technology Inc. Hexagon v56 core.
+
+- compatible:
+ Usage: required
+ Value type: <string>
+ Definition: must be one of:
+ "qcom,qcs404-cdsp-pil",
+ "qcom,sdm845-adsp-pil"
+
+- reg:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: must specify the base address and size of the qdsp6ss register
+
+- interrupts-extended:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: must list the watchdog, fatal IRQs ready, handover and
+ stop-ack IRQs
+
+- interrupt-names:
+ Usage: required
+ Value type: <stringlist>
+ Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack"
+
+- clocks:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: List of phandles and clock specifier pairs for the Hexagon,
+ per clock-names below.
+
+- clock-names:
+ Usage: required for SDM845 ADSP
+ Value type: <stringlist>
+ Definition: List of clock input name strings sorted in the same
+ order as the clocks property. Definition must have
+ "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
+ "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep"
+ and "qdsp6ss_core".
+
+- clock-names:
+ Usage: required for QCS404 CDSP
+ Value type: <stringlist>
+ Definition: List of clock input name strings sorted in the same
+ order as the clocks property. Definition must have
+ "xo", "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave",
+ "q6ss_master", "q6_axim".
+
+- power-domains:
+ Usage: required
+ Value type: <phandle>
+ Definition: reference to cx power domain node.
+
+- resets:
+ Usage: required
+ Value type: <phandle>
+ Definition: reference to the list of resets for the Hexagon.
+
+- reset-names:
+ Usage: required for SDM845 ADSP
+ Value type: <stringlist>
+ Definition: must be "pdc_sync" and "cc_lpass"
+
+- reset-names:
+ Usage: required for QCS404 CDSP
+ Value type: <stringlist>
+ Definition: must be "restart"
+
+- qcom,halt-regs:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: a phandle reference to a syscon representing TCSR followed
+ by the offset within syscon for Hexagon halt register.
+
+- memory-region:
+ Usage: required
+ Value type: <phandle>
+ Definition: reference to the reserved-memory for the firmware
+
+- qcom,smem-states:
+ Usage: required
+ Value type: <phandle>
+ Definition: reference to the smem state for requesting the Hexagon to
+ shut down
+
+- qcom,smem-state-names:
+ Usage: required
+ Value type: <stringlist>
+ Definition: must be "stop"
+
+
+= SUBNODES
+The adsp node may have an subnode named "glink-edge" that describes the
+communication edge, channels and devices related to the Hexagon.
+See ../soc/qcom/qcom,glink.txt for details on how to describe these.
+
+= EXAMPLE
+The following example describes the resources needed to boot control the
+ADSP, as it is found on SDM845 boards.
+
+ remoteproc@17300000 {
+ compatible = "qcom,sdm845-adsp-pil";
+ reg = <0x17300000 0x40c>;
+
+ interrupts-extended = <&intc GIC_SPI 162 IRQ_TYPE_EDGE_RISING>,
+ <&adsp_smp2p_in 0 IRQ_TYPE_EDGE_RISING>,
+ <&adsp_smp2p_in 1 IRQ_TYPE_EDGE_RISING>,
+ <&adsp_smp2p_in 2 IRQ_TYPE_EDGE_RISING>,
+ <&adsp_smp2p_in 3 IRQ_TYPE_EDGE_RISING>;
+ interrupt-names = "wdog", "fatal", "ready",
+ "handover", "stop-ack";
+
+ clocks = <&rpmhcc RPMH_CXO_CLK>,
+ <&gcc GCC_LPASS_SWAY_CLK>,
+ <&lpasscc LPASS_Q6SS_AHBS_AON_CLK>,
+ <&lpasscc LPASS_Q6SS_AHBM_AON_CLK>,
+ <&lpasscc LPASS_QDSP6SS_XO_CLK>,
+ <&lpasscc LPASS_QDSP6SS_SLEEP_CLK>,
+ <&lpasscc LPASS_QDSP6SS_CORE_CLK>;
+ clock-names = "xo", "sway_cbcr",
+ "lpass_ahbs_aon_cbcr",
+ "lpass_ahbm_aon_cbcr", "qdsp6ss_xo",
+ "qdsp6ss_sleep", "qdsp6ss_core";
+
+ power-domains = <&rpmhpd SDM845_CX>;
+
+ resets = <&pdc_reset PDC_AUDIO_SYNC_RESET>,
+ <&aoss_reset AOSS_CC_LPASS_RESTART>;
+ reset-names = "pdc_sync", "cc_lpass";
+
+ qcom,halt-regs = <&tcsr_mutex_regs 0x22000>;
+
+ memory-region = <&pil_adsp_mem>;
+
+ qcom,smem-states = <&adsp_smp2p_out 0>;
+ qcom,smem-state-names = "stop";
+ };
diff --git a/Documentation/devicetree/bindings/remoteproc/stm32-rproc.txt b/Documentation/devicetree/bindings/remoteproc/stm32-rproc.txt
new file mode 100644
index 000000000000..5fa915a4b736
--- /dev/null
+++ b/Documentation/devicetree/bindings/remoteproc/stm32-rproc.txt
@@ -0,0 +1,63 @@
+STMicroelectronics STM32 Remoteproc
+-----------------------------------
+This document defines the binding for the remoteproc component that loads and
+boots firmwares on the ST32MP family chipset.
+
+Required properties:
+- compatible: Must be "st,stm32mp1-m4"
+- reg: Address ranges of the RETRAM and MCU SRAM memories used by the
+ remote processor.
+- resets: Reference to a reset controller asserting the remote processor.
+- st,syscfg-holdboot: Reference to the system configuration which holds the
+ remote processor reset hold boot
+ 1st cell: phandle of syscon block
+ 2nd cell: register offset containing the hold boot setting
+ 3rd cell: register bitmask for the hold boot field
+- st,syscfg-tz: Reference to the system configuration which holds the RCC trust
+ zone mode
+ 1st cell: phandle to syscon block
+ 2nd cell: register offset containing the RCC trust zone mode setting
+ 3rd cell: register bitmask for the RCC trust zone mode bit
+
+Optional properties:
+- interrupts: Should contain the watchdog interrupt
+- mboxes: This property is required only if the rpmsg/virtio functionality
+ is used. List of phandle and mailbox channel specifiers:
+ - a channel (a) used to communicate through virtqueues with the
+ remote proc.
+ Bi-directional channel:
+ - from local to remote = send message
+ - from remote to local = send message ack
+ - a channel (b) working the opposite direction of channel (a)
+ - a channel (c) used by the local proc to notify the remote proc
+ that it is about to be shut down.
+ Unidirectional channel:
+ - from local to remote, where ACK from the remote means
+ that it is ready for shutdown
+- mbox-names: This property is required if the mboxes property is used.
+ - must be "vq0" for channel (a)
+ - must be "vq1" for channel (b)
+ - must be "shutdown" for channel (c)
+- memory-region: List of phandles to the reserved memory regions associated with
+ the remoteproc device. This is variable and describes the
+ memories shared with the remote processor (eg: remoteproc
+ firmware and carveouts, rpmsg vrings, ...).
+ (see ../reserved-memory/reserved-memory.txt)
+- st,syscfg-pdds: Reference to the system configuration which holds the remote
+ processor deep sleep setting
+ 1st cell: phandle to syscon block
+ 2nd cell: register offset containing the deep sleep setting
+ 3rd cell: register bitmask for the deep sleep bit
+- st,auto-boot: If defined, when remoteproc is probed, it loads the default
+ firmware and starts the remote processor.
+
+Example:
+ m4_rproc: m4@10000000 {
+ compatible = "st,stm32mp1-m4";
+ reg = <0x10000000 0x40000>,
+ <0x30000000 0x40000>,
+ <0x38000000 0x10000>;
+ resets = <&rcc MCU_R>;
+ st,syscfg-holdboot = <&rcc 0x10C 0x1>;
+ st,syscfg-tz = <&rcc 0x000 0x1>;
+ };
diff --git a/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt
new file mode 100644
index 000000000000..a6f8455ae6c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/reset/bitmain,bm1880-reset.txt
@@ -0,0 +1,18 @@
+Bitmain BM1880 SoC Reset Controller
+===================================
+
+Please also refer to reset.txt in this directory for common reset
+controller binding usage.
+
+Required properties:
+- compatible: Should be "bitmain,bm1880-reset"
+- reg: Offset and length of reset controller space in SCTRL.
+- #reset-cells: Must be 1.
+
+Example:
+
+ rst: reset-controller@c00 {
+ compatible = "bitmain,bm1880-reset";
+ reg = <0xc00 0x8>;
+ #reset-cells = <1>;
+ };
diff --git a/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt
index 2ecf33815d18..13e095182db4 100644
--- a/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt
+++ b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt
@@ -45,6 +45,6 @@ Example:
};
-For list of all valid reset indicies see
+For list of all valid reset indices see
<dt-bindings/reset/imx7-reset.h> for i.MX7 and
<dt-bindings/reset/imx8mq-reset.h> for i.MX8MQ
diff --git a/Documentation/devicetree/bindings/riscv/cpus.yaml b/Documentation/devicetree/bindings/riscv/cpus.yaml
new file mode 100644
index 000000000000..c899111aa5e3
--- /dev/null
+++ b/Documentation/devicetree/bindings/riscv/cpus.yaml
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: (GPL-2.0 OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/riscv/cpus.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RISC-V bindings for 'cpus' DT nodes
+
+maintainers:
+ - Paul Walmsley <paul.walmsley@sifive.com>
+ - Palmer Dabbelt <palmer@sifive.com>
+
+properties:
+ compatible:
+ items:
+ - enum:
+ - sifive,rocket0
+ - sifive,e5
+ - sifive,e51
+ - sifive,u54-mc
+ - sifive,u54
+ - sifive,u5
+ - const: riscv
+ description:
+ Identifies that the hart uses the RISC-V instruction set
+ and identifies the type of the hart.
+
+ mmu-type:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/string"
+ - enum:
+ - riscv,sv32
+ - riscv,sv39
+ - riscv,sv48
+ description:
+ Identifies the MMU address translation mode used on this
+ hart. These values originate from the RISC-V Privileged
+ Specification document, available from
+ https://riscv.org/specifications/
+
+ riscv,isa:
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/string"
+ - enum:
+ - rv64imac
+ - rv64imafdc
+ description:
+ Identifies the specific RISC-V instruction set architecture
+ supported by the hart. These are documented in the RISC-V
+ User-Level ISA document, available from
+ https://riscv.org/specifications/
+
+ timebase-frequency:
+ type: integer
+ minimum: 1
+ description:
+ Specifies the clock frequency of the system timer in Hz.
+ This value is common to all harts on a single system image.
+
+ interrupt-controller:
+ type: object
+ description: Describes the CPU's local interrupt controller
+
+ properties:
+ '#interrupt-cells':
+ const: 1
+
+ compatible:
+ const: riscv,cpu-intc
+
+ interrupt-controller: true
+
+ required:
+ - '#interrupt-cells'
+ - compatible
+ - interrupt-controller
+
+required:
+ - riscv,isa
+ - timebase-frequency
+ - interrupt-controller
+
+examples:
+ - |
+ // Example 1: SiFive Freedom U540G Development Kit
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ timebase-frequency = <1000000>;
+ cpu@0 {
+ clock-frequency = <0>;
+ compatible = "sifive,rocket0", "riscv";
+ device_type = "cpu";
+ i-cache-block-size = <64>;
+ i-cache-sets = <128>;
+ i-cache-size = <16384>;
+ reg = <0>;
+ riscv,isa = "rv64imac";
+ cpu_intc0: interrupt-controller {
+ #interrupt-cells = <1>;
+ compatible = "riscv,cpu-intc";
+ interrupt-controller;
+ };
+ };
+ cpu@1 {
+ clock-frequency = <0>;
+ compatible = "sifive,rocket0", "riscv";
+ d-cache-block-size = <64>;
+ d-cache-sets = <64>;
+ d-cache-size = <32768>;
+ d-tlb-sets = <1>;
+ d-tlb-size = <32>;
+ device_type = "cpu";
+ i-cache-block-size = <64>;
+ i-cache-sets = <64>;
+ i-cache-size = <32768>;
+ i-tlb-sets = <1>;
+ i-tlb-size = <32>;
+ mmu-type = "riscv,sv39";
+ reg = <1>;
+ riscv,isa = "rv64imafdc";
+ tlb-split;
+ cpu_intc1: interrupt-controller {
+ #interrupt-cells = <1>;
+ compatible = "riscv,cpu-intc";
+ interrupt-controller;
+ };
+ };
+ };
+
+ - |
+ // Example 2: Spike ISA Simulator with 1 Hart
+ cpus {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ cpu@0 {
+ device_type = "cpu";
+ reg = <0>;
+ compatible = "riscv";
+ riscv,isa = "rv64imafdc";
+ mmu-type = "riscv,sv48";
+ interrupt-controller {
+ #interrupt-cells = <1>;
+ interrupt-controller;
+ compatible = "riscv,cpu-intc";
+ };
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/riscv/sifive.yaml b/Documentation/devicetree/bindings/riscv/sifive.yaml
new file mode 100644
index 000000000000..9d17dc2f3f84
--- /dev/null
+++ b/Documentation/devicetree/bindings/riscv/sifive.yaml
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: (GPL-2.0 OR MIT)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/riscv/sifive.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SiFive SoC-based boards
+
+maintainers:
+ - Paul Walmsley <paul.walmsley@sifive.com>
+ - Palmer Dabbelt <palmer@sifive.com>
+
+description:
+ SiFive SoC-based boards
+
+properties:
+ $nodename:
+ const: '/'
+ compatible:
+ items:
+ - enum:
+ - sifive,freedom-unleashed-a00
+ - const: sifive,fu540-c000
+ - const: sifive,fu540
+...
diff --git a/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt b/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt
index 0014da9145af..c223e54452da 100644
--- a/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt
+++ b/Documentation/devicetree/bindings/rng/brcm,iproc-rng200.txt
@@ -2,6 +2,7 @@ HWRNG support for the iproc-rng200 driver
Required properties:
- compatible : Must be one of:
+ "brcm,bcm7211-rng200"
"brcm,bcm7278-rng200"
"brcm,iproc-rng200"
- reg : base address and size of control register block
diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml
new file mode 100644
index 000000000000..46d69c32b89b
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/allwinner,sun4i-a10-rtc.yaml
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/allwinner,sun4i-a10-rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 RTC Device Tree Bindings
+
+allOf:
+ - $ref: "rtc.yaml#"
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ compatible:
+ enum:
+ - allwinner,sun4i-a10-rtc
+ - allwinner,sun7i-a20-rtc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+additionalProperties: false
+
+examples:
+ - |
+ rtc: rtc@1c20d00 {
+ compatible = "allwinner,sun4i-a10-rtc";
+ reg = <0x01c20d00 0x20>;
+ interrupts = <24>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml
new file mode 100644
index 000000000000..924622f39c44
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/allwinner,sun6i-a31-rtc.yaml
@@ -0,0 +1,134 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/allwinner,sun6i-a31-rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 RTC Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#clock-cells":
+ const: 1
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun6i-a31-rtc
+ - const: allwinner,sun8i-a23-rtc
+ - const: allwinner,sun8i-h3-rtc
+ - const: allwinner,sun8i-r40-rtc
+ - const: allwinner,sun8i-v3-rtc
+ - const: allwinner,sun50i-h5-rtc
+ - items:
+ - const: allwinner,sun50i-a64-rtc
+ - const: allwinner,sun8i-h3-rtc
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ minItems: 1
+ maxItems: 2
+ items:
+ - description: RTC Alarm 0
+ - description: RTC Alarm 1
+
+ clocks:
+ maxItems: 1
+
+ clock-output-names:
+ minItems: 1
+ maxItems: 3
+ description:
+ The RTC provides up to three clocks
+ - the Low Frequency Oscillator or LOSC, at index 0,
+ - the Low Frequency Oscillator External output (X32KFOUT in
+ the datasheet), at index 1,
+ - the Internal Oscillator, at index 2.
+
+allOf:
+ - $ref: "rtc.yaml#"
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun6i-a31-rtc
+
+ then:
+ properties:
+ clock-output-names:
+ minItems: 1
+ maxItems: 1
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun8i-a23-rtc
+ - allwinner,sun8i-r40-rtc
+ - allwinner,sun8i-v3-rtc
+
+ then:
+ properties:
+ clock-output-names:
+ minItems: 2
+ maxItems: 2
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun8i-h3-rtc
+ - allwinner,sun50i-h5-rtc
+
+ then:
+ properties:
+ clock-output-names:
+ minItems: 3
+ maxItems: 3
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun8i-r40-rtc
+
+ then:
+ properties:
+ interrupts:
+ minItems: 1
+ maxItems: 1
+
+ else:
+ properties:
+ interrupts:
+ minItems: 2
+ maxItems: 2
+
+required:
+ - "#clock-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-output-names
+
+additionalProperties: false
+
+examples:
+ - |
+ rtc: rtc@1f00000 {
+ compatible = "allwinner,sun6i-a31-rtc";
+ reg = <0x01f00000 0x400>;
+ interrupts = <0 40 4>, <0 41 4>;
+ clock-output-names = "osc32k";
+ clocks = <&ext_osc32k>;
+ #clock-cells = <1>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/rtc/rtc.txt b/Documentation/devicetree/bindings/rtc/rtc.txt
index a97fc6a9a75e..b8d36fce5e2d 100644
--- a/Documentation/devicetree/bindings/rtc/rtc.txt
+++ b/Documentation/devicetree/bindings/rtc/rtc.txt
@@ -1,72 +1 @@
-Generic device tree bindings for Real Time Clock devices
-========================================================
-
-This document describes generic bindings which can be used to describe Real Time
-Clock devices in a device tree.
-
-Required properties
--------------------
-
-- compatible : name of RTC device following generic names recommended practice.
-
-For other required properties e.g. to describe register sets,
-clocks, etc. check the binding documentation of the specific driver.
-
-Optional properties
--------------------
-
-- start-year : if provided, the default hardware range supported by the RTC is
- shifted so the first usable year is the specified one.
-
-The following properties may not be supported by all drivers. However, if a
-driver wants to support one of the below features, it should adapt the bindings
-below.
-- trickle-resistor-ohms : Selected resistor for trickle charger. Should be given
- if trickle charger should be enabled
-- trickle-diode-disable : Do not use internal trickle charger diode Should be
- given if internal trickle charger diode should be
- disabled
-- wakeup-source : Enables wake up of host system on alarm
-- quartz-load-femtofarads : The capacitive load of the quartz(x-tal),
- expressed in femto Farad (fF).
- The default value shall be listed (if optional),
- and likewise all valid values.
-
-Trivial RTCs
-------------
-
-This is a list of trivial RTC devices that have simple device tree
-bindings, consisting only of a compatible field, an address and
-possibly an interrupt line.
-
-
-Compatible Vendor / Chip
-========== =============
-abracon,abb5zes3 AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface
-abracon,abeoz9 AB-RTCMC-32.768kHz-EOZ9: Real Time Clock/Calendar Module with I2C Interface
-dallas,ds1374 I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output
-dallas,ds1672 Dallas DS1672 Real-time Clock
-dallas,ds3232 Extremely Accurate I²C RTC with Integrated Crystal and SRAM
-epson,rx8010 I2C-BUS INTERFACE REAL TIME CLOCK MODULE
-epson,rx8571 I2C-BUS INTERFACE REAL TIME CLOCK MODULE with Battery Backed RAM
-epson,rx8581 I2C-BUS INTERFACE REAL TIME CLOCK MODULE
-emmicro,em3027 EM Microelectronic EM3027 Real-time Clock
-isil,isl1208 Intersil ISL1208 Low Power RTC with Battery Backed SRAM
-isil,isl1218 Intersil ISL1218 Low Power RTC with Battery Backed SRAM
-isil,isl12022 Intersil ISL12022 Real-time Clock
-microcrystal,rv3028 Real Time Clock Module with I2C-Bus
-microcrystal,rv3029 Real Time Clock Module with I2C-Bus
-microcrystal,rv8523 Real Time Clock
-nxp,pcf2127 Real-time clock
-nxp,pcf2129 Real-time clock
-nxp,pcf8563 Real-time clock/calendar
-pericom,pt7c4338 Real-time Clock Module
-ricoh,r2025sd I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,r2221tl I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rs5c372a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rs5c372b I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rv5c386 I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-ricoh,rv5c387a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-sii,s35390a 2-wire CMOS real-time clock
-whwave,sd3078 I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
-xircom,x1205 Xircom X1205 I2C RTC
+This file has been moved to rtc.yaml.
diff --git a/Documentation/devicetree/bindings/rtc/rtc.yaml b/Documentation/devicetree/bindings/rtc/rtc.yaml
new file mode 100644
index 000000000000..ee237b2ed66a
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/rtc.yaml
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: RTC Generic Binding
+
+maintainers:
+ - Alexandre Belloni <alexandre.belloni@bootlin.com>
+
+description: |
+ This document describes generic bindings which can be used to
+ describe Real Time Clock devices in a device tree.
+
+properties:
+ $nodename:
+ pattern: "^rtc(@.*|-[0-9a-f])*$"
+
+ quartz-load-femtofarads:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ The capacitive load of the quartz(x-tal), expressed in femto
+ Farad (fF). The default value shall be listed (if optional),
+ and likewise all valid values.
+
+ start-year:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ If provided, the default hardware range supported by the RTC is
+ shifted so the first usable year is the specified one.
+
+ trickle-diode-disable:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Do not use internal trickle charger diode. Should be given if
+ internal trickle charger diode should be disabled.
+
+ trickle-resistor-ohms:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Selected resistor for trickle charger. Should be given
+ if trickle charger should be enabled.
+
+ wakeup-source:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Enables wake up of host system on alarm.
+
+...
diff --git a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt b/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
deleted file mode 100644
index 6b732c41392b..000000000000
--- a/Documentation/devicetree/bindings/rtc/sun6i-rtc.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-* sun6i Real Time Clock
-
-RTC controller for the Allwinner A31
-
-Required properties:
-- compatible : Should be one of the following combinations:
- - "allwinner,sun6i-a31-rtc"
- - "allwinner,sun8i-a23-rtc"
- - "allwinner,sun8i-h3-rtc"
- - "allwinner,sun8i-r40-rtc", "allwinner,sun8i-h3-rtc"
- - "allwinner,sun8i-v3-rtc"
- - "allwinner,sun50i-a64-rtc", "allwinner,sun8i-h3-rtc"
- - "allwinner,sun50i-h5-rtc"
-
- Where there are two or more compatible strings, this
- denotes the hardware covered by the most specific one
- is backward-compatible with the latter ones, and the
- implementation for the latter ones can be used, albeit
- with reduced functionality.
-
-- reg : physical base address of the controller and length of
- memory mapped region.
-- interrupts : IRQ lines for the RTC alarm 0 and alarm 1, in that order.
-
-Required properties for new device trees
-- clocks : phandle to the 32kHz external oscillator
-- clock-output-names : names of up to three clock outputs. See below.
-- #clock-cells : must be equal to 1.
-
-The RTC provides the following clocks at the given indices:
-- 0: LOSC
-- 1: LOSC external output, known as X32KFOUT in the datasheet.
- This clock is not available on the A31 and is deprecated for old
- device trees still using the "allwinner,sun6i-a31-rtc" compatible.
-- 2: InternalOSC, or internal RC oscillator (A64/H3/H5 only)
-
-Example:
-
-rtc: rtc@1f00000 {
- compatible = "allwinner,sun6i-a31-rtc";
- reg = <0x01f00000 0x400>;
- interrupts = <0 40 4>, <0 41 4>;
- clock-output-names = "osc32k";
- clocks = <&ext_osc32k>;
- #clock-cells = <1>;
-};
diff --git a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt b/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt
deleted file mode 100644
index 4a8d79c1cf08..000000000000
--- a/Documentation/devicetree/bindings/rtc/sunxi-rtc.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-* sun4i/sun7i Real Time Clock
-
-RTC controller for the Allwinner A10/A20
-
-Required properties:
-- compatible : Should be "allwinner,sun4i-a10-rtc" or "allwinner,sun7i-a20-rtc"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- interrupts: IRQ line for the RTC.
-
-Example:
-
-rtc: rtc@1c20d00 {
- compatible = "allwinner,sun4i-a10-rtc";
- reg = <0x01c20d00 0x20>;
- interrupts = <24>;
-};
diff --git a/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml
new file mode 100644
index 000000000000..0c12ce9a9b45
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/trivial-rtc.yaml
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/rtc/trivial-rtc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Trivial RTCs
+
+maintainers:
+ - Alexandre Belloni <alexandre.belloni@bootlin.com>
+
+description: |
+ This is a list of trivial RTC devices that have simple device tree
+ bindings, consisting only of a compatible field, an address and
+ possibly an interrupt line.
+
+allOf:
+ - $ref: "rtc.yaml#"
+
+properties:
+ compatible:
+ enum:
+ # AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface
+ - abracon,abb5zes3
+ # AB-RTCMC-32.768kHz-EOZ9: Real Time Clock/Calendar Module with I2C Interface
+ - abracon,abeoz9
+ # I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output
+ - dallas,ds1374
+ # Dallas DS1672 Real-time Clock
+ - dallas,ds1672
+ # Extremely Accurate I²C RTC with Integrated Crystal and SRAM
+ - dallas,ds3232
+ # I2C-BUS INTERFACE REAL TIME CLOCK MODULE
+ - epson,rx8010
+ # I2C-BUS INTERFACE REAL TIME CLOCK MODULE with Battery Backed RAM
+ - epson,rx8571
+ # I2C-BUS INTERFACE REAL TIME CLOCK MODULE
+ - epson,rx8581
+ # Intersil ISL1208 Low Power RTC with Battery Backed SRAM
+ - isil,isl1208
+ # Intersil ISL1218 Low Power RTC with Battery Backed SRAM
+ - isil,isl1218
+ # Intersil ISL12022 Real-time Clock
+ - isil,isl12022
+ # Real Time Clock Module with I2C-Bus
+ - microcrystal,rv3028
+ # Real Time Clock Module with I2C-Bus
+ - microcrystal,rv3029
+ # Real Time Clock
+ - microcrystal,rv8523
+ # Real-time clock
+ - nxp,pcf2127
+ # Real-time clock
+ - nxp,pcf2129
+ # Real-time clock/calendar
+ - nxp,pcf8563
+ # Real-time Clock Module
+ - pericom,pt7c4338
+ # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+ - ricoh,r2025sd
+ # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+ - ricoh,r2221tl
+ # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+ - ricoh,rs5c372a
+ # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+ - ricoh,rs5c372b
+ # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+ - ricoh,rv5c386
+ # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+ - ricoh,rv5c387a
+ # 2-wire CMOS real-time clock
+ - sii,s35390a
+ # I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC
+ - whwave,sd3078
+ # Xircom X1205 I2C RTC
+ - xircom,x1205
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ start-year: true
+
+required:
+ - compatible
+ - reg
+
+additionalProperties: false
+
+...
diff --git a/Documentation/devicetree/bindings/serial/8250.txt b/Documentation/devicetree/bindings/serial/8250.txt
index 3cba12f855b7..20d351f268ef 100644
--- a/Documentation/devicetree/bindings/serial/8250.txt
+++ b/Documentation/devicetree/bindings/serial/8250.txt
@@ -53,6 +53,9 @@ Optional properties:
programmable TX FIFO thresholds.
- resets : phandle + reset specifier pairs
- overrun-throttle-ms : how long to pause uart rx when input overrun is encountered.
+- {rts,cts,dtr,dsr,rng,dcd}-gpios: specify a GPIO for RTS/CTS/DTR/DSR/RI/DCD
+ line respectively. It will use specified GPIO instead of the peripheral
+ function pin for the UART feature. If unsure, don't specify this property.
Note:
* fsl,ns16550:
@@ -74,3 +77,19 @@ Example:
interrupts = <10>;
reg-shift = <2>;
};
+
+Example for OMAP UART using GPIO-based modem control signals:
+
+ uart4: serial@49042000 {
+ compatible = "ti,omap3-uart";
+ reg = <0x49042000 0x400>;
+ interrupts = <80>;
+ ti,hwmods = "uart4";
+ clock-frequency = <48000000>;
+ cts-gpios = <&gpio3 5 GPIO_ACTIVE_LOW>;
+ rts-gpios = <&gpio3 6 GPIO_ACTIVE_LOW>;
+ dtr-gpios = <&gpio1 12 GPIO_ACTIVE_LOW>;
+ dsr-gpios = <&gpio1 13 GPIO_ACTIVE_LOW>;
+ dcd-gpios = <&gpio1 14 GPIO_ACTIVE_LOW>;
+ rng-gpios = <&gpio1 15 GPIO_ACTIVE_LOW>;
+ };
diff --git a/Documentation/devicetree/bindings/serial/mtk-uart.txt b/Documentation/devicetree/bindings/serial/mtk-uart.txt
index c6b5262eb352..6fdffb735fb9 100644
--- a/Documentation/devicetree/bindings/serial/mtk-uart.txt
+++ b/Documentation/devicetree/bindings/serial/mtk-uart.txt
@@ -23,7 +23,12 @@ Required properties:
- reg: The base address of the UART register bank.
-- interrupts: A single interrupt specifier.
+- interrupts:
+ index 0: an interrupt specifier for the UART controller itself
+ index 1: optional, an interrupt specifier with edge sensitivity on Rx pin to
+ support Rx in-band wake up. If one would like to use this feature,
+ one must create an addtional pinctrl to reconfigure Rx pin to normal
+ GPIO before suspend.
- clocks : Must contain an entry for each entry in clock-names.
See ../clocks/clock-bindings.txt for details.
@@ -39,7 +44,11 @@ Example:
uart0: serial@11006000 {
compatible = "mediatek,mt6589-uart", "mediatek,mt6577-uart";
reg = <0x11006000 0x400>;
- interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_LOW>;
+ interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_LOW>,
+ <GIC_SPI 52 IRQ_TYPE_EDGE_FALLING>;
clocks = <&uart_clk>, <&bus_clk>;
clock-names = "baud", "bus";
+ pinctrl-names = "default", "sleep";
+ pinctrl-0 = <&uart_pin>;
+ pinctrl-1 = <&uart_pin_sleep>;
};
diff --git a/Documentation/devicetree/bindings/serial/omap_serial.txt b/Documentation/devicetree/bindings/serial/omap_serial.txt
index 0a9b5444f4e6..dcba86b0a0d0 100644
--- a/Documentation/devicetree/bindings/serial/omap_serial.txt
+++ b/Documentation/devicetree/bindings/serial/omap_serial.txt
@@ -1,6 +1,7 @@
OMAP UART controller
Required properties:
+- compatible : should be "ti,j721e-uart", "ti,am654-uart" for J721E controllers
- compatible : should be "ti,am654-uart" for AM654 controllers
- compatible : should be "ti,omap2-uart" for OMAP2 controllers
- compatible : should be "ti,omap3-uart" for OMAP3 controllers
diff --git a/Documentation/devicetree/bindings/serial/st,stm32-usart.txt b/Documentation/devicetree/bindings/serial/st,stm32-usart.txt
index 9d3efed55deb..a6b19485c9dc 100644
--- a/Documentation/devicetree/bindings/serial/st,stm32-usart.txt
+++ b/Documentation/devicetree/bindings/serial/st,stm32-usart.txt
@@ -13,6 +13,7 @@ Required properties:
- clocks: The input clock of the USART instance
Optional properties:
+- resets: Must contain the phandle to the reset controller.
- pinctrl: The reference on the pins configuration
- st,hw-flow-ctrl: bool flag to enable hardware flow control.
- rs485-rts-delay, rs485-rx-during-tx, rs485-rts-active-low,
diff --git a/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt
index 436d2106e80d..e876f3ce54f6 100644
--- a/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt
+++ b/Documentation/devicetree/bindings/soc/amlogic/amlogic,canvas.txt
@@ -2,8 +2,8 @@ Amlogic Canvas
================================
A canvas is a collection of metadata that describes a pixel buffer.
-Those metadata include: width, height, phyaddr, wrapping, block mode
-and endianness.
+Those metadata include: width, height, phyaddr, wrapping and block mode.
+Starting with GXBB the endianness can also be described.
Many IPs within Amlogic SoCs rely on canvas indexes to read/write pixel data
rather than use the phy addresses directly. For instance, this is the case for
@@ -18,7 +18,11 @@ Video Lookup Table
--------------------------
Required properties:
-- compatible: "amlogic,canvas"
+- compatible: has to be one of:
+ - "amlogic,meson8-canvas", "amlogic,canvas" on Meson8
+ - "amlogic,meson8b-canvas", "amlogic,canvas" on Meson8b
+ - "amlogic,meson8m2-canvas", "amlogic,canvas" on Meson8m2
+ - "amlogic,canvas" on GXBB and newer
- reg: Base physical address and size of the canvas registers.
Example:
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt
new file mode 100644
index 000000000000..954ffee0a9c4
--- /dev/null
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,aoss-qmp.txt
@@ -0,0 +1,81 @@
+Qualcomm Always-On Subsystem side channel binding
+
+This binding describes the hardware component responsible for side channel
+requests to the always-on subsystem (AOSS), used for certain power management
+requests that is not handled by the standard RPMh interface. Each client in the
+SoC has it's own block of message RAM and IRQ for communication with the AOSS.
+The protocol used to communicate in the message RAM is known as Qualcomm
+Messaging Protocol (QMP)
+
+The AOSS side channel exposes control over a set of resources, used to control
+a set of debug related clocks and to affect the low power state of resources
+related to the secondary subsystems. These resources are exposed as a set of
+power-domains.
+
+- compatible:
+ Usage: required
+ Value type: <string>
+ Definition: must be "qcom,sdm845-aoss-qmp"
+
+- reg:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: the base address and size of the message RAM for this
+ client's communication with the AOSS
+
+- interrupts:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: should specify the AOSS message IRQ for this client
+
+- mboxes:
+ Usage: required
+ Value type: <prop-encoded-array>
+ Definition: reference to the mailbox representing the outgoing doorbell
+ in APCS for this client, as described in mailbox/mailbox.txt
+
+- #clock-cells:
+ Usage: optional
+ Value type: <u32>
+ Definition: must be 0
+ The single clock represents the QDSS clock.
+
+- #power-domain-cells:
+ Usage: optional
+ Value type: <u32>
+ Definition: must be 1
+ The provided power-domains are:
+ CDSP state (0), LPASS state (1), modem state (2), SLPI
+ state (3), SPSS state (4) and Venus state (5).
+
+= SUBNODES
+The AOSS side channel also provides the controls for three cooling devices,
+these are expressed as subnodes of the QMP node. The name of the node is used
+to identify the resource and must therefor be "cx", "mx" or "ebi".
+
+- #cooling-cells:
+ Usage: optional
+ Value type: <u32>
+ Definition: must be 2
+
+= EXAMPLE
+
+The following example represents the AOSS side-channel message RAM and the
+mechanism exposing the power-domains, as found in SDM845.
+
+ aoss_qmp: qmp@c300000 {
+ compatible = "qcom,sdm845-aoss-qmp";
+ reg = <0x0c300000 0x100000>;
+ interrupts = <GIC_SPI 389 IRQ_TYPE_EDGE_RISING>;
+ mboxes = <&apss_shared 0>;
+
+ #power-domain-cells = <1>;
+
+ cx_cdev: cx {
+ #cooling-cells = <2>;
+ };
+
+ mx_cdev: mx {
+ #cooling-cells = <2>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
index bcc612cc7423..db501269f47b 100644
--- a/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,apr.txt
@@ -9,7 +9,7 @@ used for audio/voice services on the QDSP.
Value type: <stringlist>
Definition: must be "qcom,apr-v<VERSION-NUMBER>", example "qcom,apr-v2"
-- reg
+- qcom,apr-domain
Usage: required
Value type: <u32>
Definition: Destination processor ID.
@@ -49,9 +49,9 @@ by the individual bindings for the specific service
The following example represents a QDSP based sound card on a MSM8996 device
which uses apr as communication between Apps and QDSP.
- apr@4 {
+ apr {
compatible = "qcom,apr-v2";
- reg = <APR_DOMAIN_ADSP>;
+ qcom,apr-domain = <APR_DOMAIN_ADSP>;
q6core@3 {
compatible = "qcom,q6core";
diff --git a/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt b/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt
index cf759e5f9b10..1214192847ac 100644
--- a/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt
+++ b/Documentation/devicetree/bindings/soc/qcom/qcom,glink.txt
@@ -21,6 +21,11 @@ edge.
Definition: should specify the IRQ used by the remote processor to
signal this processor about communication related events
+- qcom,remote-pid:
+ Usage: required for glink-smem
+ Value type: <u32>
+ Definition: specifies the identifier of the remote endpoint of this edge
+
- qcom,rpm-msg-ram:
Usage: required for glink-rpm
Value type: <prop-encoded-array>
diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml
new file mode 100644
index 000000000000..eb3992138eec
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-i2s.yaml
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: (GPL-2.0+ OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/sound/allwinner,sun4i-a10-i2s.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 I2S Controller Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#sound-dai-cells":
+ const: 0
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun4i-a10-i2s
+ - const: allwinner,sun6i-a31-i2s
+ - const: allwinner,sun8i-a83t-i2s
+ - const: allwinner,sun8i-h3-i2s
+ - const: allwinner,sun50i-a64-codec-i2s
+ - items:
+ - const: allwinner,sun50i-a64-i2s
+ - const: allwinner,sun8i-h3-i2s
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+
+ clock-names:
+ items:
+ - const: apb
+ - const: mod
+
+ # Even though it only applies to subschemas under the conditionals,
+ # not listing them here will trigger a warning because of the
+ # additionalsProperties set to false.
+ dmas: true
+ dma-names: true
+ resets:
+ maxItems: 1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun6i-a31-i2s
+ - allwinner,sun8i-a83t-i2s
+ - allwinner,sun8i-h3-i2s
+ - allwinner,sun50i-a64-codec-i2s
+
+ then:
+ required:
+ - resets
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun8i-a83t-i2s
+
+ then:
+ properties:
+ dmas:
+ minItems: 1
+ maxItems: 2
+ items:
+ - description: RX DMA Channel
+ - description: TX DMA Channel
+ description:
+ Some controllers cannot receive but can only transmit
+ data. In such a case, the RX DMA channel is to be omitted.
+
+ dma-names:
+ oneOf:
+ - items:
+ - const: rx
+ - const: tx
+ - const: tx
+ description:
+ Some controllers cannot receive but can only transmit
+ data. In such a case, the RX name is to be omitted.
+
+ else:
+ properties:
+ dmas:
+ items:
+ - description: RX DMA Channel
+ - description: TX DMA Channel
+
+ dma-names:
+ items:
+ - const: rx
+ - const: tx
+
+required:
+ - "#sound-dai-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - dmas
+ - dma-names
+
+additionalProperties: false
+
+examples:
+ - |
+ i2s0: i2s@1c22400 {
+ #sound-dai-cells = <0>;
+ compatible = "allwinner,sun4i-a10-i2s";
+ reg = <0x01c22400 0x400>;
+ interrupts = <0 16 4>;
+ clocks = <&apb0_gates 3>, <&i2s0_clk>;
+ clock-names = "apb", "mod";
+ dmas = <&dma 0 3>, <&dma 0 3>;
+ dma-names = "rx", "tx";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml
new file mode 100644
index 000000000000..e0284d8c3b63
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/allwinner,sun4i-a10-spdif.yaml
@@ -0,0 +1,120 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/sound/allwinner,sun4i-a10-spdif.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 S/PDIF Controller Device Tree Bindings
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Liam Girdwood <lgirdwood@gmail.com>
+ - Mark Brown <broonie@kernel.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#sound-dai-cells":
+ const: 0
+
+ compatible:
+ oneOf:
+ - const: allwinner,sun4i-a10-spdif
+ - const: allwinner,sun6i-a31-spdif
+ - const: allwinner,sun8i-h3-spdif
+ - const: allwinner,sun50i-h6-spdif
+ - items:
+ - const: allwinner,sun8i-a83t-spdif
+ - const: allwinner,sun8i-h3-spdif
+ - items:
+ - const: allwinner,sun50i-a64-spdif
+ - const: allwinner,sun8i-h3-spdif
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+
+ clock-names:
+ items:
+ - const: apb
+ - const: spdif
+
+ # Even though it only applies to subschemas under the conditionals,
+ # not listing them here will trigger a warning because of the
+ # additionalsProperties set to false.
+ dmas: true
+ dma-names: true
+ resets:
+ maxItems: 1
+
+allOf:
+ - if:
+ properties:
+ compatible:
+ contains:
+ enum:
+ - allwinner,sun6i-a31-spdif
+ - allwinner,sun8i-h3-spdif
+
+ then:
+ required:
+ - resets
+
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: allwinner,sun8i-h3-spdif
+
+ then:
+ properties:
+ dmas:
+ description: TX DMA Channel
+
+ dma-names:
+ const: tx
+
+ else:
+ properties:
+ dmas:
+ items:
+ - description: RX DMA Channel
+ - description: TX DMA Channel
+
+ dma-names:
+ items:
+ - const: rx
+ - const: tx
+
+required:
+ - "#sound-dai-cells"
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+ - dmas
+ - dma-names
+
+additionalProperties: false
+
+examples:
+ - |
+ spdif: spdif@1c21000 {
+ #sound-dai-cells = <0>;
+ compatible = "allwinner,sun4i-a10-spdif";
+ reg = <0x01c21000 0x40>;
+ interrupts = <13>;
+ clocks = <&apb0_gates 1>, <&spdif_clk>;
+ clock-names = "apb", "spdif";
+ dmas = <&dma 0 2>, <&dma 0 2>;
+ dma-names = "rx", "tx";
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt b/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt
index 3b94a715a0b9..8835a43edfbb 100644
--- a/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt
+++ b/Documentation/devicetree/bindings/sound/amlogic,axg-tdm-formatters.txt
@@ -15,11 +15,15 @@ Required properties:
* "lrclk" : sample clock
* "lrclk_sel": sample clock input multiplexer
-Example of TDMOUT_A on the A113 SoC:
+Optional property:
+- resets: phandle to the dedicated reset line of the tdm formatter.
+
+Example of TDMOUT_A on the S905X2 SoC:
tdmout_a: audio-controller@500 {
compatible = "amlogic,axg-tdmout";
reg = <0x0 0x500 0x0 0x40>;
+ resets = <&clkc_audio AUD_RESET_TDMOUT_A>;
clocks = <&clkc_audio AUD_CLKID_TDMOUT_A>,
<&clkc_audio AUD_CLKID_TDMOUT_A_SCLK>,
<&clkc_audio AUD_CLKID_TDMOUT_A_SCLK_SEL>,
diff --git a/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt
new file mode 100644
index 000000000000..aa6c35570d31
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/amlogic,g12a-tohdmitx.txt
@@ -0,0 +1,55 @@
+* Amlogic HDMI Tx control glue
+
+Required properties:
+- compatible: "amlogic,g12a-tohdmitx"
+- reg: physical base address of the controller and length of memory
+ mapped region.
+- #sound-dai-cells: should be 1.
+
+Example on the S905X2 SoC:
+
+tohdmitx: audio-controller@744 {
+ compatible = "amlogic,g12a-tohdmitx";
+ reg = <0x0 0x744 0x0 0x4>;
+ #sound-dai-cells = <1>;
+};
+
+Example of an 'amlogic,axg-sound-card':
+
+sound {
+ compatible = "amlogic,axg-sound-card";
+
+[...]
+
+ dai-link-x {
+ sound-dai = <&tdmif_a>;
+ dai-format = "i2s";
+ dai-tdm-slot-tx-mask-0 = <1 1>;
+
+ codec-0 {
+ sound-dai = <&tohdmitx TOHDMITX_I2S_IN_A>;
+ };
+
+ codec-1 {
+ sound-dai = <&external_dac>;
+ };
+ };
+
+ dai-link-y {
+ sound-dai = <&tdmif_c>;
+ dai-format = "i2s";
+ dai-tdm-slot-tx-mask-0 = <1 1>;
+
+ codec {
+ sound-dai = <&tohdmitx TOHDMITX_I2S_IN_C>;
+ };
+ };
+
+ dai-link-z {
+ sound-dai = <&tohdmitx TOHDMITX_I2S_OUT>;
+
+ codec {
+ sound-dai = <&hdmi_tx>;
+ };
+ };
+};
diff --git a/Documentation/devicetree/bindings/sound/cs42l73.txt b/Documentation/devicetree/bindings/sound/cs42l73.txt
index 80ae910dbf6c..47b868b5ab01 100644
--- a/Documentation/devicetree/bindings/sound/cs42l73.txt
+++ b/Documentation/devicetree/bindings/sound/cs42l73.txt
@@ -19,4 +19,4 @@ codec: cs42l73@4a {
reg = <0x4a>;
reset_gpio = <&gpio 10 0>;
chgfreq = <0x05>;
-}; \ No newline at end of file
+};
diff --git a/Documentation/devicetree/bindings/sound/cs42xx8.txt b/Documentation/devicetree/bindings/sound/cs42xx8.txt
index 8619a156d038..bbfe39347c20 100644
--- a/Documentation/devicetree/bindings/sound/cs42xx8.txt
+++ b/Documentation/devicetree/bindings/sound/cs42xx8.txt
@@ -14,6 +14,11 @@ Required properties:
- VA-supply, VD-supply, VLS-supply, VLC-supply: power supplies for the device,
as covered in Documentation/devicetree/bindings/regulator/regulator.txt
+Optional properties:
+
+ - reset-gpios : a GPIO spec to define which pin is connected to the chip's
+ !RESET pin
+
Example:
cs42888: codec@48 {
@@ -25,4 +30,5 @@ cs42888: codec@48 {
VD-supply = <&reg_audio>;
VLS-supply = <&reg_audio>;
VLC-supply = <&reg_audio>;
+ reset-gpios = <&pca9557_b 1 GPIO_ACTIVE_LOW>;
};
diff --git a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt
index a58f79f5345c..c483dcec01f8 100644
--- a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt
+++ b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.txt
@@ -44,6 +44,9 @@ Optional properties:
please refer to pinctrl-bindings.txt
- fck_parent : Should contain a valid clock name which will be used as parent
for the McASP fck
+- auxclk-fs-ratio: When McASP is bus master indicates the ratio between AUCLK
+ and FS rate if applicable:
+ AUCLK rate = auxclk-fs-ratio * FS rate
Optional GPIO support:
If any McASP pin need to be used as GPIO then the McASP node must have:
diff --git a/Documentation/devicetree/bindings/sound/madera.txt b/Documentation/devicetree/bindings/sound/madera.txt
new file mode 100644
index 000000000000..5e669ce552f4
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/madera.txt
@@ -0,0 +1,67 @@
+Cirrus Logic Madera class audio codecs
+
+This describes audio configuration bindings for these codecs.
+
+See also the core bindings for the parent MFD driver:
+See Documentation/devicetree/bindings/mfd/madera.txt
+
+and defines for values used in these bindings:
+include/dt-bindings/sound/madera.h
+
+These properties are all contained in the parent MFD node.
+
+Optional properties:
+ - cirrus,dmic-ref : Indicates how the MICBIAS pins have been externally
+ connected to DMICs on each input, one cell per input.
+ <IN1 IN2 IN3 ...>
+ A value of 0 indicates MICVDD and is the default, other values depend on the
+ codec:
+ For CS47L35 one of the CS47L35_DMIC_REF_xxx values
+ For all other codecs one of the MADERA_DMIC_REF_xxx values
+ Also see the datasheet for a description of the INn_DMIC_SUP field.
+
+ - cirrus,inmode : A list of input mode settings for each input. A maximum of
+ 16 cells, with four cells per input in the order INnAL, INnAR INnBL INnBR.
+ For non-muxed inputs the first two cells for that input set the mode for
+ the left and right channel and the second two cells must be 0.
+ For muxed inputs the first two cells for that input set the mode of the
+ left and right A inputs and the second two cells set the mode of the left
+ and right B inputs.
+ Valid mode values are one of the MADERA_INMODE_xxx. If the array is shorter
+ than the number of inputs the unspecified inputs default to
+ MADERA_INMODE_DIFF.
+
+ - cirrus,out-mono : Mono bit for each output, maximum of six cells if the
+ array is shorter outputs will be set to stereo.
+
+ - cirrus,max-channels-clocked : Maximum number of channels that I2S clocks
+ will be generated for. Useful when clock master for systems where the I2S
+ bus has multiple data lines.
+ One cell for each AIF, use a value of zero for AIFs that should be handled
+ normally.
+
+ - cirrus,pdm-fmt : PDM speaker data format, must contain 2 cells
+ (OUT5 and OUT6). See the PDM_SPKn_FMT field in the datasheet for a
+ description of this value.
+ The second cell is ignored for codecs that do not have OUT6.
+
+ - cirrus,pdm-mute : PDM mute format, must contain 2 cells
+ (OUT5 and OUT6). See the PDM_SPKn_CTRL_1 register in the datasheet for a
+ description of this value.
+ The second cell is ignored for codecs that do not have OUT6.
+
+Example:
+
+cs47l35@0 {
+ compatible = "cirrus,cs47l35";
+
+ cirrus,dmic-ref = <0 0 CS47L35_DMIC_REF_MICBIAS1B 0>;
+ cirrus,inmode = <
+ MADERA_INMODE_DMIC MADERA_INMODE_DMIC /* IN1A digital */
+ MADERA_INMODE_SE MADERA_INMODE_SE /* IN1B single-ended */
+ MADERA_INMODE_DIFF MADERA_INMODE_DIFF /* IN2 differential */
+ 0 0 /* not used on this codec */
+ >;
+ cirrus,out-mono = <0 0 0 0 0 0>;
+ cirrus,max-channels-clocked = <2 0 0>;
+};
diff --git a/Documentation/devicetree/bindings/sound/max98357a.txt b/Documentation/devicetree/bindings/sound/max98357a.txt
index 28645a2ff885..4bce14ce806f 100644
--- a/Documentation/devicetree/bindings/sound/max98357a.txt
+++ b/Documentation/devicetree/bindings/sound/max98357a.txt
@@ -9,6 +9,10 @@ Optional properties:
- sdmode-gpios : GPIO specifier for the chip's SD_MODE pin.
If this option is not specified then driver does not manage
the pin state (e.g. chip is always on).
+- sdmode-delay : specify delay time for SD_MODE pin.
+ If this option is specified, which means it's required i2s clocks
+ ready before SD_MODE is unmuted in order to avoid the speaker pop noise.
+ It's observed that 5ms is sufficient.
Example:
diff --git a/Documentation/devicetree/bindings/sound/rt1011.txt b/Documentation/devicetree/bindings/sound/rt1011.txt
new file mode 100644
index 000000000000..35a23e60d679
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/rt1011.txt
@@ -0,0 +1,32 @@
+RT1011 Mono Class D Audio Amplifier
+
+This device supports I2C only.
+
+Required properties:
+
+- compatible : "realtek,rt1011".
+
+- reg : The I2C address of the device. This I2C address decide by
+ two input pins (ASEL1 and ASEL2).
+ -------------------------------------
+ | ASEL2 | ASEL1 | Address |
+ -------------------------------------
+ | 0 | 0 | 0x38 |
+ -------------------------------------
+ | 0 | 1 | 0x39 |
+ -------------------------------------
+ | 1 | 0 | 0x3a |
+ -------------------------------------
+ | 1 | 1 | 0x3b |
+ -------------------------------------
+
+Pins on the device (for linking into audio routes) for RT1011:
+
+ * SPO
+
+Example:
+
+rt1011: codec@38 {
+ compatible = "realtek,rt1011";
+ reg = <0x38>;
+};
diff --git a/Documentation/devicetree/bindings/sound/rt1308.txt b/Documentation/devicetree/bindings/sound/rt1308.txt
new file mode 100755
index 000000000000..2d46084afce4
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/rt1308.txt
@@ -0,0 +1,17 @@
+RT1308 audio Amplifier
+
+This device supports I2C only.
+
+Required properties:
+
+- compatible : "realtek,rt1308".
+
+- reg : The I2C address of the device.
+
+
+Example:
+
+rt1308: rt1308@10 {
+ compatible = "realtek,rt1308";
+ reg = <0x10>;
+};
diff --git a/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt b/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt
index 58c341300552..cbf24bcd1b8d 100644
--- a/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt
+++ b/Documentation/devicetree/bindings/sound/st,stm32-i2s.txt
@@ -18,7 +18,7 @@ Required properties:
See Documentation/devicetree/bindings/dma/stm32-dma.txt.
- dma-names: Identifier for each DMA request line. Must be "tx" and "rx".
- pinctrl-names: should contain only value "default"
- - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
+ - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
Optional properties:
- resets: Reference to a reset controller asserting the reset controller
diff --git a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt
index 3f4467ff0aa2..944743dd9212 100644
--- a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt
+++ b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt
@@ -41,7 +41,7 @@ SAI subnodes required properties:
"tx": if sai sub-block is configured as playback DAI
"rx": if sai sub-block is configured as capture DAI
- pinctrl-names: should contain only value "default"
- - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt
+ - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.yaml
SAI subnodes Optional properties:
- st,sync: specify synchronization mode.
diff --git a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt b/Documentation/devicetree/bindings/sound/sun4i-i2s.txt
deleted file mode 100644
index 61e71c1729e0..000000000000
--- a/Documentation/devicetree/bindings/sound/sun4i-i2s.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-* Allwinner A10 I2S controller
-
-The I2S bus (Inter-IC sound bus) is a serial link for digital
-audio data transfer between devices in the system.
-
-Required properties:
-
-- compatible: should be one of the following:
- - "allwinner,sun4i-a10-i2s"
- - "allwinner,sun6i-a31-i2s"
- - "allwinner,sun8i-a83t-i2s"
- - "allwinner,sun8i-h3-i2s"
- - "allwinner,sun50i-a64-codec-i2s"
-- reg: physical base address of the controller and length of memory mapped
- region.
-- interrupts: should contain the I2S interrupt.
-- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
- Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: should include "tx" and "rx".
-- clocks: a list of phandle + clock-specifer pairs, one for each entry in clock-names.
-- clock-names: should contain the following:
- - "apb" : clock for the I2S bus interface
- - "mod" : module clock for the I2S controller
-- #sound-dai-cells : Must be equal to 0
-
-Required properties for the following compatibles:
- - "allwinner,sun6i-a31-i2s"
- - "allwinner,sun8i-a83t-i2s"
- - "allwinner,sun8i-h3-i2s"
- - "allwinner,sun50i-a64-codec-i2s"
-- resets: phandle to the reset line for this codec
-
-Example:
-
-i2s0: i2s@1c22400 {
- #sound-dai-cells = <0>;
- compatible = "allwinner,sun4i-a10-i2s";
- reg = <0x01c22400 0x400>;
- interrupts = <GIC_SPI 16 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&apb0_gates 3>, <&i2s0_clk>;
- clock-names = "apb", "mod";
- dmas = <&dma SUN4I_DMA_NORMAL 3>,
- <&dma SUN4I_DMA_NORMAL 3>;
- dma-names = "rx", "tx";
-};
diff --git a/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt b/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt
deleted file mode 100644
index 0c64a209c2e9..000000000000
--- a/Documentation/devicetree/bindings/sound/sunxi,sun4i-spdif.txt
+++ /dev/null
@@ -1,42 +0,0 @@
-Allwinner Sony/Philips Digital Interface Format (S/PDIF) Controller
-
-The Allwinner S/PDIF audio block is a transceiver that allows the
-processor to receive and transmit digital audio via an coaxial cable or
-a fibre cable.
-For now only playback is supported.
-
-Required properties:
-
- - compatible : should be one of the following:
- - "allwinner,sun4i-a10-spdif": for the Allwinner A10 SoC
- - "allwinner,sun6i-a31-spdif": for the Allwinner A31 SoC
- - "allwinner,sun8i-h3-spdif": for the Allwinner H3 SoC
-
- - reg : Offset and length of the register set for the device.
-
- - interrupts : Contains the spdif interrupt.
-
- - dmas : Generic dma devicetree binding as described in
- Documentation/devicetree/bindings/dma/dma.txt.
-
- - dma-names : Two dmas have to be defined, "tx" and "rx".
-
- - clocks : Contains an entry for each entry in clock-names.
-
- - clock-names : Includes the following entries:
- "apb" clock for the spdif bus.
- "spdif" clock for spdif controller.
-
- - resets : reset specifier for the ahb reset (A31 and newer only)
-
-Example:
-
-spdif: spdif@1c21000 {
- compatible = "allwinner,sun4i-a10-spdif";
- reg = <0x01c21000 0x40>;
- interrupts = <13>;
- clocks = <&apb0_gates 1>, <&spdif_clk>;
- clock-names = "apb", "spdif";
- dmas = <&dma 0 2>, <&dma 0 2>;
- dma-names = "rx", "tx";
-};
diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml
new file mode 100644
index 000000000000..6d1329c28170
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/allwinner,sun4i-a10-spi.yaml
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/allwinner,sun4i-a10-spi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A10 SPI Controller Device Tree Bindings
+
+allOf:
+ - $ref: "spi-controller.yaml"
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#address-cells": true
+ "#size-cells": true
+
+ compatible:
+ const: allwinner,sun4i-a10-spi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+
+ clock-names:
+ items:
+ - const: ahb
+ - const: mod
+
+ dmas:
+ items:
+ - description: RX DMA Channel
+ - description: TX DMA Channel
+
+ dma-names:
+ items:
+ - const: rx
+ - const: tx
+
+ num-cs: true
+
+patternProperties:
+ "^.*@[0-9a-f]+":
+ type: object
+ properties:
+ reg:
+ items:
+ minimum: 0
+ maximum: 4
+
+ spi-rx-bus-width:
+ const: 1
+
+ spi-tx-bus-width:
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ spi1: spi@1c06000 {
+ compatible = "allwinner,sun4i-a10-spi";
+ reg = <0x01c06000 0x1000>;
+ interrupts = <11>;
+ clocks = <&ahb_gates 21>, <&spi1_clk>;
+ clock-names = "ahb", "mod";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml
new file mode 100644
index 000000000000..f36c46d236d7
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/allwinner,sun6i-a31-spi.yaml
@@ -0,0 +1,107 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/allwinner,sun6i-a31-spi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Allwinner A31 SPI Controller Device Tree Bindings
+
+allOf:
+ - $ref: "spi-controller.yaml"
+
+maintainers:
+ - Chen-Yu Tsai <wens@csie.org>
+ - Maxime Ripard <maxime.ripard@bootlin.com>
+
+properties:
+ "#address-cells": true
+ "#size-cells": true
+
+ compatible:
+ enum:
+ - allwinner,sun6i-a31-spi
+ - allwinner,sun8i-h3-spi
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ items:
+ - description: Bus Clock
+ - description: Module Clock
+
+ clock-names:
+ items:
+ - const: ahb
+ - const: mod
+
+ resets:
+ maxItems: 1
+
+ dmas:
+ items:
+ - description: RX DMA Channel
+ - description: TX DMA Channel
+
+ dma-names:
+ items:
+ - const: rx
+ - const: tx
+
+ num-cs: true
+
+patternProperties:
+ "^.*@[0-9a-f]+":
+ type: object
+ properties:
+ reg:
+ items:
+ minimum: 0
+ maximum: 4
+
+ spi-rx-bus-width:
+ const: 1
+
+ spi-tx-bus-width:
+ const: 1
+
+required:
+ - compatible
+ - reg
+ - interrupts
+ - clocks
+ - clock-names
+
+additionalProperties: false
+
+examples:
+ - |
+ spi1: spi@1c69000 {
+ compatible = "allwinner,sun6i-a31-spi";
+ reg = <0x01c69000 0x1000>;
+ interrupts = <0 66 4>;
+ clocks = <&ahb1_gates 21>, <&spi1_clk>;
+ clock-names = "ahb", "mod";
+ resets = <&ahb1_rst 21>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+ - |
+ spi0: spi@1c68000 {
+ compatible = "allwinner,sun8i-h3-spi";
+ reg = <0x01c68000 0x1000>;
+ interrupts = <0 65 4>;
+ clocks = <&ccu 30>, <&ccu 82>;
+ clock-names = "ahb", "mod";
+ dmas = <&dma 23>, <&dma 23>;
+ dma-names = "rx", "tx";
+ resets = <&ccu 15>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/spi/spi-bus.txt b/Documentation/devicetree/bindings/spi/spi-bus.txt
index 1f6e86f787ef..e07783505498 100644
--- a/Documentation/devicetree/bindings/spi/spi-bus.txt
+++ b/Documentation/devicetree/bindings/spi/spi-bus.txt
@@ -1,111 +1 @@
-SPI (Serial Peripheral Interface) busses
-
-SPI busses can be described with a node for the SPI controller device
-and a set of child nodes for each SPI slave on the bus. The system's SPI
-controller may be described for use in SPI master mode or in SPI slave mode,
-but not for both at the same time.
-
-The SPI controller node requires the following properties:
-- compatible - Name of SPI bus controller following generic names
- recommended practice.
-
-In master mode, the SPI controller node requires the following additional
-properties:
-- #address-cells - number of cells required to define a chip select
- address on the SPI bus.
-- #size-cells - should be zero.
-
-In slave mode, the SPI controller node requires one additional property:
-- spi-slave - Empty property.
-
-No other properties are required in the SPI bus node. It is assumed
-that a driver for an SPI bus device will understand that it is an SPI bus.
-However, the binding does not attempt to define the specific method for
-assigning chip select numbers. Since SPI chip select configuration is
-flexible and non-standardized, it is left out of this binding with the
-assumption that board specific platform code will be used to manage
-chip selects. Individual drivers can define additional properties to
-support describing the chip select layout.
-
-Optional properties (master mode only):
-- cs-gpios - gpios chip select.
-- num-cs - total number of chipselects.
-
-If cs-gpios is used the number of chip selects will be increased automatically
-with max(cs-gpios > hw cs).
-
-So if for example the controller has 2 CS lines, and the cs-gpios
-property looks like this:
-
-cs-gpios = <&gpio1 0 0>, <0>, <&gpio1 1 0>, <&gpio1 2 0>;
-
-Then it should be configured so that num_chipselect = 4 with the
-following mapping:
-
-cs0 : &gpio1 0 0
-cs1 : native
-cs2 : &gpio1 1 0
-cs3 : &gpio1 2 0
-
-
-SPI slave nodes must be children of the SPI controller node.
-
-In master mode, one or more slave nodes (up to the number of chip selects) can
-be present. Required properties are:
-- compatible - Name of SPI device following generic names recommended
- practice.
-- reg - Chip select address of device.
-- spi-max-frequency - Maximum SPI clocking speed of device in Hz.
-
-In slave mode, the (single) slave node is optional.
-If present, it must be called "slave". Required properties are:
-- compatible - Name of SPI device following generic names recommended
- practice.
-
-All slave nodes can contain the following optional properties:
-- spi-cpol - Empty property indicating device requires inverse clock
- polarity (CPOL) mode.
-- spi-cpha - Empty property indicating device requires shifted clock
- phase (CPHA) mode.
-- spi-cs-high - Empty property indicating device requires chip select
- active high.
-- spi-3wire - Empty property indicating device requires 3-wire mode.
-- spi-lsb-first - Empty property indicating device requires LSB first mode.
-- spi-tx-bus-width - The bus width (number of data wires) that is used for MOSI.
- Defaults to 1 if not present.
-- spi-rx-bus-width - The bus width (number of data wires) that is used for MISO.
- Defaults to 1 if not present.
-- spi-rx-delay-us - Microsecond delay after a read transfer.
-- spi-tx-delay-us - Microsecond delay after a write transfer.
-
-Some SPI controllers and devices support Dual and Quad SPI transfer mode.
-It allows data in the SPI system to be transferred using 2 wires (DUAL) or 4
-wires (QUAD).
-Now the value that spi-tx-bus-width and spi-rx-bus-width can receive is
-only 1 (SINGLE), 2 (DUAL) and 4 (QUAD).
-Dual/Quad mode is not allowed when 3-wire mode is used.
-
-If a gpio chipselect is used for the SPI slave the gpio number will be passed
-via the SPI master node cs-gpios property.
-
-SPI example for an MPC5200 SPI bus:
- spi@f00 {
- #address-cells = <1>;
- #size-cells = <0>;
- compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
- reg = <0xf00 0x20>;
- interrupts = <2 13 0 2 14 0>;
- interrupt-parent = <&mpc5200_pic>;
-
- ethernet-switch@0 {
- compatible = "micrel,ks8995m";
- spi-max-frequency = <1000000>;
- reg = <0>;
- };
-
- codec@1 {
- compatible = "ti,tlv320aic26";
- spi-max-frequency = <100000>;
- reg = <1>;
- };
- };
+This file has moved to spi-controller.yaml.
diff --git a/Documentation/devicetree/bindings/spi/spi-controller.yaml b/Documentation/devicetree/bindings/spi/spi-controller.yaml
new file mode 100644
index 000000000000..876c0623f322
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-controller.yaml
@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/spi-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SPI Controller Generic Binding
+
+maintainers:
+ - Mark Brown <broonie@kernel.org>
+
+description: |
+ SPI busses can be described with a node for the SPI controller device
+ and a set of child nodes for each SPI slave on the bus. The system SPI
+ controller may be described for use in SPI master mode or in SPI slave mode,
+ but not for both at the same time.
+
+properties:
+ $nodename:
+ pattern: "^spi(@.*|-[0-9a-f])*$"
+
+ "#address-cells":
+ const: 1
+
+ "#size-cells":
+ const: 0
+
+ cs-gpios:
+ description: |
+ GPIOs used as chip selects.
+ If that property is used, the number of chip selects will be
+ increased automatically with max(cs-gpios, hardware chip selects).
+
+ So if, for example, the controller has 2 CS lines, and the
+ cs-gpios looks like this
+ cs-gpios = <&gpio1 0 0>, <0>, <&gpio1 1 0>, <&gpio1 2 0>;
+
+ Then it should be configured so that num_chipselect = 4, with
+ the following mapping
+ cs0 : &gpio1 0 0
+ cs1 : native
+ cs2 : &gpio1 1 0
+ cs3 : &gpio1 2 0
+
+ num-cs:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Total number of chip selects.
+
+ spi-slave:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The SPI controller acts as a slave, instead of a master.
+
+patternProperties:
+ "^slave$":
+ type: object
+
+ properties:
+ compatible:
+ description:
+ Compatible of the SPI device.
+
+ required:
+ - compatible
+
+ "^.*@[0-9a-f]+$":
+ type: object
+
+ properties:
+ compatible:
+ description:
+ Compatible of the SPI device.
+
+ reg:
+ maxItems: 1
+ minimum: 0
+ maximum: 256
+ description:
+ Chip select used by the device.
+
+ spi-3wire:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The device requires 3-wire mode.
+
+ spi-cpha:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The device requires shifted clock phase (CPHA) mode.
+
+ spi-cpol:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The device requires inverse clock polarity (CPOL) mode.
+
+ spi-cs-high:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The device requires the chip select active high.
+
+ spi-lsb-first:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ The device requires the LSB first mode.
+
+ spi-max-frequency:
+ $ref: /schemas/types.yaml#/definitions/uint32
+ description:
+ Maximum SPI clocking speed of the device in Hz.
+
+ spi-rx-bus-width:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - enum: [ 1, 2, 4 ]
+ - default: 1
+ description:
+ Bus width to the SPI bus used for MISO.
+
+ spi-rx-delay-us:
+ description:
+ Delay, in microseconds, after a read transfer.
+
+ spi-tx-bus-width:
+ allOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32
+ - enum: [ 1, 2, 4 ]
+ - default: 1
+ description:
+ Bus width to the SPI bus used for MOSI.
+
+ spi-tx-delay-us:
+ description:
+ Delay, in microseconds, after a write transfer.
+
+ required:
+ - compatible
+ - reg
+
+examples:
+ - |
+ spi@f00 {
+ #address-cells = <1>;
+ #size-cells = <0>;
+ compatible = "fsl,mpc5200b-spi","fsl,mpc5200-spi";
+ reg = <0xf00 0x20>;
+ interrupts = <2 13 0 2 14 0>;
+ interrupt-parent = <&mpc5200_pic>;
+
+ ethernet-switch@0 {
+ compatible = "micrel,ks8995m";
+ spi-max-frequency = <1000000>;
+ reg = <0>;
+ };
+
+ codec@1 {
+ compatible = "ti,tlv320aic26";
+ spi-max-frequency = <100000>;
+ reg = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/spi/spi-gpio.txt b/Documentation/devicetree/bindings/spi/spi-gpio.txt
deleted file mode 100644
index 52db562f17a4..000000000000
--- a/Documentation/devicetree/bindings/spi/spi-gpio.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-SPI-GPIO devicetree bindings
-
-This represents a group of 3-n GPIO lines used for bit-banged SPI on dedicated
-GPIO lines.
-
-Required properties:
-
- - compatible: should be set to "spi-gpio"
- - #address-cells: should be set to <0x1>
- - ranges
- - sck-gpios: GPIO spec for the SCK line to use
- - miso-gpios: GPIO spec for the MISO line to use
- - mosi-gpios: GPIO spec for the MOSI line to use
- - cs-gpios: GPIOs to use for chipselect lines.
- Not needed if num-chipselects = <0>.
- - num-chipselects: Number of chipselect lines. Should be <0> if a single device
- with no chip select is connected.
-
-Deprecated bindings:
-
-These legacy GPIO line bindings can alternatively be used to define the
-GPIO lines used, they should not be used in new device trees.
-
- - gpio-sck: GPIO spec for the SCK line to use
- - gpio-miso: GPIO spec for the MISO line to use
- - gpio-mosi: GPIO spec for the MOSI line to use
-
-Example:
-
- spi {
- compatible = "spi-gpio";
- #address-cells = <0x1>;
- ranges;
-
- sck-gpios = <&gpio 95 0>;
- miso-gpios = <&gpio 98 0>;
- mosi-gpios = <&gpio 97 0>;
- cs-gpios = <&gpio 125 0>;
- num-chipselects = <1>;
-
- /* clients */
- };
-
diff --git a/Documentation/devicetree/bindings/spi/spi-gpio.yaml b/Documentation/devicetree/bindings/spi/spi-gpio.yaml
new file mode 100644
index 000000000000..55c4f1705f07
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-gpio.yaml
@@ -0,0 +1,72 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/spi-gpio.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SPI-GPIO devicetree bindings
+
+maintainers:
+ - Rob Herring <robh@kernel.org>
+
+description:
+ This represents a group of 3-n GPIO lines used for bit-banged SPI on
+ dedicated GPIO lines.
+
+allOf:
+ - $ref: "/schemas/spi/spi-controller.yaml#"
+
+properties:
+ compatible:
+ const: spi-gpio
+
+ sck-gpios:
+ description: GPIO spec for the SCK line to use
+ maxItems: 1
+
+ miso-gpios:
+ description: GPIO spec for the MISO line to use
+ maxItems: 1
+
+ mosi-gpios:
+ description: GPIO spec for the MOSI line to use
+ maxItems: 1
+
+ cs-gpios:
+ description: GPIOs to use for chipselect lines.
+ Not needed if num-chipselects = <0>.
+ minItems: 1
+ maxItems: 1024
+
+ num-chipselects:
+ description: Number of chipselect lines. Should be <0> if a single device
+ with no chip select is connected.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ # Deprecated properties
+ gpio-sck: false
+ gpio-miso: false
+ gpio-mosi: false
+
+required:
+ - compatible
+ - num-chipselects
+ - sck-gpios
+
+examples:
+ - |
+ spi {
+ compatible = "spi-gpio";
+ #address-cells = <0x1>;
+ #size-cells = <0x0>;
+
+ sck-gpios = <&gpio 95 0>;
+ miso-gpios = <&gpio 98 0>;
+ mosi-gpios = <&gpio 97 0>;
+ cs-gpios = <&gpio 125 0>;
+ num-chipselects = <1>;
+
+ /* clients */
+ };
+
+...
diff --git a/Documentation/devicetree/bindings/spi/spi-pl022.yaml b/Documentation/devicetree/bindings/spi/spi-pl022.yaml
new file mode 100644
index 000000000000..dfb697c69341
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-pl022.yaml
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: GPL-2.0
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/spi-pl022.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM PL022 SPI controller
+
+maintainers:
+ - Linus Walleij <linus.walleij@linaro.org>
+
+allOf:
+ - $ref: "spi-controller.yaml#"
+
+# We need a select here so we don't match all nodes with 'arm,primecell'
+select:
+ properties:
+ compatible:
+ contains:
+ const: arm,pl022
+ required:
+ - compatible
+
+properties:
+ compatible:
+ items:
+ - const: arm,pl022
+ - const: arm,primecell
+
+ reg:
+ maxItems: 1
+
+ interrupts:
+ maxItems: 1
+
+ clocks:
+ maxItems: 2
+
+ clock-names:
+ items:
+ - enum:
+ - SSPCLK
+ - sspclk
+ - const: apb_pclk
+
+ pl022,autosuspend-delay:
+ description: delay in ms following transfer completion before the
+ runtime power management system suspends the device. A setting of 0
+ indicates no delay and the device will be suspended immediately.
+ $ref: "/schemas/types.yaml#/definitions/uint32"
+
+ pl022,rt:
+ description: indicates the controller should run the message pump with realtime
+ priority to minimise the transfer latency on the bus (boolean)
+ type: boolean
+
+ dmas:
+ description:
+ Two or more DMA channel specifiers following the convention outlined
+ in bindings/dma/dma.txt
+ minItems: 2
+ maxItems: 32
+
+ dma-names:
+ description:
+ There must be at least one channel named "tx" for transmit and named "rx"
+ for receive.
+ minItems: 2
+ maxItems: 32
+ additionalItems: true
+ items:
+ - const: rx
+ - const: tx
+
+patternProperties:
+ "^[a-zA-Z][a-zA-Z0-9,+\\-._]{0,63}@[0-9a-f]+$":
+ type: object
+ # SPI slave nodes must be children of the SPI master node and can
+ # contain the following properties.
+ properties:
+ pl022,interface:
+ description: SPI interface type
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - enum:
+ - 0 # SPI
+ - 1 # Texas Instruments Synchronous Serial Frame Format
+ - 2 # Microwire (Half Duplex)
+
+ pl022,com-mode:
+ description: Specifies the transfer mode
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - enum:
+ - 0 # interrupt mode
+ - 1 # polling mode
+ - 2 # DMA mode
+ default: 1
+
+ pl022,rx-level-trig:
+ description: Rx FIFO watermark level
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 4
+
+ pl022,tx-level-trig:
+ description: Tx FIFO watermark level
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0
+ maximum: 4
+
+ pl022,ctrl-len:
+ description: Microwire interface - Control length
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - minimum: 0x03
+ maximum: 0x1f
+
+ pl022,wait-state:
+ description: Microwire interface - Wait state
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - enum: [ 0, 1 ]
+
+ pl022,duplex:
+ description: Microwire interface - Full/Half duplex
+ allOf:
+ - $ref: "/schemas/types.yaml#/definitions/uint32"
+ - enum: [ 0, 1 ]
+
+required:
+ - compatible
+ - reg
+ - interrupts
+
+examples:
+ - |
+ spi@e0100000 {
+ compatible = "arm,pl022", "arm,primecell";
+ reg = <0xe0100000 0x1000>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ interrupts = <0 31 0x4>;
+ dmas = <&dma_controller 23 1>,
+ <&dma_controller 24 0>;
+ dma-names = "rx", "tx";
+
+ m25p80@1 {
+ compatible = "st,m25p80";
+ reg = <1>;
+ spi-max-frequency = <12000000>;
+ spi-cpol;
+ spi-cpha;
+ pl022,interface = <0>;
+ pl022,com-mode = <0x2>;
+ pl022,rx-level-trig = <0>;
+ pl022,tx-level-trig = <0>;
+ pl022,ctrl-len = <0x11>;
+ pl022,wait-state = <0>;
+ pl022,duplex = <0>;
+ };
+ };
+...
diff --git a/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt b/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt
index adeeb63e84b9..bfc038b9478d 100644
--- a/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt
+++ b/Documentation/devicetree/bindings/spi/spi-stm32-qspi.txt
@@ -19,8 +19,11 @@ Required properties:
- reg: chip-Select number (QSPI controller may connect 2 flashes)
- spi-max-frequency: max frequency of spi bus
-Optional property:
+Optional properties:
- spi-rx-bus-width: see ./spi-bus.txt for the description
+- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
+Documentation/devicetree/bindings/dma/dma.txt.
+- dma-names: DMA request names should include "tx" and "rx" if present.
Example:
diff --git a/Documentation/devicetree/bindings/spi/spi-sun4i.txt b/Documentation/devicetree/bindings/spi/spi-sun4i.txt
deleted file mode 100644
index c75d604a8290..000000000000
--- a/Documentation/devicetree/bindings/spi/spi-sun4i.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-Allwinner A10 SPI controller
-
-Required properties:
-- compatible: Should be "allwinner,sun4-a10-spi".
-- reg: Should contain register location and length.
-- interrupts: Should contain interrupt.
-- clocks: phandle to the clocks feeding the SPI controller. Two are
- needed:
- - "ahb": the gated AHB parent clock
- - "mod": the parent module clock
-- clock-names: Must contain the clock names described just above
-
-Example:
-
-spi1: spi@1c06000 {
- compatible = "allwinner,sun4i-a10-spi";
- reg = <0x01c06000 0x1000>;
- interrupts = <11>;
- clocks = <&ahb_gates 21>, <&spi1_clk>;
- clock-names = "ahb", "mod";
- #address-cells = <1>;
- #size-cells = <0>;
-};
diff --git a/Documentation/devicetree/bindings/spi/spi-sun6i.txt b/Documentation/devicetree/bindings/spi/spi-sun6i.txt
deleted file mode 100644
index 435a8e0731ac..000000000000
--- a/Documentation/devicetree/bindings/spi/spi-sun6i.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-Allwinner A31/H3 SPI controller
-
-Required properties:
-- compatible: Should be "allwinner,sun6i-a31-spi" or "allwinner,sun8i-h3-spi".
-- reg: Should contain register location and length.
-- interrupts: Should contain interrupt.
-- clocks: phandle to the clocks feeding the SPI controller. Two are
- needed:
- - "ahb": the gated AHB parent clock
- - "mod": the parent module clock
-- clock-names: Must contain the clock names described just above
-- resets: phandle to the reset controller asserting this device in
- reset
-
-Optional properties:
-- dmas: DMA specifiers for rx and tx dma. See the DMA client binding,
- Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: DMA request names should include "rx" and "tx" if present.
-
-Example:
-
-spi1: spi@1c69000 {
- compatible = "allwinner,sun6i-a31-spi";
- reg = <0x01c69000 0x1000>;
- interrupts = <0 66 4>;
- clocks = <&ahb1_gates 21>, <&spi1_clk>;
- clock-names = "ahb", "mod";
- resets = <&ahb1_rst 21>;
-};
-
-spi0: spi@1c68000 {
- compatible = "allwinner,sun8i-h3-spi";
- reg = <0x01c68000 0x1000>;
- interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&ccu CLK_BUS_SPI0>, <&ccu CLK_SPI0>;
- clock-names = "ahb", "mod";
- dmas = <&dma 23>, <&dma 23>;
- dma-names = "rx", "tx";
- pinctrl-names = "default";
- pinctrl-0 = <&spi0_pins>;
- resets = <&ccu RST_BUS_SPI0>;
- #address-cells = <1>;
- #size-cells = <0>;
-};
diff --git a/Documentation/devicetree/bindings/spi/spi-synquacer.txt b/Documentation/devicetree/bindings/spi/spi-synquacer.txt
new file mode 100644
index 000000000000..291dfa692d0a
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/spi-synquacer.txt
@@ -0,0 +1,27 @@
+* Socionext Synquacer HS-SPI bindings
+
+Required Properties:
+- compatible: should be "socionext,synquacer-spi"
+- reg: physical base address of the controller and length of memory mapped
+ region.
+- interrupts: should contain the "spi_rx", "spi_tx" and "spi_fault" interrupts.
+- clocks: core clock iHCLK. Optional rate clock iPCLK (default is iHCLK)
+- clock-names: Shall be "iHCLK" and "iPCLK" respectively
+
+Optional Properties:
+- socionext,use-rtm: boolean, if required to use "retimed clock" for RX
+- socionext,set-aces: boolean, if same active clock edges field to be set.
+
+Example:
+
+ spi0: spi@ff110000 {
+ compatible = "socionext,synquacer-spi";
+ reg = <0xff110000 0x1000>;
+ interrupts = <GIC_SPI 160 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 161 IRQ_TYPE_LEVEL_HIGH>,
+ <GIC_SPI 162 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&clk_hsspi>;
+ clock-names = "iHCLK";
+ socionext,use-rtm;
+ socionext,set-aces;
+ };
diff --git a/Documentation/devicetree/bindings/spi/spi_pl022.txt b/Documentation/devicetree/bindings/spi/spi_pl022.txt
deleted file mode 100644
index 7638b4968ddb..000000000000
--- a/Documentation/devicetree/bindings/spi/spi_pl022.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-ARM PL022 SPI controller
-
-Required properties:
-- compatible : "arm,pl022", "arm,primecell"
-- reg : Offset and length of the register set for the device
-- interrupts : Should contain SPI controller interrupt
-- num-cs : total number of chipselects
-
-Optional properties:
-- cs-gpios : should specify GPIOs used for chipselects.
- The gpios will be referred to as reg = <index> in the SPI child nodes.
- If unspecified, a single SPI device without a chip select can be used.
-- pl022,autosuspend-delay : delay in ms following transfer completion before
- the runtime power management system suspends the
- device. A setting of 0 indicates no delay and the
- device will be suspended immediately
-- pl022,rt : indicates the controller should run the message pump with realtime
- priority to minimise the transfer latency on the bus (boolean)
-- dmas : Two or more DMA channel specifiers following the convention outlined
- in bindings/dma/dma.txt
-- dma-names: Names for the dma channels, if present. There must be at
- least one channel named "tx" for transmit and named "rx" for
- receive.
-
-
-SPI slave nodes must be children of the SPI master node and can
-contain the following properties.
-
-- pl022,interface : interface type:
- 0: SPI
- 1: Texas Instruments Synchronous Serial Frame Format
- 2: Microwire (Half Duplex)
-- pl022,com-mode : specifies the transfer mode:
- 0: interrupt mode
- 1: polling mode (default mode if property not present)
- 2: DMA mode
-- pl022,rx-level-trig : Rx FIFO watermark level
-- pl022,tx-level-trig : Tx FIFO watermark level
-- pl022,ctrl-len : Microwire interface: Control length
-- pl022,wait-state : Microwire interface: Wait state
-- pl022,duplex : Microwire interface: Full/Half duplex
-
-
-Example:
-
- spi@e0100000 {
- compatible = "arm,pl022", "arm,primecell";
- reg = <0xe0100000 0x1000>;
- #address-cells = <1>;
- #size-cells = <0>;
- interrupts = <0 31 0x4>;
- dmas = <&dma-controller 23 1>,
- <&dma-controller 24 0>;
- dma-names = "rx", "tx";
-
- m25p80@1 {
- compatible = "st,m25p80";
- reg = <1>;
- spi-max-frequency = <12000000>;
- spi-cpol;
- spi-cpha;
- pl022,interface = <0>;
- pl022,com-mode = <0x2>;
- pl022,rx-level-trig = <0>;
- pl022,tx-level-trig = <0>;
- pl022,ctrl-len = <0x11>;
- pl022,wait-state = <0>;
- pl022,duplex = <0>;
- };
- };
diff --git a/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.txt b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.txt
new file mode 100644
index 000000000000..d57659996d62
--- /dev/null
+++ b/Documentation/devicetree/bindings/timer/nxp,sysctr-timer.txt
@@ -0,0 +1,25 @@
+NXP System Counter Module(sys_ctr)
+
+The system counter(sys_ctr) is a programmable system counter which provides
+a shared time base to Cortex A15, A7, A53, A73, etc. it is intended for use in
+applications where the counter is always powered and support multiple,
+unrelated clocks. The compare frame inside can be used for timer purpose.
+
+Required properties:
+
+- compatible : should be "nxp,sysctr-timer"
+- reg : Specifies the base physical address and size of the comapre
+ frame and the counter control, read & compare.
+- interrupts : should be the first compare frames' interrupt
+- clocks : Specifies the counter clock.
+- clock-names: Specifies the clock's name of this module
+
+Example:
+
+ system_counter: timer@306a0000 {
+ compatible = "nxp,sysctr-timer";
+ reg = <0x306a0000 0x20000>;/* system-counter-rd & compare */
+ clocks = <&clk_8m>;
+ clock-names = "per";
+ interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+ };
diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.txt b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
index c0594450e9ef..c5220bcd852b 100644
--- a/Documentation/devicetree/bindings/timer/renesas,cmt.txt
+++ b/Documentation/devicetree/bindings/timer/renesas,cmt.txt
@@ -42,12 +42,18 @@ Required Properties:
- "renesas,r8a7793-cmt1" for the 48-bit CMT1 device included in r8a7793.
- "renesas,r8a7794-cmt0" for the 32-bit CMT0 device included in r8a7794.
- "renesas,r8a7794-cmt1" for the 48-bit CMT1 device included in r8a7794.
+ - "renesas,r8a7795-cmt0" for the 32-bit CMT0 device included in r8a7795.
+ - "renesas,r8a7795-cmt1" for the 48-bit CMT1 device included in r8a7795.
- "renesas,r8a7796-cmt0" for the 32-bit CMT0 device included in r8a7796.
- "renesas,r8a7796-cmt1" for the 48-bit CMT1 device included in r8a7796.
+ - "renesas,r8a77965-cmt0" for the 32-bit CMT0 device included in r8a77965.
+ - "renesas,r8a77965-cmt1" for the 48-bit CMT1 device included in r8a77965.
- "renesas,r8a77970-cmt0" for the 32-bit CMT0 device included in r8a77970.
- "renesas,r8a77970-cmt1" for the 48-bit CMT1 device included in r8a77970.
- "renesas,r8a77980-cmt0" for the 32-bit CMT0 device included in r8a77980.
- "renesas,r8a77980-cmt1" for the 48-bit CMT1 device included in r8a77980.
+ - "renesas,r8a77990-cmt0" for the 32-bit CMT0 device included in r8a77990.
+ - "renesas,r8a77990-cmt1" for the 48-bit CMT1 device included in r8a77990.
- "renesas,rcar-gen2-cmt0" for 32-bit CMT0 devices included in R-Car Gen2
and RZ/G1.
diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml
index 747fd3f689dc..2e742d399e87 100644
--- a/Documentation/devicetree/bindings/trivial-devices.yaml
+++ b/Documentation/devicetree/bindings/trivial-devices.yaml
@@ -52,6 +52,10 @@ properties:
- at,24c08
# i2c trusted platform module (TPM)
- atmel,at97sc3204t
+ # i2c h/w symmetric crypto module
+ - atmel,atsha204a
+ # i2c h/w elliptic curve crypto module
+ - atmel,atecc508a
# CM32181: Ambient Light Sensor
- capella,cm32181
# CM3232: Ambient Light Sensor
diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt
index 49eac0dc86b0..aafff3a6904d 100644
--- a/Documentation/devicetree/bindings/usb/dwc2.txt
+++ b/Documentation/devicetree/bindings/usb/dwc2.txt
@@ -42,6 +42,8 @@ Refer to phy/phy-bindings.txt for generic phy consumer properties
- g-rx-fifo-size: size of rx fifo size in gadget mode.
- g-np-tx-fifo-size: size of non-periodic tx fifo size in gadget mode.
- g-tx-fifo-size: size of periodic tx fifo per endpoint (except ep0) in gadget mode.
+- snps,need-phy-for-wake: If present indicates that the phy needs to be left
+ on for remote wakeup during suspend.
- snps,reset-phy-on-wake: If present indicates that we need to reset the PHY when
we detect a wakeup. This is due to a hardware errata.
@@ -58,4 +60,5 @@ Example:
clock-names = "otg";
phys = <&usbphy>;
phy-names = "usb2-phy";
+ snps,need-phy-for-wake;
};
diff --git a/Documentation/devicetree/bindings/usb/dwc3.txt b/Documentation/devicetree/bindings/usb/dwc3.txt
index 8e5265e9f658..66780a47ad85 100644
--- a/Documentation/devicetree/bindings/usb/dwc3.txt
+++ b/Documentation/devicetree/bindings/usb/dwc3.txt
@@ -64,6 +64,8 @@ Optional properties:
- snps,dis_u2_susphy_quirk: when set core will disable USB2 suspend phy.
- snps,dis_enblslpm_quirk: when set clears the enblslpm in GUSB2PHYCFG,
disabling the suspend signal to the PHY.
+ - snps,dis-u1-entry-quirk: set if link entering into U1 needs to be disabled.
+ - snps,dis-u2-entry-quirk: set if link entering into U2 needs to be disabled.
- snps,dis_rxdet_inp3_quirk: when set core will disable receiver detection
in PHY P3 power state.
- snps,dis-u2-freeclk-exists-quirk: when set, clear the u2_freeclk_exists
diff --git a/Documentation/devicetree/bindings/usb/generic-ehci.yaml b/Documentation/devicetree/bindings/usb/generic-ehci.yaml
index d3b4f6415920..059f6ef1ad4a 100644
--- a/Documentation/devicetree/bindings/usb/generic-ehci.yaml
+++ b/Documentation/devicetree/bindings/usb/generic-ehci.yaml
@@ -74,7 +74,7 @@ additionalProperties: false
examples:
- |
- ehci@e0000300 {
+ usb@e0000300 {
compatible = "ibm,usb-ehci-440epx", "generic-ehci";
interrupt-parent = <&UIC0>;
interrupts = <0x1a 4>;
@@ -89,7 +89,6 @@ examples:
interrupts = <39>;
clocks = <&ahb_gates 1>;
phys = <&usbphy 1>;
- phy-names = "usb";
};
...
diff --git a/Documentation/devicetree/bindings/usb/renesas_usb3.txt b/Documentation/devicetree/bindings/usb/renesas,usb3.txt
index 35039e720515..35039e720515 100644
--- a/Documentation/devicetree/bindings/usb/renesas_usb3.txt
+++ b/Documentation/devicetree/bindings/usb/renesas,usb3.txt
diff --git a/Documentation/devicetree/bindings/usb/renesas,usbhs.txt b/Documentation/devicetree/bindings/usb/renesas,usbhs.txt
new file mode 100644
index 000000000000..e39255ea6e4f
--- /dev/null
+++ b/Documentation/devicetree/bindings/usb/renesas,usbhs.txt
@@ -0,0 +1,57 @@
+Renesas Electronics USBHS driver
+
+Required properties:
+ - compatible: Must contain one or more of the following:
+
+ - "renesas,usbhs-r8a7743" for r8a7743 (RZ/G1M) compatible device
+ - "renesas,usbhs-r8a7744" for r8a7744 (RZ/G1N) compatible device
+ - "renesas,usbhs-r8a7745" for r8a7745 (RZ/G1E) compatible device
+ - "renesas,usbhs-r8a77470" for r8a77470 (RZ/G1C) compatible device
+ - "renesas,usbhs-r8a774a1" for r8a774a1 (RZ/G2M) compatible device
+ - "renesas,usbhs-r8a774c0" for r8a774c0 (RZ/G2E) compatible device
+ - "renesas,usbhs-r8a7790" for r8a7790 (R-Car H2) compatible device
+ - "renesas,usbhs-r8a7791" for r8a7791 (R-Car M2-W) compatible device
+ - "renesas,usbhs-r8a7792" for r8a7792 (R-Car V2H) compatible device
+ - "renesas,usbhs-r8a7793" for r8a7793 (R-Car M2-N) compatible device
+ - "renesas,usbhs-r8a7794" for r8a7794 (R-Car E2) compatible device
+ - "renesas,usbhs-r8a7795" for r8a7795 (R-Car H3) compatible device
+ - "renesas,usbhs-r8a7796" for r8a7796 (R-Car M3-W) compatible device
+ - "renesas,usbhs-r8a77965" for r8a77965 (R-Car M3-N) compatible device
+ - "renesas,usbhs-r8a77990" for r8a77990 (R-Car E3) compatible device
+ - "renesas,usbhs-r8a77995" for r8a77995 (R-Car D3) compatible device
+ - "renesas,usbhs-r7s72100" for r7s72100 (RZ/A1) compatible device
+ - "renesas,usbhs-r7s9210" for r7s9210 (RZ/A2) compatible device
+ - "renesas,rcar-gen2-usbhs" for R-Car Gen2 or RZ/G1 compatible devices
+ - "renesas,rcar-gen3-usbhs" for R-Car Gen3 or RZ/G2 compatible devices
+ - "renesas,rza1-usbhs" for RZ/A1 compatible device
+ - "renesas,rza2-usbhs" for RZ/A2 compatible device
+
+ When compatible with the generic version, nodes must list the
+ SoC-specific version corresponding to the platform first followed
+ by the generic version.
+
+ - reg: Base address and length of the register for the USBHS
+ - interrupts: Interrupt specifier for the USBHS
+ - clocks: A list of phandle + clock specifier pairs.
+ - In case of "renesas,rcar-gen3-usbhs", two clocks are required.
+ First clock should be peripheral and second one should be host.
+ - In case of except above, one clock is required. First clock
+ should be peripheral.
+
+Optional properties:
+ - renesas,buswait: Integer to use BUSWAIT register
+ - renesas,enable-gpio: A gpio specifier to check GPIO determining if USB
+ function should be enabled
+ - phys: phandle + phy specifier pair
+ - phy-names: must be "usb"
+ - dmas: Must contain a list of references to DMA specifiers.
+ - dma-names : named "ch%d", where %d is the channel number ranging from zero
+ to the number of channels (DnFIFOs) minus one.
+
+Example:
+ usbhs: usb@e6590000 {
+ compatible = "renesas,usbhs-r8a7790", "renesas,rcar-gen2-usbhs";
+ reg = <0 0xe6590000 0 0x100>;
+ interrupts = <0 107 IRQ_TYPE_LEVEL_HIGH>;
+ clocks = <&mstp7_clks R8A7790_CLK_HSUSB>;
+ };
diff --git a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt b/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
deleted file mode 100644
index b8acc2a994a8..000000000000
--- a/Documentation/devicetree/bindings/usb/renesas_usbhs.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-Renesas Electronics USBHS driver
-
-Required properties:
- - compatible: Must contain one or more of the following:
-
- - "renesas,usbhs-r8a7743" for r8a7743 (RZ/G1M) compatible device
- - "renesas,usbhs-r8a7744" for r8a7744 (RZ/G1N) compatible device
- - "renesas,usbhs-r8a7745" for r8a7745 (RZ/G1E) compatible device
- - "renesas,usbhs-r8a77470" for r8a77470 (RZ/G1C) compatible device
- - "renesas,usbhs-r8a774a1" for r8a774a1 (RZ/G2M) compatible device
- - "renesas,usbhs-r8a774c0" for r8a774c0 (RZ/G2E) compatible device
- - "renesas,usbhs-r8a7790" for r8a7790 (R-Car H2) compatible device
- - "renesas,usbhs-r8a7791" for r8a7791 (R-Car M2-W) compatible device
- - "renesas,usbhs-r8a7792" for r8a7792 (R-Car V2H) compatible device
- - "renesas,usbhs-r8a7793" for r8a7793 (R-Car M2-N) compatible device
- - "renesas,usbhs-r8a7794" for r8a7794 (R-Car E2) compatible device
- - "renesas,usbhs-r8a7795" for r8a7795 (R-Car H3) compatible device
- - "renesas,usbhs-r8a7796" for r8a7796 (R-Car M3-W) compatible device
- - "renesas,usbhs-r8a77965" for r8a77965 (R-Car M3-N) compatible device
- - "renesas,usbhs-r8a77990" for r8a77990 (R-Car E3) compatible device
- - "renesas,usbhs-r8a77995" for r8a77995 (R-Car D3) compatible device
- - "renesas,usbhs-r7s72100" for r7s72100 (RZ/A1) compatible device
- - "renesas,rcar-gen2-usbhs" for R-Car Gen2 or RZ/G1 compatible devices
- - "renesas,rcar-gen3-usbhs" for R-Car Gen3 or RZ/G2 compatible devices
- - "renesas,rza1-usbhs" for RZ/A1 compatible device
-
- When compatible with the generic version, nodes must list the
- SoC-specific version corresponding to the platform first followed
- by the generic version.
-
- - reg: Base address and length of the register for the USBHS
- - interrupts: Interrupt specifier for the USBHS
- - clocks: A list of phandle + clock specifier pairs.
- - In case of "renesas,rcar-gen3-usbhs", two clocks are required.
- First clock should be peripheral and second one should be host.
- - In case of except above, one clock is required. First clock
- should be peripheral.
-
-Optional properties:
- - renesas,buswait: Integer to use BUSWAIT register
- - renesas,enable-gpio: A gpio specifier to check GPIO determining if USB
- function should be enabled
- - phys: phandle + phy specifier pair
- - phy-names: must be "usb"
- - dmas: Must contain a list of references to DMA specifiers.
- - dma-names : named "ch%d", where %d is the channel number ranging from zero
- to the number of channels (DnFIFOs) minus one.
-
-Example:
- usbhs: usb@e6590000 {
- compatible = "renesas,usbhs-r8a7790", "renesas,rcar-gen2-usbhs";
- reg = <0 0xe6590000 0 0x100>;
- interrupts = <0 107 IRQ_TYPE_LEVEL_HIGH>;
- clocks = <&mstp7_clks R8A7790_CLK_HSUSB>;
- };
diff --git a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt
index e45b38ce2986..26c85afd0b53 100644
--- a/Documentation/devicetree/bindings/usb/s3c2410-usb.txt
+++ b/Documentation/devicetree/bindings/usb/s3c2410-usb.txt
@@ -4,7 +4,7 @@ OHCI
Required properties:
- compatible: should be "samsung,s3c2410-ohci" for USB host controller
- - reg: address and lenght of the controller memory mapped region
+ - reg: address and length of the controller memory mapped region
- interrupts: interrupt number for the USB OHCI controller
- clocks: Should reference the bus and host clocks
- clock-names: Should contain two strings
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.yaml b/Documentation/devicetree/bindings/vendor-prefixes.yaml
index 33a65a45e319..6992bbbbffab 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.yaml
+++ b/Documentation/devicetree/bindings/vendor-prefixes.yaml
@@ -49,6 +49,8 @@ patternProperties:
description: Aeroflex Gaisler AB
"^al,.*":
description: Annapurna Labs
+ "^allegro,.*":
+ description: Allegro DVT
"^allo,.*":
description: Allo.com
"^allwinner,.*":
@@ -147,6 +149,8 @@ patternProperties:
description: Broadcom Corporation
"^buffalo,.*":
description: Buffalo, Inc.
+ "^bur,.*":
+ description: B&R Industrial Automation GmbH
"^bticino,.*":
description: Bticino International
"^calxeda,.*":
@@ -175,6 +179,8 @@ patternProperties:
description: Common Hardware Reference Platform
"^chunghwa,.*":
description: Chunghwa Picture Tubes Ltd.
+ "^chuwi,.*":
+ description: Chuwi Innovation Ltd.
"^ciaa,.*":
description: Computadora Industrial Abierta Argentina
"^cirrus,.*":
@@ -185,8 +191,12 @@ patternProperties:
description: Chips&Media, Inc.
"^cnxt,.*":
description: Conexant Systems, Inc.
+ "^colorfly,.*":
+ description: Colorful GRP, Shenzhen Xueyushi Technology Ltd.
"^compulab,.*":
description: CompuLab Ltd.
+ "^corpro,.*":
+ description: Chengdu Corpro Technology Co., Ltd.
"^cortina,.*":
description: Cortina Systems, Inc.
"^cosmic,.*":
@@ -199,6 +209,8 @@ patternProperties:
description: Crystalfontz America, Inc.
"^csky,.*":
description: Hangzhou C-SKY Microsystems Co., Ltd
+ "^csq,.*":
+ description: Shenzen Chuangsiqi Technology Co.,Ltd.
"^cubietech,.*":
description: Cubietech, Ltd.
"^cypress,.*":
@@ -219,6 +231,8 @@ patternProperties:
description: Devantech, Ltd.
"^dh,.*":
description: DH electronics GmbH
+ "^difrnce,.*":
+ description: Shenzhen Yagu Electronic Technology Co., Ltd.
"^digi,.*":
description: Digi International Inc.
"^digilent,.*":
@@ -241,6 +255,8 @@ patternProperties:
description: DPTechnics
"^dragino,.*":
description: Dragino Technology Co., Limited
+ "^dserve,.*":
+ description: dServe Technology B.V.
"^ea,.*":
description: Embedded Artists AB
"^ebs-systart,.*":
@@ -263,6 +279,8 @@ patternProperties:
description: Emlid, Ltd.
"^emmicro,.*":
description: EM Microelectronic
+ "^empire-electronix,.*":
+ description: Empire Electronix
"^emtrion,.*":
description: emtrion GmbH
"^endless,.*":
@@ -277,6 +295,8 @@ patternProperties:
description: Ecole Polytechnique Fédérale de Lausanne
"^epson,.*":
description: Seiko Epson Corp.
+ "^esp,.*":
+ description: Espressif Systems Co. Ltd.
"^est,.*":
description: ESTeem Wireless Modems
"^ettus,.*":
@@ -287,6 +307,8 @@ patternProperties:
description: Everest Semiconductor Co. Ltd.
"^everspin,.*":
description: Everspin Technologies, Inc.
+ "^evervision,.*":
+ description: Evervision Electronics Co. Ltd.
"^exar,.*":
description: Exar Corporation
"^excito,.*":
@@ -327,6 +349,8 @@ patternProperties:
description: GE Fanuc Intelligent Platforms Embedded Systems, Inc.
"^GEFanuc,.*":
description: GE Fanuc Intelligent Platforms Embedded Systems, Inc.
+ "^gemei,.*":
+ description: Gemei Digital Technology Co., Ltd.
"^geniatech,.*":
description: Geniatech, Inc.
"^giantec,.*":
@@ -371,12 +395,20 @@ patternProperties:
description: Holt Integrated Circuits, Inc.
"^honeywell,.*":
description: Honeywell
+ "^hoperun,.*":
+ description: Jiangsu HopeRun Software Co., Ltd.
"^hp,.*":
description: Hewlett Packard
+ "^hsg,.*":
+ description: HannStar Display Co.
"^holtek,.*":
description: Holtek Semiconductor, Inc.
+ "^hugsun,.*":
+ description: Shenzhen Hugsun Technology Co. Ltd.
"^hwacom,.*":
description: HwaCom Systems Inc.
+ "^hyundai,.*":
+ description: Hyundai Technology
"^i2se,.*":
description: I2SE GmbH
"^ibm,.*":
@@ -391,6 +423,10 @@ patternProperties:
description: ILI Technology Corporation (ILITEK)
"^img,.*":
description: Imagination Technologies Ltd.
+ "^incircuit,.*":
+ description: In-Circuit GmbH
+ "^inet-tek,.*":
+ description: Shenzhen iNet Mobile Internet Technology Co., Ltd
"^infineon,.*":
description: Infineon Technologies
"^inforce,.*":
@@ -425,6 +461,8 @@ patternProperties:
description: Japan Display Inc.
"^jedec,.*":
description: JEDEC Solid State Technology Association
+ "^jesurun,.*":
+ description: Shenzhen Jesurun Electronics Business Dept.
"^jianda,.*":
description: Jiandangjing Technology Co., Ltd.
"^karo,.*":
@@ -449,6 +487,8 @@ patternProperties:
description: Rakuten Kobo Inc.
"^koe,.*":
description: Kaohsiung Opto-Electronics Inc.
+ "^kontron,.*":
+ description: Kontron S&T AG
"^kosagi,.*":
description: Sutajio Ko-Usagi PTE Ltd.
"^kyo,.*":
@@ -457,6 +497,8 @@ patternProperties:
description: LaCie
"^laird,.*":
description: Laird PLC
+ "^lamobo,.*":
+ description: Ketai Huajie Technology Co., Ltd.
"^lantiq,.*":
description: Lantiq Semiconductor
"^lattice,.*":
@@ -475,6 +517,8 @@ patternProperties:
description: Lichee Pi
"^linaro,.*":
description: Linaro Limited
+ "^linksprite,.*":
+ description: LinkSprite Technologies, Inc.
"^linksys,.*":
description: Belkin International, Inc. (Linksys)
"^linux,.*":
@@ -491,6 +535,8 @@ patternProperties:
description: Liebherr-Werk Nenzing GmbH
"^macnica,.*":
description: Macnica Americas
+ "^mapleboard,.*":
+ description: Mapleboard.org
"^marvell,.*":
description: Marvell Technology Group Ltd.
"^maxbotix,.*":
@@ -531,6 +577,8 @@ patternProperties:
description: Micron Technology Inc.
"^mikroe,.*":
description: MikroElektronika d.o.o.
+ "^miniand,.*":
+ description: Miniand Tech
"^minix,.*":
description: MINIX Technology Ltd.
"^miramems,.*":
@@ -661,28 +709,38 @@ patternProperties:
description: Picochip Ltd
"^pine64,.*":
description: Pine64
+ "^pineriver,.*":
+ description: Shenzhen PineRiver Designs Co., Ltd.
"^pixcir,.*":
description: PIXCIR MICROELECTRONICS Co., Ltd
"^plantower,.*":
description: Plantower Co., Ltd
"^plathome,.*":
- description: Plat'Home Co., Ltd.
+ description: Plat\'Home Co., Ltd.
"^plda,.*":
description: PLDA
"^plx,.*":
description: Broadcom Corporation (formerly PLX Technology)
"^pni,.*":
description: PNI Sensor Corporation
+ "^polaroid,.*":
+ description: Polaroid Corporation
"^portwell,.*":
description: Portwell Inc.
"^poslab,.*":
description: Poslab Technology Co., Ltd.
+ "^pov,.*":
+ description: Point of View International B.V.
"^powervr,.*":
description: PowerVR (deprecated, use img)
+ "^primux,.*":
+ description: Primux Trading, S.L.
"^probox2,.*":
description: PROBOX2 (by W2COMP Co., Ltd.)
"^pulsedlight,.*":
description: PulsedLight, Inc
+ "^purism,.*":
+ description: Purism, SPC
"^qca,.*":
description: Qualcomm Atheros, Inc.
"^qcom,.*":
@@ -691,6 +749,8 @@ patternProperties:
description: QEMU, a generic and open source machine emulator and virtualizer
"^qi,.*":
description: Qi Hardware
+ "^qihua,.*":
+ description: Chengdu Kaixuan Information Technology Co., Ltd.
"^qiaodian,.*":
description: QiaoDian XianShi Corporation
"^qnap,.*":
@@ -713,6 +773,8 @@ patternProperties:
description: Realtek Semiconductor Corp.
"^renesas,.*":
description: Renesas Electronics Corporation
+ "^rervision,.*":
+ description: Shenzhen Rervision Technology Co., Ltd.
"^richtek,.*":
description: Richtek Technology Corporation
"^ricoh,.*":
@@ -781,8 +843,14 @@ patternProperties:
description: Silergy Corp.
"^siliconmitus,.*":
description: Silicon Mitus, Inc.
- "^simte,.*":
- description: k
+ "^simtek,.*":
+ description: Cypress Semiconductor Corporation (Simtek Corporation)
+ "^sinlinx,.*":
+ description: Sinlinx Electronics Technology Co., LTD
+ "^sinovoip,.*":
+ description: SinoVoip Co., Ltd
+ "^sipeed,.*":
+ description: Shenzhen Sipeed Technology Co., Ltd.
"^sirf,.*":
description: SiRF Technology, Inc.
"^sis,.*":
@@ -795,6 +863,8 @@ patternProperties:
description: Standard Microsystems Corporation
"^snps,.*":
description: Synopsys, Inc.
+ "^sochip,.*":
+ description: Shenzhen SoChip Technology Co., Ltd.
"^socionext,.*":
description: Socionext Inc.
"^solidrun,.*":
@@ -849,6 +919,8 @@ patternProperties:
description: Shenzhen Techstar Electronics Co., Ltd.
"^terasic,.*":
description: Terasic Inc.
+ "^tfc,.*":
+ description: Three Five Corp
"^thine,.*":
description: THine Electronics, Inc.
"^ti,.*":
@@ -901,6 +973,8 @@ patternProperties:
description: United Radiant Technology Corporation
"^usi,.*":
description: Universal Scientific Industrial Co., Ltd.
+ "^utoo,.*":
+ description: Aigo Digital Technology Co., Ltd.
"^v3,.*":
description: V3 Semiconductor
"^vamrs,.*":
@@ -923,6 +997,8 @@ patternProperties:
description: Voipac Technologies s.r.o.
"^vot,.*":
description: Vision Optical Technology Co., Ltd.
+ "^vxt,.*":
+ description: VXT Ltd
"^wd,.*":
description: Western Digital Corp.
"^wetek,.*":
@@ -937,10 +1013,14 @@ patternProperties:
description: Winbond Electronics corp.
"^winstar,.*":
description: Winstar Display Corp.
+ "^wits,.*":
+ description: Shenzhen Merrii Technology Co., Ltd. (WITS)
"^wlf,.*":
description: Wolfson Microelectronics
"^wm,.*":
description: Wondermedia Technologies, Inc.
+ "^wobo,.*":
+ description: Wobo
"^x-powers,.*":
description: X-Powers
"^xes,.*":
@@ -951,6 +1031,8 @@ patternProperties:
description: Xilinx
"^xunlong,.*":
description: Shenzhen Xunlong Software CO.,Limited
+ "^yones-toptech,.*":
+ description: Yones Toptech Co., Ltd.
"^ysoft,.*":
description: Y Soft Corporation a.s.
"^zarlink,.*":
@@ -968,7 +1050,7 @@ patternProperties:
# Normal property name match without a comma
# These should catch all node/property names without a prefix
- "^[a-zA-Z0-9#][a-zA-Z0-9+\\-._@]{0,63}$": true
+ "^[a-zA-Z0-9#_][a-zA-Z0-9+\\-._@]{0,63}$": true
"^[a-zA-Z0-9+\\-._]*@[0-9a-zA-Z,]*$": true
"^#.*": true
diff --git a/Documentation/devicetree/bindings/virtio/iommu.txt b/Documentation/devicetree/bindings/virtio/iommu.txt
new file mode 100644
index 000000000000..2407fea0651c
--- /dev/null
+++ b/Documentation/devicetree/bindings/virtio/iommu.txt
@@ -0,0 +1,66 @@
+* virtio IOMMU PCI device
+
+When virtio-iommu uses the PCI transport, its programming interface is
+discovered dynamically by the PCI probing infrastructure. However the
+device tree statically describes the relation between IOMMU and DMA
+masters. Therefore, the PCI root complex that hosts the virtio-iommu
+contains a child node representing the IOMMU device explicitly.
+
+Required properties:
+
+- compatible: Should be "virtio,pci-iommu"
+- reg: PCI address of the IOMMU. As defined in the PCI Bus
+ Binding reference [1], the reg property is a five-cell
+ address encoded as (phys.hi phys.mid phys.lo size.hi
+ size.lo). phys.hi should contain the device's BDF as
+ 0b00000000 bbbbbbbb dddddfff 00000000. The other cells
+ should be zero.
+- #iommu-cells: Each platform DMA master managed by the IOMMU is assigned
+ an endpoint ID, described by the "iommus" property [2].
+ For virtio-iommu, #iommu-cells must be 1.
+
+Notes:
+
+- DMA from the IOMMU device isn't managed by another IOMMU. Therefore the
+ virtio-iommu node doesn't have an "iommus" property, and is omitted from
+ the iommu-map property of the root complex.
+
+Example:
+
+pcie@10000000 {
+ compatible = "pci-host-ecam-generic";
+ ...
+
+ /* The IOMMU programming interface uses slot 00:01.0 */
+ iommu0: iommu@0008 {
+ compatible = "virtio,pci-iommu";
+ reg = <0x00000800 0 0 0 0>;
+ #iommu-cells = <1>;
+ };
+
+ /*
+ * The IOMMU manages all functions in this PCI domain except
+ * itself. Omit BDF 00:01.0.
+ */
+ iommu-map = <0x0 &iommu0 0x0 0x8>
+ <0x9 &iommu0 0x9 0xfff7>;
+};
+
+pcie@20000000 {
+ compatible = "pci-host-ecam-generic";
+ ...
+ /*
+ * The IOMMU also manages all functions from this domain,
+ * with endpoint IDs 0x10000 - 0x1ffff
+ */
+ iommu-map = <0x0 &iommu0 0x10000 0x10000>;
+};
+
+ethernet@fe001000 {
+ ...
+ /* The IOMMU manages this platform device with endpoint ID 0x20000 */
+ iommus = <&iommu0 0x20000>;
+};
+
+[1] Documentation/devicetree/bindings/pci/pci.txt
+[2] Documentation/devicetree/bindings/iommu/iommu.txt
diff --git a/Documentation/devicetree/bindings/virtio/mmio.txt b/Documentation/devicetree/bindings/virtio/mmio.txt
index 5069c1b8e193..21af30fbb81f 100644
--- a/Documentation/devicetree/bindings/virtio/mmio.txt
+++ b/Documentation/devicetree/bindings/virtio/mmio.txt
@@ -8,10 +8,40 @@ Required properties:
- reg: control registers base address and size including configuration space
- interrupts: interrupt generated by the device
+Required properties for virtio-iommu:
+
+- #iommu-cells: When the node corresponds to a virtio-iommu device, it is
+ linked to DMA masters using the "iommus" or "iommu-map"
+ properties [1][2]. #iommu-cells specifies the size of the
+ "iommus" property. For virtio-iommu #iommu-cells must be
+ 1, each cell describing a single endpoint ID.
+
+Optional properties:
+
+- iommus: If the device accesses memory through an IOMMU, it should
+ have an "iommus" property [1]. Since virtio-iommu itself
+ does not access memory through an IOMMU, the "virtio,mmio"
+ node cannot have both an "#iommu-cells" and an "iommus"
+ property.
+
Example:
virtio_block@3000 {
compatible = "virtio,mmio";
reg = <0x3000 0x100>;
interrupts = <41>;
+
+ /* Device has endpoint ID 23 */
+ iommus = <&viommu 23>
}
+
+ viommu: iommu@3100 {
+ compatible = "virtio,mmio";
+ reg = <0x3100 0x100>;
+ interrupts = <42>;
+
+ #iommu-cells = <1>
+ }
+
+[1] Documentation/devicetree/bindings/iommu/iommu.txt
+[2] Documentation/devicetree/bindings/pci/pci-iommu.txt
diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt
deleted file mode 100644
index 02b87e92ae68..000000000000
--- a/Documentation/devicetree/bindings/watchdog/fsl-imx-sc-wdt.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-* Freescale i.MX System Controller Watchdog
-
-i.MX system controller watchdog is for i.MX SoCs with system controller inside,
-the watchdog is managed by system controller, users can ONLY communicate with
-system controller from secure mode for watchdog operations, so Linux i.MX system
-controller watchdog driver will call ARM SMC API and trap into ARM-Trusted-Firmware
-for watchdog operations, ARM-Trusted-Firmware is running at secure EL3 mode and
-it will request system controller to execute the watchdog operation passed from
-Linux kernel.
-
-Required properties:
-- compatible: Should be :
- "fsl,imx8qxp-sc-wdt"
- followed by "fsl,imx-sc-wdt";
-
-Optional properties:
-- timeout-sec : Contains the watchdog timeout in seconds.
-
-Examples:
-
-watchdog {
- compatible = "fsl,imx8qxp-sc-wdt", "fsl,imx-sc-wdt";
- timeout-sec = <60>;
-};
diff --git a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt b/Documentation/devicetree/bindings/watchdog/renesas,wdt.txt
index 9f365c1a3399..9f365c1a3399 100644
--- a/Documentation/devicetree/bindings/watchdog/renesas-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/renesas,wdt.txt
diff --git a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
index 46055254e8dd..e65198d82a2b 100644
--- a/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/sunxi-wdt.txt
@@ -6,6 +6,7 @@ Required properties:
"allwinner,sun4i-a10-wdt"
"allwinner,sun6i-a31-wdt"
"allwinner,sun50i-a64-wdt","allwinner,sun6i-a31-wdt"
+ "allwinner,sun50i-h6-wdt","allwinner,sun6i-a31-wdt"
"allwinner,suniv-f1c100s-wdt", "allwinner,sun4i-a10-wdt"
- reg : Specifies base physical address and size of the registers.
diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt
index 60f8640f2b2f..4660ccee35a3 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -160,7 +160,7 @@ it with special cases.
of the kernel image. That entry point supports two calling
conventions. A summary of the interface is described here. A full
description of the boot requirements is documented in
- Documentation/arm/Booting
+ Documentation/arm/booting.rst
a) ATAGS interface. Minimal information is passed from firmware
to the kernel with a tagged list of predefined parameters.
@@ -174,7 +174,7 @@ it with special cases.
b) Entry with a flattened device-tree block. Firmware loads the
physical address of the flattened device tree block (dtb) into r2,
r1 is not used, but it is considered good practice to use a valid
- machine number as described in Documentation/arm/Booting.
+ machine number as described in Documentation/arm/booting.rst.
r0 : 0
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 5eba889ea84d..9f4392876099 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -30,6 +30,7 @@
*.lzo
*.mo
*.moc
+*.mod
*.mod.c
*.o
*.o.*
diff --git a/Documentation/driver-api/80211/mac80211-advanced.rst b/Documentation/driver-api/80211/mac80211-advanced.rst
index 70a89b2163c2..9f1c5bb7ac35 100644
--- a/Documentation/driver-api/80211/mac80211-advanced.rst
+++ b/Documentation/driver-api/80211/mac80211-advanced.rst
@@ -226,9 +226,6 @@ TBD
.. kernel-doc:: include/net/mac80211.h
:functions: ieee80211_tx_rate_control
-.. kernel-doc:: include/net/mac80211.h
- :functions: rate_control_send_low
-
TBD
This part of the book describes mac80211 internals.
diff --git a/Documentation/driver-api/backlight/lp855x-driver.rst b/Documentation/driver-api/backlight/lp855x-driver.rst
new file mode 100644
index 000000000000..1e0b224fc397
--- /dev/null
+++ b/Documentation/driver-api/backlight/lp855x-driver.rst
@@ -0,0 +1,81 @@
+====================
+Kernel driver lp855x
+====================
+
+Backlight driver for LP855x ICs
+
+Supported chips:
+
+ Texas Instruments LP8550, LP8551, LP8552, LP8553, LP8555, LP8556 and
+ LP8557
+
+Author: Milo(Woogyom) Kim <milo.kim@ti.com>
+
+Description
+-----------
+
+* Brightness control
+
+ Brightness can be controlled by the pwm input or the i2c command.
+ The lp855x driver supports both cases.
+
+* Device attributes
+
+ 1) bl_ctl_mode
+
+ Backlight control mode.
+
+ Value: pwm based or register based
+
+ 2) chip_id
+
+ The lp855x chip id.
+
+ Value: lp8550/lp8551/lp8552/lp8553/lp8555/lp8556/lp8557
+
+Platform data for lp855x
+------------------------
+
+For supporting platform specific data, the lp855x platform data can be used.
+
+* name:
+ Backlight driver name. If it is not defined, default name is set.
+* device_control:
+ Value of DEVICE CONTROL register.
+* initial_brightness:
+ Initial value of backlight brightness.
+* period_ns:
+ Platform specific PWM period value. unit is nano.
+ Only valid when brightness is pwm input mode.
+* size_program:
+ Total size of lp855x_rom_data.
+* rom_data:
+ List of new eeprom/eprom registers.
+
+Examples
+========
+
+1) lp8552 platform data: i2c register mode with new eeprom data::
+
+ #define EEPROM_A5_ADDR 0xA5
+ #define EEPROM_A5_VAL 0x4f /* EN_VSYNC=0 */
+
+ static struct lp855x_rom_data lp8552_eeprom_arr[] = {
+ {EEPROM_A5_ADDR, EEPROM_A5_VAL},
+ };
+
+ static struct lp855x_platform_data lp8552_pdata = {
+ .name = "lcd-bl",
+ .device_control = I2C_CONFIG(LP8552),
+ .initial_brightness = INITIAL_BRT,
+ .size_program = ARRAY_SIZE(lp8552_eeprom_arr),
+ .rom_data = lp8552_eeprom_arr,
+ };
+
+2) lp8556 platform data: pwm input mode with default rom data::
+
+ static struct lp855x_platform_data lp8556_pdata = {
+ .device_control = PWM_CONFIG(LP8556),
+ .initial_brightness = INITIAL_BRT,
+ .period_ns = 1000000,
+ };
diff --git a/Documentation/bt8xxgpio.txt b/Documentation/driver-api/bt8xxgpio.rst
index a845feb074de..a845feb074de 100644
--- a/Documentation/bt8xxgpio.txt
+++ b/Documentation/driver-api/bt8xxgpio.rst
diff --git a/Documentation/driver-api/connector.rst b/Documentation/driver-api/connector.rst
new file mode 100644
index 000000000000..c100c7482289
--- /dev/null
+++ b/Documentation/driver-api/connector.rst
@@ -0,0 +1,156 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+================
+Kernel Connector
+================
+
+Kernel connector - new netlink based userspace <-> kernel space easy
+to use communication module.
+
+The Connector driver makes it easy to connect various agents using a
+netlink based network. One must register a callback and an identifier.
+When the driver receives a special netlink message with the appropriate
+identifier, the appropriate callback will be called.
+
+From the userspace point of view it's quite straightforward:
+
+ - socket();
+ - bind();
+ - send();
+ - recv();
+
+But if kernelspace wants to use the full power of such connections, the
+driver writer must create special sockets, must know about struct sk_buff
+handling, etc... The Connector driver allows any kernelspace agents to use
+netlink based networking for inter-process communication in a significantly
+easier way::
+
+ int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
+ void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
+ void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
+
+ struct cb_id
+ {
+ __u32 idx;
+ __u32 val;
+ };
+
+idx and val are unique identifiers which must be registered in the
+connector.h header for in-kernel usage. `void (*callback) (void *)` is a
+callback function which will be called when a message with above idx.val
+is received by the connector core. The argument for that function must
+be dereferenced to `struct cn_msg *`::
+
+ struct cn_msg
+ {
+ struct cb_id id;
+
+ __u32 seq;
+ __u32 ack;
+
+ __u32 len; /* Length of the following data */
+ __u8 data[0];
+ };
+
+Connector interfaces
+====================
+
+ .. kernel-doc:: include/linux/connector.h
+
+ Note:
+ When registering new callback user, connector core assigns
+ netlink group to the user which is equal to its id.idx.
+
+Protocol description
+====================
+
+The current framework offers a transport layer with fixed headers. The
+recommended protocol which uses such a header is as following:
+
+msg->seq and msg->ack are used to determine message genealogy. When
+someone sends a message, they use a locally unique sequence and random
+acknowledge number. The sequence number may be copied into
+nlmsghdr->nlmsg_seq too.
+
+The sequence number is incremented with each message sent.
+
+If you expect a reply to the message, then the sequence number in the
+received message MUST be the same as in the original message, and the
+acknowledge number MUST be the same + 1.
+
+If we receive a message and its sequence number is not equal to one we
+are expecting, then it is a new message. If we receive a message and
+its sequence number is the same as one we are expecting, but its
+acknowledge is not equal to the sequence number in the original
+message + 1, then it is a new message.
+
+Obviously, the protocol header contains the above id.
+
+The connector allows event notification in the following form: kernel
+driver or userspace process can ask connector to notify it when
+selected ids will be turned on or off (registered or unregistered its
+callback). It is done by sending a special command to the connector
+driver (it also registers itself with id={-1, -1}).
+
+As example of this usage can be found in the cn_test.c module which
+uses the connector to request notification and to send messages.
+
+Reliability
+===========
+
+Netlink itself is not a reliable protocol. That means that messages can
+be lost due to memory pressure or process' receiving queue overflowed,
+so caller is warned that it must be prepared. That is why the struct
+cn_msg [main connector's message header] contains u32 seq and u32 ack
+fields.
+
+Userspace usage
+===============
+
+2.6.14 has a new netlink socket implementation, which by default does not
+allow people to send data to netlink groups other than 1.
+So, if you wish to use a netlink socket (for example using connector)
+with a different group number, the userspace application must subscribe to
+that group first. It can be achieved by the following pseudocode::
+
+ s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+
+ l_local.nl_family = AF_NETLINK;
+ l_local.nl_groups = 12345;
+ l_local.nl_pid = 0;
+
+ if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+ perror("bind");
+ close(s);
+ return -1;
+ }
+
+ {
+ int on = l_local.nl_groups;
+ setsockopt(s, 270, 1, &on, sizeof(on));
+ }
+
+Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
+option. To drop a multicast subscription, one should call the above socket
+option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
+
+2.6.14 netlink code only allows to select a group which is less or equal to
+the maximum group number, which is used at netlink_kernel_create() time.
+In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
+group number 12345, you must increment CN_NETLINK_USERS to that number.
+Additional 0xf numbers are allocated to be used by non-in-kernel users.
+
+Due to this limitation, group 0xffffffff does not work now, so one can
+not use add/remove connector's group notifications, but as far as I know,
+only cn_test.c test module used it.
+
+Some work in netlink area is still being done, so things can be changed in
+2.6.15 timeframe, if it will happen, documentation will be updated for that
+kernel.
+
+Code samples
+============
+
+Sample code for a connector test module and user space can be found
+in samples/connector/. To build this code, enable CONFIG_CONNECTOR
+and CONFIG_SAMPLES.
diff --git a/Documentation/driver-api/console.rst b/Documentation/driver-api/console.rst
new file mode 100644
index 000000000000..8394ad7747ac
--- /dev/null
+++ b/Documentation/driver-api/console.rst
@@ -0,0 +1,152 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Console Drivers
+===============
+
+The Linux kernel has 2 general types of console drivers. The first type is
+assigned by the kernel to all the virtual consoles during the boot process.
+This type will be called 'system driver', and only one system driver is allowed
+to exist. The system driver is persistent and it can never be unloaded, though
+it may become inactive.
+
+The second type has to be explicitly loaded and unloaded. This will be called
+'modular driver' by this document. Multiple modular drivers can coexist at
+any time with each driver sharing the console with other drivers including
+the system driver. However, modular drivers cannot take over the console
+that is currently occupied by another modular driver. (Exception: Drivers that
+call do_take_over_console() will succeed in the takeover regardless of the type
+of driver occupying the consoles.) They can only take over the console that is
+occupied by the system driver. In the same token, if the modular driver is
+released by the console, the system driver will take over.
+
+Modular drivers, from the programmer's point of view, have to call::
+
+ do_take_over_console() - load and bind driver to console layer
+ give_up_console() - unload driver; it will only work if driver
+ is fully unbound
+
+In newer kernels, the following are also available::
+
+ do_register_con_driver()
+ do_unregister_con_driver()
+
+If sysfs is enabled, the contents of /sys/class/vtconsole can be
+examined. This shows the console backends currently registered by the
+system which are named vtcon<n> where <n> is an integer from 0 to 15.
+Thus::
+
+ ls /sys/class/vtconsole
+ . .. vtcon0 vtcon1
+
+Each directory in /sys/class/vtconsole has 3 files::
+
+ ls /sys/class/vtconsole/vtcon0
+ . .. bind name uevent
+
+What do these files signify?
+
+ 1. bind - this is a read/write file. It shows the status of the driver if
+ read, or acts to bind or unbind the driver to the virtual consoles
+ when written to. The possible values are:
+
+ 0
+ - means the driver is not bound and if echo'ed, commands the driver
+ to unbind
+
+ 1
+ - means the driver is bound and if echo'ed, commands the driver to
+ bind
+
+ 2. name - read-only file. Shows the name of the driver in this format::
+
+ cat /sys/class/vtconsole/vtcon0/name
+ (S) VGA+
+
+ '(S)' stands for a (S)ystem driver, i.e., it cannot be directly
+ commanded to bind or unbind
+
+ 'VGA+' is the name of the driver
+
+ cat /sys/class/vtconsole/vtcon1/name
+ (M) frame buffer device
+
+ In this case, '(M)' stands for a (M)odular driver, one that can be
+ directly commanded to bind or unbind.
+
+ 3. uevent - ignore this file
+
+When unbinding, the modular driver is detached first, and then the system
+driver takes over the consoles vacated by the driver. Binding, on the other
+hand, will bind the driver to the consoles that are currently occupied by a
+system driver.
+
+NOTE1:
+ Binding and unbinding must be selected in Kconfig. It's under::
+
+ Device Drivers ->
+ Character devices ->
+ Support for binding and unbinding console drivers
+
+NOTE2:
+ If any of the virtual consoles are in KD_GRAPHICS mode, then binding or
+ unbinding will not succeed. An example of an application that sets the
+ console to KD_GRAPHICS is X.
+
+How useful is this feature? This is very useful for console driver
+developers. By unbinding the driver from the console layer, one can unload the
+driver, make changes, recompile, reload and rebind the driver without any need
+for rebooting the kernel. For regular users who may want to switch from
+framebuffer console to VGA console and vice versa, this feature also makes
+this possible. (NOTE NOTE NOTE: Please read fbcon.txt under Documentation/fb
+for more details.)
+
+Notes for developers
+====================
+
+do_take_over_console() is now broken up into::
+
+ do_register_con_driver()
+ do_bind_con_driver() - private function
+
+give_up_console() is a wrapper to do_unregister_con_driver(), and a driver must
+be fully unbound for this call to succeed. con_is_bound() will check if the
+driver is bound or not.
+
+Guidelines for console driver writers
+=====================================
+
+In order for binding to and unbinding from the console to properly work,
+console drivers must follow these guidelines:
+
+1. All drivers, except system drivers, must call either do_register_con_driver()
+ or do_take_over_console(). do_register_con_driver() will just add the driver
+ to the console's internal list. It won't take over the
+ console. do_take_over_console(), as it name implies, will also take over (or
+ bind to) the console.
+
+2. All resources allocated during con->con_init() must be released in
+ con->con_deinit().
+
+3. All resources allocated in con->con_startup() must be released when the
+ driver, which was previously bound, becomes unbound. The console layer
+ does not have a complementary call to con->con_startup() so it's up to the
+ driver to check when it's legal to release these resources. Calling
+ con_is_bound() in con->con_deinit() will help. If the call returned
+ false(), then it's safe to release the resources. This balance has to be
+ ensured because con->con_startup() can be called again when a request to
+ rebind the driver to the console arrives.
+
+4. Upon exit of the driver, ensure that the driver is totally unbound. If the
+ condition is satisfied, then the driver must call do_unregister_con_driver()
+ or give_up_console().
+
+5. do_unregister_con_driver() can also be called on conditions which make it
+ impossible for the driver to service console requests. This can happen
+ with the framebuffer console that suddenly lost all of its drivers.
+
+The current crop of console drivers should still work correctly, but binding
+and unbinding them may cause problems. With minimal fixes, these drivers can
+be made to work correctly.
+
+Antonino Daplas <adaplas@pol.net>
diff --git a/Documentation/dcdbas.txt b/Documentation/driver-api/dcdbas.rst
index 309cc57a7c1c..309cc57a7c1c 100644
--- a/Documentation/dcdbas.txt
+++ b/Documentation/driver-api/dcdbas.rst
diff --git a/Documentation/dell_rbu.txt b/Documentation/driver-api/dell_rbu.rst
index 5d1ce7bcd04d..5d1ce7bcd04d 100644
--- a/Documentation/dell_rbu.txt
+++ b/Documentation/driver-api/dell_rbu.rst
diff --git a/Documentation/driver-api/dmaengine/dmatest.rst b/Documentation/driver-api/dmaengine/dmatest.rst
index e78d070bb468..ee268d445d38 100644
--- a/Documentation/driver-api/dmaengine/dmatest.rst
+++ b/Documentation/driver-api/dmaengine/dmatest.rst
@@ -44,7 +44,8 @@ Example of usage::
dmatest.timeout=2000 dmatest.iterations=1 dmatest.channel=dma0chan0 dmatest.run=1
-Example of multi-channel test usage:
+Example of multi-channel test usage (new in the 5.0 kernel)::
+
% modprobe dmatest
% echo 2000 > /sys/module/dmatest/parameters/timeout
% echo 1 > /sys/module/dmatest/parameters/iterations
@@ -53,15 +54,18 @@ Example of multi-channel test usage:
% echo dma0chan2 > /sys/module/dmatest/parameters/channel
% echo 1 > /sys/module/dmatest/parameters/run
-Note: the channel parameter should always be the last parameter set prior to
-running the test (setting run=1), this is because upon setting the channel
-parameter, that specific channel is requested using the dmaengine and a thread
-is created with the existing parameters. This thread is set as pending
-and will be executed once run is set to 1. Any parameters set after the thread
-is created are not applied.
+.. note::
+ For all tests, starting in the 5.0 kernel, either single- or multi-channel,
+ the channel parameter(s) must be set after all other parameters. It is at
+ that time that the existing parameter values are acquired for use by the
+ thread(s). All other parameters are shared. Therefore, if changes are made
+ to any of the other parameters, and an additional channel specified, the
+ (shared) parameters used for all threads will use the new values.
+ After the channels are specified, each thread is set as pending. All threads
+ begin execution when the run parameter is set to 1.
.. hint::
- available channel list could be extracted by running the following command::
+ A list of available channels can be found by running the following command::
% ls -1 /sys/class/dma/
@@ -204,6 +208,7 @@ Releasing Channels
Channels can be freed by setting run to 0.
Example::
+
% echo dma0chan1 > /sys/module/dmatest/parameters/channel
dmatest: Added 1 threads using dma0chan1
% cat /sys/class/dma/dma0chan1/in_use
diff --git a/Documentation/driver-api/driver-model/binding.rst b/Documentation/driver-api/driver-model/binding.rst
new file mode 100644
index 000000000000..7ea1d7a41e1d
--- /dev/null
+++ b/Documentation/driver-api/driver-model/binding.rst
@@ -0,0 +1,98 @@
+==============
+Driver Binding
+==============
+
+Driver binding is the process of associating a device with a device
+driver that can control it. Bus drivers have typically handled this
+because there have been bus-specific structures to represent the
+devices and the drivers. With generic device and device driver
+structures, most of the binding can take place using common code.
+
+
+Bus
+~~~
+
+The bus type structure contains a list of all devices that are on that bus
+type in the system. When device_register is called for a device, it is
+inserted into the end of this list. The bus object also contains a
+list of all drivers of that bus type. When driver_register is called
+for a driver, it is inserted at the end of this list. These are the
+two events which trigger driver binding.
+
+
+device_register
+~~~~~~~~~~~~~~~
+
+When a new device is added, the bus's list of drivers is iterated over
+to find one that supports it. In order to determine that, the device
+ID of the device must match one of the device IDs that the driver
+supports. The format and semantics for comparing IDs is bus-specific.
+Instead of trying to derive a complex state machine and matching
+algorithm, it is up to the bus driver to provide a callback to compare
+a device against the IDs of a driver. The bus returns 1 if a match was
+found; 0 otherwise.
+
+int match(struct device * dev, struct device_driver * drv);
+
+If a match is found, the device's driver field is set to the driver
+and the driver's probe callback is called. This gives the driver a
+chance to verify that it really does support the hardware, and that
+it's in a working state.
+
+Device Class
+~~~~~~~~~~~~
+
+Upon the successful completion of probe, the device is registered with
+the class to which it belongs. Device drivers belong to one and only one
+class, and that is set in the driver's devclass field.
+devclass_add_device is called to enumerate the device within the class
+and actually register it with the class, which happens with the
+class's register_dev callback.
+
+
+Driver
+~~~~~~
+
+When a driver is attached to a device, the device is inserted into the
+driver's list of devices.
+
+
+sysfs
+~~~~~
+
+A symlink is created in the bus's 'devices' directory that points to
+the device's directory in the physical hierarchy.
+
+A symlink is created in the driver's 'devices' directory that points
+to the device's directory in the physical hierarchy.
+
+A directory for the device is created in the class's directory. A
+symlink is created in that directory that points to the device's
+physical location in the sysfs tree.
+
+A symlink can be created (though this isn't done yet) in the device's
+physical directory to either its class directory, or the class's
+top-level directory. One can also be created to point to its driver's
+directory also.
+
+
+driver_register
+~~~~~~~~~~~~~~~
+
+The process is almost identical for when a new driver is added.
+The bus's list of devices is iterated over to find a match. Devices
+that already have a driver are skipped. All the devices are iterated
+over, to bind as many devices as possible to the driver.
+
+
+Removal
+~~~~~~~
+
+When a device is removed, the reference count for it will eventually
+go to 0. When it does, the remove callback of the driver is called. It
+is removed from the driver's list of devices and the reference count
+of the driver is decremented. All symlinks between the two are removed.
+
+When a driver is removed, the list of devices that it supports is
+iterated over, and the driver's remove callback is called for each
+one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-api/driver-model/bus.rst b/Documentation/driver-api/driver-model/bus.rst
new file mode 100644
index 000000000000..016b15a6e8ea
--- /dev/null
+++ b/Documentation/driver-api/driver-model/bus.rst
@@ -0,0 +1,146 @@
+=========
+Bus Types
+=========
+
+Definition
+~~~~~~~~~~
+See the kerneldoc for the struct bus_type.
+
+int bus_register(struct bus_type * bus);
+
+
+Declaration
+~~~~~~~~~~~
+
+Each bus type in the kernel (PCI, USB, etc) should declare one static
+object of this type. They must initialize the name field, and may
+optionally initialize the match callback::
+
+ struct bus_type pci_bus_type = {
+ .name = "pci",
+ .match = pci_bus_match,
+ };
+
+The structure should be exported to drivers in a header file:
+
+extern struct bus_type pci_bus_type;
+
+
+Registration
+~~~~~~~~~~~~
+
+When a bus driver is initialized, it calls bus_register. This
+initializes the rest of the fields in the bus object and inserts it
+into a global list of bus types. Once the bus object is registered,
+the fields in it are usable by the bus driver.
+
+
+Callbacks
+~~~~~~~~~
+
+match(): Attaching Drivers to Devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The format of device ID structures and the semantics for comparing
+them are inherently bus-specific. Drivers typically declare an array
+of device IDs of devices they support that reside in a bus-specific
+driver structure.
+
+The purpose of the match callback is to give the bus an opportunity to
+determine if a particular driver supports a particular device by
+comparing the device IDs the driver supports with the device ID of a
+particular device, without sacrificing bus-specific functionality or
+type-safety.
+
+When a driver is registered with the bus, the bus's list of devices is
+iterated over, and the match callback is called for each device that
+does not have a driver associated with it.
+
+
+
+Device and Driver Lists
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The lists of devices and drivers are intended to replace the local
+lists that many buses keep. They are lists of struct devices and
+struct device_drivers, respectively. Bus drivers are free to use the
+lists as they please, but conversion to the bus-specific type may be
+necessary.
+
+The LDM core provides helper functions for iterating over each list::
+
+ int bus_for_each_dev(struct bus_type * bus, struct device * start,
+ void * data,
+ int (*fn)(struct device *, void *));
+
+ int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+ void * data, int (*fn)(struct device_driver *, void *));
+
+These helpers iterate over the respective list, and call the callback
+for each device or driver in the list. All list accesses are
+synchronized by taking the bus's lock (read currently). The reference
+count on each object in the list is incremented before the callback is
+called; it is decremented after the next object has been obtained. The
+lock is not held when calling the callback.
+
+
+sysfs
+~~~~~~~~
+There is a top-level directory named 'bus'.
+
+Each bus gets a directory in the bus directory, along with two default
+directories::
+
+ /sys/bus/pci/
+ |-- devices
+ `-- drivers
+
+Drivers registered with the bus get a directory in the bus's drivers
+directory::
+
+ /sys/bus/pci/
+ |-- devices
+ `-- drivers
+ |-- Intel ICH
+ |-- Intel ICH Joystick
+ |-- agpgart
+ `-- e100
+
+Each device that is discovered on a bus of that type gets a symlink in
+the bus's devices directory to the device's directory in the physical
+hierarchy::
+
+ /sys/bus/pci/
+ |-- devices
+ | |-- 00:00.0 -> ../../../root/pci0/00:00.0
+ | |-- 00:01.0 -> ../../../root/pci0/00:01.0
+ | `-- 00:02.0 -> ../../../root/pci0/00:02.0
+ `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ struct bus_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct bus_type *, char * buf);
+ ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+ };
+
+Bus drivers can export attributes using the BUS_ATTR_RW macro that works
+similarly to the DEVICE_ATTR_RW macro for devices. For example, a
+definition like this::
+
+ static BUS_ATTR_RW(debug);
+
+is equivalent to declaring::
+
+ static bus_attribute bus_attr_debug;
+
+This can then be used to add and remove the attribute from the bus's
+sysfs directory using::
+
+ int bus_create_file(struct bus_type *, struct bus_attribute *);
+ void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-api/driver-model/class.rst b/Documentation/driver-api/driver-model/class.rst
new file mode 100644
index 000000000000..fff55b80e86a
--- /dev/null
+++ b/Documentation/driver-api/driver-model/class.rst
@@ -0,0 +1,149 @@
+==============
+Device Classes
+==============
+
+Introduction
+~~~~~~~~~~~~
+A device class describes a type of device, like an audio or network
+device. The following device classes have been identified:
+
+<Insert List of Device Classes Here>
+
+
+Each device class defines a set of semantics and a programming interface
+that devices of that class adhere to. Device drivers are the
+implementation of that programming interface for a particular device on
+a particular bus.
+
+Device classes are agnostic with respect to what bus a device resides
+on.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The device class structure looks like::
+
+
+ typedef int (*devclass_add)(struct device *);
+ typedef void (*devclass_remove)(struct device *);
+
+See the kerneldoc for the struct class.
+
+A typical device class definition would look like::
+
+ struct device_class input_devclass = {
+ .name = "input",
+ .add_device = input_add_device,
+ .remove_device = input_remove_device,
+ };
+
+Each device class structure should be exported in a header file so it
+can be used by drivers, extensions and interfaces.
+
+Device classes are registered and unregistered with the core using::
+
+ int devclass_register(struct device_class * cls);
+ void devclass_unregister(struct device_class * cls);
+
+
+Devices
+~~~~~~~
+As devices are bound to drivers, they are added to the device class
+that the driver belongs to. Before the driver model core, this would
+typically happen during the driver's probe() callback, once the device
+has been initialized. It now happens after the probe() callback
+finishes from the core.
+
+The device is enumerated in the class. Each time a device is added to
+the class, the class's devnum field is incremented and assigned to the
+device. The field is never decremented, so if the device is removed
+from the class and re-added, it will receive a different enumerated
+value.
+
+The class is allowed to create a class-specific structure for the
+device and store it in the device's class_data pointer.
+
+There is no list of devices in the device class. Each driver has a
+list of devices that it supports. The device class has a list of
+drivers of that particular class. To access all of the devices in the
+class, iterate over the device lists of each driver in the class.
+
+
+Device Drivers
+~~~~~~~~~~~~~~
+Device drivers are added to device classes when they are registered
+with the core. A driver specifies the class it belongs to by setting
+the struct device_driver::devclass field.
+
+
+sysfs directory structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There is a top-level sysfs directory named 'class'.
+
+Each class gets a directory in the class directory, along with two
+default subdirectories::
+
+ class/
+ `-- input
+ |-- devices
+ `-- drivers
+
+
+Drivers registered with the class get a symlink in the drivers/ directory
+that points to the driver's directory (under its bus directory)::
+
+ class/
+ `-- input
+ |-- devices
+ `-- drivers
+ `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
+
+
+Each device gets a symlink in the devices/ directory that points to the
+device's directory in the physical hierarchy::
+
+ class/
+ `-- input
+ |-- devices
+ | `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+ `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ struct devclass_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
+ ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
+ };
+
+Class drivers can export attributes using the DEVCLASS_ATTR macro that works
+similarly to the DEVICE_ATTR macro for devices. For example, a definition
+like this::
+
+ static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
+
+is equivalent to declaring::
+
+ static devclass_attribute devclass_attr_debug;
+
+The bus driver can add and remove the attribute from the class's
+sysfs directory using::
+
+ int devclass_create_file(struct device_class *, struct devclass_attribute *);
+ void devclass_remove_file(struct device_class *, struct devclass_attribute *);
+
+In the example above, the file will be named 'debug' in placed in the
+class's directory in sysfs.
+
+
+Interfaces
+~~~~~~~~~~
+There may exist multiple mechanisms for accessing the same device of a
+particular class type. Device interfaces describe these mechanisms.
+
+When a device is added to a device class, the core attempts to add it
+to every interface that is registered with the device class.
diff --git a/Documentation/driver-api/driver-model/design-patterns.rst b/Documentation/driver-api/driver-model/design-patterns.rst
new file mode 100644
index 000000000000..41eb8f41f7dd
--- /dev/null
+++ b/Documentation/driver-api/driver-model/design-patterns.rst
@@ -0,0 +1,116 @@
+=============================
+Device Driver Design Patterns
+=============================
+
+This document describes a few common design patterns found in device drivers.
+It is likely that subsystem maintainers will ask driver developers to
+conform to these design patterns.
+
+1. State Container
+2. container_of()
+
+
+1. State Container
+~~~~~~~~~~~~~~~~~~
+
+While the kernel contains a few device drivers that assume that they will
+only be probed() once on a certain system (singletons), it is custom to assume
+that the device the driver binds to will appear in several instances. This
+means that the probe() function and all callbacks need to be reentrant.
+
+The most common way to achieve this is to use the state container design
+pattern. It usually has this form::
+
+ struct foo {
+ spinlock_t lock; /* Example member */
+ (...)
+ };
+
+ static int foo_probe(...)
+ {
+ struct foo *foo;
+
+ foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
+ if (!foo)
+ return -ENOMEM;
+ spin_lock_init(&foo->lock);
+ (...)
+ }
+
+This will create an instance of struct foo in memory every time probe() is
+called. This is our state container for this instance of the device driver.
+Of course it is then necessary to always pass this instance of the
+state around to all functions that need access to the state and its members.
+
+For example, if the driver is registering an interrupt handler, you would
+pass around a pointer to struct foo like this::
+
+ static irqreturn_t foo_handler(int irq, void *arg)
+ {
+ struct foo *foo = arg;
+ (...)
+ }
+
+ static int foo_probe(...)
+ {
+ struct foo *foo;
+
+ (...)
+ ret = request_irq(irq, foo_handler, 0, "foo", foo);
+ }
+
+This way you always get a pointer back to the correct instance of foo in
+your interrupt handler.
+
+
+2. container_of()
+~~~~~~~~~~~~~~~~~
+
+Continuing on the above example we add an offloaded work::
+
+ struct foo {
+ spinlock_t lock;
+ struct workqueue_struct *wq;
+ struct work_struct offload;
+ (...)
+ };
+
+ static void foo_work(struct work_struct *work)
+ {
+ struct foo *foo = container_of(work, struct foo, offload);
+
+ (...)
+ }
+
+ static irqreturn_t foo_handler(int irq, void *arg)
+ {
+ struct foo *foo = arg;
+
+ queue_work(foo->wq, &foo->offload);
+ (...)
+ }
+
+ static int foo_probe(...)
+ {
+ struct foo *foo;
+
+ foo->wq = create_singlethread_workqueue("foo-wq");
+ INIT_WORK(&foo->offload, foo_work);
+ (...)
+ }
+
+The design pattern is the same for an hrtimer or something similar that will
+return a single argument which is a pointer to a struct member in the
+callback.
+
+container_of() is a macro defined in <linux/kernel.h>
+
+What container_of() does is to obtain a pointer to the containing struct from
+a pointer to a member by a simple subtraction using the offsetof() macro from
+standard C, which allows something similar to object oriented behaviours.
+Notice that the contained member must not be a pointer, but an actual member
+for this to work.
+
+We can see here that we avoid having global pointers to our struct foo *
+instance this way, while still keeping the number of parameters passed to the
+work function to a single pointer.
diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst
new file mode 100644
index 000000000000..2b868d49d349
--- /dev/null
+++ b/Documentation/driver-api/driver-model/device.rst
@@ -0,0 +1,109 @@
+==========================
+The Basic Device Structure
+==========================
+
+See the kerneldoc for the struct device.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The bus driver that discovers the device uses this to register the
+device with the core::
+
+ int device_register(struct device * dev);
+
+The bus should initialize the following fields:
+
+ - parent
+ - name
+ - bus_id
+ - bus
+
+A device is removed from the core when its reference count goes to
+0. The reference count can be adjusted using::
+
+ struct device * get_device(struct device * dev);
+ void put_device(struct device * dev);
+
+get_device() will return a pointer to the struct device passed to it
+if the reference is not already 0 (if it's in the process of being
+removed already).
+
+A driver can access the lock in the device structure using::
+
+ void lock_device(struct device * dev);
+ void unlock_device(struct device * dev);
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+ struct device_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device *dev, struct device_attribute *attr,
+ char *buf);
+ ssize_t (*store)(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count);
+ };
+
+Attributes of devices can be exported by a device driver through sysfs.
+
+Please see Documentation/filesystems/sysfs.txt for more information
+on how sysfs works.
+
+As explained in Documentation/kobject.txt, device attributes must be
+created before the KOBJ_ADD uevent is generated. The only way to realize
+that is by defining an attribute group.
+
+Attributes are declared using a macro called DEVICE_ATTR::
+
+ #define DEVICE_ATTR(name,mode,show,store)
+
+Example:::
+
+ static DEVICE_ATTR(type, 0444, show_type, NULL);
+ static DEVICE_ATTR(power, 0644, show_power, store_power);
+
+This declares two structures of type struct device_attribute with respective
+names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
+organized as follows into a group::
+
+ static struct attribute *dev_attrs[] = {
+ &dev_attr_type.attr,
+ &dev_attr_power.attr,
+ NULL,
+ };
+
+ static struct attribute_group dev_attr_group = {
+ .attrs = dev_attrs,
+ };
+
+ static const struct attribute_group *dev_attr_groups[] = {
+ &dev_attr_group,
+ NULL,
+ };
+
+This array of groups can then be associated with a device by setting the
+group pointer in struct device before device_register() is invoked::
+
+ dev->groups = dev_attr_groups;
+ device_register(dev);
+
+The device_register() function will use the 'groups' pointer to create the
+device attributes and the device_unregister() function will use this pointer
+to remove the device attributes.
+
+Word of warning: While the kernel allows device_create_file() and
+device_remove_file() to be called on a device at any time, userspace has
+strict expectations on when attributes get created. When a new device is
+registered in the kernel, a uevent is generated to notify userspace (like
+udev) that a new device is available. If attributes are added after the
+device is registered, then userspace won't get notified and userspace will
+not know about the new attributes.
+
+This is important for device driver that need to publish additional
+attributes for a device at driver probe time. If the device driver simply
+calls device_create_file() on the device structure passed to it, then
+userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
new file mode 100644
index 000000000000..a100bef54952
--- /dev/null
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -0,0 +1,418 @@
+================================
+Devres - Managed Device Resource
+================================
+
+Tejun Heo <teheo@suse.de>
+
+First draft 10 January 2007
+
+.. contents
+
+ 1. Intro : Huh? Devres?
+ 2. Devres : Devres in a nutshell
+ 3. Devres Group : Group devres'es and release them together
+ 4. Details : Life time rules, calling context, ...
+ 5. Overhead : How much do we have to pay for this?
+ 6. List of managed interfaces: Currently implemented managed interfaces
+
+
+1. Intro
+--------
+
+devres came up while trying to convert libata to use iomap. Each
+iomapped address should be kept and unmapped on driver detach. For
+example, a plain SFF ATA controller (that is, good old PCI IDE) in
+native mode makes use of 5 PCI BARs and all of them should be
+maintained.
+
+As with many other device drivers, libata low level drivers have
+sufficient bugs in ->remove and ->probe failure path. Well, yes,
+that's probably because libata low level driver developers are lazy
+bunch, but aren't all low level driver developers? After spending a
+day fiddling with braindamaged hardware with no document or
+braindamaged document, if it's finally working, well, it's working.
+
+For one reason or another, low level drivers don't receive as much
+attention or testing as core code, and bugs on driver detach or
+initialization failure don't happen often enough to be noticeable.
+Init failure path is worse because it's much less travelled while
+needs to handle multiple entry points.
+
+So, many low level drivers end up leaking resources on driver detach
+and having half broken failure path implementation in ->probe() which
+would leak resources or even cause oops when failure occurs. iomap
+adds more to this mix. So do msi and msix.
+
+
+2. Devres
+---------
+
+devres is basically linked list of arbitrarily sized memory areas
+associated with a struct device. Each devres entry is associated with
+a release function. A devres can be released in several ways. No
+matter what, all devres entries are released on driver detach. On
+release, the associated release function is invoked and then the
+devres entry is freed.
+
+Managed interface is created for resources commonly used by device
+drivers using devres. For example, coherent DMA memory is acquired
+using dma_alloc_coherent(). The managed version is called
+dmam_alloc_coherent(). It is identical to dma_alloc_coherent() except
+for the DMA memory allocated using it is managed and will be
+automatically released on driver detach. Implementation looks like
+the following::
+
+ struct dma_devres {
+ size_t size;
+ void *vaddr;
+ dma_addr_t dma_handle;
+ };
+
+ static void dmam_coherent_release(struct device *dev, void *res)
+ {
+ struct dma_devres *this = res;
+
+ dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
+ }
+
+ dmam_alloc_coherent(dev, size, dma_handle, gfp)
+ {
+ struct dma_devres *dr;
+ void *vaddr;
+
+ dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
+ ...
+
+ /* alloc DMA memory as usual */
+ vaddr = dma_alloc_coherent(...);
+ ...
+
+ /* record size, vaddr, dma_handle in dr */
+ dr->vaddr = vaddr;
+ ...
+
+ devres_add(dev, dr);
+
+ return vaddr;
+ }
+
+If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
+freed whether initialization fails half-way or the device gets
+detached. If most resources are acquired using managed interface, a
+driver can have much simpler init and exit code. Init path basically
+looks like the following::
+
+ my_init_one()
+ {
+ struct mydev *d;
+
+ d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->ring = dmam_alloc_coherent(...);
+ if (!d->ring)
+ return -ENOMEM;
+
+ if (check something)
+ return -EINVAL;
+ ...
+
+ return register_to_upper_layer(d);
+ }
+
+And exit path::
+
+ my_remove_one()
+ {
+ unregister_from_upper_layer(d);
+ shutdown_my_hardware();
+ }
+
+As shown above, low level drivers can be simplified a lot by using
+devres. Complexity is shifted from less maintained low level drivers
+to better maintained higher layer. Also, as init failure path is
+shared with exit path, both can get more testing.
+
+Note though that when converting current calls or assignments to
+managed devm_* versions it is up to you to check if internal operations
+like allocating memory, have failed. Managed resources pertains to the
+freeing of these resources *only* - all other checks needed are still
+on you. In some cases this may mean introducing checks that were not
+necessary before moving to the managed devm_* calls.
+
+
+3. Devres group
+---------------
+
+Devres entries can be grouped using devres group. When a group is
+released, all contained normal devres entries and properly nested
+groups are released. One usage is to rollback series of acquired
+resources on failure. For example::
+
+ if (!devres_open_group(dev, NULL, GFP_KERNEL))
+ return -ENOMEM;
+
+ acquire A;
+ if (failed)
+ goto err;
+
+ acquire B;
+ if (failed)
+ goto err;
+ ...
+
+ devres_remove_group(dev, NULL);
+ return 0;
+
+ err:
+ devres_release_group(dev, NULL);
+ return err_code;
+
+As resource acquisition failure usually means probe failure, constructs
+like above are usually useful in midlayer driver (e.g. libata core
+layer) where interface function shouldn't have side effect on failure.
+For LLDs, just returning error code suffices in most cases.
+
+Each group is identified by `void *id`. It can either be explicitly
+specified by @id argument to devres_open_group() or automatically
+created by passing NULL as @id as in the above example. In both
+cases, devres_open_group() returns the group's id. The returned id
+can be passed to other devres functions to select the target group.
+If NULL is given to those functions, the latest open group is
+selected.
+
+For example, you can do something like the following::
+
+ int my_midlayer_create_something()
+ {
+ if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
+ return -ENOMEM;
+
+ ...
+
+ devres_close_group(dev, my_midlayer_create_something);
+ return 0;
+ }
+
+ void my_midlayer_destroy_something()
+ {
+ devres_release_group(dev, my_midlayer_create_something);
+ }
+
+
+4. Details
+----------
+
+Lifetime of a devres entry begins on devres allocation and finishes
+when it is released or destroyed (removed and freed) - no reference
+counting.
+
+devres core guarantees atomicity to all basic devres operations and
+has support for single-instance devres types (atomic
+lookup-and-add-if-not-found). Other than that, synchronizing
+concurrent accesses to allocated devres data is caller's
+responsibility. This is usually non-issue because bus ops and
+resource allocations already do the job.
+
+For an example of single-instance devres type, read pcim_iomap_table()
+in lib/devres.c.
+
+All devres interface functions can be called without context if the
+right gfp mask is given.
+
+
+5. Overhead
+-----------
+
+Each devres bookkeeping info is allocated together with requested data
+area. With debug option turned off, bookkeeping info occupies 16
+bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
+up to ull alignment). If singly linked list is used, it can be
+reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
+
+Each devres group occupies 8 pointers. It can be reduced to 6 if
+singly linked list is used.
+
+Memory space overhead on ahci controller with two ports is between 300
+and 400 bytes on 32bit machine after naive conversion (we can
+certainly invest a bit more effort into libata core layer).
+
+
+6. List of managed interfaces
+-----------------------------
+
+CLOCK
+ devm_clk_get()
+ devm_clk_get_optional()
+ devm_clk_put()
+ devm_clk_bulk_get()
+ devm_clk_bulk_get_all()
+ devm_clk_bulk_get_optional()
+ devm_get_clk_from_childl()
+ devm_clk_hw_register()
+ devm_of_clk_add_hw_provider()
+ devm_clk_hw_register_clkdev()
+
+DMA
+ dmaenginem_async_device_register()
+ dmam_alloc_coherent()
+ dmam_alloc_attrs()
+ dmam_free_coherent()
+ dmam_pool_create()
+ dmam_pool_destroy()
+
+DRM
+ devm_drm_dev_init()
+
+GPIO
+ devm_gpiod_get()
+ devm_gpiod_get_index()
+ devm_gpiod_get_index_optional()
+ devm_gpiod_get_optional()
+ devm_gpiod_put()
+ devm_gpiod_unhinge()
+ devm_gpiochip_add_data()
+ devm_gpio_request()
+ devm_gpio_request_one()
+ devm_gpio_free()
+
+I2C
+ devm_i2c_new_dummy_device()
+
+IIO
+ devm_iio_device_alloc()
+ devm_iio_device_free()
+ devm_iio_device_register()
+ devm_iio_device_unregister()
+ devm_iio_kfifo_allocate()
+ devm_iio_kfifo_free()
+ devm_iio_triggered_buffer_setup()
+ devm_iio_triggered_buffer_cleanup()
+ devm_iio_trigger_alloc()
+ devm_iio_trigger_free()
+ devm_iio_trigger_register()
+ devm_iio_trigger_unregister()
+ devm_iio_channel_get()
+ devm_iio_channel_release()
+ devm_iio_channel_get_all()
+ devm_iio_channel_release_all()
+
+INPUT
+ devm_input_allocate_device()
+
+IO region
+ devm_release_mem_region()
+ devm_release_region()
+ devm_release_resource()
+ devm_request_mem_region()
+ devm_request_region()
+ devm_request_resource()
+
+IOMAP
+ devm_ioport_map()
+ devm_ioport_unmap()
+ devm_ioremap()
+ devm_ioremap_nocache()
+ devm_ioremap_wc()
+ devm_ioremap_resource() : checks resource, requests memory region, ioremaps
+ devm_iounmap()
+ pcim_iomap()
+ pcim_iomap_regions() : do request_region() and iomap() on multiple BARs
+ pcim_iomap_table() : array of mapped addresses indexed by BAR
+ pcim_iounmap()
+
+IRQ
+ devm_free_irq()
+ devm_request_any_context_irq()
+ devm_request_irq()
+ devm_request_threaded_irq()
+ devm_irq_alloc_descs()
+ devm_irq_alloc_desc()
+ devm_irq_alloc_desc_at()
+ devm_irq_alloc_desc_from()
+ devm_irq_alloc_descs_from()
+ devm_irq_alloc_generic_chip()
+ devm_irq_setup_generic_chip()
+ devm_irq_sim_init()
+
+LED
+ devm_led_classdev_register()
+ devm_led_classdev_unregister()
+
+MDIO
+ devm_mdiobus_alloc()
+ devm_mdiobus_alloc_size()
+ devm_mdiobus_free()
+
+MEM
+ devm_free_pages()
+ devm_get_free_pages()
+ devm_kasprintf()
+ devm_kcalloc()
+ devm_kfree()
+ devm_kmalloc()
+ devm_kmalloc_array()
+ devm_kmemdup()
+ devm_kstrdup()
+ devm_kvasprintf()
+ devm_kzalloc()
+
+MFD
+ devm_mfd_add_devices()
+
+MUX
+ devm_mux_chip_alloc()
+ devm_mux_chip_register()
+ devm_mux_control_get()
+
+PER-CPU MEM
+ devm_alloc_percpu()
+ devm_free_percpu()
+
+PCI
+ devm_pci_alloc_host_bridge() : managed PCI host bridge allocation
+ devm_pci_remap_cfgspace() : ioremap PCI configuration space
+ devm_pci_remap_cfg_resource() : ioremap PCI configuration space resource
+ pcim_enable_device() : after success, all PCI ops become managed
+ pcim_pin_device() : keep PCI device enabled after release
+
+PHY
+ devm_usb_get_phy()
+ devm_usb_put_phy()
+
+PINCTRL
+ devm_pinctrl_get()
+ devm_pinctrl_put()
+ devm_pinctrl_register()
+ devm_pinctrl_unregister()
+
+POWER
+ devm_reboot_mode_register()
+ devm_reboot_mode_unregister()
+
+PWM
+ devm_pwm_get()
+ devm_pwm_put()
+
+REGULATOR
+ devm_regulator_bulk_get()
+ devm_regulator_get()
+ devm_regulator_put()
+ devm_regulator_register()
+
+RESET
+ devm_reset_control_get()
+ devm_reset_controller_register()
+
+SERDEV
+ devm_serdev_device_open()
+
+SLAVE DMA ENGINE
+ devm_acpi_dma_controller_register()
+
+SPI
+ devm_spi_register_master()
+
+WATCHDOG
+ devm_watchdog_register_device()
diff --git a/Documentation/driver-api/driver-model/driver.rst b/Documentation/driver-api/driver-model/driver.rst
new file mode 100644
index 000000000000..11d281506a04
--- /dev/null
+++ b/Documentation/driver-api/driver-model/driver.rst
@@ -0,0 +1,223 @@
+==============
+Device Drivers
+==============
+
+See the kerneldoc for the struct device_driver.
+
+
+Allocation
+~~~~~~~~~~
+
+Device drivers are statically allocated structures. Though there may
+be multiple devices in a system that a driver supports, struct
+device_driver represents the driver as a whole (not a particular
+device instance).
+
+Initialization
+~~~~~~~~~~~~~~
+
+The driver must initialize at least the name and bus fields. It should
+also initialize the devclass field (when it arrives), so it may obtain
+the proper linkage internally. It should also initialize as many of
+the callbacks as possible, though each is optional.
+
+Declaration
+~~~~~~~~~~~
+
+As stated above, struct device_driver objects are statically
+allocated. Below is an example declaration of the eepro100
+driver. This declaration is hypothetical only; it relies on the driver
+being converted completely to the new model::
+
+ static struct device_driver eepro100_driver = {
+ .name = "eepro100",
+ .bus = &pci_bus_type,
+
+ .probe = eepro100_probe,
+ .remove = eepro100_remove,
+ .suspend = eepro100_suspend,
+ .resume = eepro100_resume,
+ };
+
+Most drivers will not be able to be converted completely to the new
+model because the bus they belong to has a bus-specific structure with
+bus-specific fields that cannot be generalized.
+
+The most common example of this are device ID structures. A driver
+typically defines an array of device IDs that it supports. The format
+of these structures and the semantics for comparing device IDs are
+completely bus-specific. Defining them as bus-specific entities would
+sacrifice type-safety, so we keep bus-specific structures around.
+
+Bus-specific drivers should include a generic struct device_driver in
+the definition of the bus-specific driver. Like this::
+
+ struct pci_driver {
+ const struct pci_device_id *id_table;
+ struct device_driver driver;
+ };
+
+A definition that included bus-specific fields would look like
+(using the eepro100 driver again)::
+
+ static struct pci_driver eepro100_driver = {
+ .id_table = eepro100_pci_tbl,
+ .driver = {
+ .name = "eepro100",
+ .bus = &pci_bus_type,
+ .probe = eepro100_probe,
+ .remove = eepro100_remove,
+ .suspend = eepro100_suspend,
+ .resume = eepro100_resume,
+ },
+ };
+
+Some may find the syntax of embedded struct initialization awkward or
+even a bit ugly. So far, it's the best way we've found to do what we want...
+
+Registration
+~~~~~~~~~~~~
+
+::
+
+ int driver_register(struct device_driver *drv);
+
+The driver registers the structure on startup. For drivers that have
+no bus-specific fields (i.e. don't have a bus-specific driver
+structure), they would use driver_register and pass a pointer to their
+struct device_driver object.
+
+Most drivers, however, will have a bus-specific structure and will
+need to register with the bus using something like pci_driver_register.
+
+It is important that drivers register their driver structure as early as
+possible. Registration with the core initializes several fields in the
+struct device_driver object, including the reference count and the
+lock. These fields are assumed to be valid at all times and may be
+used by the device model core or the bus driver.
+
+
+Transition Bus Drivers
+~~~~~~~~~~~~~~~~~~~~~~
+
+By defining wrapper functions, the transition to the new model can be
+made easier. Drivers can ignore the generic structure altogether and
+let the bus wrapper fill in the fields. For the callbacks, the bus can
+define generic callbacks that forward the call to the bus-specific
+callbacks of the drivers.
+
+This solution is intended to be only temporary. In order to get class
+information in the driver, the drivers must be modified anyway. Since
+converting drivers to the new model should reduce some infrastructural
+complexity and code size, it is recommended that they are converted as
+class information is added.
+
+Access
+~~~~~~
+
+Once the object has been registered, it may access the common fields of
+the object, like the lock and the list of devices::
+
+ int driver_for_each_dev(struct device_driver *drv, void *data,
+ int (*callback)(struct device *dev, void *data));
+
+The devices field is a list of all the devices that have been bound to
+the driver. The LDM core provides a helper function to operate on all
+the devices a driver controls. This helper locks the driver on each
+node access, and does proper reference counting on each device as it
+accesses it.
+
+
+sysfs
+~~~~~
+
+When a driver is registered, a sysfs directory is created in its
+bus's directory. In this directory, the driver can export an interface
+to userspace to control operation of the driver on a global basis;
+e.g. toggling debugging output in the driver.
+
+A future feature of this directory will be a 'devices' directory. This
+directory will contain symlinks to the directories of devices it
+supports.
+
+
+
+Callbacks
+~~~~~~~~~
+
+::
+
+ int (*probe) (struct device *dev);
+
+The probe() entry is called in task context, with the bus's rwsem locked
+and the driver partially bound to the device. Drivers commonly use
+container_of() to convert "dev" to a bus-specific type, both in probe()
+and other routines. That type often provides device resource data, such
+as pci_dev.resource[] or platform_device.resources, which is used in
+addition to dev->platform_data to initialize the driver.
+
+This callback holds the driver-specific logic to bind the driver to a
+given device. That includes verifying that the device is present, that
+it's a version the driver can handle, that driver data structures can
+be allocated and initialized, and that any hardware can be initialized.
+Drivers often store a pointer to their state with dev_set_drvdata().
+When the driver has successfully bound itself to that device, then probe()
+returns zero and the driver model code will finish its part of binding
+the driver to that device.
+
+A driver's probe() may return a negative errno value to indicate that
+the driver did not bind to this device, in which case it should have
+released all resources it allocated::
+
+ int (*remove) (struct device *dev);
+
+remove is called to unbind a driver from a device. This may be
+called if a device is physically removed from the system, if the
+driver module is being unloaded, during a reboot sequence, or
+in other cases.
+
+It is up to the driver to determine if the device is present or
+not. It should free any resources allocated specifically for the
+device; i.e. anything in the device's driver_data field.
+
+If the device is still present, it should quiesce the device and place
+it into a supported low-power state::
+
+ int (*suspend) (struct device *dev, pm_message_t state);
+
+suspend is called to put the device in a low power state::
+
+ int (*resume) (struct device *dev);
+
+Resume is used to bring a device back from a low power state.
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+ struct driver_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct device_driver *driver, char *buf);
+ ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
+ };
+
+Device drivers can export attributes via their sysfs directories.
+Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
+macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
+macros.
+
+Example::
+
+ DRIVER_ATTR_RW(debug);
+
+This is equivalent to declaring::
+
+ struct driver_attribute driver_attr_debug;
+
+This can then be used to add and remove the attribute from the
+driver's directory using::
+
+ int driver_create_file(struct device_driver *, const struct driver_attribute *);
+ void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-api/driver-model/index.rst b/Documentation/driver-api/driver-model/index.rst
new file mode 100644
index 000000000000..755016422269
--- /dev/null
+++ b/Documentation/driver-api/driver-model/index.rst
@@ -0,0 +1,24 @@
+============
+Driver Model
+============
+
+.. toctree::
+ :maxdepth: 1
+
+ binding
+ bus
+ class
+ design-patterns
+ device
+ devres
+ driver
+ overview
+ platform
+ porting
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst
new file mode 100644
index 000000000000..d4d1e9b40e0c
--- /dev/null
+++ b/Documentation/driver-api/driver-model/overview.rst
@@ -0,0 +1,124 @@
+=============================
+The Linux Kernel Device Model
+=============================
+
+Patrick Mochel <mochel@digitalimplant.org>
+
+Drafted 26 August 2002
+Updated 31 January 2006
+
+
+Overview
+~~~~~~~~
+
+The Linux Kernel Driver Model is a unification of all the disparate driver
+models that were previously used in the kernel. It is intended to augment the
+bus-specific drivers for bridges and devices by consolidating a set of data
+and operations into globally accessible data structures.
+
+Traditional driver models implemented some sort of tree-like structure
+(sometimes just a list) for the devices they control. There wasn't any
+uniformity across the different bus types.
+
+The current driver model provides a common, uniform data model for describing
+a bus and the devices that can appear under the bus. The unified bus
+model includes a set of common attributes which all busses carry, and a set
+of common callbacks, such as device discovery during bus probing, bus
+shutdown, bus power management, etc.
+
+The common device and bridge interface reflects the goals of the modern
+computer: namely the ability to do seamless device "plug and play", power
+management, and hot plug. In particular, the model dictated by Intel and
+Microsoft (namely ACPI) ensures that almost every device on almost any bus
+on an x86-compatible system can work within this paradigm. Of course,
+not every bus is able to support all such operations, although most
+buses support most of those operations.
+
+
+Downstream Access
+~~~~~~~~~~~~~~~~~
+
+Common data fields have been moved out of individual bus layers into a common
+data structure. These fields must still be accessed by the bus layers,
+and sometimes by the device-specific drivers.
+
+Other bus layers are encouraged to do what has been done for the PCI layer.
+struct pci_dev now looks like this::
+
+ struct pci_dev {
+ ...
+
+ struct device dev; /* Generic device interface */
+ ...
+ };
+
+Note first that the struct device dev within the struct pci_dev is
+statically allocated. This means only one allocation on device discovery.
+
+Note also that that struct device dev is not necessarily defined at the
+front of the pci_dev structure. This is to make people think about what
+they're doing when switching between the bus driver and the global driver,
+and to discourage meaningless and incorrect casts between the two.
+
+The PCI bus layer freely accesses the fields of struct device. It knows about
+the structure of struct pci_dev, and it should know the structure of struct
+device. Individual PCI device drivers that have been converted to the current
+driver model generally do not and should not touch the fields of struct device,
+unless there is a compelling reason to do so.
+
+The above abstraction prevents unnecessary pain during transitional phases.
+If it were not done this way, then when a field was renamed or removed, every
+downstream driver would break. On the other hand, if only the bus layer
+(and not the device layer) accesses the struct device, it is only the bus
+layer that needs to change.
+
+
+User Interface
+~~~~~~~~~~~~~~
+
+By virtue of having a complete hierarchical view of all the devices in the
+system, exporting a complete hierarchical view to userspace becomes relatively
+easy. This has been accomplished by implementing a special purpose virtual
+file system named sysfs.
+
+Almost all mainstream Linux distros mount this filesystem automatically; you
+can see some variation of the following in the output of the "mount" command::
+
+ $ mount
+ ...
+ none on /sys type sysfs (rw,noexec,nosuid,nodev)
+ ...
+ $
+
+The auto-mounting of sysfs is typically accomplished by an entry similar to
+the following in the /etc/fstab file::
+
+ none /sys sysfs defaults 0 0
+
+or something similar in the /lib/init/fstab file on Debian-based systems::
+
+ none /sys sysfs nodev,noexec,nosuid 0 0
+
+If sysfs is not automatically mounted, you can always do it manually with::
+
+ # mount -t sysfs sysfs /sys
+
+Whenever a device is inserted into the tree, a directory is created for it.
+This directory may be populated at each layer of discovery - the global layer,
+the bus layer, or the device layer.
+
+The global layer currently creates two files - 'name' and 'power'. The
+former only reports the name of the device. The latter reports the
+current power state of the device. It will also be used to set the current
+power state.
+
+The bus layer may also create files for the devices it finds while probing the
+bus. For example, the PCI layer currently creates 'irq' and 'resource' files
+for each PCI device.
+
+A device-specific driver may also export files in its directory to expose
+device-specific data or tunable interfaces.
+
+More information about the sysfs directory layout can be found in
+the other documents in this directory and in the file
+Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst
new file mode 100644
index 000000000000..334dd4071ae4
--- /dev/null
+++ b/Documentation/driver-api/driver-model/platform.rst
@@ -0,0 +1,246 @@
+============================
+Platform Devices and Drivers
+============================
+
+See <linux/platform_device.h> for the driver model interface to the
+platform bus: platform_device, and platform_driver. This pseudo-bus
+is used to connect devices on busses with minimal infrastructure,
+like those used to integrate peripherals on many system-on-chip
+processors, or some "legacy" PC interconnects; as opposed to large
+formally specified ones like PCI or USB.
+
+
+Platform devices
+~~~~~~~~~~~~~~~~
+Platform devices are devices that typically appear as autonomous
+entities in the system. This includes legacy port-based devices and
+host bridges to peripheral buses, and most controllers integrated
+into system-on-chip platforms. What they usually have in common
+is direct addressing from a CPU bus. Rarely, a platform_device will
+be connected through a segment of some other kind of bus; but its
+registers will still be directly addressable.
+
+Platform devices are given a name, used in driver binding, and a
+list of resources such as addresses and IRQs::
+
+ struct platform_device {
+ const char *name;
+ u32 id;
+ struct device dev;
+ u32 num_resources;
+ struct resource *resource;
+ };
+
+
+Platform drivers
+~~~~~~~~~~~~~~~~
+Platform drivers follow the standard driver model convention, where
+discovery/enumeration is handled outside the drivers, and drivers
+provide probe() and remove() methods. They support power management
+and shutdown notifications using the standard conventions::
+
+ struct platform_driver {
+ int (*probe)(struct platform_device *);
+ int (*remove)(struct platform_device *);
+ void (*shutdown)(struct platform_device *);
+ int (*suspend)(struct platform_device *, pm_message_t state);
+ int (*suspend_late)(struct platform_device *, pm_message_t state);
+ int (*resume_early)(struct platform_device *);
+ int (*resume)(struct platform_device *);
+ struct device_driver driver;
+ };
+
+Note that probe() should in general verify that the specified device hardware
+actually exists; sometimes platform setup code can't be sure. The probing
+can use device resources, including clocks, and device platform_data.
+
+Platform drivers register themselves the normal way::
+
+ int platform_driver_register(struct platform_driver *drv);
+
+Or, in common situations where the device is known not to be hot-pluggable,
+the probe() routine can live in an init section to reduce the driver's
+runtime memory footprint::
+
+ int platform_driver_probe(struct platform_driver *drv,
+ int (*probe)(struct platform_device *))
+
+Kernel modules can be composed of several platform drivers. The platform core
+provides helpers to register and unregister an array of drivers::
+
+ int __platform_register_drivers(struct platform_driver * const *drivers,
+ unsigned int count, struct module *owner);
+ void platform_unregister_drivers(struct platform_driver * const *drivers,
+ unsigned int count);
+
+If one of the drivers fails to register, all drivers registered up to that
+point will be unregistered in reverse order. Note that there is a convenience
+macro that passes THIS_MODULE as owner parameter::
+
+ #define platform_register_drivers(drivers, count)
+
+
+Device Enumeration
+~~~~~~~~~~~~~~~~~~
+As a rule, platform specific (and often board-specific) setup code will
+register platform devices::
+
+ int platform_device_register(struct platform_device *pdev);
+
+ int platform_add_devices(struct platform_device **pdevs, int ndev);
+
+The general rule is to register only those devices that actually exist,
+but in some cases extra devices might be registered. For example, a kernel
+might be configured to work with an external network adapter that might not
+be populated on all boards, or likewise to work with an integrated controller
+that some boards might not hook up to any peripherals.
+
+In some cases, boot firmware will export tables describing the devices
+that are populated on a given board. Without such tables, often the
+only way for system setup code to set up the correct devices is to build
+a kernel for a specific target board. Such board-specific kernels are
+common with embedded and custom systems development.
+
+In many cases, the memory and IRQ resources associated with the platform
+device are not enough to let the device's driver work. Board setup code
+will often provide additional information using the device's platform_data
+field to hold additional information.
+
+Embedded systems frequently need one or more clocks for platform devices,
+which are normally kept off until they're actively needed (to save power).
+System setup also associates those clocks with the device, so that that
+calls to clk_get(&pdev->dev, clock_name) return them as needed.
+
+
+Legacy Drivers: Device Probing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some drivers are not fully converted to the driver model, because they take
+on a non-driver role: the driver registers its platform device, rather than
+leaving that for system infrastructure. Such drivers can't be hotplugged
+or coldplugged, since those mechanisms require device creation to be in a
+different system component than the driver.
+
+The only "good" reason for this is to handle older system designs which, like
+original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
+configuration. Newer systems have largely abandoned that model, in favor of
+bus-level support for dynamic configuration (PCI, USB), or device tables
+provided by the boot firmware (e.g. PNPACPI on x86). There are too many
+conflicting options about what might be where, and even educated guesses by
+an operating system will be wrong often enough to make trouble.
+
+This style of driver is discouraged. If you're updating such a driver,
+please try to move the device enumeration to a more appropriate location,
+outside the driver. This will usually be cleanup, since such drivers
+tend to already have "normal" modes, such as ones using device nodes that
+were created by PNP or by platform device setup.
+
+None the less, there are some APIs to support such legacy drivers. Avoid
+using these calls except with such hotplug-deficient drivers::
+
+ struct platform_device *platform_device_alloc(
+ const char *name, int id);
+
+You can use platform_device_alloc() to dynamically allocate a device, which
+you will then initialize with resources and platform_device_register().
+A better solution is usually::
+
+ struct platform_device *platform_device_register_simple(
+ const char *name, int id,
+ struct resource *res, unsigned int nres);
+
+You can use platform_device_register_simple() as a one-step call to allocate
+and register a device.
+
+
+Device Naming and Driver Binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The platform_device.dev.bus_id is the canonical name for the devices.
+It's built from two components:
+
+ * platform_device.name ... which is also used to for driver matching.
+
+ * platform_device.id ... the device instance number, or else "-1"
+ to indicate there's only one.
+
+These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
+"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
+named "serial". While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
+and use the platform_driver called "my_rtc".
+
+Driver binding is performed automatically by the driver core, invoking
+driver probe() after finding a match between device and driver. If the
+probe() succeeds, the driver and device are bound as usual. There are
+three different ways to find such a match:
+
+ - Whenever a device is registered, the drivers for that bus are
+ checked for matches. Platform devices should be registered very
+ early during system boot.
+
+ - When a driver is registered using platform_driver_register(), all
+ unbound devices on that bus are checked for matches. Drivers
+ usually register later during booting, or by module loading.
+
+ - Registering a driver using platform_driver_probe() works just like
+ using platform_driver_register(), except that the driver won't
+ be probed later if another device registers. (Which is OK, since
+ this interface is only for use with non-hotpluggable devices.)
+
+
+Early Platform Devices and Drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The early platform interfaces provide platform data to platform device
+drivers early on during the system boot. The code is built on top of the
+early_param() command line parsing and can be executed very early on.
+
+Example: "earlyprintk" class early serial console in 6 steps
+
+1. Registering early platform device data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code registers platform device data using the function
+early_platform_add_devices(). In the case of early serial console this
+should be hardware configuration for the serial port. Devices registered
+at this point will later on be matched against early platform drivers.
+
+2. Parsing kernel command line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls parse_early_param() to parse the kernel
+command line. This will execute all matching early_param() callbacks.
+User specified early platform devices will be registered at this point.
+For the early serial console case the user can specify port on the
+kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
+the class string, "serial" is the name of the platform driver and
+0 is the platform device id. If the id is -1 then the dot and the
+id can be omitted.
+
+3. Installing early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code may optionally force registration of all early
+platform drivers belonging to a certain class using the function
+early_platform_driver_register_all(). User specified devices from
+step 2 have priority over these. This step is omitted by the serial
+driver example since the early serial driver code should be disabled
+unless the user has specified port on the kernel command line.
+
+4. Early platform driver registration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Compiled-in platform drivers making use of early_platform_init() are
+automatically registered during step 2 or 3. The serial driver example
+should use early_platform_init("earlyprintk", &platform_driver).
+
+5. Probing of early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls early_platform_driver_probe() to match
+registered early platform devices associated with a certain class with
+registered early platform drivers. Matched devices will get probed().
+This step can be executed at any point during the early boot. As soon
+as possible may be good for the serial port case.
+
+6. Inside the early platform driver probe()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The driver code needs to take special care during early boot, especially
+when it comes to memory allocation and interrupt registration. The code
+in the probe() function can use is_early_platform_device() to check if
+it is called at early platform device or at the regular platform device
+time. The early serial driver performs register_console() at this point.
+
+For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-api/driver-model/porting.rst b/Documentation/driver-api/driver-model/porting.rst
new file mode 100644
index 000000000000..931ea879af3f
--- /dev/null
+++ b/Documentation/driver-api/driver-model/porting.rst
@@ -0,0 +1,448 @@
+=======================================
+Porting Drivers to the New Driver Model
+=======================================
+
+Patrick Mochel
+
+7 January 2003
+
+
+Overview
+
+Please refer to `Documentation/driver-api/driver-model/*.rst` for definitions of
+various driver types and concepts.
+
+Most of the work of porting devices drivers to the new model happens
+at the bus driver layer. This was intentional, to minimize the
+negative effect on kernel drivers, and to allow a gradual transition
+of bus drivers.
+
+In a nutshell, the driver model consists of a set of objects that can
+be embedded in larger, bus-specific objects. Fields in these generic
+objects can replace fields in the bus-specific objects.
+
+The generic objects must be registered with the driver model core. By
+doing so, they will exported via the sysfs filesystem. sysfs can be
+mounted by doing::
+
+ # mount -t sysfs sysfs /sys
+
+
+
+The Process
+
+Step 0: Read include/linux/device.h for object and function definitions.
+
+Step 1: Registering the bus driver.
+
+
+- Define a struct bus_type for the bus driver::
+
+ struct bus_type pci_bus_type = {
+ .name = "pci",
+ };
+
+
+- Register the bus type.
+
+ This should be done in the initialization function for the bus type,
+ which is usually the module_init(), or equivalent, function::
+
+ static int __init pci_driver_init(void)
+ {
+ return bus_register(&pci_bus_type);
+ }
+
+ subsys_initcall(pci_driver_init);
+
+
+ The bus type may be unregistered (if the bus driver may be compiled
+ as a module) by doing::
+
+ bus_unregister(&pci_bus_type);
+
+
+- Export the bus type for others to use.
+
+ Other code may wish to reference the bus type, so declare it in a
+ shared header file and export the symbol.
+
+From include/linux/pci.h::
+
+ extern struct bus_type pci_bus_type;
+
+
+From file the above code appears in::
+
+ EXPORT_SYMBOL(pci_bus_type);
+
+
+
+- This will cause the bus to show up in /sys/bus/pci/ with two
+ subdirectories: 'devices' and 'drivers'::
+
+ # tree -d /sys/bus/pci/
+ /sys/bus/pci/
+ |-- devices
+ `-- drivers
+
+
+
+Step 2: Registering Devices.
+
+struct device represents a single device. It mainly contains metadata
+describing the relationship the device has to other entities.
+
+
+- Embed a struct device in the bus-specific device type::
+
+
+ struct pci_dev {
+ ...
+ struct device dev; /* Generic device interface */
+ ...
+ };
+
+ It is recommended that the generic device not be the first item in
+ the struct to discourage programmers from doing mindless casts
+ between the object types. Instead macros, or inline functions,
+ should be created to convert from the generic object type::
+
+
+ #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
+
+ or
+
+ static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
+ {
+ return container_of(n, struct pci_dev, dev);
+ }
+
+ This allows the compiler to verify type-safety of the operations
+ that are performed (which is Good).
+
+
+- Initialize the device on registration.
+
+ When devices are discovered or registered with the bus type, the
+ bus driver should initialize the generic device. The most important
+ things to initialize are the bus_id, parent, and bus fields.
+
+ The bus_id is an ASCII string that contains the device's address on
+ the bus. The format of this string is bus-specific. This is
+ necessary for representing devices in sysfs.
+
+ parent is the physical parent of the device. It is important that
+ the bus driver sets this field correctly.
+
+ The driver model maintains an ordered list of devices that it uses
+ for power management. This list must be in order to guarantee that
+ devices are shutdown before their physical parents, and vice versa.
+ The order of this list is determined by the parent of registered
+ devices.
+
+ Also, the location of the device's sysfs directory depends on a
+ device's parent. sysfs exports a directory structure that mirrors
+ the device hierarchy. Accurately setting the parent guarantees that
+ sysfs will accurately represent the hierarchy.
+
+ The device's bus field is a pointer to the bus type the device
+ belongs to. This should be set to the bus_type that was declared
+ and initialized before.
+
+ Optionally, the bus driver may set the device's name and release
+ fields.
+
+ The name field is an ASCII string describing the device, like
+
+ "ATI Technologies Inc Radeon QD"
+
+ The release field is a callback that the driver model core calls
+ when the device has been removed, and all references to it have
+ been released. More on this in a moment.
+
+
+- Register the device.
+
+ Once the generic device has been initialized, it can be registered
+ with the driver model core by doing::
+
+ device_register(&dev->dev);
+
+ It can later be unregistered by doing::
+
+ device_unregister(&dev->dev);
+
+ This should happen on buses that support hotpluggable devices.
+ If a bus driver unregisters a device, it should not immediately free
+ it. It should instead wait for the driver model core to call the
+ device's release method, then free the bus-specific object.
+ (There may be other code that is currently referencing the device
+ structure, and it would be rude to free the device while that is
+ happening).
+
+
+ When the device is registered, a directory in sysfs is created.
+ The PCI tree in sysfs looks like::
+
+ /sys/devices/pci0/
+ |-- 00:00.0
+ |-- 00:01.0
+ | `-- 01:00.0
+ |-- 00:02.0
+ | `-- 02:1f.0
+ | `-- 03:00.0
+ |-- 00:1e.0
+ | `-- 04:04.0
+ |-- 00:1f.0
+ |-- 00:1f.1
+ | |-- ide0
+ | | |-- 0.0
+ | | `-- 0.1
+ | `-- ide1
+ | `-- 1.0
+ |-- 00:1f.2
+ |-- 00:1f.3
+ `-- 00:1f.5
+
+ Also, symlinks are created in the bus's 'devices' directory
+ that point to the device's directory in the physical hierarchy::
+
+ /sys/bus/pci/devices/
+ |-- 00:00.0 -> ../../../devices/pci0/00:00.0
+ |-- 00:01.0 -> ../../../devices/pci0/00:01.0
+ |-- 00:02.0 -> ../../../devices/pci0/00:02.0
+ |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
+ |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
+ |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
+ |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
+ |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
+ |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
+ |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
+ |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
+ |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
+ `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
+
+
+
+Step 3: Registering Drivers.
+
+struct device_driver is a simple driver structure that contains a set
+of operations that the driver model core may call.
+
+
+- Embed a struct device_driver in the bus-specific driver.
+
+ Just like with devices, do something like::
+
+ struct pci_driver {
+ ...
+ struct device_driver driver;
+ };
+
+
+- Initialize the generic driver structure.
+
+ When the driver registers with the bus (e.g. doing pci_register_driver()),
+ initialize the necessary fields of the driver: the name and bus
+ fields.
+
+
+- Register the driver.
+
+ After the generic driver has been initialized, call::
+
+ driver_register(&drv->driver);
+
+ to register the driver with the core.
+
+ When the driver is unregistered from the bus, unregister it from the
+ core by doing::
+
+ driver_unregister(&drv->driver);
+
+ Note that this will block until all references to the driver have
+ gone away. Normally, there will not be any.
+
+
+- Sysfs representation.
+
+ Drivers are exported via sysfs in their bus's 'driver's directory.
+ For example::
+
+ /sys/bus/pci/drivers/
+ |-- 3c59x
+ |-- Ensoniq AudioPCI
+ |-- agpgart-amdk7
+ |-- e100
+ `-- serial
+
+
+Step 4: Define Generic Methods for Drivers.
+
+struct device_driver defines a set of operations that the driver model
+core calls. Most of these operations are probably similar to
+operations the bus already defines for drivers, but taking different
+parameters.
+
+It would be difficult and tedious to force every driver on a bus to
+simultaneously convert their drivers to generic format. Instead, the
+bus driver should define single instances of the generic methods that
+forward call to the bus-specific drivers. For instance::
+
+
+ static int pci_device_remove(struct device * dev)
+ {
+ struct pci_dev * pci_dev = to_pci_dev(dev);
+ struct pci_driver * drv = pci_dev->driver;
+
+ if (drv) {
+ if (drv->remove)
+ drv->remove(pci_dev);
+ pci_dev->driver = NULL;
+ }
+ return 0;
+ }
+
+
+The generic driver should be initialized with these methods before it
+is registered::
+
+ /* initialize common driver fields */
+ drv->driver.name = drv->name;
+ drv->driver.bus = &pci_bus_type;
+ drv->driver.probe = pci_device_probe;
+ drv->driver.resume = pci_device_resume;
+ drv->driver.suspend = pci_device_suspend;
+ drv->driver.remove = pci_device_remove;
+
+ /* register with core */
+ driver_register(&drv->driver);
+
+
+Ideally, the bus should only initialize the fields if they are not
+already set. This allows the drivers to implement their own generic
+methods.
+
+
+Step 5: Support generic driver binding.
+
+The model assumes that a device or driver can be dynamically
+registered with the bus at any time. When registration happens,
+devices must be bound to a driver, or drivers must be bound to all
+devices that it supports.
+
+A driver typically contains a list of device IDs that it supports. The
+bus driver compares these IDs to the IDs of devices registered with it.
+The format of the device IDs, and the semantics for comparing them are
+bus-specific, so the generic model does attempt to generalize them.
+
+Instead, a bus may supply a method in struct bus_type that does the
+comparison::
+
+ int (*match)(struct device * dev, struct device_driver * drv);
+
+match should return positive value if the driver supports the device,
+and zero otherwise. It may also return error code (for example
+-EPROBE_DEFER) if determining that given driver supports the device is
+not possible.
+
+When a device is registered, the bus's list of drivers is iterated
+over. bus->match() is called for each one until a match is found.
+
+When a driver is registered, the bus's list of devices is iterated
+over. bus->match() is called for each device that is not already
+claimed by a driver.
+
+When a device is successfully bound to a driver, device->driver is
+set, the device is added to a per-driver list of devices, and a
+symlink is created in the driver's sysfs directory that points to the
+device's physical directory::
+
+ /sys/bus/pci/drivers/
+ |-- 3c59x
+ | `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
+ |-- Ensoniq AudioPCI
+ |-- agpgart-amdk7
+ | `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
+ |-- e100
+ | `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
+ `-- serial
+
+
+This driver binding should replace the existing driver binding
+mechanism the bus currently uses.
+
+
+Step 6: Supply a hotplug callback.
+
+Whenever a device is registered with the driver model core, the
+userspace program /sbin/hotplug is called to notify userspace.
+Users can define actions to perform when a device is inserted or
+removed.
+
+The driver model core passes several arguments to userspace via
+environment variables, including
+
+- ACTION: set to 'add' or 'remove'
+- DEVPATH: set to the device's physical path in sysfs.
+
+A bus driver may also supply additional parameters for userspace to
+consume. To do this, a bus must implement the 'hotplug' method in
+struct bus_type::
+
+ int (*hotplug) (struct device *dev, char **envp,
+ int num_envp, char *buffer, int buffer_size);
+
+This is called immediately before /sbin/hotplug is executed.
+
+
+Step 7: Cleaning up the bus driver.
+
+The generic bus, device, and driver structures provide several fields
+that can replace those defined privately to the bus driver.
+
+- Device list.
+
+struct bus_type contains a list of all devices registered with the bus
+type. This includes all devices on all instances of that bus type.
+An internal list that the bus uses may be removed, in favor of using
+this one.
+
+The core provides an iterator to access these devices::
+
+ int bus_for_each_dev(struct bus_type * bus, struct device * start,
+ void * data, int (*fn)(struct device *, void *));
+
+
+- Driver list.
+
+struct bus_type also contains a list of all drivers registered with
+it. An internal list of drivers that the bus driver maintains may
+be removed in favor of using the generic one.
+
+The drivers may be iterated over, like devices::
+
+ int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+ void * data, int (*fn)(struct device_driver *, void *));
+
+
+Please see drivers/base/bus.c for more information.
+
+
+- rwsem
+
+struct bus_type contains an rwsem that protects all core accesses to
+the device and driver lists. This can be used by the bus driver
+internally, and should be used when accessing the device or driver
+lists the bus maintains.
+
+
+- Device and driver fields.
+
+Some of the fields in struct device and struct device_driver duplicate
+fields in the bus-specific representations of these objects. Feel free
+to remove the bus-specific ones and favor the generic ones. Note
+though, that this will likely mean fixing up all the drivers that
+reference the bus-specific fields (though those should all be 1-line
+changes).
diff --git a/Documentation/driver-api/early-userspace/buffer-format.rst b/Documentation/driver-api/early-userspace/buffer-format.rst
new file mode 100644
index 000000000000..7f74e301fdf3
--- /dev/null
+++ b/Documentation/driver-api/early-userspace/buffer-format.rst
@@ -0,0 +1,119 @@
+=======================
+initramfs buffer format
+=======================
+
+Al Viro, H. Peter Anvin
+
+Last revision: 2002-01-13
+
+Starting with kernel 2.5.x, the old "initial ramdisk" protocol is
+getting {replaced/complemented} with the new "initial ramfs"
+(initramfs) protocol. The initramfs contents is passed using the same
+memory buffer protocol used by the initrd protocol, but the contents
+is different. The initramfs buffer contains an archive which is
+expanded into a ramfs filesystem; this document details the format of
+the initramfs buffer format.
+
+The initramfs buffer format is based around the "newc" or "crc" CPIO
+formats, and can be created with the cpio(1) utility. The cpio
+archive can be compressed using gzip(1). One valid version of an
+initramfs buffer is thus a single .cpio.gz file.
+
+The full format of the initramfs buffer is defined by the following
+grammar, where::
+
+ * is used to indicate "0 or more occurrences of"
+ (|) indicates alternatives
+ + indicates concatenation
+ GZIP() indicates the gzip(1) of the operand
+ ALGN(n) means padding with null bytes to an n-byte boundary
+
+ initramfs := ("\0" | cpio_archive | cpio_gzip_archive)*
+
+ cpio_gzip_archive := GZIP(cpio_archive)
+
+ cpio_archive := cpio_file* + (<nothing> | cpio_trailer)
+
+ cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data
+
+ cpio_trailer := ALGN(4) + cpio_header + "TRAILER!!!\0" + ALGN(4)
+
+
+In human terms, the initramfs buffer contains a collection of
+compressed and/or uncompressed cpio archives (in the "newc" or "crc"
+formats); arbitrary amounts zero bytes (for padding) can be added
+between members.
+
+The cpio "TRAILER!!!" entry (cpio end-of-archive) is optional, but is
+not ignored; see "handling of hard links" below.
+
+The structure of the cpio_header is as follows (all fields contain
+hexadecimal ASCII numbers fully padded with '0' on the left to the
+full width of the field, for example, the integer 4780 is represented
+by the ASCII string "000012ac"):
+
+============= ================== ==============================================
+Field name Field size Meaning
+============= ================== ==============================================
+c_magic 6 bytes The string "070701" or "070702"
+c_ino 8 bytes File inode number
+c_mode 8 bytes File mode and permissions
+c_uid 8 bytes File uid
+c_gid 8 bytes File gid
+c_nlink 8 bytes Number of links
+c_mtime 8 bytes Modification time
+c_filesize 8 bytes Size of data field
+c_maj 8 bytes Major part of file device number
+c_min 8 bytes Minor part of file device number
+c_rmaj 8 bytes Major part of device node reference
+c_rmin 8 bytes Minor part of device node reference
+c_namesize 8 bytes Length of filename, including final \0
+c_chksum 8 bytes Checksum of data field if c_magic is 070702;
+ otherwise zero
+============= ================== ==============================================
+
+The c_mode field matches the contents of st_mode returned by stat(2)
+on Linux, and encodes the file type and file permissions.
+
+The c_filesize should be zero for any file which is not a regular file
+or symlink.
+
+The c_chksum field contains a simple 32-bit unsigned sum of all the
+bytes in the data field. cpio(1) refers to this as "crc", which is
+clearly incorrect (a cyclic redundancy check is a different and
+significantly stronger integrity check), however, this is the
+algorithm used.
+
+If the filename is "TRAILER!!!" this is actually an end-of-archive
+marker; the c_filesize for an end-of-archive marker must be zero.
+
+
+Handling of hard links
+======================
+
+When a nondirectory with c_nlink > 1 is seen, the (c_maj,c_min,c_ino)
+tuple is looked up in a tuple buffer. If not found, it is entered in
+the tuple buffer and the entry is created as usual; if found, a hard
+link rather than a second copy of the file is created. It is not
+necessary (but permitted) to include a second copy of the file
+contents; if the file contents is not included, the c_filesize field
+should be set to zero to indicate no data section follows. If data is
+present, the previous instance of the file is overwritten; this allows
+the data-carrying instance of a file to occur anywhere in the sequence
+(GNU cpio is reported to attach the data to the last instance of a
+file only.)
+
+c_filesize must not be zero for a symlink.
+
+When a "TRAILER!!!" end-of-archive marker is seen, the tuple buffer is
+reset. This permits archives which are generated independently to be
+concatenated.
+
+To combine file data from different sources (without having to
+regenerate the (c_maj,c_min,c_ino) fields), therefore, either one of
+the following techniques can be used:
+
+a) Separate the different file data sources with a "TRAILER!!!"
+ end-of-archive marker, or
+
+b) Make sure c_nlink == 1 for all nondirectory entries.
diff --git a/Documentation/driver-api/early-userspace/early_userspace_support.rst b/Documentation/driver-api/early-userspace/early_userspace_support.rst
new file mode 100644
index 000000000000..3deefb34046b
--- /dev/null
+++ b/Documentation/driver-api/early-userspace/early_userspace_support.rst
@@ -0,0 +1,154 @@
+=======================
+Early userspace support
+=======================
+
+Last update: 2004-12-20 tlh
+
+
+"Early userspace" is a set of libraries and programs that provide
+various pieces of functionality that are important enough to be
+available while a Linux kernel is coming up, but that don't need to be
+run inside the kernel itself.
+
+It consists of several major infrastructure components:
+
+- gen_init_cpio, a program that builds a cpio-format archive
+ containing a root filesystem image. This archive is compressed, and
+ the compressed image is linked into the kernel image.
+- initramfs, a chunk of code that unpacks the compressed cpio image
+ midway through the kernel boot process.
+- klibc, a userspace C library, currently packaged separately, that is
+ optimized for correctness and small size.
+
+The cpio file format used by initramfs is the "newc" (aka "cpio -H newc")
+format, and is documented in the file "buffer-format.txt". There are
+two ways to add an early userspace image: specify an existing cpio
+archive to be used as the image or have the kernel build process build
+the image from specifications.
+
+CPIO ARCHIVE method
+-------------------
+
+You can create a cpio archive that contains the early userspace image.
+Your cpio archive should be specified in CONFIG_INITRAMFS_SOURCE and it
+will be used directly. Only a single cpio file may be specified in
+CONFIG_INITRAMFS_SOURCE and directory and file names are not allowed in
+combination with a cpio archive.
+
+IMAGE BUILDING method
+---------------------
+
+The kernel build process can also build an early userspace image from
+source parts rather than supplying a cpio archive. This method provides
+a way to create images with root-owned files even though the image was
+built by an unprivileged user.
+
+The image is specified as one or more sources in
+CONFIG_INITRAMFS_SOURCE. Sources can be either directories or files -
+cpio archives are *not* allowed when building from sources.
+
+A source directory will have it and all of its contents packaged. The
+specified directory name will be mapped to '/'. When packaging a
+directory, limited user and group ID translation can be performed.
+INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to
+user root (0). INITRAMFS_ROOT_GID can be set to a group ID that needs
+to be mapped to group root (0).
+
+A source file must be directives in the format required by the
+usr/gen_init_cpio utility (run 'usr/gen_init_cpio -h' to get the
+file format). The directives in the file will be passed directly to
+usr/gen_init_cpio.
+
+When a combination of directories and files are specified then the
+initramfs image will be an aggregate of all of them. In this way a user
+can create a 'root-image' directory and install all files into it.
+Because device-special files cannot be created by a unprivileged user,
+special files can be listed in a 'root-files' file. Both 'root-image'
+and 'root-files' can be listed in CONFIG_INITRAMFS_SOURCE and a complete
+early userspace image can be built by an unprivileged user.
+
+As a technical note, when directories and files are specified, the
+entire CONFIG_INITRAMFS_SOURCE is passed to
+usr/gen_initramfs_list.sh. This means that CONFIG_INITRAMFS_SOURCE
+can really be interpreted as any legal argument to
+gen_initramfs_list.sh. If a directory is specified as an argument then
+the contents are scanned, uid/gid translation is performed, and
+usr/gen_init_cpio file directives are output. If a directory is
+specified as an argument to usr/gen_initramfs_list.sh then the
+contents of the file are simply copied to the output. All of the output
+directives from directory scanning and file contents copying are
+processed by usr/gen_init_cpio.
+
+See also 'usr/gen_initramfs_list.sh -h'.
+
+Where's this all leading?
+=========================
+
+The klibc distribution contains some of the necessary software to make
+early userspace useful. The klibc distribution is currently
+maintained separately from the kernel.
+
+You can obtain somewhat infrequent snapshots of klibc from
+https://www.kernel.org/pub/linux/libs/klibc/
+
+For active users, you are better off using the klibc git
+repository, at http://git.kernel.org/?p=libs/klibc/klibc.git
+
+The standalone klibc distribution currently provides three components,
+in addition to the klibc library:
+
+- ipconfig, a program that configures network interfaces. It can
+ configure them statically, or use DHCP to obtain information
+ dynamically (aka "IP autoconfiguration").
+- nfsmount, a program that can mount an NFS filesystem.
+- kinit, the "glue" that uses ipconfig and nfsmount to replace the old
+ support for IP autoconfig, mount a filesystem over NFS, and continue
+ system boot using that filesystem as root.
+
+kinit is built as a single statically linked binary to save space.
+
+Eventually, several more chunks of kernel functionality will hopefully
+move to early userspace:
+
+- Almost all of init/do_mounts* (the beginning of this is already in
+ place)
+- ACPI table parsing
+- Insert unwieldy subsystem that doesn't really need to be in kernel
+ space here
+
+If kinit doesn't meet your current needs and you've got bytes to burn,
+the klibc distribution includes a small Bourne-compatible shell (ash)
+and a number of other utilities, so you can replace kinit and build
+custom initramfs images that meet your needs exactly.
+
+For questions and help, you can sign up for the early userspace
+mailing list at http://www.zytor.com/mailman/listinfo/klibc
+
+How does it work?
+=================
+
+The kernel has currently 3 ways to mount the root filesystem:
+
+a) all required device and filesystem drivers compiled into the kernel, no
+ initrd. init/main.c:init() will call prepare_namespace() to mount the
+ final root filesystem, based on the root= option and optional init= to run
+ some other init binary than listed at the end of init/main.c:init().
+
+b) some device and filesystem drivers built as modules and stored in an
+ initrd. The initrd must contain a binary '/linuxrc' which is supposed to
+ load these driver modules. It is also possible to mount the final root
+ filesystem via linuxrc and use the pivot_root syscall. The initrd is
+ mounted and executed via prepare_namespace().
+
+c) using initramfs. The call to prepare_namespace() must be skipped.
+ This means that a binary must do all the work. Said binary can be stored
+ into initramfs either via modifying usr/gen_init_cpio.c or via the new
+ initrd format, an cpio archive. It must be called "/init". This binary
+ is responsible to do all the things prepare_namespace() would do.
+
+ To maintain backwards compatibility, the /init binary will only run if it
+ comes via an initramfs cpio archive. If this is not the case,
+ init/main.c:init() will run prepare_namespace() to mount the final root
+ and exec one of the predefined init binaries.
+
+Bryan O'Sullivan <bos@serpentine.com>
diff --git a/Documentation/driver-api/early-userspace/index.rst b/Documentation/driver-api/early-userspace/index.rst
new file mode 100644
index 000000000000..149c1822f06d
--- /dev/null
+++ b/Documentation/driver-api/early-userspace/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+Early Userspace
+===============
+
+.. toctree::
+ :maxdepth: 1
+
+ early_userspace_support
+ buffer-format
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/driver-api/edid.rst b/Documentation/driver-api/edid.rst
new file mode 100644
index 000000000000..b1b5acd501ed
--- /dev/null
+++ b/Documentation/driver-api/edid.rst
@@ -0,0 +1,58 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+EDID
+====
+
+In the good old days when graphics parameters were configured explicitly
+in a file called xorg.conf, even broken hardware could be managed.
+
+Today, with the advent of Kernel Mode Setting, a graphics board is
+either correctly working because all components follow the standards -
+or the computer is unusable, because the screen remains dark after
+booting or it displays the wrong area. Cases when this happens are:
+- The graphics board does not recognize the monitor.
+- The graphics board is unable to detect any EDID data.
+- The graphics board incorrectly forwards EDID data to the driver.
+- The monitor sends no or bogus EDID data.
+- A KVM sends its own EDID data instead of querying the connected monitor.
+Adding the kernel parameter "nomodeset" helps in most cases, but causes
+restrictions later on.
+
+As a remedy for such situations, the kernel configuration item
+CONFIG_DRM_LOAD_EDID_FIRMWARE was introduced. It allows to provide an
+individually prepared or corrected EDID data set in the /lib/firmware
+directory from where it is loaded via the firmware interface. The code
+(see drivers/gpu/drm/drm_edid_load.c) contains built-in data sets for
+commonly used screen resolutions (800x600, 1024x768, 1280x1024, 1600x1200,
+1680x1050, 1920x1080) as binary blobs, but the kernel source tree does
+not contain code to create these data. In order to elucidate the origin
+of the built-in binary EDID blobs and to facilitate the creation of
+individual data for a specific misbehaving monitor, commented sources
+and a Makefile environment are given here.
+
+To create binary EDID and C source code files from the existing data
+material, simply type "make".
+
+If you want to create your own EDID file, copy the file 1024x768.S,
+replace the settings with your own data and add a new target to the
+Makefile. Please note that the EDID data structure expects the timing
+values in a different way as compared to the standard X11 format.
+
+X11:
+ HTimings:
+ hdisp hsyncstart hsyncend htotal
+ VTimings:
+ vdisp vsyncstart vsyncend vtotal
+
+EDID::
+
+ #define XPIX hdisp
+ #define XBLANK htotal-hdisp
+ #define XOFFSET hsyncstart-hdisp
+ #define XPULSE hsyncend-hsyncstart
+
+ #define YPIX vdisp
+ #define YBLANK vtotal-vdisp
+ #define YOFFSET vsyncstart-vdisp
+ #define YPULSE vsyncend-vsyncstart
diff --git a/Documentation/driver-api/eisa.rst b/Documentation/driver-api/eisa.rst
new file mode 100644
index 000000000000..c07565ba57da
--- /dev/null
+++ b/Documentation/driver-api/eisa.rst
@@ -0,0 +1,230 @@
+================
+EISA bus support
+================
+
+:Author: Marc Zyngier <maz@wild-wind.fr.eu.org>
+
+This document groups random notes about porting EISA drivers to the
+new EISA/sysfs API.
+
+Starting from version 2.5.59, the EISA bus is almost given the same
+status as other much more mainstream busses such as PCI or USB. This
+has been possible through sysfs, which defines a nice enough set of
+abstractions to manage busses, devices and drivers.
+
+Although the new API is quite simple to use, converting existing
+drivers to the new infrastructure is not an easy task (mostly because
+detection code is generally also used to probe ISA cards). Moreover,
+most EISA drivers are among the oldest Linux drivers so, as you can
+imagine, some dust has settled here over the years.
+
+The EISA infrastructure is made up of three parts:
+
+ - The bus code implements most of the generic code. It is shared
+ among all the architectures that the EISA code runs on. It
+ implements bus probing (detecting EISA cards available on the bus),
+ allocates I/O resources, allows fancy naming through sysfs, and
+ offers interfaces for driver to register.
+
+ - The bus root driver implements the glue between the bus hardware
+ and the generic bus code. It is responsible for discovering the
+ device implementing the bus, and setting it up to be latter probed
+ by the bus code. This can go from something as simple as reserving
+ an I/O region on x86, to the rather more complex, like the hppa
+ EISA code. This is the part to implement in order to have EISA
+ running on an "new" platform.
+
+ - The driver offers the bus a list of devices that it manages, and
+ implements the necessary callbacks to probe and release devices
+ whenever told to.
+
+Every function/structure below lives in <linux/eisa.h>, which depends
+heavily on <linux/device.h>.
+
+Bus root driver
+===============
+
+::
+
+ int eisa_root_register (struct eisa_root_device *root);
+
+The eisa_root_register function is used to declare a device as the
+root of an EISA bus. The eisa_root_device structure holds a reference
+to this device, as well as some parameters for probing purposes::
+
+ struct eisa_root_device {
+ struct device *dev; /* Pointer to bridge device */
+ struct resource *res;
+ unsigned long bus_base_addr;
+ int slots; /* Max slot number */
+ int force_probe; /* Probe even when no slot 0 */
+ u64 dma_mask; /* from bridge device */
+ int bus_nr; /* Set by eisa_root_register */
+ struct resource eisa_root_res; /* ditto */
+ };
+
+============= ======================================================
+node used for eisa_root_register internal purpose
+dev pointer to the root device
+res root device I/O resource
+bus_base_addr slot 0 address on this bus
+slots max slot number to probe
+force_probe Probe even when slot 0 is empty (no EISA mainboard)
+dma_mask Default DMA mask. Usually the bridge device dma_mask.
+bus_nr unique bus id, set by eisa_root_register
+============= ======================================================
+
+Driver
+======
+
+::
+
+ int eisa_driver_register (struct eisa_driver *edrv);
+ void eisa_driver_unregister (struct eisa_driver *edrv);
+
+Clear enough ?
+
+::
+
+ struct eisa_device_id {
+ char sig[EISA_SIG_LEN];
+ unsigned long driver_data;
+ };
+
+ struct eisa_driver {
+ const struct eisa_device_id *id_table;
+ struct device_driver driver;
+ };
+
+=============== ====================================================
+id_table an array of NULL terminated EISA id strings,
+ followed by an empty string. Each string can
+ optionally be paired with a driver-dependent value
+ (driver_data).
+
+driver a generic driver, such as described in
+ Documentation/driver-api/driver-model/driver.rst. Only .name,
+ .probe and .remove members are mandatory.
+=============== ====================================================
+
+An example is the 3c59x driver::
+
+ static struct eisa_device_id vortex_eisa_ids[] = {
+ { "TCM5920", EISA_3C592_OFFSET },
+ { "TCM5970", EISA_3C597_OFFSET },
+ { "" }
+ };
+
+ static struct eisa_driver vortex_eisa_driver = {
+ .id_table = vortex_eisa_ids,
+ .driver = {
+ .name = "3c59x",
+ .probe = vortex_eisa_probe,
+ .remove = vortex_eisa_remove
+ }
+ };
+
+Device
+======
+
+The sysfs framework calls .probe and .remove functions upon device
+discovery and removal (note that the .remove function is only called
+when driver is built as a module).
+
+Both functions are passed a pointer to a 'struct device', which is
+encapsulated in a 'struct eisa_device' described as follows::
+
+ struct eisa_device {
+ struct eisa_device_id id;
+ int slot;
+ int state;
+ unsigned long base_addr;
+ struct resource res[EISA_MAX_RESOURCES];
+ u64 dma_mask;
+ struct device dev; /* generic device */
+ };
+
+======== ============================================================
+id EISA id, as read from device. id.driver_data is set from the
+ matching driver EISA id.
+slot slot number which the device was detected on
+state set of flags indicating the state of the device. Current
+ flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
+res set of four 256 bytes I/O regions allocated to this device
+dma_mask DMA mask set from the parent device.
+dev generic device (see Documentation/driver-api/driver-model/device.rst)
+======== ============================================================
+
+You can get the 'struct eisa_device' from 'struct device' using the
+'to_eisa_device' macro.
+
+Misc stuff
+==========
+
+::
+
+ void eisa_set_drvdata (struct eisa_device *edev, void *data);
+
+Stores data into the device's driver_data area.
+
+::
+
+ void *eisa_get_drvdata (struct eisa_device *edev):
+
+Gets the pointer previously stored into the device's driver_data area.
+
+::
+
+ int eisa_get_region_index (void *addr);
+
+Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given
+address.
+
+Kernel parameters
+=================
+
+eisa_bus.enable_dev
+ A comma-separated list of slots to be enabled, even if the firmware
+ set the card as disabled. The driver must be able to properly
+ initialize the device in such conditions.
+
+eisa_bus.disable_dev
+ A comma-separated list of slots to be enabled, even if the firmware
+ set the card as enabled. The driver won't be called to handle this
+ device.
+
+virtual_root.force_probe
+ Force the probing code to probe EISA slots even when it cannot find an
+ EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
+ (don't force), and set to 1 (force probing) when either
+ CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
+
+Random notes
+============
+
+Converting an EISA driver to the new API mostly involves *deleting*
+code (since probing is now in the core EISA code). Unfortunately, most
+drivers share their probing routine between ISA, and EISA. Special
+care must be taken when ripping out the EISA code, so other busses
+won't suffer from these surgical strikes...
+
+You *must not* expect any EISA device to be detected when returning
+from eisa_driver_register, since the chances are that the bus has not
+yet been probed. In fact, that's what happens most of the time (the
+bus root driver usually kicks in rather late in the boot process).
+Unfortunately, most drivers are doing the probing by themselves, and
+expect to have explored the whole machine when they exit their probe
+routine.
+
+For example, switching your favorite EISA SCSI card to the "hotplug"
+model is "the right thing"(tm).
+
+Thanks
+======
+
+I'd like to thank the following people for their help:
+
+- Xavier Benigni for lending me a wonderful Alpha Jensen,
+- James Bottomley, Jeff Garzik for getting this stuff into the kernel,
+- Andries Brouwer for contributing numerous EISA ids,
+- Catrin Jones for coping with far too many machines at home.
diff --git a/Documentation/driver-api/gpio/consumer.rst b/Documentation/driver-api/gpio/consumer.rst
index fdecb6d711db..423492d125b9 100644
--- a/Documentation/driver-api/gpio/consumer.rst
+++ b/Documentation/driver-api/gpio/consumer.rst
@@ -283,8 +283,6 @@ To summarize::
gpiod_set_value(desc, 1); default (active high) high
gpiod_set_value(desc, 0); active low high
gpiod_set_value(desc, 1); active low low
- gpiod_set_value(desc, 0); default (active high) low
- gpiod_set_value(desc, 1); default (active high) high
gpiod_set_value(desc, 0); open drain low
gpiod_set_value(desc, 1); open drain high impedance
gpiod_set_value(desc, 0); open source high impedance
@@ -366,7 +364,7 @@ accessed sequentially.
The functions take three arguments:
* array_size - the number of array elements
* desc_array - an array of GPIO descriptors
- * array_info - optional information obtained from gpiod_array_get()
+ * array_info - optional information obtained from gpiod_get_array()
* value_bitmap - a bitmap to store the GPIOs' values (get) or
a bitmap of values to assign to the GPIOs (set)
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 1ce7fcd0f989..921c71a3d683 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -235,7 +235,7 @@ means that a pull up or pull-down resistor is available on the output of the
GPIO line, and this resistor is software controlled.
In discrete designs, a pull-up or pull-down resistor is simply soldered on
-the circuit board. This is not something we deal or model in software. The
+the circuit board. This is not something we deal with or model in software. The
most you will think about these lines is that they will very likely be
configured as open drain or open source (see the section above).
@@ -292,18 +292,18 @@ We can divide GPIO irqchips in two broad categories:
- HIERARCHICAL INTERRUPT CHIPS: this means that each GPIO line has a dedicated
irq line to a parent interrupt controller one level up. There is no need
- to inquire the GPIO hardware to figure out which line has figured, but it
- may still be necessary to acknowledge the interrupt and set up the
- configuration such as edge sensitivity.
+ to inquire the GPIO hardware to figure out which line has fired, but it
+ may still be necessary to acknowledge the interrupt and set up configuration
+ such as edge sensitivity.
Realtime considerations: a realtime compliant GPIO driver should not use
spinlock_t or any sleepable APIs (like PM runtime) as part of its irqchip
implementation.
-- spinlock_t should be replaced with raw_spinlock_t [1].
+- spinlock_t should be replaced with raw_spinlock_t.[1]
- If sleepable APIs have to be used, these can be done from the .irq_bus_lock()
and .irq_bus_unlock() callbacks, as these are the only slowpath callbacks
- on an irqchip. Create the callbacks if needed [2].
+ on an irqchip. Create the callbacks if needed.[2]
Cascaded GPIO irqchips
@@ -361,7 +361,7 @@ Cascaded GPIO irqchips usually fall in one of three categories:
Realtime considerations: this kind of handlers will be forced threaded on -RT,
and as result the IRQ core will complain that generic_handle_irq() is called
- with IRQ enabled and the same work around as for "CHAINED GPIO irqchips" can
+ with IRQ enabled and the same work-around as for "CHAINED GPIO irqchips" can
be applied.
- NESTED THREADED GPIO IRQCHIPS: these are off-chip GPIO expanders and any
@@ -399,7 +399,7 @@ symbol:
will pass the struct gpio_chip* for the chip to all IRQ callbacks, so the
callbacks need to embed the gpio_chip in its state container and obtain a
pointer to the container using container_of().
- (See Documentation/driver-model/design-patterns.txt)
+ (See Documentation/driver-api/driver-model/design-patterns.rst)
- gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip,
as discussed above regarding different types of cascaded irqchips. The
@@ -418,7 +418,7 @@ symbol:
If there is a need to exclude certain GPIO lines from the IRQ domain handled by
these helpers, we can set .irq.need_valid_mask of the gpiochip before
-[devm_]gpiochip_add_data() is called. This allocates an .irq.valid_mask with as
+``[devm_]gpiochip_add_data()`` is called. This allocates an .irq.valid_mask with as
many bits set as there are GPIO lines in the chip, each bit representing line
0..n-1. Drivers can exclude GPIO lines by clearing bits from this mask. The mask
must be filled in before gpiochip_irqchip_add() or gpiochip_irqchip_add_nested()
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index d26308af6036..d12a80f386a6 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -14,8 +14,10 @@ available subsections can be seen below.
.. toctree::
:maxdepth: 2
+ driver-model/index
basics
infrastructure
+ early-userspace/index
pm/index
clk
device-io
@@ -34,7 +36,9 @@ available subsections can be seen below.
pci/index
spi
i2c
+ ipmb
i3c/index
+ interconnect
hsi
edac
scsi
@@ -42,8 +46,12 @@ available subsections can be seen below.
target
mtdnand
miscellaneous
+ mei/index
+ mtd/index
+ mmc/index
+ nvdimm/index
w1
- rapidio
+ rapidio/index
s390-drivers
vme
80211/index
@@ -51,13 +59,48 @@ available subsections can be seen below.
firmware/index
pinctl
gpio/index
+ md/index
misc_devices
+ nfc/index
dmaengine/index
slimbus
soundwire/index
fpga/index
acpi/index
+ backlight/lp855x-driver.rst
+ bt8xxgpio
+ connector
+ console
+ dcdbas
+ dell_rbu
+ edid
+ eisa
+ isa
+ isapnp
generic-counter
+ lightnvm-pblk
+ memory-devices/index
+ men-chameleon-bus
+ ntb
+ nvmem
+ parport-lowlevel
+ pps
+ ptp
+ phy/index
+ pti_intel_mid
+ pwm
+ rfkill
+ serial/index
+ sgi-ioc4
+ sm501
+ smsc_ece1099
+ switchtec
+ sync_file
+ vfio-mediated-device
+ vfio
+ xilinx/index
+ xillybus
+ zorro
.. only:: subproject and html
diff --git a/Documentation/driver-api/interconnect.rst b/Documentation/driver-api/interconnect.rst
new file mode 100644
index 000000000000..c3e004893796
--- /dev/null
+++ b/Documentation/driver-api/interconnect.rst
@@ -0,0 +1,93 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================================
+GENERIC SYSTEM INTERCONNECT SUBSYSTEM
+=====================================
+
+Introduction
+------------
+
+This framework is designed to provide a standard kernel interface to control
+the settings of the interconnects on an SoC. These settings can be throughput,
+latency and priority between multiple interconnected devices or functional
+blocks. This can be controlled dynamically in order to save power or provide
+maximum performance.
+
+The interconnect bus is hardware with configurable parameters, which can be
+set on a data path according to the requests received from various drivers.
+An example of interconnect buses are the interconnects between various
+components or functional blocks in chipsets. There can be multiple interconnects
+on an SoC that can be multi-tiered.
+
+Below is a simplified diagram of a real-world SoC interconnect bus topology.
+
+::
+
+ +----------------+ +----------------+
+ | HW Accelerator |--->| M NoC |<---------------+
+ +----------------+ +----------------+ |
+ | | +------------+
+ +-----+ +-------------+ V +------+ | |
+ | DDR | | +--------+ | PCIe | | |
+ +-----+ | | Slaves | +------+ | |
+ ^ ^ | +--------+ | | C NoC |
+ | | V V | |
+ +------------------+ +------------------------+ | | +-----+
+ | |-->| |-->| |-->| CPU |
+ | |-->| |<--| | +-----+
+ | Mem NoC | | S NoC | +------------+
+ | |<--| |---------+ |
+ | |<--| |<------+ | | +--------+
+ +------------------+ +------------------------+ | | +-->| Slaves |
+ ^ ^ ^ ^ ^ | | +--------+
+ | | | | | | V
+ +------+ | +-----+ +-----+ +---------+ +----------------+ +--------+
+ | CPUs | | | GPU | | DSP | | Masters |-->| P NoC |-->| Slaves |
+ +------+ | +-----+ +-----+ +---------+ +----------------+ +--------+
+ |
+ +-------+
+ | Modem |
+ +-------+
+
+Terminology
+-----------
+
+Interconnect provider is the software definition of the interconnect hardware.
+The interconnect providers on the above diagram are M NoC, S NoC, C NoC, P NoC
+and Mem NoC.
+
+Interconnect node is the software definition of the interconnect hardware
+port. Each interconnect provider consists of multiple interconnect nodes,
+which are connected to other SoC components including other interconnect
+providers. The point on the diagram where the CPUs connect to the memory is
+called an interconnect node, which belongs to the Mem NoC interconnect provider.
+
+Interconnect endpoints are the first or the last element of the path. Every
+endpoint is a node, but not every node is an endpoint.
+
+Interconnect path is everything between two endpoints including all the nodes
+that have to be traversed to reach from a source to destination node. It may
+include multiple master-slave pairs across several interconnect providers.
+
+Interconnect consumers are the entities which make use of the data paths exposed
+by the providers. The consumers send requests to providers requesting various
+throughput, latency and priority. Usually the consumers are device drivers, that
+send request based on their needs. An example for a consumer is a video decoder
+that supports various formats and image sizes.
+
+Interconnect providers
+----------------------
+
+Interconnect provider is an entity that implements methods to initialize and
+configure interconnect bus hardware. The interconnect provider drivers should
+be registered with the interconnect provider core.
+
+.. kernel-doc:: include/linux/interconnect-provider.h
+
+Interconnect consumers
+----------------------
+
+Interconnect consumers are the clients which use the interconnect APIs to
+get paths between endpoints and set their bandwidth/latency/QoS requirements
+for these interconnect paths. These interfaces are not currently
+documented.
diff --git a/Documentation/driver-api/ipmb.rst b/Documentation/driver-api/ipmb.rst
new file mode 100644
index 000000000000..7e2265144157
--- /dev/null
+++ b/Documentation/driver-api/ipmb.rst
@@ -0,0 +1,105 @@
+==============================
+IPMB Driver for a Satellite MC
+==============================
+
+The Intelligent Platform Management Bus or IPMB, is an
+I2C bus that provides a standardized interconnection between
+different boards within a chassis. This interconnection is
+between the baseboard management (BMC) and chassis electronics.
+IPMB is also associated with the messaging protocol through the
+IPMB bus.
+
+The devices using the IPMB are usually management
+controllers that perform management functions such as servicing
+the front panel interface, monitoring the baseboard,
+hot-swapping disk drivers in the system chassis, etc...
+
+When an IPMB is implemented in the system, the BMC serves as
+a controller to give system software access to the IPMB. The BMC
+sends IPMI requests to a device (usually a Satellite Management
+Controller or Satellite MC) via IPMB and the device
+sends a response back to the BMC.
+
+For more information on IPMB and the format of an IPMB message,
+refer to the IPMB and IPMI specifications.
+
+IPMB driver for Satellite MC
+----------------------------
+
+ipmb-dev-int - This is the driver needed on a Satellite MC to
+receive IPMB messages from a BMC and send a response back.
+This driver works with the I2C driver and a userspace
+program such as OpenIPMI:
+
+1) It is an I2C slave backend driver. So, it defines a callback
+ function to set the Satellite MC as an I2C slave.
+ This callback function handles the received IPMI requests.
+
+2) It defines the read and write functions to enable a user
+ space program (such as OpenIPMI) to communicate with the kernel.
+
+
+Load the IPMB driver
+--------------------
+
+The driver needs to be loaded at boot time or manually first.
+First, make sure you have the following in your config file:
+CONFIG_IPMB_DEVICE_INTERFACE=y
+
+1) If you want the driver to be loaded at boot time:
+
+a) Add this entry to your ACPI table, under the appropriate SMBus::
+
+ Device (SMB0) // Example SMBus host controller
+ {
+ Name (_HID, "<Vendor-Specific HID>") // Vendor-Specific HID
+ Name (_UID, 0) // Unique ID of particular host controller
+ :
+ :
+ Device (IPMB)
+ {
+ Name (_HID, "IPMB0001") // IPMB device interface
+ Name (_UID, 0) // Unique device identifier
+ }
+ }
+
+b) Example for device tree::
+
+ &i2c2 {
+ status = "okay";
+
+ ipmb@10 {
+ compatible = "ipmb-dev";
+ reg = <0x10>;
+ };
+ };
+
+2) Manually from Linux::
+
+ modprobe ipmb-dev-int
+
+
+Instantiate the device
+----------------------
+
+After loading the driver, you can instantiate the device as
+described in 'Documentation/i2c/instantiating-devices'.
+If you have multiple BMCs, each connected to your Satellite MC via
+a different I2C bus, you can instantiate a device for each of
+those BMCs.
+
+The name of the instantiated device contains the I2C bus number
+associated with it as follows::
+
+ BMC1 ------ IPMB/I2C bus 1 ---------| /dev/ipmb-1
+ Satellite MC
+ BMC1 ------ IPMB/I2C bus 2 ---------| /dev/ipmb-2
+
+For instance, you can instantiate the ipmb-dev-int device from
+user space at the 7 bit address 0x10 on bus 2::
+
+ # echo ipmb-dev 0x1010 > /sys/bus/i2c/devices/i2c-2/new_device
+
+This will create the device file /dev/ipmb-2, which can be accessed
+by the user space program. The device needs to be instantiated
+before running the user space program.
diff --git a/Documentation/isa.txt b/Documentation/driver-api/isa.rst
index def4a7b690b5..def4a7b690b5 100644
--- a/Documentation/isa.txt
+++ b/Documentation/driver-api/isa.rst
diff --git a/Documentation/isapnp.txt b/Documentation/driver-api/isapnp.rst
index 8d0840ac847b..8d0840ac847b 100644
--- a/Documentation/isapnp.txt
+++ b/Documentation/driver-api/isapnp.rst
diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/driver-api/lightnvm-pblk.rst
index 1040ed1cec81..1040ed1cec81 100644
--- a/Documentation/lightnvm/pblk.txt
+++ b/Documentation/driver-api/lightnvm-pblk.rst
diff --git a/Documentation/driver-api/md/index.rst b/Documentation/driver-api/md/index.rst
new file mode 100644
index 000000000000..18f54a7d7d6e
--- /dev/null
+++ b/Documentation/driver-api/md/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+RAID
+====
+
+.. toctree::
+ :maxdepth: 1
+
+ md-cluster
+ raid5-cache
+ raid5-ppl
diff --git a/Documentation/driver-api/md/md-cluster.rst b/Documentation/driver-api/md/md-cluster.rst
new file mode 100644
index 000000000000..96eb52cec7eb
--- /dev/null
+++ b/Documentation/driver-api/md/md-cluster.rst
@@ -0,0 +1,385 @@
+==========
+MD Cluster
+==========
+
+The cluster MD is a shared-device RAID for a cluster, it supports
+two levels: raid1 and raid10 (limited support).
+
+
+1. On-disk format
+=================
+
+Separate write-intent-bitmaps are used for each cluster node.
+The bitmaps record all writes that may have been started on that node,
+and may not yet have finished. The on-disk layout is::
+
+ 0 4k 8k 12k
+ -------------------------------------------------------------------
+ | idle | md super | bm super [0] + bits |
+ | bm bits[0, contd] | bm super[1] + bits | bm bits[1, contd] |
+ | bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits |
+ | bm bits [3, contd] | | |
+
+During "normal" functioning we assume the filesystem ensures that only
+one node writes to any given block at a time, so a write request will
+
+ - set the appropriate bit (if not already set)
+ - commit the write to all mirrors
+ - schedule the bit to be cleared after a timeout.
+
+Reads are just handled normally. It is up to the filesystem to ensure
+one node doesn't read from a location where another node (or the same
+node) is writing.
+
+
+2. DLM Locks for management
+===========================
+
+There are three groups of locks for managing the device:
+
+2.1 Bitmap lock resource (bm_lockres)
+-------------------------------------
+
+ The bm_lockres protects individual node bitmaps. They are named in
+ the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a
+ node joins the cluster, it acquires the lock in PW mode and it stays
+ so during the lifetime the node is part of the cluster. The lock
+ resource number is based on the slot number returned by the DLM
+ subsystem. Since DLM starts node count from one and bitmap slots
+ start from zero, one is subtracted from the DLM slot number to arrive
+ at the bitmap slot number.
+
+ The LVB of the bitmap lock for a particular node records the range
+ of sectors that are being re-synced by that node. No other
+ node may write to those sectors. This is used when a new nodes
+ joins the cluster.
+
+2.2 Message passing locks
+-------------------------
+
+ Each node has to communicate with other nodes when starting or ending
+ resync, and for metadata superblock updates. This communication is
+ managed through three locks: "token", "message", and "ack", together
+ with the Lock Value Block (LVB) of one of the "message" lock.
+
+2.3 new-device management
+-------------------------
+
+ A single lock: "no-new-dev" is used to co-ordinate the addition of
+ new devices - this must be synchronized across the array.
+ Normally all nodes hold a concurrent-read lock on this device.
+
+3. Communication
+================
+
+ Messages can be broadcast to all nodes, and the sender waits for all
+ other nodes to acknowledge the message before proceeding. Only one
+ message can be processed at a time.
+
+3.1 Message Types
+-----------------
+
+ There are six types of messages which are passed:
+
+3.1.1 METADATA_UPDATED
+^^^^^^^^^^^^^^^^^^^^^^
+
+ informs other nodes that the metadata has
+ been updated, and the node must re-read the md superblock. This is
+ performed synchronously. It is primarily used to signal device
+ failure.
+
+3.1.2 RESYNCING
+^^^^^^^^^^^^^^^
+ informs other nodes that a resync is initiated or
+ ended so that each node may suspend or resume the region. Each
+ RESYNCING message identifies a range of the devices that the
+ sending node is about to resync. This overrides any previous
+ notification from that node: only one ranged can be resynced at a
+ time per-node.
+
+3.1.3 NEWDISK
+^^^^^^^^^^^^^
+
+ informs other nodes that a device is being added to
+ the array. Message contains an identifier for that device. See
+ below for further details.
+
+3.1.4 REMOVE
+^^^^^^^^^^^^
+
+ A failed or spare device is being removed from the
+ array. The slot-number of the device is included in the message.
+
+ 3.1.5 RE_ADD:
+
+ A failed device is being re-activated - the assumption
+ is that it has been determined to be working again.
+
+ 3.1.6 BITMAP_NEEDS_SYNC:
+
+ If a node is stopped locally but the bitmap
+ isn't clean, then another node is informed to take the ownership of
+ resync.
+
+3.2 Communication mechanism
+---------------------------
+
+ The DLM LVB is used to communicate within nodes of the cluster. There
+ are three resources used for the purpose:
+
+3.2.1 token
+^^^^^^^^^^^
+ The resource which protects the entire communication
+ system. The node having the token resource is allowed to
+ communicate.
+
+3.2.2 message
+^^^^^^^^^^^^^
+ The lock resource which carries the data to communicate.
+
+3.2.3 ack
+^^^^^^^^^
+
+ The resource, acquiring which means the message has been
+ acknowledged by all nodes in the cluster. The BAST of the resource
+ is used to inform the receiving node that a node wants to
+ communicate.
+
+The algorithm is:
+
+ 1. receive status - all nodes have concurrent-reader lock on "ack"::
+
+ sender receiver receiver
+ "ack":CR "ack":CR "ack":CR
+
+ 2. sender get EX on "token",
+ sender get EX on "message"::
+
+ sender receiver receiver
+ "token":EX "ack":CR "ack":CR
+ "message":EX
+ "ack":CR
+
+ Sender checks that it still needs to send a message. Messages
+ received or other events that happened while waiting for the
+ "token" may have made this message inappropriate or redundant.
+
+ 3. sender writes LVB
+
+ sender down-convert "message" from EX to CW
+
+ sender try to get EX of "ack"
+
+ ::
+
+ [ wait until all receivers have *processed* the "message" ]
+
+ [ triggered by bast of "ack" ]
+ receiver get CR on "message"
+ receiver read LVB
+ receiver processes the message
+ [ wait finish ]
+ receiver releases "ack"
+ receiver tries to get PR on "message"
+
+ sender receiver receiver
+ "token":EX "message":CR "message":CR
+ "message":CW
+ "ack":EX
+
+ 4. triggered by grant of EX on "ack" (indicating all receivers
+ have processed message)
+
+ sender down-converts "ack" from EX to CR
+
+ sender releases "message"
+
+ sender releases "token"
+
+ ::
+
+ receiver upconvert to PR on "message"
+ receiver get CR of "ack"
+ receiver release "message"
+
+ sender receiver receiver
+ "ack":CR "ack":CR "ack":CR
+
+
+4. Handling Failures
+====================
+
+4.1 Node Failure
+----------------
+
+ When a node fails, the DLM informs the cluster with the slot
+ number. The node starts a cluster recovery thread. The cluster
+ recovery thread:
+
+ - acquires the bitmap<number> lock of the failed node
+ - opens the bitmap
+ - reads the bitmap of the failed node
+ - copies the set bitmap to local node
+ - cleans the bitmap of the failed node
+ - releases bitmap<number> lock of the failed node
+ - initiates resync of the bitmap on the current node
+ md_check_recovery is invoked within recover_bitmaps,
+ then md_check_recovery -> metadata_update_start/finish,
+ it will lock the communication by lock_comm.
+ Which means when one node is resyncing it blocks all
+ other nodes from writing anywhere on the array.
+
+ The resync process is the regular md resync. However, in a clustered
+ environment when a resync is performed, it needs to tell other nodes
+ of the areas which are suspended. Before a resync starts, the node
+ send out RESYNCING with the (lo,hi) range of the area which needs to
+ be suspended. Each node maintains a suspend_list, which contains the
+ list of ranges which are currently suspended. On receiving RESYNCING,
+ the node adds the range to the suspend_list. Similarly, when the node
+ performing resync finishes, it sends RESYNCING with an empty range to
+ other nodes and other nodes remove the corresponding entry from the
+ suspend_list.
+
+ A helper function, ->area_resyncing() can be used to check if a
+ particular I/O range should be suspended or not.
+
+4.2 Device Failure
+==================
+
+ Device failures are handled and communicated with the metadata update
+ routine. When a node detects a device failure it does not allow
+ any further writes to that device until the failure has been
+ acknowledged by all other nodes.
+
+5. Adding a new Device
+----------------------
+
+ For adding a new device, it is necessary that all nodes "see" the new
+ device to be added. For this, the following algorithm is used:
+
+ 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
+ ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD)
+ 2. Node 1 sends a NEWDISK message with uuid and slot number
+ 3. Other nodes issue kobject_uevent_env with uuid and slot number
+ (Steps 4,5 could be a udev rule)
+ 4. In userspace, the node searches for the disk, perhaps
+ using blkid -t SUB_UUID=""
+ 5. Other nodes issue either of the following depending on whether
+ the disk was found:
+ ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
+ disc.number set to slot number)
+ ioctl(CLUSTERED_DISK_NACK)
+ 6. Other nodes drop lock on "no-new-devs" (CR) if device is found
+ 7. Node 1 attempts EX lock on "no-new-dev"
+ 8. If node 1 gets the lock, it sends METADATA_UPDATED after
+ unmarking the disk as SpareLocal
+ 9. If not (get "no-new-dev" lock), it fails the operation and sends
+ METADATA_UPDATED.
+ 10. Other nodes get the information whether a disk is added or not
+ by the following METADATA_UPDATED.
+
+6. Module interface
+===================
+
+ There are 17 call-backs which the md core can make to the cluster
+ module. Understanding these can give a good overview of the whole
+ process.
+
+6.1 join(nodes) and leave()
+---------------------------
+
+ These are called when an array is started with a clustered bitmap,
+ and when the array is stopped. join() ensures the cluster is
+ available and initializes the various resources.
+ Only the first 'nodes' nodes in the cluster can use the array.
+
+6.2 slot_number()
+-----------------
+
+ Reports the slot number advised by the cluster infrastructure.
+ Range is from 0 to nodes-1.
+
+6.3 resync_info_update()
+------------------------
+
+ This updates the resync range that is stored in the bitmap lock.
+ The starting point is updated as the resync progresses. The
+ end point is always the end of the array.
+ It does *not* send a RESYNCING message.
+
+6.4 resync_start(), resync_finish()
+-----------------------------------
+
+ These are called when resync/recovery/reshape starts or stops.
+ They update the resyncing range in the bitmap lock and also
+ send a RESYNCING message. resync_start reports the whole
+ array as resyncing, resync_finish reports none of it.
+
+ resync_finish() also sends a BITMAP_NEEDS_SYNC message which
+ allows some other node to take over.
+
+6.5 metadata_update_start(), metadata_update_finish(), metadata_update_cancel()
+-------------------------------------------------------------------------------
+
+ metadata_update_start is used to get exclusive access to
+ the metadata. If a change is still needed once that access is
+ gained, metadata_update_finish() will send a METADATA_UPDATE
+ message to all other nodes, otherwise metadata_update_cancel()
+ can be used to release the lock.
+
+6.6 area_resyncing()
+--------------------
+
+ This combines two elements of functionality.
+
+ Firstly, it will check if any node is currently resyncing
+ anything in a given range of sectors. If any resync is found,
+ then the caller will avoid writing or read-balancing in that
+ range.
+
+ Secondly, while node recovery is happening it reports that
+ all areas are resyncing for READ requests. This avoids races
+ between the cluster-filesystem and the cluster-RAID handling
+ a node failure.
+
+6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack()
+---------------------------------------------------------------
+
+ These are used to manage the new-disk protocol described above.
+ When a new device is added, add_new_disk_start() is called before
+ it is bound to the array and, if that succeeds, add_new_disk_finish()
+ is called the device is fully added.
+
+ When a device is added in acknowledgement to a previous
+ request, or when the device is declared "unavailable",
+ new_disk_ack() is called.
+
+6.8 remove_disk()
+-----------------
+
+ This is called when a spare or failed device is removed from
+ the array. It causes a REMOVE message to be send to other nodes.
+
+6.9 gather_bitmaps()
+--------------------
+
+ This sends a RE_ADD message to all other nodes and then
+ gathers bitmap information from all bitmaps. This combined
+ bitmap is then used to recovery the re-added device.
+
+6.10 lock_all_bitmaps() and unlock_all_bitmaps()
+------------------------------------------------
+
+ These are called when change bitmap to none. If a node plans
+ to clear the cluster raid's bitmap, it need to make sure no other
+ nodes are using the raid which is achieved by lock all bitmap
+ locks within the cluster, and also those locks are unlocked
+ accordingly.
+
+7. Unsupported features
+=======================
+
+There are somethings which are not supported by cluster MD yet.
+
+- change array_sectors.
diff --git a/Documentation/driver-api/md/raid5-cache.rst b/Documentation/driver-api/md/raid5-cache.rst
new file mode 100644
index 000000000000..d7a15f44a7c3
--- /dev/null
+++ b/Documentation/driver-api/md/raid5-cache.rst
@@ -0,0 +1,111 @@
+================
+RAID 4/5/6 cache
+================
+
+Raid 4/5/6 could include an extra disk for data cache besides normal RAID
+disks. The role of RAID disks isn't changed with the cache disk. The cache disk
+caches data to the RAID disks. The cache can be in write-through (supported
+since 4.4) or write-back mode (supported since 4.10). mdadm (supported since
+3.4) has a new option '--write-journal' to create array with cache. Please
+refer to mdadm manual for details. By default (RAID array starts), the cache is
+in write-through mode. A user can switch it to write-back mode by::
+
+ echo "write-back" > /sys/block/md0/md/journal_mode
+
+And switch it back to write-through mode by::
+
+ echo "write-through" > /sys/block/md0/md/journal_mode
+
+In both modes, all writes to the array will hit cache disk first. This means
+the cache disk must be fast and sustainable.
+
+write-through mode
+==================
+
+This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean
+shutdown can cause data in some stripes to not be in consistent state, eg, data
+and parity don't match. The reason is that a stripe write involves several RAID
+disks and it's possible the writes don't hit all RAID disks yet before the
+unclean shutdown. We call an array degraded if it has inconsistent data. MD
+tries to resync the array to bring it back to normal state. But before the
+resync completes, any system crash will expose the chance of real data
+corruption in the RAID array. This problem is called 'write hole'.
+
+The write-through cache will cache all data on cache disk first. After the data
+is safe on the cache disk, the data will be flushed onto RAID disks. The
+two-step write will guarantee MD can recover correct data after unclean
+shutdown even the array is degraded. Thus the cache can close the 'write hole'.
+
+In write-through mode, MD reports IO completion to upper layer (usually
+filesystems) after the data is safe on RAID disks, so cache disk failure
+doesn't cause data loss. Of course cache disk failure means the array is
+exposed to 'write hole' again.
+
+In write-through mode, the cache disk isn't required to be big. Several
+hundreds megabytes are enough.
+
+write-back mode
+===============
+
+write-back mode fixes the 'write hole' issue too, since all write data is
+cached on cache disk. But the main goal of 'write-back' cache is to speed up
+write. If a write crosses all RAID disks of a stripe, we call it full-stripe
+write. For non-full-stripe writes, MD must read old data before the new parity
+can be calculated. These synchronous reads hurt write throughput. Some writes
+which are sequential but not dispatched in the same time will suffer from this
+overhead too. Write-back cache will aggregate the data and flush the data to
+RAID disks only after the data becomes a full stripe write. This will
+completely avoid the overhead, so it's very helpful for some workloads. A
+typical workload which does sequential write followed by fsync is an example.
+
+In write-back mode, MD reports IO completion to upper layer (usually
+filesystems) right after the data hits cache disk. The data is flushed to raid
+disks later after specific conditions met. So cache disk failure will cause
+data loss.
+
+In write-back mode, MD also caches data in memory. The memory cache includes
+the same data stored on cache disk, so a power loss doesn't cause data loss.
+The memory cache size has performance impact for the array. It's recommended
+the size is big. A user can configure the size by::
+
+ echo "2048" > /sys/block/md0/md/stripe_cache_size
+
+Too small cache disk will make the write aggregation less efficient in this
+mode depending on the workloads. It's recommended to use a cache disk with at
+least several gigabytes size in write-back mode.
+
+The implementation
+==================
+
+The write-through and write-back cache use the same disk format. The cache disk
+is organized as a simple write log. The log consists of 'meta data' and 'data'
+pairs. The meta data describes the data. It also includes checksum and sequence
+ID for recovery identification. Data can be IO data and parity data. Data is
+checksumed too. The checksum is stored in the meta data ahead of the data. The
+checksum is an optimization because MD can write meta and data freely without
+worry about the order. MD superblock has a field pointed to the valid meta data
+of log head.
+
+The log implementation is pretty straightforward. The difficult part is the
+order in which MD writes data to cache disk and RAID disks. Specifically, in
+write-through mode, MD calculates parity for IO data, writes both IO data and
+parity to the log, writes the data and parity to RAID disks after the data and
+parity is settled down in log and finally the IO is finished. Read just reads
+from raid disks as usual.
+
+In write-back mode, MD writes IO data to the log and reports IO completion. The
+data is also fully cached in memory at that time, which means read must query
+memory cache. If some conditions are met, MD will flush the data to RAID disks.
+MD will calculate parity for the data and write parity into the log. After this
+is finished, MD will write both data and parity into RAID disks, then MD can
+release the memory cache. The flush conditions could be stripe becomes a full
+stripe write, free cache disk space is low or free in-kernel memory cache space
+is low.
+
+After an unclean shutdown, MD does recovery. MD reads all meta data and data
+from the log. The sequence ID and checksum will help us detect corrupted meta
+data and data. If MD finds a stripe with data and valid parities (1 parity for
+raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If
+parities are incompleted, they are discarded. If part of data is corrupted,
+they are discarded too. MD then loads valid data and writes them to RAID disks
+in normal way.
diff --git a/Documentation/driver-api/md/raid5-ppl.rst b/Documentation/driver-api/md/raid5-ppl.rst
new file mode 100644
index 000000000000..357e5515bc55
--- /dev/null
+++ b/Documentation/driver-api/md/raid5-ppl.rst
@@ -0,0 +1,47 @@
+==================
+Partial Parity Log
+==================
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe. It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+There is a limitation of maximum 64 disks in the array for PPL. It allows to
+keep data structures and implementation simple. RAID5 arrays with so many disks
+are not likely due to high risk of multiple disks failure. Such restriction
+should not be a real life limitation.
diff --git a/Documentation/driver-api/mei/hdcp.rst b/Documentation/driver-api/mei/hdcp.rst
new file mode 100644
index 000000000000..e85a065b1cdc
--- /dev/null
+++ b/Documentation/driver-api/mei/hdcp.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+HDCP:
+=====
+
+ME FW as a security engine provides the capability for setting up
+HDCP2.2 protocol negotiation between the Intel graphics device and
+an HDC2.2 sink.
+
+ME FW prepares HDCP2.2 negotiation parameters, signs and encrypts them
+according the HDCP 2.2 spec. The Intel graphics sends the created blob
+to the HDCP2.2 sink.
+
+Similarly, the HDCP2.2 sink's response is transferred to ME FW
+for decryption and verification.
+
+Once all the steps of HDCP2.2 negotiation are completed,
+upon request ME FW will configure the port as authenticated and supply
+the HDCP encryption keys to Intel graphics hardware.
+
+
+mei_hdcp driver
+---------------
+.. kernel-doc:: drivers/misc/mei/hdcp/mei_hdcp.c
+ :doc: MEI_HDCP Client Driver
+
+mei_hdcp api
+------------
+
+.. kernel-doc:: drivers/misc/mei/hdcp/mei_hdcp.c
+ :functions:
+
diff --git a/Documentation/driver-api/mei/iamt.rst b/Documentation/driver-api/mei/iamt.rst
new file mode 100644
index 000000000000..6ef3e613684b
--- /dev/null
+++ b/Documentation/driver-api/mei/iamt.rst
@@ -0,0 +1,101 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Intel(R) Active Management Technology (Intel AMT)
+=================================================
+
+Prominent usage of the Intel ME Interface is to communicate with Intel(R)
+Active Management Technology (Intel AMT) implemented in firmware running on
+the Intel ME.
+
+Intel AMT provides the ability to manage a host remotely out-of-band (OOB)
+even when the operating system running on the host processor has crashed or
+is in a sleep state.
+
+Some examples of Intel AMT usage are:
+ - Monitoring hardware state and platform components
+ - Remote power off/on (useful for green computing or overnight IT
+ maintenance)
+ - OS updates
+ - Storage of useful platform information such as software assets
+ - Built-in hardware KVM
+ - Selective network isolation of Ethernet and IP protocol flows based
+ on policies set by a remote management console
+ - IDE device redirection from remote management console
+
+Intel AMT (OOB) communication is based on SOAP (deprecated
+starting with Release 6.0) over HTTP/S or WS-Management protocol over
+HTTP/S that are received from a remote management console application.
+
+For more information about Intel AMT:
+https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+
+
+Intel AMT Applications
+----------------------
+
+ 1) Intel Local Management Service (Intel LMS)
+
+ Applications running locally on the platform communicate with Intel AMT Release
+ 2.0 and later releases in the same way that network applications do via SOAP
+ over HTTP (deprecated starting with Release 6.0) or with WS-Management over
+ SOAP over HTTP. This means that some Intel AMT features can be accessed from a
+ local application using the same network interface as a remote application
+ communicating with Intel AMT over the network.
+
+ When a local application sends a message addressed to the local Intel AMT host
+ name, the Intel LMS, which listens for traffic directed to the host name,
+ intercepts the message and routes it to the Intel MEI.
+ For more information:
+ https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+ Under "About Intel AMT" => "Local Access"
+
+ For downloading Intel LMS:
+ https://github.com/intel/lms
+
+ The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS
+ firmware feature using a defined GUID and then communicates with the feature
+ using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol).
+ The protocol is used to maintain multiple sessions with Intel AMT from a
+ single application.
+
+ See the protocol specification in the Intel AMT Software Development Kit (SDK)
+ https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+ Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)"
+ => "Information for Intel(R) vPro(TM) Gateway Developers"
+ => "Description of the Intel AMT Port Forwarding (APF) Protocol"
+
+ 2) Intel AMT Remote configuration using a Local Agent
+
+ A Local Agent enables IT personnel to configure Intel AMT out-of-the-box
+ without requiring installing additional data to enable setup. The remote
+ configuration process may involve an ISV-developed remote configuration
+ agent that runs on the host.
+ For more information:
+ https://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide/default.htm
+ Under "Setup and Configuration of Intel AMT" =>
+ "SDK Tools Supporting Setup and Configuration" =>
+ "Using the Local Agent Sample"
+
+Intel AMT OS Health Watchdog
+----------------------------
+
+The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog.
+Whenever the OS hangs or crashes, Intel AMT will send an event
+to any subscriber to this event. This mechanism means that
+IT knows when a platform crashes even when there is a hard failure on the host.
+
+The Intel AMT Watchdog is composed of two parts:
+ 1) Firmware feature - receives the heartbeats
+ and sends an event when the heartbeats stop.
+ 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature,
+ configures the watchdog and sends the heartbeats.
+
+The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure
+the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the
+watchdog is 120 seconds.
+
+If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate
+on the me client bus and watchdog devices won't be exposed.
+
+---
+linux-mei@linux.intel.com
diff --git a/Documentation/driver-api/mei/index.rst b/Documentation/driver-api/mei/index.rst
new file mode 100644
index 000000000000..3a22b522ee78
--- /dev/null
+++ b/Documentation/driver-api/mei/index.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+.. include:: <isonum.txt>
+
+===================================================
+Intel(R) Management Engine Interface (Intel(R) MEI)
+===================================================
+
+**Copyright** |copy| 2019 Intel Corporation
+
+
+.. only:: html
+
+ .. class:: toc-title
+
+ Table of Contents
+
+.. toctree::
+ :maxdepth: 3
+
+ mei
+ mei-client-bus
+ iamt
diff --git a/Documentation/driver-api/mei/mei-client-bus.rst b/Documentation/driver-api/mei/mei-client-bus.rst
new file mode 100644
index 000000000000..f242b3f8d6aa
--- /dev/null
+++ b/Documentation/driver-api/mei/mei-client-bus.rst
@@ -0,0 +1,168 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================================
+Intel(R) Management Engine (ME) Client bus API
+==============================================
+
+
+Rationale
+=========
+
+The MEI character device is useful for dedicated applications to send and receive
+data to the many FW appliance found in Intel's ME from the user space.
+However, for some of the ME functionalities it makes sense to leverage existing software
+stack and expose them through existing kernel subsystems.
+
+In order to plug seamlessly into the kernel device driver model we add kernel virtual
+bus abstraction on top of the MEI driver. This allows implementing Linux kernel drivers
+for the various MEI features as a stand alone entities found in their respective subsystem.
+Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to
+the existing code.
+
+
+MEI CL bus API
+==============
+
+A driver implementation for an MEI Client is very similar to any other existing bus
+based device drivers. The driver registers itself as an MEI CL bus driver through
+the ``struct mei_cl_driver`` structure defined in :file:`include/linux/mei_cl_bus.c`
+
+.. code-block:: C
+
+ struct mei_cl_driver {
+ struct device_driver driver;
+ const char *name;
+
+ const struct mei_cl_device_id *id_table;
+
+ int (*probe)(struct mei_cl_device *dev, const struct mei_cl_id *id);
+ int (*remove)(struct mei_cl_device *dev);
+ };
+
+
+
+The mei_cl_device_id structure defined in :file:`include/linux/mod_devicetable.h` allows a
+driver to bind itself against a device name.
+
+.. code-block:: C
+
+ struct mei_cl_device_id {
+ char name[MEI_CL_NAME_SIZE];
+ uuid_le uuid;
+ __u8 version;
+ kernel_ulong_t driver_info;
+ };
+
+To actually register a driver on the ME Client bus one must call the :c:func:`mei_cl_add_driver`
+API. This is typically called at module initialization time.
+
+Once the driver is registered and bound to the device, a driver will typically
+try to do some I/O on this bus and this should be done through the :c:func:`mei_cl_send`
+and :c:func:`mei_cl_recv` functions. More detailed information is in :ref:`api` section.
+
+In order for a driver to be notified about pending traffic or event, the driver
+should register a callback via :c:func:`mei_cl_devev_register_rx_cb` and
+:c:func:`mei_cldev_register_notify_cb` function respectively.
+
+.. _api:
+
+API:
+----
+.. kernel-doc:: drivers/misc/mei/bus.c
+ :export: drivers/misc/mei/bus.c
+
+
+
+Example
+=======
+
+As a theoretical example let's pretend the ME comes with a "contact" NFC IP.
+The driver init and exit routines for this device would look like:
+
+.. code-block:: C
+
+ #define CONTACT_DRIVER_NAME "contact"
+
+ static struct mei_cl_device_id contact_mei_cl_tbl[] = {
+ { CONTACT_DRIVER_NAME, },
+
+ /* required last entry */
+ { }
+ };
+ MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl);
+
+ static struct mei_cl_driver contact_driver = {
+ .id_table = contact_mei_tbl,
+ .name = CONTACT_DRIVER_NAME,
+
+ .probe = contact_probe,
+ .remove = contact_remove,
+ };
+
+ static int contact_init(void)
+ {
+ int r;
+
+ r = mei_cl_driver_register(&contact_driver);
+ if (r) {
+ pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n");
+ return r;
+ }
+
+ return 0;
+ }
+
+ static void __exit contact_exit(void)
+ {
+ mei_cl_driver_unregister(&contact_driver);
+ }
+
+ module_init(contact_init);
+ module_exit(contact_exit);
+
+And the driver's simplified probe routine would look like that:
+
+.. code-block:: C
+
+ int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id)
+ {
+ [...]
+ mei_cldev_enable(dev);
+
+ mei_cldev_register_rx_cb(dev, contact_rx_cb);
+
+ return 0;
+ }
+
+In the probe routine the driver first enable the MEI device and then registers
+an rx handler which is as close as it can get to registering a threaded IRQ handler.
+The handler implementation will typically call :c:func:`mei_cldev_recv` and then
+process received data.
+
+.. code-block:: C
+
+ #define MAX_PAYLOAD 128
+ #define HDR_SIZE 4
+ static void conntact_rx_cb(struct mei_cl_device *cldev)
+ {
+ struct contact *c = mei_cldev_get_drvdata(cldev);
+ unsigned char payload[MAX_PAYLOAD];
+ ssize_t payload_sz;
+
+ payload_sz = mei_cldev_recv(cldev, payload, MAX_PAYLOAD)
+ if (reply_size < HDR_SIZE) {
+ return;
+ }
+
+ c->process_rx(payload);
+
+ }
+
+MEI Client Bus Drivers
+======================
+
+.. toctree::
+ :maxdepth: 2
+
+ hdcp
+ nfc
diff --git a/Documentation/driver-api/mei/mei.rst b/Documentation/driver-api/mei/mei.rst
new file mode 100644
index 000000000000..c800d8e5f422
--- /dev/null
+++ b/Documentation/driver-api/mei/mei.rst
@@ -0,0 +1,176 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Introduction
+============
+
+The Intel Management Engine (Intel ME) is an isolated and protected computing
+resource (Co-processor) residing inside certain Intel chipsets. The Intel ME
+provides support for computer/IT management and security features.
+The actual feature set depends on the Intel chipset SKU.
+
+The Intel Management Engine Interface (Intel MEI, previously known as HECI)
+is the interface between the Host and Intel ME. This interface is exposed
+to the host as a PCI device, actually multiple PCI devices might be exposed.
+The Intel MEI Driver is in charge of the communication channel between
+a host application and the Intel ME features.
+
+Each Intel ME feature, or Intel ME Client is addressed by a unique GUID and
+each client has its own protocol. The protocol is message-based with a
+header and payload up to maximal number of bytes advertised by the client,
+upon connection.
+
+Intel MEI Driver
+================
+
+The driver exposes a character device with device nodes /dev/meiX.
+
+An application maintains communication with an Intel ME feature while
+/dev/meiX is open. The binding to a specific feature is performed by calling
+:c:macro:`MEI_CONNECT_CLIENT_IOCTL`, which passes the desired GUID.
+The number of instances of an Intel ME feature that can be opened
+at the same time depends on the Intel ME feature, but most of the
+features allow only a single instance.
+
+The driver is transparent to data that are passed between firmware feature
+and host application.
+
+Because some of the Intel ME features can change the system
+configuration, the driver by default allows only a privileged
+user to access it.
+
+The session is terminated calling :c:func:`close(int fd)`.
+
+A code snippet for an application communicating with Intel AMTHI client:
+
+.. code-block:: C
+
+ struct mei_connect_client_data data;
+ fd = open(MEI_DEVICE);
+
+ data.d.in_client_uuid = AMTHI_GUID;
+
+ ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &data);
+
+ printf("Ver=%d, MaxLen=%ld\n",
+ data.d.in_client_uuid.protocol_version,
+ data.d.in_client_uuid.max_msg_length);
+
+ [...]
+
+ write(fd, amthi_req_data, amthi_req_data_len);
+
+ [...]
+
+ read(fd, &amthi_res_data, amthi_res_data_len);
+
+ [...]
+ close(fd);
+
+
+User space API
+
+IOCTLs:
+=======
+
+The Intel MEI Driver supports the following IOCTL commands:
+
+IOCTL_MEI_CONNECT_CLIENT
+-------------------------
+Connect to firmware Feature/Client.
+
+.. code-block:: none
+
+ Usage:
+
+ struct mei_connect_client_data client_data;
+
+ ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &client_data);
+
+ Inputs:
+
+ struct mei_connect_client_data - contain the following
+ Input field:
+
+ in_client_uuid - GUID of the FW Feature that needs
+ to connect to.
+ Outputs:
+ out_client_properties - Client Properties: MTU and Protocol Version.
+
+ Error returns:
+
+ ENOTTY No such client (i.e. wrong GUID) or connection is not allowed.
+ EINVAL Wrong IOCTL Number
+ ENODEV Device or Connection is not initialized or ready.
+ ENOMEM Unable to allocate memory to client internal data.
+ EFAULT Fatal Error (e.g. Unable to access user input data)
+ EBUSY Connection Already Open
+
+:Note:
+ max_msg_length (MTU) in client properties describes the maximum
+ data that can be sent or received. (e.g. if MTU=2K, can send
+ requests up to bytes 2k and received responses up to 2k bytes).
+
+
+IOCTL_MEI_NOTIFY_SET
+---------------------
+Enable or disable event notifications.
+
+
+.. code-block:: none
+
+ Usage:
+
+ uint32_t enable;
+
+ ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable);
+
+
+ uint32_t enable = 1;
+ or
+ uint32_t enable[disable] = 0;
+
+ Error returns:
+
+
+ EINVAL Wrong IOCTL Number
+ ENODEV Device is not initialized or the client not connected
+ ENOMEM Unable to allocate memory to client internal data.
+ EFAULT Fatal Error (e.g. Unable to access user input data)
+ EOPNOTSUPP if the device doesn't support the feature
+
+:Note:
+ The client must be connected in order to enable notification events
+
+
+IOCTL_MEI_NOTIFY_GET
+--------------------
+Retrieve event
+
+.. code-block:: none
+
+ Usage:
+ uint32_t event;
+ ioctl(fd, IOCTL_MEI_NOTIFY_GET, &event);
+
+ Outputs:
+ 1 - if an event is pending
+ 0 - if there is no even pending
+
+ Error returns:
+ EINVAL Wrong IOCTL Number
+ ENODEV Device is not initialized or the client not connected
+ ENOMEM Unable to allocate memory to client internal data.
+ EFAULT Fatal Error (e.g. Unable to access user input data)
+ EOPNOTSUPP if the device doesn't support the feature
+
+:Note:
+ The client must be connected and event notification has to be enabled
+ in order to receive an event
+
+
+
+Supported Chipsets
+==================
+82X38/X48 Express and newer
+
+linux-mei@linux.intel.com
diff --git a/Documentation/driver-api/mei/nfc.rst b/Documentation/driver-api/mei/nfc.rst
new file mode 100644
index 000000000000..b5b6fc96f85e
--- /dev/null
+++ b/Documentation/driver-api/mei/nfc.rst
@@ -0,0 +1,28 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+MEI NFC
+-------
+
+Some Intel 8 and 9 Serieses chipsets supports NFC devices connected behind
+the Intel Management Engine controller.
+MEI client bus exposes the NFC chips as NFC phy devices and enables
+binding with Microread and NXP PN544 NFC device driver from the Linux NFC
+subsystem.
+
+.. kernel-render:: DOT
+ :alt: MEI NFC digraph
+ :caption: **MEI NFC** Stack
+
+ digraph NFC {
+ cl_nfc -> me_cl_nfc;
+ "drivers/nfc/mei_phy" -> cl_nfc [lhead=bus];
+ "drivers/nfc/microread/mei" -> cl_nfc;
+ "drivers/nfc/microread/mei" -> "drivers/nfc/mei_phy";
+ "drivers/nfc/pn544/mei" -> cl_nfc;
+ "drivers/nfc/pn544/mei" -> "drivers/nfc/mei_phy";
+ "net/nfc" -> "drivers/nfc/microread/mei";
+ "net/nfc" -> "drivers/nfc/pn544/mei";
+ "neard" -> "net/nfc";
+ cl_nfc [label="mei/bus(nfc)"];
+ me_cl_nfc [label="me fw (nfc)"];
+ }
diff --git a/Documentation/driver-api/memory-devices/index.rst b/Documentation/driver-api/memory-devices/index.rst
new file mode 100644
index 000000000000..28101458cda5
--- /dev/null
+++ b/Documentation/driver-api/memory-devices/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=========================
+Memory Controller drivers
+=========================
+
+.. toctree::
+ :maxdepth: 1
+
+ ti-emif
+ ti-gpmc
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/driver-api/memory-devices/ti-emif.rst b/Documentation/driver-api/memory-devices/ti-emif.rst
new file mode 100644
index 000000000000..dea2ad9bcd7e
--- /dev/null
+++ b/Documentation/driver-api/memory-devices/ti-emif.rst
@@ -0,0 +1,64 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============================
+TI EMIF SDRAM Controller Driver
+===============================
+
+Author
+======
+Aneesh V <aneesh@ti.com>
+
+Location
+========
+driver/memory/emif.c
+
+Supported SoCs:
+===============
+TI OMAP44xx
+TI OMAP54xx
+
+Menuconfig option:
+==================
+Device Drivers
+ Memory devices
+ Texas Instruments EMIF driver
+
+Description
+===========
+This driver is for the EMIF module available in Texas Instruments
+SoCs. EMIF is an SDRAM controller that, based on its revision,
+supports one or more of DDR2, DDR3, and LPDDR2 SDRAM protocols.
+This driver takes care of only LPDDR2 memories presently. The
+functions of the driver includes re-configuring AC timing
+parameters and other settings during frequency, voltage and
+temperature changes
+
+Platform Data (see include/linux/platform_data/emif_plat.h)
+===========================================================
+DDR device details and other board dependent and SoC dependent
+information can be passed through platform data (struct emif_platform_data)
+
+- DDR device details: 'struct ddr_device_info'
+- Device AC timings: 'struct lpddr2_timings' and 'struct lpddr2_min_tck'
+- Custom configurations: customizable policy options through
+ 'struct emif_custom_configs'
+- IP revision
+- PHY type
+
+Interface to the external world
+===============================
+EMIF driver registers notifiers for voltage and frequency changes
+affecting EMIF and takes appropriate actions when these are invoked.
+
+- freq_pre_notify_handling()
+- freq_post_notify_handling()
+- volt_notify_handling()
+
+Debugfs
+=======
+The driver creates two debugfs entries per device.
+
+- regcache_dump : dump of register values calculated and saved for all
+ frequencies used so far.
+- mr4 : last polled value of MR4 register in the LPDDR2 device. MR4
+ indicates the current temperature level of the device.
diff --git a/Documentation/driver-api/memory-devices/ti-gpmc.rst b/Documentation/driver-api/memory-devices/ti-gpmc.rst
new file mode 100644
index 000000000000..33efcb81f080
--- /dev/null
+++ b/Documentation/driver-api/memory-devices/ti-gpmc.rst
@@ -0,0 +1,179 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================================
+GPMC (General Purpose Memory Controller)
+========================================
+
+GPMC is an unified memory controller dedicated to interfacing external
+memory devices like
+
+ * Asynchronous SRAM like memories and application specific integrated
+ circuit devices.
+ * Asynchronous, synchronous, and page mode burst NOR flash devices
+ NAND flash
+ * Pseudo-SRAM devices
+
+GPMC is found on Texas Instruments SoC's (OMAP based)
+IP details: http://www.ti.com/lit/pdf/spruh73 section 7.1
+
+
+GPMC generic timing calculation:
+================================
+
+GPMC has certain timings that has to be programmed for proper
+functioning of the peripheral, while peripheral has another set of
+timings. To have peripheral work with gpmc, peripheral timings has to
+be translated to the form gpmc can understand. The way it has to be
+translated depends on the connected peripheral. Also there is a
+dependency for certain gpmc timings on gpmc clock frequency. Hence a
+generic timing routine was developed to achieve above requirements.
+
+Generic routine provides a generic method to calculate gpmc timings
+from gpmc peripheral timings. struct gpmc_device_timings fields has to
+be updated with timings from the datasheet of the peripheral that is
+connected to gpmc. A few of the peripheral timings can be fed either
+in time or in cycles, provision to handle this scenario has been
+provided (refer struct gpmc_device_timings definition). It may so
+happen that timing as specified by peripheral datasheet is not present
+in timing structure, in this scenario, try to correlate peripheral
+timing to the one available. If that doesn't work, try to add a new
+field as required by peripheral, educate generic timing routine to
+handle it, make sure that it does not break any of the existing.
+Then there may be cases where peripheral datasheet doesn't mention
+certain fields of struct gpmc_device_timings, zero those entries.
+
+Generic timing routine has been verified to work properly on
+multiple onenand's and tusb6010 peripherals.
+
+A word of caution: generic timing routine has been developed based
+on understanding of gpmc timings, peripheral timings, available
+custom timing routines, a kind of reverse engineering without
+most of the datasheets & hardware (to be exact none of those supported
+in mainline having custom timing routine) and by simulation.
+
+gpmc timing dependency on peripheral timings:
+
+[<gpmc_timing>: <peripheral timing1>, <peripheral timing2> ...]
+
+1. common
+
+cs_on:
+ t_ceasu
+adv_on:
+ t_avdasu, t_ceavd
+
+2. sync common
+
+sync_clk:
+ clk
+page_burst_access:
+ t_bacc
+clk_activation:
+ t_ces, t_avds
+
+3. read async muxed
+
+adv_rd_off:
+ t_avdp_r
+oe_on:
+ t_oeasu, t_aavdh
+access:
+ t_iaa, t_oe, t_ce, t_aa
+rd_cycle:
+ t_rd_cycle, t_cez_r, t_oez
+
+4. read async non-muxed
+
+adv_rd_off:
+ t_avdp_r
+oe_on:
+ t_oeasu
+access:
+ t_iaa, t_oe, t_ce, t_aa
+rd_cycle:
+ t_rd_cycle, t_cez_r, t_oez
+
+5. read sync muxed
+
+adv_rd_off:
+ t_avdp_r, t_avdh
+oe_on:
+ t_oeasu, t_ach, cyc_aavdh_oe
+access:
+ t_iaa, cyc_iaa, cyc_oe
+rd_cycle:
+ t_cez_r, t_oez, t_ce_rdyz
+
+6. read sync non-muxed
+
+adv_rd_off:
+ t_avdp_r
+oe_on:
+ t_oeasu
+access:
+ t_iaa, cyc_iaa, cyc_oe
+rd_cycle:
+ t_cez_r, t_oez, t_ce_rdyz
+
+7. write async muxed
+
+adv_wr_off:
+ t_avdp_w
+we_on, wr_data_mux_bus:
+ t_weasu, t_aavdh, cyc_aavhd_we
+we_off:
+ t_wpl
+cs_wr_off:
+ t_wph
+wr_cycle:
+ t_cez_w, t_wr_cycle
+
+8. write async non-muxed
+
+adv_wr_off:
+ t_avdp_w
+we_on, wr_data_mux_bus:
+ t_weasu
+we_off:
+ t_wpl
+cs_wr_off:
+ t_wph
+wr_cycle:
+ t_cez_w, t_wr_cycle
+
+9. write sync muxed
+
+adv_wr_off:
+ t_avdp_w, t_avdh
+we_on, wr_data_mux_bus:
+ t_weasu, t_rdyo, t_aavdh, cyc_aavhd_we
+we_off:
+ t_wpl, cyc_wpl
+cs_wr_off:
+ t_wph
+wr_cycle:
+ t_cez_w, t_ce_rdyz
+
+10. write sync non-muxed
+
+adv_wr_off:
+ t_avdp_w
+we_on, wr_data_mux_bus:
+ t_weasu, t_rdyo
+we_off:
+ t_wpl, cyc_wpl
+cs_wr_off:
+ t_wph
+wr_cycle:
+ t_cez_w, t_ce_rdyz
+
+
+Note:
+ Many of gpmc timings are dependent on other gpmc timings (a few
+ gpmc timings purely dependent on other gpmc timings, a reason that
+ some of the gpmc timings are missing above), and it will result in
+ indirect dependency of peripheral timings to gpmc timings other than
+ mentioned above, refer timing routine for more details. To know what
+ these peripheral timings correspond to, please see explanations in
+ struct gpmc_device_timings definition. And for gpmc timings refer
+ IP details (link above).
diff --git a/Documentation/men-chameleon-bus.txt b/Documentation/driver-api/men-chameleon-bus.rst
index 1b1f048aa748..1b1f048aa748 100644
--- a/Documentation/men-chameleon-bus.txt
+++ b/Documentation/driver-api/men-chameleon-bus.rst
diff --git a/Documentation/driver-api/mmc/index.rst b/Documentation/driver-api/mmc/index.rst
new file mode 100644
index 000000000000..7339736ac774
--- /dev/null
+++ b/Documentation/driver-api/mmc/index.rst
@@ -0,0 +1,13 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+MMC/SD/SDIO card support
+========================
+
+.. toctree::
+ :maxdepth: 1
+
+ mmc-dev-attrs
+ mmc-dev-parts
+ mmc-async-req
+ mmc-tools
diff --git a/Documentation/driver-api/mmc/mmc-async-req.rst b/Documentation/driver-api/mmc/mmc-async-req.rst
new file mode 100644
index 000000000000..0f7197c9c3b5
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-async-req.rst
@@ -0,0 +1,98 @@
+========================
+MMC Asynchronous Request
+========================
+
+Rationale
+=========
+
+How significant is the cache maintenance overhead?
+
+It depends. Fast eMMC and multiple cache levels with speculative cache
+pre-fetch makes the cache overhead relatively significant. If the DMA
+preparations for the next request are done in parallel with the current
+transfer, the DMA preparation overhead would not affect the MMC performance.
+
+The intention of non-blocking (asynchronous) MMC requests is to minimize the
+time between when an MMC request ends and another MMC request begins.
+
+Using mmc_wait_for_req(), the MMC controller is idle while dma_map_sg and
+dma_unmap_sg are processing. Using non-blocking MMC requests makes it
+possible to prepare the caches for next job in parallel with an active
+MMC request.
+
+MMC block driver
+================
+
+The mmc_blk_issue_rw_rq() in the MMC block driver is made non-blocking.
+
+The increase in throughput is proportional to the time it takes to
+prepare (major part of preparations are dma_map_sg() and dma_unmap_sg())
+a request and how fast the memory is. The faster the MMC/SD is the
+more significant the prepare request time becomes. Roughly the expected
+performance gain is 5% for large writes and 10% on large reads on a L2 cache
+platform. In power save mode, when clocks run on a lower frequency, the DMA
+preparation may cost even more. As long as these slower preparations are run
+in parallel with the transfer performance won't be affected.
+
+Details on measurements from IOZone and mmc_test
+================================================
+
+https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req
+
+MMC core API extension
+======================
+
+There is one new public function mmc_start_req().
+
+It starts a new MMC command request for a host. The function isn't
+truly non-blocking. If there is an ongoing async request it waits
+for completion of that request and starts the new one and returns. It
+doesn't wait for the new request to complete. If there is no ongoing
+request it starts the new request and returns immediately.
+
+MMC host extensions
+===================
+
+There are two optional members in the mmc_host_ops -- pre_req() and
+post_req() -- that the host driver may implement in order to move work
+to before and after the actual mmc_host_ops.request() function is called.
+
+In the DMA case pre_req() may do dma_map_sg() and prepare the DMA
+descriptor, and post_req() runs the dma_unmap_sg().
+
+Optimize for the first request
+==============================
+
+The first request in a series of requests can't be prepared in parallel
+with the previous transfer, since there is no previous request.
+
+The argument is_first_req in pre_req() indicates that there is no previous
+request. The host driver may optimize for this scenario to minimize
+the performance loss. A way to optimize for this is to split the current
+request in two chunks, prepare the first chunk and start the request,
+and finally prepare the second chunk and start the transfer.
+
+Pseudocode to handle is_first_req scenario with minimal prepare overhead::
+
+ if (is_first_req && req->size > threshold)
+ /* start MMC transfer for the complete transfer size */
+ mmc_start_command(MMC_CMD_TRANSFER_FULL_SIZE);
+
+ /*
+ * Begin to prepare DMA while cmd is being processed by MMC.
+ * The first chunk of the request should take the same time
+ * to prepare as the "MMC process command time".
+ * If prepare time exceeds MMC cmd time
+ * the transfer is delayed, guesstimate max 4k as first chunk size.
+ */
+ prepare_1st_chunk_for_dma(req);
+ /* flush pending desc to the DMAC (dmaengine.h) */
+ dma_issue_pending(req->dma_desc);
+
+ prepare_2nd_chunk_for_dma(req);
+ /*
+ * The second issue_pending should be called before MMC runs out
+ * of the first chunk. If the MMC runs out of the first data chunk
+ * before this call, the transfer is delayed.
+ */
+ dma_issue_pending(req->dma_desc);
diff --git a/Documentation/driver-api/mmc/mmc-dev-attrs.rst b/Documentation/driver-api/mmc/mmc-dev-attrs.rst
new file mode 100644
index 000000000000..4f44b1b730d6
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-dev-attrs.rst
@@ -0,0 +1,91 @@
+==================================
+SD and MMC Block Device Attributes
+==================================
+
+These attributes are defined for the block devices associated with the
+SD or MMC device.
+
+The following attributes are read/write.
+
+ ======== ===============================================
+ force_ro Enforce read-only access even if write protect switch is off.
+ ======== ===============================================
+
+SD and MMC Device Attributes
+============================
+
+All attributes are read-only.
+
+ ====================== ===============================================
+ cid Card Identification Register
+ csd Card Specific Data Register
+ scr SD Card Configuration Register (SD only)
+ date Manufacturing Date (from CID Register)
+ fwrev Firmware/Product Revision (from CID Register)
+ (SD and MMCv1 only)
+ hwrev Hardware/Product Revision (from CID Register)
+ (SD and MMCv1 only)
+ manfid Manufacturer ID (from CID Register)
+ name Product Name (from CID Register)
+ oemid OEM/Application ID (from CID Register)
+ prv Product Revision (from CID Register)
+ (SD and MMCv4 only)
+ serial Product Serial Number (from CID Register)
+ erase_size Erase group size
+ preferred_erase_size Preferred erase size
+ raw_rpmb_size_mult RPMB partition size
+ rel_sectors Reliable write sector count
+ ocr Operation Conditions Register
+ dsr Driver Stage Register
+ cmdq_en Command Queue enabled:
+
+ 1 => enabled, 0 => not enabled
+ ====================== ===============================================
+
+Note on Erase Size and Preferred Erase Size:
+
+ "erase_size" is the minimum size, in bytes, of an erase
+ operation. For MMC, "erase_size" is the erase group size
+ reported by the card. Note that "erase_size" does not apply
+ to trim or secure trim operations where the minimum size is
+ always one 512 byte sector. For SD, "erase_size" is 512
+ if the card is block-addressed, 0 otherwise.
+
+ SD/MMC cards can erase an arbitrarily large area up to and
+ including the whole card. When erasing a large area it may
+ be desirable to do it in smaller chunks for three reasons:
+
+ 1. A single erase command will make all other I/O on
+ the card wait. This is not a problem if the whole card
+ is being erased, but erasing one partition will make
+ I/O for another partition on the same card wait for the
+ duration of the erase - which could be a several
+ minutes.
+ 2. To be able to inform the user of erase progress.
+ 3. The erase timeout becomes too large to be very
+ useful. Because the erase timeout contains a margin
+ which is multiplied by the size of the erase area,
+ the value can end up being several minutes for large
+ areas.
+
+ "erase_size" is not the most efficient unit to erase
+ (especially for SD where it is just one sector),
+ hence "preferred_erase_size" provides a good chunk
+ size for erasing large areas.
+
+ For MMC, "preferred_erase_size" is the high-capacity
+ erase size if a card specifies one, otherwise it is
+ based on the capacity of the card.
+
+ For SD, "preferred_erase_size" is the allocation unit
+ size specified by the card.
+
+ "preferred_erase_size" is in bytes.
+
+Note on raw_rpmb_size_mult:
+
+ "raw_rpmb_size_mult" is a multiple of 128kB block.
+
+ RPMB size in byte is calculated by using the following equation:
+
+ RPMB partition size = 128kB x raw_rpmb_size_mult
diff --git a/Documentation/driver-api/mmc/mmc-dev-parts.rst b/Documentation/driver-api/mmc/mmc-dev-parts.rst
new file mode 100644
index 000000000000..995922f1f744
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-dev-parts.rst
@@ -0,0 +1,41 @@
+============================
+SD and MMC Device Partitions
+============================
+
+Device partitions are additional logical block devices present on the
+SD/MMC device.
+
+As of this writing, MMC boot partitions as supported and exposed as
+/dev/mmcblkXboot0 and /dev/mmcblkXboot1, where X is the index of the
+parent /dev/mmcblkX.
+
+MMC Boot Partitions
+===================
+
+Read and write access is provided to the two MMC boot partitions. Due to
+the sensitive nature of the boot partition contents, which often store
+a bootloader or bootloader configuration tables crucial to booting the
+platform, write access is disabled by default to reduce the chance of
+accidental bricking.
+
+To enable write access to /dev/mmcblkXbootY, disable the forced read-only
+access with::
+
+ echo 0 > /sys/block/mmcblkXbootY/force_ro
+
+To re-enable read-only access::
+
+ echo 1 > /sys/block/mmcblkXbootY/force_ro
+
+The boot partitions can also be locked read only until the next power on,
+with::
+
+ echo 1 > /sys/block/mmcblkXbootY/ro_lock_until_next_power_on
+
+This is a feature of the card and not of the kernel. If the card does
+not support boot partition locking, the file will not exist. If the
+feature has been disabled on the card, the file will be read-only.
+
+The boot partitions can also be locked permanently, but this feature is
+not accessible through sysfs in order to avoid accidental or malicious
+bricking.
diff --git a/Documentation/driver-api/mmc/mmc-tools.rst b/Documentation/driver-api/mmc/mmc-tools.rst
new file mode 100644
index 000000000000..54406093768b
--- /dev/null
+++ b/Documentation/driver-api/mmc/mmc-tools.rst
@@ -0,0 +1,37 @@
+======================
+MMC tools introduction
+======================
+
+There is one MMC test tools called mmc-utils, which is maintained by Chris Ball,
+you can find it at the below public git repository:
+
+ http://git.kernel.org/cgit/linux/kernel/git/cjb/mmc-utils.git/
+
+Functions
+=========
+
+The mmc-utils tools can do the following:
+
+ - Print and parse extcsd data.
+ - Determine the eMMC writeprotect status.
+ - Set the eMMC writeprotect status.
+ - Set the eMMC data sector size to 4KB by disabling emulation.
+ - Create general purpose partition.
+ - Enable the enhanced user area.
+ - Enable write reliability per partition.
+ - Print the response to STATUS_SEND (CMD13).
+ - Enable the boot partition.
+ - Set Boot Bus Conditions.
+ - Enable the eMMC BKOPS feature.
+ - Permanently enable the eMMC H/W Reset feature.
+ - Permanently disable the eMMC H/W Reset feature.
+ - Send Sanitize command.
+ - Program authentication key for the device.
+ - Counter value for the rpmb device will be read to stdout.
+ - Read from rpmb device to output.
+ - Write to rpmb device from data file.
+ - Enable the eMMC cache feature.
+ - Disable the eMMC cache feature.
+ - Print and parse CID data.
+ - Print and parse CSD data.
+ - Print and parse SCR data.
diff --git a/Documentation/driver-api/mtd/index.rst b/Documentation/driver-api/mtd/index.rst
new file mode 100644
index 000000000000..436ba5a851d7
--- /dev/null
+++ b/Documentation/driver-api/mtd/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Memory Technology Device (MTD)
+==============================
+
+.. toctree::
+ :maxdepth: 1
+
+ intel-spi
+ nand_ecc
+ spi-nor
diff --git a/Documentation/driver-api/mtd/intel-spi.rst b/Documentation/driver-api/mtd/intel-spi.rst
new file mode 100644
index 000000000000..0e6d9cd5388d
--- /dev/null
+++ b/Documentation/driver-api/mtd/intel-spi.rst
@@ -0,0 +1,90 @@
+==============================
+Upgrading BIOS using intel-spi
+==============================
+
+Many Intel CPUs like Baytrail and Braswell include SPI serial flash host
+controller which is used to hold BIOS and other platform specific data.
+Since contents of the SPI serial flash is crucial for machine to function,
+it is typically protected by different hardware protection mechanisms to
+avoid accidental (or on purpose) overwrite of the content.
+
+Not all manufacturers protect the SPI serial flash, mainly because it
+allows upgrading the BIOS image directly from an OS.
+
+The intel-spi driver makes it possible to read and write the SPI serial
+flash, if certain protection bits are not set and locked. If it finds
+any of them set, the whole MTD device is made read-only to prevent
+partial overwrites. By default the driver exposes SPI serial flash
+contents as read-only but it can be changed from kernel command line,
+passing "intel-spi.writeable=1".
+
+Please keep in mind that overwriting the BIOS image on SPI serial flash
+might render the machine unbootable and requires special equipment like
+Dediprog to revive. You have been warned!
+
+Below are the steps how to upgrade MinnowBoard MAX BIOS directly from
+Linux.
+
+ 1) Download and extract the latest Minnowboard MAX BIOS SPI image
+ [1]. At the time writing this the latest image is v92.
+
+ 2) Install mtd-utils package [2]. We need this in order to erase the SPI
+ serial flash. Distros like Debian and Fedora have this prepackaged with
+ name "mtd-utils".
+
+ 3) Add "intel-spi.writeable=1" to the kernel command line and reboot
+ the board (you can also reload the driver passing "writeable=1" as
+ module parameter to modprobe).
+
+ 4) Once the board is up and running again, find the right MTD partition
+ (it is named as "BIOS")::
+
+ # cat /proc/mtd
+ dev: size erasesize name
+ mtd0: 00800000 00001000 "BIOS"
+
+ So here it will be /dev/mtd0 but it may vary.
+
+ 5) Make backup of the existing image first::
+
+ # dd if=/dev/mtd0ro of=bios.bak
+ 16384+0 records in
+ 16384+0 records out
+ 8388608 bytes (8.4 MB) copied, 10.0269 s, 837 kB/s
+
+ 6) Verify the backup:
+
+ # sha1sum /dev/mtd0ro bios.bak
+ fdbb011920572ca6c991377c4b418a0502668b73 /dev/mtd0ro
+ fdbb011920572ca6c991377c4b418a0502668b73 bios.bak
+
+ The SHA1 sums must match. Otherwise do not continue any further!
+
+ 7) Erase the SPI serial flash. After this step, do not reboot the
+ board! Otherwise it will not start anymore::
+
+ # flash_erase /dev/mtd0 0 0
+ Erasing 4 Kibyte @ 7ff000 -- 100 % complete
+
+ 8) Once completed without errors you can write the new BIOS image:
+
+ # dd if=MNW2MAX1.X64.0092.R01.1605221712.bin of=/dev/mtd0
+
+ 9) Verify that the new content of the SPI serial flash matches the new
+ BIOS image::
+
+ # sha1sum /dev/mtd0ro MNW2MAX1.X64.0092.R01.1605221712.bin
+ 9b4df9e4be2057fceec3a5529ec3d950836c87a2 /dev/mtd0ro
+ 9b4df9e4be2057fceec3a5529ec3d950836c87a2 MNW2MAX1.X64.0092.R01.1605221712.bin
+
+ The SHA1 sums should match.
+
+ 10) Now you can reboot your board and observe the new BIOS starting up
+ properly.
+
+References
+----------
+
+[1] https://firmware.intel.com/sites/default/files/MinnowBoard%2EMAX_%2EX64%2E92%2ER01%2Ezip
+
+[2] http://www.linux-mtd.infradead.org/
diff --git a/Documentation/driver-api/mtd/nand_ecc.rst b/Documentation/driver-api/mtd/nand_ecc.rst
new file mode 100644
index 000000000000..e8d3c53a5056
--- /dev/null
+++ b/Documentation/driver-api/mtd/nand_ecc.rst
@@ -0,0 +1,763 @@
+==========================
+NAND Error-correction Code
+==========================
+
+Introduction
+============
+
+Having looked at the linux mtd/nand driver and more specific at nand_ecc.c
+I felt there was room for optimisation. I bashed the code for a few hours
+performing tricks like table lookup removing superfluous code etc.
+After that the speed was increased by 35-40%.
+Still I was not too happy as I felt there was additional room for improvement.
+
+Bad! I was hooked.
+I decided to annotate my steps in this file. Perhaps it is useful to someone
+or someone learns something from it.
+
+
+The problem
+===========
+
+NAND flash (at least SLC one) typically has sectors of 256 bytes.
+However NAND flash is not extremely reliable so some error detection
+(and sometimes correction) is needed.
+
+This is done by means of a Hamming code. I'll try to explain it in
+laymans terms (and apologies to all the pro's in the field in case I do
+not use the right terminology, my coding theory class was almost 30
+years ago, and I must admit it was not one of my favourites).
+
+As I said before the ecc calculation is performed on sectors of 256
+bytes. This is done by calculating several parity bits over the rows and
+columns. The parity used is even parity which means that the parity bit = 1
+if the data over which the parity is calculated is 1 and the parity bit = 0
+if the data over which the parity is calculated is 0. So the total
+number of bits over the data over which the parity is calculated + the
+parity bit is even. (see wikipedia if you can't follow this).
+Parity is often calculated by means of an exclusive or operation,
+sometimes also referred to as xor. In C the operator for xor is ^
+
+Back to ecc.
+Let's give a small figure:
+
+========= ==== ==== ==== ==== ==== ==== ==== ==== === === === === ====
+byte 0: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp4 ... rp14
+byte 1: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp2 rp4 ... rp14
+byte 2: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp4 ... rp14
+byte 3: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp4 ... rp14
+byte 4: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp5 ... rp14
+...
+byte 254: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp5 ... rp15
+byte 255: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp5 ... rp15
+ cp1 cp0 cp1 cp0 cp1 cp0 cp1 cp0
+ cp3 cp3 cp2 cp2 cp3 cp3 cp2 cp2
+ cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4
+========= ==== ==== ==== ==== ==== ==== ==== ==== === === === === ====
+
+This figure represents a sector of 256 bytes.
+cp is my abbreviation for column parity, rp for row parity.
+
+Let's start to explain column parity.
+
+- cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
+
+ so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even.
+
+Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7.
+
+- cp2 is the parity over bit0, bit1, bit4 and bit5
+- cp3 is the parity over bit2, bit3, bit6 and bit7.
+- cp4 is the parity over bit0, bit1, bit2 and bit3.
+- cp5 is the parity over bit4, bit5, bit6 and bit7.
+
+Note that each of cp0 .. cp5 is exactly one bit.
+
+Row parity actually works almost the same.
+
+- rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254)
+- rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255)
+- rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ...
+ (so handle two bytes, then skip 2 bytes).
+- rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...)
+- for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc.
+
+ so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...)
+- and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, ..
+
+The story now becomes quite boring. I guess you get the idea.
+
+- rp6 covers 8 bytes then skips 8 etc
+- rp7 skips 8 bytes then covers 8 etc
+- rp8 covers 16 bytes then skips 16 etc
+- rp9 skips 16 bytes then covers 16 etc
+- rp10 covers 32 bytes then skips 32 etc
+- rp11 skips 32 bytes then covers 32 etc
+- rp12 covers 64 bytes then skips 64 etc
+- rp13 skips 64 bytes then covers 64 etc
+- rp14 covers 128 bytes then skips 128
+- rp15 skips 128 bytes then covers 128
+
+In the end the parity bits are grouped together in three bytes as
+follows:
+
+===== ===== ===== ===== ===== ===== ===== ===== =====
+ECC Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0
+===== ===== ===== ===== ===== ===== ===== ===== =====
+ECC 0 rp07 rp06 rp05 rp04 rp03 rp02 rp01 rp00
+ECC 1 rp15 rp14 rp13 rp12 rp11 rp10 rp09 rp08
+ECC 2 cp5 cp4 cp3 cp2 cp1 cp0 1 1
+===== ===== ===== ===== ===== ===== ===== ===== =====
+
+I detected after writing this that ST application note AN1823
+(http://www.st.com/stonline/) gives a much
+nicer picture.(but they use line parity as term where I use row parity)
+Oh well, I'm graphically challenged, so suffer with me for a moment :-)
+
+And I could not reuse the ST picture anyway for copyright reasons.
+
+
+Attempt 0
+=========
+
+Implementing the parity calculation is pretty simple.
+In C pseudocode::
+
+ for (i = 0; i < 256; i++)
+ {
+ if (i & 0x01)
+ rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
+ else
+ rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp0;
+ if (i & 0x02)
+ rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
+ else
+ rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2;
+ if (i & 0x04)
+ rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5;
+ else
+ rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4;
+ if (i & 0x08)
+ rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7;
+ else
+ rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6;
+ if (i & 0x10)
+ rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9;
+ else
+ rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8;
+ if (i & 0x20)
+ rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
+ else
+ rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
+ if (i & 0x40)
+ rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
+ else
+ rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12;
+ if (i & 0x80)
+ rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15;
+ else
+ rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14;
+ cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0;
+ cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1;
+ cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2;
+ cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3
+ cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4
+ cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5
+ }
+
+
+Analysis 0
+==========
+
+C does have bitwise operators but not really operators to do the above
+efficiently (and most hardware has no such instructions either).
+Therefore without implementing this it was clear that the code above was
+not going to bring me a Nobel prize :-)
+
+Fortunately the exclusive or operation is commutative, so we can combine
+the values in any order. So instead of calculating all the bits
+individually, let us try to rearrange things.
+For the column parity this is easy. We can just xor the bytes and in the
+end filter out the relevant bits. This is pretty nice as it will bring
+all cp calculation out of the for loop.
+
+Similarly we can first xor the bytes for the various rows.
+This leads to:
+
+
+Attempt 1
+=========
+
+::
+
+ const char parity[256] = {
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
+ 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
+ };
+
+ void ecc1(const unsigned char *buf, unsigned char *code)
+ {
+ int i;
+ const unsigned char *bp = buf;
+ unsigned char cur;
+ unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+ unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+ unsigned char par;
+
+ par = 0;
+ rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+ rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+ rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+ rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+ for (i = 0; i < 256; i++)
+ {
+ cur = *bp++;
+ par ^= cur;
+ if (i & 0x01) rp1 ^= cur; else rp0 ^= cur;
+ if (i & 0x02) rp3 ^= cur; else rp2 ^= cur;
+ if (i & 0x04) rp5 ^= cur; else rp4 ^= cur;
+ if (i & 0x08) rp7 ^= cur; else rp6 ^= cur;
+ if (i & 0x10) rp9 ^= cur; else rp8 ^= cur;
+ if (i & 0x20) rp11 ^= cur; else rp10 ^= cur;
+ if (i & 0x40) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x80) rp15 ^= cur; else rp14 ^= cur;
+ }
+ code[0] =
+ (parity[rp7] << 7) |
+ (parity[rp6] << 6) |
+ (parity[rp5] << 5) |
+ (parity[rp4] << 4) |
+ (parity[rp3] << 3) |
+ (parity[rp2] << 2) |
+ (parity[rp1] << 1) |
+ (parity[rp0]);
+ code[1] =
+ (parity[rp15] << 7) |
+ (parity[rp14] << 6) |
+ (parity[rp13] << 5) |
+ (parity[rp12] << 4) |
+ (parity[rp11] << 3) |
+ (parity[rp10] << 2) |
+ (parity[rp9] << 1) |
+ (parity[rp8]);
+ code[2] =
+ (parity[par & 0xf0] << 7) |
+ (parity[par & 0x0f] << 6) |
+ (parity[par & 0xcc] << 5) |
+ (parity[par & 0x33] << 4) |
+ (parity[par & 0xaa] << 3) |
+ (parity[par & 0x55] << 2);
+ code[0] = ~code[0];
+ code[1] = ~code[1];
+ code[2] = ~code[2];
+ }
+
+Still pretty straightforward. The last three invert statements are there to
+give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash
+all data is 0xff, so the checksum then matches.
+
+I also introduced the parity lookup. I expected this to be the fastest
+way to calculate the parity, but I will investigate alternatives later
+on.
+
+
+Analysis 1
+==========
+
+The code works, but is not terribly efficient. On my system it took
+almost 4 times as much time as the linux driver code. But hey, if it was
+*that* easy this would have been done long before.
+No pain. no gain.
+
+Fortunately there is plenty of room for improvement.
+
+In step 1 we moved from bit-wise calculation to byte-wise calculation.
+However in C we can also use the unsigned long data type and virtually
+every modern microprocessor supports 32 bit operations, so why not try
+to write our code in such a way that we process data in 32 bit chunks.
+
+Of course this means some modification as the row parity is byte by
+byte. A quick analysis:
+for the column parity we use the par variable. When extending to 32 bits
+we can in the end easily calculate rp0 and rp1 from it.
+(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
+respectively, from MSB to LSB)
+also rp2 and rp3 can be easily retrieved from par as rp3 covers the
+first two MSBs and rp2 covers the last two LSBs.
+
+Note that of course now the loop is executed only 64 times (256/4).
+And note that care must taken wrt byte ordering. The way bytes are
+ordered in a long is machine dependent, and might affect us.
+Anyway, if there is an issue: this code is developed on x86 (to be
+precise: a DELL PC with a D920 Intel CPU)
+
+And of course the performance might depend on alignment, but I expect
+that the I/O buffers in the nand driver are aligned properly (and
+otherwise that should be fixed to get maximum performance).
+
+Let's give it a try...
+
+
+Attempt 2
+=========
+
+::
+
+ extern const char parity[256];
+
+ void ecc2(const unsigned char *buf, unsigned char *code)
+ {
+ int i;
+ const unsigned long *bp = (unsigned long *)buf;
+ unsigned long cur;
+ unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
+ unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
+ unsigned long par;
+
+ par = 0;
+ rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
+ rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
+ rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
+ rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
+
+ for (i = 0; i < 64; i++)
+ {
+ cur = *bp++;
+ par ^= cur;
+ if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+ if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+ if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+ if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+ if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+ }
+ /*
+ we need to adapt the code generation for the fact that rp vars are now
+ long; also the column parity calculation needs to be changed.
+ we'll bring rp4 to 15 back to single byte entities by shifting and
+ xoring
+ */
+ rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff;
+ rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff;
+ rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff;
+ rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff;
+ rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff;
+ rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff;
+ rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff;
+ rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff;
+ rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff;
+ rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff;
+ rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff;
+ rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff;
+ rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff;
+ rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff;
+ par ^= (par >> 16);
+ rp1 = (par >> 8); rp1 &= 0xff;
+ rp0 = (par & 0xff);
+ par ^= (par >> 8); par &= 0xff;
+
+ code[0] =
+ (parity[rp7] << 7) |
+ (parity[rp6] << 6) |
+ (parity[rp5] << 5) |
+ (parity[rp4] << 4) |
+ (parity[rp3] << 3) |
+ (parity[rp2] << 2) |
+ (parity[rp1] << 1) |
+ (parity[rp0]);
+ code[1] =
+ (parity[rp15] << 7) |
+ (parity[rp14] << 6) |
+ (parity[rp13] << 5) |
+ (parity[rp12] << 4) |
+ (parity[rp11] << 3) |
+ (parity[rp10] << 2) |
+ (parity[rp9] << 1) |
+ (parity[rp8]);
+ code[2] =
+ (parity[par & 0xf0] << 7) |
+ (parity[par & 0x0f] << 6) |
+ (parity[par & 0xcc] << 5) |
+ (parity[par & 0x33] << 4) |
+ (parity[par & 0xaa] << 3) |
+ (parity[par & 0x55] << 2);
+ code[0] = ~code[0];
+ code[1] = ~code[1];
+ code[2] = ~code[2];
+ }
+
+The parity array is not shown any more. Note also that for these
+examples I kinda deviated from my regular programming style by allowing
+multiple statements on a line, not using { } in then and else blocks
+with only a single statement and by using operators like ^=
+
+
+Analysis 2
+==========
+
+The code (of course) works, and hurray: we are a little bit faster than
+the linux driver code (about 15%). But wait, don't cheer too quickly.
+There is more to be gained.
+If we look at e.g. rp14 and rp15 we see that we either xor our data with
+rp14 or with rp15. However we also have par which goes over all data.
+This means there is no need to calculate rp14 as it can be calculated from
+rp15 through rp14 = par ^ rp15, because par = rp14 ^ rp15;
+(or if desired we can avoid calculating rp15 and calculate it from
+rp14). That is why some places refer to inverse parity.
+Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
+Effectively this means we can eliminate the else clause from the if
+statements. Also we can optimise the calculation in the end a little bit
+by going from long to byte first. Actually we can even avoid the table
+lookups
+
+Attempt 3
+=========
+
+Odd replaced::
+
+ if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
+ if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
+ if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
+ if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
+ if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
+
+with::
+
+ if (i & 0x01) rp5 ^= cur;
+ if (i & 0x02) rp7 ^= cur;
+ if (i & 0x04) rp9 ^= cur;
+ if (i & 0x08) rp11 ^= cur;
+ if (i & 0x10) rp13 ^= cur;
+ if (i & 0x20) rp15 ^= cur;
+
+and outside the loop added::
+
+ rp4 = par ^ rp5;
+ rp6 = par ^ rp7;
+ rp8 = par ^ rp9;
+ rp10 = par ^ rp11;
+ rp12 = par ^ rp13;
+ rp14 = par ^ rp15;
+
+And after that the code takes about 30% more time, although the number of
+statements is reduced. This is also reflected in the assembly code.
+
+
+Analysis 3
+==========
+
+Very weird. Guess it has to do with caching or instruction parallellism
+or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting
+observation was that this one is only 30% slower (according to time)
+executing the code as my 3Ghz D920 processor.
+
+Well, it was expected not to be easy so maybe instead move to a
+different track: let's move back to the code from attempt2 and do some
+loop unrolling. This will eliminate a few if statements. I'll try
+different amounts of unrolling to see what works best.
+
+
+Attempt 4
+=========
+
+Unrolled the loop 1, 2, 3 and 4 times.
+For 4 the code starts with::
+
+ for (i = 0; i < 4; i++)
+ {
+ cur = *bp++;
+ par ^= cur;
+ rp4 ^= cur;
+ rp6 ^= cur;
+ rp8 ^= cur;
+ rp10 ^= cur;
+ if (i & 0x1) rp13 ^= cur; else rp12 ^= cur;
+ if (i & 0x2) rp15 ^= cur; else rp14 ^= cur;
+ cur = *bp++;
+ par ^= cur;
+ rp5 ^= cur;
+ rp6 ^= cur;
+ ...
+
+
+Analysis 4
+==========
+
+Unrolling once gains about 15%
+
+Unrolling twice keeps the gain at about 15%
+
+Unrolling three times gives a gain of 30% compared to attempt 2.
+
+Unrolling four times gives a marginal improvement compared to unrolling
+three times.
+
+I decided to proceed with a four time unrolled loop anyway. It was my gut
+feeling that in the next steps I would obtain additional gain from it.
+
+The next step was triggered by the fact that par contains the xor of all
+bytes and rp4 and rp5 each contain the xor of half of the bytes.
+So in effect par = rp4 ^ rp5. But as xor is commutative we can also say
+that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can
+eliminate rp5 (or rp4, but I already foresaw another optimisation).
+The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15.
+
+
+Attempt 5
+=========
+
+Effectively so all odd digit rp assignments in the loop were removed.
+This included the else clause of the if statements.
+Of course after the loop we need to correct things by adding code like::
+
+ rp5 = par ^ rp4;
+
+Also the initial assignments (rp5 = 0; etc) could be removed.
+Along the line I also removed the initialisation of rp0/1/2/3.
+
+
+Analysis 5
+==========
+
+Measurements showed this was a good move. The run-time roughly halved
+compared with attempt 4 with 4 times unrolled, and we only require 1/3rd
+of the processor time compared to the current code in the linux kernel.
+
+However, still I thought there was more. I didn't like all the if
+statements. Why not keep a running parity and only keep the last if
+statement. Time for yet another version!
+
+
+Attempt 6
+=========
+
+THe code within the for loop was changed to::
+
+ for (i = 0; i < 4; i++)
+ {
+ cur = *bp++; tmppar = cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp8 ^= cur;
+
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur;
+
+ par ^= tmppar;
+ if ((i & 0x1) == 0) rp12 ^= tmppar;
+ if ((i & 0x2) == 0) rp14 ^= tmppar;
+ }
+
+As you can see tmppar is used to accumulate the parity within a for
+iteration. In the last 3 statements is added to par and, if needed,
+to rp12 and rp14.
+
+While making the changes I also found that I could exploit that tmppar
+contains the running parity for this iteration. So instead of having:
+rp4 ^= cur; rp6 ^= cur;
+I removed the rp6 ^= cur; statement and did rp6 ^= tmppar; on next
+statement. A similar change was done for rp8 and rp10
+
+
+Analysis 6
+==========
+
+Measuring this code again showed big gain. When executing the original
+linux code 1 million times, this took about 1 second on my system.
+(using time to measure the performance). After this iteration I was back
+to 0.075 sec. Actually I had to decide to start measuring over 10
+million iterations in order not to lose too much accuracy. This one
+definitely seemed to be the jackpot!
+
+There is a little bit more room for improvement though. There are three
+places with statements::
+
+ rp4 ^= cur; rp6 ^= cur;
+
+It seems more efficient to also maintain a variable rp4_6 in the while
+loop; This eliminates 3 statements per loop. Of course after the loop we
+need to correct by adding::
+
+ rp4 ^= rp4_6;
+ rp6 ^= rp4_6
+
+Furthermore there are 4 sequential assignments to rp8. This can be
+encoded slightly more efficiently by saving tmppar before those 4 lines
+and later do rp8 = rp8 ^ tmppar ^ notrp8;
+(where notrp8 is the value of rp8 before those 4 lines).
+Again a use of the commutative property of xor.
+Time for a new test!
+
+
+Attempt 7
+=========
+
+The new code now looks like::
+
+ for (i = 0; i < 4; i++)
+ {
+ cur = *bp++; tmppar = cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
+
+ cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
+
+ notrp8 = tmppar;
+ cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur;
+ rp8 = rp8 ^ tmppar ^ notrp8;
+
+ cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp6 ^= cur;
+ cur = *bp++; tmppar ^= cur; rp4 ^= cur;
+ cur = *bp++; tmppar ^= cur;
+
+ par ^= tmppar;
+ if ((i & 0x1) == 0) rp12 ^= tmppar;
+ if ((i & 0x2) == 0) rp14 ^= tmppar;
+ }
+ rp4 ^= rp4_6;
+ rp6 ^= rp4_6;
+
+
+Not a big change, but every penny counts :-)
+
+
+Analysis 7
+==========
+
+Actually this made things worse. Not very much, but I don't want to move
+into the wrong direction. Maybe something to investigate later. Could
+have to do with caching again.
+
+Guess that is what there is to win within the loop. Maybe unrolling one
+more time will help. I'll keep the optimisations from 7 for now.
+
+
+Attempt 8
+=========
+
+Unrolled the loop one more time.
+
+
+Analysis 8
+==========
+
+This makes things worse. Let's stick with attempt 6 and continue from there.
+Although it seems that the code within the loop cannot be optimised
+further there is still room to optimize the generation of the ecc codes.
+We can simply calculate the total parity. If this is 0 then rp4 = rp5
+etc. If the parity is 1, then rp4 = !rp5;
+
+But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
+in the result byte and then do something like::
+
+ code[0] |= (code[0] << 1);
+
+Lets test this.
+
+
+Attempt 9
+=========
+
+Changed the code but again this slightly degrades performance. Tried all
+kind of other things, like having dedicated parity arrays to avoid the
+shift after parity[rp7] << 7; No gain.
+Change the lookup using the parity array by using shift operators (e.g.
+replace parity[rp7] << 7 with::
+
+ rp7 ^= (rp7 << 4);
+ rp7 ^= (rp7 << 2);
+ rp7 ^= (rp7 << 1);
+ rp7 &= 0x80;
+
+No gain.
+
+The only marginal change was inverting the parity bits, so we can remove
+the last three invert statements.
+
+Ah well, pity this does not deliver more. Then again 10 million
+iterations using the linux driver code takes between 13 and 13.5
+seconds, whereas my code now takes about 0.73 seconds for those 10
+million iterations. So basically I've improved the performance by a
+factor 18 on my system. Not that bad. Of course on different hardware
+you will get different results. No warranties!
+
+But of course there is no such thing as a free lunch. The codesize almost
+tripled (from 562 bytes to 1434 bytes). Then again, it is not that much.
+
+
+Correcting errors
+=================
+
+For correcting errors I again used the ST application note as a starter,
+but I also peeked at the existing code.
+
+The algorithm itself is pretty straightforward. Just xor the given and
+the calculated ecc. If all bytes are 0 there is no problem. If 11 bits
+are 1 we have one correctable bit error. If there is 1 bit 1, we have an
+error in the given ecc code.
+
+It proved to be fastest to do some table lookups. Performance gain
+introduced by this is about a factor 2 on my system when a repair had to
+be done, and 1% or so if no repair had to be done.
+
+Code size increased from 330 bytes to 686 bytes for this function.
+(gcc 4.2, -O3)
+
+
+Conclusion
+==========
+
+The gain when calculating the ecc is tremendous. Om my development hardware
+a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
+embedded system with a MIPS core a factor 7 was obtained.
+
+On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
+5 (big endian mode, gcc 4.1.2, -O3)
+
+For correction not much gain could be obtained (as bitflips are rare). Then
+again there are also much less cycles spent there.
+
+It seems there is not much more gain possible in this, at least when
+programmed in C. Of course it might be possible to squeeze something more
+out of it with an assembler program, but due to pipeline behaviour etc
+this is very tricky (at least for intel hw).
+
+Author: Frans Meulenbroeks
+
+Copyright (C) 2008 Koninklijke Philips Electronics NV.
diff --git a/Documentation/driver-api/mtd/spi-nor.rst b/Documentation/driver-api/mtd/spi-nor.rst
new file mode 100644
index 000000000000..f5333e3bf486
--- /dev/null
+++ b/Documentation/driver-api/mtd/spi-nor.rst
@@ -0,0 +1,66 @@
+=================
+SPI NOR framework
+=================
+
+Part I - Why do we need this framework?
+---------------------------------------
+
+SPI bus controllers (drivers/spi/) only deal with streams of bytes; the bus
+controller operates agnostic of the specific device attached. However, some
+controllers (such as Freescale's QuadSPI controller) cannot easily handle
+arbitrary streams of bytes, but rather are designed specifically for SPI NOR.
+
+In particular, Freescale's QuadSPI controller must know the NOR commands to
+find the right LUT sequence. Unfortunately, the SPI subsystem has no notion of
+opcodes, addresses, or data payloads; a SPI controller simply knows to send or
+receive bytes (Tx and Rx). Therefore, we must define a new layering scheme under
+which the controller driver is aware of the opcodes, addressing, and other
+details of the SPI NOR protocol.
+
+Part II - How does the framework work?
+--------------------------------------
+
+This framework just adds a new layer between the MTD and the SPI bus driver.
+With this new layer, the SPI NOR controller driver does not depend on the
+m25p80 code anymore.
+
+Before this framework, the layer is like::
+
+ MTD
+ ------------------------
+ m25p80
+ ------------------------
+ SPI bus driver
+ ------------------------
+ SPI NOR chip
+
+ After this framework, the layer is like:
+ MTD
+ ------------------------
+ SPI NOR framework
+ ------------------------
+ m25p80
+ ------------------------
+ SPI bus driver
+ ------------------------
+ SPI NOR chip
+
+ With the SPI NOR controller driver (Freescale QuadSPI), it looks like:
+ MTD
+ ------------------------
+ SPI NOR framework
+ ------------------------
+ fsl-quadSPI
+ ------------------------
+ SPI NOR chip
+
+Part III - How can drivers use the framework?
+---------------------------------------------
+
+The main API is spi_nor_scan(). Before you call the hook, a driver should
+initialize the necessary fields for spi_nor{}. Please see
+drivers/mtd/spi-nor/spi-nor.c for detail. Please also refer to fsl-quadspi.c
+when you want to write a new driver for a SPI NOR controller.
+Another API is spi_nor_restore(), this is used to restore the status of SPI
+flash chip such as addressing mode. Call it whenever detach the driver from
+device or reboot the system.
diff --git a/Documentation/driver-api/nfc/index.rst b/Documentation/driver-api/nfc/index.rst
new file mode 100644
index 000000000000..b6e9eedbff29
--- /dev/null
+++ b/Documentation/driver-api/nfc/index.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+Near Field Communication
+========================
+
+.. toctree::
+ :maxdepth: 1
+
+ nfc-hci
+ nfc-pn544
diff --git a/Documentation/driver-api/nfc/nfc-hci.rst b/Documentation/driver-api/nfc/nfc-hci.rst
new file mode 100644
index 000000000000..eb8a1a14e919
--- /dev/null
+++ b/Documentation/driver-api/nfc/nfc-hci.rst
@@ -0,0 +1,311 @@
+========================
+HCI backend for NFC Core
+========================
+
+- Author: Eric Lapuyade, Samuel Ortiz
+- Contact: eric.lapuyade@intel.com, samuel.ortiz@intel.com
+
+General
+-------
+
+The HCI layer implements much of the ETSI TS 102 622 V10.2.0 specification. It
+enables easy writing of HCI-based NFC drivers. The HCI layer runs as an NFC Core
+backend, implementing an abstract nfc device and translating NFC Core API
+to HCI commands and events.
+
+HCI
+---
+
+HCI registers as an nfc device with NFC Core. Requests coming from userspace are
+routed through netlink sockets to NFC Core and then to HCI. From this point,
+they are translated in a sequence of HCI commands sent to the HCI layer in the
+host controller (the chip). Commands can be executed synchronously (the sending
+context blocks waiting for response) or asynchronously (the response is returned
+from HCI Rx context).
+HCI events can also be received from the host controller. They will be handled
+and a translation will be forwarded to NFC Core as needed. There are hooks to
+let the HCI driver handle proprietary events or override standard behavior.
+HCI uses 2 execution contexts:
+
+- one for executing commands : nfc_hci_msg_tx_work(). Only one command
+ can be executing at any given moment.
+- one for dispatching received events and commands : nfc_hci_msg_rx_work().
+
+HCI Session initialization
+--------------------------
+
+The Session initialization is an HCI standard which must unfortunately
+support proprietary gates. This is the reason why the driver will pass a list
+of proprietary gates that must be part of the session. HCI will ensure all
+those gates have pipes connected when the hci device is set up.
+In case the chip supports pre-opened gates and pseudo-static pipes, the driver
+can pass that information to HCI core.
+
+HCI Gates and Pipes
+-------------------
+
+A gate defines the 'port' where some service can be found. In order to access
+a service, one must create a pipe to that gate and open it. In this
+implementation, pipes are totally hidden. The public API only knows gates.
+This is consistent with the driver need to send commands to proprietary gates
+without knowing the pipe connected to it.
+
+Driver interface
+----------------
+
+A driver is generally written in two parts : the physical link management and
+the HCI management. This makes it easier to maintain a driver for a chip that
+can be connected using various phy (i2c, spi, ...)
+
+HCI Management
+--------------
+
+A driver would normally register itself with HCI and provide the following
+entry points::
+
+ struct nfc_hci_ops {
+ int (*open)(struct nfc_hci_dev *hdev);
+ void (*close)(struct nfc_hci_dev *hdev);
+ int (*hci_ready) (struct nfc_hci_dev *hdev);
+ int (*xmit) (struct nfc_hci_dev *hdev, struct sk_buff *skb);
+ int (*start_poll) (struct nfc_hci_dev *hdev,
+ u32 im_protocols, u32 tm_protocols);
+ int (*dep_link_up)(struct nfc_hci_dev *hdev, struct nfc_target *target,
+ u8 comm_mode, u8 *gb, size_t gb_len);
+ int (*dep_link_down)(struct nfc_hci_dev *hdev);
+ int (*target_from_gate) (struct nfc_hci_dev *hdev, u8 gate,
+ struct nfc_target *target);
+ int (*complete_target_discovered) (struct nfc_hci_dev *hdev, u8 gate,
+ struct nfc_target *target);
+ int (*im_transceive) (struct nfc_hci_dev *hdev,
+ struct nfc_target *target, struct sk_buff *skb,
+ data_exchange_cb_t cb, void *cb_context);
+ int (*tm_send)(struct nfc_hci_dev *hdev, struct sk_buff *skb);
+ int (*check_presence)(struct nfc_hci_dev *hdev,
+ struct nfc_target *target);
+ int (*event_received)(struct nfc_hci_dev *hdev, u8 gate, u8 event,
+ struct sk_buff *skb);
+ };
+
+- open() and close() shall turn the hardware on and off.
+- hci_ready() is an optional entry point that is called right after the hci
+ session has been set up. The driver can use it to do additional initialization
+ that must be performed using HCI commands.
+- xmit() shall simply write a frame to the physical link.
+- start_poll() is an optional entrypoint that shall set the hardware in polling
+ mode. This must be implemented only if the hardware uses proprietary gates or a
+ mechanism slightly different from the HCI standard.
+- dep_link_up() is called after a p2p target has been detected, to finish
+ the p2p connection setup with hardware parameters that need to be passed back
+ to nfc core.
+- dep_link_down() is called to bring the p2p link down.
+- target_from_gate() is an optional entrypoint to return the nfc protocols
+ corresponding to a proprietary gate.
+- complete_target_discovered() is an optional entry point to let the driver
+ perform additional proprietary processing necessary to auto activate the
+ discovered target.
+- im_transceive() must be implemented by the driver if proprietary HCI commands
+ are required to send data to the tag. Some tag types will require custom
+ commands, others can be written to using the standard HCI commands. The driver
+ can check the tag type and either do proprietary processing, or return 1 to ask
+ for standard processing. The data exchange command itself must be sent
+ asynchronously.
+- tm_send() is called to send data in the case of a p2p connection
+- check_presence() is an optional entry point that will be called regularly
+ by the core to check that an activated tag is still in the field. If this is
+ not implemented, the core will not be able to push tag_lost events to the user
+ space
+- event_received() is called to handle an event coming from the chip. Driver
+ can handle the event or return 1 to let HCI attempt standard processing.
+
+On the rx path, the driver is responsible to push incoming HCP frames to HCI
+using nfc_hci_recv_frame(). HCI will take care of re-aggregation and handling
+This must be done from a context that can sleep.
+
+PHY Management
+--------------
+
+The physical link (i2c, ...) management is defined by the following structure::
+
+ struct nfc_phy_ops {
+ int (*write)(void *dev_id, struct sk_buff *skb);
+ int (*enable)(void *dev_id);
+ void (*disable)(void *dev_id);
+ };
+
+enable():
+ turn the phy on (power on), make it ready to transfer data
+disable():
+ turn the phy off
+write():
+ Send a data frame to the chip. Note that to enable higher
+ layers such as an llc to store the frame for re-emission, this
+ function must not alter the skb. It must also not return a positive
+ result (return 0 for success, negative for failure).
+
+Data coming from the chip shall be sent directly to nfc_hci_recv_frame().
+
+LLC
+---
+
+Communication between the CPU and the chip often requires some link layer
+protocol. Those are isolated as modules managed by the HCI layer. There are
+currently two modules : nop (raw transfert) and shdlc.
+A new llc must implement the following functions::
+
+ struct nfc_llc_ops {
+ void *(*init) (struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
+ rcv_to_hci_t rcv_to_hci, int tx_headroom,
+ int tx_tailroom, int *rx_headroom, int *rx_tailroom,
+ llc_failure_t llc_failure);
+ void (*deinit) (struct nfc_llc *llc);
+ int (*start) (struct nfc_llc *llc);
+ int (*stop) (struct nfc_llc *llc);
+ void (*rcv_from_drv) (struct nfc_llc *llc, struct sk_buff *skb);
+ int (*xmit_from_hci) (struct nfc_llc *llc, struct sk_buff *skb);
+ };
+
+init():
+ allocate and init your private storage
+deinit():
+ cleanup
+start():
+ establish the logical connection
+stop ():
+ terminate the logical connection
+rcv_from_drv():
+ handle data coming from the chip, going to HCI
+xmit_from_hci():
+ handle data sent by HCI, going to the chip
+
+The llc must be registered with nfc before it can be used. Do that by
+calling::
+
+ nfc_llc_register(const char *name, struct nfc_llc_ops *ops);
+
+Again, note that the llc does not handle the physical link. It is thus very
+easy to mix any physical link with any llc for a given chip driver.
+
+Included Drivers
+----------------
+
+An HCI based driver for an NXP PN544, connected through I2C bus, and using
+shdlc is included.
+
+Execution Contexts
+------------------
+
+The execution contexts are the following:
+- IRQ handler (IRQH):
+fast, cannot sleep. sends incoming frames to HCI where they are passed to
+the current llc. In case of shdlc, the frame is queued in shdlc rx queue.
+
+- SHDLC State Machine worker (SMW)
+
+ Only when llc_shdlc is used: handles shdlc rx & tx queues.
+
+ Dispatches HCI cmd responses.
+
+- HCI Tx Cmd worker (MSGTXWQ)
+
+ Serializes execution of HCI commands.
+
+ Completes execution in case of response timeout.
+
+- HCI Rx worker (MSGRXWQ)
+
+ Dispatches incoming HCI commands or events.
+
+- Syscall context from a userspace call (SYSCALL)
+
+ Any entrypoint in HCI called from NFC Core
+
+Workflow executing an HCI command (using shdlc)
+-----------------------------------------------
+
+Executing an HCI command can easily be performed synchronously using the
+following API::
+
+ int nfc_hci_send_cmd (struct nfc_hci_dev *hdev, u8 gate, u8 cmd,
+ const u8 *param, size_t param_len, struct sk_buff **skb)
+
+The API must be invoked from a context that can sleep. Most of the time, this
+will be the syscall context. skb will return the result that was received in
+the response.
+
+Internally, execution is asynchronous. So all this API does is to enqueue the
+HCI command, setup a local wait queue on stack, and wait_event() for completion.
+The wait is not interruptible because it is guaranteed that the command will
+complete after some short timeout anyway.
+
+MSGTXWQ context will then be scheduled and invoke nfc_hci_msg_tx_work().
+This function will dequeue the next pending command and send its HCP fragments
+to the lower layer which happens to be shdlc. It will then start a timer to be
+able to complete the command with a timeout error if no response arrive.
+
+SMW context gets scheduled and invokes nfc_shdlc_sm_work(). This function
+handles shdlc framing in and out. It uses the driver xmit to send frames and
+receives incoming frames in an skb queue filled from the driver IRQ handler.
+SHDLC I(nformation) frames payload are HCP fragments. They are aggregated to
+form complete HCI frames, which can be a response, command, or event.
+
+HCI Responses are dispatched immediately from this context to unblock
+waiting command execution. Response processing involves invoking the completion
+callback that was provided by nfc_hci_msg_tx_work() when it sent the command.
+The completion callback will then wake the syscall context.
+
+It is also possible to execute the command asynchronously using this API::
+
+ static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
+ const u8 *param, size_t param_len,
+ data_exchange_cb_t cb, void *cb_context)
+
+The workflow is the same, except that the API call returns immediately, and
+the callback will be called with the result from the SMW context.
+
+Workflow receiving an HCI event or command
+------------------------------------------
+
+HCI commands or events are not dispatched from SMW context. Instead, they are
+queued to HCI rx_queue and will be dispatched from HCI rx worker
+context (MSGRXWQ). This is done this way to allow a cmd or event handler
+to also execute other commands (for example, handling the
+NFC_HCI_EVT_TARGET_DISCOVERED event from PN544 requires to issue an
+ANY_GET_PARAMETER to the reader A gate to get information on the target
+that was discovered).
+
+Typically, such an event will be propagated to NFC Core from MSGRXWQ context.
+
+Error management
+----------------
+
+Errors that occur synchronously with the execution of an NFC Core request are
+simply returned as the execution result of the request. These are easy.
+
+Errors that occur asynchronously (e.g. in a background protocol handling thread)
+must be reported such that upper layers don't stay ignorant that something
+went wrong below and know that expected events will probably never happen.
+Handling of these errors is done as follows:
+
+- driver (pn544) fails to deliver an incoming frame: it stores the error such
+ that any subsequent call to the driver will result in this error. Then it
+ calls the standard nfc_shdlc_recv_frame() with a NULL argument to report the
+ problem above. shdlc stores a EREMOTEIO sticky status, which will trigger
+ SMW to report above in turn.
+
+- SMW is basically a background thread to handle incoming and outgoing shdlc
+ frames. This thread will also check the shdlc sticky status and report to HCI
+ when it discovers it is not able to run anymore because of an unrecoverable
+ error that happened within shdlc or below. If the problem occurs during shdlc
+ connection, the error is reported through the connect completion.
+
+- HCI: if an internal HCI error happens (frame is lost), or HCI is reported an
+ error from a lower layer, HCI will either complete the currently executing
+ command with that error, or notify NFC Core directly if no command is
+ executing.
+
+- NFC Core: when NFC Core is notified of an error from below and polling is
+ active, it will send a tag discovered event with an empty tag list to the user
+ space to let it know that the poll operation will never be able to detect a
+ tag. If polling is not active and the error was sticky, lower levels will
+ return it at next invocation.
diff --git a/Documentation/driver-api/nfc/nfc-pn544.rst b/Documentation/driver-api/nfc/nfc-pn544.rst
new file mode 100644
index 000000000000..6b2d8aae0c4e
--- /dev/null
+++ b/Documentation/driver-api/nfc/nfc-pn544.rst
@@ -0,0 +1,34 @@
+============================================================================
+Kernel driver for the NXP Semiconductors PN544 Near Field Communication chip
+============================================================================
+
+
+General
+-------
+
+The PN544 is an integrated transmission module for contactless
+communication. The driver goes under drives/nfc/ and is compiled as a
+module named "pn544".
+
+Host Interfaces: I2C, SPI and HSU, this driver supports currently only I2C.
+
+Protocols
+---------
+
+In the normal (HCI) mode and in the firmware update mode read and
+write functions behave a bit differently because the message formats
+or the protocols are different.
+
+In the normal (HCI) mode the protocol used is derived from the ETSI
+HCI specification. The firmware is updated using a specific protocol,
+which is different from HCI.
+
+HCI messages consist of an eight bit header and the message body. The
+header contains the message length. Maximum size for an HCI message is
+33. In HCI mode sent messages are tested for a correct
+checksum. Firmware update messages have the length in the second (MSB)
+and third (LSB) bytes of the message. The maximum FW message length is
+1024 bytes.
+
+For the ETSI HCI specification see
+http://www.etsi.org/WebSite/Technologies/ProtocolSpecification.aspx
diff --git a/Documentation/driver-api/ntb.rst b/Documentation/driver-api/ntb.rst
new file mode 100644
index 000000000000..87d1372da879
--- /dev/null
+++ b/Documentation/driver-api/ntb.rst
@@ -0,0 +1,263 @@
+===========
+NTB Drivers
+===========
+
+NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects
+the separate memory systems of two or more computers to the same PCI-Express
+fabric. Existing NTB hardware supports a common feature set: doorbell
+registers and memory translation windows, as well as non common features like
+scratchpad and message registers. Scratchpad registers are read-and-writable
+registers that are accessible from either side of the device, so that peers can
+exchange a small amount of information at a fixed address. Message registers can
+be utilized for the same purpose. Additionally they are provided with with
+special status bits to make sure the information isn't rewritten by another
+peer. Doorbell registers provide a way for peers to send interrupt events.
+Memory windows allow translated read and write access to the peer memory.
+
+NTB Core Driver (ntb)
+=====================
+
+The NTB core driver defines an api wrapping the common feature set, and allows
+clients interested in NTB features to discover NTB the devices supported by
+hardware drivers. The term "client" is used here to mean an upper layer
+component making use of the NTB api. The term "driver," or "hardware driver,"
+is used here to mean a driver for a specific vendor and model of NTB hardware.
+
+NTB Client Drivers
+==================
+
+NTB client drivers should register with the NTB core driver. After
+registering, the client probe and remove functions will be called appropriately
+as ntb hardware, or hardware drivers, are inserted and removed. The
+registration uses the Linux Device framework, so it should feel familiar to
+anyone who has written a pci driver.
+
+NTB Typical client driver implementation
+----------------------------------------
+
+Primary purpose of NTB is to share some peace of memory between at least two
+systems. So the NTB device features like Scratchpad/Message registers are
+mainly used to perform the proper memory window initialization. Typically
+there are two types of memory window interfaces supported by the NTB API:
+inbound translation configured on the local ntb port and outbound translation
+configured by the peer, on the peer ntb port. The first type is
+depicted on the next figure::
+
+ Inbound translation:
+
+ Memory: Local NTB Port: Peer NTB Port: Peer MMIO:
+ ____________
+ | dma-mapped |-ntb_mw_set_trans(addr) |
+ | memory | _v____________ | ______________
+ | (addr) |<======| MW xlat addr |<====| MW base addr |<== memory-mapped IO
+ |------------| |--------------| | |--------------|
+
+So typical scenario of the first type memory window initialization looks:
+1) allocate a memory region, 2) put translated address to NTB config,
+3) somehow notify a peer device of performed initialization, 4) peer device
+maps corresponding outbound memory window so to have access to the shared
+memory region.
+
+The second type of interface, that implies the shared windows being
+initialized by a peer device, is depicted on the figure::
+
+ Outbound translation:
+
+ Memory: Local NTB Port: Peer NTB Port: Peer MMIO:
+ ____________ ______________
+ | dma-mapped | | | MW base addr |<== memory-mapped IO
+ | memory | | |--------------|
+ | (addr) |<===================| MW xlat addr |<-ntb_peer_mw_set_trans(addr)
+ |------------| | |--------------|
+
+Typical scenario of the second type interface initialization would be:
+1) allocate a memory region, 2) somehow deliver a translated address to a peer
+device, 3) peer puts the translated address to NTB config, 4) peer device maps
+outbound memory window so to have access to the shared memory region.
+
+As one can see the described scenarios can be combined in one portable
+algorithm.
+
+ Local device:
+ 1) Allocate memory for a shared window
+ 2) Initialize memory window by translated address of the allocated region
+ (it may fail if local memory window initialization is unsupported)
+ 3) Send the translated address and memory window index to a peer device
+
+ Peer device:
+ 1) Initialize memory window with retrieved address of the allocated
+ by another device memory region (it may fail if peer memory window
+ initialization is unsupported)
+ 2) Map outbound memory window
+
+In accordance with this scenario, the NTB Memory Window API can be used as
+follows:
+
+ Local device:
+ 1) ntb_mw_count(pidx) - retrieve number of memory ranges, which can
+ be allocated for memory windows between local device and peer device
+ of port with specified index.
+ 2) ntb_get_align(pidx, midx) - retrieve parameters restricting the
+ shared memory region alignment and size. Then memory can be properly
+ allocated.
+ 3) Allocate physically contiguous memory region in compliance with
+ restrictions retrieved in 2).
+ 4) ntb_mw_set_trans(pidx, midx) - try to set translation address of
+ the memory window with specified index for the defined peer device
+ (it may fail if local translated address setting is not supported)
+ 5) Send translated base address (usually together with memory window
+ number) to the peer device using, for instance, scratchpad or message
+ registers.
+
+ Peer device:
+ 1) ntb_peer_mw_set_trans(pidx, midx) - try to set received from other
+ device (related to pidx) translated address for specified memory
+ window. It may fail if retrieved address, for instance, exceeds
+ maximum possible address or isn't properly aligned.
+ 2) ntb_peer_mw_get_addr(widx) - retrieve MMIO address to map the memory
+ window so to have an access to the shared memory.
+
+Also it is worth to note, that method ntb_mw_count(pidx) should return the
+same value as ntb_peer_mw_count() on the peer with port index - pidx.
+
+NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev)
+------------------------------------------------------------------
+
+The primary client for NTB is the Transport client, used in tandem with NTB
+Netdev. These drivers function together to create a logical link to the peer,
+across the ntb, to exchange packets of network data. The Transport client
+establishes a logical link to the peer, and creates queue pairs to exchange
+messages and data. The NTB Netdev then creates an ethernet device using a
+Transport queue pair. Network data is copied between socket buffers and the
+Transport queue pair buffer. The Transport client may be used for other things
+besides Netdev, however no other applications have yet been written.
+
+NTB Ping Pong Test Client (ntb\_pingpong)
+-----------------------------------------
+
+The Ping Pong test client serves as a demonstration to exercise the doorbell
+and scratchpad registers of NTB hardware, and as an example simple NTB client.
+Ping Pong enables the link when started, waits for the NTB link to come up, and
+then proceeds to read and write the doorbell scratchpad registers of the NTB.
+The peers interrupt each other using a bit mask of doorbell bits, which is
+shifted by one in each round, to test the behavior of multiple doorbell bits
+and interrupt vectors. The Ping Pong driver also reads the first local
+scratchpad, and writes the value plus one to the first peer scratchpad, each
+round before writing the peer doorbell register.
+
+Module Parameters:
+
+* unsafe - Some hardware has known issues with scratchpad and doorbell
+ registers. By default, Ping Pong will not attempt to exercise such
+ hardware. You may override this behavior at your own risk by setting
+ unsafe=1.
+* delay\_ms - Specify the delay between receiving a doorbell
+ interrupt event and setting the peer doorbell register for the next
+ round.
+* init\_db - Specify the doorbell bits to start new series of rounds. A new
+ series begins once all the doorbell bits have been shifted out of
+ range.
+* dyndbg - It is suggested to specify dyndbg=+p when loading this module, and
+ then to observe debugging output on the console.
+
+NTB Tool Test Client (ntb\_tool)
+--------------------------------
+
+The Tool test client serves for debugging, primarily, ntb hardware and drivers.
+The Tool provides access through debugfs for reading, setting, and clearing the
+NTB doorbell, and reading and writing scratchpads.
+
+The Tool does not currently have any module parameters.
+
+Debugfs Files:
+
+* *debugfs*/ntb\_tool/*hw*/
+ A directory in debugfs will be created for each
+ NTB device probed by the tool. This directory is shortened to *hw*
+ below.
+* *hw*/db
+ This file is used to read, set, and clear the local doorbell. Not
+ all operations may be supported by all hardware. To read the doorbell,
+ read the file. To set the doorbell, write `s` followed by the bits to
+ set (eg: `echo 's 0x0101' > db`). To clear the doorbell, write `c`
+ followed by the bits to clear.
+* *hw*/mask
+ This file is used to read, set, and clear the local doorbell mask.
+ See *db* for details.
+* *hw*/peer\_db
+ This file is used to read, set, and clear the peer doorbell.
+ See *db* for details.
+* *hw*/peer\_mask
+ This file is used to read, set, and clear the peer doorbell
+ mask. See *db* for details.
+* *hw*/spad
+ This file is used to read and write local scratchpads. To read
+ the values of all scratchpads, read the file. To write values, write a
+ series of pairs of scratchpad number and value
+ (eg: `echo '4 0x123 7 0xabc' > spad`
+ # to set scratchpads `4` and `7` to `0x123` and `0xabc`, respectively).
+* *hw*/peer\_spad
+ This file is used to read and write peer scratchpads. See
+ *spad* for details.
+
+NTB MSI Test Client (ntb\_msi\_test)
+------------------------------------
+
+The MSI test client serves to test and debug the MSI library which
+allows for passing MSI interrupts across NTB memory windows. The
+test client is interacted with through the debugfs filesystem:
+
+* *debugfs*/ntb\_tool/*hw*/
+ A directory in debugfs will be created for each
+ NTB device probed by the tool. This directory is shortened to *hw*
+ below.
+* *hw*/port
+ This file describes the local port number
+* *hw*/irq*_occurrences
+ One occurrences file exists for each interrupt and, when read,
+ returns the number of times the interrupt has been triggered.
+* *hw*/peer*/port
+ This file describes the port number for each peer
+* *hw*/peer*/count
+ This file describes the number of interrupts that can be
+ triggered on each peer
+* *hw*/peer*/trigger
+ Writing an interrupt number (any number less than the value
+ specified in count) will trigger the interrupt on the
+ specified peer. That peer's interrupt's occurrence file
+ should be incremented.
+
+NTB Hardware Drivers
+====================
+
+NTB hardware drivers should register devices with the NTB core driver. After
+registering, clients probe and remove functions will be called.
+
+NTB Intel Hardware Driver (ntb\_hw\_intel)
+------------------------------------------
+
+The Intel hardware driver supports NTB on Xeon and Atom CPUs.
+
+Module Parameters:
+
+* b2b\_mw\_idx
+ If the peer ntb is to be accessed via a memory window, then use
+ this memory window to access the peer ntb. A value of zero or positive
+ starts from the first mw idx, and a negative value starts from the last
+ mw idx. Both sides MUST set the same value here! The default value is
+ `-1`.
+* b2b\_mw\_share
+ If the peer ntb is to be accessed via a memory window, and if
+ the memory window is large enough, still allow the client to use the
+ second half of the memory window for address translation to the peer.
+* xeon\_b2b\_usd\_bar2\_addr64
+ If using B2B topology on Xeon hardware, use
+ this 64 bit address on the bus between the NTB devices for the window
+ at BAR2, on the upstream side of the link.
+* xeon\_b2b\_usd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_usd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_usd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar2\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
+* xeon\_b2b\_dsd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
diff --git a/Documentation/driver-api/nvdimm/btt.rst b/Documentation/driver-api/nvdimm/btt.rst
new file mode 100644
index 000000000000..107395c042ae
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/btt.rst
@@ -0,0 +1,285 @@
+=============================
+BTT - Block Translation Table
+=============================
+
+
+1. Introduction
+===============
+
+Persistent memory based storage is able to perform IO at byte (or more
+accurately, cache line) granularity. However, we often want to expose such
+storage as traditional block devices. The block drivers for persistent memory
+will do exactly this. However, they do not provide any atomicity guarantees.
+Traditional SSDs typically provide protection against torn sectors in hardware,
+using stored energy in capacitors to complete in-flight block writes, or perhaps
+in firmware. We don't have this luxury with persistent memory - if a write is in
+progress, and we experience a power failure, the block will contain a mix of old
+and new data. Applications may not be prepared to handle such a scenario.
+
+The Block Translation Table (BTT) provides atomic sector update semantics for
+persistent memory devices, so that applications that rely on sector writes not
+being torn can continue to do so. The BTT manifests itself as a stacked block
+device, and reserves a portion of the underlying storage for its metadata. At
+the heart of it, is an indirection table that re-maps all the blocks on the
+volume. It can be thought of as an extremely simple file system that only
+provides atomic sector updates.
+
+
+2. Static Layout
+================
+
+The underlying storage on which a BTT can be laid out is not limited in any way.
+The BTT, however, splits the available space into chunks of up to 512 GiB,
+called "Arenas".
+
+Each arena follows the same layout for its metadata, and all references in an
+arena are internal to it (with the exception of one field that points to the
+next arena). The following depicts the "On-disk" metadata layout::
+
+
+ Backing Store +-------> Arena
+ +---------------+ | +------------------+
+ | | | | Arena info block |
+ | Arena 0 +---+ | 4K |
+ | 512G | +------------------+
+ | | | |
+ +---------------+ | |
+ | | | |
+ | Arena 1 | | Data Blocks |
+ | 512G | | |
+ | | | |
+ +---------------+ | |
+ | . | | |
+ | . | | |
+ | . | | |
+ | | | |
+ | | | |
+ +---------------+ +------------------+
+ | |
+ | BTT Map |
+ | |
+ | |
+ +------------------+
+ | |
+ | BTT Flog |
+ | |
+ +------------------+
+ | Info block copy |
+ | 4K |
+ +------------------+
+
+
+3. Theory of Operation
+======================
+
+
+a. The BTT Map
+--------------
+
+The map is a simple lookup/indirection table that maps an LBA to an internal
+block. Each map entry is 32 bits. The two most significant bits are special
+flags, and the remaining form the internal block number.
+
+======== =============================================================
+Bit Description
+======== =============================================================
+31 - 30 Error and Zero flags - Used in the following way::
+
+ == == ====================================================
+ 31 30 Description
+ == == ====================================================
+ 0 0 Initial state. Reads return zeroes; Premap = Postmap
+ 0 1 Zero state: Reads return zeroes
+ 1 0 Error state: Reads fail; Writes clear 'E' bit
+ 1 1 Normal Block – has valid postmap
+ == == ====================================================
+
+29 - 0 Mappings to internal 'postmap' blocks
+======== =============================================================
+
+
+Some of the terminology that will be subsequently used:
+
+============ ================================================================
+External LBA LBA as made visible to upper layers.
+ABA Arena Block Address - Block offset/number within an arena
+Premap ABA The block offset into an arena, which was decided upon by range
+ checking the External LBA
+Postmap ABA The block number in the "Data Blocks" area obtained after
+ indirection from the map
+nfree The number of free blocks that are maintained at any given time.
+ This is the number of concurrent writes that can happen to the
+ arena.
+============ ================================================================
+
+
+For example, after adding a BTT, we surface a disk of 1024G. We get a read for
+the external LBA at 768G. This falls into the second arena, and of the 512G
+worth of blocks that this arena contributes, this block is at 256G. Thus, the
+premap ABA is 256G. We now refer to the map, and find out the mapping for block
+'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64.
+
+
+b. The BTT Flog
+---------------
+
+The BTT provides sector atomicity by making every write an "allocating write",
+i.e. Every write goes to a "free" block. A running list of free blocks is
+maintained in the form of the BTT flog. 'Flog' is a combination of the words
+"free list" and "log". The flog contains 'nfree' entries, and an entry contains:
+
+======== =====================================================================
+lba The premap ABA that is being written to
+old_map The old postmap ABA - after 'this' write completes, this will be a
+ free block.
+new_map The new postmap ABA. The map will up updated to reflect this
+ lba->postmap_aba mapping, but we log it here in case we have to
+ recover.
+seq Sequence number to mark which of the 2 sections of this flog entry is
+ valid/newest. It cycles between 01->10->11->01 (binary) under normal
+ operation, with 00 indicating an uninitialized state.
+lba' alternate lba entry
+old_map' alternate old postmap entry
+new_map' alternate new postmap entry
+seq' alternate sequence number.
+======== =====================================================================
+
+Each of the above fields is 32-bit, making one entry 32 bytes. Entries are also
+padded to 64 bytes to avoid cache line sharing or aliasing. Flog updates are
+done such that for any entry being written, it:
+a. overwrites the 'old' section in the entry based on sequence numbers
+b. writes the 'new' section such that the sequence number is written last.
+
+
+c. The concept of lanes
+-----------------------
+
+While 'nfree' describes the number of concurrent IOs an arena can process
+concurrently, 'nlanes' is the number of IOs the BTT device as a whole can
+process::
+
+ nlanes = min(nfree, num_cpus)
+
+A lane number is obtained at the start of any IO, and is used for indexing into
+all the on-disk and in-memory data structures for the duration of the IO. If
+there are more CPUs than the max number of available lanes, than lanes are
+protected by spinlocks.
+
+
+d. In-memory data structure: Read Tracking Table (RTT)
+------------------------------------------------------
+
+Consider a case where we have two threads, one doing reads and the other,
+writes. We can hit a condition where the writer thread grabs a free block to do
+a new IO, but the (slow) reader thread is still reading from it. In other words,
+the reader consulted a map entry, and started reading the corresponding block. A
+writer started writing to the same external LBA, and finished the write updating
+the map for that external LBA to point to its new postmap ABA. At this point the
+internal, postmap block that the reader is (still) reading has been inserted
+into the list of free blocks. If another write comes in for the same LBA, it can
+grab this free block, and start writing to it, causing the reader to read
+incorrect data. To prevent this, we introduce the RTT.
+
+The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts
+into rtt[lane_number], the postmap ABA it is reading, and clears it after the
+read is complete. Every writer thread, after grabbing a free block, checks the
+RTT for its presence. If the postmap free block is in the RTT, it waits till the
+reader clears the RTT entry, and only then starts writing to it.
+
+
+e. In-memory data structure: map locks
+--------------------------------------
+
+Consider a case where two writer threads are writing to the same LBA. There can
+be a race in the following sequence of steps::
+
+ free[lane] = map[premap_aba]
+ map[premap_aba] = postmap_aba
+
+Both threads can update their respective free[lane] with the same old, freed
+postmap_aba. This has made the layout inconsistent by losing a free entry, and
+at the same time, duplicating another free entry for two lanes.
+
+To solve this, we could have a single map lock (per arena) that has to be taken
+before performing the above sequence, but we feel that could be too contentious.
+Instead we use an array of (nfree) map_locks that is indexed by
+(premap_aba modulo nfree).
+
+
+f. Reconstruction from the Flog
+-------------------------------
+
+On startup, we analyze the BTT flog to create our list of free blocks. We walk
+through all the entries, and for each lane, of the set of two possible
+'sections', we always look at the most recent one only (based on the sequence
+number). The reconstruction rules/steps are simple:
+
+- Read map[log_entry.lba].
+- If log_entry.new matches the map entry, then log_entry.old is free.
+- If log_entry.new does not match the map entry, then log_entry.new is free.
+ (This case can only be caused by power-fails/unsafe shutdowns)
+
+
+g. Summarizing - Read and Write flows
+-------------------------------------
+
+Read:
+
+1. Convert external LBA to arena number + pre-map ABA
+2. Get a lane (and take lane_lock)
+3. Read map to get the entry for this pre-map ABA
+4. Enter post-map ABA into RTT[lane]
+5. If TRIM flag set in map, return zeroes, and end IO (go to step 8)
+6. If ERROR flag set in map, end IO with EIO (go to step 8)
+7. Read data from this block
+8. Remove post-map ABA entry from RTT[lane]
+9. Release lane (and lane_lock)
+
+Write:
+
+1. Convert external LBA to Arena number + pre-map ABA
+2. Get a lane (and take lane_lock)
+3. Use lane to index into in-memory free list and obtain a new block, next flog
+ index, next sequence number
+4. Scan the RTT to check if free block is present, and spin/wait if it is.
+5. Write data to this free block
+6. Read map to get the existing post-map ABA entry for this pre-map ABA
+7. Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num]
+8. Write new post-map ABA into map.
+9. Write old post-map entry into the free list
+10. Calculate next sequence number and write into the free list entry
+11. Release lane (and lane_lock)
+
+
+4. Error Handling
+=================
+
+An arena would be in an error state if any of the metadata is corrupted
+irrecoverably, either due to a bug or a media error. The following conditions
+indicate an error:
+
+- Info block checksum does not match (and recovering from the copy also fails)
+- All internal available blocks are not uniquely and entirely addressed by the
+ sum of mapped blocks and free blocks (from the BTT flog).
+- Rebuilding free list from the flog reveals missing/duplicate/impossible
+ entries
+- A map entry is out of bounds
+
+If any of these error conditions are encountered, the arena is put into a read
+only state using a flag in the info block.
+
+
+5. Usage
+========
+
+The BTT can be set up on any disk (namespace) exposed by the libnvdimm subsystem
+(pmem, or blk mode). The easiest way to set up such a namespace is using the
+'ndctl' utility [1]:
+
+For example, the ndctl command line to setup a btt with a 4k sector size is::
+
+ ndctl create-namespace -f -e namespace0.0 -m sector -l 4k
+
+See ndctl create-namespace --help for more options.
+
+[1]: https://github.com/pmem/ndctl
diff --git a/Documentation/driver-api/nvdimm/index.rst b/Documentation/driver-api/nvdimm/index.rst
new file mode 100644
index 000000000000..a4f8f98aeb94
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================================
+Non-Volatile Memory Device (NVDIMM)
+===================================
+
+.. toctree::
+ :maxdepth: 1
+
+ nvdimm
+ btt
+ security
diff --git a/Documentation/driver-api/nvdimm/nvdimm.rst b/Documentation/driver-api/nvdimm/nvdimm.rst
new file mode 100644
index 000000000000..08f855cbb4e6
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/nvdimm.rst
@@ -0,0 +1,887 @@
+===============================
+LIBNVDIMM: Non-Volatile Devices
+===============================
+
+libnvdimm - kernel / libndctl - userspace helper library
+
+linux-nvdimm@lists.01.org
+
+Version 13
+
+.. contents:
+
+ Glossary
+ Overview
+ Supporting Documents
+ Git Trees
+ LIBNVDIMM PMEM and BLK
+ Why BLK?
+ PMEM vs BLK
+ BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
+ Example NVDIMM Platform
+ LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
+ LIBNDCTL: Context
+ libndctl: instantiate a new library context example
+ LIBNVDIMM/LIBNDCTL: Bus
+ libnvdimm: control class device in /sys/class
+ libnvdimm: bus
+ libndctl: bus enumeration example
+ LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
+ libnvdimm: DIMM (NMEM)
+ libndctl: DIMM enumeration example
+ LIBNVDIMM/LIBNDCTL: Region
+ libnvdimm: region
+ libndctl: region enumeration example
+ Why Not Encode the Region Type into the Region Name?
+ How Do I Determine the Major Type of a Region?
+ LIBNVDIMM/LIBNDCTL: Namespace
+ libnvdimm: namespace
+ libndctl: namespace enumeration example
+ libndctl: namespace creation example
+ Why the Term "namespace"?
+ LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
+ libnvdimm: btt layout
+ libndctl: btt creation example
+ Summary LIBNDCTL Diagram
+
+
+Glossary
+========
+
+PMEM:
+ A system-physical-address range where writes are persistent. A
+ block device composed of PMEM is capable of DAX. A PMEM address range
+ may span an interleave of several DIMMs.
+
+BLK:
+ A set of one or more programmable memory mapped apertures provided
+ by a DIMM to access its media. This indirection precludes the
+ performance benefit of interleaving, but enables DIMM-bounded failure
+ modes.
+
+DPA:
+ DIMM Physical Address, is a DIMM-relative offset. With one DIMM in
+ the system there would be a 1:1 system-physical-address:DPA association.
+ Once more DIMMs are added a memory controller interleave must be
+ decoded to determine the DPA associated with a given
+ system-physical-address. BLK capacity always has a 1:1 relationship
+ with a single-DIMM's DPA range.
+
+DAX:
+ File system extensions to bypass the page cache and block layer to
+ mmap persistent memory, from a PMEM block device, directly into a
+ process address space.
+
+DSM:
+ Device Specific Method: ACPI method to to control specific
+ device - in this case the firmware.
+
+DCR:
+ NVDIMM Control Region Structure defined in ACPI 6 Section 5.2.25.5.
+ It defines a vendor-id, device-id, and interface format for a given DIMM.
+
+BTT:
+ Block Translation Table: Persistent memory is byte addressable.
+ Existing software may have an expectation that the power-fail-atomicity
+ of writes is at least one sector, 512 bytes. The BTT is an indirection
+ table with atomic update semantics to front a PMEM/BLK block device
+ driver and present arbitrary atomic sector sizes.
+
+LABEL:
+ Metadata stored on a DIMM device that partitions and identifies
+ (persistently names) storage between PMEM and BLK. It also partitions
+ BLK storage to host BTTs with different parameters per BLK-partition.
+ Note that traditional partition tables, GPT/MBR, are layered on top of a
+ BLK or PMEM device.
+
+
+Overview
+========
+
+The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely,
+PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM
+and BLK mode access. These three modes of operation are described by
+the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6. While the LIBNVDIMM
+implementation is generic and supports pre-NFIT platforms, it was guided
+by the superset of capabilities need to support this ACPI 6 definition
+for NVDIMM resources. The bulk of the kernel implementation is in place
+to handle the case where DPA accessible via PMEM is aliased with DPA
+accessible via BLK. When that occurs a LABEL is needed to reserve DPA
+for exclusive access via one mode a time.
+
+Supporting Documents
+--------------------
+
+ACPI 6:
+ http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
+NVDIMM Namespace:
+ http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
+DSM Interface Example:
+ http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
+Driver Writer's Guide:
+ http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf
+
+Git Trees
+---------
+
+LIBNVDIMM:
+ https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git
+LIBNDCTL:
+ https://github.com/pmem/ndctl.git
+PMEM:
+ https://github.com/01org/prd
+
+
+LIBNVDIMM PMEM and BLK
+======================
+
+Prior to the arrival of the NFIT, non-volatile memory was described to a
+system in various ad-hoc ways. Usually only the bare minimum was
+provided, namely, a single system-physical-address range where writes
+are expected to be durable after a system power loss. Now, the NFIT
+specification standardizes not only the description of PMEM, but also
+BLK and platform message-passing entry points for control and
+configuration.
+
+For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block
+device driver:
+
+ 1. PMEM (nd_pmem.ko): Drives a system-physical-address range. This
+ range is contiguous in system memory and may be interleaved (hardware
+ memory controller striped) across multiple DIMMs. When interleaved the
+ platform may optionally provide details of which DIMMs are participating
+ in the interleave.
+
+ Note that while LIBNVDIMM describes system-physical-address ranges that may
+ alias with BLK access as ND_NAMESPACE_PMEM ranges and those without
+ alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no
+ distinction. The different device-types are an implementation detail
+ that userspace can exploit to implement policies like "only interface
+ with address ranges from certain DIMMs". It is worth noting that when
+ aliasing is present and a DIMM lacks a label, then no block device can
+ be created by default as userspace needs to do at least one allocation
+ of DPA to the PMEM range. In contrast ND_NAMESPACE_IO ranges, once
+ registered, can be immediately attached to nd_pmem.
+
+ 2. BLK (nd_blk.ko): This driver performs I/O using a set of platform
+ defined apertures. A set of apertures will access just one DIMM.
+ Multiple windows (apertures) allow multiple concurrent accesses, much like
+ tagged-command-queuing, and would likely be used by different threads or
+ different CPUs.
+
+ The NFIT specification defines a standard format for a BLK-aperture, but
+ the spec also allows for vendor specific layouts, and non-NFIT BLK
+ implementations may have other designs for BLK I/O. For this reason
+ "nd_blk" calls back into platform-specific code to perform the I/O.
+
+ One such implementation is defined in the "Driver Writer's Guide" and "DSM
+ Interface Example".
+
+
+Why BLK?
+========
+
+While PMEM provides direct byte-addressable CPU-load/store access to
+NVDIMM storage, it does not provide the best system RAS (recovery,
+availability, and serviceability) model. An access to a corrupted
+system-physical-address address causes a CPU exception while an access
+to a corrupted address through an BLK-aperture causes that block window
+to raise an error status in a register. The latter is more aligned with
+the standard error model that host-bus-adapter attached disks present.
+
+Also, if an administrator ever wants to replace a memory it is easier to
+service a system at DIMM module boundaries. Compare this to PMEM where
+data could be interleaved in an opaque hardware specific manner across
+several DIMMs.
+
+PMEM vs BLK
+-----------
+
+BLK-apertures solve these RAS problems, but their presence is also the
+major contributing factor to the complexity of the ND subsystem. They
+complicate the implementation because PMEM and BLK alias in DPA space.
+Any given DIMM's DPA-range may contribute to one or more
+system-physical-address sets of interleaved DIMMs, *and* may also be
+accessed in its entirety through its BLK-aperture. Accessing a DPA
+through a system-physical-address while simultaneously accessing the
+same DPA through a BLK-aperture has undefined results. For this reason,
+DIMMs with this dual interface configuration include a DSM function to
+store/retrieve a LABEL. The LABEL effectively partitions the DPA-space
+into exclusive system-physical-address and BLK-aperture accessible
+regions. For simplicity a DIMM is allowed a PMEM "region" per each
+interleave set in which it is a member. The remaining DPA space can be
+carved into an arbitrary number of BLK devices with discontiguous
+extents.
+
+BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One of the few
+reasons to allow multiple BLK namespaces per REGION is so that each
+BLK-namespace can be configured with a BTT with unique atomic sector
+sizes. While a PMEM device can host a BTT the LABEL specification does
+not provide for a sector size to be specified for a PMEM namespace.
+
+This is due to the expectation that the primary usage model for PMEM is
+via DAX, and the BTT is incompatible with DAX. However, for the cases
+where an application or filesystem still needs atomic sector update
+guarantees it can register a BTT on a PMEM device or partition. See
+LIBNVDIMM/NDCTL: Block Translation Table "btt"
+
+
+Example NVDIMM Platform
+=======================
+
+For the remainder of this document the following diagram will be
+referenced for any example sysfs layouts::
+
+
+ (a) (b) DIMM BLK-REGION
+ +-------------------+--------+--------+--------+
+ +------+ | pm0.0 | blk2.0 | pm1.0 | blk2.1 | 0 region2
+ | imc0 +--+- - - region0- - - +--------+ +--------+
+ +--+---+ | pm0.0 | blk3.0 | pm1.0 | blk3.1 | 1 region3
+ | +-------------------+--------v v--------+
+ +--+---+ | |
+ | cpu0 | region1
+ +--+---+ | |
+ | +----------------------------^ ^--------+
+ +--+---+ | blk4.0 | pm1.0 | blk4.0 | 2 region4
+ | imc1 +--+----------------------------| +--------+
+ +------+ | blk5.0 | pm1.0 | blk5.0 | 3 region5
+ +----------------------------+--------+--------+
+
+In this platform we have four DIMMs and two memory controllers in one
+socket. Each unique interface (BLK or PMEM) to DPA space is identified
+by a region device with a dynamically assigned id (REGION0 - REGION5).
+
+ 1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A
+ single PMEM namespace is created in the REGION0-SPA-range that spans most
+ of DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that
+ interleaved system-physical-address range is reclaimed as BLK-aperture
+ accessed space starting at DPA-offset (a) into each DIMM. In that
+ reclaimed space we create two BLK-aperture "namespaces" from REGION2 and
+ REGION3 where "blk2.0" and "blk3.0" are just human readable names that
+ could be set to any user-desired name in the LABEL.
+
+ 2. In the last portion of DIMM0 and DIMM1 we have an interleaved
+ system-physical-address range, REGION1, that spans those two DIMMs as
+ well as DIMM2 and DIMM3. Some of REGION1 is allocated to a PMEM namespace
+ named "pm1.0", the rest is reclaimed in 4 BLK-aperture namespaces (for
+ each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
+ "blk5.0".
+
+ 3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1
+ interleaved system-physical-address range (i.e. the DPA address past
+ offset (b) are also included in the "blk4.0" and "blk5.0" namespaces.
+ Note, that this example shows that BLK-aperture namespaces don't need to
+ be contiguous in DPA-space.
+
+ This bus is provided by the kernel under the device
+ /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and
+ the nfit_test.ko module is loaded. This not only test LIBNVDIMM but the
+ acpi_nfit.ko driver as well.
+
+
+LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
+========================================================
+
+What follows is a description of the LIBNVDIMM sysfs layout and a
+corresponding object hierarchy diagram as viewed through the LIBNDCTL
+API. The example sysfs paths and diagrams are relative to the Example
+NVDIMM Platform which is also the LIBNVDIMM bus used in the LIBNDCTL unit
+test.
+
+LIBNDCTL: Context
+-----------------
+
+Every API call in the LIBNDCTL library requires a context that holds the
+logging parameters and other library instance state. The library is
+based on the libabc template:
+
+ https://git.kernel.org/cgit/linux/kernel/git/kay/libabc.git
+
+LIBNDCTL: instantiate a new library context example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+ struct ndctl_ctx *ctx;
+
+ if (ndctl_new(&ctx) == 0)
+ return ctx;
+ else
+ return NULL;
+
+LIBNVDIMM/LIBNDCTL: Bus
+-----------------------
+
+A bus has a 1:1 relationship with an NFIT. The current expectation for
+ACPI based systems is that there is only ever one platform-global NFIT.
+That said, it is trivial to register multiple NFITs, the specification
+does not preclude it. The infrastructure supports multiple busses and
+we use this capability to test multiple NFIT configurations in the unit
+test.
+
+LIBNVDIMM: control class device in /sys/class
+---------------------------------------------
+
+This character device accepts DSM messages to be passed to DIMM
+identified by its NFIT handle::
+
+ /sys/class/nd/ndctl0
+ |-- dev
+ |-- device -> ../../../ndbus0
+ |-- subsystem -> ../../../../../../../class/nd
+
+
+
+LIBNVDIMM: bus
+--------------
+
+::
+
+ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
+ struct nvdimm_bus_descriptor *nfit_desc);
+
+::
+
+ /sys/devices/platform/nfit_test.0/ndbus0
+ |-- commands
+ |-- nd
+ |-- nfit
+ |-- nmem0
+ |-- nmem1
+ |-- nmem2
+ |-- nmem3
+ |-- power
+ |-- provider
+ |-- region0
+ |-- region1
+ |-- region2
+ |-- region3
+ |-- region4
+ |-- region5
+ |-- uevent
+ `-- wait_probe
+
+LIBNDCTL: bus enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Find the bus handle that describes the bus from Example NVDIMM Platform::
+
+ static struct ndctl_bus *get_bus_by_provider(struct ndctl_ctx *ctx,
+ const char *provider)
+ {
+ struct ndctl_bus *bus;
+
+ ndctl_bus_foreach(ctx, bus)
+ if (strcmp(provider, ndctl_bus_get_provider(bus)) == 0)
+ return bus;
+
+ return NULL;
+ }
+
+ bus = get_bus_by_provider(ctx, "nfit_test.0");
+
+
+LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
+-------------------------------
+
+The DIMM device provides a character device for sending commands to
+hardware, and it is a container for LABELs. If the DIMM is defined by
+NFIT then an optional 'nfit' attribute sub-directory is available to add
+NFIT-specifics.
+
+Note that the kernel device name for "DIMMs" is "nmemX". The NFIT
+describes these devices via "Memory Device to System Physical Address
+Range Mapping Structure", and there is no requirement that they actually
+be physical DIMMs, so we use a more generic name.
+
+LIBNVDIMM: DIMM (NMEM)
+^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+ struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
+ const struct attribute_group **groups, unsigned long flags,
+ unsigned long *dsm_mask);
+
+::
+
+ /sys/devices/platform/nfit_test.0/ndbus0
+ |-- nmem0
+ | |-- available_slots
+ | |-- commands
+ | |-- dev
+ | |-- devtype
+ | |-- driver -> ../../../../../bus/nd/drivers/nvdimm
+ | |-- modalias
+ | |-- nfit
+ | | |-- device
+ | | |-- format
+ | | |-- handle
+ | | |-- phys_id
+ | | |-- rev_id
+ | | |-- serial
+ | | `-- vendor
+ | |-- state
+ | |-- subsystem -> ../../../../../bus/nd
+ | `-- uevent
+ |-- nmem1
+ [..]
+
+
+LIBNDCTL: DIMM enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Note, in this example we are assuming NFIT-defined DIMMs which are
+identified by an "nfit_handle" a 32-bit value where:
+
+ - Bit 3:0 DIMM number within the memory channel
+ - Bit 7:4 memory channel number
+ - Bit 11:8 memory controller ID
+ - Bit 15:12 socket ID (within scope of a Node controller if node
+ controller is present)
+ - Bit 27:16 Node Controller ID
+ - Bit 31:28 Reserved
+
+::
+
+ static struct ndctl_dimm *get_dimm_by_handle(struct ndctl_bus *bus,
+ unsigned int handle)
+ {
+ struct ndctl_dimm *dimm;
+
+ ndctl_dimm_foreach(bus, dimm)
+ if (ndctl_dimm_get_handle(dimm) == handle)
+ return dimm;
+
+ return NULL;
+ }
+
+ #define DIMM_HANDLE(n, s, i, c, d) \
+ (((n & 0xfff) << 16) | ((s & 0xf) << 12) | ((i & 0xf) << 8) \
+ | ((c & 0xf) << 4) | (d & 0xf))
+
+ dimm = get_dimm_by_handle(bus, DIMM_HANDLE(0, 0, 0, 0, 0));
+
+LIBNVDIMM/LIBNDCTL: Region
+--------------------------
+
+A generic REGION device is registered for each PMEM range or BLK-aperture
+set. Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture
+sets on the "nfit_test.0" bus. The primary role of regions are to be a
+container of "mappings". A mapping is a tuple of <DIMM,
+DPA-start-offset, length>.
+
+LIBNVDIMM provides a built-in driver for these REGION devices. This driver
+is responsible for reconciling the aliased DPA mappings across all
+regions, parsing the LABEL, if present, and then emitting NAMESPACE
+devices with the resolved/exclusive DPA-boundaries for the nd_pmem or
+nd_blk device driver to consume.
+
+In addition to the generic attributes of "mapping"s, "interleave_ways"
+and "size" the REGION device also exports some convenience attributes.
+"nstype" indicates the integer type of namespace-device this region
+emits, "devtype" duplicates the DEVTYPE variable stored by udev at the
+'add' event, "modalias" duplicates the MODALIAS variable stored by udev
+at the 'add' event, and finally, the optional "spa_index" is provided in
+the case where the region is defined by a SPA.
+
+LIBNVDIMM: region::
+
+ struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc);
+ struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
+ struct nd_region_desc *ndr_desc);
+
+::
+
+ /sys/devices/platform/nfit_test.0/ndbus0
+ |-- region0
+ | |-- available_size
+ | |-- btt0
+ | |-- btt_seed
+ | |-- devtype
+ | |-- driver -> ../../../../../bus/nd/drivers/nd_region
+ | |-- init_namespaces
+ | |-- mapping0
+ | |-- mapping1
+ | |-- mappings
+ | |-- modalias
+ | |-- namespace0.0
+ | |-- namespace_seed
+ | |-- numa_node
+ | |-- nfit
+ | | `-- spa_index
+ | |-- nstype
+ | |-- set_cookie
+ | |-- size
+ | |-- subsystem -> ../../../../../bus/nd
+ | `-- uevent
+ |-- region1
+ [..]
+
+LIBNDCTL: region enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Sample region retrieval routines based on NFIT-unique data like
+"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for
+BLK::
+
+ static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus,
+ unsigned int spa_index)
+ {
+ struct ndctl_region *region;
+
+ ndctl_region_foreach(bus, region) {
+ if (ndctl_region_get_type(region) != ND_DEVICE_REGION_PMEM)
+ continue;
+ if (ndctl_region_get_spa_index(region) == spa_index)
+ return region;
+ }
+ return NULL;
+ }
+
+ static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus,
+ unsigned int handle)
+ {
+ struct ndctl_region *region;
+
+ ndctl_region_foreach(bus, region) {
+ struct ndctl_mapping *map;
+
+ if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK)
+ continue;
+ ndctl_mapping_foreach(region, map) {
+ struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map);
+
+ if (ndctl_dimm_get_handle(dimm) == handle)
+ return region;
+ }
+ }
+ return NULL;
+ }
+
+
+Why Not Encode the Region Type into the Region Name?
+----------------------------------------------------
+
+At first glance it seems since NFIT defines just PMEM and BLK interface
+types that we should simply name REGION devices with something derived
+from those type names. However, the ND subsystem explicitly keeps the
+REGION name generic and expects userspace to always consider the
+region-attributes for four reasons:
+
+ 1. There are already more than two REGION and "namespace" types. For
+ PMEM there are two subtypes. As mentioned previously we have PMEM where
+ the constituent DIMM devices are known and anonymous PMEM. For BLK
+ regions the NFIT specification already anticipates vendor specific
+ implementations. The exact distinction of what a region contains is in
+ the region-attributes not the region-name or the region-devtype.
+
+ 2. A region with zero child-namespaces is a possible configuration. For
+ example, the NFIT allows for a DCR to be published without a
+ corresponding BLK-aperture. This equates to a DIMM that can only accept
+ control/configuration messages, but no i/o through a descendant block
+ device. Again, this "type" is advertised in the attributes ('mappings'
+ == 0) and the name does not tell you much.
+
+ 3. What if a third major interface type arises in the future? Outside
+ of vendor specific implementations, it's not difficult to envision a
+ third class of interface type beyond BLK and PMEM. With a generic name
+ for the REGION level of the device-hierarchy old userspace
+ implementations can still make sense of new kernel advertised
+ region-types. Userspace can always rely on the generic region
+ attributes like "mappings", "size", etc and the expected child devices
+ named "namespace". This generic format of the device-model hierarchy
+ allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and
+ future-proof.
+
+ 4. There are more robust mechanisms for determining the major type of a
+ region than a device name. See the next section, How Do I Determine the
+ Major Type of a Region?
+
+How Do I Determine the Major Type of a Region?
+----------------------------------------------
+
+Outside of the blanket recommendation of "use libndctl", or simply
+looking at the kernel header (/usr/include/linux/ndctl.h) to decode the
+"nstype" integer attribute, here are some other options.
+
+1. module alias lookup
+^^^^^^^^^^^^^^^^^^^^^^
+
+ The whole point of region/namespace device type differentiation is to
+ decide which block-device driver will attach to a given LIBNVDIMM namespace.
+ One can simply use the modalias to lookup the resulting module. It's
+ important to note that this method is robust in the presence of a
+ vendor-specific driver down the road. If a vendor-specific
+ implementation wants to supplant the standard nd_blk driver it can with
+ minimal impact to the rest of LIBNVDIMM.
+
+ In fact, a vendor may also want to have a vendor-specific region-driver
+ (outside of nd_region). For example, if a vendor defined its own LABEL
+ format it would need its own region driver to parse that LABEL and emit
+ the resulting namespaces. The output from module resolution is more
+ accurate than a region-name or region-devtype.
+
+2. udev
+^^^^^^^
+
+ The kernel "devtype" is registered in the udev database::
+
+ # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0
+ P: /devices/platform/nfit_test.0/ndbus0/region0
+ E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0
+ E: DEVTYPE=nd_pmem
+ E: MODALIAS=nd:t2
+ E: SUBSYSTEM=nd
+
+ # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4
+ P: /devices/platform/nfit_test.0/ndbus0/region4
+ E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4
+ E: DEVTYPE=nd_blk
+ E: MODALIAS=nd:t3
+ E: SUBSYSTEM=nd
+
+ ...and is available as a region attribute, but keep in mind that the
+ "devtype" does not indicate sub-type variations and scripts should
+ really be understanding the other attributes.
+
+3. type specific attributes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ As it currently stands a BLK-aperture region will never have a
+ "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region. A
+ BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM
+ that does not allow I/O. A PMEM region with a "mappings" value of zero
+ is a simple system-physical-address range.
+
+
+LIBNVDIMM/LIBNDCTL: Namespace
+-----------------------------
+
+A REGION, after resolving DPA aliasing and LABEL specified boundaries,
+surfaces one or more "namespace" devices. The arrival of a "namespace"
+device currently triggers either the nd_blk or nd_pmem driver to load
+and register a disk/block device.
+
+LIBNVDIMM: namespace
+^^^^^^^^^^^^^^^^^^^^
+
+Here is a sample layout from the three major types of NAMESPACE where
+namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid'
+attribute), namespace2.0 represents a BLK namespace (note it has a
+'sector_size' attribute) that, and namespace6.0 represents an anonymous
+PMEM namespace (note that has no 'uuid' attribute due to not support a
+LABEL)::
+
+ /sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0
+ |-- alt_name
+ |-- devtype
+ |-- dpa_extents
+ |-- force_raw
+ |-- modalias
+ |-- numa_node
+ |-- resource
+ |-- size
+ |-- subsystem -> ../../../../../../bus/nd
+ |-- type
+ |-- uevent
+ `-- uuid
+ /sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0
+ |-- alt_name
+ |-- devtype
+ |-- dpa_extents
+ |-- force_raw
+ |-- modalias
+ |-- numa_node
+ |-- sector_size
+ |-- size
+ |-- subsystem -> ../../../../../../bus/nd
+ |-- type
+ |-- uevent
+ `-- uuid
+ /sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0
+ |-- block
+ | `-- pmem0
+ |-- devtype
+ |-- driver -> ../../../../../../bus/nd/drivers/pmem
+ |-- force_raw
+ |-- modalias
+ |-- numa_node
+ |-- resource
+ |-- size
+ |-- subsystem -> ../../../../../../bus/nd
+ |-- type
+ `-- uevent
+
+LIBNDCTL: namespace enumeration example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Namespaces are indexed relative to their parent region, example below.
+These indexes are mostly static from boot to boot, but subsystem makes
+no guarantees in this regard. For a static namespace identifier use its
+'uuid' attribute.
+
+::
+
+ static struct ndctl_namespace
+ *get_namespace_by_id(struct ndctl_region *region, unsigned int id)
+ {
+ struct ndctl_namespace *ndns;
+
+ ndctl_namespace_foreach(region, ndns)
+ if (ndctl_namespace_get_id(ndns) == id)
+ return ndns;
+
+ return NULL;
+ }
+
+LIBNDCTL: namespace creation example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Idle namespaces are automatically created by the kernel if a given
+region has enough available capacity to create a new namespace.
+Namespace instantiation involves finding an idle namespace and
+configuring it. For the most part the setting of namespace attributes
+can occur in any order, the only constraint is that 'uuid' must be set
+before 'size'. This enables the kernel to track DPA allocations
+internally with a static identifier::
+
+ static int configure_namespace(struct ndctl_region *region,
+ struct ndctl_namespace *ndns,
+ struct namespace_parameters *parameters)
+ {
+ char devname[50];
+
+ snprintf(devname, sizeof(devname), "namespace%d.%d",
+ ndctl_region_get_id(region), paramaters->id);
+
+ ndctl_namespace_set_alt_name(ndns, devname);
+ /* 'uuid' must be set prior to setting size! */
+ ndctl_namespace_set_uuid(ndns, paramaters->uuid);
+ ndctl_namespace_set_size(ndns, paramaters->size);
+ /* unlike pmem namespaces, blk namespaces have a sector size */
+ if (parameters->lbasize)
+ ndctl_namespace_set_sector_size(ndns, parameters->lbasize);
+ ndctl_namespace_enable(ndns);
+ }
+
+
+Why the Term "namespace"?
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ 1. Why not "volume" for instance? "volume" ran the risk of confusing
+ ND (libnvdimm subsystem) to a volume manager like device-mapper.
+
+ 2. The term originated to describe the sub-devices that can be created
+ within a NVME controller (see the nvme specification:
+ http://www.nvmexpress.org/specifications/), and NFIT namespaces are
+ meant to parallel the capabilities and configurability of
+ NVME-namespaces.
+
+
+LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
+-------------------------------------------------
+
+A BTT (design document: http://pmem.io/2014/09/23/btt.html) is a stacked
+block device driver that fronts either the whole block device or a
+partition of a block device emitted by either a PMEM or BLK NAMESPACE.
+
+LIBNVDIMM: btt layout
+^^^^^^^^^^^^^^^^^^^^^
+
+Every region will start out with at least one BTT device which is the
+seed device. To activate it set the "namespace", "uuid", and
+"sector_size" attributes and then bind the device to the nd_pmem or
+nd_blk driver depending on the region type::
+
+ /sys/devices/platform/nfit_test.1/ndbus0/region0/btt0/
+ |-- namespace
+ |-- delete
+ |-- devtype
+ |-- modalias
+ |-- numa_node
+ |-- sector_size
+ |-- subsystem -> ../../../../../bus/nd
+ |-- uevent
+ `-- uuid
+
+LIBNDCTL: btt creation example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Similar to namespaces an idle BTT device is automatically created per
+region. Each time this "seed" btt device is configured and enabled a new
+seed is created. Creating a BTT configuration involves two steps of
+finding and idle BTT and assigning it to consume a PMEM or BLK namespace::
+
+ static struct ndctl_btt *get_idle_btt(struct ndctl_region *region)
+ {
+ struct ndctl_btt *btt;
+
+ ndctl_btt_foreach(region, btt)
+ if (!ndctl_btt_is_enabled(btt)
+ && !ndctl_btt_is_configured(btt))
+ return btt;
+
+ return NULL;
+ }
+
+ static int configure_btt(struct ndctl_region *region,
+ struct btt_parameters *parameters)
+ {
+ btt = get_idle_btt(region);
+
+ ndctl_btt_set_uuid(btt, parameters->uuid);
+ ndctl_btt_set_sector_size(btt, parameters->sector_size);
+ ndctl_btt_set_namespace(btt, parameters->ndns);
+ /* turn off raw mode device */
+ ndctl_namespace_disable(parameters->ndns);
+ /* turn on btt access */
+ ndctl_btt_enable(btt);
+ }
+
+Once instantiated a new inactive btt seed device will appear underneath
+the region.
+
+Once a "namespace" is removed from a BTT that instance of the BTT device
+will be deleted or otherwise reset to default values. This deletion is
+only at the device model level. In order to destroy a BTT the "info
+block" needs to be destroyed. Note, that to destroy a BTT the media
+needs to be written in raw mode. By default, the kernel will autodetect
+the presence of a BTT and disable raw mode. This autodetect behavior
+can be suppressed by enabling raw mode for the namespace via the
+ndctl_namespace_set_raw_mode() API.
+
+
+Summary LIBNDCTL Diagram
+------------------------
+
+For the given example above, here is the view of the objects as seen by the
+LIBNDCTL API::
+
+ +---+
+ |CTX| +---------+ +--------------+ +---------------+
+ +-+-+ +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
+ | | +---------+ +--------------+ +---------------+
+ +-------+ | | +---------+ +--------------+ +---------------+
+ | DIMM0 <-+ | +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" |
+ +-------+ | | | +---------+ +--------------+ +---------------+
+ | DIMM1 <-+ +-v--+ | +---------+ +--------------+ +---------------+
+ +-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6 "blk2.0" |
+ | DIMM2 <-+ +----+ | +---------+ | +--------------+ +----------------------+
+ +-------+ | | +-> NAMESPACE2.1 +--> ND5 "blk2.1" | BTT2 |
+ | DIMM3 <-+ | +--------------+ +----------------------+
+ +-------+ | +---------+ +--------------+ +---------------+
+ +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4 "blk3.0" |
+ | +---------+ | +--------------+ +----------------------+
+ | +-> NAMESPACE3.1 +--> ND3 "blk3.1" | BTT1 |
+ | +--------------+ +----------------------+
+ | +---------+ +--------------+ +---------------+
+ +-> REGION4 +---> NAMESPACE4.0 +--> ND2 "blk4.0" |
+ | +---------+ +--------------+ +---------------+
+ | +---------+ +--------------+ +----------------------+
+ +-> REGION5 +---> NAMESPACE5.0 +--> ND1 "blk5.0" | BTT0 |
+ +---------+ +--------------+ +---------------+------+
diff --git a/Documentation/driver-api/nvdimm/security.rst b/Documentation/driver-api/nvdimm/security.rst
new file mode 100644
index 000000000000..ad9dea099b34
--- /dev/null
+++ b/Documentation/driver-api/nvdimm/security.rst
@@ -0,0 +1,143 @@
+===============
+NVDIMM Security
+===============
+
+1. Introduction
+---------------
+
+With the introduction of Intel Device Specific Methods (DSM) v1.8
+specification [1], security DSMs are introduced. The spec added the following
+security DSMs: "get security state", "set passphrase", "disable passphrase",
+"unlock unit", "freeze lock", "secure erase", and "overwrite". A security_ops
+data structure has been added to struct dimm in order to support the security
+operations and generic APIs are exposed to allow vendor neutral operations.
+
+2. Sysfs Interface
+------------------
+The "security" sysfs attribute is provided in the nvdimm sysfs directory. For
+example:
+/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/security
+
+The "show" attribute of that attribute will display the security state for
+that DIMM. The following states are available: disabled, unlocked, locked,
+frozen, and overwrite. If security is not supported, the sysfs attribute
+will not be visible.
+
+The "store" attribute takes several commands when it is being written to
+in order to support some of the security functionalities:
+update <old_keyid> <new_keyid> - enable or update passphrase.
+disable <keyid> - disable enabled security and remove key.
+freeze - freeze changing of security states.
+erase <keyid> - delete existing user encryption key.
+overwrite <keyid> - wipe the entire nvdimm.
+master_update <keyid> <new_keyid> - enable or update master passphrase.
+master_erase <keyid> - delete existing user encryption key.
+
+3. Key Management
+-----------------
+
+The key is associated to the payload by the DIMM id. For example:
+# cat /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/nfit/id
+8089-a2-1740-00000133
+The DIMM id would be provided along with the key payload (passphrase) to
+the kernel.
+
+The security keys are managed on the basis of a single key per DIMM. The
+key "passphrase" is expected to be 32bytes long. This is similar to the ATA
+security specification [2]. A key is initially acquired via the request_key()
+kernel API call during nvdimm unlock. It is up to the user to make sure that
+all the keys are in the kernel user keyring for unlock.
+
+A nvdimm encrypted-key of format enc32 has the description format of:
+nvdimm:<bus-provider-specific-unique-id>
+
+See file ``Documentation/security/keys/trusted-encrypted.rst`` for creating
+encrypted-keys of enc32 format. TPM usage with a master trusted key is
+preferred for sealing the encrypted-keys.
+
+4. Unlocking
+------------
+When the DIMMs are being enumerated by the kernel, the kernel will attempt to
+retrieve the key from the kernel user keyring. This is the only time
+a locked DIMM can be unlocked. Once unlocked, the DIMM will remain unlocked
+until reboot. Typically an entity (i.e. shell script) will inject all the
+relevant encrypted-keys into the kernel user keyring during the initramfs phase.
+This provides the unlock function access to all the related keys that contain
+the passphrase for the respective nvdimms. It is also recommended that the
+keys are injected before libnvdimm is loaded by modprobe.
+
+5. Update
+---------
+When doing an update, it is expected that the existing key is removed from
+the kernel user keyring and reinjected as different (old) key. It's irrelevant
+what the key description is for the old key since we are only interested in the
+keyid when doing the update operation. It is also expected that the new key
+is injected with the description format described from earlier in this
+document. The update command written to the sysfs attribute will be with
+the format:
+update <old keyid> <new keyid>
+
+If there is no old keyid due to a security enabling, then a 0 should be
+passed in.
+
+6. Freeze
+---------
+The freeze operation does not require any keys. The security config can be
+frozen by a user with root privelege.
+
+7. Disable
+----------
+The security disable command format is:
+disable <keyid>
+
+An key with the current passphrase payload that is tied to the nvdimm should be
+in the kernel user keyring.
+
+8. Secure Erase
+---------------
+The command format for doing a secure erase is:
+erase <keyid>
+
+An key with the current passphrase payload that is tied to the nvdimm should be
+in the kernel user keyring.
+
+9. Overwrite
+------------
+The command format for doing an overwrite is:
+overwrite <keyid>
+
+Overwrite can be done without a key if security is not enabled. A key serial
+of 0 can be passed in to indicate no key.
+
+The sysfs attribute "security" can be polled to wait on overwrite completion.
+Overwrite can last tens of minutes or more depending on nvdimm size.
+
+An encrypted-key with the current user passphrase that is tied to the nvdimm
+should be injected and its keyid should be passed in via sysfs.
+
+10. Master Update
+-----------------
+The command format for doing a master update is:
+update <old keyid> <new keyid>
+
+The operating mechanism for master update is identical to update except the
+master passphrase key is passed to the kernel. The master passphrase key
+is just another encrypted-key.
+
+This command is only available when security is disabled.
+
+11. Master Erase
+----------------
+The command format for doing a master erase is:
+master_erase <current keyid>
+
+This command has the same operating mechanism as erase except the master
+passphrase key is passed to the kernel. The master passphrase key is just
+another encrypted-key.
+
+This command is only available when the master security is enabled, indicated
+by the extended security status.
+
+[1]: http://pmem.io/documents/NVDIMM_DSM_Interface-V1.8.pdf
+
+[2]: http://www.t13.org/documents/UploadedDocuments/docs2006/e05179r4-ACS-SecurityClarifications.pdf
diff --git a/Documentation/driver-api/nvmem.rst b/Documentation/driver-api/nvmem.rst
new file mode 100644
index 000000000000..d9d958d5c824
--- /dev/null
+++ b/Documentation/driver-api/nvmem.rst
@@ -0,0 +1,189 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===============
+NVMEM Subsystem
+===============
+
+ Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
+
+This document explains the NVMEM Framework along with the APIs provided,
+and how to use it.
+
+1. Introduction
+===============
+*NVMEM* is the abbreviation for Non Volatile Memory layer. It is used to
+retrieve configuration of SOC or Device specific data from non volatile
+memories like eeprom, efuses and so on.
+
+Before this framework existed, NVMEM drivers like eeprom were stored in
+drivers/misc, where they all had to duplicate pretty much the same code to
+register a sysfs file, allow in-kernel users to access the content of the
+devices they were driving, etc.
+
+This was also a problem as far as other in-kernel users were involved, since
+the solutions used were pretty much different from one driver to another, there
+was a rather big abstraction leak.
+
+This framework aims at solve these problems. It also introduces DT
+representation for consumer devices to go get the data they require (MAC
+Addresses, SoC/Revision ID, part numbers, and so on) from the NVMEMs. This
+framework is based on regmap, so that most of the abstraction available in
+regmap can be reused, across multiple types of buses.
+
+NVMEM Providers
++++++++++++++++
+
+NVMEM provider refers to an entity that implements methods to initialize, read
+and write the non-volatile memory.
+
+2. Registering/Unregistering the NVMEM provider
+===============================================
+
+A NVMEM provider can register with NVMEM core by supplying relevant
+nvmem configuration to nvmem_register(), on success core would return a valid
+nvmem_device pointer.
+
+nvmem_unregister(nvmem) is used to unregister a previously registered provider.
+
+For example, a simple qfprom case::
+
+ static struct nvmem_config econfig = {
+ .name = "qfprom",
+ .owner = THIS_MODULE,
+ };
+
+ static int qfprom_probe(struct platform_device *pdev)
+ {
+ ...
+ econfig.dev = &pdev->dev;
+ nvmem = nvmem_register(&econfig);
+ ...
+ }
+
+It is mandatory that the NVMEM provider has a regmap associated with its
+struct device. Failure to do would return error code from nvmem_register().
+
+Users of board files can define and register nvmem cells using the
+nvmem_cell_table struct::
+
+ static struct nvmem_cell_info foo_nvmem_cells[] = {
+ {
+ .name = "macaddr",
+ .offset = 0x7f00,
+ .bytes = ETH_ALEN,
+ }
+ };
+
+ static struct nvmem_cell_table foo_nvmem_cell_table = {
+ .nvmem_name = "i2c-eeprom",
+ .cells = foo_nvmem_cells,
+ .ncells = ARRAY_SIZE(foo_nvmem_cells),
+ };
+
+ nvmem_add_cell_table(&foo_nvmem_cell_table);
+
+Additionally it is possible to create nvmem cell lookup entries and register
+them with the nvmem framework from machine code as shown in the example below::
+
+ static struct nvmem_cell_lookup foo_nvmem_lookup = {
+ .nvmem_name = "i2c-eeprom",
+ .cell_name = "macaddr",
+ .dev_id = "foo_mac.0",
+ .con_id = "mac-address",
+ };
+
+ nvmem_add_cell_lookups(&foo_nvmem_lookup, 1);
+
+NVMEM Consumers
++++++++++++++++
+
+NVMEM consumers are the entities which make use of the NVMEM provider to
+read from and to NVMEM.
+
+3. NVMEM cell based consumer APIs
+=================================
+
+NVMEM cells are the data entries/fields in the NVMEM.
+The NVMEM framework provides 3 APIs to read/write NVMEM cells::
+
+ struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *name);
+ struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *name);
+
+ void nvmem_cell_put(struct nvmem_cell *cell);
+ void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
+
+ void *nvmem_cell_read(struct nvmem_cell *cell, ssize_t *len);
+ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, ssize_t len);
+
+`*nvmem_cell_get()` apis will get a reference to nvmem cell for a given id,
+and nvmem_cell_read/write() can then read or write to the cell.
+Once the usage of the cell is finished the consumer should call
+`*nvmem_cell_put()` to free all the allocation memory for the cell.
+
+4. Direct NVMEM device based consumer APIs
+==========================================
+
+In some instances it is necessary to directly read/write the NVMEM.
+To facilitate such consumers NVMEM framework provides below apis::
+
+ struct nvmem_device *nvmem_device_get(struct device *dev, const char *name);
+ struct nvmem_device *devm_nvmem_device_get(struct device *dev,
+ const char *name);
+ void nvmem_device_put(struct nvmem_device *nvmem);
+ int nvmem_device_read(struct nvmem_device *nvmem, unsigned int offset,
+ size_t bytes, void *buf);
+ int nvmem_device_write(struct nvmem_device *nvmem, unsigned int offset,
+ size_t bytes, void *buf);
+ int nvmem_device_cell_read(struct nvmem_device *nvmem,
+ struct nvmem_cell_info *info, void *buf);
+ int nvmem_device_cell_write(struct nvmem_device *nvmem,
+ struct nvmem_cell_info *info, void *buf);
+
+Before the consumers can read/write NVMEM directly, it should get hold
+of nvmem_controller from one of the `*nvmem_device_get()` api.
+
+The difference between these apis and cell based apis is that these apis always
+take nvmem_device as parameter.
+
+5. Releasing a reference to the NVMEM
+=====================================
+
+When a consumer no longer needs the NVMEM, it has to release the reference
+to the NVMEM it has obtained using the APIs mentioned in the above section.
+The NVMEM framework provides 2 APIs to release a reference to the NVMEM::
+
+ void nvmem_cell_put(struct nvmem_cell *cell);
+ void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
+ void nvmem_device_put(struct nvmem_device *nvmem);
+ void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem);
+
+Both these APIs are used to release a reference to the NVMEM and
+devm_nvmem_cell_put and devm_nvmem_device_put destroys the devres associated
+with this NVMEM.
+
+Userspace
++++++++++
+
+6. Userspace binary interface
+==============================
+
+Userspace can read/write the raw NVMEM file located at::
+
+ /sys/bus/nvmem/devices/*/nvmem
+
+ex::
+
+ hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
+
+ 0000000 0000 0000 0000 0000 0000 0000 0000 0000
+ *
+ 00000a0 db10 2240 0000 e000 0c00 0c00 0000 0c00
+ 0000000 0000 0000 0000 0000 0000 0000 0000 0000
+ ...
+ *
+ 0001000
+
+7. DeviceTree Binding
+=====================
+
+See Documentation/devicetree/bindings/nvmem/nvmem.txt
diff --git a/Documentation/parport-lowlevel.txt b/Documentation/driver-api/parport-lowlevel.rst
index 0633d70ffda7..0633d70ffda7 100644
--- a/Documentation/parport-lowlevel.txt
+++ b/Documentation/driver-api/parport-lowlevel.rst
diff --git a/Documentation/driver-api/phy/index.rst b/Documentation/driver-api/phy/index.rst
new file mode 100644
index 000000000000..69ba1216de72
--- /dev/null
+++ b/Documentation/driver-api/phy/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Generic PHY Framework
+=====================
+
+.. toctree::
+
+ phy
+ samsung-usb2
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
+
diff --git a/Documentation/phy.txt b/Documentation/driver-api/phy/phy.rst
index 457c3e0f86d6..457c3e0f86d6 100644
--- a/Documentation/phy.txt
+++ b/Documentation/driver-api/phy/phy.rst
diff --git a/Documentation/driver-api/phy/samsung-usb2.rst b/Documentation/driver-api/phy/samsung-usb2.rst
new file mode 100644
index 000000000000..c48c8b9797b9
--- /dev/null
+++ b/Documentation/driver-api/phy/samsung-usb2.rst
@@ -0,0 +1,137 @@
+====================================
+Samsung USB 2.0 PHY adaptation layer
+====================================
+
+1. Description
+--------------
+
+The architecture of the USB 2.0 PHY module in Samsung SoCs is similar
+among many SoCs. In spite of the similarities it proved difficult to
+create a one driver that would fit all these PHY controllers. Often
+the differences were minor and were found in particular bits of the
+registers of the PHY. In some rare cases the order of register writes or
+the PHY powering up process had to be altered. This adaptation layer is
+a compromise between having separate drivers and having a single driver
+with added support for many special cases.
+
+2. Files description
+--------------------
+
+- phy-samsung-usb2.c
+ This is the main file of the adaptation layer. This file contains
+ the probe function and provides two callbacks to the Generic PHY
+ Framework. This two callbacks are used to power on and power off the
+ phy. They carry out the common work that has to be done on all version
+ of the PHY module. Depending on which SoC was chosen they execute SoC
+ specific callbacks. The specific SoC version is selected by choosing
+ the appropriate compatible string. In addition, this file contains
+ struct of_device_id definitions for particular SoCs.
+
+- phy-samsung-usb2.h
+ This is the include file. It declares the structures used by this
+ driver. In addition it should contain extern declarations for
+ structures that describe particular SoCs.
+
+3. Supporting SoCs
+------------------
+
+To support a new SoC a new file should be added to the drivers/phy
+directory. Each SoC's configuration is stored in an instance of the
+struct samsung_usb2_phy_config::
+
+ struct samsung_usb2_phy_config {
+ const struct samsung_usb2_common_phy *phys;
+ int (*rate_to_clk)(unsigned long, u32 *);
+ unsigned int num_phys;
+ bool has_mode_switch;
+ };
+
+The num_phys is the number of phys handled by the driver. `*phys` is an
+array that contains the configuration for each phy. The has_mode_switch
+property is a boolean flag that determines whether the SoC has USB host
+and device on a single pair of pins. If so, a special register has to
+be modified to change the internal routing of these pins between a USB
+device or host module.
+
+For example the configuration for Exynos 4210 is following::
+
+ const struct samsung_usb2_phy_config exynos4210_usb2_phy_config = {
+ .has_mode_switch = 0,
+ .num_phys = EXYNOS4210_NUM_PHYS,
+ .phys = exynos4210_phys,
+ .rate_to_clk = exynos4210_rate_to_clk,
+ }
+
+- `int (*rate_to_clk)(unsigned long, u32 *)`
+
+ The rate_to_clk callback is to convert the rate of the clock
+ used as the reference clock for the PHY module to the value
+ that should be written in the hardware register.
+
+The exynos4210_phys configuration array is as follows::
+
+ static const struct samsung_usb2_common_phy exynos4210_phys[] = {
+ {
+ .label = "device",
+ .id = EXYNOS4210_DEVICE,
+ .power_on = exynos4210_power_on,
+ .power_off = exynos4210_power_off,
+ },
+ {
+ .label = "host",
+ .id = EXYNOS4210_HOST,
+ .power_on = exynos4210_power_on,
+ .power_off = exynos4210_power_off,
+ },
+ {
+ .label = "hsic0",
+ .id = EXYNOS4210_HSIC0,
+ .power_on = exynos4210_power_on,
+ .power_off = exynos4210_power_off,
+ },
+ {
+ .label = "hsic1",
+ .id = EXYNOS4210_HSIC1,
+ .power_on = exynos4210_power_on,
+ .power_off = exynos4210_power_off,
+ },
+ {},
+ };
+
+- `int (*power_on)(struct samsung_usb2_phy_instance *);`
+ `int (*power_off)(struct samsung_usb2_phy_instance *);`
+
+ These two callbacks are used to power on and power off the phy
+ by modifying appropriate registers.
+
+Final change to the driver is adding appropriate compatible value to the
+phy-samsung-usb2.c file. In case of Exynos 4210 the following lines were
+added to the struct of_device_id samsung_usb2_phy_of_match[] array::
+
+ #ifdef CONFIG_PHY_EXYNOS4210_USB2
+ {
+ .compatible = "samsung,exynos4210-usb2-phy",
+ .data = &exynos4210_usb2_phy_config,
+ },
+ #endif
+
+To add further flexibility to the driver the Kconfig file enables to
+include support for selected SoCs in the compiled driver. The Kconfig
+entry for Exynos 4210 is following::
+
+ config PHY_EXYNOS4210_USB2
+ bool "Support for Exynos 4210"
+ depends on PHY_SAMSUNG_USB2
+ depends on CPU_EXYNOS4210
+ help
+ Enable USB PHY support for Exynos 4210. This option requires that
+ Samsung USB 2.0 PHY driver is enabled and means that support for this
+ particular SoC is compiled in the driver. In case of Exynos 4210 four
+ phys are available - device, host, HSCI0 and HSCI1.
+
+The newly created file that supports the new SoC has to be also added to the
+Makefile. In case of Exynos 4210 the added line is following::
+
+ obj-$(CONFIG_PHY_EXYNOS4210_USB2) += phy-exynos4210-usb2.o
+
+After completing these steps the support for the new SoC should be ready.
diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 30835683616a..f66c7b9126ea 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -225,7 +225,7 @@ system-wide transition to a sleep state even though its :c:member:`runtime_auto`
flag is clear.
For more information about the runtime power management framework, refer to
-:file:`Documentation/power/runtime_pm.txt`.
+:file:`Documentation/power/runtime_pm.rst`.
Calling Drivers to Enter and Leave System Sleep States
@@ -728,7 +728,7 @@ it into account in any way.
Devices may be defined as IRQ-safe which indicates to the PM core that their
runtime PM callbacks may be invoked with disabled interrupts (see
-:file:`Documentation/power/runtime_pm.txt` for more information). If an
+:file:`Documentation/power/runtime_pm.rst` for more information). If an
IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be
disallowed, unless the domain itself is defined as IRQ-safe. However, it
makes sense to define a PM domain as IRQ-safe only if all the devices in it
@@ -795,7 +795,7 @@ so on) and the final state of the device must reflect the "active" runtime PM
status in that case.
During system-wide resume from a sleep state it's easiest to put devices into
-the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
+the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`.
[Refer to that document for more information regarding this particular issue as
well as for information on the device runtime power management framework in
general.]
diff --git a/Documentation/driver-api/pps.rst b/Documentation/driver-api/pps.rst
index 1456d2c32ebd..2d6b99766ee8 100644
--- a/Documentation/driver-api/pps.rst
+++ b/Documentation/driver-api/pps.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
======================
PPS - Pulse Per Second
diff --git a/Documentation/driver-api/pti_intel_mid.rst b/Documentation/driver-api/pti_intel_mid.rst
new file mode 100644
index 000000000000..20f1cff42d5f
--- /dev/null
+++ b/Documentation/driver-api/pti_intel_mid.rst
@@ -0,0 +1,106 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============
+Intel MID PTI
+=============
+
+The Intel MID PTI project is HW implemented in Intel Atom
+system-on-a-chip designs based on the Parallel Trace
+Interface for MIPI P1149.7 cJTAG standard. The kernel solution
+for this platform involves the following files::
+
+ ./include/linux/pti.h
+ ./drivers/.../n_tracesink.h
+ ./drivers/.../n_tracerouter.c
+ ./drivers/.../n_tracesink.c
+ ./drivers/.../pti.c
+
+pti.c is the driver that enables various debugging features
+popular on platforms from certain mobile manufacturers.
+n_tracerouter.c and n_tracesink.c allow extra system information to
+be collected and routed to the pti driver, such as trace
+debugging data from a modem. Although n_tracerouter
+and n_tracesink are a part of the complete PTI solution,
+these two line disciplines can work separately from
+pti.c and route any data stream from one /dev/tty node
+to another /dev/tty node via kernel-space. This provides
+a stable, reliable connection that will not break unless
+the user-space application shuts down (plus avoids
+kernel->user->kernel context switch overheads of routing
+data).
+
+An example debugging usage for this driver system:
+
+ * Hook /dev/ttyPTI0 to syslogd. Opening this port will also start
+ a console device to further capture debugging messages to PTI.
+ * Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW.
+ This is where n_tracerouter and n_tracesink are used.
+ * Hook /dev/pti to a user-level debugging application for writing
+ to PTI HW.
+ * `Use mipi_` Kernel Driver API in other device drivers for
+ debugging to PTI by first requesting a PTI write address via
+ mipi_request_masterchannel(1).
+
+Below is example pseudo-code on how a 'privileged' application
+can hook up n_tracerouter and n_tracesink to any tty on
+a system. 'Privileged' means the application has enough
+privileges to successfully manipulate the ldisc drivers
+but is not just blindly executing as 'root'. Keep in mind
+the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter
+and n_tracesink line discpline drivers but is a generic
+operation for a program to use a line discpline driver
+on a tty port other than the default n_tty::
+
+ /////////// To hook up n_tracerouter and n_tracesink /////////
+
+ // Note that n_tracerouter depends on n_tracesink.
+ #include <errno.h>
+ #define ONE_TTY "/dev/ttyOne"
+ #define TWO_TTY "/dev/ttyTwo"
+
+ // needed global to hand onto ldisc connection
+ static int g_fd_source = -1;
+ static int g_fd_sink = -1;
+
+ // these two vars used to grab LDISC values from loaded ldisc drivers
+ // in OS. Look at /proc/tty/ldiscs to get the right numbers from
+ // the ldiscs loaded in the system.
+ int source_ldisc_num, sink_ldisc_num = -1;
+ int retval;
+
+ g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W
+ g_fd_sink = open(TWO_TTY, O_RDWR); // must be R/W
+
+ if (g_fd_source <= 0) || (g_fd_sink <= 0) {
+ // doubt you'll want to use these exact error lines of code
+ printf("Error on open(). errno: %d\n",errno);
+ return errno;
+ }
+
+ retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num);
+ if (retval < 0) {
+ printf("Error on ioctl(). errno: %d\n", errno);
+ return errno;
+ }
+
+ retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num);
+ if (retval < 0) {
+ printf("Error on ioctl(). errno: %d\n", errno);
+ return errno;
+ }
+
+ /////////// To disconnect n_tracerouter and n_tracesink ////////
+
+ // First make sure data through the ldiscs has stopped.
+
+ // Second, disconnect ldiscs. This provides a
+ // little cleaner shutdown on tty stack.
+ sink_ldisc_num = 0;
+ source_ldisc_num = 0;
+ ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num);
+ ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num);
+
+ // Three, program closes connection, and cleanup:
+ close(g_fd_uart);
+ close(g_fd_gadget);
+ g_fd_uart = g_fd_gadget = NULL;
diff --git a/Documentation/driver-api/ptp.rst b/Documentation/driver-api/ptp.rst
index b6e65d66d37a..a15192e32347 100644
--- a/Documentation/driver-api/ptp.rst
+++ b/Documentation/driver-api/ptp.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
===========================================
PTP hardware clock infrastructure for Linux
diff --git a/Documentation/driver-api/pwm.rst b/Documentation/driver-api/pwm.rst
new file mode 100644
index 000000000000..ab62f1bb0366
--- /dev/null
+++ b/Documentation/driver-api/pwm.rst
@@ -0,0 +1,165 @@
+======================================
+Pulse Width Modulation (PWM) interface
+======================================
+
+This provides an overview about the Linux PWM interface
+
+PWMs are commonly used for controlling LEDs, fans or vibrators in
+cell phones. PWMs with a fixed purpose have no need implementing
+the Linux PWM API (although they could). However, PWMs are often
+found as discrete devices on SoCs which have no fixed purpose. It's
+up to the board designer to connect them to LEDs or fans. To provide
+this kind of flexibility the generic PWM API exists.
+
+Identifying PWMs
+----------------
+
+Users of the legacy PWM API use unique IDs to refer to PWM devices.
+
+Instead of referring to a PWM device via its unique ID, board setup code
+should instead register a static mapping that can be used to match PWM
+consumers to providers, as given in the following example::
+
+ static struct pwm_lookup board_pwm_lookup[] = {
+ PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL,
+ 50000, PWM_POLARITY_NORMAL),
+ };
+
+ static void __init board_init(void)
+ {
+ ...
+ pwm_add_table(board_pwm_lookup, ARRAY_SIZE(board_pwm_lookup));
+ ...
+ }
+
+Using PWMs
+----------
+
+Legacy users can request a PWM device using pwm_request() and free it
+after usage with pwm_free().
+
+New users should use the pwm_get() function and pass to it the consumer
+device or a consumer name. pwm_put() is used to free the PWM device. Managed
+variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist.
+
+After being requested, a PWM has to be configured using::
+
+ int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
+
+This API controls both the PWM period/duty_cycle config and the
+enable/disable state.
+
+The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
+around pwm_apply_state() and should not be used if the user wants to change
+several parameter at once. For example, if you see pwm_config() and
+pwm_{enable,disable}() calls in the same function, this probably means you
+should switch to pwm_apply_state().
+
+The PWM user API also allows one to query the PWM state with pwm_get_state().
+
+In addition to the PWM state, the PWM API also exposes PWM arguments, which
+are the reference PWM config one should use on this PWM.
+PWM arguments are usually platform-specific and allows the PWM user to only
+care about dutycycle relatively to the full period (like, duty = 50% of the
+period). struct pwm_args contains 2 fields (period and polarity) and should
+be used to set the initial PWM config (usually done in the probe function
+of the PWM user). PWM arguments are retrieved with pwm_get_args().
+
+All consumers should really be reconfiguring the PWM upon resume as
+appropriate. This is the only way to ensure that everything is resumed in
+the proper order.
+
+Using PWMs with the sysfs interface
+-----------------------------------
+
+If CONFIG_SYSFS is enabled in your kernel configuration a simple sysfs
+interface is provided to use the PWMs from userspace. It is exposed at
+/sys/class/pwm/. Each probed PWM controller/chip will be exported as
+pwmchipN, where N is the base of the PWM chip. Inside the directory you
+will find:
+
+ npwm
+ The number of PWM channels this chip supports (read-only).
+
+ export
+ Exports a PWM channel for use with sysfs (write-only).
+
+ unexport
+ Unexports a PWM channel from sysfs (write-only).
+
+The PWM channels are numbered using a per-chip index from 0 to npwm-1.
+
+When a PWM channel is exported a pwmX directory will be created in the
+pwmchipN directory it is associated with, where X is the number of the
+channel that was exported. The following properties will then be available:
+
+ period
+ The total period of the PWM signal (read/write).
+ Value is in nanoseconds and is the sum of the active and inactive
+ time of the PWM.
+
+ duty_cycle
+ The active time of the PWM signal (read/write).
+ Value is in nanoseconds and must be less than the period.
+
+ polarity
+ Changes the polarity of the PWM signal (read/write).
+ Writes to this property only work if the PWM chip supports changing
+ the polarity. The polarity can only be changed if the PWM is not
+ enabled. Value is the string "normal" or "inversed".
+
+ enable
+ Enable/disable the PWM signal (read/write).
+
+ - 0 - disabled
+ - 1 - enabled
+
+Implementing a PWM driver
+-------------------------
+
+Currently there are two ways to implement pwm drivers. Traditionally
+there only has been the barebone API meaning that each driver has
+to implement the pwm_*() functions itself. This means that it's impossible
+to have multiple PWM drivers in the system. For this reason it's mandatory
+for new drivers to use the generic PWM framework.
+
+A new PWM controller/chip can be added using pwmchip_add() and removed
+again with pwmchip_remove(). pwmchip_add() takes a filled in struct
+pwm_chip as argument which provides a description of the PWM chip, the
+number of PWM devices provided by the chip and the chip-specific
+implementation of the supported PWM operations to the framework.
+
+When implementing polarity support in a PWM driver, make sure to respect the
+signal conventions in the PWM framework. By definition, normal polarity
+characterizes a signal starts high for the duration of the duty cycle and
+goes low for the remainder of the period. Conversely, a signal with inversed
+polarity starts low for the duration of the duty cycle and goes high for the
+remainder of the period.
+
+Drivers are encouraged to implement ->apply() instead of the legacy
+->enable(), ->disable() and ->config() methods. Doing that should provide
+atomicity in the PWM config workflow, which is required when the PWM controls
+a critical device (like a regulator).
+
+The implementation of ->get_state() (a method used to retrieve initial PWM
+state) is also encouraged for the same reason: letting the PWM user know
+about the current PWM state would allow him to avoid glitches.
+
+Drivers should not implement any power management. In other words,
+consumers should implement it as described in the "Using PWMs" section.
+
+Locking
+-------
+
+The PWM core list manipulations are protected by a mutex, so pwm_request()
+and pwm_free() may not be called from an atomic context. Currently the
+PWM core does not enforce any locking to pwm_enable(), pwm_disable() and
+pwm_config(), so the calling context is currently driver specific. This
+is an issue derived from the former barebone API and should be fixed soon.
+
+Helpers
+-------
+
+Currently a PWM can only be configured with period_ns and duty_ns. For several
+use cases freq_hz and duty_percent might be better. Instead of calculating
+this in your driver please consider adding appropriate helpers to the framework.
diff --git a/Documentation/driver-api/rapidio/index.rst b/Documentation/driver-api/rapidio/index.rst
new file mode 100644
index 000000000000..a41b4242d16f
--- /dev/null
+++ b/Documentation/driver-api/rapidio/index.rst
@@ -0,0 +1,15 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===========================
+The Linux RapidIO Subsystem
+===========================
+
+.. toctree::
+ :maxdepth: 1
+
+ rapidio
+ sysfs
+
+ tsi721
+ mport_cdev
+ rio_cm
diff --git a/Documentation/driver-api/rapidio/mport_cdev.rst b/Documentation/driver-api/rapidio/mport_cdev.rst
new file mode 100644
index 000000000000..df77a7f7be7d
--- /dev/null
+++ b/Documentation/driver-api/rapidio/mport_cdev.rst
@@ -0,0 +1,110 @@
+==================================================================
+RapidIO subsystem mport character device driver (rio_mport_cdev.c)
+==================================================================
+
+1. Overview
+===========
+
+This device driver is the result of collaboration within the RapidIO.org
+Software Task Group (STG) between Texas Instruments, Freescale,
+Prodrive Technologies, Nokia Networks, BAE and IDT. Additional input was
+received from other members of RapidIO.org. The objective was to create a
+character mode driver interface which exposes the capabilities of RapidIO
+devices directly to applications, in a manner that allows the numerous and
+varied RapidIO implementations to interoperate.
+
+This driver (MPORT_CDEV) provides access to basic RapidIO subsystem operations
+for user-space applications. Most of RapidIO operations are supported through
+'ioctl' system calls.
+
+When loaded this device driver creates filesystem nodes named rio_mportX in /dev
+directory for each registered RapidIO mport device. 'X' in the node name matches
+to unique port ID assigned to each local mport device.
+
+Using available set of ioctl commands user-space applications can perform
+following RapidIO bus and subsystem operations:
+
+- Reads and writes from/to configuration registers of mport devices
+ (RIO_MPORT_MAINT_READ_LOCAL/RIO_MPORT_MAINT_WRITE_LOCAL)
+- Reads and writes from/to configuration registers of remote RapidIO devices.
+ This operations are defined as RapidIO Maintenance reads/writes in RIO spec.
+ (RIO_MPORT_MAINT_READ_REMOTE/RIO_MPORT_MAINT_WRITE_REMOTE)
+- Set RapidIO Destination ID for mport devices (RIO_MPORT_MAINT_HDID_SET)
+- Set RapidIO Component Tag for mport devices (RIO_MPORT_MAINT_COMPTAG_SET)
+- Query logical index of mport devices (RIO_MPORT_MAINT_PORT_IDX_GET)
+- Query capabilities and RapidIO link configuration of mport devices
+ (RIO_MPORT_GET_PROPERTIES)
+- Enable/Disable reporting of RapidIO doorbell events to user-space applications
+ (RIO_ENABLE_DOORBELL_RANGE/RIO_DISABLE_DOORBELL_RANGE)
+- Enable/Disable reporting of RIO port-write events to user-space applications
+ (RIO_ENABLE_PORTWRITE_RANGE/RIO_DISABLE_PORTWRITE_RANGE)
+- Query/Control type of events reported through this driver: doorbells,
+ port-writes or both (RIO_SET_EVENT_MASK/RIO_GET_EVENT_MASK)
+- Configure/Map mport's outbound requests window(s) for specific size,
+ RapidIO destination ID, hopcount and request type
+ (RIO_MAP_OUTBOUND/RIO_UNMAP_OUTBOUND)
+- Configure/Map mport's inbound requests window(s) for specific size,
+ RapidIO base address and local memory base address
+ (RIO_MAP_INBOUND/RIO_UNMAP_INBOUND)
+- Allocate/Free contiguous DMA coherent memory buffer for DMA data transfers
+ to/from remote RapidIO devices (RIO_ALLOC_DMA/RIO_FREE_DMA)
+- Initiate DMA data transfers to/from remote RapidIO devices (RIO_TRANSFER).
+ Supports blocking, asynchronous and posted (a.k.a 'fire-and-forget') data
+ transfer modes.
+- Check/Wait for completion of asynchronous DMA data transfer
+ (RIO_WAIT_FOR_ASYNC)
+- Manage device objects supported by RapidIO subsystem (RIO_DEV_ADD/RIO_DEV_DEL).
+ This allows implementation of various RapidIO fabric enumeration algorithms
+ as user-space applications while using remaining functionality provided by
+ kernel RapidIO subsystem.
+
+2. Hardware Compatibility
+=========================
+
+This device driver uses standard interfaces defined by kernel RapidIO subsystem
+and therefore it can be used with any mport device driver registered by RapidIO
+subsystem with limitations set by available mport implementation.
+
+At this moment the most common limitation is availability of RapidIO-specific
+DMA engine framework for specific mport device. Users should verify available
+functionality of their platform when planning to use this driver:
+
+- IDT Tsi721 PCIe-to-RapidIO bridge device and its mport device driver are fully
+ compatible with this driver.
+- Freescale SoCs 'fsl_rio' mport driver does not have implementation for RapidIO
+ specific DMA engine support and therefore DMA data transfers mport_cdev driver
+ are not available.
+
+3. Module parameters
+====================
+
+- 'dma_timeout'
+ - DMA transfer completion timeout (in msec, default value 3000).
+ This parameter set a maximum completion wait time for SYNC mode DMA
+ transfer requests and for RIO_WAIT_FOR_ASYNC ioctl requests.
+
+- 'dbg_level'
+ - This parameter allows to control amount of debug information
+ generated by this device driver. This parameter is formed by set of
+ bit masks that correspond to the specific functional blocks.
+ For mask definitions see 'drivers/rapidio/devices/rio_mport_cdev.c'
+ This parameter can be changed dynamically.
+ Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+4. Known problems
+=================
+
+ None.
+
+5. User-space Applications and API
+==================================
+
+API library and applications that use this device driver are available from
+RapidIO.org.
+
+6. TODO List
+============
+
+- Add support for sending/receiving "raw" RapidIO messaging packets.
+- Add memory mapped DMA data transfers as an option when RapidIO-specific DMA
+ is not available.
diff --git a/Documentation/driver-api/rapidio/rapidio.rst b/Documentation/driver-api/rapidio/rapidio.rst
new file mode 100644
index 000000000000..fb8942d3ba85
--- /dev/null
+++ b/Documentation/driver-api/rapidio/rapidio.rst
@@ -0,0 +1,362 @@
+============
+Introduction
+============
+
+The RapidIO standard is a packet-based fabric interconnect standard designed for
+use in embedded systems. Development of the RapidIO standard is directed by the
+RapidIO Trade Association (RTA). The current version of the RapidIO specification
+is publicly available for download from the RTA web-site [1].
+
+This document describes the basics of the Linux RapidIO subsystem and provides
+information on its major components.
+
+1 Overview
+==========
+
+Because the RapidIO subsystem follows the Linux device model it is integrated
+into the kernel similarly to other buses by defining RapidIO-specific device and
+bus types and registering them within the device model.
+
+The Linux RapidIO subsystem is architecture independent and therefore defines
+architecture-specific interfaces that provide support for common RapidIO
+subsystem operations.
+
+2. Core Components
+==================
+
+A typical RapidIO network is a combination of endpoints and switches.
+Each of these components is represented in the subsystem by an associated data
+structure. The core logical components of the RapidIO subsystem are defined
+in include/linux/rio.h file.
+
+2.1 Master Port
+---------------
+
+A master port (or mport) is a RapidIO interface controller that is local to the
+processor executing the Linux code. A master port generates and receives RapidIO
+packets (transactions). In the RapidIO subsystem each master port is represented
+by a rio_mport data structure. This structure contains master port specific
+resources such as mailboxes and doorbells. The rio_mport also includes a unique
+host device ID that is valid when a master port is configured as an enumerating
+host.
+
+RapidIO master ports are serviced by subsystem specific mport device drivers
+that provide functionality defined for this subsystem. To provide a hardware
+independent interface for RapidIO subsystem operations, rio_mport structure
+includes rio_ops data structure which contains pointers to hardware specific
+implementations of RapidIO functions.
+
+2.2 Device
+----------
+
+A RapidIO device is any endpoint (other than mport) or switch in the network.
+All devices are presented in the RapidIO subsystem by corresponding rio_dev data
+structure. Devices form one global device list and per-network device lists
+(depending on number of available mports and networks).
+
+2.3 Switch
+----------
+
+A RapidIO switch is a special class of device that routes packets between its
+ports towards their final destination. The packet destination port within a
+switch is defined by an internal routing table. A switch is presented in the
+RapidIO subsystem by rio_dev data structure expanded by additional rio_switch
+data structure, which contains switch specific information such as copy of the
+routing table and pointers to switch specific functions.
+
+The RapidIO subsystem defines the format and initialization method for subsystem
+specific switch drivers that are designed to provide hardware-specific
+implementation of common switch management routines.
+
+2.4 Network
+-----------
+
+A RapidIO network is a combination of interconnected endpoint and switch devices.
+Each RapidIO network known to the system is represented by corresponding rio_net
+data structure. This structure includes lists of all devices and local master
+ports that form the same network. It also contains a pointer to the default
+master port that is used to communicate with devices within the network.
+
+2.5 Device Drivers
+------------------
+
+RapidIO device-specific drivers follow Linux Kernel Driver Model and are
+intended to support specific RapidIO devices attached to the RapidIO network.
+
+2.6 Subsystem Interfaces
+------------------------
+
+RapidIO interconnect specification defines features that may be used to provide
+one or more common service layers for all participating RapidIO devices. These
+common services may act separately from device-specific drivers or be used by
+device-specific drivers. Example of such service provider is the RIONET driver
+which implements Ethernet-over-RapidIO interface. Because only one driver can be
+registered for a device, all common RapidIO services have to be registered as
+subsystem interfaces. This allows to have multiple common services attached to
+the same device without blocking attachment of a device-specific driver.
+
+3. Subsystem Initialization
+===========================
+
+In order to initialize the RapidIO subsystem, a platform must initialize and
+register at least one master port within the RapidIO network. To register mport
+within the subsystem controller driver's initialization code calls function
+rio_register_mport() for each available master port.
+
+After all active master ports are registered with a RapidIO subsystem,
+an enumeration and/or discovery routine may be called automatically or
+by user-space command.
+
+RapidIO subsystem can be configured to be built as a statically linked or
+modular component of the kernel (see details below).
+
+4. Enumeration and Discovery
+============================
+
+4.1 Overview
+------------
+
+RapidIO subsystem configuration options allow users to build enumeration and
+discovery methods as statically linked components or loadable modules.
+An enumeration/discovery method implementation and available input parameters
+define how any given method can be attached to available RapidIO mports:
+simply to all available mports OR individually to the specified mport device.
+
+Depending on selected enumeration/discovery build configuration, there are
+several methods to initiate an enumeration and/or discovery process:
+
+ (a) Statically linked enumeration and discovery process can be started
+ automatically during kernel initialization time using corresponding module
+ parameters. This was the original method used since introduction of RapidIO
+ subsystem. Now this method relies on enumerator module parameter which is
+ 'rio-scan.scan' for existing basic enumeration/discovery method.
+ When automatic start of enumeration/discovery is used a user has to ensure
+ that all discovering endpoints are started before the enumerating endpoint
+ and are waiting for enumeration to be completed.
+ Configuration option CONFIG_RAPIDIO_DISC_TIMEOUT defines time that discovering
+ endpoint waits for enumeration to be completed. If the specified timeout
+ expires the discovery process is terminated without obtaining RapidIO network
+ information. NOTE: a timed out discovery process may be restarted later using
+ a user-space command as it is described below (if the given endpoint was
+ enumerated successfully).
+
+ (b) Statically linked enumeration and discovery process can be started by
+ a command from user space. This initiation method provides more flexibility
+ for a system startup compared to the option (a) above. After all participating
+ endpoints have been successfully booted, an enumeration process shall be
+ started first by issuing a user-space command, after an enumeration is
+ completed a discovery process can be started on all remaining endpoints.
+
+ (c) Modular enumeration and discovery process can be started by a command from
+ user space. After an enumeration/discovery module is loaded, a network scan
+ process can be started by issuing a user-space command.
+ Similar to the option (b) above, an enumerator has to be started first.
+
+ (d) Modular enumeration and discovery process can be started by a module
+ initialization routine. In this case an enumerating module shall be loaded
+ first.
+
+When a network scan process is started it calls an enumeration or discovery
+routine depending on the configured role of a master port: host or agent.
+
+Enumeration is performed by a master port if it is configured as a host port by
+assigning a host destination ID greater than or equal to zero. The host
+destination ID can be assigned to a master port using various methods depending
+on RapidIO subsystem build configuration:
+
+ (a) For a statically linked RapidIO subsystem core use command line parameter
+ "rapidio.hdid=" with a list of destination ID assignments in order of mport
+ device registration. For example, in a system with two RapidIO controllers
+ the command line parameter "rapidio.hdid=-1,7" will result in assignment of
+ the host destination ID=7 to the second RapidIO controller, while the first
+ one will be assigned destination ID=-1.
+
+ (b) If the RapidIO subsystem core is built as a loadable module, in addition
+ to the method shown above, the host destination ID(s) can be specified using
+ traditional methods of passing module parameter "hdid=" during its loading:
+
+ - from command line: "modprobe rapidio hdid=-1,7", or
+ - from modprobe configuration file using configuration command "options",
+ like in this example: "options rapidio hdid=-1,7". An example of modprobe
+ configuration file is provided in the section below.
+
+NOTES:
+ (i) if "hdid=" parameter is omitted all available mport will be assigned
+ destination ID = -1;
+
+ (ii) the "hdid=" parameter in systems with multiple mports can have
+ destination ID assignments omitted from the end of list (default = -1).
+
+If the host device ID for a specific master port is set to -1, the discovery
+process will be performed for it.
+
+The enumeration and discovery routines use RapidIO maintenance transactions
+to access the configuration space of devices.
+
+NOTE: If RapidIO switch-specific device drivers are built as loadable modules
+they must be loaded before enumeration/discovery process starts.
+This requirement is cased by the fact that enumeration/discovery methods invoke
+vendor-specific callbacks on early stages.
+
+4.2 Automatic Start of Enumeration and Discovery
+------------------------------------------------
+
+Automatic enumeration/discovery start method is applicable only to built-in
+enumeration/discovery RapidIO configuration selection. To enable automatic
+enumeration/discovery start by existing basic enumerator method set use boot
+command line parameter "rio-scan.scan=1".
+
+This configuration requires synchronized start of all RapidIO endpoints that
+form a network which will be enumerated/discovered. Discovering endpoints have
+to be started before an enumeration starts to ensure that all RapidIO
+controllers have been initialized and are ready to be discovered. Configuration
+parameter CONFIG_RAPIDIO_DISC_TIMEOUT defines time (in seconds) which
+a discovering endpoint will wait for enumeration to be completed.
+
+When automatic enumeration/discovery start is selected, basic method's
+initialization routine calls rio_init_mports() to perform enumeration or
+discovery for all known mport devices.
+
+Depending on RapidIO network size and configuration this automatic
+enumeration/discovery start method may be difficult to use due to the
+requirement for synchronized start of all endpoints.
+
+4.3 User-space Start of Enumeration and Discovery
+-------------------------------------------------
+
+User-space start of enumeration and discovery can be used with built-in and
+modular build configurations. For user-space controlled start RapidIO subsystem
+creates the sysfs write-only attribute file '/sys/bus/rapidio/scan'. To initiate
+an enumeration or discovery process on specific mport device, a user needs to
+write mport_ID (not RapidIO destination ID) into that file. The mport_ID is a
+sequential number (0 ... RIO_MAX_MPORTS) assigned during mport device
+registration. For example for machine with single RapidIO controller, mport_ID
+for that controller always will be 0.
+
+To initiate RapidIO enumeration/discovery on all available mports a user may
+write '-1' (or RIO_MPORT_ANY) into the scan attribute file.
+
+4.4 Basic Enumeration Method
+----------------------------
+
+This is an original enumeration/discovery method which is available since
+first release of RapidIO subsystem code. The enumeration process is
+implemented according to the enumeration algorithm outlined in the RapidIO
+Interconnect Specification: Annex I [1].
+
+This method can be configured as statically linked or loadable module.
+The method's single parameter "scan" allows to trigger the enumeration/discovery
+process from module initialization routine.
+
+This enumeration/discovery method can be started only once and does not support
+unloading if it is built as a module.
+
+The enumeration process traverses the network using a recursive depth-first
+algorithm. When a new device is found, the enumerator takes ownership of that
+device by writing into the Host Device ID Lock CSR. It does this to ensure that
+the enumerator has exclusive right to enumerate the device. If device ownership
+is successfully acquired, the enumerator allocates a new rio_dev structure and
+initializes it according to device capabilities.
+
+If the device is an endpoint, a unique device ID is assigned to it and its value
+is written into the device's Base Device ID CSR.
+
+If the device is a switch, the enumerator allocates an additional rio_switch
+structure to store switch specific information. Then the switch's vendor ID and
+device ID are queried against a table of known RapidIO switches. Each switch
+table entry contains a pointer to a switch-specific initialization routine that
+initializes pointers to the rest of switch specific operations, and performs
+hardware initialization if necessary. A RapidIO switch does not have a unique
+device ID; it relies on hopcount and routing for device ID of an attached
+endpoint if access to its configuration registers is required. If a switch (or
+chain of switches) does not have any endpoint (except enumerator) attached to
+it, a fake device ID will be assigned to configure a route to that switch.
+In the case of a chain of switches without endpoint, one fake device ID is used
+to configure a route through the entire chain and switches are differentiated by
+their hopcount value.
+
+For both endpoints and switches the enumerator writes a unique component tag
+into device's Component Tag CSR. That unique value is used by the error
+management notification mechanism to identify a device that is reporting an
+error management event.
+
+Enumeration beyond a switch is completed by iterating over each active egress
+port of that switch. For each active link, a route to a default device ID
+(0xFF for 8-bit systems and 0xFFFF for 16-bit systems) is temporarily written
+into the routing table. The algorithm recurs by calling itself with hopcount + 1
+and the default device ID in order to access the device on the active port.
+
+After the host has completed enumeration of the entire network it releases
+devices by clearing device ID locks (calls rio_clear_locks()). For each endpoint
+in the system, it sets the Discovered bit in the Port General Control CSR
+to indicate that enumeration is completed and agents are allowed to execute
+passive discovery of the network.
+
+The discovery process is performed by agents and is similar to the enumeration
+process that is described above. However, the discovery process is performed
+without changes to the existing routing because agents only gather information
+about RapidIO network structure and are building an internal map of discovered
+devices. This way each Linux-based component of the RapidIO subsystem has
+a complete view of the network. The discovery process can be performed
+simultaneously by several agents. After initializing its RapidIO master port
+each agent waits for enumeration completion by the host for the configured wait
+time period. If this wait time period expires before enumeration is completed,
+an agent skips RapidIO discovery and continues with remaining kernel
+initialization.
+
+4.5 Adding New Enumeration/Discovery Method
+-------------------------------------------
+
+RapidIO subsystem code organization allows addition of new enumeration/discovery
+methods as new configuration options without significant impact to the core
+RapidIO code.
+
+A new enumeration/discovery method has to be attached to one or more mport
+devices before an enumeration/discovery process can be started. Normally,
+method's module initialization routine calls rio_register_scan() to attach
+an enumerator to a specified mport device (or devices). The basic enumerator
+implementation demonstrates this process.
+
+4.6 Using Loadable RapidIO Switch Drivers
+-----------------------------------------
+
+In the case when RapidIO switch drivers are built as loadable modules a user
+must ensure that they are loaded before the enumeration/discovery starts.
+This process can be automated by specifying pre- or post- dependencies in the
+RapidIO-specific modprobe configuration file as shown in the example below.
+
+File /etc/modprobe.d/rapidio.conf::
+
+ # Configure RapidIO subsystem modules
+
+ # Set enumerator host destination ID (overrides kernel command line option)
+ options rapidio hdid=-1,2
+
+ # Load RapidIO switch drivers immediately after rapidio core module was loaded
+ softdep rapidio post: idt_gen2 idtcps tsi57x
+
+ # OR :
+
+ # Load RapidIO switch drivers just before rio-scan enumerator module is loaded
+ softdep rio-scan pre: idt_gen2 idtcps tsi57x
+
+ --------------------------
+
+NOTE:
+ In the example above, one of "softdep" commands must be removed or
+ commented out to keep required module loading sequence.
+
+5. References
+=============
+
+[1] RapidIO Trade Association. RapidIO Interconnect Specifications.
+ http://www.rapidio.org.
+
+[2] Rapidio TA. Technology Comparisons.
+ http://www.rapidio.org/education/technology_comparisons/
+
+[3] RapidIO support for Linux.
+ http://lwn.net/Articles/139118/
+
+[4] Matt Porter. RapidIO for Linux. Ottawa Linux Symposium, 2005
+ http://www.kernel.org/doc/ols/2005/ols2005v2-pages-43-56.pdf
diff --git a/Documentation/driver-api/rapidio/rio_cm.rst b/Documentation/driver-api/rapidio/rio_cm.rst
new file mode 100644
index 000000000000..5294430a7a74
--- /dev/null
+++ b/Documentation/driver-api/rapidio/rio_cm.rst
@@ -0,0 +1,135 @@
+==========================================================================
+RapidIO subsystem Channelized Messaging character device driver (rio_cm.c)
+==========================================================================
+
+
+1. Overview
+===========
+
+This device driver is the result of collaboration within the RapidIO.org
+Software Task Group (STG) between Texas Instruments, Prodrive Technologies,
+Nokia Networks, BAE and IDT. Additional input was received from other members
+of RapidIO.org.
+
+The objective was to create a character mode driver interface which exposes
+messaging capabilities of RapidIO endpoint devices (mports) directly
+to applications, in a manner that allows the numerous and varied RapidIO
+implementations to interoperate.
+
+This driver (RIO_CM) provides to user-space applications shared access to
+RapidIO mailbox messaging resources.
+
+RapidIO specification (Part 2) defines that endpoint devices may have up to four
+messaging mailboxes in case of multi-packet message (up to 4KB) and
+up to 64 mailboxes if single-packet messages (up to 256 B) are used. In addition
+to protocol definition limitations, a particular hardware implementation can
+have reduced number of messaging mailboxes. RapidIO aware applications must
+therefore share the messaging resources of a RapidIO endpoint.
+
+Main purpose of this device driver is to provide RapidIO mailbox messaging
+capability to large number of user-space processes by introducing socket-like
+operations using a single messaging mailbox. This allows applications to
+use the limited RapidIO messaging hardware resources efficiently.
+
+Most of device driver's operations are supported through 'ioctl' system calls.
+
+When loaded this device driver creates a single file system node named rio_cm
+in /dev directory common for all registered RapidIO mport devices.
+
+Following ioctl commands are available to user-space applications:
+
+- RIO_CM_MPORT_GET_LIST:
+ Returns to caller list of local mport devices that
+ support messaging operations (number of entries up to RIO_MAX_MPORTS).
+ Each list entry is combination of mport's index in the system and RapidIO
+ destination ID assigned to the port.
+- RIO_CM_EP_GET_LIST_SIZE:
+ Returns number of messaging capable remote endpoints
+ in a RapidIO network associated with the specified mport device.
+- RIO_CM_EP_GET_LIST:
+ Returns list of RapidIO destination IDs for messaging
+ capable remote endpoints (peers) available in a RapidIO network associated
+ with the specified mport device.
+- RIO_CM_CHAN_CREATE:
+ Creates RapidIO message exchange channel data structure
+ with channel ID assigned automatically or as requested by a caller.
+- RIO_CM_CHAN_BIND:
+ Binds the specified channel data structure to the specified
+ mport device.
+- RIO_CM_CHAN_LISTEN:
+ Enables listening for connection requests on the specified
+ channel.
+- RIO_CM_CHAN_ACCEPT:
+ Accepts a connection request from peer on the specified
+ channel. If wait timeout for this request is specified by a caller it is
+ a blocking call. If timeout set to 0 this is non-blocking call - ioctl
+ handler checks for a pending connection request and if one is not available
+ exits with -EGAIN error status immediately.
+- RIO_CM_CHAN_CONNECT:
+ Sends a connection request to a remote peer/channel.
+- RIO_CM_CHAN_SEND:
+ Sends a data message through the specified channel.
+ The handler for this request assumes that message buffer specified by
+ a caller includes the reserved space for a packet header required by
+ this driver.
+- RIO_CM_CHAN_RECEIVE:
+ Receives a data message through a connected channel.
+ If the channel does not have an incoming message ready to return this ioctl
+ handler will wait for new message until timeout specified by a caller
+ expires. If timeout value is set to 0, ioctl handler uses a default value
+ defined by MAX_SCHEDULE_TIMEOUT.
+- RIO_CM_CHAN_CLOSE:
+ Closes a specified channel and frees associated buffers.
+ If the specified channel is in the CONNECTED state, sends close notification
+ to the remote peer.
+
+The ioctl command codes and corresponding data structures intended for use by
+user-space applications are defined in 'include/uapi/linux/rio_cm_cdev.h'.
+
+2. Hardware Compatibility
+=========================
+
+This device driver uses standard interfaces defined by kernel RapidIO subsystem
+and therefore it can be used with any mport device driver registered by RapidIO
+subsystem with limitations set by available mport HW implementation of messaging
+mailboxes.
+
+3. Module parameters
+====================
+
+- 'dbg_level'
+ - This parameter allows to control amount of debug information
+ generated by this device driver. This parameter is formed by set of
+ bit masks that correspond to the specific functional block.
+ For mask definitions see 'drivers/rapidio/devices/rio_cm.c'
+ This parameter can be changed dynamically.
+ Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+- 'cmbox'
+ - Number of RapidIO mailbox to use (default value is 1).
+ This parameter allows to set messaging mailbox number that will be used
+ within entire RapidIO network. It can be used when default mailbox is
+ used by other device drivers or is not supported by some nodes in the
+ RapidIO network.
+
+- 'chstart'
+ - Start channel number for dynamic assignment. Default value - 256.
+ Allows to exclude channel numbers below this parameter from dynamic
+ allocation to avoid conflicts with software components that use
+ reserved predefined channel numbers.
+
+4. Known problems
+=================
+
+ None.
+
+5. User-space Applications and API Library
+==========================================
+
+Messaging API library and applications that use this device driver are available
+from RapidIO.org.
+
+6. TODO List
+============
+
+- Add support for system notification messages (reserved channel 0).
diff --git a/Documentation/driver-api/rapidio/sysfs.rst b/Documentation/driver-api/rapidio/sysfs.rst
new file mode 100644
index 000000000000..540f72683496
--- /dev/null
+++ b/Documentation/driver-api/rapidio/sysfs.rst
@@ -0,0 +1,7 @@
+=============
+Sysfs entries
+=============
+
+The RapidIO sysfs files have moved to:
+Documentation/ABI/testing/sysfs-bus-rapidio and
+Documentation/ABI/testing/sysfs-class-rapidio
diff --git a/Documentation/driver-api/rapidio/tsi721.rst b/Documentation/driver-api/rapidio/tsi721.rst
new file mode 100644
index 000000000000..42aea438cd20
--- /dev/null
+++ b/Documentation/driver-api/rapidio/tsi721.rst
@@ -0,0 +1,112 @@
+=========================================================================
+RapidIO subsystem mport driver for IDT Tsi721 PCI Express-to-SRIO bridge.
+=========================================================================
+
+1. Overview
+===========
+
+This driver implements all currently defined RapidIO mport callback functions.
+It supports maintenance read and write operations, inbound and outbound RapidIO
+doorbells, inbound maintenance port-writes and RapidIO messaging.
+
+To generate SRIO maintenance transactions this driver uses one of Tsi721 DMA
+channels. This mechanism provides access to larger range of hop counts and
+destination IDs without need for changes in outbound window translation.
+
+RapidIO messaging support uses dedicated messaging channels for each mailbox.
+For inbound messages this driver uses destination ID matching to forward messages
+into the corresponding message queue. Messaging callbacks are implemented to be
+fully compatible with RIONET driver (Ethernet over RapidIO messaging services).
+
+1. Module parameters:
+
+- 'dbg_level'
+ - This parameter allows to control amount of debug information
+ generated by this device driver. This parameter is formed by set of
+ This parameter can be changed bit masks that correspond to the specific
+ functional block.
+ For mask definitions see 'drivers/rapidio/devices/tsi721.h'
+ This parameter can be changed dynamically.
+ Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+- 'dma_desc_per_channel'
+ - This parameter defines number of hardware buffer
+ descriptors allocated for each registered Tsi721 DMA channel.
+ Its default value is 128.
+
+- 'dma_txqueue_sz'
+ - DMA transactions queue size. Defines number of pending
+ transaction requests that can be accepted by each DMA channel.
+ Default value is 16.
+
+- 'dma_sel'
+ - DMA channel selection mask. Bitmask that defines which hardware
+ DMA channels (0 ... 6) will be registered with DmaEngine core.
+ If bit is set to 1, the corresponding DMA channel will be registered.
+ DMA channels not selected by this mask will not be used by this device
+ driver. Default value is 0x7f (use all channels).
+
+- 'pcie_mrrs'
+ - override value for PCIe Maximum Read Request Size (MRRS).
+ This parameter gives an ability to override MRRS value set during PCIe
+ configuration process. Tsi721 supports read request sizes up to 4096B.
+ Value for this parameter must be set as defined by PCIe specification:
+ 0 = 128B, 1 = 256B, 2 = 512B, 3 = 1024B, 4 = 2048B and 5 = 4096B.
+ Default value is '-1' (= keep platform setting).
+
+- 'mbox_sel'
+ - RIO messaging MBOX selection mask. This is a bitmask that defines
+ messaging MBOXes are managed by this device driver. Mask bits 0 - 3
+ correspond to MBOX0 - MBOX3. MBOX is under driver's control if the
+ corresponding bit is set to '1'. Default value is 0x0f (= all).
+
+2. Known problems
+=================
+
+ None.
+
+3. DMA Engine Support
+=====================
+
+Tsi721 mport driver supports DMA data transfers between local system memory and
+remote RapidIO devices. This functionality is implemented according to SLAVE
+mode API defined by common Linux kernel DMA Engine framework.
+
+Depending on system requirements RapidIO DMA operations can be included/excluded
+by setting CONFIG_RAPIDIO_DMA_ENGINE option. Tsi721 miniport driver uses seven
+out of eight available BDMA channels to support DMA data transfers.
+One BDMA channel is reserved for generation of maintenance read/write requests.
+
+If Tsi721 mport driver have been built with RAPIDIO_DMA_ENGINE support included,
+this driver will accept DMA-specific module parameter:
+
+ "dma_desc_per_channel"
+ - defines number of hardware buffer descriptors used by
+ each BDMA channel of Tsi721 (by default - 128).
+
+4. Version History
+
+ ===== ====================================================================
+ 1.1.0 DMA operations re-worked to support data scatter/gather lists larger
+ than hardware buffer descriptors ring.
+ 1.0.0 Initial driver release.
+ ===== ====================================================================
+
+5. License
+===========
+
+ Copyright(c) 2011 Integrated Device Technology, Inc. All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ more details.
+
+ You should have received a copy of the GNU General Public License along with
+ this program; if not, write to the Free Software Foundation, Inc.,
+ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
diff --git a/Documentation/rfkill.txt b/Documentation/driver-api/rfkill.rst
index 7d3684e81df6..7d3684e81df6 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/driver-api/rfkill.rst
diff --git a/Documentation/driver-api/s390-drivers.rst b/Documentation/driver-api/s390-drivers.rst
index 30e6aa7e160b..5158577bc29b 100644
--- a/Documentation/driver-api/s390-drivers.rst
+++ b/Documentation/driver-api/s390-drivers.rst
@@ -27,7 +27,7 @@ not strictly considered I/O devices. They are considered here as well,
although they are not the focus of this document.
Some additional information can also be found in the kernel source under
-Documentation/s390/driver-model.txt.
+Documentation/s390/driver-model.rst.
The css bus
===========
@@ -38,7 +38,7 @@ into several categories:
* Standard I/O subchannels, for use by the system. They have a child
device on the ccw bus and are described below.
* I/O subchannels bound to the vfio-ccw driver. See
- Documentation/s390/vfio-ccw.txt.
+ Documentation/s390/vfio-ccw.rst.
* Message subchannels. No Linux driver currently exists.
* CHSC subchannels (at most one). The chsc subchannel driver can be used
to send asynchronous chsc commands.
diff --git a/Documentation/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst
index 532ff67e2f1c..532ff67e2f1c 100644
--- a/Documentation/serial/cyclades_z.rst
+++ b/Documentation/driver-api/serial/cyclades_z.rst
diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
new file mode 100644
index 000000000000..31bd4e16fb1f
--- /dev/null
+++ b/Documentation/driver-api/serial/driver.rst
@@ -0,0 +1,549 @@
+====================
+Low Level Serial API
+====================
+
+
+This document is meant as a brief overview of some aspects of the new serial
+driver. It is not complete, any questions you have should be directed to
+<rmk@arm.linux.org.uk>
+
+The reference implementation is contained within amba-pl011.c.
+
+
+
+Low Level Serial Hardware Driver
+--------------------------------
+
+The low level serial hardware driver is responsible for supplying port
+information (defined by uart_port) and a set of control methods (defined
+by uart_ops) to the core serial driver. The low level driver is also
+responsible for handling interrupts for the port, and providing any
+console support.
+
+
+Console Support
+---------------
+
+The serial core provides a few helper functions. This includes identifing
+the correct port structure (via uart_get_console) and decoding command line
+arguments (uart_parse_options).
+
+There is also a helper function (uart_console_write) which performs a
+character by character write, translating newlines to CRLF sequences.
+Driver writers are recommended to use this function rather than implementing
+their own version.
+
+
+Locking
+-------
+
+It is the responsibility of the low level hardware driver to perform the
+necessary locking using port->lock. There are some exceptions (which
+are described in the uart_ops listing below.)
+
+There are two locks. A per-port spinlock, and an overall semaphore.
+
+From the core driver perspective, the port->lock locks the following
+data::
+
+ port->mctrl
+ port->icount
+ port->state->xmit.head (circ_buf->head)
+ port->state->xmit.tail (circ_buf->tail)
+
+The low level driver is free to use this lock to provide any additional
+locking.
+
+The port_sem semaphore is used to protect against ports being added/
+removed or reconfigured at inappropriate times. Since v2.6.27, this
+semaphore has been the 'mutex' member of the tty_port struct, and
+commonly referred to as the port mutex.
+
+
+uart_ops
+--------
+
+The uart_ops structure is the main interface between serial_core and the
+hardware specific driver. It contains all the methods to control the
+hardware.
+
+ tx_empty(port)
+ This function tests whether the transmitter fifo and shifter
+ for the port described by 'port' is empty. If it is empty,
+ this function should return TIOCSER_TEMT, otherwise return 0.
+ If the port does not support this operation, then it should
+ return TIOCSER_TEMT.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ This call must not sleep
+
+ set_mctrl(port, mctrl)
+ This function sets the modem control lines for port described
+ by 'port' to the state described by mctrl. The relevant bits
+ of mctrl are:
+
+ - TIOCM_RTS RTS signal.
+ - TIOCM_DTR DTR signal.
+ - TIOCM_OUT1 OUT1 signal.
+ - TIOCM_OUT2 OUT2 signal.
+ - TIOCM_LOOP Set the port into loopback mode.
+
+ If the appropriate bit is set, the signal should be driven
+ active. If the bit is clear, the signal should be driven
+ inactive.
+
+ Locking: port->lock taken.
+
+ Interrupts: locally disabled.
+
+ This call must not sleep
+
+ get_mctrl(port)
+ Returns the current state of modem control inputs. The state
+ of the outputs should not be returned, since the core keeps
+ track of their state. The state information should include:
+
+ - TIOCM_CAR state of DCD signal
+ - TIOCM_CTS state of CTS signal
+ - TIOCM_DSR state of DSR signal
+ - TIOCM_RI state of RI signal
+
+ The bit is set if the signal is currently driven active. If
+ the port does not support CTS, DCD or DSR, the driver should
+ indicate that the signal is permanently active. If RI is
+ not available, the signal should not be indicated as active.
+
+ Locking: port->lock taken.
+
+ Interrupts: locally disabled.
+
+ This call must not sleep
+
+ stop_tx(port)
+ Stop transmitting characters. This might be due to the CTS
+ line becoming inactive or the tty layer indicating we want
+ to stop transmission due to an XOFF character.
+
+ The driver should stop transmitting characters as soon as
+ possible.
+
+ Locking: port->lock taken.
+
+ Interrupts: locally disabled.
+
+ This call must not sleep
+
+ start_tx(port)
+ Start transmitting characters.
+
+ Locking: port->lock taken.
+
+ Interrupts: locally disabled.
+
+ This call must not sleep
+
+ throttle(port)
+ Notify the serial driver that input buffers for the line discipline are
+ close to full, and it should somehow signal that no more characters
+ should be sent to the serial port.
+ This will be called only if hardware assisted flow control is enabled.
+
+ Locking: serialized with .unthrottle() and termios modification by the
+ tty layer.
+
+ unthrottle(port)
+ Notify the serial driver that characters can now be sent to the serial
+ port without fear of overrunning the input buffers of the line
+ disciplines.
+
+ This will be called only if hardware assisted flow control is enabled.
+
+ Locking: serialized with .throttle() and termios modification by the
+ tty layer.
+
+ send_xchar(port,ch)
+ Transmit a high priority character, even if the port is stopped.
+ This is used to implement XON/XOFF flow control and tcflow(). If
+ the serial driver does not implement this function, the tty core
+ will append the character to the circular buffer and then call
+ start_tx() / stop_tx() to flush the data out.
+
+ Do not transmit if ch == '\0' (__DISABLED_CHAR).
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ stop_rx(port)
+ Stop receiving characters; the port is in the process of
+ being closed.
+
+ Locking: port->lock taken.
+
+ Interrupts: locally disabled.
+
+ This call must not sleep
+
+ enable_ms(port)
+ Enable the modem status interrupts.
+
+ This method may be called multiple times. Modem status
+ interrupts should be disabled when the shutdown method is
+ called.
+
+ Locking: port->lock taken.
+
+ Interrupts: locally disabled.
+
+ This call must not sleep
+
+ break_ctl(port,ctl)
+ Control the transmission of a break signal. If ctl is
+ nonzero, the break signal should be transmitted. The signal
+ should be terminated when another call is made with a zero
+ ctl.
+
+ Locking: caller holds tty_port->mutex
+
+ startup(port)
+ Grab any interrupt resources and initialise any low level driver
+ state. Enable the port for reception. It should not activate
+ RTS nor DTR; this will be done via a separate call to set_mctrl.
+
+ This method will only be called when the port is initially opened.
+
+ Locking: port_sem taken.
+
+ Interrupts: globally disabled.
+
+ shutdown(port)
+ Disable the port, disable any break condition that may be in
+ effect, and free any interrupt resources. It should not disable
+ RTS nor DTR; this will have already been done via a separate
+ call to set_mctrl.
+
+ Drivers must not access port->state once this call has completed.
+
+ This method will only be called when there are no more users of
+ this port.
+
+ Locking: port_sem taken.
+
+ Interrupts: caller dependent.
+
+ flush_buffer(port)
+ Flush any write buffers, reset any DMA state and stop any
+ ongoing DMA transfers.
+
+ This will be called whenever the port->state->xmit circular
+ buffer is cleared.
+
+ Locking: port->lock taken.
+
+ Interrupts: locally disabled.
+
+ This call must not sleep
+
+ set_termios(port,termios,oldtermios)
+ Change the port parameters, including word length, parity, stop
+ bits. Update read_status_mask and ignore_status_mask to indicate
+ the types of events we are interested in receiving. Relevant
+ termios->c_cflag bits are:
+
+ CSIZE
+ - word size
+ CSTOPB
+ - 2 stop bits
+ PARENB
+ - parity enable
+ PARODD
+ - odd parity (when PARENB is in force)
+ CREAD
+ - enable reception of characters (if not set,
+ still receive characters from the port, but
+ throw them away.
+ CRTSCTS
+ - if set, enable CTS status change reporting
+ CLOCAL
+ - if not set, enable modem status change
+ reporting.
+
+ Relevant termios->c_iflag bits are:
+
+ INPCK
+ - enable frame and parity error events to be
+ passed to the TTY layer.
+ BRKINT / PARMRK
+ - both of these enable break events to be
+ passed to the TTY layer.
+
+ IGNPAR
+ - ignore parity and framing errors
+ IGNBRK
+ - ignore break errors, If IGNPAR is also
+ set, ignore overrun errors as well.
+
+ The interaction of the iflag bits is as follows (parity error
+ given as an example):
+
+ =============== ======= ====== =============================
+ Parity error INPCK IGNPAR
+ =============== ======= ====== =============================
+ n/a 0 n/a character received, marked as
+ TTY_NORMAL
+ None 1 n/a character received, marked as
+ TTY_NORMAL
+ Yes 1 0 character received, marked as
+ TTY_PARITY
+ Yes 1 1 character discarded
+ =============== ======= ====== =============================
+
+ Other flags may be used (eg, xon/xoff characters) if your
+ hardware supports hardware "soft" flow control.
+
+ Locking: caller holds tty_port->mutex
+
+ Interrupts: caller dependent.
+
+ This call must not sleep
+
+ set_ldisc(port,termios)
+ Notifier for discipline change. See Documentation/driver-api/serial/tty.rst.
+
+ Locking: caller holds tty_port->mutex
+
+ pm(port,state,oldstate)
+ Perform any power management related activities on the specified
+ port. State indicates the new state (defined by
+ enum uart_pm_state), oldstate indicates the previous state.
+
+ This function should not be used to grab any resources.
+
+ This will be called when the port is initially opened and finally
+ closed, except when the port is also the system console. This
+ will occur even if CONFIG_PM is not set.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ type(port)
+ Return a pointer to a string constant describing the specified
+ port, or return NULL, in which case the string 'unknown' is
+ substituted.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ release_port(port)
+ Release any memory and IO region resources currently in use by
+ the port.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ request_port(port)
+ Request any memory and IO region resources required by the port.
+ If any fail, no resources should be registered when this function
+ returns, and it should return -EBUSY on failure.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ config_port(port,type)
+ Perform any autoconfiguration steps required for the port. `type`
+ contains a bit mask of the required configuration. UART_CONFIG_TYPE
+ indicates that the port requires detection and identification.
+ port->type should be set to the type found, or PORT_UNKNOWN if
+ no port was detected.
+
+ UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+ which should be probed using standard kernel autoprobing techniques.
+ This is not necessary on platforms where ports have interrupts
+ internally hard wired (eg, system on a chip implementations).
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ verify_port(port,serinfo)
+ Verify the new serial port information contained within serinfo is
+ suitable for this port type.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ ioctl(port,cmd,arg)
+ Perform any port specific IOCTLs. IOCTL commands must be defined
+ using the standard numbering system found in <asm/ioctl.h>
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ poll_init(port)
+ Called by kgdb to perform the minimal hardware initialization needed
+ to support poll_put_char() and poll_get_char(). Unlike ->startup()
+ this should not request interrupts.
+
+ Locking: tty_mutex and tty_port->mutex taken.
+
+ Interrupts: n/a.
+
+ poll_put_char(port,ch)
+ Called by kgdb to write a single character directly to the serial
+ port. It can and should block until there is space in the TX FIFO.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ This call must not sleep
+
+ poll_get_char(port)
+ Called by kgdb to read a single character directly from the serial
+ port. If data is available, it should be returned; otherwise
+ the function should return NO_POLL_CHAR immediately.
+
+ Locking: none.
+
+ Interrupts: caller dependent.
+
+ This call must not sleep
+
+Other functions
+---------------
+
+uart_update_timeout(port,cflag,baud)
+ Update the FIFO drain timeout, port->timeout, according to the
+ number of bits, parity, stop bits and baud rate.
+
+ Locking: caller is expected to take port->lock
+
+ Interrupts: n/a
+
+uart_get_baud_rate(port,termios,old,min,max)
+ Return the numeric baud rate for the specified termios, taking
+ account of the special 38400 baud "kludge". The B0 baud rate
+ is mapped to 9600 baud.
+
+ If the baud rate is not within min..max, then if old is non-NULL,
+ the original baud rate will be tried. If that exceeds the
+ min..max constraint, 9600 baud will be returned. termios will
+ be updated to the baud rate in use.
+
+ Note: min..max must always allow 9600 baud to be selected.
+
+ Locking: caller dependent.
+
+ Interrupts: n/a
+
+uart_get_divisor(port,baud)
+ Return the divisor (baud_base / baud) for the specified baud
+ rate, appropriately rounded.
+
+ If 38400 baud and custom divisor is selected, return the
+ custom divisor instead.
+
+ Locking: caller dependent.
+
+ Interrupts: n/a
+
+uart_match_port(port1,port2)
+ This utility function can be used to determine whether two
+ uart_port structures describe the same port.
+
+ Locking: n/a
+
+ Interrupts: n/a
+
+uart_write_wakeup(port)
+ A driver is expected to call this function when the number of
+ characters in the transmit buffer have dropped below a threshold.
+
+ Locking: port->lock should be held.
+
+ Interrupts: n/a
+
+uart_register_driver(drv)
+ Register a uart driver with the core driver. We in turn register
+ with the tty layer, and initialise the core driver per-port state.
+
+ drv->port should be NULL, and the per-port structures should be
+ registered using uart_add_one_port after this call has succeeded.
+
+ Locking: none
+
+ Interrupts: enabled
+
+uart_unregister_driver()
+ Remove all references to a driver from the core driver. The low
+ level driver must have removed all its ports via the
+ uart_remove_one_port() if it registered them with uart_add_one_port().
+
+ Locking: none
+
+ Interrupts: enabled
+
+**uart_suspend_port()**
+
+**uart_resume_port()**
+
+**uart_add_one_port()**
+
+**uart_remove_one_port()**
+
+Other notes
+-----------
+
+It is intended some day to drop the 'unused' entries from uart_port, and
+allow low level drivers to register their own individual uart_port's with
+the core. This will allow drivers to use uart_port as a pointer to a
+structure containing both the uart_port entry with their own extensions,
+thus::
+
+ struct my_port {
+ struct uart_port port;
+ int my_stuff;
+ };
+
+Modem control lines via GPIO
+----------------------------
+
+Some helpers are provided in order to set/get modem control lines via GPIO.
+
+mctrl_gpio_init(port, idx):
+ This will get the {cts,rts,...}-gpios from device tree if they are
+ present and request them, set direction etc, and return an
+ allocated structure. `devm_*` functions are used, so there's no need
+ to call mctrl_gpio_free().
+ As this sets up the irq handling make sure to not handle changes to the
+ gpio input lines in your driver, too.
+
+mctrl_gpio_free(dev, gpios):
+ This will free the requested gpios in mctrl_gpio_init().
+ As `devm_*` functions are used, there's generally no need to call
+ this function.
+
+mctrl_gpio_to_gpiod(gpios, gidx)
+ This returns the gpio_desc structure associated to the modem line
+ index.
+
+mctrl_gpio_set(gpios, mctrl):
+ This will sets the gpios according to the mctrl state.
+
+mctrl_gpio_get(gpios, mctrl):
+ This will update mctrl with the gpios values.
+
+mctrl_gpio_enable_ms(gpios):
+ Enables irqs and handling of changes to the ms lines.
+
+mctrl_gpio_disable_ms(gpios):
+ Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst
new file mode 100644
index 000000000000..33ad10d05b26
--- /dev/null
+++ b/Documentation/driver-api/serial/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Support for Serial devices
+==========================
+
+.. toctree::
+ :maxdepth: 1
+
+
+ driver
+ tty
+
+Serial drivers
+==============
+
+.. toctree::
+ :maxdepth: 1
+
+ cyclades_z
+ moxa-smartio
+ n_gsm
+ rocket
+ serial-iso7816
+ serial-rs485
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/serial/moxa-smartio.rst b/Documentation/driver-api/serial/moxa-smartio.rst
index 156100f17c3f..156100f17c3f 100644
--- a/Documentation/serial/moxa-smartio.rst
+++ b/Documentation/driver-api/serial/moxa-smartio.rst
diff --git a/Documentation/serial/n_gsm.rst b/Documentation/driver-api/serial/n_gsm.rst
index f3ad9fd26408..f3ad9fd26408 100644
--- a/Documentation/serial/n_gsm.rst
+++ b/Documentation/driver-api/serial/n_gsm.rst
diff --git a/Documentation/serial/rocket.rst b/Documentation/driver-api/serial/rocket.rst
index 23761eae4282..23761eae4282 100644
--- a/Documentation/serial/rocket.rst
+++ b/Documentation/driver-api/serial/rocket.rst
diff --git a/Documentation/serial/serial-iso7816.rst b/Documentation/driver-api/serial/serial-iso7816.rst
index d990143de0c6..d990143de0c6 100644
--- a/Documentation/serial/serial-iso7816.rst
+++ b/Documentation/driver-api/serial/serial-iso7816.rst
diff --git a/Documentation/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst
index 6bc824f948f9..6bc824f948f9 100644
--- a/Documentation/serial/serial-rs485.rst
+++ b/Documentation/driver-api/serial/serial-rs485.rst
diff --git a/Documentation/serial/tty.rst b/Documentation/driver-api/serial/tty.rst
index dd972caacf3e..dd972caacf3e 100644
--- a/Documentation/serial/tty.rst
+++ b/Documentation/driver-api/serial/tty.rst
diff --git a/Documentation/sgi-ioc4.txt b/Documentation/driver-api/sgi-ioc4.rst
index 72709222d3c0..72709222d3c0 100644
--- a/Documentation/sgi-ioc4.txt
+++ b/Documentation/driver-api/sgi-ioc4.rst
diff --git a/Documentation/SM501.txt b/Documentation/driver-api/sm501.rst
index 882507453ba4..882507453ba4 100644
--- a/Documentation/SM501.txt
+++ b/Documentation/driver-api/sm501.rst
diff --git a/Documentation/smsc_ece1099.txt b/Documentation/driver-api/smsc_ece1099.rst
index 079277421eaf..079277421eaf 100644
--- a/Documentation/smsc_ece1099.txt
+++ b/Documentation/driver-api/smsc_ece1099.rst
diff --git a/Documentation/driver-api/soundwire/locking.rst b/Documentation/driver-api/soundwire/locking.rst
index 253f73555255..3a7ffb3d87f3 100644
--- a/Documentation/driver-api/soundwire/locking.rst
+++ b/Documentation/driver-api/soundwire/locking.rst
@@ -44,7 +44,9 @@ Message transfer.
b. Transfer message (Read/Write) to Slave1 or broadcast message on
Bus in case of bank switch.
- c. Release Message lock ::
+ c. Release Message lock
+
+ ::
+----------+ +---------+
| | | |
diff --git a/Documentation/driver-api/switchtec.rst b/Documentation/driver-api/switchtec.rst
new file mode 100644
index 000000000000..7611fdc53e19
--- /dev/null
+++ b/Documentation/driver-api/switchtec.rst
@@ -0,0 +1,102 @@
+========================
+Linux Switchtec Support
+========================
+
+Microsemi's "Switchtec" line of PCI switch devices is already
+supported by the kernel with standard PCI switch drivers. However, the
+Switchtec device advertises a special management endpoint which
+enables some additional functionality. This includes:
+
+* Packet and Byte Counters
+* Firmware Upgrades
+* Event and Error logs
+* Querying port link status
+* Custom user firmware commands
+
+The switchtec kernel module implements this functionality.
+
+
+Interface
+=========
+
+The primary means of communicating with the Switchtec management firmware is
+through the Memory-mapped Remote Procedure Call (MRPC) interface.
+Commands are submitted to the interface with a 4-byte command
+identifier and up to 1KB of command specific data. The firmware will
+respond with a 4-byte return code and up to 1KB of command-specific
+data. The interface only processes a single command at a time.
+
+
+Userspace Interface
+===================
+
+The MRPC interface will be exposed to userspace through a simple char
+device: /dev/switchtec#, one for each management endpoint in the system.
+
+The char device has the following semantics:
+
+* A write must consist of at least 4 bytes and no more than 1028 bytes.
+ The first 4 bytes will be interpreted as the Command ID and the
+ remainder will be used as the input data. A write will send the
+ command to the firmware to begin processing.
+
+* Each write must be followed by exactly one read. Any double write will
+ produce an error and any read that doesn't follow a write will
+ produce an error.
+
+* A read will block until the firmware completes the command and return
+ the 4-byte Command Return Value plus up to 1024 bytes of output
+ data. (The length will be specified by the size parameter of the read
+ call -- reading less than 4 bytes will produce an error.)
+
+* The poll call will also be supported for userspace applications that
+ need to do other things while waiting for the command to complete.
+
+The following IOCTLs are also supported by the device:
+
+* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number
+ of partitions in the device.
+
+* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for
+ any specified partition in flash.
+
+* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps
+ indicating all uncleared events.
+
+* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags
+ for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct
+ with the event_id, index and flags set (index being the partition or PFF
+ number for non-global events). It returns whether the event has
+ occurred, the number of times and any event specific data. The flags
+ can be used to clear the count or enable and disable actions to
+ happen when the event occurs.
+ By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag,
+ you can set an event to trigger a poll command to return with
+ POLLPRI. In this way, userspace can wait for events to occur.
+
+* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert
+ between PCI Function Framework number (used by the event system)
+ and Switchtec Logic Port ID and Partition number (which is more
+ user friendly).
+
+
+Non-Transparent Bridge (NTB) Driver
+===================================
+
+An NTB hardware driver is provided for the Switchtec hardware in
+ntb_hw_switchtec. Currently, it only supports switches configured with
+exactly 2 NT partitions and zero or more non-NT partitions. It also requires
+the following configuration settings:
+
+* Both NT partitions must be able to access each other's GAS spaces.
+ Thus, the bits in the GAS Access Vector under Management Settings
+ must be set to support this.
+* Kernel configuration MUST include support for NTB (CONFIG_NTB needs
+ to be set)
+
+NT EP BAR 2 will be dynamically configured as a Direct Window, and
+the configuration file does not need to configure it explicitly.
+
+Please refer to Documentation/driver-api/ntb.rst in Linux source tree for an overall
+understanding of the Linux NTB stack. ntb_hw_switchtec works as an NTB
+Hardware Driver in this stack.
diff --git a/Documentation/sync_file.txt b/Documentation/driver-api/sync_file.rst
index 496fb2c3b3e6..496fb2c3b3e6 100644
--- a/Documentation/sync_file.txt
+++ b/Documentation/driver-api/sync_file.rst
diff --git a/Documentation/driver-api/uio-howto.rst b/Documentation/driver-api/uio-howto.rst
index 25f50eace28b..8fecfa11d4ff 100644
--- a/Documentation/driver-api/uio-howto.rst
+++ b/Documentation/driver-api/uio-howto.rst
@@ -276,8 +276,8 @@ fields of ``struct uio_mem``:
- ``int memtype``: Required if the mapping is used. Set this to
``UIO_MEM_PHYS`` if you you have physical memory on your card to be
mapped. Use ``UIO_MEM_LOGICAL`` for logical memory (e.g. allocated
- with :c:func:`kmalloc()`). There's also ``UIO_MEM_VIRTUAL`` for
- virtual memory.
+ with :c:func:`__get_free_pages()` but not kmalloc()). There's also
+ ``UIO_MEM_VIRTUAL`` for virtual memory.
- ``phys_addr_t addr``: Required if the mapping is used. Fill in the
address of your memory block. This address is the one that appears in
diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst
index 4a74cf6f2797..2525c3622cae 100644
--- a/Documentation/driver-api/usb/power-management.rst
+++ b/Documentation/driver-api/usb/power-management.rst
@@ -46,7 +46,7 @@ device is turned off while the system as a whole remains running, we
call it a "dynamic suspend" (also known as a "runtime suspend" or
"selective suspend"). This document concentrates mostly on how
dynamic PM is implemented in the USB subsystem, although system PM is
-covered to some extent (see ``Documentation/power/*.txt`` for more
+covered to some extent (see ``Documentation/power/*.rst`` for more
information about system PM).
System PM support is present only if the kernel was built with
diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst
new file mode 100644
index 000000000000..25eb7d5b834b
--- /dev/null
+++ b/Documentation/driver-api/vfio-mediated-device.rst
@@ -0,0 +1,414 @@
+.. include:: <isonum.txt>
+
+=====================
+VFIO Mediated devices
+=====================
+
+:Copyright: |copy| 2016, NVIDIA CORPORATION. All rights reserved.
+:Author: Neo Jia <cjia@nvidia.com>
+:Author: Kirti Wankhede <kwankhede@nvidia.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 2 as
+published by the Free Software Foundation.
+
+
+Virtual Function I/O (VFIO) Mediated devices[1]
+===============================================
+
+The number of use cases for virtualizing DMA devices that do not have built-in
+SR_IOV capability is increasing. Previously, to virtualize such devices,
+developers had to create their own management interfaces and APIs, and then
+integrate them with user space software. To simplify integration with user space
+software, we have identified common requirements and a unified management
+interface for such devices.
+
+The VFIO driver framework provides unified APIs for direct device access. It is
+an IOMMU/device-agnostic framework for exposing direct device access to user
+space in a secure, IOMMU-protected environment. This framework is used for
+multiple devices, such as GPUs, network adapters, and compute accelerators. With
+direct device access, virtual machines or user space applications have direct
+access to the physical device. This framework is reused for mediated devices.
+
+The mediated core driver provides a common interface for mediated device
+management that can be used by drivers of different devices. This module
+provides a generic interface to perform these operations:
+
+* Create and destroy a mediated device
+* Add a mediated device to and remove it from a mediated bus driver
+* Add a mediated device to and remove it from an IOMMU group
+
+The mediated core driver also provides an interface to register a bus driver.
+For example, the mediated VFIO mdev driver is designed for mediated devices and
+supports VFIO APIs. The mediated bus driver adds a mediated device to and
+removes it from a VFIO group.
+
+The following high-level block diagram shows the main components and interfaces
+in the VFIO mediated driver framework. The diagram shows NVIDIA, Intel, and IBM
+devices as examples, as these devices are the first devices to use this module::
+
+ +---------------+
+ | |
+ | +-----------+ | mdev_register_driver() +--------------+
+ | | | +<------------------------+ |
+ | | mdev | | | |
+ | | bus | +------------------------>+ vfio_mdev.ko |<-> VFIO user
+ | | driver | | probe()/remove() | | APIs
+ | | | | +--------------+
+ | +-----------+ |
+ | |
+ | MDEV CORE |
+ | MODULE |
+ | mdev.ko |
+ | +-----------+ | mdev_register_device() +--------------+
+ | | | +<------------------------+ |
+ | | | | | nvidia.ko |<-> physical
+ | | | +------------------------>+ | device
+ | | | | callbacks +--------------+
+ | | Physical | |
+ | | device | | mdev_register_device() +--------------+
+ | | interface | |<------------------------+ |
+ | | | | | i915.ko |<-> physical
+ | | | +------------------------>+ | device
+ | | | | callbacks +--------------+
+ | | | |
+ | | | | mdev_register_device() +--------------+
+ | | | +<------------------------+ |
+ | | | | | ccw_device.ko|<-> physical
+ | | | +------------------------>+ | device
+ | | | | callbacks +--------------+
+ | +-----------+ |
+ +---------------+
+
+
+Registration Interfaces
+=======================
+
+The mediated core driver provides the following types of registration
+interfaces:
+
+* Registration interface for a mediated bus driver
+* Physical device driver interface
+
+Registration Interface for a Mediated Bus Driver
+------------------------------------------------
+
+The registration interface for a mediated bus driver provides the following
+structure to represent a mediated device's driver::
+
+ /*
+ * struct mdev_driver [2] - Mediated device's driver
+ * @name: driver name
+ * @probe: called when new device created
+ * @remove: called when device removed
+ * @driver: device driver structure
+ */
+ struct mdev_driver {
+ const char *name;
+ int (*probe) (struct device *dev);
+ void (*remove) (struct device *dev);
+ struct device_driver driver;
+ };
+
+A mediated bus driver for mdev should use this structure in the function calls
+to register and unregister itself with the core driver:
+
+* Register::
+
+ extern int mdev_register_driver(struct mdev_driver *drv,
+ struct module *owner);
+
+* Unregister::
+
+ extern void mdev_unregister_driver(struct mdev_driver *drv);
+
+The mediated bus driver is responsible for adding mediated devices to the VFIO
+group when devices are bound to the driver and removing mediated devices from
+the VFIO when devices are unbound from the driver.
+
+
+Physical Device Driver Interface
+--------------------------------
+
+The physical device driver interface provides the mdev_parent_ops[3] structure
+to define the APIs to manage work in the mediated core driver that is related
+to the physical device.
+
+The structures in the mdev_parent_ops structure are as follows:
+
+* dev_attr_groups: attributes of the parent device
+* mdev_attr_groups: attributes of the mediated device
+* supported_config: attributes to define supported configurations
+
+The functions in the mdev_parent_ops structure are as follows:
+
+* create: allocate basic resources in a driver for a mediated device
+* remove: free resources in a driver when a mediated device is destroyed
+
+(Note that mdev-core provides no implicit serialization of create/remove
+callbacks per mdev parent device, per mdev type, or any other categorization.
+Vendor drivers are expected to be fully asynchronous in this respect or
+provide their own internal resource protection.)
+
+The callbacks in the mdev_parent_ops structure are as follows:
+
+* open: open callback of mediated device
+* close: close callback of mediated device
+* ioctl: ioctl callback of mediated device
+* read : read emulation callback
+* write: write emulation callback
+* mmap: mmap emulation callback
+
+A driver should use the mdev_parent_ops structure in the function call to
+register itself with the mdev core driver::
+
+ extern int mdev_register_device(struct device *dev,
+ const struct mdev_parent_ops *ops);
+
+However, the mdev_parent_ops structure is not required in the function call
+that a driver should use to unregister itself with the mdev core driver::
+
+ extern void mdev_unregister_device(struct device *dev);
+
+
+Mediated Device Management Interface Through sysfs
+==================================================
+
+The management interface through sysfs enables user space software, such as
+libvirt, to query and configure mediated devices in a hardware-agnostic fashion.
+This management interface provides flexibility to the underlying physical
+device's driver to support features such as:
+
+* Mediated device hot plug
+* Multiple mediated devices in a single virtual machine
+* Multiple mediated devices from different physical devices
+
+Links in the mdev_bus Class Directory
+-------------------------------------
+The /sys/class/mdev_bus/ directory contains links to devices that are registered
+with the mdev core driver.
+
+Directories and files under the sysfs for Each Physical Device
+--------------------------------------------------------------
+
+::
+
+ |- [parent physical device]
+ |--- Vendor-specific-attributes [optional]
+ |--- [mdev_supported_types]
+ | |--- [<type-id>]
+ | | |--- create
+ | | |--- name
+ | | |--- available_instances
+ | | |--- device_api
+ | | |--- description
+ | | |--- [devices]
+ | |--- [<type-id>]
+ | | |--- create
+ | | |--- name
+ | | |--- available_instances
+ | | |--- device_api
+ | | |--- description
+ | | |--- [devices]
+ | |--- [<type-id>]
+ | |--- create
+ | |--- name
+ | |--- available_instances
+ | |--- device_api
+ | |--- description
+ | |--- [devices]
+
+* [mdev_supported_types]
+
+ The list of currently supported mediated device types and their details.
+
+ [<type-id>], device_api, and available_instances are mandatory attributes
+ that should be provided by vendor driver.
+
+* [<type-id>]
+
+ The [<type-id>] name is created by adding the device driver string as a prefix
+ to the string provided by the vendor driver. This format of this name is as
+ follows::
+
+ sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name);
+
+ (or using mdev_parent_dev(mdev) to arrive at the parent device outside
+ of the core mdev code)
+
+* device_api
+
+ This attribute should show which device API is being created, for example,
+ "vfio-pci" for a PCI device.
+
+* available_instances
+
+ This attribute should show the number of devices of type <type-id> that can be
+ created.
+
+* [device]
+
+ This directory contains links to the devices of type <type-id> that have been
+ created.
+
+* name
+
+ This attribute should show human readable name. This is optional attribute.
+
+* description
+
+ This attribute should show brief features/description of the type. This is
+ optional attribute.
+
+Directories and Files Under the sysfs for Each mdev Device
+----------------------------------------------------------
+
+::
+
+ |- [parent phy device]
+ |--- [$MDEV_UUID]
+ |--- remove
+ |--- mdev_type {link to its type}
+ |--- vendor-specific-attributes [optional]
+
+* remove (write only)
+
+Writing '1' to the 'remove' file destroys the mdev device. The vendor driver can
+fail the remove() callback if that device is active and the vendor driver
+doesn't support hot unplug.
+
+Example::
+
+ # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove
+
+Mediated device Hot plug
+------------------------
+
+Mediated devices can be created and assigned at runtime. The procedure to hot
+plug a mediated device is the same as the procedure to hot plug a PCI device.
+
+Translation APIs for Mediated Devices
+=====================================
+
+The following APIs are provided for translating user pfn to host pfn in a VFIO
+driver::
+
+ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
+ int npage, int prot, unsigned long *phys_pfn);
+
+ extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
+ int npage);
+
+These functions call back into the back-end IOMMU module by using the pin_pages
+and unpin_pages callbacks of the struct vfio_iommu_driver_ops[4]. Currently
+these callbacks are supported in the TYPE1 IOMMU module. To enable them for
+other IOMMU backend modules, such as PPC64 sPAPR module, they need to provide
+these two callback functions.
+
+Using the Sample Code
+=====================
+
+mtty.c in samples/vfio-mdev/ directory is a sample driver program to
+demonstrate how to use the mediated device framework.
+
+The sample driver creates an mdev device that simulates a serial port over a PCI
+card.
+
+1. Build and load the mtty.ko module.
+
+ This step creates a dummy device, /sys/devices/virtual/mtty/mtty/
+
+ Files in this device directory in sysfs are similar to the following::
+
+ # tree /sys/devices/virtual/mtty/mtty/
+ /sys/devices/virtual/mtty/mtty/
+ |-- mdev_supported_types
+ | |-- mtty-1
+ | | |-- available_instances
+ | | |-- create
+ | | |-- device_api
+ | | |-- devices
+ | | `-- name
+ | `-- mtty-2
+ | |-- available_instances
+ | |-- create
+ | |-- device_api
+ | |-- devices
+ | `-- name
+ |-- mtty_dev
+ | `-- sample_mtty_dev
+ |-- power
+ | |-- autosuspend_delay_ms
+ | |-- control
+ | |-- runtime_active_time
+ | |-- runtime_status
+ | `-- runtime_suspended_time
+ |-- subsystem -> ../../../../class/mtty
+ `-- uevent
+
+2. Create a mediated device by using the dummy device that you created in the
+ previous step::
+
+ # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \
+ /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create
+
+3. Add parameters to qemu-kvm::
+
+ -device vfio-pci,\
+ sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
+
+4. Boot the VM.
+
+ In the Linux guest VM, with no hardware on the host, the device appears
+ as follows::
+
+ # lspci -s 00:05.0 -xxvv
+ 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550])
+ Subsystem: Device 4348:3253
+ Physical Slot: 5
+ Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr-
+ Stepping- SERR- FastB2B- DisINTx-
+ Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort-
+ <TAbort- <MAbort- >SERR- <PERR- INTx-
+ Interrupt: pin A routed to IRQ 10
+ Region 0: I/O ports at c150 [size=8]
+ Region 1: I/O ports at c158 [size=8]
+ Kernel driver in use: serial
+ 00: 48 43 53 32 01 00 00 02 10 02 00 07 00 00 00 00
+ 10: 51 c1 00 00 59 c1 00 00 00 00 00 00 00 00 00 00
+ 20: 00 00 00 00 00 00 00 00 00 00 00 00 48 43 53 32
+ 30: 00 00 00 00 00 00 00 00 00 00 00 00 0a 01 00 00
+
+ In the Linux guest VM, dmesg output for the device is as follows:
+
+ serial 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ 10
+ 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A
+ 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A
+
+
+5. In the Linux guest VM, check the serial ports::
+
+ # setserial -g /dev/ttyS*
+ /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4
+ /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10
+ /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10
+
+6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or
+ /dev/ttyS2 with hardware flow control disabled.
+
+7. Type data on the minicom terminal or send data to the terminal emulation
+ program and read the data.
+
+ Data is loop backed from hosts mtty driver.
+
+8. Destroy the mediated device that you created::
+
+ # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove
+
+References
+==========
+
+1. See Documentation/driver-api/vfio.rst for more information on VFIO.
+2. struct mdev_driver in include/linux/mdev.h
+3. struct mdev_parent_ops in include/linux/mdev.h
+4. struct vfio_iommu_driver_ops in include/linux/vfio.h
diff --git a/Documentation/vfio.txt b/Documentation/driver-api/vfio.rst
index f1a4d3c3ba0b..f1a4d3c3ba0b 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/driver-api/vfio.rst
diff --git a/Documentation/xilinx/eemi.rst b/Documentation/driver-api/xilinx/eemi.rst
index 9dcbc6f18d75..9dcbc6f18d75 100644
--- a/Documentation/xilinx/eemi.rst
+++ b/Documentation/driver-api/xilinx/eemi.rst
diff --git a/Documentation/driver-api/xilinx/index.rst b/Documentation/driver-api/xilinx/index.rst
new file mode 100644
index 000000000000..13f7589ed442
--- /dev/null
+++ b/Documentation/driver-api/xilinx/index.rst
@@ -0,0 +1,16 @@
+
+===========
+Xilinx FPGA
+===========
+
+.. toctree::
+ :maxdepth: 1
+
+ eemi
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/xillybus.txt b/Documentation/driver-api/xillybus.rst
index 2446ee303c09..2446ee303c09 100644
--- a/Documentation/xillybus.txt
+++ b/Documentation/driver-api/xillybus.rst
diff --git a/Documentation/zorro.txt b/Documentation/driver-api/zorro.rst
index 664072b017e3..664072b017e3 100644
--- a/Documentation/zorro.txt
+++ b/Documentation/driver-api/zorro.rst
diff --git a/Documentation/driver-model/binding.txt b/Documentation/driver-model/binding.txt
deleted file mode 100644
index abfc8e290d53..000000000000
--- a/Documentation/driver-model/binding.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-
-Driver Binding
-
-Driver binding is the process of associating a device with a device
-driver that can control it. Bus drivers have typically handled this
-because there have been bus-specific structures to represent the
-devices and the drivers. With generic device and device driver
-structures, most of the binding can take place using common code.
-
-
-Bus
-~~~
-
-The bus type structure contains a list of all devices that are on that bus
-type in the system. When device_register is called for a device, it is
-inserted into the end of this list. The bus object also contains a
-list of all drivers of that bus type. When driver_register is called
-for a driver, it is inserted at the end of this list. These are the
-two events which trigger driver binding.
-
-
-device_register
-~~~~~~~~~~~~~~~
-
-When a new device is added, the bus's list of drivers is iterated over
-to find one that supports it. In order to determine that, the device
-ID of the device must match one of the device IDs that the driver
-supports. The format and semantics for comparing IDs is bus-specific.
-Instead of trying to derive a complex state machine and matching
-algorithm, it is up to the bus driver to provide a callback to compare
-a device against the IDs of a driver. The bus returns 1 if a match was
-found; 0 otherwise.
-
-int match(struct device * dev, struct device_driver * drv);
-
-If a match is found, the device's driver field is set to the driver
-and the driver's probe callback is called. This gives the driver a
-chance to verify that it really does support the hardware, and that
-it's in a working state.
-
-Device Class
-~~~~~~~~~~~~
-
-Upon the successful completion of probe, the device is registered with
-the class to which it belongs. Device drivers belong to one and only one
-class, and that is set in the driver's devclass field.
-devclass_add_device is called to enumerate the device within the class
-and actually register it with the class, which happens with the
-class's register_dev callback.
-
-
-Driver
-~~~~~~
-
-When a driver is attached to a device, the device is inserted into the
-driver's list of devices.
-
-
-sysfs
-~~~~~
-
-A symlink is created in the bus's 'devices' directory that points to
-the device's directory in the physical hierarchy.
-
-A symlink is created in the driver's 'devices' directory that points
-to the device's directory in the physical hierarchy.
-
-A directory for the device is created in the class's directory. A
-symlink is created in that directory that points to the device's
-physical location in the sysfs tree.
-
-A symlink can be created (though this isn't done yet) in the device's
-physical directory to either its class directory, or the class's
-top-level directory. One can also be created to point to its driver's
-directory also.
-
-
-driver_register
-~~~~~~~~~~~~~~~
-
-The process is almost identical for when a new driver is added.
-The bus's list of devices is iterated over to find a match. Devices
-that already have a driver are skipped. All the devices are iterated
-over, to bind as many devices as possible to the driver.
-
-
-Removal
-~~~~~~~
-
-When a device is removed, the reference count for it will eventually
-go to 0. When it does, the remove callback of the driver is called. It
-is removed from the driver's list of devices and the reference count
-of the driver is decremented. All symlinks between the two are removed.
-
-When a driver is removed, the list of devices that it supports is
-iterated over, and the driver's remove callback is called for each
-one. The device is removed from that list and the symlinks removed.
-
diff --git a/Documentation/driver-model/bus.txt b/Documentation/driver-model/bus.txt
deleted file mode 100644
index c247b488a567..000000000000
--- a/Documentation/driver-model/bus.txt
+++ /dev/null
@@ -1,143 +0,0 @@
-
-Bus Types
-
-Definition
-~~~~~~~~~~
-See the kerneldoc for the struct bus_type.
-
-int bus_register(struct bus_type * bus);
-
-
-Declaration
-~~~~~~~~~~~
-
-Each bus type in the kernel (PCI, USB, etc) should declare one static
-object of this type. They must initialize the name field, and may
-optionally initialize the match callback.
-
-struct bus_type pci_bus_type = {
- .name = "pci",
- .match = pci_bus_match,
-};
-
-The structure should be exported to drivers in a header file:
-
-extern struct bus_type pci_bus_type;
-
-
-Registration
-~~~~~~~~~~~~
-
-When a bus driver is initialized, it calls bus_register. This
-initializes the rest of the fields in the bus object and inserts it
-into a global list of bus types. Once the bus object is registered,
-the fields in it are usable by the bus driver.
-
-
-Callbacks
-~~~~~~~~~
-
-match(): Attaching Drivers to Devices
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The format of device ID structures and the semantics for comparing
-them are inherently bus-specific. Drivers typically declare an array
-of device IDs of devices they support that reside in a bus-specific
-driver structure.
-
-The purpose of the match callback is to give the bus an opportunity to
-determine if a particular driver supports a particular device by
-comparing the device IDs the driver supports with the device ID of a
-particular device, without sacrificing bus-specific functionality or
-type-safety.
-
-When a driver is registered with the bus, the bus's list of devices is
-iterated over, and the match callback is called for each device that
-does not have a driver associated with it.
-
-
-
-Device and Driver Lists
-~~~~~~~~~~~~~~~~~~~~~~~
-
-The lists of devices and drivers are intended to replace the local
-lists that many buses keep. They are lists of struct devices and
-struct device_drivers, respectively. Bus drivers are free to use the
-lists as they please, but conversion to the bus-specific type may be
-necessary.
-
-The LDM core provides helper functions for iterating over each list.
-
-int bus_for_each_dev(struct bus_type * bus, struct device * start, void * data,
- int (*fn)(struct device *, void *));
-
-int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
- void * data, int (*fn)(struct device_driver *, void *));
-
-These helpers iterate over the respective list, and call the callback
-for each device or driver in the list. All list accesses are
-synchronized by taking the bus's lock (read currently). The reference
-count on each object in the list is incremented before the callback is
-called; it is decremented after the next object has been obtained. The
-lock is not held when calling the callback.
-
-
-sysfs
-~~~~~~~~
-There is a top-level directory named 'bus'.
-
-Each bus gets a directory in the bus directory, along with two default
-directories:
-
- /sys/bus/pci/
- |-- devices
- `-- drivers
-
-Drivers registered with the bus get a directory in the bus's drivers
-directory:
-
- /sys/bus/pci/
- |-- devices
- `-- drivers
- |-- Intel ICH
- |-- Intel ICH Joystick
- |-- agpgart
- `-- e100
-
-Each device that is discovered on a bus of that type gets a symlink in
-the bus's devices directory to the device's directory in the physical
-hierarchy:
-
- /sys/bus/pci/
- |-- devices
- | |-- 00:00.0 -> ../../../root/pci0/00:00.0
- | |-- 00:01.0 -> ../../../root/pci0/00:01.0
- | `-- 00:02.0 -> ../../../root/pci0/00:02.0
- `-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-struct bus_attribute {
- struct attribute attr;
- ssize_t (*show)(struct bus_type *, char * buf);
- ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
-};
-
-Bus drivers can export attributes using the BUS_ATTR_RW macro that works
-similarly to the DEVICE_ATTR_RW macro for devices. For example, a
-definition like this:
-
-static BUS_ATTR_RW(debug);
-
-is equivalent to declaring:
-
-static bus_attribute bus_attr_debug;
-
-This can then be used to add and remove the attribute from the bus's
-sysfs directory using:
-
-int bus_create_file(struct bus_type *, struct bus_attribute *);
-void bus_remove_file(struct bus_type *, struct bus_attribute *);
-
-
diff --git a/Documentation/driver-model/class.txt b/Documentation/driver-model/class.txt
deleted file mode 100644
index 1fefc480a80b..000000000000
--- a/Documentation/driver-model/class.txt
+++ /dev/null
@@ -1,147 +0,0 @@
-
-Device Classes
-
-
-Introduction
-~~~~~~~~~~~~
-A device class describes a type of device, like an audio or network
-device. The following device classes have been identified:
-
-<Insert List of Device Classes Here>
-
-
-Each device class defines a set of semantics and a programming interface
-that devices of that class adhere to. Device drivers are the
-implementation of that programming interface for a particular device on
-a particular bus.
-
-Device classes are agnostic with respect to what bus a device resides
-on.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The device class structure looks like:
-
-
-typedef int (*devclass_add)(struct device *);
-typedef void (*devclass_remove)(struct device *);
-
-See the kerneldoc for the struct class.
-
-A typical device class definition would look like:
-
-struct device_class input_devclass = {
- .name = "input",
- .add_device = input_add_device,
- .remove_device = input_remove_device,
-};
-
-Each device class structure should be exported in a header file so it
-can be used by drivers, extensions and interfaces.
-
-Device classes are registered and unregistered with the core using:
-
-int devclass_register(struct device_class * cls);
-void devclass_unregister(struct device_class * cls);
-
-
-Devices
-~~~~~~~
-As devices are bound to drivers, they are added to the device class
-that the driver belongs to. Before the driver model core, this would
-typically happen during the driver's probe() callback, once the device
-has been initialized. It now happens after the probe() callback
-finishes from the core.
-
-The device is enumerated in the class. Each time a device is added to
-the class, the class's devnum field is incremented and assigned to the
-device. The field is never decremented, so if the device is removed
-from the class and re-added, it will receive a different enumerated
-value.
-
-The class is allowed to create a class-specific structure for the
-device and store it in the device's class_data pointer.
-
-There is no list of devices in the device class. Each driver has a
-list of devices that it supports. The device class has a list of
-drivers of that particular class. To access all of the devices in the
-class, iterate over the device lists of each driver in the class.
-
-
-Device Drivers
-~~~~~~~~~~~~~~
-Device drivers are added to device classes when they are registered
-with the core. A driver specifies the class it belongs to by setting
-the struct device_driver::devclass field.
-
-
-sysfs directory structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There is a top-level sysfs directory named 'class'.
-
-Each class gets a directory in the class directory, along with two
-default subdirectories:
-
- class/
- `-- input
- |-- devices
- `-- drivers
-
-
-Drivers registered with the class get a symlink in the drivers/ directory
-that points to the driver's directory (under its bus directory):
-
- class/
- `-- input
- |-- devices
- `-- drivers
- `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
-
-
-Each device gets a symlink in the devices/ directory that points to the
-device's directory in the physical hierarchy:
-
- class/
- `-- input
- |-- devices
- | `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
- `-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-struct devclass_attribute {
- struct attribute attr;
- ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
- ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
-};
-
-Class drivers can export attributes using the DEVCLASS_ATTR macro that works
-similarly to the DEVICE_ATTR macro for devices. For example, a definition
-like this:
-
-static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
-
-is equivalent to declaring:
-
-static devclass_attribute devclass_attr_debug;
-
-The bus driver can add and remove the attribute from the class's
-sysfs directory using:
-
-int devclass_create_file(struct device_class *, struct devclass_attribute *);
-void devclass_remove_file(struct device_class *, struct devclass_attribute *);
-
-In the example above, the file will be named 'debug' in placed in the
-class's directory in sysfs.
-
-
-Interfaces
-~~~~~~~~~~
-There may exist multiple mechanisms for accessing the same device of a
-particular class type. Device interfaces describe these mechanisms.
-
-When a device is added to a device class, the core attempts to add it
-to every interface that is registered with the device class.
-
diff --git a/Documentation/driver-model/design-patterns.txt b/Documentation/driver-model/design-patterns.txt
deleted file mode 100644
index ba7b2df64904..000000000000
--- a/Documentation/driver-model/design-patterns.txt
+++ /dev/null
@@ -1,116 +0,0 @@
-
-Device Driver Design Patterns
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This document describes a few common design patterns found in device drivers.
-It is likely that subsystem maintainers will ask driver developers to
-conform to these design patterns.
-
-1. State Container
-2. container_of()
-
-
-1. State Container
-~~~~~~~~~~~~~~~~~~
-
-While the kernel contains a few device drivers that assume that they will
-only be probed() once on a certain system (singletons), it is custom to assume
-that the device the driver binds to will appear in several instances. This
-means that the probe() function and all callbacks need to be reentrant.
-
-The most common way to achieve this is to use the state container design
-pattern. It usually has this form:
-
-struct foo {
- spinlock_t lock; /* Example member */
- (...)
-};
-
-static int foo_probe(...)
-{
- struct foo *foo;
-
- foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
- if (!foo)
- return -ENOMEM;
- spin_lock_init(&foo->lock);
- (...)
-}
-
-This will create an instance of struct foo in memory every time probe() is
-called. This is our state container for this instance of the device driver.
-Of course it is then necessary to always pass this instance of the
-state around to all functions that need access to the state and its members.
-
-For example, if the driver is registering an interrupt handler, you would
-pass around a pointer to struct foo like this:
-
-static irqreturn_t foo_handler(int irq, void *arg)
-{
- struct foo *foo = arg;
- (...)
-}
-
-static int foo_probe(...)
-{
- struct foo *foo;
-
- (...)
- ret = request_irq(irq, foo_handler, 0, "foo", foo);
-}
-
-This way you always get a pointer back to the correct instance of foo in
-your interrupt handler.
-
-
-2. container_of()
-~~~~~~~~~~~~~~~~~
-
-Continuing on the above example we add an offloaded work:
-
-struct foo {
- spinlock_t lock;
- struct workqueue_struct *wq;
- struct work_struct offload;
- (...)
-};
-
-static void foo_work(struct work_struct *work)
-{
- struct foo *foo = container_of(work, struct foo, offload);
-
- (...)
-}
-
-static irqreturn_t foo_handler(int irq, void *arg)
-{
- struct foo *foo = arg;
-
- queue_work(foo->wq, &foo->offload);
- (...)
-}
-
-static int foo_probe(...)
-{
- struct foo *foo;
-
- foo->wq = create_singlethread_workqueue("foo-wq");
- INIT_WORK(&foo->offload, foo_work);
- (...)
-}
-
-The design pattern is the same for an hrtimer or something similar that will
-return a single argument which is a pointer to a struct member in the
-callback.
-
-container_of() is a macro defined in <linux/kernel.h>
-
-What container_of() does is to obtain a pointer to the containing struct from
-a pointer to a member by a simple subtraction using the offsetof() macro from
-standard C, which allows something similar to object oriented behaviours.
-Notice that the contained member must not be a pointer, but an actual member
-for this to work.
-
-We can see here that we avoid having global pointers to our struct foo *
-instance this way, while still keeping the number of parameters passed to the
-work function to a single pointer.
diff --git a/Documentation/driver-model/device.txt b/Documentation/driver-model/device.txt
deleted file mode 100644
index 2403eb856187..000000000000
--- a/Documentation/driver-model/device.txt
+++ /dev/null
@@ -1,106 +0,0 @@
-
-The Basic Device Structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See the kerneldoc for the struct device.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The bus driver that discovers the device uses this to register the
-device with the core:
-
-int device_register(struct device * dev);
-
-The bus should initialize the following fields:
-
- - parent
- - name
- - bus_id
- - bus
-
-A device is removed from the core when its reference count goes to
-0. The reference count can be adjusted using:
-
-struct device * get_device(struct device * dev);
-void put_device(struct device * dev);
-
-get_device() will return a pointer to the struct device passed to it
-if the reference is not already 0 (if it's in the process of being
-removed already).
-
-A driver can access the lock in the device structure using:
-
-void lock_device(struct device * dev);
-void unlock_device(struct device * dev);
-
-
-Attributes
-~~~~~~~~~~
-struct device_attribute {
- struct attribute attr;
- ssize_t (*show)(struct device *dev, struct device_attribute *attr,
- char *buf);
- ssize_t (*store)(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count);
-};
-
-Attributes of devices can be exported by a device driver through sysfs.
-
-Please see Documentation/filesystems/sysfs.txt for more information
-on how sysfs works.
-
-As explained in Documentation/kobject.txt, device attributes must be
-created before the KOBJ_ADD uevent is generated. The only way to realize
-that is by defining an attribute group.
-
-Attributes are declared using a macro called DEVICE_ATTR:
-
-#define DEVICE_ATTR(name,mode,show,store)
-
-Example:
-
-static DEVICE_ATTR(type, 0444, show_type, NULL);
-static DEVICE_ATTR(power, 0644, show_power, store_power);
-
-This declares two structures of type struct device_attribute with respective
-names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
-organized as follows into a group:
-
-static struct attribute *dev_attrs[] = {
- &dev_attr_type.attr,
- &dev_attr_power.attr,
- NULL,
-};
-
-static struct attribute_group dev_attr_group = {
- .attrs = dev_attrs,
-};
-
-static const struct attribute_group *dev_attr_groups[] = {
- &dev_attr_group,
- NULL,
-};
-
-This array of groups can then be associated with a device by setting the
-group pointer in struct device before device_register() is invoked:
-
- dev->groups = dev_attr_groups;
- device_register(dev);
-
-The device_register() function will use the 'groups' pointer to create the
-device attributes and the device_unregister() function will use this pointer
-to remove the device attributes.
-
-Word of warning: While the kernel allows device_create_file() and
-device_remove_file() to be called on a device at any time, userspace has
-strict expectations on when attributes get created. When a new device is
-registered in the kernel, a uevent is generated to notify userspace (like
-udev) that a new device is available. If attributes are added after the
-device is registered, then userspace won't get notified and userspace will
-not know about the new attributes.
-
-This is important for device driver that need to publish additional
-attributes for a device at driver probe time. If the device driver simply
-calls device_create_file() on the device structure passed to it, then
-userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt
deleted file mode 100644
index 69c7fa7f616c..000000000000
--- a/Documentation/driver-model/devres.txt
+++ /dev/null
@@ -1,412 +0,0 @@
-Devres - Managed Device Resource
-================================
-
-Tejun Heo <teheo@suse.de>
-
-First draft 10 January 2007
-
-
-1. Intro : Huh? Devres?
-2. Devres : Devres in a nutshell
-3. Devres Group : Group devres'es and release them together
-4. Details : Life time rules, calling context, ...
-5. Overhead : How much do we have to pay for this?
-6. List of managed interfaces : Currently implemented managed interfaces
-
-
- 1. Intro
- --------
-
-devres came up while trying to convert libata to use iomap. Each
-iomapped address should be kept and unmapped on driver detach. For
-example, a plain SFF ATA controller (that is, good old PCI IDE) in
-native mode makes use of 5 PCI BARs and all of them should be
-maintained.
-
-As with many other device drivers, libata low level drivers have
-sufficient bugs in ->remove and ->probe failure path. Well, yes,
-that's probably because libata low level driver developers are lazy
-bunch, but aren't all low level driver developers? After spending a
-day fiddling with braindamaged hardware with no document or
-braindamaged document, if it's finally working, well, it's working.
-
-For one reason or another, low level drivers don't receive as much
-attention or testing as core code, and bugs on driver detach or
-initialization failure don't happen often enough to be noticeable.
-Init failure path is worse because it's much less travelled while
-needs to handle multiple entry points.
-
-So, many low level drivers end up leaking resources on driver detach
-and having half broken failure path implementation in ->probe() which
-would leak resources or even cause oops when failure occurs. iomap
-adds more to this mix. So do msi and msix.
-
-
- 2. Devres
- ---------
-
-devres is basically linked list of arbitrarily sized memory areas
-associated with a struct device. Each devres entry is associated with
-a release function. A devres can be released in several ways. No
-matter what, all devres entries are released on driver detach. On
-release, the associated release function is invoked and then the
-devres entry is freed.
-
-Managed interface is created for resources commonly used by device
-drivers using devres. For example, coherent DMA memory is acquired
-using dma_alloc_coherent(). The managed version is called
-dmam_alloc_coherent(). It is identical to dma_alloc_coherent() except
-for the DMA memory allocated using it is managed and will be
-automatically released on driver detach. Implementation looks like
-the following.
-
- struct dma_devres {
- size_t size;
- void *vaddr;
- dma_addr_t dma_handle;
- };
-
- static void dmam_coherent_release(struct device *dev, void *res)
- {
- struct dma_devres *this = res;
-
- dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
- }
-
- dmam_alloc_coherent(dev, size, dma_handle, gfp)
- {
- struct dma_devres *dr;
- void *vaddr;
-
- dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
- ...
-
- /* alloc DMA memory as usual */
- vaddr = dma_alloc_coherent(...);
- ...
-
- /* record size, vaddr, dma_handle in dr */
- dr->vaddr = vaddr;
- ...
-
- devres_add(dev, dr);
-
- return vaddr;
- }
-
-If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
-freed whether initialization fails half-way or the device gets
-detached. If most resources are acquired using managed interface, a
-driver can have much simpler init and exit code. Init path basically
-looks like the following.
-
- my_init_one()
- {
- struct mydev *d;
-
- d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
- if (!d)
- return -ENOMEM;
-
- d->ring = dmam_alloc_coherent(...);
- if (!d->ring)
- return -ENOMEM;
-
- if (check something)
- return -EINVAL;
- ...
-
- return register_to_upper_layer(d);
- }
-
-And exit path,
-
- my_remove_one()
- {
- unregister_from_upper_layer(d);
- shutdown_my_hardware();
- }
-
-As shown above, low level drivers can be simplified a lot by using
-devres. Complexity is shifted from less maintained low level drivers
-to better maintained higher layer. Also, as init failure path is
-shared with exit path, both can get more testing.
-
-Note though that when converting current calls or assignments to
-managed devm_* versions it is up to you to check if internal operations
-like allocating memory, have failed. Managed resources pertains to the
-freeing of these resources *only* - all other checks needed are still
-on you. In some cases this may mean introducing checks that were not
-necessary before moving to the managed devm_* calls.
-
-
- 3. Devres group
- ---------------
-
-Devres entries can be grouped using devres group. When a group is
-released, all contained normal devres entries and properly nested
-groups are released. One usage is to rollback series of acquired
-resources on failure. For example,
-
- if (!devres_open_group(dev, NULL, GFP_KERNEL))
- return -ENOMEM;
-
- acquire A;
- if (failed)
- goto err;
-
- acquire B;
- if (failed)
- goto err;
- ...
-
- devres_remove_group(dev, NULL);
- return 0;
-
- err:
- devres_release_group(dev, NULL);
- return err_code;
-
-As resource acquisition failure usually means probe failure, constructs
-like above are usually useful in midlayer driver (e.g. libata core
-layer) where interface function shouldn't have side effect on failure.
-For LLDs, just returning error code suffices in most cases.
-
-Each group is identified by void *id. It can either be explicitly
-specified by @id argument to devres_open_group() or automatically
-created by passing NULL as @id as in the above example. In both
-cases, devres_open_group() returns the group's id. The returned id
-can be passed to other devres functions to select the target group.
-If NULL is given to those functions, the latest open group is
-selected.
-
-For example, you can do something like the following.
-
- int my_midlayer_create_something()
- {
- if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
- return -ENOMEM;
-
- ...
-
- devres_close_group(dev, my_midlayer_create_something);
- return 0;
- }
-
- void my_midlayer_destroy_something()
- {
- devres_release_group(dev, my_midlayer_create_something);
- }
-
-
- 4. Details
- ----------
-
-Lifetime of a devres entry begins on devres allocation and finishes
-when it is released or destroyed (removed and freed) - no reference
-counting.
-
-devres core guarantees atomicity to all basic devres operations and
-has support for single-instance devres types (atomic
-lookup-and-add-if-not-found). Other than that, synchronizing
-concurrent accesses to allocated devres data is caller's
-responsibility. This is usually non-issue because bus ops and
-resource allocations already do the job.
-
-For an example of single-instance devres type, read pcim_iomap_table()
-in lib/devres.c.
-
-All devres interface functions can be called without context if the
-right gfp mask is given.
-
-
- 5. Overhead
- -----------
-
-Each devres bookkeeping info is allocated together with requested data
-area. With debug option turned off, bookkeeping info occupies 16
-bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
-up to ull alignment). If singly linked list is used, it can be
-reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
-
-Each devres group occupies 8 pointers. It can be reduced to 6 if
-singly linked list is used.
-
-Memory space overhead on ahci controller with two ports is between 300
-and 400 bytes on 32bit machine after naive conversion (we can
-certainly invest a bit more effort into libata core layer).
-
-
- 6. List of managed interfaces
- -----------------------------
-
-CLOCK
- devm_clk_get()
- devm_clk_get_optional()
- devm_clk_put()
- devm_clk_hw_register()
- devm_of_clk_add_hw_provider()
- devm_clk_hw_register_clkdev()
-
-DMA
- dmaenginem_async_device_register()
- dmam_alloc_coherent()
- dmam_alloc_attrs()
- dmam_free_coherent()
- dmam_pool_create()
- dmam_pool_destroy()
-
-DRM
- devm_drm_dev_init()
-
-GPIO
- devm_gpiod_get()
- devm_gpiod_get_index()
- devm_gpiod_get_index_optional()
- devm_gpiod_get_optional()
- devm_gpiod_put()
- devm_gpiod_unhinge()
- devm_gpiochip_add_data()
- devm_gpio_request()
- devm_gpio_request_one()
- devm_gpio_free()
-
-I2C
- devm_i2c_new_dummy_device()
-
-IIO
- devm_iio_device_alloc()
- devm_iio_device_free()
- devm_iio_device_register()
- devm_iio_device_unregister()
- devm_iio_kfifo_allocate()
- devm_iio_kfifo_free()
- devm_iio_triggered_buffer_setup()
- devm_iio_triggered_buffer_cleanup()
- devm_iio_trigger_alloc()
- devm_iio_trigger_free()
- devm_iio_trigger_register()
- devm_iio_trigger_unregister()
- devm_iio_channel_get()
- devm_iio_channel_release()
- devm_iio_channel_get_all()
- devm_iio_channel_release_all()
-
-INPUT
- devm_input_allocate_device()
-
-IO region
- devm_release_mem_region()
- devm_release_region()
- devm_release_resource()
- devm_request_mem_region()
- devm_request_region()
- devm_request_resource()
-
-IOMAP
- devm_ioport_map()
- devm_ioport_unmap()
- devm_ioremap()
- devm_ioremap_nocache()
- devm_ioremap_wc()
- devm_ioremap_resource() : checks resource, requests memory region, ioremaps
- devm_iounmap()
- pcim_iomap()
- pcim_iomap_regions() : do request_region() and iomap() on multiple BARs
- pcim_iomap_table() : array of mapped addresses indexed by BAR
- pcim_iounmap()
-
-IRQ
- devm_free_irq()
- devm_request_any_context_irq()
- devm_request_irq()
- devm_request_threaded_irq()
- devm_irq_alloc_descs()
- devm_irq_alloc_desc()
- devm_irq_alloc_desc_at()
- devm_irq_alloc_desc_from()
- devm_irq_alloc_descs_from()
- devm_irq_alloc_generic_chip()
- devm_irq_setup_generic_chip()
- devm_irq_sim_init()
-
-LED
- devm_led_classdev_register()
- devm_led_classdev_unregister()
-
-MDIO
- devm_mdiobus_alloc()
- devm_mdiobus_alloc_size()
- devm_mdiobus_free()
-
-MEM
- devm_free_pages()
- devm_get_free_pages()
- devm_kasprintf()
- devm_kcalloc()
- devm_kfree()
- devm_kmalloc()
- devm_kmalloc_array()
- devm_kmemdup()
- devm_kstrdup()
- devm_kvasprintf()
- devm_kzalloc()
-
-MFD
- devm_mfd_add_devices()
-
-MUX
- devm_mux_chip_alloc()
- devm_mux_chip_register()
- devm_mux_control_get()
-
-PER-CPU MEM
- devm_alloc_percpu()
- devm_free_percpu()
-
-PCI
- devm_pci_alloc_host_bridge() : managed PCI host bridge allocation
- devm_pci_remap_cfgspace() : ioremap PCI configuration space
- devm_pci_remap_cfg_resource() : ioremap PCI configuration space resource
- pcim_enable_device() : after success, all PCI ops become managed
- pcim_pin_device() : keep PCI device enabled after release
-
-PHY
- devm_usb_get_phy()
- devm_usb_put_phy()
-
-PINCTRL
- devm_pinctrl_get()
- devm_pinctrl_put()
- devm_pinctrl_register()
- devm_pinctrl_unregister()
-
-POWER
- devm_reboot_mode_register()
- devm_reboot_mode_unregister()
-
-PWM
- devm_pwm_get()
- devm_pwm_put()
-
-REGULATOR
- devm_regulator_bulk_get()
- devm_regulator_get()
- devm_regulator_put()
- devm_regulator_register()
-
-RESET
- devm_reset_control_get()
- devm_reset_controller_register()
-
-SERDEV
- devm_serdev_device_open()
-
-SLAVE DMA ENGINE
- devm_acpi_dma_controller_register()
-
-SPI
- devm_spi_register_master()
-
-WATCHDOG
- devm_watchdog_register_device()
diff --git a/Documentation/driver-model/driver.txt b/Documentation/driver-model/driver.txt
deleted file mode 100644
index d661e6f7e6a0..000000000000
--- a/Documentation/driver-model/driver.txt
+++ /dev/null
@@ -1,215 +0,0 @@
-
-Device Drivers
-
-See the kerneldoc for the struct device_driver.
-
-
-Allocation
-~~~~~~~~~~
-
-Device drivers are statically allocated structures. Though there may
-be multiple devices in a system that a driver supports, struct
-device_driver represents the driver as a whole (not a particular
-device instance).
-
-Initialization
-~~~~~~~~~~~~~~
-
-The driver must initialize at least the name and bus fields. It should
-also initialize the devclass field (when it arrives), so it may obtain
-the proper linkage internally. It should also initialize as many of
-the callbacks as possible, though each is optional.
-
-Declaration
-~~~~~~~~~~~
-
-As stated above, struct device_driver objects are statically
-allocated. Below is an example declaration of the eepro100
-driver. This declaration is hypothetical only; it relies on the driver
-being converted completely to the new model.
-
-static struct device_driver eepro100_driver = {
- .name = "eepro100",
- .bus = &pci_bus_type,
-
- .probe = eepro100_probe,
- .remove = eepro100_remove,
- .suspend = eepro100_suspend,
- .resume = eepro100_resume,
-};
-
-Most drivers will not be able to be converted completely to the new
-model because the bus they belong to has a bus-specific structure with
-bus-specific fields that cannot be generalized.
-
-The most common example of this are device ID structures. A driver
-typically defines an array of device IDs that it supports. The format
-of these structures and the semantics for comparing device IDs are
-completely bus-specific. Defining them as bus-specific entities would
-sacrifice type-safety, so we keep bus-specific structures around.
-
-Bus-specific drivers should include a generic struct device_driver in
-the definition of the bus-specific driver. Like this:
-
-struct pci_driver {
- const struct pci_device_id *id_table;
- struct device_driver driver;
-};
-
-A definition that included bus-specific fields would look like
-(using the eepro100 driver again):
-
-static struct pci_driver eepro100_driver = {
- .id_table = eepro100_pci_tbl,
- .driver = {
- .name = "eepro100",
- .bus = &pci_bus_type,
- .probe = eepro100_probe,
- .remove = eepro100_remove,
- .suspend = eepro100_suspend,
- .resume = eepro100_resume,
- },
-};
-
-Some may find the syntax of embedded struct initialization awkward or
-even a bit ugly. So far, it's the best way we've found to do what we want...
-
-Registration
-~~~~~~~~~~~~
-
-int driver_register(struct device_driver * drv);
-
-The driver registers the structure on startup. For drivers that have
-no bus-specific fields (i.e. don't have a bus-specific driver
-structure), they would use driver_register and pass a pointer to their
-struct device_driver object.
-
-Most drivers, however, will have a bus-specific structure and will
-need to register with the bus using something like pci_driver_register.
-
-It is important that drivers register their driver structure as early as
-possible. Registration with the core initializes several fields in the
-struct device_driver object, including the reference count and the
-lock. These fields are assumed to be valid at all times and may be
-used by the device model core or the bus driver.
-
-
-Transition Bus Drivers
-~~~~~~~~~~~~~~~~~~~~~~
-
-By defining wrapper functions, the transition to the new model can be
-made easier. Drivers can ignore the generic structure altogether and
-let the bus wrapper fill in the fields. For the callbacks, the bus can
-define generic callbacks that forward the call to the bus-specific
-callbacks of the drivers.
-
-This solution is intended to be only temporary. In order to get class
-information in the driver, the drivers must be modified anyway. Since
-converting drivers to the new model should reduce some infrastructural
-complexity and code size, it is recommended that they are converted as
-class information is added.
-
-Access
-~~~~~~
-
-Once the object has been registered, it may access the common fields of
-the object, like the lock and the list of devices.
-
-int driver_for_each_dev(struct device_driver * drv, void * data,
- int (*callback)(struct device * dev, void * data));
-
-The devices field is a list of all the devices that have been bound to
-the driver. The LDM core provides a helper function to operate on all
-the devices a driver controls. This helper locks the driver on each
-node access, and does proper reference counting on each device as it
-accesses it.
-
-
-sysfs
-~~~~~
-
-When a driver is registered, a sysfs directory is created in its
-bus's directory. In this directory, the driver can export an interface
-to userspace to control operation of the driver on a global basis;
-e.g. toggling debugging output in the driver.
-
-A future feature of this directory will be a 'devices' directory. This
-directory will contain symlinks to the directories of devices it
-supports.
-
-
-
-Callbacks
-~~~~~~~~~
-
- int (*probe) (struct device * dev);
-
-The probe() entry is called in task context, with the bus's rwsem locked
-and the driver partially bound to the device. Drivers commonly use
-container_of() to convert "dev" to a bus-specific type, both in probe()
-and other routines. That type often provides device resource data, such
-as pci_dev.resource[] or platform_device.resources, which is used in
-addition to dev->platform_data to initialize the driver.
-
-This callback holds the driver-specific logic to bind the driver to a
-given device. That includes verifying that the device is present, that
-it's a version the driver can handle, that driver data structures can
-be allocated and initialized, and that any hardware can be initialized.
-Drivers often store a pointer to their state with dev_set_drvdata().
-When the driver has successfully bound itself to that device, then probe()
-returns zero and the driver model code will finish its part of binding
-the driver to that device.
-
-A driver's probe() may return a negative errno value to indicate that
-the driver did not bind to this device, in which case it should have
-released all resources it allocated.
-
- int (*remove) (struct device * dev);
-
-remove is called to unbind a driver from a device. This may be
-called if a device is physically removed from the system, if the
-driver module is being unloaded, during a reboot sequence, or
-in other cases.
-
-It is up to the driver to determine if the device is present or
-not. It should free any resources allocated specifically for the
-device; i.e. anything in the device's driver_data field.
-
-If the device is still present, it should quiesce the device and place
-it into a supported low-power state.
-
- int (*suspend) (struct device * dev, pm_message_t state);
-
-suspend is called to put the device in a low power state.
-
- int (*resume) (struct device * dev);
-
-Resume is used to bring a device back from a low power state.
-
-
-Attributes
-~~~~~~~~~~
-struct driver_attribute {
- struct attribute attr;
- ssize_t (*show)(struct device_driver *driver, char *buf);
- ssize_t (*store)(struct device_driver *, const char * buf, size_t count);
-};
-
-Device drivers can export attributes via their sysfs directories.
-Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
-macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
-macros.
-
-Example:
-
-DRIVER_ATTR_RW(debug);
-
-This is equivalent to declaring:
-
-struct driver_attribute driver_attr_debug;
-
-This can then be used to add and remove the attribute from the
-driver's directory using:
-
-int driver_create_file(struct device_driver *, const struct driver_attribute *);
-void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-model/overview.txt b/Documentation/driver-model/overview.txt
deleted file mode 100644
index 6a8f9a8075d8..000000000000
--- a/Documentation/driver-model/overview.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-The Linux Kernel Device Model
-
-Patrick Mochel <mochel@digitalimplant.org>
-
-Drafted 26 August 2002
-Updated 31 January 2006
-
-
-Overview
-~~~~~~~~
-
-The Linux Kernel Driver Model is a unification of all the disparate driver
-models that were previously used in the kernel. It is intended to augment the
-bus-specific drivers for bridges and devices by consolidating a set of data
-and operations into globally accessible data structures.
-
-Traditional driver models implemented some sort of tree-like structure
-(sometimes just a list) for the devices they control. There wasn't any
-uniformity across the different bus types.
-
-The current driver model provides a common, uniform data model for describing
-a bus and the devices that can appear under the bus. The unified bus
-model includes a set of common attributes which all busses carry, and a set
-of common callbacks, such as device discovery during bus probing, bus
-shutdown, bus power management, etc.
-
-The common device and bridge interface reflects the goals of the modern
-computer: namely the ability to do seamless device "plug and play", power
-management, and hot plug. In particular, the model dictated by Intel and
-Microsoft (namely ACPI) ensures that almost every device on almost any bus
-on an x86-compatible system can work within this paradigm. Of course,
-not every bus is able to support all such operations, although most
-buses support most of those operations.
-
-
-Downstream Access
-~~~~~~~~~~~~~~~~~
-
-Common data fields have been moved out of individual bus layers into a common
-data structure. These fields must still be accessed by the bus layers,
-and sometimes by the device-specific drivers.
-
-Other bus layers are encouraged to do what has been done for the PCI layer.
-struct pci_dev now looks like this:
-
-struct pci_dev {
- ...
-
- struct device dev; /* Generic device interface */
- ...
-};
-
-Note first that the struct device dev within the struct pci_dev is
-statically allocated. This means only one allocation on device discovery.
-
-Note also that that struct device dev is not necessarily defined at the
-front of the pci_dev structure. This is to make people think about what
-they're doing when switching between the bus driver and the global driver,
-and to discourage meaningless and incorrect casts between the two.
-
-The PCI bus layer freely accesses the fields of struct device. It knows about
-the structure of struct pci_dev, and it should know the structure of struct
-device. Individual PCI device drivers that have been converted to the current
-driver model generally do not and should not touch the fields of struct device,
-unless there is a compelling reason to do so.
-
-The above abstraction prevents unnecessary pain during transitional phases.
-If it were not done this way, then when a field was renamed or removed, every
-downstream driver would break. On the other hand, if only the bus layer
-(and not the device layer) accesses the struct device, it is only the bus
-layer that needs to change.
-
-
-User Interface
-~~~~~~~~~~~~~~
-
-By virtue of having a complete hierarchical view of all the devices in the
-system, exporting a complete hierarchical view to userspace becomes relatively
-easy. This has been accomplished by implementing a special purpose virtual
-file system named sysfs.
-
-Almost all mainstream Linux distros mount this filesystem automatically; you
-can see some variation of the following in the output of the "mount" command:
-
-$ mount
-...
-none on /sys type sysfs (rw,noexec,nosuid,nodev)
-...
-$
-
-The auto-mounting of sysfs is typically accomplished by an entry similar to
-the following in the /etc/fstab file:
-
-none /sys sysfs defaults 0 0
-
-or something similar in the /lib/init/fstab file on Debian-based systems:
-
-none /sys sysfs nodev,noexec,nosuid 0 0
-
-If sysfs is not automatically mounted, you can always do it manually with:
-
-# mount -t sysfs sysfs /sys
-
-Whenever a device is inserted into the tree, a directory is created for it.
-This directory may be populated at each layer of discovery - the global layer,
-the bus layer, or the device layer.
-
-The global layer currently creates two files - 'name' and 'power'. The
-former only reports the name of the device. The latter reports the
-current power state of the device. It will also be used to set the current
-power state.
-
-The bus layer may also create files for the devices it finds while probing the
-bus. For example, the PCI layer currently creates 'irq' and 'resource' files
-for each PCI device.
-
-A device-specific driver may also export files in its directory to expose
-device-specific data or tunable interfaces.
-
-More information about the sysfs directory layout can be found in
-the other documents in this directory and in the file
-Documentation/filesystems/sysfs.txt.
-
diff --git a/Documentation/driver-model/platform.txt b/Documentation/driver-model/platform.txt
deleted file mode 100644
index 9d9e47dfc013..000000000000
--- a/Documentation/driver-model/platform.txt
+++ /dev/null
@@ -1,244 +0,0 @@
-Platform Devices and Drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-See <linux/platform_device.h> for the driver model interface to the
-platform bus: platform_device, and platform_driver. This pseudo-bus
-is used to connect devices on busses with minimal infrastructure,
-like those used to integrate peripherals on many system-on-chip
-processors, or some "legacy" PC interconnects; as opposed to large
-formally specified ones like PCI or USB.
-
-
-Platform devices
-~~~~~~~~~~~~~~~~
-Platform devices are devices that typically appear as autonomous
-entities in the system. This includes legacy port-based devices and
-host bridges to peripheral buses, and most controllers integrated
-into system-on-chip platforms. What they usually have in common
-is direct addressing from a CPU bus. Rarely, a platform_device will
-be connected through a segment of some other kind of bus; but its
-registers will still be directly addressable.
-
-Platform devices are given a name, used in driver binding, and a
-list of resources such as addresses and IRQs.
-
-struct platform_device {
- const char *name;
- u32 id;
- struct device dev;
- u32 num_resources;
- struct resource *resource;
-};
-
-
-Platform drivers
-~~~~~~~~~~~~~~~~
-Platform drivers follow the standard driver model convention, where
-discovery/enumeration is handled outside the drivers, and drivers
-provide probe() and remove() methods. They support power management
-and shutdown notifications using the standard conventions.
-
-struct platform_driver {
- int (*probe)(struct platform_device *);
- int (*remove)(struct platform_device *);
- void (*shutdown)(struct platform_device *);
- int (*suspend)(struct platform_device *, pm_message_t state);
- int (*suspend_late)(struct platform_device *, pm_message_t state);
- int (*resume_early)(struct platform_device *);
- int (*resume)(struct platform_device *);
- struct device_driver driver;
-};
-
-Note that probe() should in general verify that the specified device hardware
-actually exists; sometimes platform setup code can't be sure. The probing
-can use device resources, including clocks, and device platform_data.
-
-Platform drivers register themselves the normal way:
-
- int platform_driver_register(struct platform_driver *drv);
-
-Or, in common situations where the device is known not to be hot-pluggable,
-the probe() routine can live in an init section to reduce the driver's
-runtime memory footprint:
-
- int platform_driver_probe(struct platform_driver *drv,
- int (*probe)(struct platform_device *))
-
-Kernel modules can be composed of several platform drivers. The platform core
-provides helpers to register and unregister an array of drivers:
-
- int __platform_register_drivers(struct platform_driver * const *drivers,
- unsigned int count, struct module *owner);
- void platform_unregister_drivers(struct platform_driver * const *drivers,
- unsigned int count);
-
-If one of the drivers fails to register, all drivers registered up to that
-point will be unregistered in reverse order. Note that there is a convenience
-macro that passes THIS_MODULE as owner parameter:
-
- #define platform_register_drivers(drivers, count)
-
-
-Device Enumeration
-~~~~~~~~~~~~~~~~~~
-As a rule, platform specific (and often board-specific) setup code will
-register platform devices:
-
- int platform_device_register(struct platform_device *pdev);
-
- int platform_add_devices(struct platform_device **pdevs, int ndev);
-
-The general rule is to register only those devices that actually exist,
-but in some cases extra devices might be registered. For example, a kernel
-might be configured to work with an external network adapter that might not
-be populated on all boards, or likewise to work with an integrated controller
-that some boards might not hook up to any peripherals.
-
-In some cases, boot firmware will export tables describing the devices
-that are populated on a given board. Without such tables, often the
-only way for system setup code to set up the correct devices is to build
-a kernel for a specific target board. Such board-specific kernels are
-common with embedded and custom systems development.
-
-In many cases, the memory and IRQ resources associated with the platform
-device are not enough to let the device's driver work. Board setup code
-will often provide additional information using the device's platform_data
-field to hold additional information.
-
-Embedded systems frequently need one or more clocks for platform devices,
-which are normally kept off until they're actively needed (to save power).
-System setup also associates those clocks with the device, so that that
-calls to clk_get(&pdev->dev, clock_name) return them as needed.
-
-
-Legacy Drivers: Device Probing
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers are not fully converted to the driver model, because they take
-on a non-driver role: the driver registers its platform device, rather than
-leaving that for system infrastructure. Such drivers can't be hotplugged
-or coldplugged, since those mechanisms require device creation to be in a
-different system component than the driver.
-
-The only "good" reason for this is to handle older system designs which, like
-original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
-configuration. Newer systems have largely abandoned that model, in favor of
-bus-level support for dynamic configuration (PCI, USB), or device tables
-provided by the boot firmware (e.g. PNPACPI on x86). There are too many
-conflicting options about what might be where, and even educated guesses by
-an operating system will be wrong often enough to make trouble.
-
-This style of driver is discouraged. If you're updating such a driver,
-please try to move the device enumeration to a more appropriate location,
-outside the driver. This will usually be cleanup, since such drivers
-tend to already have "normal" modes, such as ones using device nodes that
-were created by PNP or by platform device setup.
-
-None the less, there are some APIs to support such legacy drivers. Avoid
-using these calls except with such hotplug-deficient drivers.
-
- struct platform_device *platform_device_alloc(
- const char *name, int id);
-
-You can use platform_device_alloc() to dynamically allocate a device, which
-you will then initialize with resources and platform_device_register().
-A better solution is usually:
-
- struct platform_device *platform_device_register_simple(
- const char *name, int id,
- struct resource *res, unsigned int nres);
-
-You can use platform_device_register_simple() as a one-step call to allocate
-and register a device.
-
-
-Device Naming and Driver Binding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The platform_device.dev.bus_id is the canonical name for the devices.
-It's built from two components:
-
- * platform_device.name ... which is also used to for driver matching.
-
- * platform_device.id ... the device instance number, or else "-1"
- to indicate there's only one.
-
-These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
-"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
-named "serial". While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
-and use the platform_driver called "my_rtc".
-
-Driver binding is performed automatically by the driver core, invoking
-driver probe() after finding a match between device and driver. If the
-probe() succeeds, the driver and device are bound as usual. There are
-three different ways to find such a match:
-
- - Whenever a device is registered, the drivers for that bus are
- checked for matches. Platform devices should be registered very
- early during system boot.
-
- - When a driver is registered using platform_driver_register(), all
- unbound devices on that bus are checked for matches. Drivers
- usually register later during booting, or by module loading.
-
- - Registering a driver using platform_driver_probe() works just like
- using platform_driver_register(), except that the driver won't
- be probed later if another device registers. (Which is OK, since
- this interface is only for use with non-hotpluggable devices.)
-
-
-Early Platform Devices and Drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The early platform interfaces provide platform data to platform device
-drivers early on during the system boot. The code is built on top of the
-early_param() command line parsing and can be executed very early on.
-
-Example: "earlyprintk" class early serial console in 6 steps
-
-1. Registering early platform device data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code registers platform device data using the function
-early_platform_add_devices(). In the case of early serial console this
-should be hardware configuration for the serial port. Devices registered
-at this point will later on be matched against early platform drivers.
-
-2. Parsing kernel command line
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls parse_early_param() to parse the kernel
-command line. This will execute all matching early_param() callbacks.
-User specified early platform devices will be registered at this point.
-For the early serial console case the user can specify port on the
-kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
-the class string, "serial" is the name of the platform driver and
-0 is the platform device id. If the id is -1 then the dot and the
-id can be omitted.
-
-3. Installing early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code may optionally force registration of all early
-platform drivers belonging to a certain class using the function
-early_platform_driver_register_all(). User specified devices from
-step 2 have priority over these. This step is omitted by the serial
-driver example since the early serial driver code should be disabled
-unless the user has specified port on the kernel command line.
-
-4. Early platform driver registration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Compiled-in platform drivers making use of early_platform_init() are
-automatically registered during step 2 or 3. The serial driver example
-should use early_platform_init("earlyprintk", &platform_driver).
-
-5. Probing of early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls early_platform_driver_probe() to match
-registered early platform devices associated with a certain class with
-registered early platform drivers. Matched devices will get probed().
-This step can be executed at any point during the early boot. As soon
-as possible may be good for the serial port case.
-
-6. Inside the early platform driver probe()
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The driver code needs to take special care during early boot, especially
-when it comes to memory allocation and interrupt registration. The code
-in the probe() function can use is_early_platform_device() to check if
-it is called at early platform device or at the regular platform device
-time. The early serial driver performs register_console() at this point.
-
-For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-model/porting.txt b/Documentation/driver-model/porting.txt
deleted file mode 100644
index 453053f1661f..000000000000
--- a/Documentation/driver-model/porting.txt
+++ /dev/null
@@ -1,447 +0,0 @@
-
-Porting Drivers to the New Driver Model
-
-Patrick Mochel
-
-7 January 2003
-
-
-Overview
-
-Please refer to Documentation/driver-model/*.txt for definitions of
-various driver types and concepts.
-
-Most of the work of porting devices drivers to the new model happens
-at the bus driver layer. This was intentional, to minimize the
-negative effect on kernel drivers, and to allow a gradual transition
-of bus drivers.
-
-In a nutshell, the driver model consists of a set of objects that can
-be embedded in larger, bus-specific objects. Fields in these generic
-objects can replace fields in the bus-specific objects.
-
-The generic objects must be registered with the driver model core. By
-doing so, they will exported via the sysfs filesystem. sysfs can be
-mounted by doing
-
- # mount -t sysfs sysfs /sys
-
-
-
-The Process
-
-Step 0: Read include/linux/device.h for object and function definitions.
-
-Step 1: Registering the bus driver.
-
-
-- Define a struct bus_type for the bus driver.
-
-struct bus_type pci_bus_type = {
- .name = "pci",
-};
-
-
-- Register the bus type.
- This should be done in the initialization function for the bus type,
- which is usually the module_init(), or equivalent, function.
-
-static int __init pci_driver_init(void)
-{
- return bus_register(&pci_bus_type);
-}
-
-subsys_initcall(pci_driver_init);
-
-
- The bus type may be unregistered (if the bus driver may be compiled
- as a module) by doing:
-
- bus_unregister(&pci_bus_type);
-
-
-- Export the bus type for others to use.
-
- Other code may wish to reference the bus type, so declare it in a
- shared header file and export the symbol.
-
-From include/linux/pci.h:
-
-extern struct bus_type pci_bus_type;
-
-
-From file the above code appears in:
-
-EXPORT_SYMBOL(pci_bus_type);
-
-
-
-- This will cause the bus to show up in /sys/bus/pci/ with two
- subdirectories: 'devices' and 'drivers'.
-
-# tree -d /sys/bus/pci/
-/sys/bus/pci/
-|-- devices
-`-- drivers
-
-
-
-Step 2: Registering Devices.
-
-struct device represents a single device. It mainly contains metadata
-describing the relationship the device has to other entities.
-
-
-- Embed a struct device in the bus-specific device type.
-
-
-struct pci_dev {
- ...
- struct device dev; /* Generic device interface */
- ...
-};
-
- It is recommended that the generic device not be the first item in
- the struct to discourage programmers from doing mindless casts
- between the object types. Instead macros, or inline functions,
- should be created to convert from the generic object type.
-
-
-#define to_pci_dev(n) container_of(n, struct pci_dev, dev)
-
-or
-
-static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
-{
- return container_of(n, struct pci_dev, dev);
-}
-
- This allows the compiler to verify type-safety of the operations
- that are performed (which is Good).
-
-
-- Initialize the device on registration.
-
- When devices are discovered or registered with the bus type, the
- bus driver should initialize the generic device. The most important
- things to initialize are the bus_id, parent, and bus fields.
-
- The bus_id is an ASCII string that contains the device's address on
- the bus. The format of this string is bus-specific. This is
- necessary for representing devices in sysfs.
-
- parent is the physical parent of the device. It is important that
- the bus driver sets this field correctly.
-
- The driver model maintains an ordered list of devices that it uses
- for power management. This list must be in order to guarantee that
- devices are shutdown before their physical parents, and vice versa.
- The order of this list is determined by the parent of registered
- devices.
-
- Also, the location of the device's sysfs directory depends on a
- device's parent. sysfs exports a directory structure that mirrors
- the device hierarchy. Accurately setting the parent guarantees that
- sysfs will accurately represent the hierarchy.
-
- The device's bus field is a pointer to the bus type the device
- belongs to. This should be set to the bus_type that was declared
- and initialized before.
-
- Optionally, the bus driver may set the device's name and release
- fields.
-
- The name field is an ASCII string describing the device, like
-
- "ATI Technologies Inc Radeon QD"
-
- The release field is a callback that the driver model core calls
- when the device has been removed, and all references to it have
- been released. More on this in a moment.
-
-
-- Register the device.
-
- Once the generic device has been initialized, it can be registered
- with the driver model core by doing:
-
- device_register(&dev->dev);
-
- It can later be unregistered by doing:
-
- device_unregister(&dev->dev);
-
- This should happen on buses that support hotpluggable devices.
- If a bus driver unregisters a device, it should not immediately free
- it. It should instead wait for the driver model core to call the
- device's release method, then free the bus-specific object.
- (There may be other code that is currently referencing the device
- structure, and it would be rude to free the device while that is
- happening).
-
-
- When the device is registered, a directory in sysfs is created.
- The PCI tree in sysfs looks like:
-
-/sys/devices/pci0/
-|-- 00:00.0
-|-- 00:01.0
-| `-- 01:00.0
-|-- 00:02.0
-| `-- 02:1f.0
-| `-- 03:00.0
-|-- 00:1e.0
-| `-- 04:04.0
-|-- 00:1f.0
-|-- 00:1f.1
-| |-- ide0
-| | |-- 0.0
-| | `-- 0.1
-| `-- ide1
-| `-- 1.0
-|-- 00:1f.2
-|-- 00:1f.3
-`-- 00:1f.5
-
- Also, symlinks are created in the bus's 'devices' directory
- that point to the device's directory in the physical hierarchy.
-
-/sys/bus/pci/devices/
-|-- 00:00.0 -> ../../../devices/pci0/00:00.0
-|-- 00:01.0 -> ../../../devices/pci0/00:01.0
-|-- 00:02.0 -> ../../../devices/pci0/00:02.0
-|-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
-|-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
-|-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
-|-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
-|-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
-|-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
-|-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
-|-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
-|-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
-`-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
-
-
-
-Step 3: Registering Drivers.
-
-struct device_driver is a simple driver structure that contains a set
-of operations that the driver model core may call.
-
-
-- Embed a struct device_driver in the bus-specific driver.
-
- Just like with devices, do something like:
-
-struct pci_driver {
- ...
- struct device_driver driver;
-};
-
-
-- Initialize the generic driver structure.
-
- When the driver registers with the bus (e.g. doing pci_register_driver()),
- initialize the necessary fields of the driver: the name and bus
- fields.
-
-
-- Register the driver.
-
- After the generic driver has been initialized, call
-
- driver_register(&drv->driver);
-
- to register the driver with the core.
-
- When the driver is unregistered from the bus, unregister it from the
- core by doing:
-
- driver_unregister(&drv->driver);
-
- Note that this will block until all references to the driver have
- gone away. Normally, there will not be any.
-
-
-- Sysfs representation.
-
- Drivers are exported via sysfs in their bus's 'driver's directory.
- For example:
-
-/sys/bus/pci/drivers/
-|-- 3c59x
-|-- Ensoniq AudioPCI
-|-- agpgart-amdk7
-|-- e100
-`-- serial
-
-
-Step 4: Define Generic Methods for Drivers.
-
-struct device_driver defines a set of operations that the driver model
-core calls. Most of these operations are probably similar to
-operations the bus already defines for drivers, but taking different
-parameters.
-
-It would be difficult and tedious to force every driver on a bus to
-simultaneously convert their drivers to generic format. Instead, the
-bus driver should define single instances of the generic methods that
-forward call to the bus-specific drivers. For instance:
-
-
-static int pci_device_remove(struct device * dev)
-{
- struct pci_dev * pci_dev = to_pci_dev(dev);
- struct pci_driver * drv = pci_dev->driver;
-
- if (drv) {
- if (drv->remove)
- drv->remove(pci_dev);
- pci_dev->driver = NULL;
- }
- return 0;
-}
-
-
-The generic driver should be initialized with these methods before it
-is registered.
-
- /* initialize common driver fields */
- drv->driver.name = drv->name;
- drv->driver.bus = &pci_bus_type;
- drv->driver.probe = pci_device_probe;
- drv->driver.resume = pci_device_resume;
- drv->driver.suspend = pci_device_suspend;
- drv->driver.remove = pci_device_remove;
-
- /* register with core */
- driver_register(&drv->driver);
-
-
-Ideally, the bus should only initialize the fields if they are not
-already set. This allows the drivers to implement their own generic
-methods.
-
-
-Step 5: Support generic driver binding.
-
-The model assumes that a device or driver can be dynamically
-registered with the bus at any time. When registration happens,
-devices must be bound to a driver, or drivers must be bound to all
-devices that it supports.
-
-A driver typically contains a list of device IDs that it supports. The
-bus driver compares these IDs to the IDs of devices registered with it.
-The format of the device IDs, and the semantics for comparing them are
-bus-specific, so the generic model does attempt to generalize them.
-
-Instead, a bus may supply a method in struct bus_type that does the
-comparison:
-
- int (*match)(struct device * dev, struct device_driver * drv);
-
-match should return positive value if the driver supports the device,
-and zero otherwise. It may also return error code (for example
--EPROBE_DEFER) if determining that given driver supports the device is
-not possible.
-
-When a device is registered, the bus's list of drivers is iterated
-over. bus->match() is called for each one until a match is found.
-
-When a driver is registered, the bus's list of devices is iterated
-over. bus->match() is called for each device that is not already
-claimed by a driver.
-
-When a device is successfully bound to a driver, device->driver is
-set, the device is added to a per-driver list of devices, and a
-symlink is created in the driver's sysfs directory that points to the
-device's physical directory:
-
-/sys/bus/pci/drivers/
-|-- 3c59x
-| `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
-|-- Ensoniq AudioPCI
-|-- agpgart-amdk7
-| `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
-|-- e100
-| `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
-`-- serial
-
-
-This driver binding should replace the existing driver binding
-mechanism the bus currently uses.
-
-
-Step 6: Supply a hotplug callback.
-
-Whenever a device is registered with the driver model core, the
-userspace program /sbin/hotplug is called to notify userspace.
-Users can define actions to perform when a device is inserted or
-removed.
-
-The driver model core passes several arguments to userspace via
-environment variables, including
-
-- ACTION: set to 'add' or 'remove'
-- DEVPATH: set to the device's physical path in sysfs.
-
-A bus driver may also supply additional parameters for userspace to
-consume. To do this, a bus must implement the 'hotplug' method in
-struct bus_type:
-
- int (*hotplug) (struct device *dev, char **envp,
- int num_envp, char *buffer, int buffer_size);
-
-This is called immediately before /sbin/hotplug is executed.
-
-
-Step 7: Cleaning up the bus driver.
-
-The generic bus, device, and driver structures provide several fields
-that can replace those defined privately to the bus driver.
-
-- Device list.
-
-struct bus_type contains a list of all devices registered with the bus
-type. This includes all devices on all instances of that bus type.
-An internal list that the bus uses may be removed, in favor of using
-this one.
-
-The core provides an iterator to access these devices.
-
-int bus_for_each_dev(struct bus_type * bus, struct device * start,
- void * data, int (*fn)(struct device *, void *));
-
-
-- Driver list.
-
-struct bus_type also contains a list of all drivers registered with
-it. An internal list of drivers that the bus driver maintains may
-be removed in favor of using the generic one.
-
-The drivers may be iterated over, like devices:
-
-int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
- void * data, int (*fn)(struct device_driver *, void *));
-
-
-Please see drivers/base/bus.c for more information.
-
-
-- rwsem
-
-struct bus_type contains an rwsem that protects all core accesses to
-the device and driver lists. This can be used by the bus driver
-internally, and should be used when accessing the device or driver
-lists the bus maintains.
-
-
-- Device and driver fields.
-
-Some of the fields in struct device and struct device_driver duplicate
-fields in the bus-specific representations of these objects. Feel free
-to remove the bus-specific ones and favor the generic ones. Note
-though, that this will likely mean fixing up all the drivers that
-reference the bus-specific fields (though those should all be 1-line
-changes).
-
diff --git a/Documentation/early-userspace/README b/Documentation/early-userspace/README
deleted file mode 100644
index 955d667dc87e..000000000000
--- a/Documentation/early-userspace/README
+++ /dev/null
@@ -1,151 +0,0 @@
-Early userspace support
-=======================
-
-Last update: 2004-12-20 tlh
-
-
-"Early userspace" is a set of libraries and programs that provide
-various pieces of functionality that are important enough to be
-available while a Linux kernel is coming up, but that don't need to be
-run inside the kernel itself.
-
-It consists of several major infrastructure components:
-
-- gen_init_cpio, a program that builds a cpio-format archive
- containing a root filesystem image. This archive is compressed, and
- the compressed image is linked into the kernel image.
-- initramfs, a chunk of code that unpacks the compressed cpio image
- midway through the kernel boot process.
-- klibc, a userspace C library, currently packaged separately, that is
- optimized for correctness and small size.
-
-The cpio file format used by initramfs is the "newc" (aka "cpio -H newc")
-format, and is documented in the file "buffer-format.txt". There are
-two ways to add an early userspace image: specify an existing cpio
-archive to be used as the image or have the kernel build process build
-the image from specifications.
-
-CPIO ARCHIVE method
-
-You can create a cpio archive that contains the early userspace image.
-Your cpio archive should be specified in CONFIG_INITRAMFS_SOURCE and it
-will be used directly. Only a single cpio file may be specified in
-CONFIG_INITRAMFS_SOURCE and directory and file names are not allowed in
-combination with a cpio archive.
-
-IMAGE BUILDING method
-
-The kernel build process can also build an early userspace image from
-source parts rather than supplying a cpio archive. This method provides
-a way to create images with root-owned files even though the image was
-built by an unprivileged user.
-
-The image is specified as one or more sources in
-CONFIG_INITRAMFS_SOURCE. Sources can be either directories or files -
-cpio archives are *not* allowed when building from sources.
-
-A source directory will have it and all of its contents packaged. The
-specified directory name will be mapped to '/'. When packaging a
-directory, limited user and group ID translation can be performed.
-INITRAMFS_ROOT_UID can be set to a user ID that needs to be mapped to
-user root (0). INITRAMFS_ROOT_GID can be set to a group ID that needs
-to be mapped to group root (0).
-
-A source file must be directives in the format required by the
-usr/gen_init_cpio utility (run 'usr/gen_init_cpio -h' to get the
-file format). The directives in the file will be passed directly to
-usr/gen_init_cpio.
-
-When a combination of directories and files are specified then the
-initramfs image will be an aggregate of all of them. In this way a user
-can create a 'root-image' directory and install all files into it.
-Because device-special files cannot be created by a unprivileged user,
-special files can be listed in a 'root-files' file. Both 'root-image'
-and 'root-files' can be listed in CONFIG_INITRAMFS_SOURCE and a complete
-early userspace image can be built by an unprivileged user.
-
-As a technical note, when directories and files are specified, the
-entire CONFIG_INITRAMFS_SOURCE is passed to
-usr/gen_initramfs_list.sh. This means that CONFIG_INITRAMFS_SOURCE
-can really be interpreted as any legal argument to
-gen_initramfs_list.sh. If a directory is specified as an argument then
-the contents are scanned, uid/gid translation is performed, and
-usr/gen_init_cpio file directives are output. If a directory is
-specified as an argument to usr/gen_initramfs_list.sh then the
-contents of the file are simply copied to the output. All of the output
-directives from directory scanning and file contents copying are
-processed by usr/gen_init_cpio.
-
-See also 'usr/gen_initramfs_list.sh -h'.
-
-Where's this all leading?
-=========================
-
-The klibc distribution contains some of the necessary software to make
-early userspace useful. The klibc distribution is currently
-maintained separately from the kernel.
-
-You can obtain somewhat infrequent snapshots of klibc from
-https://www.kernel.org/pub/linux/libs/klibc/
-
-For active users, you are better off using the klibc git
-repository, at http://git.kernel.org/?p=libs/klibc/klibc.git
-
-The standalone klibc distribution currently provides three components,
-in addition to the klibc library:
-
-- ipconfig, a program that configures network interfaces. It can
- configure them statically, or use DHCP to obtain information
- dynamically (aka "IP autoconfiguration").
-- nfsmount, a program that can mount an NFS filesystem.
-- kinit, the "glue" that uses ipconfig and nfsmount to replace the old
- support for IP autoconfig, mount a filesystem over NFS, and continue
- system boot using that filesystem as root.
-
-kinit is built as a single statically linked binary to save space.
-
-Eventually, several more chunks of kernel functionality will hopefully
-move to early userspace:
-
-- Almost all of init/do_mounts* (the beginning of this is already in
- place)
-- ACPI table parsing
-- Insert unwieldy subsystem that doesn't really need to be in kernel
- space here
-
-If kinit doesn't meet your current needs and you've got bytes to burn,
-the klibc distribution includes a small Bourne-compatible shell (ash)
-and a number of other utilities, so you can replace kinit and build
-custom initramfs images that meet your needs exactly.
-
-For questions and help, you can sign up for the early userspace
-mailing list at http://www.zytor.com/mailman/listinfo/klibc
-
-How does it work?
-=================
-
-The kernel has currently 3 ways to mount the root filesystem:
-
-a) all required device and filesystem drivers compiled into the kernel, no
- initrd. init/main.c:init() will call prepare_namespace() to mount the
- final root filesystem, based on the root= option and optional init= to run
- some other init binary than listed at the end of init/main.c:init().
-
-b) some device and filesystem drivers built as modules and stored in an
- initrd. The initrd must contain a binary '/linuxrc' which is supposed to
- load these driver modules. It is also possible to mount the final root
- filesystem via linuxrc and use the pivot_root syscall. The initrd is
- mounted and executed via prepare_namespace().
-
-c) using initramfs. The call to prepare_namespace() must be skipped.
- This means that a binary must do all the work. Said binary can be stored
- into initramfs either via modifying usr/gen_init_cpio.c or via the new
- initrd format, an cpio archive. It must be called "/init". This binary
- is responsible to do all the things prepare_namespace() would do.
-
- To maintain backwards compatibility, the /init binary will only run if it
- comes via an initramfs cpio archive. If this is not the case,
- init/main.c:init() will run prepare_namespace() to mount the final root
- and exec one of the predefined init binaries.
-
-Bryan O'Sullivan <bos@serpentine.com>
diff --git a/Documentation/early-userspace/buffer-format.txt b/Documentation/early-userspace/buffer-format.txt
deleted file mode 100644
index e1fd7f9dad16..000000000000
--- a/Documentation/early-userspace/buffer-format.txt
+++ /dev/null
@@ -1,112 +0,0 @@
- initramfs buffer format
- -----------------------
-
- Al Viro, H. Peter Anvin
- Last revision: 2002-01-13
-
-Starting with kernel 2.5.x, the old "initial ramdisk" protocol is
-getting {replaced/complemented} with the new "initial ramfs"
-(initramfs) protocol. The initramfs contents is passed using the same
-memory buffer protocol used by the initrd protocol, but the contents
-is different. The initramfs buffer contains an archive which is
-expanded into a ramfs filesystem; this document details the format of
-the initramfs buffer format.
-
-The initramfs buffer format is based around the "newc" or "crc" CPIO
-formats, and can be created with the cpio(1) utility. The cpio
-archive can be compressed using gzip(1). One valid version of an
-initramfs buffer is thus a single .cpio.gz file.
-
-The full format of the initramfs buffer is defined by the following
-grammar, where:
- * is used to indicate "0 or more occurrences of"
- (|) indicates alternatives
- + indicates concatenation
- GZIP() indicates the gzip(1) of the operand
- ALGN(n) means padding with null bytes to an n-byte boundary
-
- initramfs := ("\0" | cpio_archive | cpio_gzip_archive)*
-
- cpio_gzip_archive := GZIP(cpio_archive)
-
- cpio_archive := cpio_file* + (<nothing> | cpio_trailer)
-
- cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data
-
- cpio_trailer := ALGN(4) + cpio_header + "TRAILER!!!\0" + ALGN(4)
-
-
-In human terms, the initramfs buffer contains a collection of
-compressed and/or uncompressed cpio archives (in the "newc" or "crc"
-formats); arbitrary amounts zero bytes (for padding) can be added
-between members.
-
-The cpio "TRAILER!!!" entry (cpio end-of-archive) is optional, but is
-not ignored; see "handling of hard links" below.
-
-The structure of the cpio_header is as follows (all fields contain
-hexadecimal ASCII numbers fully padded with '0' on the left to the
-full width of the field, for example, the integer 4780 is represented
-by the ASCII string "000012ac"):
-
-Field name Field size Meaning
-c_magic 6 bytes The string "070701" or "070702"
-c_ino 8 bytes File inode number
-c_mode 8 bytes File mode and permissions
-c_uid 8 bytes File uid
-c_gid 8 bytes File gid
-c_nlink 8 bytes Number of links
-c_mtime 8 bytes Modification time
-c_filesize 8 bytes Size of data field
-c_maj 8 bytes Major part of file device number
-c_min 8 bytes Minor part of file device number
-c_rmaj 8 bytes Major part of device node reference
-c_rmin 8 bytes Minor part of device node reference
-c_namesize 8 bytes Length of filename, including final \0
-c_chksum 8 bytes Checksum of data field if c_magic is 070702;
- otherwise zero
-
-The c_mode field matches the contents of st_mode returned by stat(2)
-on Linux, and encodes the file type and file permissions.
-
-The c_filesize should be zero for any file which is not a regular file
-or symlink.
-
-The c_chksum field contains a simple 32-bit unsigned sum of all the
-bytes in the data field. cpio(1) refers to this as "crc", which is
-clearly incorrect (a cyclic redundancy check is a different and
-significantly stronger integrity check), however, this is the
-algorithm used.
-
-If the filename is "TRAILER!!!" this is actually an end-of-archive
-marker; the c_filesize for an end-of-archive marker must be zero.
-
-
-*** Handling of hard links
-
-When a nondirectory with c_nlink > 1 is seen, the (c_maj,c_min,c_ino)
-tuple is looked up in a tuple buffer. If not found, it is entered in
-the tuple buffer and the entry is created as usual; if found, a hard
-link rather than a second copy of the file is created. It is not
-necessary (but permitted) to include a second copy of the file
-contents; if the file contents is not included, the c_filesize field
-should be set to zero to indicate no data section follows. If data is
-present, the previous instance of the file is overwritten; this allows
-the data-carrying instance of a file to occur anywhere in the sequence
-(GNU cpio is reported to attach the data to the last instance of a
-file only.)
-
-c_filesize must not be zero for a symlink.
-
-When a "TRAILER!!!" end-of-archive marker is seen, the tuple buffer is
-reset. This permits archives which are generated independently to be
-concatenated.
-
-To combine file data from different sources (without having to
-regenerate the (c_maj,c_min,c_ino) fields), therefore, either one of
-the following techniques can be used:
-
-a) Separate the different file data sources with a "TRAILER!!!"
- end-of-archive marker, or
-
-b) Make sure c_nlink == 1 for all nondirectory entries.
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
deleted file mode 100644
index 2806e5544e43..000000000000
--- a/Documentation/eisa.txt
+++ /dev/null
@@ -1,230 +0,0 @@
-================
-EISA bus support
-================
-
-:Author: Marc Zyngier <maz@wild-wind.fr.eu.org>
-
-This document groups random notes about porting EISA drivers to the
-new EISA/sysfs API.
-
-Starting from version 2.5.59, the EISA bus is almost given the same
-status as other much more mainstream busses such as PCI or USB. This
-has been possible through sysfs, which defines a nice enough set of
-abstractions to manage busses, devices and drivers.
-
-Although the new API is quite simple to use, converting existing
-drivers to the new infrastructure is not an easy task (mostly because
-detection code is generally also used to probe ISA cards). Moreover,
-most EISA drivers are among the oldest Linux drivers so, as you can
-imagine, some dust has settled here over the years.
-
-The EISA infrastructure is made up of three parts:
-
- - The bus code implements most of the generic code. It is shared
- among all the architectures that the EISA code runs on. It
- implements bus probing (detecting EISA cards available on the bus),
- allocates I/O resources, allows fancy naming through sysfs, and
- offers interfaces for driver to register.
-
- - The bus root driver implements the glue between the bus hardware
- and the generic bus code. It is responsible for discovering the
- device implementing the bus, and setting it up to be latter probed
- by the bus code. This can go from something as simple as reserving
- an I/O region on x86, to the rather more complex, like the hppa
- EISA code. This is the part to implement in order to have EISA
- running on an "new" platform.
-
- - The driver offers the bus a list of devices that it manages, and
- implements the necessary callbacks to probe and release devices
- whenever told to.
-
-Every function/structure below lives in <linux/eisa.h>, which depends
-heavily on <linux/device.h>.
-
-Bus root driver
-===============
-
-::
-
- int eisa_root_register (struct eisa_root_device *root);
-
-The eisa_root_register function is used to declare a device as the
-root of an EISA bus. The eisa_root_device structure holds a reference
-to this device, as well as some parameters for probing purposes::
-
- struct eisa_root_device {
- struct device *dev; /* Pointer to bridge device */
- struct resource *res;
- unsigned long bus_base_addr;
- int slots; /* Max slot number */
- int force_probe; /* Probe even when no slot 0 */
- u64 dma_mask; /* from bridge device */
- int bus_nr; /* Set by eisa_root_register */
- struct resource eisa_root_res; /* ditto */
- };
-
-============= ======================================================
-node used for eisa_root_register internal purpose
-dev pointer to the root device
-res root device I/O resource
-bus_base_addr slot 0 address on this bus
-slots max slot number to probe
-force_probe Probe even when slot 0 is empty (no EISA mainboard)
-dma_mask Default DMA mask. Usually the bridge device dma_mask.
-bus_nr unique bus id, set by eisa_root_register
-============= ======================================================
-
-Driver
-======
-
-::
-
- int eisa_driver_register (struct eisa_driver *edrv);
- void eisa_driver_unregister (struct eisa_driver *edrv);
-
-Clear enough ?
-
-::
-
- struct eisa_device_id {
- char sig[EISA_SIG_LEN];
- unsigned long driver_data;
- };
-
- struct eisa_driver {
- const struct eisa_device_id *id_table;
- struct device_driver driver;
- };
-
-=============== ====================================================
-id_table an array of NULL terminated EISA id strings,
- followed by an empty string. Each string can
- optionally be paired with a driver-dependent value
- (driver_data).
-
-driver a generic driver, such as described in
- Documentation/driver-model/driver.txt. Only .name,
- .probe and .remove members are mandatory.
-=============== ====================================================
-
-An example is the 3c59x driver::
-
- static struct eisa_device_id vortex_eisa_ids[] = {
- { "TCM5920", EISA_3C592_OFFSET },
- { "TCM5970", EISA_3C597_OFFSET },
- { "" }
- };
-
- static struct eisa_driver vortex_eisa_driver = {
- .id_table = vortex_eisa_ids,
- .driver = {
- .name = "3c59x",
- .probe = vortex_eisa_probe,
- .remove = vortex_eisa_remove
- }
- };
-
-Device
-======
-
-The sysfs framework calls .probe and .remove functions upon device
-discovery and removal (note that the .remove function is only called
-when driver is built as a module).
-
-Both functions are passed a pointer to a 'struct device', which is
-encapsulated in a 'struct eisa_device' described as follows::
-
- struct eisa_device {
- struct eisa_device_id id;
- int slot;
- int state;
- unsigned long base_addr;
- struct resource res[EISA_MAX_RESOURCES];
- u64 dma_mask;
- struct device dev; /* generic device */
- };
-
-======== ============================================================
-id EISA id, as read from device. id.driver_data is set from the
- matching driver EISA id.
-slot slot number which the device was detected on
-state set of flags indicating the state of the device. Current
- flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
-res set of four 256 bytes I/O regions allocated to this device
-dma_mask DMA mask set from the parent device.
-dev generic device (see Documentation/driver-model/device.txt)
-======== ============================================================
-
-You can get the 'struct eisa_device' from 'struct device' using the
-'to_eisa_device' macro.
-
-Misc stuff
-==========
-
-::
-
- void eisa_set_drvdata (struct eisa_device *edev, void *data);
-
-Stores data into the device's driver_data area.
-
-::
-
- void *eisa_get_drvdata (struct eisa_device *edev):
-
-Gets the pointer previously stored into the device's driver_data area.
-
-::
-
- int eisa_get_region_index (void *addr);
-
-Returns the region number (0 <= x < EISA_MAX_RESOURCES) of a given
-address.
-
-Kernel parameters
-=================
-
-eisa_bus.enable_dev
- A comma-separated list of slots to be enabled, even if the firmware
- set the card as disabled. The driver must be able to properly
- initialize the device in such conditions.
-
-eisa_bus.disable_dev
- A comma-separated list of slots to be enabled, even if the firmware
- set the card as enabled. The driver won't be called to handle this
- device.
-
-virtual_root.force_probe
- Force the probing code to probe EISA slots even when it cannot find an
- EISA compliant mainboard (nothing appears on slot 0). Defaults to 0
- (don't force), and set to 1 (force probing) when either
- CONFIG_ALPHA_JENSEN or CONFIG_EISA_VLB_PRIMING are set.
-
-Random notes
-============
-
-Converting an EISA driver to the new API mostly involves *deleting*
-code (since probing is now in the core EISA code). Unfortunately, most
-drivers share their probing routine between ISA, and EISA. Special
-care must be taken when ripping out the EISA code, so other busses
-won't suffer from these surgical strikes...
-
-You *must not* expect any EISA device to be detected when returning
-from eisa_driver_register, since the chances are that the bus has not
-yet been probed. In fact, that's what happens most of the time (the
-bus root driver usually kicks in rather late in the boot process).
-Unfortunately, most drivers are doing the probing by themselves, and
-expect to have explored the whole machine when they exit their probe
-routine.
-
-For example, switching your favorite EISA SCSI card to the "hotplug"
-model is "the right thing"(tm).
-
-Thanks
-======
-
-I'd like to thank the following people for their help:
-
-- Xavier Benigni for lending me a wonderful Alpha Jensen,
-- James Bottomley, Jeff Garzik for getting this stuff into the kernel,
-- Andries Brouwer for contributing numerous EISA ids,
-- Catrin Jones for coping with far too many machines at home.
diff --git a/Documentation/extcon/intel-int3496.txt b/Documentation/extcon/intel-int3496.txt
deleted file mode 100644
index 8155dbc7fad3..000000000000
--- a/Documentation/extcon/intel-int3496.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Intel INT3496 ACPI device extcon driver documentation
------------------------------------------------------
-
-The Intel INT3496 ACPI device extcon driver is a driver for ACPI
-devices with an acpi-id of INT3496, such as found for example on
-Intel Baytrail and Cherrytrail tablets.
-
-This ACPI device describes how the OS can read the id-pin of the devices'
-USB-otg port, as well as how it optionally can enable Vbus output on the
-otg port and how it can optionally control the muxing of the data pins
-between an USB host and an USB peripheral controller.
-
-The ACPI devices exposes this functionality by returning an array with up
-to 3 gpio descriptors from its ACPI _CRS (Current Resource Settings) call:
-
-Index 0: The input gpio for the id-pin, this is always present and valid
-Index 1: The output gpio for enabling Vbus output from the device to the otg
- port, write 1 to enable the Vbus output (this gpio descriptor may
- be absent or invalid)
-Index 2: The output gpio for muxing of the data pins between the USB host and
- the USB peripheral controller, write 1 to mux to the peripheral
- controller
-
-There is a mapping between indices and GPIO connection IDs as follows
- id index 0
- vbus index 1
- mux index 2
diff --git a/Documentation/fault-injection/index.rst b/Documentation/fault-injection/index.rst
index 92b5639ed07a..8408a8a91b34 100644
--- a/Documentation/fault-injection/index.rst
+++ b/Documentation/fault-injection/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
===============
fault-injection
diff --git a/Documentation/fault-injection/nvme-fault-injection.rst b/Documentation/fault-injection/nvme-fault-injection.rst
index bbb1bf3e8650..cdb2e829228e 100644
--- a/Documentation/fault-injection/nvme-fault-injection.rst
+++ b/Documentation/fault-injection/nvme-fault-injection.rst
@@ -118,3 +118,61 @@ Message from dmesg::
cpu_startup_entry+0x6f/0x80
start_secondary+0x187/0x1e0
secondary_startup_64+0xa5/0xb0
+
+Example 3: Inject an error into the 10th admin command
+------------------------------------------------------
+
+::
+
+ echo 100 > /sys/kernel/debug/nvme0/fault_inject/probability
+ echo 10 > /sys/kernel/debug/nvme0/fault_inject/space
+ echo 1 > /sys/kernel/debug/nvme0/fault_inject/times
+ nvme reset /dev/nvme0
+
+Expected Result::
+
+ After NVMe controller reset, the reinitialization may or may not succeed.
+ It depends on which admin command is actually forced to fail.
+
+Message from dmesg::
+
+ nvme nvme0: resetting controller
+ FAULT_INJECTION: forcing a failure.
+ name fault_inject, interval 1, probability 100, space 1, times 1
+ CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0-rc2+ #2
+ Hardware name: MSI MS-7A45/B150M MORTAR ARCTIC (MS-7A45), BIOS 1.50 04/25/2017
+ Call Trace:
+ <IRQ>
+ dump_stack+0x63/0x85
+ should_fail+0x14a/0x170
+ nvme_should_fail+0x38/0x80 [nvme_core]
+ nvme_irq+0x129/0x280 [nvme]
+ ? blk_mq_end_request+0xb3/0x120
+ __handle_irq_event_percpu+0x84/0x1a0
+ handle_irq_event_percpu+0x32/0x80
+ handle_irq_event+0x3b/0x60
+ handle_edge_irq+0x7f/0x1a0
+ handle_irq+0x20/0x30
+ do_IRQ+0x4e/0xe0
+ common_interrupt+0xf/0xf
+ </IRQ>
+ RIP: 0010:cpuidle_enter_state+0xc5/0x460
+ Code: ff e8 8f 5f 86 ff 80 7d c7 00 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 69 03 00 00 31 ff e8 62 aa 8c ff fb 66 0f 1f 44 00 00 <45> 85 ed 0f 88 37 03 00 00 4c 8b 45 d0 4c 2b 45 b8 48 ba cf f7 53
+ RSP: 0018:ffffffff88c03dd0 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffdc
+ RAX: ffff9dac25a2ac80 RBX: ffffffff88d53760 RCX: 000000000000001f
+ RDX: 0000000000000000 RSI: 000000002d958403 RDI: 0000000000000000
+ RBP: ffffffff88c03e18 R08: fffffff75e35ffb7 R09: 00000a49a56c0b48
+ R10: ffffffff88c03da0 R11: 0000000000001b0c R12: ffff9dac25a34d00
+ R13: 0000000000000006 R14: 0000000000000006 R15: ffffffff88d53760
+ cpuidle_enter+0x2e/0x40
+ call_cpuidle+0x23/0x40
+ do_idle+0x201/0x280
+ cpu_startup_entry+0x1d/0x20
+ rest_init+0xaa/0xb0
+ arch_call_rest_init+0xe/0x1b
+ start_kernel+0x51c/0x53b
+ x86_64_start_reservations+0x24/0x26
+ x86_64_start_kernel+0x74/0x77
+ secondary_startup_64+0xa4/0xb0
+ nvme nvme0: Could not set queue count (16385)
+ nvme nvme0: IO queues not created
diff --git a/Documentation/fb/fbcon.rst b/Documentation/fb/fbcon.rst
index 1da65b9000de..ebca41785abe 100644
--- a/Documentation/fb/fbcon.rst
+++ b/Documentation/fb/fbcon.rst
@@ -187,7 +187,7 @@ the hardware. Thus, in a VGA console::
Assuming the VGA driver can be unloaded, one must first unbind the VGA driver
from the console layer before unloading the driver. The VGA driver cannot be
unloaded if it is still bound to the console layer. (See
-Documentation/console/console.txt for more information).
+Documentation/driver-api/console.rst for more information).
This is more complicated in the case of the framebuffer console (fbcon),
because fbcon is an intermediate layer between the console and the drivers::
@@ -204,7 +204,7 @@ fbcon. Thus, there is no need to explicitly unbind the fbdev drivers from
fbcon.
So, how do we unbind fbcon from the console? Part of the answer is in
-Documentation/console/console.txt. To summarize:
+Documentation/driver-api/console.rst. To summarize:
Echo a value to the bind file that represents the framebuffer console
driver. So assuming vtcon1 represents fbcon, then::
diff --git a/Documentation/fb/index.rst b/Documentation/fb/index.rst
index d47313714635..baf02393d8ee 100644
--- a/Documentation/fb/index.rst
+++ b/Documentation/fb/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
============
Frame Buffer
diff --git a/Documentation/fb/modedb.rst b/Documentation/fb/modedb.rst
index 3c2397293977..9c4e3fd39e6d 100644
--- a/Documentation/fb/modedb.rst
+++ b/Documentation/fb/modedb.rst
@@ -53,6 +53,20 @@ Specifying the option multiple times for different ports is possible, e.g.::
video=LVDS-1:d video=HDMI-1:D
+Options can also be passed after the mode, using commas as separator.
+
+ Sample usage: 720x480,rotate=180 - 720x480 mode, rotated by 180 degrees
+
+Valid options are::
+
+ - margin_top, margin_bottom, margin_left, margin_right (integer):
+ Number of pixels in the margins, typically to deal with overscan on TVs
+ - reflect_x (boolean): Perform an axial symmetry on the X axis
+ - reflect_y (boolean): Perform an axial symmetry on the Y axis
+ - rotate (integer): Rotate the initial framebuffer by x
+ degrees. Valid values are 0, 90, 180 and 270.
+
+
-----------------------------------------------------------------------------
What is the VESA(TM) Coordinated Video Timings (CVT)?
diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst
index 2ed0dfb661cf..6821c87b7893 100644
--- a/Documentation/fb/vesafb.rst
+++ b/Documentation/fb/vesafb.rst
@@ -30,7 +30,7 @@ How to use it?
==============
Switching modes is done using the vga=... boot parameter. Read
-Documentation/svga.txt for details.
+Documentation/admin-guide/svga.rst for details.
You should compile in both vgacon (for text mode) and vesafb (for
graphics mode). Which of them takes over the console depends on
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index dac435575384..204dd3ea36bb 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -361,8 +361,6 @@ so fl_release_private called on a lease should not block.
----------------------- lock_manager_operations ---------------------------
prototypes:
- int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
- unsigned long (*lm_owner_key)(struct file_lock *);
void (*lm_notify)(struct file_lock *); /* unblock callback */
int (*lm_grant)(struct file_lock *, struct file_lock *, int);
void (*lm_break)(struct file_lock *); /* break_lease callback */
@@ -371,23 +369,11 @@ prototypes:
locking rules:
inode->i_lock blocked_lock_lock may block
-lm_compare_owner: yes[1] maybe no
-lm_owner_key yes[1] yes no
lm_notify: yes yes no
lm_grant: no no no
lm_break: yes no no
lm_change yes no no
-[1]: ->lm_compare_owner and ->lm_owner_key are generally called with
-*an* inode->i_lock held. It may not be the i_lock of the inode
-associated with either file_lock argument! This is the case with deadlock
-detection, since the code has to chase down the owners of locks that may
-be entirely unrelated to the one on which the lock is being acquired.
-For deadlock detection however, the blocked_lock_lock is also held. The
-fact that these locks are held ensures that the file_locks do not
-disappear out from under you while doing the comparison or generating an
-owner key.
-
--------------------------- buffer_head -----------------------------------
prototypes:
void (*b_end_io)(struct buffer_head *bh, int uptodate);
diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt
index 61311356025d..545262c167c3 100644
--- a/Documentation/filesystems/coda.txt
+++ b/Documentation/filesystems/coda.txt
@@ -481,7 +481,10 @@ kernel support.
-
+ struct coda_timespec {
+ int64_t tv_sec; /* seconds */
+ long tv_nsec; /* nanoseconds */
+ };
struct coda_vattr {
enum coda_vtype va_type; /* vnode type (for create) */
@@ -493,9 +496,9 @@ kernel support.
long va_fileid; /* file id */
u_quad_t va_size; /* file size in bytes */
long va_blocksize; /* blocksize preferred for i/o */
- struct timespec va_atime; /* time of last access */
- struct timespec va_mtime; /* time of last modification */
- struct timespec va_ctime; /* time file changed */
+ struct coda_timespec va_atime; /* time of last access */
+ struct coda_timespec va_mtime; /* time of last modification */
+ struct coda_timespec va_ctime; /* time file changed */
u_long va_gen; /* generation number of file */
u_long va_flags; /* flags defined for file */
dev_t va_rdev; /* device special file represents */
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 6d2c0d340dea..679729442fd2 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -76,7 +76,7 @@ exposure of uninitialized data through mmap.
These filesystems may be used for inspiration:
- ext2: see Documentation/filesystems/ext2.txt
- ext4: see Documentation/filesystems/ext4/
-- xfs: see Documentation/filesystems/xfs.txt
+- xfs: see Documentation/admin-guide/xfs.rst
Handling Media Errors
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 4a0a9c3f4af6..9e27c843d00e 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -169,7 +169,7 @@ byte offsets over a base for the register block.
If you want to dump an u32 array in debugfs, you can create file with:
- struct dentry *debugfs_create_u32_array(const char *name, umode_t mode,
+ void debugfs_create_u32_array(const char *name, umode_t mode,
struct dentry *parent,
u32 *array, u32 elements);
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index a19973a4dd1e..94c2cf0292f5 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -57,7 +57,13 @@ noacl Don't support POSIX ACLs.
nobh Do not attach buffer_heads to file pagecache.
-grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2.
+quota, usrquota Enable user disk quota support
+ (requires CONFIG_QUOTA).
+
+grpquota Enable group disk quota support
+ (requires CONFIG_QUOTA).
+
+noquota option ls silently ignored by ext2.
Specification
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index f7b5e4ff0de3..496fa28b2492 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -214,11 +214,22 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix",
non-atomic files likewise "nobarrier" mount option.
test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt
context. The fake fscrypt context is used by xfstests.
-checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable"
+checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enable"
to reenable checkpointing. Is enabled by default. While
disabled, any unmounting or unexpected shutdowns will cause
the filesystem contents to appear as they did when the
filesystem was mounted with that option.
+ While mounting with checkpoint=disabled, the filesystem must
+ run garbage collection to ensure that all available space can
+ be used. If this takes too much time, the mount may return
+ EAGAIN. You may optionally add a value to indicate how much
+ of the disk you would be willing to temporarily give up to
+ avoid additional garbage collection. This can be given as a
+ number of blocks, or as a percent. For instance, mounting
+ with checkpoint=disable:100% would always succeed, but it may
+ hide up to all remaining free space. The actual space that
+ would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
+ This space is reclaimed once checkpoint=enable.
================================================================================
DEBUGFS ENTRIES
@@ -246,11 +257,14 @@ Files in /sys/fs/f2fs/<devname>
..............................................................................
File Content
- gc_max_sleep_time This tuning parameter controls the maximum sleep
+ gc_urgent_sleep_time This parameter controls sleep time for gc_urgent.
+ 500 ms is set by default. See above gc_urgent.
+
+ gc_min_sleep_time This tuning parameter controls the minimum sleep
time for the garbage collection thread. Time is
in milliseconds.
- gc_min_sleep_time This tuning parameter controls the minimum sleep
+ gc_max_sleep_time This tuning parameter controls the maximum sleep
time for the garbage collection thread. Time is
in milliseconds.
@@ -270,9 +284,6 @@ Files in /sys/fs/f2fs/<devname>
to 1, background thread starts to do GC by given
gc_urgent_sleep_time interval.
- gc_urgent_sleep_time This parameter controls sleep time for gc_urgent.
- 500 ms is set by default. See above gc_urgent.
-
reclaim_segments This parameter controls the number of prefree
segments to be reclaimed. If the number of prefree
segments is larger than the number of segments
@@ -287,7 +298,16 @@ Files in /sys/fs/f2fs/<devname>
checkpoint is triggered, and issued during the
checkpoint. By default, it is disabled with 0.
- trim_sections This parameter controls the number of sections
+ discard_granularity This parameter controls the granularity of discard
+ command size. It will issue discard commands iif
+ the size is larger than given granularity. Its
+ unit size is 4KB, and 4 (=16KB) is set by default.
+ The maximum value is 128 (=512KB).
+
+ reserved_blocks This parameter indicates the number of blocks that
+ f2fs reserves internally for root.
+
+ batched_trim_sections This parameter controls the number of sections
to be trimmed out in batch mode when FITRIM
conducts. 32 sections is set by default.
@@ -309,11 +329,35 @@ Files in /sys/fs/f2fs/<devname>
the number is less than this value, it triggers
in-place-updates.
+ min_seq_blocks This parameter controls the threshold to serialize
+ write IOs issued by multiple threads in parallel.
+
+ min_hot_blocks This parameter controls the threshold to allocate
+ a hot data log for pending data blocks to write.
+
+ min_ssr_sections This parameter adds the threshold when deciding
+ SSR block allocation. If this is large, SSR mode
+ will be enabled early.
+
+ ram_thresh This parameter controls the memory footprint used
+ by free nids and cached nat entries. By default,
+ 10 is set, which indicates 10 MB / 1 GB RAM.
+
+ ra_nid_pages When building free nids, F2FS reads NAT blocks
+ ahead for speed up. Default is 0.
+
+ dirty_nats_ratio Given dirty ratio of cached nat entries, F2FS
+ determines flushing them in background.
+
max_victim_search This parameter controls the number of trials to
find a victim segment when conducting SSR and
cleaning operations. The default value is 4096
which covers 8GB block address range.
+ migration_granularity For large-sized sections, F2FS can stop GC given
+ this granularity instead of reclaiming entire
+ section.
+
dir_level This parameter controls the directory level to
support large directory. If a directory has a
number of files, it can reduce the file lookup
@@ -321,9 +365,53 @@ Files in /sys/fs/f2fs/<devname>
Otherwise, it needs to decrease this value to
reduce the space overhead. The default value is 0.
- ram_thresh This parameter controls the memory footprint used
- by free nids and cached nat entries. By default,
- 10 is set, which indicates 10 MB / 1 GB RAM.
+ cp_interval F2FS tries to do checkpoint periodically, 60 secs
+ by default.
+
+ idle_interval F2FS detects system is idle, if there's no F2FS
+ operations during given interval, 5 secs by
+ default.
+
+ discard_idle_interval F2FS detects the discard thread is idle, given
+ time interval. Default is 5 secs.
+
+ gc_idle_interval F2FS detects the GC thread is idle, given time
+ interval. Default is 5 secs.
+
+ umount_discard_timeout When unmounting the disk, F2FS waits for finishing
+ queued discard commands which can take huge time.
+ This gives time out for it, 5 secs by default.
+
+ iostat_enable This controls to enable/disable iostat in F2FS.
+
+ readdir_ra This enables/disabled readahead of inode blocks
+ in readdir, and default is enabled.
+
+ gc_pin_file_thresh This indicates how many GC can be failed for the
+ pinned file. If it exceeds this, F2FS doesn't
+ guarantee its pinning state. 2048 trials is set
+ by default.
+
+ extension_list This enables to change extension_list for hot/cold
+ files in runtime.
+
+ inject_rate This controls injection rate of arbitrary faults.
+
+ inject_type This controls injection type of arbitrary faults.
+
+ dirty_segments This shows # of dirty segments.
+
+ lifetime_write_kbytes This shows # of data written to the disk.
+
+ features This shows current features enabled on F2FS.
+
+ current_reserved_blocks This shows # of blocks currently reserved.
+
+ unusable If checkpoint=disable, this shows the number of
+ blocks that are unusable.
+ If checkpoint=enable it shows the number of blocks
+ that would be unusable if checkpoint=disable were
+ to be set.
================================================================================
USAGE
@@ -716,3 +804,28 @@ WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET
WRITE_LIFE_NONE " WRITE_LIFE_NONE
WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM
WRITE_LIFE_LONG " WRITE_LIFE_LONG
+
+Fallocate(2) Policy
+-------------------
+
+The default policy follows the below posix rule.
+
+Allocating disk space
+ The default operation (i.e., mode is zero) of fallocate() allocates
+ the disk space within the range specified by offset and len. The
+ file size (as reported by stat(2)) will be changed if offset+len is
+ greater than the file size. Any subregion within the range specified
+ by offset and len that did not contain data before the call will be
+ initialized to zero. This default behavior closely resembles the
+ behavior of the posix_fallocate(3) library function, and is intended
+ as a method of optimally implementing that function.
+
+However, once F2FS receives ioctl(fd, F2FS_IOC_SET_PIN_FILE) in prior to
+fallocate(fd, DEFAULT_MODE), it allocates on-disk blocks addressess having
+zero or random data, which is useful to the below scenario where:
+ 1. create(fd)
+ 2. ioctl(fd, F2FS_IOC_SET_PIN_FILE)
+ 3. fallocate(fd, 0, 0, size)
+ 4. address = fibmap(fd, offset)
+ 5. open(blkdev)
+ 6. write(blkdev, address)
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 08c23b60e016..82efa41b0e6c 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -191,7 +191,9 @@ Currently, the following pairs of encryption modes are supported:
If unsure, you should use the (AES-256-XTS, AES-256-CTS-CBC) pair.
AES-128-CBC was added only for low-powered embedded devices with
-crypto accelerators such as CAAM or CESA that do not support XTS.
+crypto accelerators such as CAAM or CESA that do not support XTS. To
+use AES-128-CBC, CONFIG_CRYPTO_SHA256 (or another SHA-256
+implementation) must be enabled so that ESSIV can be used.
Adiantum is a (primarily) stream cipher-based mode that is fast even
on CPUs without dedicated crypto instructions. It's also a true
@@ -647,3 +649,42 @@ Note that the precise way that filenames are presented to userspace
without the key is subject to change in the future. It is only meant
as a way to temporarily present valid filenames so that commands like
``rm -r`` work as expected on encrypted directories.
+
+Tests
+=====
+
+To test fscrypt, use xfstests, which is Linux's de facto standard
+filesystem test suite. First, run all the tests in the "encrypt"
+group on the relevant filesystem(s). For example, to test ext4 and
+f2fs encryption using `kvm-xfstests
+<https://github.com/tytso/xfstests-bld/blob/master/Documentation/kvm-quickstart.md>`_::
+
+ kvm-xfstests -c ext4,f2fs -g encrypt
+
+UBIFS encryption can also be tested this way, but it should be done in
+a separate command, and it takes some time for kvm-xfstests to set up
+emulated UBI volumes::
+
+ kvm-xfstests -c ubifs -g encrypt
+
+No tests should fail. However, tests that use non-default encryption
+modes (e.g. generic/549 and generic/550) will be skipped if the needed
+algorithms were not built into the kernel's crypto API. Also, tests
+that access the raw block device (e.g. generic/399, generic/548,
+generic/549, generic/550) will be skipped on UBIFS.
+
+Besides running the "encrypt" group tests, for ext4 and f2fs it's also
+possible to run most xfstests with the "test_dummy_encryption" mount
+option. This option causes all new files to be automatically
+encrypted with a dummy key, without having to make any API calls.
+This tests the encrypted I/O paths more thoroughly. To do this with
+kvm-xfstests, use the "encrypt" filesystem configuration::
+
+ kvm-xfstests -c ext4/encrypt,f2fs/encrypt -g auto
+
+Because this runs many more tests than "-g encrypt" does, it takes
+much longer to run; so also consider using `gce-xfstests
+<https://github.com/tytso/xfstests-bld/blob/master/Documentation/gce-xfstests.md>`_
+instead of kvm-xfstests::
+
+ gce-xfstests -c ext4/encrypt,f2fs/encrypt -g auto
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index d2963123eb1c..ae4332464560 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -239,7 +239,7 @@ rdinit=<executable file>
A description of the process of mounting the root file system can be
found in:
- Documentation/early-userspace/README
+ Documentation/driver-api/early-userspace/early_userspace_support.rst
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 2813a19389fe..6b7a41cfcaed 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -428,8 +428,19 @@ release it yourself.
--
[mandatory]
d_alloc_root() is gone, along with a lot of bugs caused by code
-misusing it. Replacement: d_make_root(inode). The difference is,
-d_make_root() drops the reference to inode if dentry allocation fails.
+misusing it. Replacement: d_make_root(inode). On success d_make_root(inode)
+allocates and returns a new dentry instantiated with the passed in inode.
+On failure NULL is returned and the passed in inode is dropped so the reference
+to inode is consumed in all cases and failure handling need not do any cleanup
+for the inode. If d_make_root(inode) is passed a NULL inode it returns NULL
+and also requires no further error handling. Typical usage is:
+
+ inode = foofs_new_inode(....);
+ s->s_root = d_make_root(inode);
+ if (!s->s_root)
+ /* Nothing needed for the inode cleanup */
+ return -ENOMEM;
+ ...
--
[mandatory]
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 66cad5c86171..99ca040e3f90 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -45,6 +45,7 @@ Table of Contents
3.9 /proc/<pid>/map_files - Information about memory mapped files
3.10 /proc/<pid>/timerslack_ns - Task timerslack value
3.11 /proc/<pid>/patch_state - Livepatch patch operation state
+ 3.12 /proc/<pid>/arch_status - Task architecture specific information
4 Configuring procfs
4.1 Mount options
@@ -153,9 +154,11 @@ Table 1-1: Process specific entries in /proc
symbol the task is blocked in - or "0" if not blocked.
pagemap Page table
stack Report full stack trace, enable via CONFIG_STACKTRACE
- smaps an extension based on maps, showing the memory consumption of
+ smaps An extension based on maps, showing the memory consumption of
each mapping and flags associated with it
- numa_maps an extension based on maps, showing the memory locality and
+ smaps_rollup Accumulated smaps stats for all mappings of the process. This
+ can be derived from smaps, but is faster and more convenient
+ numa_maps An extension based on maps, showing the memory locality and
binding policy as well as mem usage (in pages) of each mapping.
..............................................................................
@@ -365,7 +368,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
exit_code the thread's exit_code in the form reported by the waitpid system call
..............................................................................
-The /proc/PID/maps file containing the currently mapped memory regions and
+The /proc/PID/maps file contains the currently mapped memory regions and
their access permissions.
The format is:
@@ -416,11 +419,14 @@ is not associated with a file:
or if empty, the mapping is anonymous.
The /proc/PID/smaps is an extension based on maps, showing the memory
-consumption for each of the process's mappings. For each of mappings there
-is a series of lines such as the following:
+consumption for each of the process's mappings. For each mapping (aka Virtual
+Memory Area, or VMA) there is a series of lines such as the following:
08048000-080bc000 r-xp 00000000 03:02 13130 /bin/bash
+
Size: 1084 kB
+KernelPageSize: 4 kB
+MMUPageSize: 4 kB
Rss: 892 kB
Pss: 374 kB
Shared_Clean: 892 kB
@@ -442,11 +448,14 @@ Locked: 0 kB
THPeligible: 0
VmFlags: rd ex mr mw me dw
-the first of these lines shows the same information as is displayed for the
-mapping in /proc/PID/maps. The remaining lines show the size of the mapping
-(size), the amount of the mapping that is currently resident in RAM (RSS), the
-process' proportional share of this mapping (PSS), the number of clean and
-dirty private pages in the mapping.
+The first of these lines shows the same information as is displayed for the
+mapping in /proc/PID/maps. Following lines show the size of the mapping
+(size); the size of each page allocated when backing a VMA (KernelPageSize),
+which is usually the same as the size in the page table entries; the page size
+used by the MMU when backing a VMA (in most cases, the same as KernelPageSize);
+the amount of the mapping that is currently resident in RAM (RSS); the
+process' proportional share of this mapping (PSS); and the number of clean and
+dirty shared and private pages in the mapping.
The "proportional set size" (PSS) of a process is the count of pages it has
in memory, where each page is divided by the number of processes sharing it.
@@ -477,8 +486,8 @@ replaced by copy-on-write) part of the underlying shmem object out on swap.
"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
does not take into account swapped out page of underlying shmem objects.
"Locked" indicates whether the mapping is locked in memory or not.
-"THPeligible" indicates whether the mapping is eligible for THP pages - 1 if
-true, 0 otherwise.
+"THPeligible" indicates whether the mapping is eligible for allocating THP
+pages - 1 if true, 0 otherwise. It just shows the current status.
"VmFlags" field deserves a separate description. This member represents the kernel
flags associated with the particular virtual memory area in two letter encoded
@@ -531,6 +540,19 @@ guarantees:
2) If there is something at a given vaddr during the entirety of the
life of the smaps/maps walk, there will be some output for it.
+The /proc/PID/smaps_rollup file includes the same fields as /proc/PID/smaps,
+but their values are the sums of the corresponding values for all mappings of
+the process. Additionally, it contains these fields:
+
+Pss_Anon
+Pss_File
+Pss_Shmem
+
+They represent the proportional shares of anonymous, file, and shmem pages, as
+described for smaps above. These fields are omitted in smaps since each
+mapping identifies the type (anon, file, or shmem) of all pages it contains.
+Thus all information in smaps_rollup can be derived from smaps, but at a
+significantly higher cost.
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
bits on both physical and virtual pages associated with a process, and the
@@ -1478,7 +1500,7 @@ review the kernel documentation in the directory /usr/src/linux/Documentation.
This chapter is heavily based on the documentation included in the pre 2.2
kernels, and became part of it in version 2.2.1 of the Linux kernel.
-Please see: Documentation/sysctl/ directory for descriptions of these
+Please see: Documentation/admin-guide/sysctl/ directory for descriptions of these
entries.
------------------------------------------------------------------------------
@@ -1948,6 +1970,45 @@ patched. If the patch is being enabled, then the task has already been
patched. If the patch is being disabled, then the task hasn't been
unpatched yet.
+3.12 /proc/<pid>/arch_status - task architecture specific status
+-------------------------------------------------------------------
+When CONFIG_PROC_PID_ARCH_STATUS is enabled, this file displays the
+architecture specific status of the task.
+
+Example
+-------
+ $ cat /proc/6753/arch_status
+ AVX512_elapsed_ms: 8
+
+Description
+-----------
+
+x86 specific entries:
+---------------------
+ AVX512_elapsed_ms:
+ ------------------
+ If AVX512 is supported on the machine, this entry shows the milliseconds
+ elapsed since the last time AVX512 usage was recorded. The recording
+ happens on a best effort basis when a task is scheduled out. This means
+ that the value depends on two factors:
+
+ 1) The time which the task spent on the CPU without being scheduled
+ out. With CPU isolation and a single runnable task this can take
+ several seconds.
+
+ 2) The time since the task was scheduled out last. Depending on the
+ reason for being scheduled out (time slice exhausted, syscall ...)
+ this can be arbitrary long time.
+
+ As a consequence the value cannot be considered precise and authoritative
+ information. The application which uses this information has to be aware
+ of the overall scenario on the system in order to determine whether a
+ task is a real AVX512 user or not. Precise information can be obtained
+ with performance counters.
+
+ A special value of '-1' indicates that no AVX512 usage was recorded, thus
+ the task is unlikely an AVX512 user, but depends on the workload and the
+ scheduling scenario, it also could be a false negative mentioned above.
------------------------------------------------------------------------------
Configuring procfs
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 79637d227e85..97d42ccaa92d 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -105,7 +105,7 @@ All this differs from the old initrd in several ways:
- The old initrd file was a gzipped filesystem image (in some file format,
such as ext2, that needed a driver built into the kernel), while the new
initramfs archive is a gzipped cpio archive (like tar only simpler,
- see cpio(1) and Documentation/early-userspace/buffer-format.txt). The
+ see cpio(1) and Documentation/driver-api/early-userspace/buffer-format.rst). The
kernel's cpio extraction code is not only extremely small, it's also
__init text and data that can be discarded during the boot process.
@@ -159,7 +159,7 @@ One advantage of the configuration file is that root access is not required to
set permissions or create device nodes in the new archive. (Note that those
two example "file" entries expect to find files named "init.sh" and "busybox" in
a directory called "initramfs", under the linux-2.6.* directory. See
-Documentation/early-userspace/README for more details.)
+Documentation/driver-api/early-userspace/early_userspace_support.rst for more details.)
The kernel does not depend on external cpio tools. If you specify a
directory instead of a configuration file, the kernel's build infrastructure
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 5b5311f9358d..ddf15b1b0d5a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -319,7 +319,7 @@ quick way to lookup the sysfs interface for a device from the result of
a stat(2) operation.
More information can driver-model specific features can be found in
-Documentation/driver-model/.
+Documentation/driver-api/driver-model/.
TODO: Finish this section.
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index d06e9a59a9f4..5ecbc03e6b2f 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
use at file creation time. When a task allocates a file in the file
system, the mount option memory policy will be applied with a NodeList,
if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroup-v1/cpusets.txt] and any optional flags, listed
+[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed
below. If the resulting NodeLists is the empty set, the effective memory
policy for the file will revert to "default" policy.
diff --git a/Documentation/filesystems/xfs-self-describing-metadata.txt b/Documentation/filesystems/xfs-self-describing-metadata.txt
index 68604e67a495..8db0121d0980 100644
--- a/Documentation/filesystems/xfs-self-describing-metadata.txt
+++ b/Documentation/filesystems/xfs-self-describing-metadata.txt
@@ -222,7 +222,7 @@ static void
xfs_foo_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
if ((xfs_sb_version_hascrc(&mp->m_sb) &&
!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
@@ -245,7 +245,7 @@ static bool
xfs_foo_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_ondisk_hdr *hdr = bp->b_addr;
if (hdr->magic != cpu_to_be32(XFS_FOO_MAGIC))
@@ -272,7 +272,7 @@ static bool
xfs_foo_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_ondisk_hdr *hdr = bp->b_addr;
if (hdr->magic == cpu_to_be32(XFS_FOO_CRC_MAGIC)) {
@@ -297,7 +297,7 @@ static void
xfs_foo_write_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_mount *mp = bp->b_mount;
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_foo_verify(bp)) {
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
deleted file mode 100644
index a5cbb5e0e3db..000000000000
--- a/Documentation/filesystems/xfs.txt
+++ /dev/null
@@ -1,470 +0,0 @@
-
-The SGI XFS Filesystem
-======================
-
-XFS is a high performance journaling filesystem which originated
-on the SGI IRIX platform. It is completely multi-threaded, can
-support large files and large filesystems, extended attributes,
-variable block sizes, is extent based, and makes extensive use of
-Btrees (directories, extents, free space) to aid both performance
-and scalability.
-
-Refer to the documentation at https://xfs.wiki.kernel.org/
-for further details. This implementation is on-disk compatible
-with the IRIX version of XFS.
-
-
-Mount Options
-=============
-
-When mounting an XFS filesystem, the following options are accepted.
-For boolean mount options, the names with the (*) suffix is the
-default behaviour.
-
- allocsize=size
- Sets the buffered I/O end-of-file preallocation size when
- doing delayed allocation writeout (default size is 64KiB).
- Valid values for this option are page size (typically 4KiB)
- through to 1GiB, inclusive, in power-of-2 increments.
-
- The default behaviour is for dynamic end-of-file
- preallocation size, which uses a set of heuristics to
- optimise the preallocation size based on the current
- allocation patterns within the file and the access patterns
- to the file. Specifying a fixed allocsize value turns off
- the dynamic behaviour.
-
- attr2
- noattr2
- The options enable/disable an "opportunistic" improvement to
- be made in the way inline extended attributes are stored
- on-disk. When the new form is used for the first time when
- attr2 is selected (either when setting or removing extended
- attributes) the on-disk superblock feature bit field will be
- updated to reflect this format being in use.
-
- The default behaviour is determined by the on-disk feature
- bit indicating that attr2 behaviour is active. If either
- mount option it set, then that becomes the new default used
- by the filesystem.
-
- CRC enabled filesystems always use the attr2 format, and so
- will reject the noattr2 mount option if it is set.
-
- discard
- nodiscard (*)
- Enable/disable the issuing of commands to let the block
- device reclaim space freed by the filesystem. This is
- useful for SSD devices, thinly provisioned LUNs and virtual
- machine images, but may have a performance impact.
-
- Note: It is currently recommended that you use the fstrim
- application to discard unused blocks rather than the discard
- mount option because the performance impact of this option
- is quite severe.
-
- grpid/bsdgroups
- nogrpid/sysvgroups (*)
- These options define what group ID a newly created file
- gets. When grpid is set, it takes the group ID of the
- directory in which it is created; otherwise it takes the
- fsgid of the current process, unless the directory has the
- setgid bit set, in which case it takes the gid from the
- parent directory, and also gets the setgid bit set if it is
- a directory itself.
-
- filestreams
- Make the data allocator use the filestreams allocation mode
- across the entire filesystem rather than just on directories
- configured to use it.
-
- ikeep
- noikeep (*)
- When ikeep is specified, XFS does not delete empty inode
- clusters and keeps them around on disk. When noikeep is
- specified, empty inode clusters are returned to the free
- space pool.
-
- inode32
- inode64 (*)
- When inode32 is specified, it indicates that XFS limits
- inode creation to locations which will not result in inode
- numbers with more than 32 bits of significance.
-
- When inode64 is specified, it indicates that XFS is allowed
- to create inodes at any location in the filesystem,
- including those which will result in inode numbers occupying
- more than 32 bits of significance.
-
- inode32 is provided for backwards compatibility with older
- systems and applications, since 64 bits inode numbers might
- cause problems for some applications that cannot handle
- large inode numbers. If applications are in use which do
- not handle inode numbers bigger than 32 bits, the inode32
- option should be specified.
-
-
- largeio
- nolargeio (*)
- If "nolargeio" is specified, the optimal I/O reported in
- st_blksize by stat(2) will be as small as possible to allow
- user applications to avoid inefficient read/modify/write
- I/O. This is typically the page size of the machine, as
- this is the granularity of the page cache.
-
- If "largeio" specified, a filesystem that was created with a
- "swidth" specified will return the "swidth" value (in bytes)
- in st_blksize. If the filesystem does not have a "swidth"
- specified but does specify an "allocsize" then "allocsize"
- (in bytes) will be returned instead. Otherwise the behaviour
- is the same as if "nolargeio" was specified.
-
- logbufs=value
- Set the number of in-memory log buffers. Valid numbers
- range from 2-8 inclusive.
-
- The default value is 8 buffers.
-
- If the memory cost of 8 log buffers is too high on small
- systems, then it may be reduced at some cost to performance
- on metadata intensive workloads. The logbsize option below
- controls the size of each buffer and so is also relevant to
- this case.
-
- logbsize=value
- Set the size of each in-memory log buffer. The size may be
- specified in bytes, or in kilobytes with a "k" suffix.
- Valid sizes for version 1 and version 2 logs are 16384 (16k)
- and 32768 (32k). Valid sizes for version 2 logs also
- include 65536 (64k), 131072 (128k) and 262144 (256k). The
- logbsize must be an integer multiple of the log
- stripe unit configured at mkfs time.
-
- The default value for for version 1 logs is 32768, while the
- default value for version 2 logs is MAX(32768, log_sunit).
-
- logdev=device and rtdev=device
- Use an external log (metadata journal) and/or real-time device.
- An XFS filesystem has up to three parts: a data section, a log
- section, and a real-time section. The real-time section is
- optional, and the log section can be separate from the data
- section or contained within it.
-
- noalign
- Data allocations will not be aligned at stripe unit
- boundaries. This is only relevant to filesystems created
- with non-zero data alignment parameters (sunit, swidth) by
- mkfs.
-
- norecovery
- The filesystem will be mounted without running log recovery.
- If the filesystem was not cleanly unmounted, it is likely to
- be inconsistent when mounted in "norecovery" mode.
- Some files or directories may not be accessible because of this.
- Filesystems mounted "norecovery" must be mounted read-only or
- the mount will fail.
-
- nouuid
- Don't check for double mounted file systems using the file
- system uuid. This is useful to mount LVM snapshot volumes,
- and often used in combination with "norecovery" for mounting
- read-only snapshots.
-
- noquota
- Forcibly turns off all quota accounting and enforcement
- within the filesystem.
-
- uquota/usrquota/uqnoenforce/quota
- User disk quota accounting enabled, and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
-
- gquota/grpquota/gqnoenforce
- Group disk quota accounting enabled and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
-
- pquota/prjquota/pqnoenforce
- Project disk quota accounting enabled and limits (optionally)
- enforced. Refer to xfs_quota(8) for further details.
-
- sunit=value and swidth=value
- Used to specify the stripe unit and width for a RAID device
- or a stripe volume. "value" must be specified in 512-byte
- block units. These options are only relevant to filesystems
- that were created with non-zero data alignment parameters.
-
- The sunit and swidth parameters specified must be compatible
- with the existing filesystem alignment characteristics. In
- general, that means the only valid changes to sunit are
- increasing it by a power-of-2 multiple. Valid swidth values
- are any integer multiple of a valid sunit value.
-
- Typically the only time these mount options are necessary if
- after an underlying RAID device has had it's geometry
- modified, such as adding a new disk to a RAID5 lun and
- reshaping it.
-
- swalloc
- Data allocations will be rounded up to stripe width boundaries
- when the current end of file is being extended and the file
- size is larger than the stripe width size.
-
- wsync
- When specified, all filesystem namespace operations are
- executed synchronously. This ensures that when the namespace
- operation (create, unlink, etc) completes, the change to the
- namespace is on stable storage. This is useful in HA setups
- where failover must not result in clients seeing
- inconsistent namespace presentation during or after a
- failover event.
-
-
-Deprecated Mount Options
-========================
-
- Name Removal Schedule
- ---- ----------------
-
-
-Removed Mount Options
-=====================
-
- Name Removed
- ---- -------
- delaylog/nodelaylog v4.0
- ihashsize v4.0
- irixsgid v4.0
- osyncisdsync/osyncisosync v4.0
- barrier v4.19
- nobarrier v4.19
-
-
-sysctls
-=======
-
-The following sysctls are available for the XFS filesystem:
-
- fs.xfs.stats_clear (Min: 0 Default: 0 Max: 1)
- Setting this to "1" clears accumulated XFS statistics
- in /proc/fs/xfs/stat. It then immediately resets to "0".
-
- fs.xfs.xfssyncd_centisecs (Min: 100 Default: 3000 Max: 720000)
- The interval at which the filesystem flushes metadata
- out to disk and runs internal cache cleanup routines.
-
- fs.xfs.filestream_centisecs (Min: 1 Default: 3000 Max: 360000)
- The interval at which the filesystem ages filestreams cache
- references and returns timed-out AGs back to the free stream
- pool.
-
- fs.xfs.speculative_prealloc_lifetime
- (Units: seconds Min: 1 Default: 300 Max: 86400)
- The interval at which the background scanning for inodes
- with unused speculative preallocation runs. The scan
- removes unused preallocation from clean inodes and releases
- the unused space back to the free pool.
-
- fs.xfs.error_level (Min: 0 Default: 3 Max: 11)
- A volume knob for error reporting when internal errors occur.
- This will generate detailed messages & backtraces for filesystem
- shutdowns, for example. Current threshold values are:
-
- XFS_ERRLEVEL_OFF: 0
- XFS_ERRLEVEL_LOW: 1
- XFS_ERRLEVEL_HIGH: 5
-
- fs.xfs.panic_mask (Min: 0 Default: 0 Max: 256)
- Causes certain error conditions to call BUG(). Value is a bitmask;
- OR together the tags which represent errors which should cause panics:
-
- XFS_NO_PTAG 0
- XFS_PTAG_IFLUSH 0x00000001
- XFS_PTAG_LOGRES 0x00000002
- XFS_PTAG_AILDELETE 0x00000004
- XFS_PTAG_ERROR_REPORT 0x00000008
- XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
- XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
- XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
- XFS_PTAG_FSBLOCK_ZERO 0x00000080
- XFS_PTAG_VERIFIER_ERROR 0x00000100
-
- This option is intended for debugging only.
-
- fs.xfs.irix_symlink_mode (Min: 0 Default: 0 Max: 1)
- Controls whether symlinks are created with mode 0777 (default)
- or whether their mode is affected by the umask (irix mode).
-
- fs.xfs.irix_sgid_inherit (Min: 0 Default: 0 Max: 1)
- Controls files created in SGID directories.
- If the group ID of the new file does not match the effective group
- ID or one of the supplementary group IDs of the parent dir, the
- ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
- is set.
-
- fs.xfs.inherit_sync (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "sync" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_nodump (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "nodump" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_noatime (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "noatime" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_nosymlinks (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "nosymlinks" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.inherit_nodefrag (Min: 0 Default: 1 Max: 1)
- Setting this to "1" will cause the "nodefrag" flag set
- by the xfs_io(8) chattr command on a directory to be
- inherited by files in that directory.
-
- fs.xfs.rotorstep (Min: 1 Default: 1 Max: 256)
- In "inode32" allocation mode, this option determines how many
- files the allocator attempts to allocate in the same allocation
- group before moving to the next allocation group. The intent
- is to control the rate at which the allocator moves between
- allocation groups when allocating extents for new files.
-
-Deprecated Sysctls
-==================
-
-None at present.
-
-
-Removed Sysctls
-===============
-
- Name Removed
- ---- -------
- fs.xfs.xfsbufd_centisec v4.0
- fs.xfs.age_buffer_centisecs v4.0
-
-
-Error handling
-==============
-
-XFS can act differently according to the type of error found during its
-operation. The implementation introduces the following concepts to the error
-handler:
-
- -failure speed:
- Defines how fast XFS should propagate an error upwards when a specific
- error is found during the filesystem operation. It can propagate
- immediately, after a defined number of retries, after a set time period,
- or simply retry forever.
-
- -error classes:
- Specifies the subsystem the error configuration will apply to, such as
- metadata IO or memory allocation. Different subsystems will have
- different error handlers for which behaviour can be configured.
-
- -error handlers:
- Defines the behavior for a specific error.
-
-The filesystem behavior during an error can be set via sysfs files. Each
-error handler works independently - the first condition met by an error handler
-for a specific class will cause the error to be propagated rather than reset and
-retried.
-
-The action taken by the filesystem when the error is propagated is context
-dependent - it may cause a shut down in the case of an unrecoverable error,
-it may be reported back to userspace, or it may even be ignored because
-there's nothing useful we can with the error or anyone we can report it to (e.g.
-during unmount).
-
-The configuration files are organized into the following hierarchy for each
-mounted filesystem:
-
- /sys/fs/xfs/<dev>/error/<class>/<error>/
-
-Where:
- <dev>
- The short device name of the mounted filesystem. This is the same device
- name that shows up in XFS kernel error messages as "XFS(<dev>): ..."
-
- <class>
- The subsystem the error configuration belongs to. As of 4.9, the defined
- classes are:
-
- - "metadata": applies metadata buffer write IO
-
- <error>
- The individual error handler configurations.
-
-
-Each filesystem has "global" error configuration options defined in their top
-level directory:
-
- /sys/fs/xfs/<dev>/error/
-
- fail_at_unmount (Min: 0 Default: 1 Max: 1)
- Defines the filesystem error behavior at unmount time.
-
- If set to a value of 1, XFS will override all other error configurations
- during unmount and replace them with "immediate fail" characteristics.
- i.e. no retries, no retry timeout. This will always allow unmount to
- succeed when there are persistent errors present.
-
- If set to 0, the configured retry behaviour will continue until all
- retries and/or timeouts have been exhausted. This will delay unmount
- completion when there are persistent errors, and it may prevent the
- filesystem from ever unmounting fully in the case of "retry forever"
- handler configurations.
-
- Note: there is no guarantee that fail_at_unmount can be set while an
- unmount is in progress. It is possible that the sysfs entries are
- removed by the unmounting filesystem before a "retry forever" error
- handler configuration causes unmount to hang, and hence the filesystem
- must be configured appropriately before unmount begins to prevent
- unmount hangs.
-
-Each filesystem has specific error class handlers that define the error
-propagation behaviour for specific errors. There is also a "default" error
-handler defined, which defines the behaviour for all errors that don't have
-specific handlers defined. Where multiple retry constraints are configuredi for
-a single error, the first retry configuration that expires will cause the error
-to be propagated. The handler configurations are found in the directory:
-
- /sys/fs/xfs/<dev>/error/<class>/<error>/
-
- max_retries (Min: -1 Default: Varies Max: INTMAX)
- Defines the allowed number of retries of a specific error before
- the filesystem will propagate the error. The retry count for a given
- error context (e.g. a specific metadata buffer) is reset every time
- there is a successful completion of the operation.
-
- Setting the value to "-1" will cause XFS to retry forever for this
- specific error.
-
- Setting the value to "0" will cause XFS to fail immediately when the
- specific error is reported.
-
- Setting the value to "N" (where 0 < N < Max) will make XFS retry the
- operation "N" times before propagating the error.
-
- retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day)
- Define the amount of time (in seconds) that the filesystem is
- allowed to retry its operations when the specific error is
- found.
-
- Setting the value to "-1" will allow XFS to retry forever for this
- specific error.
-
- Setting the value to "0" will cause XFS to fail immediately when the
- specific error is reported.
-
- Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the
- operation for up to "N" seconds before propagating the error.
-
-Note: The default behaviour for a specific error handler is dependent on both
-the class and error context. For example, the default values for
-"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults
-to "fail immediately" behaviour. This is done because ENODEV is a fatal,
-unrecoverable error no matter how many times the metadata IO is retried.
diff --git a/Documentation/firmware-guide/acpi/enumeration.rst b/Documentation/firmware-guide/acpi/enumeration.rst
index 1252617b520f..0a72b6321f5f 100644
--- a/Documentation/firmware-guide/acpi/enumeration.rst
+++ b/Documentation/firmware-guide/acpi/enumeration.rst
@@ -316,7 +316,7 @@ specifies the path to the controller. In order to use these GPIOs in Linux
we need to translate them to the corresponding Linux GPIO descriptors.
There is a standard GPIO API for that and is documented in
-Documentation/gpio/.
+Documentation/admin-guide/gpio/.
In the above example we can get the corresponding two GPIO descriptors with
a code like this::
diff --git a/Documentation/firmware-guide/acpi/extcon-intel-int3496.rst b/Documentation/firmware-guide/acpi/extcon-intel-int3496.rst
new file mode 100644
index 000000000000..5137ca834b54
--- /dev/null
+++ b/Documentation/firmware-guide/acpi/extcon-intel-int3496.rst
@@ -0,0 +1,33 @@
+=====================================================
+Intel INT3496 ACPI device extcon driver documentation
+=====================================================
+
+The Intel INT3496 ACPI device extcon driver is a driver for ACPI
+devices with an acpi-id of INT3496, such as found for example on
+Intel Baytrail and Cherrytrail tablets.
+
+This ACPI device describes how the OS can read the id-pin of the devices'
+USB-otg port, as well as how it optionally can enable Vbus output on the
+otg port and how it can optionally control the muxing of the data pins
+between an USB host and an USB peripheral controller.
+
+The ACPI devices exposes this functionality by returning an array with up
+to 3 gpio descriptors from its ACPI _CRS (Current Resource Settings) call:
+
+======= =====================================================================
+Index 0 The input gpio for the id-pin, this is always present and valid
+Index 1 The output gpio for enabling Vbus output from the device to the otg
+ port, write 1 to enable the Vbus output (this gpio descriptor may
+ be absent or invalid)
+Index 2 The output gpio for muxing of the data pins between the USB host and
+ the USB peripheral controller, write 1 to mux to the peripheral
+ controller
+======= =====================================================================
+
+There is a mapping between indices and GPIO connection IDs as follows
+
+ ======= =======
+ id index 0
+ vbus index 1
+ mux index 2
+ ======= =======
diff --git a/Documentation/firmware-guide/acpi/index.rst b/Documentation/firmware-guide/acpi/index.rst
index ae609eec4679..90c90d42d9ad 100644
--- a/Documentation/firmware-guide/acpi/index.rst
+++ b/Documentation/firmware-guide/acpi/index.rst
@@ -24,3 +24,4 @@ ACPI Support
acpi-lid
lpit
video_extension
+ extcon-intel-int3496
diff --git a/Documentation/fmc/API.txt b/Documentation/fmc/API.txt
deleted file mode 100644
index 06b06b92c794..000000000000
--- a/Documentation/fmc/API.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-Functions Exported by fmc.ko
-****************************
-
-The FMC core exports the usual 4 functions that are needed for a bus to
-work, and a few more:
-
- int fmc_driver_register(struct fmc_driver *drv);
- void fmc_driver_unregister(struct fmc_driver *drv);
- int fmc_device_register(struct fmc_device *fmc);
- void fmc_device_unregister(struct fmc_device *fmc);
-
- int fmc_device_register_n(struct fmc_device **fmc, int n);
- void fmc_device_unregister_n(struct fmc_device **fmc, int n);
-
- uint32_t fmc_readl(struct fmc_device *fmc, int offset);
- void fmc_writel(struct fmc_device *fmc, uint32_t val, int off);
- void *fmc_get_drvdata(struct fmc_device *fmc);
- void fmc_set_drvdata(struct fmc_device *fmc, void *data);
-
- int fmc_reprogram(struct fmc_device *f, struct fmc_driver *d, char *gw,
- int sdb_entry);
-
-The data structure that describe a device is detailed in *note FMC
-Device::, the one that describes a driver is detailed in *note FMC
-Driver::. Please note that structures of type fmc_device must be
-allocated by the caller, but must not be released after unregistering.
-The fmc-bus itself takes care of releasing the structure when their use
-count reaches zero - actually, the device model does that in lieu of us.
-
-The functions to register and unregister n devices are meant to be used
-by carriers that host more than one mezzanine. The devices must all be
-registered at the same time because if the FPGA is reprogrammed, all
-devices in the array are affected. Usually, the driver matching the
-first device will reprogram the FPGA, so other devices must know they
-are already driven by a reprogrammed FPGA.
-
-If a carrier hosts slots that are driven by different FPGA devices, it
-should register as a group only mezzanines that are driven by the same
-FPGA, for the reason outlined above.
-
-Finally, the fmc_reprogram function calls the reprogram method (see
-*note The API Offered by Carriers:: and also scans the memory area for
-an SDB tree. You can pass -1 as sdb_entry to disable such scan.
-Otherwise, the function fails if no tree is found at the specified
-entry point. The function is meant to factorize common code, and by
-the time you read this it is already used by the spec-sw and fine-delay
-modules.
diff --git a/Documentation/fmc/FMC-and-SDB.txt b/Documentation/fmc/FMC-and-SDB.txt
deleted file mode 100644
index fa14e0b24521..000000000000
--- a/Documentation/fmc/FMC-and-SDB.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-
-FMC (FPGA Mezzanine Card) is the standard we use for our I/O devices,
-in the context of White Rabbit and related hardware.
-
-In our I/O environments we need to write drivers for each mezzanine
-card, and such drivers must work regardless of the carrier being used.
-To achieve this, we abstract the FMC interface.
-
-We have a carrier for PCI-E called SPEC and one for VME called SVEC,
-but more are planned. Also, we support stand-alone devices (usually
-plugged on a SPEC card), controlled through Etherbone, developed by GSI.
-
-Code and documentation for the FMC bus was born as part of the spec-sw
-project, but now it lives in its own project. Other projects, i.e.
-software support for the various carriers, should include this as a
-submodule.
-
-The most up to date version of code and documentation is always
-available from the repository you can clone from:
-
- git://ohwr.org/fmc-projects/fmc-bus.git (read-only)
- git@ohwr.org:fmc-projects/fmc-bus.git (read-write for developers)
-
-Selected versions of the documentation, as well as complete tar
-archives for selected revisions are placed to the Files section of the
-project: `http://www.ohwr.org/projects/fmc-bus/files'
-
-
-What is FMC
-***********
-
-FMC, as said, stands for "FPGA Mezzanine Card". It is a standard
-developed by the VME consortium called VITA (VMEbus International Trade
-Association and ratified by ANSI, the American National Standard
-Institute. The official documentation is called "ANSI-VITA 57.1".
-
-The FMC card is an almost square PCB, around 70x75 millimeters, that is
-called mezzanine in this document. It usually lives plugged into
-another PCB for power supply and control; such bigger circuit board is
-called carrier from now on, and a single carrier may host more than one
-mezzanine.
-
-In the typical application the mezzanine is mostly analog while the
-carrier is mostly digital, and hosts an FPGA that must be configured to
-match the specific mezzanine and the desired application. Thus, you may
-need to load different FPGA images to drive different instances of the
-same mezzanine.
-
-FMC, as such, is not a bus in the usual meaning of the term, because
-most carriers have only one connector, and carriers with several
-connectors have completely separate electrical connections to them.
-This package, however, implements a bus as a software abstraction.
-
-
-What is SDB
-***********
-
-SDB (Self Describing Bus) is a set of data structures that we use for
-enumerating the internal structure of an FPGA image. We also use it as
-a filesystem inside the FMC EEPROM.
-
-SDB is not mandatory for use of this FMC kernel bus, but if you have SDB
-this package can make good use of it. SDB itself is developed in the
-fpga-config-space OHWR project. The link to the repository is
-`git://ohwr.org/hdl-core-lib/fpga-config-space.git' and what is used in
-this project lives in the sdbfs subdirectory in there.
-
-SDB support for FMC is described in *note FMC Identification:: and
-*note SDB Support::
-
-
-SDB Support
-***********
-
-The fmc.ko bus driver exports a few functions to help drivers taking
-advantage of the SDB information that may be present in your own FPGA
-memory image.
-
-The module exports the following functions, in the special header
-<linux/fmc-sdb.h>. The linux/ prefix in the name is there because we
-plan to submit it upstream in the future, and don't want to force
-changes on our drivers if that happens.
-
- int fmc_scan_sdb_tree(struct fmc_device *fmc, unsigned long address);
- void fmc_show_sdb_tree(struct fmc_device *fmc);
- signed long fmc_find_sdb_device(struct sdb_array *tree, uint64_t vendor,
- uint32_t device, unsigned long *sz);
- int fmc_free_sdb_tree(struct fmc_device *fmc);
diff --git a/Documentation/fmc/carrier.txt b/Documentation/fmc/carrier.txt
deleted file mode 100644
index 5e4f1dd3e98b..000000000000
--- a/Documentation/fmc/carrier.txt
+++ /dev/null
@@ -1,311 +0,0 @@
-FMC Device
-**********
-
-Within the Linux bus framework, the FMC device is created and
-registered by the carrier driver. For example, the PCI driver for the
-SPEC card fills a data structure for each SPEC that it drives, and
-registers an associated FMC device for each card. The SVEC driver can
-do exactly the same for the VME carrier (actually, it should do it
-twice, because the SVEC carries two FMC mezzanines). Similarly, an
-Etherbone driver will be able to register its own FMC devices, offering
-communication primitives through frame exchange.
-
-The contents of the EEPROM within the FMC are used for identification
-purposes, i.e. for matching the device with its own driver. For this
-reason the device structure includes a complete copy of the EEPROM
-(actually, the carrier driver may choose whether or not to return it -
-for example we most likely won't have the whole EEPROM available for
-Etherbone devices.
-
-The following listing shows the current structure defining a device.
-Please note that all the machinery is in place but some details may
-still change in the future. For this reason, there is a version field
-at the beginning of the structure. As usual, the minor number will
-change for compatible changes (like a new flag) and the major number
-will increase when an incompatible change happens (for example, a
-change in layout of some fmc data structures). Device writers should
-just set it to the value FMC_VERSION, and be ready to get back -EINVAL
-at registration time.
-
- struct fmc_device {
- unsigned long version;
- unsigned long flags;
- struct module *owner; /* char device must pin it */
- struct fmc_fru_id id; /* for EEPROM-based match */
- struct fmc_operations *op; /* carrier-provided */
- int irq; /* according to host bus. 0 == none */
- int eeprom_len; /* Usually 8kB, may be less */
- int eeprom_addr; /* 0x50, 0x52 etc */
- uint8_t *eeprom; /* Full contents or leading part */
- char *carrier_name; /* "SPEC" or similar, for special use */
- void *carrier_data; /* "struct spec *" or equivalent */
- __iomem void *fpga_base; /* May be NULL (Etherbone) */
- __iomem void *slot_base; /* Set by the driver */
- struct fmc_device **devarray; /* Allocated by the bus */
- int slot_id; /* Index in the slot array */
- int nr_slots; /* Number of slots in this carrier */
- unsigned long memlen; /* Used for the char device */
- struct device dev; /* For Linux use */
- struct device *hwdev; /* The underlying hardware device */
- unsigned long sdbfs_entry;
- struct sdb_array *sdb;
- uint32_t device_id; /* Filled by the device */
- char *mezzanine_name; /* Defaults to ``fmc'' */
- void *mezzanine_data;
- };
-
-The meaning of most fields is summarized in the code comment above.
-
-The following fields must be filled by the carrier driver before
-registration:
-
- * version: must be set to FMC_VERSION.
-
- * owner: set to MODULE_OWNER.
-
- * op: the operations to act on the device.
-
- * irq: number for the mezzanine; may be zero.
-
- * eeprom_len: length of the following array.
-
- * eeprom_addr: 0x50 for first mezzanine and so on.
-
- * eeprom: the full content of the I2C EEPROM.
-
- * carrier_name.
-
- * carrier_data: a unique pointer for the carrier.
-
- * fpga_base: the I/O memory address (may be NULL).
-
- * slot_id: the index of this slot (starting from zero).
-
- * memlen: if fpga_base is valid, the length of I/O memory.
-
- * hwdev: to be used in some dev_err() calls.
-
- * device_id: a slot-specific unique integer number.
-
-
-Please note that the carrier should read its own EEPROM memory before
-registering the device, as well as fill all other fields listed above.
-
-The following fields should not be assigned, because they are filled
-later by either the bus or the device driver:
-
- * flags.
-
- * fru_id: filled by the bus, parsing the eeprom.
-
- * slot_base: filled and used by the driver, if useful to it.
-
- * devarray: an array og all mezzanines driven by a singe FPGA.
-
- * nr_slots: set by the core at registration time.
-
- * dev: used by Linux.
-
- * sdb: FPGA contents, scanned according to driver's directions.
-
- * sdbfs_entry: SDB entry point in EEPROM: autodetected.
-
- * mezzanine_data: available for the driver.
-
- * mezzanine_name: filled by fmc-bus during identification.
-
-
-Note: mezzanine_data may be redundant, because Linux offers the drvdata
-approach, so the field may be removed in later versions of this bus
-implementation.
-
-As I write this, she SPEC carrier is already completely functional in
-the fmc-bus environment, and is a good reference to look at.
-
-
-The API Offered by Carriers
-===========================
-
-The carrier provides a number of methods by means of the
-`fmc_operations' structure, which currently is defined like this
-(again, it is a moving target, please refer to the header rather than
-this document):
-
- struct fmc_operations {
- uint32_t (*readl)(struct fmc_device *fmc, int offset);
- void (*writel)(struct fmc_device *fmc, uint32_t value, int offset);
- int (*reprogram)(struct fmc_device *f, struct fmc_driver *d, char *gw);
- int (*validate)(struct fmc_device *fmc, struct fmc_driver *drv);
- int (*irq_request)(struct fmc_device *fmc, irq_handler_t h,
- char *name, int flags);
- void (*irq_ack)(struct fmc_device *fmc);
- int (*irq_free)(struct fmc_device *fmc);
- int (*gpio_config)(struct fmc_device *fmc, struct fmc_gpio *gpio,
- int ngpio);
- int (*read_ee)(struct fmc_device *fmc, int pos, void *d, int l);
- int (*write_ee)(struct fmc_device *fmc, int pos, const void *d, int l);
- };
-
-The individual methods perform the following tasks:
-
-`readl'
-`writel'
- These functions access FPGA registers by whatever means the
- carrier offers. They are not expected to fail, and most of the time
- they will just make a memory access to the host bus. If the
- carrier provides a fpga_base pointer, the driver may use direct
- access through that pointer. For this reason the header offers the
- inline functions fmc_readl and fmc_writel that access fpga_base if
- the respective method is NULL. A driver that wants to be portable
- and efficient should use fmc_readl and fmc_writel. For Etherbone,
- or other non-local carriers, error-management is still to be
- defined.
-
-`validate'
- Module parameters are used to manage different applications for
- two or more boards of the same kind. Validation is based on the
- busid module parameter, if provided, and returns the matching
- index in the associated array. See *note Module Parameters:: in in
- doubt. If no match is found, `-ENOENT' is returned; if the user
- didn't pass `busid=', all devices will pass validation. The value
- returned by the validate method can be used as index into other
- parameters (for example, some drivers use the `lm32=' parameter in
- this way). Such "generic parameters" are documented in *note
- Module Parameters::, below. The validate method is used by
- `fmc-trivial.ko', described in *note fmc-trivial::.
-
-`reprogram'
- The carrier enumerates FMC devices by loading a standard (or
- golden) FPGA binary that allows EEPROM access. Each driver, then,
- will need to reprogram the FPGA by calling this function. If the
- name argument is NULL, the carrier should reprogram the golden
- binary. If the gateware name has been overridden through module
- parameters (in a carrier-specific way) the file loaded will match
- the parameters. Per-device gateware names can be specified using
- the `gateware=' parameter, see *note Module Parameters::. Note:
- Clients should call rhe new helper, fmc_reprogram, which both
- calls this method and parse the SDB tree of the FPGA.
-
-`irq_request'
-`irq_ack'
-`irq_free'
- Interrupt management is carrier-specific, so it is abstracted as
- operations. The interrupt number is listed in the device
- structure, and for the mezzanine driver the number is only
- informative. The handler will receive the fmc pointer as dev_id;
- the flags argument is passed to the Linux request_irq function,
- but fmc-specific flags may be added in the future. You'll most
- likely want to pass the `IRQF_SHARED' flag.
-
-`gpio_config'
- The method allows to configure a GPIO pin in the carrier, and read
- its current value if it is configured as input. See *note The GPIO
- Abstraction:: for details.
-
-`read_ee'
-`write_ee'
- Read or write the EEPROM. The functions are expected to be only
- called before reprogramming and the carrier should refuse them
- with `ENODEV' after reprogramming. The offset is expected to be
- within 8kB (the current size), but addresses up to 1MB are
- reserved to fit bigger I2C devices in the future. Carriers may
- offer access to other internal flash memories using these same
- methods: for example the SPEC driver may define that its carrier
- I2C memory is seen at offset 1M and the internal SPI flash is seen
- at offset 16M. This multiplexing of several flash memories in the
- same address space is carrier-specific and should only be used
- by a driver that has verified the `carrier_name' field.
-
-
-
-The GPIO Abstraction
-====================
-
-Support for GPIO pins in the fmc-bus environment is not very
-straightforward and deserves special discussion.
-
-While the general idea of a carrier-independent driver seems to fly,
-configuration of specific signals within the carrier needs at least
-some knowledge of the carrier itself. For this reason, the specific
-driver can request to configure carrier-specific GPIO pins, numbered
-from 0 to at most 4095. Configuration is performed by passing a
-pointer to an array of struct fmc_gpio items, as well as the length of
-the array. This is the data structure:
-
- struct fmc_gpio {
- char *carrier_name;
- int gpio;
- int _gpio; /* internal use by the carrier */
- int mode; /* GPIOF_DIR_OUT etc, from <linux/gpio.h> */
- int irqmode; /* IRQF_TRIGGER_LOW and so on */
- };
-
-By specifying a carrier_name for each pin, the driver may access
-different pins in different carriers. The gpio_config method is
-expected to return the number of pins successfully configured, ignoring
-requests for other carriers. However, if no pin is configured (because
-no structure at all refers to the current carrier_name), the operation
-returns an error so the caller will know that it is running under a
-yet-unsupported carrier.
-
-So, for example, a driver that has been developed and tested on both
-the SPEC and the SVEC may request configuration of two different GPIO
-pins, and expect one such configuration to succeed - if none succeeds
-it most likely means that the current carrier is a still-unknown one.
-
-If, however, your GPIO pin has a specific known role, you can pass a
-special number in the gpio field, using one of the following macros:
-
- #define FMC_GPIO_RAW(x) (x) /* 4096 of them */
- #define FMC_GPIO_IRQ(x) ((x) + 0x1000) /* 256 of them */
- #define FMC_GPIO_LED(x) ((x) + 0x1100) /* 256 of them */
- #define FMC_GPIO_KEY(x) ((x) + 0x1200) /* 256 of them */
- #define FMC_GPIO_TP(x) ((x) + 0x1300) /* 256 of them */
- #define FMC_GPIO_USER(x) ((x) + 0x1400) /* 256 of them */
-
-Use of virtual GPIO numbers (anything but FMC_GPIO_RAW) is allowed
-provided the carrier_name field in the data structure is left
-unspecified (NULL). Each carrier is responsible for providing a mapping
-between virtual and physical GPIO numbers. The carrier may then use the
-_gpio field to cache the result of this mapping.
-
-All carriers must map their I/O lines to the sets above starting from
-zero. The SPEC, for example, maps interrupt pins 0 and 1, and test
-points 0 through 3 (even if the test points on the PCB are called
-5,6,7,8).
-
-If, for example, a driver requires a free LED and a test point (for a
-scope probe to be plugged at some point during development) it may ask
-for FMC_GPIO_LED(0) and FMC_GPIO_TP(0). Each carrier will provide
-suitable GPIO pins. Clearly, the person running the drivers will know
-the order used by the specific carrier driver in assigning leds and
-testpoints, so to make a carrier-dependent use of the diagnostic tools.
-
-In theory, some form of autodetection should be possible: a driver like
-the wr-nic (which uses IRQ(1) on the SPEC card) should configure
-IRQ(0), make a test with software-generated interrupts and configure
-IRQ(1) if the test fails. This probing step should be used because even
-if the wr-nic gateware is known to use IRQ1 on the SPEC, the driver
-should be carrier-independent and thus use IRQ(0) as a first bet -
-actually, the knowledge that IRQ0 may fail is carrier-dependent
-information, but using it doesn't make the driver unsuitable for other
-carriers.
-
-The return value of gpio_config is defined as follows:
-
- * If no pin in the array can be used by the carrier, `-ENODEV'.
-
- * If at least one virtual GPIO number cannot be mapped, `-ENOENT'.
-
- * On success, 0 or positive. The value returned is the number of
- high input bits (if no input is configured, the value for success
- is 0).
-
-While I admit the procedure is not completely straightforward, it
-allows configuration, input and output with a single carrier operation.
-Given the typical use case of FMC devices, GPIO operations are not
-expected to ever by in hot paths, and GPIO access so fare has only been
-used to configure the interrupt pin, mode and polarity. Especially
-reading inputs is not expected to be common. If your device has GPIO
-capabilities in the hot path, you should consider using the kernel's
-GPIO mechanisms.
diff --git a/Documentation/fmc/fmc-chardev.txt b/Documentation/fmc/fmc-chardev.txt
deleted file mode 100644
index d9ccb278e597..000000000000
--- a/Documentation/fmc/fmc-chardev.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-fmc-chardev
-===========
-
-This is a simple generic driver, that allows user access by means of a
-character device (actually, one for each mezzanine it takes hold of).
-
-The char device is created as a misc device. Its name in /dev (as
-created by udev) is the same name as the underlying FMC device. Thus,
-the name can be a silly fmc-0000 look-alike if the device has no
-identifiers nor bus_id, a more specific fmc-0400 if the device has a
-bus-specific address but no associated name, or something like
-fdelay-0400 if the FMC core can rely on both a mezzanine name and a bus
-address.
-
-Currently the driver only supports read and write: you can lseek to the
-desired address and read or write a register.
-
-The driver assumes all registers are 32-bit in size, and only accepts a
-single read or write per system call. However, as a result of Unix read
-and write semantics, users can simply fread or fwrite bigger areas in
-order to dump or store bigger memory areas.
-
-There is currently no support for mmap, user-space interrupt management
-and DMA buffers. They may be added in later versions, if the need
-arises.
-
-The example below shows raw access to a SPEC card programmed with its
-golden FPGA file, that features an SDB structure at offset 256 - i.e.
-64 words. The mezzanine's EEPROM in this case is not programmed, so the
-default name is fmc-<bus><devfn>, and there are two cards in the system:
-
- spusa.root# insmod fmc-chardev.ko
- [ 1073.339332] spec 0000:02:00.0: Driver has no ID: matches all
- [ 1073.345051] spec 0000:02:00.0: Created misc device "fmc-0200"
- [ 1073.350821] spec 0000:04:00.0: Driver has no ID: matches all
- [ 1073.356525] spec 0000:04:00.0: Created misc device "fmc-0400"
- spusa.root# ls -l /dev/fmc*
- crw------- 1 root root 10, 58 Nov 20 19:23 /dev/fmc-0200
- crw------- 1 root root 10, 57 Nov 20 19:23 /dev/fmc-0400
- spusa.root# dd bs=4 skip=64 count=1 if=/dev/fmc-0200 2> /dev/null | od -t x1z
- 0000000 2d 42 44 53 >-BDS<
- 0000004
-
-The simple program tools/fmc-mem in this package can access an FMC char
-device and read or write a word or a whole area. Actually, the program
-is not specific to FMC at all, it just uses lseek, read and write.
-
-Its first argument is the device name, the second the offset, the third
-(if any) the value to write and the optional last argument that must
-begin with "+" is the number of bytes to read or write. In case of
-repeated reading data is written to stdout; repeated writes read from
-stdin and the value argument is ignored.
-
-The following examples show reading the SDB magic number and the first
-SDB record from a SPEC device programmed with its golden image:
-
- spusa.root# ./fmc-mem /dev/fmc-0200 100
- 5344422d
- spusa.root# ./fmc-mem /dev/fmc-0200 100 +40 | od -Ax -t x1z
- 000000 2d 42 44 53 00 01 02 00 00 00 00 00 00 00 00 00 >-BDS............<
- 000010 00 00 00 00 ff 01 00 00 00 00 00 00 51 06 00 00 >............Q...<
- 000020 c9 42 a5 e6 02 00 00 00 11 05 12 20 2d 34 42 57 >.B......... -4BW<
- 000030 73 6f 72 43 72 61 62 73 49 53 47 2d 00 20 20 20 >sorCrabsISG-. <
- 000040
diff --git a/Documentation/fmc/fmc-fakedev.txt b/Documentation/fmc/fmc-fakedev.txt
deleted file mode 100644
index e85b74a4ae30..000000000000
--- a/Documentation/fmc/fmc-fakedev.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-fmc-fakedev
-===========
-
-This package includes a software-only device, called fmc-fakedev, which
-is able to register up to 4 mezzanines (by default it registers one).
-Unlike the SPEC driver, which creates an FMC device for each PCI cards
-it manages, this module creates a single instance of its set of
-mezzanines.
-
-It is meant as the simplest possible example of how a driver should be
-written, and it includes a fake EEPROM image (built using the tools
-described in *note FMC Identification::),, which by default is
-replicated for each fake mezzanine.
-
-You can also use this device to verify the match algorithms, by asking
-it to test your own EEPROM image. You can provide the image by means of
-the eeprom= module parameter: the new EEPROM image is loaded, as usual,
-by means of the firmware loader. This example shows the defaults and a
-custom EEPROM image:
-
- spusa.root# insmod fmc-fakedev.ko
- [ 99.971247] fake-fmc-carrier: mezzanine 0
- [ 99.975393] Manufacturer: fake-vendor
- [ 99.979624] Product name: fake-design-for-testing
- spusa.root# rmmod fmc-fakedev
- spusa.root# insmod fmc-fakedev.ko eeprom=fdelay-eeprom.bin
- [ 121.447464] fake-fmc-carrier: Mezzanine 0: eeprom "fdelay-eeprom.bin"
- [ 121.462725] fake-fmc-carrier: mezzanine 0
- [ 121.466858] Manufacturer: CERN
- [ 121.470477] Product name: FmcDelay1ns4cha
- spusa.root# rmmod fmc-fakedev
-
-After loading the device, you can use the write_ee method do modify its
-own internal fake EEPROM: whenever the image is overwritten starting at
-offset 0, the module will unregister and register again the FMC device.
-This is shown in fmc-write-eeprom.txt
diff --git a/Documentation/fmc/fmc-trivial.txt b/Documentation/fmc/fmc-trivial.txt
deleted file mode 100644
index d1910bc67159..000000000000
--- a/Documentation/fmc/fmc-trivial.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-fmc-trivial
-===========
-
-The simple module fmc-trivial is just a simple client that registers an
-interrupt handler. I used it to verify the basic mechanism of the FMC
-bus and how interrupts worked.
-
-The module implements the generic FMC parameters, so it can program a
-different gateware file in each card. The whole list of parameters it
-accepts are:
-
-`busid='
-`gateware='
- Generic parameters. See mezzanine.txt
-
-
-This driver is worth reading, in my opinion.
diff --git a/Documentation/fmc/fmc-write-eeprom.txt b/Documentation/fmc/fmc-write-eeprom.txt
deleted file mode 100644
index e0a9712156aa..000000000000
--- a/Documentation/fmc/fmc-write-eeprom.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-fmc-write-eeprom
-================
-
-This module is designed to load a binary file from /lib/firmware and to
-write it to the internal EEPROM of the mezzanine card. This driver uses
-the `busid' generic parameter.
-
-Overwriting the EEPROM is not something you should do daily, and it is
-expected to only happen during manufacturing. For this reason, the
-module makes it unlikely for the random user to change a working EEPROM.
-
-However, since the EEPROM may include application-specific information
-other than the identification, later versions of this packages added
-write-support through sysfs. See *note Accessing the EEPROM::.
-
-To avoid damaging the EEPROM content, the module takes the following
-measures:
-
- * It accepts a `file=' argument (within /lib/firmware) and if no
- such argument is received, it doesn't write anything to EEPROM
- (i.e. there is no default file name).
-
- * If the file name ends with `.bin' it is written verbatim starting
- at offset 0.
-
- * If the file name ends with `.tlv' it is interpreted as
- type-length-value (i.e., it allows writev(2)-like operation).
-
- * If the file name doesn't match any of the patterns above, it is
- ignored and no write is performed.
-
- * Only cards listed with `busid=' are written to. If no busid is
- specified, no programming is done (and the probe function of the
- driver will fail).
-
-
-Each TLV tuple is formatted in this way: the header is 5 bytes,
-followed by data. The first byte is `w' for write, the next two bytes
-represent the address, in little-endian byte order, and the next two
-represent the data length, in little-endian order. The length does not
-include the header (it is the actual number of bytes to be written).
-
-This is a real example: that writes 5 bytes at position 0x110:
-
- spusa.root# od -t x1 -Ax /lib/firmware/try.tlv
- 000000 77 10 01 05 00 30 31 32 33 34
- 00000a
- spusa.root# insmod /tmp/fmc-write-eeprom.ko busid=0x0200 file=try.tlv
- [19983.391498] spec 0000:03:00.0: write 5 bytes at 0x0110
- [19983.414615] spec 0000:03:00.0: write_eeprom: success
-
-Please note that you'll most likely want to use SDBFS to build your
-EEPROM image, at least if your mezzanines are being used in the White
-Rabbit environment. For this reason the TLV format is not expected to
-be used much and is not expected to be developed further.
-
-If you want to try reflashing fake EEPROM devices, you can use the
-fmc-fakedev.ko module (see *note fmc-fakedev::). Whenever you change
-the image starting at offset 0, it will deregister and register again
-after two seconds. Please note, however, that if fmc-write-eeprom is
-still loaded, the system will associate it to the new device, which
-will be reprogrammed and thus will be unloaded after two seconds. The
-following example removes the module after it reflashed fakedev the
-first time.
-
- spusa.root# insmod fmc-fakedev.ko
- [ 72.984733] fake-fmc: Manufacturer: fake-vendor
- [ 72.989434] fake-fmc: Product name: fake-design-for-testing
- spusa.root# insmod fmc-write-eeprom.ko busid=0 file=fdelay-eeprom.bin; \
- rmmod fmc-write-eeprom
- [ 130.874098] fake-fmc: Matching a generic driver (no ID)
- [ 130.887845] fake-fmc: programming 6155 bytes
- [ 130.894567] fake-fmc: write_eeprom: success
- [ 132.895794] fake-fmc: Manufacturer: CERN
- [ 132.899872] fake-fmc: Product name: FmcDelay1ns4cha
-
-
-Accessing the EEPROM
-=====================
-
-The bus creates a sysfs binary file called eeprom for each mezzanine it
-knows about:
-
- spusa.root# cd /sys/bus/fmc/devices; ls -l */eeprom
- -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcAdc100m14b4cha-0800/eeprom
- -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDelay1ns4cha-0200/eeprom
- -r--r--r-- 1 root root 8192 Feb 21 12:30 FmcDio5cha-0400/eeprom
-
-Everybody can read the files and the superuser can also modify it, but
-the operation may on the carrier driver, if the carrier is unable to
-access the I2C bus. For example, the spec driver can access the bus
-only with its golden gateware: after a mezzanine driver reprogrammed
-the FPGA with a custom circuit, the carrier is unable to access the
-EEPROM and returns ENOTSUPP.
-
-An alternative way to write the EEPROM is the mezzanine driver
-fmc-write-eeprom (See *note fmc-write-eeprom::), but the procedure is
-more complex.
diff --git a/Documentation/fmc/identifiers.txt b/Documentation/fmc/identifiers.txt
deleted file mode 100644
index 3bb577ff0d52..000000000000
--- a/Documentation/fmc/identifiers.txt
+++ /dev/null
@@ -1,168 +0,0 @@
-FMC Identification
-******************
-
-The FMC standard requires every compliant mezzanine to carry
-identification information in an I2C EEPROM. The information must be
-laid out according to the "IPMI Platform Management FRU Information",
-where IPMI is a lie I'd better not expand, and FRU means "Field
-Replaceable Unit".
-
-The FRU information is an intricate unreadable binary blob that must
-live at offset 0 of the EEPROM, and typically extends for a few hundred
-bytes. The standard allows the application to use all the remaining
-storage area of the EEPROM as it wants.
-
-This chapter explains how to create your own EEPROM image and how to
-write it in your mezzanine, as well as how devices and drivers are
-paired at run time. EEPROM programming uses tools that are part of this
-package and SDB (part of the fpga-config-space package).
-
-The first sections are only interesting for manufacturers who need to
-write the EEPROM. If you are just a software developer writing an FMC
-device or driver, you may jump straight to *note SDB Support::.
-
-
-Building the FRU Structure
-==========================
-
-If you want to know the internals of the FRU structure and despair, you
-can retrieve the document from
-`http://download.intel.com/design/servers/ipmi/FRU1011.pdf' . The
-standard is awful and difficult without reason, so we only support the
-minimum mandatory subset - we create a simple structure and parse it
-back at run time, but we are not able to either generate or parse more
-arcane features like non-english languages and 6-bit text. If you need
-more items of the FRU standard for your boards, please submit patches.
-
-This package includes the Python script that Matthieu Cattin wrote to
-generate the FRU binary blob, based on an helper libipmi by Manohar
-Vanga and Matthieu himself. I changed the test script to receive
-parameters from the command line or from the environment (the command
-line takes precedence)
-
-To make a long story short, in order to build a standard-compliant
-binary file to be burned in your EEPROM, you need the following items:
-
- Environment Opt Official Name Default
----------------------------------------------------------------------
- FRU_VENDOR -v "Board Manufacturer" fmc-example
- FRU_NAME -n "Board Product Name" mezzanine
- FRU_SERIAL -s `Board Serial Number" 0001
- FRU_PART -p "Board Part Number" sample-part
- FRU_OUTPUT -o not applicable /dev/stdout
-
-The "Official Name" above is what you find in the FRU official
-documentation, chapter 11, page 7 ("Board Info Area Format"). The
-output option is used to save the generated binary to a specific file
-name instead of stdout.
-
-You can pass the items to the FRU generator either in the environment
-or on the command line. This package has currently no support for
-specifying power consumption or such stuff, but I plan to add it as
-soon as I find some time for that.
-
-FIXME: consumption etc for FRU are here or in PTS?
-
-The following example creates a binary image for a specific board:
-
- ./tools/fru-generator -v CERN -n FmcAdc100m14b4cha \
- -s HCCFFIA___-CR000003 -p EDA-02063-V5-0 > eeprom.bin
-
-The following example shows a script that builds several binary EEPROM
-images for a series of boards, changing the serial number for each of
-them. The script uses a mix of environment variables and command line
-options, and uses the same string patterns shown above.
-
- #!/bin/sh
-
- export FRU_VENDOR="CERN"
- export FRU_NAME="FmcAdc100m14b4cha"
- export FRU_PART="EDA-02063-V5-0"
-
- serial="HCCFFIA___-CR"
-
- for number in $(seq 1 50); do
- # build number-string "ns"
- ns="$(printf %06d $number)"
- ./fru-generator -s "${serial}${ns}" > eeprom-${ns}.bin
- done
-
-
-Using SDB-FS in the EEPROM
-==========================
-
-If you want to use SDB as a filesystem in the EEPROM device within the
-mezzanine, you should create one such filesystem using gensdbfs, from
-the fpga-config-space package on OHWR.
-
-By using an SBD filesystem you can cluster several files in a single
-EEPROM, so both the host system and a soft-core running in the FPGA (if
-any) can access extra production-time information.
-
-We chose to use SDB as a storage filesystem because the format is very
-simple, and both the host system and the soft-core will likely already
-include support code for such format. The SDB library offered by the
-fpga-config-space is less than 1kB under LM32, so it proves quite up to
-the task.
-
-The SDB entry point (which acts as a directory listing) cannot live at
-offset zero in the flash device, because the FRU information must live
-there. To avoid wasting precious storage space while still allowing
-for more-than-minimal FRU structures, the fmc.ko will look for the SDB
-record at address 256, 512 and 1024.
-
-In order to generate the complete EEPROM image you'll need a
-configuration file for gensdbfs: you tell the program where to place
-the sdb entry point, and you must force the FRU data file to be placed
-at the beginning of the storage device. If needed, you can also place
-other files at a special offset (we sometimes do it for backward
-compatibility with drivers we wrote before implementing SDB for flash
-memory).
-
-The directory tools/sdbfs of this package includes a well-commented
-example that you may want to use as a starting point (the comments are
-in the file called -SDB-CONFIG-). Reading documentation for gensdbfs
-is a suggested first step anyways.
-
-This package (generic FMC bus support) only accesses two files in the
-EEPROM: the FRU information, at offset zero, with a suggested filename
-of IPMI-FRU and the short name for the mezzanine, in a file called
-name. The IPMI-FRU name is not mandatory, but a strongly suggested
-choice; the name filename is mandatory, because this is the preferred
-short name used by the FMC core. For example, a name of "fdelay" may
-supplement a Product Name like "FmcDelay1ns4cha" - exactly as
-demonstrated in `tools/sdbfs'.
-
-Note: SDB access to flash memory is not yet supported, so the short
-name currently in use is just the "Product Name" FRU string.
-
-The example in tools/sdbfs includes an extra file, that is needed by
-the fine-delay driver, and must live at a known address of 0x1800. By
-running gensdbfs on that directory you can output your binary EEPROM
-image (here below spusa$ is the shell prompt):
-
- spusa$ ../fru-generator -v CERN -n FmcDelay1ns4cha -s proto-0 \
- -p EDA-02267-V3 > IPMI-FRU
- spusa$ ls -l
- total 16
- -rw-rw-r-- 1 rubini staff 975 Nov 19 18:08 --SDB-CONFIG--
- -rw-rw-r-- 1 rubini staff 216 Nov 19 18:13 IPMI-FRU
- -rw-rw-r-- 1 rubini staff 11 Nov 19 18:04 fd-calib
- -rw-rw-r-- 1 rubini staff 7 Nov 19 18:04 name
- spusa$ sudo gensdbfs . /lib/firmware/fdelay-eeprom.bin
- spusa$ sdb-read -l -e 0x100 /lib/firmware/fdelay-eeprom.bin
- /home/rubini/wip/sdbfs/userspace/sdb-read: listing format is to be defined
- 46696c6544617461:2e202020 00000100-000018ff .
- 46696c6544617461:6e616d65 00000200-00000206 name
- 46696c6544617461:66642d63 00001800-000018ff fd-calib
- 46696c6544617461:49504d49 00000000-000000d7 IPMI-FRU
- spusa$ ../fru-dump /lib/firmware/fdelay-eeprom.bin
- /lib/firmware/fdelay-eeprom.bin: manufacturer: CERN
- /lib/firmware/fdelay-eeprom.bin: product-name: FmcDelay1ns4cha
- /lib/firmware/fdelay-eeprom.bin: serial-number: proto-0
- /lib/firmware/fdelay-eeprom.bin: part-number: EDA-02267-V3
-
-As expected, the output file is both a proper sdbfs object and an IPMI
-FRU information blob. The fd-calib file lives at offset 0x1800 and is
-over-allocated to 256 bytes, according to the configuration file for
-gensdbfs.
diff --git a/Documentation/fmc/mezzanine.txt b/Documentation/fmc/mezzanine.txt
deleted file mode 100644
index 87910dbfc91e..000000000000
--- a/Documentation/fmc/mezzanine.txt
+++ /dev/null
@@ -1,123 +0,0 @@
-FMC Driver
-**********
-
-An FMC driver is concerned with the specific mezzanine and associated
-gateware. As such, it is expected to be independent of the carrier
-being used: it will perform I/O accesses only by means of
-carrier-provided functions.
-
-The matching between device and driver is based on the content of the
-EEPROM (as mandated by the FMC standard) or by the actual cores
-configured in the FPGA; the latter technique is used when the FPGA is
-already programmed when the device is registered to the bus core.
-
-In some special cases it is possible for a driver to directly access
-FPGA registers, by means of the `fpga_base' field of the device
-structure. This may be needed for high-bandwidth peripherals like fast
-ADC cards. If the device module registered a remote device (for example
-by means of Etherbone), the `fpga_base' pointer will be NULL.
-Therefore, drivers must be ready to deal with NULL base pointers, and
-fail gracefully. Most driver, however, are not expected to access the
-pointer directly but run fmc_readl and fmc_writel instead, which will
-work in any case.
-
-In even more special cases, the driver may access carrier-specific
-functionality: the `carrier_name' string allows the driver to check
-which is the current carrier and make use of the `carrier_data'
-pointer. We chose to use carrier names rather than numeric identifiers
-for greater flexibility, but also to avoid a central registry within
-the `fmc.h' file - we hope other users will exploit our framework with
-their own carriers. An example use of carrier names is in GPIO setup
-(see *note The GPIO Abstraction::), although the name match is not
-expected to be performed by the driver. If you depend on specific
-carriers, please check the carrier name and fail gracefully if your
-driver finds it is running in a yet-unknown-to-it environment.
-
-
-ID Table
-========
-
-Like most other Linux drivers, and FMC driver must list all the devices
-which it is able to drive. This is usually done by means of a device
-table, but in FMC we can match hardware based either on the contents of
-their EEPROM or on the actual FPGA cores that can be enumerated.
-Therefore, we have two tables of identifiers.
-
-Matching of FRU information depends on two names, the manufacturer (or
-vendor) and the device (see *note FMC Identification::); for
-flexibility during production (i.e. before writing to the EEPROM) the
-bus supports a catch-all driver that specifies NULL strings. For this
-reason, the table is specified as pointer-and-length, not a a
-null-terminated array - the entry with NULL names can be a valid entry.
-
-Matching on FPGA cores depends on two numeric fields: the 64-bit vendor
-number and the 32-bit device number. Support for matching based on
-class is not yet implemented. Each device is expected to be uniquely
-identified by an array of cores (it matches if all of the cores are
-instantiated), and for consistency the list is passed as
-pointer-and-length. Several similar devices can be driven by the same
-driver, and thus the driver specifies and array of such arrays.
-
-The complete set of involved data structures is thus the following:
-
- struct fmc_fru_id { char *manufacturer; char *product_name; };
- struct fmc_sdb_one_id { uint64_t vendor; uint32_t device; };
- struct fmc_sdb_id { struct fmc_sdb_one_id *cores; int cores_nr; };
-
- struct fmc_device_id {
- struct fmc_fru_id *fru_id; int fru_id_nr;
- struct fmc_sdb_id *sdb_id; int sdb_id_nr;
- };
-
-A better reference, with full explanation, is the <linux/fmc.h> header.
-
-
-Module Parameters
-=================
-
-Most of the FMC drivers need the same set of kernel parameters. This
-package includes support to implement common parameters by means of
-fields in the `fmc_driver' structure and simple macro definitions.
-
-The parameters are carrier-specific, in that they rely on the busid
-concept, that varies among carriers. For the SPEC, the identifier is a
-PCI bus and devfn number, 16 bits wide in total; drivers for other
-carriers will most likely offer something similar but not identical,
-and some code duplication is unavoidable.
-
-This is the list of parameters that are common to several modules to
-see how they are actually used, please look at spec-trivial.c.
-
-`busid='
- This is an array of integers, listing carrier-specific
- identification numbers. For PIC, for example, `0x0400' represents
- bus 4, slot 0. If any such ID is specified, the driver will only
- accept to drive cards that appear in the list (even if the FMC ID
- matches). This is accomplished by the validate carrier method.
-
-`gateware='
- The argument is an array of strings. If no busid= is specified,
- the first string of gateware= is used for all cards; otherwise the
- identifiers and gateware names are paired one by one, in the order
- specified.
-
-`show_sdb='
- For modules supporting it, this parameter asks to show the SDB
- internal structure by means of kernel messages. It is disabled by
- default because those lines tend to hide more important messages,
- if you look at the system console while loading the drivers.
- Note: the parameter is being obsoleted, because fmc.ko itself now
- supports dump_sdb= that applies to every client driver.
-
-
-For example, if you are using the trivial driver to load two different
-gateware files to two different cards, you can use the following
-parameters to load different binaries to the cards, after looking up
-the PCI identifiers. This has been tested with a SPEC carrier.
-
- insmod fmc-trivial.ko \
- busid=0x0200,0x0400 \
- gateware=fmc/fine-delay.bin,fmc/simple-dio.bin
-
-Please note that not all sub-modules support all of those parameters.
-You can use modinfo to check what is supported by each module.
diff --git a/Documentation/fmc/parameters.txt b/Documentation/fmc/parameters.txt
deleted file mode 100644
index 59edf088e3a4..000000000000
--- a/Documentation/fmc/parameters.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-Module Parameters in fmc.ko
-***************************
-
-The core driver receives two module parameters, meant to help debugging
-client modules. Both parameters can be modified by writing to
-/sys/module/fmc/parameters/, because they are used when client drivers
-are devices are registered, not when fmc.ko is loaded.
-
-`dump_eeprom='
- If not zero, the parameter asks the bus controller to dump the
- EEPROM of any device that is registered, using printk.
-
-`dump_sdb='
- If not zero, the parameter prints the SDB tree of every FPGA it is
- loaded by fmc_reprogram(). If greater than one, it asks to dump
- the binary content of SDB records. This currently only dumps the
- top-level SDB array, though.
-
-
-EEPROM dumping avoids repeating lines, since most of the contents is
-usually empty and all bits are one or zero. This is an example of the
-output:
-
- [ 6625.850480] spec 0000:02:00.0: FPGA programming successful
- [ 6626.139949] spec 0000:02:00.0: Manufacturer: CERN
- [ 6626.144666] spec 0000:02:00.0: Product name: FmcDelay1ns4cha
- [ 6626.150370] FMC: mezzanine 0: 0000:02:00.0 on SPEC
- [ 6626.155179] FMC: dumping eeprom 0x2000 (8192) bytes
- [ 6626.160087] 0000: 01 00 00 01 00 0b 00 f3 01 0a 00 a5 85 87 c4 43
- [ 6626.167069] 0010: 45 52 4e cf 46 6d 63 44 65 6c 61 79 31 6e 73 34
- [ 6626.174019] 0020: 63 68 61 c7 70 72 6f 74 6f 2d 30 cc 45 44 41 2d
- [ 6626.180975] 0030: 30 32 32 36 37 2d 56 33 da 32 30 31 32 2d 31 31
- [...]
- [ 6626.371366] 0200: 66 64 65 6c 61 79 0a 00 00 00 00 00 00 00 00 00
- [ 6626.378359] 0210: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
- [ 6626.385361] [...]
- [ 6626.387308] 1800: 70 6c 61 63 65 68 6f 6c 64 65 72 ff ff ff ff ff
- [ 6626.394259] 1810: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
- [ 6626.401250] [...]
-
-The dump of SDB looks like the following; the example shows the simple
-golden gateware for the SPEC card, removing the leading timestamps to
-fit the page:
-
- spec 0000:02:00.0: SDB: 00000651:e6a542c9 WB4-Crossbar-GSI
- spec 0000:02:00.0: SDB: 0000ce42:ff07fc47 WR-Periph-Syscon (00000000-000000ff)
- FMC: mezzanine 0: 0000:02:00.0 on SPEC
- FMC: poor dump of sdb first level:
- 0000: 53 44 42 2d 00 02 01 00 00 00 00 00 00 00 00 00
- 0010: 00 00 00 00 00 00 01 ff 00 00 00 00 00 00 06 51
- 0020: e6 a5 42 c9 00 00 00 02 20 12 05 11 57 42 34 2d
- 0030: 43 72 6f 73 73 62 61 72 2d 47 53 49 20 20 20 00
- 0040: 00 00 01 01 00 00 00 07 00 00 00 00 00 00 00 00
- 0050: 00 00 00 00 00 00 00 ff 00 00 00 00 00 00 ce 42
- 0060: ff 07 fc 47 00 00 00 01 20 12 03 05 57 52 2d 50
- 0070: 65 72 69 70 68 2d 53 79 73 63 6f 6e 20 20 20 01
diff --git a/Documentation/fpga/index.rst b/Documentation/fpga/index.rst
index 2c87d1ea084f..f80f95667ca2 100644
--- a/Documentation/fpga/index.rst
+++ b/Documentation/fpga/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
====
fpga
diff --git a/Documentation/gpio/index.rst b/Documentation/gpio/index.rst
deleted file mode 100644
index 09a4a553f434..000000000000
--- a/Documentation/gpio/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-:orphan:
-
-====
-gpio
-====
-
-.. toctree::
- :maxdepth: 1
-
- sysfs
-
-.. only:: subproject and html
-
- Indices
- =======
-
- * :ref:`genindex`
diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst
index a740e491dfcc..5acdd1842ea2 100644
--- a/Documentation/gpu/amdgpu.rst
+++ b/Documentation/gpu/amdgpu.rst
@@ -37,10 +37,10 @@ Buffer Objects
PRIME Buffer Sharing
--------------------
-.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
:doc: PRIME Buffer Sharing
-.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_prime.c
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
:internal:
MMU Notifier
@@ -70,6 +70,26 @@ Interrupt Handling
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
:internal:
+AMDGPU XGMI Support
+===================
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+ :doc: AMDGPU XGMI Support
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+ :internal:
+
+AMDGPU RAS debugfs control interface
+====================================
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+ :doc: AMDGPU RAS debugfs control interface
+
+
+.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+ :internal:
+
+
GPU Power/Thermal Controls and Monitoring
=========================================
diff --git a/Documentation/gpu/drivers.rst b/Documentation/gpu/drivers.rst
index 044a7025477c..4bfb7068e9f7 100644
--- a/Documentation/gpu/drivers.rst
+++ b/Documentation/gpu/drivers.rst
@@ -7,6 +7,7 @@ GPU Driver Documentation
amdgpu
amdgpu-dc
i915
+ mcde
meson
pl111
tegra
diff --git a/Documentation/gpu/drm-client.rst b/Documentation/gpu/drm-client.rst
index 7e672063e7eb..58b5a1d1219d 100644
--- a/Documentation/gpu/drm-client.rst
+++ b/Documentation/gpu/drm-client.rst
@@ -10,3 +10,6 @@ Kernel clients
.. kernel-doc:: drivers/gpu/drm/drm_client.c
:export:
+
+.. kernel-doc:: drivers/gpu/drm/drm_client_modeset.c
+ :export:
diff --git a/Documentation/gpu/drm-kms-helpers.rst b/Documentation/gpu/drm-kms-helpers.rst
index 14102ae035dc..b327bbc11182 100644
--- a/Documentation/gpu/drm-kms-helpers.rst
+++ b/Documentation/gpu/drm-kms-helpers.rst
@@ -181,6 +181,21 @@ Panel Helper Reference
.. kernel-doc:: drivers/gpu/drm/drm_panel_orientation_quirks.c
:export:
+Panel Self Refresh Helper Reference
+===================================
+
+.. kernel-doc:: drivers/gpu/drm/drm_self_refresh_helper.c
+ :doc: overview
+
+.. kernel-doc:: drivers/gpu/drm/drm_self_refresh_helper.c
+ :export:
+
+HDCP Helper Functions Reference
+===============================
+
+.. kernel-doc:: drivers/gpu/drm/drm_hdcp.c
+ :export:
+
Display Port Helper Functions Reference
=======================================
diff --git a/Documentation/gpu/drm-mm.rst b/Documentation/gpu/drm-mm.rst
index 54a696d961a7..c8ebd4f66a6a 100644
--- a/Documentation/gpu/drm-mm.rst
+++ b/Documentation/gpu/drm-mm.rst
@@ -79,7 +79,6 @@ count for the TTM, which will call your initialization function.
See the radeon_ttm.c file for an example of usage.
-
The Graphics Execution Manager (GEM)
====================================
@@ -380,6 +379,39 @@ GEM CMA Helper Functions Reference
.. kernel-doc:: drivers/gpu/drm/drm_gem_cma_helper.c
:export:
+VRAM Helper Function Reference
+==============================
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_helper_common.c
+ :doc: overview
+
+.. kernel-doc:: include/drm/drm_gem_vram_helper.h
+ :internal:
+
+GEM VRAM Helper Functions Reference
+-----------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_vram_helper.c
+ :doc: overview
+
+.. kernel-doc:: include/drm/drm_gem_vram_helper.h
+ :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_gem_vram_helper.c
+ :export:
+
+VRAM MM Helper Functions Reference
+----------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_mm_helper.c
+ :doc: overview
+
+.. kernel-doc:: include/drm/drm_vram_mm_helper.h
+ :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_vram_mm_helper.c
+ :export:
+
VMA Offset Manager
==================
diff --git a/Documentation/gpu/drm-uapi.rst b/Documentation/gpu/drm-uapi.rst
index c9fd23efd957..94f90521f58c 100644
--- a/Documentation/gpu/drm-uapi.rst
+++ b/Documentation/gpu/drm-uapi.rst
@@ -85,16 +85,18 @@ leads to a few additional requirements:
- The userspace side must be fully reviewed and tested to the standards of that
userspace project. For e.g. mesa this means piglit testcases and review on the
mailing list. This is again to ensure that the new interface actually gets the
- job done.
+ job done. The userspace-side reviewer should also provide an Acked-by on the
+ kernel uAPI patch indicating that they believe the proposed uAPI is sound and
+ sufficiently documented and validated for userspace's consumption.
- The userspace patches must be against the canonical upstream, not some vendor
fork. This is to make sure that no one cheats on the review and testing
requirements by doing a quick fork.
- The kernel patch can only be merged after all the above requirements are met,
- but it **must** be merged **before** the userspace patches land. uAPI always flows
- from the kernel, doing things the other way round risks divergence of the uAPI
- definitions and header files.
+ but it **must** be merged to either drm-next or drm-misc-next **before** the
+ userspace patches land. uAPI always flows from the kernel, doing things the
+ other way round risks divergence of the uAPI definitions and header files.
These are fairly steep requirements, but have grown out from years of shared
pain and experience with uAPI added hastily, and almost always regretted about
@@ -327,3 +329,12 @@ DRM_IOCTL_MODESET_CTL
mode setting, since on many devices the vertical blank counter is
reset to 0 at some point during modeset. Modern drivers should not
call this any more since with kernel mode setting it is a no-op.
+
+Userspace API Structures
+========================
+
+.. kernel-doc:: include/uapi/drm/drm_mode.h
+ :doc: overview
+
+.. kernel-doc:: include/uapi/drm/drm_mode.h
+ :internal:
diff --git a/Documentation/gpu/i915.rst b/Documentation/gpu/i915.rst
index 055df45596c1..c38ef0dda605 100644
--- a/Documentation/gpu/i915.rst
+++ b/Documentation/gpu/i915.rst
@@ -61,7 +61,7 @@ Intel GVT-g Host Support(vGPU device model)
Workarounds
-----------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_workarounds.c
+.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_workarounds.c
:doc: Hardware workarounds
Display Hardware Handling
@@ -82,13 +82,13 @@ change.
Frontbuffer Tracking
--------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_frontbuffer.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_frontbuffer.c
:doc: frontbuffer tracking
-.. kernel-doc:: drivers/gpu/drm/i915/intel_frontbuffer.h
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_frontbuffer.h
:internal:
-.. kernel-doc:: drivers/gpu/drm/i915/intel_frontbuffer.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_frontbuffer.c
:internal:
.. kernel-doc:: drivers/gpu/drm/i915/i915_gem.c
@@ -97,10 +97,10 @@ Frontbuffer Tracking
Display FIFO Underrun Reporting
-------------------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fifo_underrun.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fifo_underrun.c
:doc: fifo underrun handling
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fifo_underrun.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fifo_underrun.c
:internal:
Plane Configuration
@@ -115,10 +115,10 @@ panel self refresh.
Atomic Plane Helpers
--------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_atomic_plane.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_atomic_plane.c
:doc: atomic plane helpers
-.. kernel-doc:: drivers/gpu/drm/i915/intel_atomic_plane.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_atomic_plane.c
:internal:
Output Probing
@@ -132,19 +132,19 @@ probing, so those sections fully apply.
Hotplug
-------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_hotplug.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_hotplug.c
:doc: Hotplug
-.. kernel-doc:: drivers/gpu/drm/i915/intel_hotplug.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_hotplug.c
:internal:
High Definition Audio
---------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_audio.c
:doc: High Definition Audio over HDMI and Display Port
-.. kernel-doc:: drivers/gpu/drm/i915/intel_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_audio.c
:internal:
.. kernel-doc:: include/drm/i915_component.h
@@ -153,58 +153,58 @@ High Definition Audio
Intel HDMI LPE Audio Support
----------------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lpe_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_lpe_audio.c
:doc: LPE Audio integration for HDMI or DP playback
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lpe_audio.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_lpe_audio.c
:internal:
Panel Self Refresh PSR (PSR/SRD)
--------------------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_psr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_psr.c
:doc: Panel Self Refresh (PSR/SRD)
-.. kernel-doc:: drivers/gpu/drm/i915/intel_psr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_psr.c
:internal:
Frame Buffer Compression (FBC)
------------------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fbc.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fbc.c
:doc: Frame Buffer Compression (FBC)
-.. kernel-doc:: drivers/gpu/drm/i915/intel_fbc.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_fbc.c
:internal:
Display Refresh Rate Switching (DRRS)
-------------------------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
:doc: Display Refresh Rate Switching (DRRS)
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
:functions: intel_dp_set_drrs_state
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
:functions: intel_edp_drrs_enable
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
:functions: intel_edp_drrs_disable
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
:functions: intel_edp_drrs_invalidate
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
:functions: intel_edp_drrs_flush
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dp.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dp.c
:functions: intel_dp_drrs_init
DPIO
----
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpio_phy.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpio_phy.c
:doc: DPIO
CSR firmware support for DMC
@@ -219,34 +219,34 @@ CSR firmware support for DMC
Video BIOS Table (VBT)
----------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_bios.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_bios.c
:doc: Video BIOS Table (VBT)
-.. kernel-doc:: drivers/gpu/drm/i915/intel_bios.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_bios.c
:internal:
-.. kernel-doc:: drivers/gpu/drm/i915/intel_vbt_defs.h
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_vbt_defs.h
:internal:
Display clocks
--------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_cdclk.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_cdclk.c
:doc: CDCLK / RAWCLK
-.. kernel-doc:: drivers/gpu/drm/i915/intel_cdclk.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_cdclk.c
:internal:
Display PLLs
------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpll_mgr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpll_mgr.c
:doc: Display PLLs
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpll_mgr.c
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpll_mgr.c
:internal:
-.. kernel-doc:: drivers/gpu/drm/i915/intel_dpll_mgr.h
+.. kernel-doc:: drivers/gpu/drm/i915/display/intel_dpll_mgr.h
:internal:
Memory Management and Command Submission
@@ -349,7 +349,7 @@ of buffer object caches. Shrinking is used to make main memory
available. Note that this is mostly orthogonal to evicting buffer
objects, which has the goal to make space in gpu virtual address spaces.
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_shrinker.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
:internal:
Batchbuffer Parsing
@@ -373,18 +373,15 @@ Batchbuffer Pools
User Batchbuffer Execution
--------------------------
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_execbuffer.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
:doc: User command execution
Logical Rings, Logical Ring Contexts and Execlists
--------------------------------------------------
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lrc.c
+.. kernel-doc:: drivers/gpu/drm/i915/gt/intel_lrc.c
:doc: Logical Rings, Logical Ring Contexts and Execlists
-.. kernel-doc:: drivers/gpu/drm/i915/intel_lrc.c
- :internal:
-
Global GTT views
----------------
@@ -415,10 +412,10 @@ Hardware Tiling and Swizzling Details
Object Tiling IOCTLs
--------------------
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_tiling.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_tiling.c
:internal:
-.. kernel-doc:: drivers/gpu/drm/i915/i915_gem_tiling.c
+.. kernel-doc:: drivers/gpu/drm/i915/gem/i915_gem_tiling.c
:doc: buffer object tiling
WOPCM
@@ -478,12 +475,6 @@ i915_context_create and i915_context_free
.. kernel-doc:: drivers/gpu/drm/i915/i915_trace.h
:doc: i915_context_create and i915_context_free tracepoints
-switch_mm
----------
-
-.. kernel-doc:: drivers/gpu/drm/i915/i915_trace.h
- :doc: switch_mm tracepoint
-
Perf
====
diff --git a/Documentation/gpu/mcde.rst b/Documentation/gpu/mcde.rst
new file mode 100644
index 000000000000..c69e977defda
--- /dev/null
+++ b/Documentation/gpu/mcde.rst
@@ -0,0 +1,8 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================================
+ drm/mcde ST-Ericsson MCDE Multi-channel display engine
+=======================================================
+
+.. kernel-doc:: drivers/gpu/drm/mcde/mcde_drv.c
+ :doc: ST-Ericsson MCDE DRM Driver
diff --git a/Documentation/gpu/todo.rst b/Documentation/gpu/todo.rst
index 1528ad2d598b..0a49c5a1d9ce 100644
--- a/Documentation/gpu/todo.rst
+++ b/Documentation/gpu/todo.rst
@@ -10,25 +10,6 @@ graphics subsystem useful as newbie projects. Or for slow rainy days.
Subsystem-wide refactorings
===========================
-De-midlayer drivers
--------------------
-
-With the recent ``drm_bus`` cleanup patches for 3.17 it is no longer required
-to have a ``drm_bus`` structure set up. Drivers can directly set up the
-``drm_device`` structure instead of relying on bus methods in ``drm_usb.c``
-and ``drm_pci.c``. The goal is to get rid of the driver's ``->load`` /
-``->unload`` callbacks and open-code the load/unload sequence properly, using
-the new two-stage ``drm_device`` setup/teardown.
-
-Once all existing drivers are converted we can also remove those bus support
-files for USB and platform devices.
-
-All you need is a GPU for a non-converted driver (currently almost all of
-them, but also all the virtual ones used by KVM, so everyone qualifies).
-
-Contact: Daniel Vetter, Thierry Reding, respective driver maintainers
-
-
Remove custom dumb_map_offset implementations
---------------------------------------------
@@ -247,6 +228,12 @@ struct drm_gem_object_funcs
GEM objects can now have a function table instead of having the callbacks on the
DRM driver struct. This is now the preferred way and drivers can be moved over.
+DRM_GEM_CMA_VMAP_DRIVER_OPS, DRM_GEM_SHMEM_DRIVER_OPS already support this, but
+DRM_GEM_VRAM_DRIVER_PRIME does not yet and needs to be aligned with the previous
+two. We also need a 2nd version of the CMA define that doesn't require the
+vmapping to be present (different hook for prime importing). Plus this needs to
+be rolled out to all drivers using their own implementations, too.
+
Use DRM_MODESET_LOCK_ALL_* helpers instead of boilerplate
---------------------------------------------------------
@@ -300,6 +287,21 @@ it to use drm_mode_hsync() instead.
Contact: Sean Paul
+drm_fb_helper tasks
+-------------------
+
+- drm_fb_helper_restore_fbdev_mode_unlocked() should call restore_fbdev_mode()
+ not the _force variant so it can bail out if there is a master. But first
+ these igt tests need to be fixed: kms_fbcon_fbt@psr and
+ kms_fbcon_fbt@psr-suspend.
+
+- The max connector argument for drm_fb_helper_init() and
+ drm_fb_helper_fbdev_setup() isn't used anymore and can be removed.
+
+- The helper doesn't keep an array of connectors anymore so these can be
+ removed: drm_fb_helper_single_add_all_connectors(),
+ drm_fb_helper_add_one_connector() and drm_fb_helper_remove_one_connector().
+
Core refactorings
=================
@@ -488,5 +490,20 @@ i915
device_link_add to model the dependency between i915 and snd_had. See
https://dri.freedesktop.org/docs/drm/driver-api/device_link.html
+Bootsplash
+==========
+
+There is support in place now for writing internal DRM clients making it
+possible to pick up the bootsplash work that was rejected because it was written
+for fbdev.
+
+- [v6,8/8] drm/client: Hack: Add bootsplash example
+ https://patchwork.freedesktop.org/patch/306579/
+
+- [RFC PATCH v2 00/13] Kernel based bootsplash
+ https://lkml.org/lkml/2017/12/13/764
+
+Contact: Sam Ravnborg
+
Outside DRM
===========
diff --git a/Documentation/hid/hid-alps.rst b/Documentation/hid/hid-alps.rst
new file mode 100644
index 000000000000..e2f4c4c11e3f
--- /dev/null
+++ b/Documentation/hid/hid-alps.rst
@@ -0,0 +1,180 @@
+==========================
+ALPS HID Touchpad Protocol
+==========================
+
+Introduction
+------------
+Currently ALPS HID driver supports U1 Touchpad device.
+
+U1 device basic information.
+
+========== ======
+Vender ID 0x044E
+Product ID 0x120B
+Version ID 0x0121
+========== ======
+
+
+HID Descriptor
+--------------
+
+======= ==================== ===== =======================================
+Byte Field Value Notes
+======= ==================== ===== =======================================
+0 wHIDDescLength 001E Length of HID Descriptor : 30 bytes
+2 bcdVersion 0100 Compliant with Version 1.00
+4 wReportDescLength 00B2 Report Descriptor is 178 Bytes (0x00B2)
+6 wReportDescRegister 0002 Identifier to read Report Descriptor
+8 wInputRegister 0003 Identifier to read Input Report
+10 wMaxInputLength 0053 Input Report is 80 Bytes + 2
+12 wOutputRegister 0000 Identifier to read Output Report
+14 wMaxOutputLength 0000 No Output Reports
+16 wCommandRegister 0005 Identifier for Command Register
+18 wDataRegister 0006 Identifier for Data Register
+20 wVendorID 044E Vendor ID 0x044E
+22 wProductID 120B Product ID 0x120B
+24 wVersionID 0121 Version 01.21
+26 RESERVED 0000 RESERVED
+======= ==================== ===== =======================================
+
+
+Report ID
+---------
+
+========== ================= =========================================
+ReportID-1 (Input Reports) (HIDUsage-Mouse) for TP&SP
+ReportID-2 (Input Reports) (HIDUsage-keyboard) for TP
+ReportID-3 (Input Reports) (Vendor Usage: Max 10 finger data) for TP
+ReportID-4 (Input Reports) (Vendor Usage: ON bit data) for GP
+ReportID-5 (Feature Reports) Feature Reports
+ReportID-6 (Input Reports) (Vendor Usage: StickPointer data) for SP
+ReportID-7 (Feature Reports) Flash update (Bootloader)
+========== ================= =========================================
+
+
+Data pattern
+------------
+
+===== ========== ===== =================
+Case1 ReportID_1 TP/SP Relative/Relative
+Case2 ReportID_3 TP Absolute
+ ReportID_6 SP Absolute
+===== ========== ===== =================
+
+
+Command Read/Write
+------------------
+To read/write to RAM, need to send a commands to the device.
+
+The command format is as below.
+
+DataByte(SET_REPORT)
+
+===== ======================
+Byte1 Command Byte
+Byte2 Address - Byte 0 (LSB)
+Byte3 Address - Byte 1
+Byte4 Address - Byte 2
+Byte5 Address - Byte 3 (MSB)
+Byte6 Value Byte
+Byte7 Checksum
+===== ======================
+
+Command Byte is read=0xD1/write=0xD2 .
+
+Address is read/write RAM address.
+
+Value Byte is writing data when you send the write commands.
+
+When you read RAM, there is no meaning.
+
+DataByte(GET_REPORT)
+
+===== ======================
+Byte1 Response Byte
+Byte2 Address - Byte 0 (LSB)
+Byte3 Address - Byte 1
+Byte4 Address - Byte 2
+Byte5 Address - Byte 3 (MSB)
+Byte6 Value Byte
+Byte7 Checksum
+===== ======================
+
+Read value is stored in Value Byte.
+
+
+Packet Format
+Touchpad data byte
+------------------
+
+
+======= ======= ======= ======= ======= ======= ======= ======= =====
+- b7 b6 b5 b4 b3 b2 b1 b0
+======= ======= ======= ======= ======= ======= ======= ======= =====
+1 0 0 SW6 SW5 SW4 SW3 SW2 SW1
+2 0 0 0 Fcv Fn3 Fn2 Fn1 Fn0
+3 Xa0_7 Xa0_6 Xa0_5 Xa0_4 Xa0_3 Xa0_2 Xa0_1 Xa0_0
+4 Xa0_15 Xa0_14 Xa0_13 Xa0_12 Xa0_11 Xa0_10 Xa0_9 Xa0_8
+5 Ya0_7 Ya0_6 Ya0_5 Ya0_4 Ya0_3 Ya0_2 Ya0_1 Ya0_0
+6 Ya0_15 Ya0_14 Ya0_13 Ya0_12 Ya0_11 Ya0_10 Ya0_9 Ya0_8
+7 LFB0 Zs0_6 Zs0_5 Zs0_4 Zs0_3 Zs0_2 Zs0_1 Zs0_0
+
+8 Xa1_7 Xa1_6 Xa1_5 Xa1_4 Xa1_3 Xa1_2 Xa1_1 Xa1_0
+9 Xa1_15 Xa1_14 Xa1_13 Xa1_12 Xa1_11 Xa1_10 Xa1_9 Xa1_8
+10 Ya1_7 Ya1_6 Ya1_5 Ya1_4 Ya1_3 Ya1_2 Ya1_1 Ya1_0
+11 Ya1_15 Ya1_14 Ya1_13 Ya1_12 Ya1_11 Ya1_10 Ya1_9 Ya1_8
+12 LFB1 Zs1_6 Zs1_5 Zs1_4 Zs1_3 Zs1_2 Zs1_1 Zs1_0
+
+13 Xa2_7 Xa2_6 Xa2_5 Xa2_4 Xa2_3 Xa2_2 Xa2_1 Xa2_0
+14 Xa2_15 Xa2_14 Xa2_13 Xa2_12 Xa2_11 Xa2_10 Xa2_9 Xa2_8
+15 Ya2_7 Ya2_6 Ya2_5 Ya2_4 Ya2_3 Ya2_2 Ya2_1 Ya2_0
+16 Ya2_15 Ya2_14 Ya2_13 Ya2_12 Ya2_11 Ya2_10 Ya2_9 Ya2_8
+17 LFB2 Zs2_6 Zs2_5 Zs2_4 Zs2_3 Zs2_2 Zs2_1 Zs2_0
+
+18 Xa3_7 Xa3_6 Xa3_5 Xa3_4 Xa3_3 Xa3_2 Xa3_1 Xa3_0
+19 Xa3_15 Xa3_14 Xa3_13 Xa3_12 Xa3_11 Xa3_10 Xa3_9 Xa3_8
+20 Ya3_7 Ya3_6 Ya3_5 Ya3_4 Ya3_3 Ya3_2 Ya3_1 Ya3_0
+21 Ya3_15 Ya3_14 Ya3_13 Ya3_12 Ya3_11 Ya3_10 Ya3_9 Ya3_8
+22 LFB3 Zs3_6 Zs3_5 Zs3_4 Zs3_3 Zs3_2 Zs3_1 Zs3_0
+
+23 Xa4_7 Xa4_6 Xa4_5 Xa4_4 Xa4_3 Xa4_2 Xa4_1 Xa4_0
+24 Xa4_15 Xa4_14 Xa4_13 Xa4_12 Xa4_11 Xa4_10 Xa4_9 Xa4_8
+25 Ya4_7 Ya4_6 Ya4_5 Ya4_4 Ya4_3 Ya4_2 Ya4_1 Ya4_0
+26 Ya4_15 Ya4_14 Ya4_13 Ya4_12 Ya4_11 Ya4_10 Ya4_9 Ya4_8
+27 LFB4 Zs4_6 Zs4_5 Zs4_4 Zs4_3 Zs4_2 Zs4_1 Zs4_0
+======= ======= ======= ======= ======= ======= ======= ======= =====
+
+
+SW1-SW6:
+ SW ON/OFF status
+Xan_15-0(16bit):
+ X Absolute data of the "n"th finger
+Yan_15-0(16bit):
+ Y Absolute data of the "n"th finger
+Zsn_6-0(7bit):
+ Operation area of the "n"th finger
+
+
+StickPointer data byte
+----------------------
+
+======= ======= ======= ======= ======= ======= ======= ======= =====
+- b7 b6 b5 b4 b3 b2 b1 b0
+======= ======= ======= ======= ======= ======= ======= ======= =====
+Byte1 1 1 1 0 1 SW3 SW2 SW1
+Byte2 X7 X6 X5 X4 X3 X2 X1 X0
+Byte3 X15 X14 X13 X12 X11 X10 X9 X8
+Byte4 Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
+Byte5 Y15 Y14 Y13 Y12 Y11 Y10 Y9 Y8
+Byte6 Z7 Z6 Z5 Z4 Z3 Z2 Z1 Z0
+Byte7 T&P Z14 Z13 Z12 Z11 Z10 Z9 Z8
+======= ======= ======= ======= ======= ======= ======= ======= =====
+
+SW1-SW3:
+ SW ON/OFF status
+Xn_15-0(16bit):
+ X Absolute data
+Yn_15-0(16bit):
+ Y Absolute data
+Zn_14-0(15bit):
+ Z
diff --git a/Documentation/hid/hid-alps.txt b/Documentation/hid/hid-alps.txt
deleted file mode 100644
index 6b02a2447c77..000000000000
--- a/Documentation/hid/hid-alps.txt
+++ /dev/null
@@ -1,139 +0,0 @@
-ALPS HID Touchpad Protocol
-----------------------
-
-Introduction
-------------
-Currently ALPS HID driver supports U1 Touchpad device.
-
-U1 devuce basic information.
-Vender ID 0x044E
-Product ID 0x120B
-Version ID 0x0121
-
-
-HID Descriptor
-------------
-Byte Field Value Notes
-0 wHIDDescLength 001E Length of HID Descriptor : 30 bytes
-2 bcdVersion 0100 Compliant with Version 1.00
-4 wReportDescLength 00B2 Report Descriptor is 178 Bytes (0x00B2)
-6 wReportDescRegister 0002 Identifier to read Report Descriptor
-8 wInputRegister 0003 Identifier to read Input Report
-10 wMaxInputLength 0053 Input Report is 80 Bytes + 2
-12 wOutputRegister 0000 Identifier to read Output Report
-14 wMaxOutputLength 0000 No Output Reports
-16 wCommandRegister 0005 Identifier for Command Register
-18 wDataRegister 0006 Identifier for Data Register
-20 wVendorID 044E Vendor ID 0x044E
-22 wProductID 120B Product ID 0x120B
-24 wVersionID 0121 Version 01.21
-26 RESERVED 0000 RESERVED
-
-
-Report ID
-------------
-ReportID-1 (Input Reports) (HIDUsage-Mouse) for TP&SP
-ReportID-2 (Input Reports) (HIDUsage-keyboard) for TP
-ReportID-3 (Input Reports) (Vendor Usage: Max 10 finger data) for TP
-ReportID-4 (Input Reports) (Vendor Usage: ON bit data) for GP
-ReportID-5 (Feature Reports) Feature Reports
-ReportID-6 (Input Reports) (Vendor Usage: StickPointer data) for SP
-ReportID-7 (Feature Reports) Flash update (Bootloader)
-
-
-Data pattern
-------------
-Case1 ReportID_1 TP/SP Relative/Relative
-Case2 ReportID_3 TP Absolute
- ReportID_6 SP Absolute
-
-
-Command Read/Write
-------------------
-To read/write to RAM, need to send a commands to the device.
-The command format is as below.
-
-DataByte(SET_REPORT)
-Byte1 Command Byte
-Byte2 Address - Byte 0 (LSB)
-Byte3 Address - Byte 1
-Byte4 Address - Byte 2
-Byte5 Address - Byte 3 (MSB)
-Byte6 Value Byte
-Byte7 Checksum
-
-Command Byte is read=0xD1/write=0xD2 .
-Address is read/write RAM address.
-Value Byte is writing data when you send the write commands.
-When you read RAM, there is no meaning.
-
-DataByte(GET_REPORT)
-Byte1 Response Byte
-Byte2 Address - Byte 0 (LSB)
-Byte3 Address - Byte 1
-Byte4 Address - Byte 2
-Byte5 Address - Byte 3 (MSB)
-Byte6 Value Byte
-Byte7 Checksum
-
-Read value is stored in Value Byte.
-
-
-Packet Format
-Touchpad data byte
-------------------
- b7 b6 b5 b4 b3 b2 b1 b0
-1 0 0 SW6 SW5 SW4 SW3 SW2 SW1
-2 0 0 0 Fcv Fn3 Fn2 Fn1 Fn0
-3 Xa0_7 Xa0_6 Xa0_5 Xa0_4 Xa0_3 Xa0_2 Xa0_1 Xa0_0
-4 Xa0_15 Xa0_14 Xa0_13 Xa0_12 Xa0_11 Xa0_10 Xa0_9 Xa0_8
-5 Ya0_7 Ya0_6 Ya0_5 Ya0_4 Ya0_3 Ya0_2 Ya0_1 Ya0_0
-6 Ya0_15 Ya0_14 Ya0_13 Ya0_12 Ya0_11 Ya0_10 Ya0_9 Ya0_8
-7 LFB0 Zs0_6 Zs0_5 Zs0_4 Zs0_3 Zs0_2 Zs0_1 Zs0_0
-
-8 Xa1_7 Xa1_6 Xa1_5 Xa1_4 Xa1_3 Xa1_2 Xa1_1 Xa1_0
-9 Xa1_15 Xa1_14 Xa1_13 Xa1_12 Xa1_11 Xa1_10 Xa1_9 Xa1_8
-10 Ya1_7 Ya1_6 Ya1_5 Ya1_4 Ya1_3 Ya1_2 Ya1_1 Ya1_0
-11 Ya1_15 Ya1_14 Ya1_13 Ya1_12 Ya1_11 Ya1_10 Ya1_9 Ya1_8
-12 LFB1 Zs1_6 Zs1_5 Zs1_4 Zs1_3 Zs1_2 Zs1_1 Zs1_0
-
-13 Xa2_7 Xa2_6 Xa2_5 Xa2_4 Xa2_3 Xa2_2 Xa2_1 Xa2_0
-14 Xa2_15 Xa2_14 Xa2_13 Xa2_12 Xa2_11 Xa2_10 Xa2_9 Xa2_8
-15 Ya2_7 Ya2_6 Ya2_5 Ya2_4 Ya2_3 Ya2_2 Ya2_1 Ya2_0
-16 Ya2_15 Ya2_14 Ya2_13 Ya2_12 Ya2_11 Ya2_10 Ya2_9 Ya2_8
-17 LFB2 Zs2_6 Zs2_5 Zs2_4 Zs2_3 Zs2_2 Zs2_1 Zs2_0
-
-18 Xa3_7 Xa3_6 Xa3_5 Xa3_4 Xa3_3 Xa3_2 Xa3_1 Xa3_0
-19 Xa3_15 Xa3_14 Xa3_13 Xa3_12 Xa3_11 Xa3_10 Xa3_9 Xa3_8
-20 Ya3_7 Ya3_6 Ya3_5 Ya3_4 Ya3_3 Ya3_2 Ya3_1 Ya3_0
-21 Ya3_15 Ya3_14 Ya3_13 Ya3_12 Ya3_11 Ya3_10 Ya3_9 Ya3_8
-22 LFB3 Zs3_6 Zs3_5 Zs3_4 Zs3_3 Zs3_2 Zs3_1 Zs3_0
-
-23 Xa4_7 Xa4_6 Xa4_5 Xa4_4 Xa4_3 Xa4_2 Xa4_1 Xa4_0
-24 Xa4_15 Xa4_14 Xa4_13 Xa4_12 Xa4_11 Xa4_10 Xa4_9 Xa4_8
-25 Ya4_7 Ya4_6 Ya4_5 Ya4_4 Ya4_3 Ya4_2 Ya4_1 Ya4_0
-26 Ya4_15 Ya4_14 Ya4_13 Ya4_12 Ya4_11 Ya4_10 Ya4_9 Ya4_8
-27 LFB4 Zs4_6 Zs4_5 Zs4_4 Zs4_3 Zs4_2 Zs4_1 Zs4_0
-
-
-SW1-SW6: SW ON/OFF status
-Xan_15-0(16bit):X Absolute data of the "n"th finger
-Yan_15-0(16bit):Y Absolute data of the "n"th finger
-Zsn_6-0(7bit): Operation area of the "n"th finger
-
-
-StickPointer data byte
-------------------
- b7 b6 b5 b4 b3 b2 b1 b0
-Byte1 1 1 1 0 1 SW3 SW2 SW1
-Byte2 X7 X6 X5 X4 X3 X2 X1 X0
-Byte3 X15 X14 X13 X12 X11 X10 X9 X8
-Byte4 Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
-Byte5 Y15 Y14 Y13 Y12 Y11 Y10 Y9 Y8
-Byte6 Z7 Z6 Z5 Z4 Z3 Z2 Z1 Z0
-Byte7 T&P Z14 Z13 Z12 Z11 Z10 Z9 Z8
-
-SW1-SW3: SW ON/OFF status
-Xn_15-0(16bit):X Absolute data
-Yn_15-0(16bit):Y Absolute data
-Zn_14-0(15bit):Z
diff --git a/Documentation/hid/hid-sensor.rst b/Documentation/hid/hid-sensor.rst
new file mode 100644
index 000000000000..758972e34971
--- /dev/null
+++ b/Documentation/hid/hid-sensor.rst
@@ -0,0 +1,242 @@
+=====================
+HID Sensors Framework
+=====================
+HID sensor framework provides necessary interfaces to implement sensor drivers,
+which are connected to a sensor hub. The sensor hub is a HID device and it provides
+a report descriptor conforming to HID 1.12 sensor usage tables.
+
+Description from the HID 1.12 "HID Sensor Usages" specification:
+"Standardization of HID usages for sensors would allow (but not require) sensor
+hardware vendors to provide a consistent Plug And Play interface at the USB boundary,
+thereby enabling some operating systems to incorporate common device drivers that
+could be reused between vendors, alleviating any need for the vendors to provide
+the drivers themselves."
+
+This specification describes many usage IDs, which describe the type of sensor
+and also the individual data fields. Each sensor can have variable number of
+data fields. The length and order is specified in the report descriptor. For
+example a part of report descriptor can look like::
+
+ INPUT(1)[INPUT]
+ ..
+ Field(2)
+ Physical(0020.0073)
+ Usage(1)
+ 0020.045f
+ Logical Minimum(-32767)
+ Logical Maximum(32767)
+ Report Size(8)
+ Report Count(1)
+ Report Offset(16)
+ Flags(Variable Absolute)
+ ..
+ ..
+
+The report is indicating "sensor page (0x20)" contains an accelerometer-3D (0x73).
+This accelerometer-3D has some fields. Here for example field 2 is motion intensity
+(0x045f) with a logical minimum value of -32767 and logical maximum of 32767. The
+order of fields and length of each field is important as the input event raw
+data will use this format.
+
+
+Implementation
+==============
+
+This specification defines many different types of sensors with different sets of
+data fields. It is difficult to have a common input event to user space applications,
+for different sensors. For example an accelerometer can send X,Y and Z data, whereas
+an ambient light sensor can send illumination data.
+So the implementation has two parts:
+
+- Core hid driver
+- Individual sensor processing part (sensor drivers)
+
+Core driver
+-----------
+The core driver registers (hid-sensor-hub) registers as a HID driver. It parses
+report descriptors and identifies all the sensors present. It adds an MFD device
+with name HID-SENSOR-xxxx (where xxxx is usage id from the specification).
+
+For example:
+
+HID-SENSOR-200073 is registered for an Accelerometer 3D driver.
+
+So if any driver with this name is inserted, then the probe routine for that
+function will be called. So an accelerometer processing driver can register
+with this name and will be probed if there is an accelerometer-3D detected.
+
+The core driver provides a set of APIs which can be used by the processing
+drivers to register and get events for that usage id. Also it provides parsing
+functions, which get and set each input/feature/output report.
+
+Individual sensor processing part (sensor drivers)
+--------------------------------------------------
+
+The processing driver will use an interface provided by the core driver to parse
+the report and get the indexes of the fields and also can get events. This driver
+can use IIO interface to use the standard ABI defined for a type of sensor.
+
+
+Core driver Interface
+=====================
+
+Callback structure::
+
+ Each processing driver can use this structure to set some callbacks.
+ int (*suspend)(..): Callback when HID suspend is received
+ int (*resume)(..): Callback when HID resume is received
+ int (*capture_sample)(..): Capture a sample for one of its data fields
+ int (*send_event)(..): One complete event is received which can have
+ multiple data fields.
+
+Registration functions::
+
+ int sensor_hub_register_callback(struct hid_sensor_hub_device *hsdev,
+ u32 usage_id,
+ struct hid_sensor_hub_callbacks *usage_callback):
+
+Registers callbacks for an usage id. The callback functions are not allowed
+to sleep::
+
+
+ int sensor_hub_remove_callback(struct hid_sensor_hub_device *hsdev,
+ u32 usage_id):
+
+Removes callbacks for an usage id.
+
+
+Parsing function::
+
+ int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
+ u8 type,
+ u32 usage_id, u32 attr_usage_id,
+ struct hid_sensor_hub_attribute_info *info);
+
+A processing driver can look for some field of interest and check if it exists
+in a report descriptor. If it exists it will store necessary information
+so that fields can be set or get individually.
+These indexes avoid searching every time and getting field index to get or set.
+
+
+Set Feature report::
+
+ int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
+ u32 field_index, s32 value);
+
+This interface is used to set a value for a field in feature report. For example
+if there is a field report_interval, which is parsed by a call to
+sensor_hub_input_get_attribute_info before, then it can directly set that
+individual field::
+
+
+ int sensor_hub_get_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
+ u32 field_index, s32 *value);
+
+This interface is used to get a value for a field in input report. For example
+if there is a field report_interval, which is parsed by a call to
+sensor_hub_input_get_attribute_info before, then it can directly get that
+individual field value::
+
+
+ int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev,
+ u32 usage_id,
+ u32 attr_usage_id, u32 report_id);
+
+This is used to get a particular field value through input reports. For example
+accelerometer wants to poll X axis value, then it can call this function with
+the usage id of X axis. HID sensors can provide events, so this is not necessary
+to poll for any field. If there is some new sample, the core driver will call
+registered callback function to process the sample.
+
+
+----------
+
+HID Custom and generic Sensors
+------------------------------
+
+
+HID Sensor specification defines two special sensor usage types. Since they
+don't represent a standard sensor, it is not possible to define using Linux IIO
+type interfaces.
+The purpose of these sensors is to extend the functionality or provide a
+way to obfuscate the data being communicated by a sensor. Without knowing the
+mapping between the data and its encapsulated form, it is difficult for
+an application/driver to determine what data is being communicated by the sensor.
+This allows some differentiating use cases, where vendor can provide applications.
+Some common use cases are debug other sensors or to provide some events like
+keyboard attached/detached or lid open/close.
+
+To allow application to utilize these sensors, here they are exported uses sysfs
+attribute groups, attributes and misc device interface.
+
+An example of this representation on sysfs::
+
+ /sys/devices/pci0000:00/INT33C2:00/i2c-0/i2c-INT33D1:00/0018:8086:09FA.0001/HID-SENSOR-2000e1.6.auto$ tree -R
+ .
+ │   ├── enable_sensor
+ │   │   ├── feature-0-200316
+ │   │   │   ├── feature-0-200316-maximum
+ │   │   │   ├── feature-0-200316-minimum
+ │   │   │   ├── feature-0-200316-name
+ │   │   │   ├── feature-0-200316-size
+ │   │   │   ├── feature-0-200316-unit-expo
+ │   │   │   ├── feature-0-200316-units
+ │   │   │   ├── feature-0-200316-value
+ │   │   ├── feature-1-200201
+ │   │   │   ├── feature-1-200201-maximum
+ │   │   │   ├── feature-1-200201-minimum
+ │   │   │   ├── feature-1-200201-name
+ │   │   │   ├── feature-1-200201-size
+ │   │   │   ├── feature-1-200201-unit-expo
+ │   │   │   ├── feature-1-200201-units
+ │   │   │   ├── feature-1-200201-value
+ │   │   ├── input-0-200201
+ │   │   │   ├── input-0-200201-maximum
+ │   │   │   ├── input-0-200201-minimum
+ │   │   │   ├── input-0-200201-name
+ │   │   │   ├── input-0-200201-size
+ │   │   │   ├── input-0-200201-unit-expo
+ │   │   │   ├── input-0-200201-units
+ │   │   │   ├── input-0-200201-value
+ │   │   ├── input-1-200202
+ │   │   │   ├── input-1-200202-maximum
+ │   │   │   ├── input-1-200202-minimum
+ │   │   │   ├── input-1-200202-name
+ │   │   │   ├── input-1-200202-size
+ │   │   │   ├── input-1-200202-unit-expo
+ │   │   │   ├── input-1-200202-units
+ │   │   │   ├── input-1-200202-value
+
+Here there is a custom sensors with four fields, two feature and two inputs.
+Each field is represented by a set of attributes. All fields except the "value"
+are read only. The value field is a RW field.
+
+Example::
+
+ /sys/bus/platform/devices/HID-SENSOR-2000e1.6.auto/feature-0-200316$ grep -r . *
+ feature-0-200316-maximum:6
+ feature-0-200316-minimum:0
+ feature-0-200316-name:property-reporting-state
+ feature-0-200316-size:1
+ feature-0-200316-unit-expo:0
+ feature-0-200316-units:25
+ feature-0-200316-value:1
+
+How to enable such sensor?
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default sensor can be power gated. To enable sysfs attribute "enable" can be
+used::
+
+ $ echo 1 > enable_sensor
+
+Once enabled and powered on, sensor can report value using HID reports.
+These reports are pushed using misc device interface in a FIFO order::
+
+ /dev$ tree | grep HID-SENSOR-2000e1.6.auto
+ │   │   │   ├── 10:53 -> ../HID-SENSOR-2000e1.6.auto
+ │   ├── HID-SENSOR-2000e1.6.auto
+
+Each reports can be of variable length preceded by a header. This header
+consist of a 32 bit usage id, 64 bit time stamp and 32 bit length field of raw
+data.
diff --git a/Documentation/hid/hid-sensor.txt b/Documentation/hid/hid-sensor.txt
deleted file mode 100644
index b287752a31cd..000000000000
--- a/Documentation/hid/hid-sensor.txt
+++ /dev/null
@@ -1,224 +0,0 @@
-
-HID Sensors Framework
-======================
-HID sensor framework provides necessary interfaces to implement sensor drivers,
-which are connected to a sensor hub. The sensor hub is a HID device and it provides
-a report descriptor conforming to HID 1.12 sensor usage tables.
-
-Description from the HID 1.12 "HID Sensor Usages" specification:
-"Standardization of HID usages for sensors would allow (but not require) sensor
-hardware vendors to provide a consistent Plug And Play interface at the USB boundary,
-thereby enabling some operating systems to incorporate common device drivers that
-could be reused between vendors, alleviating any need for the vendors to provide
-the drivers themselves."
-
-This specification describes many usage IDs, which describe the type of sensor
-and also the individual data fields. Each sensor can have variable number of
-data fields. The length and order is specified in the report descriptor. For
-example a part of report descriptor can look like:
-
- INPUT(1)[INPUT]
- ..
- Field(2)
- Physical(0020.0073)
- Usage(1)
- 0020.045f
- Logical Minimum(-32767)
- Logical Maximum(32767)
- Report Size(8)
- Report Count(1)
- Report Offset(16)
- Flags(Variable Absolute)
-..
-..
-
-The report is indicating "sensor page (0x20)" contains an accelerometer-3D (0x73).
-This accelerometer-3D has some fields. Here for example field 2 is motion intensity
-(0x045f) with a logical minimum value of -32767 and logical maximum of 32767. The
-order of fields and length of each field is important as the input event raw
-data will use this format.
-
-
-Implementation
-=================
-
-This specification defines many different types of sensors with different sets of
-data fields. It is difficult to have a common input event to user space applications,
-for different sensors. For example an accelerometer can send X,Y and Z data, whereas
-an ambient light sensor can send illumination data.
-So the implementation has two parts:
-- Core hid driver
-- Individual sensor processing part (sensor drivers)
-
-Core driver
------------
-The core driver registers (hid-sensor-hub) registers as a HID driver. It parses
-report descriptors and identifies all the sensors present. It adds an MFD device
-with name HID-SENSOR-xxxx (where xxxx is usage id from the specification).
-For example
-HID-SENSOR-200073 is registered for an Accelerometer 3D driver.
-So if any driver with this name is inserted, then the probe routine for that
-function will be called. So an accelerometer processing driver can register
-with this name and will be probed if there is an accelerometer-3D detected.
-
-The core driver provides a set of APIs which can be used by the processing
-drivers to register and get events for that usage id. Also it provides parsing
-functions, which get and set each input/feature/output report.
-
-Individual sensor processing part (sensor drivers)
------------
-The processing driver will use an interface provided by the core driver to parse
-the report and get the indexes of the fields and also can get events. This driver
-can use IIO interface to use the standard ABI defined for a type of sensor.
-
-
-Core driver Interface
-=====================
-
-Callback structure:
-Each processing driver can use this structure to set some callbacks.
- int (*suspend)(..): Callback when HID suspend is received
- int (*resume)(..): Callback when HID resume is received
- int (*capture_sample)(..): Capture a sample for one of its data fields
- int (*send_event)(..): One complete event is received which can have
- multiple data fields.
-
-Registration functions:
-int sensor_hub_register_callback(struct hid_sensor_hub_device *hsdev,
- u32 usage_id,
- struct hid_sensor_hub_callbacks *usage_callback):
-
-Registers callbacks for an usage id. The callback functions are not allowed
-to sleep.
-
-
-int sensor_hub_remove_callback(struct hid_sensor_hub_device *hsdev,
- u32 usage_id):
-
-Removes callbacks for an usage id.
-
-
-Parsing function:
-int sensor_hub_input_get_attribute_info(struct hid_sensor_hub_device *hsdev,
- u8 type,
- u32 usage_id, u32 attr_usage_id,
- struct hid_sensor_hub_attribute_info *info);
-
-A processing driver can look for some field of interest and check if it exists
-in a report descriptor. If it exists it will store necessary information
-so that fields can be set or get individually.
-These indexes avoid searching every time and getting field index to get or set.
-
-
-Set Feature report
-int sensor_hub_set_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
- u32 field_index, s32 value);
-
-This interface is used to set a value for a field in feature report. For example
-if there is a field report_interval, which is parsed by a call to
-sensor_hub_input_get_attribute_info before, then it can directly set that individual
-field.
-
-
-int sensor_hub_get_feature(struct hid_sensor_hub_device *hsdev, u32 report_id,
- u32 field_index, s32 *value);
-
-This interface is used to get a value for a field in input report. For example
-if there is a field report_interval, which is parsed by a call to
-sensor_hub_input_get_attribute_info before, then it can directly get that individual
-field value.
-
-
-int sensor_hub_input_attr_get_raw_value(struct hid_sensor_hub_device *hsdev,
- u32 usage_id,
- u32 attr_usage_id, u32 report_id);
-
-This is used to get a particular field value through input reports. For example
-accelerometer wants to poll X axis value, then it can call this function with
-the usage id of X axis. HID sensors can provide events, so this is not necessary
-to poll for any field. If there is some new sample, the core driver will call
-registered callback function to process the sample.
-
-
-----------
-
-HID Custom and generic Sensors
-
-HID Sensor specification defines two special sensor usage types. Since they
-don't represent a standard sensor, it is not possible to define using Linux IIO
-type interfaces.
-The purpose of these sensors is to extend the functionality or provide a
-way to obfuscate the data being communicated by a sensor. Without knowing the
-mapping between the data and its encapsulated form, it is difficult for
-an application/driver to determine what data is being communicated by the sensor.
-This allows some differentiating use cases, where vendor can provide applications.
-Some common use cases are debug other sensors or to provide some events like
-keyboard attached/detached or lid open/close.
-
-To allow application to utilize these sensors, here they are exported uses sysfs
-attribute groups, attributes and misc device interface.
-
-An example of this representation on sysfs:
-/sys/devices/pci0000:00/INT33C2:00/i2c-0/i2c-INT33D1:00/0018:8086:09FA.0001/HID-SENSOR-2000e1.6.auto$ tree -R
-.
-????????? enable_sensor
-????????? feature-0-200316
-??????? ????????? feature-0-200316-maximum
-??????? ????????? feature-0-200316-minimum
-??????? ????????? feature-0-200316-name
-??????? ????????? feature-0-200316-size
-??????? ????????? feature-0-200316-unit-expo
-??????? ????????? feature-0-200316-units
-??????? ????????? feature-0-200316-value
-????????? feature-1-200201
-??????? ????????? feature-1-200201-maximum
-??????? ????????? feature-1-200201-minimum
-??????? ????????? feature-1-200201-name
-??????? ????????? feature-1-200201-size
-??????? ????????? feature-1-200201-unit-expo
-??????? ????????? feature-1-200201-units
-??????? ????????? feature-1-200201-value
-????????? input-0-200201
-??????? ????????? input-0-200201-maximum
-??????? ????????? input-0-200201-minimum
-??????? ????????? input-0-200201-name
-??????? ????????? input-0-200201-size
-??????? ????????? input-0-200201-unit-expo
-??????? ????????? input-0-200201-units
-??????? ????????? input-0-200201-value
-????????? input-1-200202
-??????? ????????? input-1-200202-maximum
-??????? ????????? input-1-200202-minimum
-??????? ????????? input-1-200202-name
-??????? ????????? input-1-200202-size
-??????? ????????? input-1-200202-unit-expo
-??????? ????????? input-1-200202-units
-??????? ????????? input-1-200202-value
-
-Here there is a custom sensors with four fields, two feature and two inputs.
-Each field is represented by a set of attributes. All fields except the "value"
-are read only. The value field is a RW field.
-Example
-/sys/bus/platform/devices/HID-SENSOR-2000e1.6.auto/feature-0-200316$ grep -r . *
-feature-0-200316-maximum:6
-feature-0-200316-minimum:0
-feature-0-200316-name:property-reporting-state
-feature-0-200316-size:1
-feature-0-200316-unit-expo:0
-feature-0-200316-units:25
-feature-0-200316-value:1
-
-How to enable such sensor?
-By default sensor can be power gated. To enable sysfs attribute "enable" can be
-used.
-$ echo 1 > enable_sensor
-
-Once enabled and powered on, sensor can report value using HID reports.
-These reports are pushed using misc device interface in a FIFO order.
-/dev$ tree | grep HID-SENSOR-2000e1.6.auto
-??????? ????????? 10:53 -> ../HID-SENSOR-2000e1.6.auto
-????????? HID-SENSOR-2000e1.6.auto
-
-Each reports can be of variable length preceded by a header. This header
-consist of a 32 bit usage id, 64 bit time stamp and 32 bit length field of raw
-data.
diff --git a/Documentation/hid/hid-transport.rst b/Documentation/hid/hid-transport.rst
new file mode 100644
index 000000000000..0fe526f36db6
--- /dev/null
+++ b/Documentation/hid/hid-transport.rst
@@ -0,0 +1,359 @@
+=========================
+HID I/O Transport Drivers
+=========================
+
+The HID subsystem is independent of the underlying transport driver. Initially,
+only USB was supported, but other specifications adopted the HID design and
+provided new transport drivers. The kernel includes at least support for USB,
+Bluetooth, I2C and user-space I/O drivers.
+
+1) HID Bus
+==========
+
+The HID subsystem is designed as a bus. Any I/O subsystem may provide HID
+devices and register them with the HID bus. HID core then loads generic device
+drivers on top of it. The transport drivers are responsible of raw data
+transport and device setup/management. HID core is responsible of
+report-parsing, report interpretation and the user-space API. Device specifics
+and quirks are handled by all layers depending on the quirk.
+
+::
+
+ +-----------+ +-----------+ +-----------+ +-----------+
+ | Device #1 | | Device #i | | Device #j | | Device #k |
+ +-----------+ +-----------+ +-----------+ +-----------+
+ \\ // \\ //
+ +------------+ +------------+
+ | I/O Driver | | I/O Driver |
+ +------------+ +------------+
+ || ||
+ +------------------+ +------------------+
+ | Transport Driver | | Transport Driver |
+ +------------------+ +------------------+
+ \___ ___/
+ \ /
+ +----------------+
+ | HID Core |
+ +----------------+
+ / | | \
+ / | | \
+ ____________/ | | \_________________
+ / | | \
+ / | | \
+ +----------------+ +-----------+ +------------------+ +------------------+
+ | Generic Driver | | MT Driver | | Custom Driver #1 | | Custom Driver #2 |
+ +----------------+ +-----------+ +------------------+ +------------------+
+
+Example Drivers:
+
+ - I/O: USB, I2C, Bluetooth-l2cap
+ - Transport: USB-HID, I2C-HID, BT-HIDP
+
+Everything below "HID Core" is simplified in this graph as it is only of
+interest to HID device drivers. Transport drivers do not need to know the
+specifics.
+
+1.1) Device Setup
+-----------------
+
+I/O drivers normally provide hotplug detection or device enumeration APIs to the
+transport drivers. Transport drivers use this to find any suitable HID device.
+They allocate HID device objects and register them with HID core. Transport
+drivers are not required to register themselves with HID core. HID core is never
+aware of which transport drivers are available and is not interested in it. It
+is only interested in devices.
+
+Transport drivers attach a constant "struct hid_ll_driver" object with each
+device. Once a device is registered with HID core, the callbacks provided via
+this struct are used by HID core to communicate with the device.
+
+Transport drivers are responsible of detecting device failures and unplugging.
+HID core will operate a device as long as it is registered regardless of any
+device failures. Once transport drivers detect unplug or failure events, they
+must unregister the device from HID core and HID core will stop using the
+provided callbacks.
+
+1.2) Transport Driver Requirements
+----------------------------------
+
+The terms "asynchronous" and "synchronous" in this document describe the
+transmission behavior regarding acknowledgements. An asynchronous channel must
+not perform any synchronous operations like waiting for acknowledgements or
+verifications. Generally, HID calls operating on asynchronous channels must be
+running in atomic-context just fine.
+On the other hand, synchronous channels can be implemented by the transport
+driver in whatever way they like. They might just be the same as asynchronous
+channels, but they can also provide acknowledgement reports, automatic
+retransmission on failure, etc. in a blocking manner. If such functionality is
+required on asynchronous channels, a transport-driver must implement that via
+its own worker threads.
+
+HID core requires transport drivers to follow a given design. A Transport
+driver must provide two bi-directional I/O channels to each HID device. These
+channels must not necessarily be bi-directional in the hardware itself. A
+transport driver might just provide 4 uni-directional channels. Or it might
+multiplex all four on a single physical channel. However, in this document we
+will describe them as two bi-directional channels as they have several
+properties in common.
+
+ - Interrupt Channel (intr): The intr channel is used for asynchronous data
+ reports. No management commands or data acknowledgements are sent on this
+ channel. Any unrequested incoming or outgoing data report must be sent on
+ this channel and is never acknowledged by the remote side. Devices usually
+ send their input events on this channel. Outgoing events are normally
+ not send via intr, except if high throughput is required.
+ - Control Channel (ctrl): The ctrl channel is used for synchronous requests and
+ device management. Unrequested data input events must not be sent on this
+ channel and are normally ignored. Instead, devices only send management
+ events or answers to host requests on this channel.
+ The control-channel is used for direct blocking queries to the device
+ independent of any events on the intr-channel.
+ Outgoing reports are usually sent on the ctrl channel via synchronous
+ SET_REPORT requests.
+
+Communication between devices and HID core is mostly done via HID reports. A
+report can be of one of three types:
+
+ - INPUT Report: Input reports provide data from device to host. This
+ data may include button events, axis events, battery status or more. This
+ data is generated by the device and sent to the host with or without
+ requiring explicit requests. Devices can choose to send data continuously or
+ only on change.
+ - OUTPUT Report: Output reports change device states. They are sent from host
+ to device and may include LED requests, rumble requests or more. Output
+ reports are never sent from device to host, but a host can retrieve their
+ current state.
+ Hosts may choose to send output reports either continuously or only on
+ change.
+ - FEATURE Report: Feature reports are used for specific static device features
+ and never reported spontaneously. A host can read and/or write them to access
+ data like battery-state or device-settings.
+ Feature reports are never sent without requests. A host must explicitly set
+ or retrieve a feature report. This also means, feature reports are never sent
+ on the intr channel as this channel is asynchronous.
+
+INPUT and OUTPUT reports can be sent as pure data reports on the intr channel.
+For INPUT reports this is the usual operational mode. But for OUTPUT reports,
+this is rarely done as OUTPUT reports are normally quite scarce. But devices are
+free to make excessive use of asynchronous OUTPUT reports (for instance, custom
+HID audio speakers make great use of it).
+
+Plain reports must not be sent on the ctrl channel, though. Instead, the ctrl
+channel provides synchronous GET/SET_REPORT requests. Plain reports are only
+allowed on the intr channel and are the only means of data there.
+
+ - GET_REPORT: A GET_REPORT request has a report ID as payload and is sent
+ from host to device. The device must answer with a data report for the
+ requested report ID on the ctrl channel as a synchronous acknowledgement.
+ Only one GET_REPORT request can be pending for each device. This restriction
+ is enforced by HID core as several transport drivers don't allow multiple
+ simultaneous GET_REPORT requests.
+ Note that data reports which are sent as answer to a GET_REPORT request are
+ not handled as generic device events. That is, if a device does not operate
+ in continuous data reporting mode, an answer to GET_REPORT does not replace
+ the raw data report on the intr channel on state change.
+ GET_REPORT is only used by custom HID device drivers to query device state.
+ Normally, HID core caches any device state so this request is not necessary
+ on devices that follow the HID specs except during device initialization to
+ retrieve the current state.
+ GET_REPORT requests can be sent for any of the 3 report types and shall
+ return the current report state of the device. However, OUTPUT reports as
+ payload may be blocked by the underlying transport driver if the
+ specification does not allow them.
+ - SET_REPORT: A SET_REPORT request has a report ID plus data as payload. It is
+ sent from host to device and a device must update it's current report state
+ according to the given data. Any of the 3 report types can be used. However,
+ INPUT reports as payload might be blocked by the underlying transport driver
+ if the specification does not allow them.
+ A device must answer with a synchronous acknowledgement. However, HID core
+ does not require transport drivers to forward this acknowledgement to HID
+ core.
+ Same as for GET_REPORT, only one SET_REPORT can be pending at a time. This
+ restriction is enforced by HID core as some transport drivers do not support
+ multiple synchronous SET_REPORT requests.
+
+Other ctrl-channel requests are supported by USB-HID but are not available
+(or deprecated) in most other transport level specifications:
+
+ - GET/SET_IDLE: Only used by USB-HID and I2C-HID.
+ - GET/SET_PROTOCOL: Not used by HID core.
+ - RESET: Used by I2C-HID, not hooked up in HID core.
+ - SET_POWER: Used by I2C-HID, not hooked up in HID core.
+
+2) HID API
+==========
+
+2.1) Initialization
+-------------------
+
+Transport drivers normally use the following procedure to register a new device
+with HID core::
+
+ struct hid_device *hid;
+ int ret;
+
+ hid = hid_allocate_device();
+ if (IS_ERR(hid)) {
+ ret = PTR_ERR(hid);
+ goto err_<...>;
+ }
+
+ strscpy(hid->name, <device-name-src>, sizeof(hid->name));
+ strscpy(hid->phys, <device-phys-src>, sizeof(hid->phys));
+ strscpy(hid->uniq, <device-uniq-src>, sizeof(hid->uniq));
+
+ hid->ll_driver = &custom_ll_driver;
+ hid->bus = <device-bus>;
+ hid->vendor = <device-vendor>;
+ hid->product = <device-product>;
+ hid->version = <device-version>;
+ hid->country = <device-country>;
+ hid->dev.parent = <pointer-to-parent-device>;
+ hid->driver_data = <transport-driver-data-field>;
+
+ ret = hid_add_device(hid);
+ if (ret)
+ goto err_<...>;
+
+Once hid_add_device() is entered, HID core might use the callbacks provided in
+"custom_ll_driver". Note that fields like "country" can be ignored by underlying
+transport-drivers if not supported.
+
+To unregister a device, use::
+
+ hid_destroy_device(hid);
+
+Once hid_destroy_device() returns, HID core will no longer make use of any
+driver callbacks.
+
+2.2) hid_ll_driver operations
+-----------------------------
+
+The available HID callbacks are:
+
+ ::
+
+ int (*start) (struct hid_device *hdev)
+
+ Called from HID device drivers once they want to use the device. Transport
+ drivers can choose to setup their device in this callback. However, normally
+ devices are already set up before transport drivers register them to HID core
+ so this is mostly only used by USB-HID.
+
+ ::
+
+ void (*stop) (struct hid_device *hdev)
+
+ Called from HID device drivers once they are done with a device. Transport
+ drivers can free any buffers and deinitialize the device. But note that
+ ->start() might be called again if another HID device driver is loaded on the
+ device.
+
+ Transport drivers are free to ignore it and deinitialize devices after they
+ destroyed them via hid_destroy_device().
+
+ ::
+
+ int (*open) (struct hid_device *hdev)
+
+ Called from HID device drivers once they are interested in data reports.
+ Usually, while user-space didn't open any input API/etc., device drivers are
+ not interested in device data and transport drivers can put devices asleep.
+ However, once ->open() is called, transport drivers must be ready for I/O.
+ ->open() calls are nested for each client that opens the HID device.
+
+ ::
+
+ void (*close) (struct hid_device *hdev)
+
+ Called from HID device drivers after ->open() was called but they are no
+ longer interested in device reports. (Usually if user-space closed any input
+ devices of the driver).
+
+ Transport drivers can put devices asleep and terminate any I/O of all
+ ->open() calls have been followed by a ->close() call. However, ->start() may
+ be called again if the device driver is interested in input reports again.
+
+ ::
+
+ int (*parse) (struct hid_device *hdev)
+
+ Called once during device setup after ->start() has been called. Transport
+ drivers must read the HID report-descriptor from the device and tell HID core
+ about it via hid_parse_report().
+
+ ::
+
+ int (*power) (struct hid_device *hdev, int level)
+
+ Called by HID core to give PM hints to transport drivers. Usually this is
+ analogical to the ->open() and ->close() hints and redundant.
+
+ ::
+
+ void (*request) (struct hid_device *hdev, struct hid_report *report,
+ int reqtype)
+
+ Send an HID request on the ctrl channel. "report" contains the report that
+ should be sent and "reqtype" the request type. Request-type can be
+ HID_REQ_SET_REPORT or HID_REQ_GET_REPORT.
+
+ This callback is optional. If not provided, HID core will assemble a raw
+ report following the HID specs and send it via the ->raw_request() callback.
+ The transport driver is free to implement this asynchronously.
+
+ ::
+
+ int (*wait) (struct hid_device *hdev)
+
+ Used by HID core before calling ->request() again. A transport driver can use
+ it to wait for any pending requests to complete if only one request is
+ allowed at a time.
+
+ ::
+
+ int (*raw_request) (struct hid_device *hdev, unsigned char reportnum,
+ __u8 *buf, size_t count, unsigned char rtype,
+ int reqtype)
+
+ Same as ->request() but provides the report as raw buffer. This request shall
+ be synchronous. A transport driver must not use ->wait() to complete such
+ requests. This request is mandatory and hid core will reject the device if
+ it is missing.
+
+ ::
+
+ int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len)
+
+ Send raw output report via intr channel. Used by some HID device drivers
+ which require high throughput for outgoing requests on the intr channel. This
+ must not cause SET_REPORT calls! This must be implemented as asynchronous
+ output report on the intr channel!
+
+ ::
+
+ int (*idle) (struct hid_device *hdev, int report, int idle, int reqtype)
+
+ Perform SET/GET_IDLE request. Only used by USB-HID, do not implement!
+
+2.3) Data Path
+--------------
+
+Transport drivers are responsible of reading data from I/O devices. They must
+handle any I/O-related state-tracking themselves. HID core does not implement
+protocol handshakes or other management commands which can be required by the
+given HID transport specification.
+
+Every raw data packet read from a device must be fed into HID core via
+hid_input_report(). You must specify the channel-type (intr or ctrl) and report
+type (input/output/feature). Under normal conditions, only input reports are
+provided via this API.
+
+Responses to GET_REPORT requests via ->request() must also be provided via this
+API. Responses to ->raw_request() are synchronous and must be intercepted by the
+transport driver and not passed to hid_input_report().
+Acknowledgements to SET_REPORT requests are not of interest to HID core.
+
+----------------------------------------------------
+
+Written 2013, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hid/hid-transport.txt b/Documentation/hid/hid-transport.txt
deleted file mode 100644
index 4f41d67f1b4b..000000000000
--- a/Documentation/hid/hid-transport.txt
+++ /dev/null
@@ -1,317 +0,0 @@
- HID I/O Transport Drivers
- ===========================
-
-The HID subsystem is independent of the underlying transport driver. Initially,
-only USB was supported, but other specifications adopted the HID design and
-provided new transport drivers. The kernel includes at least support for USB,
-Bluetooth, I2C and user-space I/O drivers.
-
-1) HID Bus
-==========
-
-The HID subsystem is designed as a bus. Any I/O subsystem may provide HID
-devices and register them with the HID bus. HID core then loads generic device
-drivers on top of it. The transport drivers are responsible of raw data
-transport and device setup/management. HID core is responsible of
-report-parsing, report interpretation and the user-space API. Device specifics
-and quirks are handled by all layers depending on the quirk.
-
- +-----------+ +-----------+ +-----------+ +-----------+
- | Device #1 | | Device #i | | Device #j | | Device #k |
- +-----------+ +-----------+ +-----------+ +-----------+
- \\ // \\ //
- +------------+ +------------+
- | I/O Driver | | I/O Driver |
- +------------+ +------------+
- || ||
- +------------------+ +------------------+
- | Transport Driver | | Transport Driver |
- +------------------+ +------------------+
- \___ ___/
- \ /
- +----------------+
- | HID Core |
- +----------------+
- / | | \
- / | | \
- ____________/ | | \_________________
- / | | \
- / | | \
- +----------------+ +-----------+ +------------------+ +------------------+
- | Generic Driver | | MT Driver | | Custom Driver #1 | | Custom Driver #2 |
- +----------------+ +-----------+ +------------------+ +------------------+
-
-Example Drivers:
- I/O: USB, I2C, Bluetooth-l2cap
- Transport: USB-HID, I2C-HID, BT-HIDP
-
-Everything below "HID Core" is simplified in this graph as it is only of
-interest to HID device drivers. Transport drivers do not need to know the
-specifics.
-
-1.1) Device Setup
------------------
-
-I/O drivers normally provide hotplug detection or device enumeration APIs to the
-transport drivers. Transport drivers use this to find any suitable HID device.
-They allocate HID device objects and register them with HID core. Transport
-drivers are not required to register themselves with HID core. HID core is never
-aware of which transport drivers are available and is not interested in it. It
-is only interested in devices.
-
-Transport drivers attach a constant "struct hid_ll_driver" object with each
-device. Once a device is registered with HID core, the callbacks provided via
-this struct are used by HID core to communicate with the device.
-
-Transport drivers are responsible of detecting device failures and unplugging.
-HID core will operate a device as long as it is registered regardless of any
-device failures. Once transport drivers detect unplug or failure events, they
-must unregister the device from HID core and HID core will stop using the
-provided callbacks.
-
-1.2) Transport Driver Requirements
-----------------------------------
-
-The terms "asynchronous" and "synchronous" in this document describe the
-transmission behavior regarding acknowledgements. An asynchronous channel must
-not perform any synchronous operations like waiting for acknowledgements or
-verifications. Generally, HID calls operating on asynchronous channels must be
-running in atomic-context just fine.
-On the other hand, synchronous channels can be implemented by the transport
-driver in whatever way they like. They might just be the same as asynchronous
-channels, but they can also provide acknowledgement reports, automatic
-retransmission on failure, etc. in a blocking manner. If such functionality is
-required on asynchronous channels, a transport-driver must implement that via
-its own worker threads.
-
-HID core requires transport drivers to follow a given design. A Transport
-driver must provide two bi-directional I/O channels to each HID device. These
-channels must not necessarily be bi-directional in the hardware itself. A
-transport driver might just provide 4 uni-directional channels. Or it might
-multiplex all four on a single physical channel. However, in this document we
-will describe them as two bi-directional channels as they have several
-properties in common.
-
- - Interrupt Channel (intr): The intr channel is used for asynchronous data
- reports. No management commands or data acknowledgements are sent on this
- channel. Any unrequested incoming or outgoing data report must be sent on
- this channel and is never acknowledged by the remote side. Devices usually
- send their input events on this channel. Outgoing events are normally
- not send via intr, except if high throughput is required.
- - Control Channel (ctrl): The ctrl channel is used for synchronous requests and
- device management. Unrequested data input events must not be sent on this
- channel and are normally ignored. Instead, devices only send management
- events or answers to host requests on this channel.
- The control-channel is used for direct blocking queries to the device
- independent of any events on the intr-channel.
- Outgoing reports are usually sent on the ctrl channel via synchronous
- SET_REPORT requests.
-
-Communication between devices and HID core is mostly done via HID reports. A
-report can be of one of three types:
-
- - INPUT Report: Input reports provide data from device to host. This
- data may include button events, axis events, battery status or more. This
- data is generated by the device and sent to the host with or without
- requiring explicit requests. Devices can choose to send data continuously or
- only on change.
- - OUTPUT Report: Output reports change device states. They are sent from host
- to device and may include LED requests, rumble requests or more. Output
- reports are never sent from device to host, but a host can retrieve their
- current state.
- Hosts may choose to send output reports either continuously or only on
- change.
- - FEATURE Report: Feature reports are used for specific static device features
- and never reported spontaneously. A host can read and/or write them to access
- data like battery-state or device-settings.
- Feature reports are never sent without requests. A host must explicitly set
- or retrieve a feature report. This also means, feature reports are never sent
- on the intr channel as this channel is asynchronous.
-
-INPUT and OUTPUT reports can be sent as pure data reports on the intr channel.
-For INPUT reports this is the usual operational mode. But for OUTPUT reports,
-this is rarely done as OUTPUT reports are normally quite scarce. But devices are
-free to make excessive use of asynchronous OUTPUT reports (for instance, custom
-HID audio speakers make great use of it).
-
-Plain reports must not be sent on the ctrl channel, though. Instead, the ctrl
-channel provides synchronous GET/SET_REPORT requests. Plain reports are only
-allowed on the intr channel and are the only means of data there.
-
- - GET_REPORT: A GET_REPORT request has a report ID as payload and is sent
- from host to device. The device must answer with a data report for the
- requested report ID on the ctrl channel as a synchronous acknowledgement.
- Only one GET_REPORT request can be pending for each device. This restriction
- is enforced by HID core as several transport drivers don't allow multiple
- simultaneous GET_REPORT requests.
- Note that data reports which are sent as answer to a GET_REPORT request are
- not handled as generic device events. That is, if a device does not operate
- in continuous data reporting mode, an answer to GET_REPORT does not replace
- the raw data report on the intr channel on state change.
- GET_REPORT is only used by custom HID device drivers to query device state.
- Normally, HID core caches any device state so this request is not necessary
- on devices that follow the HID specs except during device initialization to
- retrieve the current state.
- GET_REPORT requests can be sent for any of the 3 report types and shall
- return the current report state of the device. However, OUTPUT reports as
- payload may be blocked by the underlying transport driver if the
- specification does not allow them.
- - SET_REPORT: A SET_REPORT request has a report ID plus data as payload. It is
- sent from host to device and a device must update it's current report state
- according to the given data. Any of the 3 report types can be used. However,
- INPUT reports as payload might be blocked by the underlying transport driver
- if the specification does not allow them.
- A device must answer with a synchronous acknowledgement. However, HID core
- does not require transport drivers to forward this acknowledgement to HID
- core.
- Same as for GET_REPORT, only one SET_REPORT can be pending at a time. This
- restriction is enforced by HID core as some transport drivers do not support
- multiple synchronous SET_REPORT requests.
-
-Other ctrl-channel requests are supported by USB-HID but are not available
-(or deprecated) in most other transport level specifications:
-
- - GET/SET_IDLE: Only used by USB-HID and I2C-HID.
- - GET/SET_PROTOCOL: Not used by HID core.
- - RESET: Used by I2C-HID, not hooked up in HID core.
- - SET_POWER: Used by I2C-HID, not hooked up in HID core.
-
-2) HID API
-==========
-
-2.1) Initialization
--------------------
-
-Transport drivers normally use the following procedure to register a new device
-with HID core:
-
- struct hid_device *hid;
- int ret;
-
- hid = hid_allocate_device();
- if (IS_ERR(hid)) {
- ret = PTR_ERR(hid);
- goto err_<...>;
- }
-
- strscpy(hid->name, <device-name-src>, sizeof(hid->name));
- strscpy(hid->phys, <device-phys-src>, sizeof(hid->phys));
- strscpy(hid->uniq, <device-uniq-src>, sizeof(hid->uniq));
-
- hid->ll_driver = &custom_ll_driver;
- hid->bus = <device-bus>;
- hid->vendor = <device-vendor>;
- hid->product = <device-product>;
- hid->version = <device-version>;
- hid->country = <device-country>;
- hid->dev.parent = <pointer-to-parent-device>;
- hid->driver_data = <transport-driver-data-field>;
-
- ret = hid_add_device(hid);
- if (ret)
- goto err_<...>;
-
-Once hid_add_device() is entered, HID core might use the callbacks provided in
-"custom_ll_driver". Note that fields like "country" can be ignored by underlying
-transport-drivers if not supported.
-
-To unregister a device, use:
-
- hid_destroy_device(hid);
-
-Once hid_destroy_device() returns, HID core will no longer make use of any
-driver callbacks.
-
-2.2) hid_ll_driver operations
------------------------------
-
-The available HID callbacks are:
- - int (*start) (struct hid_device *hdev)
- Called from HID device drivers once they want to use the device. Transport
- drivers can choose to setup their device in this callback. However, normally
- devices are already set up before transport drivers register them to HID core
- so this is mostly only used by USB-HID.
-
- - void (*stop) (struct hid_device *hdev)
- Called from HID device drivers once they are done with a device. Transport
- drivers can free any buffers and deinitialize the device. But note that
- ->start() might be called again if another HID device driver is loaded on the
- device.
- Transport drivers are free to ignore it and deinitialize devices after they
- destroyed them via hid_destroy_device().
-
- - int (*open) (struct hid_device *hdev)
- Called from HID device drivers once they are interested in data reports.
- Usually, while user-space didn't open any input API/etc., device drivers are
- not interested in device data and transport drivers can put devices asleep.
- However, once ->open() is called, transport drivers must be ready for I/O.
- ->open() calls are nested for each client that opens the HID device.
-
- - void (*close) (struct hid_device *hdev)
- Called from HID device drivers after ->open() was called but they are no
- longer interested in device reports. (Usually if user-space closed any input
- devices of the driver).
- Transport drivers can put devices asleep and terminate any I/O of all
- ->open() calls have been followed by a ->close() call. However, ->start() may
- be called again if the device driver is interested in input reports again.
-
- - int (*parse) (struct hid_device *hdev)
- Called once during device setup after ->start() has been called. Transport
- drivers must read the HID report-descriptor from the device and tell HID core
- about it via hid_parse_report().
-
- - int (*power) (struct hid_device *hdev, int level)
- Called by HID core to give PM hints to transport drivers. Usually this is
- analogical to the ->open() and ->close() hints and redundant.
-
- - void (*request) (struct hid_device *hdev, struct hid_report *report,
- int reqtype)
- Send an HID request on the ctrl channel. "report" contains the report that
- should be sent and "reqtype" the request type. Request-type can be
- HID_REQ_SET_REPORT or HID_REQ_GET_REPORT.
- This callback is optional. If not provided, HID core will assemble a raw
- report following the HID specs and send it via the ->raw_request() callback.
- The transport driver is free to implement this asynchronously.
-
- - int (*wait) (struct hid_device *hdev)
- Used by HID core before calling ->request() again. A transport driver can use
- it to wait for any pending requests to complete if only one request is
- allowed at a time.
-
- - int (*raw_request) (struct hid_device *hdev, unsigned char reportnum,
- __u8 *buf, size_t count, unsigned char rtype,
- int reqtype)
- Same as ->request() but provides the report as raw buffer. This request shall
- be synchronous. A transport driver must not use ->wait() to complete such
- requests. This request is mandatory and hid core will reject the device if
- it is missing.
-
- - int (*output_report) (struct hid_device *hdev, __u8 *buf, size_t len)
- Send raw output report via intr channel. Used by some HID device drivers
- which require high throughput for outgoing requests on the intr channel. This
- must not cause SET_REPORT calls! This must be implemented as asynchronous
- output report on the intr channel!
-
- - int (*idle) (struct hid_device *hdev, int report, int idle, int reqtype)
- Perform SET/GET_IDLE request. Only used by USB-HID, do not implement!
-
-2.3) Data Path
---------------
-
-Transport drivers are responsible of reading data from I/O devices. They must
-handle any I/O-related state-tracking themselves. HID core does not implement
-protocol handshakes or other management commands which can be required by the
-given HID transport specification.
-
-Every raw data packet read from a device must be fed into HID core via
-hid_input_report(). You must specify the channel-type (intr or ctrl) and report
-type (input/output/feature). Under normal conditions, only input reports are
-provided via this API.
-
-Responses to GET_REPORT requests via ->request() must also be provided via this
-API. Responses to ->raw_request() are synchronous and must be intercepted by the
-transport driver and not passed to hid_input_report().
-Acknowledgements to SET_REPORT requests are not of interest to HID core.
-
-----------------------------------------------------
-Written 2013, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hid/hiddev.rst b/Documentation/hid/hiddev.rst
new file mode 100644
index 000000000000..209e6ba4e019
--- /dev/null
+++ b/Documentation/hid/hiddev.rst
@@ -0,0 +1,251 @@
+================================================
+Care and feeding of your Human Interface Devices
+================================================
+
+Introduction
+============
+
+In addition to the normal input type HID devices, USB also uses the
+human interface device protocols for things that are not really human
+interfaces, but have similar sorts of communication needs. The two big
+examples for this are power devices (especially uninterruptable power
+supplies) and monitor control on higher end monitors.
+
+To support these disparate requirements, the Linux USB system provides
+HID events to two separate interfaces:
+* the input subsystem, which converts HID events into normal input
+device interfaces (such as keyboard, mouse and joystick) and a
+normalised event interface - see Documentation/input/input.rst
+* the hiddev interface, which provides fairly raw HID events
+
+The data flow for a HID event produced by a device is something like
+the following::
+
+ usb.c ---> hid-core.c ----> hid-input.c ----> [keyboard/mouse/joystick/event]
+ |
+ |
+ --> hiddev.c ----> POWER / MONITOR CONTROL
+
+In addition, other subsystems (apart from USB) can potentially feed
+events into the input subsystem, but these have no effect on the hid
+device interface.
+
+Using the HID Device Interface
+==============================
+
+The hiddev interface is a char interface using the normal USB major,
+with the minor numbers starting at 96 and finishing at 111. Therefore,
+you need the following commands::
+
+ mknod /dev/usb/hiddev0 c 180 96
+ mknod /dev/usb/hiddev1 c 180 97
+ mknod /dev/usb/hiddev2 c 180 98
+ mknod /dev/usb/hiddev3 c 180 99
+ mknod /dev/usb/hiddev4 c 180 100
+ mknod /dev/usb/hiddev5 c 180 101
+ mknod /dev/usb/hiddev6 c 180 102
+ mknod /dev/usb/hiddev7 c 180 103
+ mknod /dev/usb/hiddev8 c 180 104
+ mknod /dev/usb/hiddev9 c 180 105
+ mknod /dev/usb/hiddev10 c 180 106
+ mknod /dev/usb/hiddev11 c 180 107
+ mknod /dev/usb/hiddev12 c 180 108
+ mknod /dev/usb/hiddev13 c 180 109
+ mknod /dev/usb/hiddev14 c 180 110
+ mknod /dev/usb/hiddev15 c 180 111
+
+So you point your hiddev compliant user-space program at the correct
+interface for your device, and it all just works.
+
+Assuming that you have a hiddev compliant user-space program, of
+course. If you need to write one, read on.
+
+
+The HIDDEV API
+==============
+
+This description should be read in conjunction with the HID
+specification, freely available from http://www.usb.org, and
+conveniently linked of http://www.linux-usb.org.
+
+The hiddev API uses a read() interface, and a set of ioctl() calls.
+
+HID devices exchange data with the host computer using data
+bundles called "reports". Each report is divided into "fields",
+each of which can have one or more "usages". In the hid-core,
+each one of these usages has a single signed 32 bit value.
+
+read():
+-------
+
+This is the event interface. When the HID device's state changes,
+it performs an interrupt transfer containing a report which contains
+the changed value. The hid-core.c module parses the report, and
+returns to hiddev.c the individual usages that have changed within
+the report. In its basic mode, the hiddev will make these individual
+usage changes available to the reader using a struct hiddev_event::
+
+ struct hiddev_event {
+ unsigned hid;
+ signed int value;
+ };
+
+containing the HID usage identifier for the status that changed, and
+the value that it was changed to. Note that the structure is defined
+within <linux/hiddev.h>, along with some other useful #defines and
+structures. The HID usage identifier is a composite of the HID usage
+page shifted to the 16 high order bits ORed with the usage code. The
+behavior of the read() function can be modified using the HIDIOCSFLAG
+ioctl() described below.
+
+
+ioctl():
+--------
+
+This is the control interface. There are a number of controls:
+
+HIDIOCGVERSION
+ - int (read)
+
+ Gets the version code out of the hiddev driver.
+
+HIDIOCAPPLICATION
+ - (none)
+
+This ioctl call returns the HID application usage associated with the
+hid device. The third argument to ioctl() specifies which application
+index to get. This is useful when the device has more than one
+application collection. If the index is invalid (greater or equal to
+the number of application collections this device has) the ioctl
+returns -1. You can find out beforehand how many application
+collections the device has from the num_applications field from the
+hiddev_devinfo structure.
+
+HIDIOCGCOLLECTIONINFO
+ - struct hiddev_collection_info (read/write)
+
+This returns a superset of the information above, providing not only
+application collections, but all the collections the device has. It
+also returns the level the collection lives in the hierarchy.
+The user passes in a hiddev_collection_info struct with the index
+field set to the index that should be returned. The ioctl fills in
+the other fields. If the index is larger than the last collection
+index, the ioctl returns -1 and sets errno to -EINVAL.
+
+HIDIOCGDEVINFO
+ - struct hiddev_devinfo (read)
+
+Gets a hiddev_devinfo structure which describes the device.
+
+HIDIOCGSTRING
+ - struct hiddev_string_descriptor (read/write)
+
+Gets a string descriptor from the device. The caller must fill in the
+"index" field to indicate which descriptor should be returned.
+
+HIDIOCINITREPORT
+ - (none)
+
+Instructs the kernel to retrieve all input and feature report values
+from the device. At this point, all the usage structures will contain
+current values for the device, and will maintain it as the device
+changes. Note that the use of this ioctl is unnecessary in general,
+since later kernels automatically initialize the reports from the
+device at attach time.
+
+HIDIOCGNAME
+ - string (variable length)
+
+Gets the device name
+
+HIDIOCGREPORT
+ - struct hiddev_report_info (write)
+
+Instructs the kernel to get a feature or input report from the device,
+in order to selectively update the usage structures (in contrast to
+INITREPORT).
+
+HIDIOCSREPORT
+ - struct hiddev_report_info (write)
+
+Instructs the kernel to send a report to the device. This report can
+be filled in by the user through HIDIOCSUSAGE calls (below) to fill in
+individual usage values in the report before sending the report in full
+to the device.
+
+HIDIOCGREPORTINFO
+ - struct hiddev_report_info (read/write)
+
+Fills in a hiddev_report_info structure for the user. The report is
+looked up by type (input, output or feature) and id, so these fields
+must be filled in by the user. The ID can be absolute -- the actual
+report id as reported by the device -- or relative --
+HID_REPORT_ID_FIRST for the first report, and (HID_REPORT_ID_NEXT |
+report_id) for the next report after report_id. Without a-priori
+information about report ids, the right way to use this ioctl is to
+use the relative IDs above to enumerate the valid IDs. The ioctl
+returns non-zero when there is no more next ID. The real report ID is
+filled into the returned hiddev_report_info structure.
+
+HIDIOCGFIELDINFO
+ - struct hiddev_field_info (read/write)
+
+Returns the field information associated with a report in a
+hiddev_field_info structure. The user must fill in report_id and
+report_type in this structure, as above. The field_index should also
+be filled in, which should be a number from 0 and maxfield-1, as
+returned from a previous HIDIOCGREPORTINFO call.
+
+HIDIOCGUCODE
+ - struct hiddev_usage_ref (read/write)
+
+Returns the usage_code in a hiddev_usage_ref structure, given that
+given its report type, report id, field index, and index within the
+field have already been filled into the structure.
+
+HIDIOCGUSAGE
+ - struct hiddev_usage_ref (read/write)
+
+Returns the value of a usage in a hiddev_usage_ref structure. The
+usage to be retrieved can be specified as above, or the user can
+choose to fill in the report_type field and specify the report_id as
+HID_REPORT_ID_UNKNOWN. In this case, the hiddev_usage_ref will be
+filled in with the report and field information associated with this
+usage if it is found.
+
+HIDIOCSUSAGE
+ - struct hiddev_usage_ref (write)
+
+Sets the value of a usage in an output report. The user fills in
+the hiddev_usage_ref structure as above, but additionally fills in
+the value field.
+
+HIDIOGCOLLECTIONINDEX
+ - struct hiddev_usage_ref (write)
+
+Returns the collection index associated with this usage. This
+indicates where in the collection hierarchy this usage sits.
+
+HIDIOCGFLAG
+ - int (read)
+HIDIOCSFLAG
+ - int (write)
+
+These operations respectively inspect and replace the mode flags
+that influence the read() call above. The flags are as follows:
+
+ HIDDEV_FLAG_UREF
+ - read() calls will now return
+ struct hiddev_usage_ref instead of struct hiddev_event.
+ This is a larger structure, but in situations where the
+ device has more than one usage in its reports with the
+ same usage code, this mode serves to resolve such
+ ambiguity.
+
+ HIDDEV_FLAG_REPORT
+ - This flag can only be used in conjunction
+ with HIDDEV_FLAG_UREF. With this flag set, when the device
+ sends a report, a struct hiddev_usage_ref will be returned
+ to read() filled in with the report_type and report_id, but
+ with field_index set to FIELD_INDEX_NONE. This serves as
+ additional notification when the device has sent a report.
diff --git a/Documentation/hid/hiddev.txt b/Documentation/hid/hiddev.txt
deleted file mode 100644
index 638448707aa2..000000000000
--- a/Documentation/hid/hiddev.txt
+++ /dev/null
@@ -1,205 +0,0 @@
-Care and feeding of your Human Interface Devices
-
-INTRODUCTION
-
-In addition to the normal input type HID devices, USB also uses the
-human interface device protocols for things that are not really human
-interfaces, but have similar sorts of communication needs. The two big
-examples for this are power devices (especially uninterruptable power
-supplies) and monitor control on higher end monitors.
-
-To support these disparate requirements, the Linux USB system provides
-HID events to two separate interfaces:
-* the input subsystem, which converts HID events into normal input
-device interfaces (such as keyboard, mouse and joystick) and a
-normalised event interface - see Documentation/input/input.rst
-* the hiddev interface, which provides fairly raw HID events
-
-The data flow for a HID event produced by a device is something like
-the following :
-
- usb.c ---> hid-core.c ----> hid-input.c ----> [keyboard/mouse/joystick/event]
- |
- |
- --> hiddev.c ----> POWER / MONITOR CONTROL
-
-In addition, other subsystems (apart from USB) can potentially feed
-events into the input subsystem, but these have no effect on the hid
-device interface.
-
-USING THE HID DEVICE INTERFACE
-
-The hiddev interface is a char interface using the normal USB major,
-with the minor numbers starting at 96 and finishing at 111. Therefore,
-you need the following commands:
-mknod /dev/usb/hiddev0 c 180 96
-mknod /dev/usb/hiddev1 c 180 97
-mknod /dev/usb/hiddev2 c 180 98
-mknod /dev/usb/hiddev3 c 180 99
-mknod /dev/usb/hiddev4 c 180 100
-mknod /dev/usb/hiddev5 c 180 101
-mknod /dev/usb/hiddev6 c 180 102
-mknod /dev/usb/hiddev7 c 180 103
-mknod /dev/usb/hiddev8 c 180 104
-mknod /dev/usb/hiddev9 c 180 105
-mknod /dev/usb/hiddev10 c 180 106
-mknod /dev/usb/hiddev11 c 180 107
-mknod /dev/usb/hiddev12 c 180 108
-mknod /dev/usb/hiddev13 c 180 109
-mknod /dev/usb/hiddev14 c 180 110
-mknod /dev/usb/hiddev15 c 180 111
-
-So you point your hiddev compliant user-space program at the correct
-interface for your device, and it all just works.
-
-Assuming that you have a hiddev compliant user-space program, of
-course. If you need to write one, read on.
-
-
-THE HIDDEV API
-This description should be read in conjunction with the HID
-specification, freely available from http://www.usb.org, and
-conveniently linked of http://www.linux-usb.org.
-
-The hiddev API uses a read() interface, and a set of ioctl() calls.
-
-HID devices exchange data with the host computer using data
-bundles called "reports". Each report is divided into "fields",
-each of which can have one or more "usages". In the hid-core,
-each one of these usages has a single signed 32 bit value.
-
-read():
-This is the event interface. When the HID device's state changes,
-it performs an interrupt transfer containing a report which contains
-the changed value. The hid-core.c module parses the report, and
-returns to hiddev.c the individual usages that have changed within
-the report. In its basic mode, the hiddev will make these individual
-usage changes available to the reader using a struct hiddev_event:
-
- struct hiddev_event {
- unsigned hid;
- signed int value;
- };
-
-containing the HID usage identifier for the status that changed, and
-the value that it was changed to. Note that the structure is defined
-within <linux/hiddev.h>, along with some other useful #defines and
-structures. The HID usage identifier is a composite of the HID usage
-page shifted to the 16 high order bits ORed with the usage code. The
-behavior of the read() function can be modified using the HIDIOCSFLAG
-ioctl() described below.
-
-
-ioctl():
-This is the control interface. There are a number of controls:
-
-HIDIOCGVERSION - int (read)
-Gets the version code out of the hiddev driver.
-
-HIDIOCAPPLICATION - (none)
-This ioctl call returns the HID application usage associated with the
-hid device. The third argument to ioctl() specifies which application
-index to get. This is useful when the device has more than one
-application collection. If the index is invalid (greater or equal to
-the number of application collections this device has) the ioctl
-returns -1. You can find out beforehand how many application
-collections the device has from the num_applications field from the
-hiddev_devinfo structure.
-
-HIDIOCGCOLLECTIONINFO - struct hiddev_collection_info (read/write)
-This returns a superset of the information above, providing not only
-application collections, but all the collections the device has. It
-also returns the level the collection lives in the hierarchy.
-The user passes in a hiddev_collection_info struct with the index
-field set to the index that should be returned. The ioctl fills in
-the other fields. If the index is larger than the last collection
-index, the ioctl returns -1 and sets errno to -EINVAL.
-
-HIDIOCGDEVINFO - struct hiddev_devinfo (read)
-Gets a hiddev_devinfo structure which describes the device.
-
-HIDIOCGSTRING - struct hiddev_string_descriptor (read/write)
-Gets a string descriptor from the device. The caller must fill in the
-"index" field to indicate which descriptor should be returned.
-
-HIDIOCINITREPORT - (none)
-Instructs the kernel to retrieve all input and feature report values
-from the device. At this point, all the usage structures will contain
-current values for the device, and will maintain it as the device
-changes. Note that the use of this ioctl is unnecessary in general,
-since later kernels automatically initialize the reports from the
-device at attach time.
-
-HIDIOCGNAME - string (variable length)
-Gets the device name
-
-HIDIOCGREPORT - struct hiddev_report_info (write)
-Instructs the kernel to get a feature or input report from the device,
-in order to selectively update the usage structures (in contrast to
-INITREPORT).
-
-HIDIOCSREPORT - struct hiddev_report_info (write)
-Instructs the kernel to send a report to the device. This report can
-be filled in by the user through HIDIOCSUSAGE calls (below) to fill in
-individual usage values in the report before sending the report in full
-to the device.
-
-HIDIOCGREPORTINFO - struct hiddev_report_info (read/write)
-Fills in a hiddev_report_info structure for the user. The report is
-looked up by type (input, output or feature) and id, so these fields
-must be filled in by the user. The ID can be absolute -- the actual
-report id as reported by the device -- or relative --
-HID_REPORT_ID_FIRST for the first report, and (HID_REPORT_ID_NEXT |
-report_id) for the next report after report_id. Without a-priori
-information about report ids, the right way to use this ioctl is to
-use the relative IDs above to enumerate the valid IDs. The ioctl
-returns non-zero when there is no more next ID. The real report ID is
-filled into the returned hiddev_report_info structure.
-
-HIDIOCGFIELDINFO - struct hiddev_field_info (read/write)
-Returns the field information associated with a report in a
-hiddev_field_info structure. The user must fill in report_id and
-report_type in this structure, as above. The field_index should also
-be filled in, which should be a number from 0 and maxfield-1, as
-returned from a previous HIDIOCGREPORTINFO call.
-
-HIDIOCGUCODE - struct hiddev_usage_ref (read/write)
-Returns the usage_code in a hiddev_usage_ref structure, given that
-given its report type, report id, field index, and index within the
-field have already been filled into the structure.
-
-HIDIOCGUSAGE - struct hiddev_usage_ref (read/write)
-Returns the value of a usage in a hiddev_usage_ref structure. The
-usage to be retrieved can be specified as above, or the user can
-choose to fill in the report_type field and specify the report_id as
-HID_REPORT_ID_UNKNOWN. In this case, the hiddev_usage_ref will be
-filled in with the report and field information associated with this
-usage if it is found.
-
-HIDIOCSUSAGE - struct hiddev_usage_ref (write)
-Sets the value of a usage in an output report. The user fills in
-the hiddev_usage_ref structure as above, but additionally fills in
-the value field.
-
-HIDIOGCOLLECTIONINDEX - struct hiddev_usage_ref (write)
-Returns the collection index associated with this usage. This
-indicates where in the collection hierarchy this usage sits.
-
-HIDIOCGFLAG - int (read)
-HIDIOCSFLAG - int (write)
-These operations respectively inspect and replace the mode flags
-that influence the read() call above. The flags are as follows:
-
- HIDDEV_FLAG_UREF - read() calls will now return
- struct hiddev_usage_ref instead of struct hiddev_event.
- This is a larger structure, but in situations where the
- device has more than one usage in its reports with the
- same usage code, this mode serves to resolve such
- ambiguity.
-
- HIDDEV_FLAG_REPORT - This flag can only be used in conjunction
- with HIDDEV_FLAG_UREF. With this flag set, when the device
- sends a report, a struct hiddev_usage_ref will be returned
- to read() filled in with the report_type and report_id, but
- with field_index set to FIELD_INDEX_NONE. This serves as
- additional notification when the device has sent a report.
diff --git a/Documentation/hid/hidraw.rst b/Documentation/hid/hidraw.rst
new file mode 100644
index 000000000000..4a4a0ba1f362
--- /dev/null
+++ b/Documentation/hid/hidraw.rst
@@ -0,0 +1,138 @@
+================================================================
+HIDRAW - Raw Access to USB and Bluetooth Human Interface Devices
+================================================================
+
+The hidraw driver provides a raw interface to USB and Bluetooth Human
+Interface Devices (HIDs). It differs from hiddev in that reports sent and
+received are not parsed by the HID parser, but are sent to and received from
+the device unmodified.
+
+Hidraw should be used if the userspace application knows exactly how to
+communicate with the hardware device, and is able to construct the HID
+reports manually. This is often the case when making userspace drivers for
+custom HID devices.
+
+Hidraw is also useful for communicating with non-conformant HID devices
+which send and receive data in a way that is inconsistent with their report
+descriptors. Because hiddev parses reports which are sent and received
+through it, checking them against the device's report descriptor, such
+communication with these non-conformant devices is impossible using hiddev.
+Hidraw is the only alternative, short of writing a custom kernel driver, for
+these non-conformant devices.
+
+A benefit of hidraw is that its use by userspace applications is independent
+of the underlying hardware type. Currently, Hidraw is implemented for USB
+and Bluetooth. In the future, as new hardware bus types are developed which
+use the HID specification, hidraw will be expanded to add support for these
+new bus types.
+
+Hidraw uses a dynamic major number, meaning that udev should be relied on to
+create hidraw device nodes. Udev will typically create the device nodes
+directly under /dev (eg: /dev/hidraw0). As this location is distribution-
+and udev rule-dependent, applications should use libudev to locate hidraw
+devices attached to the system. There is a tutorial on libudev with a
+working example at:
+
+ http://www.signal11.us/oss/udev/
+
+The HIDRAW API
+---------------
+
+read()
+-------
+read() will read a queued report received from the HID device. On USB
+devices, the reports read using read() are the reports sent from the device
+on the INTERRUPT IN endpoint. By default, read() will block until there is
+a report available to be read. read() can be made non-blocking, by passing
+the O_NONBLOCK flag to open(), or by setting the O_NONBLOCK flag using
+fcntl().
+
+On a device which uses numbered reports, the first byte of the returned data
+will be the report number; the report data follows, beginning in the second
+byte. For devices which do not use numbered reports, the report data
+will begin at the first byte.
+
+write()
+-------
+The write() function will write a report to the device. For USB devices, if
+the device has an INTERRUPT OUT endpoint, the report will be sent on that
+endpoint. If it does not, the report will be sent over the control endpoint,
+using a SET_REPORT transfer.
+
+The first byte of the buffer passed to write() should be set to the report
+number. If the device does not use numbered reports, the first byte should
+be set to 0. The report data itself should begin at the second byte.
+
+ioctl()
+-------
+Hidraw supports the following ioctls:
+
+HIDIOCGRDESCSIZE:
+ Get Report Descriptor Size
+
+This ioctl will get the size of the device's report descriptor.
+
+HIDIOCGRDESC:
+ Get Report Descriptor
+
+This ioctl returns the device's report descriptor using a
+hidraw_report_descriptor struct. Make sure to set the size field of the
+hidraw_report_descriptor struct to the size returned from HIDIOCGRDESCSIZE.
+
+HIDIOCGRAWINFO:
+ Get Raw Info
+
+This ioctl will return a hidraw_devinfo struct containing the bus type, the
+vendor ID (VID), and product ID (PID) of the device. The bus type can be one
+of::
+
+ - BUS_USB
+ - BUS_HIL
+ - BUS_BLUETOOTH
+ - BUS_VIRTUAL
+
+which are defined in uapi/linux/input.h.
+
+HIDIOCGRAWNAME(len):
+ Get Raw Name
+
+This ioctl returns a string containing the vendor and product strings of
+the device. The returned string is Unicode, UTF-8 encoded.
+
+HIDIOCGRAWPHYS(len):
+ Get Physical Address
+
+This ioctl returns a string representing the physical address of the device.
+For USB devices, the string contains the physical path to the device (the
+USB controller, hubs, ports, etc). For Bluetooth devices, the string
+contains the hardware (MAC) address of the device.
+
+HIDIOCSFEATURE(len):
+ Send a Feature Report
+
+This ioctl will send a feature report to the device. Per the HID
+specification, feature reports are always sent using the control endpoint.
+Set the first byte of the supplied buffer to the report number. For devices
+which do not use numbered reports, set the first byte to 0. The report data
+begins in the second byte. Make sure to set len accordingly, to one more
+than the length of the report (to account for the report number).
+
+HIDIOCGFEATURE(len):
+ Get a Feature Report
+
+This ioctl will request a feature report from the device using the control
+endpoint. The first byte of the supplied buffer should be set to the report
+number of the requested report. For devices which do not use numbered
+reports, set the first byte to 0. The report will be returned starting at
+the first byte of the buffer (ie: the report number is not returned).
+
+Example
+-------
+In samples/, find hid-example.c, which shows examples of read(), write(),
+and all the ioctls for hidraw. The code may be used by anyone for any
+purpose, and can serve as a starting point for developing applications using
+hidraw.
+
+Document by:
+
+ Alan Ott <alan@signal11.us>, Signal 11 Software
diff --git a/Documentation/hid/hidraw.txt b/Documentation/hid/hidraw.txt
deleted file mode 100644
index c8436e354f44..000000000000
--- a/Documentation/hid/hidraw.txt
+++ /dev/null
@@ -1,119 +0,0 @@
- HIDRAW - Raw Access to USB and Bluetooth Human Interface Devices
- ==================================================================
-
-The hidraw driver provides a raw interface to USB and Bluetooth Human
-Interface Devices (HIDs). It differs from hiddev in that reports sent and
-received are not parsed by the HID parser, but are sent to and received from
-the device unmodified.
-
-Hidraw should be used if the userspace application knows exactly how to
-communicate with the hardware device, and is able to construct the HID
-reports manually. This is often the case when making userspace drivers for
-custom HID devices.
-
-Hidraw is also useful for communicating with non-conformant HID devices
-which send and receive data in a way that is inconsistent with their report
-descriptors. Because hiddev parses reports which are sent and received
-through it, checking them against the device's report descriptor, such
-communication with these non-conformant devices is impossible using hiddev.
-Hidraw is the only alternative, short of writing a custom kernel driver, for
-these non-conformant devices.
-
-A benefit of hidraw is that its use by userspace applications is independent
-of the underlying hardware type. Currently, Hidraw is implemented for USB
-and Bluetooth. In the future, as new hardware bus types are developed which
-use the HID specification, hidraw will be expanded to add support for these
-new bus types.
-
-Hidraw uses a dynamic major number, meaning that udev should be relied on to
-create hidraw device nodes. Udev will typically create the device nodes
-directly under /dev (eg: /dev/hidraw0). As this location is distribution-
-and udev rule-dependent, applications should use libudev to locate hidraw
-devices attached to the system. There is a tutorial on libudev with a
-working example at:
- http://www.signal11.us/oss/udev/
-
-The HIDRAW API
----------------
-
-read()
--------
-read() will read a queued report received from the HID device. On USB
-devices, the reports read using read() are the reports sent from the device
-on the INTERRUPT IN endpoint. By default, read() will block until there is
-a report available to be read. read() can be made non-blocking, by passing
-the O_NONBLOCK flag to open(), or by setting the O_NONBLOCK flag using
-fcntl().
-
-On a device which uses numbered reports, the first byte of the returned data
-will be the report number; the report data follows, beginning in the second
-byte. For devices which do not use numbered reports, the report data
-will begin at the first byte.
-
-write()
---------
-The write() function will write a report to the device. For USB devices, if
-the device has an INTERRUPT OUT endpoint, the report will be sent on that
-endpoint. If it does not, the report will be sent over the control endpoint,
-using a SET_REPORT transfer.
-
-The first byte of the buffer passed to write() should be set to the report
-number. If the device does not use numbered reports, the first byte should
-be set to 0. The report data itself should begin at the second byte.
-
-ioctl()
---------
-Hidraw supports the following ioctls:
-
-HIDIOCGRDESCSIZE: Get Report Descriptor Size
-This ioctl will get the size of the device's report descriptor.
-
-HIDIOCGRDESC: Get Report Descriptor
-This ioctl returns the device's report descriptor using a
-hidraw_report_descriptor struct. Make sure to set the size field of the
-hidraw_report_descriptor struct to the size returned from HIDIOCGRDESCSIZE.
-
-HIDIOCGRAWINFO: Get Raw Info
-This ioctl will return a hidraw_devinfo struct containing the bus type, the
-vendor ID (VID), and product ID (PID) of the device. The bus type can be one
-of:
- BUS_USB
- BUS_HIL
- BUS_BLUETOOTH
- BUS_VIRTUAL
-which are defined in uapi/linux/input.h.
-
-HIDIOCGRAWNAME(len): Get Raw Name
-This ioctl returns a string containing the vendor and product strings of
-the device. The returned string is Unicode, UTF-8 encoded.
-
-HIDIOCGRAWPHYS(len): Get Physical Address
-This ioctl returns a string representing the physical address of the device.
-For USB devices, the string contains the physical path to the device (the
-USB controller, hubs, ports, etc). For Bluetooth devices, the string
-contains the hardware (MAC) address of the device.
-
-HIDIOCSFEATURE(len): Send a Feature Report
-This ioctl will send a feature report to the device. Per the HID
-specification, feature reports are always sent using the control endpoint.
-Set the first byte of the supplied buffer to the report number. For devices
-which do not use numbered reports, set the first byte to 0. The report data
-begins in the second byte. Make sure to set len accordingly, to one more
-than the length of the report (to account for the report number).
-
-HIDIOCGFEATURE(len): Get a Feature Report
-This ioctl will request a feature report from the device using the control
-endpoint. The first byte of the supplied buffer should be set to the report
-number of the requested report. For devices which do not use numbered
-reports, set the first byte to 0. The report will be returned starting at
-the first byte of the buffer (ie: the report number is not returned).
-
-Example
----------
-In samples/, find hid-example.c, which shows examples of read(), write(),
-and all the ioctls for hidraw. The code may be used by anyone for any
-purpose, and can serve as a starting point for developing applications using
-hidraw.
-
-Document by:
- Alan Ott <alan@signal11.us>, Signal 11 Software
diff --git a/Documentation/hid/index.rst b/Documentation/hid/index.rst
new file mode 100644
index 000000000000..737d66dc16a1
--- /dev/null
+++ b/Documentation/hid/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=============================
+Human Interface Devices (HID)
+=============================
+
+.. toctree::
+ :maxdepth: 1
+
+ hiddev
+ hidraw
+ hid-sensor
+ hid-transport
+
+ uhid
+
+ hid-alps
+ intel-ish-hid
diff --git a/Documentation/hid/intel-ish-hid.rst b/Documentation/hid/intel-ish-hid.rst
new file mode 100644
index 000000000000..cccbf4be17d7
--- /dev/null
+++ b/Documentation/hid/intel-ish-hid.rst
@@ -0,0 +1,485 @@
+=================================
+Intel Integrated Sensor Hub (ISH)
+=================================
+
+A sensor hub enables the ability to offload sensor polling and algorithm
+processing to a dedicated low power co-processor. This allows the core
+processor to go into low power modes more often, resulting in the increased
+battery life.
+
+There are many vendors providing external sensor hubs confirming to HID
+Sensor usage tables, and used in several tablets, 2 in 1 convertible laptops
+and embedded products. Linux had this support since Linux 3.9.
+
+Intel® introduced integrated sensor hubs as a part of the SoC starting from
+Cherry Trail and now supported on multiple generations of CPU packages. There
+are many commercial devices already shipped with Integrated Sensor Hubs (ISH).
+These ISH also comply to HID sensor specification, but the difference is the
+transport protocol used for communication. The current external sensor hubs
+mainly use HID over i2C or USB. But ISH doesn't use either i2c or USB.
+
+1. Overview
+===========
+
+Using a analogy with a usbhid implementation, the ISH follows a similar model
+for a very high speed communication::
+
+ ----------------- ----------------------
+ | USB HID | --> | ISH HID |
+ ----------------- ----------------------
+ ----------------- ----------------------
+ | USB protocol | --> | ISH Transport |
+ ----------------- ----------------------
+ ----------------- ----------------------
+ | EHCI/XHCI | --> | ISH IPC |
+ ----------------- ----------------------
+ PCI PCI
+ ----------------- ----------------------
+ |Host controller| --> | ISH processor |
+ ----------------- ----------------------
+ USB Link
+ ----------------- ----------------------
+ | USB End points| --> | ISH Clients |
+ ----------------- ----------------------
+
+Like USB protocol provides a method for device enumeration, link management
+and user data encapsulation, the ISH also provides similar services. But it is
+very light weight tailored to manage and communicate with ISH client
+applications implemented in the firmware.
+
+The ISH allows multiple sensor management applications executing in the
+firmware. Like USB endpoints the messaging can be to/from a client. As part of
+enumeration process, these clients are identified. These clients can be simple
+HID sensor applications, sensor calibration application or senor firmware
+update application.
+
+The implementation model is similar, like USB bus, ISH transport is also
+implemented as a bus. Each client application executing in the ISH processor
+is registered as a device on this bus. The driver, which binds each device
+(ISH HID driver) identifies the device type and registers with the hid core.
+
+2. ISH Implementation: Block Diagram
+====================================
+
+::
+
+ ---------------------------
+ | User Space Applications |
+ ---------------------------
+
+ ----------------IIO ABI----------------
+ --------------------------
+ | IIO Sensor Drivers |
+ --------------------------
+ --------------------------
+ | IIO core |
+ --------------------------
+ --------------------------
+ | HID Sensor Hub MFD |
+ --------------------------
+ --------------------------
+ | HID Core |
+ --------------------------
+ --------------------------
+ | HID over ISH Client |
+ --------------------------
+ --------------------------
+ | ISH Transport (ISHTP) |
+ --------------------------
+ --------------------------
+ | IPC Drivers |
+ --------------------------
+ OS
+ ---------------- PCI -----------------
+ Hardware + Firmware
+ ----------------------------
+ | ISH Hardware/Firmware(FW) |
+ ----------------------------
+
+3. High level processing in above blocks
+========================================
+
+3.1 Hardware Interface
+----------------------
+
+The ISH is exposed as "Non-VGA unclassified PCI device" to the host. The PCI
+product and vendor IDs are changed from different generations of processors. So
+the source code which enumerate drivers needs to update from generation to
+generation.
+
+3.2 Inter Processor Communication (IPC) driver
+----------------------------------------------
+
+Location: drivers/hid/intel-ish-hid/ipc
+
+The IPC message used memory mapped I/O. The registers are defined in
+hw-ish-regs.h.
+
+3.2.1 IPC/FW message types
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There are two types of messages, one for management of link and other messages
+are to and from transport layers.
+
+TX and RX of Transport messages
+...............................
+
+A set of memory mapped register offers support of multi byte messages TX and
+RX (E.g.IPC_REG_ISH2HOST_MSG, IPC_REG_HOST2ISH_MSG). The IPC layer maintains
+internal queues to sequence messages and send them in order to the FW.
+Optionally the caller can register handler to get notification of completion.
+A door bell mechanism is used in messaging to trigger processing in host and
+client firmware side. When ISH interrupt handler is called, the ISH2HOST
+doorbell register is used by host drivers to determine that the interrupt
+is for ISH.
+
+Each side has 32 32-bit message registers and a 32-bit doorbell. Doorbell
+register has the following format:
+Bits 0..6: fragment length (7 bits are used)
+Bits 10..13: encapsulated protocol
+Bits 16..19: management command (for IPC management protocol)
+Bit 31: doorbell trigger (signal H/W interrupt to the other side)
+Other bits are reserved, should be 0.
+
+3.2.2 Transport layer interface
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To abstract HW level IPC communication, a set of callbacks are registered.
+The transport layer uses them to send and receive messages.
+Refer to struct ishtp_hw_ops for callbacks.
+
+3.3 ISH Transport layer
+-----------------------
+
+Location: drivers/hid/intel-ish-hid/ishtp/
+
+3.3.1 A Generic Transport Layer
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The transport layer is a bi-directional protocol, which defines:
+- Set of commands to start, stop, connect, disconnect and flow control
+(ishtp/hbm.h) for details
+- A flow control mechanism to avoid buffer overflows
+
+This protocol resembles bus messages described in the following document:
+http://www.intel.com/content/dam/www/public/us/en/documents/technical-\
+specifications/dcmi-hi-1-0-spec.pdf "Chapter 7: Bus Message Layer"
+
+3.3.2 Connection and Flow Control Mechanism
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Each FW client and a protocol is identified by an UUID. In order to communicate
+to a FW client, a connection must be established using connect request and
+response bus messages. If successful, a pair (host_client_id and fw_client_id)
+will identify the connection.
+
+Once connection is established, peers send each other flow control bus messages
+independently. Every peer may send a message only if it has received a
+flow-control credit before. Once it sent a message, it may not send another one
+before receiving the next flow control credit.
+Either side can send disconnect request bus message to end communication. Also
+the link will be dropped if major FW reset occurs.
+
+3.3.3 Peer to Peer data transfer
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Peer to Peer data transfer can happen with or without using DMA. Depending on
+the sensor bandwidth requirement DMA can be enabled by using module parameter
+ishtp_use_dma under intel_ishtp.
+
+Each side (host and FW) manages its DMA transfer memory independently. When an
+ISHTP client from either host or FW side wants to send something, it decides
+whether to send over IPC or over DMA; for each transfer the decision is
+independent. The sending side sends DMA_XFER message when the message is in
+the respective host buffer (TX when host client sends, RX when FW client
+sends). The recipient of DMA message responds with DMA_XFER_ACK, indicating
+the sender that the memory region for that message may be reused.
+
+DMA initialization is started with host sending DMA_ALLOC_NOTIFY bus message
+(that includes RX buffer) and FW responds with DMA_ALLOC_NOTIFY_ACK.
+Additionally to DMA address communication, this sequence checks capabilities:
+if thw host doesn't support DMA, then it won't send DMA allocation, so FW can't
+send DMA; if FW doesn't support DMA then it won't respond with
+DMA_ALLOC_NOTIFY_ACK, in which case host will not use DMA transfers.
+Here ISH acts as busmaster DMA controller. Hence when host sends DMA_XFER,
+it's request to do host->ISH DMA transfer; when FW sends DMA_XFER, it means
+that it already did DMA and the message resides at host. Thus, DMA_XFER
+and DMA_XFER_ACK act as ownership indicators.
+
+At initial state all outgoing memory belongs to the sender (TX to host, RX to
+FW), DMA_XFER transfers ownership on the region that contains ISHTP message to
+the receiving side, DMA_XFER_ACK returns ownership to the sender. A sender
+needs not wait for previous DMA_XFER to be ack'ed, and may send another message
+as long as remaining continuous memory in its ownership is enough.
+In principle, multiple DMA_XFER and DMA_XFER_ACK messages may be sent at once
+(up to IPC MTU), thus allowing for interrupt throttling.
+Currently, ISH FW decides to send over DMA if ISHTP message is more than 3 IPC
+fragments and via IPC otherwise.
+
+3.3.4 Ring Buffers
+^^^^^^^^^^^^^^^^^^
+
+When a client initiate a connection, a ring or RX and TX buffers are allocated.
+The size of ring can be specified by the client. HID client set 16 and 32 for
+TX and RX buffers respectively. On send request from client, the data to be
+sent is copied to one of the send ring buffer and scheduled to be sent using
+bus message protocol. These buffers are required because the FW may have not
+have processed the last message and may not have enough flow control credits
+to send. Same thing holds true on receive side and flow control is required.
+
+3.3.5 Host Enumeration
+^^^^^^^^^^^^^^^^^^^^^^
+
+The host enumeration bus command allow discovery of clients present in the FW.
+There can be multiple sensor clients and clients for calibration function.
+
+To ease in implantation and allow independent driver handle each client
+this transport layer takes advantage of Linux Bus driver model. Each
+client is registered as device on the the transport bus (ishtp bus).
+
+Enumeration sequence of messages:
+
+- Host sends HOST_START_REQ_CMD, indicating that host ISHTP layer is up.
+- FW responds with HOST_START_RES_CMD
+- Host sends HOST_ENUM_REQ_CMD (enumerate FW clients)
+- FW responds with HOST_ENUM_RES_CMD that includes bitmap of available FW
+ client IDs
+- For each FW ID found in that bitmap host sends
+ HOST_CLIENT_PROPERTIES_REQ_CMD
+- FW responds with HOST_CLIENT_PROPERTIES_RES_CMD. Properties include UUID,
+ max ISHTP message size, etc.
+- Once host received properties for that last discovered client, it considers
+ ISHTP device fully functional (and allocates DMA buffers)
+
+3.4 HID over ISH Client
+-----------------------
+
+Location: drivers/hid/intel-ish-hid
+
+The ISHTP client driver is responsible for:
+
+- enumerate HID devices under FW ISH client
+- Get Report descriptor
+- Register with HID core as a LL driver
+- Process Get/Set feature request
+- Get input reports
+
+3.5 HID Sensor Hub MFD and IIO sensor drivers
+---------------------------------------------
+
+The functionality in these drivers is the same as an external sensor hub.
+Refer to
+Documentation/hid/hid-sensor.rst for HID sensor
+Documentation/ABI/testing/sysfs-bus-iio for IIO ABIs to user space
+
+3.6 End to End HID transport Sequence Diagram
+---------------------------------------------
+
+::
+
+ HID-ISH-CLN ISHTP IPC HW
+ | | | |
+ | | |-----WAKE UP------------------>|
+ | | | |
+ | | |-----HOST READY--------------->|
+ | | | |
+ | | |<----MNG_RESET_NOTIFY_ACK----- |
+ | | | |
+ | |<----ISHTP_START------ | |
+ | | | |
+ | |<-----------------HOST_START_RES_CMD-------------------|
+ | | | |
+ | |------------------QUERY_SUBSCRIBER-------------------->|
+ | | | |
+ | |------------------HOST_ENUM_REQ_CMD------------------->|
+ | | | |
+ | |<-----------------HOST_ENUM_RES_CMD--------------------|
+ | | | |
+ | |------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
+ | | | |
+ | |<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
+ | Create new device on in ishtp bus | |
+ | | | |
+ | |------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
+ | | | |
+ | |<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
+ | Create new device on in ishtp bus | |
+ | | | |
+ | |--Repeat HOST_CLIENT_PROPERTIES_REQ_CMD-till last one--|
+ | | | |
+ probed()
+ |----ishtp_cl_connect--->|----------------- CLIENT_CONNECT_REQ_CMD-------------->|
+ | | | |
+ | |<----------------CLIENT_CONNECT_RES_CMD----------------|
+ | | | |
+ |register event callback | | |
+ | | | |
+ |ishtp_cl_send(
+ HOSTIF_DM_ENUM_DEVICES) |----------fill ishtp_msg_hdr struct write to HW----- >|
+ | | | |
+ | | |<-----IRQ(IPC_PROTOCOL_ISHTP---|
+ | | | |
+ |<--ENUM_DEVICE RSP------| | |
+ | | | |
+ for each enumerated device
+ |ishtp_cl_send(
+ HOSTIF_GET_HID_DESCRIPTOR|----------fill ishtp_msg_hdr struct write to HW----- >|
+ | | | |
+ ...Response
+ | | | |
+ for each enumerated device
+ |ishtp_cl_send(
+ HOSTIF_GET_REPORT_DESCRIPTOR|--------------fill ishtp_msg_hdr struct write to HW-- >|
+ | | | |
+ | | | |
+ hid_allocate_device
+ | | | |
+ hid_add_device | | |
+ | | | |
+
+
+3.7 ISH Debugging
+-----------------
+
+To debug ISH, event tracing mechanism is used. To enable debug logs
+echo 1 > /sys/kernel/debug/tracing/events/intel_ish/enable
+cat sys/kernel/debug/tracing/trace
+
+3.8 ISH IIO sysfs Example on Lenovo thinkpad Yoga 260
+-----------------------------------------------------
+
+::
+
+ root@otcpl-ThinkPad-Yoga-260:~# tree -l /sys/bus/iio/devices/
+ /sys/bus/iio/devices/
+ ├── iio:device0 -> ../../../devices/0044:8086:22D8.0001/HID-SENSOR-200073.9.auto/iio:device0
+ │   ├── buffer
+ │   │   ├── enable
+ │   │   ├── length
+ │   │   └── watermark
+ ...
+ │   ├── in_accel_hysteresis
+ │   ├── in_accel_offset
+ │   ├── in_accel_sampling_frequency
+ │   ├── in_accel_scale
+ │   ├── in_accel_x_raw
+ │   ├── in_accel_y_raw
+ │   ├── in_accel_z_raw
+ │   ├── name
+ │   ├── scan_elements
+ │   │   ├── in_accel_x_en
+ │   │   ├── in_accel_x_index
+ │   │   ├── in_accel_x_type
+ │   │   ├── in_accel_y_en
+ │   │   ├── in_accel_y_index
+ │   │   ├── in_accel_y_type
+ │   │   ├── in_accel_z_en
+ │   │   ├── in_accel_z_index
+ │   │   └── in_accel_z_type
+ ...
+ │   │   ├── devices
+ │   │   │   │   ├── buffer
+ │   │   │   │   │   ├── enable
+ │   │   │   │   │   ├── length
+ │   │   │   │   │   └── watermark
+ │   │   │   │   ├── dev
+ │   │   │   │   ├── in_intensity_both_raw
+ │   │   │   │   ├── in_intensity_hysteresis
+ │   │   │   │   ├── in_intensity_offset
+ │   │   │   │   ├── in_intensity_sampling_frequency
+ │   │   │   │   ├── in_intensity_scale
+ │   │   │   │   ├── name
+ │   │   │   │   ├── scan_elements
+ │   │   │   │   │   ├── in_intensity_both_en
+ │   │   │   │   │   ├── in_intensity_both_index
+ │   │   │   │   │   └── in_intensity_both_type
+ │   │   │   │   ├── trigger
+ │   │   │   │   │   └── current_trigger
+ ...
+ │   │   │   │   ├── buffer
+ │   │   │   │   │   ├── enable
+ │   │   │   │   │   ├── length
+ │   │   │   │   │   └── watermark
+ │   │   │   │   ├── dev
+ │   │   │   │   ├── in_magn_hysteresis
+ │   │   │   │   ├── in_magn_offset
+ │   │   │   │   ├── in_magn_sampling_frequency
+ │   │   │   │   ├── in_magn_scale
+ │   │   │   │   ├── in_magn_x_raw
+ │   │   │   │   ├── in_magn_y_raw
+ │   │   │   │   ├── in_magn_z_raw
+ │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_raw
+ │   │   │   │   ├── in_rot_hysteresis
+ │   │   │   │   ├── in_rot_offset
+ │   │   │   │   ├── in_rot_sampling_frequency
+ │   │   │   │   ├── in_rot_scale
+ │   │   │   │   ├── name
+ ...
+ │   │   │   │   ├── scan_elements
+ │   │   │   │   │   ├── in_magn_x_en
+ │   │   │   │   │   ├── in_magn_x_index
+ │   │   │   │   │   ├── in_magn_x_type
+ │   │   │   │   │   ├── in_magn_y_en
+ │   │   │   │   │   ├── in_magn_y_index
+ │   │   │   │   │   ├── in_magn_y_type
+ │   │   │   │   │   ├── in_magn_z_en
+ │   │   │   │   │   ├── in_magn_z_index
+ │   │   │   │   │   ├── in_magn_z_type
+ │   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_en
+ │   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_index
+ │   │   │   │   │   └── in_rot_from_north_magnetic_tilt_comp_type
+ │   │   │   │   ├── trigger
+ │   │   │   │   │   └── current_trigger
+ ...
+ │   │   │   │   ├── buffer
+ │   │   │   │   │   ├── enable
+ │   │   │   │   │   ├── length
+ │   │   │   │   │   └── watermark
+ │   │   │   │   ├── dev
+ │   │   │   │   ├── in_anglvel_hysteresis
+ │   │   │   │   ├── in_anglvel_offset
+ │   │   │   │   ├── in_anglvel_sampling_frequency
+ │   │   │   │   ├── in_anglvel_scale
+ │   │   │   │   ├── in_anglvel_x_raw
+ │   │   │   │   ├── in_anglvel_y_raw
+ │   │   │   │   ├── in_anglvel_z_raw
+ │   │   │   │   ├── name
+ │   │   │   │   ├── scan_elements
+ │   │   │   │   │   ├── in_anglvel_x_en
+ │   │   │   │   │   ├── in_anglvel_x_index
+ │   │   │   │   │   ├── in_anglvel_x_type
+ │   │   │   │   │   ├── in_anglvel_y_en
+ │   │   │   │   │   ├── in_anglvel_y_index
+ │   │   │   │   │   ├── in_anglvel_y_type
+ │   │   │   │   │   ├── in_anglvel_z_en
+ │   │   │   │   │   ├── in_anglvel_z_index
+ │   │   │   │   │   └── in_anglvel_z_type
+ │   │   │   │   ├── trigger
+ │   │   │   │   │   └── current_trigger
+ ...
+ │   │   │   │   ├── buffer
+ │   │   │   │   │   ├── enable
+ │   │   │   │   │   ├── length
+ │   │   │   │   │   └── watermark
+ │   │   │   │   ├── dev
+ │   │   │   │   ├── in_anglvel_hysteresis
+ │   │   │   │   ├── in_anglvel_offset
+ │   │   │   │   ├── in_anglvel_sampling_frequency
+ │   │   │   │   ├── in_anglvel_scale
+ │   │   │   │   ├── in_anglvel_x_raw
+ │   │   │   │   ├── in_anglvel_y_raw
+ │   │   │   │   ├── in_anglvel_z_raw
+ │   │   │   │   ├── name
+ │   │   │   │   ├── scan_elements
+ │   │   │   │   │   ├── in_anglvel_x_en
+ │   │   │   │   │   ├── in_anglvel_x_index
+ │   │   │   │   │   ├── in_anglvel_x_type
+ │   │   │   │   │   ├── in_anglvel_y_en
+ │   │   │   │   │   ├── in_anglvel_y_index
+ │   │   │   │   │   ├── in_anglvel_y_type
+ │   │   │   │   │   ├── in_anglvel_z_en
+ │   │   │   │   │   ├── in_anglvel_z_index
+ │   │   │   │   │   └── in_anglvel_z_type
+ │   │   │   │   ├── trigger
+ │   │   │   │   │   └── current_trigger
+ ...
diff --git a/Documentation/hid/intel-ish-hid.txt b/Documentation/hid/intel-ish-hid.txt
deleted file mode 100644
index d48b21c71ddd..000000000000
--- a/Documentation/hid/intel-ish-hid.txt
+++ /dev/null
@@ -1,454 +0,0 @@
-Intel Integrated Sensor Hub (ISH)
-===============================
-
-A sensor hub enables the ability to offload sensor polling and algorithm
-processing to a dedicated low power co-processor. This allows the core
-processor to go into low power modes more often, resulting in the increased
-battery life.
-
-There are many vendors providing external sensor hubs confirming to HID
-Sensor usage tables, and used in several tablets, 2 in 1 convertible laptops
-and embedded products. Linux had this support since Linux 3.9.
-
-Intel® introduced integrated sensor hubs as a part of the SoC starting from
-Cherry Trail and now supported on multiple generations of CPU packages. There
-are many commercial devices already shipped with Integrated Sensor Hubs (ISH).
-These ISH also comply to HID sensor specification, but the difference is the
-transport protocol used for communication. The current external sensor hubs
-mainly use HID over i2C or USB. But ISH doesn't use either i2c or USB.
-
-1. Overview
-
-Using a analogy with a usbhid implementation, the ISH follows a similar model
-for a very high speed communication:
-
- ----------------- ----------------------
- | USB HID | --> | ISH HID |
- ----------------- ----------------------
- ----------------- ----------------------
- | USB protocol | --> | ISH Transport |
- ----------------- ----------------------
- ----------------- ----------------------
- | EHCI/XHCI | --> | ISH IPC |
- ----------------- ----------------------
- PCI PCI
- ----------------- ----------------------
- |Host controller| --> | ISH processor |
- ----------------- ----------------------
- USB Link
- ----------------- ----------------------
- | USB End points| --> | ISH Clients |
- ----------------- ----------------------
-
-Like USB protocol provides a method for device enumeration, link management
-and user data encapsulation, the ISH also provides similar services. But it is
-very light weight tailored to manage and communicate with ISH client
-applications implemented in the firmware.
-
-The ISH allows multiple sensor management applications executing in the
-firmware. Like USB endpoints the messaging can be to/from a client. As part of
-enumeration process, these clients are identified. These clients can be simple
-HID sensor applications, sensor calibration application or senor firmware
-update application.
-
-The implementation model is similar, like USB bus, ISH transport is also
-implemented as a bus. Each client application executing in the ISH processor
-is registered as a device on this bus. The driver, which binds each device
-(ISH HID driver) identifies the device type and registers with the hid core.
-
-2. ISH Implementation: Block Diagram
-
- ---------------------------
- | User Space Applications |
- ---------------------------
-
-----------------IIO ABI----------------
- --------------------------
- | IIO Sensor Drivers |
- --------------------------
- --------------------------
- | IIO core |
- --------------------------
- --------------------------
- | HID Sensor Hub MFD |
- --------------------------
- --------------------------
- | HID Core |
- --------------------------
- --------------------------
- | HID over ISH Client |
- --------------------------
- --------------------------
- | ISH Transport (ISHTP) |
- --------------------------
- --------------------------
- | IPC Drivers |
- --------------------------
-OS
----------------- PCI -----------------
-Hardware + Firmware
- ----------------------------
- | ISH Hardware/Firmware(FW) |
- ----------------------------
-
-3. High level processing in above blocks
-
-3.1 Hardware Interface
-
-The ISH is exposed as "Non-VGA unclassified PCI device" to the host. The PCI
-product and vendor IDs are changed from different generations of processors. So
-the source code which enumerate drivers needs to update from generation to
-generation.
-
-3.2 Inter Processor Communication (IPC) driver
-Location: drivers/hid/intel-ish-hid/ipc
-
-The IPC message used memory mapped I/O. The registers are defined in
-hw-ish-regs.h.
-
-3.2.1 IPC/FW message types
-
-There are two types of messages, one for management of link and other messages
-are to and from transport layers.
-
-TX and RX of Transport messages
-
-A set of memory mapped register offers support of multi byte messages TX and
-RX (E.g.IPC_REG_ISH2HOST_MSG, IPC_REG_HOST2ISH_MSG). The IPC layer maintains
-internal queues to sequence messages and send them in order to the FW.
-Optionally the caller can register handler to get notification of completion.
-A door bell mechanism is used in messaging to trigger processing in host and
-client firmware side. When ISH interrupt handler is called, the ISH2HOST
-doorbell register is used by host drivers to determine that the interrupt
-is for ISH.
-
-Each side has 32 32-bit message registers and a 32-bit doorbell. Doorbell
-register has the following format:
-Bits 0..6: fragment length (7 bits are used)
-Bits 10..13: encapsulated protocol
-Bits 16..19: management command (for IPC management protocol)
-Bit 31: doorbell trigger (signal H/W interrupt to the other side)
-Other bits are reserved, should be 0.
-
-3.2.2 Transport layer interface
-
-To abstract HW level IPC communication, a set of callbacks are registered.
-The transport layer uses them to send and receive messages.
-Refer to struct ishtp_hw_ops for callbacks.
-
-3.3 ISH Transport layer
-Location: drivers/hid/intel-ish-hid/ishtp/
-
-3.3.1 A Generic Transport Layer
-
-The transport layer is a bi-directional protocol, which defines:
-- Set of commands to start, stop, connect, disconnect and flow control
-(ishtp/hbm.h) for details
-- A flow control mechanism to avoid buffer overflows
-
-This protocol resembles bus messages described in the following document:
-http://www.intel.com/content/dam/www/public/us/en/documents/technical-\
-specifications/dcmi-hi-1-0-spec.pdf "Chapter 7: Bus Message Layer"
-
-3.3.2 Connection and Flow Control Mechanism
-
-Each FW client and a protocol is identified by an UUID. In order to communicate
-to a FW client, a connection must be established using connect request and
-response bus messages. If successful, a pair (host_client_id and fw_client_id)
-will identify the connection.
-
-Once connection is established, peers send each other flow control bus messages
-independently. Every peer may send a message only if it has received a
-flow-control credit before. Once it sent a message, it may not send another one
-before receiving the next flow control credit.
-Either side can send disconnect request bus message to end communication. Also
-the link will be dropped if major FW reset occurs.
-
-3.3.3 Peer to Peer data transfer
-
-Peer to Peer data transfer can happen with or without using DMA. Depending on
-the sensor bandwidth requirement DMA can be enabled by using module parameter
-ishtp_use_dma under intel_ishtp.
-
-Each side (host and FW) manages its DMA transfer memory independently. When an
-ISHTP client from either host or FW side wants to send something, it decides
-whether to send over IPC or over DMA; for each transfer the decision is
-independent. The sending side sends DMA_XFER message when the message is in
-the respective host buffer (TX when host client sends, RX when FW client
-sends). The recipient of DMA message responds with DMA_XFER_ACK, indicating
-the sender that the memory region for that message may be reused.
-
-DMA initialization is started with host sending DMA_ALLOC_NOTIFY bus message
-(that includes RX buffer) and FW responds with DMA_ALLOC_NOTIFY_ACK.
-Additionally to DMA address communication, this sequence checks capabilities:
-if thw host doesn't support DMA, then it won't send DMA allocation, so FW can't
-send DMA; if FW doesn't support DMA then it won't respond with
-DMA_ALLOC_NOTIFY_ACK, in which case host will not use DMA transfers.
-Here ISH acts as busmaster DMA controller. Hence when host sends DMA_XFER,
-it's request to do host->ISH DMA transfer; when FW sends DMA_XFER, it means
-that it already did DMA and the message resides at host. Thus, DMA_XFER
-and DMA_XFER_ACK act as ownership indicators.
-
-At initial state all outgoing memory belongs to the sender (TX to host, RX to
-FW), DMA_XFER transfers ownership on the region that contains ISHTP message to
-the receiving side, DMA_XFER_ACK returns ownership to the sender. A sender
-needs not wait for previous DMA_XFER to be ack'ed, and may send another message
-as long as remaining continuous memory in its ownership is enough.
-In principle, multiple DMA_XFER and DMA_XFER_ACK messages may be sent at once
-(up to IPC MTU), thus allowing for interrupt throttling.
-Currently, ISH FW decides to send over DMA if ISHTP message is more than 3 IPC
-fragments and via IPC otherwise.
-
-3.3.4 Ring Buffers
-
-When a client initiate a connection, a ring or RX and TX buffers are allocated.
-The size of ring can be specified by the client. HID client set 16 and 32 for
-TX and RX buffers respectively. On send request from client, the data to be
-sent is copied to one of the send ring buffer and scheduled to be sent using
-bus message protocol. These buffers are required because the FW may have not
-have processed the last message and may not have enough flow control credits
-to send. Same thing holds true on receive side and flow control is required.
-
-3.3.5 Host Enumeration
-
-The host enumeration bus command allow discovery of clients present in the FW.
-There can be multiple sensor clients and clients for calibration function.
-
-To ease in implantation and allow independent driver handle each client
-this transport layer takes advantage of Linux Bus driver model. Each
-client is registered as device on the the transport bus (ishtp bus).
-
-Enumeration sequence of messages:
-- Host sends HOST_START_REQ_CMD, indicating that host ISHTP layer is up.
-- FW responds with HOST_START_RES_CMD
-- Host sends HOST_ENUM_REQ_CMD (enumerate FW clients)
-- FW responds with HOST_ENUM_RES_CMD that includes bitmap of available FW
-client IDs
-- For each FW ID found in that bitmap host sends
-HOST_CLIENT_PROPERTIES_REQ_CMD
-- FW responds with HOST_CLIENT_PROPERTIES_RES_CMD. Properties include UUID,
-max ISHTP message size, etc.
-- Once host received properties for that last discovered client, it considers
-ISHTP device fully functional (and allocates DMA buffers)
-
-3.4 HID over ISH Client
-Location: drivers/hid/intel-ish-hid
-
-The ISHTP client driver is responsible for:
-- enumerate HID devices under FW ISH client
-- Get Report descriptor
-- Register with HID core as a LL driver
-- Process Get/Set feature request
-- Get input reports
-
-3.5 HID Sensor Hub MFD and IIO sensor drivers
-
-The functionality in these drivers is the same as an external sensor hub.
-Refer to
-Documentation/hid/hid-sensor.txt for HID sensor
-Documentation/ABI/testing/sysfs-bus-iio for IIO ABIs to user space
-
-3.6 End to End HID transport Sequence Diagram
-
-HID-ISH-CLN ISHTP IPC HW
- | | | |
- | | |-----WAKE UP------------------>|
- | | | |
- | | |-----HOST READY--------------->|
- | | | |
- | | |<----MNG_RESET_NOTIFY_ACK----- |
- | | | |
- | |<----ISHTP_START------ | |
- | | | |
- | |<-----------------HOST_START_RES_CMD-------------------|
- | | | |
- | |------------------QUERY_SUBSCRIBER-------------------->|
- | | | |
- | |------------------HOST_ENUM_REQ_CMD------------------->|
- | | | |
- | |<-----------------HOST_ENUM_RES_CMD--------------------|
- | | | |
- | |------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
- | | | |
- | |<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
- | Create new device on in ishtp bus | |
- | | | |
- | |------------------HOST_CLIENT_PROPERTIES_REQ_CMD------>|
- | | | |
- | |<-----------------HOST_CLIENT_PROPERTIES_RES_CMD-------|
- | Create new device on in ishtp bus | |
- | | | |
- | |--Repeat HOST_CLIENT_PROPERTIES_REQ_CMD-till last one--|
- | | | |
- probed()
- |----ishtp_cl_connect-->|----------------- CLIENT_CONNECT_REQ_CMD-------------->|
- | | | |
- | |<----------------CLIENT_CONNECT_RES_CMD----------------|
- | | | |
- |register event callback| | |
- | | | |
- |ishtp_cl_send(
- HOSTIF_DM_ENUM_DEVICES) |----------fill ishtp_msg_hdr struct write to HW----- >|
- | | | |
- | | |<-----IRQ(IPC_PROTOCOL_ISHTP---|
- | | | |
- |<--ENUM_DEVICE RSP-----| | |
- | | | |
-for each enumerated device
- |ishtp_cl_send(
- HOSTIF_GET_HID_DESCRIPTOR |----------fill ishtp_msg_hdr struct write to HW--- >|
- | | | |
- ...Response
- | | | |
-for each enumerated device
- |ishtp_cl_send(
- HOSTIF_GET_REPORT_DESCRIPTOR |----------fill ishtp_msg_hdr struct write to HW- >|
- | | | |
- | | | |
- hid_allocate_device
- | | | |
- hid_add_device | | |
- | | | |
-
-
-3.7 ISH Debugging
-
-To debug ISH, event tracing mechanism is used. To enable debug logs
-echo 1 > /sys/kernel/debug/tracing/events/intel_ish/enable
-cat sys/kernel/debug/tracing/trace
-
-3.8 ISH IIO sysfs Example on Lenovo thinkpad Yoga 260
-
-root@otcpl-ThinkPad-Yoga-260:~# tree -l /sys/bus/iio/devices/
-/sys/bus/iio/devices/
-├── iio:device0 -> ../../../devices/0044:8086:22D8.0001/HID-SENSOR-200073.9.auto/iio:device0
-│   ├── buffer
-│   │   ├── enable
-│   │   ├── length
-│   │   └── watermark
-...
-│   ├── in_accel_hysteresis
-│   ├── in_accel_offset
-│   ├── in_accel_sampling_frequency
-│   ├── in_accel_scale
-│   ├── in_accel_x_raw
-│   ├── in_accel_y_raw
-│   ├── in_accel_z_raw
-│   ├── name
-│   ├── scan_elements
-│   │   ├── in_accel_x_en
-│   │   ├── in_accel_x_index
-│   │   ├── in_accel_x_type
-│   │   ├── in_accel_y_en
-│   │   ├── in_accel_y_index
-│   │   ├── in_accel_y_type
-│   │   ├── in_accel_z_en
-│   │   ├── in_accel_z_index
-│   │   └── in_accel_z_type
-...
-│   │   ├── devices
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_intensity_both_raw
-│   │   │   │   ├── in_intensity_hysteresis
-│   │   │   │   ├── in_intensity_offset
-│   │   │   │   ├── in_intensity_sampling_frequency
-│   │   │   │   ├── in_intensity_scale
-│   │   │   │   ├── name
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_intensity_both_en
-│   │   │   │   │   ├── in_intensity_both_index
-│   │   │   │   │   └── in_intensity_both_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_magn_hysteresis
-│   │   │   │   ├── in_magn_offset
-│   │   │   │   ├── in_magn_sampling_frequency
-│   │   │   │   ├── in_magn_scale
-│   │   │   │   ├── in_magn_x_raw
-│   │   │   │   ├── in_magn_y_raw
-│   │   │   │   ├── in_magn_z_raw
-│   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_raw
-│   │   │   │   ├── in_rot_hysteresis
-│   │   │   │   ├── in_rot_offset
-│   │   │   │   ├── in_rot_sampling_frequency
-│   │   │   │   ├── in_rot_scale
-│   │   │   │   ├── name
-...
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_magn_x_en
-│   │   │   │   │   ├── in_magn_x_index
-│   │   │   │   │   ├── in_magn_x_type
-│   │   │   │   │   ├── in_magn_y_en
-│   │   │   │   │   ├── in_magn_y_index
-│   │   │   │   │   ├── in_magn_y_type
-│   │   │   │   │   ├── in_magn_z_en
-│   │   │   │   │   ├── in_magn_z_index
-│   │   │   │   │   ├── in_magn_z_type
-│   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_en
-│   │   │   │   │   ├── in_rot_from_north_magnetic_tilt_comp_index
-│   │   │   │   │   └── in_rot_from_north_magnetic_tilt_comp_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_anglvel_hysteresis
-│   │   │   │   ├── in_anglvel_offset
-│   │   │   │   ├── in_anglvel_sampling_frequency
-│   │   │   │   ├── in_anglvel_scale
-│   │   │   │   ├── in_anglvel_x_raw
-│   │   │   │   ├── in_anglvel_y_raw
-│   │   │   │   ├── in_anglvel_z_raw
-│   │   │   │   ├── name
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_anglvel_x_en
-│   │   │   │   │   ├── in_anglvel_x_index
-│   │   │   │   │   ├── in_anglvel_x_type
-│   │   │   │   │   ├── in_anglvel_y_en
-│   │   │   │   │   ├── in_anglvel_y_index
-│   │   │   │   │   ├── in_anglvel_y_type
-│   │   │   │   │   ├── in_anglvel_z_en
-│   │   │   │   │   ├── in_anglvel_z_index
-│   │   │   │   │   └── in_anglvel_z_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
-│   │   │   │   ├── buffer
-│   │   │   │   │   ├── enable
-│   │   │   │   │   ├── length
-│   │   │   │   │   └── watermark
-│   │   │   │   ├── dev
-│   │   │   │   ├── in_anglvel_hysteresis
-│   │   │   │   ├── in_anglvel_offset
-│   │   │   │   ├── in_anglvel_sampling_frequency
-│   │   │   │   ├── in_anglvel_scale
-│   │   │   │   ├── in_anglvel_x_raw
-│   │   │   │   ├── in_anglvel_y_raw
-│   │   │   │   ├── in_anglvel_z_raw
-│   │   │   │   ├── name
-│   │   │   │   ├── scan_elements
-│   │   │   │   │   ├── in_anglvel_x_en
-│   │   │   │   │   ├── in_anglvel_x_index
-│   │   │   │   │   ├── in_anglvel_x_type
-│   │   │   │   │   ├── in_anglvel_y_en
-│   │   │   │   │   ├── in_anglvel_y_index
-│   │   │   │   │   ├── in_anglvel_y_type
-│   │   │   │   │   ├── in_anglvel_z_en
-│   │   │   │   │   ├── in_anglvel_z_index
-│   │   │   │   │   └── in_anglvel_z_type
-│   │   │   │   ├── trigger
-│   │   │   │   │   └── current_trigger
-...
diff --git a/Documentation/hid/uhid.rst b/Documentation/hid/uhid.rst
new file mode 100644
index 000000000000..b18cb96c885f
--- /dev/null
+++ b/Documentation/hid/uhid.rst
@@ -0,0 +1,193 @@
+======================================================
+UHID - User-space I/O driver support for HID subsystem
+======================================================
+
+UHID allows user-space to implement HID transport drivers. Please see
+hid-transport.txt for an introduction into HID transport drivers. This document
+relies heavily on the definitions declared there.
+
+With UHID, a user-space transport driver can create kernel hid-devices for each
+device connected to the user-space controlled bus. The UHID API defines the I/O
+events provided from the kernel to user-space and vice versa.
+
+There is an example user-space application in ./samples/uhid/uhid-example.c
+
+The UHID API
+------------
+
+UHID is accessed through a character misc-device. The minor-number is allocated
+dynamically so you need to rely on udev (or similar) to create the device node.
+This is /dev/uhid by default.
+
+If a new device is detected by your HID I/O Driver and you want to register this
+device with the HID subsystem, then you need to open /dev/uhid once for each
+device you want to register. All further communication is done by read()'ing or
+write()'ing "struct uhid_event" objects. Non-blocking operations are supported
+by setting O_NONBLOCK::
+
+ struct uhid_event {
+ __u32 type;
+ union {
+ struct uhid_create2_req create2;
+ struct uhid_output_req output;
+ struct uhid_input2_req input2;
+ ...
+ } u;
+ };
+
+The "type" field contains the ID of the event. Depending on the ID different
+payloads are sent. You must not split a single event across multiple read()'s or
+multiple write()'s. A single event must always be sent as a whole. Furthermore,
+only a single event can be sent per read() or write(). Pending data is ignored.
+If you want to handle multiple events in a single syscall, then use vectored
+I/O with readv()/writev().
+The "type" field defines the payload. For each type, there is a
+payload-structure available in the union "u" (except for empty payloads). This
+payload contains management and/or device data.
+
+The first thing you should do is sending an UHID_CREATE2 event. This will
+register the device. UHID will respond with an UHID_START event. You can now
+start sending data to and reading data from UHID. However, unless UHID sends the
+UHID_OPEN event, the internally attached HID Device Driver has no user attached.
+That is, you might put your device asleep unless you receive the UHID_OPEN
+event. If you receive the UHID_OPEN event, you should start I/O. If the last
+user closes the HID device, you will receive an UHID_CLOSE event. This may be
+followed by an UHID_OPEN event again and so on. There is no need to perform
+reference-counting in user-space. That is, you will never receive multiple
+UHID_OPEN events without an UHID_CLOSE event. The HID subsystem performs
+ref-counting for you.
+You may decide to ignore UHID_OPEN/UHID_CLOSE, though. I/O is allowed even
+though the device may have no users.
+
+If you want to send data on the interrupt channel to the HID subsystem, you send
+an HID_INPUT2 event with your raw data payload. If the kernel wants to send data
+on the interrupt channel to the device, you will read an UHID_OUTPUT event.
+Data requests on the control channel are currently limited to GET_REPORT and
+SET_REPORT (no other data reports on the control channel are defined so far).
+Those requests are always synchronous. That means, the kernel sends
+UHID_GET_REPORT and UHID_SET_REPORT events and requires you to forward them to
+the device on the control channel. Once the device responds, you must forward
+the response via UHID_GET_REPORT_REPLY and UHID_SET_REPORT_REPLY to the kernel.
+The kernel blocks internal driver-execution during such round-trips (times out
+after a hard-coded period).
+
+If your device disconnects, you should send an UHID_DESTROY event. This will
+unregister the device. You can now send UHID_CREATE2 again to register a new
+device.
+If you close() the fd, the device is automatically unregistered and destroyed
+internally.
+
+write()
+-------
+write() allows you to modify the state of the device and feed input data into
+the kernel. The kernel will parse the event immediately and if the event ID is
+not supported, it will return -EOPNOTSUPP. If the payload is invalid, then
+-EINVAL is returned, otherwise, the amount of data that was read is returned and
+the request was handled successfully. O_NONBLOCK does not affect write() as
+writes are always handled immediately in a non-blocking fashion. Future requests
+might make use of O_NONBLOCK, though.
+
+UHID_CREATE2:
+ This creates the internal HID device. No I/O is possible until you send this
+ event to the kernel. The payload is of type struct uhid_create2_req and
+ contains information about your device. You can start I/O now.
+
+UHID_DESTROY:
+ This destroys the internal HID device. No further I/O will be accepted. There
+ may still be pending messages that you can receive with read() but no further
+ UHID_INPUT events can be sent to the kernel.
+ You can create a new device by sending UHID_CREATE2 again. There is no need to
+ reopen the character device.
+
+UHID_INPUT2:
+ You must send UHID_CREATE2 before sending input to the kernel! This event
+ contains a data-payload. This is the raw data that you read from your device
+ on the interrupt channel. The kernel will parse the HID reports.
+
+UHID_GET_REPORT_REPLY:
+ If you receive a UHID_GET_REPORT request you must answer with this request.
+ You must copy the "id" field from the request into the answer. Set the "err"
+ field to 0 if no error occurred or to EIO if an I/O error occurred.
+ If "err" is 0 then you should fill the buffer of the answer with the results
+ of the GET_REPORT request and set "size" correspondingly.
+
+UHID_SET_REPORT_REPLY:
+ This is the SET_REPORT equivalent of UHID_GET_REPORT_REPLY. Unlike GET_REPORT,
+ SET_REPORT never returns a data buffer, therefore, it's sufficient to set the
+ "id" and "err" fields correctly.
+
+read()
+------
+read() will return a queued output report. No reaction is required to any of
+them but you should handle them according to your needs.
+
+UHID_START:
+ This is sent when the HID device is started. Consider this as an answer to
+ UHID_CREATE2. This is always the first event that is sent. Note that this
+ event might not be available immediately after write(UHID_CREATE2) returns.
+ Device drivers might required delayed setups.
+ This event contains a payload of type uhid_start_req. The "dev_flags" field
+ describes special behaviors of a device. The following flags are defined:
+
+ - UHID_DEV_NUMBERED_FEATURE_REPORTS
+ - UHID_DEV_NUMBERED_OUTPUT_REPORTS
+ - UHID_DEV_NUMBERED_INPUT_REPORTS
+
+ Each of these flags defines whether a given report-type uses numbered
+ reports. If numbered reports are used for a type, all messages from
+ the kernel already have the report-number as prefix. Otherwise, no
+ prefix is added by the kernel.
+ For messages sent by user-space to the kernel, you must adjust the
+ prefixes according to these flags.
+
+UHID_STOP:
+ This is sent when the HID device is stopped. Consider this as an answer to
+ UHID_DESTROY.
+
+ If you didn't destroy your device via UHID_DESTROY, but the kernel sends an
+ UHID_STOP event, this should usually be ignored. It means that the kernel
+ reloaded/changed the device driver loaded on your HID device (or some other
+ maintenance actions happened).
+
+ You can usually ignored any UHID_STOP events safely.
+
+UHID_OPEN:
+ This is sent when the HID device is opened. That is, the data that the HID
+ device provides is read by some other process. You may ignore this event but
+ it is useful for power-management. As long as you haven't received this event
+ there is actually no other process that reads your data so there is no need to
+ send UHID_INPUT2 events to the kernel.
+
+UHID_CLOSE:
+ This is sent when there are no more processes which read the HID data. It is
+ the counterpart of UHID_OPEN and you may as well ignore this event.
+
+UHID_OUTPUT:
+ This is sent if the HID device driver wants to send raw data to the I/O
+ device on the interrupt channel. You should read the payload and forward it to
+ the device. The payload is of type "struct uhid_output_req".
+ This may be received even though you haven't received UHID_OPEN, yet.
+
+UHID_GET_REPORT:
+ This event is sent if the kernel driver wants to perform a GET_REPORT request
+ on the control channeld as described in the HID specs. The report-type and
+ report-number are available in the payload.
+ The kernel serializes GET_REPORT requests so there will never be two in
+ parallel. However, if you fail to respond with a UHID_GET_REPORT_REPLY, the
+ request might silently time out.
+ Once you read a GET_REPORT request, you shall forward it to the hid device and
+ remember the "id" field in the payload. Once your hid device responds to the
+ GET_REPORT (or if it fails), you must send a UHID_GET_REPORT_REPLY to the
+ kernel with the exact same "id" as in the request. If the request already
+ timed out, the kernel will ignore the response silently. The "id" field is
+ never re-used, so conflicts cannot happen.
+
+UHID_SET_REPORT:
+ This is the SET_REPORT equivalent of UHID_GET_REPORT. On receipt, you shall
+ send a SET_REPORT request to your hid device. Once it replies, you must tell
+ the kernel about it via UHID_SET_REPORT_REPLY.
+ The same restrictions as for UHID_GET_REPORT apply.
+
+----------------------------------------------------
+
+Written 2012, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hid/uhid.txt b/Documentation/hid/uhid.txt
deleted file mode 100644
index 958fff945304..000000000000
--- a/Documentation/hid/uhid.txt
+++ /dev/null
@@ -1,187 +0,0 @@
- UHID - User-space I/O driver support for HID subsystem
- ========================================================
-
-UHID allows user-space to implement HID transport drivers. Please see
-hid-transport.txt for an introduction into HID transport drivers. This document
-relies heavily on the definitions declared there.
-
-With UHID, a user-space transport driver can create kernel hid-devices for each
-device connected to the user-space controlled bus. The UHID API defines the I/O
-events provided from the kernel to user-space and vice versa.
-
-There is an example user-space application in ./samples/uhid/uhid-example.c
-
-The UHID API
-------------
-
-UHID is accessed through a character misc-device. The minor-number is allocated
-dynamically so you need to rely on udev (or similar) to create the device node.
-This is /dev/uhid by default.
-
-If a new device is detected by your HID I/O Driver and you want to register this
-device with the HID subsystem, then you need to open /dev/uhid once for each
-device you want to register. All further communication is done by read()'ing or
-write()'ing "struct uhid_event" objects. Non-blocking operations are supported
-by setting O_NONBLOCK.
-
-struct uhid_event {
- __u32 type;
- union {
- struct uhid_create2_req create2;
- struct uhid_output_req output;
- struct uhid_input2_req input2;
- ...
- } u;
-};
-
-The "type" field contains the ID of the event. Depending on the ID different
-payloads are sent. You must not split a single event across multiple read()'s or
-multiple write()'s. A single event must always be sent as a whole. Furthermore,
-only a single event can be sent per read() or write(). Pending data is ignored.
-If you want to handle multiple events in a single syscall, then use vectored
-I/O with readv()/writev().
-The "type" field defines the payload. For each type, there is a
-payload-structure available in the union "u" (except for empty payloads). This
-payload contains management and/or device data.
-
-The first thing you should do is sending an UHID_CREATE2 event. This will
-register the device. UHID will respond with an UHID_START event. You can now
-start sending data to and reading data from UHID. However, unless UHID sends the
-UHID_OPEN event, the internally attached HID Device Driver has no user attached.
-That is, you might put your device asleep unless you receive the UHID_OPEN
-event. If you receive the UHID_OPEN event, you should start I/O. If the last
-user closes the HID device, you will receive an UHID_CLOSE event. This may be
-followed by an UHID_OPEN event again and so on. There is no need to perform
-reference-counting in user-space. That is, you will never receive multiple
-UHID_OPEN events without an UHID_CLOSE event. The HID subsystem performs
-ref-counting for you.
-You may decide to ignore UHID_OPEN/UHID_CLOSE, though. I/O is allowed even
-though the device may have no users.
-
-If you want to send data on the interrupt channel to the HID subsystem, you send
-an HID_INPUT2 event with your raw data payload. If the kernel wants to send data
-on the interrupt channel to the device, you will read an UHID_OUTPUT event.
-Data requests on the control channel are currently limited to GET_REPORT and
-SET_REPORT (no other data reports on the control channel are defined so far).
-Those requests are always synchronous. That means, the kernel sends
-UHID_GET_REPORT and UHID_SET_REPORT events and requires you to forward them to
-the device on the control channel. Once the device responds, you must forward
-the response via UHID_GET_REPORT_REPLY and UHID_SET_REPORT_REPLY to the kernel.
-The kernel blocks internal driver-execution during such round-trips (times out
-after a hard-coded period).
-
-If your device disconnects, you should send an UHID_DESTROY event. This will
-unregister the device. You can now send UHID_CREATE2 again to register a new
-device.
-If you close() the fd, the device is automatically unregistered and destroyed
-internally.
-
-write()
--------
-write() allows you to modify the state of the device and feed input data into
-the kernel. The kernel will parse the event immediately and if the event ID is
-not supported, it will return -EOPNOTSUPP. If the payload is invalid, then
--EINVAL is returned, otherwise, the amount of data that was read is returned and
-the request was handled successfully. O_NONBLOCK does not affect write() as
-writes are always handled immediately in a non-blocking fashion. Future requests
-might make use of O_NONBLOCK, though.
-
- UHID_CREATE2:
- This creates the internal HID device. No I/O is possible until you send this
- event to the kernel. The payload is of type struct uhid_create2_req and
- contains information about your device. You can start I/O now.
-
- UHID_DESTROY:
- This destroys the internal HID device. No further I/O will be accepted. There
- may still be pending messages that you can receive with read() but no further
- UHID_INPUT events can be sent to the kernel.
- You can create a new device by sending UHID_CREATE2 again. There is no need to
- reopen the character device.
-
- UHID_INPUT2:
- You must send UHID_CREATE2 before sending input to the kernel! This event
- contains a data-payload. This is the raw data that you read from your device
- on the interrupt channel. The kernel will parse the HID reports.
-
- UHID_GET_REPORT_REPLY:
- If you receive a UHID_GET_REPORT request you must answer with this request.
- You must copy the "id" field from the request into the answer. Set the "err"
- field to 0 if no error occurred or to EIO if an I/O error occurred.
- If "err" is 0 then you should fill the buffer of the answer with the results
- of the GET_REPORT request and set "size" correspondingly.
-
- UHID_SET_REPORT_REPLY:
- This is the SET_REPORT equivalent of UHID_GET_REPORT_REPLY. Unlike GET_REPORT,
- SET_REPORT never returns a data buffer, therefore, it's sufficient to set the
- "id" and "err" fields correctly.
-
-read()
-------
-read() will return a queued output report. No reaction is required to any of
-them but you should handle them according to your needs.
-
- UHID_START:
- This is sent when the HID device is started. Consider this as an answer to
- UHID_CREATE2. This is always the first event that is sent. Note that this
- event might not be available immediately after write(UHID_CREATE2) returns.
- Device drivers might required delayed setups.
- This event contains a payload of type uhid_start_req. The "dev_flags" field
- describes special behaviors of a device. The following flags are defined:
- UHID_DEV_NUMBERED_FEATURE_REPORTS:
- UHID_DEV_NUMBERED_OUTPUT_REPORTS:
- UHID_DEV_NUMBERED_INPUT_REPORTS:
- Each of these flags defines whether a given report-type uses numbered
- reports. If numbered reports are used for a type, all messages from
- the kernel already have the report-number as prefix. Otherwise, no
- prefix is added by the kernel.
- For messages sent by user-space to the kernel, you must adjust the
- prefixes according to these flags.
-
- UHID_STOP:
- This is sent when the HID device is stopped. Consider this as an answer to
- UHID_DESTROY.
- If you didn't destroy your device via UHID_DESTROY, but the kernel sends an
- UHID_STOP event, this should usually be ignored. It means that the kernel
- reloaded/changed the device driver loaded on your HID device (or some other
- maintenance actions happened).
- You can usually ignored any UHID_STOP events safely.
-
- UHID_OPEN:
- This is sent when the HID device is opened. That is, the data that the HID
- device provides is read by some other process. You may ignore this event but
- it is useful for power-management. As long as you haven't received this event
- there is actually no other process that reads your data so there is no need to
- send UHID_INPUT2 events to the kernel.
-
- UHID_CLOSE:
- This is sent when there are no more processes which read the HID data. It is
- the counterpart of UHID_OPEN and you may as well ignore this event.
-
- UHID_OUTPUT:
- This is sent if the HID device driver wants to send raw data to the I/O
- device on the interrupt channel. You should read the payload and forward it to
- the device. The payload is of type "struct uhid_output_req".
- This may be received even though you haven't received UHID_OPEN, yet.
-
- UHID_GET_REPORT:
- This event is sent if the kernel driver wants to perform a GET_REPORT request
- on the control channeld as described in the HID specs. The report-type and
- report-number are available in the payload.
- The kernel serializes GET_REPORT requests so there will never be two in
- parallel. However, if you fail to respond with a UHID_GET_REPORT_REPLY, the
- request might silently time out.
- Once you read a GET_REPORT request, you shall forward it to the hid device and
- remember the "id" field in the payload. Once your hid device responds to the
- GET_REPORT (or if it fails), you must send a UHID_GET_REPORT_REPLY to the
- kernel with the exact same "id" as in the request. If the request already
- timed out, the kernel will ignore the response silently. The "id" field is
- never re-used, so conflicts cannot happen.
-
- UHID_SET_REPORT:
- This is the SET_REPORT equivalent of UHID_GET_REPORT. On receipt, you shall
- send a SET_REPORT request to your hid device. Once it replies, you must tell
- the kernel about it via UHID_SET_REPORT_REPLY.
- The same restrictions as for UHID_GET_REPORT apply.
-
-----------------------------------------------------
-Written 2012, David Herrmann <dh.herrmann@gmail.com>
diff --git a/Documentation/hwmon/pxe1610 b/Documentation/hwmon/pxe1610
new file mode 100644
index 000000000000..211cedeefb44
--- /dev/null
+++ b/Documentation/hwmon/pxe1610
@@ -0,0 +1,90 @@
+Kernel driver pxe1610
+=====================
+
+Supported chips:
+ * Infineon PXE1610
+ Prefix: 'pxe1610'
+ Addresses scanned: -
+ Datasheet: Datasheet is not publicly available.
+
+ * Infineon PXE1110
+ Prefix: 'pxe1110'
+ Addresses scanned: -
+ Datasheet: Datasheet is not publicly available.
+
+ * Infineon PXM1310
+ Prefix: 'pxm1310'
+ Addresses scanned: -
+ Datasheet: Datasheet is not publicly available.
+
+Author: Vijay Khemka <vijaykhemka@fb.com>
+
+
+Description
+-----------
+
+PXE1610/PXE1110 are Multi-rail/Multiphase Digital Controllers
+and compliant to
+ -- Intel VR13 DC-DC converter specifications.
+ -- Intel SVID protocol.
+Used for Vcore power regulation for Intel VR13 based microprocessors
+ -- Servers, Workstations, and High-end desktops
+
+PXM1310 is a Multi-rail Controller and it is compliant to
+ -- Intel VR13 DC-DC converter specifications.
+ -- Intel SVID protocol.
+Used for DDR3/DDR4 Memory power regulation for Intel VR13 and
+IMVP8 based systems
+
+
+Usage Notes
+-----------
+
+This driver does not probe for PMBus devices. You will have
+to instantiate devices explicitly.
+
+Example: the following commands will load the driver for an PXE1610
+at address 0x70 on I2C bus #4:
+
+# modprobe pxe1610
+# echo pxe1610 0x70 > /sys/bus/i2c/devices/i2c-4/new_device
+
+It can also be instantiated by declaring in device tree
+
+
+Sysfs attributes
+----------------
+
+curr1_label "iin"
+curr1_input Measured input current
+curr1_alarm Current high alarm
+
+curr[2-4]_label "iout[1-3]"
+curr[2-4]_input Measured output current
+curr[2-4]_crit Critical maximum current
+curr[2-4]_crit_alarm Current critical high alarm
+
+in1_label "vin"
+in1_input Measured input voltage
+in1_crit Critical maximum input voltage
+in1_crit_alarm Input voltage critical high alarm
+
+in[2-4]_label "vout[1-3]"
+in[2-4]_input Measured output voltage
+in[2-4]_lcrit Critical minimum output voltage
+in[2-4]_lcrit_alarm Output voltage critical low alarm
+in[2-4]_crit Critical maximum output voltage
+in[2-4]_crit_alarm Output voltage critical high alarm
+
+power1_label "pin"
+power1_input Measured input power
+power1_alarm Input power high alarm
+
+power[2-4]_label "pout[1-3]"
+power[2-4]_input Measured output power
+
+temp[1-3]_input Measured temperature
+temp[1-3]_crit Critical high temperature
+temp[1-3]_crit_alarm Chip temperature critical high alarm
+temp[1-3]_max Maximum temperature
+temp[1-3]_max_alarm Chip temperature high alarm
diff --git a/Documentation/hwmon/submitting-patches.rst b/Documentation/hwmon/submitting-patches.rst
index f9796b9d9db6..452fc28d8e0b 100644
--- a/Documentation/hwmon/submitting-patches.rst
+++ b/Documentation/hwmon/submitting-patches.rst
@@ -89,7 +89,7 @@ increase the chances of your change being accepted.
console. Excessive logging can seriously affect system performance.
* Use devres functions whenever possible to allocate resources. For rationale
- and supported functions, please see Documentation/driver-model/devres.txt.
+ and supported functions, please see Documentation/driver-api/driver-model/devres.rst.
If a function is not supported by devres, consider using devm_add_action().
* If the driver has a detect function, make sure it is silent. Debug messages
diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt
index ed640a278185..6f03713b7003 100644
--- a/Documentation/hwspinlock.txt
+++ b/Documentation/hwspinlock.txt
@@ -136,6 +136,39 @@ The function will never sleep.
::
+ int hwspin_lock_timeout_raw(struct hwspinlock *hwlock, unsigned int timeout);
+
+Lock a previously-assigned hwspinlock with a timeout limit (specified in
+msecs). If the hwspinlock is already taken, the function will busy loop
+waiting for it to be released, but give up when the timeout elapses.
+
+Caution: User must protect the routine of getting hardware lock with mutex
+or spinlock to avoid dead-lock, that will let user can do some time-consuming
+or sleepable operations under the hardware lock.
+
+Returns 0 when successful and an appropriate error code otherwise (most
+notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
+
+The function will never sleep.
+
+::
+
+ int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to);
+
+Lock a previously-assigned hwspinlock with a timeout limit (specified in
+msecs). If the hwspinlock is already taken, the function will busy loop
+waiting for it to be released, but give up when the timeout elapses.
+
+This function shall be called only from an atomic context and the timeout
+value shall not exceed a few msecs.
+
+Returns 0 when successful and an appropriate error code otherwise (most
+notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
+
+The function will never sleep.
+
+::
+
int hwspin_trylock(struct hwspinlock *hwlock);
@@ -186,6 +219,34 @@ The function will never sleep.
::
+ int hwspin_trylock_raw(struct hwspinlock *hwlock);
+
+Attempt to lock a previously-assigned hwspinlock, but immediately fail if
+it is already taken.
+
+Caution: User must protect the routine of getting hardware lock with mutex
+or spinlock to avoid dead-lock, that will let user can do some time-consuming
+or sleepable operations under the hardware lock.
+
+Returns 0 on success and an appropriate error code otherwise (most
+notably -EBUSY if the hwspinlock was already taken).
+The function will never sleep.
+
+::
+
+ int hwspin_trylock_in_atomic(struct hwspinlock *hwlock);
+
+Attempt to lock a previously-assigned hwspinlock, but immediately fail if
+it is already taken.
+
+This function shall be called only from an atomic context.
+
+Returns 0 on success and an appropriate error code otherwise (most
+notably -EBUSY if the hwspinlock was already taken).
+The function will never sleep.
+
+::
+
void hwspin_unlock(struct hwspinlock *hwlock);
Unlock a previously-locked hwspinlock. Always succeed, and can be called
@@ -222,6 +283,26 @@ the given flags. This function will never sleep.
::
+ void hwspin_unlock_raw(struct hwspinlock *hwlock);
+
+Unlock a previously-locked hwspinlock.
+
+The caller should **never** unlock an hwspinlock which is already unlocked.
+Doing so is considered a bug (there is no protection against this).
+This function will never sleep.
+
+::
+
+ void hwspin_unlock_in_atomic(struct hwspinlock *hwlock);
+
+Unlock a previously-locked hwspinlock.
+
+The caller should **never** unlock an hwspinlock which is already unlocked.
+Doing so is considered a bug (there is no protection against this).
+This function will never sleep.
+
+::
+
int hwspin_lock_get_id(struct hwspinlock *hwlock);
Retrieve id number of a given hwspinlock. This is needed when an
diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801
index ee9984f35868..f426c13c63a9 100644
--- a/Documentation/i2c/busses/i2c-i801
+++ b/Documentation/i2c/busses/i2c-i801
@@ -37,6 +37,8 @@ Supported adapters:
* Intel Cedar Fork (PCH)
* Intel Ice Lake (PCH)
* Intel Comet Lake (PCH)
+ * Intel Elkhart Lake (PCH)
+ * Intel Tiger Lake (PCH)
Datasheets: Publicly available at the Intel website
On Intel Patsburg and later chipsets, both the normal host SMBus controller
@@ -58,6 +60,7 @@ question doesn't work as intended for whatever reason. Bit values:
0x02 disable the block buffer
0x08 disable the I2C block read functionality
0x10 don't use interrupts
+ 0x20 disable SMBus Host Notify
Description
@@ -88,7 +91,7 @@ SMBus controller.
Process Call Support
--------------------
-Not supported.
+Block process call is supported on the 82801EB (ICH5) and later chips.
I2C Block Read Support
@@ -118,16 +121,15 @@ BIOS to enable it, it means it has been hidden by the BIOS code. Asus is
well known for first doing this on their P4B motherboard, and many other
boards after that. Some vendor machines are affected as well.
-The first thing to try is the "i2c_ec" ACPI driver. It could be that the
+The first thing to try is the "i2c-scmi" ACPI driver. It could be that the
SMBus was hidden on purpose because it'll be driven by ACPI. If the
-i2c_ec driver works for you, just forget about the i2c-i801 driver and
-don't try to unhide the ICH SMBus. Even if i2c_ec doesn't work, you
+i2c-scmi driver works for you, just forget about the i2c-i801 driver and
+don't try to unhide the ICH SMBus. Even if i2c-scmi doesn't work, you
better make sure that the SMBus isn't used by the ACPI code. Try loading
-the "fan" and "thermal" drivers, and check in /proc/acpi/fan and
-/proc/acpi/thermal_zone. If you find anything there, it's likely that
-the ACPI is accessing the SMBus and it's safer not to unhide it. Only
-once you are certain that ACPI isn't using the SMBus, you can attempt
-to unhide it.
+the "fan" and "thermal" drivers, and check in /sys/class/thermal. If you
+find a thermal zone with type "acpitz", it's likely that the ACPI is
+accessing the SMBus and it's safer not to unhide it. Only once you are
+certain that ACPI isn't using the SMBus, you can attempt to unhide it.
In order to unhide the SMBus, we need to change the value of a PCI
register before the kernel enumerates the PCI devices. This is done in
diff --git a/Documentation/ia64/IRQ-redir.txt b/Documentation/ia64/IRQ-redir.txt
deleted file mode 100644
index f7bd72261283..000000000000
--- a/Documentation/ia64/IRQ-redir.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-IRQ affinity on IA64 platforms
-------------------------------
- 07.01.2002, Erich Focht <efocht@ess.nec.de>
-
-
-By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be
-controlled. The behavior on IA64 platforms is slightly different from
-that described in Documentation/IRQ-affinity.txt for i386 systems.
-
-Because of the usage of SAPIC mode and physical destination mode the
-IRQ target is one particular CPU and cannot be a mask of several
-CPUs. Only the first non-zero bit is taken into account.
-
-
-Usage examples:
-
-The target CPU has to be specified as a hexadecimal CPU mask. The
-first non-zero bit is the selected CPU. This format has been kept for
-compatibility reasons with i386.
-
-Set the delivery mode of interrupt 41 to fixed and route the
-interrupts to CPU #3 (logical CPU number) (2^3=0x08):
- echo "8" >/proc/irq/41/smp_affinity
-
-Set the default route for IRQ number 41 to CPU 6 in lowest priority
-delivery mode (redirectable):
- echo "r 40" >/proc/irq/41/smp_affinity
-
-The output of the command
- cat /proc/irq/IRQ#/smp_affinity
-gives the target CPU mask for the specified interrupt vector. If the CPU
-mask is preceded by the character "r", the interrupt is redirectable
-(i.e. lowest priority mode routing is used), otherwise its route is
-fixed.
-
-
-
-Initialization and default behavior:
-
-If the platform features IRQ redirection (info provided by SAL) all
-IO-SAPIC interrupts are initialized with CPU#0 as their default target
-and the routing is the so called "lowest priority mode" (actually
-fixed SAPIC mode with hint). The XTP chipset registers are used as hints
-for the IRQ routing. Currently in Linux XTP registers can have three
-values:
- - minimal for an idle task,
- - normal if any other task runs,
- - maximal if the CPU is going to be switched off.
-The IRQ is routed to the CPU with lowest XTP register value, the
-search begins at the default CPU. Therefore most of the interrupts
-will be handled by CPU #0.
-
-If the platform doesn't feature interrupt redirection IOSAPIC fixed
-routing is used. The target CPUs are distributed in a round robin
-manner. IRQs will be routed only to the selected target CPUs. Check
-with
- cat /proc/interrupts
-
-
-
-Comments:
-
-On large (multi-node) systems it is recommended to route the IRQs to
-the node to which the corresponding device is connected.
-For systems like the NEC AzusA we get IRQ node-affinity for free. This
-is because usually the chipsets on each node redirect the interrupts
-only to their own CPUs (as they cannot see the XTP registers on the
-other nodes).
-
diff --git a/Documentation/ia64/README b/Documentation/ia64/README
deleted file mode 100644
index aa17f2154cba..000000000000
--- a/Documentation/ia64/README
+++ /dev/null
@@ -1,43 +0,0 @@
- Linux kernel release 2.4.xx for the IA-64 Platform
-
- These are the release notes for Linux version 2.4 for IA-64
- platform. This document provides information specific to IA-64
- ONLY, to get additional information about the Linux kernel also
- read the original Linux README provided with the kernel.
-
-INSTALLING the kernel:
-
- - IA-64 kernel installation is the same as the other platforms, see
- original README for details.
-
-
-SOFTWARE REQUIREMENTS
-
- Compiling and running this kernel requires an IA-64 compliant GCC
- compiler. And various software packages also compiled with an
- IA-64 compliant GCC compiler.
-
-
-CONFIGURING the kernel:
-
- Configuration is the same, see original README for details.
-
-
-COMPILING the kernel:
-
- - Compiling this kernel doesn't differ from other platform so read
- the original README for details BUT make sure you have an IA-64
- compliant GCC compiler.
-
-IA-64 SPECIFICS
-
- - General issues:
-
- o Hardly any performance tuning has been done. Obvious targets
- include the library routines (IP checksum, etc.). Less
- obvious targets include making sure we don't flush the TLB
- needlessly, etc.
-
- o SMP locks cleanup/optimization
-
- o IA32 support. Currently experimental. It mostly works.
diff --git a/Documentation/ia64/aliasing.rst b/Documentation/ia64/aliasing.rst
new file mode 100644
index 000000000000..a08b36aba015
--- /dev/null
+++ b/Documentation/ia64/aliasing.rst
@@ -0,0 +1,246 @@
+==================================
+Memory Attribute Aliasing on IA-64
+==================================
+
+Bjorn Helgaas <bjorn.helgaas@hp.com>
+
+May 4, 2006
+
+
+Memory Attributes
+=================
+
+ Itanium supports several attributes for virtual memory references.
+ The attribute is part of the virtual translation, i.e., it is
+ contained in the TLB entry. The ones of most interest to the Linux
+ kernel are:
+
+ == ======================
+ WB Write-back (cacheable)
+ UC Uncacheable
+ WC Write-coalescing
+ == ======================
+
+ System memory typically uses the WB attribute. The UC attribute is
+ used for memory-mapped I/O devices. The WC attribute is uncacheable
+ like UC is, but writes may be delayed and combined to increase
+ performance for things like frame buffers.
+
+ The Itanium architecture requires that we avoid accessing the same
+ page with both a cacheable mapping and an uncacheable mapping[1].
+
+ The design of the chipset determines which attributes are supported
+ on which regions of the address space. For example, some chipsets
+ support either WB or UC access to main memory, while others support
+ only WB access.
+
+Memory Map
+==========
+
+ Platform firmware describes the physical memory map and the
+ supported attributes for each region. At boot-time, the kernel uses
+ the EFI GetMemoryMap() interface. ACPI can also describe memory
+ devices and the attributes they support, but Linux/ia64 currently
+ doesn't use this information.
+
+ The kernel uses the efi_memmap table returned from GetMemoryMap() to
+ learn the attributes supported by each region of physical address
+ space. Unfortunately, this table does not completely describe the
+ address space because some machines omit some or all of the MMIO
+ regions from the map.
+
+ The kernel maintains another table, kern_memmap, which describes the
+ memory Linux is actually using and the attribute for each region.
+ This contains only system memory; it does not contain MMIO space.
+
+ The kern_memmap table typically contains only a subset of the system
+ memory described by the efi_memmap. Linux/ia64 can't use all memory
+ in the system because of constraints imposed by the identity mapping
+ scheme.
+
+ The efi_memmap table is preserved unmodified because the original
+ boot-time information is required for kexec.
+
+Kernel Identify Mappings
+========================
+
+ Linux/ia64 identity mappings are done with large pages, currently
+ either 16MB or 64MB, referred to as "granules." Cacheable mappings
+ are speculative[2], so the processor can read any location in the
+ page at any time, independent of the programmer's intentions. This
+ means that to avoid attribute aliasing, Linux can create a cacheable
+ identity mapping only when the entire granule supports cacheable
+ access.
+
+ Therefore, kern_memmap contains only full granule-sized regions that
+ can referenced safely by an identity mapping.
+
+ Uncacheable mappings are not speculative, so the processor will
+ generate UC accesses only to locations explicitly referenced by
+ software. This allows UC identity mappings to cover granules that
+ are only partially populated, or populated with a combination of UC
+ and WB regions.
+
+User Mappings
+=============
+
+ User mappings are typically done with 16K or 64K pages. The smaller
+ page size allows more flexibility because only 16K or 64K has to be
+ homogeneous with respect to memory attributes.
+
+Potential Attribute Aliasing Cases
+==================================
+
+ There are several ways the kernel creates new mappings:
+
+mmap of /dev/mem
+----------------
+
+ This uses remap_pfn_range(), which creates user mappings. These
+ mappings may be either WB or UC. If the region being mapped
+ happens to be in kern_memmap, meaning that it may also be mapped
+ by a kernel identity mapping, the user mapping must use the same
+ attribute as the kernel mapping.
+
+ If the region is not in kern_memmap, the user mapping should use
+ an attribute reported as being supported in the EFI memory map.
+
+ Since the EFI memory map does not describe MMIO on some
+ machines, this should use an uncacheable mapping as a fallback.
+
+mmap of /sys/class/pci_bus/.../legacy_mem
+-----------------------------------------
+
+ This is very similar to mmap of /dev/mem, except that legacy_mem
+ only allows mmap of the one megabyte "legacy MMIO" area for a
+ specific PCI bus. Typically this is the first megabyte of
+ physical address space, but it may be different on machines with
+ several VGA devices.
+
+ "X" uses this to access VGA frame buffers. Using legacy_mem
+ rather than /dev/mem allows multiple instances of X to talk to
+ different VGA cards.
+
+ The /dev/mem mmap constraints apply.
+
+mmap of /proc/bus/pci/.../??.?
+------------------------------
+
+ This is an MMIO mmap of PCI functions, which additionally may or
+ may not be requested as using the WC attribute.
+
+ If WC is requested, and the region in kern_memmap is either WC
+ or UC, and the EFI memory map designates the region as WC, then
+ the WC mapping is allowed.
+
+ Otherwise, the user mapping must use the same attribute as the
+ kernel mapping.
+
+read/write of /dev/mem
+----------------------
+
+ This uses copy_from_user(), which implicitly uses a kernel
+ identity mapping. This is obviously safe for things in
+ kern_memmap.
+
+ There may be corner cases of things that are not in kern_memmap,
+ but could be accessed this way. For example, registers in MMIO
+ space are not in kern_memmap, but could be accessed with a UC
+ mapping. This would not cause attribute aliasing. But
+ registers typically can be accessed only with four-byte or
+ eight-byte accesses, and the copy_from_user() path doesn't allow
+ any control over the access size, so this would be dangerous.
+
+ioremap()
+---------
+
+ This returns a mapping for use inside the kernel.
+
+ If the region is in kern_memmap, we should use the attribute
+ specified there.
+
+ If the EFI memory map reports that the entire granule supports
+ WB, we should use that (granules that are partially reserved
+ or occupied by firmware do not appear in kern_memmap).
+
+ If the granule contains non-WB memory, but we can cover the
+ region safely with kernel page table mappings, we can use
+ ioremap_page_range() as most other architectures do.
+
+ Failing all of the above, we have to fall back to a UC mapping.
+
+Past Problem Cases
+==================
+
+mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
+--------------------------------------------------------------------
+
+ The EFI memory map may not report these MMIO regions.
+
+ These must be allowed so that X will work. This means that
+ when the EFI memory map is incomplete, every /dev/mem mmap must
+ succeed. It may create either WB or UC user mappings, depending
+ on whether the region is in kern_memmap or the EFI memory map.
+
+mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
+----------------------------------------------------------------------
+
+ The EFI memory map reports the following attributes:
+
+ =============== ======= ==================
+ 0x00000-0x9FFFF WB only
+ 0xA0000-0xBFFFF UC only (VGA frame buffer)
+ 0xC0000-0xFFFFF WB only
+ =============== ======= ==================
+
+ This mmap is done with user pages, not kernel identity mappings,
+ so it is safe to use WB mappings.
+
+ The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
+ which uses a granule-sized UC mapping. This granule will cover some
+ WB-only memory, but since UC is non-speculative, the processor will
+ never generate an uncacheable reference to the WB-only areas unless
+ the driver explicitly touches them.
+
+mmap of 0x0-0xFFFFF legacy_mem by "X"
+-------------------------------------
+
+ If the EFI memory map reports that the entire range supports the
+ same attributes, we can allow the mmap (and we will prefer WB if
+ supported, as is the case with HP sx[12]000 machines with VGA
+ disabled).
+
+ If EFI reports the range as partly WB and partly UC (as on sx[12]000
+ machines with VGA enabled), we must fail the mmap because there's no
+ safe attribute to use.
+
+ If EFI reports some of the range but not all (as on Intel firmware
+ that doesn't report the VGA frame buffer at all), we should fail the
+ mmap and force the user to map just the specific region of interest.
+
+mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
+------------------------------------------------------------------------
+
+ The EFI memory map reports the following attributes::
+
+ 0x00000-0xFFFFF WB only (no VGA MMIO hole)
+
+ This is a special case of the previous case, and the mmap should
+ fail for the same reason as above.
+
+read of /sys/devices/.../rom
+----------------------------
+
+ For VGA devices, this may cause an ioremap() of 0xC0000. This
+ used to be done with a UC mapping, because the VGA frame buffer
+ at 0xA0000 prevents use of a WB granule. The UC mapping causes
+ an MCA on HP sx[12]000 chipsets.
+
+ We should use WB page table mappings to avoid covering the VGA
+ frame buffer.
+
+Notes
+=====
+
+ [1] SDM rev 2.2, vol 2, sec 4.4.1.
+ [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt
deleted file mode 100644
index 5a4dea6abebd..000000000000
--- a/Documentation/ia64/aliasing.txt
+++ /dev/null
@@ -1,221 +0,0 @@
- MEMORY ATTRIBUTE ALIASING ON IA-64
-
- Bjorn Helgaas
- <bjorn.helgaas@hp.com>
- May 4, 2006
-
-
-MEMORY ATTRIBUTES
-
- Itanium supports several attributes for virtual memory references.
- The attribute is part of the virtual translation, i.e., it is
- contained in the TLB entry. The ones of most interest to the Linux
- kernel are:
-
- WB Write-back (cacheable)
- UC Uncacheable
- WC Write-coalescing
-
- System memory typically uses the WB attribute. The UC attribute is
- used for memory-mapped I/O devices. The WC attribute is uncacheable
- like UC is, but writes may be delayed and combined to increase
- performance for things like frame buffers.
-
- The Itanium architecture requires that we avoid accessing the same
- page with both a cacheable mapping and an uncacheable mapping[1].
-
- The design of the chipset determines which attributes are supported
- on which regions of the address space. For example, some chipsets
- support either WB or UC access to main memory, while others support
- only WB access.
-
-MEMORY MAP
-
- Platform firmware describes the physical memory map and the
- supported attributes for each region. At boot-time, the kernel uses
- the EFI GetMemoryMap() interface. ACPI can also describe memory
- devices and the attributes they support, but Linux/ia64 currently
- doesn't use this information.
-
- The kernel uses the efi_memmap table returned from GetMemoryMap() to
- learn the attributes supported by each region of physical address
- space. Unfortunately, this table does not completely describe the
- address space because some machines omit some or all of the MMIO
- regions from the map.
-
- The kernel maintains another table, kern_memmap, which describes the
- memory Linux is actually using and the attribute for each region.
- This contains only system memory; it does not contain MMIO space.
-
- The kern_memmap table typically contains only a subset of the system
- memory described by the efi_memmap. Linux/ia64 can't use all memory
- in the system because of constraints imposed by the identity mapping
- scheme.
-
- The efi_memmap table is preserved unmodified because the original
- boot-time information is required for kexec.
-
-KERNEL IDENTITY MAPPINGS
-
- Linux/ia64 identity mappings are done with large pages, currently
- either 16MB or 64MB, referred to as "granules." Cacheable mappings
- are speculative[2], so the processor can read any location in the
- page at any time, independent of the programmer's intentions. This
- means that to avoid attribute aliasing, Linux can create a cacheable
- identity mapping only when the entire granule supports cacheable
- access.
-
- Therefore, kern_memmap contains only full granule-sized regions that
- can referenced safely by an identity mapping.
-
- Uncacheable mappings are not speculative, so the processor will
- generate UC accesses only to locations explicitly referenced by
- software. This allows UC identity mappings to cover granules that
- are only partially populated, or populated with a combination of UC
- and WB regions.
-
-USER MAPPINGS
-
- User mappings are typically done with 16K or 64K pages. The smaller
- page size allows more flexibility because only 16K or 64K has to be
- homogeneous with respect to memory attributes.
-
-POTENTIAL ATTRIBUTE ALIASING CASES
-
- There are several ways the kernel creates new mappings:
-
- mmap of /dev/mem
-
- This uses remap_pfn_range(), which creates user mappings. These
- mappings may be either WB or UC. If the region being mapped
- happens to be in kern_memmap, meaning that it may also be mapped
- by a kernel identity mapping, the user mapping must use the same
- attribute as the kernel mapping.
-
- If the region is not in kern_memmap, the user mapping should use
- an attribute reported as being supported in the EFI memory map.
-
- Since the EFI memory map does not describe MMIO on some
- machines, this should use an uncacheable mapping as a fallback.
-
- mmap of /sys/class/pci_bus/.../legacy_mem
-
- This is very similar to mmap of /dev/mem, except that legacy_mem
- only allows mmap of the one megabyte "legacy MMIO" area for a
- specific PCI bus. Typically this is the first megabyte of
- physical address space, but it may be different on machines with
- several VGA devices.
-
- "X" uses this to access VGA frame buffers. Using legacy_mem
- rather than /dev/mem allows multiple instances of X to talk to
- different VGA cards.
-
- The /dev/mem mmap constraints apply.
-
- mmap of /proc/bus/pci/.../??.?
-
- This is an MMIO mmap of PCI functions, which additionally may or
- may not be requested as using the WC attribute.
-
- If WC is requested, and the region in kern_memmap is either WC
- or UC, and the EFI memory map designates the region as WC, then
- the WC mapping is allowed.
-
- Otherwise, the user mapping must use the same attribute as the
- kernel mapping.
-
- read/write of /dev/mem
-
- This uses copy_from_user(), which implicitly uses a kernel
- identity mapping. This is obviously safe for things in
- kern_memmap.
-
- There may be corner cases of things that are not in kern_memmap,
- but could be accessed this way. For example, registers in MMIO
- space are not in kern_memmap, but could be accessed with a UC
- mapping. This would not cause attribute aliasing. But
- registers typically can be accessed only with four-byte or
- eight-byte accesses, and the copy_from_user() path doesn't allow
- any control over the access size, so this would be dangerous.
-
- ioremap()
-
- This returns a mapping for use inside the kernel.
-
- If the region is in kern_memmap, we should use the attribute
- specified there.
-
- If the EFI memory map reports that the entire granule supports
- WB, we should use that (granules that are partially reserved
- or occupied by firmware do not appear in kern_memmap).
-
- If the granule contains non-WB memory, but we can cover the
- region safely with kernel page table mappings, we can use
- ioremap_page_range() as most other architectures do.
-
- Failing all of the above, we have to fall back to a UC mapping.
-
-PAST PROBLEM CASES
-
- mmap of various MMIO regions from /dev/mem by "X" on Intel platforms
-
- The EFI memory map may not report these MMIO regions.
-
- These must be allowed so that X will work. This means that
- when the EFI memory map is incomplete, every /dev/mem mmap must
- succeed. It may create either WB or UC user mappings, depending
- on whether the region is in kern_memmap or the EFI memory map.
-
- mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled
-
- The EFI memory map reports the following attributes:
- 0x00000-0x9FFFF WB only
- 0xA0000-0xBFFFF UC only (VGA frame buffer)
- 0xC0000-0xFFFFF WB only
-
- This mmap is done with user pages, not kernel identity mappings,
- so it is safe to use WB mappings.
-
- The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000,
- which uses a granule-sized UC mapping. This granule will cover some
- WB-only memory, but since UC is non-speculative, the processor will
- never generate an uncacheable reference to the WB-only areas unless
- the driver explicitly touches them.
-
- mmap of 0x0-0xFFFFF legacy_mem by "X"
-
- If the EFI memory map reports that the entire range supports the
- same attributes, we can allow the mmap (and we will prefer WB if
- supported, as is the case with HP sx[12]000 machines with VGA
- disabled).
-
- If EFI reports the range as partly WB and partly UC (as on sx[12]000
- machines with VGA enabled), we must fail the mmap because there's no
- safe attribute to use.
-
- If EFI reports some of the range but not all (as on Intel firmware
- that doesn't report the VGA frame buffer at all), we should fail the
- mmap and force the user to map just the specific region of interest.
-
- mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled
-
- The EFI memory map reports the following attributes:
- 0x00000-0xFFFFF WB only (no VGA MMIO hole)
-
- This is a special case of the previous case, and the mmap should
- fail for the same reason as above.
-
- read of /sys/devices/.../rom
-
- For VGA devices, this may cause an ioremap() of 0xC0000. This
- used to be done with a UC mapping, because the VGA frame buffer
- at 0xA0000 prevents use of a WB granule. The UC mapping causes
- an MCA on HP sx[12]000 chipsets.
-
- We should use WB page table mappings to avoid covering the VGA
- frame buffer.
-
-NOTES
-
- [1] SDM rev 2.2, vol 2, sec 4.4.1.
- [2] SDM rev 2.2, vol 2, sec 4.4.6.
diff --git a/Documentation/ia64/efirtc.rst b/Documentation/ia64/efirtc.rst
new file mode 100644
index 000000000000..2f7ff5026308
--- /dev/null
+++ b/Documentation/ia64/efirtc.rst
@@ -0,0 +1,144 @@
+==========================
+EFI Real Time Clock driver
+==========================
+
+S. Eranian <eranian@hpl.hp.com>
+
+March 2000
+
+1. Introduction
+===============
+
+This document describes the efirtc.c driver has provided for
+the IA-64 platform.
+
+The purpose of this driver is to supply an API for kernel and user applications
+to get access to the Time Service offered by EFI version 0.92.
+
+EFI provides 4 calls one can make once the OS is booted: GetTime(),
+SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this
+driver. We describe those calls as well the design of the driver in the
+following sections.
+
+2. Design Decisions
+===================
+
+The original ideas was to provide a very simple driver to get access to,
+at first, the time of day service. This is required in order to access, in a
+portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock
+to initialize the system view of the time during boot.
+
+Because we wanted to minimize the impact on existing user-level apps using
+the CMOS clock, we decided to expose an API that was very similar to the one
+used today with the legacy RTC driver (driver/char/rtc.c). However, because
+EFI provides a simpler services, not all ioctl() are available. Also
+new ioctl()s have been introduced for things that EFI provides but not the
+legacy.
+
+EFI uses a slightly different way of representing the time, noticeably
+the reference date is different. Year is the using the full 4-digit format.
+The Epoch is January 1st 1998. For backward compatibility reasons we don't
+expose this new way of representing time. Instead we use something very
+similar to the struct tm, i.e. struct rtc_time, as used by hwclock.
+One of the reasons for doing it this way is to allow for EFI to still evolve
+without necessarily impacting any of the user applications. The decoupling
+enables flexibility and permits writing wrapper code is ncase things change.
+
+The driver exposes two interfaces, one via the device file and a set of
+ioctl()s. The other is read-only via the /proc filesystem.
+
+As of today we don't offer a /proc/sys interface.
+
+To allow for a uniform interface between the legacy RTC and EFI time service,
+we have created the include/linux/rtc.h header file to contain only the
+"public" API of the two drivers. The specifics of the legacy RTC are still
+in include/linux/mc146818rtc.h.
+
+
+3. Time of day service
+======================
+
+The part of the driver gives access to the time of day service of EFI.
+Two ioctl()s, compatible with the legacy RTC calls:
+
+ Read the CMOS clock::
+
+ ioctl(d, RTC_RD_TIME, &rtc);
+
+ Write the CMOS clock::
+
+ ioctl(d, RTC_SET_TIME, &rtc);
+
+The rtc is a pointer to a data structure defined in rtc.h which is close
+to a struct tm::
+
+ struct rtc_time {
+ int tm_sec;
+ int tm_min;
+ int tm_hour;
+ int tm_mday;
+ int tm_mon;
+ int tm_year;
+ int tm_wday;
+ int tm_yday;
+ int tm_isdst;
+ };
+
+The driver takes care of converting back an forth between the EFI time and
+this format.
+
+Those two ioctl()s can be exercised with the hwclock command:
+
+For reading::
+
+ # /sbin/hwclock --show
+ Mon Mar 6 15:32:32 2000 -0.910248 seconds
+
+For setting::
+
+ # /sbin/hwclock --systohc
+
+Root privileges are required to be able to set the time of day.
+
+4. Wakeup Alarm service
+=======================
+
+EFI provides an API by which one can program when a machine should wakeup,
+i.e. reboot. This is very different from the alarm provided by the legacy
+RTC which is some kind of interval timer alarm. For this reason we don't use
+the same ioctl()s to get access to the service. Instead we have
+introduced 2 news ioctl()s to the interface of an RTC.
+
+We have added 2 new ioctl()s that are specific to the EFI driver:
+
+ Read the current state of the alarm::
+
+ ioctl(d, RTC_WKLAM_RD, &wkt)
+
+ Set the alarm or change its status::
+
+ ioctl(d, RTC_WKALM_SET, &wkt)
+
+The wkt structure encapsulates a struct rtc_time + 2 extra fields to get
+status information::
+
+ struct rtc_wkalrm {
+
+ unsigned char enabled; /* =1 if alarm is enabled */
+ unsigned char pending; /* =1 if alarm is pending */
+
+ struct rtc_time time;
+ }
+
+As of today, none of the existing user-level apps supports this feature.
+However writing such a program should be hard by simply using those two
+ioctl().
+
+Root privileges are required to be able to set the alarm.
+
+5. References
+=============
+
+Checkout the following Web site for more information on EFI:
+
+http://developer.intel.com/technology/efi/
diff --git a/Documentation/ia64/efirtc.txt b/Documentation/ia64/efirtc.txt
deleted file mode 100644
index 057e6bebda8f..000000000000
--- a/Documentation/ia64/efirtc.txt
+++ /dev/null
@@ -1,128 +0,0 @@
-EFI Real Time Clock driver
--------------------------------
-S. Eranian <eranian@hpl.hp.com>
-March 2000
-
-I/ Introduction
-
-This document describes the efirtc.c driver has provided for
-the IA-64 platform.
-
-The purpose of this driver is to supply an API for kernel and user applications
-to get access to the Time Service offered by EFI version 0.92.
-
-EFI provides 4 calls one can make once the OS is booted: GetTime(),
-SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this
-driver. We describe those calls as well the design of the driver in the
-following sections.
-
-II/ Design Decisions
-
-The original ideas was to provide a very simple driver to get access to,
-at first, the time of day service. This is required in order to access, in a
-portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock
-to initialize the system view of the time during boot.
-
-Because we wanted to minimize the impact on existing user-level apps using
-the CMOS clock, we decided to expose an API that was very similar to the one
-used today with the legacy RTC driver (driver/char/rtc.c). However, because
-EFI provides a simpler services, not all ioctl() are available. Also
-new ioctl()s have been introduced for things that EFI provides but not the
-legacy.
-
-EFI uses a slightly different way of representing the time, noticeably
-the reference date is different. Year is the using the full 4-digit format.
-The Epoch is January 1st 1998. For backward compatibility reasons we don't
-expose this new way of representing time. Instead we use something very
-similar to the struct tm, i.e. struct rtc_time, as used by hwclock.
-One of the reasons for doing it this way is to allow for EFI to still evolve
-without necessarily impacting any of the user applications. The decoupling
-enables flexibility and permits writing wrapper code is ncase things change.
-
-The driver exposes two interfaces, one via the device file and a set of
-ioctl()s. The other is read-only via the /proc filesystem.
-
-As of today we don't offer a /proc/sys interface.
-
-To allow for a uniform interface between the legacy RTC and EFI time service,
-we have created the include/linux/rtc.h header file to contain only the
-"public" API of the two drivers. The specifics of the legacy RTC are still
-in include/linux/mc146818rtc.h.
-
-
-III/ Time of day service
-
-The part of the driver gives access to the time of day service of EFI.
-Two ioctl()s, compatible with the legacy RTC calls:
-
- Read the CMOS clock: ioctl(d, RTC_RD_TIME, &rtc);
-
- Write the CMOS clock: ioctl(d, RTC_SET_TIME, &rtc);
-
-The rtc is a pointer to a data structure defined in rtc.h which is close
-to a struct tm:
-
-struct rtc_time {
- int tm_sec;
- int tm_min;
- int tm_hour;
- int tm_mday;
- int tm_mon;
- int tm_year;
- int tm_wday;
- int tm_yday;
- int tm_isdst;
-};
-
-The driver takes care of converting back an forth between the EFI time and
-this format.
-
-Those two ioctl()s can be exercised with the hwclock command:
-
-For reading:
-# /sbin/hwclock --show
-Mon Mar 6 15:32:32 2000 -0.910248 seconds
-
-For setting:
-# /sbin/hwclock --systohc
-
-Root privileges are required to be able to set the time of day.
-
-IV/ Wakeup Alarm service
-
-EFI provides an API by which one can program when a machine should wakeup,
-i.e. reboot. This is very different from the alarm provided by the legacy
-RTC which is some kind of interval timer alarm. For this reason we don't use
-the same ioctl()s to get access to the service. Instead we have
-introduced 2 news ioctl()s to the interface of an RTC.
-
-We have added 2 new ioctl()s that are specific to the EFI driver:
-
- Read the current state of the alarm
- ioctl(d, RTC_WKLAM_RD, &wkt)
-
- Set the alarm or change its status
- ioctl(d, RTC_WKALM_SET, &wkt)
-
-The wkt structure encapsulates a struct rtc_time + 2 extra fields to get
-status information:
-
-struct rtc_wkalrm {
-
- unsigned char enabled; /* =1 if alarm is enabled */
- unsigned char pending; /* =1 if alarm is pending */
-
- struct rtc_time time;
-}
-
-As of today, none of the existing user-level apps supports this feature.
-However writing such a program should be hard by simply using those two
-ioctl().
-
-Root privileges are required to be able to set the alarm.
-
-V/ References.
-
-Checkout the following Web site for more information on EFI:
-
-http://developer.intel.com/technology/efi/
diff --git a/Documentation/ia64/err_inject.rst b/Documentation/ia64/err_inject.rst
new file mode 100644
index 000000000000..900f71e93a29
--- /dev/null
+++ b/Documentation/ia64/err_inject.rst
@@ -0,0 +1,1067 @@
+========================================
+IPF Machine Check (MC) error inject tool
+========================================
+
+IPF Machine Check (MC) error inject tool is used to inject MC
+errors from Linux. The tool is a test bed for IPF MC work flow including
+hardware correctable error handling, OS recoverable error handling, MC
+event logging, etc.
+
+The tool includes two parts: a kernel driver and a user application
+sample. The driver provides interface to PAL to inject error
+and query error injection capabilities. The driver code is in
+arch/ia64/kernel/err_inject.c. The application sample (shown below)
+provides a combination of various errors and calls the driver's interface
+(sysfs interface) to inject errors or query error injection capabilities.
+
+The tool can be used to test Intel IPF machine MC handling capabilities.
+It's especially useful for people who can not access hardware MC injection
+tool to inject error. It's also very useful to integrate with other
+software test suits to do stressful testing on IPF.
+
+Below is a sample application as part of the whole tool. The sample
+can be used as a working test tool. Or it can be expanded to include
+more features. It also can be a integrated into a library or other user
+application to have more thorough test.
+
+The sample application takes err.conf as error configuration input. GCC
+compiles the code. After you install err_inject driver, you can run
+this sample application to inject errors.
+
+Errata: Itanium 2 Processors Specification Update lists some errata against
+the pal_mc_error_inject PAL procedure. The following err.conf has been tested
+on latest Montecito PAL.
+
+err.conf::
+
+ #This is configuration file for err_inject_tool.
+ #The format of the each line is:
+ #cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
+ #where
+ # cpu: logical cpu number the error will be inject in.
+ # loop: times the error will be injected.
+ # interval: In second. every so often one error is injected.
+ # err_type_info, err_struct_info: PAL parameters.
+ #
+ #Note: All values are hex w/o or w/ 0x prefix.
+
+
+ #On cpu2, inject only total 0x10 errors, interval 5 seconds
+ #corrected, data cache, hier-2, physical addr(assigned by tool code).
+ #working on Montecito latest PAL.
+ 2, 10, 5, 4101, 95
+
+ #On cpu4, inject and consume total 0x10 errors, interval 5 seconds
+ #corrected, data cache, hier-2, physical addr(assigned by tool code).
+ #working on Montecito latest PAL.
+ 4, 10, 5, 4109, 95
+
+ #On cpu15, inject and consume total 0x10 errors, interval 5 seconds
+ #recoverable, DTR0, hier-2.
+ #working on Montecito latest PAL.
+ 0xf, 0x10, 5, 4249, 15
+
+The sample application source code:
+
+err_injection_tool.c::
+
+ /*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Copyright (C) 2006 Intel Co
+ * Fenghua Yu <fenghua.yu@intel.com>
+ *
+ */
+ #include <sys/types.h>
+ #include <sys/stat.h>
+ #include <fcntl.h>
+ #include <stdio.h>
+ #include <sched.h>
+ #include <unistd.h>
+ #include <stdlib.h>
+ #include <stdarg.h>
+ #include <string.h>
+ #include <errno.h>
+ #include <time.h>
+ #include <sys/ipc.h>
+ #include <sys/sem.h>
+ #include <sys/wait.h>
+ #include <sys/mman.h>
+ #include <sys/shm.h>
+
+ #define MAX_FN_SIZE 256
+ #define MAX_BUF_SIZE 256
+ #define DATA_BUF_SIZE 256
+ #define NR_CPUS 512
+ #define MAX_TASK_NUM 2048
+ #define MIN_INTERVAL 5 // seconds
+ #define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte.
+ #define PARA_FIELD_NUM 5
+ #define MASK_SIZE (NR_CPUS/64)
+ #define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
+
+ int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
+
+ int verbose;
+ #define vbprintf if (verbose) printf
+
+ int log_info(int cpu, const char *fmt, ...)
+ {
+ FILE *log;
+ char fn[MAX_FN_SIZE];
+ char buf[MAX_BUF_SIZE];
+ va_list args;
+
+ sprintf(fn, "%d.log", cpu);
+ log=fopen(fn, "a+");
+ if (log==NULL) {
+ perror("Error open:");
+ return -1;
+ }
+
+ va_start(args, fmt);
+ vprintf(fmt, args);
+ memset(buf, 0, MAX_BUF_SIZE);
+ vsprintf(buf, fmt, args);
+ va_end(args);
+
+ fwrite(buf, sizeof(buf), 1, log);
+ fclose(log);
+
+ return 0;
+ }
+
+ typedef unsigned long u64;
+ typedef unsigned int u32;
+
+ typedef union err_type_info_u {
+ struct {
+ u64 mode : 3, /* 0-2 */
+ err_inj : 3, /* 3-5 */
+ err_sev : 2, /* 6-7 */
+ err_struct : 5, /* 8-12 */
+ struct_hier : 3, /* 13-15 */
+ reserved : 48; /* 16-63 */
+ } err_type_info_u;
+ u64 err_type_info;
+ } err_type_info_t;
+
+ typedef union err_struct_info_u {
+ struct {
+ u64 siv : 1, /* 0 */
+ c_t : 2, /* 1-2 */
+ cl_p : 3, /* 3-5 */
+ cl_id : 3, /* 6-8 */
+ cl_dp : 1, /* 9 */
+ reserved1 : 22, /* 10-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_cache;
+ struct {
+ u64 siv : 1, /* 0 */
+ tt : 2, /* 1-2 */
+ tc_tr : 2, /* 3-4 */
+ tr_slot : 8, /* 5-12 */
+ reserved1 : 19, /* 13-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_tlb;
+ struct {
+ u64 siv : 1, /* 0 */
+ regfile_id : 4, /* 1-4 */
+ reg_num : 7, /* 5-11 */
+ reserved1 : 20, /* 12-31 */
+ tiv : 1, /* 32 */
+ trigger : 4, /* 33-36 */
+ trigger_pl : 3, /* 37-39 */
+ reserved2 : 24; /* 40-63 */
+ } err_struct_info_register;
+ struct {
+ u64 reserved;
+ } err_struct_info_bus_processor_interconnect;
+ u64 err_struct_info;
+ } err_struct_info_t;
+
+ typedef union err_data_buffer_u {
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ u64 inj_addr; /* 64-127 */
+ u64 way : 5, /* 128-132 */
+ index : 20, /* 133-152 */
+ : 39; /* 153-191 */
+ } err_data_buffer_cache;
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ u64 inj_addr; /* 64-127 */
+ u64 way : 5, /* 128-132 */
+ index : 20, /* 133-152 */
+ reserved : 39; /* 153-191 */
+ } err_data_buffer_tlb;
+ struct {
+ u64 trigger_addr; /* 0-63 */
+ } err_data_buffer_register;
+ struct {
+ u64 reserved; /* 0-63 */
+ } err_data_buffer_bus_processor_interconnect;
+ u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+ } err_data_buffer_t;
+
+ typedef union capabilities_u {
+ struct {
+ u64 i : 1,
+ d : 1,
+ rv : 1,
+ tag : 1,
+ data : 1,
+ mesi : 1,
+ dp : 1,
+ reserved1 : 3,
+ pa : 1,
+ va : 1,
+ wi : 1,
+ reserved2 : 20,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved3 : 30;
+ } capabilities_cache;
+ struct {
+ u64 d : 1,
+ i : 1,
+ rv : 1,
+ tc : 1,
+ tr : 1,
+ reserved1 : 27,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved2 : 30;
+ } capabilities_tlb;
+ struct {
+ u64 gr_b0 : 1,
+ gr_b1 : 1,
+ fr : 1,
+ br : 1,
+ pr : 1,
+ ar : 1,
+ cr : 1,
+ rr : 1,
+ pkr : 1,
+ dbr : 1,
+ ibr : 1,
+ pmc : 1,
+ pmd : 1,
+ reserved1 : 3,
+ regnum : 1,
+ reserved2 : 15,
+ trigger : 1,
+ trigger_pl : 1,
+ reserved3 : 30;
+ } capabilities_register;
+ struct {
+ u64 reserved;
+ } capabilities_bus_processor_interconnect;
+ } capabilities_t;
+
+ typedef struct resources_s {
+ u64 ibr0 : 1,
+ ibr2 : 1,
+ ibr4 : 1,
+ ibr6 : 1,
+ dbr0 : 1,
+ dbr2 : 1,
+ dbr4 : 1,
+ dbr6 : 1,
+ reserved : 48;
+ } resources_t;
+
+
+ long get_page_size(void)
+ {
+ long page_size=sysconf(_SC_PAGESIZE);
+ return page_size;
+ }
+
+ #define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
+ #define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
+ #define SHM_VA 0x2000000100000000
+
+ int shmid;
+ void *shmaddr;
+
+ int create_shm(void)
+ {
+ key_t key;
+ char fn[MAX_FN_SIZE];
+
+ /* cpu0 is always existing */
+ sprintf(fn, PATH_FORMAT, 0);
+ if ((key = ftok(fn, 's')) == -1) {
+ perror("ftok");
+ return -1;
+ }
+
+ shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
+ if (shmid == -1) {
+ if (errno==EEXIST) {
+ shmid = shmget(key, SHM_SIZE, 0);
+ if (shmid == -1) {
+ perror("shmget");
+ return -1;
+ }
+ }
+ else {
+ perror("shmget");
+ return -1;
+ }
+ }
+ vbprintf("shmid=%d", shmid);
+
+ /* connect to the segment: */
+ shmaddr = shmat(shmid, (void *)SHM_VA, 0);
+ if (shmaddr == (void*)-1) {
+ perror("shmat");
+ return -1;
+ }
+
+ memset(shmaddr, 0, SHM_SIZE);
+ mlock(shmaddr, SHM_SIZE);
+
+ return 0;
+ }
+
+ int free_shm()
+ {
+ munlock(shmaddr, SHM_SIZE);
+ shmdt(shmaddr);
+ semctl(shmid, 0, IPC_RMID);
+
+ return 0;
+ }
+
+ #ifdef _SEM_SEMUN_UNDEFINED
+ union semun
+ {
+ int val;
+ struct semid_ds *buf;
+ unsigned short int *array;
+ struct seminfo *__buf;
+ };
+ #endif
+
+ u32 mode=1; /* 1: physical mode; 2: virtual mode. */
+ int one_lock=1;
+ key_t key[NR_CPUS];
+ int semid[NR_CPUS];
+
+ int create_sem(int cpu)
+ {
+ union semun arg;
+ char fn[MAX_FN_SIZE];
+ int sid;
+
+ sprintf(fn, PATH_FORMAT, cpu);
+ sprintf(fn, "%s/%s", fn, "err_type_info");
+ if ((key[cpu] = ftok(fn, 'e')) == -1) {
+ perror("ftok");
+ return -1;
+ }
+
+ if (semid[cpu]!=0)
+ return 0;
+
+ /* clear old semaphore */
+ if ((sid = semget(key[cpu], 1, 0)) != -1)
+ semctl(sid, 0, IPC_RMID);
+
+ /* get one semaphore */
+ if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
+ perror("semget");
+ printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
+ (u64)key[cpu]);
+ return -1;
+ }
+
+ vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
+ (u64)key[cpu]);
+ /* initialize the semaphore to 1: */
+ arg.val = 1;
+ if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
+ perror("semctl");
+ return -1;
+ }
+
+ return 0;
+ }
+
+ static int lock(int cpu)
+ {
+ struct sembuf lock;
+
+ lock.sem_num = cpu;
+ lock.sem_op = 1;
+ semop(semid[cpu], &lock, 1);
+
+ return 0;
+ }
+
+ static int unlock(int cpu)
+ {
+ struct sembuf unlock;
+
+ unlock.sem_num = cpu;
+ unlock.sem_op = -1;
+ semop(semid[cpu], &unlock, 1);
+
+ return 0;
+ }
+
+ void free_sem(int cpu)
+ {
+ semctl(semid[cpu], 0, IPC_RMID);
+ }
+
+ int wr_multi(char *fn, unsigned long *data, int size)
+ {
+ int fd;
+ char buf[MAX_BUF_SIZE];
+ int ret;
+
+ if (size==1)
+ sprintf(buf, "%lx", *data);
+ else if (size==3)
+ sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
+ else {
+ fprintf(stderr,"write to file with wrong size!\n");
+ return -1;
+ }
+
+ fd=open(fn, O_RDWR);
+ if (!fd) {
+ perror("Error:");
+ return -1;
+ }
+ ret=write(fd, buf, sizeof(buf));
+ close(fd);
+ return ret;
+ }
+
+ int wr(char *fn, unsigned long data)
+ {
+ return wr_multi(fn, &data, 1);
+ }
+
+ int rd(char *fn, unsigned long *data)
+ {
+ int fd;
+ char buf[MAX_BUF_SIZE];
+
+ fd=open(fn, O_RDONLY);
+ if (fd<0) {
+ perror("Error:");
+ return -1;
+ }
+ read(fd, buf, MAX_BUF_SIZE);
+ *data=strtoul(buf, NULL, 16);
+ close(fd);
+ return 0;
+ }
+
+ int rd_status(char *path, int *status)
+ {
+ char fn[MAX_FN_SIZE];
+ sprintf(fn, "%s/status", path);
+ if (rd(fn, (u64*)status)<0) {
+ perror("status reading error.\n");
+ return -1;
+ }
+
+ return 0;
+ }
+
+ int rd_capabilities(char *path, u64 *capabilities)
+ {
+ char fn[MAX_FN_SIZE];
+ sprintf(fn, "%s/capabilities", path);
+ if (rd(fn, capabilities)<0) {
+ perror("capabilities reading error.\n");
+ return -1;
+ }
+
+ return 0;
+ }
+
+ int rd_all(char *path)
+ {
+ unsigned long err_type_info, err_struct_info, err_data_buffer;
+ int status;
+ unsigned long capabilities, resources;
+ char fn[MAX_FN_SIZE];
+
+ sprintf(fn, "%s/err_type_info", path);
+ if (rd(fn, &err_type_info)<0) {
+ perror("err_type_info reading error.\n");
+ return -1;
+ }
+ printf("err_type_info=%lx\n", err_type_info);
+
+ sprintf(fn, "%s/err_struct_info", path);
+ if (rd(fn, &err_struct_info)<0) {
+ perror("err_struct_info reading error.\n");
+ return -1;
+ }
+ printf("err_struct_info=%lx\n", err_struct_info);
+
+ sprintf(fn, "%s/err_data_buffer", path);
+ if (rd(fn, &err_data_buffer)<0) {
+ perror("err_data_buffer reading error.\n");
+ return -1;
+ }
+ printf("err_data_buffer=%lx\n", err_data_buffer);
+
+ sprintf(fn, "%s/status", path);
+ if (rd("status", (u64*)&status)<0) {
+ perror("status reading error.\n");
+ return -1;
+ }
+ printf("status=%d\n", status);
+
+ sprintf(fn, "%s/capabilities", path);
+ if (rd(fn,&capabilities)<0) {
+ perror("capabilities reading error.\n");
+ return -1;
+ }
+ printf("capabilities=%lx\n", capabilities);
+
+ sprintf(fn, "%s/resources", path);
+ if (rd(fn, &resources)<0) {
+ perror("resources reading error.\n");
+ return -1;
+ }
+ printf("resources=%lx\n", resources);
+
+ return 0;
+ }
+
+ int query_capabilities(char *path, err_type_info_t err_type_info,
+ u64 *capabilities)
+ {
+ char fn[MAX_FN_SIZE];
+ err_struct_info_t err_struct_info;
+ err_data_buffer_t err_data_buffer;
+
+ err_struct_info.err_struct_info=0;
+ memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
+
+ sprintf(fn, "%s/err_type_info", path);
+ wr(fn, err_type_info.err_type_info);
+ sprintf(fn, "%s/err_struct_info", path);
+ wr(fn, 0x0);
+ sprintf(fn, "%s/err_data_buffer", path);
+ wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+ // Fire pal_mc_error_inject procedure.
+ sprintf(fn, "%s/call_start", path);
+ wr(fn, mode);
+
+ if (rd_capabilities(path, capabilities)<0)
+ return -1;
+
+ return 0;
+ }
+
+ int query_all_capabilities()
+ {
+ int status;
+ err_type_info_t err_type_info;
+ int err_sev, err_struct, struct_hier;
+ int cap=0;
+ u64 capabilities;
+ char path[MAX_FN_SIZE];
+
+ err_type_info.err_type_info=0; // Initial
+ err_type_info.err_type_info_u.mode=0; // Query mode;
+ err_type_info.err_type_info_u.err_inj=0;
+
+ printf("All capabilities implemented in pal_mc_error_inject:\n");
+ sprintf(path, PATH_FORMAT ,0);
+ for (err_sev=0;err_sev<3;err_sev++)
+ for (err_struct=0;err_struct<5;err_struct++)
+ for (struct_hier=0;struct_hier<5;struct_hier++)
+ {
+ status=-1;
+ capabilities=0;
+ err_type_info.err_type_info_u.err_sev=err_sev;
+ err_type_info.err_type_info_u.err_struct=err_struct;
+ err_type_info.err_type_info_u.struct_hier=struct_hier;
+
+ if (query_capabilities(path, err_type_info, &capabilities)<0)
+ continue;
+
+ if (rd_status(path, &status)<0)
+ continue;
+
+ if (status==0) {
+ cap=1;
+ printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
+ err_sev, err_struct, struct_hier);
+ printf("capabilities 0x%lx\n", capabilities);
+ }
+ }
+ if (!cap) {
+ printf("No capabilities supported.\n");
+ return 0;
+ }
+
+ return 0;
+ }
+
+ int err_inject(int cpu, char *path, err_type_info_t err_type_info,
+ err_struct_info_t err_struct_info,
+ err_data_buffer_t err_data_buffer)
+ {
+ int status;
+ char fn[MAX_FN_SIZE];
+
+ log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
+ err_type_info.err_type_info,
+ err_struct_info.err_struct_info);
+ log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
+ err_data_buffer.err_data_buffer[0],
+ err_data_buffer.err_data_buffer[1],
+ err_data_buffer.err_data_buffer[2]);
+ sprintf(fn, "%s/err_type_info", path);
+ wr(fn, err_type_info.err_type_info);
+ sprintf(fn, "%s/err_struct_info", path);
+ wr(fn, err_struct_info.err_struct_info);
+ sprintf(fn, "%s/err_data_buffer", path);
+ wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
+
+ // Fire pal_mc_error_inject procedure.
+ sprintf(fn, "%s/call_start", path);
+ wr(fn,mode);
+
+ if (rd_status(path, &status)<0) {
+ vbprintf("fail: read status\n");
+ return -100;
+ }
+
+ if (status!=0) {
+ log_info(cpu, "fail: status=%d\n", status);
+ return status;
+ }
+
+ return status;
+ }
+
+ static int construct_data_buf(char *path, err_type_info_t err_type_info,
+ err_struct_info_t err_struct_info,
+ err_data_buffer_t *err_data_buffer,
+ void *va1)
+ {
+ char fn[MAX_FN_SIZE];
+ u64 virt_addr=0, phys_addr=0;
+
+ vbprintf("va1=%lx\n", (u64)va1);
+ memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
+
+ switch (err_type_info.err_type_info_u.err_struct) {
+ case 1: // Cache
+ switch (err_struct_info.err_struct_info_cache.cl_id) {
+ case 1: //Virtual addr
+ err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
+ break;
+ case 2: //Phys addr
+ sprintf(fn, "%s/virtual_to_phys", path);
+ virt_addr=(u64)va1;
+ if (wr(fn,virt_addr)<0)
+ return -1;
+ rd(fn, &phys_addr);
+ err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
+ break;
+ default:
+ printf("Not supported cl_id\n");
+ break;
+ }
+ break;
+ case 2: // TLB
+ break;
+ case 3: // Register file
+ break;
+ case 4: // Bus/system interconnect
+ default:
+ printf("Not supported err_struct\n");
+ break;
+ }
+
+ return 0;
+ }
+
+ typedef struct {
+ u64 cpu;
+ u64 loop;
+ u64 interval;
+ u64 err_type_info;
+ u64 err_struct_info;
+ u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
+ } parameters_t;
+
+ parameters_t line_para;
+ int para;
+
+ static int empty_data_buffer(u64 *err_data_buffer)
+ {
+ int empty=1;
+ int i;
+
+ for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
+ if (err_data_buffer[i]!=-1)
+ empty=0;
+
+ return empty;
+ }
+
+ int err_inj()
+ {
+ err_type_info_t err_type_info;
+ err_struct_info_t err_struct_info;
+ err_data_buffer_t err_data_buffer;
+ int count;
+ FILE *fp;
+ unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
+ u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
+ int num;
+ int i;
+ char path[MAX_FN_SIZE];
+ parameters_t parameters[MAX_TASK_NUM]={};
+ pid_t child_pid[MAX_TASK_NUM];
+ time_t current_time;
+ int status;
+
+ if (!para) {
+ fp=fopen("err.conf", "r");
+ if (fp==NULL) {
+ perror("Error open err.conf");
+ return -1;
+ }
+
+ num=0;
+ while (!feof(fp)) {
+ char buf[256];
+ memset(buf,0,256);
+ fgets(buf, 256, fp);
+ count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+ &cpu, &loop, &interval,&err_type_info_conf,
+ &err_struct_info_conf,
+ &err_data_buffer_conf[0],
+ &err_data_buffer_conf[1],
+ &err_data_buffer_conf[2]);
+ if (count!=PARA_FIELD_NUM+3) {
+ err_data_buffer_conf[0]=-1;
+ err_data_buffer_conf[1]=-1;
+ err_data_buffer_conf[2]=-1;
+ count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
+ &cpu, &loop, &interval,&err_type_info_conf,
+ &err_struct_info_conf);
+ if (count!=PARA_FIELD_NUM)
+ continue;
+ }
+
+ parameters[num].cpu=cpu;
+ parameters[num].loop=loop;
+ parameters[num].interval= interval>MIN_INTERVAL
+ ?interval:MIN_INTERVAL;
+ parameters[num].err_type_info=err_type_info_conf;
+ parameters[num].err_struct_info=err_struct_info_conf;
+ memcpy(parameters[num++].err_data_buffer,
+ err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
+
+ if (num>=MAX_TASK_NUM)
+ break;
+ }
+ }
+ else {
+ parameters[0].cpu=line_para.cpu;
+ parameters[0].loop=line_para.loop;
+ parameters[0].interval= line_para.interval>MIN_INTERVAL
+ ?line_para.interval:MIN_INTERVAL;
+ parameters[0].err_type_info=line_para.err_type_info;
+ parameters[0].err_struct_info=line_para.err_struct_info;
+ memcpy(parameters[0].err_data_buffer,
+ line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
+
+ num=1;
+ }
+
+ /* Create semaphore: If one_lock, one semaphore for all processors.
+ Otherwise, one semaphore for each processor. */
+ if (one_lock) {
+ if (create_sem(0)) {
+ printf("Can not create semaphore...exit\n");
+ free_sem(0);
+ return -1;
+ }
+ }
+ else {
+ for (i=0;i<num;i++) {
+ if (create_sem(parameters[i].cpu)) {
+ printf("Can not create semaphore for cpu%d...exit\n",i);
+ free_sem(parameters[num].cpu);
+ return -1;
+ }
+ }
+ }
+
+ /* Create a shm segment which will be used to inject/consume errors on.*/
+ if (create_shm()==-1) {
+ printf("Error to create shm...exit\n");
+ return -1;
+ }
+
+ for (i=0;i<num;i++) {
+ pid_t pid;
+
+ current_time=time(NULL);
+ log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
+ log_info(parameters[i].cpu, "Configurations:\n");
+ log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
+ parameters[i].cpu,
+ parameters[i].loop,
+ parameters[i].interval);
+ log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
+ parameters[i].err_type_info,
+ parameters[i].err_struct_info);
+
+ sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
+ err_type_info.err_type_info=parameters[i].err_type_info;
+ err_struct_info.err_struct_info=parameters[i].err_struct_info;
+ memcpy(err_data_buffer.err_data_buffer,
+ parameters[i].err_data_buffer,
+ ERR_DATA_BUFFER_SIZE*8);
+
+ pid=fork();
+ if (pid==0) {
+ unsigned long mask[MASK_SIZE];
+ int j, k;
+
+ void *va1, *va2;
+
+ /* Allocate two memory areas va1 and va2 in shm */
+ va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
+ va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
+
+ vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
+ memset(va1, 0x1, PAGE_SIZE);
+ memset(va2, 0x2, PAGE_SIZE);
+
+ if (empty_data_buffer(err_data_buffer.err_data_buffer))
+ /* If not specified yet, construct data buffer
+ * with va1
+ */
+ construct_data_buf(path, err_type_info,
+ err_struct_info, &err_data_buffer,va1);
+
+ for (j=0;j<MASK_SIZE;j++)
+ mask[j]=0;
+
+ cpu=parameters[i].cpu;
+ k = cpu%64;
+ j = cpu/64;
+ mask[j] = 1UL << k;
+
+ if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
+ perror("Error sched_setaffinity:");
+ return -1;
+ }
+
+ for (j=0; j<parameters[i].loop; j++) {
+ log_info(parameters[i].cpu,"Injection ");
+ log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
+
+ parameters[i].cpu,j+1, parameters[i].loop);
+
+ /* Hold the lock */
+ if (one_lock)
+ lock(0);
+ else
+ /* Hold lock on this cpu */
+ lock(parameters[i].cpu);
+
+ if ((status=err_inject(parameters[i].cpu,
+ path, err_type_info,
+ err_struct_info, err_data_buffer))
+ ==0) {
+ /* consume the error for "inject only"*/
+ memcpy(va2, va1, PAGE_SIZE);
+ memcpy(va1, va2, PAGE_SIZE);
+ log_info(parameters[i].cpu,
+ "successful\n");
+ }
+ else {
+ log_info(parameters[i].cpu,"fail:");
+ log_info(parameters[i].cpu,
+ "status=%d\n", status);
+ unlock(parameters[i].cpu);
+ break;
+ }
+ if (one_lock)
+ /* Release the lock */
+ unlock(0);
+ /* Release lock on this cpu */
+ else
+ unlock(parameters[i].cpu);
+
+ if (j < parameters[i].loop-1)
+ sleep(parameters[i].interval);
+ }
+ current_time=time(NULL);
+ log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
+ return 0;
+ }
+ else if (pid<0) {
+ perror("Error fork:");
+ continue;
+ }
+ child_pid[i]=pid;
+ }
+ for (i=0;i<num;i++)
+ waitpid(child_pid[i], NULL, 0);
+
+ if (one_lock)
+ free_sem(0);
+ else
+ for (i=0;i<num;i++)
+ free_sem(parameters[i].cpu);
+
+ printf("All done.\n");
+
+ return 0;
+ }
+
+ void help()
+ {
+ printf("err_inject_tool:\n");
+ printf("\t-q: query all capabilities. default: off\n");
+ printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
+ printf("\t-i: inject errors. default: off\n");
+ printf("\t-l: one lock per cpu. default: one lock for all\n");
+ printf("\t-e: error parameters:\n");
+ printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
+ printf("\t\t cpu: logical cpu number the error will be inject in.\n");
+ printf("\t\t loop: times the error will be injected.\n");
+ printf("\t\t interval: In second. every so often one error is injected.\n");
+ printf("\t\t err_type_info, err_struct_info: PAL parameters.\n");
+ printf("\t\t err_data_buffer: PAL parameter. Optional. If not present,\n");
+ printf("\t\t it's constructed by tool automatically. Be\n");
+ printf("\t\t careful to provide err_data_buffer and make\n");
+ printf("\t\t sure it's working with the environment.\n");
+ printf("\t Note:no space between error parameters.\n");
+ printf("\t default: Take error parameters from err.conf instead of command line.\n");
+ printf("\t-v: verbose. default: off\n");
+ printf("\t-h: help\n\n");
+ printf("The tool will take err.conf file as ");
+ printf("input to inject single or multiple errors ");
+ printf("on one or multiple cpus in parallel.\n");
+ }
+
+ int main(int argc, char **argv)
+ {
+ char c;
+ int do_err_inj=0;
+ int do_query_all=0;
+ int count;
+ u32 m;
+
+ /* Default one lock for all cpu's */
+ one_lock=1;
+ while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
+ switch (c) {
+ case 'm': /* Procedure mode. 1: phys 2: virt */
+ count=sscanf(optarg, "%x", &m);
+ if (count!=1 || (m!=1 && m!=2)) {
+ printf("Wrong mode number.\n");
+ help();
+ return -1;
+ }
+ mode=m;
+ break;
+ case 'i': /* Inject errors */
+ do_err_inj=1;
+ break;
+ case 'q': /* Query */
+ do_query_all=1;
+ break;
+ case 'v': /* Verbose */
+ verbose=1;
+ break;
+ case 'l': /* One lock per cpu */
+ one_lock=0;
+ break;
+ case 'e': /* error arguments */
+ /* Take parameters:
+ * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
+ * err_data_buffer is optional. Recommend not to specify
+ * err_data_buffer. Better to use tool to generate it.
+ */
+ count=sscanf(optarg,
+ "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
+ &line_para.cpu,
+ &line_para.loop,
+ &line_para.interval,
+ &line_para.err_type_info,
+ &line_para.err_struct_info,
+ &line_para.err_data_buffer[0],
+ &line_para.err_data_buffer[1],
+ &line_para.err_data_buffer[2]);
+ if (count!=PARA_FIELD_NUM+3) {
+ line_para.err_data_buffer[0]=-1,
+ line_para.err_data_buffer[1]=-1,
+ line_para.err_data_buffer[2]=-1;
+ count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
+ &line_para.cpu,
+ &line_para.loop,
+ &line_para.interval,
+ &line_para.err_type_info,
+ &line_para.err_struct_info);
+ if (count!=PARA_FIELD_NUM) {
+ printf("Wrong error arguments.\n");
+ help();
+ return -1;
+ }
+ }
+ para=1;
+ break;
+ continue;
+ break;
+ case 'h':
+ help();
+ return 0;
+ default:
+ break;
+ }
+
+ if (do_query_all)
+ query_all_capabilities();
+ if (do_err_inj)
+ err_inj();
+
+ if (!do_query_all && !do_err_inj)
+ help();
+
+ return 0;
+ }
diff --git a/Documentation/ia64/err_inject.txt b/Documentation/ia64/err_inject.txt
deleted file mode 100644
index 9f651c181429..000000000000
--- a/Documentation/ia64/err_inject.txt
+++ /dev/null
@@ -1,1068 +0,0 @@
-
-IPF Machine Check (MC) error inject tool
-========================================
-
-IPF Machine Check (MC) error inject tool is used to inject MC
-errors from Linux. The tool is a test bed for IPF MC work flow including
-hardware correctable error handling, OS recoverable error handling, MC
-event logging, etc.
-
-The tool includes two parts: a kernel driver and a user application
-sample. The driver provides interface to PAL to inject error
-and query error injection capabilities. The driver code is in
-arch/ia64/kernel/err_inject.c. The application sample (shown below)
-provides a combination of various errors and calls the driver's interface
-(sysfs interface) to inject errors or query error injection capabilities.
-
-The tool can be used to test Intel IPF machine MC handling capabilities.
-It's especially useful for people who can not access hardware MC injection
-tool to inject error. It's also very useful to integrate with other
-software test suits to do stressful testing on IPF.
-
-Below is a sample application as part of the whole tool. The sample
-can be used as a working test tool. Or it can be expanded to include
-more features. It also can be a integrated into a library or other user
-application to have more thorough test.
-
-The sample application takes err.conf as error configuration input. GCC
-compiles the code. After you install err_inject driver, you can run
-this sample application to inject errors.
-
-Errata: Itanium 2 Processors Specification Update lists some errata against
-the pal_mc_error_inject PAL procedure. The following err.conf has been tested
-on latest Montecito PAL.
-
-err.conf:
-
-#This is configuration file for err_inject_tool.
-#The format of the each line is:
-#cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer
-#where
-# cpu: logical cpu number the error will be inject in.
-# loop: times the error will be injected.
-# interval: In second. every so often one error is injected.
-# err_type_info, err_struct_info: PAL parameters.
-#
-#Note: All values are hex w/o or w/ 0x prefix.
-
-
-#On cpu2, inject only total 0x10 errors, interval 5 seconds
-#corrected, data cache, hier-2, physical addr(assigned by tool code).
-#working on Montecito latest PAL.
-2, 10, 5, 4101, 95
-
-#On cpu4, inject and consume total 0x10 errors, interval 5 seconds
-#corrected, data cache, hier-2, physical addr(assigned by tool code).
-#working on Montecito latest PAL.
-4, 10, 5, 4109, 95
-
-#On cpu15, inject and consume total 0x10 errors, interval 5 seconds
-#recoverable, DTR0, hier-2.
-#working on Montecito latest PAL.
-0xf, 0x10, 5, 4249, 15
-
-The sample application source code:
-
-err_injection_tool.c:
-
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Copyright (C) 2006 Intel Co
- * Fenghua Yu <fenghua.yu@intel.com>
- *
- */
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <sched.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <time.h>
-#include <sys/ipc.h>
-#include <sys/sem.h>
-#include <sys/wait.h>
-#include <sys/mman.h>
-#include <sys/shm.h>
-
-#define MAX_FN_SIZE 256
-#define MAX_BUF_SIZE 256
-#define DATA_BUF_SIZE 256
-#define NR_CPUS 512
-#define MAX_TASK_NUM 2048
-#define MIN_INTERVAL 5 // seconds
-#define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte.
-#define PARA_FIELD_NUM 5
-#define MASK_SIZE (NR_CPUS/64)
-#define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/"
-
-int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask);
-
-int verbose;
-#define vbprintf if (verbose) printf
-
-int log_info(int cpu, const char *fmt, ...)
-{
- FILE *log;
- char fn[MAX_FN_SIZE];
- char buf[MAX_BUF_SIZE];
- va_list args;
-
- sprintf(fn, "%d.log", cpu);
- log=fopen(fn, "a+");
- if (log==NULL) {
- perror("Error open:");
- return -1;
- }
-
- va_start(args, fmt);
- vprintf(fmt, args);
- memset(buf, 0, MAX_BUF_SIZE);
- vsprintf(buf, fmt, args);
- va_end(args);
-
- fwrite(buf, sizeof(buf), 1, log);
- fclose(log);
-
- return 0;
-}
-
-typedef unsigned long u64;
-typedef unsigned int u32;
-
-typedef union err_type_info_u {
- struct {
- u64 mode : 3, /* 0-2 */
- err_inj : 3, /* 3-5 */
- err_sev : 2, /* 6-7 */
- err_struct : 5, /* 8-12 */
- struct_hier : 3, /* 13-15 */
- reserved : 48; /* 16-63 */
- } err_type_info_u;
- u64 err_type_info;
-} err_type_info_t;
-
-typedef union err_struct_info_u {
- struct {
- u64 siv : 1, /* 0 */
- c_t : 2, /* 1-2 */
- cl_p : 3, /* 3-5 */
- cl_id : 3, /* 6-8 */
- cl_dp : 1, /* 9 */
- reserved1 : 22, /* 10-31 */
- tiv : 1, /* 32 */
- trigger : 4, /* 33-36 */
- trigger_pl : 3, /* 37-39 */
- reserved2 : 24; /* 40-63 */
- } err_struct_info_cache;
- struct {
- u64 siv : 1, /* 0 */
- tt : 2, /* 1-2 */
- tc_tr : 2, /* 3-4 */
- tr_slot : 8, /* 5-12 */
- reserved1 : 19, /* 13-31 */
- tiv : 1, /* 32 */
- trigger : 4, /* 33-36 */
- trigger_pl : 3, /* 37-39 */
- reserved2 : 24; /* 40-63 */
- } err_struct_info_tlb;
- struct {
- u64 siv : 1, /* 0 */
- regfile_id : 4, /* 1-4 */
- reg_num : 7, /* 5-11 */
- reserved1 : 20, /* 12-31 */
- tiv : 1, /* 32 */
- trigger : 4, /* 33-36 */
- trigger_pl : 3, /* 37-39 */
- reserved2 : 24; /* 40-63 */
- } err_struct_info_register;
- struct {
- u64 reserved;
- } err_struct_info_bus_processor_interconnect;
- u64 err_struct_info;
-} err_struct_info_t;
-
-typedef union err_data_buffer_u {
- struct {
- u64 trigger_addr; /* 0-63 */
- u64 inj_addr; /* 64-127 */
- u64 way : 5, /* 128-132 */
- index : 20, /* 133-152 */
- : 39; /* 153-191 */
- } err_data_buffer_cache;
- struct {
- u64 trigger_addr; /* 0-63 */
- u64 inj_addr; /* 64-127 */
- u64 way : 5, /* 128-132 */
- index : 20, /* 133-152 */
- reserved : 39; /* 153-191 */
- } err_data_buffer_tlb;
- struct {
- u64 trigger_addr; /* 0-63 */
- } err_data_buffer_register;
- struct {
- u64 reserved; /* 0-63 */
- } err_data_buffer_bus_processor_interconnect;
- u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-} err_data_buffer_t;
-
-typedef union capabilities_u {
- struct {
- u64 i : 1,
- d : 1,
- rv : 1,
- tag : 1,
- data : 1,
- mesi : 1,
- dp : 1,
- reserved1 : 3,
- pa : 1,
- va : 1,
- wi : 1,
- reserved2 : 20,
- trigger : 1,
- trigger_pl : 1,
- reserved3 : 30;
- } capabilities_cache;
- struct {
- u64 d : 1,
- i : 1,
- rv : 1,
- tc : 1,
- tr : 1,
- reserved1 : 27,
- trigger : 1,
- trigger_pl : 1,
- reserved2 : 30;
- } capabilities_tlb;
- struct {
- u64 gr_b0 : 1,
- gr_b1 : 1,
- fr : 1,
- br : 1,
- pr : 1,
- ar : 1,
- cr : 1,
- rr : 1,
- pkr : 1,
- dbr : 1,
- ibr : 1,
- pmc : 1,
- pmd : 1,
- reserved1 : 3,
- regnum : 1,
- reserved2 : 15,
- trigger : 1,
- trigger_pl : 1,
- reserved3 : 30;
- } capabilities_register;
- struct {
- u64 reserved;
- } capabilities_bus_processor_interconnect;
-} capabilities_t;
-
-typedef struct resources_s {
- u64 ibr0 : 1,
- ibr2 : 1,
- ibr4 : 1,
- ibr6 : 1,
- dbr0 : 1,
- dbr2 : 1,
- dbr4 : 1,
- dbr6 : 1,
- reserved : 48;
-} resources_t;
-
-
-long get_page_size(void)
-{
- long page_size=sysconf(_SC_PAGESIZE);
- return page_size;
-}
-
-#define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size())
-#define SHM_SIZE (2*PAGE_SIZE*NR_CPUS)
-#define SHM_VA 0x2000000100000000
-
-int shmid;
-void *shmaddr;
-
-int create_shm(void)
-{
- key_t key;
- char fn[MAX_FN_SIZE];
-
- /* cpu0 is always existing */
- sprintf(fn, PATH_FORMAT, 0);
- if ((key = ftok(fn, 's')) == -1) {
- perror("ftok");
- return -1;
- }
-
- shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT);
- if (shmid == -1) {
- if (errno==EEXIST) {
- shmid = shmget(key, SHM_SIZE, 0);
- if (shmid == -1) {
- perror("shmget");
- return -1;
- }
- }
- else {
- perror("shmget");
- return -1;
- }
- }
- vbprintf("shmid=%d", shmid);
-
- /* connect to the segment: */
- shmaddr = shmat(shmid, (void *)SHM_VA, 0);
- if (shmaddr == (void*)-1) {
- perror("shmat");
- return -1;
- }
-
- memset(shmaddr, 0, SHM_SIZE);
- mlock(shmaddr, SHM_SIZE);
-
- return 0;
-}
-
-int free_shm()
-{
- munlock(shmaddr, SHM_SIZE);
- shmdt(shmaddr);
- semctl(shmid, 0, IPC_RMID);
-
- return 0;
-}
-
-#ifdef _SEM_SEMUN_UNDEFINED
-union semun
-{
- int val;
- struct semid_ds *buf;
- unsigned short int *array;
- struct seminfo *__buf;
-};
-#endif
-
-u32 mode=1; /* 1: physical mode; 2: virtual mode. */
-int one_lock=1;
-key_t key[NR_CPUS];
-int semid[NR_CPUS];
-
-int create_sem(int cpu)
-{
- union semun arg;
- char fn[MAX_FN_SIZE];
- int sid;
-
- sprintf(fn, PATH_FORMAT, cpu);
- sprintf(fn, "%s/%s", fn, "err_type_info");
- if ((key[cpu] = ftok(fn, 'e')) == -1) {
- perror("ftok");
- return -1;
- }
-
- if (semid[cpu]!=0)
- return 0;
-
- /* clear old semaphore */
- if ((sid = semget(key[cpu], 1, 0)) != -1)
- semctl(sid, 0, IPC_RMID);
-
- /* get one semaphore */
- if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) {
- perror("semget");
- printf("Please remove semaphore with key=0x%lx, then run the tool.\n",
- (u64)key[cpu]);
- return -1;
- }
-
- vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu,
- (u64)key[cpu]);
- /* initialize the semaphore to 1: */
- arg.val = 1;
- if (semctl(semid[cpu], 0, SETVAL, arg) == -1) {
- perror("semctl");
- return -1;
- }
-
- return 0;
-}
-
-static int lock(int cpu)
-{
- struct sembuf lock;
-
- lock.sem_num = cpu;
- lock.sem_op = 1;
- semop(semid[cpu], &lock, 1);
-
- return 0;
-}
-
-static int unlock(int cpu)
-{
- struct sembuf unlock;
-
- unlock.sem_num = cpu;
- unlock.sem_op = -1;
- semop(semid[cpu], &unlock, 1);
-
- return 0;
-}
-
-void free_sem(int cpu)
-{
- semctl(semid[cpu], 0, IPC_RMID);
-}
-
-int wr_multi(char *fn, unsigned long *data, int size)
-{
- int fd;
- char buf[MAX_BUF_SIZE];
- int ret;
-
- if (size==1)
- sprintf(buf, "%lx", *data);
- else if (size==3)
- sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]);
- else {
- fprintf(stderr,"write to file with wrong size!\n");
- return -1;
- }
-
- fd=open(fn, O_RDWR);
- if (!fd) {
- perror("Error:");
- return -1;
- }
- ret=write(fd, buf, sizeof(buf));
- close(fd);
- return ret;
-}
-
-int wr(char *fn, unsigned long data)
-{
- return wr_multi(fn, &data, 1);
-}
-
-int rd(char *fn, unsigned long *data)
-{
- int fd;
- char buf[MAX_BUF_SIZE];
-
- fd=open(fn, O_RDONLY);
- if (fd<0) {
- perror("Error:");
- return -1;
- }
- read(fd, buf, MAX_BUF_SIZE);
- *data=strtoul(buf, NULL, 16);
- close(fd);
- return 0;
-}
-
-int rd_status(char *path, int *status)
-{
- char fn[MAX_FN_SIZE];
- sprintf(fn, "%s/status", path);
- if (rd(fn, (u64*)status)<0) {
- perror("status reading error.\n");
- return -1;
- }
-
- return 0;
-}
-
-int rd_capabilities(char *path, u64 *capabilities)
-{
- char fn[MAX_FN_SIZE];
- sprintf(fn, "%s/capabilities", path);
- if (rd(fn, capabilities)<0) {
- perror("capabilities reading error.\n");
- return -1;
- }
-
- return 0;
-}
-
-int rd_all(char *path)
-{
- unsigned long err_type_info, err_struct_info, err_data_buffer;
- int status;
- unsigned long capabilities, resources;
- char fn[MAX_FN_SIZE];
-
- sprintf(fn, "%s/err_type_info", path);
- if (rd(fn, &err_type_info)<0) {
- perror("err_type_info reading error.\n");
- return -1;
- }
- printf("err_type_info=%lx\n", err_type_info);
-
- sprintf(fn, "%s/err_struct_info", path);
- if (rd(fn, &err_struct_info)<0) {
- perror("err_struct_info reading error.\n");
- return -1;
- }
- printf("err_struct_info=%lx\n", err_struct_info);
-
- sprintf(fn, "%s/err_data_buffer", path);
- if (rd(fn, &err_data_buffer)<0) {
- perror("err_data_buffer reading error.\n");
- return -1;
- }
- printf("err_data_buffer=%lx\n", err_data_buffer);
-
- sprintf(fn, "%s/status", path);
- if (rd("status", (u64*)&status)<0) {
- perror("status reading error.\n");
- return -1;
- }
- printf("status=%d\n", status);
-
- sprintf(fn, "%s/capabilities", path);
- if (rd(fn,&capabilities)<0) {
- perror("capabilities reading error.\n");
- return -1;
- }
- printf("capabilities=%lx\n", capabilities);
-
- sprintf(fn, "%s/resources", path);
- if (rd(fn, &resources)<0) {
- perror("resources reading error.\n");
- return -1;
- }
- printf("resources=%lx\n", resources);
-
- return 0;
-}
-
-int query_capabilities(char *path, err_type_info_t err_type_info,
- u64 *capabilities)
-{
- char fn[MAX_FN_SIZE];
- err_struct_info_t err_struct_info;
- err_data_buffer_t err_data_buffer;
-
- err_struct_info.err_struct_info=0;
- memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8);
-
- sprintf(fn, "%s/err_type_info", path);
- wr(fn, err_type_info.err_type_info);
- sprintf(fn, "%s/err_struct_info", path);
- wr(fn, 0x0);
- sprintf(fn, "%s/err_data_buffer", path);
- wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
- // Fire pal_mc_error_inject procedure.
- sprintf(fn, "%s/call_start", path);
- wr(fn, mode);
-
- if (rd_capabilities(path, capabilities)<0)
- return -1;
-
- return 0;
-}
-
-int query_all_capabilities()
-{
- int status;
- err_type_info_t err_type_info;
- int err_sev, err_struct, struct_hier;
- int cap=0;
- u64 capabilities;
- char path[MAX_FN_SIZE];
-
- err_type_info.err_type_info=0; // Initial
- err_type_info.err_type_info_u.mode=0; // Query mode;
- err_type_info.err_type_info_u.err_inj=0;
-
- printf("All capabilities implemented in pal_mc_error_inject:\n");
- sprintf(path, PATH_FORMAT ,0);
- for (err_sev=0;err_sev<3;err_sev++)
- for (err_struct=0;err_struct<5;err_struct++)
- for (struct_hier=0;struct_hier<5;struct_hier++)
- {
- status=-1;
- capabilities=0;
- err_type_info.err_type_info_u.err_sev=err_sev;
- err_type_info.err_type_info_u.err_struct=err_struct;
- err_type_info.err_type_info_u.struct_hier=struct_hier;
-
- if (query_capabilities(path, err_type_info, &capabilities)<0)
- continue;
-
- if (rd_status(path, &status)<0)
- continue;
-
- if (status==0) {
- cap=1;
- printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ",
- err_sev, err_struct, struct_hier);
- printf("capabilities 0x%lx\n", capabilities);
- }
- }
- if (!cap) {
- printf("No capabilities supported.\n");
- return 0;
- }
-
- return 0;
-}
-
-int err_inject(int cpu, char *path, err_type_info_t err_type_info,
- err_struct_info_t err_struct_info,
- err_data_buffer_t err_data_buffer)
-{
- int status;
- char fn[MAX_FN_SIZE];
-
- log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ",
- err_type_info.err_type_info,
- err_struct_info.err_struct_info);
- log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n",
- err_data_buffer.err_data_buffer[0],
- err_data_buffer.err_data_buffer[1],
- err_data_buffer.err_data_buffer[2]);
- sprintf(fn, "%s/err_type_info", path);
- wr(fn, err_type_info.err_type_info);
- sprintf(fn, "%s/err_struct_info", path);
- wr(fn, err_struct_info.err_struct_info);
- sprintf(fn, "%s/err_data_buffer", path);
- wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE);
-
- // Fire pal_mc_error_inject procedure.
- sprintf(fn, "%s/call_start", path);
- wr(fn,mode);
-
- if (rd_status(path, &status)<0) {
- vbprintf("fail: read status\n");
- return -100;
- }
-
- if (status!=0) {
- log_info(cpu, "fail: status=%d\n", status);
- return status;
- }
-
- return status;
-}
-
-static int construct_data_buf(char *path, err_type_info_t err_type_info,
- err_struct_info_t err_struct_info,
- err_data_buffer_t *err_data_buffer,
- void *va1)
-{
- char fn[MAX_FN_SIZE];
- u64 virt_addr=0, phys_addr=0;
-
- vbprintf("va1=%lx\n", (u64)va1);
- memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8);
-
- switch (err_type_info.err_type_info_u.err_struct) {
- case 1: // Cache
- switch (err_struct_info.err_struct_info_cache.cl_id) {
- case 1: //Virtual addr
- err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1;
- break;
- case 2: //Phys addr
- sprintf(fn, "%s/virtual_to_phys", path);
- virt_addr=(u64)va1;
- if (wr(fn,virt_addr)<0)
- return -1;
- rd(fn, &phys_addr);
- err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr;
- break;
- default:
- printf("Not supported cl_id\n");
- break;
- }
- break;
- case 2: // TLB
- break;
- case 3: // Register file
- break;
- case 4: // Bus/system interconnect
- default:
- printf("Not supported err_struct\n");
- break;
- }
-
- return 0;
-}
-
-typedef struct {
- u64 cpu;
- u64 loop;
- u64 interval;
- u64 err_type_info;
- u64 err_struct_info;
- u64 err_data_buffer[ERR_DATA_BUFFER_SIZE];
-} parameters_t;
-
-parameters_t line_para;
-int para;
-
-static int empty_data_buffer(u64 *err_data_buffer)
-{
- int empty=1;
- int i;
-
- for (i=0;i<ERR_DATA_BUFFER_SIZE; i++)
- if (err_data_buffer[i]!=-1)
- empty=0;
-
- return empty;
-}
-
-int err_inj()
-{
- err_type_info_t err_type_info;
- err_struct_info_t err_struct_info;
- err_data_buffer_t err_data_buffer;
- int count;
- FILE *fp;
- unsigned long cpu, loop, interval, err_type_info_conf, err_struct_info_conf;
- u64 err_data_buffer_conf[ERR_DATA_BUFFER_SIZE];
- int num;
- int i;
- char path[MAX_FN_SIZE];
- parameters_t parameters[MAX_TASK_NUM]={};
- pid_t child_pid[MAX_TASK_NUM];
- time_t current_time;
- int status;
-
- if (!para) {
- fp=fopen("err.conf", "r");
- if (fp==NULL) {
- perror("Error open err.conf");
- return -1;
- }
-
- num=0;
- while (!feof(fp)) {
- char buf[256];
- memset(buf,0,256);
- fgets(buf, 256, fp);
- count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
- &cpu, &loop, &interval,&err_type_info_conf,
- &err_struct_info_conf,
- &err_data_buffer_conf[0],
- &err_data_buffer_conf[1],
- &err_data_buffer_conf[2]);
- if (count!=PARA_FIELD_NUM+3) {
- err_data_buffer_conf[0]=-1;
- err_data_buffer_conf[1]=-1;
- err_data_buffer_conf[2]=-1;
- count=sscanf(buf, "%lx, %lx, %lx, %lx, %lx\n",
- &cpu, &loop, &interval,&err_type_info_conf,
- &err_struct_info_conf);
- if (count!=PARA_FIELD_NUM)
- continue;
- }
-
- parameters[num].cpu=cpu;
- parameters[num].loop=loop;
- parameters[num].interval= interval>MIN_INTERVAL
- ?interval:MIN_INTERVAL;
- parameters[num].err_type_info=err_type_info_conf;
- parameters[num].err_struct_info=err_struct_info_conf;
- memcpy(parameters[num++].err_data_buffer,
- err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ;
-
- if (num>=MAX_TASK_NUM)
- break;
- }
- }
- else {
- parameters[0].cpu=line_para.cpu;
- parameters[0].loop=line_para.loop;
- parameters[0].interval= line_para.interval>MIN_INTERVAL
- ?line_para.interval:MIN_INTERVAL;
- parameters[0].err_type_info=line_para.err_type_info;
- parameters[0].err_struct_info=line_para.err_struct_info;
- memcpy(parameters[0].err_data_buffer,
- line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ;
-
- num=1;
- }
-
- /* Create semaphore: If one_lock, one semaphore for all processors.
- Otherwise, one semaphore for each processor. */
- if (one_lock) {
- if (create_sem(0)) {
- printf("Can not create semaphore...exit\n");
- free_sem(0);
- return -1;
- }
- }
- else {
- for (i=0;i<num;i++) {
- if (create_sem(parameters[i].cpu)) {
- printf("Can not create semaphore for cpu%d...exit\n",i);
- free_sem(parameters[num].cpu);
- return -1;
- }
- }
- }
-
- /* Create a shm segment which will be used to inject/consume errors on.*/
- if (create_shm()==-1) {
- printf("Error to create shm...exit\n");
- return -1;
- }
-
- for (i=0;i<num;i++) {
- pid_t pid;
-
- current_time=time(NULL);
- log_info(parameters[i].cpu, "\nBegine at %s", ctime(&current_time));
- log_info(parameters[i].cpu, "Configurations:\n");
- log_info(parameters[i].cpu,"On cpu%ld: loop=%lx, interval=%lx(s)",
- parameters[i].cpu,
- parameters[i].loop,
- parameters[i].interval);
- log_info(parameters[i].cpu," err_type_info=%lx,err_struct_info=%lx\n",
- parameters[i].err_type_info,
- parameters[i].err_struct_info);
-
- sprintf(path, PATH_FORMAT, (int)parameters[i].cpu);
- err_type_info.err_type_info=parameters[i].err_type_info;
- err_struct_info.err_struct_info=parameters[i].err_struct_info;
- memcpy(err_data_buffer.err_data_buffer,
- parameters[i].err_data_buffer,
- ERR_DATA_BUFFER_SIZE*8);
-
- pid=fork();
- if (pid==0) {
- unsigned long mask[MASK_SIZE];
- int j, k;
-
- void *va1, *va2;
-
- /* Allocate two memory areas va1 and va2 in shm */
- va1=shmaddr+parameters[i].cpu*PAGE_SIZE;
- va2=shmaddr+parameters[i].cpu*PAGE_SIZE+PAGE_SIZE;
-
- vbprintf("va1=%lx, va2=%lx\n", (u64)va1, (u64)va2);
- memset(va1, 0x1, PAGE_SIZE);
- memset(va2, 0x2, PAGE_SIZE);
-
- if (empty_data_buffer(err_data_buffer.err_data_buffer))
- /* If not specified yet, construct data buffer
- * with va1
- */
- construct_data_buf(path, err_type_info,
- err_struct_info, &err_data_buffer,va1);
-
- for (j=0;j<MASK_SIZE;j++)
- mask[j]=0;
-
- cpu=parameters[i].cpu;
- k = cpu%64;
- j = cpu/64;
- mask[j] = 1UL << k;
-
- if (sched_setaffinity(0, MASK_SIZE*8, mask)==-1) {
- perror("Error sched_setaffinity:");
- return -1;
- }
-
- for (j=0; j<parameters[i].loop; j++) {
- log_info(parameters[i].cpu,"Injection ");
- log_info(parameters[i].cpu,"on cpu%ld: #%d/%ld ",
-
- parameters[i].cpu,j+1, parameters[i].loop);
-
- /* Hold the lock */
- if (one_lock)
- lock(0);
- else
- /* Hold lock on this cpu */
- lock(parameters[i].cpu);
-
- if ((status=err_inject(parameters[i].cpu,
- path, err_type_info,
- err_struct_info, err_data_buffer))
- ==0) {
- /* consume the error for "inject only"*/
- memcpy(va2, va1, PAGE_SIZE);
- memcpy(va1, va2, PAGE_SIZE);
- log_info(parameters[i].cpu,
- "successful\n");
- }
- else {
- log_info(parameters[i].cpu,"fail:");
- log_info(parameters[i].cpu,
- "status=%d\n", status);
- unlock(parameters[i].cpu);
- break;
- }
- if (one_lock)
- /* Release the lock */
- unlock(0);
- /* Release lock on this cpu */
- else
- unlock(parameters[i].cpu);
-
- if (j < parameters[i].loop-1)
- sleep(parameters[i].interval);
- }
- current_time=time(NULL);
- log_info(parameters[i].cpu, "Done at %s", ctime(&current_time));
- return 0;
- }
- else if (pid<0) {
- perror("Error fork:");
- continue;
- }
- child_pid[i]=pid;
- }
- for (i=0;i<num;i++)
- waitpid(child_pid[i], NULL, 0);
-
- if (one_lock)
- free_sem(0);
- else
- for (i=0;i<num;i++)
- free_sem(parameters[i].cpu);
-
- printf("All done.\n");
-
- return 0;
-}
-
-void help()
-{
- printf("err_inject_tool:\n");
- printf("\t-q: query all capabilities. default: off\n");
- printf("\t-m: procedure mode. 1: physical 2: virtual. default: 1\n");
- printf("\t-i: inject errors. default: off\n");
- printf("\t-l: one lock per cpu. default: one lock for all\n");
- printf("\t-e: error parameters:\n");
- printf("\t\tcpu,loop,interval,err_type_info,err_struct_info[,err_data_buffer[0],err_data_buffer[1],err_data_buffer[2]]\n");
- printf("\t\t cpu: logical cpu number the error will be inject in.\n");
- printf("\t\t loop: times the error will be injected.\n");
- printf("\t\t interval: In second. every so often one error is injected.\n");
- printf("\t\t err_type_info, err_struct_info: PAL parameters.\n");
- printf("\t\t err_data_buffer: PAL parameter. Optional. If not present,\n");
- printf("\t\t it's constructed by tool automatically. Be\n");
- printf("\t\t careful to provide err_data_buffer and make\n");
- printf("\t\t sure it's working with the environment.\n");
- printf("\t Note:no space between error parameters.\n");
- printf("\t default: Take error parameters from err.conf instead of command line.\n");
- printf("\t-v: verbose. default: off\n");
- printf("\t-h: help\n\n");
- printf("The tool will take err.conf file as ");
- printf("input to inject single or multiple errors ");
- printf("on one or multiple cpus in parallel.\n");
-}
-
-int main(int argc, char **argv)
-{
- char c;
- int do_err_inj=0;
- int do_query_all=0;
- int count;
- u32 m;
-
- /* Default one lock for all cpu's */
- one_lock=1;
- while ((c = getopt(argc, argv, "m:iqvhle:")) != EOF)
- switch (c) {
- case 'm': /* Procedure mode. 1: phys 2: virt */
- count=sscanf(optarg, "%x", &m);
- if (count!=1 || (m!=1 && m!=2)) {
- printf("Wrong mode number.\n");
- help();
- return -1;
- }
- mode=m;
- break;
- case 'i': /* Inject errors */
- do_err_inj=1;
- break;
- case 'q': /* Query */
- do_query_all=1;
- break;
- case 'v': /* Verbose */
- verbose=1;
- break;
- case 'l': /* One lock per cpu */
- one_lock=0;
- break;
- case 'e': /* error arguments */
- /* Take parameters:
- * #cpu, loop, interval, err_type_info, err_struct_info[, err_data_buffer]
- * err_data_buffer is optional. Recommend not to specify
- * err_data_buffer. Better to use tool to generate it.
- */
- count=sscanf(optarg,
- "%lx, %lx, %lx, %lx, %lx, %lx, %lx, %lx\n",
- &line_para.cpu,
- &line_para.loop,
- &line_para.interval,
- &line_para.err_type_info,
- &line_para.err_struct_info,
- &line_para.err_data_buffer[0],
- &line_para.err_data_buffer[1],
- &line_para.err_data_buffer[2]);
- if (count!=PARA_FIELD_NUM+3) {
- line_para.err_data_buffer[0]=-1,
- line_para.err_data_buffer[1]=-1,
- line_para.err_data_buffer[2]=-1;
- count=sscanf(optarg, "%lx, %lx, %lx, %lx, %lx\n",
- &line_para.cpu,
- &line_para.loop,
- &line_para.interval,
- &line_para.err_type_info,
- &line_para.err_struct_info);
- if (count!=PARA_FIELD_NUM) {
- printf("Wrong error arguments.\n");
- help();
- return -1;
- }
- }
- para=1;
- break;
- continue;
- break;
- case 'h':
- help();
- return 0;
- default:
- break;
- }
-
- if (do_query_all)
- query_all_capabilities();
- if (do_err_inj)
- err_inj();
-
- if (!do_query_all && !do_err_inj)
- help();
-
- return 0;
-}
-
diff --git a/Documentation/ia64/fsys.rst b/Documentation/ia64/fsys.rst
new file mode 100644
index 000000000000..a702d2cc94b6
--- /dev/null
+++ b/Documentation/ia64/fsys.rst
@@ -0,0 +1,303 @@
+===================================
+Light-weight System Calls for IA-64
+===================================
+
+ Started: 13-Jan-2003
+
+ Last update: 27-Sep-2003
+
+ David Mosberger-Tang
+ <davidm@hpl.hp.com>
+
+Using the "epc" instruction effectively introduces a new mode of
+execution to the ia64 linux kernel. We call this mode the
+"fsys-mode". To recap, the normal states of execution are:
+
+ - kernel mode:
+ Both the register stack and the memory stack have been
+ switched over to kernel memory. The user-level state is saved
+ in a pt-regs structure at the top of the kernel memory stack.
+
+ - user mode:
+ Both the register stack and the kernel stack are in
+ user memory. The user-level state is contained in the
+ CPU registers.
+
+ - bank 0 interruption-handling mode:
+ This is the non-interruptible state which all
+ interruption-handlers start execution in. The user-level
+ state remains in the CPU registers and some kernel state may
+ be stored in bank 0 of registers r16-r31.
+
+In contrast, fsys-mode has the following special properties:
+
+ - execution is at privilege level 0 (most-privileged)
+
+ - CPU registers may contain a mixture of user-level and kernel-level
+ state (it is the responsibility of the kernel to ensure that no
+ security-sensitive kernel-level state is leaked back to
+ user-level)
+
+ - execution is interruptible and preemptible (an fsys-mode handler
+ can disable interrupts and avoid all other interruption-sources
+ to avoid preemption)
+
+ - neither the memory-stack nor the register-stack can be trusted while
+ in fsys-mode (they point to the user-level stacks, which may
+ be invalid, or completely bogus addresses)
+
+In summary, fsys-mode is much more similar to running in user-mode
+than it is to running in kernel-mode. Of course, given that the
+privilege level is at level 0, this means that fsys-mode requires some
+care (see below).
+
+
+How to tell fsys-mode
+=====================
+
+Linux operates in fsys-mode when (a) the privilege level is 0 (most
+privileged) and (b) the stacks have NOT been switched to kernel memory
+yet. For convenience, the header file <asm-ia64/ptrace.h> provides
+three macros::
+
+ user_mode(regs)
+ user_stack(task,regs)
+ fsys_mode(task,regs)
+
+The "regs" argument is a pointer to a pt_regs structure. The "task"
+argument is a pointer to the task structure to which the "regs"
+pointer belongs to. user_mode() returns TRUE if the CPU state pointed
+to by "regs" was executing in user mode (privilege level 3).
+user_stack() returns TRUE if the state pointed to by "regs" was
+executing on the user-level stack(s). Finally, fsys_mode() returns
+TRUE if the CPU state pointed to by "regs" was executing in fsys-mode.
+The fsys_mode() macro is equivalent to the expression::
+
+ !user_mode(regs) && user_stack(task,regs)
+
+How to write an fsyscall handler
+================================
+
+The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers
+(fsyscall_table). This table contains one entry for each system call.
+By default, a system call is handled by fsys_fallback_syscall(). This
+routine takes care of entering (full) kernel mode and calling the
+normal Linux system call handler. For performance-critical system
+calls, it is possible to write a hand-tuned fsyscall_handler. For
+example, fsys.S contains fsys_getpid(), which is a hand-tuned version
+of the getpid() system call.
+
+The entry and exit-state of an fsyscall handler is as follows:
+
+Machine state on entry to fsyscall handler
+------------------------------------------
+
+ ========= ===============================================================
+ r10 0
+ r11 saved ar.pfs (a user-level value)
+ r15 system call number
+ r16 "current" task pointer (in normal kernel-mode, this is in r13)
+ r32-r39 system call arguments
+ b6 return address (a user-level value)
+ ar.pfs previous frame-state (a user-level value)
+ PSR.be cleared to zero (i.e., little-endian byte order is in effect)
+ - all other registers may contain values passed in from user-mode
+ ========= ===============================================================
+
+Required machine state on exit to fsyscall handler
+--------------------------------------------------
+
+ ========= ===========================================================
+ r11 saved ar.pfs (as passed into the fsyscall handler)
+ r15 system call number (as passed into the fsyscall handler)
+ r32-r39 system call arguments (as passed into the fsyscall handler)
+ b6 return address (as passed into the fsyscall handler)
+ ar.pfs previous frame-state (as passed into the fsyscall handler)
+ ========= ===========================================================
+
+Fsyscall handlers can execute with very little overhead, but with that
+speed comes a set of restrictions:
+
+ * Fsyscall-handlers MUST check for any pending work in the flags
+ member of the thread-info structure and if any of the
+ TIF_ALLWORK_MASK flags are set, the handler needs to fall back on
+ doing a full system call (by calling fsys_fallback_syscall).
+
+ * Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11,
+ r15, b6, and ar.pfs) because they will be needed in case of a
+ system call restart. Of course, all "preserved" registers also
+ must be preserved, in accordance to the normal calling conventions.
+
+ * Fsyscall-handlers MUST check argument registers for containing a
+ NaT value before using them in any way that could trigger a
+ NaT-consumption fault. If a system call argument is found to
+ contain a NaT value, an fsyscall-handler may return immediately
+ with r8=EINVAL, r10=-1.
+
+ * Fsyscall-handlers MUST NOT use the "alloc" instruction or perform
+ any other operation that would trigger mandatory RSE
+ (register-stack engine) traffic.
+
+ * Fsyscall-handlers MUST NOT write to any stacked registers because
+ it is not safe to assume that user-level called a handler with the
+ proper number of arguments.
+
+ * Fsyscall-handlers need to be careful when accessing per-CPU variables:
+ unless proper safe-guards are taken (e.g., interruptions are avoided),
+ execution may be pre-empted and resumed on another CPU at any given
+ time.
+
+ * Fsyscall-handlers must be careful not to leak sensitive kernel'
+ information back to user-level. In particular, before returning to
+ user-level, care needs to be taken to clear any scratch registers
+ that could contain sensitive information (note that the current
+ task pointer is not considered sensitive: it's already exposed
+ through ar.k6).
+
+ * Fsyscall-handlers MUST NOT access user-memory without first
+ validating access-permission (this can be done typically via
+ probe.r.fault and/or probe.w.fault) and without guarding against
+ memory access exceptions (this can be done with the EX() macros
+ defined by asmmacro.h).
+
+The above restrictions may seem draconian, but remember that it's
+possible to trade off some of the restrictions by paying a slightly
+higher overhead. For example, if an fsyscall-handler could benefit
+from the shadow register bank, it could temporarily disable PSR.i and
+PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as
+needed. In other words, following the above rules yields extremely
+fast system call execution (while fully preserving system call
+semantics), but there is also a lot of flexibility in handling more
+complicated cases.
+
+Signal handling
+===============
+
+The delivery of (asynchronous) signals must be delayed until fsys-mode
+is exited. This is accomplished with the help of the lower-privilege
+transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user()
+checks whether the interrupted task was in fsys-mode and, if so, sets
+PSR.lp and returns immediately. When fsys-mode is exited via the
+"br.ret" instruction that lowers the privilege level, a trap will
+occur. The trap handler clears PSR.lp again and returns immediately.
+The kernel exit path then checks for and delivers any pending signals.
+
+PSR Handling
+============
+
+The "epc" instruction doesn't change the contents of PSR at all. This
+is in contrast to a regular interruption, which clears almost all
+bits. Because of that, some care needs to be taken to ensure things
+work as expected. The following discussion describes how each PSR bit
+is handled.
+
+======= =======================================================================
+PSR.be Cleared when entering fsys-mode. A srlz.d instruction is used
+ to ensure the CPU is in little-endian mode before the first
+ load/store instruction is executed. PSR.be is normally NOT
+ restored upon return from an fsys-mode handler. In other
+ words, user-level code must not rely on PSR.be being preserved
+ across a system call.
+PSR.up Unchanged.
+PSR.ac Unchanged.
+PSR.mfl Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.mfh Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.ic Unchanged. Note: fsys-mode handlers can clear the bit, if needed.
+PSR.i Unchanged. Note: fsys-mode handlers can clear the bit, if needed.
+PSR.pk Unchanged.
+PSR.dt Unchanged.
+PSR.dfl Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.dfh Unchanged. Note: fsys-mode handlers must not write-registers!
+PSR.sp Unchanged.
+PSR.pp Unchanged.
+PSR.di Unchanged.
+PSR.si Unchanged.
+PSR.db Unchanged. The kernel prevents user-level from setting a hardware
+ breakpoint that triggers at any privilege level other than
+ 3 (user-mode).
+PSR.lp Unchanged.
+PSR.tb Lazy redirect. If a taken-branch trap occurs while in
+ fsys-mode, the trap-handler modifies the saved machine state
+ such that execution resumes in the gate page at
+ syscall_via_break(), with privilege level 3. Note: the
+ taken branch would occur on the branch invoking the
+ fsyscall-handler, at which point, by definition, a syscall
+ restart is still safe. If the system call number is invalid,
+ the fsys-mode handler will return directly to user-level. This
+ return will trigger a taken-branch trap, but since the trap is
+ taken _after_ restoring the privilege level, the CPU has already
+ left fsys-mode, so no special treatment is needed.
+PSR.rt Unchanged.
+PSR.cpl Cleared to 0.
+PSR.is Unchanged (guaranteed to be 0 on entry to the gate page).
+PSR.mc Unchanged.
+PSR.it Unchanged (guaranteed to be 1).
+PSR.id Unchanged. Note: the ia64 linux kernel never sets this bit.
+PSR.da Unchanged. Note: the ia64 linux kernel never sets this bit.
+PSR.dd Unchanged. Note: the ia64 linux kernel never sets this bit.
+PSR.ss Lazy redirect. If set, "epc" will cause a Single Step Trap to
+ be taken. The trap handler then modifies the saved machine
+ state such that execution resumes in the gate page at
+ syscall_via_break(), with privilege level 3.
+PSR.ri Unchanged.
+PSR.ed Unchanged. Note: This bit could only have an effect if an fsys-mode
+ handler performed a speculative load that gets NaTted. If so, this
+ would be the normal & expected behavior, so no special treatment is
+ needed.
+PSR.bn Unchanged. Note: fsys-mode handlers may clear the bit, if needed.
+ Doing so requires clearing PSR.i and PSR.ic as well.
+PSR.ia Unchanged. Note: the ia64 linux kernel never sets this bit.
+======= =======================================================================
+
+Using fast system calls
+=======================
+
+To use fast system calls, userspace applications need simply call
+__kernel_syscall_via_epc(). For example
+
+-- example fgettimeofday() call --
+
+-- fgettimeofday.S --
+
+::
+
+ #include <asm/asmmacro.h>
+
+ GLOBAL_ENTRY(fgettimeofday)
+ .prologue
+ .save ar.pfs, r11
+ mov r11 = ar.pfs
+ .body
+
+ mov r2 = 0xa000000000020660;; // gate address
+ // found by inspection of System.map for the
+ // __kernel_syscall_via_epc() function. See
+ // below for how to do this for real.
+
+ mov b7 = r2
+ mov r15 = 1087 // gettimeofday syscall
+ ;;
+ br.call.sptk.many b6 = b7
+ ;;
+
+ .restore sp
+
+ mov ar.pfs = r11
+ br.ret.sptk.many rp;; // return to caller
+ END(fgettimeofday)
+
+-- end fgettimeofday.S --
+
+In reality, getting the gate address is accomplished by two extra
+values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
+
+ * AT_SYSINFO : is the address of __kernel_syscall_via_epc()
+ * AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
+
+The ELF DSO is a pre-linked library that is mapped in by the kernel at
+the gate page. It is a proper ELF shared object so, with a dynamic
+loader that recognises the library, you should be able to make calls to
+the exported functions within it as with any other shared library.
+AT_SYSINFO points into the kernel DSO at the
+__kernel_syscall_via_epc() function for historical reasons (it was
+used before the kernel DSO) and as a convenience.
diff --git a/Documentation/ia64/fsys.txt b/Documentation/ia64/fsys.txt
deleted file mode 100644
index 59dd689d9b86..000000000000
--- a/Documentation/ia64/fsys.txt
+++ /dev/null
@@ -1,286 +0,0 @@
--*-Mode: outline-*-
-
- Light-weight System Calls for IA-64
- -----------------------------------
-
- Started: 13-Jan-2003
- Last update: 27-Sep-2003
-
- David Mosberger-Tang
- <davidm@hpl.hp.com>
-
-Using the "epc" instruction effectively introduces a new mode of
-execution to the ia64 linux kernel. We call this mode the
-"fsys-mode". To recap, the normal states of execution are:
-
- - kernel mode:
- Both the register stack and the memory stack have been
- switched over to kernel memory. The user-level state is saved
- in a pt-regs structure at the top of the kernel memory stack.
-
- - user mode:
- Both the register stack and the kernel stack are in
- user memory. The user-level state is contained in the
- CPU registers.
-
- - bank 0 interruption-handling mode:
- This is the non-interruptible state which all
- interruption-handlers start execution in. The user-level
- state remains in the CPU registers and some kernel state may
- be stored in bank 0 of registers r16-r31.
-
-In contrast, fsys-mode has the following special properties:
-
- - execution is at privilege level 0 (most-privileged)
-
- - CPU registers may contain a mixture of user-level and kernel-level
- state (it is the responsibility of the kernel to ensure that no
- security-sensitive kernel-level state is leaked back to
- user-level)
-
- - execution is interruptible and preemptible (an fsys-mode handler
- can disable interrupts and avoid all other interruption-sources
- to avoid preemption)
-
- - neither the memory-stack nor the register-stack can be trusted while
- in fsys-mode (they point to the user-level stacks, which may
- be invalid, or completely bogus addresses)
-
-In summary, fsys-mode is much more similar to running in user-mode
-than it is to running in kernel-mode. Of course, given that the
-privilege level is at level 0, this means that fsys-mode requires some
-care (see below).
-
-
-* How to tell fsys-mode
-
-Linux operates in fsys-mode when (a) the privilege level is 0 (most
-privileged) and (b) the stacks have NOT been switched to kernel memory
-yet. For convenience, the header file <asm-ia64/ptrace.h> provides
-three macros:
-
- user_mode(regs)
- user_stack(task,regs)
- fsys_mode(task,regs)
-
-The "regs" argument is a pointer to a pt_regs structure. The "task"
-argument is a pointer to the task structure to which the "regs"
-pointer belongs to. user_mode() returns TRUE if the CPU state pointed
-to by "regs" was executing in user mode (privilege level 3).
-user_stack() returns TRUE if the state pointed to by "regs" was
-executing on the user-level stack(s). Finally, fsys_mode() returns
-TRUE if the CPU state pointed to by "regs" was executing in fsys-mode.
-The fsys_mode() macro is equivalent to the expression:
-
- !user_mode(regs) && user_stack(task,regs)
-
-* How to write an fsyscall handler
-
-The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers
-(fsyscall_table). This table contains one entry for each system call.
-By default, a system call is handled by fsys_fallback_syscall(). This
-routine takes care of entering (full) kernel mode and calling the
-normal Linux system call handler. For performance-critical system
-calls, it is possible to write a hand-tuned fsyscall_handler. For
-example, fsys.S contains fsys_getpid(), which is a hand-tuned version
-of the getpid() system call.
-
-The entry and exit-state of an fsyscall handler is as follows:
-
-** Machine state on entry to fsyscall handler:
-
- - r10 = 0
- - r11 = saved ar.pfs (a user-level value)
- - r15 = system call number
- - r16 = "current" task pointer (in normal kernel-mode, this is in r13)
- - r32-r39 = system call arguments
- - b6 = return address (a user-level value)
- - ar.pfs = previous frame-state (a user-level value)
- - PSR.be = cleared to zero (i.e., little-endian byte order is in effect)
- - all other registers may contain values passed in from user-mode
-
-** Required machine state on exit to fsyscall handler:
-
- - r11 = saved ar.pfs (as passed into the fsyscall handler)
- - r15 = system call number (as passed into the fsyscall handler)
- - r32-r39 = system call arguments (as passed into the fsyscall handler)
- - b6 = return address (as passed into the fsyscall handler)
- - ar.pfs = previous frame-state (as passed into the fsyscall handler)
-
-Fsyscall handlers can execute with very little overhead, but with that
-speed comes a set of restrictions:
-
- o Fsyscall-handlers MUST check for any pending work in the flags
- member of the thread-info structure and if any of the
- TIF_ALLWORK_MASK flags are set, the handler needs to fall back on
- doing a full system call (by calling fsys_fallback_syscall).
-
- o Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11,
- r15, b6, and ar.pfs) because they will be needed in case of a
- system call restart. Of course, all "preserved" registers also
- must be preserved, in accordance to the normal calling conventions.
-
- o Fsyscall-handlers MUST check argument registers for containing a
- NaT value before using them in any way that could trigger a
- NaT-consumption fault. If a system call argument is found to
- contain a NaT value, an fsyscall-handler may return immediately
- with r8=EINVAL, r10=-1.
-
- o Fsyscall-handlers MUST NOT use the "alloc" instruction or perform
- any other operation that would trigger mandatory RSE
- (register-stack engine) traffic.
-
- o Fsyscall-handlers MUST NOT write to any stacked registers because
- it is not safe to assume that user-level called a handler with the
- proper number of arguments.
-
- o Fsyscall-handlers need to be careful when accessing per-CPU variables:
- unless proper safe-guards are taken (e.g., interruptions are avoided),
- execution may be pre-empted and resumed on another CPU at any given
- time.
-
- o Fsyscall-handlers must be careful not to leak sensitive kernel'
- information back to user-level. In particular, before returning to
- user-level, care needs to be taken to clear any scratch registers
- that could contain sensitive information (note that the current
- task pointer is not considered sensitive: it's already exposed
- through ar.k6).
-
- o Fsyscall-handlers MUST NOT access user-memory without first
- validating access-permission (this can be done typically via
- probe.r.fault and/or probe.w.fault) and without guarding against
- memory access exceptions (this can be done with the EX() macros
- defined by asmmacro.h).
-
-The above restrictions may seem draconian, but remember that it's
-possible to trade off some of the restrictions by paying a slightly
-higher overhead. For example, if an fsyscall-handler could benefit
-from the shadow register bank, it could temporarily disable PSR.i and
-PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as
-needed. In other words, following the above rules yields extremely
-fast system call execution (while fully preserving system call
-semantics), but there is also a lot of flexibility in handling more
-complicated cases.
-
-* Signal handling
-
-The delivery of (asynchronous) signals must be delayed until fsys-mode
-is exited. This is accomplished with the help of the lower-privilege
-transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user()
-checks whether the interrupted task was in fsys-mode and, if so, sets
-PSR.lp and returns immediately. When fsys-mode is exited via the
-"br.ret" instruction that lowers the privilege level, a trap will
-occur. The trap handler clears PSR.lp again and returns immediately.
-The kernel exit path then checks for and delivers any pending signals.
-
-* PSR Handling
-
-The "epc" instruction doesn't change the contents of PSR at all. This
-is in contrast to a regular interruption, which clears almost all
-bits. Because of that, some care needs to be taken to ensure things
-work as expected. The following discussion describes how each PSR bit
-is handled.
-
-PSR.be Cleared when entering fsys-mode. A srlz.d instruction is used
- to ensure the CPU is in little-endian mode before the first
- load/store instruction is executed. PSR.be is normally NOT
- restored upon return from an fsys-mode handler. In other
- words, user-level code must not rely on PSR.be being preserved
- across a system call.
-PSR.up Unchanged.
-PSR.ac Unchanged.
-PSR.mfl Unchanged. Note: fsys-mode handlers must not write-registers!
-PSR.mfh Unchanged. Note: fsys-mode handlers must not write-registers!
-PSR.ic Unchanged. Note: fsys-mode handlers can clear the bit, if needed.
-PSR.i Unchanged. Note: fsys-mode handlers can clear the bit, if needed.
-PSR.pk Unchanged.
-PSR.dt Unchanged.
-PSR.dfl Unchanged. Note: fsys-mode handlers must not write-registers!
-PSR.dfh Unchanged. Note: fsys-mode handlers must not write-registers!
-PSR.sp Unchanged.
-PSR.pp Unchanged.
-PSR.di Unchanged.
-PSR.si Unchanged.
-PSR.db Unchanged. The kernel prevents user-level from setting a hardware
- breakpoint that triggers at any privilege level other than 3 (user-mode).
-PSR.lp Unchanged.
-PSR.tb Lazy redirect. If a taken-branch trap occurs while in
- fsys-mode, the trap-handler modifies the saved machine state
- such that execution resumes in the gate page at
- syscall_via_break(), with privilege level 3. Note: the
- taken branch would occur on the branch invoking the
- fsyscall-handler, at which point, by definition, a syscall
- restart is still safe. If the system call number is invalid,
- the fsys-mode handler will return directly to user-level. This
- return will trigger a taken-branch trap, but since the trap is
- taken _after_ restoring the privilege level, the CPU has already
- left fsys-mode, so no special treatment is needed.
-PSR.rt Unchanged.
-PSR.cpl Cleared to 0.
-PSR.is Unchanged (guaranteed to be 0 on entry to the gate page).
-PSR.mc Unchanged.
-PSR.it Unchanged (guaranteed to be 1).
-PSR.id Unchanged. Note: the ia64 linux kernel never sets this bit.
-PSR.da Unchanged. Note: the ia64 linux kernel never sets this bit.
-PSR.dd Unchanged. Note: the ia64 linux kernel never sets this bit.
-PSR.ss Lazy redirect. If set, "epc" will cause a Single Step Trap to
- be taken. The trap handler then modifies the saved machine
- state such that execution resumes in the gate page at
- syscall_via_break(), with privilege level 3.
-PSR.ri Unchanged.
-PSR.ed Unchanged. Note: This bit could only have an effect if an fsys-mode
- handler performed a speculative load that gets NaTted. If so, this
- would be the normal & expected behavior, so no special treatment is
- needed.
-PSR.bn Unchanged. Note: fsys-mode handlers may clear the bit, if needed.
- Doing so requires clearing PSR.i and PSR.ic as well.
-PSR.ia Unchanged. Note: the ia64 linux kernel never sets this bit.
-
-* Using fast system calls
-
-To use fast system calls, userspace applications need simply call
-__kernel_syscall_via_epc(). For example
-
--- example fgettimeofday() call --
--- fgettimeofday.S --
-
-#include <asm/asmmacro.h>
-
-GLOBAL_ENTRY(fgettimeofday)
-.prologue
-.save ar.pfs, r11
-mov r11 = ar.pfs
-.body
-
-mov r2 = 0xa000000000020660;; // gate address
- // found by inspection of System.map for the
- // __kernel_syscall_via_epc() function. See
- // below for how to do this for real.
-
-mov b7 = r2
-mov r15 = 1087 // gettimeofday syscall
-;;
-br.call.sptk.many b6 = b7
-;;
-
-.restore sp
-
-mov ar.pfs = r11
-br.ret.sptk.many rp;; // return to caller
-END(fgettimeofday)
-
--- end fgettimeofday.S --
-
-In reality, getting the gate address is accomplished by two extra
-values passed via the ELF auxiliary vector (include/asm-ia64/elf.h)
-
- o AT_SYSINFO : is the address of __kernel_syscall_via_epc()
- o AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO
-
-The ELF DSO is a pre-linked library that is mapped in by the kernel at
-the gate page. It is a proper ELF shared object so, with a dynamic
-loader that recognises the library, you should be able to make calls to
-the exported functions within it as with any other shared library.
-AT_SYSINFO points into the kernel DSO at the
-__kernel_syscall_via_epc() function for historical reasons (it was
-used before the kernel DSO) and as a convenience.
diff --git a/Documentation/ia64/ia64.rst b/Documentation/ia64/ia64.rst
new file mode 100644
index 000000000000..b725019a9492
--- /dev/null
+++ b/Documentation/ia64/ia64.rst
@@ -0,0 +1,49 @@
+===========================================
+Linux kernel release for the IA-64 Platform
+===========================================
+
+ These are the release notes for Linux since version 2.4 for IA-64
+ platform. This document provides information specific to IA-64
+ ONLY, to get additional information about the Linux kernel also
+ read the original Linux README provided with the kernel.
+
+Installing the Kernel
+=====================
+
+ - IA-64 kernel installation is the same as the other platforms, see
+ original README for details.
+
+
+Software Requirements
+=====================
+
+ Compiling and running this kernel requires an IA-64 compliant GCC
+ compiler. And various software packages also compiled with an
+ IA-64 compliant GCC compiler.
+
+
+Configuring the Kernel
+======================
+
+ Configuration is the same, see original README for details.
+
+
+Compiling the Kernel:
+
+ - Compiling this kernel doesn't differ from other platform so read
+ the original README for details BUT make sure you have an IA-64
+ compliant GCC compiler.
+
+IA-64 Specifics
+===============
+
+ - General issues:
+
+ * Hardly any performance tuning has been done. Obvious targets
+ include the library routines (IP checksum, etc.). Less
+ obvious targets include making sure we don't flush the TLB
+ needlessly, etc.
+
+ * SMP locks cleanup/optimization
+
+ * IA32 support. Currently experimental. It mostly works.
diff --git a/Documentation/ia64/index.rst b/Documentation/ia64/index.rst
new file mode 100644
index 000000000000..0436e1034115
--- /dev/null
+++ b/Documentation/ia64/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==================
+IA-64 Architecture
+==================
+
+.. toctree::
+ :maxdepth: 1
+
+ ia64
+ aliasing
+ efirtc
+ err_inject
+ fsys
+ irq-redir
+ mca
+ serial
+ xen
diff --git a/Documentation/ia64/irq-redir.rst b/Documentation/ia64/irq-redir.rst
new file mode 100644
index 000000000000..39bf94484a15
--- /dev/null
+++ b/Documentation/ia64/irq-redir.rst
@@ -0,0 +1,80 @@
+==============================
+IRQ affinity on IA64 platforms
+==============================
+
+07.01.2002, Erich Focht <efocht@ess.nec.de>
+
+
+By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be
+controlled. The behavior on IA64 platforms is slightly different from
+that described in Documentation/IRQ-affinity.txt for i386 systems.
+
+Because of the usage of SAPIC mode and physical destination mode the
+IRQ target is one particular CPU and cannot be a mask of several
+CPUs. Only the first non-zero bit is taken into account.
+
+
+Usage examples
+==============
+
+The target CPU has to be specified as a hexadecimal CPU mask. The
+first non-zero bit is the selected CPU. This format has been kept for
+compatibility reasons with i386.
+
+Set the delivery mode of interrupt 41 to fixed and route the
+interrupts to CPU #3 (logical CPU number) (2^3=0x08)::
+
+ echo "8" >/proc/irq/41/smp_affinity
+
+Set the default route for IRQ number 41 to CPU 6 in lowest priority
+delivery mode (redirectable)::
+
+ echo "r 40" >/proc/irq/41/smp_affinity
+
+The output of the command::
+
+ cat /proc/irq/IRQ#/smp_affinity
+
+gives the target CPU mask for the specified interrupt vector. If the CPU
+mask is preceded by the character "r", the interrupt is redirectable
+(i.e. lowest priority mode routing is used), otherwise its route is
+fixed.
+
+
+
+Initialization and default behavior
+===================================
+
+If the platform features IRQ redirection (info provided by SAL) all
+IO-SAPIC interrupts are initialized with CPU#0 as their default target
+and the routing is the so called "lowest priority mode" (actually
+fixed SAPIC mode with hint). The XTP chipset registers are used as hints
+for the IRQ routing. Currently in Linux XTP registers can have three
+values:
+
+ - minimal for an idle task,
+ - normal if any other task runs,
+ - maximal if the CPU is going to be switched off.
+
+The IRQ is routed to the CPU with lowest XTP register value, the
+search begins at the default CPU. Therefore most of the interrupts
+will be handled by CPU #0.
+
+If the platform doesn't feature interrupt redirection IOSAPIC fixed
+routing is used. The target CPUs are distributed in a round robin
+manner. IRQs will be routed only to the selected target CPUs. Check
+with::
+
+ cat /proc/interrupts
+
+
+
+Comments
+========
+
+On large (multi-node) systems it is recommended to route the IRQs to
+the node to which the corresponding device is connected.
+For systems like the NEC AzusA we get IRQ node-affinity for free. This
+is because usually the chipsets on each node redirect the interrupts
+only to their own CPUs (as they cannot see the XTP registers on the
+other nodes).
diff --git a/Documentation/ia64/mca.rst b/Documentation/ia64/mca.rst
new file mode 100644
index 000000000000..08270bba44a4
--- /dev/null
+++ b/Documentation/ia64/mca.rst
@@ -0,0 +1,198 @@
+=============================================================
+An ad-hoc collection of notes on IA64 MCA and INIT processing
+=============================================================
+
+Feel free to update it with notes about any area that is not clear.
+
+---
+
+MCA/INIT are completely asynchronous. They can occur at any time, when
+the OS is in any state. Including when one of the cpus is already
+holding a spinlock. Trying to get any lock from MCA/INIT state is
+asking for deadlock. Also the state of structures that are protected
+by locks is indeterminate, including linked lists.
+
+---
+
+The complicated ia64 MCA process. All of this is mandated by Intel's
+specification for ia64 SAL, error recovery and unwind, it is not as
+if we have a choice here.
+
+* MCA occurs on one cpu, usually due to a double bit memory error.
+ This is the monarch cpu.
+
+* SAL sends an MCA rendezvous interrupt (which is a normal interrupt)
+ to all the other cpus, the slaves.
+
+* Slave cpus that receive the MCA interrupt call down into SAL, they
+ end up spinning disabled while the MCA is being serviced.
+
+* If any slave cpu was already spinning disabled when the MCA occurred
+ then it cannot service the MCA interrupt. SAL waits ~20 seconds then
+ sends an unmaskable INIT event to the slave cpus that have not
+ already rendezvoused.
+
+* Because MCA/INIT can be delivered at any time, including when the cpu
+ is down in PAL in physical mode, the registers at the time of the
+ event are _completely_ undefined. In particular the MCA/INIT
+ handlers cannot rely on the thread pointer, PAL physical mode can
+ (and does) modify TP. It is allowed to do that as long as it resets
+ TP on return. However MCA/INIT events expose us to these PAL
+ internal TP changes. Hence curr_task().
+
+* If an MCA/INIT event occurs while the kernel was running (not user
+ space) and the kernel has called PAL then the MCA/INIT handler cannot
+ assume that the kernel stack is in a fit state to be used. Mainly
+ because PAL may or may not maintain the stack pointer internally.
+ Because the MCA/INIT handlers cannot trust the kernel stack, they
+ have to use their own, per-cpu stacks. The MCA/INIT stacks are
+ preformatted with just enough task state to let the relevant handlers
+ do their job.
+
+* Unlike most other architectures, the ia64 struct task is embedded in
+ the kernel stack[1]. So switching to a new kernel stack means that
+ we switch to a new task as well. Because various bits of the kernel
+ assume that current points into the struct task, switching to a new
+ stack also means a new value for current.
+
+* Once all slaves have rendezvoused and are spinning disabled, the
+ monarch is entered. The monarch now tries to diagnose the problem
+ and decide if it can recover or not.
+
+* Part of the monarch's job is to look at the state of all the other
+ tasks. The only way to do that on ia64 is to call the unwinder,
+ as mandated by Intel.
+
+* The starting point for the unwind depends on whether a task is
+ running or not. That is, whether it is on a cpu or is blocked. The
+ monarch has to determine whether or not a task is on a cpu before it
+ knows how to start unwinding it. The tasks that received an MCA or
+ INIT event are no longer running, they have been converted to blocked
+ tasks. But (and its a big but), the cpus that received the MCA
+ rendezvous interrupt are still running on their normal kernel stacks!
+
+* To distinguish between these two cases, the monarch must know which
+ tasks are on a cpu and which are not. Hence each slave cpu that
+ switches to an MCA/INIT stack, registers its new stack using
+ set_curr_task(), so the monarch can tell that the _original_ task is
+ no longer running on that cpu. That gives us a decent chance of
+ getting a valid backtrace of the _original_ task.
+
+* MCA/INIT can be nested, to a depth of 2 on any cpu. In the case of a
+ nested error, we want diagnostics on the MCA/INIT handler that
+ failed, not on the task that was originally running. Again this
+ requires set_curr_task() so the MCA/INIT handlers can register their
+ own stack as running on that cpu. Then a recursive error gets a
+ trace of the failing handler's "task".
+
+[1]
+ My (Keith Owens) original design called for ia64 to separate its
+ struct task and the kernel stacks. Then the MCA/INIT data would be
+ chained stacks like i386 interrupt stacks. But that required
+ radical surgery on the rest of ia64, plus extra hard wired TLB
+ entries with its associated performance degradation. David
+ Mosberger vetoed that approach. Which meant that separate kernel
+ stacks meant separate "tasks" for the MCA/INIT handlers.
+
+---
+
+INIT is less complicated than MCA. Pressing the nmi button or using
+the equivalent command on the management console sends INIT to all
+cpus. SAL picks one of the cpus as the monarch and the rest are
+slaves. All the OS INIT handlers are entered at approximately the same
+time. The OS monarch prints the state of all tasks and returns, after
+which the slaves return and the system resumes.
+
+At least that is what is supposed to happen. Alas there are broken
+versions of SAL out there. Some drive all the cpus as monarchs. Some
+drive them all as slaves. Some drive one cpu as monarch, wait for that
+cpu to return from the OS then drive the rest as slaves. Some versions
+of SAL cannot even cope with returning from the OS, they spin inside
+SAL on resume. The OS INIT code has workarounds for some of these
+broken SAL symptoms, but some simply cannot be fixed from the OS side.
+
+---
+
+The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer
+violations. Unfortunately MCA/INIT start off as massive layer
+violations (can occur at _any_ time) and they build from there.
+
+At least ia64 makes an attempt at recovering from hardware errors, but
+it is a difficult problem because of the asynchronous nature of these
+errors. When processing an unmaskable interrupt we sometimes need
+special code to cope with our inability to take any locks.
+
+---
+
+How is ia64 MCA/INIT different from x86 NMI?
+
+* x86 NMI typically gets delivered to one cpu. MCA/INIT gets sent to
+ all cpus.
+
+* x86 NMI cannot be nested. MCA/INIT can be nested, to a depth of 2
+ per cpu.
+
+* x86 has a separate struct task which points to one of multiple kernel
+ stacks. ia64 has the struct task embedded in the single kernel
+ stack, so switching stack means switching task.
+
+* x86 does not call the BIOS so the NMI handler does not have to worry
+ about any registers having changed. MCA/INIT can occur while the cpu
+ is in PAL in physical mode, with undefined registers and an undefined
+ kernel stack.
+
+* i386 backtrace is not very sensitive to whether a process is running
+ or not. ia64 unwind is very, very sensitive to whether a process is
+ running or not.
+
+---
+
+What happens when MCA/INIT is delivered what a cpu is running user
+space code?
+
+The user mode registers are stored in the RSE area of the MCA/INIT on
+entry to the OS and are restored from there on return to SAL, so user
+mode registers are preserved across a recoverable MCA/INIT. Since the
+OS has no idea what unwind data is available for the user space stack,
+MCA/INIT never tries to backtrace user space. Which means that the OS
+does not bother making the user space process look like a blocked task,
+i.e. the OS does not copy pt_regs and switch_stack to the user space
+stack. Also the OS has no idea how big the user space RSE and memory
+stacks are, which makes it too risky to copy the saved state to a user
+mode stack.
+
+---
+
+How do we get a backtrace on the tasks that were running when MCA/INIT
+was delivered?
+
+mca.c:::ia64_mca_modify_original_stack(). That identifies and
+verifies the original kernel stack, copies the dirty registers from
+the MCA/INIT stack's RSE to the original stack's RSE, copies the
+skeleton struct pt_regs and switch_stack to the original stack, fills
+in the skeleton structures from the PAL minstate area and updates the
+original stack's thread.ksp. That makes the original stack look
+exactly like any other blocked task, i.e. it now appears to be
+sleeping. To get a backtrace, just start with thread.ksp for the
+original task and unwind like any other sleeping task.
+
+---
+
+How do we identify the tasks that were running when MCA/INIT was
+delivered?
+
+If the previous task has been verified and converted to a blocked
+state, then sos->prev_task on the MCA/INIT stack is updated to point to
+the previous task. You can look at that field in dumps or debuggers.
+To help distinguish between the handler and the original tasks,
+handlers have _TIF_MCA_INIT set in thread_info.flags.
+
+The sos data is always in the MCA/INIT handler stack, at offset
+MCA_SOS_OFFSET. You can get that value from mca_asm.h or calculate it
+as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct
+ia64_sal_os_state), with 16 byte alignment for all structures.
+
+Also the comm field of the MCA/INIT task is modified to include the pid
+of the original task, for humans to use. For example, a comm field of
+'MCA 12159' means that pid 12159 was running when the MCA was
+delivered.
diff --git a/Documentation/ia64/mca.txt b/Documentation/ia64/mca.txt
deleted file mode 100644
index f097c60cba1b..000000000000
--- a/Documentation/ia64/mca.txt
+++ /dev/null
@@ -1,194 +0,0 @@
-An ad-hoc collection of notes on IA64 MCA and INIT processing. Feel
-free to update it with notes about any area that is not clear.
-
----
-
-MCA/INIT are completely asynchronous. They can occur at any time, when
-the OS is in any state. Including when one of the cpus is already
-holding a spinlock. Trying to get any lock from MCA/INIT state is
-asking for deadlock. Also the state of structures that are protected
-by locks is indeterminate, including linked lists.
-
----
-
-The complicated ia64 MCA process. All of this is mandated by Intel's
-specification for ia64 SAL, error recovery and unwind, it is not as
-if we have a choice here.
-
-* MCA occurs on one cpu, usually due to a double bit memory error.
- This is the monarch cpu.
-
-* SAL sends an MCA rendezvous interrupt (which is a normal interrupt)
- to all the other cpus, the slaves.
-
-* Slave cpus that receive the MCA interrupt call down into SAL, they
- end up spinning disabled while the MCA is being serviced.
-
-* If any slave cpu was already spinning disabled when the MCA occurred
- then it cannot service the MCA interrupt. SAL waits ~20 seconds then
- sends an unmaskable INIT event to the slave cpus that have not
- already rendezvoused.
-
-* Because MCA/INIT can be delivered at any time, including when the cpu
- is down in PAL in physical mode, the registers at the time of the
- event are _completely_ undefined. In particular the MCA/INIT
- handlers cannot rely on the thread pointer, PAL physical mode can
- (and does) modify TP. It is allowed to do that as long as it resets
- TP on return. However MCA/INIT events expose us to these PAL
- internal TP changes. Hence curr_task().
-
-* If an MCA/INIT event occurs while the kernel was running (not user
- space) and the kernel has called PAL then the MCA/INIT handler cannot
- assume that the kernel stack is in a fit state to be used. Mainly
- because PAL may or may not maintain the stack pointer internally.
- Because the MCA/INIT handlers cannot trust the kernel stack, they
- have to use their own, per-cpu stacks. The MCA/INIT stacks are
- preformatted with just enough task state to let the relevant handlers
- do their job.
-
-* Unlike most other architectures, the ia64 struct task is embedded in
- the kernel stack[1]. So switching to a new kernel stack means that
- we switch to a new task as well. Because various bits of the kernel
- assume that current points into the struct task, switching to a new
- stack also means a new value for current.
-
-* Once all slaves have rendezvoused and are spinning disabled, the
- monarch is entered. The monarch now tries to diagnose the problem
- and decide if it can recover or not.
-
-* Part of the monarch's job is to look at the state of all the other
- tasks. The only way to do that on ia64 is to call the unwinder,
- as mandated by Intel.
-
-* The starting point for the unwind depends on whether a task is
- running or not. That is, whether it is on a cpu or is blocked. The
- monarch has to determine whether or not a task is on a cpu before it
- knows how to start unwinding it. The tasks that received an MCA or
- INIT event are no longer running, they have been converted to blocked
- tasks. But (and its a big but), the cpus that received the MCA
- rendezvous interrupt are still running on their normal kernel stacks!
-
-* To distinguish between these two cases, the monarch must know which
- tasks are on a cpu and which are not. Hence each slave cpu that
- switches to an MCA/INIT stack, registers its new stack using
- set_curr_task(), so the monarch can tell that the _original_ task is
- no longer running on that cpu. That gives us a decent chance of
- getting a valid backtrace of the _original_ task.
-
-* MCA/INIT can be nested, to a depth of 2 on any cpu. In the case of a
- nested error, we want diagnostics on the MCA/INIT handler that
- failed, not on the task that was originally running. Again this
- requires set_curr_task() so the MCA/INIT handlers can register their
- own stack as running on that cpu. Then a recursive error gets a
- trace of the failing handler's "task".
-
-[1] My (Keith Owens) original design called for ia64 to separate its
- struct task and the kernel stacks. Then the MCA/INIT data would be
- chained stacks like i386 interrupt stacks. But that required
- radical surgery on the rest of ia64, plus extra hard wired TLB
- entries with its associated performance degradation. David
- Mosberger vetoed that approach. Which meant that separate kernel
- stacks meant separate "tasks" for the MCA/INIT handlers.
-
----
-
-INIT is less complicated than MCA. Pressing the nmi button or using
-the equivalent command on the management console sends INIT to all
-cpus. SAL picks one of the cpus as the monarch and the rest are
-slaves. All the OS INIT handlers are entered at approximately the same
-time. The OS monarch prints the state of all tasks and returns, after
-which the slaves return and the system resumes.
-
-At least that is what is supposed to happen. Alas there are broken
-versions of SAL out there. Some drive all the cpus as monarchs. Some
-drive them all as slaves. Some drive one cpu as monarch, wait for that
-cpu to return from the OS then drive the rest as slaves. Some versions
-of SAL cannot even cope with returning from the OS, they spin inside
-SAL on resume. The OS INIT code has workarounds for some of these
-broken SAL symptoms, but some simply cannot be fixed from the OS side.
-
----
-
-The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer
-violations. Unfortunately MCA/INIT start off as massive layer
-violations (can occur at _any_ time) and they build from there.
-
-At least ia64 makes an attempt at recovering from hardware errors, but
-it is a difficult problem because of the asynchronous nature of these
-errors. When processing an unmaskable interrupt we sometimes need
-special code to cope with our inability to take any locks.
-
----
-
-How is ia64 MCA/INIT different from x86 NMI?
-
-* x86 NMI typically gets delivered to one cpu. MCA/INIT gets sent to
- all cpus.
-
-* x86 NMI cannot be nested. MCA/INIT can be nested, to a depth of 2
- per cpu.
-
-* x86 has a separate struct task which points to one of multiple kernel
- stacks. ia64 has the struct task embedded in the single kernel
- stack, so switching stack means switching task.
-
-* x86 does not call the BIOS so the NMI handler does not have to worry
- about any registers having changed. MCA/INIT can occur while the cpu
- is in PAL in physical mode, with undefined registers and an undefined
- kernel stack.
-
-* i386 backtrace is not very sensitive to whether a process is running
- or not. ia64 unwind is very, very sensitive to whether a process is
- running or not.
-
----
-
-What happens when MCA/INIT is delivered what a cpu is running user
-space code?
-
-The user mode registers are stored in the RSE area of the MCA/INIT on
-entry to the OS and are restored from there on return to SAL, so user
-mode registers are preserved across a recoverable MCA/INIT. Since the
-OS has no idea what unwind data is available for the user space stack,
-MCA/INIT never tries to backtrace user space. Which means that the OS
-does not bother making the user space process look like a blocked task,
-i.e. the OS does not copy pt_regs and switch_stack to the user space
-stack. Also the OS has no idea how big the user space RSE and memory
-stacks are, which makes it too risky to copy the saved state to a user
-mode stack.
-
----
-
-How do we get a backtrace on the tasks that were running when MCA/INIT
-was delivered?
-
-mca.c:::ia64_mca_modify_original_stack(). That identifies and
-verifies the original kernel stack, copies the dirty registers from
-the MCA/INIT stack's RSE to the original stack's RSE, copies the
-skeleton struct pt_regs and switch_stack to the original stack, fills
-in the skeleton structures from the PAL minstate area and updates the
-original stack's thread.ksp. That makes the original stack look
-exactly like any other blocked task, i.e. it now appears to be
-sleeping. To get a backtrace, just start with thread.ksp for the
-original task and unwind like any other sleeping task.
-
----
-
-How do we identify the tasks that were running when MCA/INIT was
-delivered?
-
-If the previous task has been verified and converted to a blocked
-state, then sos->prev_task on the MCA/INIT stack is updated to point to
-the previous task. You can look at that field in dumps or debuggers.
-To help distinguish between the handler and the original tasks,
-handlers have _TIF_MCA_INIT set in thread_info.flags.
-
-The sos data is always in the MCA/INIT handler stack, at offset
-MCA_SOS_OFFSET. You can get that value from mca_asm.h or calculate it
-as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct
-ia64_sal_os_state), with 16 byte alignment for all structures.
-
-Also the comm field of the MCA/INIT task is modified to include the pid
-of the original task, for humans to use. For example, a comm field of
-'MCA 12159' means that pid 12159 was running when the MCA was
-delivered.
diff --git a/Documentation/ia64/serial.rst b/Documentation/ia64/serial.rst
new file mode 100644
index 000000000000..1de70c305a79
--- /dev/null
+++ b/Documentation/ia64/serial.rst
@@ -0,0 +1,165 @@
+==============
+Serial Devices
+==============
+
+Serial Device Naming
+====================
+
+ As of 2.6.10, serial devices on ia64 are named based on the
+ order of ACPI and PCI enumeration. The first device in the
+ ACPI namespace (if any) becomes /dev/ttyS0, the second becomes
+ /dev/ttyS1, etc., and PCI devices are named sequentially
+ starting after the ACPI devices.
+
+ Prior to 2.6.10, there were confusing exceptions to this:
+
+ - Firmware on some machines (mostly from HP) provides an HCDP
+ table[1] that tells the kernel about devices that can be used
+ as a serial console. If the user specified "console=ttyS0"
+ or the EFI ConOut path contained only UART devices, the
+ kernel registered the device described by the HCDP as
+ /dev/ttyS0.
+
+ - If there was no HCDP, we assumed there were UARTs at the
+ legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so
+ the kernel registered those as /dev/ttyS0 and /dev/ttyS1.
+
+ Any additional ACPI or PCI devices were registered sequentially
+ after /dev/ttyS0 as they were discovered.
+
+ With an HCDP, device names changed depending on EFI configuration
+ and "console=" arguments. Without an HCDP, device names didn't
+ change, but we registered devices that might not really exist.
+
+ For example, an HP rx1600 with a single built-in serial port
+ (described in the ACPI namespace) plus an MP[2] (a PCI device) has
+ these ports:
+
+ ========== ========== ============ ============ =======
+ Type MMIO pre-2.6.10 pre-2.6.10 2.6.10+
+ address
+ (EFI console (EFI console
+ on builtin) on MP port)
+ ========== ========== ============ ============ =======
+ builtin 0xff5e0000 ttyS0 ttyS1 ttyS0
+ MP UPS 0xf8031000 ttyS1 ttyS2 ttyS1
+ MP Console 0xf8030000 ttyS2 ttyS0 ttyS2
+ MP 2 0xf8030010 ttyS3 ttyS3 ttyS3
+ MP 3 0xf8030038 ttyS4 ttyS4 ttyS4
+ ========== ========== ============ ============ =======
+
+Console Selection
+=================
+
+ EFI knows what your console devices are, but it doesn't tell the
+ kernel quite enough to actually locate them. The DIG64 HCDP
+ table[1] does tell the kernel where potential serial console
+ devices are, but not all firmware supplies it. Also, EFI supports
+ multiple simultaneous consoles and doesn't tell the kernel which
+ should be the "primary" one.
+
+ So how do you tell Linux which console device to use?
+
+ - If your firmware supplies the HCDP, it is simplest to
+ configure EFI with a single device (either a UART or a VGA
+ card) as the console. Then you don't need to tell Linux
+ anything; the kernel will automatically use the EFI console.
+
+ (This works only in 2.6.6 or later; prior to that you had
+ to specify "console=ttyS0" to get a serial console.)
+
+ - Without an HCDP, Linux defaults to a VGA console unless you
+ specify a "console=" argument.
+
+ NOTE: Don't assume that a serial console device will be /dev/ttyS0.
+ It might be ttyS1, ttyS2, etc. Make sure you have the appropriate
+ entries in /etc/inittab (for getty) and /etc/securetty (to allow
+ root login).
+
+Early Serial Console
+====================
+
+ The kernel can't start using a serial console until it knows where
+ the device lives. Normally this happens when the driver enumerates
+ all the serial devices, which can happen a minute or more after the
+ kernel starts booting.
+
+ 2.6.10 and later kernels have an "early uart" driver that works
+ very early in the boot process. The kernel will automatically use
+ this if the user supplies an argument like "console=uart,io,0x3f8",
+ or if the EFI console path contains only a UART device and the
+ firmware supplies an HCDP.
+
+Troubleshooting Serial Console Problems
+=======================================
+
+ No kernel output after elilo prints "Uncompressing Linux... done":
+
+ - You specified "console=ttyS0" but Linux changed the device
+ to which ttyS0 refers. Configure exactly one EFI console
+ device[3] and remove the "console=" option.
+
+ - The EFI console path contains both a VGA device and a UART.
+ EFI and elilo use both, but Linux defaults to VGA. Remove
+ the VGA device from the EFI console path[3].
+
+ - Multiple UARTs selected as EFI console devices. EFI and
+ elilo use all selected devices, but Linux uses only one.
+ Make sure only one UART is selected in the EFI console
+ path[3].
+
+ - You're connected to an HP MP port[2] but have a non-MP UART
+ selected as EFI console device. EFI uses the MP as a
+ console device even when it isn't explicitly selected.
+ Either move the console cable to the non-MP UART, or change
+ the EFI console path[3] to the MP UART.
+
+ Long pause (60+ seconds) between "Uncompressing Linux... done" and
+ start of kernel output:
+
+ - No early console because you used "console=ttyS<n>". Remove
+ the "console=" option if your firmware supplies an HCDP.
+
+ - If you don't have an HCDP, the kernel doesn't know where
+ your console lives until the driver discovers serial
+ devices. Use "console=uart,io,0x3f8" (or appropriate
+ address for your machine).
+
+ Kernel and init script output works fine, but no "login:" prompt:
+
+ - Add getty entry to /etc/inittab for console tty. Look for
+ the "Adding console on ttyS<n>" message that tells you which
+ device is the console.
+
+ "login:" prompt, but can't login as root:
+
+ - Add entry to /etc/securetty for console tty.
+
+ No ACPI serial devices found in 2.6.17 or later:
+
+ - Turn on CONFIG_PNP and CONFIG_PNPACPI. Prior to 2.6.17, ACPI
+ serial devices were discovered by 8250_acpi. In 2.6.17,
+ 8250_acpi was replaced by the combination of 8250_pnp and
+ CONFIG_PNPACPI.
+
+
+
+[1]
+ http://www.dig64.org/specifications/agreement
+ The table was originally defined as the "HCDP" for "Headless
+ Console/Debug Port." The current version is the "PCDP" for
+ "Primary Console and Debug Port Devices."
+
+[2]
+ The HP MP (management processor) is a PCI device that provides
+ several UARTs. One of the UARTs is often used as a console; the
+ EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart".
+ The external connection is usually a 25-pin connector, and a
+ special dongle converts that to three 9-pin connectors, one of
+ which is labelled "Console."
+
+[3]
+ EFI console devices are configured using the EFI Boot Manager
+ "Boot option maintenance" menu. You may have to interrupt the
+ boot sequence to use this menu, and you will have to reset the
+ box after changing console configuration.
diff --git a/Documentation/ia64/serial.txt b/Documentation/ia64/serial.txt
deleted file mode 100644
index a63d2c54329b..000000000000
--- a/Documentation/ia64/serial.txt
+++ /dev/null
@@ -1,151 +0,0 @@
-SERIAL DEVICE NAMING
-
- As of 2.6.10, serial devices on ia64 are named based on the
- order of ACPI and PCI enumeration. The first device in the
- ACPI namespace (if any) becomes /dev/ttyS0, the second becomes
- /dev/ttyS1, etc., and PCI devices are named sequentially
- starting after the ACPI devices.
-
- Prior to 2.6.10, there were confusing exceptions to this:
-
- - Firmware on some machines (mostly from HP) provides an HCDP
- table[1] that tells the kernel about devices that can be used
- as a serial console. If the user specified "console=ttyS0"
- or the EFI ConOut path contained only UART devices, the
- kernel registered the device described by the HCDP as
- /dev/ttyS0.
-
- - If there was no HCDP, we assumed there were UARTs at the
- legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so
- the kernel registered those as /dev/ttyS0 and /dev/ttyS1.
-
- Any additional ACPI or PCI devices were registered sequentially
- after /dev/ttyS0 as they were discovered.
-
- With an HCDP, device names changed depending on EFI configuration
- and "console=" arguments. Without an HCDP, device names didn't
- change, but we registered devices that might not really exist.
-
- For example, an HP rx1600 with a single built-in serial port
- (described in the ACPI namespace) plus an MP[2] (a PCI device) has
- these ports:
-
- pre-2.6.10 pre-2.6.10
- MMIO (EFI console (EFI console
- address on builtin) on MP port) 2.6.10
- ========== ========== ========== ======
- builtin 0xff5e0000 ttyS0 ttyS1 ttyS0
- MP UPS 0xf8031000 ttyS1 ttyS2 ttyS1
- MP Console 0xf8030000 ttyS2 ttyS0 ttyS2
- MP 2 0xf8030010 ttyS3 ttyS3 ttyS3
- MP 3 0xf8030038 ttyS4 ttyS4 ttyS4
-
-CONSOLE SELECTION
-
- EFI knows what your console devices are, but it doesn't tell the
- kernel quite enough to actually locate them. The DIG64 HCDP
- table[1] does tell the kernel where potential serial console
- devices are, but not all firmware supplies it. Also, EFI supports
- multiple simultaneous consoles and doesn't tell the kernel which
- should be the "primary" one.
-
- So how do you tell Linux which console device to use?
-
- - If your firmware supplies the HCDP, it is simplest to
- configure EFI with a single device (either a UART or a VGA
- card) as the console. Then you don't need to tell Linux
- anything; the kernel will automatically use the EFI console.
-
- (This works only in 2.6.6 or later; prior to that you had
- to specify "console=ttyS0" to get a serial console.)
-
- - Without an HCDP, Linux defaults to a VGA console unless you
- specify a "console=" argument.
-
- NOTE: Don't assume that a serial console device will be /dev/ttyS0.
- It might be ttyS1, ttyS2, etc. Make sure you have the appropriate
- entries in /etc/inittab (for getty) and /etc/securetty (to allow
- root login).
-
-EARLY SERIAL CONSOLE
-
- The kernel can't start using a serial console until it knows where
- the device lives. Normally this happens when the driver enumerates
- all the serial devices, which can happen a minute or more after the
- kernel starts booting.
-
- 2.6.10 and later kernels have an "early uart" driver that works
- very early in the boot process. The kernel will automatically use
- this if the user supplies an argument like "console=uart,io,0x3f8",
- or if the EFI console path contains only a UART device and the
- firmware supplies an HCDP.
-
-TROUBLESHOOTING SERIAL CONSOLE PROBLEMS
-
- No kernel output after elilo prints "Uncompressing Linux... done":
-
- - You specified "console=ttyS0" but Linux changed the device
- to which ttyS0 refers. Configure exactly one EFI console
- device[3] and remove the "console=" option.
-
- - The EFI console path contains both a VGA device and a UART.
- EFI and elilo use both, but Linux defaults to VGA. Remove
- the VGA device from the EFI console path[3].
-
- - Multiple UARTs selected as EFI console devices. EFI and
- elilo use all selected devices, but Linux uses only one.
- Make sure only one UART is selected in the EFI console
- path[3].
-
- - You're connected to an HP MP port[2] but have a non-MP UART
- selected as EFI console device. EFI uses the MP as a
- console device even when it isn't explicitly selected.
- Either move the console cable to the non-MP UART, or change
- the EFI console path[3] to the MP UART.
-
- Long pause (60+ seconds) between "Uncompressing Linux... done" and
- start of kernel output:
-
- - No early console because you used "console=ttyS<n>". Remove
- the "console=" option if your firmware supplies an HCDP.
-
- - If you don't have an HCDP, the kernel doesn't know where
- your console lives until the driver discovers serial
- devices. Use "console=uart,io,0x3f8" (or appropriate
- address for your machine).
-
- Kernel and init script output works fine, but no "login:" prompt:
-
- - Add getty entry to /etc/inittab for console tty. Look for
- the "Adding console on ttyS<n>" message that tells you which
- device is the console.
-
- "login:" prompt, but can't login as root:
-
- - Add entry to /etc/securetty for console tty.
-
- No ACPI serial devices found in 2.6.17 or later:
-
- - Turn on CONFIG_PNP and CONFIG_PNPACPI. Prior to 2.6.17, ACPI
- serial devices were discovered by 8250_acpi. In 2.6.17,
- 8250_acpi was replaced by the combination of 8250_pnp and
- CONFIG_PNPACPI.
-
-
-
-[1] http://www.dig64.org/specifications/agreement
- The table was originally defined as the "HCDP" for "Headless
- Console/Debug Port." The current version is the "PCDP" for
- "Primary Console and Debug Port Devices."
-
-[2] The HP MP (management processor) is a PCI device that provides
- several UARTs. One of the UARTs is often used as a console; the
- EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart".
- The external connection is usually a 25-pin connector, and a
- special dongle converts that to three 9-pin connectors, one of
- which is labelled "Console."
-
-[3] EFI console devices are configured using the EFI Boot Manager
- "Boot option maintenance" menu. You may have to interrupt the
- boot sequence to use this menu, and you will have to reset the
- box after changing console configuration.
diff --git a/Documentation/ia64/xen.rst b/Documentation/ia64/xen.rst
new file mode 100644
index 000000000000..831339c74441
--- /dev/null
+++ b/Documentation/ia64/xen.rst
@@ -0,0 +1,206 @@
+********************************************************
+Recipe for getting/building/running Xen/ia64 with pv_ops
+********************************************************
+This recipe describes how to get xen-ia64 source and build it,
+and run domU with pv_ops.
+
+Requirements
+============
+
+ - python
+ - mercurial
+ it (aka "hg") is an open-source source code
+ management software. See the below.
+ http://www.selenic.com/mercurial/wiki/
+ - git
+ - bridge-utils
+
+Getting and Building Xen and Dom0
+=================================
+
+ My environment is:
+
+ - Machine : Tiger4
+ - Domain0 OS : RHEL5
+ - DomainU OS : RHEL5
+
+ 1. Download source::
+
+ # hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg
+ # cd xen-unstable.hg
+ # hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg
+
+ 2. # make world
+
+ 3. # make install-tools
+
+ 4. copy kernels and xen::
+
+ # cp xen/xen.gz /boot/efi/efi/redhat/
+ # cp build-linux-2.6.18-xen_ia64/vmlinux.gz \
+ /boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen
+
+ 5. make initrd for Dom0/DomU::
+
+ # make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \
+ O=$(pwd)/build-linux-2.6.18-xen_ia64
+ # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \
+ 2.6.18.8-xen --builtin mptspi --builtin mptbase \
+ --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \
+ --builtin ehci-hcd
+
+Making a disk image for guest OS
+================================
+
+ 1. make file::
+
+ # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0
+ # mke2fs -F -j /root/rhel5.img
+ # mount -o loop /root/rhel5.img /mnt
+ # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt
+ # mkdir /mnt/{root,proc,sys,home,tmp}
+
+ Note: You may miss some device files. If so, please create them
+ with mknod. Or you can use tar instead of cp.
+
+ 2. modify DomU's fstab::
+
+ # vi /mnt/etc/fstab
+ /dev/xvda1 / ext3 defaults 1 1
+ none /dev/pts devpts gid=5,mode=620 0 0
+ none /dev/shm tmpfs defaults 0 0
+ none /proc proc defaults 0 0
+ none /sys sysfs defaults 0 0
+
+ 3. modify inittab
+
+ set runlevel to 3 to avoid X trying to start::
+
+ # vi /mnt/etc/inittab
+ id:3:initdefault:
+
+ Start a getty on the hvc0 console::
+
+ X0:2345:respawn:/sbin/mingetty hvc0
+
+ tty1-6 mingetty can be commented out
+
+ 4. add hvc0 into /etc/securetty::
+
+ # vi /mnt/etc/securetty (add hvc0)
+
+ 5. umount::
+
+ # umount /mnt
+
+FYI, virt-manager can also make a disk image for guest OS.
+It's GUI tools and easy to make it.
+
+Boot Xen & Domain0
+==================
+
+ 1. replace elilo
+ elilo of RHEL5 can boot Xen and Dom0.
+ If you use old elilo (e.g RHEL4), please download from the below
+ http://elilo.sourceforge.net/cgi-bin/blosxom
+ and copy into /boot/efi/efi/redhat/::
+
+ # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi
+
+ 2. modify elilo.conf (like the below)::
+
+ # vi /boot/efi/efi/redhat/elilo.conf
+ prompt
+ timeout=20
+ default=xen
+ relocatable
+
+ image=vmlinuz-2.6.18.8-xen
+ label=xen
+ vmm=xen.gz
+ initrd=initrd-2.6.18.8-xen.img
+ read-only
+ append=" -- rhgb root=/dev/sda2"
+
+The append options before "--" are for xen hypervisor,
+the options after "--" are for dom0.
+
+FYI, your machine may need console options like
+"com1=19200,8n1 console=vga,com1". For example,
+append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \
+console=ttyS0 root=/dev/sda2"
+
+Getting and Building domU with pv_ops
+=====================================
+
+ 1. get pv_ops tree::
+
+ # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/
+
+ 2. git branch (if necessary)::
+
+ # cd linux-2.6-xen-ia64/
+ # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19
+
+ Note:
+ The current branch is xen-ia64-domu-minimal-2008may19.
+ But you would find the new branch. You can see with
+ "git branch -r" to get the branch lists.
+
+ http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/
+
+ is also available.
+
+ The tree is based on
+
+ git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test)
+
+ 3. copy .config for pv_ops of domU::
+
+ # cp arch/ia64/configs/xen_domu_wip_defconfig .config
+
+ 4. make kernel with pv_ops::
+
+ # make oldconfig
+ # make
+
+ 5. install the kernel and initrd::
+
+ # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU
+ # make modules_install
+ # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \
+ 2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \
+ --builtin mptbase --builtin mptscsih --builtin uhci-hcd \
+ --builtin ohci-hcd --builtin ehci-hcd
+
+Boot DomainU with pv_ops
+========================
+
+ 1. make config of DomU::
+
+ # vi /etc/xen/rhel5
+ kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU"
+ ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img"
+ vcpus = 1
+ memory = 512
+ name = "rhel5"
+ disk = [ 'file:/root/rhel5.img,xvda1,w' ]
+ root = "/dev/xvda1 ro"
+ extra= "rhgb console=hvc0"
+
+ 2. After boot xen and dom0, start xend::
+
+ # /etc/init.d/xend start
+
+ ( In the debugging case, `# XEND_DEBUG=1 xend trace_start` )
+
+ 3. start domU::
+
+ # xm create -c rhel5
+
+Reference
+=========
+- Wiki of Xen/IA64 upstream merge
+ http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge
+
+Written by Akio Takebe <takebe_akio@jp.fujitsu.com> on 28 May 2008
diff --git a/Documentation/ia64/xen.txt b/Documentation/ia64/xen.txt
deleted file mode 100644
index a12c74ce2773..000000000000
--- a/Documentation/ia64/xen.txt
+++ /dev/null
@@ -1,183 +0,0 @@
- Recipe for getting/building/running Xen/ia64 with pv_ops
- --------------------------------------------------------
-
-This recipe describes how to get xen-ia64 source and build it,
-and run domU with pv_ops.
-
-============
-Requirements
-============
-
- - python
- - mercurial
- it (aka "hg") is an open-source source code
- management software. See the below.
- http://www.selenic.com/mercurial/wiki/
- - git
- - bridge-utils
-
-=================================
-Getting and Building Xen and Dom0
-=================================
-
- My environment is;
- Machine : Tiger4
- Domain0 OS : RHEL5
- DomainU OS : RHEL5
-
- 1. Download source
- # hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg
- # cd xen-unstable.hg
- # hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg
-
- 2. # make world
-
- 3. # make install-tools
-
- 4. copy kernels and xen
- # cp xen/xen.gz /boot/efi/efi/redhat/
- # cp build-linux-2.6.18-xen_ia64/vmlinux.gz \
- /boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen
-
- 5. make initrd for Dom0/DomU
- # make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \
- O=$(pwd)/build-linux-2.6.18-xen_ia64
- # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \
- 2.6.18.8-xen --builtin mptspi --builtin mptbase \
- --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \
- --builtin ehci-hcd
-
-================================
-Making a disk image for guest OS
-================================
-
- 1. make file
- # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0
- # mke2fs -F -j /root/rhel5.img
- # mount -o loop /root/rhel5.img /mnt
- # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt
- # mkdir /mnt/{root,proc,sys,home,tmp}
-
- Note: You may miss some device files. If so, please create them
- with mknod. Or you can use tar instead of cp.
-
- 2. modify DomU's fstab
- # vi /mnt/etc/fstab
- /dev/xvda1 / ext3 defaults 1 1
- none /dev/pts devpts gid=5,mode=620 0 0
- none /dev/shm tmpfs defaults 0 0
- none /proc proc defaults 0 0
- none /sys sysfs defaults 0 0
-
- 3. modify inittab
- set runlevel to 3 to avoid X trying to start
- # vi /mnt/etc/inittab
- id:3:initdefault:
- Start a getty on the hvc0 console
- X0:2345:respawn:/sbin/mingetty hvc0
- tty1-6 mingetty can be commented out
-
- 4. add hvc0 into /etc/securetty
- # vi /mnt/etc/securetty (add hvc0)
-
- 5. umount
- # umount /mnt
-
-FYI, virt-manager can also make a disk image for guest OS.
-It's GUI tools and easy to make it.
-
-==================
-Boot Xen & Domain0
-==================
-
- 1. replace elilo
- elilo of RHEL5 can boot Xen and Dom0.
- If you use old elilo (e.g RHEL4), please download from the below
- http://elilo.sourceforge.net/cgi-bin/blosxom
- and copy into /boot/efi/efi/redhat/
- # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi
-
- 2. modify elilo.conf (like the below)
- # vi /boot/efi/efi/redhat/elilo.conf
- prompt
- timeout=20
- default=xen
- relocatable
-
- image=vmlinuz-2.6.18.8-xen
- label=xen
- vmm=xen.gz
- initrd=initrd-2.6.18.8-xen.img
- read-only
- append=" -- rhgb root=/dev/sda2"
-
-The append options before "--" are for xen hypervisor,
-the options after "--" are for dom0.
-
-FYI, your machine may need console options like
-"com1=19200,8n1 console=vga,com1". For example,
-append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \
-console=ttyS0 root=/dev/sda2"
-
-=====================================
-Getting and Building domU with pv_ops
-=====================================
-
- 1. get pv_ops tree
- # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/
-
- 2. git branch (if necessary)
- # cd linux-2.6-xen-ia64/
- # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19
- (Note: The current branch is xen-ia64-domu-minimal-2008may19.
- But you would find the new branch. You can see with
- "git branch -r" to get the branch lists.
- http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/
- is also available. The tree is based on
- git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test)
-
-
- 3. copy .config for pv_ops of domU
- # cp arch/ia64/configs/xen_domu_wip_defconfig .config
-
- 4. make kernel with pv_ops
- # make oldconfig
- # make
-
- 5. install the kernel and initrd
- # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU
- # make modules_install
- # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \
- 2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \
- --builtin mptbase --builtin mptscsih --builtin uhci-hcd \
- --builtin ohci-hcd --builtin ehci-hcd
-
-========================
-Boot DomainU with pv_ops
-========================
-
- 1. make config of DomU
- # vi /etc/xen/rhel5
- kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU"
- ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img"
- vcpus = 1
- memory = 512
- name = "rhel5"
- disk = [ 'file:/root/rhel5.img,xvda1,w' ]
- root = "/dev/xvda1 ro"
- extra= "rhgb console=hvc0"
-
- 2. After boot xen and dom0, start xend
- # /etc/init.d/xend start
- ( In the debugging case, # XEND_DEBUG=1 xend trace_start )
-
- 3. start domU
- # xm create -c rhel5
-
-=========
-Reference
-=========
-- Wiki of Xen/IA64 upstream merge
- http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge
-
-Written by Akio Takebe <takebe_akio@jp.fujitsu.com> on 28 May 2008
diff --git a/Documentation/ide/index.rst b/Documentation/ide/index.rst
index 45bc12d3957f..813dfe611a31 100644
--- a/Documentation/ide/index.rst
+++ b/Documentation/ide/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
==================================
Integrated Drive Electronics (IDE)
diff --git a/Documentation/iio/ep93xx_adc.rst b/Documentation/iio/ep93xx_adc.rst
new file mode 100644
index 000000000000..4fd8dea3f6b8
--- /dev/null
+++ b/Documentation/iio/ep93xx_adc.rst
@@ -0,0 +1,40 @@
+==============================
+Cirrus Logic EP93xx ADC driver
+==============================
+
+1. Overview
+===========
+
+The driver is intended to work on both low-end (EP9301, EP9302) devices with
+5-channel ADC and high-end (EP9307, EP9312, EP9315) devices with 10-channel
+touchscreen/ADC module.
+
+2. Channel numbering
+====================
+
+Numbering scheme for channels 0..4 is defined in EP9301 and EP9302 datasheets.
+EP9307, EP9312 and EP9312 have 3 channels more (total 8), but the numbering is
+not defined. So the last three are numbered randomly, let's say.
+
+Assuming ep93xx_adc is IIO device0, you'd find the following entries under
+/sys/bus/iio/devices/iio:device0/:
+
+ +-----------------+---------------+
+ | sysfs entry | ball/pin name |
+ +=================+===============+
+ | in_voltage0_raw | YM |
+ +-----------------+---------------+
+ | in_voltage1_raw | SXP |
+ +-----------------+---------------+
+ | in_voltage2_raw | SXM |
+ +-----------------+---------------+
+ | in_voltage3_raw | SYP |
+ +-----------------+---------------+
+ | in_voltage4_raw | SYM |
+ +-----------------+---------------+
+ | in_voltage5_raw | XP |
+ +-----------------+---------------+
+ | in_voltage6_raw | XM |
+ +-----------------+---------------+
+ | in_voltage7_raw | YP |
+ +-----------------+---------------+
diff --git a/Documentation/iio/ep93xx_adc.txt b/Documentation/iio/ep93xx_adc.txt
deleted file mode 100644
index 23053e7817bd..000000000000
--- a/Documentation/iio/ep93xx_adc.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-Cirrus Logic EP93xx ADC driver.
-
-1. Overview
-
-The driver is intended to work on both low-end (EP9301, EP9302) devices with
-5-channel ADC and high-end (EP9307, EP9312, EP9315) devices with 10-channel
-touchscreen/ADC module.
-
-2. Channel numbering
-
-Numbering scheme for channels 0..4 is defined in EP9301 and EP9302 datasheets.
-EP9307, EP9312 and EP9312 have 3 channels more (total 8), but the numbering is
-not defined. So the last three are numbered randomly, let's say.
-
-Assuming ep93xx_adc is IIO device0, you'd find the following entries under
-/sys/bus/iio/devices/iio:device0/:
-
- +-----------------+---------------+
- | sysfs entry | ball/pin name |
- +-----------------+---------------+
- | in_voltage0_raw | YM |
- | in_voltage1_raw | SXP |
- | in_voltage2_raw | SXM |
- | in_voltage3_raw | SYP |
- | in_voltage4_raw | SYM |
- | in_voltage5_raw | XP |
- | in_voltage6_raw | XM |
- | in_voltage7_raw | YP |
- +-----------------+---------------+
diff --git a/Documentation/iio/iio_configfs.rst b/Documentation/iio/iio_configfs.rst
new file mode 100644
index 000000000000..ecbfdb3afef7
--- /dev/null
+++ b/Documentation/iio/iio_configfs.rst
@@ -0,0 +1,101 @@
+===============================
+Industrial IIO configfs support
+===============================
+
+1. Overview
+===========
+
+Configfs is a filesystem-based manager of kernel objects. IIO uses some
+objects that could be easily configured using configfs (e.g.: devices,
+triggers).
+
+See Documentation/filesystems/configfs/configfs.txt for more information
+about how configfs works.
+
+2. Usage
+========
+
+In order to use configfs support in IIO we need to select it at compile
+time via CONFIG_IIO_CONFIGFS config option.
+
+Then, mount the configfs filesystem (usually under /config directory)::
+
+ $ mkdir /config
+ $ mount -t configfs none /config
+
+At this point, all default IIO groups will be created and can be accessed
+under /config/iio. Next chapters will describe available IIO configuration
+objects.
+
+3. Software triggers
+====================
+
+One of the IIO default configfs groups is the "triggers" group. It is
+automagically accessible when the configfs is mounted and can be found
+under /config/iio/triggers.
+
+IIO software triggers implementation offers support for creating multiple
+trigger types. A new trigger type is usually implemented as a separate
+kernel module following the interface in include/linux/iio/sw_trigger.h::
+
+ /*
+ * drivers/iio/trigger/iio-trig-sample.c
+ * sample kernel module implementing a new trigger type
+ */
+ #include <linux/iio/sw_trigger.h>
+
+
+ static struct iio_sw_trigger *iio_trig_sample_probe(const char *name)
+ {
+ /*
+ * This allocates and registers an IIO trigger plus other
+ * trigger type specific initialization.
+ */
+ }
+
+ static int iio_trig_hrtimer_remove(struct iio_sw_trigger *swt)
+ {
+ /*
+ * This undoes the actions in iio_trig_sample_probe
+ */
+ }
+
+ static const struct iio_sw_trigger_ops iio_trig_sample_ops = {
+ .probe = iio_trig_sample_probe,
+ .remove = iio_trig_sample_remove,
+ };
+
+ static struct iio_sw_trigger_type iio_trig_sample = {
+ .name = "trig-sample",
+ .owner = THIS_MODULE,
+ .ops = &iio_trig_sample_ops,
+ };
+
+module_iio_sw_trigger_driver(iio_trig_sample);
+
+Each trigger type has its own directory under /config/iio/triggers. Loading
+iio-trig-sample module will create 'trig-sample' trigger type directory
+/config/iio/triggers/trig-sample.
+
+We support the following interrupt sources (trigger types):
+
+ * hrtimer, uses high resolution timers as interrupt source
+
+3.1 Hrtimer triggers creation and destruction
+---------------------------------------------
+
+Loading iio-trig-hrtimer module will register hrtimer trigger types allowing
+users to create hrtimer triggers under /config/iio/triggers/hrtimer.
+
+e.g::
+
+ $ mkdir /config/iio/triggers/hrtimer/instance1
+ $ rmdir /config/iio/triggers/hrtimer/instance1
+
+Each trigger can have one or more attributes specific to the trigger type.
+
+3.2 "hrtimer" trigger types attributes
+--------------------------------------
+
+"hrtimer" trigger type doesn't have any configurable attribute from /config dir.
+It does introduce the sampling_frequency attribute to trigger directory.
diff --git a/Documentation/iio/iio_configfs.txt b/Documentation/iio/iio_configfs.txt
deleted file mode 100644
index 4e5f101837a8..000000000000
--- a/Documentation/iio/iio_configfs.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-Industrial IIO configfs support
-
-1. Overview
-
-Configfs is a filesystem-based manager of kernel objects. IIO uses some
-objects that could be easily configured using configfs (e.g.: devices,
-triggers).
-
-See Documentation/filesystems/configfs/configfs.txt for more information
-about how configfs works.
-
-2. Usage
-
-In order to use configfs support in IIO we need to select it at compile
-time via CONFIG_IIO_CONFIGFS config option.
-
-Then, mount the configfs filesystem (usually under /config directory):
-
-$ mkdir /config
-$ mount -t configfs none /config
-
-At this point, all default IIO groups will be created and can be accessed
-under /config/iio. Next chapters will describe available IIO configuration
-objects.
-
-3. Software triggers
-
-One of the IIO default configfs groups is the "triggers" group. It is
-automagically accessible when the configfs is mounted and can be found
-under /config/iio/triggers.
-
-IIO software triggers implementation offers support for creating multiple
-trigger types. A new trigger type is usually implemented as a separate
-kernel module following the interface in include/linux/iio/sw_trigger.h:
-
-/*
- * drivers/iio/trigger/iio-trig-sample.c
- * sample kernel module implementing a new trigger type
- */
-#include <linux/iio/sw_trigger.h>
-
-
-static struct iio_sw_trigger *iio_trig_sample_probe(const char *name)
-{
- /*
- * This allocates and registers an IIO trigger plus other
- * trigger type specific initialization.
- */
-}
-
-static int iio_trig_hrtimer_remove(struct iio_sw_trigger *swt)
-{
- /*
- * This undoes the actions in iio_trig_sample_probe
- */
-}
-
-static const struct iio_sw_trigger_ops iio_trig_sample_ops = {
- .probe = iio_trig_sample_probe,
- .remove = iio_trig_sample_remove,
-};
-
-static struct iio_sw_trigger_type iio_trig_sample = {
- .name = "trig-sample",
- .owner = THIS_MODULE,
- .ops = &iio_trig_sample_ops,
-};
-
-module_iio_sw_trigger_driver(iio_trig_sample);
-
-Each trigger type has its own directory under /config/iio/triggers. Loading
-iio-trig-sample module will create 'trig-sample' trigger type directory
-/config/iio/triggers/trig-sample.
-
-We support the following interrupt sources (trigger types):
- * hrtimer, uses high resolution timers as interrupt source
-
-3.1 Hrtimer triggers creation and destruction
-
-Loading iio-trig-hrtimer module will register hrtimer trigger types allowing
-users to create hrtimer triggers under /config/iio/triggers/hrtimer.
-
-e.g:
-
-$ mkdir /config/iio/triggers/hrtimer/instance1
-$ rmdir /config/iio/triggers/hrtimer/instance1
-
-Each trigger can have one or more attributes specific to the trigger type.
-
-3.2 "hrtimer" trigger types attributes
-
-"hrtimer" trigger type doesn't have any configurable attribute from /config dir.
-It does introduce the sampling_frequency attribute to trigger directory.
diff --git a/Documentation/iio/index.rst b/Documentation/iio/index.rst
new file mode 100644
index 000000000000..58b7a4ebac51
--- /dev/null
+++ b/Documentation/iio/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+Industrial I/O
+==============
+
+.. toctree::
+ :maxdepth: 1
+
+ iio_configfs
+
+ ep93xx_adc
diff --git a/Documentation/index.rst b/Documentation/index.rst
index 781042b4579d..70ae148ec980 100644
--- a/Documentation/index.rst
+++ b/Documentation/index.rst
@@ -1,3 +1,4 @@
+
.. The Linux Kernel documentation master file, created by
sphinx-quickstart on Fri Feb 12 13:51:46 2016.
You can adapt this file completely to your liking, but it should at least
@@ -34,6 +35,7 @@ trying to get it to work optimally on a given system.
:maxdepth: 2
admin-guide/index
+ kbuild/index
Firmware-related documentation
------------------------------
@@ -55,6 +57,7 @@ the kernel interface as seen by application developers.
:maxdepth: 2
userspace-api/index
+ ioctl/index
Introduction to kernel development
@@ -75,6 +78,9 @@ merged much easier.
kernel-hacking/index
trace/index
maintainer/index
+ fault-injection/index
+ livepatch/index
+
Kernel API documentation
------------------------
@@ -90,8 +96,24 @@ needed).
driver-api/index
core-api/index
+ locking/index
+ accounting/index
+ block/index
+ cdrom/index
+ ide/index
+ fb/index
+ fpga/index
+ hid/index
+ iio/index
+ infiniband/index
+ leds/index
media/index
+ netlabel/index
networking/index
+ pcmcia/index
+ target/index
+ timers/index
+ watchdog/index
input/index
hwmon/index
gpu/index
@@ -101,7 +123,11 @@ needed).
filesystems/index
vm/index
bpf/index
+ usb/index
+ PCI/index
misc-devices/index
+ mic/index
+ scheduler/index
Architecture-specific documentation
-----------------------------------
@@ -113,7 +139,16 @@ implementation.
:maxdepth: 2
sh/index
+ arm/index
+ arm64/index
+ ia64/index
+ m68k/index
+ riscv/index
+ s390/index
+ sh/index
+ sparc/index
x86/index
+ xtensa/index
Filesystem Documentation
------------------------
diff --git a/Documentation/infiniband/core_locking.rst b/Documentation/infiniband/core_locking.rst
new file mode 100644
index 000000000000..f34669beb4fe
--- /dev/null
+++ b/Documentation/infiniband/core_locking.rst
@@ -0,0 +1,118 @@
+===========================
+InfiniBand Midlayer Locking
+===========================
+
+ This guide is an attempt to make explicit the locking assumptions
+ made by the InfiniBand midlayer. It describes the requirements on
+ both low-level drivers that sit below the midlayer and upper level
+ protocols that use the midlayer.
+
+Sleeping and interrupt context
+==============================
+
+ With the following exceptions, a low-level driver implementation of
+ all of the methods in struct ib_device may sleep. The exceptions
+ are any methods from the list:
+
+ - create_ah
+ - modify_ah
+ - query_ah
+ - destroy_ah
+ - post_send
+ - post_recv
+ - poll_cq
+ - req_notify_cq
+ - map_phys_fmr
+
+ which may not sleep and must be callable from any context.
+
+ The corresponding functions exported to upper level protocol
+ consumers:
+
+ - ib_create_ah
+ - ib_modify_ah
+ - ib_query_ah
+ - ib_destroy_ah
+ - ib_post_send
+ - ib_post_recv
+ - ib_req_notify_cq
+ - ib_map_phys_fmr
+
+ are therefore safe to call from any context.
+
+ In addition, the function
+
+ - ib_dispatch_event
+
+ used by low-level drivers to dispatch asynchronous events through
+ the midlayer is also safe to call from any context.
+
+Reentrancy
+----------
+
+ All of the methods in struct ib_device exported by a low-level
+ driver must be fully reentrant. The low-level driver is required to
+ perform all synchronization necessary to maintain consistency, even
+ if multiple function calls using the same object are run
+ simultaneously.
+
+ The IB midlayer does not perform any serialization of function calls.
+
+ Because low-level drivers are reentrant, upper level protocol
+ consumers are not required to perform any serialization. However,
+ some serialization may be required to get sensible results. For
+ example, a consumer may safely call ib_poll_cq() on multiple CPUs
+ simultaneously. However, the ordering of the work completion
+ information between different calls of ib_poll_cq() is not defined.
+
+Callbacks
+---------
+
+ A low-level driver must not perform a callback directly from the
+ same callchain as an ib_device method call. For example, it is not
+ allowed for a low-level driver to call a consumer's completion event
+ handler directly from its post_send method. Instead, the low-level
+ driver should defer this callback by, for example, scheduling a
+ tasklet to perform the callback.
+
+ The low-level driver is responsible for ensuring that multiple
+ completion event handlers for the same CQ are not called
+ simultaneously. The driver must guarantee that only one CQ event
+ handler for a given CQ is running at a time. In other words, the
+ following situation is not allowed::
+
+ CPU1 CPU2
+
+ low-level driver ->
+ consumer CQ event callback:
+ /* ... */
+ ib_req_notify_cq(cq, ...);
+ low-level driver ->
+ /* ... */ consumer CQ event callback:
+ /* ... */
+ return from CQ event handler
+
+ The context in which completion event and asynchronous event
+ callbacks run is not defined. Depending on the low-level driver, it
+ may be process context, softirq context, or interrupt context.
+ Upper level protocol consumers may not sleep in a callback.
+
+Hot-plug
+--------
+
+ A low-level driver announces that a device is ready for use by
+ consumers when it calls ib_register_device(), all initialization
+ must be complete before this call. The device must remain usable
+ until the driver's call to ib_unregister_device() has returned.
+
+ A low-level driver must call ib_register_device() and
+ ib_unregister_device() from process context. It must not hold any
+ semaphores that could cause deadlock if a consumer calls back into
+ the driver across these calls.
+
+ An upper level protocol consumer may begin using an IB device as
+ soon as the add method of its struct ib_client is called for that
+ device. A consumer must finish all cleanup and free all resources
+ relating to a device before returning from the remove method.
+
+ A consumer is permitted to sleep in its add and remove methods.
diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt
deleted file mode 100644
index 4b1f36b6ada0..000000000000
--- a/Documentation/infiniband/core_locking.txt
+++ /dev/null
@@ -1,112 +0,0 @@
-INFINIBAND MIDLAYER LOCKING
-
- This guide is an attempt to make explicit the locking assumptions
- made by the InfiniBand midlayer. It describes the requirements on
- both low-level drivers that sit below the midlayer and upper level
- protocols that use the midlayer.
-
-Sleeping and interrupt context
-
- With the following exceptions, a low-level driver implementation of
- all of the methods in struct ib_device may sleep. The exceptions
- are any methods from the list:
-
- create_ah
- modify_ah
- query_ah
- destroy_ah
- post_send
- post_recv
- poll_cq
- req_notify_cq
- map_phys_fmr
-
- which may not sleep and must be callable from any context.
-
- The corresponding functions exported to upper level protocol
- consumers:
-
- ib_create_ah
- ib_modify_ah
- ib_query_ah
- ib_destroy_ah
- ib_post_send
- ib_post_recv
- ib_req_notify_cq
- ib_map_phys_fmr
-
- are therefore safe to call from any context.
-
- In addition, the function
-
- ib_dispatch_event
-
- used by low-level drivers to dispatch asynchronous events through
- the midlayer is also safe to call from any context.
-
-Reentrancy
-
- All of the methods in struct ib_device exported by a low-level
- driver must be fully reentrant. The low-level driver is required to
- perform all synchronization necessary to maintain consistency, even
- if multiple function calls using the same object are run
- simultaneously.
-
- The IB midlayer does not perform any serialization of function calls.
-
- Because low-level drivers are reentrant, upper level protocol
- consumers are not required to perform any serialization. However,
- some serialization may be required to get sensible results. For
- example, a consumer may safely call ib_poll_cq() on multiple CPUs
- simultaneously. However, the ordering of the work completion
- information between different calls of ib_poll_cq() is not defined.
-
-Callbacks
-
- A low-level driver must not perform a callback directly from the
- same callchain as an ib_device method call. For example, it is not
- allowed for a low-level driver to call a consumer's completion event
- handler directly from its post_send method. Instead, the low-level
- driver should defer this callback by, for example, scheduling a
- tasklet to perform the callback.
-
- The low-level driver is responsible for ensuring that multiple
- completion event handlers for the same CQ are not called
- simultaneously. The driver must guarantee that only one CQ event
- handler for a given CQ is running at a time. In other words, the
- following situation is not allowed:
-
- CPU1 CPU2
-
- low-level driver ->
- consumer CQ event callback:
- /* ... */
- ib_req_notify_cq(cq, ...);
- low-level driver ->
- /* ... */ consumer CQ event callback:
- /* ... */
- return from CQ event handler
-
- The context in which completion event and asynchronous event
- callbacks run is not defined. Depending on the low-level driver, it
- may be process context, softirq context, or interrupt context.
- Upper level protocol consumers may not sleep in a callback.
-
-Hot-plug
-
- A low-level driver announces that a device is ready for use by
- consumers when it calls ib_register_device(), all initialization
- must be complete before this call. The device must remain usable
- until the driver's call to ib_unregister_device() has returned.
-
- A low-level driver must call ib_register_device() and
- ib_unregister_device() from process context. It must not hold any
- semaphores that could cause deadlock if a consumer calls back into
- the driver across these calls.
-
- An upper level protocol consumer may begin using an IB device as
- soon as the add method of its struct ib_client is called for that
- device. A consumer must finish all cleanup and free all resources
- relating to a device before returning from the remove method.
-
- A consumer is permitted to sleep in its add and remove methods.
diff --git a/Documentation/infiniband/index.rst b/Documentation/infiniband/index.rst
new file mode 100644
index 000000000000..9cd7615438b9
--- /dev/null
+++ b/Documentation/infiniband/index.rst
@@ -0,0 +1,23 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========
+InfiniBand
+==========
+
+.. toctree::
+ :maxdepth: 1
+
+ core_locking
+ ipoib
+ opa_vnic
+ sysfs
+ tag_matching
+ user_mad
+ user_verbs
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/infiniband/ipoib.rst b/Documentation/infiniband/ipoib.rst
new file mode 100644
index 000000000000..0dd36154c0c9
--- /dev/null
+++ b/Documentation/infiniband/ipoib.rst
@@ -0,0 +1,115 @@
+==================
+IP over InfiniBand
+==================
+
+ The ib_ipoib driver is an implementation of the IP over InfiniBand
+ protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
+ working group. It is a "native" implementation in the sense of
+ setting the interface type to ARPHRD_INFINIBAND and the hardware
+ address length to 20 (earlier proprietary implementations
+ masqueraded to the kernel as ethernet interfaces).
+
+Partitions and P_Keys
+=====================
+
+ When the IPoIB driver is loaded, it creates one interface for each
+ port using the P_Key at index 0. To create an interface with a
+ different P_Key, write the desired P_Key into the main interface's
+ /sys/class/net/<intf name>/create_child file. For example::
+
+ echo 0x8001 > /sys/class/net/ib0/create_child
+
+ This will create an interface named ib0.8001 with P_Key 0x8001. To
+ remove a subinterface, use the "delete_child" file::
+
+ echo 0x8001 > /sys/class/net/ib0/delete_child
+
+ The P_Key for any interface is given by the "pkey" file, and the
+ main interface for a subinterface is in "parent."
+
+ Child interface create/delete can also be done using IPoIB's
+ rtnl_link_ops, where children created using either way behave the same.
+
+Datagram vs Connected modes
+===========================
+
+ The IPoIB driver supports two modes of operation: datagram and
+ connected. The mode is set and read through an interface's
+ /sys/class/net/<intf name>/mode file.
+
+ In datagram mode, the IB UD (Unreliable Datagram) transport is used
+ and so the interface MTU has is equal to the IB L2 MTU minus the
+ IPoIB encapsulation header (4 bytes). For example, in a typical IB
+ fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
+
+ In connected mode, the IB RC (Reliable Connected) transport is used.
+ Connected mode takes advantage of the connected nature of the IB
+ transport and allows an MTU up to the maximal IP packet size of 64K,
+ which reduces the number of IP packets needed for handling large UDP
+ datagrams, TCP segments, etc and increases the performance for large
+ messages.
+
+ In connected mode, the interface's UD QP is still used for multicast
+ and communication with peers that don't support connected mode. In
+ this case, RX emulation of ICMP PMTU packets is used to cause the
+ networking stack to use the smaller UD MTU for these neighbours.
+
+Stateless offloads
+==================
+
+ If the IB HW supports IPoIB stateless offloads, IPoIB advertises
+ TCP/IP checksum and/or Large Send (LSO) offloading capability to the
+ network stack.
+
+ Large Receive (LRO) offloading is also implemented and may be turned
+ on/off using ethtool calls. Currently LRO is supported only for
+ checksum offload capable devices.
+
+ Stateless offloads are supported only in datagram mode.
+
+Interrupt moderation
+====================
+
+ If the underlying IB device supports CQ event moderation, one can
+ use ethtool to set interrupt mitigation parameters and thus reduce
+ the overhead incurred by handling interrupts. The main code path of
+ IPoIB doesn't use events for TX completion signaling so only RX
+ moderation is supported.
+
+Debugging Information
+=====================
+
+ By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
+ to 'y', tracing messages are compiled into the driver. They are
+ turned on by setting the module parameters debug_level and
+ mcast_debug_level to 1. These parameters can be controlled at
+ runtime through files in /sys/module/ib_ipoib/.
+
+ CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
+ virtual filesystem. By mounting this filesystem, for example with::
+
+ mount -t debugfs none /sys/kernel/debug
+
+ it is possible to get statistics about multicast groups from the
+ files /sys/kernel/debug/ipoib/ib0_mcg and so on.
+
+ The performance impact of this option is negligible, so it
+ is safe to enable this option with debug_level set to 0 for normal
+ operation.
+
+ CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
+ the data path when data_debug_level is set to 1. However, even with
+ the output disabled, enabling this configuration option will affect
+ performance, because it adds tests to the fast path.
+
+References
+==========
+
+ Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
+ http://ietf.org/rfc/rfc4391.txt
+
+ IP over InfiniBand (IPoIB) Architecture (RFC 4392)
+ http://ietf.org/rfc/rfc4392.txt
+
+ IP over InfiniBand: Connected Mode (RFC 4755)
+ http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/infiniband/ipoib.txt b/Documentation/infiniband/ipoib.txt
deleted file mode 100644
index 47c1dd9818f2..000000000000
--- a/Documentation/infiniband/ipoib.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-IP OVER INFINIBAND
-
- The ib_ipoib driver is an implementation of the IP over InfiniBand
- protocol as specified by RFC 4391 and 4392, issued by the IETF ipoib
- working group. It is a "native" implementation in the sense of
- setting the interface type to ARPHRD_INFINIBAND and the hardware
- address length to 20 (earlier proprietary implementations
- masqueraded to the kernel as ethernet interfaces).
-
-Partitions and P_Keys
-
- When the IPoIB driver is loaded, it creates one interface for each
- port using the P_Key at index 0. To create an interface with a
- different P_Key, write the desired P_Key into the main interface's
- /sys/class/net/<intf name>/create_child file. For example:
-
- echo 0x8001 > /sys/class/net/ib0/create_child
-
- This will create an interface named ib0.8001 with P_Key 0x8001. To
- remove a subinterface, use the "delete_child" file:
-
- echo 0x8001 > /sys/class/net/ib0/delete_child
-
- The P_Key for any interface is given by the "pkey" file, and the
- main interface for a subinterface is in "parent."
-
- Child interface create/delete can also be done using IPoIB's
- rtnl_link_ops, where children created using either way behave the same.
-
-Datagram vs Connected modes
-
- The IPoIB driver supports two modes of operation: datagram and
- connected. The mode is set and read through an interface's
- /sys/class/net/<intf name>/mode file.
-
- In datagram mode, the IB UD (Unreliable Datagram) transport is used
- and so the interface MTU has is equal to the IB L2 MTU minus the
- IPoIB encapsulation header (4 bytes). For example, in a typical IB
- fabric with a 2K MTU, the IPoIB MTU will be 2048 - 4 = 2044 bytes.
-
- In connected mode, the IB RC (Reliable Connected) transport is used.
- Connected mode takes advantage of the connected nature of the IB
- transport and allows an MTU up to the maximal IP packet size of 64K,
- which reduces the number of IP packets needed for handling large UDP
- datagrams, TCP segments, etc and increases the performance for large
- messages.
-
- In connected mode, the interface's UD QP is still used for multicast
- and communication with peers that don't support connected mode. In
- this case, RX emulation of ICMP PMTU packets is used to cause the
- networking stack to use the smaller UD MTU for these neighbours.
-
-Stateless offloads
-
- If the IB HW supports IPoIB stateless offloads, IPoIB advertises
- TCP/IP checksum and/or Large Send (LSO) offloading capability to the
- network stack.
-
- Large Receive (LRO) offloading is also implemented and may be turned
- on/off using ethtool calls. Currently LRO is supported only for
- checksum offload capable devices.
-
- Stateless offloads are supported only in datagram mode.
-
-Interrupt moderation
-
- If the underlying IB device supports CQ event moderation, one can
- use ethtool to set interrupt mitigation parameters and thus reduce
- the overhead incurred by handling interrupts. The main code path of
- IPoIB doesn't use events for TX completion signaling so only RX
- moderation is supported.
-
-Debugging Information
-
- By compiling the IPoIB driver with CONFIG_INFINIBAND_IPOIB_DEBUG set
- to 'y', tracing messages are compiled into the driver. They are
- turned on by setting the module parameters debug_level and
- mcast_debug_level to 1. These parameters can be controlled at
- runtime through files in /sys/module/ib_ipoib/.
-
- CONFIG_INFINIBAND_IPOIB_DEBUG also enables files in the debugfs
- virtual filesystem. By mounting this filesystem, for example with
-
- mount -t debugfs none /sys/kernel/debug
-
- it is possible to get statistics about multicast groups from the
- files /sys/kernel/debug/ipoib/ib0_mcg and so on.
-
- The performance impact of this option is negligible, so it
- is safe to enable this option with debug_level set to 0 for normal
- operation.
-
- CONFIG_INFINIBAND_IPOIB_DEBUG_DATA enables even more debug output in
- the data path when data_debug_level is set to 1. However, even with
- the output disabled, enabling this configuration option will affect
- performance, because it adds tests to the fast path.
-
-References
-
- Transmission of IP over InfiniBand (IPoIB) (RFC 4391)
- http://ietf.org/rfc/rfc4391.txt
- IP over InfiniBand (IPoIB) Architecture (RFC 4392)
- http://ietf.org/rfc/rfc4392.txt
- IP over InfiniBand: Connected Mode (RFC 4755)
- http://ietf.org/rfc/rfc4755.txt
diff --git a/Documentation/infiniband/opa_vnic.rst b/Documentation/infiniband/opa_vnic.rst
new file mode 100644
index 000000000000..2f888d9ffec0
--- /dev/null
+++ b/Documentation/infiniband/opa_vnic.rst
@@ -0,0 +1,159 @@
+=================================================================
+Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC)
+=================================================================
+
+Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature
+supports Ethernet functionality over Omni-Path fabric by encapsulating
+the Ethernet packets between HFI nodes.
+
+Architecture
+=============
+The patterns of exchanges of Omni-Path encapsulated Ethernet packets
+involves one or more virtual Ethernet switches overlaid on the Omni-Path
+fabric topology. A subset of HFI nodes on the Omni-Path fabric are
+permitted to exchange encapsulated Ethernet packets across a particular
+virtual Ethernet switch. The virtual Ethernet switches are logical
+abstractions achieved by configuring the HFI nodes on the fabric for
+header generation and processing. In the simplest configuration all HFI
+nodes across the fabric exchange encapsulated Ethernet packets over a
+single virtual Ethernet switch. A virtual Ethernet switch, is effectively
+an independent Ethernet network. The configuration is performed by an
+Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
+application. HFI nodes can have multiple VNICs each connected to a
+different virtual Ethernet switch. The below diagram presents a case
+of two virtual Ethernet switches with two HFI nodes::
+
+ +-------------------+
+ | Subnet/ |
+ | Ethernet |
+ | Manager |
+ +-------------------+
+ / /
+ / /
+ / /
+ / /
+ +-----------------------------+ +------------------------------+
+ | Virtual Ethernet Switch | | Virtual Ethernet Switch |
+ | +---------+ +---------+ | | +---------+ +---------+ |
+ | | VPORT | | VPORT | | | | VPORT | | VPORT | |
+ +--+---------+----+---------+-+ +-+---------+----+---------+---+
+ | \ / |
+ | \ / |
+ | \/ |
+ | / \ |
+ | / \ |
+ +-----------+------------+ +-----------+------------+
+ | VNIC | VNIC | | VNIC | VNIC |
+ +-----------+------------+ +-----------+------------+
+ | HFI | | HFI |
+ +------------------------+ +------------------------+
+
+
+The Omni-Path encapsulated Ethernet packet format is as described below.
+
+==================== ================================
+Bits Field
+==================== ================================
+Quad Word 0:
+0-19 SLID (lower 20 bits)
+20-30 Length (in Quad Words)
+31 BECN bit
+32-51 DLID (lower 20 bits)
+52-56 SC (Service Class)
+57-59 RC (Routing Control)
+60 FECN bit
+61-62 L2 (=10, 16B format)
+63 LT (=1, Link Transfer Head Flit)
+
+Quad Word 1:
+0-7 L4 type (=0x78 ETHERNET)
+8-11 SLID[23:20]
+12-15 DLID[23:20]
+16-31 PKEY
+32-47 Entropy
+48-63 Reserved
+
+Quad Word 2:
+0-15 Reserved
+16-31 L4 header
+32-63 Ethernet Packet
+
+Quad Words 3 to N-1:
+0-63 Ethernet packet (pad extended)
+
+Quad Word N (last):
+0-23 Ethernet packet (pad extended)
+24-55 ICRC
+56-61 Tail
+62-63 LT (=01, Link Transfer Tail Flit)
+==================== ================================
+
+Ethernet packet is padded on the transmit side to ensure that the VNIC OPA
+packet is quad word aligned. The 'Tail' field contains the number of bytes
+padded. On the receive side the 'Tail' field is read and the padding is
+removed (along with ICRC, Tail and OPA header) before passing packet up
+the network stack.
+
+The L4 header field contains the virtual Ethernet switch id the VNIC port
+belongs to. On the receive side, this field is used to de-multiplex the
+received VNIC packets to different VNIC ports.
+
+Driver Design
+==============
+Intel OPA VNIC software design is presented in the below diagram.
+OPA VNIC functionality has a HW dependent component and a HW
+independent component.
+
+The support has been added for IB device to allocate and free the RDMA
+netdev devices. The RDMA netdev supports interfacing with the network
+stack thus creating standard network interfaces. OPA_VNIC is an RDMA
+netdev device type.
+
+The HW dependent VNIC functionality is part of the HFI1 driver. It
+implements the verbs to allocate and free the OPA_VNIC RDMA netdev.
+It involves HW resource allocation/management for VNIC functionality.
+It interfaces with the network stack and implements the required
+net_device_ops functions. It expects Omni-Path encapsulated Ethernet
+packets in the transmit path and provides HW access to them. It strips
+the Omni-Path header from the received packets before passing them up
+the network stack. It also implements the RDMA netdev control operations.
+
+The OPA VNIC module implements the HW independent VNIC functionality.
+It consists of two parts. The VNIC Ethernet Management Agent (VEMA)
+registers itself with IB core as an IB client and interfaces with the
+IB MAD stack. It exchanges the management information with the Ethernet
+Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees
+the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions
+set by HW dependent VNIC driver where required to accommodate any control
+operation. It also handles the encapsulation of Ethernet packets with an
+Omni-Path header in the transmit path. For each VNIC interface, the
+information required for encapsulation is configured by the EM via VEMA MAD
+interface. It also passes any control information to the HW dependent driver
+by invoking the RDMA netdev control operations::
+
+ +-------------------+ +----------------------+
+ | | | Linux |
+ | IB MAD | | Network |
+ | | | Stack |
+ +-------------------+ +----------------------+
+ | | |
+ | | |
+ +----------------------------+ |
+ | | |
+ | OPA VNIC Module | |
+ | (OPA VNIC RDMA Netdev | |
+ | & EMA functions) | |
+ | | |
+ +----------------------------+ |
+ | |
+ | |
+ +------------------+ |
+ | IB core | |
+ +------------------+ |
+ | |
+ | |
+ +--------------------------------------------+
+ | |
+ | HFI1 Driver with VNIC support |
+ | |
+ +--------------------------------------------+
diff --git a/Documentation/infiniband/opa_vnic.txt b/Documentation/infiniband/opa_vnic.txt
deleted file mode 100644
index 282e17be798a..000000000000
--- a/Documentation/infiniband/opa_vnic.txt
+++ /dev/null
@@ -1,153 +0,0 @@
-Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature
-supports Ethernet functionality over Omni-Path fabric by encapsulating
-the Ethernet packets between HFI nodes.
-
-Architecture
-=============
-The patterns of exchanges of Omni-Path encapsulated Ethernet packets
-involves one or more virtual Ethernet switches overlaid on the Omni-Path
-fabric topology. A subset of HFI nodes on the Omni-Path fabric are
-permitted to exchange encapsulated Ethernet packets across a particular
-virtual Ethernet switch. The virtual Ethernet switches are logical
-abstractions achieved by configuring the HFI nodes on the fabric for
-header generation and processing. In the simplest configuration all HFI
-nodes across the fabric exchange encapsulated Ethernet packets over a
-single virtual Ethernet switch. A virtual Ethernet switch, is effectively
-an independent Ethernet network. The configuration is performed by an
-Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM)
-application. HFI nodes can have multiple VNICs each connected to a
-different virtual Ethernet switch. The below diagram presents a case
-of two virtual Ethernet switches with two HFI nodes.
-
- +-------------------+
- | Subnet/ |
- | Ethernet |
- | Manager |
- +-------------------+
- / /
- / /
- / /
- / /
-+-----------------------------+ +------------------------------+
-| Virtual Ethernet Switch | | Virtual Ethernet Switch |
-| +---------+ +---------+ | | +---------+ +---------+ |
-| | VPORT | | VPORT | | | | VPORT | | VPORT | |
-+--+---------+----+---------+-+ +-+---------+----+---------+---+
- | \ / |
- | \ / |
- | \/ |
- | / \ |
- | / \ |
- +-----------+------------+ +-----------+------------+
- | VNIC | VNIC | | VNIC | VNIC |
- +-----------+------------+ +-----------+------------+
- | HFI | | HFI |
- +------------------------+ +------------------------+
-
-
-The Omni-Path encapsulated Ethernet packet format is as described below.
-
-Bits Field
-------------------------------------
-Quad Word 0:
-0-19 SLID (lower 20 bits)
-20-30 Length (in Quad Words)
-31 BECN bit
-32-51 DLID (lower 20 bits)
-52-56 SC (Service Class)
-57-59 RC (Routing Control)
-60 FECN bit
-61-62 L2 (=10, 16B format)
-63 LT (=1, Link Transfer Head Flit)
-
-Quad Word 1:
-0-7 L4 type (=0x78 ETHERNET)
-8-11 SLID[23:20]
-12-15 DLID[23:20]
-16-31 PKEY
-32-47 Entropy
-48-63 Reserved
-
-Quad Word 2:
-0-15 Reserved
-16-31 L4 header
-32-63 Ethernet Packet
-
-Quad Words 3 to N-1:
-0-63 Ethernet packet (pad extended)
-
-Quad Word N (last):
-0-23 Ethernet packet (pad extended)
-24-55 ICRC
-56-61 Tail
-62-63 LT (=01, Link Transfer Tail Flit)
-
-Ethernet packet is padded on the transmit side to ensure that the VNIC OPA
-packet is quad word aligned. The 'Tail' field contains the number of bytes
-padded. On the receive side the 'Tail' field is read and the padding is
-removed (along with ICRC, Tail and OPA header) before passing packet up
-the network stack.
-
-The L4 header field contains the virtual Ethernet switch id the VNIC port
-belongs to. On the receive side, this field is used to de-multiplex the
-received VNIC packets to different VNIC ports.
-
-Driver Design
-==============
-Intel OPA VNIC software design is presented in the below diagram.
-OPA VNIC functionality has a HW dependent component and a HW
-independent component.
-
-The support has been added for IB device to allocate and free the RDMA
-netdev devices. The RDMA netdev supports interfacing with the network
-stack thus creating standard network interfaces. OPA_VNIC is an RDMA
-netdev device type.
-
-The HW dependent VNIC functionality is part of the HFI1 driver. It
-implements the verbs to allocate and free the OPA_VNIC RDMA netdev.
-It involves HW resource allocation/management for VNIC functionality.
-It interfaces with the network stack and implements the required
-net_device_ops functions. It expects Omni-Path encapsulated Ethernet
-packets in the transmit path and provides HW access to them. It strips
-the Omni-Path header from the received packets before passing them up
-the network stack. It also implements the RDMA netdev control operations.
-
-The OPA VNIC module implements the HW independent VNIC functionality.
-It consists of two parts. The VNIC Ethernet Management Agent (VEMA)
-registers itself with IB core as an IB client and interfaces with the
-IB MAD stack. It exchanges the management information with the Ethernet
-Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees
-the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions
-set by HW dependent VNIC driver where required to accommodate any control
-operation. It also handles the encapsulation of Ethernet packets with an
-Omni-Path header in the transmit path. For each VNIC interface, the
-information required for encapsulation is configured by the EM via VEMA MAD
-interface. It also passes any control information to the HW dependent driver
-by invoking the RDMA netdev control operations.
-
- +-------------------+ +----------------------+
- | | | Linux |
- | IB MAD | | Network |
- | | | Stack |
- +-------------------+ +----------------------+
- | | |
- | | |
- +----------------------------+ |
- | | |
- | OPA VNIC Module | |
- | (OPA VNIC RDMA Netdev | |
- | & EMA functions) | |
- | | |
- +----------------------------+ |
- | |
- | |
- +------------------+ |
- | IB core | |
- +------------------+ |
- | |
- | |
- +--------------------------------------------+
- | |
- | HFI1 Driver with VNIC support |
- | |
- +--------------------------------------------+
diff --git a/Documentation/infiniband/sysfs.rst b/Documentation/infiniband/sysfs.rst
new file mode 100644
index 000000000000..f0abd6fa48f4
--- /dev/null
+++ b/Documentation/infiniband/sysfs.rst
@@ -0,0 +1,6 @@
+===========
+Sysfs files
+===========
+
+The sysfs interface has moved to
+Documentation/ABI/stable/sysfs-class-infiniband.
diff --git a/Documentation/infiniband/sysfs.txt b/Documentation/infiniband/sysfs.txt
deleted file mode 100644
index 9fab5062f84b..000000000000
--- a/Documentation/infiniband/sysfs.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-SYSFS FILES
-
-The sysfs interface has moved to
-Documentation/ABI/stable/sysfs-class-infiniband.
diff --git a/Documentation/infiniband/tag_matching.rst b/Documentation/infiniband/tag_matching.rst
new file mode 100644
index 000000000000..ef56ea585f92
--- /dev/null
+++ b/Documentation/infiniband/tag_matching.rst
@@ -0,0 +1,69 @@
+==================
+Tag matching logic
+==================
+
+The MPI standard defines a set of rules, known as tag-matching, for matching
+source send operations to destination receives. The following parameters must
+match the following source and destination parameters:
+
+* Communicator
+* User tag - wild card may be specified by the receiver
+* Source rank – wild car may be specified by the receiver
+* Destination rank – wild
+
+The ordering rules require that when more than one pair of send and receive
+message envelopes may match, the pair that includes the earliest posted-send
+and the earliest posted-receive is the pair that must be used to satisfy the
+matching operation. However, this doesn’t imply that tags are consumed in
+the order they are created, e.g., a later generated tag may be consumed, if
+earlier tags can’t be used to satisfy the matching rules.
+
+When a message is sent from the sender to the receiver, the communication
+library may attempt to process the operation either after or before the
+corresponding matching receive is posted. If a matching receive is posted,
+this is an expected message, otherwise it is called an unexpected message.
+Implementations frequently use different matching schemes for these two
+different matching instances.
+
+To keep MPI library memory footprint down, MPI implementations typically use
+two different protocols for this purpose:
+
+1. The Eager protocol- the complete message is sent when the send is
+processed by the sender. A completion send is received in the send_cq
+notifying that the buffer can be reused.
+
+2. The Rendezvous Protocol - the sender sends the tag-matching header,
+and perhaps a portion of data when first notifying the receiver. When the
+corresponding buffer is posted, the responder will use the information from
+the header to initiate an RDMA READ operation directly to the matching buffer.
+A fin message needs to be received in order for the buffer to be reused.
+
+Tag matching implementation
+===========================
+
+There are two types of matching objects used, the posted receive list and the
+unexpected message list. The application posts receive buffers through calls
+to the MPI receive routines in the posted receive list and posts send messages
+using the MPI send routines. The head of the posted receive list may be
+maintained by the hardware, with the software expected to shadow this list.
+
+When send is initiated and arrives at the receive side, if there is no
+pre-posted receive for this arriving message, it is passed to the software and
+placed in the unexpected message list. Otherwise the match is processed,
+including rendezvous processing, if appropriate, delivering the data to the
+specified receive buffer. This allows overlapping receive-side MPI tag
+matching with computation.
+
+When a receive-message is posted, the communication library will first check
+the software unexpected message list for a matching receive. If a match is
+found, data is delivered to the user buffer, using a software controlled
+protocol. The UCX implementation uses either an eager or rendezvous protocol,
+depending on data size. If no match is found, the entire pre-posted receive
+list is maintained by the hardware, and there is space to add one more
+pre-posted receive to this list, this receive is passed to the hardware.
+Software is expected to shadow this list, to help with processing MPI cancel
+operations. In addition, because hardware and software are not expected to be
+tightly synchronized with respect to the tag-matching operation, this shadow
+list is used to detect the case that a pre-posted receive is passed to the
+hardware, as the matching unexpected message is being passed from the hardware
+to the software.
diff --git a/Documentation/infiniband/tag_matching.txt b/Documentation/infiniband/tag_matching.txt
deleted file mode 100644
index d2a3bf819226..000000000000
--- a/Documentation/infiniband/tag_matching.txt
+++ /dev/null
@@ -1,64 +0,0 @@
-Tag matching logic
-
-The MPI standard defines a set of rules, known as tag-matching, for matching
-source send operations to destination receives. The following parameters must
-match the following source and destination parameters:
-* Communicator
-* User tag - wild card may be specified by the receiver
-* Source rank – wild car may be specified by the receiver
-* Destination rank – wild
-The ordering rules require that when more than one pair of send and receive
-message envelopes may match, the pair that includes the earliest posted-send
-and the earliest posted-receive is the pair that must be used to satisfy the
-matching operation. However, this doesn’t imply that tags are consumed in
-the order they are created, e.g., a later generated tag may be consumed, if
-earlier tags can’t be used to satisfy the matching rules.
-
-When a message is sent from the sender to the receiver, the communication
-library may attempt to process the operation either after or before the
-corresponding matching receive is posted. If a matching receive is posted,
-this is an expected message, otherwise it is called an unexpected message.
-Implementations frequently use different matching schemes for these two
-different matching instances.
-
-To keep MPI library memory footprint down, MPI implementations typically use
-two different protocols for this purpose:
-
-1. The Eager protocol- the complete message is sent when the send is
-processed by the sender. A completion send is received in the send_cq
-notifying that the buffer can be reused.
-
-2. The Rendezvous Protocol - the sender sends the tag-matching header,
-and perhaps a portion of data when first notifying the receiver. When the
-corresponding buffer is posted, the responder will use the information from
-the header to initiate an RDMA READ operation directly to the matching buffer.
-A fin message needs to be received in order for the buffer to be reused.
-
-Tag matching implementation
-
-There are two types of matching objects used, the posted receive list and the
-unexpected message list. The application posts receive buffers through calls
-to the MPI receive routines in the posted receive list and posts send messages
-using the MPI send routines. The head of the posted receive list may be
-maintained by the hardware, with the software expected to shadow this list.
-
-When send is initiated and arrives at the receive side, if there is no
-pre-posted receive for this arriving message, it is passed to the software and
-placed in the unexpected message list. Otherwise the match is processed,
-including rendezvous processing, if appropriate, delivering the data to the
-specified receive buffer. This allows overlapping receive-side MPI tag
-matching with computation.
-
-When a receive-message is posted, the communication library will first check
-the software unexpected message list for a matching receive. If a match is
-found, data is delivered to the user buffer, using a software controlled
-protocol. The UCX implementation uses either an eager or rendezvous protocol,
-depending on data size. If no match is found, the entire pre-posted receive
-list is maintained by the hardware, and there is space to add one more
-pre-posted receive to this list, this receive is passed to the hardware.
-Software is expected to shadow this list, to help with processing MPI cancel
-operations. In addition, because hardware and software are not expected to be
-tightly synchronized with respect to the tag-matching operation, this shadow
-list is used to detect the case that a pre-posted receive is passed to the
-hardware, as the matching unexpected message is being passed from the hardware
-to the software.
diff --git a/Documentation/infiniband/user_mad.rst b/Documentation/infiniband/user_mad.rst
new file mode 100644
index 000000000000..d88abfc0e370
--- /dev/null
+++ b/Documentation/infiniband/user_mad.rst
@@ -0,0 +1,166 @@
+====================
+Userspace MAD access
+====================
+
+Device files
+============
+
+ Each port of each InfiniBand device has a "umad" device and an
+ "issm" device attached. For example, a two-port HCA will have two
+ umad devices and two issm devices, while a switch will have one
+ device of each type (for switch port 0).
+
+Creating MAD agents
+===================
+
+ A MAD agent can be created by filling in a struct ib_user_mad_reg_req
+ and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
+ descriptor for the appropriate device file. If the registration
+ request succeeds, a 32-bit id will be returned in the structure.
+ For example::
+
+ struct ib_user_mad_reg_req req = { /* ... */ };
+ ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
+ if (!ret)
+ my_agent = req.id;
+ else
+ perror("agent register");
+
+ Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
+ ioctl. Also, all agents registered through a file descriptor will
+ be unregistered when the descriptor is closed.
+
+ 2014
+ a new registration ioctl is now provided which allows additional
+ fields to be provided during registration.
+ Users of this registration call are implicitly setting the use of
+ pkey_index (see below).
+
+Receiving MADs
+==============
+
+ MADs are received using read(). The receive side now supports
+ RMPP. The buffer passed to read() must be at least one
+ struct ib_user_mad + 256 bytes. For example:
+
+ If the buffer passed is not large enough to hold the received
+ MAD (RMPP), the errno is set to ENOSPC and the length of the
+ buffer needed is set in mad.length.
+
+ Example for normal MAD (non RMPP) reads::
+
+ struct ib_user_mad *mad;
+ mad = malloc(sizeof *mad + 256);
+ ret = read(fd, mad, sizeof *mad + 256);
+ if (ret != sizeof mad + 256) {
+ perror("read");
+ free(mad);
+ }
+
+ Example for RMPP reads::
+
+ struct ib_user_mad *mad;
+ mad = malloc(sizeof *mad + 256);
+ ret = read(fd, mad, sizeof *mad + 256);
+ if (ret == -ENOSPC)) {
+ length = mad.length;
+ free(mad);
+ mad = malloc(sizeof *mad + length);
+ ret = read(fd, mad, sizeof *mad + length);
+ }
+ if (ret < 0) {
+ perror("read");
+ free(mad);
+ }
+
+ In addition to the actual MAD contents, the other struct ib_user_mad
+ fields will be filled in with information on the received MAD. For
+ example, the remote LID will be in mad.lid.
+
+ If a send times out, a receive will be generated with mad.status set
+ to ETIMEDOUT. Otherwise when a MAD has been successfully received,
+ mad.status will be 0.
+
+ poll()/select() may be used to wait until a MAD can be read.
+
+Sending MADs
+============
+
+ MADs are sent using write(). The agent ID for sending should be
+ filled into the id field of the MAD, the destination LID should be
+ filled into the lid field, and so on. The send side does support
+ RMPP so arbitrary length MAD can be sent. For example::
+
+ struct ib_user_mad *mad;
+
+ mad = malloc(sizeof *mad + mad_length);
+
+ /* fill in mad->data */
+
+ mad->hdr.id = my_agent; /* req.id from agent registration */
+ mad->hdr.lid = my_dest; /* in network byte order... */
+ /* etc. */
+
+ ret = write(fd, &mad, sizeof *mad + mad_length);
+ if (ret != sizeof *mad + mad_length)
+ perror("write");
+
+Transaction IDs
+===============
+
+ Users of the umad devices can use the lower 32 bits of the
+ transaction ID field (that is, the least significant half of the
+ field in network byte order) in MADs being sent to match
+ request/response pairs. The upper 32 bits are reserved for use by
+ the kernel and will be overwritten before a MAD is sent.
+
+P_Key Index Handling
+====================
+
+ The old ib_umad interface did not allow setting the P_Key index for
+ MADs that are sent and did not provide a way for obtaining the P_Key
+ index of received MADs. A new layout for struct ib_user_mad_hdr
+ with a pkey_index member has been defined; however, to preserve binary
+ compatibility with older applications, this new layout will not be used
+ unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's
+ are called before a file descriptor is used for anything else.
+
+ In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
+ to 6, the new layout of struct ib_user_mad_hdr will be used by
+ default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
+
+Setting IsSM Capability Bit
+===========================
+
+ To set the IsSM capability bit for a port, simply open the
+ corresponding issm device file. If the IsSM bit is already set,
+ then the open call will block until the bit is cleared (or return
+ immediately with errno set to EAGAIN if the O_NONBLOCK flag is
+ passed to open()). The IsSM bit will be cleared when the issm file
+ is closed. No read, write or other operations can be performed on
+ the issm file.
+
+/dev files
+==========
+
+ To create the appropriate character device files automatically with
+ udev, a rule like::
+
+ KERNEL=="umad*", NAME="infiniband/%k"
+ KERNEL=="issm*", NAME="infiniband/%k"
+
+ can be used. This will create device nodes named::
+
+ /dev/infiniband/umad0
+ /dev/infiniband/issm0
+
+ for the first port, and so on. The InfiniBand device and port
+ associated with these devices can be determined from the files::
+
+ /sys/class/infiniband_mad/umad0/ibdev
+ /sys/class/infiniband_mad/umad0/port
+
+ and::
+
+ /sys/class/infiniband_mad/issm0/ibdev
+ /sys/class/infiniband_mad/issm0/port
diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt
deleted file mode 100644
index 7aca13a54a3a..000000000000
--- a/Documentation/infiniband/user_mad.txt
+++ /dev/null
@@ -1,153 +0,0 @@
-USERSPACE MAD ACCESS
-
-Device files
-
- Each port of each InfiniBand device has a "umad" device and an
- "issm" device attached. For example, a two-port HCA will have two
- umad devices and two issm devices, while a switch will have one
- device of each type (for switch port 0).
-
-Creating MAD agents
-
- A MAD agent can be created by filling in a struct ib_user_mad_reg_req
- and then calling the IB_USER_MAD_REGISTER_AGENT ioctl on a file
- descriptor for the appropriate device file. If the registration
- request succeeds, a 32-bit id will be returned in the structure.
- For example:
-
- struct ib_user_mad_reg_req req = { /* ... */ };
- ret = ioctl(fd, IB_USER_MAD_REGISTER_AGENT, (char *) &req);
- if (!ret)
- my_agent = req.id;
- else
- perror("agent register");
-
- Agents can be unregistered with the IB_USER_MAD_UNREGISTER_AGENT
- ioctl. Also, all agents registered through a file descriptor will
- be unregistered when the descriptor is closed.
-
- 2014 -- a new registration ioctl is now provided which allows additional
- fields to be provided during registration.
- Users of this registration call are implicitly setting the use of
- pkey_index (see below).
-
-Receiving MADs
-
- MADs are received using read(). The receive side now supports
- RMPP. The buffer passed to read() must be at least one
- struct ib_user_mad + 256 bytes. For example:
-
- If the buffer passed is not large enough to hold the received
- MAD (RMPP), the errno is set to ENOSPC and the length of the
- buffer needed is set in mad.length.
-
- Example for normal MAD (non RMPP) reads:
- struct ib_user_mad *mad;
- mad = malloc(sizeof *mad + 256);
- ret = read(fd, mad, sizeof *mad + 256);
- if (ret != sizeof mad + 256) {
- perror("read");
- free(mad);
- }
-
- Example for RMPP reads:
- struct ib_user_mad *mad;
- mad = malloc(sizeof *mad + 256);
- ret = read(fd, mad, sizeof *mad + 256);
- if (ret == -ENOSPC)) {
- length = mad.length;
- free(mad);
- mad = malloc(sizeof *mad + length);
- ret = read(fd, mad, sizeof *mad + length);
- }
- if (ret < 0) {
- perror("read");
- free(mad);
- }
-
- In addition to the actual MAD contents, the other struct ib_user_mad
- fields will be filled in with information on the received MAD. For
- example, the remote LID will be in mad.lid.
-
- If a send times out, a receive will be generated with mad.status set
- to ETIMEDOUT. Otherwise when a MAD has been successfully received,
- mad.status will be 0.
-
- poll()/select() may be used to wait until a MAD can be read.
-
-Sending MADs
-
- MADs are sent using write(). The agent ID for sending should be
- filled into the id field of the MAD, the destination LID should be
- filled into the lid field, and so on. The send side does support
- RMPP so arbitrary length MAD can be sent. For example:
-
- struct ib_user_mad *mad;
-
- mad = malloc(sizeof *mad + mad_length);
-
- /* fill in mad->data */
-
- mad->hdr.id = my_agent; /* req.id from agent registration */
- mad->hdr.lid = my_dest; /* in network byte order... */
- /* etc. */
-
- ret = write(fd, &mad, sizeof *mad + mad_length);
- if (ret != sizeof *mad + mad_length)
- perror("write");
-
-Transaction IDs
-
- Users of the umad devices can use the lower 32 bits of the
- transaction ID field (that is, the least significant half of the
- field in network byte order) in MADs being sent to match
- request/response pairs. The upper 32 bits are reserved for use by
- the kernel and will be overwritten before a MAD is sent.
-
-P_Key Index Handling
-
- The old ib_umad interface did not allow setting the P_Key index for
- MADs that are sent and did not provide a way for obtaining the P_Key
- index of received MADs. A new layout for struct ib_user_mad_hdr
- with a pkey_index member has been defined; however, to preserve binary
- compatibility with older applications, this new layout will not be used
- unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's
- are called before a file descriptor is used for anything else.
-
- In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented
- to 6, the new layout of struct ib_user_mad_hdr will be used by
- default, and the IB_USER_MAD_ENABLE_PKEY ioctl will be removed.
-
-Setting IsSM Capability Bit
-
- To set the IsSM capability bit for a port, simply open the
- corresponding issm device file. If the IsSM bit is already set,
- then the open call will block until the bit is cleared (or return
- immediately with errno set to EAGAIN if the O_NONBLOCK flag is
- passed to open()). The IsSM bit will be cleared when the issm file
- is closed. No read, write or other operations can be performed on
- the issm file.
-
-/dev files
-
- To create the appropriate character device files automatically with
- udev, a rule like
-
- KERNEL=="umad*", NAME="infiniband/%k"
- KERNEL=="issm*", NAME="infiniband/%k"
-
- can be used. This will create device nodes named
-
- /dev/infiniband/umad0
- /dev/infiniband/issm0
-
- for the first port, and so on. The InfiniBand device and port
- associated with these devices can be determined from the files
-
- /sys/class/infiniband_mad/umad0/ibdev
- /sys/class/infiniband_mad/umad0/port
-
- and
-
- /sys/class/infiniband_mad/issm0/ibdev
- /sys/class/infiniband_mad/issm0/port
diff --git a/Documentation/infiniband/user_verbs.rst b/Documentation/infiniband/user_verbs.rst
new file mode 100644
index 000000000000..8ddc4b1cfef2
--- /dev/null
+++ b/Documentation/infiniband/user_verbs.rst
@@ -0,0 +1,75 @@
+======================
+Userspace verbs access
+======================
+
+ The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
+ enables direct userspace access to IB hardware via "verbs," as
+ described in chapter 11 of the InfiniBand Architecture Specification.
+
+ To use the verbs, the libibverbs library, available from
+ https://github.com/linux-rdma/rdma-core, is required. libibverbs contains a
+ device-independent API for using the ib_uverbs interface.
+ libibverbs also requires appropriate device-dependent kernel and
+ userspace driver for your InfiniBand hardware. For example, to use
+ a Mellanox HCA, you will need the ib_mthca kernel module and the
+ libmthca userspace driver be installed.
+
+User-kernel communication
+=========================
+
+ Userspace communicates with the kernel for slow path, resource
+ management operations via the /dev/infiniband/uverbsN character
+ devices. Fast path operations are typically performed by writing
+ directly to hardware registers mmap()ed into userspace, with no
+ system call or context switch into the kernel.
+
+ Commands are sent to the kernel via write()s on these device files.
+ The ABI is defined in drivers/infiniband/include/ib_user_verbs.h.
+ The structs for commands that require a response from the kernel
+ contain a 64-bit field used to pass a pointer to an output buffer.
+ Status is returned to userspace as the return value of the write()
+ system call.
+
+Resource management
+===================
+
+ Since creation and destruction of all IB resources is done by
+ commands passed through a file descriptor, the kernel can keep track
+ of which resources are attached to a given userspace context. The
+ ib_uverbs module maintains idr tables that are used to translate
+ between kernel pointers and opaque userspace handles, so that kernel
+ pointers are never exposed to userspace and userspace cannot trick
+ the kernel into following a bogus pointer.
+
+ This also allows the kernel to clean up when a process exits and
+ prevent one process from touching another process's resources.
+
+Memory pinning
+==============
+
+ Direct userspace I/O requires that memory regions that are potential
+ I/O targets be kept resident at the same physical address. The
+ ib_uverbs module manages pinning and unpinning memory regions via
+ get_user_pages() and put_page() calls. It also accounts for the
+ amount of memory pinned in the process's pinned_vm, and checks that
+ unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
+
+ Pages that are pinned multiple times are counted each time they are
+ pinned, so the value of pinned_vm may be an overestimate of the
+ number of pages pinned by a process.
+
+/dev files
+==========
+
+ To create the appropriate character device files automatically with
+ udev, a rule like::
+
+ KERNEL=="uverbs*", NAME="infiniband/%k"
+
+ can be used. This will create device nodes named::
+
+ /dev/infiniband/uverbs0
+
+ and so on. Since the InfiniBand userspace verbs should be safe for
+ use by non-privileged processes, it may be useful to add an
+ appropriate MODE or GROUP to the udev rule.
diff --git a/Documentation/infiniband/user_verbs.txt b/Documentation/infiniband/user_verbs.txt
deleted file mode 100644
index 47ebf2f80b2b..000000000000
--- a/Documentation/infiniband/user_verbs.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-USERSPACE VERBS ACCESS
-
- The ib_uverbs module, built by enabling CONFIG_INFINIBAND_USER_VERBS,
- enables direct userspace access to IB hardware via "verbs," as
- described in chapter 11 of the InfiniBand Architecture Specification.
-
- To use the verbs, the libibverbs library, available from
- https://github.com/linux-rdma/rdma-core, is required. libibverbs contains a
- device-independent API for using the ib_uverbs interface.
- libibverbs also requires appropriate device-dependent kernel and
- userspace driver for your InfiniBand hardware. For example, to use
- a Mellanox HCA, you will need the ib_mthca kernel module and the
- libmthca userspace driver be installed.
-
-User-kernel communication
-
- Userspace communicates with the kernel for slow path, resource
- management operations via the /dev/infiniband/uverbsN character
- devices. Fast path operations are typically performed by writing
- directly to hardware registers mmap()ed into userspace, with no
- system call or context switch into the kernel.
-
- Commands are sent to the kernel via write()s on these device files.
- The ABI is defined in drivers/infiniband/include/ib_user_verbs.h.
- The structs for commands that require a response from the kernel
- contain a 64-bit field used to pass a pointer to an output buffer.
- Status is returned to userspace as the return value of the write()
- system call.
-
-Resource management
-
- Since creation and destruction of all IB resources is done by
- commands passed through a file descriptor, the kernel can keep track
- of which resources are attached to a given userspace context. The
- ib_uverbs module maintains idr tables that are used to translate
- between kernel pointers and opaque userspace handles, so that kernel
- pointers are never exposed to userspace and userspace cannot trick
- the kernel into following a bogus pointer.
-
- This also allows the kernel to clean up when a process exits and
- prevent one process from touching another process's resources.
-
-Memory pinning
-
- Direct userspace I/O requires that memory regions that are potential
- I/O targets be kept resident at the same physical address. The
- ib_uverbs module manages pinning and unpinning memory regions via
- get_user_pages() and put_page() calls. It also accounts for the
- amount of memory pinned in the process's pinned_vm, and checks that
- unprivileged processes do not exceed their RLIMIT_MEMLOCK limit.
-
- Pages that are pinned multiple times are counted each time they are
- pinned, so the value of pinned_vm may be an overestimate of the
- number of pages pinned by a process.
-
-/dev files
-
- To create the appropriate character device files automatically with
- udev, a rule like
-
- KERNEL=="uverbs*", NAME="infiniband/%k"
-
- can be used. This will create device nodes named
-
- /dev/infiniband/uverbs0
-
- and so on. Since the InfiniBand userspace verbs should be safe for
- use by non-privileged processes, it may be useful to add an
- appropriate MODE or GROUP to the udev rule.
diff --git a/Documentation/input/input.rst b/Documentation/input/input.rst
index 47f86a4bf16c..0eb61e67a7b7 100644
--- a/Documentation/input/input.rst
+++ b/Documentation/input/input.rst
@@ -188,7 +188,7 @@ LCDs and many other purposes.
The monitor and speaker controls should be easy to add to the hid/input
interface, but for the UPSs and LCDs it doesn't make much sense. For this,
-the hiddev interface was designed. See Documentation/hid/hiddev.txt
+the hiddev interface was designed. See Documentation/hid/hiddev.rst
for more information about it.
The usage of the usbhid module is very simple, it takes no parameters,
diff --git a/Documentation/interconnect/interconnect.rst b/Documentation/interconnect/interconnect.rst
deleted file mode 100644
index 56e331dab70e..000000000000
--- a/Documentation/interconnect/interconnect.rst
+++ /dev/null
@@ -1,95 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-:orphan:
-
-=====================================
-GENERIC SYSTEM INTERCONNECT SUBSYSTEM
-=====================================
-
-Introduction
-------------
-
-This framework is designed to provide a standard kernel interface to control
-the settings of the interconnects on an SoC. These settings can be throughput,
-latency and priority between multiple interconnected devices or functional
-blocks. This can be controlled dynamically in order to save power or provide
-maximum performance.
-
-The interconnect bus is hardware with configurable parameters, which can be
-set on a data path according to the requests received from various drivers.
-An example of interconnect buses are the interconnects between various
-components or functional blocks in chipsets. There can be multiple interconnects
-on an SoC that can be multi-tiered.
-
-Below is a simplified diagram of a real-world SoC interconnect bus topology.
-
-::
-
- +----------------+ +----------------+
- | HW Accelerator |--->| M NoC |<---------------+
- +----------------+ +----------------+ |
- | | +------------+
- +-----+ +-------------+ V +------+ | |
- | DDR | | +--------+ | PCIe | | |
- +-----+ | | Slaves | +------+ | |
- ^ ^ | +--------+ | | C NoC |
- | | V V | |
- +------------------+ +------------------------+ | | +-----+
- | |-->| |-->| |-->| CPU |
- | |-->| |<--| | +-----+
- | Mem NoC | | S NoC | +------------+
- | |<--| |---------+ |
- | |<--| |<------+ | | +--------+
- +------------------+ +------------------------+ | | +-->| Slaves |
- ^ ^ ^ ^ ^ | | +--------+
- | | | | | | V
- +------+ | +-----+ +-----+ +---------+ +----------------+ +--------+
- | CPUs | | | GPU | | DSP | | Masters |-->| P NoC |-->| Slaves |
- +------+ | +-----+ +-----+ +---------+ +----------------+ +--------+
- |
- +-------+
- | Modem |
- +-------+
-
-Terminology
------------
-
-Interconnect provider is the software definition of the interconnect hardware.
-The interconnect providers on the above diagram are M NoC, S NoC, C NoC, P NoC
-and Mem NoC.
-
-Interconnect node is the software definition of the interconnect hardware
-port. Each interconnect provider consists of multiple interconnect nodes,
-which are connected to other SoC components including other interconnect
-providers. The point on the diagram where the CPUs connect to the memory is
-called an interconnect node, which belongs to the Mem NoC interconnect provider.
-
-Interconnect endpoints are the first or the last element of the path. Every
-endpoint is a node, but not every node is an endpoint.
-
-Interconnect path is everything between two endpoints including all the nodes
-that have to be traversed to reach from a source to destination node. It may
-include multiple master-slave pairs across several interconnect providers.
-
-Interconnect consumers are the entities which make use of the data paths exposed
-by the providers. The consumers send requests to providers requesting various
-throughput, latency and priority. Usually the consumers are device drivers, that
-send request based on their needs. An example for a consumer is a video decoder
-that supports various formats and image sizes.
-
-Interconnect providers
-----------------------
-
-Interconnect provider is an entity that implements methods to initialize and
-configure interconnect bus hardware. The interconnect provider drivers should
-be registered with the interconnect provider core.
-
-.. kernel-doc:: include/linux/interconnect-provider.h
-
-Interconnect consumers
-----------------------
-
-Interconnect consumers are the clients which use the interconnect APIs to
-get paths between endpoints and set their bandwidth/latency/QoS requirements
-for these interconnect paths. These interfaces are not currently
-documented.
diff --git a/Documentation/ioctl/botching-up-ioctls.rst b/Documentation/ioctl/botching-up-ioctls.rst
new file mode 100644
index 000000000000..ac697fef3545
--- /dev/null
+++ b/Documentation/ioctl/botching-up-ioctls.rst
@@ -0,0 +1,225 @@
+=================================
+(How to avoid) Botching up ioctls
+=================================
+
+From: http://blog.ffwll.ch/2013/11/botching-up-ioctls.html
+
+By: Daniel Vetter, Copyright © 2013 Intel Corporation
+
+One clear insight kernel graphics hackers gained in the past few years is that
+trying to come up with a unified interface to manage the execution units and
+memory on completely different GPUs is a futile effort. So nowadays every
+driver has its own set of ioctls to allocate memory and submit work to the GPU.
+Which is nice, since there's no more insanity in the form of fake-generic, but
+actually only used once interfaces. But the clear downside is that there's much
+more potential to screw things up.
+
+To avoid repeating all the same mistakes again I've written up some of the
+lessons learned while botching the job for the drm/i915 driver. Most of these
+only cover technicalities and not the big-picture issues like what the command
+submission ioctl exactly should look like. Learning these lessons is probably
+something every GPU driver has to do on its own.
+
+
+Prerequisites
+-------------
+
+First the prerequisites. Without these you have already failed, because you
+will need to add a 32-bit compat layer:
+
+ * Only use fixed sized integers. To avoid conflicts with typedefs in userspace
+ the kernel has special types like __u32, __s64. Use them.
+
+ * Align everything to the natural size and use explicit padding. 32-bit
+ platforms don't necessarily align 64-bit values to 64-bit boundaries, but
+ 64-bit platforms do. So we always need padding to the natural size to get
+ this right.
+
+ * Pad the entire struct to a multiple of 64-bits if the structure contains
+ 64-bit types - the structure size will otherwise differ on 32-bit versus
+ 64-bit. Having a different structure size hurts when passing arrays of
+ structures to the kernel, or if the kernel checks the structure size, which
+ e.g. the drm core does.
+
+ * Pointers are __u64, cast from/to a uintprt_t on the userspace side and
+ from/to a void __user * in the kernel. Try really hard not to delay this
+ conversion or worse, fiddle the raw __u64 through your code since that
+ diminishes the checking tools like sparse can provide. The macro
+ u64_to_user_ptr can be used in the kernel to avoid warnings about integers
+ and pointres of different sizes.
+
+
+Basics
+------
+
+With the joys of writing a compat layer avoided we can take a look at the basic
+fumbles. Neglecting these will make backward and forward compatibility a real
+pain. And since getting things wrong on the first attempt is guaranteed you
+will have a second iteration or at least an extension for any given interface.
+
+ * Have a clear way for userspace to figure out whether your new ioctl or ioctl
+ extension is supported on a given kernel. If you can't rely on old kernels
+ rejecting the new flags/modes or ioctls (since doing that was botched in the
+ past) then you need a driver feature flag or revision number somewhere.
+
+ * Have a plan for extending ioctls with new flags or new fields at the end of
+ the structure. The drm core checks the passed-in size for each ioctl call
+ and zero-extends any mismatches between kernel and userspace. That helps,
+ but isn't a complete solution since newer userspace on older kernels won't
+ notice that the newly added fields at the end get ignored. So this still
+ needs a new driver feature flags.
+
+ * Check all unused fields and flags and all the padding for whether it's 0,
+ and reject the ioctl if that's not the case. Otherwise your nice plan for
+ future extensions is going right down the gutters since someone will submit
+ an ioctl struct with random stack garbage in the yet unused parts. Which
+ then bakes in the ABI that those fields can never be used for anything else
+ but garbage. This is also the reason why you must explicitly pad all
+ structures, even if you never use them in an array - the padding the compiler
+ might insert could contain garbage.
+
+ * Have simple testcases for all of the above.
+
+
+Fun with Error Paths
+--------------------
+
+Nowadays we don't have any excuse left any more for drm drivers being neat
+little root exploits. This means we both need full input validation and solid
+error handling paths - GPUs will die eventually in the oddmost corner cases
+anyway:
+
+ * The ioctl must check for array overflows. Also it needs to check for
+ over/underflows and clamping issues of integer values in general. The usual
+ example is sprite positioning values fed directly into the hardware with the
+ hardware just having 12 bits or so. Works nicely until some odd display
+ server doesn't bother with clamping itself and the cursor wraps around the
+ screen.
+
+ * Have simple testcases for every input validation failure case in your ioctl.
+ Check that the error code matches your expectations. And finally make sure
+ that you only test for one single error path in each subtest by submitting
+ otherwise perfectly valid data. Without this an earlier check might reject
+ the ioctl already and shadow the codepath you actually want to test, hiding
+ bugs and regressions.
+
+ * Make all your ioctls restartable. First X really loves signals and second
+ this will allow you to test 90% of all error handling paths by just
+ interrupting your main test suite constantly with signals. Thanks to X's
+ love for signal you'll get an excellent base coverage of all your error
+ paths pretty much for free for graphics drivers. Also, be consistent with
+ how you handle ioctl restarting - e.g. drm has a tiny drmIoctl helper in its
+ userspace library. The i915 driver botched this with the set_tiling ioctl,
+ now we're stuck forever with some arcane semantics in both the kernel and
+ userspace.
+
+ * If you can't make a given codepath restartable make a stuck task at least
+ killable. GPUs just die and your users won't like you more if you hang their
+ entire box (by means of an unkillable X process). If the state recovery is
+ still too tricky have a timeout or hangcheck safety net as a last-ditch
+ effort in case the hardware has gone bananas.
+
+ * Have testcases for the really tricky corner cases in your error recovery code
+ - it's way too easy to create a deadlock between your hangcheck code and
+ waiters.
+
+
+Time, Waiting and Missing it
+----------------------------
+
+GPUs do most everything asynchronously, so we have a need to time operations and
+wait for outstanding ones. This is really tricky business; at the moment none of
+the ioctls supported by the drm/i915 get this fully right, which means there's
+still tons more lessons to learn here.
+
+ * Use CLOCK_MONOTONIC as your reference time, always. It's what alsa, drm and
+ v4l use by default nowadays. But let userspace know which timestamps are
+ derived from different clock domains like your main system clock (provided
+ by the kernel) or some independent hardware counter somewhere else. Clocks
+ will mismatch if you look close enough, but if performance measuring tools
+ have this information they can at least compensate. If your userspace can
+ get at the raw values of some clocks (e.g. through in-command-stream
+ performance counter sampling instructions) consider exposing those also.
+
+ * Use __s64 seconds plus __u64 nanoseconds to specify time. It's not the most
+ convenient time specification, but it's mostly the standard.
+
+ * Check that input time values are normalized and reject them if not. Note
+ that the kernel native struct ktime has a signed integer for both seconds
+ and nanoseconds, so beware here.
+
+ * For timeouts, use absolute times. If you're a good fellow and made your
+ ioctl restartable relative timeouts tend to be too coarse and can
+ indefinitely extend your wait time due to rounding on each restart.
+ Especially if your reference clock is something really slow like the display
+ frame counter. With a spec lawyer hat on this isn't a bug since timeouts can
+ always be extended - but users will surely hate you if their neat animations
+ starts to stutter due to this.
+
+ * Consider ditching any synchronous wait ioctls with timeouts and just deliver
+ an asynchronous event on a pollable file descriptor. It fits much better
+ into event driven applications' main loop.
+
+ * Have testcases for corner-cases, especially whether the return values for
+ already-completed events, successful waits and timed-out waits are all sane
+ and suiting to your needs.
+
+
+Leaking Resources, Not
+----------------------
+
+A full-blown drm driver essentially implements a little OS, but specialized to
+the given GPU platforms. This means a driver needs to expose tons of handles
+for different objects and other resources to userspace. Doing that right
+entails its own little set of pitfalls:
+
+ * Always attach the lifetime of your dynamically created resources to the
+ lifetime of a file descriptor. Consider using a 1:1 mapping if your resource
+ needs to be shared across processes - fd-passing over unix domain sockets
+ also simplifies lifetime management for userspace.
+
+ * Always have O_CLOEXEC support.
+
+ * Ensure that you have sufficient insulation between different clients. By
+ default pick a private per-fd namespace which forces any sharing to be done
+ explicitly. Only go with a more global per-device namespace if the objects
+ are truly device-unique. One counterexample in the drm modeset interfaces is
+ that the per-device modeset objects like connectors share a namespace with
+ framebuffer objects, which mostly are not shared at all. A separate
+ namespace, private by default, for framebuffers would have been more
+ suitable.
+
+ * Think about uniqueness requirements for userspace handles. E.g. for most drm
+ drivers it's a userspace bug to submit the same object twice in the same
+ command submission ioctl. But then if objects are shareable userspace needs
+ to know whether it has seen an imported object from a different process
+ already or not. I haven't tried this myself yet due to lack of a new class
+ of objects, but consider using inode numbers on your shared file descriptors
+ as unique identifiers - it's how real files are told apart, too.
+ Unfortunately this requires a full-blown virtual filesystem in the kernel.
+
+
+Last, but not Least
+-------------------
+
+Not every problem needs a new ioctl:
+
+ * Think hard whether you really want a driver-private interface. Of course
+ it's much quicker to push a driver-private interface than engaging in
+ lengthy discussions for a more generic solution. And occasionally doing a
+ private interface to spearhead a new concept is what's required. But in the
+ end, once the generic interface comes around you'll end up maintainer two
+ interfaces. Indefinitely.
+
+ * Consider other interfaces than ioctls. A sysfs attribute is much better for
+ per-device settings, or for child objects with fairly static lifetimes (like
+ output connectors in drm with all the detection override attributes). Or
+ maybe only your testsuite needs this interface, and then debugfs with its
+ disclaimer of not having a stable ABI would be better.
+
+Finally, the name of the game is to get it right on the first attempt, since if
+your driver proves popular and your hardware platforms long-lived then you'll
+be stuck with a given ioctl essentially forever. You can try to deprecate
+horrible ioctls on newer iterations of your hardware, but generally it takes
+years to accomplish this. And then again years until the last user able to
+complain about regressions disappears, too.
diff --git a/Documentation/ioctl/botching-up-ioctls.txt b/Documentation/ioctl/botching-up-ioctls.txt
deleted file mode 100644
index 883fb034bd04..000000000000
--- a/Documentation/ioctl/botching-up-ioctls.txt
+++ /dev/null
@@ -1,224 +0,0 @@
-(How to avoid) Botching up ioctls
-=================================
-
-From: http://blog.ffwll.ch/2013/11/botching-up-ioctls.html
-
-By: Daniel Vetter, Copyright © 2013 Intel Corporation
-
-One clear insight kernel graphics hackers gained in the past few years is that
-trying to come up with a unified interface to manage the execution units and
-memory on completely different GPUs is a futile effort. So nowadays every
-driver has its own set of ioctls to allocate memory and submit work to the GPU.
-Which is nice, since there's no more insanity in the form of fake-generic, but
-actually only used once interfaces. But the clear downside is that there's much
-more potential to screw things up.
-
-To avoid repeating all the same mistakes again I've written up some of the
-lessons learned while botching the job for the drm/i915 driver. Most of these
-only cover technicalities and not the big-picture issues like what the command
-submission ioctl exactly should look like. Learning these lessons is probably
-something every GPU driver has to do on its own.
-
-
-Prerequisites
--------------
-
-First the prerequisites. Without these you have already failed, because you
-will need to add a 32-bit compat layer:
-
- * Only use fixed sized integers. To avoid conflicts with typedefs in userspace
- the kernel has special types like __u32, __s64. Use them.
-
- * Align everything to the natural size and use explicit padding. 32-bit
- platforms don't necessarily align 64-bit values to 64-bit boundaries, but
- 64-bit platforms do. So we always need padding to the natural size to get
- this right.
-
- * Pad the entire struct to a multiple of 64-bits if the structure contains
- 64-bit types - the structure size will otherwise differ on 32-bit versus
- 64-bit. Having a different structure size hurts when passing arrays of
- structures to the kernel, or if the kernel checks the structure size, which
- e.g. the drm core does.
-
- * Pointers are __u64, cast from/to a uintprt_t on the userspace side and
- from/to a void __user * in the kernel. Try really hard not to delay this
- conversion or worse, fiddle the raw __u64 through your code since that
- diminishes the checking tools like sparse can provide. The macro
- u64_to_user_ptr can be used in the kernel to avoid warnings about integers
- and pointres of different sizes.
-
-
-Basics
-------
-
-With the joys of writing a compat layer avoided we can take a look at the basic
-fumbles. Neglecting these will make backward and forward compatibility a real
-pain. And since getting things wrong on the first attempt is guaranteed you
-will have a second iteration or at least an extension for any given interface.
-
- * Have a clear way for userspace to figure out whether your new ioctl or ioctl
- extension is supported on a given kernel. If you can't rely on old kernels
- rejecting the new flags/modes or ioctls (since doing that was botched in the
- past) then you need a driver feature flag or revision number somewhere.
-
- * Have a plan for extending ioctls with new flags or new fields at the end of
- the structure. The drm core checks the passed-in size for each ioctl call
- and zero-extends any mismatches between kernel and userspace. That helps,
- but isn't a complete solution since newer userspace on older kernels won't
- notice that the newly added fields at the end get ignored. So this still
- needs a new driver feature flags.
-
- * Check all unused fields and flags and all the padding for whether it's 0,
- and reject the ioctl if that's not the case. Otherwise your nice plan for
- future extensions is going right down the gutters since someone will submit
- an ioctl struct with random stack garbage in the yet unused parts. Which
- then bakes in the ABI that those fields can never be used for anything else
- but garbage. This is also the reason why you must explicitly pad all
- structures, even if you never use them in an array - the padding the compiler
- might insert could contain garbage.
-
- * Have simple testcases for all of the above.
-
-
-Fun with Error Paths
---------------------
-
-Nowadays we don't have any excuse left any more for drm drivers being neat
-little root exploits. This means we both need full input validation and solid
-error handling paths - GPUs will die eventually in the oddmost corner cases
-anyway:
-
- * The ioctl must check for array overflows. Also it needs to check for
- over/underflows and clamping issues of integer values in general. The usual
- example is sprite positioning values fed directly into the hardware with the
- hardware just having 12 bits or so. Works nicely until some odd display
- server doesn't bother with clamping itself and the cursor wraps around the
- screen.
-
- * Have simple testcases for every input validation failure case in your ioctl.
- Check that the error code matches your expectations. And finally make sure
- that you only test for one single error path in each subtest by submitting
- otherwise perfectly valid data. Without this an earlier check might reject
- the ioctl already and shadow the codepath you actually want to test, hiding
- bugs and regressions.
-
- * Make all your ioctls restartable. First X really loves signals and second
- this will allow you to test 90% of all error handling paths by just
- interrupting your main test suite constantly with signals. Thanks to X's
- love for signal you'll get an excellent base coverage of all your error
- paths pretty much for free for graphics drivers. Also, be consistent with
- how you handle ioctl restarting - e.g. drm has a tiny drmIoctl helper in its
- userspace library. The i915 driver botched this with the set_tiling ioctl,
- now we're stuck forever with some arcane semantics in both the kernel and
- userspace.
-
- * If you can't make a given codepath restartable make a stuck task at least
- killable. GPUs just die and your users won't like you more if you hang their
- entire box (by means of an unkillable X process). If the state recovery is
- still too tricky have a timeout or hangcheck safety net as a last-ditch
- effort in case the hardware has gone bananas.
-
- * Have testcases for the really tricky corner cases in your error recovery code
- - it's way too easy to create a deadlock between your hangcheck code and
- waiters.
-
-
-Time, Waiting and Missing it
-----------------------------
-
-GPUs do most everything asynchronously, so we have a need to time operations and
-wait for outstanding ones. This is really tricky business; at the moment none of
-the ioctls supported by the drm/i915 get this fully right, which means there's
-still tons more lessons to learn here.
-
- * Use CLOCK_MONOTONIC as your reference time, always. It's what alsa, drm and
- v4l use by default nowadays. But let userspace know which timestamps are
- derived from different clock domains like your main system clock (provided
- by the kernel) or some independent hardware counter somewhere else. Clocks
- will mismatch if you look close enough, but if performance measuring tools
- have this information they can at least compensate. If your userspace can
- get at the raw values of some clocks (e.g. through in-command-stream
- performance counter sampling instructions) consider exposing those also.
-
- * Use __s64 seconds plus __u64 nanoseconds to specify time. It's not the most
- convenient time specification, but it's mostly the standard.
-
- * Check that input time values are normalized and reject them if not. Note
- that the kernel native struct ktime has a signed integer for both seconds
- and nanoseconds, so beware here.
-
- * For timeouts, use absolute times. If you're a good fellow and made your
- ioctl restartable relative timeouts tend to be too coarse and can
- indefinitely extend your wait time due to rounding on each restart.
- Especially if your reference clock is something really slow like the display
- frame counter. With a spec lawyer hat on this isn't a bug since timeouts can
- always be extended - but users will surely hate you if their neat animations
- starts to stutter due to this.
-
- * Consider ditching any synchronous wait ioctls with timeouts and just deliver
- an asynchronous event on a pollable file descriptor. It fits much better
- into event driven applications' main loop.
-
- * Have testcases for corner-cases, especially whether the return values for
- already-completed events, successful waits and timed-out waits are all sane
- and suiting to your needs.
-
-
-Leaking Resources, Not
-----------------------
-
-A full-blown drm driver essentially implements a little OS, but specialized to
-the given GPU platforms. This means a driver needs to expose tons of handles
-for different objects and other resources to userspace. Doing that right
-entails its own little set of pitfalls:
-
- * Always attach the lifetime of your dynamically created resources to the
- lifetime of a file descriptor. Consider using a 1:1 mapping if your resource
- needs to be shared across processes - fd-passing over unix domain sockets
- also simplifies lifetime management for userspace.
-
- * Always have O_CLOEXEC support.
-
- * Ensure that you have sufficient insulation between different clients. By
- default pick a private per-fd namespace which forces any sharing to be done
- explicitly. Only go with a more global per-device namespace if the objects
- are truly device-unique. One counterexample in the drm modeset interfaces is
- that the per-device modeset objects like connectors share a namespace with
- framebuffer objects, which mostly are not shared at all. A separate
- namespace, private by default, for framebuffers would have been more
- suitable.
-
- * Think about uniqueness requirements for userspace handles. E.g. for most drm
- drivers it's a userspace bug to submit the same object twice in the same
- command submission ioctl. But then if objects are shareable userspace needs
- to know whether it has seen an imported object from a different process
- already or not. I haven't tried this myself yet due to lack of a new class
- of objects, but consider using inode numbers on your shared file descriptors
- as unique identifiers - it's how real files are told apart, too.
- Unfortunately this requires a full-blown virtual filesystem in the kernel.
-
-
-Last, but not Least
--------------------
-
-Not every problem needs a new ioctl:
-
- * Think hard whether you really want a driver-private interface. Of course
- it's much quicker to push a driver-private interface than engaging in
- lengthy discussions for a more generic solution. And occasionally doing a
- private interface to spearhead a new concept is what's required. But in the
- end, once the generic interface comes around you'll end up maintainer two
- interfaces. Indefinitely.
-
- * Consider other interfaces than ioctls. A sysfs attribute is much better for
- per-device settings, or for child objects with fairly static lifetimes (like
- output connectors in drm with all the detection override attributes). Or
- maybe only your testsuite needs this interface, and then debugfs with its
- disclaimer of not having a stable ABI would be better.
-
-Finally, the name of the game is to get it right on the first attempt, since if
-your driver proves popular and your hardware platforms long-lived then you'll
-be stuck with a given ioctl essentially forever. You can try to deprecate
-horrible ioctls on newer iterations of your hardware, but generally it takes
-years to accomplish this. And then again years until the last user able to
-complain about regressions disappears, too.
diff --git a/Documentation/ioctl/cdrom.rst b/Documentation/ioctl/cdrom.rst
new file mode 100644
index 000000000000..3b4c0506de46
--- /dev/null
+++ b/Documentation/ioctl/cdrom.rst
@@ -0,0 +1,1233 @@
+============================
+Summary of CDROM ioctl calls
+============================
+
+- Edward A. Falk <efalk@google.com>
+
+November, 2004
+
+This document attempts to describe the ioctl(2) calls supported by
+the CDROM layer. These are by-and-large implemented (as of Linux 2.6)
+in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c
+
+ioctl values are listed in <linux/cdrom.h>. As of this writing, they
+are as follows:
+
+ ====================== ===============================================
+ CDROMPAUSE Pause Audio Operation
+ CDROMRESUME Resume paused Audio Operation
+ CDROMPLAYMSF Play Audio MSF (struct cdrom_msf)
+ CDROMPLAYTRKIND Play Audio Track/index (struct cdrom_ti)
+ CDROMREADTOCHDR Read TOC header (struct cdrom_tochdr)
+ CDROMREADTOCENTRY Read TOC entry (struct cdrom_tocentry)
+ CDROMSTOP Stop the cdrom drive
+ CDROMSTART Start the cdrom drive
+ CDROMEJECT Ejects the cdrom media
+ CDROMVOLCTRL Control output volume (struct cdrom_volctrl)
+ CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl)
+ CDROMREADMODE2 Read CDROM mode 2 data (2336 Bytes)
+ (struct cdrom_read)
+ CDROMREADMODE1 Read CDROM mode 1 data (2048 Bytes)
+ (struct cdrom_read)
+ CDROMREADAUDIO (struct cdrom_read_audio)
+ CDROMEJECT_SW enable(1)/disable(0) auto-ejecting
+ CDROMMULTISESSION Obtain the start-of-last-session
+ address of multi session disks
+ (struct cdrom_multisession)
+ CDROM_GET_MCN Obtain the "Universal Product Code"
+ if available (struct cdrom_mcn)
+ CDROM_GET_UPC Deprecated, use CDROM_GET_MCN instead.
+ CDROMRESET hard-reset the drive
+ CDROMVOLREAD Get the drive's volume setting
+ (struct cdrom_volctrl)
+ CDROMREADRAW read data in raw mode (2352 Bytes)
+ (struct cdrom_read)
+ CDROMREADCOOKED read data in cooked mode
+ CDROMSEEK seek msf address
+ CDROMPLAYBLK scsi-cd only, (struct cdrom_blk)
+ CDROMREADALL read all 2646 bytes
+ CDROMGETSPINDOWN return 4-bit spindown value
+ CDROMSETSPINDOWN set 4-bit spindown value
+ CDROMCLOSETRAY pendant of CDROMEJECT
+ CDROM_SET_OPTIONS Set behavior options
+ CDROM_CLEAR_OPTIONS Clear behavior options
+ CDROM_SELECT_SPEED Set the CD-ROM speed
+ CDROM_SELECT_DISC Select disc (for juke-boxes)
+ CDROM_MEDIA_CHANGED Check is media changed
+ CDROM_DRIVE_STATUS Get tray position, etc.
+ CDROM_DISC_STATUS Get disc type, etc.
+ CDROM_CHANGER_NSLOTS Get number of slots
+ CDROM_LOCKDOOR lock or unlock door
+ CDROM_DEBUG Turn debug messages on/off
+ CDROM_GET_CAPABILITY get capabilities
+ CDROMAUDIOBUFSIZ set the audio buffer size
+ DVD_READ_STRUCT Read structure
+ DVD_WRITE_STRUCT Write structure
+ DVD_AUTH Authentication
+ CDROM_SEND_PACKET send a packet to the drive
+ CDROM_NEXT_WRITABLE get next writable block
+ CDROM_LAST_WRITTEN get last block written on disc
+ ====================== ===============================================
+
+
+The information that follows was determined from reading kernel source
+code. It is likely that some corrections will be made over time.
+
+------------------------------------------------------------------------------
+
+General:
+
+ Unless otherwise specified, all ioctl calls return 0 on success
+ and -1 with errno set to an appropriate value on error. (Some
+ ioctls return non-negative data values.)
+
+ Unless otherwise specified, all ioctl calls return -1 and set
+ errno to EFAULT on a failed attempt to copy data to or from user
+ address space.
+
+ Individual drivers may return error codes not listed here.
+
+ Unless otherwise specified, all data structures and constants
+ are defined in <linux/cdrom.h>
+
+------------------------------------------------------------------------------
+
+
+CDROMPAUSE
+ Pause Audio Operation
+
+
+ usage::
+
+ ioctl(fd, CDROMPAUSE, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+
+CDROMRESUME
+ Resume paused Audio Operation
+
+
+ usage::
+
+ ioctl(fd, CDROMRESUME, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+
+CDROMPLAYMSF
+ Play Audio MSF
+
+ (struct cdrom_msf)
+
+
+ usage::
+
+ struct cdrom_msf msf;
+
+ ioctl(fd, CDROMPLAYMSF, &msf);
+
+ inputs:
+ cdrom_msf structure, describing a segment of music to play
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+ notes:
+ - MSF stands for minutes-seconds-frames
+ - LBA stands for logical block address
+ - Segment is described as start and end times, where each time
+ is described as minutes:seconds:frames.
+ A frame is 1/75 of a second.
+
+
+CDROMPLAYTRKIND
+ Play Audio Track/index
+
+ (struct cdrom_ti)
+
+
+ usage::
+
+ struct cdrom_ti ti;
+
+ ioctl(fd, CDROMPLAYTRKIND, &ti);
+
+ inputs:
+ cdrom_ti structure, describing a segment of music to play
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+ notes:
+ - Segment is described as start and end times, where each time
+ is described as a track and an index.
+
+
+
+CDROMREADTOCHDR
+ Read TOC header
+
+ (struct cdrom_tochdr)
+
+
+ usage::
+
+ cdrom_tochdr header;
+
+ ioctl(fd, CDROMREADTOCHDR, &header);
+
+ inputs:
+ cdrom_tochdr structure
+
+
+ outputs:
+ cdrom_tochdr structure
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+
+
+CDROMREADTOCENTRY
+ Read TOC entry
+
+ (struct cdrom_tocentry)
+
+
+ usage::
+
+ struct cdrom_tocentry entry;
+
+ ioctl(fd, CDROMREADTOCENTRY, &entry);
+
+ inputs:
+ cdrom_tocentry structure
+
+
+ outputs:
+ cdrom_tocentry structure
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+ - EINVAL entry.cdte_format not CDROM_MSF or CDROM_LBA
+ - EINVAL requested track out of bounds
+ - EIO I/O error reading TOC
+
+ notes:
+ - TOC stands for Table Of Contents
+ - MSF stands for minutes-seconds-frames
+ - LBA stands for logical block address
+
+
+
+CDROMSTOP
+ Stop the cdrom drive
+
+
+ usage::
+
+ ioctl(fd, CDROMSTOP, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+ notes:
+ - Exact interpretation of this ioctl depends on the device,
+ but most seem to spin the drive down.
+
+
+CDROMSTART
+ Start the cdrom drive
+
+
+ usage::
+
+ ioctl(fd, CDROMSTART, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+ notes:
+ - Exact interpretation of this ioctl depends on the device,
+ but most seem to spin the drive up and/or close the tray.
+ Other devices ignore the ioctl completely.
+
+
+CDROMEJECT
+ - Ejects the cdrom media
+
+
+ usage::
+
+ ioctl(fd, CDROMEJECT, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ none
+
+
+ error returns:
+ - ENOSYS cd drive not capable of ejecting
+ - EBUSY other processes are accessing drive, or door is locked
+
+ notes:
+ - See CDROM_LOCKDOOR, below.
+
+
+
+
+CDROMCLOSETRAY
+ pendant of CDROMEJECT
+
+
+ usage::
+
+ ioctl(fd, CDROMCLOSETRAY, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ none
+
+
+ error returns:
+ - ENOSYS cd drive not capable of closing the tray
+ - EBUSY other processes are accessing drive, or door is locked
+
+ notes:
+ - See CDROM_LOCKDOOR, below.
+
+
+
+
+CDROMVOLCTRL
+ Control output volume (struct cdrom_volctrl)
+
+
+ usage::
+
+ struct cdrom_volctrl volume;
+
+ ioctl(fd, CDROMVOLCTRL, &volume);
+
+ inputs:
+ cdrom_volctrl structure containing volumes for up to 4
+ channels.
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+
+
+CDROMVOLREAD
+ Get the drive's volume setting
+
+ (struct cdrom_volctrl)
+
+
+ usage::
+
+ struct cdrom_volctrl volume;
+
+ ioctl(fd, CDROMVOLREAD, &volume);
+
+ inputs:
+ none
+
+
+ outputs:
+ The current volume settings.
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+
+
+
+CDROMSUBCHNL
+ Read subchannel data
+
+ (struct cdrom_subchnl)
+
+
+ usage::
+
+ struct cdrom_subchnl q;
+
+ ioctl(fd, CDROMSUBCHNL, &q);
+
+ inputs:
+ cdrom_subchnl structure
+
+
+ outputs:
+ cdrom_subchnl structure
+
+
+ error return:
+ - ENOSYS cd drive not audio-capable.
+ - EINVAL format not CDROM_MSF or CDROM_LBA
+
+ notes:
+ - Format is converted to CDROM_MSF or CDROM_LBA
+ as per user request on return
+
+
+
+CDROMREADRAW
+ read data in raw mode (2352 Bytes)
+
+ (struct cdrom_read)
+
+ usage::
+
+ union {
+
+ struct cdrom_msf msf; /* input */
+ char buffer[CD_FRAMESIZE_RAW]; /* return */
+ } arg;
+ ioctl(fd, CDROMREADRAW, &arg);
+
+ inputs:
+ cdrom_msf structure indicating an address to read.
+
+ Only the start values are significant.
+
+ outputs:
+ Data written to address provided by user.
+
+
+ error return:
+ - EINVAL address less than 0, or msf less than 0:2:0
+ - ENOMEM out of memory
+
+ notes:
+ - As of 2.6.8.1, comments in <linux/cdrom.h> indicate that this
+ ioctl accepts a cdrom_read structure, but actual source code
+ reads a cdrom_msf structure and writes a buffer of data to
+ the same address.
+
+ - MSF values are converted to LBA values via this formula::
+
+ lba = (((m * CD_SECS) + s) * CD_FRAMES + f) - CD_MSF_OFFSET;
+
+
+
+
+CDROMREADMODE1
+ Read CDROM mode 1 data (2048 Bytes)
+
+ (struct cdrom_read)
+
+ notes:
+ Identical to CDROMREADRAW except that block size is
+ CD_FRAMESIZE (2048) bytes
+
+
+
+CDROMREADMODE2
+ Read CDROM mode 2 data (2336 Bytes)
+
+ (struct cdrom_read)
+
+ notes:
+ Identical to CDROMREADRAW except that block size is
+ CD_FRAMESIZE_RAW0 (2336) bytes
+
+
+
+CDROMREADAUDIO
+ (struct cdrom_read_audio)
+
+ usage::
+
+ struct cdrom_read_audio ra;
+
+ ioctl(fd, CDROMREADAUDIO, &ra);
+
+ inputs:
+ cdrom_read_audio structure containing read start
+ point and length
+
+ outputs:
+ audio data, returned to buffer indicated by ra
+
+
+ error return:
+ - EINVAL format not CDROM_MSF or CDROM_LBA
+ - EINVAL nframes not in range [1 75]
+ - ENXIO drive has no queue (probably means invalid fd)
+ - ENOMEM out of memory
+
+
+CDROMEJECT_SW
+ enable(1)/disable(0) auto-ejecting
+
+
+ usage::
+
+ int val;
+
+ ioctl(fd, CDROMEJECT_SW, val);
+
+ inputs:
+ Flag specifying auto-eject flag.
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS Drive is not capable of ejecting.
+ - EBUSY Door is locked
+
+
+
+
+CDROMMULTISESSION
+ Obtain the start-of-last-session address of multi session disks
+
+ (struct cdrom_multisession)
+
+ usage::
+
+ struct cdrom_multisession ms_info;
+
+ ioctl(fd, CDROMMULTISESSION, &ms_info);
+
+ inputs:
+ cdrom_multisession structure containing desired
+
+ format.
+
+ outputs:
+ cdrom_multisession structure is filled with last_session
+ information.
+
+ error return:
+ - EINVAL format not CDROM_MSF or CDROM_LBA
+
+
+CDROM_GET_MCN
+ Obtain the "Universal Product Code"
+ if available
+
+ (struct cdrom_mcn)
+
+
+ usage::
+
+ struct cdrom_mcn mcn;
+
+ ioctl(fd, CDROM_GET_MCN, &mcn);
+
+ inputs:
+ none
+
+
+ outputs:
+ Universal Product Code
+
+
+ error return:
+ - ENOSYS Drive is not capable of reading MCN data.
+
+ notes:
+ - Source code comments state::
+
+ The following function is implemented, although very few
+ audio discs give Universal Product Code information, which
+ should just be the Medium Catalog Number on the box. Note,
+ that the way the code is written on the CD is /not/ uniform
+ across all discs!
+
+
+
+
+CDROM_GET_UPC
+ CDROM_GET_MCN (deprecated)
+
+
+ Not implemented, as of 2.6.8.1
+
+
+
+CDROMRESET
+ hard-reset the drive
+
+
+ usage::
+
+ ioctl(fd, CDROMRESET, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ none
+
+
+ error return:
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - ENOSYS Drive is not capable of resetting.
+
+
+
+
+CDROMREADCOOKED
+ read data in cooked mode
+
+
+ usage::
+
+ u8 buffer[CD_FRAMESIZE]
+
+ ioctl(fd, CDROMREADCOOKED, buffer);
+
+ inputs:
+ none
+
+
+ outputs:
+ 2048 bytes of data, "cooked" mode.
+
+
+ notes:
+ Not implemented on all drives.
+
+
+
+
+
+CDROMREADALL
+ read all 2646 bytes
+
+
+ Same as CDROMREADCOOKED, but reads 2646 bytes.
+
+
+
+CDROMSEEK
+ seek msf address
+
+
+ usage::
+
+ struct cdrom_msf msf;
+
+ ioctl(fd, CDROMSEEK, &msf);
+
+ inputs:
+ MSF address to seek to.
+
+
+ outputs:
+ none
+
+
+
+
+CDROMPLAYBLK
+ scsi-cd only
+
+ (struct cdrom_blk)
+
+
+ usage::
+
+ struct cdrom_blk blk;
+
+ ioctl(fd, CDROMPLAYBLK, &blk);
+
+ inputs:
+ Region to play
+
+
+ outputs:
+ none
+
+
+
+
+CDROMGETSPINDOWN
+ usage::
+
+ char spindown;
+
+ ioctl(fd, CDROMGETSPINDOWN, &spindown);
+
+ inputs:
+ none
+
+
+ outputs:
+ The value of the current 4-bit spindown value.
+
+
+
+
+
+CDROMSETSPINDOWN
+ usage::
+
+ char spindown
+
+ ioctl(fd, CDROMSETSPINDOWN, &spindown);
+
+ inputs:
+ 4-bit value used to control spindown (TODO: more detail here)
+
+
+ outputs:
+ none
+
+
+
+
+
+
+CDROM_SET_OPTIONS
+ Set behavior options
+
+
+ usage::
+
+ int options;
+
+ ioctl(fd, CDROM_SET_OPTIONS, options);
+
+ inputs:
+ New values for drive options. The logical 'or' of:
+
+ ============== ==================================
+ CDO_AUTO_CLOSE close tray on first open(2)
+ CDO_AUTO_EJECT open tray on last release
+ CDO_USE_FFLAGS use O_NONBLOCK information on open
+ CDO_LOCK lock tray on open files
+ CDO_CHECK_TYPE check type on open for data
+ ============== ==================================
+
+ outputs:
+ Returns the resulting options settings in the
+ ioctl return value. Returns -1 on error.
+
+ error return:
+ - ENOSYS selected option(s) not supported by drive.
+
+
+
+
+CDROM_CLEAR_OPTIONS
+ Clear behavior options
+
+
+ Same as CDROM_SET_OPTIONS, except that selected options are
+ turned off.
+
+
+
+CDROM_SELECT_SPEED
+ Set the CD-ROM speed
+
+
+ usage::
+
+ int speed;
+
+ ioctl(fd, CDROM_SELECT_SPEED, speed);
+
+ inputs:
+ New drive speed.
+
+
+ outputs:
+ none
+
+
+ error return:
+ - ENOSYS speed selection not supported by drive.
+
+
+
+CDROM_SELECT_DISC
+ Select disc (for juke-boxes)
+
+
+ usage::
+
+ int disk;
+
+ ioctl(fd, CDROM_SELECT_DISC, disk);
+
+ inputs:
+ Disk to load into drive.
+
+
+ outputs:
+ none
+
+
+ error return:
+ - EINVAL Disk number beyond capacity of drive
+
+
+
+CDROM_MEDIA_CHANGED
+ Check is media changed
+
+
+ usage::
+
+ int slot;
+
+ ioctl(fd, CDROM_MEDIA_CHANGED, slot);
+
+ inputs:
+ Slot number to be tested, always zero except for jukeboxes.
+
+ May also be special values CDSL_NONE or CDSL_CURRENT
+
+ outputs:
+ Ioctl return value is 0 or 1 depending on whether the media
+
+ has been changed, or -1 on error.
+
+ error returns:
+ - ENOSYS Drive can't detect media change
+ - EINVAL Slot number beyond capacity of drive
+ - ENOMEM Out of memory
+
+
+
+CDROM_DRIVE_STATUS
+ Get tray position, etc.
+
+
+ usage::
+
+ int slot;
+
+ ioctl(fd, CDROM_DRIVE_STATUS, slot);
+
+ inputs:
+ Slot number to be tested, always zero except for jukeboxes.
+
+ May also be special values CDSL_NONE or CDSL_CURRENT
+
+ outputs:
+ Ioctl return value will be one of the following values
+
+ from <linux/cdrom.h>:
+
+ =================== ==========================
+ CDS_NO_INFO Information not available.
+ CDS_NO_DISC
+ CDS_TRAY_OPEN
+ CDS_DRIVE_NOT_READY
+ CDS_DISC_OK
+ -1 error
+ =================== ==========================
+
+ error returns:
+ - ENOSYS Drive can't detect drive status
+ - EINVAL Slot number beyond capacity of drive
+ - ENOMEM Out of memory
+
+
+
+
+CDROM_DISC_STATUS
+ Get disc type, etc.
+
+
+ usage::
+
+ ioctl(fd, CDROM_DISC_STATUS, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ Ioctl return value will be one of the following values
+
+ from <linux/cdrom.h>:
+
+ - CDS_NO_INFO
+ - CDS_AUDIO
+ - CDS_MIXED
+ - CDS_XA_2_2
+ - CDS_XA_2_1
+ - CDS_DATA_1
+
+ error returns:
+ none at present
+
+ notes:
+ - Source code comments state::
+
+
+ Ok, this is where problems start. The current interface for
+ the CDROM_DISC_STATUS ioctl is flawed. It makes the false
+ assumption that CDs are all CDS_DATA_1 or all CDS_AUDIO, etc.
+ Unfortunately, while this is often the case, it is also
+ very common for CDs to have some tracks with data, and some
+ tracks with audio. Just because I feel like it, I declare
+ the following to be the best way to cope. If the CD has
+ ANY data tracks on it, it will be returned as a data CD.
+ If it has any XA tracks, I will return it as that. Now I
+ could simplify this interface by combining these returns with
+ the above, but this more clearly demonstrates the problem
+ with the current interface. Too bad this wasn't designed
+ to use bitmasks... -Erik
+
+ Well, now we have the option CDS_MIXED: a mixed-type CD.
+ User level programmers might feel the ioctl is not very
+ useful.
+ ---david
+
+
+
+
+CDROM_CHANGER_NSLOTS
+ Get number of slots
+
+
+ usage::
+
+ ioctl(fd, CDROM_CHANGER_NSLOTS, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ The ioctl return value will be the number of slots in a
+ CD changer. Typically 1 for non-multi-disk devices.
+
+ error returns:
+ none
+
+
+
+CDROM_LOCKDOOR
+ lock or unlock door
+
+
+ usage::
+
+ int lock;
+
+ ioctl(fd, CDROM_LOCKDOOR, lock);
+
+ inputs:
+ Door lock flag, 1=lock, 0=unlock
+
+
+ outputs:
+ none
+
+
+ error returns:
+ - EDRIVE_CANT_DO_THIS
+
+ Door lock function not supported.
+ - EBUSY
+
+ Attempt to unlock when multiple users
+ have the drive open and not CAP_SYS_ADMIN
+
+ notes:
+ As of 2.6.8.1, the lock flag is a global lock, meaning that
+ all CD drives will be locked or unlocked together. This is
+ probably a bug.
+
+ The EDRIVE_CANT_DO_THIS value is defined in <linux/cdrom.h>
+ and is currently (2.6.8.1) the same as EOPNOTSUPP
+
+
+
+CDROM_DEBUG
+ Turn debug messages on/off
+
+
+ usage::
+
+ int debug;
+
+ ioctl(fd, CDROM_DEBUG, debug);
+
+ inputs:
+ Cdrom debug flag, 0=disable, 1=enable
+
+
+ outputs:
+ The ioctl return value will be the new debug flag.
+
+
+ error return:
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+
+
+
+CDROM_GET_CAPABILITY
+ get capabilities
+
+
+ usage::
+
+ ioctl(fd, CDROM_GET_CAPABILITY, 0);
+
+
+ inputs:
+ none
+
+
+ outputs:
+ The ioctl return value is the current device capability
+ flags. See CDC_CLOSE_TRAY, CDC_OPEN_TRAY, etc.
+
+
+
+CDROMAUDIOBUFSIZ
+ set the audio buffer size
+
+
+ usage::
+
+ int arg;
+
+ ioctl(fd, CDROMAUDIOBUFSIZ, val);
+
+ inputs:
+ New audio buffer size
+
+
+ outputs:
+ The ioctl return value is the new audio buffer size, or -1
+ on error.
+
+ error return:
+ - ENOSYS Not supported by this driver.
+
+ notes:
+ Not supported by all drivers.
+
+
+
+
+DVD_READ_STRUCT Read structure
+
+ usage::
+
+ dvd_struct s;
+
+ ioctl(fd, DVD_READ_STRUCT, &s);
+
+ inputs:
+ dvd_struct structure, containing:
+
+ =================== ==========================================
+ type specifies the information desired, one of
+ DVD_STRUCT_PHYSICAL, DVD_STRUCT_COPYRIGHT,
+ DVD_STRUCT_DISCKEY, DVD_STRUCT_BCA,
+ DVD_STRUCT_MANUFACT
+ physical.layer_num desired layer, indexed from 0
+ copyright.layer_num desired layer, indexed from 0
+ disckey.agid
+ =================== ==========================================
+
+ outputs:
+ dvd_struct structure, containing:
+
+ =================== ================================
+ physical for type == DVD_STRUCT_PHYSICAL
+ copyright for type == DVD_STRUCT_COPYRIGHT
+ disckey.value for type == DVD_STRUCT_DISCKEY
+ bca.{len,value} for type == DVD_STRUCT_BCA
+ manufact.{len,valu} for type == DVD_STRUCT_MANUFACT
+ =================== ================================
+
+ error returns:
+ - EINVAL physical.layer_num exceeds number of layers
+ - EIO Received invalid response from drive
+
+
+
+DVD_WRITE_STRUCT Write structure
+
+ Not implemented, as of 2.6.8.1
+
+
+
+DVD_AUTH Authentication
+
+ usage::
+
+ dvd_authinfo ai;
+
+ ioctl(fd, DVD_AUTH, &ai);
+
+ inputs:
+ dvd_authinfo structure. See <linux/cdrom.h>
+
+
+ outputs:
+ dvd_authinfo structure.
+
+
+ error return:
+ - ENOTTY ai.type not recognized.
+
+
+
+CDROM_SEND_PACKET
+ send a packet to the drive
+
+
+ usage::
+
+ struct cdrom_generic_command cgc;
+
+ ioctl(fd, CDROM_SEND_PACKET, &cgc);
+
+ inputs:
+ cdrom_generic_command structure containing the packet to send.
+
+
+ outputs:
+ none
+
+ cdrom_generic_command structure containing results.
+
+ error return:
+ - EIO
+
+ command failed.
+ - EPERM
+
+ Operation not permitted, either because a
+ write command was attempted on a drive which
+ is opened read-only, or because the command
+ requires CAP_SYS_RAWIO
+ - EINVAL
+
+ cgc.data_direction not set
+
+
+
+CDROM_NEXT_WRITABLE
+ get next writable block
+
+
+ usage::
+
+ long next;
+
+ ioctl(fd, CDROM_NEXT_WRITABLE, &next);
+
+ inputs:
+ none
+
+
+ outputs:
+ The next writable block.
+
+
+ notes:
+ If the device does not support this ioctl directly, the
+
+ ioctl will return CDROM_LAST_WRITTEN + 7.
+
+
+
+CDROM_LAST_WRITTEN
+ get last block written on disc
+
+
+ usage::
+
+ long last;
+
+ ioctl(fd, CDROM_LAST_WRITTEN, &last);
+
+ inputs:
+ none
+
+
+ outputs:
+ The last block written on disc
+
+
+ notes:
+ If the device does not support this ioctl directly, the
+ result is derived from the disc's table of contents. If the
+ table of contents can't be read, this ioctl returns an
+ error.
diff --git a/Documentation/ioctl/cdrom.txt b/Documentation/ioctl/cdrom.txt
deleted file mode 100644
index a4d62a9d6771..000000000000
--- a/Documentation/ioctl/cdrom.txt
+++ /dev/null
@@ -1,967 +0,0 @@
- Summary of CDROM ioctl calls.
- ============================
-
- Edward A. Falk <efalk@google.com>
-
- November, 2004
-
-This document attempts to describe the ioctl(2) calls supported by
-the CDROM layer. These are by-and-large implemented (as of Linux 2.6)
-in drivers/cdrom/cdrom.c and drivers/block/scsi_ioctl.c
-
-ioctl values are listed in <linux/cdrom.h>. As of this writing, they
-are as follows:
-
- CDROMPAUSE Pause Audio Operation
- CDROMRESUME Resume paused Audio Operation
- CDROMPLAYMSF Play Audio MSF (struct cdrom_msf)
- CDROMPLAYTRKIND Play Audio Track/index (struct cdrom_ti)
- CDROMREADTOCHDR Read TOC header (struct cdrom_tochdr)
- CDROMREADTOCENTRY Read TOC entry (struct cdrom_tocentry)
- CDROMSTOP Stop the cdrom drive
- CDROMSTART Start the cdrom drive
- CDROMEJECT Ejects the cdrom media
- CDROMVOLCTRL Control output volume (struct cdrom_volctrl)
- CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl)
- CDROMREADMODE2 Read CDROM mode 2 data (2336 Bytes)
- (struct cdrom_read)
- CDROMREADMODE1 Read CDROM mode 1 data (2048 Bytes)
- (struct cdrom_read)
- CDROMREADAUDIO (struct cdrom_read_audio)
- CDROMEJECT_SW enable(1)/disable(0) auto-ejecting
- CDROMMULTISESSION Obtain the start-of-last-session
- address of multi session disks
- (struct cdrom_multisession)
- CDROM_GET_MCN Obtain the "Universal Product Code"
- if available (struct cdrom_mcn)
- CDROM_GET_UPC Deprecated, use CDROM_GET_MCN instead.
- CDROMRESET hard-reset the drive
- CDROMVOLREAD Get the drive's volume setting
- (struct cdrom_volctrl)
- CDROMREADRAW read data in raw mode (2352 Bytes)
- (struct cdrom_read)
- CDROMREADCOOKED read data in cooked mode
- CDROMSEEK seek msf address
- CDROMPLAYBLK scsi-cd only, (struct cdrom_blk)
- CDROMREADALL read all 2646 bytes
- CDROMGETSPINDOWN return 4-bit spindown value
- CDROMSETSPINDOWN set 4-bit spindown value
- CDROMCLOSETRAY pendant of CDROMEJECT
- CDROM_SET_OPTIONS Set behavior options
- CDROM_CLEAR_OPTIONS Clear behavior options
- CDROM_SELECT_SPEED Set the CD-ROM speed
- CDROM_SELECT_DISC Select disc (for juke-boxes)
- CDROM_MEDIA_CHANGED Check is media changed
- CDROM_DRIVE_STATUS Get tray position, etc.
- CDROM_DISC_STATUS Get disc type, etc.
- CDROM_CHANGER_NSLOTS Get number of slots
- CDROM_LOCKDOOR lock or unlock door
- CDROM_DEBUG Turn debug messages on/off
- CDROM_GET_CAPABILITY get capabilities
- CDROMAUDIOBUFSIZ set the audio buffer size
- DVD_READ_STRUCT Read structure
- DVD_WRITE_STRUCT Write structure
- DVD_AUTH Authentication
- CDROM_SEND_PACKET send a packet to the drive
- CDROM_NEXT_WRITABLE get next writable block
- CDROM_LAST_WRITTEN get last block written on disc
-
-
-The information that follows was determined from reading kernel source
-code. It is likely that some corrections will be made over time.
-
-
-
-
-
-
-
-General:
-
- Unless otherwise specified, all ioctl calls return 0 on success
- and -1 with errno set to an appropriate value on error. (Some
- ioctls return non-negative data values.)
-
- Unless otherwise specified, all ioctl calls return -1 and set
- errno to EFAULT on a failed attempt to copy data to or from user
- address space.
-
- Individual drivers may return error codes not listed here.
-
- Unless otherwise specified, all data structures and constants
- are defined in <linux/cdrom.h>
-
-
-
-
-CDROMPAUSE Pause Audio Operation
-
- usage:
-
- ioctl(fd, CDROMPAUSE, 0);
-
- inputs: none
-
- outputs: none
-
- error return:
- ENOSYS cd drive not audio-capable.
-
-
-CDROMRESUME Resume paused Audio Operation
-
- usage:
-
- ioctl(fd, CDROMRESUME, 0);
-
- inputs: none
-
- outputs: none
-
- error return:
- ENOSYS cd drive not audio-capable.
-
-
-CDROMPLAYMSF Play Audio MSF (struct cdrom_msf)
-
- usage:
-
- struct cdrom_msf msf;
- ioctl(fd, CDROMPLAYMSF, &msf);
-
- inputs:
- cdrom_msf structure, describing a segment of music to play
-
- outputs: none
-
- error return:
- ENOSYS cd drive not audio-capable.
-
- notes:
- MSF stands for minutes-seconds-frames
- LBA stands for logical block address
-
- Segment is described as start and end times, where each time
- is described as minutes:seconds:frames. A frame is 1/75 of
- a second.
-
-
-CDROMPLAYTRKIND Play Audio Track/index (struct cdrom_ti)
-
- usage:
-
- struct cdrom_ti ti;
- ioctl(fd, CDROMPLAYTRKIND, &ti);
-
- inputs:
- cdrom_ti structure, describing a segment of music to play
-
- outputs: none
-
- error return:
- ENOSYS cd drive not audio-capable.
-
- notes:
- Segment is described as start and end times, where each time
- is described as a track and an index.
-
-
-
-CDROMREADTOCHDR Read TOC header (struct cdrom_tochdr)
-
- usage:
-
- cdrom_tochdr header;
- ioctl(fd, CDROMREADTOCHDR, &header);
-
- inputs:
- cdrom_tochdr structure
-
- outputs:
- cdrom_tochdr structure
-
- error return:
- ENOSYS cd drive not audio-capable.
-
-
-
-CDROMREADTOCENTRY Read TOC entry (struct cdrom_tocentry)
-
- usage:
-
- struct cdrom_tocentry entry;
- ioctl(fd, CDROMREADTOCENTRY, &entry);
-
- inputs:
- cdrom_tocentry structure
-
- outputs:
- cdrom_tocentry structure
-
- error return:
- ENOSYS cd drive not audio-capable.
- EINVAL entry.cdte_format not CDROM_MSF or CDROM_LBA
- EINVAL requested track out of bounds
- EIO I/O error reading TOC
-
- notes:
- TOC stands for Table Of Contents
- MSF stands for minutes-seconds-frames
- LBA stands for logical block address
-
-
-
-CDROMSTOP Stop the cdrom drive
-
- usage:
-
- ioctl(fd, CDROMSTOP, 0);
-
- inputs: none
-
- outputs: none
-
- error return:
- ENOSYS cd drive not audio-capable.
-
- notes:
- Exact interpretation of this ioctl depends on the device,
- but most seem to spin the drive down.
-
-
-CDROMSTART Start the cdrom drive
-
- usage:
-
- ioctl(fd, CDROMSTART, 0);
-
- inputs: none
-
- outputs: none
-
- error return:
- ENOSYS cd drive not audio-capable.
-
- notes:
- Exact interpretation of this ioctl depends on the device,
- but most seem to spin the drive up and/or close the tray.
- Other devices ignore the ioctl completely.
-
-
-CDROMEJECT Ejects the cdrom media
-
- usage:
-
- ioctl(fd, CDROMEJECT, 0);
-
- inputs: none
-
- outputs: none
-
- error returns:
- ENOSYS cd drive not capable of ejecting
- EBUSY other processes are accessing drive, or door is locked
-
- notes:
- See CDROM_LOCKDOOR, below.
-
-
-
-CDROMCLOSETRAY pendant of CDROMEJECT
-
- usage:
-
- ioctl(fd, CDROMCLOSETRAY, 0);
-
- inputs: none
-
- outputs: none
-
- error returns:
- ENOSYS cd drive not capable of closing the tray
- EBUSY other processes are accessing drive, or door is locked
-
- notes:
- See CDROM_LOCKDOOR, below.
-
-
-
-CDROMVOLCTRL Control output volume (struct cdrom_volctrl)
-
- usage:
-
- struct cdrom_volctrl volume;
- ioctl(fd, CDROMVOLCTRL, &volume);
-
- inputs:
- cdrom_volctrl structure containing volumes for up to 4
- channels.
-
- outputs: none
-
- error return:
- ENOSYS cd drive not audio-capable.
-
-
-
-CDROMVOLREAD Get the drive's volume setting
- (struct cdrom_volctrl)
-
- usage:
-
- struct cdrom_volctrl volume;
- ioctl(fd, CDROMVOLREAD, &volume);
-
- inputs: none
-
- outputs:
- The current volume settings.
-
- error return:
- ENOSYS cd drive not audio-capable.
-
-
-
-CDROMSUBCHNL Read subchannel data (struct cdrom_subchnl)
-
- usage:
-
- struct cdrom_subchnl q;
- ioctl(fd, CDROMSUBCHNL, &q);
-
- inputs:
- cdrom_subchnl structure
-
- outputs:
- cdrom_subchnl structure
-
- error return:
- ENOSYS cd drive not audio-capable.
- EINVAL format not CDROM_MSF or CDROM_LBA
-
- notes:
- Format is converted to CDROM_MSF or CDROM_LBA
- as per user request on return
-
-
-
-CDROMREADRAW read data in raw mode (2352 Bytes)
- (struct cdrom_read)
-
- usage:
-
- union {
- struct cdrom_msf msf; /* input */
- char buffer[CD_FRAMESIZE_RAW]; /* return */
- } arg;
- ioctl(fd, CDROMREADRAW, &arg);
-
- inputs:
- cdrom_msf structure indicating an address to read.
- Only the start values are significant.
-
- outputs:
- Data written to address provided by user.
-
- error return:
- EINVAL address less than 0, or msf less than 0:2:0
- ENOMEM out of memory
-
- notes:
- As of 2.6.8.1, comments in <linux/cdrom.h> indicate that this
- ioctl accepts a cdrom_read structure, but actual source code
- reads a cdrom_msf structure and writes a buffer of data to
- the same address.
-
- MSF values are converted to LBA values via this formula:
-
- lba = (((m * CD_SECS) + s) * CD_FRAMES + f) - CD_MSF_OFFSET;
-
-
-
-
-CDROMREADMODE1 Read CDROM mode 1 data (2048 Bytes)
- (struct cdrom_read)
-
- notes:
- Identical to CDROMREADRAW except that block size is
- CD_FRAMESIZE (2048) bytes
-
-
-
-CDROMREADMODE2 Read CDROM mode 2 data (2336 Bytes)
- (struct cdrom_read)
-
- notes:
- Identical to CDROMREADRAW except that block size is
- CD_FRAMESIZE_RAW0 (2336) bytes
-
-
-
-CDROMREADAUDIO (struct cdrom_read_audio)
-
- usage:
-
- struct cdrom_read_audio ra;
- ioctl(fd, CDROMREADAUDIO, &ra);
-
- inputs:
- cdrom_read_audio structure containing read start
- point and length
-
- outputs:
- audio data, returned to buffer indicated by ra
-
- error return:
- EINVAL format not CDROM_MSF or CDROM_LBA
- EINVAL nframes not in range [1 75]
- ENXIO drive has no queue (probably means invalid fd)
- ENOMEM out of memory
-
-
-CDROMEJECT_SW enable(1)/disable(0) auto-ejecting
-
- usage:
-
- int val;
- ioctl(fd, CDROMEJECT_SW, val);
-
- inputs:
- Flag specifying auto-eject flag.
-
- outputs: none
-
- error return:
- ENOSYS Drive is not capable of ejecting.
- EBUSY Door is locked
-
-
-
-
-CDROMMULTISESSION Obtain the start-of-last-session
- address of multi session disks
- (struct cdrom_multisession)
- usage:
-
- struct cdrom_multisession ms_info;
- ioctl(fd, CDROMMULTISESSION, &ms_info);
-
- inputs:
- cdrom_multisession structure containing desired
- format.
-
- outputs:
- cdrom_multisession structure is filled with last_session
- information.
-
- error return:
- EINVAL format not CDROM_MSF or CDROM_LBA
-
-
-CDROM_GET_MCN Obtain the "Universal Product Code"
- if available (struct cdrom_mcn)
-
- usage:
-
- struct cdrom_mcn mcn;
- ioctl(fd, CDROM_GET_MCN, &mcn);
-
- inputs: none
-
- outputs:
- Universal Product Code
-
- error return:
- ENOSYS Drive is not capable of reading MCN data.
-
- notes:
- Source code comments state:
-
- The following function is implemented, although very few
- audio discs give Universal Product Code information, which
- should just be the Medium Catalog Number on the box. Note,
- that the way the code is written on the CD is /not/ uniform
- across all discs!
-
-
-
-
-CDROM_GET_UPC CDROM_GET_MCN (deprecated)
-
- Not implemented, as of 2.6.8.1
-
-
-
-CDROMRESET hard-reset the drive
-
- usage:
-
- ioctl(fd, CDROMRESET, 0);
-
- inputs: none
-
- outputs: none
-
- error return:
- EACCES Access denied: requires CAP_SYS_ADMIN
- ENOSYS Drive is not capable of resetting.
-
-
-
-
-CDROMREADCOOKED read data in cooked mode
-
- usage:
-
- u8 buffer[CD_FRAMESIZE]
- ioctl(fd, CDROMREADCOOKED, buffer);
-
- inputs: none
-
- outputs:
- 2048 bytes of data, "cooked" mode.
-
- notes:
- Not implemented on all drives.
-
-
-
-
-CDROMREADALL read all 2646 bytes
-
- Same as CDROMREADCOOKED, but reads 2646 bytes.
-
-
-
-CDROMSEEK seek msf address
-
- usage:
-
- struct cdrom_msf msf;
- ioctl(fd, CDROMSEEK, &msf);
-
- inputs:
- MSF address to seek to.
-
- outputs: none
-
-
-
-CDROMPLAYBLK scsi-cd only, (struct cdrom_blk)
-
- usage:
-
- struct cdrom_blk blk;
- ioctl(fd, CDROMPLAYBLK, &blk);
-
- inputs:
- Region to play
-
- outputs: none
-
-
-
-CDROMGETSPINDOWN
-
- usage:
-
- char spindown;
- ioctl(fd, CDROMGETSPINDOWN, &spindown);
-
- inputs: none
-
- outputs:
- The value of the current 4-bit spindown value.
-
-
-
-
-CDROMSETSPINDOWN
-
- usage:
-
- char spindown
- ioctl(fd, CDROMSETSPINDOWN, &spindown);
-
- inputs:
- 4-bit value used to control spindown (TODO: more detail here)
-
- outputs: none
-
-
-
-
-
-CDROM_SET_OPTIONS Set behavior options
-
- usage:
-
- int options;
- ioctl(fd, CDROM_SET_OPTIONS, options);
-
- inputs:
- New values for drive options. The logical 'or' of:
- CDO_AUTO_CLOSE close tray on first open(2)
- CDO_AUTO_EJECT open tray on last release
- CDO_USE_FFLAGS use O_NONBLOCK information on open
- CDO_LOCK lock tray on open files
- CDO_CHECK_TYPE check type on open for data
-
- outputs:
- Returns the resulting options settings in the
- ioctl return value. Returns -1 on error.
-
- error return:
- ENOSYS selected option(s) not supported by drive.
-
-
-
-
-CDROM_CLEAR_OPTIONS Clear behavior options
-
- Same as CDROM_SET_OPTIONS, except that selected options are
- turned off.
-
-
-
-CDROM_SELECT_SPEED Set the CD-ROM speed
-
- usage:
-
- int speed;
- ioctl(fd, CDROM_SELECT_SPEED, speed);
-
- inputs:
- New drive speed.
-
- outputs: none
-
- error return:
- ENOSYS speed selection not supported by drive.
-
-
-
-CDROM_SELECT_DISC Select disc (for juke-boxes)
-
- usage:
-
- int disk;
- ioctl(fd, CDROM_SELECT_DISC, disk);
-
- inputs:
- Disk to load into drive.
-
- outputs: none
-
- error return:
- EINVAL Disk number beyond capacity of drive
-
-
-
-CDROM_MEDIA_CHANGED Check is media changed
-
- usage:
-
- int slot;
- ioctl(fd, CDROM_MEDIA_CHANGED, slot);
-
- inputs:
- Slot number to be tested, always zero except for jukeboxes.
- May also be special values CDSL_NONE or CDSL_CURRENT
-
- outputs:
- Ioctl return value is 0 or 1 depending on whether the media
- has been changed, or -1 on error.
-
- error returns:
- ENOSYS Drive can't detect media change
- EINVAL Slot number beyond capacity of drive
- ENOMEM Out of memory
-
-
-
-CDROM_DRIVE_STATUS Get tray position, etc.
-
- usage:
-
- int slot;
- ioctl(fd, CDROM_DRIVE_STATUS, slot);
-
- inputs:
- Slot number to be tested, always zero except for jukeboxes.
- May also be special values CDSL_NONE or CDSL_CURRENT
-
- outputs:
- Ioctl return value will be one of the following values
- from <linux/cdrom.h>:
-
- CDS_NO_INFO Information not available.
- CDS_NO_DISC
- CDS_TRAY_OPEN
- CDS_DRIVE_NOT_READY
- CDS_DISC_OK
- -1 error
-
- error returns:
- ENOSYS Drive can't detect drive status
- EINVAL Slot number beyond capacity of drive
- ENOMEM Out of memory
-
-
-
-
-CDROM_DISC_STATUS Get disc type, etc.
-
- usage:
-
- ioctl(fd, CDROM_DISC_STATUS, 0);
-
- inputs: none
-
- outputs:
- Ioctl return value will be one of the following values
- from <linux/cdrom.h>:
- CDS_NO_INFO
- CDS_AUDIO
- CDS_MIXED
- CDS_XA_2_2
- CDS_XA_2_1
- CDS_DATA_1
-
- error returns: none at present
-
- notes:
- Source code comments state:
-
- Ok, this is where problems start. The current interface for
- the CDROM_DISC_STATUS ioctl is flawed. It makes the false
- assumption that CDs are all CDS_DATA_1 or all CDS_AUDIO, etc.
- Unfortunately, while this is often the case, it is also
- very common for CDs to have some tracks with data, and some
- tracks with audio. Just because I feel like it, I declare
- the following to be the best way to cope. If the CD has
- ANY data tracks on it, it will be returned as a data CD.
- If it has any XA tracks, I will return it as that. Now I
- could simplify this interface by combining these returns with
- the above, but this more clearly demonstrates the problem
- with the current interface. Too bad this wasn't designed
- to use bitmasks... -Erik
-
- Well, now we have the option CDS_MIXED: a mixed-type CD.
- User level programmers might feel the ioctl is not very
- useful.
- ---david
-
-
-
-
-CDROM_CHANGER_NSLOTS Get number of slots
-
- usage:
-
- ioctl(fd, CDROM_CHANGER_NSLOTS, 0);
-
- inputs: none
-
- outputs:
- The ioctl return value will be the number of slots in a
- CD changer. Typically 1 for non-multi-disk devices.
-
- error returns: none
-
-
-
-CDROM_LOCKDOOR lock or unlock door
-
- usage:
-
- int lock;
- ioctl(fd, CDROM_LOCKDOOR, lock);
-
- inputs:
- Door lock flag, 1=lock, 0=unlock
-
- outputs: none
-
- error returns:
- EDRIVE_CANT_DO_THIS Door lock function not supported.
- EBUSY Attempt to unlock when multiple users
- have the drive open and not CAP_SYS_ADMIN
-
- notes:
- As of 2.6.8.1, the lock flag is a global lock, meaning that
- all CD drives will be locked or unlocked together. This is
- probably a bug.
-
- The EDRIVE_CANT_DO_THIS value is defined in <linux/cdrom.h>
- and is currently (2.6.8.1) the same as EOPNOTSUPP
-
-
-
-CDROM_DEBUG Turn debug messages on/off
-
- usage:
-
- int debug;
- ioctl(fd, CDROM_DEBUG, debug);
-
- inputs:
- Cdrom debug flag, 0=disable, 1=enable
-
- outputs:
- The ioctl return value will be the new debug flag.
-
- error return:
- EACCES Access denied: requires CAP_SYS_ADMIN
-
-
-
-CDROM_GET_CAPABILITY get capabilities
-
- usage:
-
- ioctl(fd, CDROM_GET_CAPABILITY, 0);
-
- inputs: none
-
- outputs:
- The ioctl return value is the current device capability
- flags. See CDC_CLOSE_TRAY, CDC_OPEN_TRAY, etc.
-
-
-
-CDROMAUDIOBUFSIZ set the audio buffer size
-
- usage:
-
- int arg;
- ioctl(fd, CDROMAUDIOBUFSIZ, val);
-
- inputs:
- New audio buffer size
-
- outputs:
- The ioctl return value is the new audio buffer size, or -1
- on error.
-
- error return:
- ENOSYS Not supported by this driver.
-
- notes:
- Not supported by all drivers.
-
-
-
-DVD_READ_STRUCT Read structure
-
- usage:
-
- dvd_struct s;
- ioctl(fd, DVD_READ_STRUCT, &s);
-
- inputs:
- dvd_struct structure, containing:
- type specifies the information desired, one of
- DVD_STRUCT_PHYSICAL, DVD_STRUCT_COPYRIGHT,
- DVD_STRUCT_DISCKEY, DVD_STRUCT_BCA,
- DVD_STRUCT_MANUFACT
- physical.layer_num desired layer, indexed from 0
- copyright.layer_num desired layer, indexed from 0
- disckey.agid
-
- outputs:
- dvd_struct structure, containing:
- physical for type == DVD_STRUCT_PHYSICAL
- copyright for type == DVD_STRUCT_COPYRIGHT
- disckey.value for type == DVD_STRUCT_DISCKEY
- bca.{len,value} for type == DVD_STRUCT_BCA
- manufact.{len,valu} for type == DVD_STRUCT_MANUFACT
-
- error returns:
- EINVAL physical.layer_num exceeds number of layers
- EIO Received invalid response from drive
-
-
-
-DVD_WRITE_STRUCT Write structure
-
- Not implemented, as of 2.6.8.1
-
-
-
-DVD_AUTH Authentication
-
- usage:
-
- dvd_authinfo ai;
- ioctl(fd, DVD_AUTH, &ai);
-
- inputs:
- dvd_authinfo structure. See <linux/cdrom.h>
-
- outputs:
- dvd_authinfo structure.
-
- error return:
- ENOTTY ai.type not recognized.
-
-
-
-CDROM_SEND_PACKET send a packet to the drive
-
- usage:
-
- struct cdrom_generic_command cgc;
- ioctl(fd, CDROM_SEND_PACKET, &cgc);
-
- inputs:
- cdrom_generic_command structure containing the packet to send.
-
- outputs: none
- cdrom_generic_command structure containing results.
-
- error return:
- EIO command failed.
- EPERM Operation not permitted, either because a
- write command was attempted on a drive which
- is opened read-only, or because the command
- requires CAP_SYS_RAWIO
- EINVAL cgc.data_direction not set
-
-
-
-CDROM_NEXT_WRITABLE get next writable block
-
- usage:
-
- long next;
- ioctl(fd, CDROM_NEXT_WRITABLE, &next);
-
- inputs: none
-
- outputs:
- The next writable block.
-
- notes:
- If the device does not support this ioctl directly, the
- ioctl will return CDROM_LAST_WRITTEN + 7.
-
-
-
-CDROM_LAST_WRITTEN get last block written on disc
-
- usage:
-
- long last;
- ioctl(fd, CDROM_LAST_WRITTEN, &last);
-
- inputs: none
-
- outputs:
- The last block written on disc
-
- notes:
- If the device does not support this ioctl directly, the
- result is derived from the disc's table of contents. If the
- table of contents can't be read, this ioctl returns an
- error.
diff --git a/Documentation/ioctl/hdio.rst b/Documentation/ioctl/hdio.rst
new file mode 100644
index 000000000000..e822e3dff176
--- /dev/null
+++ b/Documentation/ioctl/hdio.rst
@@ -0,0 +1,1342 @@
+==============================
+Summary of `HDIO_` ioctl calls
+==============================
+
+- Edward A. Falk <efalk@google.com>
+
+November, 2004
+
+This document attempts to describe the ioctl(2) calls supported by
+the HD/IDE layer. These are by-and-large implemented (as of Linux 2.6)
+in drivers/ide/ide.c and drivers/block/scsi_ioctl.c
+
+ioctl values are listed in <linux/hdreg.h>. As of this writing, they
+are as follows:
+
+ ioctls that pass argument pointers to user space:
+
+ ======================= =======================================
+ HDIO_GETGEO get device geometry
+ HDIO_GET_UNMASKINTR get current unmask setting
+ HDIO_GET_MULTCOUNT get current IDE blockmode setting
+ HDIO_GET_QDMA get use-qdma flag
+ HDIO_SET_XFER set transfer rate via proc
+ HDIO_OBSOLETE_IDENTITY OBSOLETE, DO NOT USE
+ HDIO_GET_KEEPSETTINGS get keep-settings-on-reset flag
+ HDIO_GET_32BIT get current io_32bit setting
+ HDIO_GET_NOWERR get ignore-write-error flag
+ HDIO_GET_DMA get use-dma flag
+ HDIO_GET_NICE get nice flags
+ HDIO_GET_IDENTITY get IDE identification info
+ HDIO_GET_WCACHE get write cache mode on|off
+ HDIO_GET_ACOUSTIC get acoustic value
+ HDIO_GET_ADDRESS get sector addressing mode
+ HDIO_GET_BUSSTATE get the bus state of the hwif
+ HDIO_TRISTATE_HWIF execute a channel tristate
+ HDIO_DRIVE_RESET execute a device reset
+ HDIO_DRIVE_TASKFILE execute raw taskfile
+ HDIO_DRIVE_TASK execute task and special drive command
+ HDIO_DRIVE_CMD execute a special drive command
+ HDIO_DRIVE_CMD_AEB HDIO_DRIVE_TASK
+ ======================= =======================================
+
+ ioctls that pass non-pointer values:
+
+ ======================= =======================================
+ HDIO_SET_MULTCOUNT change IDE blockmode
+ HDIO_SET_UNMASKINTR permit other irqs during I/O
+ HDIO_SET_KEEPSETTINGS keep ioctl settings on reset
+ HDIO_SET_32BIT change io_32bit flags
+ HDIO_SET_NOWERR change ignore-write-error flag
+ HDIO_SET_DMA change use-dma flag
+ HDIO_SET_PIO_MODE reconfig interface to new speed
+ HDIO_SCAN_HWIF register and (re)scan interface
+ HDIO_SET_NICE set nice flags
+ HDIO_UNREGISTER_HWIF unregister interface
+ HDIO_SET_WCACHE change write cache enable-disable
+ HDIO_SET_ACOUSTIC change acoustic behavior
+ HDIO_SET_BUSSTATE set the bus state of the hwif
+ HDIO_SET_QDMA change use-qdma flag
+ HDIO_SET_ADDRESS change lba addressing modes
+
+ HDIO_SET_IDE_SCSI Set scsi emulation mode on/off
+ HDIO_SET_SCSI_IDE not implemented yet
+ ======================= =======================================
+
+
+The information that follows was determined from reading kernel source
+code. It is likely that some corrections will be made over time.
+
+------------------------------------------------------------------------------
+
+General:
+
+ Unless otherwise specified, all ioctl calls return 0 on success
+ and -1 with errno set to an appropriate value on error.
+
+ Unless otherwise specified, all ioctl calls return -1 and set
+ errno to EFAULT on a failed attempt to copy data to or from user
+ address space.
+
+ Unless otherwise specified, all data structures and constants
+ are defined in <linux/hdreg.h>
+
+------------------------------------------------------------------------------
+
+HDIO_GETGEO
+ get device geometry
+
+
+ usage::
+
+ struct hd_geometry geom;
+
+ ioctl(fd, HDIO_GETGEO, &geom);
+
+
+ inputs:
+ none
+
+
+
+ outputs:
+ hd_geometry structure containing:
+
+
+ ========= ==================================
+ heads number of heads
+ sectors number of sectors/track
+ cylinders number of cylinders, mod 65536
+ start starting sector of this partition.
+ ========= ==================================
+
+
+ error returns:
+ - EINVAL
+
+ if the device is not a disk drive or floppy drive,
+ or if the user passes a null pointer
+
+
+ notes:
+ Not particularly useful with modern disk drives, whose geometry
+ is a polite fiction anyway. Modern drives are addressed
+ purely by sector number nowadays (lba addressing), and the
+ drive geometry is an abstraction which is actually subject
+ to change. Currently (as of Nov 2004), the geometry values
+ are the "bios" values -- presumably the values the drive had
+ when Linux first booted.
+
+ In addition, the cylinders field of the hd_geometry is an
+ unsigned short, meaning that on most architectures, this
+ ioctl will not return a meaningful value on drives with more
+ than 65535 tracks.
+
+ The start field is unsigned long, meaning that it will not
+ contain a meaningful value for disks over 219 Gb in size.
+
+
+
+
+HDIO_GET_UNMASKINTR
+ get current unmask setting
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_UNMASKINTR, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the drive's current unmask setting
+
+
+
+
+
+HDIO_SET_UNMASKINTR
+ permit other irqs during I/O
+
+
+ usage::
+
+ unsigned long val;
+
+ ioctl(fd, HDIO_SET_UNMASKINTR, val);
+
+ inputs:
+ New value for unmask flag
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 1]
+ - EBUSY Controller busy
+
+
+
+
+HDIO_GET_MULTCOUNT
+ get current IDE blockmode setting
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_MULTCOUNT, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current IDE block mode setting. This
+ controls how many sectors the drive will transfer per
+ interrupt.
+
+
+
+HDIO_SET_MULTCOUNT
+ change IDE blockmode
+
+
+ usage::
+
+ int val;
+
+ ioctl(fd, HDIO_SET_MULTCOUNT, val);
+
+ inputs:
+ New value for IDE block mode setting. This controls how many
+ sectors the drive will transfer per interrupt.
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range supported by disk.
+ - EBUSY Controller busy or blockmode already set.
+ - EIO Drive did not accept new block mode.
+
+ notes:
+ Source code comments read::
+
+ This is tightly woven into the driver->do_special cannot
+ touch. DON'T do it again until a total personality rewrite
+ is committed.
+
+ If blockmode has already been set, this ioctl will fail with
+ -EBUSY
+
+
+
+HDIO_GET_QDMA
+ get use-qdma flag
+
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_XFER
+ set transfer rate via proc
+
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_OBSOLETE_IDENTITY
+ OBSOLETE, DO NOT USE
+
+
+ Same as HDIO_GET_IDENTITY (see below), except that it only
+ returns the first 142 bytes of drive identity information.
+
+
+
+HDIO_GET_IDENTITY
+ get IDE identification info
+
+
+ usage::
+
+ unsigned char identity[512];
+
+ ioctl(fd, HDIO_GET_IDENTITY, identity);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ ATA drive identity information. For full description, see
+ the IDENTIFY DEVICE and IDENTIFY PACKET DEVICE commands in
+ the ATA specification.
+
+ error returns:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - ENOMSG IDENTIFY DEVICE information not available
+
+ notes:
+ Returns information that was obtained when the drive was
+ probed. Some of this information is subject to change, and
+ this ioctl does not re-probe the drive to update the
+ information.
+
+ This information is also available from /proc/ide/hdX/identify
+
+
+
+HDIO_GET_KEEPSETTINGS
+ get keep-settings-on-reset flag
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_KEEPSETTINGS, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current "keep settings" flag
+
+
+
+ notes:
+ When set, indicates that kernel should restore settings
+ after a drive reset.
+
+
+
+HDIO_SET_KEEPSETTINGS
+ keep ioctl settings on reset
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_SET_KEEPSETTINGS, val);
+
+ inputs:
+ New value for keep_settings flag
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 1]
+ - EBUSY Controller busy
+
+
+
+HDIO_GET_32BIT
+ get current io_32bit setting
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_32BIT, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current io_32bit setting
+
+
+
+ notes:
+ 0=16-bit, 1=32-bit, 2,3 = 32bit+sync
+
+
+
+
+
+HDIO_GET_NOWERR
+ get ignore-write-error flag
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_NOWERR, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current ignore-write-error flag
+
+
+
+
+
+HDIO_GET_DMA
+ get use-dma flag
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_DMA, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current use-dma flag
+
+
+
+
+
+HDIO_GET_NICE
+ get nice flags
+
+
+ usage::
+
+ long nice;
+
+ ioctl(fd, HDIO_GET_NICE, &nice);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The drive's "nice" values.
+
+
+
+ notes:
+ Per-drive flags which determine when the system will give more
+ bandwidth to other devices sharing the same IDE bus.
+
+ See <linux/hdreg.h>, near symbol IDE_NICE_DSC_OVERLAP.
+
+
+
+
+HDIO_SET_NICE
+ set nice flags
+
+
+ usage::
+
+ unsigned long nice;
+
+ ...
+ ioctl(fd, HDIO_SET_NICE, nice);
+
+ inputs:
+ bitmask of nice flags.
+
+
+
+ outputs:
+ none
+
+
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EPERM Flags other than DSC_OVERLAP and NICE_1 set.
+ - EPERM DSC_OVERLAP specified but not supported by drive
+
+ notes:
+ This ioctl sets the DSC_OVERLAP and NICE_1 flags from values
+ provided by the user.
+
+ Nice flags are listed in <linux/hdreg.h>, starting with
+ IDE_NICE_DSC_OVERLAP. These values represent shifts.
+
+
+
+
+
+HDIO_GET_WCACHE
+ get write cache mode on|off
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_WCACHE, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current write cache mode
+
+
+
+
+
+HDIO_GET_ACOUSTIC
+ get acoustic value
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_GET_ACOUSTIC, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current acoustic settings
+
+
+
+ notes:
+ See HDIO_SET_ACOUSTIC
+
+
+
+
+
+HDIO_GET_ADDRESS
+ usage::
+
+
+ long val;
+
+ ioctl(fd, HDIO_GET_ADDRESS, &val);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ The value of the current addressing mode:
+
+ = ===================
+ 0 28-bit
+ 1 48-bit
+ 2 48-bit doing 28-bit
+ 3 64-bit
+ = ===================
+
+
+
+HDIO_GET_BUSSTATE
+ get the bus state of the hwif
+
+
+ usage::
+
+ long state;
+
+ ioctl(fd, HDIO_SCAN_HWIF, &state);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ Current power state of the IDE bus. One of BUSSTATE_OFF,
+ BUSSTATE_ON, or BUSSTATE_TRISTATE
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+
+
+
+
+HDIO_SET_BUSSTATE
+ set the bus state of the hwif
+
+
+ usage::
+
+ int state;
+
+ ...
+ ioctl(fd, HDIO_SCAN_HWIF, state);
+
+ inputs:
+ Desired IDE power state. One of BUSSTATE_OFF, BUSSTATE_ON,
+ or BUSSTATE_TRISTATE
+
+ outputs:
+ none
+
+
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_RAWIO
+ - EOPNOTSUPP Hardware interface does not support bus power control
+
+
+
+
+HDIO_TRISTATE_HWIF
+ execute a channel tristate
+
+
+ Not implemented, as of 2.6.8.1. See HDIO_SET_BUSSTATE
+
+
+
+HDIO_DRIVE_RESET
+ execute a device reset
+
+
+ usage::
+
+ int args[3]
+
+ ...
+ ioctl(fd, HDIO_DRIVE_RESET, args);
+
+ inputs:
+ none
+
+
+
+ outputs:
+ none
+
+
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - ENXIO No such device: phy dead or ctl_addr == 0
+ - EIO I/O error: reset timed out or hardware error
+
+ notes:
+
+ - Execute a reset on the device as soon as the current IO
+ operation has completed.
+
+ - Executes an ATAPI soft reset if applicable, otherwise
+ executes an ATA soft reset on the controller.
+
+
+
+HDIO_DRIVE_TASKFILE
+ execute raw taskfile
+
+
+ Note:
+ If you don't have a copy of the ANSI ATA specification
+ handy, you should probably ignore this ioctl.
+
+ - Execute an ATA disk command directly by writing the "taskfile"
+ registers of the drive. Requires ADMIN and RAWIO access
+ privileges.
+
+ usage::
+
+ struct {
+
+ ide_task_request_t req_task;
+ u8 outbuf[OUTPUT_SIZE];
+ u8 inbuf[INPUT_SIZE];
+ } task;
+ memset(&task.req_task, 0, sizeof(task.req_task));
+ task.req_task.out_size = sizeof(task.outbuf);
+ task.req_task.in_size = sizeof(task.inbuf);
+ ...
+ ioctl(fd, HDIO_DRIVE_TASKFILE, &task);
+ ...
+
+ inputs:
+
+ (See below for details on memory area passed to ioctl.)
+
+ ============ ===================================================
+ io_ports[8] values to be written to taskfile registers
+ hob_ports[8] high-order bytes, for extended commands.
+ out_flags flags indicating which registers are valid
+ in_flags flags indicating which registers should be returned
+ data_phase see below
+ req_cmd command type to be executed
+ out_size size of output buffer
+ outbuf buffer of data to be transmitted to disk
+ inbuf buffer of data to be received from disk (see [1])
+ ============ ===================================================
+
+ outputs:
+
+ =========== ====================================================
+ io_ports[] values returned in the taskfile registers
+ hob_ports[] high-order bytes, for extended commands.
+ out_flags flags indicating which registers are valid (see [2])
+ in_flags flags indicating which registers should be returned
+ outbuf buffer of data to be transmitted to disk (see [1])
+ inbuf buffer of data to be received from disk
+ =========== ====================================================
+
+ error returns:
+ - EACCES CAP_SYS_ADMIN or CAP_SYS_RAWIO privilege not set.
+ - ENOMSG Device is not a disk drive.
+ - ENOMEM Unable to allocate memory for task
+ - EFAULT req_cmd == TASKFILE_IN_OUT (not implemented as of 2.6.8)
+ - EPERM
+
+ req_cmd == TASKFILE_MULTI_OUT and drive
+ multi-count not yet set.
+ - EIO Drive failed the command.
+
+ notes:
+
+ [1] READ THE FOLLOWING NOTES *CAREFULLY*. THIS IOCTL IS
+ FULL OF GOTCHAS. Extreme caution should be used with using
+ this ioctl. A mistake can easily corrupt data or hang the
+ system.
+
+ [2] Both the input and output buffers are copied from the
+ user and written back to the user, even when not used.
+
+ [3] If one or more bits are set in out_flags and in_flags is
+ zero, the following values are used for in_flags.all and
+ written back into in_flags on completion.
+
+ * IDE_TASKFILE_STD_IN_FLAGS | (IDE_HOB_STD_IN_FLAGS << 8)
+ if LBA48 addressing is enabled for the drive
+ * IDE_TASKFILE_STD_IN_FLAGS
+ if CHS/LBA28
+
+ The association between in_flags.all and each enable
+ bitfield flips depending on endianness; fortunately, TASKFILE
+ only uses inflags.b.data bit and ignores all other bits.
+ The end result is that, on any endian machines, it has no
+ effect other than modifying in_flags on completion.
+
+ [4] The default value of SELECT is (0xa0|DEV_bit|LBA_bit)
+ except for four drives per port chipsets. For four drives
+ per port chipsets, it's (0xa0|DEV_bit|LBA_bit) for the first
+ pair and (0x80|DEV_bit|LBA_bit) for the second pair.
+
+ [5] The argument to the ioctl is a pointer to a region of
+ memory containing a ide_task_request_t structure, followed
+ by an optional buffer of data to be transmitted to the
+ drive, followed by an optional buffer to receive data from
+ the drive.
+
+ Command is passed to the disk drive via the ide_task_request_t
+ structure, which contains these fields:
+
+ ============ ===============================================
+ io_ports[8] values for the taskfile registers
+ hob_ports[8] high-order bytes, for extended commands
+ out_flags flags indicating which entries in the
+ io_ports[] and hob_ports[] arrays
+ contain valid values. Type ide_reg_valid_t.
+ in_flags flags indicating which entries in the
+ io_ports[] and hob_ports[] arrays
+ are expected to contain valid values
+ on return.
+ data_phase See below
+ req_cmd Command type, see below
+ out_size output (user->drive) buffer size, bytes
+ in_size input (drive->user) buffer size, bytes
+ ============ ===============================================
+
+ When out_flags is zero, the following registers are loaded.
+
+ ============ ===============================================
+ HOB_FEATURE If the drive supports LBA48
+ HOB_NSECTOR If the drive supports LBA48
+ HOB_SECTOR If the drive supports LBA48
+ HOB_LCYL If the drive supports LBA48
+ HOB_HCYL If the drive supports LBA48
+ FEATURE
+ NSECTOR
+ SECTOR
+ LCYL
+ HCYL
+ SELECT First, masked with 0xE0 if LBA48, 0xEF
+ otherwise; then, or'ed with the default
+ value of SELECT.
+ ============ ===============================================
+
+ If any bit in out_flags is set, the following registers are loaded.
+
+ ============ ===============================================
+ HOB_DATA If out_flags.b.data is set. HOB_DATA will
+ travel on DD8-DD15 on little endian machines
+ and on DD0-DD7 on big endian machines.
+ DATA If out_flags.b.data is set. DATA will
+ travel on DD0-DD7 on little endian machines
+ and on DD8-DD15 on big endian machines.
+ HOB_NSECTOR If out_flags.b.nsector_hob is set
+ HOB_SECTOR If out_flags.b.sector_hob is set
+ HOB_LCYL If out_flags.b.lcyl_hob is set
+ HOB_HCYL If out_flags.b.hcyl_hob is set
+ FEATURE If out_flags.b.feature is set
+ NSECTOR If out_flags.b.nsector is set
+ SECTOR If out_flags.b.sector is set
+ LCYL If out_flags.b.lcyl is set
+ HCYL If out_flags.b.hcyl is set
+ SELECT Or'ed with the default value of SELECT and
+ loaded regardless of out_flags.b.select.
+ ============ ===============================================
+
+ Taskfile registers are read back from the drive into
+ {io|hob}_ports[] after the command completes iff one of the
+ following conditions is met; otherwise, the original values
+ will be written back, unchanged.
+
+ 1. The drive fails the command (EIO).
+ 2. One or more than one bits are set in out_flags.
+ 3. The requested data_phase is TASKFILE_NO_DATA.
+
+ ============ ===============================================
+ HOB_DATA If in_flags.b.data is set. It will contain
+ DD8-DD15 on little endian machines and
+ DD0-DD7 on big endian machines.
+ DATA If in_flags.b.data is set. It will contain
+ DD0-DD7 on little endian machines and
+ DD8-DD15 on big endian machines.
+ HOB_FEATURE If the drive supports LBA48
+ HOB_NSECTOR If the drive supports LBA48
+ HOB_SECTOR If the drive supports LBA48
+ HOB_LCYL If the drive supports LBA48
+ HOB_HCYL If the drive supports LBA48
+ NSECTOR
+ SECTOR
+ LCYL
+ HCYL
+ ============ ===============================================
+
+ The data_phase field describes the data transfer to be
+ performed. Value is one of:
+
+ =================== ========================================
+ TASKFILE_IN
+ TASKFILE_MULTI_IN
+ TASKFILE_OUT
+ TASKFILE_MULTI_OUT
+ TASKFILE_IN_OUT
+ TASKFILE_IN_DMA
+ TASKFILE_IN_DMAQ == IN_DMA (queueing not supported)
+ TASKFILE_OUT_DMA
+ TASKFILE_OUT_DMAQ == OUT_DMA (queueing not supported)
+ TASKFILE_P_IN unimplemented
+ TASKFILE_P_IN_DMA unimplemented
+ TASKFILE_P_IN_DMAQ unimplemented
+ TASKFILE_P_OUT unimplemented
+ TASKFILE_P_OUT_DMA unimplemented
+ TASKFILE_P_OUT_DMAQ unimplemented
+ =================== ========================================
+
+ The req_cmd field classifies the command type. It may be
+ one of:
+
+ ======================== =======================================
+ IDE_DRIVE_TASK_NO_DATA
+ IDE_DRIVE_TASK_SET_XFER unimplemented
+ IDE_DRIVE_TASK_IN
+ IDE_DRIVE_TASK_OUT unimplemented
+ IDE_DRIVE_TASK_RAW_WRITE
+ ======================== =======================================
+
+ [6] Do not access {in|out}_flags->all except for resetting
+ all the bits. Always access individual bit fields. ->all
+ value will flip depending on endianness. For the same
+ reason, do not use IDE_{TASKFILE|HOB}_STD_{OUT|IN}_FLAGS
+ constants defined in hdreg.h.
+
+
+
+HDIO_DRIVE_CMD
+ execute a special drive command
+
+
+ Note: If you don't have a copy of the ANSI ATA specification
+ handy, you should probably ignore this ioctl.
+
+ usage::
+
+ u8 args[4+XFER_SIZE];
+
+ ...
+ ioctl(fd, HDIO_DRIVE_CMD, args);
+
+ inputs:
+ Commands other than WIN_SMART:
+
+ ======= =======
+ args[0] COMMAND
+ args[1] NSECTOR
+ args[2] FEATURE
+ args[3] NSECTOR
+ ======= =======
+
+ WIN_SMART:
+
+ ======= =======
+ args[0] COMMAND
+ args[1] SECTOR
+ args[2] FEATURE
+ args[3] NSECTOR
+ ======= =======
+
+ outputs:
+ args[] buffer is filled with register values followed by any
+
+
+ data returned by the disk.
+
+ ======== ====================================================
+ args[0] status
+ args[1] error
+ args[2] NSECTOR
+ args[3] undefined
+ args[4+] NSECTOR * 512 bytes of data returned by the command.
+ ======== ====================================================
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_RAWIO
+ - ENOMEM Unable to allocate memory for task
+ - EIO Drive reports error
+
+ notes:
+
+ [1] For commands other than WIN_SMART, args[1] should equal
+ args[3]. SECTOR, LCYL and HCYL are undefined. For
+ WIN_SMART, 0x4f and 0xc2 are loaded into LCYL and HCYL
+ respectively. In both cases SELECT will contain the default
+ value for the drive. Please refer to HDIO_DRIVE_TASKFILE
+ notes for the default value of SELECT.
+
+ [2] If NSECTOR value is greater than zero and the drive sets
+ DRQ when interrupting for the command, NSECTOR * 512 bytes
+ are read from the device into the area following NSECTOR.
+ In the above example, the area would be
+ args[4..4+XFER_SIZE]. 16bit PIO is used regardless of
+ HDIO_SET_32BIT setting.
+
+ [3] If COMMAND == WIN_SETFEATURES && FEATURE == SETFEATURES_XFER
+ && NSECTOR >= XFER_SW_DMA_0 && the drive supports any DMA
+ mode, IDE driver will try to tune the transfer mode of the
+ drive accordingly.
+
+
+
+HDIO_DRIVE_TASK
+ execute task and special drive command
+
+
+ Note: If you don't have a copy of the ANSI ATA specification
+ handy, you should probably ignore this ioctl.
+
+ usage::
+
+ u8 args[7];
+
+ ...
+ ioctl(fd, HDIO_DRIVE_TASK, args);
+
+ inputs:
+ Taskfile register values:
+
+ ======= =======
+ args[0] COMMAND
+ args[1] FEATURE
+ args[2] NSECTOR
+ args[3] SECTOR
+ args[4] LCYL
+ args[5] HCYL
+ args[6] SELECT
+ ======= =======
+
+ outputs:
+ Taskfile register values:
+
+
+ ======= =======
+ args[0] status
+ args[1] error
+ args[2] NSECTOR
+ args[3] SECTOR
+ args[4] LCYL
+ args[5] HCYL
+ args[6] SELECT
+ ======= =======
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_RAWIO
+ - ENOMEM Unable to allocate memory for task
+ - ENOMSG Device is not a disk drive.
+ - EIO Drive failed the command.
+
+ notes:
+
+ [1] DEV bit (0x10) of SELECT register is ignored and the
+ appropriate value for the drive is used. All other bits
+ are used unaltered.
+
+
+
+HDIO_DRIVE_CMD_AEB
+ HDIO_DRIVE_TASK
+
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_32BIT
+ change io_32bit flags
+
+
+ usage::
+
+ int val;
+
+ ioctl(fd, HDIO_SET_32BIT, val);
+
+ inputs:
+ New value for io_32bit flag
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 3]
+ - EBUSY Controller busy
+
+
+
+
+HDIO_SET_NOWERR
+ change ignore-write-error flag
+
+
+ usage::
+
+ int val;
+
+ ioctl(fd, HDIO_SET_NOWERR, val);
+
+ inputs:
+ New value for ignore-write-error flag. Used for ignoring
+
+
+ WRERR_STAT
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 1]
+ - EBUSY Controller busy
+
+
+
+HDIO_SET_DMA
+ change use-dma flag
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_SET_DMA, val);
+
+ inputs:
+ New value for use-dma flag
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 1]
+ - EBUSY Controller busy
+
+
+
+HDIO_SET_PIO_MODE
+ reconfig interface to new speed
+
+
+ usage::
+
+ long val;
+
+ ioctl(fd, HDIO_SET_PIO_MODE, val);
+
+ inputs:
+ New interface speed.
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 255]
+ - EBUSY Controller busy
+
+
+
+HDIO_SCAN_HWIF
+ register and (re)scan interface
+
+
+ usage::
+
+ int args[3]
+
+ ...
+ ioctl(fd, HDIO_SCAN_HWIF, args);
+
+ inputs:
+
+ ======= =========================
+ args[0] io address to probe
+
+
+ args[1] control address to probe
+ args[2] irq number
+ ======= =========================
+
+ outputs:
+ none
+
+
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_RAWIO
+ - EIO Probe failed.
+
+ notes:
+ This ioctl initializes the addresses and irq for a disk
+ controller, probes for drives, and creates /proc/ide
+ interfaces as appropriate.
+
+
+
+HDIO_UNREGISTER_HWIF
+ unregister interface
+
+
+ usage::
+
+ int index;
+
+ ioctl(fd, HDIO_UNREGISTER_HWIF, index);
+
+ inputs:
+ index index of hardware interface to unregister
+
+
+
+ outputs:
+ none
+
+
+
+ error returns:
+ - EACCES Access denied: requires CAP_SYS_RAWIO
+
+ notes:
+ This ioctl removes a hardware interface from the kernel.
+
+ Currently (2.6.8) this ioctl silently fails if any drive on
+ the interface is busy.
+
+
+
+HDIO_SET_WCACHE
+ change write cache enable-disable
+
+
+ usage::
+
+ int val;
+
+ ioctl(fd, HDIO_SET_WCACHE, val);
+
+ inputs:
+ New value for write cache enable
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 1]
+ - EBUSY Controller busy
+
+
+
+HDIO_SET_ACOUSTIC
+ change acoustic behavior
+
+
+ usage::
+
+ int val;
+
+ ioctl(fd, HDIO_SET_ACOUSTIC, val);
+
+ inputs:
+ New value for drive acoustic settings
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 254]
+ - EBUSY Controller busy
+
+
+
+HDIO_SET_QDMA
+ change use-qdma flag
+
+
+ Not implemented, as of 2.6.8.1
+
+
+
+HDIO_SET_ADDRESS
+ change lba addressing modes
+
+
+ usage::
+
+ int val;
+
+ ioctl(fd, HDIO_SET_ADDRESS, val);
+
+ inputs:
+ New value for addressing mode
+
+ = ===================
+ 0 28-bit
+ 1 48-bit
+ 2 48-bit doing 28-bit
+ = ===================
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 2]
+ - EBUSY Controller busy
+ - EIO Drive does not support lba48 mode.
+
+
+HDIO_SET_IDE_SCSI
+ usage::
+
+
+ long val;
+
+ ioctl(fd, HDIO_SET_IDE_SCSI, val);
+
+ inputs:
+ New value for scsi emulation mode (?)
+
+
+
+ outputs:
+ none
+
+
+
+ error return:
+ - EINVAL (bdev != bdev->bd_contains) (not sure what this means)
+ - EACCES Access denied: requires CAP_SYS_ADMIN
+ - EINVAL value out of range [0 1]
+ - EBUSY Controller busy
+
+
+
+HDIO_SET_SCSI_IDE
+ Not implemented, as of 2.6.8.1
diff --git a/Documentation/ioctl/hdio.txt b/Documentation/ioctl/hdio.txt
deleted file mode 100644
index 18eb98c44ffe..000000000000
--- a/Documentation/ioctl/hdio.txt
+++ /dev/null
@@ -1,1071 +0,0 @@
- Summary of HDIO_ ioctl calls.
- ============================
-
- Edward A. Falk <efalk@google.com>
-
- November, 2004
-
-This document attempts to describe the ioctl(2) calls supported by
-the HD/IDE layer. These are by-and-large implemented (as of Linux 2.6)
-in drivers/ide/ide.c and drivers/block/scsi_ioctl.c
-
-ioctl values are listed in <linux/hdreg.h>. As of this writing, they
-are as follows:
-
- ioctls that pass argument pointers to user space:
-
- HDIO_GETGEO get device geometry
- HDIO_GET_UNMASKINTR get current unmask setting
- HDIO_GET_MULTCOUNT get current IDE blockmode setting
- HDIO_GET_QDMA get use-qdma flag
- HDIO_SET_XFER set transfer rate via proc
- HDIO_OBSOLETE_IDENTITY OBSOLETE, DO NOT USE
- HDIO_GET_KEEPSETTINGS get keep-settings-on-reset flag
- HDIO_GET_32BIT get current io_32bit setting
- HDIO_GET_NOWERR get ignore-write-error flag
- HDIO_GET_DMA get use-dma flag
- HDIO_GET_NICE get nice flags
- HDIO_GET_IDENTITY get IDE identification info
- HDIO_GET_WCACHE get write cache mode on|off
- HDIO_GET_ACOUSTIC get acoustic value
- HDIO_GET_ADDRESS get sector addressing mode
- HDIO_GET_BUSSTATE get the bus state of the hwif
- HDIO_TRISTATE_HWIF execute a channel tristate
- HDIO_DRIVE_RESET execute a device reset
- HDIO_DRIVE_TASKFILE execute raw taskfile
- HDIO_DRIVE_TASK execute task and special drive command
- HDIO_DRIVE_CMD execute a special drive command
- HDIO_DRIVE_CMD_AEB HDIO_DRIVE_TASK
-
- ioctls that pass non-pointer values:
-
- HDIO_SET_MULTCOUNT change IDE blockmode
- HDIO_SET_UNMASKINTR permit other irqs during I/O
- HDIO_SET_KEEPSETTINGS keep ioctl settings on reset
- HDIO_SET_32BIT change io_32bit flags
- HDIO_SET_NOWERR change ignore-write-error flag
- HDIO_SET_DMA change use-dma flag
- HDIO_SET_PIO_MODE reconfig interface to new speed
- HDIO_SCAN_HWIF register and (re)scan interface
- HDIO_SET_NICE set nice flags
- HDIO_UNREGISTER_HWIF unregister interface
- HDIO_SET_WCACHE change write cache enable-disable
- HDIO_SET_ACOUSTIC change acoustic behavior
- HDIO_SET_BUSSTATE set the bus state of the hwif
- HDIO_SET_QDMA change use-qdma flag
- HDIO_SET_ADDRESS change lba addressing modes
-
- HDIO_SET_IDE_SCSI Set scsi emulation mode on/off
- HDIO_SET_SCSI_IDE not implemented yet
-
-
-The information that follows was determined from reading kernel source
-code. It is likely that some corrections will be made over time.
-
-
-
-
-
-
-
-General:
-
- Unless otherwise specified, all ioctl calls return 0 on success
- and -1 with errno set to an appropriate value on error.
-
- Unless otherwise specified, all ioctl calls return -1 and set
- errno to EFAULT on a failed attempt to copy data to or from user
- address space.
-
- Unless otherwise specified, all data structures and constants
- are defined in <linux/hdreg.h>
-
-
-
-HDIO_GETGEO get device geometry
-
- usage:
-
- struct hd_geometry geom;
- ioctl(fd, HDIO_GETGEO, &geom);
-
-
- inputs: none
-
- outputs:
-
- hd_geometry structure containing:
-
- heads number of heads
- sectors number of sectors/track
- cylinders number of cylinders, mod 65536
- start starting sector of this partition.
-
-
- error returns:
- EINVAL if the device is not a disk drive or floppy drive,
- or if the user passes a null pointer
-
-
- notes:
-
- Not particularly useful with modern disk drives, whose geometry
- is a polite fiction anyway. Modern drives are addressed
- purely by sector number nowadays (lba addressing), and the
- drive geometry is an abstraction which is actually subject
- to change. Currently (as of Nov 2004), the geometry values
- are the "bios" values -- presumably the values the drive had
- when Linux first booted.
-
- In addition, the cylinders field of the hd_geometry is an
- unsigned short, meaning that on most architectures, this
- ioctl will not return a meaningful value on drives with more
- than 65535 tracks.
-
- The start field is unsigned long, meaning that it will not
- contain a meaningful value for disks over 219 Gb in size.
-
-
-
-
-HDIO_GET_UNMASKINTR get current unmask setting
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_UNMASKINTR, &val);
-
- inputs: none
-
- outputs:
- The value of the drive's current unmask setting
-
-
-
-HDIO_SET_UNMASKINTR permit other irqs during I/O
-
- usage:
-
- unsigned long val;
- ioctl(fd, HDIO_SET_UNMASKINTR, val);
-
- inputs:
- New value for unmask flag
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 1]
- EBUSY Controller busy
-
-
-
-
-HDIO_GET_MULTCOUNT get current IDE blockmode setting
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_MULTCOUNT, &val);
-
- inputs: none
-
- outputs:
- The value of the current IDE block mode setting. This
- controls how many sectors the drive will transfer per
- interrupt.
-
-
-
-HDIO_SET_MULTCOUNT change IDE blockmode
-
- usage:
-
- int val;
- ioctl(fd, HDIO_SET_MULTCOUNT, val);
-
- inputs:
- New value for IDE block mode setting. This controls how many
- sectors the drive will transfer per interrupt.
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range supported by disk.
- EBUSY Controller busy or blockmode already set.
- EIO Drive did not accept new block mode.
-
- notes:
-
- Source code comments read:
-
- This is tightly woven into the driver->do_special cannot
- touch. DON'T do it again until a total personality rewrite
- is committed.
-
- If blockmode has already been set, this ioctl will fail with
- EBUSY
-
-
-
-HDIO_GET_QDMA get use-qdma flag
-
- Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_XFER set transfer rate via proc
-
- Not implemented, as of 2.6.8.1
-
-
-
-HDIO_OBSOLETE_IDENTITY OBSOLETE, DO NOT USE
-
- Same as HDIO_GET_IDENTITY (see below), except that it only
- returns the first 142 bytes of drive identity information.
-
-
-
-HDIO_GET_IDENTITY get IDE identification info
-
- usage:
-
- unsigned char identity[512];
- ioctl(fd, HDIO_GET_IDENTITY, identity);
-
- inputs: none
-
- outputs:
-
- ATA drive identity information. For full description, see
- the IDENTIFY DEVICE and IDENTIFY PACKET DEVICE commands in
- the ATA specification.
-
- error returns:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- ENOMSG IDENTIFY DEVICE information not available
-
- notes:
-
- Returns information that was obtained when the drive was
- probed. Some of this information is subject to change, and
- this ioctl does not re-probe the drive to update the
- information.
-
- This information is also available from /proc/ide/hdX/identify
-
-
-
-HDIO_GET_KEEPSETTINGS get keep-settings-on-reset flag
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_KEEPSETTINGS, &val);
-
- inputs: none
-
- outputs:
- The value of the current "keep settings" flag
-
- notes:
-
- When set, indicates that kernel should restore settings
- after a drive reset.
-
-
-
-HDIO_SET_KEEPSETTINGS keep ioctl settings on reset
-
- usage:
-
- long val;
- ioctl(fd, HDIO_SET_KEEPSETTINGS, val);
-
- inputs:
- New value for keep_settings flag
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 1]
- EBUSY Controller busy
-
-
-
-HDIO_GET_32BIT get current io_32bit setting
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_32BIT, &val);
-
- inputs: none
-
- outputs:
- The value of the current io_32bit setting
-
- notes:
-
- 0=16-bit, 1=32-bit, 2,3 = 32bit+sync
-
-
-
-HDIO_GET_NOWERR get ignore-write-error flag
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_NOWERR, &val);
-
- inputs: none
-
- outputs:
- The value of the current ignore-write-error flag
-
-
-
-HDIO_GET_DMA get use-dma flag
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_DMA, &val);
-
- inputs: none
-
- outputs:
- The value of the current use-dma flag
-
-
-
-HDIO_GET_NICE get nice flags
-
- usage:
-
- long nice;
- ioctl(fd, HDIO_GET_NICE, &nice);
-
- inputs: none
-
- outputs:
-
- The drive's "nice" values.
-
- notes:
-
- Per-drive flags which determine when the system will give more
- bandwidth to other devices sharing the same IDE bus.
- See <linux/hdreg.h>, near symbol IDE_NICE_DSC_OVERLAP.
-
-
-
-
-HDIO_SET_NICE set nice flags
-
- usage:
-
- unsigned long nice;
- ...
- ioctl(fd, HDIO_SET_NICE, nice);
-
- inputs:
- bitmask of nice flags.
-
- outputs: none
-
- error returns:
- EACCES Access denied: requires CAP_SYS_ADMIN
- EPERM Flags other than DSC_OVERLAP and NICE_1 set.
- EPERM DSC_OVERLAP specified but not supported by drive
-
- notes:
-
- This ioctl sets the DSC_OVERLAP and NICE_1 flags from values
- provided by the user.
-
- Nice flags are listed in <linux/hdreg.h>, starting with
- IDE_NICE_DSC_OVERLAP. These values represent shifts.
-
-
-
-
-
-HDIO_GET_WCACHE get write cache mode on|off
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_WCACHE, &val);
-
- inputs: none
-
- outputs:
- The value of the current write cache mode
-
-
-
-HDIO_GET_ACOUSTIC get acoustic value
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_ACOUSTIC, &val);
-
- inputs: none
-
- outputs:
- The value of the current acoustic settings
-
- notes:
-
- See HDIO_SET_ACOUSTIC
-
-
-
-HDIO_GET_ADDRESS
-
- usage:
-
- long val;
- ioctl(fd, HDIO_GET_ADDRESS, &val);
-
- inputs: none
-
- outputs:
- The value of the current addressing mode:
- 0 = 28-bit
- 1 = 48-bit
- 2 = 48-bit doing 28-bit
- 3 = 64-bit
-
-
-
-HDIO_GET_BUSSTATE get the bus state of the hwif
-
- usage:
-
- long state;
- ioctl(fd, HDIO_SCAN_HWIF, &state);
-
- inputs: none
-
- outputs:
- Current power state of the IDE bus. One of BUSSTATE_OFF,
- BUSSTATE_ON, or BUSSTATE_TRISTATE
-
- error returns:
- EACCES Access denied: requires CAP_SYS_ADMIN
-
-
-
-
-HDIO_SET_BUSSTATE set the bus state of the hwif
-
- usage:
-
- int state;
- ...
- ioctl(fd, HDIO_SCAN_HWIF, state);
-
- inputs:
- Desired IDE power state. One of BUSSTATE_OFF, BUSSTATE_ON,
- or BUSSTATE_TRISTATE
-
- outputs: none
-
- error returns:
- EACCES Access denied: requires CAP_SYS_RAWIO
- EOPNOTSUPP Hardware interface does not support bus power control
-
-
-
-
-HDIO_TRISTATE_HWIF execute a channel tristate
-
- Not implemented, as of 2.6.8.1. See HDIO_SET_BUSSTATE
-
-
-
-HDIO_DRIVE_RESET execute a device reset
-
- usage:
-
- int args[3]
- ...
- ioctl(fd, HDIO_DRIVE_RESET, args);
-
- inputs: none
-
- outputs: none
-
- error returns:
- EACCES Access denied: requires CAP_SYS_ADMIN
- ENXIO No such device: phy dead or ctl_addr == 0
- EIO I/O error: reset timed out or hardware error
-
- notes:
-
- Execute a reset on the device as soon as the current IO
- operation has completed.
-
- Executes an ATAPI soft reset if applicable, otherwise
- executes an ATA soft reset on the controller.
-
-
-
-HDIO_DRIVE_TASKFILE execute raw taskfile
-
- Note: If you don't have a copy of the ANSI ATA specification
- handy, you should probably ignore this ioctl.
-
- Execute an ATA disk command directly by writing the "taskfile"
- registers of the drive. Requires ADMIN and RAWIO access
- privileges.
-
- usage:
-
- struct {
- ide_task_request_t req_task;
- u8 outbuf[OUTPUT_SIZE];
- u8 inbuf[INPUT_SIZE];
- } task;
- memset(&task.req_task, 0, sizeof(task.req_task));
- task.req_task.out_size = sizeof(task.outbuf);
- task.req_task.in_size = sizeof(task.inbuf);
- ...
- ioctl(fd, HDIO_DRIVE_TASKFILE, &task);
- ...
-
- inputs:
-
- (See below for details on memory area passed to ioctl.)
-
- io_ports[8] values to be written to taskfile registers
- hob_ports[8] high-order bytes, for extended commands.
- out_flags flags indicating which registers are valid
- in_flags flags indicating which registers should be returned
- data_phase see below
- req_cmd command type to be executed
- out_size size of output buffer
- outbuf buffer of data to be transmitted to disk
- inbuf buffer of data to be received from disk (see [1])
-
- outputs:
-
- io_ports[] values returned in the taskfile registers
- hob_ports[] high-order bytes, for extended commands.
- out_flags flags indicating which registers are valid (see [2])
- in_flags flags indicating which registers should be returned
- outbuf buffer of data to be transmitted to disk (see [1])
- inbuf buffer of data to be received from disk
-
- error returns:
- EACCES CAP_SYS_ADMIN or CAP_SYS_RAWIO privilege not set.
- ENOMSG Device is not a disk drive.
- ENOMEM Unable to allocate memory for task
- EFAULT req_cmd == TASKFILE_IN_OUT (not implemented as of 2.6.8)
- EPERM req_cmd == TASKFILE_MULTI_OUT and drive
- multi-count not yet set.
- EIO Drive failed the command.
-
- notes:
-
- [1] READ THE FOLLOWING NOTES *CAREFULLY*. THIS IOCTL IS
- FULL OF GOTCHAS. Extreme caution should be used with using
- this ioctl. A mistake can easily corrupt data or hang the
- system.
-
- [2] Both the input and output buffers are copied from the
- user and written back to the user, even when not used.
-
- [3] If one or more bits are set in out_flags and in_flags is
- zero, the following values are used for in_flags.all and
- written back into in_flags on completion.
-
- * IDE_TASKFILE_STD_IN_FLAGS | (IDE_HOB_STD_IN_FLAGS << 8)
- if LBA48 addressing is enabled for the drive
- * IDE_TASKFILE_STD_IN_FLAGS
- if CHS/LBA28
-
- The association between in_flags.all and each enable
- bitfield flips depending on endianness; fortunately, TASKFILE
- only uses inflags.b.data bit and ignores all other bits.
- The end result is that, on any endian machines, it has no
- effect other than modifying in_flags on completion.
-
- [4] The default value of SELECT is (0xa0|DEV_bit|LBA_bit)
- except for four drives per port chipsets. For four drives
- per port chipsets, it's (0xa0|DEV_bit|LBA_bit) for the first
- pair and (0x80|DEV_bit|LBA_bit) for the second pair.
-
- [5] The argument to the ioctl is a pointer to a region of
- memory containing a ide_task_request_t structure, followed
- by an optional buffer of data to be transmitted to the
- drive, followed by an optional buffer to receive data from
- the drive.
-
- Command is passed to the disk drive via the ide_task_request_t
- structure, which contains these fields:
-
- io_ports[8] values for the taskfile registers
- hob_ports[8] high-order bytes, for extended commands
- out_flags flags indicating which entries in the
- io_ports[] and hob_ports[] arrays
- contain valid values. Type ide_reg_valid_t.
- in_flags flags indicating which entries in the
- io_ports[] and hob_ports[] arrays
- are expected to contain valid values
- on return.
- data_phase See below
- req_cmd Command type, see below
- out_size output (user->drive) buffer size, bytes
- in_size input (drive->user) buffer size, bytes
-
- When out_flags is zero, the following registers are loaded.
-
- HOB_FEATURE If the drive supports LBA48
- HOB_NSECTOR If the drive supports LBA48
- HOB_SECTOR If the drive supports LBA48
- HOB_LCYL If the drive supports LBA48
- HOB_HCYL If the drive supports LBA48
- FEATURE
- NSECTOR
- SECTOR
- LCYL
- HCYL
- SELECT First, masked with 0xE0 if LBA48, 0xEF
- otherwise; then, or'ed with the default
- value of SELECT.
-
- If any bit in out_flags is set, the following registers are loaded.
-
- HOB_DATA If out_flags.b.data is set. HOB_DATA will
- travel on DD8-DD15 on little endian machines
- and on DD0-DD7 on big endian machines.
- DATA If out_flags.b.data is set. DATA will
- travel on DD0-DD7 on little endian machines
- and on DD8-DD15 on big endian machines.
- HOB_NSECTOR If out_flags.b.nsector_hob is set
- HOB_SECTOR If out_flags.b.sector_hob is set
- HOB_LCYL If out_flags.b.lcyl_hob is set
- HOB_HCYL If out_flags.b.hcyl_hob is set
- FEATURE If out_flags.b.feature is set
- NSECTOR If out_flags.b.nsector is set
- SECTOR If out_flags.b.sector is set
- LCYL If out_flags.b.lcyl is set
- HCYL If out_flags.b.hcyl is set
- SELECT Or'ed with the default value of SELECT and
- loaded regardless of out_flags.b.select.
-
- Taskfile registers are read back from the drive into
- {io|hob}_ports[] after the command completes iff one of the
- following conditions is met; otherwise, the original values
- will be written back, unchanged.
-
- 1. The drive fails the command (EIO).
- 2. One or more than one bits are set in out_flags.
- 3. The requested data_phase is TASKFILE_NO_DATA.
-
- HOB_DATA If in_flags.b.data is set. It will contain
- DD8-DD15 on little endian machines and
- DD0-DD7 on big endian machines.
- DATA If in_flags.b.data is set. It will contain
- DD0-DD7 on little endian machines and
- DD8-DD15 on big endian machines.
- HOB_FEATURE If the drive supports LBA48
- HOB_NSECTOR If the drive supports LBA48
- HOB_SECTOR If the drive supports LBA48
- HOB_LCYL If the drive supports LBA48
- HOB_HCYL If the drive supports LBA48
- NSECTOR
- SECTOR
- LCYL
- HCYL
-
- The data_phase field describes the data transfer to be
- performed. Value is one of:
-
- TASKFILE_IN
- TASKFILE_MULTI_IN
- TASKFILE_OUT
- TASKFILE_MULTI_OUT
- TASKFILE_IN_OUT
- TASKFILE_IN_DMA
- TASKFILE_IN_DMAQ == IN_DMA (queueing not supported)
- TASKFILE_OUT_DMA
- TASKFILE_OUT_DMAQ == OUT_DMA (queueing not supported)
- TASKFILE_P_IN unimplemented
- TASKFILE_P_IN_DMA unimplemented
- TASKFILE_P_IN_DMAQ unimplemented
- TASKFILE_P_OUT unimplemented
- TASKFILE_P_OUT_DMA unimplemented
- TASKFILE_P_OUT_DMAQ unimplemented
-
- The req_cmd field classifies the command type. It may be
- one of:
-
- IDE_DRIVE_TASK_NO_DATA
- IDE_DRIVE_TASK_SET_XFER unimplemented
- IDE_DRIVE_TASK_IN
- IDE_DRIVE_TASK_OUT unimplemented
- IDE_DRIVE_TASK_RAW_WRITE
-
- [6] Do not access {in|out}_flags->all except for resetting
- all the bits. Always access individual bit fields. ->all
- value will flip depending on endianness. For the same
- reason, do not use IDE_{TASKFILE|HOB}_STD_{OUT|IN}_FLAGS
- constants defined in hdreg.h.
-
-
-
-HDIO_DRIVE_CMD execute a special drive command
-
- Note: If you don't have a copy of the ANSI ATA specification
- handy, you should probably ignore this ioctl.
-
- usage:
-
- u8 args[4+XFER_SIZE];
- ...
- ioctl(fd, HDIO_DRIVE_CMD, args);
-
- inputs:
-
- Commands other than WIN_SMART
- args[0] COMMAND
- args[1] NSECTOR
- args[2] FEATURE
- args[3] NSECTOR
-
- WIN_SMART
- args[0] COMMAND
- args[1] SECTOR
- args[2] FEATURE
- args[3] NSECTOR
-
- outputs:
-
- args[] buffer is filled with register values followed by any
- data returned by the disk.
- args[0] status
- args[1] error
- args[2] NSECTOR
- args[3] undefined
- args[4+] NSECTOR * 512 bytes of data returned by the command.
-
- error returns:
- EACCES Access denied: requires CAP_SYS_RAWIO
- ENOMEM Unable to allocate memory for task
- EIO Drive reports error
-
- notes:
-
- [1] For commands other than WIN_SMART, args[1] should equal
- args[3]. SECTOR, LCYL and HCYL are undefined. For
- WIN_SMART, 0x4f and 0xc2 are loaded into LCYL and HCYL
- respectively. In both cases SELECT will contain the default
- value for the drive. Please refer to HDIO_DRIVE_TASKFILE
- notes for the default value of SELECT.
-
- [2] If NSECTOR value is greater than zero and the drive sets
- DRQ when interrupting for the command, NSECTOR * 512 bytes
- are read from the device into the area following NSECTOR.
- In the above example, the area would be
- args[4..4+XFER_SIZE]. 16bit PIO is used regardless of
- HDIO_SET_32BIT setting.
-
- [3] If COMMAND == WIN_SETFEATURES && FEATURE == SETFEATURES_XFER
- && NSECTOR >= XFER_SW_DMA_0 && the drive supports any DMA
- mode, IDE driver will try to tune the transfer mode of the
- drive accordingly.
-
-
-
-HDIO_DRIVE_TASK execute task and special drive command
-
- Note: If you don't have a copy of the ANSI ATA specification
- handy, you should probably ignore this ioctl.
-
- usage:
-
- u8 args[7];
- ...
- ioctl(fd, HDIO_DRIVE_TASK, args);
-
- inputs:
-
- Taskfile register values:
- args[0] COMMAND
- args[1] FEATURE
- args[2] NSECTOR
- args[3] SECTOR
- args[4] LCYL
- args[5] HCYL
- args[6] SELECT
-
- outputs:
-
- Taskfile register values:
- args[0] status
- args[1] error
- args[2] NSECTOR
- args[3] SECTOR
- args[4] LCYL
- args[5] HCYL
- args[6] SELECT
-
- error returns:
- EACCES Access denied: requires CAP_SYS_RAWIO
- ENOMEM Unable to allocate memory for task
- ENOMSG Device is not a disk drive.
- EIO Drive failed the command.
-
- notes:
-
- [1] DEV bit (0x10) of SELECT register is ignored and the
- appropriate value for the drive is used. All other bits
- are used unaltered.
-
-
-
-HDIO_DRIVE_CMD_AEB HDIO_DRIVE_TASK
-
- Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_32BIT change io_32bit flags
-
- usage:
-
- int val;
- ioctl(fd, HDIO_SET_32BIT, val);
-
- inputs:
- New value for io_32bit flag
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 3]
- EBUSY Controller busy
-
-
-
-
-HDIO_SET_NOWERR change ignore-write-error flag
-
- usage:
-
- int val;
- ioctl(fd, HDIO_SET_NOWERR, val);
-
- inputs:
- New value for ignore-write-error flag. Used for ignoring
- WRERR_STAT
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 1]
- EBUSY Controller busy
-
-
-
-HDIO_SET_DMA change use-dma flag
-
- usage:
-
- long val;
- ioctl(fd, HDIO_SET_DMA, val);
-
- inputs:
- New value for use-dma flag
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 1]
- EBUSY Controller busy
-
-
-
-HDIO_SET_PIO_MODE reconfig interface to new speed
-
- usage:
-
- long val;
- ioctl(fd, HDIO_SET_PIO_MODE, val);
-
- inputs:
- New interface speed.
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 255]
- EBUSY Controller busy
-
-
-
-HDIO_SCAN_HWIF register and (re)scan interface
-
- usage:
-
- int args[3]
- ...
- ioctl(fd, HDIO_SCAN_HWIF, args);
-
- inputs:
- args[0] io address to probe
- args[1] control address to probe
- args[2] irq number
-
- outputs: none
-
- error returns:
- EACCES Access denied: requires CAP_SYS_RAWIO
- EIO Probe failed.
-
- notes:
-
- This ioctl initializes the addresses and irq for a disk
- controller, probes for drives, and creates /proc/ide
- interfaces as appropriate.
-
-
-
-HDIO_UNREGISTER_HWIF unregister interface
-
- usage:
-
- int index;
- ioctl(fd, HDIO_UNREGISTER_HWIF, index);
-
- inputs:
- index index of hardware interface to unregister
-
- outputs: none
-
- error returns:
- EACCES Access denied: requires CAP_SYS_RAWIO
-
- notes:
-
- This ioctl removes a hardware interface from the kernel.
-
- Currently (2.6.8) this ioctl silently fails if any drive on
- the interface is busy.
-
-
-
-HDIO_SET_WCACHE change write cache enable-disable
-
- usage:
-
- int val;
- ioctl(fd, HDIO_SET_WCACHE, val);
-
- inputs:
- New value for write cache enable
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 1]
- EBUSY Controller busy
-
-
-
-HDIO_SET_ACOUSTIC change acoustic behavior
-
- usage:
-
- int val;
- ioctl(fd, HDIO_SET_ACOUSTIC, val);
-
- inputs:
- New value for drive acoustic settings
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 254]
- EBUSY Controller busy
-
-
-
-HDIO_SET_QDMA change use-qdma flag
-
- Not implemented, as of 2.6.8.1
-
-
-
-HDIO_SET_ADDRESS change lba addressing modes
-
- usage:
-
- int val;
- ioctl(fd, HDIO_SET_ADDRESS, val);
-
- inputs:
- New value for addressing mode
- 0 = 28-bit
- 1 = 48-bit
- 2 = 48-bit doing 28-bit
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 2]
- EBUSY Controller busy
- EIO Drive does not support lba48 mode.
-
-
-HDIO_SET_IDE_SCSI
-
- usage:
-
- long val;
- ioctl(fd, HDIO_SET_IDE_SCSI, val);
-
- inputs:
- New value for scsi emulation mode (?)
-
- outputs: none
-
- error return:
- EINVAL (bdev != bdev->bd_contains) (not sure what this means)
- EACCES Access denied: requires CAP_SYS_ADMIN
- EINVAL value out of range [0 1]
- EBUSY Controller busy
-
-
-
-HDIO_SET_SCSI_IDE
-
- Not implemented, as of 2.6.8.1
-
-
diff --git a/Documentation/ioctl/index.rst b/Documentation/ioctl/index.rst
new file mode 100644
index 000000000000..0f0a857f6615
--- /dev/null
+++ b/Documentation/ioctl/index.rst
@@ -0,0 +1,16 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+======
+IOCTLs
+======
+
+.. toctree::
+ :maxdepth: 1
+
+ ioctl-number
+
+ botching-up-ioctls
+ ioctl-decoding
+
+ cdrom
+ hdio
diff --git a/Documentation/ioctl/ioctl-decoding.rst b/Documentation/ioctl/ioctl-decoding.rst
new file mode 100644
index 000000000000..380d6bb3e3ea
--- /dev/null
+++ b/Documentation/ioctl/ioctl-decoding.rst
@@ -0,0 +1,31 @@
+==============================
+Decoding an IOCTL Magic Number
+==============================
+
+To decode a hex IOCTL code:
+
+Most architectures use this generic format, but check
+include/ARCH/ioctl.h for specifics, e.g. powerpc
+uses 3 bits to encode read/write and 13 bits for size.
+
+ ====== ==================================
+ bits meaning
+ ====== ==================================
+ 31-30 00 - no parameters: uses _IO macro
+ 10 - read: _IOR
+ 01 - write: _IOW
+ 11 - read/write: _IOWR
+
+ 29-16 size of arguments
+
+ 15-8 ascii character supposedly
+ unique to each driver
+
+ 7-0 function #
+ ====== ==================================
+
+
+So for example 0x82187201 is a read with arg length of 0x218,
+character 'r' function 1. Grepping the source reveals this is::
+
+ #define VFAT_IOCTL_READDIR_BOTH _IOR('r', 1, struct dirent [2])
diff --git a/Documentation/ioctl/ioctl-decoding.txt b/Documentation/ioctl/ioctl-decoding.txt
deleted file mode 100644
index e35efb0cec2e..000000000000
--- a/Documentation/ioctl/ioctl-decoding.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-To decode a hex IOCTL code:
-
-Most architectures use this generic format, but check
-include/ARCH/ioctl.h for specifics, e.g. powerpc
-uses 3 bits to encode read/write and 13 bits for size.
-
- bits meaning
- 31-30 00 - no parameters: uses _IO macro
- 10 - read: _IOR
- 01 - write: _IOW
- 11 - read/write: _IOWR
-
- 29-16 size of arguments
-
- 15-8 ascii character supposedly
- unique to each driver
-
- 7-0 function #
-
-
-So for example 0x82187201 is a read with arg length of 0x218,
-character 'r' function 1. Grepping the source reveals this is:
-
-#define VFAT_IOCTL_READDIR_BOTH _IOR('r', 1, struct dirent [2])
diff --git a/Documentation/ioctl/ioctl-number.rst b/Documentation/ioctl/ioctl-number.rst
new file mode 100644
index 000000000000..7f8dcae7a230
--- /dev/null
+++ b/Documentation/ioctl/ioctl-number.rst
@@ -0,0 +1,361 @@
+=============
+Ioctl Numbers
+=============
+
+19 October 1999
+
+Michael Elizabeth Chastain
+<mec@shout.net>
+
+If you are adding new ioctl's to the kernel, you should use the _IO
+macros defined in <linux/ioctl.h>:
+
+ ====== == ============================================
+ _IO an ioctl with no parameters
+ _IOW an ioctl with write parameters (copy_from_user)
+ _IOR an ioctl with read parameters (copy_to_user)
+ _IOWR an ioctl with both write and read parameters.
+ ====== == ============================================
+
+'Write' and 'read' are from the user's point of view, just like the
+system calls 'write' and 'read'. For example, a SET_FOO ioctl would
+be _IOW, although the kernel would actually read data from user space;
+a GET_FOO ioctl would be _IOR, although the kernel would actually write
+data to user space.
+
+The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter
+or number from the table below. Because of the large number of drivers,
+many drivers share a partial letter with other drivers.
+
+If you are writing a driver for a new device and need a letter, pick an
+unused block with enough room for expansion: 32 to 256 ioctl commands.
+You can register the block by patching this file and submitting the
+patch to Linus Torvalds. Or you can e-mail me at <mec@shout.net> and
+I'll register one for you.
+
+The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number
+to distinguish ioctls from each other. The third argument to _IOW,
+_IOR, or _IOWR is the type of the data going into the kernel or coming
+out of the kernel (e.g. 'int' or 'struct foo'). NOTE! Do NOT use
+sizeof(arg) as the third argument as this results in your ioctl thinking
+it passes an argument of type size_t.
+
+Some devices use their major number as the identifier; this is OK, as
+long as it is unique. Some devices are irregular and don't follow any
+convention at all.
+
+Following this convention is good because:
+
+(1) Keeping the ioctl's globally unique helps error checking:
+ if a program calls an ioctl on the wrong device, it will get an
+ error rather than some unexpected behaviour.
+
+(2) The 'strace' build procedure automatically finds ioctl numbers
+ defined with _IO, _IOW, _IOR, or _IOWR.
+
+(3) 'strace' can decode numbers back into useful names when the
+ numbers are unique.
+
+(4) People looking for ioctls can grep for them more easily when
+ this convention is used to define the ioctl numbers.
+
+(5) When following the convention, the driver code can use generic
+ code to copy the parameters between user and kernel space.
+
+This table lists ioctls visible from user land for Linux/x86. It contains
+most drivers up to 2.6.31, but I know I am missing some. There has been
+no attempt to list non-X86 architectures or ioctls from drivers/staging/.
+
+==== ===== ======================================================= ================================================================
+Code Seq# Include File Comments
+ (hex)
+==== ===== ======================================================= ================================================================
+0x00 00-1F linux/fs.h conflict!
+0x00 00-1F scsi/scsi_ioctl.h conflict!
+0x00 00-1F linux/fb.h conflict!
+0x00 00-1F linux/wavefront.h conflict!
+0x02 all linux/fd.h
+0x03 all linux/hdreg.h
+0x04 D2-DC linux/umsdos_fs.h Dead since 2.6.11, but don't reuse these.
+0x06 all linux/lp.h
+0x09 all linux/raid/md_u.h
+0x10 00-0F drivers/char/s390/vmcp.h
+0x10 10-1F arch/s390/include/uapi/sclp_ctl.h
+0x10 20-2F arch/s390/include/uapi/asm/hypfs.h
+0x12 all linux/fs.h
+ linux/blkpg.h
+0x1b all InfiniBand Subsystem
+ <http://infiniband.sourceforge.net/>
+0x20 all drivers/cdrom/cm206.h
+0x22 all scsi/sg.h
+'!' 00-1F uapi/linux/seccomp.h
+'#' 00-3F IEEE 1394 Subsystem
+ Block for the entire subsystem
+'$' 00-0F linux/perf_counter.h, linux/perf_event.h
+'%' 00-0F include/uapi/linux/stm.h System Trace Module subsystem
+ <mailto:alexander.shishkin@linux.intel.com>
+'&' 00-07 drivers/firewire/nosy-user.h
+'1' 00-1F linux/timepps.h PPS kit from Ulrich Windl
+ <ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
+'2' 01-04 linux/i2o.h
+'3' 00-0F drivers/s390/char/raw3270.h conflict!
+'3' 00-1F linux/suspend_ioctls.h, conflict!
+ kernel/power/user.c
+'8' all SNP8023 advanced NIC card
+ <mailto:mcr@solidum.com>
+';' 64-7F linux/vfio.h
+'@' 00-0F linux/radeonfb.h conflict!
+'@' 00-0F drivers/video/aty/aty128fb.c conflict!
+'A' 00-1F linux/apm_bios.h conflict!
+'A' 00-0F linux/agpgart.h, conflict!
+ drivers/char/agp/compat_ioctl.h
+'A' 00-7F sound/asound.h conflict!
+'B' 00-1F linux/cciss_ioctl.h conflict!
+'B' 00-0F include/linux/pmu.h conflict!
+'B' C0-FF advanced bbus <mailto:maassen@uni-freiburg.de>
+'C' all linux/soundcard.h conflict!
+'C' 01-2F linux/capi.h conflict!
+'C' F0-FF drivers/net/wan/cosa.h conflict!
+'D' all arch/s390/include/asm/dasd.h
+'D' 40-5F drivers/scsi/dpt/dtpi_ioctl.h
+'D' 05 drivers/scsi/pmcraid.h
+'E' all linux/input.h conflict!
+'E' 00-0F xen/evtchn.h conflict!
+'F' all linux/fb.h conflict!
+'F' 01-02 drivers/scsi/pmcraid.h conflict!
+'F' 20 drivers/video/fsl-diu-fb.h conflict!
+'F' 20 drivers/video/intelfb/intelfb.h conflict!
+'F' 20 linux/ivtvfb.h conflict!
+'F' 20 linux/matroxfb.h conflict!
+'F' 20 drivers/video/aty/atyfb_base.c conflict!
+'F' 00-0F video/da8xx-fb.h conflict!
+'F' 80-8F linux/arcfb.h conflict!
+'F' DD video/sstfb.h conflict!
+'G' 00-3F drivers/misc/sgi-gru/grulib.h conflict!
+'G' 00-0F linux/gigaset_dev.h conflict!
+'H' 00-7F linux/hiddev.h conflict!
+'H' 00-0F linux/hidraw.h conflict!
+'H' 01 linux/mei.h conflict!
+'H' 02 linux/mei.h conflict!
+'H' 03 linux/mei.h conflict!
+'H' 00-0F sound/asound.h conflict!
+'H' 20-40 sound/asound_fm.h conflict!
+'H' 80-8F sound/sfnt_info.h conflict!
+'H' 10-8F sound/emu10k1.h conflict!
+'H' 10-1F sound/sb16_csp.h conflict!
+'H' 10-1F sound/hda_hwdep.h conflict!
+'H' 40-4F sound/hdspm.h conflict!
+'H' 40-4F sound/hdsp.h conflict!
+'H' 90 sound/usb/usx2y/usb_stream.h
+'H' A0 uapi/linux/usb/cdc-wdm.h
+'H' C0-F0 net/bluetooth/hci.h conflict!
+'H' C0-DF net/bluetooth/hidp/hidp.h conflict!
+'H' C0-DF net/bluetooth/cmtp/cmtp.h conflict!
+'H' C0-DF net/bluetooth/bnep/bnep.h conflict!
+'H' F1 linux/hid-roccat.h <mailto:erazor_de@users.sourceforge.net>
+'H' F8-FA sound/firewire.h
+'I' all linux/isdn.h conflict!
+'I' 00-0F drivers/isdn/divert/isdn_divert.h conflict!
+'I' 40-4F linux/mISDNif.h conflict!
+'J' 00-1F drivers/scsi/gdth_ioctl.h
+'K' all linux/kd.h
+'L' 00-1F linux/loop.h conflict!
+'L' 10-1F drivers/scsi/mpt3sas/mpt3sas_ctl.h conflict!
+'L' 20-2F linux/lightnvm.h
+'L' E0-FF linux/ppdd.h encrypted disk device driver
+ <http://linux01.gwdg.de/~alatham/ppdd.html>
+'M' all linux/soundcard.h conflict!
+'M' 01-16 mtd/mtd-abi.h conflict!
+ and drivers/mtd/mtdchar.c
+'M' 01-03 drivers/scsi/megaraid/megaraid_sas.h
+'M' 00-0F drivers/video/fsl-diu-fb.h conflict!
+'N' 00-1F drivers/usb/scanner.h
+'N' 40-7F drivers/block/nvme.c
+'O' 00-06 mtd/ubi-user.h UBI
+'P' all linux/soundcard.h conflict!
+'P' 60-6F sound/sscape_ioctl.h conflict!
+'P' 00-0F drivers/usb/class/usblp.c conflict!
+'P' 01-09 drivers/misc/pci_endpoint_test.c conflict!
+'Q' all linux/soundcard.h
+'R' 00-1F linux/random.h conflict!
+'R' 01 linux/rfkill.h conflict!
+'R' C0-DF net/bluetooth/rfcomm.h
+'S' all linux/cdrom.h conflict!
+'S' 80-81 scsi/scsi_ioctl.h conflict!
+'S' 82-FF scsi/scsi.h conflict!
+'S' 00-7F sound/asequencer.h conflict!
+'T' all linux/soundcard.h conflict!
+'T' 00-AF sound/asound.h conflict!
+'T' all arch/x86/include/asm/ioctls.h conflict!
+'T' C0-DF linux/if_tun.h conflict!
+'U' all sound/asound.h conflict!
+'U' 00-CF linux/uinput.h conflict!
+'U' 00-EF linux/usbdevice_fs.h
+'U' C0-CF drivers/bluetooth/hci_uart.h
+'V' all linux/vt.h conflict!
+'V' all linux/videodev2.h conflict!
+'V' C0 linux/ivtvfb.h conflict!
+'V' C0 linux/ivtv.h conflict!
+'V' C0 media/davinci/vpfe_capture.h conflict!
+'V' C0 media/si4713.h conflict!
+'W' 00-1F linux/watchdog.h conflict!
+'W' 00-1F linux/wanrouter.h conflict! (pre 3.9)
+'W' 00-3F sound/asound.h conflict!
+'W' 40-5F drivers/pci/switch/switchtec.c
+'X' all fs/xfs/xfs_fs.h, conflict!
+ fs/xfs/linux-2.6/xfs_ioctl32.h,
+ include/linux/falloc.h,
+ linux/fs.h,
+'X' all fs/ocfs2/ocfs_fs.h conflict!
+'X' 01 linux/pktcdvd.h conflict!
+'Y' all linux/cyclades.h
+'Z' 14-15 drivers/message/fusion/mptctl.h
+'[' 00-3F linux/usb/tmc.h USB Test and Measurement Devices
+ <mailto:gregkh@linuxfoundation.org>
+'a' all linux/atm*.h, linux/sonet.h ATM on linux
+ <http://lrcwww.epfl.ch/>
+'a' 00-0F drivers/crypto/qat/qat_common/adf_cfg_common.h conflict! qat driver
+'b' 00-FF conflict! bit3 vme host bridge
+ <mailto:natalia@nikhefk.nikhef.nl>
+'c' all linux/cm4000_cs.h conflict!
+'c' 00-7F linux/comstats.h conflict!
+'c' 00-7F linux/coda.h conflict!
+'c' 00-1F linux/chio.h conflict!
+'c' 80-9F arch/s390/include/asm/chsc.h conflict!
+'c' A0-AF arch/x86/include/asm/msr.h conflict!
+'d' 00-FF linux/char/drm/drm.h conflict!
+'d' 02-40 pcmcia/ds.h conflict!
+'d' F0-FF linux/digi1.h
+'e' all linux/digi1.h conflict!
+'f' 00-1F linux/ext2_fs.h conflict!
+'f' 00-1F linux/ext3_fs.h conflict!
+'f' 00-0F fs/jfs/jfs_dinode.h conflict!
+'f' 00-0F fs/ext4/ext4.h conflict!
+'f' 00-0F linux/fs.h conflict!
+'f' 00-0F fs/ocfs2/ocfs2_fs.h conflict!
+'g' 00-0F linux/usb/gadgetfs.h
+'g' 20-2F linux/usb/g_printer.h
+'h' 00-7F conflict! Charon filesystem
+ <mailto:zapman@interlan.net>
+'h' 00-1F linux/hpet.h conflict!
+'h' 80-8F fs/hfsplus/ioctl.c
+'i' 00-3F linux/i2o-dev.h conflict!
+'i' 0B-1F linux/ipmi.h conflict!
+'i' 80-8F linux/i8k.h
+'j' 00-3F linux/joystick.h
+'k' 00-0F linux/spi/spidev.h conflict!
+'k' 00-05 video/kyro.h conflict!
+'k' 10-17 linux/hsi/hsi_char.h HSI character device
+'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system
+ <http://web.archive.org/web/%2A/http://mikonos.dia.unisa.it/tcfs>
+'l' 40-7F linux/udf_fs_i.h in development:
+ <http://sourceforge.net/projects/linux-udf/>
+'m' 00-09 linux/mmtimer.h conflict!
+'m' all linux/mtio.h conflict!
+'m' all linux/soundcard.h conflict!
+'m' all linux/synclink.h conflict!
+'m' 00-19 drivers/message/fusion/mptctl.h conflict!
+'m' 00 drivers/scsi/megaraid/megaraid_ioctl.h conflict!
+'n' 00-7F linux/ncp_fs.h and fs/ncpfs/ioctl.c
+'n' 80-8F uapi/linux/nilfs2_api.h NILFS2
+'n' E0-FF linux/matroxfb.h matroxfb
+'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2
+'o' 00-03 mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps)
+'o' 40-41 mtd/ubi-user.h UBI
+'o' 01-A1 `linux/dvb/*.h` DVB
+'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this)
+'p' 00-1F linux/rtc.h conflict!
+'p' 00-3F linux/mc146818rtc.h conflict!
+'p' 40-7F linux/nvram.h
+'p' 80-9F linux/ppdev.h user-space parport
+ <mailto:tim@cyberelk.net>
+'p' A1-A5 linux/pps.h LinuxPPS
+ <mailto:giometti@linux.it>
+'q' 00-1F linux/serio.h
+'q' 80-FF linux/telephony.h Internet PhoneJACK, Internet LineJACK
+ linux/ixjuser.h <http://web.archive.org/web/%2A/http://www.quicknet.net>
+'r' 00-1F linux/msdos_fs.h and fs/fat/dir.c
+'s' all linux/cdk.h
+'t' 00-7F linux/ppp-ioctl.h
+'t' 80-8F linux/isdn_ppp.h
+'t' 90-91 linux/toshiba.h toshiba and toshiba_acpi SMM
+'u' 00-1F linux/smb_fs.h gone
+'u' 20-3F linux/uvcvideo.h USB video class host driver
+'u' 40-4f linux/udmabuf.h userspace dma-buf misc device
+'v' 00-1F linux/ext2_fs.h conflict!
+'v' 00-1F linux/fs.h conflict!
+'v' 00-0F linux/sonypi.h conflict!
+'v' 00-0F media/v4l2-subdev.h conflict!
+'v' C0-FF linux/meye.h conflict!
+'w' all CERN SCI driver
+'y' 00-1F packet based user level communications
+ <mailto:zapman@interlan.net>
+'z' 00-3F CAN bus card conflict!
+ <mailto:hdstich@connectu.ulm.circular.de>
+'z' 40-7F CAN bus card conflict!
+ <mailto:oe@port.de>
+'z' 10-4F drivers/s390/crypto/zcrypt_api.h conflict!
+'|' 00-7F linux/media.h
+0x80 00-1F linux/fb.h
+0x89 00-06 arch/x86/include/asm/sockios.h
+0x89 0B-DF linux/sockios.h
+0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range
+0x89 E0-EF linux/dn.h PROTOPRIVATE range
+0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range
+0x8B all linux/wireless.h
+0x8C 00-3F WiNRADiO driver
+ <http://www.winradio.com.au/>
+0x90 00 drivers/cdrom/sbpcd.h
+0x92 00-0F drivers/usb/mon/mon_bin.c
+0x93 60-7F linux/auto_fs.h
+0x94 all fs/btrfs/ioctl.h Btrfs filesystem
+ and linux/fs.h some lifted to vfs/generic
+0x97 00-7F fs/ceph/ioctl.h Ceph file system
+0x99 00-0F 537-Addinboard driver
+ <mailto:buk@buks.ipn.de>
+0xA0 all linux/sdp/sdp.h Industrial Device Project
+ <mailto:kenji@bitgate.com>
+0xA1 0 linux/vtpm_proxy.h TPM Emulator Proxy Driver
+0xA3 80-8F Port ACL in development:
+ <mailto:tlewis@mindspring.com>
+0xA3 90-9F linux/dtlk.h
+0xA4 00-1F uapi/linux/tee.h Generic TEE subsystem
+0xAA 00-3F linux/uapi/linux/userfaultfd.h
+0xAB 00-1F linux/nbd.h
+0xAC 00-1F linux/raw.h
+0xAD 00 Netfilter device in development:
+ <mailto:rusty@rustcorp.com.au>
+0xAE all linux/kvm.h Kernel-based Virtual Machine
+ <mailto:kvm@vger.kernel.org>
+0xAF 00-1F linux/fsl_hypervisor.h Freescale hypervisor
+0xB0 all RATIO devices in development:
+ <mailto:vgo@ratio.de>
+0xB1 00-1F PPPoX
+ <mailto:mostrows@styx.uwaterloo.ca>
+0xB3 00 linux/mmc/ioctl.h
+0xB4 00-0F linux/gpio.h <mailto:linux-gpio@vger.kernel.org>
+0xB5 00-0F uapi/linux/rpmsg.h <mailto:linux-remoteproc@vger.kernel.org>
+0xB6 all linux/fpga-dfl.h
+0xC0 00-0F linux/usb/iowarrior.h
+0xCA 00-0F uapi/misc/cxl.h
+0xCA 10-2F uapi/misc/ocxl.h
+0xCA 80-BF uapi/scsi/cxlflash_ioctl.h
+0xCB 00-1F CBM serial IEC bus in development:
+ <mailto:michael.klein@puffin.lb.shuttle.de>
+0xCC 00-0F drivers/misc/ibmvmc.h pseries VMC driver
+0xCD 01 linux/reiserfs_fs.h
+0xCF 02 fs/cifs/ioctl.c
+0xDB 00-0F drivers/char/mwave/mwavepub.h
+0xDD 00-3F ZFCP device driver see drivers/s390/scsi/
+ <mailto:aherrman@de.ibm.com>
+0xE5 00-3F linux/fuse.h
+0xEC 00-01 drivers/platform/chrome/cros_ec_dev.h ChromeOS EC driver
+0xF3 00-3F drivers/usb/misc/sisusbvga/sisusb.h sisfb (in development)
+ <mailto:thomas@winischhofer.net>
+0xF4 00-1F video/mbxfb.h mbxfb
+ <mailto:raph@8d.com>
+0xF6 all LTTng Linux Trace Toolkit Next Generation
+ <mailto:mathieu.desnoyers@efficios.com>
+0xFD all linux/dm-ioctl.h
+0xFE all linux/isst_if.h
+==== ===== ======================================================= ================================================================
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
deleted file mode 100644
index c9558146ac58..000000000000
--- a/Documentation/ioctl/ioctl-number.txt
+++ /dev/null
@@ -1,350 +0,0 @@
-Ioctl Numbers
-19 October 1999
-Michael Elizabeth Chastain
-<mec@shout.net>
-
-If you are adding new ioctl's to the kernel, you should use the _IO
-macros defined in <linux/ioctl.h>:
-
- _IO an ioctl with no parameters
- _IOW an ioctl with write parameters (copy_from_user)
- _IOR an ioctl with read parameters (copy_to_user)
- _IOWR an ioctl with both write and read parameters.
-
-'Write' and 'read' are from the user's point of view, just like the
-system calls 'write' and 'read'. For example, a SET_FOO ioctl would
-be _IOW, although the kernel would actually read data from user space;
-a GET_FOO ioctl would be _IOR, although the kernel would actually write
-data to user space.
-
-The first argument to _IO, _IOW, _IOR, or _IOWR is an identifying letter
-or number from the table below. Because of the large number of drivers,
-many drivers share a partial letter with other drivers.
-
-If you are writing a driver for a new device and need a letter, pick an
-unused block with enough room for expansion: 32 to 256 ioctl commands.
-You can register the block by patching this file and submitting the
-patch to Linus Torvalds. Or you can e-mail me at <mec@shout.net> and
-I'll register one for you.
-
-The second argument to _IO, _IOW, _IOR, or _IOWR is a sequence number
-to distinguish ioctls from each other. The third argument to _IOW,
-_IOR, or _IOWR is the type of the data going into the kernel or coming
-out of the kernel (e.g. 'int' or 'struct foo'). NOTE! Do NOT use
-sizeof(arg) as the third argument as this results in your ioctl thinking
-it passes an argument of type size_t.
-
-Some devices use their major number as the identifier; this is OK, as
-long as it is unique. Some devices are irregular and don't follow any
-convention at all.
-
-Following this convention is good because:
-
-(1) Keeping the ioctl's globally unique helps error checking:
- if a program calls an ioctl on the wrong device, it will get an
- error rather than some unexpected behaviour.
-
-(2) The 'strace' build procedure automatically finds ioctl numbers
- defined with _IO, _IOW, _IOR, or _IOWR.
-
-(3) 'strace' can decode numbers back into useful names when the
- numbers are unique.
-
-(4) People looking for ioctls can grep for them more easily when
- this convention is used to define the ioctl numbers.
-
-(5) When following the convention, the driver code can use generic
- code to copy the parameters between user and kernel space.
-
-This table lists ioctls visible from user land for Linux/x86. It contains
-most drivers up to 2.6.31, but I know I am missing some. There has been
-no attempt to list non-X86 architectures or ioctls from drivers/staging/.
-
-Code Seq#(hex) Include File Comments
-========================================================
-0x00 00-1F linux/fs.h conflict!
-0x00 00-1F scsi/scsi_ioctl.h conflict!
-0x00 00-1F linux/fb.h conflict!
-0x00 00-1F linux/wavefront.h conflict!
-0x02 all linux/fd.h
-0x03 all linux/hdreg.h
-0x04 D2-DC linux/umsdos_fs.h Dead since 2.6.11, but don't reuse these.
-0x06 all linux/lp.h
-0x09 all linux/raid/md_u.h
-0x10 00-0F drivers/char/s390/vmcp.h
-0x10 10-1F arch/s390/include/uapi/sclp_ctl.h
-0x10 20-2F arch/s390/include/uapi/asm/hypfs.h
-0x12 all linux/fs.h
- linux/blkpg.h
-0x1b all InfiniBand Subsystem <http://infiniband.sourceforge.net/>
-0x20 all drivers/cdrom/cm206.h
-0x22 all scsi/sg.h
-'!' 00-1F uapi/linux/seccomp.h
-'#' 00-3F IEEE 1394 Subsystem Block for the entire subsystem
-'$' 00-0F linux/perf_counter.h, linux/perf_event.h
-'%' 00-0F include/uapi/linux/stm.h
- System Trace Module subsystem
- <mailto:alexander.shishkin@linux.intel.com>
-'&' 00-07 drivers/firewire/nosy-user.h
-'1' 00-1F <linux/timepps.h> PPS kit from Ulrich Windl
- <ftp://ftp.de.kernel.org/pub/linux/daemons/ntp/PPS/>
-'2' 01-04 linux/i2o.h
-'3' 00-0F drivers/s390/char/raw3270.h conflict!
-'3' 00-1F linux/suspend_ioctls.h conflict!
- and kernel/power/user.c
-'8' all SNP8023 advanced NIC card
- <mailto:mcr@solidum.com>
-';' 64-7F linux/vfio.h
-'@' 00-0F linux/radeonfb.h conflict!
-'@' 00-0F drivers/video/aty/aty128fb.c conflict!
-'A' 00-1F linux/apm_bios.h conflict!
-'A' 00-0F linux/agpgart.h conflict!
- and drivers/char/agp/compat_ioctl.h
-'A' 00-7F sound/asound.h conflict!
-'B' 00-1F linux/cciss_ioctl.h conflict!
-'B' 00-0F include/linux/pmu.h conflict!
-'B' C0-FF advanced bbus
- <mailto:maassen@uni-freiburg.de>
-'C' all linux/soundcard.h conflict!
-'C' 01-2F linux/capi.h conflict!
-'C' F0-FF drivers/net/wan/cosa.h conflict!
-'D' all arch/s390/include/asm/dasd.h
-'D' 40-5F drivers/scsi/dpt/dtpi_ioctl.h
-'D' 05 drivers/scsi/pmcraid.h
-'E' all linux/input.h conflict!
-'E' 00-0F xen/evtchn.h conflict!
-'F' all linux/fb.h conflict!
-'F' 01-02 drivers/scsi/pmcraid.h conflict!
-'F' 20 drivers/video/fsl-diu-fb.h conflict!
-'F' 20 drivers/video/intelfb/intelfb.h conflict!
-'F' 20 linux/ivtvfb.h conflict!
-'F' 20 linux/matroxfb.h conflict!
-'F' 20 drivers/video/aty/atyfb_base.c conflict!
-'F' 00-0F video/da8xx-fb.h conflict!
-'F' 80-8F linux/arcfb.h conflict!
-'F' DD video/sstfb.h conflict!
-'G' 00-3F drivers/misc/sgi-gru/grulib.h conflict!
-'G' 00-0F linux/gigaset_dev.h conflict!
-'H' 00-7F linux/hiddev.h conflict!
-'H' 00-0F linux/hidraw.h conflict!
-'H' 01 linux/mei.h conflict!
-'H' 02 linux/mei.h conflict!
-'H' 03 linux/mei.h conflict!
-'H' 00-0F sound/asound.h conflict!
-'H' 20-40 sound/asound_fm.h conflict!
-'H' 80-8F sound/sfnt_info.h conflict!
-'H' 10-8F sound/emu10k1.h conflict!
-'H' 10-1F sound/sb16_csp.h conflict!
-'H' 10-1F sound/hda_hwdep.h conflict!
-'H' 40-4F sound/hdspm.h conflict!
-'H' 40-4F sound/hdsp.h conflict!
-'H' 90 sound/usb/usx2y/usb_stream.h
-'H' A0 uapi/linux/usb/cdc-wdm.h
-'H' C0-F0 net/bluetooth/hci.h conflict!
-'H' C0-DF net/bluetooth/hidp/hidp.h conflict!
-'H' C0-DF net/bluetooth/cmtp/cmtp.h conflict!
-'H' C0-DF net/bluetooth/bnep/bnep.h conflict!
-'H' F1 linux/hid-roccat.h <mailto:erazor_de@users.sourceforge.net>
-'H' F8-FA sound/firewire.h
-'I' all linux/isdn.h conflict!
-'I' 00-0F drivers/isdn/divert/isdn_divert.h conflict!
-'I' 40-4F linux/mISDNif.h conflict!
-'J' 00-1F drivers/scsi/gdth_ioctl.h
-'K' all linux/kd.h
-'L' 00-1F linux/loop.h conflict!
-'L' 10-1F drivers/scsi/mpt3sas/mpt3sas_ctl.h conflict!
-'L' 20-2F linux/lightnvm.h
-'L' E0-FF linux/ppdd.h encrypted disk device driver
- <http://linux01.gwdg.de/~alatham/ppdd.html>
-'M' all linux/soundcard.h conflict!
-'M' 01-16 mtd/mtd-abi.h conflict!
- and drivers/mtd/mtdchar.c
-'M' 01-03 drivers/scsi/megaraid/megaraid_sas.h
-'M' 00-0F drivers/video/fsl-diu-fb.h conflict!
-'N' 00-1F drivers/usb/scanner.h
-'N' 40-7F drivers/block/nvme.c
-'O' 00-06 mtd/ubi-user.h UBI
-'P' all linux/soundcard.h conflict!
-'P' 60-6F sound/sscape_ioctl.h conflict!
-'P' 00-0F drivers/usb/class/usblp.c conflict!
-'P' 01-09 drivers/misc/pci_endpoint_test.c conflict!
-'Q' all linux/soundcard.h
-'R' 00-1F linux/random.h conflict!
-'R' 01 linux/rfkill.h conflict!
-'R' C0-DF net/bluetooth/rfcomm.h
-'S' all linux/cdrom.h conflict!
-'S' 80-81 scsi/scsi_ioctl.h conflict!
-'S' 82-FF scsi/scsi.h conflict!
-'S' 00-7F sound/asequencer.h conflict!
-'T' all linux/soundcard.h conflict!
-'T' 00-AF sound/asound.h conflict!
-'T' all arch/x86/include/asm/ioctls.h conflict!
-'T' C0-DF linux/if_tun.h conflict!
-'U' all sound/asound.h conflict!
-'U' 00-CF linux/uinput.h conflict!
-'U' 00-EF linux/usbdevice_fs.h
-'U' C0-CF drivers/bluetooth/hci_uart.h
-'V' all linux/vt.h conflict!
-'V' all linux/videodev2.h conflict!
-'V' C0 linux/ivtvfb.h conflict!
-'V' C0 linux/ivtv.h conflict!
-'V' C0 media/davinci/vpfe_capture.h conflict!
-'V' C0 media/si4713.h conflict!
-'W' 00-1F linux/watchdog.h conflict!
-'W' 00-1F linux/wanrouter.h conflict! (pre 3.9)
-'W' 00-3F sound/asound.h conflict!
-'W' 40-5F drivers/pci/switch/switchtec.c
-'X' all fs/xfs/xfs_fs.h conflict!
- and fs/xfs/linux-2.6/xfs_ioctl32.h
- and include/linux/falloc.h
- and linux/fs.h
-'X' all fs/ocfs2/ocfs_fs.h conflict!
-'X' 01 linux/pktcdvd.h conflict!
-'Y' all linux/cyclades.h
-'Z' 14-15 drivers/message/fusion/mptctl.h
-'[' 00-3F linux/usb/tmc.h USB Test and Measurement Devices
- <mailto:gregkh@linuxfoundation.org>
-'a' all linux/atm*.h, linux/sonet.h ATM on linux
- <http://lrcwww.epfl.ch/>
-'a' 00-0F drivers/crypto/qat/qat_common/adf_cfg_common.h conflict! qat driver
-'b' 00-FF conflict! bit3 vme host bridge
- <mailto:natalia@nikhefk.nikhef.nl>
-'c' all linux/cm4000_cs.h conflict!
-'c' 00-7F linux/comstats.h conflict!
-'c' 00-7F linux/coda.h conflict!
-'c' 00-1F linux/chio.h conflict!
-'c' 80-9F arch/s390/include/asm/chsc.h conflict!
-'c' A0-AF arch/x86/include/asm/msr.h conflict!
-'d' 00-FF linux/char/drm/drm.h conflict!
-'d' 02-40 pcmcia/ds.h conflict!
-'d' F0-FF linux/digi1.h
-'e' all linux/digi1.h conflict!
-'f' 00-1F linux/ext2_fs.h conflict!
-'f' 00-1F linux/ext3_fs.h conflict!
-'f' 00-0F fs/jfs/jfs_dinode.h conflict!
-'f' 00-0F fs/ext4/ext4.h conflict!
-'f' 00-0F linux/fs.h conflict!
-'f' 00-0F fs/ocfs2/ocfs2_fs.h conflict!
-'g' 00-0F linux/usb/gadgetfs.h
-'g' 20-2F linux/usb/g_printer.h
-'h' 00-7F conflict! Charon filesystem
- <mailto:zapman@interlan.net>
-'h' 00-1F linux/hpet.h conflict!
-'h' 80-8F fs/hfsplus/ioctl.c
-'i' 00-3F linux/i2o-dev.h conflict!
-'i' 0B-1F linux/ipmi.h conflict!
-'i' 80-8F linux/i8k.h
-'j' 00-3F linux/joystick.h
-'k' 00-0F linux/spi/spidev.h conflict!
-'k' 00-05 video/kyro.h conflict!
-'k' 10-17 linux/hsi/hsi_char.h HSI character device
-'l' 00-3F linux/tcfs_fs.h transparent cryptographic file system
- <http://web.archive.org/web/*/http://mikonos.dia.unisa.it/tcfs>
-'l' 40-7F linux/udf_fs_i.h in development:
- <http://sourceforge.net/projects/linux-udf/>
-'m' 00-09 linux/mmtimer.h conflict!
-'m' all linux/mtio.h conflict!
-'m' all linux/soundcard.h conflict!
-'m' all linux/synclink.h conflict!
-'m' 00-19 drivers/message/fusion/mptctl.h conflict!
-'m' 00 drivers/scsi/megaraid/megaraid_ioctl.h conflict!
-'n' 00-7F linux/ncp_fs.h and fs/ncpfs/ioctl.c
-'n' 80-8F uapi/linux/nilfs2_api.h NILFS2
-'n' E0-FF linux/matroxfb.h matroxfb
-'o' 00-1F fs/ocfs2/ocfs2_fs.h OCFS2
-'o' 00-03 mtd/ubi-user.h conflict! (OCFS2 and UBI overlaps)
-'o' 40-41 mtd/ubi-user.h UBI
-'o' 01-A1 linux/dvb/*.h DVB
-'p' 00-0F linux/phantom.h conflict! (OpenHaptics needs this)
-'p' 00-1F linux/rtc.h conflict!
-'p' 00-3F linux/mc146818rtc.h conflict!
-'p' 40-7F linux/nvram.h
-'p' 80-9F linux/ppdev.h user-space parport
- <mailto:tim@cyberelk.net>
-'p' A1-A5 linux/pps.h LinuxPPS
- <mailto:giometti@linux.it>
-'q' 00-1F linux/serio.h
-'q' 80-FF linux/telephony.h Internet PhoneJACK, Internet LineJACK
- linux/ixjuser.h <http://web.archive.org/web/*/http://www.quicknet.net>
-'r' 00-1F linux/msdos_fs.h and fs/fat/dir.c
-'s' all linux/cdk.h
-'t' 00-7F linux/ppp-ioctl.h
-'t' 80-8F linux/isdn_ppp.h
-'t' 90-91 linux/toshiba.h toshiba and toshiba_acpi SMM
-'u' 00-1F linux/smb_fs.h gone
-'u' 20-3F linux/uvcvideo.h USB video class host driver
-'u' 40-4f linux/udmabuf.h userspace dma-buf misc device
-'v' 00-1F linux/ext2_fs.h conflict!
-'v' 00-1F linux/fs.h conflict!
-'v' 00-0F linux/sonypi.h conflict!
-'v' 00-0F media/v4l2-subdev.h conflict!
-'v' C0-FF linux/meye.h conflict!
-'w' all CERN SCI driver
-'y' 00-1F packet based user level communications
- <mailto:zapman@interlan.net>
-'z' 00-3F CAN bus card conflict!
- <mailto:hdstich@connectu.ulm.circular.de>
-'z' 40-7F CAN bus card conflict!
- <mailto:oe@port.de>
-'z' 10-4F drivers/s390/crypto/zcrypt_api.h conflict!
-'|' 00-7F linux/media.h
-0x80 00-1F linux/fb.h
-0x89 00-06 arch/x86/include/asm/sockios.h
-0x89 0B-DF linux/sockios.h
-0x89 E0-EF linux/sockios.h SIOCPROTOPRIVATE range
-0x89 E0-EF linux/dn.h PROTOPRIVATE range
-0x89 F0-FF linux/sockios.h SIOCDEVPRIVATE range
-0x8B all linux/wireless.h
-0x8C 00-3F WiNRADiO driver
- <http://www.winradio.com.au/>
-0x90 00 drivers/cdrom/sbpcd.h
-0x92 00-0F drivers/usb/mon/mon_bin.c
-0x93 60-7F linux/auto_fs.h
-0x94 all fs/btrfs/ioctl.h Btrfs filesystem
- and linux/fs.h some lifted to vfs/generic
-0x97 00-7F fs/ceph/ioctl.h Ceph file system
-0x99 00-0F 537-Addinboard driver
- <mailto:buk@buks.ipn.de>
-0xA0 all linux/sdp/sdp.h Industrial Device Project
- <mailto:kenji@bitgate.com>
-0xA1 0 linux/vtpm_proxy.h TPM Emulator Proxy Driver
-0xA3 80-8F Port ACL in development:
- <mailto:tlewis@mindspring.com>
-0xA3 90-9F linux/dtlk.h
-0xA4 00-1F uapi/linux/tee.h Generic TEE subsystem
-0xAA 00-3F linux/uapi/linux/userfaultfd.h
-0xAB 00-1F linux/nbd.h
-0xAC 00-1F linux/raw.h
-0xAD 00 Netfilter device in development:
- <mailto:rusty@rustcorp.com.au>
-0xAE all linux/kvm.h Kernel-based Virtual Machine
- <mailto:kvm@vger.kernel.org>
-0xAF 00-1F linux/fsl_hypervisor.h Freescale hypervisor
-0xB0 all RATIO devices in development:
- <mailto:vgo@ratio.de>
-0xB1 00-1F PPPoX <mailto:mostrows@styx.uwaterloo.ca>
-0xB3 00 linux/mmc/ioctl.h
-0xB4 00-0F linux/gpio.h <mailto:linux-gpio@vger.kernel.org>
-0xB5 00-0F uapi/linux/rpmsg.h <mailto:linux-remoteproc@vger.kernel.org>
-0xB6 all linux/fpga-dfl.h
-0xC0 00-0F linux/usb/iowarrior.h
-0xCA 00-0F uapi/misc/cxl.h
-0xCA 10-2F uapi/misc/ocxl.h
-0xCA 80-BF uapi/scsi/cxlflash_ioctl.h
-0xCB 00-1F CBM serial IEC bus in development:
- <mailto:michael.klein@puffin.lb.shuttle.de>
-0xCC 00-0F drivers/misc/ibmvmc.h pseries VMC driver
-0xCD 01 linux/reiserfs_fs.h
-0xCF 02 fs/cifs/ioctl.c
-0xDB 00-0F drivers/char/mwave/mwavepub.h
-0xDD 00-3F ZFCP device driver see drivers/s390/scsi/
- <mailto:aherrman@de.ibm.com>
-0xE5 00-3F linux/fuse.h
-0xEC 00-01 drivers/platform/chrome/cros_ec_dev.h ChromeOS EC driver
-0xF3 00-3F drivers/usb/misc/sisusbvga/sisusb.h sisfb (in development)
- <mailto:thomas@winischhofer.net>
-0xF4 00-1F video/mbxfb.h mbxfb
- <mailto:raph@8d.com>
-0xF6 all LTTng Linux Trace Toolkit Next Generation
- <mailto:mathieu.desnoyers@efficios.com>
-0xFD all linux/dm-ioctl.h
diff --git a/Documentation/isdn/HiSax.cert b/Documentation/isdn/HiSax.cert
deleted file mode 100644
index f2a6fcb8efee..000000000000
--- a/Documentation/isdn/HiSax.cert
+++ /dev/null
@@ -1,96 +0,0 @@
------BEGIN PGP SIGNED MESSAGE-----
-
-First:
-
- HiSax is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-However, if you wish to modify the HiSax sources, please note the following:
-
-HiSax has passed the ITU approval test suite with ELSA Quickstep ISDN cards
-and Eicon Technology Diva 2.01 PCI card.
-The certification is only valid for the combination of the tested software
-version and the tested hardware. Any changes to the HiSax source code may
-therefore affect the certification.
-
-Additional ITU approval tests have been carried out for all generic cards
-using Colognechip single chip solutions HFC-S PCI A for PCI cards as well
-as HFC-S USB based USB ISDN ta adapters.
-These tests included all layers 1-3 and as well all functional tests for
-the layer 1. Because all hardware based on these chips are complete ISDN
-solutions in one chip all cards and USB-TAs using these chips are to be
-regarded as approved for those tests. Some additional electrical tests
-of the layer 1 which are independent of the driver and related to a
-special hardware used will be regarded as approved if at least one
-solution has been tested including those electrical tests. So if cards
-or tas have been completely approved for any other os, the approval
-for those electrical tests is valid for linux, too.
-Please send any questions regarding this drivers or approval abouts to
-werner@isdn-development.de
-Additional information and the type approval documents will be found
-shortly on the Colognechip website www.colognechip.com
-
-If you change the main files of the HiSax ISDN stack, the certification will
-become invalid. Because in most countries it is illegal to connect
-unapproved ISDN equipment to the public network, I have to guarantee that
-changes in HiSax do not affect the certification.
-
-In order to make a valid certification apparent to the user, I have built in
-some validation checks that are made during the make process. The HiSax main
-files are protected by md5 checksums and the md5sum file is pgp signed by
-myself:
-
-KeyID 1024/FF992F6D 1997/01/16 Karsten Keil <kkeil@suse.de>
-Key fingerprint = 92 6B F7 58 EE 86 28 C8 C4 1A E6 DC 39 89 F2 AA
-
-Only if the checksums are OK, and the signature of the file
-"drivers/isdn/hisax/md5sums.asc" match, is the certification valid; a
-message confirming this is then displayed during the hisax init process.
-
-The affected files are:
-
-drivers/isdn/hisax/isac.c
-drivers/isdn/hisax/isdnl1.c
-drivers/isdn/hisax/isdnl2.c
-drivers/isdn/hisax/isdnl3.c
-drivers/isdn/hisax/tei.c
-drivers/isdn/hisax/callc.c
-drivers/isdn/hisax/l3dss1.c
-drivers/isdn/hisax/l3_1tr6.c
-drivers/isdn/hisax/cert.c
-drivers/isdn/hisax/elsa.c
-drivers/isdn/hisax/diva.c
-drivers/isdn/hisax/hfc_pci.c
-
-Please send any changes, bugfixes and patches to me rather than implementing
-them directly into the HiSax sources.
-
-This does not reduce your rights granted by the GNU General Public License.
-If you wish to change the sources, go ahead; but note that then the
-certification is invalid even if you use one of the approved cards.
-
-Here are the certification registration numbers for ELSA Quickstep cards:
-German D133361J CETECOM ICT Services GmbH 0682
-European D133362J CETECOM ICT Services GmbH 0682
-
-
-Karsten Keil
-keil@isdn4linux.de
-
------BEGIN PGP SIGNATURE-----
-Version: 2.6.3i
-Charset: noconv
-
-iQCVAwUBOFAwqTpxHvX/mS9tAQFI2QP9GLDK2iy/KBhwReE3F7LeO+tVhffTVZ3a
-20q5/z/WcIg/pnH0uTkl2UgDXBFXYl45zJyDGNpAposIFmT+Edd14o7Vj1w/BBdn
-Y+5rBmJf+gyBu61da5d6bv0lpymwRa/um+ri+ilYnZ/XPfg5JKhdjGSBCJuJAElM
-d2jFbTrsMYw=
-=LNf9
------END PGP SIGNATURE-----
diff --git a/Documentation/isdn/INTERFACE b/Documentation/isdn/INTERFACE
deleted file mode 100644
index 5df17e5b25c8..000000000000
--- a/Documentation/isdn/INTERFACE
+++ /dev/null
@@ -1,759 +0,0 @@
-$Id: INTERFACE,v 1.15.8.2 2001/03/13 16:17:07 kai Exp $
-
-Description of the Interface between Linklevel and Hardwarelevel
- of isdn4linux:
-
-
- The Communication between Linklevel (LL) and Hardwarelevel (HL)
- is based on the struct isdn_if (defined in isdnif.h).
-
- An HL-driver can register itself at LL by calling the function
- register_isdn() with a pointer to that struct. Prior to that, it has
- to preset some of the fields of isdn_if. The LL sets the rest of
- the fields. All further communication is done via callbacks using
- the function-pointers defined in isdn_if.
-
- Changes/Version numbering:
-
- During development of the ISDN subsystem, several changes have been
- made to the interface. Before it went into kernel, the package
- had a unique version number. The last version, distributed separately
- was 0.7.4. When the subsystem went into kernel, every functional unit
- got a separate version number. These numbers are shown at initialization,
- separated by slashes:
-
- c.c/t.t/n.n/p.p/a.a/v.v
-
- where
-
- c.c is the revision of the common code.
- t.t is the revision of the tty related code.
- n.n is the revision of the network related code.
- p.p is the revision of the ppp related code.
- a.a is the revision of the audio related code.
- v.v is the revision of the V.110 related code.
-
- Changes in this document are marked with '***CHANGEx' where x representing
- the version number. If that number starts with 0, it refers to the old,
- separately distributed package. If it starts with one of the letters
- above, it refers to the revision of the corresponding module.
- ***CHANGEIx refers to the revision number of the isdnif.h
-
-1. Description of the fields of isdn_if:
-
- int channels;
-
- This field has to be set by the HL-driver to the number of channels
- supported prior to calling register_isdn(). Upon return of the call,
- the LL puts an id there, which has to be used by the HL-driver when
- invoking the other callbacks.
-
- int maxbufsize;
-
- ***CHANGE0.6: New since this version.
-
- Also to be preset by the HL-driver. With this value the HL-driver
- tells the LL the maximum size of a data-packet it will accept.
-
- unsigned long features;
-
- To be preset by the HL-driver. Using this field, the HL-driver
- announces the features supported. At the moment this is limited to
- report the supported layer2 and layer3-protocols. For setting this
- field the constants ISDN_FEATURE..., declared in isdnif.h have to be
- used.
-
- ***CHANGE0.7.1: The line type (1TR6, EDSS1) has to be set.
-
- unsigned short hl_hdrlen;
-
- ***CHANGE0.7.4: New field.
-
- To be preset by the HL-driver, if it supports sk_buff's. The driver
- should put here the amount of additional space needed in sk_buff's for
- its internal purposes. Drivers not supporting sk_buff's should
- initialize this field to 0.
-
- void (*rcvcallb_skb)(int, int, struct sk_buff *)
-
- ***CHANGE0.7.4: New field.
-
- This field will be set by LL. The HL-driver delivers received data-
- packets by calling this function. Upon calling, the HL-driver must
- already have its private data pulled off the head of the sk_buff.
-
- Parameter:
- int driver-Id
- int Channel-number locally to the driver. (starting with 0)
- struct sk_buff * Pointer to sk_buff, containing received data.
-
- int (*statcallb)(isdn_ctrl*);
-
- This field will be set by LL. This function has to be called by the
- HL-driver for signaling status-changes or other events to the LL.
-
- Parameter:
- isdn_ctrl*
-
- The struct isdn_ctrl also defined in isdn_if. The exact meanings of its
- fields are described together with the descriptions of the possible
- events. Here is only a short description of the fields:
-
- driver = driver Id.
- command = event-type. (one of the constants ISDN_STAT_...)
- arg = depends on event-type.
- num = depends on event-type.
-
- Returnvalue:
- 0 on success, else -1
-
- int (*command)(isdn_ctrl*);
-
- This field has to be preset by the HL-driver. It points to a function,
- to be called by LL to perform functions like dialing, B-channel
- setup, etc. The exact meaning of the parameters is described with the
- descriptions of the possible commands.
-
- Parameter:
- isdn_ctrl*
- driver = driver-Id
- command = command to perform. (one of the constants ISDN_CMD_...)
- arg = depends on command.
- num = depends on command.
-
- Returnvalue:
- >=0 on success, else error-code (-ENODEV etc.)
-
- int (*writebuf_skb)(int, int, int, struct sk_buff *)
-
- ***CHANGE0.7.4: New field.
- ***CHANGEI.1.21: New field.
-
- This field has to be preset by the HL-driver. The given function will
- be called by the LL for delivering data to be send via B-Channel.
-
-
- Parameter:
- int driver-Id ***CHANGE0.7.4: New parameter.
- int channel-number locally to the HL-driver. (starts with 0)
- int ack ***ChangeI1.21: New parameter
- If this is !0, the driver has to signal the delivery
- by sending an ISDN_STAT_BSENT. If this is 0, the driver
- MUST NOT send an ISDN_STAT_BSENT.
- struct sk_buff * Pointer to sk_buff containing data to be send via
- B-channel.
-
- Returnvalue:
- Length of data accepted on success, else error-code (-EINVAL on
- oversized packets etc.)
-
- int (*writecmd)(u_char*, int, int, int, int);
-
- This field has to be preset by the HL-driver. The given function will be
- called to perform write-requests on /dev/isdnctrl (i.e. sending commands
- to the card) The data-format is hardware-specific. This function is
- intended for debugging only. It is not necessary for normal operation
- and never will be called by the tty-emulation- or network-code. If
- this function is not supported, the driver has to set NULL here.
-
- Parameter:
- u_char* pointer to data.
- int length of data.
- int flag: 0 = call from within kernel-space. (HL-driver must use
- memcpy, may NOT use schedule())
- 1 = call from user-space. (HL-driver must use
- memcpy_fromfs, use of schedule() allowed)
- int driver-Id.
- int channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
- Returnvalue:
- Length of data accepted on success, else error-code (-EINVAL etc.)
-
- int (*readstat)(u_char*, int, int, int, int);
-
- This field has to be preset by the HL-driver. The given function will be
- called to perform read-requests on /dev/isdnctrl (i.e. reading replies
- from the card) The data-format is hardware-specific. This function is
- intended for debugging only. It is not necessary for normal operation
- and never will be called by the tty-emulation- or network-code. If
- this function is not supported, the driver has to set NULL here.
-
- Parameter:
- u_char* pointer to data.
- int length of data.
- int flag: 0 = call from within kernel-space. (HL-driver must use
- memcpy, may NOT use schedule())
- 1 = call from user-space. (HL-driver must use
- memcpy_fromfs, use of schedule() allowed)
- int driver-Id.
- int channel-number locally to the HL-driver. (starts with 0)
-
-***CHANGEI1.14: The driver-Id and channel-number are new since this revision.
-
- Returnvalue:
- Length of data on success, else error-code (-EINVAL etc.)
-
- char id[20];
- ***CHANGE0.7: New since this version.
-
- This string has to be preset by the HL-driver. Its purpose is for
- identification of the driver by the user. Eg.: it is shown in the
- status-info of /dev/isdninfo. Furthermore it is used as Id for binding
- net-interfaces to a specific channel. If a string of length zero is
- given, upon return, isdn4linux will replace it by a generic name. (line0,
- line1 etc.) It is recommended to make this string configurable during
- module-load-time. (copy a global variable to this string.) For doing that,
- modules 1.2.8 or newer are necessary.
-
-2. Description of the commands, a HL-driver has to support:
-
- All commands will be performed by calling the function command() described
- above from within the LL. The field command of the struct-parameter will
- contain the desired command, the field driver is always set to the
- appropriate driver-Id.
-
- Until now, the following commands are defined:
-
-***CHANGEI1.34: The parameter "num" has been replaced by a union "parm" containing
- the old "num" and a new setup_type struct used for ISDN_CMD_DIAL
- and ISDN_STAT_ICALL callback.
-
- ISDN_CMD_IOCTL:
-
- This command is intended for performing ioctl-calls for configuring
- hardware or similar purposes (setting port-addresses, loading firmware
- etc.) For this purpose, in the LL all ioctl-calls with an argument
- >= IIOCDRVCTL (0x100) will be handed transparently to this
- function after subtracting 0x100 and placing the result in arg.
- Example:
- If a userlevel-program calls ioctl(0x101,...) the function gets
- called with the field command set to 1.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_IOCTL
- arg = Original ioctl-cmd - IIOCDRVCTL
- parm.num = first bytes filled with (unsigned long)arg
-
- Returnvalue:
- Depending on driver.
-
-
- ISDN_CMD_DIAL:
-
- This command is used to tell the HL-driver it should dial a given
- number.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_DIAL
- arg = channel-number locally to the driver. (starting with 0)
-
- parm.setup.phone = An ASCII-String containing the number to dial.
- parm.setup.eazmsn = An ASCII-Sting containing the own EAZ or MSN.
- parm.setup.si1 = The Service-Indicator.
- parm.setup.si2 = Additional Service-Indicator.
-
- If the Line has been designed as SPV (a special german
- feature, meaning semi-leased-line) the phone has to
- start with an "S".
- ***CHANGE0.6: In previous versions the EAZ has been given in the
- highbyte of arg.
- ***CHANGE0.7.1: New since this version: ServiceIndicator and AddInfo.
-
- ISDN_CMD_ACCEPTD:
-
- With this command, the HL-driver is told to accept a D-Channel-setup.
- (Response to an incoming call)
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_ACCEPTD
- arg = channel-number locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_CMD_ACCEPTB:
-
- With this command, the HL-driver is told to perform a B-Channel-setup.
- (after establishing D-Channel-Connection)
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_ACCEPTB
- arg = channel-number locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_CMD_HANGUP:
-
- With this command, the HL-driver is told to hangup (B-Channel if
- established first, then D-Channel). This command is also used for
- actively rejecting an incoming call.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_HANGUP
- arg = channel-number locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_CMD_CLREAZ:
-
- With this command, the HL-driver is told not to signal incoming
- calls to the LL.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_CLREAZ
- arg = channel-number locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_CMD_SETEAZ:
-
- With this command, the HL-driver is told to signal incoming calls for
- the given EAZs/MSNs to the LL.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_SETEAZ
- arg = channel-number locally to the driver. (starting with 0)
- parm.num = ASCII-String, containing the desired EAZ's/MSN's
- (comma-separated). If an empty String is given, the
- HL-driver should respond to ALL incoming calls,
- regardless of the destination-address.
- ***CHANGE0.6: New since this version the "empty-string"-feature.
-
- ISDN_CMD_GETEAZ: (currently unused)
-
- With this command, the HL-driver is told to report the current setting
- given with ISDN_CMD_SETEAZ.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_GETEAZ
- arg = channel-number locally to the driver. (starting with 0)
- parm.num = ASCII-String, containing the current EAZ's/MSN's
-
- ISDN_CMD_SETSIL: (currently unused)
-
- With this command, the HL-driver is told to signal only incoming
- calls with the given Service-Indicators.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_SETSIL
- arg = channel-number locally to the driver. (starting with 0)
- parm.num = ASCII-String, containing the desired Service-Indicators.
-
- ISDN_CMD_GETSIL: (currently unused)
-
- With this command, the HL-driver is told to return the current
- Service-Indicators it will respond to.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_SETSIL
- arg = channel-number locally to the driver. (starting with 0)
- parm.num = ASCII-String, containing the current Service-Indicators.
-
- ISDN_CMD_SETL2:
-
- With this command, the HL-driver is told to select the given Layer-2-
- protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
- ISDN_CMD_ACCEPTD.
-
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_SETL2
- arg = channel-number locally to the driver. (starting with 0)
- logical or'ed with (protocol-Id << 8)
- protocol-Id is one of the constants ISDN_PROTO_L2...
- parm = unused.
-
- ISDN_CMD_GETL2: (currently unused)
-
- With this command, the HL-driver is told to return the current
- setting of the Layer-2-protocol.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_GETL2
- arg = channel-number locally to the driver. (starting with 0)
- parm = unused.
- Returnvalue:
- current protocol-Id (one of the constants ISDN_L2_PROTO)
-
- ISDN_CMD_SETL3:
-
- With this command, the HL-driver is told to select the given Layer-3-
- protocol. This command is issued by the LL prior to ISDN_CMD_DIAL or
- ISDN_CMD_ACCEPTD.
-
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_SETL3
- arg = channel-number locally to the driver. (starting with 0)
- logical or'ed with (protocol-Id << 8)
- protocol-Id is one of the constants ISDN_PROTO_L3...
- parm.fax = Pointer to T30_s fax struct. (fax usage only)
-
- ISDN_CMD_GETL2: (currently unused)
-
- With this command, the HL-driver is told to return the current
- setting of the Layer-3-protocol.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_GETL3
- arg = channel-number locally to the driver. (starting with 0)
- parm = unused.
- Returnvalue:
- current protocol-Id (one of the constants ISDN_L3_PROTO)
-
- ISDN_CMD_PROCEED:
-
- With this command, the HL-driver is told to proceed with a incoming call.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_PROCEED
- arg = channel-number locally to the driver. (starting with 0)
- setup.eazmsn= empty string or string send as uus1 in DSS1 with
- PROCEED message
-
- ISDN_CMD_ALERT:
-
- With this command, the HL-driver is told to alert a proceeding call.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_ALERT
- arg = channel-number locally to the driver. (starting with 0)
- setup.eazmsn= empty string or string send as uus1 in DSS1 with
- ALERT message
-
- ISDN_CMD_REDIR:
-
- With this command, the HL-driver is told to redirect a call in proceeding
- or alerting state.
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_REDIR
- arg = channel-number locally to the driver. (starting with 0)
- setup.eazmsn= empty string or string send as uus1 in DSS1 protocol
- setup.screen= screening indicator
- setup.phone = redirected to party number
-
- ISDN_CMD_PROT_IO:
-
- With this call, the LL-driver invokes protocol specific features through
- the LL.
- The call is not implicitely bound to a connection.
-
- Parameter:
- driver = driver-Id
- command = ISDN_CMD_PROT_IO
- arg = The lower 8 Bits define the addressed protocol as defined
- in ISDN_PTYPE..., the upper bits are used to differentiate
- the protocol specific CMD.
-
- para = protocol and function specific. See isdnif.h for detail.
-
-
- ISDN_CMD_FAXCMD:
-
- With this command the HL-driver receives a fax sub-command.
- For details refer to INTERFACE.fax
-
- Parameter:
- driver = driver-Id.
- command = ISDN_CMD_FAXCMD
- arg = channel-number locally to the driver. (starting with 0)
- parm = unused.
-
-
-3. Description of the events to be signaled by the HL-driver to the LL.
-
- All status-changes are signaled via calling the previously described
- function statcallb(). The field command of the struct isdn_cmd has
- to be set by the HL-driver with the appropriate Status-Id (event-number).
- The field arg has to be set to the channel-number (locally to the driver,
- starting with 0) to which this event applies. (Exception: STAVAIL-event)
-
- Until now, the following Status-Ids are defined:
-
- ISDN_STAT_AVAIL:
-
- With this call, the HL-driver signals the availability of new data
- for readstat(). Used only for debugging-purposes, see description
- of readstat().
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_STAVAIL
- arg = length of available data.
- parm = unused.
-
- ISDN_STAT_ICALL:
- ISDN_STAT_ICALLW:
-
- With this call, the HL-driver signals an incoming call to the LL.
- If ICALLW is signalled the incoming call is a waiting call without
- a available B-chan.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_ICALL
- arg = channel-number, locally to the driver. (starting with 0)
- para.setup.phone = Callernumber.
- para.setup.eazmsn = CalledNumber.
- para.setup.si1 = Service Indicator.
- para.setup.si2 = Additional Service Indicator.
- para.setup.plan = octet 3 from Calling party number Information Element.
- para.setup.screen = octet 3a from Calling party number Information Element.
-
- Return:
- 0 = No device matching this call.
- 1 = At least one device matching this call (RING on ttyI).
- HL-driver may send ALERTING on the D-channel in this case.
- 2 = Call will be rejected.
- 3 = Incoming called party number is currently incomplete.
- Additional digits are required.
- Used for signalling with PtP connections.
- 4 = Call will be held in a proceeding state
- (HL driver sends PROCEEDING)
- Used when a user space prog needs time to interpret a call
- para.setup.eazmsn may be filled with an uus1 message of
- 30 octets maximum. Empty string if no uus.
- 5 = Call will be actively deflected to another party
- Only available in DSS1/EURO protocol
- para.setup.phone must be set to destination party number
- para.setup.eazmsn may be filled with an uus1 message of
- 30 octets maximum. Empty string if no uus.
- -1 = An error happened. (Invalid parameters for example.)
- The keypad support now is included in the dial command.
-
-
- ISDN_STAT_RUN:
-
- With this call, the HL-driver signals availability of the ISDN-card.
- (after initializing, loading firmware)
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_RUN
- arg = unused.
- parm = unused.
-
- ISDN_STAT_STOP:
-
- With this call, the HL-driver signals unavailability of the ISDN-card.
- (before unloading, while resetting/reconfiguring the card)
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_STOP
- arg = unused.
- parm = unused.
-
- ISDN_STAT_DCONN:
-
- With this call, the HL-driver signals the successful establishment of
- a D-Channel-connection. (Response to ISDN_CMD_ACCEPTD or ISDN_CMD_DIAL)
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_DCONN
- arg = channel-number, locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_STAT_BCONN:
-
- With this call, the HL-driver signals the successful establishment of
- a B-Channel-connection. (Response to ISDN_CMD_ACCEPTB or because the
- remote-station has initiated establishment)
-
- The HL driver should call this when the logical l2/l3 protocol
- connection on top of the physical B-channel is established.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_BCONN
- arg = channel-number, locally to the driver. (starting with 0)
- parm.num = ASCII-String, containing type of connection (for analog
- modem only). This will be appended to the CONNECT message
- e.g. 14400/V.32bis
-
- ISDN_STAT_DHUP:
-
- With this call, the HL-driver signals the shutdown of a
- D-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
- or caused by a remote-hangup or if the remote-station has actively
- rejected a call.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_DHUP
- arg = channel-number, locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_STAT_BHUP:
-
- With this call, the HL-driver signals the shutdown of a
- B-Channel-connection. This could be a response to a prior ISDN_CMD_HANGUP,
- or caused by a remote-hangup.
-
- The HL driver should call this as soon as the logical l2/l3 protocol
- connection on top of the physical B-channel is released.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_BHUP
- arg = channel-number, locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_STAT_CINF:
-
- With this call, the HL-driver delivers charge-unit information to the
- LL.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_CINF
- arg = channel-number, locally to the driver. (starting with 0)
- parm.num = ASCII string containing charge-units (digits only).
-
- ISDN_STAT_LOAD: (currently unused)
-
- ISDN_STAT_UNLOAD:
-
- With this call, the HL-driver signals that it will be unloaded now. This
- tells the LL to release all corresponding data-structures.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_UNLOAD
- arg = unused.
- parm = unused.
-
- ISDN_STAT_BSENT:
-
- With this call the HL-driver signals the delivery of a data-packet.
- This callback is used by the network-interfaces only, tty-Emulation
- does not need this call.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_BSENT
- arg = channel-number, locally to the driver. (starting with 0)
- parm.length = ***CHANGEI.1.21: New field.
- the driver has to set this to the original length
- of the skb at the time of receiving it from the linklevel.
-
- ISDN_STAT_NODCH:
-
- With this call, the driver has to respond to a prior ISDN_CMD_DIAL, if
- no D-Channel is available.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_NODCH
- arg = channel-number, locally to the driver. (starting with 0)
- parm = unused.
-
- ISDN_STAT_ADDCH:
-
- This call is for HL-drivers, which are unable to check card-type
- or numbers of supported channels before they have loaded any firmware
- using ioctl. Those HL-driver simply set the channel-parameter to a
- minimum channel-number when registering, and later if they know
- the real amount, perform this call, allocating additional channels.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_ADDCH
- arg = number of channels to be added.
- parm = unused.
-
- ISDN_STAT_CAUSE:
-
- With this call, the HL-driver delivers CAUSE-messages to the LL.
- Currently the LL does not use this messages. Their contents is simply
- logged via kernel-messages. Therefore, currently the format of the
- messages is completely free. However they should be printable.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_NODCH
- arg = channel-number, locally to the driver. (starting with 0)
- parm.num = ASCII string containing CAUSE-message.
-
- ISDN_STAT_DISPLAY:
-
- With this call, the HL-driver delivers DISPLAY-messages to the LL.
- Currently the LL does not use this messages.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_DISPLAY
- arg = channel-number, locally to the driver. (starting with 0)
- para.display= string containing DISPLAY-message.
-
- ISDN_STAT_PROT:
-
- With this call, the HL-driver delivers protocol specific infos to the LL.
- The call is not implicitely bound to a connection.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_PROT
- arg = The lower 8 Bits define the addressed protocol as defined
- in ISDN_PTYPE..., the upper bits are used to differentiate
- the protocol specific STAT.
-
- para = protocol and function specific. See isdnif.h for detail.
-
- ISDN_STAT_DISCH:
-
- With this call, the HL-driver signals the LL to disable or enable the
- use of supplied channel and driver.
- The call may be used to reduce the available number of B-channels after
- loading the driver. The LL has to ignore a disabled channel when searching
- for free channels. The HL driver itself never delivers STAT callbacks for
- disabled channels.
- The LL returns a nonzero code if the operation was not successful or the
- selected channel is actually regarded as busy.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_DISCH
- arg = channel-number, locally to the driver. (starting with 0)
- parm.num[0] = 0 if channel shall be disabled, else enabled.
-
- ISDN_STAT_L1ERR:
-
- ***CHANGEI1.21 new status message.
- A signal can be sent to the linklevel if an Layer1-error results in
- packet-loss on receive or send. The field errcode of the cmd.parm
- union describes the error more precisely.
-
- Parameter:
- driver = driver-Id
- command = ISDN_STAT_L1ERR
- arg = channel-number, locally to the driver. (starting with 0)
- parm.errcode= ISDN_STAT_L1ERR_SEND: Packet lost while sending.
- ISDN_STAT_L1ERR_RECV: Packet lost while receiving.
- ISDN_STAT_FAXIND:
-
- With this call the HL-driver signals a fax sub-command to the LL.
- For details refer to INTERFACE.fax
-
- Parameter:
- driver = driver-Id.
- command = ISDN_STAT_FAXIND
- arg = channel-number, locally to the driver. (starting with 0)
- parm = unused.
-
diff --git a/Documentation/isdn/INTERFACE.fax b/Documentation/isdn/INTERFACE.fax
deleted file mode 100644
index 9c8c6d914ec7..000000000000
--- a/Documentation/isdn/INTERFACE.fax
+++ /dev/null
@@ -1,163 +0,0 @@
-$Id: INTERFACE.fax,v 1.2 2000/08/06 09:22:50 armin Exp $
-
-
-Description of the fax-subinterface between linklevel and hardwarelevel of
- isdn4linux.
-
- The communication between linklevel (LL) and hardwarelevel (HL) for fax
- is based on the struct T30_s (defined in isdnif.h).
- This struct is allocated in the LL.
- In order to use fax, the LL provides the pointer to this struct with the
- command ISDN_CMD_SETL3 (parm.fax). This pointer expires in case of hangup
- and when a new channel to a new connection is assigned.
-
-
-Data handling:
- In send-mode the HL-driver has to handle the <DLE> codes and the bit-order
- conversion by itself.
- In receive-mode the LL-driver takes care of the bit-order conversion
- (specified by +FBOR)
-
-Structure T30_s description:
-
- This structure stores the values (set by AT-commands), the remote-
- capability-values and the command-codes between LL and HL.
-
- If the HL-driver receives ISDN_CMD_FAXCMD, all needed information
- is in this struct set by the LL.
- To signal information to the LL, the HL-driver has to set the
- parameters and use ISDN_STAT_FAXIND.
- (Please refer to INTERFACE)
-
-Structure T30_s:
-
- All members are 8-bit unsigned (__u8)
-
- - resolution
- - rate
- - width
- - length
- - compression
- - ecm
- - binary
- - scantime
- - id[]
- Local faxmachine's parameters, set by +FDIS, +FDCS, +FLID, ...
-
- - r_resolution
- - r_rate
- - r_width
- - r_length
- - r_compression
- - r_ecm
- - r_binary
- - r_scantime
- - r_id[]
- Remote faxmachine's parameters. To be set by HL-driver.
-
- - phase
- Defines the actual state of fax connection. Set by HL or LL
- depending on progress and type of connection.
- If the phase changes because of an AT command, the LL driver
- changes this value. Otherwise the HL-driver takes care of it, but
- only necessary on call establishment (from IDLE to PHASE_A).
- (one of the constants ISDN_FAX_PHASE_[IDLE,A,B,C,D,E])
-
- - direction
- Defines outgoing/send or incoming/receive connection.
- (ISDN_TTY_FAX_CONN_[IN,OUT])
-
- - code
- Commands from LL to HL; possible constants :
- ISDN_TTY_FAX_DR signals +FDR command to HL
-
- ISDN_TTY_FAX_DT signals +FDT command to HL
-
- ISDN_TTY_FAX_ET signals +FET command to HL
-
-
- Other than that the "code" is set with the hangup-code value at
- the end of connection for the +FHNG message.
-
- - r_code
- Commands from HL to LL; possible constants :
- ISDN_TTY_FAX_CFR output of +FCFR message.
-
- ISDN_TTY_FAX_RID output of remote ID set in r_id[]
- (+FCSI/+FTSI on send/receive)
-
- ISDN_TTY_FAX_DCS output of +FDCS and CONNECT message,
- switching to phase C.
-
- ISDN_TTY_FAX_ET signals end of data,
- switching to phase D.
-
- ISDN_TTY_FAX_FCON signals the established, outgoing connection,
- switching to phase B.
-
- ISDN_TTY_FAX_FCON_I signals the established, incoming connection,
- switching to phase B.
-
- ISDN_TTY_FAX_DIS output of +FDIS message and values.
-
- ISDN_TTY_FAX_SENT signals that all data has been sent
- and <DLE><ETX> is acknowledged,
- OK message will be sent.
-
- ISDN_TTY_FAX_PTS signals a msg-confirmation (page sent successful),
- depending on fet value:
- 0: output OK message (more pages follow)
- 1: switching to phase B (next document)
-
- ISDN_TTY_FAX_TRAIN_OK output of +FDCS and OK message (for receive mode).
-
- ISDN_TTY_FAX_EOP signals end of data in receive mode,
- switching to phase D.
-
- ISDN_TTY_FAX_HNG output of the +FHNG and value set by code and
- OK message, switching to phase E.
-
-
- - badlin
- Value of +FBADLIN
-
- - badmul
- Value of +FBADMUL
-
- - bor
- Value of +FBOR
-
- - fet
- Value of +FET command in send-mode.
- Set by HL in receive-mode for +FET message.
-
- - pollid[]
- ID-string, set by +FCIG
-
- - cq
- Value of +FCQ
-
- - cr
- Value of +FCR
-
- - ctcrty
- Value of +FCTCRTY
-
- - minsp
- Value of +FMINSP
-
- - phcto
- Value of +FPHCTO
-
- - rel
- Value of +FREL
-
- - nbc
- Value of +FNBC (0,1)
- (+FNBC is not a known class 2 fax command, I added this to change the
- automatic "best capabilities" connection in the eicon HL-driver)
-
-
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README b/Documentation/isdn/README
deleted file mode 100644
index 74bd2bdb455b..000000000000
--- a/Documentation/isdn/README
+++ /dev/null
@@ -1,599 +0,0 @@
-README for the ISDN-subsystem
-
-1. Preface
-
- 1.1 Introduction
-
- This README describes how to set up and how to use the different parts
- of the ISDN-subsystem.
-
- For using the ISDN-subsystem, some additional userlevel programs are
- necessary. Those programs and some contributed utilities are available
- at
-
- ftp.isdn4linux.de
-
- /pub/isdn4linux/isdn4k-utils-<VersionNumber>.tar.gz
-
-
- We also have set up a mailing-list:
-
- The isdn4linux-project originates in Germany, and therefore by historical
- reasons, the mailing-list's primary language is german. However mails
- written in english have been welcome all the time.
-
- to subscribe: write a email to majordomo@listserv.isdn4linux.de,
- Subject irrelevant, in the message body:
- subscribe isdn4linux <your_email_address>
-
- To write to the mailing-list, write to isdn4linux@listserv.isdn4linux.de
-
- This mailinglist is bidirectionally gated to the newsgroup
-
- de.alt.comm.isdn4linux
-
- There is also a well maintained FAQ in English available at
- https://www.mhessler.de/i4lfaq/
- It can be viewed online, or downloaded in sgml/text/html format.
- The FAQ can also be viewed online at
- https://www.isdn4linux.de/faq/i4lfaq.html
- or downloaded from
- ftp://ftp.isdn4linux.de/pub/isdn4linux/FAQ/
-
- 1.1 Technical details
-
- In the following Text, the terms MSN and EAZ are used.
-
- MSN is the abbreviation for (M)ultiple(S)ubscriber(N)umber, and applies
- to Euro(EDSS1)-type lines. Usually it is simply the phone number.
-
- EAZ is the abbreviation of (E)ndgeraete(A)uswahl(Z)iffer and
- applies to German 1TR6-type lines. This is a one-digit string,
- simply appended to the base phone number
-
- The internal handling is nearly identical, so replace the appropriate
- term to that one, which applies to your local ISDN-environment.
-
- When the link-level-module isdn.o is loaded, it supports up to 16
- low-level-modules with up to 64 channels. (The number 64 is arbitrarily
- chosen and can be configured at compile-time --ISDN_MAX in isdn.h).
- A low-level-driver can register itself through an interface (which is
- defined in isdnif.h) and gets assigned a slot.
- The following char-devices are made available for each channel:
-
- A raw-control-device with the following functions:
- write: raw D-channel-messages (format: depends on driver).
- read: raw D-channel-messages (format: depends on driver).
- ioctl: depends on driver, i.e. for the ICN-driver, the base-address of
- the ports and the shared memory on the card can be set and read
- also the boot-code and the protocol software can be loaded into
- the card.
-
- O N L Y !!! for debugging (no locking against other devices):
- One raw-data-device with the following functions:
- write: data to B-channel.
- read: data from B-channel.
-
- In addition the following devices are made available:
-
- 128 tty-devices (64 cuix and 64 ttyIx) with integrated modem-emulator:
- The functionality is almost the same as that of a serial device
- (the line-discs are handled by the kernel), which lets you run
- SLIP, CSLIP and asynchronous PPP through the devices. We have tested
- Seyon, minicom, CSLIP (uri-dip) PPP, mgetty, XCept and Hylafax.
-
- The modem-emulation supports the following:
- 1.3.1 Commands:
-
- ATA Answer incoming call.
- ATD<No.> Dial, the number may contain:
- [0-9] and [,#.*WPT-S]
- the latter are ignored until 'S'.
- The 'S' must precede the number, if
- the line is a SPV (German 1TR6).
- ATE0 Echo off.
- ATE1 Echo on (default).
- ATH Hang-up.
- ATH1 Off hook (ignored).
- ATH0 Hang-up.
- ATI Return "ISDN for Linux...".
- ATI0 "
- ATI1 "
- ATI2 Report of last connection.
- ATO On line (data mode).
- ATQ0 Enable result codes (default).
- ATQ1 Disable result codes (default).
- ATSx=y Set register x to y.
- ATSx? Show contents of register x.
- ATV0 Numeric responses.
- ATV1 English responses (default).
- ATZ Load registers and EAZ/MSN from Profile.
- AT&Bx Set Send-Packet-size to x (max. 4000)
- The real packet-size may be limited by the
- low-level-driver used. e.g. the HiSax-Module-
- limit is 2000. You will get NO Error-Message,
- if you set it to higher values, because at the
- time of giving this command the corresponding
- driver may not be selected (see "Automatic
- Assignment") however the size of outgoing packets
- will be limited correctly.
- AT&D0 Ignore DTR
- AT&D2 DTR-low-edge: Hang up and return to
- command mode (default).
- AT&D3 Same as AT&D2 but also resets all registers.
- AT&Ex Set the EAZ/MSN for this channel to x.
- AT&F Reset all registers and profile to "factory-defaults"
- AT&Lx Set list of phone numbers to listen on. x is a
- list of wildcard patterns separated by semicolon.
- If this is set, it has precedence over the MSN set
- by AT&E.
- AT&Rx Select V.110 bitrate adaption.
- This command enables V.110 protocol with 9600 baud
- (x=9600), 19200 baud (x=19200) or 38400 baud
- (x=38400). A value of x=0 disables V.110 switching
- back to default X.75. This command sets the following
- Registers:
- Reg 14 (Layer-2 protocol):
- x = 0: 0
- x = 9600: 7
- x = 19200: 8
- x = 38400: 9
- Reg 18.2 = 1
- Reg 19 (Additional Service Indicator):
- x = 0: 0
- x = 9600: 197
- x = 19200: 199
- x = 38400: 198
- Note on value in Reg 19:
- There is _NO_ common convention for 38400 baud.
- The value 198 is chosen arbitrarily. Users
- _MUST_ negotiate this value before establishing
- a connection.
- AT&Sx Set window-size (x = 1..8) (not yet implemented)
- AT&V Show all settings.
- AT&W0 Write registers and EAZ/MSN to profile. See also
- iprofd (5.c in this README).
- AT&X0 BTX-mode and T.70-mode off (default)
- AT&X1 BTX-mode on. (S13.1=1, S13.5=0 S14=0, S16=7, S18=7, S19=0)
- AT&X2 T.70-mode on. (S13.1=1, S13.5=1, S14=0, S16=7, S18=7, S19=0)
- AT+Rx Resume a suspended call with CallID x (x = 1,2,3...)
- AT+Sx Suspend a call with CallID x (x = 1,2,3...)
-
- For voice-mode commands refer to README.audio
-
- 1.3.2 Escape sequence:
- During a connection, the emulation reacts just like
- a normal modem to the escape sequence <DELAY>+++<DELAY>.
- (The escape character - default '+' - can be set in the
- register 2).
- The DELAY must at least be 1.5 seconds long and delay
- between the escape characters must not exceed 0.5 seconds.
-
- 1.3.3 Registers:
-
- Nr. Default Description
- 0 0 Answer on ring number.
- (no auto-answer if S0=0).
- 1 0 Count of rings.
- 2 43 Escape character.
- (a value >= 128 disables the escape sequence).
- 3 13 Carriage return character (ASCII).
- 4 10 Line feed character (ASCII).
- 5 8 Backspace character (ASCII).
- 6 3 Delay in seconds before dialing.
- 7 60 Wait for carrier.
- 8 2 Pause time for comma (ignored)
- 9 6 Carrier detect time (ignored)
- 10 7 Carrier loss to disconnect time (ignored).
- 11 70 Touch tone timing (ignored).
- 12 69 Bit coded register:
- Bit 0: 0 = Suppress response messages.
- 1 = Show response messages.
- Bit 1: 0 = English response messages.
- 1 = Numeric response messages.
- Bit 2: 0 = Echo off.
- 1 = Echo on.
- Bit 3 0 = DCD always on.
- 1 = DCD follows carrier.
- Bit 4 0 = CTS follows RTS
- 1 = Ignore RTS, CTS always on.
- Bit 5 0 = return to command mode on DTR low.
- 1 = Same as 0 but also resets all
- registers.
- See also register 13, bit 2
- Bit 6 0 = DSR always on.
- 1 = DSR only on if channel is available.
- Bit 7 0 = Cisco-PPP-flag-hack off (default).
- 1 = Cisco-PPP-flag-hack on.
- 13 0 Bit coded register:
- Bit 0: 0 = Use delayed tty-send-algorithm
- 1 = Direct tty-send.
- Bit 1: 0 = T.70 protocol (Only for BTX!) off
- 1 = T.70 protocol (Only for BTX!) on
- Bit 2: 0 = Don't hangup on DTR low.
- 1 = Hangup on DTR low.
- Bit 3: 0 = Standard response messages
- 1 = Extended response messages
- Bit 4: 0 = CALLER NUMBER before every RING.
- 1 = CALLER NUMBER after first RING.
- Bit 5: 0 = T.70 extended protocol off
- 1 = T.70 extended protocol on
- Bit 6: 0 = Special RUNG Message off
- 1 = Special RUNG Message on
- "RUNG" is delivered on a ttyI, if
- an incoming call happened (RING) and
- the remote party hung up before any
- local ATA was given.
- Bit 7: 0 = Don't show display messages from net
- 1 = Show display messages from net
- (S12 Bit 1 must be 0 too)
- 14 0 Layer-2 protocol:
- 0 = X75/LAPB with I-frames
- 1 = X75/LAPB with UI-frames
- 2 = X75/LAPB with BUI-frames
- 3 = HDLC
- 4 = Transparent (audio)
- 7 = V.110, 9600 baud
- 8 = V.110, 19200 baud
- 9 = V.110, 38400 baud
- 10 = Analog Modem (only if hardware supports this)
- 11 = Fax G3 (only if hardware supports this)
- 15 0 Layer-3 protocol:
- 0 = transparent
- 1 = transparent with audio features (e.g. DSP)
- 2 = Fax G3 Class 2 commands (S14 has to be set to 11)
- 3 = Fax G3 Class 1 commands (S14 has to be set to 11)
- 16 250 Send-Packet-size/16
- 17 8 Window-size (not yet implemented)
- 18 4 Bit coded register, Service-Octet-1 to accept,
- or to be used on dialout:
- Bit 0: Service 1 (audio) when set.
- Bit 1: Service 5 (BTX) when set.
- Bit 2: Service 7 (data) when set.
- Note: It is possible to set more than one
- bit. In this case, on incoming calls
- the selected services are accepted,
- and if the service is "audio", the
- Layer-2-protocol is automatically
- changed to 4 regardless of the setting
- of register 14. On outgoing calls,
- the most significant 1-bit is chosen to
- select the outgoing service octet.
- 19 0 Service-Octet-2
- 20 0 Bit coded register (readonly)
- Service-Octet-1 of last call.
- Bit mapping is the same as register 18
- 21 0 Bit coded register (readonly)
- Set on incoming call (during RING) to
- octet 3 of calling party number IE (Numbering plan)
- See section 4.5.10 of ITU Q.931
- 22 0 Bit coded register (readonly)
- Set on incoming call (during RING) to
- octet 3a of calling party number IE (Screening info)
- See section 4.5.10 of ITU Q.931
- 23 0 Bit coded register:
- Bit 0: 0 = Add CPN to RING message off
- 1 = Add CPN to RING message on
- Bit 1: 0 = Add CPN to FCON message off
- 1 = Add CPN to FCON message on
- Bit 2: 0 = Add CDN to RING/FCON message off
- 1 = Add CDN to RING/FCON message on
-
- Last but not least a (at the moment fairly primitive) device to request
- the line-status (/dev/isdninfo) is made available.
-
- Automatic assignment of devices to lines:
-
- All inactive physical lines are listening to all EAZs for incoming
- calls and are NOT assigned to a specific tty or network interface.
- When an incoming call is detected, the driver looks first for a network
- interface and then for an opened tty which:
-
- 1. is configured for the same EAZ.
- 2. has the same protocol settings for the B-channel.
- 3. (only for network interfaces if the security flag is set)
- contains the caller number in its access list.
- 4. Either the channel is not bound exclusively to another Net-interface, or
- it is bound AND the other checks apply to exactly this interface.
- (For usage of the bind-features, refer to the isdnctrl-man-page)
-
- Only when a matching interface or tty is found is the call accepted
- and the "connection" between the low-level-layer and the link-level-layer
- is established and kept until the end of the connection.
- In all other cases no connection is established. Isdn4linux can be
- configured to either do NOTHING in this case (which is useful, if
- other, external devices with the same EAZ/MSN are connected to the bus)
- or to reject the call actively. (isdnctrl busreject ...)
-
- For an outgoing call, the inactive physical lines are searched.
- The call is placed on the first physical line, which supports the
- requested protocols for the B-channel. If a net-interface, however
- is pre-bound to a channel, this channel is used directly.
-
- This makes it possible to configure several network interfaces and ttys
- for one EAZ, if the network interfaces are set to secure operation.
- If an incoming call matches one network interface, it gets connected to it.
- If another incoming call for the same EAZ arrives, which does not match
- a network interface, the first tty gets a "RING" and so on.
-
-2 System prerequisites:
-
- ATTENTION!
-
- Always use the latest module utilities. The current version is
- named in Documentation/Changes. Some old versions of insmod
- are not capable of setting the driver-Ids correctly.
-
-3. Lowlevel-driver configuration.
-
- Configuration depends on how the drivers are built. See the
- README.<yourDriver> for information on driver-specific setup.
-
-4. Device-inodes
-
- The major and minor numbers and their names are described in
- Documentation/admin-guide/devices.rst. The major numbers are:
-
- 43 for the ISDN-tty's.
- 44 for the ISDN-callout-tty's.
- 45 for control/info/debug devices.
-
-5. Application
-
- a) For some card-types, firmware has to be loaded into the cards, before
- proceeding with device-independent setup. See README.<yourDriver>
- for how to do that.
-
- b) If you only intend to use ttys, you are nearly ready now.
-
- c) If you want to have really permanent "Modem"-settings on disk, you
- can start the daemon iprofd. Give it a path to a file at the command-
- line. It will store the profile-settings in this file every time
- an AT&W0 is performed on any ISDN-tty. If the file already exists,
- all profiles are initialized from this file. If you want to unload
- any of the modules, kill iprofd first.
-
- d) For networking, continue: Create an interface:
- isdnctrl addif isdn0
-
- e) Set the EAZ (or MSN for Euro-ISDN):
- isdnctrl eaz isdn0 2
-
- (For 1TR6 a single digit is allowed, for Euro-ISDN the number is your
- real MSN e.g.: Phone-Number)
-
- f) Set the number for outgoing calls on the interface:
- isdnctrl addphone isdn0 out 1234567
- ... (this can be executed more than once, all assigned numbers are
- tried in order)
- and the number(s) for incoming calls:
- isdnctrl addphone isdn0 in 1234567
-
- g) Set the timeout for hang-up:
- isdnctrl huptimeout isdn0 <timeout_in_seconds>
-
- h) additionally you may activate charge-hang-up (= Hang up before
- next charge-info, this only works, if your isdn-provider transmits
- the charge-info during and after the connection):
- isdnctrl chargehup isdn0 on
-
- i) Set the dial mode of the interface:
- isdnctrl dialmode isdn0 auto
- "off" means that you (or the system) cannot make any connection
- (neither incoming or outgoing connections are possible). Use
- this if you want to be sure that no connections will be made.
- "auto" means that the interface is in auto-dial mode, and will
- attempt to make a connection whenever a network data packet needs
- the interface's link. Note that this can cause unexpected dialouts,
- and lead to a high phone bill! Some daemons or other pc's that use
- this interface can cause this.
- Incoming connections are also possible.
- "manual" is a dial mode created to prevent the unexpected dialouts.
- In this mode, the interface will never make any connections on its
- own. You must explicitly initiate a connection with "isdnctrl dial
- isdn0". However, after an idle time of no traffic as configured for
- the huptimeout value with isdnctrl, the connection _will_ be ended.
- If you don't want any automatic hangup, set the huptimeout value to 0.
- "manual" is the default.
-
- j) Setup the interface with ifconfig as usual, and set a route to it.
-
- k) (optional) If you run X11 and have Tcl/Tk-wish version 4.0, you can use
- the script tools/tcltk/isdnmon. You can add actions for line-status
- changes. See the comments at the beginning of the script for how to
- do that. There are other tty-based tools in the tools-subdirectory
- contributed by Michael Knigge (imon), Volker Götz (imontty) and
- Andreas Kool (isdnmon).
-
- l) For initial testing, you can set the verbose-level to 2 (default: 0).
- Then all incoming calls are logged, even if they are not addressed
- to one of the configured net-interfaces:
- isdnctrl verbose 2
-
- Now you are ready! A ping to the set address should now result in an
- automatic dial-out (look at syslog kernel-messages).
- The phone numbers and EAZs can be assigned at any time with isdnctrl.
- You can add as many interfaces as you like with addif following the
- directions above. Of course, there may be some limitations. But we have
- tested as many as 20 interfaces without any problem. However, if you
- don't give an interface name to addif, the kernel will assign a name
- which starts with "eth". The number of "eth"-interfaces is limited by
- the kernel.
-
-5. Additional options for isdnctrl:
-
- "isdnctrl secure <InterfaceName> on"
- Only incoming calls, for which the caller-id is listed in the access
- list of the interface are accepted. You can add caller-id's With the
- command "isdnctrl addphone <InterfaceName> in <caller-id>"
- Euro-ISDN does not transmit the leading '0' of the caller-id for an
- incoming call, therefore you should configure it accordingly.
- If the real number for the dialout e.g. is "09311234567" the number
- to configure here is "9311234567". The pattern-match function
- works similar to the shell mechanism.
-
- ? one arbitrary digit
- * zero or arbitrary many digits
- [123] one of the digits in the list
- [1-5] one digit between '1' and '5'
- a '^' as the first character in a list inverts the list
-
-
- "isdnctrl secure <InterfaceName> off"
- Switch off secure operation (default).
-
- "isdnctrl ihup <InterfaceName> [on|off]"
- Switch the hang-up-timer for incoming calls on or off.
-
- "isdnctrl eaz <InterfaceName>"
- Returns the EAZ of an interface.
-
- "isdnctrl delphone <InterfaceName> in|out <number>"
- Deletes a number from one of the access-lists of the interface.
-
- "isdnctrl delif <InterfaceName>"
- Removes the interface (and possible slaves) from the kernel.
- (You have to unregister it with "ifconfig <InterfaceName> down" before).
-
- "isdnctrl callback <InterfaceName> [on|off]"
- Switches an interface to callback-mode. In this mode, an incoming call
- will be rejected and after this the remote-station will be called. If
- you test this feature by using ping, some routers will re-dial very
- quickly, so that the callback from isdn4linux may not be recognized.
- In this case use ping with the option -i <sec> to increase the interval
- between echo-packets.
-
- "isdnctrl cbdelay <InterfaceName> [seconds]"
- Sets the delay (default 5 sec) between an incoming call and start of
- dialing when callback is enabled.
-
- "isdnctrl cbhup <InterfaceName> [on|off]"
- This enables (default) or disables an active hangup (reject) when getting an
- incoming call for an interface which is configured for callback.
-
- "isdnctrl encap <InterfaceName> <EncapType>"
- Selects the type of packet-encapsulation. The encapsulation can be changed
- only while an interface is down.
-
- At the moment the following values are supported:
-
- rawip (Default) Selects raw-IP-encapsulation. This means, MAC-headers
- are stripped off.
- ip IP with type-field. Same as IP but the type-field of the MAC-header
- is preserved.
- x25iface X.25 interface encapsulation (first byte semantics as defined in
- ../networking/x25-iface.txt). Use this for running the linux
- X.25 network protocol stack (AF_X25 sockets) on top of isdn.
- cisco-h A special-mode for communicating with a Cisco, which is configured
- to do "hdlc"
- ethernet No stripping. Packets are sent with full MAC-header.
- The Ethernet-address of the interface is faked, from its
- IP-address: fc:fc:i1:i2:i3:i4, where i1-4 are the IP-addr.-values.
- syncppp Synchronous PPP
-
- uihdlc HDLC with UI-frame-header (for use with DOS ISPA, option -h1)
-
-
- NOTE: x25iface encapsulation is currently experimental. Please
- read README.x25 for further details
-
-
- Watching packets, using standard-tcpdump will fail for all encapsulations
- except ethernet because tcpdump does not know how to handle packets
- without MAC-header. A patch for tcpdump is included in the utility-package
- mentioned above.
-
- "isdnctrl l2_prot <InterfaceName> <L2-ProtocolName>"
- Selects a layer-2-protocol.
- (With the ICN-driver and the HiSax-driver, "x75i" and "hdlc" is available.
- With other drivers, "x75ui", "x75bui", "x25dte", "x25dce" may be
- possible too. See README.x25 for x25 related l2 protocols.)
-
- isdnctrl l3_prot <InterfaceName> <L3-ProtocolName>
- The same for layer-3. (At the moment only "trans" is allowed)
-
- "isdnctrl list <InterfaceName>"
- Shows all parameters of an interface and the charge-info.
- Try "all" as the interface name.
-
- "isdnctrl hangup <InterfaceName>"
- Forces hangup of an interface.
-
- "isdnctrl bind <InterfaceName> <DriverId>,<ChannelNumber> [exclusive]"
- If you are using more than one ISDN card, it is sometimes necessary to
- dial out using a specific card or even preserve a specific channel for
- dialout of a specific net-interface. This can be done with the above
- command. Replace <DriverId> by whatever you assigned while loading the
- module. The <ChannelNumber> is counted from zero. The upper limit
- depends on the card used. At the moment no card supports more than
- 2 channels, so the upper limit is one.
-
- "isdnctrl unbind <InterfaceName>"
- unbinds a previously bound interface.
-
- "isdnctrl busreject <DriverId> on|off"
- If switched on, isdn4linux replies a REJECT to incoming calls, it
- cannot match to any configured interface.
- If switched off, nothing happens in this case.
- You normally should NOT enable this feature, if the ISDN adapter is not
- the only device connected to the S0-bus. Otherwise it could happen that
- isdn4linux rejects an incoming call, which belongs to another device on
- the bus.
-
- "isdnctrl addslave <InterfaceName> <SlaveName>
- Creates a slave interface for channel-bundling. Slave interfaces are
- not seen by the kernel, but their ISDN-part can be configured with
- isdnctrl as usual. (Phone numbers, EAZ/MSN, timeouts etc.) If more
- than two channels are to be bundled, feel free to create as many as you
- want. InterfaceName must be a real interface, NOT a slave. Slave interfaces
- start dialing, if the master interface resp. the previous slave interface
- has a load of more than 7000 cps. They hangup if the load goes under 7000
- cps, according to their "huptimeout"-parameter.
-
- "isdnctrl sdelay <InterfaceName> secs."
- This sets the minimum time an Interface has to be fully loaded, until
- it sends a dial-request to its slave.
-
- "isdnctrl dial <InterfaceName>"
- Forces an interface to start dialing even if no packets are to be
- transferred.
-
- "isdnctrl mapping <DriverId> MSN0,MSN1,MSN2,...MSN9"
- This installs a mapping table for EAZ<->MSN-mapping for a single line.
- Missing MSN's have to be given as "-" or can be omitted, if at the end
- of the commandline.
- With this command, it's now possible to have an interface listening to
- mixed 1TR6- and Euro-Type lines. In this case, the interface has to be
- configured to a 1TR6-type EAZ (one digit). The mapping is also valid
- for tty-emulation. Seen from the interface/tty-level the mapping
- CAN be used, however it's possible to use single tty's/interfaces with
- real MSN's (more digits) also, in which case the mapping will be ignored.
- Here is an example:
-
- You have a 1TR6-type line with base-nr. 1234567 and a Euro-line with
- MSN's 987654, 987655 and 987656. The DriverId for the Euro-line is "EURO".
-
- isdnctrl mapping EURO -,987654,987655,987656,-,987655
- ...
- isdnctrl eaz isdn0 1 # listen on 12345671(1tr6) and 987654(euro)
- ...
- isdnctrl eaz isdn1 4 # listen on 12345674(1tr6) only.
- ...
- isdnctrl eaz isdn2 987654 # listen on 987654(euro) only.
-
- Same scheme is used with AT&E... at the tty's.
-
-6. If you want to write a new low-level-driver, you are welcome.
- The interface to the link-level-module is described in the file INTERFACE.
- If the interface should be expanded for any reason, don't do it
- on your own, send me a mail containing the proposed changes and
- some reasoning about them.
- If other drivers will not be affected, I will include the changes
- in the next release.
- For developers only, there is a second mailing-list. Write to me
- (fritz@isdn4linux.de), if you want to join that list.
-
-Have fun!
-
- -Fritz
-
diff --git a/Documentation/isdn/README.FAQ b/Documentation/isdn/README.FAQ
deleted file mode 100644
index e5dd1addacdd..000000000000
--- a/Documentation/isdn/README.FAQ
+++ /dev/null
@@ -1,26 +0,0 @@
-
-The FAQ for isdn4linux
-======================
-
-Please note that there is a big FAQ available in the isdn4k-utils.
-You find it in:
- isdn4k-utils/FAQ/i4lfaq.sgml
-
-In case you just want to see the FAQ online, or download the newest version,
-you can have a look at my website:
-https://www.mhessler.de/i4lfaq/ (view + download)
-or:
-https://www.isdn4linux.de/faq/4lfaq.html (view)
-
-As the extension tells, the FAQ is in SGML format, and you can convert it
-into text/html/... format by using the sgml2txt/sgml2html/... tools.
-Alternatively, you can also do a 'configure; make all' in the FAQ directory.
-
-
-Please have a look at the FAQ before posting anything in the Mailinglist,
-or the newsgroup!
-
-
-Matthias Hessler
-hessler@isdn4linux.de
-
diff --git a/Documentation/isdn/README.HiSax b/Documentation/isdn/README.HiSax
deleted file mode 100644
index b1a573cf4472..000000000000
--- a/Documentation/isdn/README.HiSax
+++ /dev/null
@@ -1,659 +0,0 @@
-HiSax is a Linux hardware-level driver for passive ISDN cards with Siemens
-chipset (ISAC_S 2085/2086/2186, HSCX SAB 82525). It is based on the Teles
-driver from Jan den Ouden.
-It is meant to be used with isdn4linux, an ISDN link-level module for Linux
-written by Fritz Elfert.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-
-Supported cards
----------------
-
-Teles 8.0/16.0/16.3 and compatible ones
-Teles 16.3c
-Teles S0/PCMCIA
-Teles PCI
-Teles S0Box
-Creatix S0Box
-Creatix PnP S0
-Compaq ISDN S0 ISA card
-AVM A1 (Fritz, Teledat 150)
-AVM Fritz PCMCIA
-AVM Fritz PnP
-AVM Fritz PCI
-ELSA Microlink PCC-16, PCF, PCF-Pro, PCC-8
-ELSA Quickstep 1000
-ELSA Quickstep 1000PCI
-ELSA Quickstep 3000 (same settings as QS1000)
-ELSA Quickstep 3000PCI
-ELSA PCMCIA
-ITK ix1-micro Rev.2
-Eicon Diva 2.0 ISA and PCI (S0 and U interface, no PRO version)
-Eicon Diva 2.01 ISA and PCI
-Eicon Diva 2.02 PCI
-Eicon Diva Piccola
-ASUSCOM NETWORK INC. ISDNLink 128K PC adapter (order code I-IN100-ST-D)
-Dynalink IS64PH (OEM version of ASUSCOM NETWORK INC. ISDNLink 128K adapter)
-PCBIT-DP (OEM version of ASUSCOM NETWORK INC. ISDNLink)
-HFC-2BS0 based cards (TeleInt SA1)
-Sedlbauer Speed Card (Speed Win, Teledat 100, PCI, Fax+)
-Sedlbauer Speed Star/Speed Star2 (PCMCIA)
-Sedlbauer ISDN-Controller PC/104
-USR Sportster internal TA (compatible Stollmann tina-pp V3)
-USR internal TA PCI
-ith Kommunikationstechnik GmbH MIC 16 ISA card
-Traverse Technologie NETjet PCI S0 card and NETspider U card
-Ovislink ISDN sc100-p card (NETjet driver)
-Dr. Neuhaus Niccy PnP/PCI
-Siemens I-Surf 1.0
-Siemens I-Surf 2.0 (with IPAC, try type 12 asuscom)
-ACER P10
-HST Saphir
-Berkom Telekom A4T
-Scitel Quadro
-Gazel ISDN cards
-HFC-PCI based cards
-Winbond W6692 based cards
-HFC-S+, HFC-SP/PCMCIA cards
-formula-n enternow
-Gerdes Power ISDN
-
-Note: PCF, PCF-Pro: up to now, only the ISDN part is supported
- PCC-8: not tested yet
- Eicon.Diehl Diva U interface not tested
-
-If you know other passive cards with the Siemens chipset, please let me know.
-You can combine any card, if there is no conflict between the resources
-(io, mem, irq).
-
-
-Configuring the driver
-----------------------
-
-The HiSax driver can either be built directly into the kernel or as a module.
-It can be configured using the command line feature while loading the kernel
-with LILO or LOADLIN or, if built as a module, using insmod/modprobe with
-parameters.
-There is also some config needed before you compile the kernel and/or
-modules. It is included in the normal "make [menu]config" target at the
-kernel. Don't forget it, especially to select the right D-channel protocol.
-
-Please note: In older versions of the HiSax driver, all PnP cards
-needed to be configured with isapnp and worked only with the HiSax
-driver used as a module.
-
-In the current version, HiSax will automatically use the in-kernel
-ISAPnP support, provided you selected it during kernel configuration
-(CONFIG_ISAPNP), if you don't give the io=, irq= command line parameters.
-
-The affected card types are: 4,7,12,14,19,27-30
-
-a) when built as a module
--------------------------
-
-insmod/modprobe hisax.o \
- io=iobase irq=IRQ mem=membase type=card_type \
- protocol=D_channel_protocol id=idstring
-
-or, if several cards are installed:
-
-insmod/modprobe hisax.o \
- io=iobase1,iobase2,... irq=IRQ1,IRQ2,... mem=membase1,membase2,... \
- type=card_type1,card_type2,... \
- protocol=D_channel_protocol1,D_channel_protocol2,... \
- id=idstring1%idstring2 ...
-
-where "iobaseN" represents the I/O base address of the Nth card, "membaseN"
-the memory base address of the Nth card, etc.
-
-The reason for the delimiter "%" being used in the idstrings is that ","
-won't work with the current modules package.
-
-The parameters may be specified in any order. For example, the "io"
-parameter may precede the "irq" parameter, or vice versa. If several
-cards are installed, the ordering within the comma separated parameter
-lists must of course be consistent.
-
-Only parameters applicable to the card type need to be specified. For
-example, the Teles 16.3 card is not memory-mapped, so the "mem"
-parameter may be omitted for this card. Sometimes it may be necessary
-to specify a dummy parameter, however. This is the case when there is
-a card of a different type later in the list that needs a parameter
-which the preceding card does not. For instance, if a Teles 16.0 card
-is listed after a Teles 16.3 card, a dummy memory base parameter of 0
-must be specified for the 16.3. Instead of a dummy value, the parameter
-can also be skipped by simply omitting the value. For example:
-mem=,0xd0000. See example 6 below.
-
-The parameter for the D-Channel protocol may be omitted if you selected the
-correct one during kernel config. Valid values are "1" for German 1TR6,
-"2" for EDSS1 (Euro ISDN), "3" for leased lines (no D-Channel) and "4"
-for US NI1.
-With US NI1 you have to include your SPID into the MSN setting in the form
-<MSN>:<SPID> for example (your phonenumber is 1234 your SPID 5678):
-AT&E1234:5678 on ttyI interfaces
-isdnctrl eaz ippp0 1234:5678 on network devices
-
-The Creatix/Teles PnP cards use io1= and io2= instead of io= for specifying
-the I/O addresses of the ISAC and HSCX chips, respectively.
-
-Card types:
-
- Type Required parameters (in addition to type and protocol)
-
- 1 Teles 16.0 irq, mem, io
- 2 Teles 8.0 irq, mem
- 3 Teles 16.3 (non PnP) irq, io
- 4 Creatix/Teles PnP irq, io0 (ISAC), io1 (HSCX)
- 5 AVM A1 (Fritz) irq, io
- 6 ELSA PCC/PCF cards io or nothing for autodetect (the iobase is
- required only if you have more than one ELSA
- card in your PC)
- 7 ELSA Quickstep 1000 irq, io (from isapnp setup)
- 8 Teles 16.3 PCMCIA irq, io
- 9 ITK ix1-micro Rev.2 irq, io
- 10 ELSA PCMCIA irq, io (set with card manager)
- 11 Eicon.Diehl Diva ISA PnP irq, io
- 11 Eicon.Diehl Diva PCI no parameter
- 12 ASUS COM ISDNLink irq, io (from isapnp setup)
- 13 HFC-2BS0 based cards irq, io
- 14 Teles 16.3c PnP irq, io
- 15 Sedlbauer Speed Card irq, io
- 15 Sedlbauer PC/104 irq, io
- 15 Sedlbauer Speed PCI no parameter
- 16 USR Sportster internal irq, io
- 17 MIC card irq, io
- 18 ELSA Quickstep 1000PCI no parameter
- 19 Compaq ISDN S0 ISA card irq, io0, io1, io (from isapnp setup io=IO2)
- 20 NETjet PCI card no parameter
- 21 Teles PCI no parameter
- 22 Sedlbauer Speed Star (PCMCIA) irq, io (set with card manager)
- 24 Dr. Neuhaus Niccy PnP irq, io0, io1 (from isapnp setup)
- 24 Dr. Neuhaus Niccy PCI no parameter
- 25 Teles S0Box irq, io (of the used lpt port)
- 26 AVM A1 PCMCIA (Fritz!) irq, io (set with card manager)
- 27 AVM PnP (Fritz!PnP) irq, io (from isapnp setup)
- 27 AVM PCI (Fritz!PCI) no parameter
- 28 Sedlbauer Speed Fax+ irq, io (from isapnp setup)
- 29 Siemens I-Surf 1.0 irq, io, memory (from isapnp setup)
- 30 ACER P10 irq, io (from isapnp setup)
- 31 HST Saphir irq, io
- 32 Telekom A4T none
- 33 Scitel Quadro subcontroller (4*S0, subctrl 1...4)
- 34 Gazel ISDN cards (ISA) irq,io
- 34 Gazel ISDN cards (PCI) none
- 35 HFC 2BDS0 PCI none
- 36 W6692 based PCI cards none
- 37 HFC 2BDS0 S+, SP irq,io
- 38 NETspider U PCI card none
- 39 HFC 2BDS0 SP/PCMCIA irq,io (set with cardmgr)
- 40 hotplug interface
- 41 Formula-n enter:now PCI none
-
-At the moment IRQ sharing is only possible with PCI cards. Please make sure
-that your IRQ is free and enabled for ISA use.
-
-
-Examples for module loading
-
-1. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 10
- modprobe hisax type=3 protocol=2 io=0x280 irq=10
-
-2. Teles 16.0, 1TR6 ISDN, I/O base d80 hex, IRQ 5, Memory d0000 hex
- modprobe hisax protocol=1 type=1 io=0xd80 mem=0xd0000 irq=5
-
-3. Fritzcard, Euro ISDN, I/O base 340 hex, IRQ 10 and ELSA PCF, Euro ISDN
- modprobe hisax type=5,6 protocol=2,2 io=0x340 irq=10 id=Fritz%Elsa
-
-4. Any ELSA PCC/PCF card, Euro ISDN
- modprobe hisax type=6 protocol=2
-
-5. Teles 16.3 PnP, Euro ISDN, with isapnp configured
- isapnp config: (INT 0 (IRQ 10 (MODE +E)))
- (IO 0 (BASE 0x0580))
- (IO 1 (BASE 0x0180))
- modprobe hisax type=4 protocol=2 irq=10 io0=0x580 io1=0x180
-
- In the current version of HiSax, you can instead simply use
-
- modprobe hisax type=4 protocol=2
-
- if you configured your kernel for ISAPnP. Don't run isapnp in
- this case!
-
-6. Teles 16.3, Euro ISDN, I/O base 280 hex, IRQ 12 and
- Teles 16.0, 1TR6, IRQ 5, Memory d0000 hex
- modprobe hisax type=3,1 protocol=2,1 io=0x280 mem=0,0xd0000
-
- Please note the dummy 0 memory address for the Teles 16.3, used as a
- placeholder as described above, in the last example.
-
-7. Teles PCMCIA, Euro ISDN, I/O base 180 hex, IRQ 15 (default values)
- modprobe hisax type=8 protocol=2 io=0x180 irq=15
-
-
-b) using LILO/LOADLIN, with the driver compiled directly into the kernel
-------------------------------------------------------------------------
-
-hisax=typ1,dp1,pa_1,pb_1,pc_1[,typ2,dp2,pa_2 ... \
- typn,dpn,pa_n,pb_n,pc_n][,idstring1[,idstring2,...,idstringn]]
-
-where
- typ1 = type of 1st card (default depends on kernel settings)
- dp1 = D-Channel protocol of 1st card. 1=1TR6, 2=EDSS1, 3=leased
- pa_1 = 1st parameter (depending on the type of the card)
- pb_1 = 2nd parameter ( " " " " " " " )
- pc_1 = 3rd parameter ( " " " " " " " )
-
- typ2,dp2,pa_2,pb_2,pc_2 = Parameters of the second card (defaults: none)
- typn,dpn,pa_n,pb_n,pc_n = Parameters of the n'th card (up to 16 cards are
- supported)
-
- idstring = Driver ID for accessing the particular card with utility
- programs and for identification when using a line monitor
- (default: "HiSax")
-
- Note: the ID string must start with an alphabetical character!
-
-Card types:
-
-type
- 1 Teles 16.0 pa=irq pb=membase pc=iobase
- 2 Teles 8.0 pa=irq pb=membase
- 3 Teles 16.3 pa=irq pb=iobase
- 4 Creatix/Teles PNP ONLY WORKS AS A MODULE !
- 5 AVM A1 (Fritz) pa=irq pb=iobase
- 6 ELSA PCC/PCF cards pa=iobase or nothing for autodetect
- 7 ELSA Quickstep 1000 ONLY WORKS AS A MODULE !
- 8 Teles S0 PCMCIA pa=irq pb=iobase
- 9 ITK ix1-micro Rev.2 pa=irq pb=iobase
- 10 ELSA PCMCIA pa=irq, pb=io (set with card manager)
- 11 Eicon.Diehl Diva ISAPnP ONLY WORKS AS A MODULE !
- 11 Eicon.Diehl Diva PCI no parameter
- 12 ASUS COM ISDNLink ONLY WORKS AS A MODULE !
- 13 HFC-2BS0 based cards pa=irq pb=io
- 14 Teles 16.3c PnP ONLY WORKS AS A MODULE !
- 15 Sedlbauer Speed Card pa=irq pb=io (Speed Win only as module !)
- 15 Sedlbauer PC/104 pa=irq pb=io
- 15 Sedlbauer Speed PCI no parameter
- 16 USR Sportster internal pa=irq pb=io
- 17 MIC card pa=irq pb=io
- 18 ELSA Quickstep 1000PCI no parameter
- 19 Compaq ISDN S0 ISA card ONLY WORKS AS A MODULE !
- 20 NETjet PCI card no parameter
- 21 Teles PCI no parameter
- 22 Sedlbauer Speed Star (PCMCIA) pa=irq, pb=io (set with card manager)
- 24 Dr. Neuhaus Niccy PnP ONLY WORKS AS A MODULE !
- 24 Dr. Neuhaus Niccy PCI no parameter
- 25 Teles S0Box pa=irq, pb=io (of the used lpt port)
- 26 AVM A1 PCMCIA (Fritz!) pa=irq, pb=io (set with card manager)
- 27 AVM PnP (Fritz!PnP) ONLY WORKS AS A MODULE !
- 27 AVM PCI (Fritz!PCI) no parameter
- 28 Sedlbauer Speed Fax+ ONLY WORKS AS A MODULE !
- 29 Siemens I-Surf 1.0 ONLY WORKS AS A MODULE !
- 30 ACER P10 ONLY WORKS AS A MODULE !
- 31 HST Saphir pa=irq, pb=io
- 32 Telekom A4T no parameter
- 33 Scitel Quadro subcontroller (4*S0, subctrl 1...4)
- 34 Gazel ISDN cards (ISA) pa=irq, pb=io
- 34 Gazel ISDN cards (PCI) no parameter
- 35 HFC 2BDS0 PCI no parameter
- 36 W6692 based PCI cards none
- 37 HFC 2BDS0 S+,SP/PCMCIA ONLY WORKS AS A MODULE !
- 38 NETspider U PCI card none
- 39 HFC 2BDS0 SP/PCMCIA ONLY WORKS AS A MODULE !
- 40 hotplug interface ONLY WORKS AS A MODULE !
- 41 Formula-n enter:now PCI none
-
-Running the driver
-------------------
-
-When you insmod isdn.o and hisax.o (or with the in-kernel version, during
-boot time), a few lines should appear in your syslog. Look for something like:
-
-Apr 13 21:01:59 kke01 kernel: HiSax: Driver for Siemens chip set ISDN cards
-Apr 13 21:01:59 kke01 kernel: HiSax: Version 2.9
-Apr 13 21:01:59 kke01 kernel: HiSax: Revisions 1.14/1.9/1.10/1.25/1.8
-Apr 13 21:01:59 kke01 kernel: HiSax: Total 1 card defined
-Apr 13 21:01:59 kke01 kernel: HiSax: Card 1 Protocol EDSS1 Id=HiSax1 (0)
-Apr 13 21:01:59 kke01 kernel: HiSax: Elsa driver Rev. 1.13
-...
-Apr 13 21:01:59 kke01 kernel: Elsa: PCF-Pro found at 0x360 Rev.:C IRQ 10
-Apr 13 21:01:59 kke01 kernel: Elsa: timer OK; resetting card
-Apr 13 21:01:59 kke01 kernel: Elsa: HSCX version A: V2.1 B: V2.1
-Apr 13 21:01:59 kke01 kernel: Elsa: ISAC 2086/2186 V1.1
-...
-Apr 13 21:01:59 kke01 kernel: HiSax: DSS1 Rev. 1.14
-Apr 13 21:01:59 kke01 kernel: HiSax: 2 channels added
-
-This means that the card is ready for use.
-Cabling problems or line-downs are not detected, and only some ELSA cards can
-detect the S0 power.
-
-Remember that, according to the new strategy for accessing low-level drivers
-from within isdn4linux, you should also define a driver ID while doing
-insmod: Simply append hisax_id=<SomeString> to the insmod command line. This
-string MUST NOT start with a digit or a small 'x'!
-
-At this point you can run a 'cat /dev/isdnctrl0' and view debugging messages.
-
-At the moment, debugging messages are enabled with the hisaxctrl tool:
-
- hisaxctrl <DriverId> DebugCmd <debugging_flags>
-
-<DriverId> default is HiSax, if you didn't specify one.
-
-DebugCmd is 1 for generic debugging
- 11 for layer 1 development debugging
- 13 for layer 3 development debugging
-
-where <debugging_flags> is the integer sum of the following debugging
-options you wish enabled:
-
-With DebugCmd set to 1:
-
- 0x0001 Link-level <--> hardware-level communication
- 0x0002 Top state machine
- 0x0004 D-Channel Frames for isdnlog
- 0x0008 D-Channel Q.921
- 0x0010 B-Channel X.75
- 0x0020 D-Channel l2
- 0x0040 B-Channel l2
- 0x0080 D-Channel link state debugging
- 0x0100 B-Channel link state debugging
- 0x0200 TEI debug
- 0x0400 LOCK debug in callc.c
- 0x0800 More paranoid debug in callc.c (not for normal use)
- 0x1000 D-Channel l1 state debugging
- 0x2000 B-Channel l1 state debugging
-
-With DebugCmd set to 11:
-
- 0x0001 Warnings (default: on)
- 0x0002 IRQ status
- 0x0004 ISAC
- 0x0008 ISAC FIFO
- 0x0010 HSCX
- 0x0020 HSCX FIFO (attention: full B-Channel output!)
- 0x0040 D-Channel LAPD frame types
- 0x0080 IPAC debug
- 0x0100 HFC receive debug
- 0x0200 ISAC monitor debug
- 0x0400 D-Channel frames for isdnlog (set with 1 0x4 too)
- 0x0800 D-Channel message verbose
-
-With DebugCmd set to 13:
-
- 1 Warnings (default: on)
- 2 l3 protocol descriptor errors
- 4 l3 state machine
- 8 charge info debugging (1TR6)
-
-For example, 'hisaxctrl HiSax 1 0x3ff' enables full generic debugging.
-
-Because of some obscure problems with some switch equipment, the delay
-between the CONNECT message and sending the first data on the B-channel is now
-configurable with
-
-hisaxctrl <DriverId> 2 <delay>
-<delay> in ms Value between 50 and 800 ms is recommended.
-
-Downloading Firmware
---------------------
-At the moment, the Sedlbauer speed fax+ is the only card, which
-needs to download firmware.
-The firmware is downloaded with the hisaxctrl tool:
-
- hisaxctrl <DriverId> 9 <firmware_filename>
-
-<DriverId> default is HiSax, if you didn't specify one,
-
-where <firmware_filename> is the filename of the firmware file.
-
-For example, 'hisaxctrl HiSax 9 ISAR.BIN' downloads the firmware for
-ISAR based cards (like the Sedlbauer speed fax+).
-
-Warning
--------
-HiSax is a work in progress and may crash your machine.
-For certification look at HiSax.cert file.
-
-Limitations
------------
-At this time, HiSax only works on Euro ISDN lines and German 1TR6 lines.
-For leased lines see appendix.
-
-Bugs
-----
-If you find any, please let me know.
-
-
-Thanks
-------
-Special thanks to:
-
- Emil Stephan for the name HiSax which is a mix of HSCX and ISAC.
-
- Fritz Elfert, Jan den Ouden, Michael Hipp, Michael Wein,
- Andreas Kool, Pekka Sarnila, Sim Yskes, Johan Myrre'en,
- Klaus-Peter Nischke (ITK AG), Christof Petig, Werner Fehn (ELSA GmbH),
- Volker Schmidt
- Edgar Toernig and Marcus Niemann for the Sedlbauer driver
- Stephan von Krawczynski
- Juergen Quade for the Leased Line part
- Klaus Lichtenwalder (Klaus.Lichtenwalder@WebForum.DE), for ELSA PCMCIA support
- Enrik Berkhan (enrik@starfleet.inka.de) for S0BOX specific stuff
- Ton van Rosmalen for Teles PCI
- Petr Novak <petr.novak@i.cz> for Winbond W6692 support
- Werner Cornelius <werner@isdn4linux.de> for HFC-PCI, HFC-S(+/P) and supplementary services support
- and more people who are hunting bugs. (If I forgot somebody, please
- send me a mail).
-
- Firma ELSA GmbH
- Firma Eicon.Diehl GmbH
- Firma Dynalink NL
- Firma ASUSCOM NETWORK INC. Taiwan
- Firma S.u.S.E
- Firma ith Kommunikationstechnik GmbH
- Firma Traverse Technologie Australia
- Firma Medusa GmbH (www.medusa.de).
- Firma Quant-X Austria for sponsoring a DEC Alpha board+CPU
- Firma Cologne Chip Designs GmbH
-
- My girl friend and partner in life Ute for her patience with me.
-
-
-Enjoy,
-
-Karsten Keil
-keil@isdn4linux.de
-
-
-Appendix: Teles PCMCIA driver
------------------------------
-
-See
- http://www.linux.no/teles_cs.txt
-for instructions.
-
-Appendix: Linux and ISDN-leased lines
--------------------------------------
-
-Original from Juergen Quade, new version KKe.
-
-Attention NEW VERSION, the old leased line syntax won't work !!!
-
-You can use HiSax to connect your Linux-Box via an ISDN leased line
-to e.g. the Internet:
-
-1. Build a kernel which includes the HiSax driver either as a module
- or as part of the kernel.
- cd /usr/src/linux
- make menuconfig
- <ISDN subsystem - ISDN support -- HiSax>
- make clean; make zImage; make modules; make modules_install
-2. Install the new kernel
- cp /usr/src/linux/arch/x86/boot/zImage /etc/kernel/linux.isdn
- vi /etc/lilo.conf
- <add new kernel in the bootable image section>
- lilo
-3. in case the hisax driver is a "fixed" part of the kernel, configure
- the driver with lilo:
- vi /etc/lilo.conf
- <add HiSax driver parameter in the global section (see below)>
- lilo
- Your lilo.conf _might_ look like the following:
-
- # LILO configuration-file
- # global section
- # teles 16.0 on IRQ=5, MEM=0xd8000, PORT=0xd80
- append="hisax=1,3,5,0xd8000,0xd80,HiSax"
- # teles 16.3 (non pnp) on IRQ=15, PORT=0xd80
- # append="hisax=3,3,5,0xd8000,0xd80,HiSax"
- boot=/dev/sda
- compact # faster, but won't work on all systems.
- linear
- read-only
- prompt
- timeout=100
- vga = normal # force sane state
- # Linux bootable partition config begins
- image = /etc/kernel/linux.isdn
- root = /dev/sda1
- label = linux.isdn
- #
- image = /etc/kernel/linux-2.0.30
- root = /dev/sda1
- label = linux.secure
-
- In the line starting with "append" you have to adapt the parameters
- according to your card (see above in this file)
-
-3. boot the new linux.isdn kernel
-4. start the ISDN subsystem:
- a) load - if necessary - the modules (depends, whether you compiled
- the ISDN driver as module or not)
- According to the type of card you have to specify the necessary
- driver parameter (irq, io, mem, type, protocol).
- For the leased line the protocol is "3". See the table above for
- the parameters, which you have to specify depending on your card.
- b) configure i4l
- /sbin/isdnctrl addif isdn0
- # EAZ 1 -- B1 channel 2 --B2 channel
- /sbin/isdnctrl eaz isdn0 1
- /sbin/isdnctrl secure isdn0 on
- /sbin/isdnctrl huptimeout isdn0 0
- /sbin/isdnctrl l2_prot isdn0 hdlc
- # Attention you must not set an outgoing number !!! This won't work !!!
- # The incoming number is LEASED0 for the first card, LEASED1 for the
- # second and so on.
- /sbin/isdnctrl addphone isdn0 in LEASED0
- # Here is no need to bind the channel.
- c) in case the remote partner is a CISCO:
- /sbin/isdnctrl encap isdn0 cisco-h
- d) configure the interface
- /sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP}
- e) set the routes
- /sbin/route add -host ${REMOTE_IP} isdn0
- /sbin/route add default gw ${REMOTE_IP}
- f) switch the card into leased mode for each used B-channel
- /sbin/hisaxctrl HiSax 5 1
-
-Remarks:
-a) Use state of the art isdn4k-utils
-
-Here an example script:
-#!/bin/sh
-# Start/Stop ISDN leased line connection
-
-I4L_AS_MODULE=yes
-I4L_REMOTE_IS_CISCO=no
-I4L_MODULE_PARAMS="type=16 io=0x268 irq=7 "
-I4L_DEBUG=no
-I4L_LEASED_128K=yes
-LOCAL_IP=192.168.1.1
-REMOTE_IP=192.168.2.1
-
-case "$1" in
- start)
- echo "Starting ISDN ..."
- if [ ${I4L_AS_MODULE} = "yes" ]; then
- echo "loading modules..."
- /sbin/modprobe hisax ${I4L_MODULE_PARAMS}
- fi
- # configure interface
- /sbin/isdnctrl addif isdn0
- /sbin/isdnctrl secure isdn0 on
- if [ ${I4L_DEBUG} = "yes" ]; then
- /sbin/isdnctrl verbose 7
- /sbin/hisaxctrl HiSax 1 0xffff
- /sbin/hisaxctrl HiSax 11 0xff
- cat /dev/isdnctrl >/tmp/lea.log &
- fi
- if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then
- /sbin/isdnctrl encap isdn0 cisco-h
- fi
- /sbin/isdnctrl huptimeout isdn0 0
- # B-CHANNEL 1
- /sbin/isdnctrl eaz isdn0 1
- /sbin/isdnctrl l2_prot isdn0 hdlc
- # 1. card
- /sbin/isdnctrl addphone isdn0 in LEASED0
- if [ ${I4L_LEASED_128K} = "yes" ]; then
- /sbin/isdnctrl addslave isdn0 isdn0s
- /sbin/isdnctrl secure isdn0s on
- /sbin/isdnctrl huptimeout isdn0s 0
- # B-CHANNEL 2
- /sbin/isdnctrl eaz isdn0s 2
- /sbin/isdnctrl l2_prot isdn0s hdlc
- # 1. card
- /sbin/isdnctrl addphone isdn0s in LEASED0
- if [ ${I4L_REMOTE_IS_CISCO} = "yes" ]; then
- /sbin/isdnctrl encap isdn0s cisco-h
- fi
- fi
- /sbin/isdnctrl dialmode isdn0 manual
- # configure tcp/ip
- /sbin/ifconfig isdn0 ${LOCAL_IP} pointopoint ${REMOTE_IP}
- /sbin/route add -host ${REMOTE_IP} isdn0
- /sbin/route add default gw ${REMOTE_IP}
- # switch to leased mode
- # B-CHANNEL 1
- /sbin/hisaxctrl HiSax 5 1
- if [ ${I4L_LEASED_128K} = "yes" ]; then
- # B-CHANNEL 2
- sleep 10; /* Wait for master */
- /sbin/hisaxctrl HiSax 5 2
- fi
- ;;
- stop)
- /sbin/ifconfig isdn0 down
- /sbin/isdnctrl delif isdn0
- if [ ${I4L_DEBUG} = "yes" ]; then
- killall cat
- fi
- if [ ${I4L_AS_MODULE} = "yes" ]; then
- /sbin/rmmod hisax
- /sbin/rmmod isdn
- /sbin/rmmod ppp
- /sbin/rmmod slhc
- fi
- ;;
- *)
- echo "Usage: $0 {start|stop}"
- exit 1
-esac
-exit 0
diff --git a/Documentation/isdn/README.audio b/Documentation/isdn/README.audio
deleted file mode 100644
index 8ebca19290d9..000000000000
--- a/Documentation/isdn/README.audio
+++ /dev/null
@@ -1,138 +0,0 @@
-$Id: README.audio,v 1.8 1999/07/11 17:17:29 armin Exp $
-
-ISDN subsystem for Linux.
- Description of audio mode.
-
-When enabled during kernel configuration, the tty emulator of the ISDN
-subsystem is capable of a reduced set of commands to support audio.
-This document describes the commands supported and the format of
-audio data.
-
-Commands for enabling/disabling audio mode:
-
- AT+FCLASS=8 Enable audio mode.
- This affects the following registers:
- S18: Bits 0 and 2 are set.
- S16: Set to 48 and any further change to
- larger values is blocked.
- AT+FCLASS=0 Disable audio mode.
- Register 18 is set to 4.
- AT+FCLASS=? Show possible modes.
- AT+FCLASS? Report current mode (0 or 8).
-
-Commands supported in audio mode:
-
-All audio mode commands have one of the following forms:
-
- AT+Vxx? Show current setting.
- AT+Vxx=? Show possible settings.
- AT+Vxx=v Set simple parameter.
- AT+Vxx=v,v ... Set complex parameter.
-
-where xx is a two-character code and v are alphanumerical parameters.
-The following commands are supported:
-
- AT+VNH=x Auto hangup setting. NO EFFECT, supported
- for compatibility only.
- AT+VNH? Always reporting "1"
- AT+VNH=? Always reporting "1"
-
- AT+VIP Reset all audio parameters.
-
- AT+VLS=x Line select. x is one of the following:
- 0 = No device.
- 2 = Phone line.
- AT+VLS=? Always reporting "0,2"
- AT+VLS? Show current line.
-
- AT+VRX Start recording. Emulator responds with
- CONNECT and starts sending audio data to
- the application. See below for data format
-
- AT+VSD=x,y Set silence-detection parameters.
- Possible parameters:
- x = 0 ... 31 sensitivity threshold level.
- (default 0 , deactivated)
- y = 0 ... 255 range of interval in units
- of 0.1 second. (default 70)
- AT+VSD=? Report possible parameters.
- AT+VSD? Show current parameters.
-
- AT+VDD=x,y Set DTMF-detection parameters.
- Only possible if online and during this connection.
- Possible parameters:
- x = 0 ... 15 sensitivity threshold level.
- (default 0 , I4L soft-decode)
- (1-15 soft-decode off, hardware on)
- y = 0 ... 255 tone duration in units of 5ms.
- Not for I4L soft decode (default 8, 40ms)
- AT+VDD=? Report possible parameters.
- AT+VDD? Show current parameters.
-
- AT+VSM=x Select audio data format.
- Possible parameters:
- 2 = ADPCM-2
- 3 = ADPCM-3
- 4 = ADPCM-4
- 5 = aLAW
- 6 = uLAW
- AT+VSM=? Show possible audio formats.
-
- AT+VTX Start audio playback. Emulator responds
- with CONNECT and starts sending audio data
- received from the application via phone line.
-General behavior and description of data formats/protocol.
- when a connection is made:
-
- On incoming calls, if the application responds to a RING
- with ATA, depending on the calling service, the emulator
- responds with either CONNECT (data call) or VCON (voice call).
-
- On outgoing voice calls, the emulator responds with VCON
- upon connection setup.
-
- Audio recording.
-
- When receiving audio data, a kind of bisync protocol is used.
- Upon AT+VRX command, the emulator responds with CONNECT, and
- starts sending audio data to the application. There are several
- escape sequences defined, all using DLE (0x10) as Escape char:
-
- <DLE><ETX> End of audio data. (i.e. caused by a
- hangup of the remote side) Emulator stops
- recording, responding with VCON.
- <DLE><DC4> Abort recording, (send by appl.) Emulator
- stops recording, sends DLE,ETX.
- <DLE><DLE> Escape sequence for DLE in data stream.
- <DLE>0 Touchtone "0" received.
- ...
- <DLE>9 Touchtone "9" received.
- <DLE># Touchtone "#" received.
- <DLE>* Touchtone "*" received.
- <DLE>A Touchtone "A" received.
- <DLE>B Touchtone "B" received.
- <DLE>C Touchtone "C" received.
- <DLE>D Touchtone "D" received.
-
- <DLE>q quiet. Silence detected after non-silence.
- <DLE>s silence. Silence detected from the
- start of recording.
-
- Currently unsupported DLE sequences:
-
- <DLE>c FAX calling tone received.
- <DLE>b busy tone received.
-
- Audio playback.
-
- When sending audio data, upon AT+VTX command, emulator responds with
- CONNECT, and starts transferring data from application to the phone line.
- The same DLE sequences apply to this mode.
-
- Full-Duplex-Audio:
-
- When _both_ commands for recording and playback are given in _one_
- AT-command-line (i.e.: "AT+VTX+VRX"), full-duplex-mode is selected.
- In this mode, the only way to stop recording is sending <DLE><DC4>
- and the only way to stop playback is to send <DLE><ETX>.
-
diff --git a/Documentation/isdn/README.concap b/Documentation/isdn/README.concap
deleted file mode 100644
index a76d74845a4c..000000000000
--- a/Documentation/isdn/README.concap
+++ /dev/null
@@ -1,259 +0,0 @@
-Description of the "concap" encapsulation protocol interface
-============================================================
-
-The "concap" interface is intended to be used by network device
-drivers that need to process an encapsulation protocol.
-It is assumed that the protocol interacts with a linux network device by
-- data transmission
-- connection control (establish, release)
-Thus, the mnemonic: "CONnection CONtrolling eNCAPsulation Protocol".
-
-This is currently only used inside the isdn subsystem. But it might
-also be useful to other kinds of network devices. Thus, if you want
-to suggest changes that improve usability or performance of the
-interface, please let me know. I'm willing to include them in future
-releases (even if I needed to adapt the current isdn code to the
-changed interface).
-
-
-Why is this useful?
-===================
-
-The encapsulation protocol used on top of WAN connections or permanent
-point-to-point links are frequently chosen upon bilateral agreement.
-Thus, a device driver for a certain type of hardware must support
-several different encapsulation protocols at once.
-
-The isdn device driver did already support several different
-encapsulation protocols. The encapsulation protocol is configured by a
-user space utility (isdnctrl). The isdn network interface code then
-uses several case statements which select appropriate actions
-depending on the currently configured encapsulation protocol.
-
-In contrast, LAN network interfaces always used a single encapsulation
-protocol which is unique to the hardware type of the interface. The LAN
-encapsulation is usually done by just sticking a header on the data. Thus,
-traditional linux network device drivers used to process the
-encapsulation protocol directly (usually by just providing a hard_header()
-method in the device structure) using some hardware type specific support
-functions. This is simple, direct and efficient. But it doesn't fit all
-the requirements for complex WAN encapsulations.
-
-
- The configurability of the encapsulation protocol to be used
- makes isdn network interfaces more flexible, but also much more
- complex than traditional lan network interfaces.
-
-
-Many Encapsulation protocols used on top of WAN connections will not just
-stick a header on the data. They also might need to set up or release
-the WAN connection. They also might want to send other data for their
-private purpose over the wire, e.g. ppp does a lot of link level
-negotiation before the first piece of user data can be transmitted.
-Such encapsulation protocols for WAN devices are typically more complex
-than encapsulation protocols for lan devices. Thus, network interface
-code for typical WAN devices also tends to be more complex.
-
-
-In order to support Linux' x25 PLP implementation on top of
-isdn network interfaces I could have introduced yet another branch to
-the various case statements inside drivers/isdn/isdn_net.c.
-This eventually made isdn_net.c even more complex. In addition, it made
-isdn_net.c harder to maintain. Thus, by identifying an abstract
-interface between the network interface code and the encapsulation
-protocol, complexity could be reduced and maintainability could be
-increased.
-
-
-Likewise, a similar encapsulation protocol will frequently be needed by
-several different interfaces of even different hardware type, e.g. the
-synchronous ppp implementation used by the isdn driver and the
-asynchronous ppp implementation used by the ppp driver have a lot of
-similar code in them. By cleanly separating the encapsulation protocol
-from the hardware specific interface stuff such code could be shared
-better in future.
-
-
-When operating over dial-up-connections (e.g. telephone lines via modem,
-non-permanent virtual circuits of wide area networks, ISDN) many
-encapsulation protocols will need to control the connection. Therefore,
-some basic connection control primitives are supported. The type and
-semantics of the connection (i.e the ISO layer where connection service
-is provided) is outside our scope and might be different depending on
-the encapsulation protocol used, e.g. for a ppp module using our service
-on top of a modem connection a connect_request will result in dialing
-a (somewhere else configured) remote phone number. For an X25-interface
-module (LAPB semantics, as defined in Documentation/networking/x25-iface.txt)
-a connect_request will ask for establishing a reliable lapb
-datalink connection.
-
-
-The encapsulation protocol currently provides the following
-service primitives to the network device.
-
-- create a new encapsulation protocol instance
-- delete encapsulation protocol instance and free all its resources
-- initialize (open) the encapsulation protocol instance for use.
-- deactivate (close) an encapsulation protocol instance.
-- process (xmit) data handed down by upper protocol layer
-- receive data from lower (hardware) layer
-- process connect indication from lower (hardware) layer
-- process disconnect indication from lower (hardware) layer
-
-
-The network interface driver accesses those primitives via callbacks
-provided by the encapsulation protocol instance within a
-struct concap_proto_ops.
-
-struct concap_proto_ops{
-
- /* create a new encapsulation protocol instance of same type */
- struct concap_proto * (*proto_new) (void);
-
- /* delete encapsulation protocol instance and free all its resources.
- cprot may no longer be referenced after calling this */
- void (*proto_del)(struct concap_proto *cprot);
-
- /* initialize the protocol's data. To be called at interface startup
- or when the device driver resets the interface. All services of the
- encapsulation protocol may be used after this*/
- int (*restart)(struct concap_proto *cprot,
- struct net_device *ndev,
- struct concap_device_ops *dops);
-
- /* deactivate an encapsulation protocol instance. The encapsulation
- protocol may not call any *dops methods after this. */
- int (*close)(struct concap_proto *cprot);
-
- /* process a frame handed down to us by upper layer */
- int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
- /* to be called for each data entity received from lower layer*/
- int (*data_ind)(struct concap_proto *cprot, struct sk_buff *skb);
-
- /* to be called when a connection was set up/down.
- Protocols that don't process these primitives might fill in
- dummy methods here */
- int (*connect_ind)(struct concap_proto *cprot);
- int (*disconn_ind)(struct concap_proto *cprot);
-};
-
-
-The data structures are defined in the header file include/linux/concap.h.
-
-
-A Network interface using encapsulation protocols must also provide
-some service primitives to the encapsulation protocol:
-
-- request data being submitted by lower layer (device hardware)
-- request a connection being set up by lower layer
-- request a connection being released by lower layer
-
-The encapsulation protocol accesses those primitives via callbacks
-provided by the network interface within a struct concap_device_ops.
-
-struct concap_device_ops{
-
- /* to request data be submitted by device */
- int (*data_req)(struct concap_proto *, struct sk_buff *);
-
- /* Control methods must be set to NULL by devices which do not
- support connection control. */
- /* to request a connection be set up */
- int (*connect_req)(struct concap_proto *);
-
- /* to request a connection be released */
- int (*disconn_req)(struct concap_proto *);
-};
-
-The network interface does not explicitly provide a receive service
-because the encapsulation protocol directly calls netif_rx().
-
-
-
-
-An encapsulation protocol itself is actually the
-struct concap_proto{
- struct net_device *net_dev; /* net device using our service */
- struct concap_device_ops *dops; /* callbacks provided by device */
- struct concap_proto_ops *pops; /* callbacks provided by us */
- int flags;
- void *proto_data; /* protocol specific private data, to
- be accessed via *pops methods only*/
- /*
- :
- whatever
- :
- */
-};
-
-Most of this is filled in when the device requests the protocol to
-be reset (opend). The network interface must provide the net_dev and
-dops pointers. Other concap_proto members should be considered private
-data that are only accessed by the pops callback functions. Likewise,
-a concap proto should access the network device's private data
-only by means of the callbacks referred to by the dops pointer.
-
-
-A possible extended device structure which uses the connection controlling
-encapsulation services could look like this:
-
-struct concap_device{
- struct net_device net_dev;
- struct my_priv /* device->local stuff */
- /* the my_priv struct might contain a
- struct concap_device_ops *dops;
- to provide the device specific callbacks
- */
- struct concap_proto *cprot; /* callbacks provided by protocol */
-};
-
-
-
-Misc Thoughts
-=============
-
-The concept of the concap proto might help to reuse protocol code and
-reduce the complexity of certain network interface implementations.
-The trade off is that it introduces yet another procedure call layer
-when processing the protocol. This has of course some impact on
-performance. However, typically the concap interface will be used by
-devices attached to slow lines (like telephone, isdn, leased synchronous
-lines). For such slow lines, the overhead is probably negligible.
-This might no longer hold for certain high speed WAN links (like
-ATM).
-
-
-If general linux network interfaces explicitly supported concap
-protocols (e.g. by a member struct concap_proto* in struct net_device)
-then the interface of the service function could be changed
-by passing a pointer of type (struct net_device*) instead of
-type (struct concap_proto*). Doing so would make many of the service
-functions compatible to network device support functions.
-
-e.g. instead of the concap protocol's service function
-
- int (*encap_and_xmit)(struct concap_proto *cprot, struct sk_buff *skb);
-
-we could have
-
- int (*encap_and_xmit)(struct net_device *ndev, struct sk_buff *skb);
-
-As this is compatible to the dev->hard_start_xmit() method, the device
-driver could directly register the concap protocol's encap_and_xmit()
-function as its hard_start_xmit() method. This would eliminate one
-procedure call layer.
-
-
-The device's data request function could also be defined as
-
- int (*data_req)(struct net_device *ndev, struct sk_buff *skb);
-
-This might even allow for some protocol stacking. And the network
-interface might even register the same data_req() function directly
-as its hard_start_xmit() method when a zero layer encapsulation
-protocol is configured. Thus, eliminating the performance penalty
-of the concap interface when a trivial concap protocol is used.
-Nevertheless, the device remains able to support encapsulation
-protocol configuration.
-
diff --git a/Documentation/isdn/README.diversion b/Documentation/isdn/README.diversion
deleted file mode 100644
index bddcd5fb86ff..000000000000
--- a/Documentation/isdn/README.diversion
+++ /dev/null
@@ -1,127 +0,0 @@
-The isdn diversion services are a supporting module working together with
-the isdn4linux and the HiSax module for passive cards.
-Active cards, TAs and cards using a own or other driver than the HiSax
-module need to be adapted to the HL<->LL interface described in a separate
-document. The diversion services may be used with all cards supported by
-the HiSax driver.
-The diversion kernel interface and controlling tool divertctrl were written
-by Werner Cornelius (werner@isdn4linux.de or werner@titro.de) under the
-GNU General Public License.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-Table of contents
-=================
-
-1. Features of the i4l diversion services
- (Or what can the i4l diversion services do for me)
-
-2. Required hard- and software
-
-3. Compiling, installing and loading/unloading the module
- Tracing calling and diversion information
-
-4. Tracing calling and diversion information
-
-5. Format of the divert device ASCII output
-
-
-1. Features of the i4l diversion services
- (Or what can the i4l diversion services do for me)
-
- The i4l diversion services offers call forwarding and logging normally
- only supported by isdn phones. Incoming calls may be diverted
- unconditionally (CFU), when not reachable (CFNR) or on busy condition
- (CFB).
- The diversions may be invoked statically in the providers exchange
- as normally done by isdn phones. In this case all incoming calls
- with a special (or all) service identifiers are forwarded if the
- forwarding reason is met. Activated static services may also be
- interrogated (queried).
- The i4l diversion services additionally offers a dynamic version of
- call forwarding which is not preprogrammed inside the providers exchange
- but dynamically activated by i4l.
- In this case all incoming calls are checked by rules that may be
- compared to the mechanism of ipfwadm or ipchains. If a given rule matches
- the checking process is finished and the rule matching will be applied
- to the call.
- The rules include primary and secondary service identifiers, called
- number and subaddress, callers number and subaddress and whether the rule
- matches to all filtered calls or only those when all B-channel resources
- are exhausted.
- Actions that may be invoked by a rule are ignore, proceed, reject,
- direct divert or delayed divert of a call.
- All incoming calls matching a rule except the ignore rule a reported and
- logged as ASCII via the proc filesystem (/proc/net/isdn/divert). If proceed
- is selected the call will be held in a proceeding state (without ringing)
- for a certain amount of time to let an external program or client decide
- how to handle the call.
-
-
-2. Required hard- and software
-
- For using the i4l diversion services the isdn line must be of a EURO/DSS1
- type. Additionally the i4l services only work together with the HiSax
- driver for passive isdn cards. All HiSax supported cards may be used for
- the diversion purposes.
- The static diversion services require the provider having static services
- CFU, CFNR, CFB activated on an MSN-line. The static services may not be
- used on a point-to-point connection. Further the static services are only
- available in some countries (for example germany). Countries requiring the
- keypad protocol for activating static diversions (like the netherlands) are
- not supported but may use the tty devices for this purpose.
- The dynamic diversion services may be used in all countries if the provider
- enables the feature CF (call forwarding). This should work on both MSN- and
- point-to-point lines.
- To add and delete rules the additional divertctrl program is needed. This
- program is part of the isdn4kutils package.
-
-3. Compiling, installing and loading/unloading the module
- Tracing calling and diversion information
-
-
- To compile the i4l code with diversion support you need to say yes to the
- DSS1 diversion services when selecting the i4l options in the kernel
- config (menuconfig or config).
- After having properly activated a make modules and make modules_install all
- required modules will be correctly installed in the needed modules dirs.
- As the diversion services are currently not included in the scripts of most
- standard distributions you will have to add a "insmod dss1_divert" after
- having loaded the global isdn module.
- The module can be loaded without any command line parameters.
- If the module is actually loaded and active may be checked with a
- "cat /proc/modules" or "ls /proc/net/isdn/divert". The divert file is
- dynamically created by the diversion module and removed when the module is
- unloaded.
-
-
-4. Tracing calling and diversion information
-
- You also may put a "cat /proc/net/isdn/divert" in the background with the
- output redirected to a file. Then all actions of the module are logged.
- The divert file in the proc system may be opened more than once, so in
- conjunction with inetd and a small remote client on other machines inside
- your network incoming calls and reactions by the module may be shown on
- every listening machine.
- If a call is reported as proceeding an external program or client may
- specify during a certain amount of time (normally 4 to 10 seconds) what
- to do with that call.
- To unload the module all open files to the device in the proc system must
- be closed. Otherwise the module (and isdn.o) may not be unloaded.
-
-5. Format of the divert device ASCII output
-
- To be done later
-
diff --git a/Documentation/isdn/README.fax b/Documentation/isdn/README.fax
deleted file mode 100644
index 5314958a8a6e..000000000000
--- a/Documentation/isdn/README.fax
+++ /dev/null
@@ -1,45 +0,0 @@
-
-Fax with isdn4linux
-===================
-
-When enabled during kernel configuration, the tty emulator
-of the ISDN subsystem is capable of the Fax Class 2 commands.
-
-This only makes sense under the following conditions :
-
-- You need the commands as dummy, because you are using
- hylafax (with patch) for AVM capi.
-- You want to use the fax capabilities of your isdn-card.
- (supported cards are listed below)
-
-
-NOTE: This implementation does *not* support fax with passive
- ISDN-cards (known as softfax). The low-level driver of
- the ISDN-card and/or the card itself must support this.
-
-
-Supported ISDN-Cards
---------------------
-
-Eicon DIVA Server BRI/PCI
- - full support with both B-channels.
-
-Eicon DIVA Server 4BRI/PCI
- - full support with all B-channels.
-
-Eicon DIVA Server PRI/PCI
- - full support on amount of B-channels
- depending on DSPs on board.
-
-
-
-The command set is known as Class 2 (not Class 2.0) and
-can be activated by AT+FCLASS=2
-
-
-The interface between the link-level-module and the hardware-level driver
-is described in the files INTERFACE.fax and INTERFACE.
-
-Armin
-mac@melware.de
-
diff --git a/Documentation/isdn/README.gigaset b/Documentation/isdn/README.gigaset
index 9b1ce277ca3d..f6184b637182 100644
--- a/Documentation/isdn/README.gigaset
+++ b/Documentation/isdn/README.gigaset
@@ -48,9 +48,8 @@ GigaSet 307x Device Driver
1.2. Software
--------
- The driver works with the Kernel CAPI subsystem as well as the old
- ISDN4Linux subsystem, so it can be used with any software which is able
- to use CAPI 2.0 or ISDN4Linux for ISDN connections (voice or data).
+ The driver works with the Kernel CAPI subsystem and can be used with any
+ software which is able to use CAPI 2.0 for ISDN connections (voice or data).
There are some user space tools available at
https://sourceforge.net/projects/gigaset307x/
@@ -92,7 +91,7 @@ GigaSet 307x Device Driver
gigaset debug debug level (see section 3.2.)
startmode initial operation mode (see section 2.5.):
- bas_gigaset ) 1=ISDN4linux/CAPI (default), 0=Unimodem
+ bas_gigaset ) 1=CAPI (default), 0=Unimodem
ser_gigaset )
usb_gigaset ) cidmode initial Call-ID mode setting (see section
2.5.): 1=on (default), 0=off
@@ -154,18 +153,10 @@ GigaSet 307x Device Driver
2.3. CAPI
----
- If the driver is compiled with CAPI support (kernel configuration option
- GIGASET_CAPI) the devices will show up as CAPI controllers as soon as the
- corresponding driver module is loaded, and can then be used with CAPI 2.0
- kernel and user space applications. For user space access, the module
- capi.ko must be loaded.
-
- Legacy ISDN4Linux applications are supported via the capidrv
- compatibility driver. The kernel module capidrv.ko must be loaded
- explicitly with the command
- modprobe capidrv
- if needed, and cannot be unloaded again without unloading the driver
- first. (These are limitations of capidrv.)
+ The devices will show up as CAPI controllers as soon as the
+ corresponding driver module is loaded, and can then be used with
+ CAPI 2.0 kernel and user space applications. For user space access,
+ the module capi.ko must be loaded.
Most distributions handle loading and unloading of the various CAPI
modules automatically via the command capiinit(1) from the capi4k-utils
@@ -173,16 +164,6 @@ GigaSet 307x Device Driver
Gigaset drivers because it doesn't support more than one module per
driver.
-2.4. ISDN4Linux
- ----------
- If the driver is compiled without CAPI support (native ISDN4Linux
- variant), it registers the device with the legacy ISDN4Linux subsystem
- after loading the module. It can then be used with ISDN4Linux
- applications only. Most distributions provide some configuration utility
- for setting up that subsystem. Otherwise you can use some HOWTOs like
- http://www.linuxhaven.de/dlhp/HOWTO/DE-ISDN-HOWTO-5.html
-
-
2.5. Unimodem mode
-------------
In this mode the device works like a modem connected to a serial port
@@ -281,8 +262,7 @@ GigaSet 307x Device Driver
number. Dialing "***" (three asterisks) calls all extensions
simultaneously (global call).
- This holds for both CAPI 2.0 and ISDN4Linux applications. Unimodem mode
- does not support internal calls.
+ Unimodem mode does not support internal calls.
2.8. Unregistered Wireless Devices (M101/M105)
-----------------------------------------
diff --git a/Documentation/isdn/README.hfc-pci b/Documentation/isdn/README.hfc-pci
deleted file mode 100644
index e8a4ef0226e8..000000000000
--- a/Documentation/isdn/README.hfc-pci
+++ /dev/null
@@ -1,41 +0,0 @@
-The driver for the HFC-PCI and HFC-PCI-A chips from CCD may be used
-for many OEM cards using this chips.
-Additionally the driver has a special feature which makes it possible
-to read the echo-channel of the isdn bus. So all frames in both directions
-may be logged.
-When the echo logging feature is used the number of available B-channels
-for a HFC-PCI card is reduced to 1. Of course this is only relevant to
-the card, not to the isdn line.
-To activate the echo mode the following ioctls must be entered:
-
-hisaxctrl <driver/cardname> 10 1
-
-This reduces the available channels to 1. There must not be open connections
-through this card when entering the command.
-And then:
-
-hisaxctrl <driver/cardname> 12 1
-
-This enables the echo mode. If Hex logging is activated the isdnctrlx
-devices show a output with a line beginning of HEX: for the providers
-exchange and ECHO: for isdn devices sending to the provider.
-
-If more than one HFC-PCI cards are installed, a specific card may be selected
-at the hisax module load command line. Supply the load command with the desired
-IO-address of the desired card.
-Example:
-There tree cards installed in your machine at IO-base addresses 0xd000, 0xd400
-and 0xdc00
-If you want to use the card at 0xd400 standalone you should supply the insmod
-or depmod with type=35 io=0xd400.
-If you want to use all three cards, but the order needs to be at 0xdc00,0xd400,
-0xd000 you may give the parameters type=35,35,35 io=0xdc00,0xd400,0xd00
-Then the desired card will be the initialised in the desired order.
-If the io parameter is used the io addresses of all used cards should be
-supplied else the parameter is assumed 0 and a auto search for a free card is
-invoked which may not give the wanted result.
-
-Comments and reports to werner@isdn4linux.de or werner@isdn-development.de
-
-
-
diff --git a/Documentation/isdn/README.syncppp b/Documentation/isdn/README.syncppp
deleted file mode 100644
index 27d260095cce..000000000000
--- a/Documentation/isdn/README.syncppp
+++ /dev/null
@@ -1,58 +0,0 @@
-Some additional information for setting up a syncPPP
-connection using network interfaces.
----------------------------------------------------------------
-
-You need one thing beside the isdn4linux package:
-
- a patched pppd .. (I called it ipppd to show the difference)
-
-Compiling isdn4linux with sync PPP:
------------------------------------
-To compile isdn4linux with the sync PPP part, you have
-to answer the appropriate question when doing a "make config"
-Don't forget to load the slhc.o
-module before the isdn.o module, if VJ-compression support
-is not compiled into your kernel. (e.g if you have no PPP or
-CSLIP in the kernel)
-
-Using isdn4linux with sync PPP:
--------------------------------
-Sync PPP is just another encapsulation for isdn4linux. The
-name to enable sync PPP encapsulation is 'syncppp' .. e.g:
-
- /sbin/isdnctrl encap ippp0 syncppp
-
-The name of the interface is here 'ippp0'. You need
-one interface with the name 'ippp0' to saturate the
-ipppd, which checks the ppp version via this interface.
-Currently, all devices must have the name ipppX where
-'X' is a decimal value.
-
-To set up a PPP connection you need the ipppd .. You must start
-the ipppd once after installing the modules. The ipppd
-communicates with the isdn4linux link-level driver using the
-/dev/ippp0 to /dev/ippp15 devices. One ipppd can handle
-all devices at once. If you want to use two PPP connections
-at the same time, you have to connect the ipppd to two
-devices .. and so on.
-I've implemented one additional option for the ipppd:
- 'useifip' will get (if set to not 0.0.0.0) the IP address
- for the negotiation from the attached network-interface.
-(also: ipppd will try to negotiate pointopoint IP as remote IP)
-You must disable BSD-compression, this implementation can't
-handle compressed packets.
-
-Check the etc/rc.isdn.syncppp in the isdn4kernel-util package
-for an example setup script.
-
-To use the MPPP stuff, you must configure a slave device
-with isdn4linux. Now call the ipppd with the '+mp' option.
-To increase the number of links, you must use the
-'addlink' option of the isdnctrl tool. (rc.isdn.syncppp.MPPP is
-an example script)
-
-enjoy it,
- michael
-
-
-
diff --git a/Documentation/isdn/README.x25 b/Documentation/isdn/README.x25
deleted file mode 100644
index e561a77c4e22..000000000000
--- a/Documentation/isdn/README.x25
+++ /dev/null
@@ -1,184 +0,0 @@
-
-X.25 support within isdn4linux
-==============================
-
-This is alpha/beta test code. Use it completely at your own risk.
-As new versions appear, the stuff described here might suddenly change
-or become invalid without notice.
-
-Keep in mind:
-
-You are using several new parts of the 2.2.x kernel series which
-have not been tested in a large scale. Therefore, you might encounter
-more bugs as usual.
-
-- If you connect to an X.25 neighbour not operated by yourself, ASK the
- other side first. Be prepared that bugs in the protocol implementation
- might result in problems.
-
-- This implementation has never wiped out my whole hard disk yet. But as
- this is experimental code, don't blame me if that happened to you.
- Backing up important data will never harm.
-
-- Monitor your isdn connections while using this software. This should
- prevent you from undesired phone bills in case of driver problems.
-
-
-
-
-How to configure the kernel
-===========================
-
-The ITU-T (former CCITT) X.25 network protocol layer has been implemented
-in the Linux source tree since version 2.1.16. The isdn subsystem might be
-useful to run X.25 on top of ISDN. If you want to try it, select
-
- "CCITT X.25 Packet Layer"
-
-from the networking options as well as
-
- "ISDN Support" and "X.25 PLP on Top of ISDN"
-
-from the ISDN subsystem options when you configure your kernel for
-compilation. You currently also need to enable
-"Prompt for development and/or incomplete code/drivers" from the
-"Code maturity level options" menu. For the x25trace utility to work
-you also need to enable "Packet socket".
-
-For local testing it is also recommended to enable the isdnloop driver
-from the isdn subsystem's configuration menu.
-
-For testing, it is recommended that all isdn drivers and the X.25 PLP
-protocol are compiled as loadable modules. Like this, you can recover
-from certain errors by simply unloading and reloading the modules.
-
-
-
-What's it for? How to use it?
-=============================
-
-X.25 on top of isdn might be useful with two different scenarios:
-
-- You might want to access a public X.25 data network from your Linux box.
- You can use i4l if you were physically connected to the X.25 switch
- by an ISDN B-channel (leased line as well as dial up connection should
- work).
-
- This corresponds to ITU-T recommendation X.31 Case A (circuit-mode
- access to PSPDN [packet switched public data network]).
-
- NOTE: X.31 also covers a Case B (access to PSPDN via virtual
- circuit / packet mode service). The latter mode (which in theory
- also allows using the D-channel) is not supported by isdn4linux.
- It should however be possible to establish such packet mode connections
- with certain active isdn cards provided that the firmware supports X.31
- and the driver exports this functionality to the user. Currently,
- the AVM B1 driver is the only driver which does so. (It should be
- possible to access D-channel X.31 with active AVM cards using the
- CAPI interface of the AVM-B1 driver).
-
-- Or you might want to operate certain ISDN teleservices on your linux
- box. A lot of those teleservices run on top of the ISO-8208
- (DTE-DTE mode) network layer protocol. ISO-8208 is essentially the
- same as ITU-T X.25.
-
- Popular candidates of such teleservices are EUROfile transfer or any
- teleservice applying ITU-T recommendation T.90.
-
-To use the X.25 protocol on top of isdn, just create an isdn network
-interface as usual, configure your own and/or peer's ISDN numbers,
-and choose x25iface encapsulation by
-
- isdnctrl encap <iface-name> x25iface.
-
-Once encap is set like this, the device can be used by the X.25 packet layer.
-
-All the stuff needed for X.25 is implemented inside the isdn link
-level (mainly isdn_net.c and some new source files). Thus, it should
-work with every existing HL driver. I was able to successfully open X.25
-connections on top of the isdnloop driver and the hisax driver.
-"x25iface"-encapsulation bypasses demand dialing. Dialing will be
-initiated when the upper (X.25 packet) layer requests the lapb datalink to
-be established. But hangup timeout is still active. Whenever a hangup
-occurs, all existing X.25 connections on that link will be cleared
-It is recommended to use sufficiently large hangup-timeouts for the
-isdn interfaces.
-
-
-In order to set up a conforming protocol stack you also need to
-specify the proper l2_prot parameter:
-
-To operate in ISO-8208 X.25 DTE-DTE mode, use
-
- isdnctrl l2_prot <iface-name> x75i
-
-To access an X.25 network switch via isdn (your linux box is the DTE), use
-
- isdnctrl l2_prot <iface-name> x25dte
-
-To mimic an X.25 network switch (DCE side of the connection), use
-
- isdnctrl l2_prot <iface-name> x25dce
-
-However, x25dte or x25dce is currently not supported by any real HL
-level driver. The main difference between x75i and x25dte/dce is that
-x25d[tc]e uses fixed lap_b addresses. With x75i, the side which
-initiates the isdn connection uses the DTE's lap_b address while the
-called side used the DCE's lap_b address. Thus, l2_prot x75i might
-probably work if you access a public X.25 network as long as the
-corresponding isdn connection is set up by you. At least one test
-was successful to connect via isdn4linux to an X.25 switch using this
-trick. At the switch side, a terminal adapter X.21 was used to connect
-it to the isdn.
-
-
-How to set up a test installation?
-==================================
-
-To test X.25 on top of isdn, you need to get
-
-- a recent version of the "isdnctrl" program that supports setting the new
- X.25 specific parameters.
-
-- the x25-utils-2.X package from
- ftp://ftp.hes.iki.fi/pub/ham/linux/ax25/x25utils-*
- (don't confuse the x25-utils with the ax25-utils)
-
-- an application program that uses linux PF_X25 sockets (some are
- contained in the x25-util package).
-
-Before compiling the user level utilities make sure that the compiler/
-preprocessor will fetch the proper kernel header files of this kernel
-source tree. Either make /usr/include/linux a symbolic link pointing to
-this kernel's include/linux directory or set the appropriate compiler flags.
-
-When all drivers and interfaces are loaded and configured you need to
-ifconfig the network interfaces up and add X.25-routes to them. Use
-the usual ifconfig tool.
-
-ifconfig <iface-name> up
-
-But a special x25route tool (distributed with the x25-util package)
-is needed to set up X.25 routes. I.e.
-
-x25route add 01 <iface-name>
-
-will cause all x.25 connections to the destination X.25-address
-"01" to be routed to your created isdn network interface.
-
-There are currently no real X.25 applications available. However, for
-tests, the x25-utils package contains a modified version of telnet
-and telnetd that uses X.25 sockets instead of tcp/ip sockets. You can
-use those for your first tests. Furthermore, you might check
-ftp://ftp.hamburg.pop.de/pub/LOCAL/linux/i4l-eft/ which contains some
-alpha-test implementation ("eftp4linux") of the EUROfile transfer
-protocol.
-
-The scripts distributed with the eftp4linux test releases might also
-provide useful examples for setting up X.25 on top of isdn.
-
-The x25-utility package also contains an x25trace tool that can be
-used to monitor X.25 packets received by the network interfaces.
-The /proc/net/x25* files also contain useful information.
-
-- Henner
diff --git a/Documentation/isdn/syncPPP.FAQ b/Documentation/isdn/syncPPP.FAQ
deleted file mode 100644
index 3257a4bc0786..000000000000
--- a/Documentation/isdn/syncPPP.FAQ
+++ /dev/null
@@ -1,224 +0,0 @@
-simple isdn4linux PPP FAQ .. to be continued .. not 'debugged'
--------------------------------------------------------------------
-
-Q01: what's pppd, ipppd, syncPPP, asyncPPP ??
-Q02: error message "this system lacks PPP support"
-Q03: strange information using 'ifconfig'
-Q04: MPPP?? What's that and how can I use it ...
-Q05: I tried MPPP but it doesn't work
-Q06: can I use asynchronous PPP encapsulation with network devices
-Q07: A SunISDN machine can't connect to my i4l system
-Q08: I wanna talk to several machines, which need different configs
-Q09: Starting the ipppd, I get only error messages from i4l
-Q10: I wanna use dynamic IP address assignment
-Q11: I can't connect. How can I check where the problem is.
-Q12: How can I reduce login delay?
-
--------------------------------------------------------------------
-
-Q01: pppd, ipppd, syncPPP, asyncPPP .. what is that ?
- what should I use?
-A: The pppd is for asynchronous PPP .. asynchronous means
- here, the framing is character based. (e.g when
- using ttyI* or tty* devices)
-
- The ipppd handles PPP packets coming in HDLC
- frames (bit based protocol) ... The PPP driver
- in isdn4linux pushes all IP packets direct
- to the network layer and all PPP protocol
- frames to the /dev/ippp* device.
- So, the ipppd is a simple external network
- protocol handler.
-
- If you login into a remote machine using the
- /dev/ttyI* devices and then enable PPP on the
- remote terminal server -> use the 'old' pppd
-
- If your remote side immediately starts to send
- frames ... you probably connect to a
- syncPPP machine .. use the network device part
- of isdn4linux with the 'syncppp' encapsulation
- and make sure, that the ipppd is running and
- connected to at least one /dev/ippp*. Check the
- isdn4linux manual on how to configure a network device.
-
---
-
-Q02: when I start the ipppd .. I only get the
- error message "this system lacks PPP support"
-A: check that at least the device 'ippp0' exists.
- (you can check this e.g with the program 'ifconfig')
- The ipppd NEEDS this device under THIS name ..
- If this device doesn't exists, use:
- isdnctrl addif ippp0
- isdnctrl encap ippp0 syncppp
- ... (see isdn4linux doc for more) ...
-A: Maybe you have compiled the ipppd with another
- kernel source tree than the kernel you currently
- run ...
-
---
-
-Q03: when I list the netdevices with ifconfig I see, that
- my ISDN interface has a HWaddr and IRQ=0 and Base
- address = 0
-A: The device is a fake ethernet device .. ignore IRQ and baseaddr
- You need the HWaddr only for ethernet encapsulation.
-
---
-
-Q04: MPPP?? What's that and how can I use it ...
-
-A: MPPP or MP or MPP (Warning: MP is also an
- acronym for 'Multi Processor') stands for
- Multi Point to Point and means bundling
- of several channels to one logical stream.
- To enable MPPP negotiation you must call the
- ipppd with the '+mp' option.
- You must also configure a slave device for
- every additional channel. (see the i4l manual
- for more)
- To use channel bundling you must first activate
- the 'master' or initial call. Now you can add
- the slave channels with the command:
- isdnctrl addlink <device>
- e.g:
- isdnctrl addlink ippp0
- This is different from other encapsulations of
- isdn4linux! With syncPPP, there is no automatic
- activation of slave devices.
-
---
-
-Q05: I tried MPPP but it doesn't work .. the ipppd
- writes in the debug log something like:
- .. rcvd [0][proto=0x3d] c0 00 00 00 80 fd 01 01 00 0a ...
- .. sent [0][LCP ProtRej id=0x2 00 3d c0 00 00 00 80 fd 01 ...
-
-A: you forgot to compile MPPP/RFC1717 support into the
- ISDN Subsystem. Recompile with this option enabled.
-
---
-
-Q06: can I use asynchronous PPP encapsulation
- over the network interface of isdn4linux ..
-
-A: No .. that's not possible .. Use the standard
- PPP package over the /dev/ttyI* devices. You
- must not use the ipppd for this.
-
---
-
-Q07: A SunISDN machine tries to connect my i4l system,
- which doesn't work.
- Checking the debug log I just saw garbage like:
-!![ ... fill in the line ... ]!!
-
-A: The Sun tries to talk asynchronous PPP ... i4l
- can't understand this ... try to use the ttyI*
- devices with the standard PPP/pppd package
-
-A: (from Alexanter Strauss: )
-!![ ... fill in mail ]!!
-
---
-
-Q08: I wanna talk to remote machines, which need
- a different configuration. The only way
- I found to do this is to kill the ipppd and
- start a new one with another config to connect
- to the second machine.
-
-A: you must bind a network interface explicitly to
- an ippp device, where you can connect a (for this
- interface) individually configured ipppd.
-
---
-
-Q09: When I start the ipppd I only get error messages
- from the i4l driver ..
-
-A: When starting, the ipppd calls functions which may
- trigger a network packet. (e.g gethostbyname()).
- Without the ipppd (at this moment, it is not
- fully started) we can't handle this network request.
- Try to configure hostnames necessary for the ipppd
- in your local /etc/hosts file or in a way, that
- your system can resolve it without using an
- isdn/ippp network-interface.
-
---
-
-Q10: I wanna use dynamic IP address assignment ... How
- must I configure the network device.
-
-A: At least you must have a route which forwards
- a packet to the ippp network-interface to trigger
- the dial-on-demand.
- A default route to the ippp-interface will work.
- Now you must choose a dummy IP address for your
- interface.
- If for some reason you can't set the default
- route to the ippp interface, you may take any
- address of the subnet from which you expect your
- dynamic IP number and set a 'network route' for
- this subnet to the ippp interface.
- To allow overriding of the dummy address you
- must call the ipppd with the 'ipcp-accept-local' option.
-
-A: You must know, how the ipppd gets the addresses it wanna
- configure. If you don't give any option, the ipppd
- tries to negotiate the local host address!
- With the option 'noipdefault' it requests an address
- from the remote machine. With 'useifip' it gets the
- addresses from the net interface. Or you set the address
- on the option line with the <a.b.c.d:e.f.g.h> option.
- Note: the IP address of the remote machine must be configured
- locally or the remote machine must send it in an IPCP request.
- If your side doesn't know the IP address after negotiation, it
- closes the connection!
- You must allow overriding of address with the 'ipcp-accept-*'
- options, if you have set your own or the remote address
- explicitly.
-
-A: Maybe you try these options .. e.g:
-
- /sbin/ipppd :$REMOTE noipdefault /dev/ippp0
-
- where REMOTE must be the address of the remote machine (the
- machine, which gives you your address)
-
---
-
-Q11: I can't connect. How can I check where the problem is.
-
-A: A good help log is the debug output from the ipppd...
- Check whether you can find there:
- - only a few LCP-conf-req SENT messages (less then 10)
- and then a Term-REQ:
- -> check whether your ISDN card is well configured
- it seems, that your machine doesn't dial
- (IRQ,IO,Proto, etc problems)
- Configure your ISDN card to print debug messages and
- check the /dev/isdnctrl output next time. There
- you can see, whether there is activity on the card/line.
- - there are at least a few RECV messages in the log:
- -> fine: your card is dialing and your remote machine
- tries to talk with you. Maybe only a missing
- authentication. Check your ipppd configuration again.
- - the ipppd exits for some reason:
- -> not good ... check /var/adm/syslog and /var/adm/daemon.
- Could be a bug in the ipppd.
-
---
-
-Q12: How can I reduce login delay?
-
-A: Log a login session ('debug' log) and check which options
- your remote side rejects. Next time configure your ipppd
- to not negotiate these options. Another 'side effect' is, that
- this increases redundancy. (e.g your remote side is buggy and
- rejects options in a wrong way).
-
-
-
diff --git a/Documentation/kbuild/headers_install.rst b/Documentation/kbuild/headers_install.rst
index 1ab7294e41ac..f6c6b74a609c 100644
--- a/Documentation/kbuild/headers_install.rst
+++ b/Documentation/kbuild/headers_install.rst
@@ -40,12 +40,5 @@ INSTALL_HDR_PATH indicates where to install the headers. It defaults to
An 'include' directory is automatically created inside INSTALL_HDR_PATH and
headers are installed in 'INSTALL_HDR_PATH/include'.
-The command "make headers_install_all" exports headers for all architectures
-simultaneously. (This is mostly of interest to distribution maintainers,
-who create an architecture-independent tarball from the resulting include
-directory.) You also can use HDR_ARCH_LIST to specify list of architectures.
-Remember to provide the appropriate linux/asm directory via "mv" or "ln -s"
-before building a C library with headers exported this way.
-
The kernel header export infrastructure is maintained by David Woodhouse
<dwmw2@infradead.org>.
diff --git a/Documentation/kbuild/index.rst b/Documentation/kbuild/index.rst
index 42d4cbe4460c..e323a3f2cc81 100644
--- a/Documentation/kbuild/index.rst
+++ b/Documentation/kbuild/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
===================
Kernel Build System
diff --git a/Documentation/kbuild/issues.rst b/Documentation/kbuild/issues.rst
index 9fdded4b681c..bdab01f733f6 100644
--- a/Documentation/kbuild/issues.rst
+++ b/Documentation/kbuild/issues.rst
@@ -1,11 +1,15 @@
-Recursion issue #1
-------------------
+================
+Recursion issues
+================
- .. include:: Kconfig.recursion-issue-01
- :literal:
+issue #1
+--------
-Recursion issue #2
-------------------
+.. literalinclude:: Kconfig.recursion-issue-01
+ :language: kconfig
- .. include:: Kconfig.recursion-issue-02
- :literal:
+issue #2
+--------
+
+.. literalinclude:: Kconfig.recursion-issue-02
+ :language: kconfig
diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst
index e774e760522d..61b2181ed3ea 100644
--- a/Documentation/kbuild/kbuild.rst
+++ b/Documentation/kbuild/kbuild.rst
@@ -18,7 +18,7 @@ This file lists all modules that are built into the kernel. This is used
by modprobe to not fail when trying to load something builtin.
modules.builtin.modinfo
---------------------------------------------------
+-----------------------
This file contains modinfo from all modules that are built into the kernel.
Unlike modinfo of a separate module, all fields are prefixed with module name.
@@ -38,12 +38,11 @@ Additional options to the assembler (for built-in and modules).
AFLAGS_MODULE
-------------
-Additional module specific options to use for $(AS).
+Additional assembler options for modules.
AFLAGS_KERNEL
-------------
-Additional options for $(AS) when used for assembler
-code for code that is compiled as built-in.
+Additional assembler options for built-in.
KCFLAGS
-------
@@ -153,6 +152,7 @@ Install script called when using "make install".
The default name is "installkernel".
The script will be called with the following arguments:
+
- $1 - kernel version
- $2 - kernel image file
- $3 - kernel map file
@@ -200,6 +200,15 @@ The output directory is often set using "O=..." on the commandline.
The value can be overridden in which case the default value is ignored.
+KBUILD_ABS_SRCTREE
+--------------------------------------------------
+Kbuild uses a relative path to point to the tree when possible. For instance,
+when building in the source tree, the source tree path is '.'
+
+Setting this flag requests Kbuild to use absolute path to the source tree.
+There are some useful cases to do so, like when generating tag files with
+absolute path entries etc.
+
KBUILD_SIGN_PIN
---------------
This variable allows a passphrase or PIN to be passed to the sign-file
diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst
index 2bc8a7803365..74bef19f69f0 100644
--- a/Documentation/kbuild/kconfig-language.rst
+++ b/Documentation/kbuild/kconfig-language.rst
@@ -53,6 +53,7 @@ A menu entry can have a number of attributes. Not all of them are
applicable everywhere (see syntax).
- type definition: "bool"/"tristate"/"string"/"hex"/"int"
+
Every config option must have a type. There are only two basic types:
tristate and string; the other types are based on these two. The type
definition optionally accepts an input prompt, so these two examples
@@ -66,11 +67,13 @@ applicable everywhere (see syntax).
prompt "Networking support"
- input prompt: "prompt" <prompt> ["if" <expr>]
+
Every menu entry can have at most one prompt, which is used to display
to the user. Optionally dependencies only for this prompt can be added
with "if".
- default value: "default" <expr> ["if" <expr>]
+
A config option can have any number of default values. If multiple
default values are visible, only the first defined one is active.
Default values are not limited to the menu entry where they are
@@ -112,6 +115,7 @@ applicable everywhere (see syntax).
Optionally dependencies for this default value can be added with "if".
- dependencies: "depends on" <expr>
+
This defines a dependency for this menu entry. If multiple
dependencies are defined, they are connected with '&&'. Dependencies
are applied to all other options within this menu entry (which also
@@ -127,6 +131,7 @@ applicable everywhere (see syntax).
default y
- reverse dependencies: "select" <symbol> ["if" <expr>]
+
While normal dependencies reduce the upper limit of a symbol (see
below), reverse dependencies can be used to force a lower limit of
another symbol. The value of the current menu symbol is used as the
@@ -146,6 +151,7 @@ applicable everywhere (see syntax).
the illegal configurations all over.
- weak reverse dependencies: "imply" <symbol> ["if" <expr>]
+
This is similar to "select" as it enforces a lower limit on another
symbol except that the "implied" symbol's value may still be set to n
from a direct dependency or with a visible prompt.
@@ -176,6 +182,7 @@ applicable everywhere (see syntax).
configure that subsystem out without also having to unset these drivers.
- limiting menu display: "visible if" <expr>
+
This attribute is only applicable to menu blocks, if the condition is
false, the menu block is not displayed to the user (the symbols
contained there can still be selected by other symbols, though). It is
@@ -183,12 +190,14 @@ applicable everywhere (see syntax).
entries. Default value of "visible" is true.
- numerical ranges: "range" <symbol> <symbol> ["if" <expr>]
+
This allows to limit the range of possible input values for int
and hex symbols. The user can only input a value which is larger than
or equal to the first symbol and smaller than or equal to the second
symbol.
- help text: "help" or "---help---"
+
This defines a help text. The end of the help text is determined by
the indentation level, this means it ends at the first line which has
a smaller indentation than the first line of the help text.
@@ -197,6 +206,7 @@ applicable everywhere (see syntax).
the file as an aid to developers.
- misc options: "option" <symbol>[=<value>]
+
Various less common options can be defined via this option syntax,
which can modify the behaviour of the menu entry and its config
symbol. These options are currently possible:
@@ -325,6 +335,7 @@ end a menu entry:
The first five also start the definition of a menu entry.
config::
+
"config" <symbol>
<config options>
@@ -332,6 +343,7 @@ This defines a config symbol <symbol> and accepts any of above
attributes as options.
menuconfig::
+
"menuconfig" <symbol>
<config options>
diff --git a/Documentation/kbuild/kconfig.rst b/Documentation/kbuild/kconfig.rst
index 88129af7e539..a9a855f894b3 100644
--- a/Documentation/kbuild/kconfig.rst
+++ b/Documentation/kbuild/kconfig.rst
@@ -264,6 +264,7 @@ NCONFIG_MODE
This mode shows all sub-menus in one large tree.
Example::
+
make NCONFIG_MODE=single_menu nconfig
----------------------------------------------------------------------
@@ -277,9 +278,12 @@ Searching in xconfig:
names, so you have to know something close to what you are
looking for.
- Example:
+ Example::
+
Ctrl-F hotplug
- or
+
+ or::
+
Menu: File, Search, hotplug
lists all config symbol entries that contain "hotplug" in
diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst
index 9274cdcc9bd2..f4f0f7ffde2b 100644
--- a/Documentation/kbuild/makefiles.rst
+++ b/Documentation/kbuild/makefiles.rst
@@ -328,7 +328,7 @@ more details, with real examples.
variable $(KBUILD_CFLAGS) and uses it for compilation flags for the
entire tree.
- asflags-y specifies options for assembling with $(AS).
+ asflags-y specifies assembler options.
Example::
@@ -384,6 +384,7 @@ more details, with real examples.
-----------------------
Kbuild tracks dependencies on the following:
+
1) All prerequisite files (both `*.c` and `*.h`)
2) `CONFIG_` options used in all prerequisite files
3) Command-line used to compile target
@@ -489,7 +490,7 @@ more details, with real examples.
as-instr checks if the assembler reports a specific instruction
and then outputs either option1 or option2
C escapes are supported in the test instruction
- Note: as-instr-option uses KBUILD_AFLAGS for $(AS) options
+ Note: as-instr-option uses KBUILD_AFLAGS for assembler options
cc-option
cc-option is used to check if $(CC) supports a given option, and if
@@ -905,7 +906,7 @@ When kbuild executes, the following steps are followed (roughly):
vmlinux. The usage of $(call if_changed,xxx) will be described later.
KBUILD_AFLAGS
- $(AS) assembler flags
+ Assembler flags
Default value - see top level Makefile
Append or modify as required per architecture.
@@ -948,16 +949,16 @@ When kbuild executes, the following steps are followed (roughly):
to 'y' when selected.
KBUILD_AFLAGS_KERNEL
- $(AS) options specific for built-in
+ Assembler options specific for built-in
$(KBUILD_AFLAGS_KERNEL) contains extra C compiler flags used to compile
resident kernel code.
KBUILD_AFLAGS_MODULE
- Options for $(AS) when building modules
+ Assembler options specific for modules
$(KBUILD_AFLAGS_MODULE) is used to add arch-specific options that
- are used for $(AS).
+ are used for assembler.
From commandline AFLAGS_MODULE shall be used (see kbuild.txt).
@@ -999,11 +1000,7 @@ When kbuild executes, the following steps are followed (roughly):
------------------------------------
The archheaders: rule is used to generate header files that
- may be installed into user space by "make header_install" or
- "make headers_install_all". In order to support
- "make headers_install_all", this target has to be able to run
- on an unconfigured tree, or a tree configured for another
- architecture.
+ may be installed into user space by "make header_install".
It is run before "make archprepare" when run on the
architecture itself.
@@ -1140,6 +1137,22 @@ When kbuild executes, the following steps are followed (roughly):
In this example, extra-y is used to list object files that
shall be built, but shall not be linked as part of built-in.a.
+ header-test-y
+
+ header-test-y specifies headers (*.h) in the current directory that
+ should be compile tested to ensure they are self-contained,
+ i.e. compilable as standalone units. If CONFIG_HEADER_TEST is enabled,
+ this builds them as part of extra-y.
+
+ header-test-pattern-y
+
+ This works as a weaker version of header-test-y, and accepts wildcard
+ patterns. The typical usage is:
+
+ header-test-pattern-y += *.h
+
+ This specifies all the files that matches to '*.h' in the current
+ directory, but the files in 'header-test-' are excluded.
6.7 Commands useful for building a boot image
---------------------------------------------
diff --git a/Documentation/kdump/index.rst b/Documentation/kdump/index.rst
deleted file mode 100644
index 2b17fcf6867a..000000000000
--- a/Documentation/kdump/index.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-:orphan:
-
-================================================================
-Documentation for Kdump - The kexec-based Crash Dumping Solution
-================================================================
-
-This document includes overview, setup and installation, and analysis
-information.
-
-.. toctree::
- :maxdepth: 1
-
- kdump
- vmcoreinfo
-
-.. only:: subproject and html
-
- Indices
- =======
-
- * :ref:`genindex`
diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index dc698ea456e0..a8518ac0d31d 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -1364,7 +1364,7 @@ Futex API reference
Further reading
===============
-- ``Documentation/locking/spinlocks.txt``: Linus Torvalds' spinlocking
+- ``Documentation/locking/spinlocks.rst``: Linus Torvalds' spinlocking
tutorial in the kernel sources.
- Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
deleted file mode 100644
index 5623b9916411..000000000000
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ /dev/null
@@ -1,356 +0,0 @@
-==========================================
-Reducing OS jitter due to per-cpu kthreads
-==========================================
-
-This document lists per-CPU kthreads in the Linux kernel and presents
-options to control their OS jitter. Note that non-per-CPU kthreads are
-not listed here. To reduce OS jitter from non-per-CPU kthreads, bind
-them to a "housekeeping" CPU dedicated to such work.
-
-References
-==========
-
-- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs.
-
-- Documentation/cgroup-v1: Using cgroups to bind tasks to sets of CPUs.
-
-- man taskset: Using the taskset command to bind tasks to sets
- of CPUs.
-
-- man sched_setaffinity: Using the sched_setaffinity() system
- call to bind tasks to sets of CPUs.
-
-- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state,
- writing "0" to offline and "1" to online.
-
-- In order to locate kernel-generated OS jitter on CPU N:
-
- cd /sys/kernel/debug/tracing
- echo 1 > max_graph_depth # Increase the "1" for more detail
- echo function_graph > current_tracer
- # run workload
- cat per_cpu/cpuN/trace
-
-kthreads
-========
-
-Name:
- ehca_comp/%u
-
-Purpose:
- Periodically process Infiniband-related work.
-
-To reduce its OS jitter, do any of the following:
-
-1. Don't use eHCA Infiniband hardware, instead choosing hardware
- that does not require per-CPU kthreads. This will prevent these
- kthreads from being created in the first place. (This will
- work for most people, as this hardware, though important, is
- relatively old and is produced in relatively low unit volumes.)
-2. Do all eHCA-Infiniband-related work on other CPUs, including
- interrupts.
-3. Rework the eHCA driver so that its per-CPU kthreads are
- provisioned only on selected CPUs.
-
-
-Name:
- irq/%d-%s
-
-Purpose:
- Handle threaded interrupts.
-
-To reduce its OS jitter, do the following:
-
-1. Use irq affinity to force the irq threads to execute on
- some other CPU.
-
-Name:
- kcmtpd_ctr_%d
-
-Purpose:
- Handle Bluetooth work.
-
-To reduce its OS jitter, do one of the following:
-
-1. Don't use Bluetooth, in which case these kthreads won't be
- created in the first place.
-2. Use irq affinity to force Bluetooth-related interrupts to
- occur on some other CPU and furthermore initiate all
- Bluetooth activity on some other CPU.
-
-Name:
- ksoftirqd/%u
-
-Purpose:
- Execute softirq handlers when threaded or when under heavy load.
-
-To reduce its OS jitter, each softirq vector must be handled
-separately as follows:
-
-TIMER_SOFTIRQ
--------------
-
-Do all of the following:
-
-1. To the extent possible, keep the CPU out of the kernel when it
- is non-idle, for example, by avoiding system calls and by forcing
- both kernel threads and interrupts to execute elsewhere.
-2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force
- the CPU offline, then bring it back online. This forces
- recurring timers to migrate elsewhere. If you are concerned
- with multiple CPUs, force them all offline before bringing the
- first one back online. Once you have onlined the CPUs in question,
- do not offline any other CPUs, because doing so could force the
- timer back onto one of the CPUs in question.
-
-NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
----------------------------------
-
-Do all of the following:
-
-1. Force networking interrupts onto other CPUs.
-2. Initiate any network I/O on other CPUs.
-3. Once your application has started, prevent CPU-hotplug operations
- from being initiated from tasks that might run on the CPU to
- be de-jittered. (It is OK to force this CPU offline and then
- bring it back online before you start your application.)
-
-BLOCK_SOFTIRQ
--------------
-
-Do all of the following:
-
-1. Force block-device interrupts onto some other CPU.
-2. Initiate any block I/O on other CPUs.
-3. Once your application has started, prevent CPU-hotplug operations
- from being initiated from tasks that might run on the CPU to
- be de-jittered. (It is OK to force this CPU offline and then
- bring it back online before you start your application.)
-
-IRQ_POLL_SOFTIRQ
-----------------
-
-Do all of the following:
-
-1. Force block-device interrupts onto some other CPU.
-2. Initiate any block I/O and block-I/O polling on other CPUs.
-3. Once your application has started, prevent CPU-hotplug operations
- from being initiated from tasks that might run on the CPU to
- be de-jittered. (It is OK to force this CPU offline and then
- bring it back online before you start your application.)
-
-TASKLET_SOFTIRQ
----------------
-
-Do one or more of the following:
-
-1. Avoid use of drivers that use tasklets. (Such drivers will contain
- calls to things like tasklet_schedule().)
-2. Convert all drivers that you must use from tasklets to workqueues.
-3. Force interrupts for drivers using tasklets onto other CPUs,
- and also do I/O involving these drivers on other CPUs.
-
-SCHED_SOFTIRQ
--------------
-
-Do all of the following:
-
-1. Avoid sending scheduler IPIs to the CPU to be de-jittered,
- for example, ensure that at most one runnable kthread is present
- on that CPU. If a thread that expects to run on the de-jittered
- CPU awakens, the scheduler will send an IPI that can result in
- a subsequent SCHED_SOFTIRQ.
-2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
- is marked as an adaptive-ticks CPU using the "nohz_full="
- boot parameter. This reduces the number of scheduler-clock
- interrupts that the de-jittered CPU receives, minimizing its
- chances of being selected to do the load balancing work that
- runs in SCHED_SOFTIRQ context.
-3. To the extent possible, keep the CPU out of the kernel when it
- is non-idle, for example, by avoiding system calls and by
- forcing both kernel threads and interrupts to execute elsewhere.
- This further reduces the number of scheduler-clock interrupts
- received by the de-jittered CPU.
-
-HRTIMER_SOFTIRQ
----------------
-
-Do all of the following:
-
-1. To the extent possible, keep the CPU out of the kernel when it
- is non-idle. For example, avoid system calls and force both
- kernel threads and interrupts to execute elsewhere.
-2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the
- CPU offline, then bring it back online. This forces recurring
- timers to migrate elsewhere. If you are concerned with multiple
- CPUs, force them all offline before bringing the first one
- back online. Once you have onlined the CPUs in question, do not
- offline any other CPUs, because doing so could force the timer
- back onto one of the CPUs in question.
-
-RCU_SOFTIRQ
------------
-
-Do at least one of the following:
-
-1. Offload callbacks and keep the CPU in either dyntick-idle or
- adaptive-ticks state by doing all of the following:
-
- a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
- de-jittered is marked as an adaptive-ticks CPU using the
- "nohz_full=" boot parameter. Bind the rcuo kthreads to
- housekeeping CPUs, which can tolerate OS jitter.
- b. To the extent possible, keep the CPU out of the kernel
- when it is non-idle, for example, by avoiding system
- calls and by forcing both kernel threads and interrupts
- to execute elsewhere.
-
-2. Enable RCU to do its processing remotely via dyntick-idle by
- doing all of the following:
-
- a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
- b. Ensure that the CPU goes idle frequently, allowing other
- CPUs to detect that it has passed through an RCU quiescent
- state. If the kernel is built with CONFIG_NO_HZ_FULL=y,
- userspace execution also allows other CPUs to detect that
- the CPU in question has passed through a quiescent state.
- c. To the extent possible, keep the CPU out of the kernel
- when it is non-idle, for example, by avoiding system
- calls and by forcing both kernel threads and interrupts
- to execute elsewhere.
-
-Name:
- kworker/%u:%d%s (cpu, id, priority)
-
-Purpose:
- Execute workqueue requests
-
-To reduce its OS jitter, do any of the following:
-
-1. Run your workload at a real-time priority, which will allow
- preempting the kworker daemons.
-2. A given workqueue can be made visible in the sysfs filesystem
- by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
- Such a workqueue can be confined to a given subset of the
- CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
- files. The set of WQ_SYSFS workqueues can be displayed using
- "ls sys/devices/virtual/workqueue". That said, the workqueues
- maintainer would like to caution people against indiscriminately
- sprinkling WQ_SYSFS across all the workqueues. The reason for
- caution is that it is easy to add WQ_SYSFS, but because sysfs is
- part of the formal user/kernel API, it can be nearly impossible
- to remove it, even if its addition was a mistake.
-3. Do any of the following needed to avoid jitter that your
- application cannot tolerate:
-
- a. Build your kernel with CONFIG_SLUB=y rather than
- CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
- use of each CPU's workqueues to run its cache_reap()
- function.
- b. Avoid using oprofile, thus avoiding OS jitter from
- wq_sync_buffer().
- c. Limit your CPU frequency so that a CPU-frequency
- governor is not required, possibly enlisting the aid of
- special heatsinks or other cooling technologies. If done
- correctly, and if you CPU architecture permits, you should
- be able to build your kernel with CONFIG_CPU_FREQ=n to
- avoid the CPU-frequency governor periodically running
- on each CPU, including cs_dbs_timer() and od_dbs_timer().
-
- WARNING: Please check your CPU specifications to
- make sure that this is safe on your particular system.
- d. As of v3.18, Christoph Lameter's on-demand vmstat workers
- commit prevents OS jitter due to vmstat_update() on
- CONFIG_SMP=y systems. Before v3.18, is not possible
- to entirely get rid of the OS jitter, but you can
- decrease its frequency by writing a large value to
- /proc/sys/vm/stat_interval. The default value is HZ,
- for an interval of one second. Of course, larger values
- will make your virtual-memory statistics update more
- slowly. Of course, you can also run your workload at
- a real-time priority, thus preempting vmstat_update(),
- but if your workload is CPU-bound, this is a bad idea.
- However, there is an RFC patch from Christoph Lameter
- (based on an earlier one from Gilad Ben-Yossef) that
- reduces or even eliminates vmstat overhead for some
- workloads at https://lkml.org/lkml/2013/9/4/379.
- e. Boot with "elevator=noop" to avoid workqueue use by
- the block layer.
- f. If running on high-end powerpc servers, build with
- CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS
- daemon from running on each CPU every second or so.
- (This will require editing Kconfig files and will defeat
- this platform's RAS functionality.) This avoids jitter
- due to the rtas_event_scan() function.
- WARNING: Please check your CPU specifications to
- make sure that this is safe on your particular system.
- g. If running on Cell Processor, build your kernel with
- CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
- spu_gov_work().
- WARNING: Please check your CPU specifications to
- make sure that this is safe on your particular system.
- h. If running on PowerMAC, build your kernel with
- CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
- avoiding OS jitter from rackmeter_do_timer().
-
-Name:
- rcuc/%u
-
-Purpose:
- Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
-
-To reduce its OS jitter, do at least one of the following:
-
-1. Build the kernel with CONFIG_PREEMPT=n. This prevents these
- kthreads from being created in the first place, and also obviates
- the need for RCU priority boosting. This approach is feasible
- for workloads that do not require high degrees of responsiveness.
-2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these
- kthreads from being created in the first place. This approach
- is feasible only if your workload never requires RCU priority
- boosting, for example, if you ensure frequent idle time on all
- CPUs that might execute within the kernel.
-3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
- boot parameter offloading RCU callbacks from all CPUs susceptible
- to OS jitter. This approach prevents the rcuc/%u kthreads from
- having any work to do, so that they are never awakened.
-4. Ensure that the CPU never enters the kernel, and, in particular,
- avoid initiating any CPU hotplug operations on this CPU. This is
- another way of preventing any callbacks from being queued on the
- CPU, again preventing the rcuc/%u kthreads from having any work
- to do.
-
-Name:
- rcuop/%d and rcuos/%d
-
-Purpose:
- Offload RCU callbacks from the corresponding CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1. Use affinity, cgroups, or other mechanism to force these kthreads
- to execute on some other CPU.
-2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
- kthreads from being created in the first place. However, please
- note that this will not eliminate OS jitter, but will instead
- shift it to RCU_SOFTIRQ.
-
-Name:
- watchdog/%u
-
-Purpose:
- Detect software lockups on each CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
- kthreads from being created in the first place.
-2. Boot with "nosoftlockup=0", which will also prevent these kthreads
- from being created. Other related watchdog and softlockup boot
- parameters may be found in Documentation/admin-guide/kernel-parameters.rst
- and Documentation/watchdog/watchdog-parameters.rst.
-3. Echo a zero to /proc/sys/kernel/watchdog to disable the
- watchdog timer.
-4. Echo a large number of /proc/sys/kernel/watchdog_thresh in
- order to reduce the frequency of OS jitter due to the watchdog
- timer down to a level that is acceptable for your workload.
diff --git a/Documentation/laptops/asus-laptop.txt b/Documentation/laptops/asus-laptop.txt
deleted file mode 100644
index 5f2858712aa0..000000000000
--- a/Documentation/laptops/asus-laptop.txt
+++ /dev/null
@@ -1,257 +0,0 @@
-Asus Laptop Extras
-
-Version 0.1
-August 6, 2009
-
-Corentin Chary <corentincj@iksaif.net>
-http://acpi4asus.sf.net/
-
- This driver provides support for extra features of ACPI-compatible ASUS laptops.
- It may also support some MEDION, JVC or VICTOR laptops (such as MEDION 9675 or
- VICTOR XP7210 for example). It makes all the extra buttons generate input
- events (like keyboards).
- On some models adds support for changing the display brightness and output,
- switching the LCD backlight on and off, and most importantly, allows you to
- blink those fancy LEDs intended for reporting mail and wireless status.
-
-This driver supercedes the old asus_acpi driver.
-
-Requirements
-------------
-
- Kernel 2.6.X sources, configured for your computer, with ACPI support.
- You also need CONFIG_INPUT and CONFIG_ACPI.
-
-Status
-------
-
- The features currently supported are the following (see below for
- detailed description):
-
- - Fn key combinations
- - Bluetooth enable and disable
- - Wlan enable and disable
- - GPS enable and disable
- - Video output switching
- - Ambient Light Sensor on and off
- - LED control
- - LED Display control
- - LCD brightness control
- - LCD on and off
-
- A compatibility table by model and feature is maintained on the web
- site, http://acpi4asus.sf.net/.
-
-Usage
------
-
- Try "modprobe asus-laptop". Check your dmesg (simply type dmesg). You should
- see some lines like this :
-
- Asus Laptop Extras version 0.42
- L2D model detected.
-
- If it is not the output you have on your laptop, send it (and the laptop's
- DSDT) to me.
-
- That's all, now, all the events generated by the hotkeys of your laptop
- should be reported via netlink events. You can check with
- "acpi_genl monitor" (part of the acpica project).
-
- Hotkeys are also reported as input keys (like keyboards) you can check
- which key are supported using "xev" under X11.
-
- You can get information on the version of your DSDT table by reading the
- /sys/devices/platform/asus-laptop/infos entry. If you have a question or a
- bug report to do, please include the output of this entry.
-
-LEDs
-----
-
- You can modify LEDs be echoing values to /sys/class/leds/asus::*/brightness :
- echo 1 > /sys/class/leds/asus::mail/brightness
- will switch the mail LED on.
- You can also know if they are on/off by reading their content and use
- kernel triggers like disk-activity or heartbeat.
-
-Backlight
----------
-
- You can control lcd backlight power and brightness with
- /sys/class/backlight/asus-laptop/. Brightness Values are between 0 and 15.
-
-Wireless devices
----------------
-
- You can turn the internal Bluetooth adapter on/off with the bluetooth entry
- (only on models with Bluetooth). This usually controls the associated LED.
- Same for Wlan adapter.
-
-Display switching
------------------
-
- Note: the display switching code is currently considered EXPERIMENTAL.
-
- Switching works for the following models:
- L3800C
- A2500H
- L5800C
- M5200N
- W1000N (albeit with some glitches)
- M6700R
- A6JC
- F3J
-
- Switching doesn't work for the following:
- M3700N
- L2X00D (locks the laptop under certain conditions)
-
- To switch the displays, echo values from 0 to 15 to
- /sys/devices/platform/asus-laptop/display. The significance of those values
- is as follows:
-
- +-------+-----+-----+-----+-----+-----+
- | Bin | Val | DVI | TV | CRT | LCD |
- +-------+-----+-----+-----+-----+-----+
- + 0000 + 0 + + + + +
- +-------+-----+-----+-----+-----+-----+
- + 0001 + 1 + + + + X +
- +-------+-----+-----+-----+-----+-----+
- + 0010 + 2 + + + X + +
- +-------+-----+-----+-----+-----+-----+
- + 0011 + 3 + + + X + X +
- +-------+-----+-----+-----+-----+-----+
- + 0100 + 4 + + X + + +
- +-------+-----+-----+-----+-----+-----+
- + 0101 + 5 + + X + + X +
- +-------+-----+-----+-----+-----+-----+
- + 0110 + 6 + + X + X + +
- +-------+-----+-----+-----+-----+-----+
- + 0111 + 7 + + X + X + X +
- +-------+-----+-----+-----+-----+-----+
- + 1000 + 8 + X + + + +
- +-------+-----+-----+-----+-----+-----+
- + 1001 + 9 + X + + + X +
- +-------+-----+-----+-----+-----+-----+
- + 1010 + 10 + X + + X + +
- +-------+-----+-----+-----+-----+-----+
- + 1011 + 11 + X + + X + X +
- +-------+-----+-----+-----+-----+-----+
- + 1100 + 12 + X + X + + +
- +-------+-----+-----+-----+-----+-----+
- + 1101 + 13 + X + X + + X +
- +-------+-----+-----+-----+-----+-----+
- + 1110 + 14 + X + X + X + +
- +-------+-----+-----+-----+-----+-----+
- + 1111 + 15 + X + X + X + X +
- +-------+-----+-----+-----+-----+-----+
-
- In most cases, the appropriate displays must be plugged in for the above
- combinations to work. TV-Out may need to be initialized at boot time.
-
- Debugging:
- 1) Check whether the Fn+F8 key:
- a) does not lock the laptop (try a boot with noapic / nolapic if it does)
- b) generates events (0x6n, where n is the value corresponding to the
- configuration above)
- c) actually works
- Record the disp value at every configuration.
- 2) Echo values from 0 to 15 to /sys/devices/platform/asus-laptop/display.
- Record its value, note any change. If nothing changes, try a broader range,
- up to 65535.
- 3) Send ANY output (both positive and negative reports are needed, unless your
- machine is already listed above) to the acpi4asus-user mailing list.
-
- Note: on some machines (e.g. L3C), after the module has been loaded, only 0x6n
- events are generated and no actual switching occurs. In such a case, a line
- like:
-
- echo $((10#$arg-60)) > /sys/devices/platform/asus-laptop/display
-
- will usually do the trick ($arg is the 0000006n-like event passed to acpid).
-
- Note: there is currently no reliable way to read display status on xxN
- (Centrino) models.
-
-LED display
------------
-
- Some models like the W1N have a LED display that can be used to display
- several items of information.
-
- LED display works for the following models:
- W1000N
- W1J
-
- To control the LED display, use the following :
-
- echo 0x0T000DDD > /sys/devices/platform/asus-laptop/
-
- where T control the 3 letters display, and DDD the 3 digits display,
- according to the tables below.
-
- DDD (digits)
- 000 to 999 = display digits
- AAA = ---
- BBB to FFF = turn-off
-
- T (type)
- 0 = off
- 1 = dvd
- 2 = vcd
- 3 = mp3
- 4 = cd
- 5 = tv
- 6 = cpu
- 7 = vol
-
- For example "echo 0x01000001 >/sys/devices/platform/asus-laptop/ledd"
- would display "DVD001".
-
-Driver options:
----------------
-
- Options can be passed to the asus-laptop driver using the standard
- module argument syntax (<param>=<value> when passing the option to the
- module or asus-laptop.<param>=<value> on the kernel boot line when
- asus-laptop is statically linked into the kernel).
-
- wapf: WAPF defines the behavior of the Fn+Fx wlan key
- The significance of values is yet to be found, but
- most of the time:
- - 0x0 should do nothing
- - 0x1 should allow to control the device with Fn+Fx key.
- - 0x4 should send an ACPI event (0x88) while pressing the Fn+Fx key
- - 0x5 like 0x1 or 0x4
-
- The default value is 0x1.
-
-Unsupported models
-------------------
-
- These models will never be supported by this module, as they use a completely
- different mechanism to handle LEDs and extra stuff (meaning we have no clue
- how it works):
-
- - ASUS A1300 (A1B), A1370D
- - ASUS L7300G
- - ASUS L8400
-
-Patches, Errors, Questions:
---------------------------
-
- I appreciate any success or failure
- reports, especially if they add to or correct the compatibility table.
- Please include the following information in your report:
-
- - Asus model name
- - a copy of your ACPI tables, using the "acpidump" utility
- - a copy of /sys/devices/platform/asus-laptop/infos
- - which driver features work and which don't
- - the observed behavior of non-working features
-
- Any other comments or patches are also more than welcome.
-
- acpi4asus-user@lists.sourceforge.net
- http://sourceforge.net/projects/acpi4asus
-
diff --git a/Documentation/laptops/disk-shock-protection.txt b/Documentation/laptops/disk-shock-protection.txt
deleted file mode 100644
index 0e6ba2663834..000000000000
--- a/Documentation/laptops/disk-shock-protection.txt
+++ /dev/null
@@ -1,149 +0,0 @@
-Hard disk shock protection
-==========================
-
-Author: Elias Oltmanns <eo@nebensachen.de>
-Last modified: 2008-10-03
-
-
-0. Contents
------------
-
-1. Intro
-2. The interface
-3. References
-4. CREDITS
-
-
-1. Intro
---------
-
-ATA/ATAPI-7 specifies the IDLE IMMEDIATE command with unload feature.
-Issuing this command should cause the drive to switch to idle mode and
-unload disk heads. This feature is being used in modern laptops in
-conjunction with accelerometers and appropriate software to implement
-a shock protection facility. The idea is to stop all I/O operations on
-the internal hard drive and park its heads on the ramp when critical
-situations are anticipated. The desire to have such a feature
-available on GNU/Linux systems has been the original motivation to
-implement a generic disk head parking interface in the Linux kernel.
-Please note, however, that other components have to be set up on your
-system in order to get disk shock protection working (see
-section 3. References below for pointers to more information about
-that).
-
-
-2. The interface
-----------------
-
-For each ATA device, the kernel exports the file
-block/*/device/unload_heads in sysfs (here assumed to be mounted under
-/sys). Access to /sys/block/*/device/unload_heads is denied with
--EOPNOTSUPP if the device does not support the unload feature.
-Otherwise, writing an integer value to this file will take the heads
-of the respective drive off the platter and block all I/O operations
-for the specified number of milliseconds. When the timeout expires and
-no further disk head park request has been issued in the meantime,
-normal operation will be resumed. The maximal value accepted for a
-timeout is 30000 milliseconds. Exceeding this limit will return
--EOVERFLOW, but heads will be parked anyway and the timeout will be
-set to 30 seconds. However, you can always change a timeout to any
-value between 0 and 30000 by issuing a subsequent head park request
-before the timeout of the previous one has expired. In particular, the
-total timeout can exceed 30 seconds and, more importantly, you can
-cancel a previously set timeout and resume normal operation
-immediately by specifying a timeout of 0. Values below -2 are rejected
-with -EINVAL (see below for the special meaning of -1 and -2). If the
-timeout specified for a recent head park request has not yet expired,
-reading from /sys/block/*/device/unload_heads will report the number
-of milliseconds remaining until normal operation will be resumed;
-otherwise, reading the unload_heads attribute will return 0.
-
-For example, do the following in order to park the heads of drive
-/dev/sda and stop all I/O operations for five seconds:
-
-# echo 5000 > /sys/block/sda/device/unload_heads
-
-A simple
-
-# cat /sys/block/sda/device/unload_heads
-
-will show you how many milliseconds are left before normal operation
-will be resumed.
-
-A word of caution: The fact that the interface operates on a basis of
-milliseconds may raise expectations that cannot be satisfied in
-reality. In fact, the ATA specs clearly state that the time for an
-unload operation to complete is vendor specific. The hint in ATA-7
-that this will typically be within 500 milliseconds apparently has
-been dropped in ATA-8.
-
-There is a technical detail of this implementation that may cause some
-confusion and should be discussed here. When a head park request has
-been issued to a device successfully, all I/O operations on the
-controller port this device is attached to will be deferred. That is
-to say, any other device that may be connected to the same port will
-be affected too. The only exception is that a subsequent head unload
-request to that other device will be executed immediately. Further
-operations on that port will be deferred until the timeout specified
-for either device on the port has expired. As far as PATA (old style
-IDE) configurations are concerned, there can only be two devices
-attached to any single port. In SATA world we have port multipliers
-which means that a user-issued head parking request to one device may
-actually result in stopping I/O to a whole bunch of devices. However,
-since this feature is supposed to be used on laptops and does not seem
-to be very useful in any other environment, there will be mostly one
-device per port. Even if the CD/DVD writer happens to be connected to
-the same port as the hard drive, it generally *should* recover just
-fine from the occasional buffer under-run incurred by a head park
-request to the HD. Actually, when you are using an ide driver rather
-than its libata counterpart (i.e. your disk is called /dev/hda
-instead of /dev/sda), then parking the heads of one drive (drive X)
-will generally not affect the mode of operation of another drive
-(drive Y) on the same port as described above. It is only when a port
-reset is required to recover from an exception on drive Y that further
-I/O operations on that drive (and the reset itself) will be delayed
-until drive X is no longer in the parked state.
-
-Finally, there are some hard drives that only comply with an earlier
-version of the ATA standard than ATA-7, but do support the unload
-feature nonetheless. Unfortunately, there is no safe way Linux can
-detect these devices, so you won't be able to write to the
-unload_heads attribute. If you know that your device really does
-support the unload feature (for instance, because the vendor of your
-laptop or the hard drive itself told you so), then you can tell the
-kernel to enable the usage of this feature for that drive by writing
-the special value -1 to the unload_heads attribute:
-
-# echo -1 > /sys/block/sda/device/unload_heads
-
-will enable the feature for /dev/sda, and giving -2 instead of -1 will
-disable it again.
-
-
-3. References
--------------
-
-There are several laptops from different vendors featuring shock
-protection capabilities. As manufacturers have refused to support open
-source development of the required software components so far, Linux
-support for shock protection varies considerably between different
-hardware implementations. Ideally, this section should contain a list
-of pointers at different projects aiming at an implementation of shock
-protection on different systems. Unfortunately, I only know of a
-single project which, although still considered experimental, is fit
-for use. Please feel free to add projects that have been the victims
-of my ignorance.
-
-- http://www.thinkwiki.org/wiki/HDAPS
- See this page for information about Linux support of the hard disk
- active protection system as implemented in IBM/Lenovo Thinkpads.
-
-
-4. CREDITS
-----------
-
-This implementation of disk head parking has been inspired by a patch
-originally published by Jon Escombe <lists@dresco.co.uk>. My efforts
-to develop an implementation of this feature that is fit to be merged
-into mainline have been aided by various kernel developers, in
-particular by Tejun Heo and Bartlomiej Zolnierkiewicz.
diff --git a/Documentation/laptops/laptop-mode.txt b/Documentation/laptops/laptop-mode.txt
deleted file mode 100644
index 1c707fc9b141..000000000000
--- a/Documentation/laptops/laptop-mode.txt
+++ /dev/null
@@ -1,782 +0,0 @@
-How to conserve battery power using laptop-mode
------------------------------------------------
-
-Document Author: Bart Samwel (bart@samwel.tk)
-Date created: January 2, 2004
-Last modified: December 06, 2004
-
-Introduction
-------------
-
-Laptop mode is used to minimize the time that the hard disk needs to be spun up,
-to conserve battery power on laptops. It has been reported to cause significant
-power savings.
-
-Contents
---------
-
-* Introduction
-* Installation
-* Caveats
-* The Details
-* Tips & Tricks
-* Control script
-* ACPI integration
-* Monitoring tool
-
-
-Installation
-------------
-
-To use laptop mode, you don't need to set any kernel configuration options
-or anything. Simply install all the files included in this document, and
-laptop mode will automatically be started when you're on battery. For
-your convenience, a tarball containing an installer can be downloaded at:
-
-http://www.samwel.tk/laptop_mode/laptop_mode/
-
-To configure laptop mode, you need to edit the configuration file, which is
-located in /etc/default/laptop-mode on Debian-based systems, or in
-/etc/sysconfig/laptop-mode on other systems.
-
-Unfortunately, automatic enabling of laptop mode does not work for
-laptops that don't have ACPI. On those laptops, you need to start laptop
-mode manually. To start laptop mode, run "laptop_mode start", and to
-stop it, run "laptop_mode stop". (Note: The laptop mode tools package now
-has experimental support for APM, you might want to try that first.)
-
-
-Caveats
--------
-
-* The downside of laptop mode is that you have a chance of losing up to 10
- minutes of work. If you cannot afford this, don't use it! The supplied ACPI
- scripts automatically turn off laptop mode when the battery almost runs out,
- so that you won't lose any data at the end of your battery life.
-
-* Most desktop hard drives have a very limited lifetime measured in spindown
- cycles, typically about 50.000 times (it's usually listed on the spec sheet).
- Check your drive's rating, and don't wear down your drive's lifetime if you
- don't need to.
-
-* If you mount some of your ext3/reiserfs filesystems with the -n option, then
- the control script will not be able to remount them correctly. You must set
- DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
- wrong options -- or it will fail because it cannot write to /etc/mtab.
-
-* If you have your filesystems listed as type "auto" in fstab, like I did, then
- the control script will not recognize them as filesystems that need remounting.
- You must list the filesystems with their true type instead.
-
-* It has been reported that some versions of the mutt mail client use file access
- times to determine whether a folder contains new mail. If you use mutt and
- experience this, you must disable the noatime remounting by setting the option
- DO_REMOUNT_NOATIME to 0 in the configuration file.
-
-
-The Details
------------
-
-Laptop mode is controlled by the knob /proc/sys/vm/laptop_mode. This knob is
-present for all kernels that have the laptop mode patch, regardless of any
-configuration options. When the knob is set, any physical disk I/O (that might
-have caused the hard disk to spin up) causes Linux to flush all dirty blocks. The
-result of this is that after a disk has spun down, it will not be spun up
-anymore to write dirty blocks, because those blocks had already been written
-immediately after the most recent read operation. The value of the laptop_mode
-knob determines the time between the occurrence of disk I/O and when the flush
-is triggered. A sensible value for the knob is 5 seconds. Setting the knob to
-0 disables laptop mode.
-
-To increase the effectiveness of the laptop_mode strategy, the laptop_mode
-control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
-/proc/sys/vm to about 10 minutes (by default), which means that pages that are
-dirtied are not forced to be written to disk as often. The control script also
-changes the dirty background ratio, so that background writeback of dirty pages
-is not done anymore. Combined with a higher commit value (also 10 minutes) for
-ext3 or ReiserFS filesystems (also done automatically by the control script),
-this results in concentration of disk activity in a small time interval which
-occurs only once every 10 minutes, or whenever the disk is forced to spin up by
-a cache miss. The disk can then be spun down in the periods of inactivity.
-
-If you want to find out which process caused the disk to spin up, you can
-gather information by setting the flag /proc/sys/vm/block_dump. When this flag
-is set, Linux reports all disk read and write operations that take place, and
-all block dirtyings done to files. This makes it possible to debug why a disk
-needs to spin up, and to increase battery life even more. The output of
-block_dump is written to the kernel output, and it can be retrieved using
-"dmesg". When you use block_dump and your kernel logging level also includes
-kernel debugging messages, you probably want to turn off klogd, otherwise
-the output of block_dump will be logged, causing disk activity that is not
-normally there.
-
-
-Configuration
--------------
-
-The laptop mode configuration file is located in /etc/default/laptop-mode on
-Debian-based systems, or in /etc/sysconfig/laptop-mode on other systems. It
-contains the following options:
-
-MAX_AGE:
-
-Maximum time, in seconds, of hard drive spindown time that you are
-comfortable with. Worst case, it's possible that you could lose this
-amount of work if your battery fails while you're in laptop mode.
-
-MINIMUM_BATTERY_MINUTES:
-
-Automatically disable laptop mode if the remaining number of minutes of
-battery power is less than this value. Default is 10 minutes.
-
-AC_HD/BATT_HD:
-
-The idle timeout that should be set on your hard drive when laptop mode
-is active (BATT_HD) and when it is not active (AC_HD). The defaults are
-20 seconds (value 4) for BATT_HD and 2 hours (value 244) for AC_HD. The
-possible values are those listed in the manual page for "hdparm" for the
-"-S" option.
-
-HD:
-
-The devices for which the spindown timeout should be adjusted by laptop mode.
-Default is /dev/hda. If you specify multiple devices, separate them by a space.
-
-READAHEAD:
-
-Disk readahead, in 512-byte sectors, while laptop mode is active. A large
-readahead can prevent disk accesses for things like executable pages (which are
-loaded on demand while the application executes) and sequentially accessed data
-(MP3s).
-
-DO_REMOUNTS:
-
-The control script automatically remounts any mounted journaled filesystems
-with appropriate commit interval options. When this option is set to 0, this
-feature is disabled.
-
-DO_REMOUNT_NOATIME:
-
-When remounting, should the filesystems be remounted with the noatime option?
-Normally, this is set to "1" (enabled), but there may be programs that require
-access time recording.
-
-DIRTY_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-before a writeback is forced, while laptop mode is active. Corresponds to
-the /proc/sys/vm/dirty_ratio sysctl.
-
-DIRTY_BACKGROUND_RATIO:
-
-The percentage of memory that is allowed to contain "dirty" or unsaved data
-after a forced writeback is done due to an exceeding of DIRTY_RATIO. Set
-this nice and low. This corresponds to the /proc/sys/vm/dirty_background_ratio
-sysctl.
-
-Note that the behaviour of dirty_background_ratio is quite different
-when laptop mode is active and when it isn't. When laptop mode is inactive,
-dirty_background_ratio is the threshold percentage at which background writeouts
-start taking place. When laptop mode is active, however, background writeouts
-are disabled, and the dirty_background_ratio only determines how much writeback
-is done when dirty_ratio is reached.
-
-DO_CPU:
-
-Enable CPU frequency scaling when in laptop mode. (Requires CPUFreq to be setup.
-See Documentation/admin-guide/pm/cpufreq.rst for more info. Disabled by default.)
-
-CPU_MAXFREQ:
-
-When on battery, what is the maximum CPU speed that the system should use? Legal
-values are "slowest" for the slowest speed that your CPU is able to operate at,
-or a value listed in /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies.
-
-
-Tips & Tricks
--------------
-
-* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
- of his regular 3 to 3.5 hours) using a spindown time of 5 seconds (BATT_HD=1).
-
-* You can spin down the disk while playing MP3, by setting disk readahead
- to 8MB (READAHEAD=16384). Effectively, the disk will read a complete MP3 at
- once, and will then spin down while the MP3 is playing. (Thanks to Bartek
- Kania.)
-
-* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
- of colours that my display uses it consumes less battery power. I've seen
- this on powerbooks too. I hope that this is a piece of information that
- might be useful to the Laptop Mode patch or its users."
-
-* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
- file after every logging. When you're using laptop-mode and your disk doesn't
- spin down, this is a likely culprit.
-
-* Richard Atterer observed that laptop mode does not work well with noflushd
- (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
- from doing its thing.
-
-* If you're worried about your data, you might want to consider using a USB
- memory stick or something like that as a "working area". (Be aware though
- that flash memory can only handle a limited number of writes, and overuse
- may wear out your memory stick pretty quickly. Do _not_ use journalling
- filesystems on flash memory sticks.)
-
-
-Configuration file for control and ACPI battery scripts
--------------------------------------------------------
-
-This allows the tunables to be changed for the scripts via an external
-configuration file
-
-It should be installed as /etc/default/laptop-mode on Debian, and as
-/etc/sysconfig/laptop-mode on Red Hat, SUSE, Mandrake, and other work-alikes.
-
---------------------CONFIG FILE BEGIN-------------------------------------------
-# Maximum time, in seconds, of hard drive spindown time that you are
-# comfortable with. Worst case, it's possible that you could lose this
-# amount of work if your battery fails you while in laptop mode.
-#MAX_AGE=600
-
-# Automatically disable laptop mode when the number of minutes of battery
-# that you have left goes below this threshold.
-MINIMUM_BATTERY_MINUTES=10
-
-# Read-ahead, in 512-byte sectors. You can spin down the disk while playing MP3/OGG
-# by setting the disk readahead to 8MB (READAHEAD=16384). Effectively, the disk
-# will read a complete MP3 at once, and will then spin down while the MP3/OGG is
-# playing.
-#READAHEAD=4096
-
-# Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-#DO_REMOUNTS=1
-
-# And shall we add the "noatime" option to that as well? (1=yes)
-#DO_REMOUNT_NOATIME=1
-
-# Dirty synchronous ratio. At this percentage of dirty pages the process
-# which
-# calls write() does its own writeback
-#DIRTY_RATIO=40
-
-#
-# Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been
-# exceeded, the kernel will wake flusher threads which will then reduce the
-# amount of dirty memory to dirty_background_ratio. Set this nice and low,
-# so once some writeout has commenced, we do a lot of it.
-#
-#DIRTY_BACKGROUND_RATIO=5
-
-# kernel default dirty buffer age
-#DEF_AGE=30
-#DEF_UPDATE=5
-#DEF_DIRTY_BACKGROUND_RATIO=10
-#DEF_DIRTY_RATIO=40
-#DEF_XFS_AGE_BUFFER=15
-#DEF_XFS_SYNC_INTERVAL=30
-#DEF_XFS_BUFD_INTERVAL=1
-
-# This must be adjusted manually to the value of HZ in the running kernel
-# on 2.4, until the XFS people change their 2.4 external interfaces to work in
-# centisecs. This can be automated, but it's a work in progress that still
-# needs# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for
-# external interfaces, and that is currently always set to 100. So you don't
-# need to change this on 2.6.
-#XFS_HZ=100
-
-# Should the maximum CPU frequency be adjusted down while on battery?
-# Requires CPUFreq to be setup.
-# See Documentation/admin-guide/pm/cpufreq.rst for more info
-#DO_CPU=0
-
-# When on battery what is the maximum CPU speed that the system should
-# use? Legal values are "slowest" for the slowest speed that your
-# CPU is able to operate at, or a value listed in:
-# /sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies
-# Only applicable if DO_CPU=1.
-#CPU_MAXFREQ=slowest
-
-# Idle timeout for your hard drive (man hdparm for valid values, -S option)
-# Default is 2 hours on AC (AC_HD=244) and 20 seconds for battery (BATT_HD=4).
-#AC_HD=244
-#BATT_HD=4
-
-# The drives for which to adjust the idle timeout. Separate them by a space,
-# e.g. HD="/dev/hda /dev/hdb".
-#HD="/dev/hda"
-
-# Set the spindown timeout on a hard drive?
-#DO_HD=1
-
---------------------CONFIG FILE END---------------------------------------------
-
-
-Control script
---------------
-
-Please note that this control script works for the Linux 2.4 and 2.6 series (thanks
-to Kiko Piris).
-
---------------------CONTROL SCRIPT BEGIN----------------------------------------
-#!/bin/bash
-
-# start or stop laptop_mode, best run by a power management daemon when
-# ac gets connected/disconnected from a laptop
-#
-# install as /sbin/laptop_mode
-#
-# Contributors to this script: Kiko Piris
-# Bart Samwel
-# Micha Feigin
-# Andrew Morton
-# Herve Eychenne
-# Dax Kelson
-#
-# Original Linux 2.4 version by: Jens Axboe
-
-#############################################################################
-
-# Source config
-if [ -f /etc/default/laptop-mode ] ; then
- # Debian
- . /etc/default/laptop-mode
-elif [ -f /etc/sysconfig/laptop-mode ] ; then
- # Others
- . /etc/sysconfig/laptop-mode
-fi
-
-# Don't raise an error if the config file is incomplete
-# set defaults instead:
-
-# Maximum time, in seconds, of hard drive spindown time that you are
-# comfortable with. Worst case, it's possible that you could lose this
-# amount of work if your battery fails you while in laptop mode.
-MAX_AGE=${MAX_AGE:-'600'}
-
-# Read-ahead, in kilobytes
-READAHEAD=${READAHEAD:-'4096'}
-
-# Shall we remount journaled fs. with appropriate commit interval? (1=yes)
-DO_REMOUNTS=${DO_REMOUNTS:-'1'}
-
-# And shall we add the "noatime" option to that as well? (1=yes)
-DO_REMOUNT_NOATIME=${DO_REMOUNT_NOATIME:-'1'}
-
-# Shall we adjust the idle timeout on a hard drive?
-DO_HD=${DO_HD:-'1'}
-
-# Adjust idle timeout on which hard drive?
-HD="${HD:-'/dev/hda'}"
-
-# spindown time for HD (hdparm -S values)
-AC_HD=${AC_HD:-'244'}
-BATT_HD=${BATT_HD:-'4'}
-
-# Dirty synchronous ratio. At this percentage of dirty pages the process which
-# calls write() does its own writeback
-DIRTY_RATIO=${DIRTY_RATIO:-'40'}
-
-# cpu frequency scaling
-# See Documentation/admin-guide/pm/cpufreq.rst for more info
-DO_CPU=${CPU_MANAGE:-'0'}
-CPU_MAXFREQ=${CPU_MAXFREQ:-'slowest'}
-
-#
-# Allowed dirty background ratio, in percent. Once DIRTY_RATIO has been
-# exceeded, the kernel will wake flusher threads which will then reduce the
-# amount of dirty memory to dirty_background_ratio. Set this nice and low,
-# so once some writeout has commenced, we do a lot of it.
-#
-DIRTY_BACKGROUND_RATIO=${DIRTY_BACKGROUND_RATIO:-'5'}
-
-# kernel default dirty buffer age
-DEF_AGE=${DEF_AGE:-'30'}
-DEF_UPDATE=${DEF_UPDATE:-'5'}
-DEF_DIRTY_BACKGROUND_RATIO=${DEF_DIRTY_BACKGROUND_RATIO:-'10'}
-DEF_DIRTY_RATIO=${DEF_DIRTY_RATIO:-'40'}
-DEF_XFS_AGE_BUFFER=${DEF_XFS_AGE_BUFFER:-'15'}
-DEF_XFS_SYNC_INTERVAL=${DEF_XFS_SYNC_INTERVAL:-'30'}
-DEF_XFS_BUFD_INTERVAL=${DEF_XFS_BUFD_INTERVAL:-'1'}
-
-# This must be adjusted manually to the value of HZ in the running kernel
-# on 2.4, until the XFS people change their 2.4 external interfaces to work in
-# centisecs. This can be automated, but it's a work in progress that still needs
-# some fixes. On 2.6 kernels, XFS uses USER_HZ instead of HZ for external
-# interfaces, and that is currently always set to 100. So you don't need to
-# change this on 2.6.
-XFS_HZ=${XFS_HZ:-'100'}
-
-#############################################################################
-
-KLEVEL="$(uname -r |
- {
- IFS='.' read a b c
- echo $a.$b
- }
-)"
-case "$KLEVEL" in
- "2.4"|"2.6")
- ;;
- *)
- echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')" >&2
- exit 1
- ;;
-esac
-
-if [ ! -e /proc/sys/vm/laptop_mode ] ; then
- echo "Kernel is not patched with laptop_mode patch." >&2
- exit 1
-fi
-
-if [ ! -w /proc/sys/vm/laptop_mode ] ; then
- echo "You do not have enough privileges to enable laptop_mode." >&2
- exit 1
-fi
-
-# Remove an option (the first parameter) of the form option=<number> from
-# a mount options string (the rest of the parameters).
-parse_mount_opts () {
- OPT="$1"
- shift
- echo ",$*," | sed \
- -e 's/,'"$OPT"'=[0-9]*,/,/g' \
- -e 's/,,*/,/g' \
- -e 's/^,//' \
- -e 's/,$//'
-}
-
-# Remove an option (the first parameter) without any arguments from
-# a mount option string (the rest of the parameters).
-parse_nonumber_mount_opts () {
- OPT="$1"
- shift
- echo ",$*," | sed \
- -e 's/,'"$OPT"',/,/g' \
- -e 's/,,*/,/g' \
- -e 's/^,//' \
- -e 's/,$//'
-}
-
-# Find out the state of a yes/no option (e.g. "atime"/"noatime") in
-# fstab for a given filesystem, and use this state to replace the
-# value of the option in another mount options string. The device
-# is the first argument, the option name the second, and the default
-# value the third. The remainder is the mount options string.
-#
-# Example:
-# parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
-#
-# If fstab contains, say, "rw" for this filesystem, then the result
-# will be "defaults,atime".
-parse_yesno_opts_wfstab () {
- L_DEV="$1"
- OPT="$2"
- DEF_OPT="$3"
- shift 3
- L_OPTS="$*"
- PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
- PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
- # Watch for a default atime in fstab
- FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
- if echo "$FSTAB_OPTS" | grep "$OPT" > /dev/null ; then
- # option specified in fstab: extract the value and use it
- if echo "$FSTAB_OPTS" | grep "no$OPT" > /dev/null ; then
- echo "$PARSEDOPTS1,no$OPT"
- else
- # no$OPT not found -- so we must have $OPT.
- echo "$PARSEDOPTS1,$OPT"
- fi
- else
- # option not specified in fstab -- choose the default.
- echo "$PARSEDOPTS1,$DEF_OPT"
- fi
-}
-
-# Find out the state of a numbered option (e.g. "commit=NNN") in
-# fstab for a given filesystem, and use this state to replace the
-# value of the option in another mount options string. The device
-# is the first argument, and the option name the second. The
-# remainder is the mount options string in which the replacement
-# must be done.
-#
-# Example:
-# parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
-#
-# If fstab contains, say, "commit=3,rw" for this filesystem, then the
-# result will be "rw,commit=3".
-parse_mount_opts_wfstab () {
- L_DEV="$1"
- OPT="$2"
- shift 2
- L_OPTS="$*"
- PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
- # Watch for a default commit in fstab
- FSTAB_OPTS="$(awk '$1 == "'$L_DEV'" { print $4 }' /etc/fstab)"
- if echo "$FSTAB_OPTS" | grep "$OPT=" > /dev/null ; then
- # option specified in fstab: extract the value, and use it
- echo -n "$PARSEDOPTS1,$OPT="
- echo ",$FSTAB_OPTS," | sed \
- -e 's/.*,'"$OPT"'=//' \
- -e 's/,.*//'
- else
- # option not specified in fstab: set it to 0
- echo "$PARSEDOPTS1,$OPT=0"
- fi
-}
-
-deduce_fstype () {
- MP="$1"
- # My root filesystem unfortunately has
- # type "unknown" in /etc/mtab. If we encounter
- # "unknown", we try to get the type from fstab.
- cat /etc/fstab |
- grep -v '^#' |
- while read FSTAB_DEV FSTAB_MP FSTAB_FST FSTAB_OPTS FSTAB_DUMP FSTAB_DUMP ; do
- if [ "$FSTAB_MP" = "$MP" ]; then
- echo $FSTAB_FST
- exit 0
- fi
- done
-}
-
-if [ $DO_REMOUNT_NOATIME -eq 1 ] ; then
- NOATIME_OPT=",noatime"
-fi
-
-case "$1" in
- start)
- AGE=$((100*$MAX_AGE))
- XFS_AGE=$(($XFS_HZ*$MAX_AGE))
- echo -n "Starting laptop_mode"
-
- if [ -d /proc/sys/vm/pagebuf ] ; then
- # (For 2.4 and early 2.6.)
- # This only needs to be set, not reset -- it is only used when
- # laptop mode is enabled.
- echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
- echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
- elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
- # (A couple of early 2.6 laptop mode patches had these.)
- # The same goes for these.
- echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
- echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
- elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
- # (2.6.6)
- # But not for these -- they are also used in normal
- # operation.
- echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
- echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
- elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
- # (2.6.7 upwards)
- # And not for these either. These are in centisecs,
- # not USER_HZ, so we have to use $AGE, not $XFS_AGE.
- echo $AGE > /proc/sys/fs/xfs/age_buffer_centisecs
- echo $AGE > /proc/sys/fs/xfs/xfssyncd_centisecs
- echo 3000 > /proc/sys/fs/xfs/xfsbufd_centisecs
- fi
-
- case "$KLEVEL" in
- "2.4")
- echo 1 > /proc/sys/vm/laptop_mode
- echo "30 500 0 0 $AGE $AGE 60 20 0" > /proc/sys/vm/bdflush
- ;;
- "2.6")
- echo 5 > /proc/sys/vm/laptop_mode
- echo "$AGE" > /proc/sys/vm/dirty_writeback_centisecs
- echo "$AGE" > /proc/sys/vm/dirty_expire_centisecs
- echo "$DIRTY_RATIO" > /proc/sys/vm/dirty_ratio
- echo "$DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio
- ;;
- esac
- if [ $DO_REMOUNTS -eq 1 ]; then
- cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
- PARSEDOPTS="$(parse_mount_opts "$OPTS")"
- if [ "$FST" = 'unknown' ]; then
- FST=$(deduce_fstype $MP)
- fi
- case "$FST" in
- "ext3"|"reiserfs")
- PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
- mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE$NOATIME_OPT
- ;;
- "xfs")
- mount $DEV -t $FST $MP -o remount,$OPTS$NOATIME_OPT
- ;;
- esac
- if [ -b $DEV ] ; then
- blockdev --setra $(($READAHEAD * 2)) $DEV
- fi
- done
- fi
- if [ $DO_HD -eq 1 ] ; then
- for THISHD in $HD ; do
- /sbin/hdparm -S $BATT_HD $THISHD > /dev/null 2>&1
- /sbin/hdparm -B 1 $THISHD > /dev/null 2>&1
- done
- fi
- if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
- if [ $CPU_MAXFREQ = 'slowest' ]; then
- CPU_MAXFREQ=`cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq`
- fi
- echo $CPU_MAXFREQ > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
- fi
- echo "."
- ;;
- stop)
- U_AGE=$((100*$DEF_UPDATE))
- B_AGE=$((100*$DEF_AGE))
- echo -n "Stopping laptop_mode"
- echo 0 > /proc/sys/vm/laptop_mode
- if [ -f /proc/sys/fs/xfs/age_buffer -a ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
- # These need to be restored, if there are no lm_*.
- echo $(($XFS_HZ*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer
- echo $(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/sync_interval
- elif [ -f /proc/sys/fs/xfs/age_buffer_centisecs ] ; then
- # These need to be restored as well.
- echo $((100*$DEF_XFS_AGE_BUFFER)) > /proc/sys/fs/xfs/age_buffer_centisecs
- echo $((100*$DEF_XFS_SYNC_INTERVAL)) > /proc/sys/fs/xfs/xfssyncd_centisecs
- echo $((100*$DEF_XFS_BUFD_INTERVAL)) > /proc/sys/fs/xfs/xfsbufd_centisecs
- fi
- case "$KLEVEL" in
- "2.4")
- echo "30 500 0 0 $U_AGE $B_AGE 60 20 0" > /proc/sys/vm/bdflush
- ;;
- "2.6")
- echo "$U_AGE" > /proc/sys/vm/dirty_writeback_centisecs
- echo "$B_AGE" > /proc/sys/vm/dirty_expire_centisecs
- echo "$DEF_DIRTY_RATIO" > /proc/sys/vm/dirty_ratio
- echo "$DEF_DIRTY_BACKGROUND_RATIO" > /proc/sys/vm/dirty_background_ratio
- ;;
- esac
- if [ $DO_REMOUNTS -eq 1 ] ; then
- cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
- # Reset commit and atime options to defaults.
- if [ "$FST" = 'unknown' ]; then
- FST=$(deduce_fstype $MP)
- fi
- case "$FST" in
- "ext3"|"reiserfs")
- PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
- PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
- mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
- ;;
- "xfs")
- PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
- mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
- ;;
- esac
- if [ -b $DEV ] ; then
- blockdev --setra 256 $DEV
- fi
- done
- fi
- if [ $DO_HD -eq 1 ] ; then
- for THISHD in $HD ; do
- /sbin/hdparm -S $AC_HD $THISHD > /dev/null 2>&1
- /sbin/hdparm -B 255 $THISHD > /dev/null 2>&1
- done
- fi
- if [ $DO_CPU -eq 1 -a -e /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_min_freq ]; then
- echo `cat /sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq` > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq
- fi
- echo "."
- ;;
- *)
- echo "Usage: $0 {start|stop}" 2>&1
- exit 1
- ;;
-
-esac
-
-exit 0
---------------------CONTROL SCRIPT END------------------------------------------
-
-
-ACPI integration
-----------------
-
-Dax Kelson submitted this so that the ACPI acpid daemon will
-kick off the laptop_mode script and run hdparm. The part that
-automatically disables laptop mode when the battery is low was
-written by Jan Topinski.
-
------------------/etc/acpi/events/ac_adapter BEGIN------------------------------
-event=ac_adapter
-action=/etc/acpi/actions/ac.sh %e
-----------------/etc/acpi/events/ac_adapter END---------------------------------
-
-
------------------/etc/acpi/events/battery BEGIN---------------------------------
-event=battery.*
-action=/etc/acpi/actions/battery.sh %e
-----------------/etc/acpi/events/battery END------------------------------------
-
-
-----------------/etc/acpi/actions/ac.sh BEGIN-----------------------------------
-#!/bin/bash
-
-# ac on/offline event handler
-
-status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/$2/state`
-
-case $status in
- "on-line")
- /sbin/laptop_mode stop
- exit 0
- ;;
- "off-line")
- /sbin/laptop_mode start
- exit 0
- ;;
-esac
----------------------------/etc/acpi/actions/ac.sh END--------------------------
-
-
----------------------------/etc/acpi/actions/battery.sh BEGIN-------------------
-#! /bin/bash
-
-# Automatically disable laptop mode when the battery almost runs out.
-
-BATT_INFO=/proc/acpi/battery/$2/state
-
-if [[ -f /proc/sys/vm/laptop_mode ]]
-then
- LM=`cat /proc/sys/vm/laptop_mode`
- if [[ $LM -gt 0 ]]
- then
- if [[ -f $BATT_INFO ]]
- then
- # Source the config file only now that we know we need
- if [ -f /etc/default/laptop-mode ] ; then
- # Debian
- . /etc/default/laptop-mode
- elif [ -f /etc/sysconfig/laptop-mode ] ; then
- # Others
- . /etc/sysconfig/laptop-mode
- fi
- MINIMUM_BATTERY_MINUTES=${MINIMUM_BATTERY_MINUTES:-'10'}
-
- ACTION="`cat $BATT_INFO | grep charging | cut -c 26-`"
- if [[ ACTION -eq "discharging" ]]
- then
- PRESENT_RATE=`cat $BATT_INFO | grep "present rate:" | sed "s/.* \([0-9][0-9]* \).*/\1/" `
- REMAINING=`cat $BATT_INFO | grep "remaining capacity:" | sed "s/.* \([0-9][0-9]* \).*/\1/" `
- fi
- if (($REMAINING * 60 / $PRESENT_RATE < $MINIMUM_BATTERY_MINUTES))
- then
- /sbin/laptop_mode stop
- fi
- else
- logger -p daemon.warning "You are using laptop mode and your battery interface $BATT_INFO is missing. This may lead to loss of data when the battery runs out. Check kernel ACPI support and /proc/acpi/battery folder, and edit /etc/acpi/battery.sh to set BATT_INFO to the correct path."
- fi
- fi
-fi
----------------------------/etc/acpi/actions/battery.sh END--------------------
-
-
-Monitoring tool
----------------
-
-Bartek Kania submitted this, it can be used to measure how much time your disk
-spends spun up/down. See tools/laptop/dslm/dslm.c
diff --git a/Documentation/laptops/lg-laptop.rst b/Documentation/laptops/lg-laptop.rst
deleted file mode 100644
index f2c2ffe31101..000000000000
--- a/Documentation/laptops/lg-laptop.rst
+++ /dev/null
@@ -1,85 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0+
-
-:orphan:
-
-LG Gram laptop extra features
-=============================
-
-By Matan Ziv-Av <matan@svgalib.org>
-
-
-Hotkeys
--------
-
-The following FN keys are ignored by the kernel without this driver:
-
-- FN-F1 (LG control panel) - Generates F15
-- FN-F5 (Touchpad toggle) - Generates F13
-- FN-F6 (Airplane mode) - Generates RFKILL
-- FN-F8 (Keyboard backlight) - Generates F16.
- This key also changes keyboard backlight mode.
-- FN-F9 (Reader mode) - Generates F14
-
-The rest of the FN keys work without a need for a special driver.
-
-
-Reader mode
------------
-
-Writing 0/1 to /sys/devices/platform/lg-laptop/reader_mode disables/enables
-reader mode. In this mode the screen colors change (blue color reduced),
-and the reader mode indicator LED (on F9 key) turns on.
-
-
-FN Lock
--------
-
-Writing 0/1 to /sys/devices/platform/lg-laptop/fn_lock disables/enables
-FN lock.
-
-
-Battery care limit
-------------------
-
-Writing 80/100 to /sys/devices/platform/lg-laptop/battery_care_limit
-sets the maximum capacity to charge the battery. Limiting the charge
-reduces battery capacity loss over time.
-
-This value is reset to 100 when the kernel boots.
-
-
-Fan mode
---------
-
-Writing 1/0 to /sys/devices/platform/lg-laptop/fan_mode disables/enables
-the fan silent mode.
-
-
-USB charge
-----------
-
-Writing 0/1 to /sys/devices/platform/lg-laptop/usb_charge disables/enables
-charging another device from the USB port while the device is turned off.
-
-This value is reset to 0 when the kernel boots.
-
-
-LEDs
-~~~~
-
-The are two LED devices supported by the driver:
-
-Keyboard backlight
-------------------
-
-A led device named kbd_led controls the keyboard backlight. There are three
-lighting level: off (0), low (127) and high (255).
-
-The keyboard backlight is also controlled by the key combination FN-F8
-which cycles through those levels.
-
-
-Touchpad indicator LED
-----------------------
-
-On the F5 key. Controlled by led device names tpad_led.
diff --git a/Documentation/laptops/sony-laptop.txt b/Documentation/laptops/sony-laptop.txt
deleted file mode 100644
index 978b1e615155..000000000000
--- a/Documentation/laptops/sony-laptop.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-Sony Notebook Control Driver (SNC) Readme
------------------------------------------
- Copyright (C) 2004- 2005 Stelian Pop <stelian@popies.net>
- Copyright (C) 2007 Mattia Dongili <malattia@linux.it>
-
-This mini-driver drives the SNC and SPIC device present in the ACPI BIOS of the
-Sony Vaio laptops. This driver mixes both devices functions under the same
-(hopefully consistent) interface. This also means that the sonypi driver is
-obsoleted by sony-laptop now.
-
-Fn keys (hotkeys):
-------------------
-Some models report hotkeys through the SNC or SPIC devices, such events are
-reported both through the ACPI subsystem as acpi events and through the INPUT
-subsystem. See the logs of /proc/bus/input/devices to find out what those
-events are and which input devices are created by the driver.
-Additionally, loading the driver with the debug option will report all events
-in the kernel log.
-
-The "scancodes" passed to the input system (that can be remapped with udev)
-are indexes to the table "sony_laptop_input_keycode_map" in the sony-laptop.c
-module. For example the "FN/E" key combination (EJECTCD on some models)
-generates the scancode 20 (0x14).
-
-Backlight control:
-------------------
-If your laptop model supports it, you will find sysfs files in the
-/sys/class/backlight/sony/
-directory. You will be able to query and set the current screen
-brightness:
- brightness get/set screen brightness (an integer
- between 0 and 7)
- actual_brightness reading from this file will query the HW
- to get real brightness value
- max_brightness the maximum brightness value
-
-
-Platform specific:
-------------------
-Loading the sony-laptop module will create a
-/sys/devices/platform/sony-laptop/
-directory populated with some files.
-
-You then read/write integer values from/to those files by using
-standard UNIX tools.
-
-The files are:
- brightness_default screen brightness which will be set
- when the laptop will be rebooted
- cdpower power on/off the internal CD drive
- audiopower power on/off the internal sound card
- lanpower power on/off the internal ethernet card
- (only in debug mode)
- bluetoothpower power on/off the internal bluetooth device
- fanspeed get/set the fan speed
-
-Note that some files may be missing if they are not supported
-by your particular laptop model.
-
-Example usage:
- # echo "1" > /sys/devices/platform/sony-laptop/brightness_default
-sets the lowest screen brightness for the next and later reboots,
- # echo "8" > /sys/devices/platform/sony-laptop/brightness_default
-sets the highest screen brightness for the next and later reboots,
- # cat /sys/devices/platform/sony-laptop/brightness_default
-retrieves the value.
-
- # echo "0" > /sys/devices/platform/sony-laptop/audiopower
-powers off the sound card,
- # echo "1" > /sys/devices/platform/sony-laptop/audiopower
-powers on the sound card.
-
-
-RFkill control:
----------------
-More recent Vaio models expose a consistent set of ACPI methods to
-control radio frequency emitting devices. If you are a lucky owner of
-such a laptop you will find the necessary rfkill devices under
-/sys/class/rfkill. Check those starting with sony-* in
- # grep . /sys/class/rfkill/*/{state,name}
-
-
-Development:
-------------
-
-If you want to help with the development of this driver (and
-you are not afraid of any side effects doing strange things with
-your ACPI BIOS could have on your laptop), load the driver and
-pass the option 'debug=1'.
-
-REPEAT: DON'T DO THIS IF YOU DON'T LIKE RISKY BUSINESS.
-
-In your kernel logs you will find the list of all ACPI methods
-the SNC device has on your laptop.
-
-* For new models you will see a long list of meaningless method names,
-reading the DSDT table source should reveal that:
-(1) the SNC device uses an internal capability lookup table
-(2) SN00 is used to find values in the lookup table
-(3) SN06 and SN07 are used to call into the real methods based on
- offsets you can obtain iterating the table using SN00
-(4) SN02 used to enable events.
-Some values in the capability lookup table are more or less known, see
-the code for all sony_call_snc_handle calls, others are more obscure.
-
-* For old models you can see the GCDP/GCDP methods used to pwer on/off
-the CD drive, but there are others and they are usually different from
-model to model.
-
-I HAVE NO IDEA WHAT THOSE METHODS DO.
-
-The sony-laptop driver creates, for some of those methods (the most
-current ones found on several Vaio models), an entry under
-/sys/devices/platform/sony-laptop, just like the 'cdpower' one.
-You can create other entries corresponding to your own laptop methods by
-further editing the source (see the 'sony_nc_values' table, and add a new
-entry to this table with your get/set method names using the
-SNC_HANDLE_NAMES macro).
-
-Your mission, should you accept it, is to try finding out what
-those entries are for, by reading/writing random values from/to those
-files and find out what is the impact on your laptop.
-
-Should you find anything interesting, please report it back to me,
-I will not disavow all knowledge of your actions :)
-
-See also http://www.linux.it/~malattia/wiki/index.php/Sony_drivers for other
-useful info.
-
-Bugs/Limitations:
------------------
-
-* This driver is not based on official documentation from Sony
- (because there is none), so there is no guarantee this driver
- will work at all, or do the right thing. Although this hasn't
- happened to me, this driver could do very bad things to your
- laptop, including permanent damage.
-
-* The sony-laptop and sonypi drivers do not interact at all. In the
- future, sonypi will be removed and replaced by sony-laptop.
-
-* spicctrl, which is the userspace tool used to communicate with the
- sonypi driver (through /dev/sonypi) is deprecated as well since all
- its features are now available under the sysfs tree via sony-laptop.
diff --git a/Documentation/laptops/sonypi.txt b/Documentation/laptops/sonypi.txt
deleted file mode 100644
index 606bdb9ce036..000000000000
--- a/Documentation/laptops/sonypi.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-Sony Programmable I/O Control Device Driver Readme
---------------------------------------------------
- Copyright (C) 2001-2004 Stelian Pop <stelian@popies.net>
- Copyright (C) 2001-2002 Alcôve <www.alcove.com>
- Copyright (C) 2001 Michael Ashley <m.ashley@unsw.edu.au>
- Copyright (C) 2001 Junichi Morita <jun1m@mars.dti.ne.jp>
- Copyright (C) 2000 Takaya Kinjo <t-kinjo@tc4.so-net.ne.jp>
- Copyright (C) 2000 Andrew Tridgell <tridge@samba.org>
-
-This driver enables access to the Sony Programmable I/O Control Device which
-can be found in many Sony Vaio laptops. Some newer Sony laptops (seems to be
-limited to new FX series laptops, at least the FX501 and the FX702) lack a
-sonypi device and are not supported at all by this driver.
-
-It will give access (through a user space utility) to some events those laptops
-generate, like:
- - jogdial events (the small wheel on the side of Vaios)
- - capture button events (only on Vaio Picturebook series)
- - Fn keys
- - bluetooth button (only on C1VR model)
- - programmable keys, back, help, zoom, thumbphrase buttons, etc.
- (when available)
-
-Those events (see linux/sonypi.h) can be polled using the character device node
-/dev/sonypi (major 10, minor auto allocated or specified as a option).
-A simple daemon which translates the jogdial movements into mouse wheel events
-can be downloaded at: <http://popies.net/sonypi/>
-
-Another option to intercept the events is to get them directly through the
-input layer.
-
-This driver supports also some ioctl commands for setting the LCD screen
-brightness and querying the batteries charge information (some more
-commands may be added in the future).
-
-This driver can also be used to set the camera controls on Picturebook series
-(brightness, contrast etc), and is used by the video4linux driver for the
-Motion Eye camera.
-
-Please note that this driver was created by reverse engineering the Windows
-driver and the ACPI BIOS, because Sony doesn't agree to release any programming
-specs for its laptops. If someone convinces them to do so, drop me a note.
-
-Driver options:
----------------
-
-Several options can be passed to the sonypi driver using the standard
-module argument syntax (<param>=<value> when passing the option to the
-module or sonypi.<param>=<value> on the kernel boot line when sonypi is
-statically linked into the kernel). Those options are:
-
- minor: minor number of the misc device /dev/sonypi,
- default is -1 (automatic allocation, see /proc/misc
- or kernel logs)
-
- camera: if you have a PictureBook series Vaio (with the
- integrated MotionEye camera), set this parameter to 1
- in order to let the driver access to the camera
-
- fnkeyinit: on some Vaios (C1VE, C1VR etc), the Fn key events don't
- get enabled unless you set this parameter to 1.
- Do not use this option unless it's actually necessary,
- some Vaio models don't deal well with this option.
- This option is available only if the kernel is
- compiled without ACPI support (since it conflicts
- with it and it shouldn't be required anyway if
- ACPI is already enabled).
-
- verbose: set to 1 to print unknown events received from the
- sonypi device.
- set to 2 to print all events received from the
- sonypi device.
-
- compat: uses some compatibility code for enabling the sonypi
- events. If the driver worked for you in the past
- (prior to version 1.5) and does not work anymore,
- add this option and report to the author.
-
- mask: event mask telling the driver what events will be
- reported to the user. This parameter is required for
- some Vaio models where the hardware reuses values
- used in other Vaio models (like the FX series who does
- not have a jogdial but reuses the jogdial events for
- programmable keys events). The default event mask is
- set to 0xffffffff, meaning that all possible events
- will be tried. You can use the following bits to
- construct your own event mask (from
- drivers/char/sonypi.h):
- SONYPI_JOGGER_MASK 0x0001
- SONYPI_CAPTURE_MASK 0x0002
- SONYPI_FNKEY_MASK 0x0004
- SONYPI_BLUETOOTH_MASK 0x0008
- SONYPI_PKEY_MASK 0x0010
- SONYPI_BACK_MASK 0x0020
- SONYPI_HELP_MASK 0x0040
- SONYPI_LID_MASK 0x0080
- SONYPI_ZOOM_MASK 0x0100
- SONYPI_THUMBPHRASE_MASK 0x0200
- SONYPI_MEYE_MASK 0x0400
- SONYPI_MEMORYSTICK_MASK 0x0800
- SONYPI_BATTERY_MASK 0x1000
- SONYPI_WIRELESS_MASK 0x2000
-
- useinput: if set (which is the default) two input devices are
- created, one which interprets the jogdial events as
- mouse events, the other one which acts like a
- keyboard reporting the pressing of the special keys.
-
-Module use:
------------
-
-In order to automatically load the sonypi module on use, you can put those
-lines a configuration file in /etc/modprobe.d/:
-
- alias char-major-10-250 sonypi
- options sonypi minor=250
-
-This supposes the use of minor 250 for the sonypi device:
-
- # mknod /dev/sonypi c 10 250
-
-Bugs:
------
-
- - several users reported that this driver disables the BIOS-managed
- Fn-keys which put the laptop in sleeping state, or switch the
- external monitor on/off. There is no workaround yet, since this
- driver disables all APM management for those keys, by enabling the
- ACPI management (and the ACPI core stuff is not complete yet). If
- you have one of those laptops with working Fn keys and want to
- continue to use them, don't use this driver.
-
- - some users reported that the laptop speed is lower (dhrystone
- tested) when using the driver with the fnkeyinit parameter. I cannot
- reproduce it on my laptop and not all users have this problem.
- This happens because the fnkeyinit parameter enables the ACPI
- mode (but without additional ACPI control, like processor
- speed handling etc). Use ACPI instead of APM if it works on your
- laptop.
-
- - sonypi lacks the ability to distinguish between certain key
- events on some models.
-
- - some models with the nvidia card (geforce go 6200 tc) uses a
- different way to adjust the backlighting of the screen. There
- is a userspace utility to adjust the brightness on those models,
- which can be downloaded from
- http://www.acc.umu.se/~erikw/program/smartdimmer-0.1.tar.bz2
-
- - since all development was done by reverse engineering, there is
- _absolutely no guarantee_ that this driver will not crash your
- laptop. Permanently.
diff --git a/Documentation/laptops/thinkpad-acpi.txt b/Documentation/laptops/thinkpad-acpi.txt
deleted file mode 100644
index 6cced88de6da..000000000000
--- a/Documentation/laptops/thinkpad-acpi.txt
+++ /dev/null
@@ -1,1487 +0,0 @@
- ThinkPad ACPI Extras Driver
-
- Version 0.25
- October 16th, 2013
-
- Borislav Deianov <borislav@users.sf.net>
- Henrique de Moraes Holschuh <hmh@hmh.eng.br>
- http://ibm-acpi.sf.net/
-
-
-This is a Linux driver for the IBM and Lenovo ThinkPad laptops. It
-supports various features of these laptops which are accessible
-through the ACPI and ACPI EC framework, but not otherwise fully
-supported by the generic Linux ACPI drivers.
-
-This driver used to be named ibm-acpi until kernel 2.6.21 and release
-0.13-20070314. It used to be in the drivers/acpi tree, but it was
-moved to the drivers/misc tree and renamed to thinkpad-acpi for kernel
-2.6.22, and release 0.14. It was moved to drivers/platform/x86 for
-kernel 2.6.29 and release 0.22.
-
-The driver is named "thinkpad-acpi". In some places, like module
-names and log messages, "thinkpad_acpi" is used because of userspace
-issues.
-
-"tpacpi" is used as a shorthand where "thinkpad-acpi" would be too
-long due to length limitations on some Linux kernel versions.
-
-Status
-------
-
-The features currently supported are the following (see below for
-detailed description):
-
- - Fn key combinations
- - Bluetooth enable and disable
- - video output switching, expansion control
- - ThinkLight on and off
- - CMOS/UCMS control
- - LED control
- - ACPI sounds
- - temperature sensors
- - Experimental: embedded controller register dump
- - LCD brightness control
- - Volume control
- - Fan control and monitoring: fan speed, fan enable/disable
- - WAN enable and disable
- - UWB enable and disable
-
-A compatibility table by model and feature is maintained on the web
-site, http://ibm-acpi.sf.net/. I appreciate any success or failure
-reports, especially if they add to or correct the compatibility table.
-Please include the following information in your report:
-
- - ThinkPad model name
- - a copy of your ACPI tables, using the "acpidump" utility
- - a copy of the output of dmidecode, with serial numbers
- and UUIDs masked off
- - which driver features work and which don't
- - the observed behavior of non-working features
-
-Any other comments or patches are also more than welcome.
-
-
-Installation
-------------
-
-If you are compiling this driver as included in the Linux kernel
-sources, look for the CONFIG_THINKPAD_ACPI Kconfig option.
-It is located on the menu path: "Device Drivers" -> "X86 Platform
-Specific Device Drivers" -> "ThinkPad ACPI Laptop Extras".
-
-
-Features
---------
-
-The driver exports two different interfaces to userspace, which can be
-used to access the features it provides. One is a legacy procfs-based
-interface, which will be removed at some time in the future. The other
-is a new sysfs-based interface which is not complete yet.
-
-The procfs interface creates the /proc/acpi/ibm directory. There is a
-file under that directory for each feature it supports. The procfs
-interface is mostly frozen, and will change very little if at all: it
-will not be extended to add any new functionality in the driver, instead
-all new functionality will be implemented on the sysfs interface.
-
-The sysfs interface tries to blend in the generic Linux sysfs subsystems
-and classes as much as possible. Since some of these subsystems are not
-yet ready or stabilized, it is expected that this interface will change,
-and any and all userspace programs must deal with it.
-
-
-Notes about the sysfs interface:
-
-Unlike what was done with the procfs interface, correctness when talking
-to the sysfs interfaces will be enforced, as will correctness in the
-thinkpad-acpi's implementation of sysfs interfaces.
-
-Also, any bugs in the thinkpad-acpi sysfs driver code or in the
-thinkpad-acpi's implementation of the sysfs interfaces will be fixed for
-maximum correctness, even if that means changing an interface in
-non-compatible ways. As these interfaces mature both in the kernel and
-in thinkpad-acpi, such changes should become quite rare.
-
-Applications interfacing to the thinkpad-acpi sysfs interfaces must
-follow all sysfs guidelines and correctly process all errors (the sysfs
-interface makes extensive use of errors). File descriptors and open /
-close operations to the sysfs inodes must also be properly implemented.
-
-The version of thinkpad-acpi's sysfs interface is exported by the driver
-as a driver attribute (see below).
-
-Sysfs driver attributes are on the driver's sysfs attribute space,
-for 2.6.23+ this is /sys/bus/platform/drivers/thinkpad_acpi/ and
-/sys/bus/platform/drivers/thinkpad_hwmon/
-
-Sysfs device attributes are on the thinkpad_acpi device sysfs attribute
-space, for 2.6.23+ this is /sys/devices/platform/thinkpad_acpi/.
-
-Sysfs device attributes for the sensors and fan are on the
-thinkpad_hwmon device's sysfs attribute space, but you should locate it
-looking for a hwmon device with the name attribute of "thinkpad", or
-better yet, through libsensors. For 4.14+ sysfs attributes were moved to the
-hwmon device (/sys/bus/platform/devices/thinkpad_hwmon/hwmon/hwmon? or
-/sys/class/hwmon/hwmon?).
-
-Driver version
---------------
-
-procfs: /proc/acpi/ibm/driver
-sysfs driver attribute: version
-
-The driver name and version. No commands can be written to this file.
-
-
-Sysfs interface version
------------------------
-
-sysfs driver attribute: interface_version
-
-Version of the thinkpad-acpi sysfs interface, as an unsigned long
-(output in hex format: 0xAAAABBCC), where:
- AAAA - major revision
- BB - minor revision
- CC - bugfix revision
-
-The sysfs interface version changelog for the driver can be found at the
-end of this document. Changes to the sysfs interface done by the kernel
-subsystems are not documented here, nor are they tracked by this
-attribute.
-
-Changes to the thinkpad-acpi sysfs interface are only considered
-non-experimental when they are submitted to Linux mainline, at which
-point the changes in this interface are documented and interface_version
-may be updated. If you are using any thinkpad-acpi features not yet
-sent to mainline for merging, you do so on your own risk: these features
-may disappear, or be implemented in a different and incompatible way by
-the time they are merged in Linux mainline.
-
-Changes that are backwards-compatible by nature (e.g. the addition of
-attributes that do not change the way the other attributes work) do not
-always warrant an update of interface_version. Therefore, one must
-expect that an attribute might not be there, and deal with it properly
-(an attribute not being there *is* a valid way to make it clear that a
-feature is not available in sysfs).
-
-
-Hot keys
---------
-
-procfs: /proc/acpi/ibm/hotkey
-sysfs device attribute: hotkey_*
-
-In a ThinkPad, the ACPI HKEY handler is responsible for communicating
-some important events and also keyboard hot key presses to the operating
-system. Enabling the hotkey functionality of thinkpad-acpi signals the
-firmware that such a driver is present, and modifies how the ThinkPad
-firmware will behave in many situations.
-
-The driver enables the HKEY ("hot key") event reporting automatically
-when loaded, and disables it when it is removed.
-
-The driver will report HKEY events in the following format:
-
- ibm/hotkey HKEY 00000080 0000xxxx
-
-Some of these events refer to hot key presses, but not all of them.
-
-The driver will generate events over the input layer for hot keys and
-radio switches, and over the ACPI netlink layer for other events. The
-input layer support accepts the standard IOCTLs to remap the keycodes
-assigned to each hot key.
-
-The hot key bit mask allows some control over which hot keys generate
-events. If a key is "masked" (bit set to 0 in the mask), the firmware
-will handle it. If it is "unmasked", it signals the firmware that
-thinkpad-acpi would prefer to handle it, if the firmware would be so
-kind to allow it (and it often doesn't!).
-
-Not all bits in the mask can be modified. Not all bits that can be
-modified do anything. Not all hot keys can be individually controlled
-by the mask. Some models do not support the mask at all. The behaviour
-of the mask is, therefore, highly dependent on the ThinkPad model.
-
-The driver will filter out any unmasked hotkeys, so even if the firmware
-doesn't allow disabling an specific hotkey, the driver will not report
-events for unmasked hotkeys.
-
-Note that unmasking some keys prevents their default behavior. For
-example, if Fn+F5 is unmasked, that key will no longer enable/disable
-Bluetooth by itself in firmware.
-
-Note also that not all Fn key combinations are supported through ACPI
-depending on the ThinkPad model and firmware version. On those
-ThinkPads, it is still possible to support some extra hotkeys by
-polling the "CMOS NVRAM" at least 10 times per second. The driver
-attempts to enables this functionality automatically when required.
-
-procfs notes:
-
-The following commands can be written to the /proc/acpi/ibm/hotkey file:
-
- echo 0xffffffff > /proc/acpi/ibm/hotkey -- enable all hot keys
- echo 0 > /proc/acpi/ibm/hotkey -- disable all possible hot keys
- ... any other 8-hex-digit mask ...
- echo reset > /proc/acpi/ibm/hotkey -- restore the recommended mask
-
-The following commands have been deprecated and will cause the kernel
-to log a warning:
-
- echo enable > /proc/acpi/ibm/hotkey -- does nothing
- echo disable > /proc/acpi/ibm/hotkey -- returns an error
-
-The procfs interface does not support NVRAM polling control. So as to
-maintain maximum bug-to-bug compatibility, it does not report any masks,
-nor does it allow one to manipulate the hot key mask when the firmware
-does not support masks at all, even if NVRAM polling is in use.
-
-sysfs notes:
-
- hotkey_bios_enabled:
- DEPRECATED, WILL BE REMOVED SOON.
-
- Returns 0.
-
- hotkey_bios_mask:
- DEPRECATED, DON'T USE, WILL BE REMOVED IN THE FUTURE.
-
- Returns the hot keys mask when thinkpad-acpi was loaded.
- Upon module unload, the hot keys mask will be restored
- to this value. This is always 0x80c, because those are
- the hotkeys that were supported by ancient firmware
- without mask support.
-
- hotkey_enable:
- DEPRECATED, WILL BE REMOVED SOON.
-
- 0: returns -EPERM
- 1: does nothing
-
- hotkey_mask:
- bit mask to enable reporting (and depending on
- the firmware, ACPI event generation) for each hot key
- (see above). Returns the current status of the hot keys
- mask, and allows one to modify it.
-
- hotkey_all_mask:
- bit mask that should enable event reporting for all
- supported hot keys, when echoed to hotkey_mask above.
- Unless you know which events need to be handled
- passively (because the firmware *will* handle them
- anyway), do *not* use hotkey_all_mask. Use
- hotkey_recommended_mask, instead. You have been warned.
-
- hotkey_recommended_mask:
- bit mask that should enable event reporting for all
- supported hot keys, except those which are always
- handled by the firmware anyway. Echo it to
- hotkey_mask above, to use. This is the default mask
- used by the driver.
-
- hotkey_source_mask:
- bit mask that selects which hot keys will the driver
- poll the NVRAM for. This is auto-detected by the driver
- based on the capabilities reported by the ACPI firmware,
- but it can be overridden at runtime.
-
- Hot keys whose bits are set in hotkey_source_mask are
- polled for in NVRAM, and reported as hotkey events if
- enabled in hotkey_mask. Only a few hot keys are
- available through CMOS NVRAM polling.
-
- Warning: when in NVRAM mode, the volume up/down/mute
- keys are synthesized according to changes in the mixer,
- which uses a single volume up or volume down hotkey
- press to unmute, as per the ThinkPad volume mixer user
- interface. When in ACPI event mode, volume up/down/mute
- events are reported by the firmware and can behave
- differently (and that behaviour changes with firmware
- version -- not just with firmware models -- as well as
- OSI(Linux) state).
-
- hotkey_poll_freq:
- frequency in Hz for hot key polling. It must be between
- 0 and 25 Hz. Polling is only carried out when strictly
- needed.
-
- Setting hotkey_poll_freq to zero disables polling, and
- will cause hot key presses that require NVRAM polling
- to never be reported.
-
- Setting hotkey_poll_freq too low may cause repeated
- pressings of the same hot key to be misreported as a
- single key press, or to not even be detected at all.
- The recommended polling frequency is 10Hz.
-
- hotkey_radio_sw:
- If the ThinkPad has a hardware radio switch, this
- attribute will read 0 if the switch is in the "radios
- disabled" position, and 1 if the switch is in the
- "radios enabled" position.
-
- This attribute has poll()/select() support.
-
- hotkey_tablet_mode:
- If the ThinkPad has tablet capabilities, this attribute
- will read 0 if the ThinkPad is in normal mode, and
- 1 if the ThinkPad is in tablet mode.
-
- This attribute has poll()/select() support.
-
- wakeup_reason:
- Set to 1 if the system is waking up because the user
- requested a bay ejection. Set to 2 if the system is
- waking up because the user requested the system to
- undock. Set to zero for normal wake-ups or wake-ups
- due to unknown reasons.
-
- This attribute has poll()/select() support.
-
- wakeup_hotunplug_complete:
- Set to 1 if the system was waken up because of an
- undock or bay ejection request, and that request
- was successfully completed. At this point, it might
- be useful to send the system back to sleep, at the
- user's choice. Refer to HKEY events 0x4003 and
- 0x3003, below.
-
- This attribute has poll()/select() support.
-
-input layer notes:
-
-A Hot key is mapped to a single input layer EV_KEY event, possibly
-followed by an EV_MSC MSC_SCAN event that shall contain that key's scan
-code. An EV_SYN event will always be generated to mark the end of the
-event block.
-
-Do not use the EV_MSC MSC_SCAN events to process keys. They are to be
-used as a helper to remap keys, only. They are particularly useful when
-remapping KEY_UNKNOWN keys.
-
-The events are available in an input device, with the following id:
-
- Bus: BUS_HOST
- vendor: 0x1014 (PCI_VENDOR_ID_IBM) or
- 0x17aa (PCI_VENDOR_ID_LENOVO)
- product: 0x5054 ("TP")
- version: 0x4101
-
-The version will have its LSB incremented if the keymap changes in a
-backwards-compatible way. The MSB shall always be 0x41 for this input
-device. If the MSB is not 0x41, do not use the device as described in
-this section, as it is either something else (e.g. another input device
-exported by a thinkpad driver, such as HDAPS) or its functionality has
-been changed in a non-backwards compatible way.
-
-Adding other event types for other functionalities shall be considered a
-backwards-compatible change for this input device.
-
-Thinkpad-acpi Hot Key event map (version 0x4101):
-
-ACPI Scan
-event code Key Notes
-
-0x1001 0x00 FN+F1 -
-
-0x1002 0x01 FN+F2 IBM: battery (rare)
- Lenovo: Screen lock
-
-0x1003 0x02 FN+F3 Many IBM models always report
- this hot key, even with hot keys
- disabled or with Fn+F3 masked
- off
- IBM: screen lock, often turns
- off the ThinkLight as side-effect
- Lenovo: battery
-
-0x1004 0x03 FN+F4 Sleep button (ACPI sleep button
- semantics, i.e. sleep-to-RAM).
- It always generates some kind
- of event, either the hot key
- event or an ACPI sleep button
- event. The firmware may
- refuse to generate further FN+F4
- key presses until a S3 or S4 ACPI
- sleep cycle is performed or some
- time passes.
-
-0x1005 0x04 FN+F5 Radio. Enables/disables
- the internal Bluetooth hardware
- and W-WAN card if left in control
- of the firmware. Does not affect
- the WLAN card.
- Should be used to turn on/off all
- radios (Bluetooth+W-WAN+WLAN),
- really.
-
-0x1006 0x05 FN+F6 -
-
-0x1007 0x06 FN+F7 Video output cycle.
- Do you feel lucky today?
-
-0x1008 0x07 FN+F8 IBM: toggle screen expand
- Lenovo: configure UltraNav,
- or toggle screen expand
-
-0x1009 0x08 FN+F9 -
- .. .. ..
-0x100B 0x0A FN+F11 -
-
-0x100C 0x0B FN+F12 Sleep to disk. You are always
- supposed to handle it yourself,
- either through the ACPI event,
- or through a hotkey event.
- The firmware may refuse to
- generate further FN+F12 key
- press events until a S3 or S4
- ACPI sleep cycle is performed,
- or some time passes.
-
-0x100D 0x0C FN+BACKSPACE -
-0x100E 0x0D FN+INSERT -
-0x100F 0x0E FN+DELETE -
-
-0x1010 0x0F FN+HOME Brightness up. This key is
- always handled by the firmware
- in IBM ThinkPads, even when
- unmasked. Just leave it alone.
- For Lenovo ThinkPads with a new
- BIOS, it has to be handled either
- by the ACPI OSI, or by userspace.
- The driver does the right thing,
- never mess with this.
-0x1011 0x10 FN+END Brightness down. See brightness
- up for details.
-
-0x1012 0x11 FN+PGUP ThinkLight toggle. This key is
- always handled by the firmware,
- even when unmasked.
-
-0x1013 0x12 FN+PGDOWN -
-
-0x1014 0x13 FN+SPACE Zoom key
-
-0x1015 0x14 VOLUME UP Internal mixer volume up. This
- key is always handled by the
- firmware, even when unmasked.
- NOTE: Lenovo seems to be changing
- this.
-0x1016 0x15 VOLUME DOWN Internal mixer volume up. This
- key is always handled by the
- firmware, even when unmasked.
- NOTE: Lenovo seems to be changing
- this.
-0x1017 0x16 MUTE Mute internal mixer. This
- key is always handled by the
- firmware, even when unmasked.
-
-0x1018 0x17 THINKPAD ThinkPad/Access IBM/Lenovo key
-
-0x1019 0x18 unknown
-.. .. ..
-0x1020 0x1F unknown
-
-The ThinkPad firmware does not allow one to differentiate when most hot
-keys are pressed or released (either that, or we don't know how to, yet).
-For these keys, the driver generates a set of events for a key press and
-immediately issues the same set of events for a key release. It is
-unknown by the driver if the ThinkPad firmware triggered these events on
-hot key press or release, but the firmware will do it for either one, not
-both.
-
-If a key is mapped to KEY_RESERVED, it generates no input events at all.
-If a key is mapped to KEY_UNKNOWN, it generates an input event that
-includes an scan code. If a key is mapped to anything else, it will
-generate input device EV_KEY events.
-
-In addition to the EV_KEY events, thinkpad-acpi may also issue EV_SW
-events for switches:
-
-SW_RFKILL_ALL T60 and later hardware rfkill rocker switch
-SW_TABLET_MODE Tablet ThinkPads HKEY events 0x5009 and 0x500A
-
-Non hotkey ACPI HKEY event map:
--------------------------------
-
-Events that are never propagated by the driver:
-
-0x2304 System is waking up from suspend to undock
-0x2305 System is waking up from suspend to eject bay
-0x2404 System is waking up from hibernation to undock
-0x2405 System is waking up from hibernation to eject bay
-0x5001 Lid closed
-0x5002 Lid opened
-0x5009 Tablet swivel: switched to tablet mode
-0x500A Tablet swivel: switched to normal mode
-0x5010 Brightness level changed/control event
-0x6000 KEYBOARD: Numlock key pressed
-0x6005 KEYBOARD: Fn key pressed (TO BE VERIFIED)
-0x7000 Radio Switch may have changed state
-
-
-Events that are propagated by the driver to userspace:
-
-0x2313 ALARM: System is waking up from suspend because
- the battery is nearly empty
-0x2413 ALARM: System is waking up from hibernation because
- the battery is nearly empty
-0x3003 Bay ejection (see 0x2x05) complete, can sleep again
-0x3006 Bay hotplug request (hint to power up SATA link when
- the optical drive tray is ejected)
-0x4003 Undocked (see 0x2x04), can sleep again
-0x4010 Docked into hotplug port replicator (non-ACPI dock)
-0x4011 Undocked from hotplug port replicator (non-ACPI dock)
-0x500B Tablet pen inserted into its storage bay
-0x500C Tablet pen removed from its storage bay
-0x6011 ALARM: battery is too hot
-0x6012 ALARM: battery is extremely hot
-0x6021 ALARM: a sensor is too hot
-0x6022 ALARM: a sensor is extremely hot
-0x6030 System thermal table changed
-0x6032 Thermal Control command set completion (DYTC, Windows)
-0x6040 Nvidia Optimus/AC adapter related (TO BE VERIFIED)
-0x60C0 X1 Yoga 2016, Tablet mode status changed
-0x60F0 Thermal Transformation changed (GMTS, Windows)
-
-Battery nearly empty alarms are a last resort attempt to get the
-operating system to hibernate or shutdown cleanly (0x2313), or shutdown
-cleanly (0x2413) before power is lost. They must be acted upon, as the
-wake up caused by the firmware will have negated most safety nets...
-
-When any of the "too hot" alarms happen, according to Lenovo the user
-should suspend or hibernate the laptop (and in the case of battery
-alarms, unplug the AC adapter) to let it cool down. These alarms do
-signal that something is wrong, they should never happen on normal
-operating conditions.
-
-The "extremely hot" alarms are emergencies. According to Lenovo, the
-operating system is to force either an immediate suspend or hibernate
-cycle, or a system shutdown. Obviously, something is very wrong if this
-happens.
-
-
-Brightness hotkey notes:
-
-Don't mess with the brightness hotkeys in a Thinkpad. If you want
-notifications for OSD, use the sysfs backlight class event support.
-
-The driver will issue KEY_BRIGHTNESS_UP and KEY_BRIGHTNESS_DOWN events
-automatically for the cases were userspace has to do something to
-implement brightness changes. When you override these events, you will
-either fail to handle properly the ThinkPads that require explicit
-action to change backlight brightness, or the ThinkPads that require
-that no action be taken to work properly.
-
-
-Bluetooth
----------
-
-procfs: /proc/acpi/ibm/bluetooth
-sysfs device attribute: bluetooth_enable (deprecated)
-sysfs rfkill class: switch "tpacpi_bluetooth_sw"
-
-This feature shows the presence and current state of a ThinkPad
-Bluetooth device in the internal ThinkPad CDC slot.
-
-If the ThinkPad supports it, the Bluetooth state is stored in NVRAM,
-so it is kept across reboots and power-off.
-
-Procfs notes:
-
-If Bluetooth is installed, the following commands can be used:
-
- echo enable > /proc/acpi/ibm/bluetooth
- echo disable > /proc/acpi/ibm/bluetooth
-
-Sysfs notes:
-
- If the Bluetooth CDC card is installed, it can be enabled /
- disabled through the "bluetooth_enable" thinkpad-acpi device
- attribute, and its current status can also be queried.
-
- enable:
- 0: disables Bluetooth / Bluetooth is disabled
- 1: enables Bluetooth / Bluetooth is enabled.
-
- Note: this interface has been superseded by the generic rfkill
- class. It has been deprecated, and it will be removed in year
- 2010.
-
- rfkill controller switch "tpacpi_bluetooth_sw": refer to
- Documentation/rfkill.txt for details.
-
-
-Video output control -- /proc/acpi/ibm/video
---------------------------------------------
-
-This feature allows control over the devices used for video output -
-LCD, CRT or DVI (if available). The following commands are available:
-
- echo lcd_enable > /proc/acpi/ibm/video
- echo lcd_disable > /proc/acpi/ibm/video
- echo crt_enable > /proc/acpi/ibm/video
- echo crt_disable > /proc/acpi/ibm/video
- echo dvi_enable > /proc/acpi/ibm/video
- echo dvi_disable > /proc/acpi/ibm/video
- echo auto_enable > /proc/acpi/ibm/video
- echo auto_disable > /proc/acpi/ibm/video
- echo expand_toggle > /proc/acpi/ibm/video
- echo video_switch > /proc/acpi/ibm/video
-
-NOTE: Access to this feature is restricted to processes owning the
-CAP_SYS_ADMIN capability for safety reasons, as it can interact badly
-enough with some versions of X.org to crash it.
-
-Each video output device can be enabled or disabled individually.
-Reading /proc/acpi/ibm/video shows the status of each device.
-
-Automatic video switching can be enabled or disabled. When automatic
-video switching is enabled, certain events (e.g. opening the lid,
-docking or undocking) cause the video output device to change
-automatically. While this can be useful, it also causes flickering
-and, on the X40, video corruption. By disabling automatic switching,
-the flickering or video corruption can be avoided.
-
-The video_switch command cycles through the available video outputs
-(it simulates the behavior of Fn-F7).
-
-Video expansion can be toggled through this feature. This controls
-whether the display is expanded to fill the entire LCD screen when a
-mode with less than full resolution is used. Note that the current
-video expansion status cannot be determined through this feature.
-
-Note that on many models (particularly those using Radeon graphics
-chips) the X driver configures the video card in a way which prevents
-Fn-F7 from working. This also disables the video output switching
-features of this driver, as it uses the same ACPI methods as
-Fn-F7. Video switching on the console should still work.
-
-UPDATE: refer to https://bugs.freedesktop.org/show_bug.cgi?id=2000
-
-
-ThinkLight control
-------------------
-
-procfs: /proc/acpi/ibm/light
-sysfs attributes: as per LED class, for the "tpacpi::thinklight" LED
-
-procfs notes:
-
-The ThinkLight status can be read and set through the procfs interface. A
-few models which do not make the status available will show the ThinkLight
-status as "unknown". The available commands are:
-
- echo on > /proc/acpi/ibm/light
- echo off > /proc/acpi/ibm/light
-
-sysfs notes:
-
-The ThinkLight sysfs interface is documented by the LED class
-documentation, in Documentation/leds/leds-class.txt. The ThinkLight LED name
-is "tpacpi::thinklight".
-
-Due to limitations in the sysfs LED class, if the status of the ThinkLight
-cannot be read or if it is unknown, thinkpad-acpi will report it as "off".
-It is impossible to know if the status returned through sysfs is valid.
-
-
-CMOS/UCMS control
------------------
-
-procfs: /proc/acpi/ibm/cmos
-sysfs device attribute: cmos_command
-
-This feature is mostly used internally by the ACPI firmware to keep the legacy
-CMOS NVRAM bits in sync with the current machine state, and to record this
-state so that the ThinkPad will retain such settings across reboots.
-
-Some of these commands actually perform actions in some ThinkPad models, but
-this is expected to disappear more and more in newer models. As an example, in
-a T43 and in a X40, commands 12 and 13 still control the ThinkLight state for
-real, but commands 0 to 2 don't control the mixer anymore (they have been
-phased out) and just update the NVRAM.
-
-The range of valid cmos command numbers is 0 to 21, but not all have an
-effect and the behavior varies from model to model. Here is the behavior
-on the X40 (tpb is the ThinkPad Buttons utility):
-
- 0 - Related to "Volume down" key press
- 1 - Related to "Volume up" key press
- 2 - Related to "Mute on" key press
- 3 - Related to "Access IBM" key press
- 4 - Related to "LCD brightness up" key press
- 5 - Related to "LCD brightness down" key press
- 11 - Related to "toggle screen expansion" key press/function
- 12 - Related to "ThinkLight on"
- 13 - Related to "ThinkLight off"
- 14 - Related to "ThinkLight" key press (toggle ThinkLight)
-
-The cmos command interface is prone to firmware split-brain problems, as
-in newer ThinkPads it is just a compatibility layer. Do not use it, it is
-exported just as a debug tool.
-
-
-LED control
------------
-
-procfs: /proc/acpi/ibm/led
-sysfs attributes: as per LED class, see below for names
-
-Some of the LED indicators can be controlled through this feature. On
-some older ThinkPad models, it is possible to query the status of the
-LED indicators as well. Newer ThinkPads cannot query the real status
-of the LED indicators.
-
-Because misuse of the LEDs could induce an unaware user to perform
-dangerous actions (like undocking or ejecting a bay device while the
-buses are still active), or mask an important alarm (such as a nearly
-empty battery, or a broken battery), access to most LEDs is
-restricted.
-
-Unrestricted access to all LEDs requires that thinkpad-acpi be
-compiled with the CONFIG_THINKPAD_ACPI_UNSAFE_LEDS option enabled.
-Distributions must never enable this option. Individual users that
-are aware of the consequences are welcome to enabling it.
-
-Audio mute and microphone mute LEDs are supported, but currently not
-visible to userspace. They are used by the snd-hda-intel audio driver.
-
-procfs notes:
-
-The available commands are:
-
- echo '<LED number> on' >/proc/acpi/ibm/led
- echo '<LED number> off' >/proc/acpi/ibm/led
- echo '<LED number> blink' >/proc/acpi/ibm/led
-
-The <LED number> range is 0 to 15. The set of LEDs that can be
-controlled varies from model to model. Here is the common ThinkPad
-mapping:
-
- 0 - power
- 1 - battery (orange)
- 2 - battery (green)
- 3 - UltraBase/dock
- 4 - UltraBay
- 5 - UltraBase battery slot
- 6 - (unknown)
- 7 - standby
- 8 - dock status 1
- 9 - dock status 2
- 10, 11 - (unknown)
- 12 - thinkvantage
- 13, 14, 15 - (unknown)
-
-All of the above can be turned on and off and can be made to blink.
-
-sysfs notes:
-
-The ThinkPad LED sysfs interface is described in detail by the LED class
-documentation, in Documentation/leds/leds-class.txt.
-
-The LEDs are named (in LED ID order, from 0 to 12):
-"tpacpi::power", "tpacpi:orange:batt", "tpacpi:green:batt",
-"tpacpi::dock_active", "tpacpi::bay_active", "tpacpi::dock_batt",
-"tpacpi::unknown_led", "tpacpi::standby", "tpacpi::dock_status1",
-"tpacpi::dock_status2", "tpacpi::unknown_led2", "tpacpi::unknown_led3",
-"tpacpi::thinkvantage".
-
-Due to limitations in the sysfs LED class, if the status of the LED
-indicators cannot be read due to an error, thinkpad-acpi will report it as
-a brightness of zero (same as LED off).
-
-If the thinkpad firmware doesn't support reading the current status,
-trying to read the current LED brightness will just return whatever
-brightness was last written to that attribute.
-
-These LEDs can blink using hardware acceleration. To request that a
-ThinkPad indicator LED should blink in hardware accelerated mode, use the
-"timer" trigger, and leave the delay_on and delay_off parameters set to
-zero (to request hardware acceleration autodetection).
-
-LEDs that are known not to exist in a given ThinkPad model are not
-made available through the sysfs interface. If you have a dock and you
-notice there are LEDs listed for your ThinkPad that do not exist (and
-are not in the dock), or if you notice that there are missing LEDs,
-a report to ibm-acpi-devel@lists.sourceforge.net is appreciated.
-
-
-ACPI sounds -- /proc/acpi/ibm/beep
-----------------------------------
-
-The BEEP method is used internally by the ACPI firmware to provide
-audible alerts in various situations. This feature allows the same
-sounds to be triggered manually.
-
-The commands are non-negative integer numbers:
-
- echo <number> >/proc/acpi/ibm/beep
-
-The valid <number> range is 0 to 17. Not all numbers trigger sounds
-and the sounds vary from model to model. Here is the behavior on the
-X40:
-
- 0 - stop a sound in progress (but use 17 to stop 16)
- 2 - two beeps, pause, third beep ("low battery")
- 3 - single beep
- 4 - high, followed by low-pitched beep ("unable")
- 5 - single beep
- 6 - very high, followed by high-pitched beep ("AC/DC")
- 7 - high-pitched beep
- 9 - three short beeps
- 10 - very long beep
- 12 - low-pitched beep
- 15 - three high-pitched beeps repeating constantly, stop with 0
- 16 - one medium-pitched beep repeating constantly, stop with 17
- 17 - stop 16
-
-
-Temperature sensors
--------------------
-
-procfs: /proc/acpi/ibm/thermal
-sysfs device attributes: (hwmon "thinkpad") temp*_input
-
-Most ThinkPads include six or more separate temperature sensors but only
-expose the CPU temperature through the standard ACPI methods. This
-feature shows readings from up to eight different sensors on older
-ThinkPads, and up to sixteen different sensors on newer ThinkPads.
-
-For example, on the X40, a typical output may be:
-temperatures: 42 42 45 41 36 -128 33 -128
-
-On the T43/p, a typical output may be:
-temperatures: 48 48 36 52 38 -128 31 -128 48 52 48 -128 -128 -128 -128 -128
-
-The mapping of thermal sensors to physical locations varies depending on
-system-board model (and thus, on ThinkPad model).
-
-http://thinkwiki.org/wiki/Thermal_Sensors is a public wiki page that
-tries to track down these locations for various models.
-
-Most (newer?) models seem to follow this pattern:
-
-1: CPU
-2: (depends on model)
-3: (depends on model)
-4: GPU
-5: Main battery: main sensor
-6: Bay battery: main sensor
-7: Main battery: secondary sensor
-8: Bay battery: secondary sensor
-9-15: (depends on model)
-
-For the R51 (source: Thomas Gruber):
-2: Mini-PCI
-3: Internal HDD
-
-For the T43, T43/p (source: Shmidoax/Thinkwiki.org)
-http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_T43.2C_T43p
-2: System board, left side (near PCMCIA slot), reported as HDAPS temp
-3: PCMCIA slot
-9: MCH (northbridge) to DRAM Bus
-10: Clock-generator, mini-pci card and ICH (southbridge), under Mini-PCI
- card, under touchpad
-11: Power regulator, underside of system board, below F2 key
-
-The A31 has a very atypical layout for the thermal sensors
-(source: Milos Popovic, http://thinkwiki.org/wiki/Thermal_Sensors#ThinkPad_A31)
-1: CPU
-2: Main Battery: main sensor
-3: Power Converter
-4: Bay Battery: main sensor
-5: MCH (northbridge)
-6: PCMCIA/ambient
-7: Main Battery: secondary sensor
-8: Bay Battery: secondary sensor
-
-
-Procfs notes:
- Readings from sensors that are not available return -128.
- No commands can be written to this file.
-
-Sysfs notes:
- Sensors that are not available return the ENXIO error. This
- status may change at runtime, as there are hotplug thermal
- sensors, like those inside the batteries and docks.
-
- thinkpad-acpi thermal sensors are reported through the hwmon
- subsystem, and follow all of the hwmon guidelines at
- Documentation/hwmon.
-
-EXPERIMENTAL: Embedded controller register dump
------------------------------------------------
-
-This feature is not included in the thinkpad driver anymore.
-Instead the EC can be accessed through /sys/kernel/debug/ec with
-a userspace tool which can be found here:
-ftp://ftp.suse.com/pub/people/trenn/sources/ec
-
-Use it to determine the register holding the fan
-speed on some models. To do that, do the following:
- - make sure the battery is fully charged
- - make sure the fan is running
- - use above mentioned tool to read out the EC
-
-Often fan and temperature values vary between
-readings. Since temperatures don't change vary fast, you can take
-several quick dumps to eliminate them.
-
-You can use a similar method to figure out the meaning of other
-embedded controller registers - e.g. make sure nothing else changes
-except the charging or discharging battery to determine which
-registers contain the current battery capacity, etc. If you experiment
-with this, do send me your results (including some complete dumps with
-a description of the conditions when they were taken.)
-
-
-LCD brightness control
-----------------------
-
-procfs: /proc/acpi/ibm/brightness
-sysfs backlight device "thinkpad_screen"
-
-This feature allows software control of the LCD brightness on ThinkPad
-models which don't have a hardware brightness slider.
-
-It has some limitations: the LCD backlight cannot be actually turned
-on or off by this interface, it just controls the backlight brightness
-level.
-
-On IBM (and some of the earlier Lenovo) ThinkPads, the backlight control
-has eight brightness levels, ranging from 0 to 7. Some of the levels
-may not be distinct. Later Lenovo models that implement the ACPI
-display backlight brightness control methods have 16 levels, ranging
-from 0 to 15.
-
-For IBM ThinkPads, there are two interfaces to the firmware for direct
-brightness control, EC and UCMS (or CMOS). To select which one should be
-used, use the brightness_mode module parameter: brightness_mode=1 selects
-EC mode, brightness_mode=2 selects UCMS mode, brightness_mode=3 selects EC
-mode with NVRAM backing (so that brightness changes are remembered across
-shutdown/reboot).
-
-The driver tries to select which interface to use from a table of
-defaults for each ThinkPad model. If it makes a wrong choice, please
-report this as a bug, so that we can fix it.
-
-Lenovo ThinkPads only support brightness_mode=2 (UCMS).
-
-When display backlight brightness controls are available through the
-standard ACPI interface, it is best to use it instead of this direct
-ThinkPad-specific interface. The driver will disable its native
-backlight brightness control interface if it detects that the standard
-ACPI interface is available in the ThinkPad.
-
-If you want to use the thinkpad-acpi backlight brightness control
-instead of the generic ACPI video backlight brightness control for some
-reason, you should use the acpi_backlight=vendor kernel parameter.
-
-The brightness_enable module parameter can be used to control whether
-the LCD brightness control feature will be enabled when available.
-brightness_enable=0 forces it to be disabled. brightness_enable=1
-forces it to be enabled when available, even if the standard ACPI
-interface is also available.
-
-Procfs notes:
-
- The available commands are:
-
- echo up >/proc/acpi/ibm/brightness
- echo down >/proc/acpi/ibm/brightness
- echo 'level <level>' >/proc/acpi/ibm/brightness
-
-Sysfs notes:
-
-The interface is implemented through the backlight sysfs class, which is
-poorly documented at this time.
-
-Locate the thinkpad_screen device under /sys/class/backlight, and inside
-it there will be the following attributes:
-
- max_brightness:
- Reads the maximum brightness the hardware can be set to.
- The minimum is always zero.
-
- actual_brightness:
- Reads what brightness the screen is set to at this instant.
-
- brightness:
- Writes request the driver to change brightness to the
- given value. Reads will tell you what brightness the
- driver is trying to set the display to when "power" is set
- to zero and the display has not been dimmed by a kernel
- power management event.
-
- power:
- power management mode, where 0 is "display on", and 1 to 3
- will dim the display backlight to brightness level 0
- because thinkpad-acpi cannot really turn the backlight
- off. Kernel power management events can temporarily
- increase the current power management level, i.e. they can
- dim the display.
-
-
-WARNING:
-
- Whatever you do, do NOT ever call thinkpad-acpi backlight-level change
- interface and the ACPI-based backlight level change interface
- (available on newer BIOSes, and driven by the Linux ACPI video driver)
- at the same time. The two will interact in bad ways, do funny things,
- and maybe reduce the life of the backlight lamps by needlessly kicking
- its level up and down at every change.
-
-
-Volume control (Console Audio control)
---------------------------------------
-
-procfs: /proc/acpi/ibm/volume
-ALSA: "ThinkPad Console Audio Control", default ID: "ThinkPadEC"
-
-NOTE: by default, the volume control interface operates in read-only
-mode, as it is supposed to be used for on-screen-display purposes.
-The read/write mode can be enabled through the use of the
-"volume_control=1" module parameter.
-
-NOTE: distros are urged to not enable volume_control by default, this
-should be done by the local admin only. The ThinkPad UI is for the
-console audio control to be done through the volume keys only, and for
-the desktop environment to just provide on-screen-display feedback.
-Software volume control should be done only in the main AC97/HDA
-mixer.
-
-
-About the ThinkPad Console Audio control:
-
-ThinkPads have a built-in amplifier and muting circuit that drives the
-console headphone and speakers. This circuit is after the main AC97
-or HDA mixer in the audio path, and under exclusive control of the
-firmware.
-
-ThinkPads have three special hotkeys to interact with the console
-audio control: volume up, volume down and mute.
-
-It is worth noting that the normal way the mute function works (on
-ThinkPads that do not have a "mute LED") is:
-
-1. Press mute to mute. It will *always* mute, you can press it as
- many times as you want, and the sound will remain mute.
-
-2. Press either volume key to unmute the ThinkPad (it will _not_
- change the volume, it will just unmute).
-
-This is a very superior design when compared to the cheap software-only
-mute-toggle solution found on normal consumer laptops: you can be
-absolutely sure the ThinkPad will not make noise if you press the mute
-button, no matter the previous state.
-
-The IBM ThinkPads, and the earlier Lenovo ThinkPads have variable-gain
-amplifiers driving the speakers and headphone output, and the firmware
-also handles volume control for the headphone and speakers on these
-ThinkPads without any help from the operating system (this volume
-control stage exists after the main AC97 or HDA mixer in the audio
-path).
-
-The newer Lenovo models only have firmware mute control, and depend on
-the main HDA mixer to do volume control (which is done by the operating
-system). In this case, the volume keys are filtered out for unmute
-key press (there are some firmware bugs in this area) and delivered as
-normal key presses to the operating system (thinkpad-acpi is not
-involved).
-
-
-The ThinkPad-ACPI volume control:
-
-The preferred way to interact with the Console Audio control is the
-ALSA interface.
-
-The legacy procfs interface allows one to read the current state,
-and if volume control is enabled, accepts the following commands:
-
- echo up >/proc/acpi/ibm/volume
- echo down >/proc/acpi/ibm/volume
- echo mute >/proc/acpi/ibm/volume
- echo unmute >/proc/acpi/ibm/volume
- echo 'level <level>' >/proc/acpi/ibm/volume
-
-The <level> number range is 0 to 14 although not all of them may be
-distinct. To unmute the volume after the mute command, use either the
-up or down command (the level command will not unmute the volume), or
-the unmute command.
-
-You can use the volume_capabilities parameter to tell the driver
-whether your thinkpad has volume control or mute-only control:
-volume_capabilities=1 for mixers with mute and volume control,
-volume_capabilities=2 for mixers with only mute control.
-
-If the driver misdetects the capabilities for your ThinkPad model,
-please report this to ibm-acpi-devel@lists.sourceforge.net, so that we
-can update the driver.
-
-There are two strategies for volume control. To select which one
-should be used, use the volume_mode module parameter: volume_mode=1
-selects EC mode, and volume_mode=3 selects EC mode with NVRAM backing
-(so that volume/mute changes are remembered across shutdown/reboot).
-
-The driver will operate in volume_mode=3 by default. If that does not
-work well on your ThinkPad model, please report this to
-ibm-acpi-devel@lists.sourceforge.net.
-
-The driver supports the standard ALSA module parameters. If the ALSA
-mixer is disabled, the driver will disable all volume functionality.
-
-
-Fan control and monitoring: fan speed, fan enable/disable
----------------------------------------------------------
-
-procfs: /proc/acpi/ibm/fan
-sysfs device attributes: (hwmon "thinkpad") fan1_input, pwm1,
- pwm1_enable, fan2_input
-sysfs hwmon driver attributes: fan_watchdog
-
-NOTE NOTE NOTE: fan control operations are disabled by default for
-safety reasons. To enable them, the module parameter "fan_control=1"
-must be given to thinkpad-acpi.
-
-This feature attempts to show the current fan speed, control mode and
-other fan data that might be available. The speed is read directly
-from the hardware registers of the embedded controller. This is known
-to work on later R, T, X and Z series ThinkPads but may show a bogus
-value on other models.
-
-Some Lenovo ThinkPads support a secondary fan. This fan cannot be
-controlled separately, it shares the main fan control.
-
-Fan levels:
-
-Most ThinkPad fans work in "levels" at the firmware interface. Level 0
-stops the fan. The higher the level, the higher the fan speed, although
-adjacent levels often map to the same fan speed. 7 is the highest
-level, where the fan reaches the maximum recommended speed.
-
-Level "auto" means the EC changes the fan level according to some
-internal algorithm, usually based on readings from the thermal sensors.
-
-There is also a "full-speed" level, also known as "disengaged" level.
-In this level, the EC disables the speed-locked closed-loop fan control,
-and drives the fan as fast as it can go, which might exceed hardware
-limits, so use this level with caution.
-
-The fan usually ramps up or down slowly from one speed to another, and
-it is normal for the EC to take several seconds to react to fan
-commands. The full-speed level may take up to two minutes to ramp up to
-maximum speed, and in some ThinkPads, the tachometer readings go stale
-while the EC is transitioning to the full-speed level.
-
-WARNING WARNING WARNING: do not leave the fan disabled unless you are
-monitoring all of the temperature sensor readings and you are ready to
-enable it if necessary to avoid overheating.
-
-An enabled fan in level "auto" may stop spinning if the EC decides the
-ThinkPad is cool enough and doesn't need the extra airflow. This is
-normal, and the EC will spin the fan up if the various thermal readings
-rise too much.
-
-On the X40, this seems to depend on the CPU and HDD temperatures.
-Specifically, the fan is turned on when either the CPU temperature
-climbs to 56 degrees or the HDD temperature climbs to 46 degrees. The
-fan is turned off when the CPU temperature drops to 49 degrees and the
-HDD temperature drops to 41 degrees. These thresholds cannot
-currently be controlled.
-
-The ThinkPad's ACPI DSDT code will reprogram the fan on its own when
-certain conditions are met. It will override any fan programming done
-through thinkpad-acpi.
-
-The thinkpad-acpi kernel driver can be programmed to revert the fan
-level to a safe setting if userspace does not issue one of the procfs
-fan commands: "enable", "disable", "level" or "watchdog", or if there
-are no writes to pwm1_enable (or to pwm1 *if and only if* pwm1_enable is
-set to 1, manual mode) within a configurable amount of time of up to
-120 seconds. This functionality is called fan safety watchdog.
-
-Note that the watchdog timer stops after it enables the fan. It will be
-rearmed again automatically (using the same interval) when one of the
-above mentioned fan commands is received. The fan watchdog is,
-therefore, not suitable to protect against fan mode changes made through
-means other than the "enable", "disable", and "level" procfs fan
-commands, or the hwmon fan control sysfs interface.
-
-Procfs notes:
-
-The fan may be enabled or disabled with the following commands:
-
- echo enable >/proc/acpi/ibm/fan
- echo disable >/proc/acpi/ibm/fan
-
-Placing a fan on level 0 is the same as disabling it. Enabling a fan
-will try to place it in a safe level if it is too slow or disabled.
-
-The fan level can be controlled with the command:
-
- echo 'level <level>' > /proc/acpi/ibm/fan
-
-Where <level> is an integer from 0 to 7, or one of the words "auto" or
-"full-speed" (without the quotes). Not all ThinkPads support the "auto"
-and "full-speed" levels. The driver accepts "disengaged" as an alias for
-"full-speed", and reports it as "disengaged" for backwards
-compatibility.
-
-On the X31 and X40 (and ONLY on those models), the fan speed can be
-controlled to a certain degree. Once the fan is running, it can be
-forced to run faster or slower with the following command:
-
- echo 'speed <speed>' > /proc/acpi/ibm/fan
-
-The sustainable range of fan speeds on the X40 appears to be from about
-3700 to about 7350. Values outside this range either do not have any
-effect or the fan speed eventually settles somewhere in that range. The
-fan cannot be stopped or started with this command. This functionality
-is incomplete, and not available through the sysfs interface.
-
-To program the safety watchdog, use the "watchdog" command.
-
- echo 'watchdog <interval in seconds>' > /proc/acpi/ibm/fan
-
-If you want to disable the watchdog, use 0 as the interval.
-
-Sysfs notes:
-
-The sysfs interface follows the hwmon subsystem guidelines for the most
-part, and the exception is the fan safety watchdog.
-
-Writes to any of the sysfs attributes may return the EINVAL error if
-that operation is not supported in a given ThinkPad or if the parameter
-is out-of-bounds, and EPERM if it is forbidden. They may also return
-EINTR (interrupted system call), and EIO (I/O error while trying to talk
-to the firmware).
-
-Features not yet implemented by the driver return ENOSYS.
-
-hwmon device attribute pwm1_enable:
- 0: PWM offline (fan is set to full-speed mode)
- 1: Manual PWM control (use pwm1 to set fan level)
- 2: Hardware PWM control (EC "auto" mode)
- 3: reserved (Software PWM control, not implemented yet)
-
- Modes 0 and 2 are not supported by all ThinkPads, and the
- driver is not always able to detect this. If it does know a
- mode is unsupported, it will return -EINVAL.
-
-hwmon device attribute pwm1:
- Fan level, scaled from the firmware values of 0-7 to the hwmon
- scale of 0-255. 0 means fan stopped, 255 means highest normal
- speed (level 7).
-
- This attribute only commands the fan if pmw1_enable is set to 1
- (manual PWM control).
-
-hwmon device attribute fan1_input:
- Fan tachometer reading, in RPM. May go stale on certain
- ThinkPads while the EC transitions the PWM to offline mode,
- which can take up to two minutes. May return rubbish on older
- ThinkPads.
-
-hwmon device attribute fan2_input:
- Fan tachometer reading, in RPM, for the secondary fan.
- Available only on some ThinkPads. If the secondary fan is
- not installed, will always read 0.
-
-hwmon driver attribute fan_watchdog:
- Fan safety watchdog timer interval, in seconds. Minimum is
- 1 second, maximum is 120 seconds. 0 disables the watchdog.
-
-To stop the fan: set pwm1 to zero, and pwm1_enable to 1.
-
-To start the fan in a safe mode: set pwm1_enable to 2. If that fails
-with EINVAL, try to set pwm1_enable to 1 and pwm1 to at least 128 (255
-would be the safest choice, though).
-
-
-WAN
----
-
-procfs: /proc/acpi/ibm/wan
-sysfs device attribute: wwan_enable (deprecated)
-sysfs rfkill class: switch "tpacpi_wwan_sw"
-
-This feature shows the presence and current state of the built-in
-Wireless WAN device.
-
-If the ThinkPad supports it, the WWAN state is stored in NVRAM,
-so it is kept across reboots and power-off.
-
-It was tested on a Lenovo ThinkPad X60. It should probably work on other
-ThinkPad models which come with this module installed.
-
-Procfs notes:
-
-If the W-WAN card is installed, the following commands can be used:
-
- echo enable > /proc/acpi/ibm/wan
- echo disable > /proc/acpi/ibm/wan
-
-Sysfs notes:
-
- If the W-WAN card is installed, it can be enabled /
- disabled through the "wwan_enable" thinkpad-acpi device
- attribute, and its current status can also be queried.
-
- enable:
- 0: disables WWAN card / WWAN card is disabled
- 1: enables WWAN card / WWAN card is enabled.
-
- Note: this interface has been superseded by the generic rfkill
- class. It has been deprecated, and it will be removed in year
- 2010.
-
- rfkill controller switch "tpacpi_wwan_sw": refer to
- Documentation/rfkill.txt for details.
-
-
-EXPERIMENTAL: UWB
------------------
-
-This feature is considered EXPERIMENTAL because it has not been extensively
-tested and validated in various ThinkPad models yet. The feature may not
-work as expected. USE WITH CAUTION! To use this feature, you need to supply
-the experimental=1 parameter when loading the module.
-
-sysfs rfkill class: switch "tpacpi_uwb_sw"
-
-This feature exports an rfkill controller for the UWB device, if one is
-present and enabled in the BIOS.
-
-Sysfs notes:
-
- rfkill controller switch "tpacpi_uwb_sw": refer to
- Documentation/rfkill.txt for details.
-
-Adaptive keyboard
------------------
-
-sysfs device attribute: adaptive_kbd_mode
-
-This sysfs attribute controls the keyboard "face" that will be shown on the
-Lenovo X1 Carbon 2nd gen (2014)'s adaptive keyboard. The value can be read
-and set.
-
-1 = Home mode
-2 = Web-browser mode
-3 = Web-conference mode
-4 = Function mode
-5 = Layflat mode
-
-For more details about which buttons will appear depending on the mode, please
-review the laptop's user guide:
-http://www.lenovo.com/shop/americas/content/user_guides/x1carbon_2_ug_en.pdf
-
-Multiple Commands, Module Parameters
-------------------------------------
-
-Multiple commands can be written to the proc files in one shot by
-separating them with commas, for example:
-
- echo enable,0xffff > /proc/acpi/ibm/hotkey
- echo lcd_disable,crt_enable > /proc/acpi/ibm/video
-
-Commands can also be specified when loading the thinkpad-acpi module,
-for example:
-
- modprobe thinkpad_acpi hotkey=enable,0xffff video=auto_disable
-
-
-Enabling debugging output
--------------------------
-
-The module takes a debug parameter which can be used to selectively
-enable various classes of debugging output, for example:
-
- modprobe thinkpad_acpi debug=0xffff
-
-will enable all debugging output classes. It takes a bitmask, so
-to enable more than one output class, just add their values.
-
- Debug bitmask Description
- 0x8000 Disclose PID of userspace programs
- accessing some functions of the driver
- 0x0001 Initialization and probing
- 0x0002 Removal
- 0x0004 RF Transmitter control (RFKILL)
- (bluetooth, WWAN, UWB...)
- 0x0008 HKEY event interface, hotkeys
- 0x0010 Fan control
- 0x0020 Backlight brightness
- 0x0040 Audio mixer/volume control
-
-There is also a kernel build option to enable more debugging
-information, which may be necessary to debug driver problems.
-
-The level of debugging information output by the driver can be changed
-at runtime through sysfs, using the driver attribute debug_level. The
-attribute takes the same bitmask as the debug module parameter above.
-
-
-Force loading of module
------------------------
-
-If thinkpad-acpi refuses to detect your ThinkPad, you can try to specify
-the module parameter force_load=1. Regardless of whether this works or
-not, please contact ibm-acpi-devel@lists.sourceforge.net with a report.
-
-
-Sysfs interface changelog:
-
-0x000100: Initial sysfs support, as a single platform driver and
- device.
-0x000200: Hot key support for 32 hot keys, and radio slider switch
- support.
-0x010000: Hot keys are now handled by default over the input
- layer, the radio switch generates input event EV_RADIO,
- and the driver enables hot key handling by default in
- the firmware.
-
-0x020000: ABI fix: added a separate hwmon platform device and
- driver, which must be located by name (thinkpad)
- and the hwmon class for libsensors4 (lm-sensors 3)
- compatibility. Moved all hwmon attributes to this
- new platform device.
-
-0x020100: Marker for thinkpad-acpi with hot key NVRAM polling
- support. If you must, use it to know you should not
- start a userspace NVRAM poller (allows to detect when
- NVRAM is compiled out by the user because it is
- unneeded/undesired in the first place).
-0x020101: Marker for thinkpad-acpi with hot key NVRAM polling
- and proper hotkey_mask semantics (version 8 of the
- NVRAM polling patch). Some development snapshots of
- 0.18 had an earlier version that did strange things
- to hotkey_mask.
-
-0x020200: Add poll()/select() support to the following attributes:
- hotkey_radio_sw, wakeup_hotunplug_complete, wakeup_reason
-
-0x020300: hotkey enable/disable support removed, attributes
- hotkey_bios_enabled and hotkey_enable deprecated and
- marked for removal.
-
-0x020400: Marker for 16 LEDs support. Also, LEDs that are known
- to not exist in a given model are not registered with
- the LED sysfs class anymore.
-
-0x020500: Updated hotkey driver, hotkey_mask is always available
- and it is always able to disable hot keys. Very old
- thinkpads are properly supported. hotkey_bios_mask
- is deprecated and marked for removal.
-
-0x020600: Marker for backlight change event support.
-
-0x020700: Support for mute-only mixers.
- Volume control in read-only mode by default.
- Marker for ALSA mixer support.
-
-0x030000: Thermal and fan sysfs attributes were moved to the hwmon
- device instead of being attached to the backing platform
- device.
diff --git a/Documentation/laptops/toshiba_haps.txt b/Documentation/laptops/toshiba_haps.txt
deleted file mode 100644
index 0c1d88dedbde..000000000000
--- a/Documentation/laptops/toshiba_haps.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-Kernel driver toshiba_haps
-Toshiba HDD Active Protection Sensor
-====================================
-
-Author: Azael Avalos <coproscefalo@gmail.com>
-
-
-0. Contents
------------
-
-1. Description
-2. Interface
-3. Accelerometer axes
-4. Supported devices
-5. Usage
-
-
-1. Description
---------------
-
-This driver provides support for the accelerometer found in various Toshiba
-laptops, being called "Toshiba HDD Protection - Shock Sensor" officially,
-and detects laptops automatically with this device.
-On Windows, Toshiba provided software monitors this device and provides
-automatic HDD protection (head unload) on sudden moves or harsh vibrations,
-however, this driver only provides a notification via a sysfs file to let
-userspace tools or daemons act accordingly, as well as providing a sysfs
-file to set the desired protection level or sensor sensibility.
-
-
-2. Interface
-------------
-
-This device comes with 3 methods:
-_STA - Checks existence of the device, returning Zero if the device does not
- exists or is not supported.
-PTLV - Sets the desired protection level.
-RSSS - Shuts down the HDD protection interface for a few seconds,
- then restores normal operation.
-
-Note:
-The presence of Solid State Drives (SSD) can make this driver to fail loading,
-given the fact that such drives have no movable parts, and thus, not requiring
-any "protection" as well as failing during the evaluation of the _STA method
-found under this device.
-
-
-3. Accelerometer axes
----------------------
-
-This device does not report any axes, however, to query the sensor position
-a couple HCI (Hardware Configuration Interface) calls (0x6D and 0xA6) are
-provided to query such information, handled by the kernel module toshiba_acpi
-since kernel version 3.15.
-
-
-4. Supported devices
---------------------
-
-This driver binds itself to the ACPI device TOS620A, and any Toshiba laptop
-with this device is supported, given the fact that they have the presence of
-conventional HDD and not only SSD, or a combination of both HDD and SSD.
-
-
-5. Usage
---------
-
-The sysfs files under /sys/devices/LNXSYSTM:00/LNXSYBUS:00/TOS620A:00/ are:
-protection_level - The protection_level is readable and writeable, and
- provides a way to let userspace query the current protection
- level, as well as set the desired protection level, the
- available protection levels are:
- 0 - Disabled | 1 - Low | 2 - Medium | 3 - High
-reset_protection - The reset_protection entry is writeable only, being "1"
- the only parameter it accepts, it is used to trigger
- a reset of the protection interface.
diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
new file mode 100644
index 000000000000..060f4e485897
--- /dev/null
+++ b/Documentation/leds/index.rst
@@ -0,0 +1,25 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+====
+LEDs
+====
+
+.. toctree::
+ :maxdepth: 1
+
+ leds-class
+ leds-class-flash
+ ledtrig-oneshot
+ ledtrig-transient
+ ledtrig-usbport
+
+ uleds
+
+ leds-blinkm
+ leds-lm3556
+ leds-lp3944
+ leds-lp5521
+ leds-lp5523
+ leds-lp5562
+ leds-lp55xx
+ leds-mlxcpld
diff --git a/Documentation/leds/leds-blinkm.rst b/Documentation/leds/leds-blinkm.rst
new file mode 100644
index 000000000000..c74b5bc877b1
--- /dev/null
+++ b/Documentation/leds/leds-blinkm.rst
@@ -0,0 +1,84 @@
+==================
+Leds BlinkM driver
+==================
+
+The leds-blinkm driver supports the devices of the BlinkM family.
+
+They are RGB-LED modules driven by a (AT)tiny microcontroller and
+communicate through I2C. The default address of these modules is
+0x09 but this can be changed through a command. By this you could
+dasy-chain up to 127 BlinkMs on an I2C bus.
+
+The device accepts RGB and HSB color values through separate commands.
+Also you can store blinking sequences as "scripts" in
+the controller and run them. Also fading is an option.
+
+The interface this driver provides is 2-fold:
+
+a) LED class interface for use with triggers
+############################################
+
+The registration follows the scheme::
+
+ blinkm-<i2c-bus-nr>-<i2c-device-nr>-<color>
+
+ $ ls -h /sys/class/leds/blinkm-6-*
+ /sys/class/leds/blinkm-6-9-blue:
+ brightness device max_brightness power subsystem trigger uevent
+
+ /sys/class/leds/blinkm-6-9-green:
+ brightness device max_brightness power subsystem trigger uevent
+
+ /sys/class/leds/blinkm-6-9-red:
+ brightness device max_brightness power subsystem trigger uevent
+
+(same is /sys/bus/i2c/devices/6-0009/leds)
+
+We can control the colors separated into red, green and blue and
+assign triggers on each color.
+
+E.g.::
+
+ $ cat blinkm-6-9-blue/brightness
+ 05
+
+ $ echo 200 > blinkm-6-9-blue/brightness
+ $
+
+ $ modprobe ledtrig-heartbeat
+ $ echo heartbeat > blinkm-6-9-green/trigger
+ $
+
+
+b) Sysfs group to control rgb, fade, hsb, scripts ...
+#####################################################
+
+This extended interface is available as folder blinkm
+in the sysfs folder of the I2C device.
+E.g. below /sys/bus/i2c/devices/6-0009/blinkm
+
+ $ ls -h /sys/bus/i2c/devices/6-0009/blinkm/
+ blue green red test
+
+Currently supported is just setting red, green, blue
+and a test sequence.
+
+E.g.::
+
+ $ cat *
+ 00
+ 00
+ 00
+ #Write into test to start test sequence!#
+
+ $ echo 1 > test
+ $
+
+ $ echo 255 > red
+ $
+
+
+
+as of 6/2012
+
+dl9pf <at> gmx <dot> de
diff --git a/Documentation/leds/leds-blinkm.txt b/Documentation/leds/leds-blinkm.txt
deleted file mode 100644
index 9dd92f4cf4e1..000000000000
--- a/Documentation/leds/leds-blinkm.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-The leds-blinkm driver supports the devices of the BlinkM family.
-
-They are RGB-LED modules driven by a (AT)tiny microcontroller and
-communicate through I2C. The default address of these modules is
-0x09 but this can be changed through a command. By this you could
-dasy-chain up to 127 BlinkMs on an I2C bus.
-
-The device accepts RGB and HSB color values through separate commands.
-Also you can store blinking sequences as "scripts" in
-the controller and run them. Also fading is an option.
-
-The interface this driver provides is 2-fold:
-
-a) LED class interface for use with triggers
-############################################
-
-The registration follows the scheme:
-blinkm-<i2c-bus-nr>-<i2c-device-nr>-<color>
-
-$ ls -h /sys/class/leds/blinkm-6-*
-/sys/class/leds/blinkm-6-9-blue:
-brightness device max_brightness power subsystem trigger uevent
-
-/sys/class/leds/blinkm-6-9-green:
-brightness device max_brightness power subsystem trigger uevent
-
-/sys/class/leds/blinkm-6-9-red:
-brightness device max_brightness power subsystem trigger uevent
-
-(same is /sys/bus/i2c/devices/6-0009/leds)
-
-We can control the colors separated into red, green and blue and
-assign triggers on each color.
-
-E.g.:
-
-$ cat blinkm-6-9-blue/brightness
-05
-
-$ echo 200 > blinkm-6-9-blue/brightness
-$
-
-$ modprobe ledtrig-heartbeat
-$ echo heartbeat > blinkm-6-9-green/trigger
-$
-
-
-b) Sysfs group to control rgb, fade, hsb, scripts ...
-#####################################################
-
-This extended interface is available as folder blinkm
-in the sysfs folder of the I2C device.
-E.g. below /sys/bus/i2c/devices/6-0009/blinkm
-
-$ ls -h /sys/bus/i2c/devices/6-0009/blinkm/
-blue green red test
-
-Currently supported is just setting red, green, blue
-and a test sequence.
-
-E.g.:
-
-$ cat *
-00
-00
-00
-#Write into test to start test sequence!#
-
-$ echo 1 > test
-$
-
-$ echo 255 > red
-$
-
-
-
-as of 6/2012
-
-dl9pf <at> gmx <dot> de
-
diff --git a/Documentation/leds/leds-class-flash.rst b/Documentation/leds/leds-class-flash.rst
new file mode 100644
index 000000000000..6ec12c5a1a0e
--- /dev/null
+++ b/Documentation/leds/leds-class-flash.rst
@@ -0,0 +1,90 @@
+==============================
+Flash LED handling under Linux
+==============================
+
+Some LED devices provide two modes - torch and flash. In the LED subsystem
+those modes are supported by LED class (see Documentation/leds/leds-class.rst)
+and LED Flash class respectively. The torch mode related features are enabled
+by default and the flash ones only if a driver declares it by setting
+LED_DEV_CAP_FLASH flag.
+
+In order to enable the support for flash LEDs CONFIG_LEDS_CLASS_FLASH symbol
+must be defined in the kernel config. A LED Flash class driver must be
+registered in the LED subsystem with led_classdev_flash_register function.
+
+Following sysfs attributes are exposed for controlling flash LED devices:
+(see Documentation/ABI/testing/sysfs-class-led-flash)
+
+ - flash_brightness
+ - max_flash_brightness
+ - flash_timeout
+ - max_flash_timeout
+ - flash_strobe
+ - flash_fault
+
+
+V4L2 flash wrapper for flash LEDs
+=================================
+
+A LED subsystem driver can be controlled also from the level of VideoForLinux2
+subsystem. In order to enable this CONFIG_V4L2_FLASH_LED_CLASS symbol has to
+be defined in the kernel config.
+
+The driver must call the v4l2_flash_init function to get registered in the
+V4L2 subsystem. The function takes six arguments:
+
+- dev:
+ flash device, e.g. an I2C device
+- of_node:
+ of_node of the LED, may be NULL if the same as device's
+- fled_cdev:
+ LED flash class device to wrap
+- iled_cdev:
+ LED flash class device representing indicator LED associated with
+ fled_cdev, may be NULL
+- ops:
+ V4L2 specific ops
+
+ * external_strobe_set
+ defines the source of the flash LED strobe -
+ V4L2_CID_FLASH_STROBE control or external source, typically
+ a sensor, which makes it possible to synchronise the flash
+ strobe start with exposure start,
+ * intensity_to_led_brightness and led_brightness_to_intensity
+ perform
+ enum led_brightness <-> V4L2 intensity conversion in a device
+ specific manner - they can be used for devices with non-linear
+ LED current scale.
+- config:
+ configuration for V4L2 Flash sub-device
+
+ * dev_name
+ the name of the media entity, unique in the system,
+ * flash_faults
+ bitmask of flash faults that the LED flash class
+ device can report; corresponding LED_FAULT* bit definitions are
+ available in <linux/led-class-flash.h>,
+ * torch_intensity
+ constraints for the LED in TORCH mode
+ in microamperes,
+ * indicator_intensity
+ constraints for the indicator LED
+ in microamperes,
+ * has_external_strobe
+ determines whether the flash strobe source
+ can be switched to external,
+
+On remove the v4l2_flash_release function has to be called, which takes one
+argument - struct v4l2_flash pointer returned previously by v4l2_flash_init.
+This function can be safely called with NULL or error pointer argument.
+
+Please refer to drivers/leds/leds-max77693.c for an exemplary usage of the
+v4l2 flash wrapper.
+
+Once the V4L2 sub-device is registered by the driver which created the Media
+controller device, the sub-device node acts just as a node of a native V4L2
+flash API device would. The calls are simply routed to the LED flash API.
+
+Opening the V4L2 flash sub-device makes the LED subsystem sysfs interface
+unavailable. The interface is re-enabled after the V4L2 flash sub-device
+is closed.
diff --git a/Documentation/leds/leds-class-flash.txt b/Documentation/leds/leds-class-flash.txt
deleted file mode 100644
index 8da3c6f4b60b..000000000000
--- a/Documentation/leds/leds-class-flash.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-
-Flash LED handling under Linux
-==============================
-
-Some LED devices provide two modes - torch and flash. In the LED subsystem
-those modes are supported by LED class (see Documentation/leds/leds-class.txt)
-and LED Flash class respectively. The torch mode related features are enabled
-by default and the flash ones only if a driver declares it by setting
-LED_DEV_CAP_FLASH flag.
-
-In order to enable the support for flash LEDs CONFIG_LEDS_CLASS_FLASH symbol
-must be defined in the kernel config. A LED Flash class driver must be
-registered in the LED subsystem with led_classdev_flash_register function.
-
-Following sysfs attributes are exposed for controlling flash LED devices:
-(see Documentation/ABI/testing/sysfs-class-led-flash)
- - flash_brightness
- - max_flash_brightness
- - flash_timeout
- - max_flash_timeout
- - flash_strobe
- - flash_fault
-
-
-V4L2 flash wrapper for flash LEDs
-=================================
-
-A LED subsystem driver can be controlled also from the level of VideoForLinux2
-subsystem. In order to enable this CONFIG_V4L2_FLASH_LED_CLASS symbol has to
-be defined in the kernel config.
-
-The driver must call the v4l2_flash_init function to get registered in the
-V4L2 subsystem. The function takes six arguments:
-- dev : flash device, e.g. an I2C device
-- of_node : of_node of the LED, may be NULL if the same as device's
-- fled_cdev : LED flash class device to wrap
-- iled_cdev : LED flash class device representing indicator LED associated with
- fled_cdev, may be NULL
-- ops : V4L2 specific ops
- * external_strobe_set - defines the source of the flash LED strobe -
- V4L2_CID_FLASH_STROBE control or external source, typically
- a sensor, which makes it possible to synchronise the flash
- strobe start with exposure start,
- * intensity_to_led_brightness and led_brightness_to_intensity - perform
- enum led_brightness <-> V4L2 intensity conversion in a device
- specific manner - they can be used for devices with non-linear
- LED current scale.
-- config : configuration for V4L2 Flash sub-device
- * dev_name - the name of the media entity, unique in the system,
- * flash_faults - bitmask of flash faults that the LED flash class
- device can report; corresponding LED_FAULT* bit definitions are
- available in <linux/led-class-flash.h>,
- * torch_intensity - constraints for the LED in TORCH mode
- in microamperes,
- * indicator_intensity - constraints for the indicator LED
- in microamperes,
- * has_external_strobe - determines whether the flash strobe source
- can be switched to external,
-
-On remove the v4l2_flash_release function has to be called, which takes one
-argument - struct v4l2_flash pointer returned previously by v4l2_flash_init.
-This function can be safely called with NULL or error pointer argument.
-
-Please refer to drivers/leds/leds-max77693.c for an exemplary usage of the
-v4l2 flash wrapper.
-
-Once the V4L2 sub-device is registered by the driver which created the Media
-controller device, the sub-device node acts just as a node of a native V4L2
-flash API device would. The calls are simply routed to the LED flash API.
-
-Opening the V4L2 flash sub-device makes the LED subsystem sysfs interface
-unavailable. The interface is re-enabled after the V4L2 flash sub-device
-is closed.
diff --git a/Documentation/leds/leds-class.rst b/Documentation/leds/leds-class.rst
new file mode 100644
index 000000000000..df0120a1ee3c
--- /dev/null
+++ b/Documentation/leds/leds-class.rst
@@ -0,0 +1,125 @@
+========================
+LED handling under Linux
+========================
+
+In its simplest form, the LED class just allows control of LEDs from
+userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the
+LED is defined in max_brightness file. The brightness file will set the brightness
+of the LED (taking a value 0-max_brightness). Most LEDs don't have hardware
+brightness support so will just be turned on for non-zero brightness settings.
+
+The class also introduces the optional concept of an LED trigger. A trigger
+is a kernel based source of led events. Triggers can either be simple or
+complex. A simple trigger isn't configurable and is designed to slot into
+existing subsystems with minimal additional code. Examples are the disk-activity,
+nand-disk and sharpsl-charge triggers. With led triggers disabled, the code
+optimises away.
+
+Complex triggers while available to all LEDs have LED specific
+parameters and work on a per LED basis. The timer trigger is an example.
+The timer trigger will periodically change the LED brightness between
+LED_OFF and the current brightness setting. The "on" and "off" time can
+be specified via /sys/class/leds/<device>/delay_{on,off} in milliseconds.
+You can change the brightness value of a LED independently of the timer
+trigger. However, if you set the brightness value to LED_OFF it will
+also disable the timer trigger.
+
+You can change triggers in a similar manner to the way an IO scheduler
+is chosen (via /sys/class/leds/<device>/trigger). Trigger specific
+parameters can appear in /sys/class/leds/<device> once a given trigger is
+selected.
+
+
+Design Philosophy
+=================
+
+The underlying design philosophy is simplicity. LEDs are simple devices
+and the aim is to keep a small amount of code giving as much functionality
+as possible. Please keep this in mind when suggesting enhancements.
+
+
+LED Device Naming
+=================
+
+Is currently of the form:
+
+ "devicename:colour:function"
+
+There have been calls for LED properties such as colour to be exported as
+individual led class attributes. As a solution which doesn't incur as much
+overhead, I suggest these become part of the device name. The naming scheme
+above leaves scope for further attributes should they be needed. If sections
+of the name don't apply, just leave that section blank.
+
+
+Brightness setting API
+======================
+
+LED subsystem core exposes following API for setting brightness:
+
+ - led_set_brightness:
+ it is guaranteed not to sleep, passing LED_OFF stops
+ blinking,
+
+ - led_set_brightness_sync:
+ for use cases when immediate effect is desired -
+ it can block the caller for the time required for accessing
+ device registers and can sleep, passing LED_OFF stops hardware
+ blinking, returns -EBUSY if software blink fallback is enabled.
+
+
+LED registration API
+====================
+
+A driver wanting to register a LED classdev for use by other drivers /
+userspace needs to allocate and fill a led_classdev struct and then call
+`[devm_]led_classdev_register`. If the non devm version is used the driver
+must call led_classdev_unregister from its remove function before
+free-ing the led_classdev struct.
+
+If the driver can detect hardware initiated brightness changes and thus
+wants to have a brightness_hw_changed attribute then the LED_BRIGHT_HW_CHANGED
+flag must be set in flags before registering. Calling
+led_classdev_notify_brightness_hw_changed on a classdev not registered with
+the LED_BRIGHT_HW_CHANGED flag is a bug and will trigger a WARN_ON.
+
+Hardware accelerated blink of LEDs
+==================================
+
+Some LEDs can be programmed to blink without any CPU interaction. To
+support this feature, a LED driver can optionally implement the
+blink_set() function (see <linux/leds.h>). To set an LED to blinking,
+however, it is better to use the API function led_blink_set(), as it
+will check and implement software fallback if necessary.
+
+To turn off blinking, use the API function led_brightness_set()
+with brightness value LED_OFF, which should stop any software
+timers that may have been required for blinking.
+
+The blink_set() function should choose a user friendly blinking value
+if it is called with `*delay_on==0` && `*delay_off==0` parameters. In this
+case the driver should give back the chosen value through delay_on and
+delay_off parameters to the leds subsystem.
+
+Setting the brightness to zero with brightness_set() callback function
+should completely turn off the LED and cancel the previously programmed
+hardware blinking function, if any.
+
+
+Known Issues
+============
+
+The LED Trigger core cannot be a module as the simple trigger functions
+would cause nightmare dependency issues. I see this as a minor issue
+compared to the benefits the simple trigger functionality brings. The
+rest of the LED subsystem can be modular.
+
+
+Future Development
+==================
+
+At the moment, a trigger can't be created specifically for a single LED.
+There are a number of cases where a trigger might only be mappable to a
+particular LED (ACPI?). The addition of triggers provided by the LED driver
+should cover this option and be possible to add without breaking the
+current interface.
diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt
deleted file mode 100644
index 8b39cc6b03ee..000000000000
--- a/Documentation/leds/leds-class.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-
-LED handling under Linux
-========================
-
-In its simplest form, the LED class just allows control of LEDs from
-userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the
-LED is defined in max_brightness file. The brightness file will set the brightness
-of the LED (taking a value 0-max_brightness). Most LEDs don't have hardware
-brightness support so will just be turned on for non-zero brightness settings.
-
-The class also introduces the optional concept of an LED trigger. A trigger
-is a kernel based source of led events. Triggers can either be simple or
-complex. A simple trigger isn't configurable and is designed to slot into
-existing subsystems with minimal additional code. Examples are the disk-activity,
-nand-disk and sharpsl-charge triggers. With led triggers disabled, the code
-optimises away.
-
-Complex triggers while available to all LEDs have LED specific
-parameters and work on a per LED basis. The timer trigger is an example.
-The timer trigger will periodically change the LED brightness between
-LED_OFF and the current brightness setting. The "on" and "off" time can
-be specified via /sys/class/leds/<device>/delay_{on,off} in milliseconds.
-You can change the brightness value of a LED independently of the timer
-trigger. However, if you set the brightness value to LED_OFF it will
-also disable the timer trigger.
-
-You can change triggers in a similar manner to the way an IO scheduler
-is chosen (via /sys/class/leds/<device>/trigger). Trigger specific
-parameters can appear in /sys/class/leds/<device> once a given trigger is
-selected.
-
-
-Design Philosophy
-=================
-
-The underlying design philosophy is simplicity. LEDs are simple devices
-and the aim is to keep a small amount of code giving as much functionality
-as possible. Please keep this in mind when suggesting enhancements.
-
-
-LED Device Naming
-=================
-
-Is currently of the form:
-
-"devicename:colour:function"
-
-There have been calls for LED properties such as colour to be exported as
-individual led class attributes. As a solution which doesn't incur as much
-overhead, I suggest these become part of the device name. The naming scheme
-above leaves scope for further attributes should they be needed. If sections
-of the name don't apply, just leave that section blank.
-
-
-Brightness setting API
-======================
-
-LED subsystem core exposes following API for setting brightness:
-
- - led_set_brightness : it is guaranteed not to sleep, passing LED_OFF stops
- blinking,
- - led_set_brightness_sync : for use cases when immediate effect is desired -
- it can block the caller for the time required for accessing
- device registers and can sleep, passing LED_OFF stops hardware
- blinking, returns -EBUSY if software blink fallback is enabled.
-
-
-LED registration API
-====================
-
-A driver wanting to register a LED classdev for use by other drivers /
-userspace needs to allocate and fill a led_classdev struct and then call
-[devm_]led_classdev_register. If the non devm version is used the driver
-must call led_classdev_unregister from its remove function before
-free-ing the led_classdev struct.
-
-If the driver can detect hardware initiated brightness changes and thus
-wants to have a brightness_hw_changed attribute then the LED_BRIGHT_HW_CHANGED
-flag must be set in flags before registering. Calling
-led_classdev_notify_brightness_hw_changed on a classdev not registered with
-the LED_BRIGHT_HW_CHANGED flag is a bug and will trigger a WARN_ON.
-
-Hardware accelerated blink of LEDs
-==================================
-
-Some LEDs can be programmed to blink without any CPU interaction. To
-support this feature, a LED driver can optionally implement the
-blink_set() function (see <linux/leds.h>). To set an LED to blinking,
-however, it is better to use the API function led_blink_set(), as it
-will check and implement software fallback if necessary.
-
-To turn off blinking, use the API function led_brightness_set()
-with brightness value LED_OFF, which should stop any software
-timers that may have been required for blinking.
-
-The blink_set() function should choose a user friendly blinking value
-if it is called with *delay_on==0 && *delay_off==0 parameters. In this
-case the driver should give back the chosen value through delay_on and
-delay_off parameters to the leds subsystem.
-
-Setting the brightness to zero with brightness_set() callback function
-should completely turn off the LED and cancel the previously programmed
-hardware blinking function, if any.
-
-
-Known Issues
-============
-
-The LED Trigger core cannot be a module as the simple trigger functions
-would cause nightmare dependency issues. I see this as a minor issue
-compared to the benefits the simple trigger functionality brings. The
-rest of the LED subsystem can be modular.
-
-
-Future Development
-==================
-
-At the moment, a trigger can't be created specifically for a single LED.
-There are a number of cases where a trigger might only be mappable to a
-particular LED (ACPI?). The addition of triggers provided by the LED driver
-should cover this option and be possible to add without breaking the
-current interface.
diff --git a/Documentation/leds/leds-lm3556.rst b/Documentation/leds/leds-lm3556.rst
new file mode 100644
index 000000000000..1ef17d7d800e
--- /dev/null
+++ b/Documentation/leds/leds-lm3556.rst
@@ -0,0 +1,137 @@
+========================
+Kernel driver for lm3556
+========================
+
+* Texas Instrument:
+ 1.5 A Synchronous Boost LED Flash Driver w/ High-Side Current Source
+* Datasheet: http://www.national.com/ds/LM/LM3556.pdf
+
+Authors:
+ - Daniel Jeong
+
+ Contact:Daniel Jeong(daniel.jeong-at-ti.com, gshark.jeong-at-gmail.com)
+
+Description
+-----------
+There are 3 functions in LM3556, Flash, Torch and Indicator.
+
+Flash Mode
+^^^^^^^^^^
+
+In Flash Mode, the LED current source(LED) provides 16 target current levels
+from 93.75 mA to 1500 mA.The Flash currents are adjusted via the CURRENT
+CONTROL REGISTER(0x09).Flash mode is activated by the ENABLE REGISTER(0x0A),
+or by pulling the STROBE pin HIGH.
+
+LM3556 Flash can be controlled through sys/class/leds/flash/brightness file
+
+* if STROBE pin is enabled, below example control brightness only, and
+ ON / OFF will be controlled by STROBE pin.
+
+Flash Example:
+
+OFF::
+
+ #echo 0 > sys/class/leds/flash/brightness
+
+93.75 mA::
+
+ #echo 1 > sys/class/leds/flash/brightness
+
+...
+
+1500 mA::
+
+ #echo 16 > sys/class/leds/flash/brightness
+
+Torch Mode
+^^^^^^^^^^
+
+In Torch Mode, the current source(LED) is programmed via the CURRENT CONTROL
+REGISTER(0x09).Torch Mode is activated by the ENABLE REGISTER(0x0A) or by the
+hardware TORCH input.
+
+LM3556 torch can be controlled through sys/class/leds/torch/brightness file.
+* if TORCH pin is enabled, below example control brightness only,
+and ON / OFF will be controlled by TORCH pin.
+
+Torch Example:
+
+OFF::
+
+ #echo 0 > sys/class/leds/torch/brightness
+
+46.88 mA::
+
+ #echo 1 > sys/class/leds/torch/brightness
+
+...
+
+375 mA::
+
+ #echo 8 > sys/class/leds/torch/brightness
+
+Indicator Mode
+^^^^^^^^^^^^^^
+
+Indicator pattern can be set through sys/class/leds/indicator/pattern file,
+and 4 patterns are pre-defined in indicator_pattern array.
+
+According to N-lank, Pulse time and N Period values, different pattern wiill
+be generated.If you want new patterns for your own device, change
+indicator_pattern array with your own values and INDIC_PATTERN_SIZE.
+
+Please refer datasheet for more detail about N-Blank, Pulse time and N Period.
+
+Indicator pattern example:
+
+pattern 0::
+
+ #echo 0 > sys/class/leds/indicator/pattern
+
+...
+
+pattern 3::
+
+ #echo 3 > sys/class/leds/indicator/pattern
+
+Indicator brightness can be controlled through
+sys/class/leds/indicator/brightness file.
+
+Example:
+
+OFF::
+
+ #echo 0 > sys/class/leds/indicator/brightness
+
+5.86 mA::
+
+ #echo 1 > sys/class/leds/indicator/brightness
+
+...
+
+46.875mA::
+
+ #echo 8 > sys/class/leds/indicator/brightness
+
+Notes
+-----
+Driver expects it is registered using the i2c_board_info mechanism.
+To register the chip at address 0x63 on specific adapter, set the platform data
+according to include/linux/platform_data/leds-lm3556.h, set the i2c board info
+
+Example::
+
+ static struct i2c_board_info board_i2c_ch4[] __initdata = {
+ {
+ I2C_BOARD_INFO(LM3556_NAME, 0x63),
+ .platform_data = &lm3556_pdata,
+ },
+ };
+
+and register it in the platform init function
+
+Example::
+
+ board_register_i2c_bus(4, 400,
+ board_i2c_ch4, ARRAY_SIZE(board_i2c_ch4));
diff --git a/Documentation/leds/leds-lm3556.txt b/Documentation/leds/leds-lm3556.txt
deleted file mode 100644
index 62278e871b50..000000000000
--- a/Documentation/leds/leds-lm3556.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-Kernel driver for lm3556
-========================
-
-*Texas Instrument:
- 1.5 A Synchronous Boost LED Flash Driver w/ High-Side Current Source
-* Datasheet: http://www.national.com/ds/LM/LM3556.pdf
-
-Authors:
- Daniel Jeong
- Contact:Daniel Jeong(daniel.jeong-at-ti.com, gshark.jeong-at-gmail.com)
-
-Description
------------
-There are 3 functions in LM3556, Flash, Torch and Indicator.
-
-FLASH MODE
-In Flash Mode, the LED current source(LED) provides 16 target current levels
-from 93.75 mA to 1500 mA.The Flash currents are adjusted via the CURRENT
-CONTROL REGISTER(0x09).Flash mode is activated by the ENABLE REGISTER(0x0A),
-or by pulling the STROBE pin HIGH.
-LM3556 Flash can be controlled through sys/class/leds/flash/brightness file
-* if STROBE pin is enabled, below example control brightness only, and
-ON / OFF will be controlled by STROBE pin.
-
-Flash Example:
-OFF : #echo 0 > sys/class/leds/flash/brightness
-93.75 mA: #echo 1 > sys/class/leds/flash/brightness
-... .....
-1500 mA: #echo 16 > sys/class/leds/flash/brightness
-
-TORCH MODE
-In Torch Mode, the current source(LED) is programmed via the CURRENT CONTROL
-REGISTER(0x09).Torch Mode is activated by the ENABLE REGISTER(0x0A) or by the
-hardware TORCH input.
-LM3556 torch can be controlled through sys/class/leds/torch/brightness file.
-* if TORCH pin is enabled, below example control brightness only,
-and ON / OFF will be controlled by TORCH pin.
-
-Torch Example:
-OFF : #echo 0 > sys/class/leds/torch/brightness
-46.88 mA: #echo 1 > sys/class/leds/torch/brightness
-... .....
-375 mA : #echo 8 > sys/class/leds/torch/brightness
-
-INDICATOR MODE
-Indicator pattern can be set through sys/class/leds/indicator/pattern file,
-and 4 patterns are pre-defined in indicator_pattern array.
-According to N-lank, Pulse time and N Period values, different pattern wiill
-be generated.If you want new patterns for your own device, change
-indicator_pattern array with your own values and INDIC_PATTERN_SIZE.
-Please refer datasheet for more detail about N-Blank, Pulse time and N Period.
-
-Indicator pattern example:
-pattern 0: #echo 0 > sys/class/leds/indicator/pattern
-....
-pattern 3: #echo 3 > sys/class/leds/indicator/pattern
-
-Indicator brightness can be controlled through
-sys/class/leds/indicator/brightness file.
-
-Example:
-OFF : #echo 0 > sys/class/leds/indicator/brightness
-5.86 mA : #echo 1 > sys/class/leds/indicator/brightness
-........
-46.875mA : #echo 8 > sys/class/leds/indicator/brightness
-
-Notes
------
-Driver expects it is registered using the i2c_board_info mechanism.
-To register the chip at address 0x63 on specific adapter, set the platform data
-according to include/linux/platform_data/leds-lm3556.h, set the i2c board info
-
-Example:
- static struct i2c_board_info board_i2c_ch4[] __initdata = {
- {
- I2C_BOARD_INFO(LM3556_NAME, 0x63),
- .platform_data = &lm3556_pdata,
- },
- };
-
-and register it in the platform init function
-
-Example:
- board_register_i2c_bus(4, 400,
- board_i2c_ch4, ARRAY_SIZE(board_i2c_ch4));
diff --git a/Documentation/leds/leds-lp3944.rst b/Documentation/leds/leds-lp3944.rst
new file mode 100644
index 000000000000..c2f87dc1a3a9
--- /dev/null
+++ b/Documentation/leds/leds-lp3944.rst
@@ -0,0 +1,59 @@
+====================
+Kernel driver lp3944
+====================
+
+ * National Semiconductor LP3944 Fun-light Chip
+
+ Prefix: 'lp3944'
+
+ Addresses scanned: None (see the Notes section below)
+
+ Datasheet:
+
+ Publicly available at the National Semiconductor website
+ http://www.national.com/pf/LP/LP3944.html
+
+Authors:
+ Antonio Ospite <ospite@studenti.unina.it>
+
+
+Description
+-----------
+The LP3944 is a helper chip that can drive up to 8 leds, with two programmable
+DIM modes; it could even be used as a gpio expander but this driver assumes it
+is used as a led controller.
+
+The DIM modes are used to set _blink_ patterns for leds, the pattern is
+specified supplying two parameters:
+
+ - period:
+ from 0s to 1.6s
+ - duty cycle:
+ percentage of the period the led is on, from 0 to 100
+
+Setting a led in DIM0 or DIM1 mode makes it blink according to the pattern.
+See the datasheet for details.
+
+LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
+leds, the camera flash light and the lcds power.
+
+
+Notes
+-----
+The chip is used mainly in embedded contexts, so this driver expects it is
+registered using the i2c_board_info mechanism.
+
+To register the chip at address 0x60 on adapter 0, set the platform data
+according to include/linux/leds-lp3944.h, set the i2c board info::
+
+ static struct i2c_board_info a910_i2c_board_info[] __initdata = {
+ {
+ I2C_BOARD_INFO("lp3944", 0x60),
+ .platform_data = &a910_lp3944_leds,
+ },
+ };
+
+and register it in the platform init function::
+
+ i2c_register_board_info(0, a910_i2c_board_info,
+ ARRAY_SIZE(a910_i2c_board_info));
diff --git a/Documentation/leds/leds-lp3944.txt b/Documentation/leds/leds-lp3944.txt
deleted file mode 100644
index e88ac3b60c08..000000000000
--- a/Documentation/leds/leds-lp3944.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-Kernel driver lp3944
-====================
-
- * National Semiconductor LP3944 Fun-light Chip
- Prefix: 'lp3944'
- Addresses scanned: None (see the Notes section below)
- Datasheet: Publicly available at the National Semiconductor website
- http://www.national.com/pf/LP/LP3944.html
-
-Authors:
- Antonio Ospite <ospite@studenti.unina.it>
-
-
-Description
------------
-The LP3944 is a helper chip that can drive up to 8 leds, with two programmable
-DIM modes; it could even be used as a gpio expander but this driver assumes it
-is used as a led controller.
-
-The DIM modes are used to set _blink_ patterns for leds, the pattern is
-specified supplying two parameters:
- - period: from 0s to 1.6s
- - duty cycle: percentage of the period the led is on, from 0 to 100
-
-Setting a led in DIM0 or DIM1 mode makes it blink according to the pattern.
-See the datasheet for details.
-
-LP3944 can be found on Motorola A910 smartphone, where it drives the rgb
-leds, the camera flash light and the lcds power.
-
-
-Notes
------
-The chip is used mainly in embedded contexts, so this driver expects it is
-registered using the i2c_board_info mechanism.
-
-To register the chip at address 0x60 on adapter 0, set the platform data
-according to include/linux/leds-lp3944.h, set the i2c board info:
-
- static struct i2c_board_info a910_i2c_board_info[] __initdata = {
- {
- I2C_BOARD_INFO("lp3944", 0x60),
- .platform_data = &a910_lp3944_leds,
- },
- };
-
-and register it in the platform init function
-
- i2c_register_board_info(0, a910_i2c_board_info,
- ARRAY_SIZE(a910_i2c_board_info));
diff --git a/Documentation/leds/leds-lp5521.rst b/Documentation/leds/leds-lp5521.rst
new file mode 100644
index 000000000000..0432615b083d
--- /dev/null
+++ b/Documentation/leds/leds-lp5521.rst
@@ -0,0 +1,115 @@
+========================
+Kernel driver for lp5521
+========================
+
+* National Semiconductor LP5521 led driver chip
+* Datasheet: http://www.national.com/pf/LP/LP5521.html
+
+Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
+
+Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
+
+Description
+-----------
+
+LP5521 can drive up to 3 channels. Leds can be controlled directly via
+the led class control interface. Channels have generic names:
+lp5521:channelx, where x is 0 .. 2
+
+All three channels can be also controlled using the engine micro programs.
+More details of the instructions can be found from the public data sheet.
+
+LP5521 has the internal program memory for running various LED patterns.
+There are two ways to run LED patterns.
+
+1) Legacy interface - enginex_mode and enginex_load
+ Control interface for the engines:
+
+ x is 1 .. 3
+
+ enginex_mode:
+ disabled, load, run
+ enginex_load:
+ store program (visible only in engine load mode)
+
+ Example (start to blink the channel 2 led)::
+
+ cd /sys/class/leds/lp5521:channel2/device
+ echo "load" > engine3_mode
+ echo "037f4d0003ff6000" > engine3_load
+ echo "run" > engine3_mode
+
+ To stop the engine::
+
+ echo "disabled" > engine3_mode
+
+2) Firmware interface - LP55xx common interface
+
+For the details, please refer to 'firmware' section in leds-lp55xx.txt
+
+sysfs contains a selftest entry.
+
+The test communicates with the chip and checks that
+the clock mode is automatically set to the requested one.
+
+Each channel has its own led current settings.
+
+- /sys/class/leds/lp5521:channel0/led_current - RW
+- /sys/class/leds/lp5521:channel0/max_current - RO
+
+Format: 10x mA i.e 10 means 1.0 mA
+
+example platform data::
+
+ static struct lp55xx_led_config lp5521_led_config[] = {
+ {
+ .name = "red",
+ .chan_nr = 0,
+ .led_current = 50,
+ .max_current = 130,
+ }, {
+ .name = "green",
+ .chan_nr = 1,
+ .led_current = 0,
+ .max_current = 130,
+ }, {
+ .name = "blue",
+ .chan_nr = 2,
+ .led_current = 0,
+ .max_current = 130,
+ }
+ };
+
+ static int lp5521_setup(void)
+ {
+ /* setup HW resources */
+ }
+
+ static void lp5521_release(void)
+ {
+ /* Release HW resources */
+ }
+
+ static void lp5521_enable(bool state)
+ {
+ /* Control of chip enable signal */
+ }
+
+ static struct lp55xx_platform_data lp5521_platform_data = {
+ .led_config = lp5521_led_config,
+ .num_channels = ARRAY_SIZE(lp5521_led_config),
+ .clock_mode = LP55XX_CLOCK_EXT,
+ .setup_resources = lp5521_setup,
+ .release_resources = lp5521_release,
+ .enable = lp5521_enable,
+ };
+
+Note:
+ chan_nr can have values between 0 and 2.
+ The name of each channel can be configurable.
+ If the name field is not defined, the default name will be set to 'xxxx:channelN'
+ (XXXX : pdata->label or i2c client name, N : channel number)
+
+
+If the current is set to 0 in the platform data, that channel is
+disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp5521.txt b/Documentation/leds/leds-lp5521.txt
deleted file mode 100644
index d08d8c179f85..000000000000
--- a/Documentation/leds/leds-lp5521.txt
+++ /dev/null
@@ -1,101 +0,0 @@
-Kernel driver for lp5521
-========================
-
-* National Semiconductor LP5521 led driver chip
-* Datasheet: http://www.national.com/pf/LP/LP5521.html
-
-Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
-Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
-
-Description
------------
-
-LP5521 can drive up to 3 channels. Leds can be controlled directly via
-the led class control interface. Channels have generic names:
-lp5521:channelx, where x is 0 .. 2
-
-All three channels can be also controlled using the engine micro programs.
-More details of the instructions can be found from the public data sheet.
-
-LP5521 has the internal program memory for running various LED patterns.
-There are two ways to run LED patterns.
-
-1) Legacy interface - enginex_mode and enginex_load
- Control interface for the engines:
- x is 1 .. 3
- enginex_mode : disabled, load, run
- enginex_load : store program (visible only in engine load mode)
-
- Example (start to blink the channel 2 led):
- cd /sys/class/leds/lp5521:channel2/device
- echo "load" > engine3_mode
- echo "037f4d0003ff6000" > engine3_load
- echo "run" > engine3_mode
-
- To stop the engine:
- echo "disabled" > engine3_mode
-
-2) Firmware interface - LP55xx common interface
- For the details, please refer to 'firmware' section in leds-lp55xx.txt
-
-sysfs contains a selftest entry.
-The test communicates with the chip and checks that
-the clock mode is automatically set to the requested one.
-
-Each channel has its own led current settings.
-/sys/class/leds/lp5521:channel0/led_current - RW
-/sys/class/leds/lp5521:channel0/max_current - RO
-Format: 10x mA i.e 10 means 1.0 mA
-
-example platform data:
-
-Note: chan_nr can have values between 0 and 2.
-The name of each channel can be configurable.
-If the name field is not defined, the default name will be set to 'xxxx:channelN'
-(XXXX : pdata->label or i2c client name, N : channel number)
-
-static struct lp55xx_led_config lp5521_led_config[] = {
- {
- .name = "red",
- .chan_nr = 0,
- .led_current = 50,
- .max_current = 130,
- }, {
- .name = "green",
- .chan_nr = 1,
- .led_current = 0,
- .max_current = 130,
- }, {
- .name = "blue",
- .chan_nr = 2,
- .led_current = 0,
- .max_current = 130,
- }
-};
-
-static int lp5521_setup(void)
-{
- /* setup HW resources */
-}
-
-static void lp5521_release(void)
-{
- /* Release HW resources */
-}
-
-static void lp5521_enable(bool state)
-{
- /* Control of chip enable signal */
-}
-
-static struct lp55xx_platform_data lp5521_platform_data = {
- .led_config = lp5521_led_config,
- .num_channels = ARRAY_SIZE(lp5521_led_config),
- .clock_mode = LP55XX_CLOCK_EXT,
- .setup_resources = lp5521_setup,
- .release_resources = lp5521_release,
- .enable = lp5521_enable,
-};
-
-If the current is set to 0 in the platform data, that channel is
-disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp5523.rst b/Documentation/leds/leds-lp5523.rst
new file mode 100644
index 000000000000..7d7362a1dd57
--- /dev/null
+++ b/Documentation/leds/leds-lp5523.rst
@@ -0,0 +1,147 @@
+========================
+Kernel driver for lp5523
+========================
+
+* National Semiconductor LP5523 led driver chip
+* Datasheet: http://www.national.com/pf/LP/LP5523.html
+
+Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
+Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
+
+Description
+-----------
+LP5523 can drive up to 9 channels. Leds can be controlled directly via
+the led class control interface.
+The name of each channel is configurable in the platform data - name and label.
+There are three options to make the channel name.
+
+a) Define the 'name' in the platform data
+
+To make specific channel name, then use 'name' platform data.
+
+- /sys/class/leds/R1 (name: 'R1')
+- /sys/class/leds/B1 (name: 'B1')
+
+b) Use the 'label' with no 'name' field
+
+For one device name with channel number, then use 'label'.
+- /sys/class/leds/RGB:channelN (label: 'RGB', N: 0 ~ 8)
+
+c) Default
+
+If both fields are NULL, 'lp5523' is used by default.
+- /sys/class/leds/lp5523:channelN (N: 0 ~ 8)
+
+LP5523 has the internal program memory for running various LED patterns.
+There are two ways to run LED patterns.
+
+1) Legacy interface - enginex_mode, enginex_load and enginex_leds
+
+ Control interface for the engines:
+
+ x is 1 .. 3
+
+ enginex_mode:
+ disabled, load, run
+ enginex_load:
+ microcode load
+ enginex_leds:
+ led mux control
+
+ ::
+
+ cd /sys/class/leds/lp5523:channel2/device
+ echo "load" > engine3_mode
+ echo "9d80400004ff05ff437f0000" > engine3_load
+ echo "111111111" > engine3_leds
+ echo "run" > engine3_mode
+
+ To stop the engine::
+
+ echo "disabled" > engine3_mode
+
+2) Firmware interface - LP55xx common interface
+
+For the details, please refer to 'firmware' section in leds-lp55xx.txt
+
+LP5523 has three master faders. If a channel is mapped to one of
+the master faders, its output is dimmed based on the value of the master
+fader.
+
+For example::
+
+ echo "123000123" > master_fader_leds
+
+creates the following channel-fader mappings::
+
+ channel 0,6 to master_fader1
+ channel 1,7 to master_fader2
+ channel 2,8 to master_fader3
+
+Then, to have 25% of the original output on channel 0,6::
+
+ echo 64 > master_fader1
+
+To have 0% of the original output (i.e. no output) channel 1,7::
+
+ echo 0 > master_fader2
+
+To have 100% of the original output (i.e. no dimming) on channel 2,8::
+
+ echo 255 > master_fader3
+
+To clear all master fader controls::
+
+ echo "000000000" > master_fader_leds
+
+Selftest uses always the current from the platform data.
+
+Each channel contains led current settings.
+- /sys/class/leds/lp5523:channel2/led_current - RW
+- /sys/class/leds/lp5523:channel2/max_current - RO
+
+Format: 10x mA i.e 10 means 1.0 mA
+
+Example platform data::
+
+ static struct lp55xx_led_config lp5523_led_config[] = {
+ {
+ .name = "D1",
+ .chan_nr = 0,
+ .led_current = 50,
+ .max_current = 130,
+ },
+ ...
+ {
+ .chan_nr = 8,
+ .led_current = 50,
+ .max_current = 130,
+ }
+ };
+
+ static int lp5523_setup(void)
+ {
+ /* Setup HW resources */
+ }
+
+ static void lp5523_release(void)
+ {
+ /* Release HW resources */
+ }
+
+ static void lp5523_enable(bool state)
+ {
+ /* Control chip enable signal */
+ }
+
+ static struct lp55xx_platform_data lp5523_platform_data = {
+ .led_config = lp5523_led_config,
+ .num_channels = ARRAY_SIZE(lp5523_led_config),
+ .clock_mode = LP55XX_CLOCK_EXT,
+ .setup_resources = lp5523_setup,
+ .release_resources = lp5523_release,
+ .enable = lp5523_enable,
+ };
+
+Note
+ chan_nr can have values between 0 and 8.
diff --git a/Documentation/leds/leds-lp5523.txt b/Documentation/leds/leds-lp5523.txt
deleted file mode 100644
index 0961a060fc4d..000000000000
--- a/Documentation/leds/leds-lp5523.txt
+++ /dev/null
@@ -1,130 +0,0 @@
-Kernel driver for lp5523
-========================
-
-* National Semiconductor LP5523 led driver chip
-* Datasheet: http://www.national.com/pf/LP/LP5523.html
-
-Authors: Mathias Nyman, Yuri Zaporozhets, Samu Onkalo
-Contact: Samu Onkalo (samu.p.onkalo-at-nokia.com)
-
-Description
------------
-LP5523 can drive up to 9 channels. Leds can be controlled directly via
-the led class control interface.
-The name of each channel is configurable in the platform data - name and label.
-There are three options to make the channel name.
-
-a) Define the 'name' in the platform data
-To make specific channel name, then use 'name' platform data.
-/sys/class/leds/R1 (name: 'R1')
-/sys/class/leds/B1 (name: 'B1')
-
-b) Use the 'label' with no 'name' field
-For one device name with channel number, then use 'label'.
-/sys/class/leds/RGB:channelN (label: 'RGB', N: 0 ~ 8)
-
-c) Default
-If both fields are NULL, 'lp5523' is used by default.
-/sys/class/leds/lp5523:channelN (N: 0 ~ 8)
-
-LP5523 has the internal program memory for running various LED patterns.
-There are two ways to run LED patterns.
-
-1) Legacy interface - enginex_mode, enginex_load and enginex_leds
- Control interface for the engines:
- x is 1 .. 3
- enginex_mode : disabled, load, run
- enginex_load : microcode load
- enginex_leds : led mux control
-
- cd /sys/class/leds/lp5523:channel2/device
- echo "load" > engine3_mode
- echo "9d80400004ff05ff437f0000" > engine3_load
- echo "111111111" > engine3_leds
- echo "run" > engine3_mode
-
- To stop the engine:
- echo "disabled" > engine3_mode
-
-2) Firmware interface - LP55xx common interface
- For the details, please refer to 'firmware' section in leds-lp55xx.txt
-
-LP5523 has three master faders. If a channel is mapped to one of
-the master faders, its output is dimmed based on the value of the master
-fader.
-
-For example,
-
- echo "123000123" > master_fader_leds
-
-creates the following channel-fader mappings:
-
- channel 0,6 to master_fader1
- channel 1,7 to master_fader2
- channel 2,8 to master_fader3
-
-Then, to have 25% of the original output on channel 0,6:
-
- echo 64 > master_fader1
-
-To have 0% of the original output (i.e. no output) channel 1,7:
-
- echo 0 > master_fader2
-
-To have 100% of the original output (i.e. no dimming) on channel 2,8:
-
- echo 255 > master_fader3
-
-To clear all master fader controls:
-
- echo "000000000" > master_fader_leds
-
-Selftest uses always the current from the platform data.
-
-Each channel contains led current settings.
-/sys/class/leds/lp5523:channel2/led_current - RW
-/sys/class/leds/lp5523:channel2/max_current - RO
-Format: 10x mA i.e 10 means 1.0 mA
-
-Example platform data:
-
-Note - chan_nr can have values between 0 and 8.
-
-static struct lp55xx_led_config lp5523_led_config[] = {
- {
- .name = "D1",
- .chan_nr = 0,
- .led_current = 50,
- .max_current = 130,
- },
-...
- {
- .chan_nr = 8,
- .led_current = 50,
- .max_current = 130,
- }
-};
-
-static int lp5523_setup(void)
-{
- /* Setup HW resources */
-}
-
-static void lp5523_release(void)
-{
- /* Release HW resources */
-}
-
-static void lp5523_enable(bool state)
-{
- /* Control chip enable signal */
-}
-
-static struct lp55xx_platform_data lp5523_platform_data = {
- .led_config = lp5523_led_config,
- .num_channels = ARRAY_SIZE(lp5523_led_config),
- .clock_mode = LP55XX_CLOCK_EXT,
- .setup_resources = lp5523_setup,
- .release_resources = lp5523_release,
- .enable = lp5523_enable,
-};
diff --git a/Documentation/leds/leds-lp5562.rst b/Documentation/leds/leds-lp5562.rst
new file mode 100644
index 000000000000..79bbb2487ff6
--- /dev/null
+++ b/Documentation/leds/leds-lp5562.rst
@@ -0,0 +1,137 @@
+========================
+Kernel driver for lp5562
+========================
+
+* TI LP5562 LED Driver
+
+Author: Milo(Woogyom) Kim <milo.kim@ti.com>
+
+Description
+===========
+
+ LP5562 can drive up to 4 channels. R/G/B and White.
+ LEDs can be controlled directly via the led class control interface.
+
+ All four channels can be also controlled using the engine micro programs.
+ LP5562 has the internal program memory for running various LED patterns.
+ For the details, please refer to 'firmware' section in leds-lp55xx.txt
+
+Device attribute
+================
+
+engine_mux
+ 3 Engines are allocated in LP5562, but the number of channel is 4.
+ Therefore each channel should be mapped to the engine number.
+
+ Value: RGB or W
+
+ This attribute is used for programming LED data with the firmware interface.
+ Unlike the LP5521/LP5523/55231, LP5562 has unique feature for the engine mux,
+ so additional sysfs is required
+
+ LED Map
+
+ ===== === ===============================
+ Red ... Engine 1 (fixed)
+ Green ... Engine 2 (fixed)
+ Blue ... Engine 3 (fixed)
+ White ... Engine 1 or 2 or 3 (selective)
+ ===== === ===============================
+
+How to load the program data using engine_mux
+=============================================
+
+ Before loading the LP5562 program data, engine_mux should be written between
+ the engine selection and loading the firmware.
+ Engine mux has two different mode, RGB and W.
+ RGB is used for loading RGB program data, W is used for W program data.
+
+ For example, run blinking green channel pattern::
+
+ echo 2 > /sys/bus/i2c/devices/xxxx/select_engine # 2 is for green channel
+ echo "RGB" > /sys/bus/i2c/devices/xxxx/engine_mux # engine mux for RGB
+ echo 1 > /sys/class/firmware/lp5562/loading
+ echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
+ echo 0 > /sys/class/firmware/lp5562/loading
+ echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+ To run a blinking white pattern::
+
+ echo 1 or 2 or 3 > /sys/bus/i2c/devices/xxxx/select_engine
+ echo "W" > /sys/bus/i2c/devices/xxxx/engine_mux
+ echo 1 > /sys/class/firmware/lp5562/loading
+ echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
+ echo 0 > /sys/class/firmware/lp5562/loading
+ echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+How to load the predefined patterns
+===================================
+
+ Please refer to 'leds-lp55xx.txt"
+
+Setting Current of Each Channel
+===============================
+
+ Like LP5521 and LP5523/55231, LP5562 provides LED current settings.
+ The 'led_current' and 'max_current' are used.
+
+Example of Platform data
+========================
+
+::
+
+ static struct lp55xx_led_config lp5562_led_config[] = {
+ {
+ .name = "R",
+ .chan_nr = 0,
+ .led_current = 20,
+ .max_current = 40,
+ },
+ {
+ .name = "G",
+ .chan_nr = 1,
+ .led_current = 20,
+ .max_current = 40,
+ },
+ {
+ .name = "B",
+ .chan_nr = 2,
+ .led_current = 20,
+ .max_current = 40,
+ },
+ {
+ .name = "W",
+ .chan_nr = 3,
+ .led_current = 20,
+ .max_current = 40,
+ },
+ };
+
+ static int lp5562_setup(void)
+ {
+ /* setup HW resources */
+ }
+
+ static void lp5562_release(void)
+ {
+ /* Release HW resources */
+ }
+
+ static void lp5562_enable(bool state)
+ {
+ /* Control of chip enable signal */
+ }
+
+ static struct lp55xx_platform_data lp5562_platform_data = {
+ .led_config = lp5562_led_config,
+ .num_channels = ARRAY_SIZE(lp5562_led_config),
+ .setup_resources = lp5562_setup,
+ .release_resources = lp5562_release,
+ .enable = lp5562_enable,
+ };
+
+To configure the platform specific data, lp55xx_platform_data structure is used
+
+
+If the current is set to 0 in the platform data, that channel is
+disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp5562.txt b/Documentation/leds/leds-lp5562.txt
deleted file mode 100644
index 5a823ff6b393..000000000000
--- a/Documentation/leds/leds-lp5562.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-Kernel driver for LP5562
-========================
-
-* TI LP5562 LED Driver
-
-Author: Milo(Woogyom) Kim <milo.kim@ti.com>
-
-Description
-
- LP5562 can drive up to 4 channels. R/G/B and White.
- LEDs can be controlled directly via the led class control interface.
-
- All four channels can be also controlled using the engine micro programs.
- LP5562 has the internal program memory for running various LED patterns.
- For the details, please refer to 'firmware' section in leds-lp55xx.txt
-
-Device attribute: engine_mux
-
- 3 Engines are allocated in LP5562, but the number of channel is 4.
- Therefore each channel should be mapped to the engine number.
- Value : RGB or W
-
- This attribute is used for programming LED data with the firmware interface.
- Unlike the LP5521/LP5523/55231, LP5562 has unique feature for the engine mux,
- so additional sysfs is required.
-
- LED Map
- Red ... Engine 1 (fixed)
- Green ... Engine 2 (fixed)
- Blue ... Engine 3 (fixed)
- White ... Engine 1 or 2 or 3 (selective)
-
-How to load the program data using engine_mux
-
- Before loading the LP5562 program data, engine_mux should be written between
- the engine selection and loading the firmware.
- Engine mux has two different mode, RGB and W.
- RGB is used for loading RGB program data, W is used for W program data.
-
- For example, run blinking green channel pattern,
- echo 2 > /sys/bus/i2c/devices/xxxx/select_engine # 2 is for green channel
- echo "RGB" > /sys/bus/i2c/devices/xxxx/engine_mux # engine mux for RGB
- echo 1 > /sys/class/firmware/lp5562/loading
- echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
- echo 0 > /sys/class/firmware/lp5562/loading
- echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
- To run a blinking white pattern,
- echo 1 or 2 or 3 > /sys/bus/i2c/devices/xxxx/select_engine
- echo "W" > /sys/bus/i2c/devices/xxxx/engine_mux
- echo 1 > /sys/class/firmware/lp5562/loading
- echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
- echo 0 > /sys/class/firmware/lp5562/loading
- echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-How to load the predefined patterns
-
- Please refer to 'leds-lp55xx.txt"
-
-Setting Current of Each Channel
-
- Like LP5521 and LP5523/55231, LP5562 provides LED current settings.
- The 'led_current' and 'max_current' are used.
-
-(Example of Platform data)
-
-To configure the platform specific data, lp55xx_platform_data structure is used.
-
-static struct lp55xx_led_config lp5562_led_config[] = {
- {
- .name = "R",
- .chan_nr = 0,
- .led_current = 20,
- .max_current = 40,
- },
- {
- .name = "G",
- .chan_nr = 1,
- .led_current = 20,
- .max_current = 40,
- },
- {
- .name = "B",
- .chan_nr = 2,
- .led_current = 20,
- .max_current = 40,
- },
- {
- .name = "W",
- .chan_nr = 3,
- .led_current = 20,
- .max_current = 40,
- },
-};
-
-static int lp5562_setup(void)
-{
- /* setup HW resources */
-}
-
-static void lp5562_release(void)
-{
- /* Release HW resources */
-}
-
-static void lp5562_enable(bool state)
-{
- /* Control of chip enable signal */
-}
-
-static struct lp55xx_platform_data lp5562_platform_data = {
- .led_config = lp5562_led_config,
- .num_channels = ARRAY_SIZE(lp5562_led_config),
- .setup_resources = lp5562_setup,
- .release_resources = lp5562_release,
- .enable = lp5562_enable,
-};
-
-If the current is set to 0 in the platform data, that channel is
-disabled and it is not visible in the sysfs.
diff --git a/Documentation/leds/leds-lp55xx.rst b/Documentation/leds/leds-lp55xx.rst
new file mode 100644
index 000000000000..632e41cec0b5
--- /dev/null
+++ b/Documentation/leds/leds-lp55xx.rst
@@ -0,0 +1,224 @@
+=================================================
+LP5521/LP5523/LP55231/LP5562/LP8501 Common Driver
+=================================================
+
+Authors: Milo(Woogyom) Kim <milo.kim@ti.com>
+
+Description
+-----------
+LP5521, LP5523/55231, LP5562 and LP8501 have common features as below.
+
+ Register access via the I2C
+ Device initialization/deinitialization
+ Create LED class devices for multiple output channels
+ Device attributes for user-space interface
+ Program memory for running LED patterns
+
+The LP55xx common driver provides these features using exported functions.
+
+ lp55xx_init_device() / lp55xx_deinit_device()
+ lp55xx_register_leds() / lp55xx_unregister_leds()
+ lp55xx_regsister_sysfs() / lp55xx_unregister_sysfs()
+
+( Driver Structure Data )
+
+In lp55xx common driver, two different data structure is used.
+
+* lp55xx_led
+ control multi output LED channels such as led current, channel index.
+* lp55xx_chip
+ general chip control such like the I2C and platform data.
+
+For example, LP5521 has maximum 3 LED channels.
+LP5523/55231 has 9 output channels::
+
+ lp55xx_chip for LP5521 ... lp55xx_led #1
+ lp55xx_led #2
+ lp55xx_led #3
+
+ lp55xx_chip for LP5523 ... lp55xx_led #1
+ lp55xx_led #2
+ .
+ .
+ lp55xx_led #9
+
+( Chip Dependent Code )
+
+To support device specific configurations, special structure
+'lpxx_device_config' is used.
+
+ - Maximum number of channels
+ - Reset command, chip enable command
+ - Chip specific initialization
+ - Brightness control register access
+ - Setting LED output current
+ - Program memory address access for running patterns
+ - Additional device specific attributes
+
+( Firmware Interface )
+
+LP55xx family devices have the internal program memory for running
+various LED patterns.
+
+This pattern data is saved as a file in the user-land or
+hex byte string is written into the memory through the I2C.
+
+LP55xx common driver supports the firmware interface.
+
+LP55xx chips have three program engines.
+
+To load and run the pattern, the programming sequence is following.
+
+ (1) Select an engine number (1/2/3)
+ (2) Mode change to load
+ (3) Write pattern data into selected area
+ (4) Mode change to run
+
+The LP55xx common driver provides simple interfaces as below.
+
+select_engine:
+ Select which engine is used for running program
+run_engine:
+ Start program which is loaded via the firmware interface
+firmware:
+ Load program data
+
+In case of LP5523, one more command is required, 'enginex_leds'.
+It is used for selecting LED output(s) at each engine number.
+In more details, please refer to 'leds-lp5523.txt'.
+
+For example, run blinking pattern in engine #1 of LP5521::
+
+ echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
+ echo 1 > /sys/class/firmware/lp5521/loading
+ echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
+ echo 0 > /sys/class/firmware/lp5521/loading
+ echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+For example, run blinking pattern in engine #3 of LP55231
+
+Two LEDs are configured as pattern output channels::
+
+ echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
+ echo 1 > /sys/class/firmware/lp55231/loading
+ echo "9d0740ff7e0040007e00a0010000" > /sys/class/firmware/lp55231/data
+ echo 0 > /sys/class/firmware/lp55231/loading
+ echo "000001100" > /sys/bus/i2c/devices/xxxx/engine3_leds
+ echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+To start blinking patterns in engine #2 and #3 simultaneously::
+
+ for idx in 2 3
+ do
+ echo $idx > /sys/class/leds/red/device/select_engine
+ sleep 0.1
+ echo 1 > /sys/class/firmware/lp5521/loading
+ echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
+ echo 0 > /sys/class/firmware/lp5521/loading
+ done
+ echo 1 > /sys/class/leds/red/device/run_engine
+
+Here is another example for LP5523.
+
+Full LED strings are selected by 'engine2_leds'::
+
+ echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
+ echo 1 > /sys/class/firmware/lp5523/loading
+ echo "9d80400004ff05ff437f0000" > /sys/class/firmware/lp5523/data
+ echo 0 > /sys/class/firmware/lp5523/loading
+ echo "111111111" > /sys/bus/i2c/devices/xxxx/engine2_leds
+ echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+As soon as 'loading' is set to 0, registered callback is called.
+Inside the callback, the selected engine is loaded and memory is updated.
+To run programmed pattern, 'run_engine' attribute should be enabled.
+
+The pattern sequence of LP8501 is similar to LP5523.
+
+However pattern data is specific.
+
+Ex 1) Engine 1 is used::
+
+ echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
+ echo 1 > /sys/class/firmware/lp8501/loading
+ echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
+ echo 0 > /sys/class/firmware/lp8501/loading
+ echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+Ex 2) Engine 2 and 3 are used at the same time::
+
+ echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
+ sleep 1
+ echo 1 > /sys/class/firmware/lp8501/loading
+ echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
+ echo 0 > /sys/class/firmware/lp8501/loading
+ sleep 1
+ echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
+ sleep 1
+ echo 1 > /sys/class/firmware/lp8501/loading
+ echo "9d0340ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
+ echo 0 > /sys/class/firmware/lp8501/loading
+ sleep 1
+ echo 1 > /sys/class/leds/d1/device/run_engine
+
+( 'run_engine' and 'firmware_cb' )
+
+The sequence of running the program data is common.
+
+But each device has own specific register addresses for commands.
+
+To support this, 'run_engine' and 'firmware_cb' are configurable in each driver.
+
+run_engine:
+ Control the selected engine
+firmware_cb:
+ The callback function after loading the firmware is done.
+
+ Chip specific commands for loading and updating program memory.
+
+( Predefined pattern data )
+
+Without the firmware interface, LP55xx driver provides another method for
+loading a LED pattern. That is 'predefined' pattern.
+
+A predefined pattern is defined in the platform data and load it(or them)
+via the sysfs if needed.
+
+To use the predefined pattern concept, 'patterns' and 'num_patterns' should be
+configured.
+
+Example of predefined pattern data::
+
+ /* mode_1: blinking data */
+ static const u8 mode_1[] = {
+ 0x40, 0x00, 0x60, 0x00, 0x40, 0xFF, 0x60, 0x00,
+ };
+
+ /* mode_2: always on */
+ static const u8 mode_2[] = { 0x40, 0xFF, };
+
+ struct lp55xx_predef_pattern board_led_patterns[] = {
+ {
+ .r = mode_1,
+ .size_r = ARRAY_SIZE(mode_1),
+ },
+ {
+ .b = mode_2,
+ .size_b = ARRAY_SIZE(mode_2),
+ },
+ }
+
+ struct lp55xx_platform_data lp5562_pdata = {
+ ...
+ .patterns = board_led_patterns,
+ .num_patterns = ARRAY_SIZE(board_led_patterns),
+ };
+
+Then, mode_1 and mode_2 can be run via through the sysfs::
+
+ echo 1 > /sys/bus/i2c/devices/xxxx/led_pattern # red blinking LED pattern
+ echo 2 > /sys/bus/i2c/devices/xxxx/led_pattern # blue LED always on
+
+To stop running pattern::
+
+ echo 0 > /sys/bus/i2c/devices/xxxx/led_pattern
diff --git a/Documentation/leds/leds-lp55xx.txt b/Documentation/leds/leds-lp55xx.txt
deleted file mode 100644
index e23fa91ea722..000000000000
--- a/Documentation/leds/leds-lp55xx.txt
+++ /dev/null
@@ -1,194 +0,0 @@
-LP5521/LP5523/LP55231/LP5562/LP8501 Common Driver
-=================================================
-
-Authors: Milo(Woogyom) Kim <milo.kim@ti.com>
-
-Description
------------
-LP5521, LP5523/55231, LP5562 and LP8501 have common features as below.
-
- Register access via the I2C
- Device initialization/deinitialization
- Create LED class devices for multiple output channels
- Device attributes for user-space interface
- Program memory for running LED patterns
-
-The LP55xx common driver provides these features using exported functions.
- lp55xx_init_device() / lp55xx_deinit_device()
- lp55xx_register_leds() / lp55xx_unregister_leds()
- lp55xx_regsister_sysfs() / lp55xx_unregister_sysfs()
-
-( Driver Structure Data )
-
-In lp55xx common driver, two different data structure is used.
-
-o lp55xx_led
- control multi output LED channels such as led current, channel index.
-o lp55xx_chip
- general chip control such like the I2C and platform data.
-
-For example, LP5521 has maximum 3 LED channels.
-LP5523/55231 has 9 output channels.
-
-lp55xx_chip for LP5521 ... lp55xx_led #1
- lp55xx_led #2
- lp55xx_led #3
-
-lp55xx_chip for LP5523 ... lp55xx_led #1
- lp55xx_led #2
- .
- .
- lp55xx_led #9
-
-( Chip Dependent Code )
-
-To support device specific configurations, special structure
-'lpxx_device_config' is used.
-
- Maximum number of channels
- Reset command, chip enable command
- Chip specific initialization
- Brightness control register access
- Setting LED output current
- Program memory address access for running patterns
- Additional device specific attributes
-
-( Firmware Interface )
-
-LP55xx family devices have the internal program memory for running
-various LED patterns.
-This pattern data is saved as a file in the user-land or
-hex byte string is written into the memory through the I2C.
-LP55xx common driver supports the firmware interface.
-
-LP55xx chips have three program engines.
-To load and run the pattern, the programming sequence is following.
- (1) Select an engine number (1/2/3)
- (2) Mode change to load
- (3) Write pattern data into selected area
- (4) Mode change to run
-
-The LP55xx common driver provides simple interfaces as below.
-select_engine : Select which engine is used for running program
-run_engine : Start program which is loaded via the firmware interface
-firmware : Load program data
-
-In case of LP5523, one more command is required, 'enginex_leds'.
-It is used for selecting LED output(s) at each engine number.
-In more details, please refer to 'leds-lp5523.txt'.
-
-For example, run blinking pattern in engine #1 of LP5521
-echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp5521/loading
-echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
-echo 0 > /sys/class/firmware/lp5521/loading
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-For example, run blinking pattern in engine #3 of LP55231
-Two LEDs are configured as pattern output channels.
-echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp55231/loading
-echo "9d0740ff7e0040007e00a0010000" > /sys/class/firmware/lp55231/data
-echo 0 > /sys/class/firmware/lp55231/loading
-echo "000001100" > /sys/bus/i2c/devices/xxxx/engine3_leds
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-To start blinking patterns in engine #2 and #3 simultaneously,
-for idx in 2 3
-do
- echo $idx > /sys/class/leds/red/device/select_engine
- sleep 0.1
- echo 1 > /sys/class/firmware/lp5521/loading
- echo "4000600040FF6000" > /sys/class/firmware/lp5521/data
- echo 0 > /sys/class/firmware/lp5521/loading
-done
-echo 1 > /sys/class/leds/red/device/run_engine
-
-Here is another example for LP5523.
-Full LED strings are selected by 'engine2_leds'.
-echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp5523/loading
-echo "9d80400004ff05ff437f0000" > /sys/class/firmware/lp5523/data
-echo 0 > /sys/class/firmware/lp5523/loading
-echo "111111111" > /sys/bus/i2c/devices/xxxx/engine2_leds
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-As soon as 'loading' is set to 0, registered callback is called.
-Inside the callback, the selected engine is loaded and memory is updated.
-To run programmed pattern, 'run_engine' attribute should be enabled.
-
-The pattern sequence of LP8501 is similar to LP5523.
-However pattern data is specific.
-Ex 1) Engine 1 is used
-echo 1 > /sys/bus/i2c/devices/xxxx/select_engine
-echo 1 > /sys/class/firmware/lp8501/loading
-echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
-echo 0 > /sys/class/firmware/lp8501/loading
-echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
-
-Ex 2) Engine 2 and 3 are used at the same time
-echo 2 > /sys/bus/i2c/devices/xxxx/select_engine
-sleep 1
-echo 1 > /sys/class/firmware/lp8501/loading
-echo "9d0140ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
-echo 0 > /sys/class/firmware/lp8501/loading
-sleep 1
-echo 3 > /sys/bus/i2c/devices/xxxx/select_engine
-sleep 1
-echo 1 > /sys/class/firmware/lp8501/loading
-echo "9d0340ff7e0040007e00a001c000" > /sys/class/firmware/lp8501/data
-echo 0 > /sys/class/firmware/lp8501/loading
-sleep 1
-echo 1 > /sys/class/leds/d1/device/run_engine
-
-( 'run_engine' and 'firmware_cb' )
-The sequence of running the program data is common.
-But each device has own specific register addresses for commands.
-To support this, 'run_engine' and 'firmware_cb' are configurable in each driver.
-run_engine : Control the selected engine
-firmware_cb : The callback function after loading the firmware is done.
- Chip specific commands for loading and updating program memory.
-
-( Predefined pattern data )
-
-Without the firmware interface, LP55xx driver provides another method for
-loading a LED pattern. That is 'predefined' pattern.
-A predefined pattern is defined in the platform data and load it(or them)
-via the sysfs if needed.
-To use the predefined pattern concept, 'patterns' and 'num_patterns' should be
-configured.
-
- Example of predefined pattern data:
-
- /* mode_1: blinking data */
- static const u8 mode_1[] = {
- 0x40, 0x00, 0x60, 0x00, 0x40, 0xFF, 0x60, 0x00,
- };
-
- /* mode_2: always on */
- static const u8 mode_2[] = { 0x40, 0xFF, };
-
- struct lp55xx_predef_pattern board_led_patterns[] = {
- {
- .r = mode_1,
- .size_r = ARRAY_SIZE(mode_1),
- },
- {
- .b = mode_2,
- .size_b = ARRAY_SIZE(mode_2),
- },
- }
-
- struct lp55xx_platform_data lp5562_pdata = {
- ...
- .patterns = board_led_patterns,
- .num_patterns = ARRAY_SIZE(board_led_patterns),
- };
-
-Then, mode_1 and mode_2 can be run via through the sysfs.
-
- echo 1 > /sys/bus/i2c/devices/xxxx/led_pattern # red blinking LED pattern
- echo 2 > /sys/bus/i2c/devices/xxxx/led_pattern # blue LED always on
-
-To stop running pattern,
- echo 0 > /sys/bus/i2c/devices/xxxx/led_pattern
diff --git a/Documentation/leds/leds-mlxcpld.rst b/Documentation/leds/leds-mlxcpld.rst
new file mode 100644
index 000000000000..528582429e0b
--- /dev/null
+++ b/Documentation/leds/leds-mlxcpld.rst
@@ -0,0 +1,118 @@
+=======================================
+Kernel driver for Mellanox systems LEDs
+=======================================
+
+Provide system LED support for the nex Mellanox systems:
+"msx6710", "msx6720", "msb7700", "msn2700", "msx1410",
+"msn2410", "msb7800", "msn2740", "msn2100".
+
+Description
+-----------
+Driver provides the following LEDs for the systems "msx6710", "msx6720",
+"msb7700", "msn2700", "msx1410", "msn2410", "msb7800", "msn2740":
+
+ - mlxcpld:fan1:green
+ - mlxcpld:fan1:red
+ - mlxcpld:fan2:green
+ - mlxcpld:fan2:red
+ - mlxcpld:fan3:green
+ - mlxcpld:fan3:red
+ - mlxcpld:fan4:green
+ - mlxcpld:fan4:red
+ - mlxcpld:psu:green
+ - mlxcpld:psu:red
+ - mlxcpld:status:green
+ - mlxcpld:status:red
+
+ "status"
+ - CPLD reg offset: 0x20
+ - Bits [3:0]
+
+ "psu"
+ - CPLD reg offset: 0x20
+ - Bits [7:4]
+
+ "fan1"
+ - CPLD reg offset: 0x21
+ - Bits [3:0]
+
+ "fan2"
+ - CPLD reg offset: 0x21
+ - Bits [7:4]
+
+ "fan3"
+ - CPLD reg offset: 0x22
+ - Bits [3:0]
+
+ "fan4"
+ - CPLD reg offset: 0x22
+ - Bits [7:4]
+
+ Color mask for all the above LEDs:
+
+ [bit3,bit2,bit1,bit0] or
+ [bit7,bit6,bit5,bit4]:
+
+ - [0,0,0,0] = LED OFF
+ - [0,1,0,1] = Red static ON
+ - [1,1,0,1] = Green static ON
+ - [0,1,1,0] = Red blink 3Hz
+ - [1,1,1,0] = Green blink 3Hz
+ - [0,1,1,1] = Red blink 6Hz
+ - [1,1,1,1] = Green blink 6Hz
+
+Driver provides the following LEDs for the system "msn2100":
+
+ - mlxcpld:fan:green
+ - mlxcpld:fan:red
+ - mlxcpld:psu1:green
+ - mlxcpld:psu1:red
+ - mlxcpld:psu2:green
+ - mlxcpld:psu2:red
+ - mlxcpld:status:green
+ - mlxcpld:status:red
+ - mlxcpld:uid:blue
+
+ "status"
+ - CPLD reg offset: 0x20
+ - Bits [3:0]
+
+ "fan"
+ - CPLD reg offset: 0x21
+ - Bits [3:0]
+
+ "psu1"
+ - CPLD reg offset: 0x23
+ - Bits [3:0]
+
+ "psu2"
+ - CPLD reg offset: 0x23
+ - Bits [7:4]
+
+ "uid"
+ - CPLD reg offset: 0x24
+ - Bits [3:0]
+
+ Color mask for all the above LEDs, excepted uid:
+
+ [bit3,bit2,bit1,bit0] or
+ [bit7,bit6,bit5,bit4]:
+
+ - [0,0,0,0] = LED OFF
+ - [0,1,0,1] = Red static ON
+ - [1,1,0,1] = Green static ON
+ - [0,1,1,0] = Red blink 3Hz
+ - [1,1,1,0] = Green blink 3Hz
+ - [0,1,1,1] = Red blink 6Hz
+ - [1,1,1,1] = Green blink 6Hz
+
+ Color mask for uid LED:
+ [bit3,bit2,bit1,bit0]:
+
+ - [0,0,0,0] = LED OFF
+ - [1,1,0,1] = Blue static ON
+ - [1,1,1,0] = Blue blink 3Hz
+ - [1,1,1,1] = Blue blink 6Hz
+
+Driver supports HW blinking at 3Hz and 6Hz frequency (50% duty cycle).
+For 3Hz duty cylce is about 167 msec, for 6Hz is about 83 msec.
diff --git a/Documentation/leds/leds-mlxcpld.txt b/Documentation/leds/leds-mlxcpld.txt
deleted file mode 100644
index a0e8fd457117..000000000000
--- a/Documentation/leds/leds-mlxcpld.txt
+++ /dev/null
@@ -1,110 +0,0 @@
-Kernel driver for Mellanox systems LEDs
-=======================================
-
-Provide system LED support for the nex Mellanox systems:
-"msx6710", "msx6720", "msb7700", "msn2700", "msx1410",
-"msn2410", "msb7800", "msn2740", "msn2100".
-
-Description
------------
-Driver provides the following LEDs for the systems "msx6710", "msx6720",
-"msb7700", "msn2700", "msx1410", "msn2410", "msb7800", "msn2740":
- mlxcpld:fan1:green
- mlxcpld:fan1:red
- mlxcpld:fan2:green
- mlxcpld:fan2:red
- mlxcpld:fan3:green
- mlxcpld:fan3:red
- mlxcpld:fan4:green
- mlxcpld:fan4:red
- mlxcpld:psu:green
- mlxcpld:psu:red
- mlxcpld:status:green
- mlxcpld:status:red
-
- "status"
- CPLD reg offset: 0x20
- Bits [3:0]
-
- "psu"
- CPLD reg offset: 0x20
- Bits [7:4]
-
- "fan1"
- CPLD reg offset: 0x21
- Bits [3:0]
-
- "fan2"
- CPLD reg offset: 0x21
- Bits [7:4]
-
- "fan3"
- CPLD reg offset: 0x22
- Bits [3:0]
-
- "fan4"
- CPLD reg offset: 0x22
- Bits [7:4]
-
- Color mask for all the above LEDs:
- [bit3,bit2,bit1,bit0] or
- [bit7,bit6,bit5,bit4]:
- [0,0,0,0] = LED OFF
- [0,1,0,1] = Red static ON
- [1,1,0,1] = Green static ON
- [0,1,1,0] = Red blink 3Hz
- [1,1,1,0] = Green blink 3Hz
- [0,1,1,1] = Red blink 6Hz
- [1,1,1,1] = Green blink 6Hz
-
-Driver provides the following LEDs for the system "msn2100":
- mlxcpld:fan:green
- mlxcpld:fan:red
- mlxcpld:psu1:green
- mlxcpld:psu1:red
- mlxcpld:psu2:green
- mlxcpld:psu2:red
- mlxcpld:status:green
- mlxcpld:status:red
- mlxcpld:uid:blue
-
- "status"
- CPLD reg offset: 0x20
- Bits [3:0]
-
- "fan"
- CPLD reg offset: 0x21
- Bits [3:0]
-
- "psu1"
- CPLD reg offset: 0x23
- Bits [3:0]
-
- "psu2"
- CPLD reg offset: 0x23
- Bits [7:4]
-
- "uid"
- CPLD reg offset: 0x24
- Bits [3:0]
-
- Color mask for all the above LEDs, excepted uid:
- [bit3,bit2,bit1,bit0] or
- [bit7,bit6,bit5,bit4]:
- [0,0,0,0] = LED OFF
- [0,1,0,1] = Red static ON
- [1,1,0,1] = Green static ON
- [0,1,1,0] = Red blink 3Hz
- [1,1,1,0] = Green blink 3Hz
- [0,1,1,1] = Red blink 6Hz
- [1,1,1,1] = Green blink 6Hz
-
- Color mask for uid LED:
- [bit3,bit2,bit1,bit0]:
- [0,0,0,0] = LED OFF
- [1,1,0,1] = Blue static ON
- [1,1,1,0] = Blue blink 3Hz
- [1,1,1,1] = Blue blink 6Hz
-
-Driver supports HW blinking at 3Hz and 6Hz frequency (50% duty cycle).
-For 3Hz duty cylce is about 167 msec, for 6Hz is about 83 msec.
diff --git a/Documentation/leds/ledtrig-oneshot.rst b/Documentation/leds/ledtrig-oneshot.rst
new file mode 100644
index 000000000000..69fa3ea1d554
--- /dev/null
+++ b/Documentation/leds/ledtrig-oneshot.rst
@@ -0,0 +1,44 @@
+====================
+One-shot LED Trigger
+====================
+
+This is a LED trigger useful for signaling the user of an event where there are
+no clear trap points to put standard led-on and led-off settings. Using this
+trigger, the application needs only to signal the trigger when an event has
+happened, than the trigger turns the LED on and than keeps it off for a
+specified amount of time.
+
+This trigger is meant to be usable both for sporadic and dense events. In the
+first case, the trigger produces a clear single controlled blink for each
+event, while in the latter it keeps blinking at constant rate, as to signal
+that the events are arriving continuously.
+
+A one-shot LED only stays in a constant state when there are no events. An
+additional "invert" property specifies if the LED has to stay off (normal) or
+on (inverted) when not rearmed.
+
+The trigger can be activated from user space on led class devices as shown
+below::
+
+ echo oneshot > trigger
+
+This adds sysfs attributes to the LED that are documented in:
+Documentation/ABI/testing/sysfs-class-led-trigger-oneshot
+
+Example use-case: network devices, initialization::
+
+ echo oneshot > trigger # set trigger for this led
+ echo 33 > delay_on # blink at 1 / (33 + 33) Hz on continuous traffic
+ echo 33 > delay_off
+
+interface goes up::
+
+ echo 1 > invert # set led as normally-on, turn the led on
+
+packet received/transmitted::
+
+ echo 1 > shot # led starts blinking, ignored if already blinking
+
+interface goes down::
+
+ echo 0 > invert # set led as normally-off, turn the led off
diff --git a/Documentation/leds/ledtrig-oneshot.txt b/Documentation/leds/ledtrig-oneshot.txt
deleted file mode 100644
index fe57474a12e2..000000000000
--- a/Documentation/leds/ledtrig-oneshot.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-One-shot LED Trigger
-====================
-
-This is a LED trigger useful for signaling the user of an event where there are
-no clear trap points to put standard led-on and led-off settings. Using this
-trigger, the application needs only to signal the trigger when an event has
-happened, than the trigger turns the LED on and than keeps it off for a
-specified amount of time.
-
-This trigger is meant to be usable both for sporadic and dense events. In the
-first case, the trigger produces a clear single controlled blink for each
-event, while in the latter it keeps blinking at constant rate, as to signal
-that the events are arriving continuously.
-
-A one-shot LED only stays in a constant state when there are no events. An
-additional "invert" property specifies if the LED has to stay off (normal) or
-on (inverted) when not rearmed.
-
-The trigger can be activated from user space on led class devices as shown
-below:
-
- echo oneshot > trigger
-
-This adds sysfs attributes to the LED that are documented in:
-Documentation/ABI/testing/sysfs-class-led-trigger-oneshot
-
-Example use-case: network devices, initialization:
-
- echo oneshot > trigger # set trigger for this led
- echo 33 > delay_on # blink at 1 / (33 + 33) Hz on continuous traffic
- echo 33 > delay_off
-
-interface goes up:
-
- echo 1 > invert # set led as normally-on, turn the led on
-
-packet received/transmitted:
-
- echo 1 > shot # led starts blinking, ignored if already blinking
-
-interface goes down
-
- echo 0 > invert # set led as normally-off, turn the led off
diff --git a/Documentation/leds/ledtrig-transient.rst b/Documentation/leds/ledtrig-transient.rst
new file mode 100644
index 000000000000..d921dc830cd0
--- /dev/null
+++ b/Documentation/leds/ledtrig-transient.rst
@@ -0,0 +1,167 @@
+=====================
+LED Transient Trigger
+=====================
+
+The leds timer trigger does not currently have an interface to activate
+a one shot timer. The current support allows for setting two timers, one for
+specifying how long a state to be on, and the second for how long the state
+to be off. The delay_on value specifies the time period an LED should stay
+in on state, followed by a delay_off value that specifies how long the LED
+should stay in off state. The on and off cycle repeats until the trigger
+gets deactivated. There is no provision for one time activation to implement
+features that require an on or off state to be held just once and then stay in
+the original state forever.
+
+Without one shot timer interface, user space can still use timer trigger to
+set a timer to hold a state, however when user space application crashes or
+goes away without deactivating the timer, the hardware will be left in that
+state permanently.
+
+As a specific example of this use-case, let's look at vibrate feature on
+phones. Vibrate function on phones is implemented using PWM pins on SoC or
+PMIC. There is a need to activate one shot timer to control the vibrate
+feature, to prevent user space crashes leaving the phone in vibrate mode
+permanently causing the battery to drain.
+
+Transient trigger addresses the need for one shot timer activation. The
+transient trigger can be enabled and disabled just like the other leds
+triggers.
+
+When an led class device driver registers itself, it can specify all leds
+triggers it supports and a default trigger. During registration, activation
+routine for the default trigger gets called. During registration of an led
+class device, the LED state does not change.
+
+When the driver unregisters, deactivation routine for the currently active
+trigger will be called, and LED state is changed to LED_OFF.
+
+Driver suspend changes the LED state to LED_OFF and resume doesn't change
+the state. Please note that there is no explicit interaction between the
+suspend and resume actions and the currently enabled trigger. LED state
+changes are suspended while the driver is in suspend state. Any timers
+that are active at the time driver gets suspended, continue to run, without
+being able to actually change the LED state. Once driver is resumed, triggers
+start functioning again.
+
+LED state changes are controlled using brightness which is a common led
+class device property. When brightness is set to 0 from user space via
+echo 0 > brightness, it will result in deactivating the current trigger.
+
+Transient trigger uses standard register and unregister interfaces. During
+trigger registration, for each led class device that specifies this trigger
+as its default trigger, trigger activation routine will get called. During
+registration, the LED state does not change, unless there is another trigger
+active, in which case LED state changes to LED_OFF.
+
+During trigger unregistration, LED state gets changed to LED_OFF.
+
+Transient trigger activation routine doesn't change the LED state. It
+creates its properties and does its initialization. Transient trigger
+deactivation routine, will cancel any timer that is active before it cleans
+up and removes the properties it created. It will restore the LED state to
+non-transient state. When driver gets suspended, irrespective of the transient
+state, the LED state changes to LED_OFF.
+
+Transient trigger can be enabled and disabled from user space on led class
+devices, that support this trigger as shown below::
+
+ echo transient > trigger
+ echo none > trigger
+
+NOTE:
+ Add a new property trigger state to control the state.
+
+This trigger exports three properties, activate, state, and duration. When
+transient trigger is activated these properties are set to default values.
+
+- duration allows setting timer value in msecs. The initial value is 0.
+- activate allows activating and deactivating the timer specified by
+ duration as needed. The initial and default value is 0. This will allow
+ duration to be set after trigger activation.
+- state allows user to specify a transient state to be held for the specified
+ duration.
+
+ activate
+ - one shot timer activate mechanism.
+ 1 when activated, 0 when deactivated.
+ default value is zero when transient trigger is enabled,
+ to allow duration to be set.
+
+ activate state indicates a timer with a value of specified
+ duration running.
+ deactivated state indicates that there is no active timer
+ running.
+
+ duration
+ - one shot timer value. When activate is set, duration value
+ is used to start a timer that runs once. This value doesn't
+ get changed by the trigger unless user does a set via
+ echo new_value > duration
+
+ state
+ - transient state to be held. It has two values 0 or 1. 0 maps
+ to LED_OFF and 1 maps to LED_FULL. The specified state is
+ held for the duration of the one shot timer and then the
+ state gets changed to the non-transient state which is the
+ inverse of transient state.
+ If state = LED_FULL, when the timer runs out the state will
+ go back to LED_OFF.
+ If state = LED_OFF, when the timer runs out the state will
+ go back to LED_FULL.
+ Please note that current LED state is not checked prior to
+ changing the state to the specified state.
+ Driver could map these values to inverted depending on the
+ default states it defines for the LED in its brightness_set()
+ interface which is called from the led brightness_set()
+ interfaces to control the LED state.
+
+When timer expires activate goes back to deactivated state, duration is left
+at the set value to be used when activate is set at a future time. This will
+allow user app to set the time once and activate it to run it once for the
+specified value as needed. When timer expires, state is restored to the
+non-transient state which is the inverse of the transient state:
+
+ ================= ===============================================
+ echo 1 > activate starts timer = duration when duration is not 0.
+ echo 0 > activate cancels currently running timer.
+ echo n > duration stores timer value to be used upon next
+ activate. Currently active timer if
+ any, continues to run for the specified time.
+ echo 0 > duration stores timer value to be used upon next
+ activate. Currently active timer if any,
+ continues to run for the specified time.
+ echo 1 > state stores desired transient state LED_FULL to be
+ held for the specified duration.
+ echo 0 > state stores desired transient state LED_OFF to be
+ held for the specified duration.
+ ================= ===============================================
+
+What is not supported
+=====================
+
+- Timer activation is one shot and extending and/or shortening the timer
+ is not supported.
+
+Examples
+========
+
+use-case 1::
+
+ echo transient > trigger
+ echo n > duration
+ echo 1 > state
+
+repeat the following step as needed::
+
+ echo 1 > activate - start timer = duration to run once
+ echo 1 > activate - start timer = duration to run once
+ echo none > trigger
+
+This trigger is intended to be used for for the following example use cases:
+
+ - Control of vibrate (phones, tablets etc.) hardware by user space app.
+ - Use of LED by user space app as activity indicator.
+ - Use of LED by user space app as a kind of watchdog indicator -- as
+ long as the app is alive, it can keep the LED illuminated, if it dies
+ the LED will be extinguished automatically.
+ - Use by any user space app that needs a transient GPIO output.
diff --git a/Documentation/leds/ledtrig-transient.txt b/Documentation/leds/ledtrig-transient.txt
deleted file mode 100644
index 3bd38b487df1..000000000000
--- a/Documentation/leds/ledtrig-transient.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-LED Transient Trigger
-=====================
-
-The leds timer trigger does not currently have an interface to activate
-a one shot timer. The current support allows for setting two timers, one for
-specifying how long a state to be on, and the second for how long the state
-to be off. The delay_on value specifies the time period an LED should stay
-in on state, followed by a delay_off value that specifies how long the LED
-should stay in off state. The on and off cycle repeats until the trigger
-gets deactivated. There is no provision for one time activation to implement
-features that require an on or off state to be held just once and then stay in
-the original state forever.
-
-Without one shot timer interface, user space can still use timer trigger to
-set a timer to hold a state, however when user space application crashes or
-goes away without deactivating the timer, the hardware will be left in that
-state permanently.
-
-As a specific example of this use-case, let's look at vibrate feature on
-phones. Vibrate function on phones is implemented using PWM pins on SoC or
-PMIC. There is a need to activate one shot timer to control the vibrate
-feature, to prevent user space crashes leaving the phone in vibrate mode
-permanently causing the battery to drain.
-
-Transient trigger addresses the need for one shot timer activation. The
-transient trigger can be enabled and disabled just like the other leds
-triggers.
-
-When an led class device driver registers itself, it can specify all leds
-triggers it supports and a default trigger. During registration, activation
-routine for the default trigger gets called. During registration of an led
-class device, the LED state does not change.
-
-When the driver unregisters, deactivation routine for the currently active
-trigger will be called, and LED state is changed to LED_OFF.
-
-Driver suspend changes the LED state to LED_OFF and resume doesn't change
-the state. Please note that there is no explicit interaction between the
-suspend and resume actions and the currently enabled trigger. LED state
-changes are suspended while the driver is in suspend state. Any timers
-that are active at the time driver gets suspended, continue to run, without
-being able to actually change the LED state. Once driver is resumed, triggers
-start functioning again.
-
-LED state changes are controlled using brightness which is a common led
-class device property. When brightness is set to 0 from user space via
-echo 0 > brightness, it will result in deactivating the current trigger.
-
-Transient trigger uses standard register and unregister interfaces. During
-trigger registration, for each led class device that specifies this trigger
-as its default trigger, trigger activation routine will get called. During
-registration, the LED state does not change, unless there is another trigger
-active, in which case LED state changes to LED_OFF.
-
-During trigger unregistration, LED state gets changed to LED_OFF.
-
-Transient trigger activation routine doesn't change the LED state. It
-creates its properties and does its initialization. Transient trigger
-deactivation routine, will cancel any timer that is active before it cleans
-up and removes the properties it created. It will restore the LED state to
-non-transient state. When driver gets suspended, irrespective of the transient
-state, the LED state changes to LED_OFF.
-
-Transient trigger can be enabled and disabled from user space on led class
-devices, that support this trigger as shown below:
-
-echo transient > trigger
-echo none > trigger
-
-NOTE: Add a new property trigger state to control the state.
-
-This trigger exports three properties, activate, state, and duration. When
-transient trigger is activated these properties are set to default values.
-
-- duration allows setting timer value in msecs. The initial value is 0.
-- activate allows activating and deactivating the timer specified by
- duration as needed. The initial and default value is 0. This will allow
- duration to be set after trigger activation.
-- state allows user to specify a transient state to be held for the specified
- duration.
-
- activate - one shot timer activate mechanism.
- 1 when activated, 0 when deactivated.
- default value is zero when transient trigger is enabled,
- to allow duration to be set.
-
- activate state indicates a timer with a value of specified
- duration running.
- deactivated state indicates that there is no active timer
- running.
-
- duration - one shot timer value. When activate is set, duration value
- is used to start a timer that runs once. This value doesn't
- get changed by the trigger unless user does a set via
- echo new_value > duration
-
- state - transient state to be held. It has two values 0 or 1. 0 maps
- to LED_OFF and 1 maps to LED_FULL. The specified state is
- held for the duration of the one shot timer and then the
- state gets changed to the non-transient state which is the
- inverse of transient state.
- If state = LED_FULL, when the timer runs out the state will
- go back to LED_OFF.
- If state = LED_OFF, when the timer runs out the state will
- go back to LED_FULL.
- Please note that current LED state is not checked prior to
- changing the state to the specified state.
- Driver could map these values to inverted depending on the
- default states it defines for the LED in its brightness_set()
- interface which is called from the led brightness_set()
- interfaces to control the LED state.
-
-When timer expires activate goes back to deactivated state, duration is left
-at the set value to be used when activate is set at a future time. This will
-allow user app to set the time once and activate it to run it once for the
-specified value as needed. When timer expires, state is restored to the
-non-transient state which is the inverse of the transient state.
-
- echo 1 > activate - starts timer = duration when duration is not 0.
- echo 0 > activate - cancels currently running timer.
- echo n > duration - stores timer value to be used upon next
- activate. Currently active timer if
- any, continues to run for the specified time.
- echo 0 > duration - stores timer value to be used upon next
- activate. Currently active timer if any,
- continues to run for the specified time.
- echo 1 > state - stores desired transient state LED_FULL to be
- held for the specified duration.
- echo 0 > state - stores desired transient state LED_OFF to be
- held for the specified duration.
-
-What is not supported:
-======================
-- Timer activation is one shot and extending and/or shortening the timer
- is not supported.
-
-Example use-case 1:
- echo transient > trigger
- echo n > duration
- echo 1 > state
-repeat the following step as needed:
- echo 1 > activate - start timer = duration to run once
- echo 1 > activate - start timer = duration to run once
- echo none > trigger
-
-This trigger is intended to be used for for the following example use cases:
- - Control of vibrate (phones, tablets etc.) hardware by user space app.
- - Use of LED by user space app as activity indicator.
- - Use of LED by user space app as a kind of watchdog indicator -- as
- long as the app is alive, it can keep the LED illuminated, if it dies
- the LED will be extinguished automatically.
- - Use by any user space app that needs a transient GPIO output.
diff --git a/Documentation/leds/ledtrig-usbport.rst b/Documentation/leds/ledtrig-usbport.rst
new file mode 100644
index 000000000000..37c2505bfd57
--- /dev/null
+++ b/Documentation/leds/ledtrig-usbport.rst
@@ -0,0 +1,46 @@
+====================
+USB port LED trigger
+====================
+
+This LED trigger can be used for signalling to the user a presence of USB device
+in a given port. It simply turns on LED when device appears and turns it off
+when it disappears.
+
+It requires selecting USB ports that should be observed. All available ones are
+listed as separated entries in a "ports" subdirectory. Selecting is handled by
+echoing "1" to a chosen port.
+
+Please note that this trigger allows selecting multiple USB ports for a single
+LED.
+
+This can be useful in two cases:
+
+1) Device with single USB LED and few physical ports
+====================================================
+
+In such a case LED will be turned on as long as there is at least one connected
+USB device.
+
+2) Device with a physical port handled by few controllers
+=========================================================
+
+Some devices may have one controller per PHY standard. E.g. USB 3.0 physical
+port may be handled by ohci-platform, ehci-platform and xhci-hcd. If there is
+only one LED user will most likely want to assign ports from all 3 hubs.
+
+
+This trigger can be activated from user space on led class devices as shown
+below::
+
+ echo usbport > trigger
+
+This adds sysfs attributes to the LED that are documented in:
+Documentation/ABI/testing/sysfs-class-led-trigger-usbport
+
+Example use-case::
+
+ echo usbport > trigger
+ echo 1 > ports/usb1-port1
+ echo 1 > ports/usb2-port1
+ cat ports/usb1-port1
+ echo 0 > ports/usb1-port1
diff --git a/Documentation/leds/ledtrig-usbport.txt b/Documentation/leds/ledtrig-usbport.txt
deleted file mode 100644
index 69f54bfb4789..000000000000
--- a/Documentation/leds/ledtrig-usbport.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-USB port LED trigger
-====================
-
-This LED trigger can be used for signalling to the user a presence of USB device
-in a given port. It simply turns on LED when device appears and turns it off
-when it disappears.
-
-It requires selecting USB ports that should be observed. All available ones are
-listed as separated entries in a "ports" subdirectory. Selecting is handled by
-echoing "1" to a chosen port.
-
-Please note that this trigger allows selecting multiple USB ports for a single
-LED. This can be useful in two cases:
-
-1) Device with single USB LED and few physical ports
-
-In such a case LED will be turned on as long as there is at least one connected
-USB device.
-
-2) Device with a physical port handled by few controllers
-
-Some devices may have one controller per PHY standard. E.g. USB 3.0 physical
-port may be handled by ohci-platform, ehci-platform and xhci-hcd. If there is
-only one LED user will most likely want to assign ports from all 3 hubs.
-
-
-This trigger can be activated from user space on led class devices as shown
-below:
-
- echo usbport > trigger
-
-This adds sysfs attributes to the LED that are documented in:
-Documentation/ABI/testing/sysfs-class-led-trigger-usbport
-
-Example use-case:
-
- echo usbport > trigger
- echo 1 > ports/usb1-port1
- echo 1 > ports/usb2-port1
- cat ports/usb1-port1
- echo 0 > ports/usb1-port1
diff --git a/Documentation/leds/uleds.rst b/Documentation/leds/uleds.rst
new file mode 100644
index 000000000000..83221098009c
--- /dev/null
+++ b/Documentation/leds/uleds.rst
@@ -0,0 +1,37 @@
+==============
+Userspace LEDs
+==============
+
+The uleds driver supports userspace LEDs. This can be useful for testing
+triggers and can also be used to implement virtual LEDs.
+
+
+Usage
+=====
+
+When the driver is loaded, a character device is created at /dev/uleds. To
+create a new LED class device, open /dev/uleds and write a uleds_user_dev
+structure to it (found in kernel public header file linux/uleds.h)::
+
+ #define LED_MAX_NAME_SIZE 64
+
+ struct uleds_user_dev {
+ char name[LED_MAX_NAME_SIZE];
+ };
+
+A new LED class device will be created with the name given. The name can be
+any valid sysfs device node name, but consider using the LED class naming
+convention of "devicename:color:function".
+
+The current brightness is found by reading a single byte from the character
+device. Values are unsigned: 0 to 255. Reading will block until the brightness
+changes. The device node can also be polled to notify when the brightness value
+changes.
+
+The LED class device will be removed when the open file handle to /dev/uleds
+is closed.
+
+Multiple LED class devices are created by opening additional file handles to
+/dev/uleds.
+
+See tools/leds/uledmon.c for an example userspace program.
diff --git a/Documentation/leds/uleds.txt b/Documentation/leds/uleds.txt
deleted file mode 100644
index 13e375a580f9..000000000000
--- a/Documentation/leds/uleds.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-Userspace LEDs
-==============
-
-The uleds driver supports userspace LEDs. This can be useful for testing
-triggers and can also be used to implement virtual LEDs.
-
-
-Usage
-=====
-
-When the driver is loaded, a character device is created at /dev/uleds. To
-create a new LED class device, open /dev/uleds and write a uleds_user_dev
-structure to it (found in kernel public header file linux/uleds.h).
-
- #define LED_MAX_NAME_SIZE 64
-
- struct uleds_user_dev {
- char name[LED_MAX_NAME_SIZE];
- };
-
-A new LED class device will be created with the name given. The name can be
-any valid sysfs device node name, but consider using the LED class naming
-convention of "devicename:color:function".
-
-The current brightness is found by reading a single byte from the character
-device. Values are unsigned: 0 to 255. Reading will block until the brightness
-changes. The device node can also be polled to notify when the brightness value
-changes.
-
-The LED class device will be removed when the open file handle to /dev/uleds
-is closed.
-
-Multiple LED class devices are created by opening additional file handles to
-/dev/uleds.
-
-See tools/leds/uledmon.c for an example userspace program.
diff --git a/Documentation/livepatch/index.rst b/Documentation/livepatch/index.rst
index edd291d51847..17674a9e21b2 100644
--- a/Documentation/livepatch/index.rst
+++ b/Documentation/livepatch/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
===================
Kernel Livepatching
diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst
new file mode 100644
index 000000000000..626a463f7e42
--- /dev/null
+++ b/Documentation/locking/index.rst
@@ -0,0 +1,24 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======
+locking
+=======
+
+.. toctree::
+ :maxdepth: 1
+
+ lockdep-design
+ lockstat
+ locktorture
+ mutex-design
+ rt-mutex-design
+ rt-mutex
+ spinlocks
+ ww-mutex-design
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/locking/lockdep-design.rst b/Documentation/locking/lockdep-design.rst
new file mode 100644
index 000000000000..23fcbc4d3fc0
--- /dev/null
+++ b/Documentation/locking/lockdep-design.rst
@@ -0,0 +1,394 @@
+Runtime locking correctness validator
+=====================================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+additions by Arjan van de Ven <arjan@linux.intel.com>
+
+Lock-class
+----------
+
+The basic object the validator operates upon is a 'class' of locks.
+
+A class of locks is a group of locks that are logically the same with
+respect to locking rules, even if the locks may have multiple (possibly
+tens of thousands of) instantiations. For example a lock in the inode
+struct is one class, while each inode has its own instantiation of that
+lock class.
+
+The validator tracks the 'usage state' of lock-classes, and it tracks
+the dependencies between different lock-classes. Lock usage indicates
+how a lock is used with regard to its IRQ contexts, while lock
+dependency can be understood as lock order, where L1 -> L2 suggests that
+a task is attempting to acquire L2 while holding L1. From lockdep's
+perspective, the two locks (L1 and L2) are not necessarily related; that
+dependency just means the order ever happened. The validator maintains a
+continuing effort to prove lock usages and dependencies are correct or
+the validator will shoot a splat if incorrect.
+
+A lock-class's behavior is constructed by its instances collectively:
+when the first instance of a lock-class is used after bootup the class
+gets registered, then all (subsequent) instances will be mapped to the
+class and hence their usages and dependecies will contribute to those of
+the class. A lock-class does not go away when a lock instance does, but
+it can be removed if the memory space of the lock class (static or
+dynamic) is reclaimed, this happens for example when a module is
+unloaded or a workqueue is destroyed.
+
+State
+-----
+
+The validator tracks lock-class usage history and divides the usage into
+(4 usages * n STATEs + 1) categories:
+
+where the 4 usages can be:
+- 'ever held in STATE context'
+- 'ever held as readlock in STATE context'
+- 'ever held with STATE enabled'
+- 'ever held as readlock with STATE enabled'
+
+where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
+now they include:
+- hardirq
+- softirq
+
+where the last 1 category is:
+- 'ever used' [ == !unused ]
+
+When locking rules are violated, these usage bits are presented in the
+locking error messages, inside curlies, with a total of 2 * n STATEs bits.
+A contrived example::
+
+ modprobe/2287 is trying to acquire lock:
+ (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+ but task is already holding lock:
+ (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+
+For a given lock, the bit positions from left to right indicate the usage
+of the lock and readlock (if exists), for each of the n STATEs listed
+above respectively, and the character displayed at each bit position
+indicates:
+
+ === ===================================================
+ '.' acquired while irqs disabled and not in irq context
+ '-' acquired in irq context
+ '+' acquired with irqs enabled
+ '?' acquired in irq context with irqs enabled.
+ === ===================================================
+
+The bits are illustrated with an example::
+
+ (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+ ||||
+ ||| \-> softirq disabled and not in softirq context
+ || \--> acquired in softirq context
+ | \---> hardirq disabled and not in hardirq context
+ \----> acquired in hardirq context
+
+
+For a given STATE, whether the lock is ever acquired in that STATE
+context and whether that STATE is enabled yields four possible cases as
+shown in the table below. The bit character is able to indicate which
+exact case is for the lock as of the reporting time.
+
+ +--------------+-------------+--------------+
+ | | irq enabled | irq disabled |
+ +--------------+-------------+--------------+
+ | ever in irq | ? | - |
+ +--------------+-------------+--------------+
+ | never in irq | + | . |
+ +--------------+-------------+--------------+
+
+The character '-' suggests irq is disabled because if otherwise the
+charactor '?' would have been shown instead. Similar deduction can be
+applied for '+' too.
+
+Unused locks (e.g., mutexes) cannot be part of the cause of an error.
+
+
+Single-lock state rules:
+------------------------
+
+A lock is irq-safe means it was ever used in an irq context, while a lock
+is irq-unsafe means it was ever acquired with irq enabled.
+
+A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
+following states must be exclusive: only one of them is allowed to be set
+for any lock-class based on its usage::
+
+ <hardirq-safe> or <hardirq-unsafe>
+ <softirq-safe> or <softirq-unsafe>
+
+This is because if a lock can be used in irq context (irq-safe) then it
+cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
+deadlock may happen. For example, in the scenario that after this lock
+was acquired but before released, if the context is interrupted this
+lock will be attempted to acquire twice, which creates a deadlock,
+referred to as lock recursion deadlock.
+
+The validator detects and reports lock usage that violates these
+single-lock state rules.
+
+Multi-lock dependency rules:
+----------------------------
+
+The same lock-class must not be acquired twice, because this could lead
+to lock recursion deadlocks.
+
+Furthermore, two locks can not be taken in inverse order::
+
+ <L1> -> <L2>
+ <L2> -> <L1>
+
+because this could lead to a deadlock - referred to as lock inversion
+deadlock - as attempts to acquire the two locks form a circle which
+could lead to the two contexts waiting for each other permanently. The
+validator will find such dependency circle in arbitrary complexity,
+i.e., there can be any other locking sequence between the acquire-lock
+operations; the validator will still find whether these locks can be
+acquired in a circular fashion.
+
+Furthermore, the following usage based lock dependencies are not allowed
+between any two lock-classes::
+
+ <hardirq-safe> -> <hardirq-unsafe>
+ <softirq-safe> -> <softirq-unsafe>
+
+The first rule comes from the fact that a hardirq-safe lock could be
+taken by a hardirq context, interrupting a hardirq-unsafe lock - and
+thus could result in a lock inversion deadlock. Likewise, a softirq-safe
+lock could be taken by an softirq context, interrupting a softirq-unsafe
+lock.
+
+The above rules are enforced for any locking sequence that occurs in the
+kernel: when acquiring a new lock, the validator checks whether there is
+any rule violation between the new lock and any of the held locks.
+
+When a lock-class changes its state, the following aspects of the above
+dependency rules are enforced:
+
+- if a new hardirq-safe lock is discovered, we check whether it
+ took any hardirq-unsafe lock in the past.
+
+- if a new softirq-safe lock is discovered, we check whether it took
+ any softirq-unsafe lock in the past.
+
+- if a new hardirq-unsafe lock is discovered, we check whether any
+ hardirq-safe lock took it in the past.
+
+- if a new softirq-unsafe lock is discovered, we check whether any
+ softirq-safe lock took it in the past.
+
+(Again, we do these checks too on the basis that an interrupt context
+could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
+could lead to a lock inversion deadlock - even if that lock scenario did
+not trigger in practice yet.)
+
+Exception: Nested data dependencies leading to nested locking
+-------------------------------------------------------------
+
+There are a few cases where the Linux kernel acquires more than one
+instance of the same lock-class. Such cases typically happen when there
+is some sort of hierarchy within objects of the same type. In these
+cases there is an inherent "natural" ordering between the two objects
+(defined by the properties of the hierarchy), and the kernel grabs the
+locks in this fixed order on each of the objects.
+
+An example of such an object hierarchy that results in "nested locking"
+is that of a "whole disk" block-dev object and a "partition" block-dev
+object; the partition is "part of" the whole device and as long as one
+always takes the whole disk lock as a higher lock than the partition
+lock, the lock ordering is fully correct. The validator does not
+automatically detect this natural ordering, as the locking rule behind
+the ordering is not static.
+
+In order to teach the validator about this correct usage model, new
+versions of the various locking primitives were added that allow you to
+specify a "nesting level". An example call, for the block device mutex,
+looks like this::
+
+ enum bdev_bd_mutex_lock_class
+ {
+ BD_MUTEX_NORMAL,
+ BD_MUTEX_WHOLE,
+ BD_MUTEX_PARTITION
+ };
+
+mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
+
+In this case the locking is done on a bdev object that is known to be a
+partition.
+
+The validator treats a lock that is taken in such a nested fashion as a
+separate (sub)class for the purposes of validation.
+
+Note: When changing code to use the _nested() primitives, be careful and
+check really thoroughly that the hierarchy is correctly mapped; otherwise
+you can get false positives or false negatives.
+
+Annotations
+-----------
+
+Two constructs can be used to annotate and check where and if certain locks
+must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
+
+As the name suggests, lockdep_assert_held* family of macros assert that a
+particular lock is held at a certain time (and generate a WARN() otherwise).
+This annotation is largely used all over the kernel, e.g. kernel/sched/
+core.c::
+
+ void update_rq_clock(struct rq *rq)
+ {
+ s64 delta;
+
+ lockdep_assert_held(&rq->lock);
+ [...]
+ }
+
+where holding rq->lock is required to safely update a rq's clock.
+
+The other family of macros is lockdep_*pin_lock(), which is admittedly only
+used for rq->lock ATM. Despite their limited adoption these annotations
+generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
+out to be especially helpful to debug code with callbacks, where an upper
+layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
+and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
+returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
+that nobody tampered with the lock, e.g. kernel/sched/sched.h::
+
+ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+ {
+ rf->cookie = lockdep_pin_lock(&rq->lock);
+ [...]
+ }
+
+ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+ {
+ [...]
+ lockdep_unpin_lock(&rq->lock, rf->cookie);
+ }
+
+While comments about locking requirements might provide useful information,
+the runtime checks performed by annotations are invaluable when debugging
+locking problems and they carry the same level of details when inspecting
+code. Always prefer annotations when in doubt!
+
+Proof of 100% correctness:
+--------------------------
+
+The validator achieves perfect, mathematical 'closure' (proof of locking
+correctness) in the sense that for every simple, standalone single-task
+locking sequence that occurred at least once during the lifetime of the
+kernel, the validator proves it with a 100% certainty that no
+combination and timing of these locking sequences can cause any class of
+lock related deadlock. [1]_
+
+I.e. complex multi-CPU and multi-task locking scenarios do not have to
+occur in practice to prove a deadlock: only the simple 'component'
+locking chains have to occur at least once (anytime, in any
+task/context) for the validator to be able to prove correctness. (For
+example, complex deadlocks that would normally need more than 3 CPUs and
+a very unlikely constellation of tasks, irq-contexts and timings to
+occur, can be detected on a plain, lightly loaded single-CPU system as
+well!)
+
+This radically decreases the complexity of locking related QA of the
+kernel: what has to be done during QA is to trigger as many "simple"
+single-task locking dependencies in the kernel as possible, at least
+once, to prove locking correctness - instead of having to trigger every
+possible combination of locking interaction between CPUs, combined with
+every possible hardirq and softirq nesting scenario (which is impossible
+to do in practice).
+
+.. [1]
+
+ assuming that the validator itself is 100% correct, and no other
+ part of the system corrupts the state of the validator in any way.
+ We also assume that all NMI/SMM paths [which could interrupt
+ even hardirq-disabled codepaths] are correct and do not interfere
+ with the validator. We also assume that the 64-bit 'chain hash'
+ value is unique for every lock-chain in the system. Also, lock
+ recursion must not be higher than 20.
+
+Performance:
+------------
+
+The above rules require **massive** amounts of runtime checking. If we did
+that for every lock taken and for every irqs-enable event, it would
+render the system practically unusably slow. The complexity of checking
+is O(N^2), so even with just a few hundred lock-classes we'd have to do
+tens of thousands of checks for every event.
+
+This problem is solved by checking any given 'locking scenario' (unique
+sequence of locks taken after each other) only once. A simple stack of
+held locks is maintained, and a lightweight 64-bit hash value is
+calculated, which hash is unique for every lock chain. The hash value,
+when the chain is validated for the first time, is then put into a hash
+table, which hash-table can be checked in a lockfree manner. If the
+locking chain occurs again later on, the hash table tells us that we
+don't have to validate the chain again.
+
+Troubleshooting:
+----------------
+
+The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
+Exceeding this number will trigger the following lockdep warning:
+
+ (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+
+By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
+desktop systems have less than 1,000 lock classes, so this warning
+normally results from lock-class leakage or failure to properly
+initialize locks. These two problems are illustrated below:
+
+1. Repeated module loading and unloading while running the validator
+ will result in lock-class leakage. The issue here is that each
+ load of the module will create a new set of lock classes for
+ that module's locks, but module unloading does not remove old
+ classes (see below discussion of reuse of lock classes for why).
+ Therefore, if that module is loaded and unloaded repeatedly,
+ the number of lock classes will eventually reach the maximum.
+
+2. Using structures such as arrays that have large numbers of
+ locks that are not explicitly initialized. For example,
+ a hash table with 8192 buckets where each bucket has its own
+ spinlock_t will consume 8192 lock classes -unless- each spinlock
+ is explicitly initialized at runtime, for example, using the
+ run-time spin_lock_init() as opposed to compile-time initializers
+ such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize
+ the per-bucket spinlocks would guarantee lock-class overflow.
+ In contrast, a loop that called spin_lock_init() on each lock
+ would place all 8192 locks into a single lock class.
+
+ The moral of this story is that you should always explicitly
+ initialize your locks.
+
+One might argue that the validator should be modified to allow
+lock classes to be reused. However, if you are tempted to make this
+argument, first review the code and think through the changes that would
+be required, keeping in mind that the lock classes to be removed are
+likely to be linked into the lock-dependency graph. This turns out to
+be harder to do than to say.
+
+Of course, if you do run out of lock classes, the next thing to do is
+to find the offending lock classes. First, the following command gives
+you the number of lock classes currently in use along with the maximum::
+
+ grep "lock-classes" /proc/lockdep_stats
+
+This command produces the following output on a modest system::
+
+ lock-classes: 748 [max: 8191]
+
+If the number allocated (748 above) increases continually over time,
+then there is likely a leak. The following command can be used to
+identify the leaking lock classes::
+
+ grep "BD" /proc/lockdep
+
+Run the command and save the output, then compare against the output from
+a later run of this command to identify the leakers. This same output
+can also help you find situations where runtime lock initialization has
+been omitted.
diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
deleted file mode 100644
index 39fae143c9cb..000000000000
--- a/Documentation/locking/lockdep-design.txt
+++ /dev/null
@@ -1,333 +0,0 @@
-Runtime locking correctness validator
-=====================================
-
-started by Ingo Molnar <mingo@redhat.com>
-additions by Arjan van de Ven <arjan@linux.intel.com>
-
-Lock-class
-----------
-
-The basic object the validator operates upon is a 'class' of locks.
-
-A class of locks is a group of locks that are logically the same with
-respect to locking rules, even if the locks may have multiple (possibly
-tens of thousands of) instantiations. For example a lock in the inode
-struct is one class, while each inode has its own instantiation of that
-lock class.
-
-The validator tracks the 'state' of lock-classes, and it tracks
-dependencies between different lock-classes. The validator maintains a
-rolling proof that the state and the dependencies are correct.
-
-Unlike an lock instantiation, the lock-class itself never goes away: when
-a lock-class is used for the first time after bootup it gets registered,
-and all subsequent uses of that lock-class will be attached to this
-lock-class.
-
-State
------
-
-The validator tracks lock-class usage history into 4 * nSTATEs + 1 separate
-state bits:
-
-- 'ever held in STATE context'
-- 'ever held as readlock in STATE context'
-- 'ever held with STATE enabled'
-- 'ever held as readlock with STATE enabled'
-
-Where STATE can be either one of (kernel/locking/lockdep_states.h)
- - hardirq
- - softirq
-
-- 'ever used' [ == !unused ]
-
-When locking rules are violated, these state bits are presented in the
-locking error messages, inside curlies. A contrived example:
-
- modprobe/2287 is trying to acquire lock:
- (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
- but task is already holding lock:
- (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-
-The bit position indicates STATE, STATE-read, for each of the states listed
-above, and the character displayed in each indicates:
-
- '.' acquired while irqs disabled and not in irq context
- '-' acquired in irq context
- '+' acquired with irqs enabled
- '?' acquired in irq context with irqs enabled.
-
-Unused mutexes cannot be part of the cause of an error.
-
-
-Single-lock state rules:
-------------------------
-
-A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
-following states are exclusive, and only one of them is allowed to be
-set for any lock-class:
-
- <hardirq-safe> and <hardirq-unsafe>
- <softirq-safe> and <softirq-unsafe>
-
-The validator detects and reports lock usage that violate these
-single-lock state rules.
-
-Multi-lock dependency rules:
-----------------------------
-
-The same lock-class must not be acquired twice, because this could lead
-to lock recursion deadlocks.
-
-Furthermore, two locks may not be taken in different order:
-
- <L1> -> <L2>
- <L2> -> <L1>
-
-because this could lead to lock inversion deadlocks. (The validator
-finds such dependencies in arbitrary complexity, i.e. there can be any
-other locking sequence between the acquire-lock operations, the
-validator will still track all dependencies between locks.)
-
-Furthermore, the following usage based lock dependencies are not allowed
-between any two lock-classes:
-
- <hardirq-safe> -> <hardirq-unsafe>
- <softirq-safe> -> <softirq-unsafe>
-
-The first rule comes from the fact that a hardirq-safe lock could be
-taken by a hardirq context, interrupting a hardirq-unsafe lock - and
-thus could result in a lock inversion deadlock. Likewise, a softirq-safe
-lock could be taken by an softirq context, interrupting a softirq-unsafe
-lock.
-
-The above rules are enforced for any locking sequence that occurs in the
-kernel: when acquiring a new lock, the validator checks whether there is
-any rule violation between the new lock and any of the held locks.
-
-When a lock-class changes its state, the following aspects of the above
-dependency rules are enforced:
-
-- if a new hardirq-safe lock is discovered, we check whether it
- took any hardirq-unsafe lock in the past.
-
-- if a new softirq-safe lock is discovered, we check whether it took
- any softirq-unsafe lock in the past.
-
-- if a new hardirq-unsafe lock is discovered, we check whether any
- hardirq-safe lock took it in the past.
-
-- if a new softirq-unsafe lock is discovered, we check whether any
- softirq-safe lock took it in the past.
-
-(Again, we do these checks too on the basis that an interrupt context
-could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
-could lead to a lock inversion deadlock - even if that lock scenario did
-not trigger in practice yet.)
-
-Exception: Nested data dependencies leading to nested locking
--------------------------------------------------------------
-
-There are a few cases where the Linux kernel acquires more than one
-instance of the same lock-class. Such cases typically happen when there
-is some sort of hierarchy within objects of the same type. In these
-cases there is an inherent "natural" ordering between the two objects
-(defined by the properties of the hierarchy), and the kernel grabs the
-locks in this fixed order on each of the objects.
-
-An example of such an object hierarchy that results in "nested locking"
-is that of a "whole disk" block-dev object and a "partition" block-dev
-object; the partition is "part of" the whole device and as long as one
-always takes the whole disk lock as a higher lock than the partition
-lock, the lock ordering is fully correct. The validator does not
-automatically detect this natural ordering, as the locking rule behind
-the ordering is not static.
-
-In order to teach the validator about this correct usage model, new
-versions of the various locking primitives were added that allow you to
-specify a "nesting level". An example call, for the block device mutex,
-looks like this:
-
-enum bdev_bd_mutex_lock_class
-{
- BD_MUTEX_NORMAL,
- BD_MUTEX_WHOLE,
- BD_MUTEX_PARTITION
-};
-
- mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
-
-In this case the locking is done on a bdev object that is known to be a
-partition.
-
-The validator treats a lock that is taken in such a nested fashion as a
-separate (sub)class for the purposes of validation.
-
-Note: When changing code to use the _nested() primitives, be careful and
-check really thoroughly that the hierarchy is correctly mapped; otherwise
-you can get false positives or false negatives.
-
-Annotations
------------
-
-Two constructs can be used to annotate and check where and if certain locks
-must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
-
-As the name suggests, lockdep_assert_held* family of macros assert that a
-particular lock is held at a certain time (and generate a WARN() otherwise).
-This annotation is largely used all over the kernel, e.g. kernel/sched/
-core.c
-
- void update_rq_clock(struct rq *rq)
- {
- s64 delta;
-
- lockdep_assert_held(&rq->lock);
- [...]
- }
-
-where holding rq->lock is required to safely update a rq's clock.
-
-The other family of macros is lockdep_*pin_lock(), which is admittedly only
-used for rq->lock ATM. Despite their limited adoption these annotations
-generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
-out to be especially helpful to debug code with callbacks, where an upper
-layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
-and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
-returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
-that nobody tampered with the lock, e.g. kernel/sched/sched.h
-
- static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
- {
- rf->cookie = lockdep_pin_lock(&rq->lock);
- [...]
- }
-
- static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
- {
- [...]
- lockdep_unpin_lock(&rq->lock, rf->cookie);
- }
-
-While comments about locking requirements might provide useful information,
-the runtime checks performed by annotations are invaluable when debugging
-locking problems and they carry the same level of details when inspecting
-code. Always prefer annotations when in doubt!
-
-Proof of 100% correctness:
---------------------------
-
-The validator achieves perfect, mathematical 'closure' (proof of locking
-correctness) in the sense that for every simple, standalone single-task
-locking sequence that occurred at least once during the lifetime of the
-kernel, the validator proves it with a 100% certainty that no
-combination and timing of these locking sequences can cause any class of
-lock related deadlock. [*]
-
-I.e. complex multi-CPU and multi-task locking scenarios do not have to
-occur in practice to prove a deadlock: only the simple 'component'
-locking chains have to occur at least once (anytime, in any
-task/context) for the validator to be able to prove correctness. (For
-example, complex deadlocks that would normally need more than 3 CPUs and
-a very unlikely constellation of tasks, irq-contexts and timings to
-occur, can be detected on a plain, lightly loaded single-CPU system as
-well!)
-
-This radically decreases the complexity of locking related QA of the
-kernel: what has to be done during QA is to trigger as many "simple"
-single-task locking dependencies in the kernel as possible, at least
-once, to prove locking correctness - instead of having to trigger every
-possible combination of locking interaction between CPUs, combined with
-every possible hardirq and softirq nesting scenario (which is impossible
-to do in practice).
-
-[*] assuming that the validator itself is 100% correct, and no other
- part of the system corrupts the state of the validator in any way.
- We also assume that all NMI/SMM paths [which could interrupt
- even hardirq-disabled codepaths] are correct and do not interfere
- with the validator. We also assume that the 64-bit 'chain hash'
- value is unique for every lock-chain in the system. Also, lock
- recursion must not be higher than 20.
-
-Performance:
-------------
-
-The above rules require _massive_ amounts of runtime checking. If we did
-that for every lock taken and for every irqs-enable event, it would
-render the system practically unusably slow. The complexity of checking
-is O(N^2), so even with just a few hundred lock-classes we'd have to do
-tens of thousands of checks for every event.
-
-This problem is solved by checking any given 'locking scenario' (unique
-sequence of locks taken after each other) only once. A simple stack of
-held locks is maintained, and a lightweight 64-bit hash value is
-calculated, which hash is unique for every lock chain. The hash value,
-when the chain is validated for the first time, is then put into a hash
-table, which hash-table can be checked in a lockfree manner. If the
-locking chain occurs again later on, the hash table tells us that we
-don't have to validate the chain again.
-
-Troubleshooting:
-----------------
-
-The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
-Exceeding this number will trigger the following lockdep warning:
-
- (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
-
-By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
-desktop systems have less than 1,000 lock classes, so this warning
-normally results from lock-class leakage or failure to properly
-initialize locks. These two problems are illustrated below:
-
-1. Repeated module loading and unloading while running the validator
- will result in lock-class leakage. The issue here is that each
- load of the module will create a new set of lock classes for
- that module's locks, but module unloading does not remove old
- classes (see below discussion of reuse of lock classes for why).
- Therefore, if that module is loaded and unloaded repeatedly,
- the number of lock classes will eventually reach the maximum.
-
-2. Using structures such as arrays that have large numbers of
- locks that are not explicitly initialized. For example,
- a hash table with 8192 buckets where each bucket has its own
- spinlock_t will consume 8192 lock classes -unless- each spinlock
- is explicitly initialized at runtime, for example, using the
- run-time spin_lock_init() as opposed to compile-time initializers
- such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize
- the per-bucket spinlocks would guarantee lock-class overflow.
- In contrast, a loop that called spin_lock_init() on each lock
- would place all 8192 locks into a single lock class.
-
- The moral of this story is that you should always explicitly
- initialize your locks.
-
-One might argue that the validator should be modified to allow
-lock classes to be reused. However, if you are tempted to make this
-argument, first review the code and think through the changes that would
-be required, keeping in mind that the lock classes to be removed are
-likely to be linked into the lock-dependency graph. This turns out to
-be harder to do than to say.
-
-Of course, if you do run out of lock classes, the next thing to do is
-to find the offending lock classes. First, the following command gives
-you the number of lock classes currently in use along with the maximum:
-
- grep "lock-classes" /proc/lockdep_stats
-
-This command produces the following output on a modest system:
-
- lock-classes: 748 [max: 8191]
-
-If the number allocated (748 above) increases continually over time,
-then there is likely a leak. The following command can be used to
-identify the leaking lock classes:
-
- grep "BD" /proc/lockdep
-
-Run the command and save the output, then compare against the output from
-a later run of this command to identify the leakers. This same output
-can also help you find situations where runtime lock initialization has
-been omitted.
diff --git a/Documentation/locking/lockstat.rst b/Documentation/locking/lockstat.rst
new file mode 100644
index 000000000000..536eab8dbd99
--- /dev/null
+++ b/Documentation/locking/lockstat.rst
@@ -0,0 +1,204 @@
+===============
+Lock Statistics
+===============
+
+What
+====
+
+As the name suggests, it provides statistics on locks.
+
+
+Why
+===
+
+Because things like lock contention can severely impact performance.
+
+How
+===
+
+Lockdep already has hooks in the lock functions and maps lock instances to
+lock classes. We build on that (see Documentation/locking/lockdep-design.rst).
+The graph below shows the relation between the lock functions and the various
+hooks therein::
+
+ __acquire
+ |
+ lock _____
+ | \
+ | __contended
+ | |
+ | <wait>
+ | _______/
+ |/
+ |
+ __acquired
+ |
+ .
+ <hold>
+ .
+ |
+ __release
+ |
+ unlock
+
+ lock, unlock - the regular lock functions
+ __* - the hooks
+ <> - states
+
+With these hooks we provide the following statistics:
+
+ con-bounces
+ - number of lock contention that involved x-cpu data
+ contentions
+ - number of lock acquisitions that had to wait
+ wait time
+ min
+ - shortest (non-0) time we ever had to wait for a lock
+ max
+ - longest time we ever had to wait for a lock
+ total
+ - total time we spend waiting on this lock
+ avg
+ - average time spent waiting on this lock
+ acq-bounces
+ - number of lock acquisitions that involved x-cpu data
+ acquisitions
+ - number of times we took the lock
+ hold time
+ min
+ - shortest (non-0) time we ever held the lock
+ max
+ - longest time we ever held the lock
+ total
+ - total time this lock was held
+ avg
+ - average time this lock was held
+
+These numbers are gathered per lock class, per read/write state (when
+applicable).
+
+It also tracks 4 contention points per class. A contention point is a call site
+that had to wait on lock acquisition.
+
+Configuration
+-------------
+
+Lock statistics are enabled via CONFIG_LOCK_STAT.
+
+Usage
+-----
+
+Enable collection of statistics::
+
+ # echo 1 >/proc/sys/kernel/lock_stat
+
+Disable collection of statistics::
+
+ # echo 0 >/proc/sys/kernel/lock_stat
+
+Look at the current lock statistics::
+
+ ( line numbers not part of actual output, done for clarity in the explanation
+ below )
+
+ # less /proc/lock_stat
+
+ 01 lock_stat version 0.4
+ 02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
+ 04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 05
+ 06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99
+ 07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77
+ 08 ---------------
+ 09 &mm->mmap_sem 1 [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
+ 10 &mm->mmap_sem 96 [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+ 11 &mm->mmap_sem 34 [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+ 12 &mm->mmap_sem 17 [<ffffffff81127e71>] vm_munmap+0x41/0x80
+ 13 ---------------
+ 14 &mm->mmap_sem 1 [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
+ 15 &mm->mmap_sem 60 [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
+ 16 &mm->mmap_sem 41 [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+ 17 &mm->mmap_sem 68 [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+ 18
+ 19.............................................................................................................................................................................................................................
+ 20
+ 21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48
+ 22 ---------------
+ 23 unix_table_lock 45 [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+ 24 unix_table_lock 47 [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+ 25 unix_table_lock 15 [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+ 26 unix_table_lock 5 [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+ 27 ---------------
+ 28 unix_table_lock 39 [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+ 29 unix_table_lock 49 [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+ 30 unix_table_lock 20 [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+ 31 unix_table_lock 4 [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+
+
+This excerpt shows the first two lock class statistics. Line 01 shows the
+output version - each time the format changes this will be updated. Line 02-04
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
+statistics. These statistics come in two parts; the actual stats separated by a
+short separator (line 08, 13) from the contention points.
+
+Lines 09-12 show the first 4 recorded contention points (the code
+which tries to get the lock) and lines 14-17 show the first 4 recorded
+contended points (the lock holder). It is possible that the max
+con-bounces point is missing in the statistics.
+
+The first lock (05-18) is a read/write lock, and shows two lines above the
+short separator. The contention points don't match the column descriptors,
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
+
+The integer part of the time values is in us.
+
+Dealing with nested locks, subclasses may appear::
+
+ 32...........................................................................................................................................................................................................................
+ 33
+ 34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82
+ 35 ---------
+ 36 &rq->lock 645 [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+ 37 &rq->lock 297 [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+ 38 &rq->lock 360 [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
+ 39 &rq->lock 428 [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
+ 40 ---------
+ 41 &rq->lock 77 [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+ 42 &rq->lock 174 [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+ 43 &rq->lock 4715 [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+ 44 &rq->lock 893 [<ffffffff81340524>] schedule+0x157/0x7b8
+ 45
+ 46...........................................................................................................................................................................................................................
+ 47
+ 48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84
+ 49 -----------
+ 50 &rq->lock/1 11526 [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+ 51 -----------
+ 52 &rq->lock/1 5645 [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+ 53 &rq->lock/1 1224 [<ffffffff81340524>] schedule+0x157/0x7b8
+ 54 &rq->lock/1 4336 [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+ 55 &rq->lock/1 181 [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+
+Line 48 shows statistics for the second subclass (/1) of &rq->lock class
+(subclass starts from 0), since in this case, as line 50 suggests,
+double_rq_lock actually acquires a nested lock of two spinlocks.
+
+View the top contending locks::
+
+ # grep : /proc/lock_stat | head
+ clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71
+ tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59
+ &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59
+ &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69
+ &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93
+ tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72
+ tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89
+ rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89
+ &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27
+ rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76
+
+Clear the statistics::
+
+ # echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt
deleted file mode 100644
index fdbeb0c45ef3..000000000000
--- a/Documentation/locking/lockstat.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-
-LOCK STATISTICS
-
-- WHAT
-
-As the name suggests, it provides statistics on locks.
-
-- WHY
-
-Because things like lock contention can severely impact performance.
-
-- HOW
-
-Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/locking/lockdep-design.txt).
-The graph below shows the relation between the lock functions and the various
-hooks therein.
-
- __acquire
- |
- lock _____
- | \
- | __contended
- | |
- | <wait>
- | _______/
- |/
- |
- __acquired
- |
- .
- <hold>
- .
- |
- __release
- |
- unlock
-
-lock, unlock - the regular lock functions
-__* - the hooks
-<> - states
-
-With these hooks we provide the following statistics:
-
- con-bounces - number of lock contention that involved x-cpu data
- contentions - number of lock acquisitions that had to wait
- wait time min - shortest (non-0) time we ever had to wait for a lock
- max - longest time we ever had to wait for a lock
- total - total time we spend waiting on this lock
- avg - average time spent waiting on this lock
- acq-bounces - number of lock acquisitions that involved x-cpu data
- acquisitions - number of times we took the lock
- hold time min - shortest (non-0) time we ever held the lock
- max - longest time we ever held the lock
- total - total time this lock was held
- avg - average time this lock was held
-
-These numbers are gathered per lock class, per read/write state (when
-applicable).
-
-It also tracks 4 contention points per class. A contention point is a call site
-that had to wait on lock acquisition.
-
- - CONFIGURATION
-
-Lock statistics are enabled via CONFIG_LOCK_STAT.
-
- - USAGE
-
-Enable collection of statistics:
-
-# echo 1 >/proc/sys/kernel/lock_stat
-
-Disable collection of statistics:
-
-# echo 0 >/proc/sys/kernel/lock_stat
-
-Look at the current lock statistics:
-
-( line numbers not part of actual output, done for clarity in the explanation
- below )
-
-# less /proc/lock_stat
-
-01 lock_stat version 0.4
-02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-03 class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg
-04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-05
-06 &mm->mmap_sem-W: 46 84 0.26 939.10 16371.53 194.90 47291 2922365 0.16 2220301.69 17464026916.32 5975.99
-07 &mm->mmap_sem-R: 37 100 1.31 299502.61 325629.52 3256.30 212344 34316685 0.10 7744.91 95016910.20 2.77
-08 ---------------
-09 &mm->mmap_sem 1 [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
-10 &mm->mmap_sem 96 [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-11 &mm->mmap_sem 34 [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-12 &mm->mmap_sem 17 [<ffffffff81127e71>] vm_munmap+0x41/0x80
-13 ---------------
-14 &mm->mmap_sem 1 [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
-15 &mm->mmap_sem 60 [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
-16 &mm->mmap_sem 41 [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-17 &mm->mmap_sem 68 [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-18
-19.............................................................................................................................................................................................................................
-20
-21 unix_table_lock: 110 112 0.21 49.24 163.91 1.46 21094 66312 0.12 624.42 31589.81 0.48
-22 ---------------
-23 unix_table_lock 45 [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-24 unix_table_lock 47 [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-25 unix_table_lock 15 [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-26 unix_table_lock 5 [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-27 ---------------
-28 unix_table_lock 39 [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-29 unix_table_lock 49 [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-30 unix_table_lock 20 [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-31 unix_table_lock 4 [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-
-
-This excerpt shows the first two lock class statistics. Line 01 shows the
-output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-18 and 20-31 show the actual
-statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 13) from the contention points.
-
-Lines 09-12 show the first 4 recorded contention points (the code
-which tries to get the lock) and lines 14-17 show the first 4 recorded
-contended points (the lock holder). It is possible that the max
-con-bounces point is missing in the statistics.
-
-The first lock (05-18) is a read/write lock, and shows two lines above the
-short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol. The second set of contention
-points are the points we're contending with.
-
-The integer part of the time values is in us.
-
-Dealing with nested locks, subclasses may appear:
-
-32...........................................................................................................................................................................................................................
-33
-34 &rq->lock: 13128 13128 0.43 190.53 103881.26 7.91 97454 3453404 0.00 401.11 13224683.11 3.82
-35 ---------
-36 &rq->lock 645 [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-37 &rq->lock 297 [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-38 &rq->lock 360 [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
-39 &rq->lock 428 [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
-40 ---------
-41 &rq->lock 77 [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-42 &rq->lock 174 [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-43 &rq->lock 4715 [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-44 &rq->lock 893 [<ffffffff81340524>] schedule+0x157/0x7b8
-45
-46...........................................................................................................................................................................................................................
-47
-48 &rq->lock/1: 1526 11488 0.33 388.73 136294.31 11.86 21461 38404 0.00 37.93 109388.53 2.84
-49 -----------
-50 &rq->lock/1 11526 [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-51 -----------
-52 &rq->lock/1 5645 [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-53 &rq->lock/1 1224 [<ffffffff81340524>] schedule+0x157/0x7b8
-54 &rq->lock/1 4336 [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-55 &rq->lock/1 181 [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-
-Line 48 shows statistics for the second subclass (/1) of &rq->lock class
-(subclass starts from 0), since in this case, as line 50 suggests,
-double_rq_lock actually acquires a nested lock of two spinlocks.
-
-View the top contending locks:
-
-# grep : /proc/lock_stat | head
- clockevents_lock: 2926159 2947636 0.15 46882.81 1784540466.34 605.41 3381345 3879161 0.00 2260.97 53178395.68 13.71
- tick_broadcast_lock: 346460 346717 0.18 2257.43 39364622.71 113.54 3642919 4242696 0.00 2263.79 49173646.60 11.59
- &mapping->i_mmap_mutex: 203896 203899 3.36 645530.05 31767507988.39 155800.21 3361776 8893984 0.17 2254.15 14110121.02 1.59
- &rq->lock: 135014 136909 0.18 606.09 842160.68 6.15 1540728 10436146 0.00 728.72 17606683.41 1.69
- &(&zone->lru_lock)->rlock: 93000 94934 0.16 59.18 188253.78 1.98 1199912 3809894 0.15 391.40 3559518.81 0.93
- tasklist_lock-W: 40667 41130 0.23 1189.42 428980.51 10.43 270278 510106 0.16 653.51 3939674.91 7.72
- tasklist_lock-R: 21298 21305 0.20 1310.05 215511.12 10.12 186204 241258 0.14 1162.33 1179779.23 4.89
- rcu_node_1: 47656 49022 0.16 635.41 193616.41 3.95 844888 1865423 0.00 764.26 1656226.96 0.89
- &(&dentry->d_lockref.lock)->rlock: 39791 40179 0.15 1302.08 88851.96 2.21 2790851 12527025 0.10 1910.75 3379714.27 0.27
- rcu_node_0: 29203 30064 0.16 786.55 1555573.00 51.74 88963 244254 0.00 398.87 428872.51 1.76
-
-Clear the statistics:
-
-# echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst
new file mode 100644
index 000000000000..e79eeeca3ac6
--- /dev/null
+++ b/Documentation/locking/locktorture.rst
@@ -0,0 +1,170 @@
+==================================
+Kernel Lock Torture Test Operation
+==================================
+
+CONFIG_LOCK_TORTURE_TEST
+========================
+
+The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
+that runs torture tests on core kernel locking primitives. The kernel
+module, 'locktorture', may be built after the fact on the running
+kernel to be tested, if desired. The tests periodically output status
+messages via printk(), which can be examined via the dmesg (perhaps
+grepping for "torture"). The test is started when the module is loaded,
+and stops when the module is unloaded. This program is based on how RCU
+is tortured, via rcutorture.
+
+This torture test consists of creating a number of kernel threads which
+acquire the lock and hold it for specific amount of time, thus simulating
+different critical region behaviors. The amount of contention on the lock
+can be simulated by either enlarging this critical region hold time and/or
+creating more kthreads.
+
+
+Module Parameters
+=================
+
+This module has the following parameters:
+
+
+Locktorture-specific
+--------------------
+
+nwriters_stress
+ Number of kernel threads that will stress exclusive lock
+ ownership (writers). The default value is twice the number
+ of online CPUs.
+
+nreaders_stress
+ Number of kernel threads that will stress shared lock
+ ownership (readers). The default is the same amount of writer
+ locks. If the user did not specify nwriters_stress, then
+ both readers and writers be the amount of online CPUs.
+
+torture_type
+ Type of lock to torture. By default, only spinlocks will
+ be tortured. This module can torture the following locks,
+ with string values as follows:
+
+ - "lock_busted":
+ Simulates a buggy lock implementation.
+
+ - "spin_lock":
+ spin_lock() and spin_unlock() pairs.
+
+ - "spin_lock_irq":
+ spin_lock_irq() and spin_unlock_irq() pairs.
+
+ - "rw_lock":
+ read/write lock() and unlock() rwlock pairs.
+
+ - "rw_lock_irq":
+ read/write lock_irq() and unlock_irq()
+ rwlock pairs.
+
+ - "mutex_lock":
+ mutex_lock() and mutex_unlock() pairs.
+
+ - "rtmutex_lock":
+ rtmutex_lock() and rtmutex_unlock() pairs.
+ Kernel must have CONFIG_RT_MUTEX=y.
+
+ - "rwsem_lock":
+ read/write down() and up() semaphore pairs.
+
+
+Torture-framework (RCU + locking)
+---------------------------------
+
+shutdown_secs
+ The number of seconds to run the test before terminating
+ the test and powering off the system. The default is
+ zero, which disables test termination and system shutdown.
+ This capability is useful for automated testing.
+
+onoff_interval
+ The number of seconds between each attempt to execute a
+ randomly selected CPU-hotplug operation. Defaults
+ to zero, which disables CPU hotplugging. In
+ CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
+ refuse to do any CPU-hotplug operations regardless of
+ what value is specified for onoff_interval.
+
+onoff_holdoff
+ The number of seconds to wait until starting CPU-hotplug
+ operations. This would normally only be used when
+ locktorture was built into the kernel and started
+ automatically at boot time, in which case it is useful
+ in order to avoid confusing boot-time code with CPUs
+ coming and going. This parameter is only useful if
+ CONFIG_HOTPLUG_CPU is enabled.
+
+stat_interval
+ Number of seconds between statistics-related printk()s.
+ By default, locktorture will report stats every 60 seconds.
+ Setting the interval to zero causes the statistics to
+ be printed -only- when the module is unloaded, and this
+ is the default.
+
+stutter
+ The length of time to run the test before pausing for this
+ same period of time. Defaults to "stutter=5", so as
+ to run and pause for (roughly) five-second intervals.
+ Specifying "stutter=0" causes the test to run continuously
+ without pausing, which is the old default behavior.
+
+shuffle_interval
+ The number of seconds to keep the test threads affinitied
+ to a particular subset of the CPUs, defaults to 3 seconds.
+ Used in conjunction with test_no_idle_hz.
+
+verbose
+ Enable verbose debugging printing, via printk(). Enabled
+ by default. This extra information is mostly related to
+ high-level errors and reports from the main 'torture'
+ framework.
+
+
+Statistics
+==========
+
+Statistics are printed in the following format::
+
+ spin_lock-torture: Writes: Total: 93746064 Max/Min: 0/0 Fail: 0
+ (A) (B) (C) (D) (E)
+
+ (A): Lock type that is being tortured -- torture_type parameter.
+
+ (B): Number of writer lock acquisitions. If dealing with a read/write
+ primitive a second "Reads" statistics line is printed.
+
+ (C): Number of times the lock was acquired.
+
+ (D): Min and max number of times threads failed to acquire the lock.
+
+ (E): true/false values if there were errors acquiring the lock. This should
+ -only- be positive if there is a bug in the locking primitive's
+ implementation. Otherwise a lock should never fail (i.e., spin_lock()).
+ Of course, the same applies for (C), above. A dummy example of this is
+ the "lock_busted" type.
+
+Usage
+=====
+
+The following script may be used to torture locks::
+
+ #!/bin/sh
+
+ modprobe locktorture
+ sleep 3600
+ rmmod locktorture
+ dmesg | grep torture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors. The "rmmod" command forces a "SUCCESS",
+"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed. The first
+two are self-explanatory, while the last indicates that while there
+were no locking failures, CPU-hotplug problems were detected.
+
+Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt
deleted file mode 100644
index 6a8df4cd19bf..000000000000
--- a/Documentation/locking/locktorture.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-Kernel Lock Torture Test Operation
-
-CONFIG_LOCK_TORTURE_TEST
-
-The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
-that runs torture tests on core kernel locking primitives. The kernel
-module, 'locktorture', may be built after the fact on the running
-kernel to be tested, if desired. The tests periodically output status
-messages via printk(), which can be examined via the dmesg (perhaps
-grepping for "torture"). The test is started when the module is loaded,
-and stops when the module is unloaded. This program is based on how RCU
-is tortured, via rcutorture.
-
-This torture test consists of creating a number of kernel threads which
-acquire the lock and hold it for specific amount of time, thus simulating
-different critical region behaviors. The amount of contention on the lock
-can be simulated by either enlarging this critical region hold time and/or
-creating more kthreads.
-
-
-MODULE PARAMETERS
-
-This module has the following parameters:
-
-
- ** Locktorture-specific **
-
-nwriters_stress Number of kernel threads that will stress exclusive lock
- ownership (writers). The default value is twice the number
- of online CPUs.
-
-nreaders_stress Number of kernel threads that will stress shared lock
- ownership (readers). The default is the same amount of writer
- locks. If the user did not specify nwriters_stress, then
- both readers and writers be the amount of online CPUs.
-
-torture_type Type of lock to torture. By default, only spinlocks will
- be tortured. This module can torture the following locks,
- with string values as follows:
-
- o "lock_busted": Simulates a buggy lock implementation.
-
- o "spin_lock": spin_lock() and spin_unlock() pairs.
-
- o "spin_lock_irq": spin_lock_irq() and spin_unlock_irq()
- pairs.
-
- o "rw_lock": read/write lock() and unlock() rwlock pairs.
-
- o "rw_lock_irq": read/write lock_irq() and unlock_irq()
- rwlock pairs.
-
- o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
-
- o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
- pairs. Kernel must have CONFIG_RT_MUTEX=y.
-
- o "rwsem_lock": read/write down() and up() semaphore pairs.
-
-
- ** Torture-framework (RCU + locking) **
-
-shutdown_secs The number of seconds to run the test before terminating
- the test and powering off the system. The default is
- zero, which disables test termination and system shutdown.
- This capability is useful for automated testing.
-
-onoff_interval The number of seconds between each attempt to execute a
- randomly selected CPU-hotplug operation. Defaults
- to zero, which disables CPU hotplugging. In
- CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
- refuse to do any CPU-hotplug operations regardless of
- what value is specified for onoff_interval.
-
-onoff_holdoff The number of seconds to wait until starting CPU-hotplug
- operations. This would normally only be used when
- locktorture was built into the kernel and started
- automatically at boot time, in which case it is useful
- in order to avoid confusing boot-time code with CPUs
- coming and going. This parameter is only useful if
- CONFIG_HOTPLUG_CPU is enabled.
-
-stat_interval Number of seconds between statistics-related printk()s.
- By default, locktorture will report stats every 60 seconds.
- Setting the interval to zero causes the statistics to
- be printed -only- when the module is unloaded, and this
- is the default.
-
-stutter The length of time to run the test before pausing for this
- same period of time. Defaults to "stutter=5", so as
- to run and pause for (roughly) five-second intervals.
- Specifying "stutter=0" causes the test to run continuously
- without pausing, which is the old default behavior.
-
-shuffle_interval The number of seconds to keep the test threads affinitied
- to a particular subset of the CPUs, defaults to 3 seconds.
- Used in conjunction with test_no_idle_hz.
-
-verbose Enable verbose debugging printing, via printk(). Enabled
- by default. This extra information is mostly related to
- high-level errors and reports from the main 'torture'
- framework.
-
-
-STATISTICS
-
-Statistics are printed in the following format:
-
-spin_lock-torture: Writes: Total: 93746064 Max/Min: 0/0 Fail: 0
- (A) (B) (C) (D) (E)
-
-(A): Lock type that is being tortured -- torture_type parameter.
-
-(B): Number of writer lock acquisitions. If dealing with a read/write primitive
- a second "Reads" statistics line is printed.
-
-(C): Number of times the lock was acquired.
-
-(D): Min and max number of times threads failed to acquire the lock.
-
-(E): true/false values if there were errors acquiring the lock. This should
- -only- be positive if there is a bug in the locking primitive's
- implementation. Otherwise a lock should never fail (i.e., spin_lock()).
- Of course, the same applies for (C), above. A dummy example of this is
- the "lock_busted" type.
-
-USAGE
-
-The following script may be used to torture locks:
-
- #!/bin/sh
-
- modprobe locktorture
- sleep 3600
- rmmod locktorture
- dmesg | grep torture:
-
-The output can be manually inspected for the error flag of "!!!".
-One could of course create a more elaborate script that automatically
-checked for such errors. The "rmmod" command forces a "SUCCESS",
-"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed. The first
-two are self-explanatory, while the last indicates that while there
-were no locking failures, CPU-hotplug problems were detected.
-
-Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst
new file mode 100644
index 000000000000..4d8236b81fa5
--- /dev/null
+++ b/Documentation/locking/mutex-design.rst
@@ -0,0 +1,152 @@
+=======================
+Generic Mutex Subsystem
+=======================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+updated by Davidlohr Bueso <davidlohr@hp.com>
+
+What are mutexes?
+-----------------
+
+In the Linux kernel, mutexes refer to a particular locking primitive
+that enforces serialization on shared memory systems, and not only to
+the generic term referring to 'mutual exclusion' found in academia
+or similar theoretical text books. Mutexes are sleeping locks which
+behave similarly to binary semaphores, and were introduced in 2006[1]
+as an alternative to these. This new data structure provided a number
+of advantages, including simpler interfaces, and at that time smaller
+code (see Disadvantages).
+
+[1] http://lwn.net/Articles/164802/
+
+Implementation
+--------------
+
+Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
+and implemented in kernel/locking/mutex.c. These locks use an atomic variable
+(->owner) to keep track of the lock state during its lifetime. Field owner
+actually contains `struct task_struct *` to the current lock owner and it is
+therefore NULL if not currently owned. Since task_struct pointers are aligned
+at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
+if waiter list is non-empty). In its most basic form it also includes a
+wait-queue and a spinlock that serializes access to it. Furthermore,
+CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
+below in (ii).
+
+When acquiring a mutex, there are three possible paths that can be
+taken, depending on the state of the lock:
+
+(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
+ the current task. This only works in the uncontended case (cmpxchg() checks
+ against 0UL, so all 3 state bits above have to be 0). If the lock is
+ contended it goes to the next possible path.
+
+(ii) midpath: aka optimistic spinning, tries to spin for acquisition
+ while the lock owner is running and there are no other tasks ready
+ to run that have higher priority (need_resched). The rationale is
+ that if the lock owner is running, it is likely to release the lock
+ soon. The mutex spinners are queued up using MCS lock so that only
+ one spinner can compete for the mutex.
+
+ The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
+ with the desirable properties of being fair and with each cpu trying
+ to acquire the lock spinning on a local variable. It avoids expensive
+ cacheline bouncing that common test-and-set spinlock implementations
+ incur. An MCS-like lock is specially tailored for optimistic spinning
+ for sleeping lock implementation. An important feature of the customized
+ MCS lock is that it has the extra property that spinners are able to exit
+ the MCS spinlock queue when they need to reschedule. This further helps
+ avoid situations where MCS spinners that need to reschedule would continue
+ waiting to spin on mutex owner, only to go directly to slowpath upon
+ obtaining the MCS lock.
+
+
+(iii) slowpath: last resort, if the lock is still unable to be acquired,
+ the task is added to the wait-queue and sleeps until woken up by the
+ unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
+
+While formally kernel mutexes are sleepable locks, it is path (ii) that
+makes them more practically a hybrid type. By simply not interrupting a
+task and busy-waiting for a few cycles instead of immediately sleeping,
+the performance of this lock has been seen to significantly improve a
+number of workloads. Note that this technique is also used for rw-semaphores.
+
+Semantics
+---------
+
+The mutex subsystem checks and enforces the following rules:
+
+ - Only one task can hold the mutex at a time.
+ - Only the owner can unlock the mutex.
+ - Multiple unlocks are not permitted.
+ - Recursive locking/unlocking is not permitted.
+ - A mutex must only be initialized via the API (see below).
+ - A task may not exit with a mutex held.
+ - Memory areas where held locks reside must not be freed.
+ - Held mutexes must not be reinitialized.
+ - Mutexes may not be used in hardware or software interrupt
+ contexts such as tasklets and timers.
+
+These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
+In addition, the mutex debugging code also implements a number of other
+features that make lock debugging easier and faster:
+
+ - Uses symbolic names of mutexes, whenever they are printed
+ in debug output.
+ - Point-of-acquire tracking, symbolic lookup of function names,
+ list of all locks held in the system, printout of them.
+ - Owner tracking.
+ - Detects self-recursing locks and prints out all relevant info.
+ - Detects multi-task circular deadlocks and prints out all affected
+ locks and tasks (and only those tasks).
+
+
+Interfaces
+----------
+Statically define the mutex::
+
+ DEFINE_MUTEX(name);
+
+Dynamically initialize the mutex::
+
+ mutex_init(mutex);
+
+Acquire the mutex, uninterruptible::
+
+ void mutex_lock(struct mutex *lock);
+ void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+ int mutex_trylock(struct mutex *lock);
+
+Acquire the mutex, interruptible::
+
+ int mutex_lock_interruptible_nested(struct mutex *lock,
+ unsigned int subclass);
+ int mutex_lock_interruptible(struct mutex *lock);
+
+Acquire the mutex, interruptible, if dec to 0::
+
+ int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
+
+Unlock the mutex::
+
+ void mutex_unlock(struct mutex *lock);
+
+Test if the mutex is taken::
+
+ int mutex_is_locked(struct mutex *lock);
+
+Disadvantages
+-------------
+
+Unlike its original design and purpose, 'struct mutex' is among the largest
+locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
+is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
+cache and memory footprint.
+
+When to use mutexes
+-------------------
+
+Unless the strict semantics of mutexes are unsuitable and/or the critical
+region prevents the lock from being shared, always prefer them to any other
+locking primitive.
diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt
deleted file mode 100644
index 818aca19612f..000000000000
--- a/Documentation/locking/mutex-design.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Generic Mutex Subsystem
-
-started by Ingo Molnar <mingo@redhat.com>
-updated by Davidlohr Bueso <davidlohr@hp.com>
-
-What are mutexes?
------------------
-
-In the Linux kernel, mutexes refer to a particular locking primitive
-that enforces serialization on shared memory systems, and not only to
-the generic term referring to 'mutual exclusion' found in academia
-or similar theoretical text books. Mutexes are sleeping locks which
-behave similarly to binary semaphores, and were introduced in 2006[1]
-as an alternative to these. This new data structure provided a number
-of advantages, including simpler interfaces, and at that time smaller
-code (see Disadvantages).
-
-[1] http://lwn.net/Articles/164802/
-
-Implementation
---------------
-
-Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
-and implemented in kernel/locking/mutex.c. These locks use an atomic variable
-(->owner) to keep track of the lock state during its lifetime. Field owner
-actually contains 'struct task_struct *' to the current lock owner and it is
-therefore NULL if not currently owned. Since task_struct pointers are aligned
-at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
-if waiter list is non-empty). In its most basic form it also includes a
-wait-queue and a spinlock that serializes access to it. Furthermore,
-CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
-below in (ii).
-
-When acquiring a mutex, there are three possible paths that can be
-taken, depending on the state of the lock:
-
-(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
- the current task. This only works in the uncontended case (cmpxchg() checks
- against 0UL, so all 3 state bits above have to be 0). If the lock is
- contended it goes to the next possible path.
-
-(ii) midpath: aka optimistic spinning, tries to spin for acquisition
- while the lock owner is running and there are no other tasks ready
- to run that have higher priority (need_resched). The rationale is
- that if the lock owner is running, it is likely to release the lock
- soon. The mutex spinners are queued up using MCS lock so that only
- one spinner can compete for the mutex.
-
- The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
- with the desirable properties of being fair and with each cpu trying
- to acquire the lock spinning on a local variable. It avoids expensive
- cacheline bouncing that common test-and-set spinlock implementations
- incur. An MCS-like lock is specially tailored for optimistic spinning
- for sleeping lock implementation. An important feature of the customized
- MCS lock is that it has the extra property that spinners are able to exit
- the MCS spinlock queue when they need to reschedule. This further helps
- avoid situations where MCS spinners that need to reschedule would continue
- waiting to spin on mutex owner, only to go directly to slowpath upon
- obtaining the MCS lock.
-
-
-(iii) slowpath: last resort, if the lock is still unable to be acquired,
- the task is added to the wait-queue and sleeps until woken up by the
- unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
-
-While formally kernel mutexes are sleepable locks, it is path (ii) that
-makes them more practically a hybrid type. By simply not interrupting a
-task and busy-waiting for a few cycles instead of immediately sleeping,
-the performance of this lock has been seen to significantly improve a
-number of workloads. Note that this technique is also used for rw-semaphores.
-
-Semantics
----------
-
-The mutex subsystem checks and enforces the following rules:
-
- - Only one task can hold the mutex at a time.
- - Only the owner can unlock the mutex.
- - Multiple unlocks are not permitted.
- - Recursive locking/unlocking is not permitted.
- - A mutex must only be initialized via the API (see below).
- - A task may not exit with a mutex held.
- - Memory areas where held locks reside must not be freed.
- - Held mutexes must not be reinitialized.
- - Mutexes may not be used in hardware or software interrupt
- contexts such as tasklets and timers.
-
-These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
-In addition, the mutex debugging code also implements a number of other
-features that make lock debugging easier and faster:
-
- - Uses symbolic names of mutexes, whenever they are printed
- in debug output.
- - Point-of-acquire tracking, symbolic lookup of function names,
- list of all locks held in the system, printout of them.
- - Owner tracking.
- - Detects self-recursing locks and prints out all relevant info.
- - Detects multi-task circular deadlocks and prints out all affected
- locks and tasks (and only those tasks).
-
-
-Interfaces
-----------
-Statically define the mutex:
- DEFINE_MUTEX(name);
-
-Dynamically initialize the mutex:
- mutex_init(mutex);
-
-Acquire the mutex, uninterruptible:
- void mutex_lock(struct mutex *lock);
- void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
- int mutex_trylock(struct mutex *lock);
-
-Acquire the mutex, interruptible:
- int mutex_lock_interruptible_nested(struct mutex *lock,
- unsigned int subclass);
- int mutex_lock_interruptible(struct mutex *lock);
-
-Acquire the mutex, interruptible, if dec to 0:
- int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
-
-Unlock the mutex:
- void mutex_unlock(struct mutex *lock);
-
-Test if the mutex is taken:
- int mutex_is_locked(struct mutex *lock);
-
-Disadvantages
--------------
-
-Unlike its original design and purpose, 'struct mutex' is among the largest
-locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
-is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
-cache and memory footprint.
-
-When to use mutexes
--------------------
-
-Unless the strict semantics of mutexes are unsuitable and/or the critical
-region prevents the lock from being shared, always prefer them to any other
-locking primitive.
diff --git a/Documentation/locking/rt-mutex-design.rst b/Documentation/locking/rt-mutex-design.rst
new file mode 100644
index 000000000000..59c2a64efb21
--- /dev/null
+++ b/Documentation/locking/rt-mutex-design.rst
@@ -0,0 +1,574 @@
+==============================
+RT-mutex implementation design
+==============================
+
+Copyright (c) 2006 Steven Rostedt
+
+Licensed under the GNU Free Documentation License, Version 1.2
+
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/locking/rt-mutex.rst. Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run. This happens for several reasons, and
+most of the time it can't be helped. Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource. This is a priority inversion. What we want to prevent
+is something called unbounded priority inversion. That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is where you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock. This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem::
+
+ grab lock L1 (owned by C)
+ |
+ A ---+
+ C preempted by B
+ |
+ C +----+
+
+ B +-------->
+ B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document. Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process. To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A. So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A. As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain
+ - The PI chain is an ordered series of locks and processes that cause
+ processes to inherit priorities from a previous process that is
+ blocked on one of its locks. This is described in more detail
+ later in this document.
+
+mutex
+ - In this document, to differentiate from locks that implement
+ PI and spin locks that are used in the PI code, from now on
+ the PI locks will be called a mutex.
+
+lock
+ - In this document from now on, I will use the term lock when
+ referring to spin locks that are used to protect parts of the PI
+ algorithm. These locks disable preemption for UP (when
+ CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+ entering critical sections simultaneously.
+
+spin lock
+ - Same as lock above.
+
+waiter
+ - A waiter is a struct that is stored on the stack of a blocked
+ process. Since the scope of the waiter is within the code for
+ a process being blocked on the mutex, it is fine to allocate
+ the waiter on the process's stack (local variable). This
+ structure holds a pointer to the task, as well as the mutex that
+ the task is blocked on. It also has rbtree node structures to
+ place the task in the waiters rbtree of a mutex as well as the
+ pi_waiters rbtree of a mutex owner task (described below).
+
+ waiter is sometimes used in reference to the task that is waiting
+ on a mutex. This is the same as waiter->task.
+
+waiters
+ - A list of processes that are blocked on a mutex.
+
+top waiter
+ - The highest priority process waiting on a specific mutex.
+
+top pi waiter
+ - The highest priority process waiting on one of the mutexes
+ that a specific process owns.
+
+Note:
+ task and process are used interchangeably in this document, mostly to
+ differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place. Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example::
+
+ Process: A, B, C, D, E
+ Mutexes: L1, L2, L3, L4
+
+ A owns: L1
+ B blocked on L1
+ B owns L2
+ C blocked on L2
+ C owns L3
+ D blocked on L3
+ D owns L4
+ E blocked on L4
+
+The chain would be::
+
+ E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be::
+
+ F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains::
+
+ E->L4->D->L3->C->L2-+
+ |
+ +->B->L1->A
+ |
+ F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes. If we add another process G that is
+blocked on mutex L2::
+
+ G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again::
+
+ E->L4->D->L3->C-+
+ +->L2-+
+ | |
+ G-+ +->B->L1->A
+ |
+ F->L5-+
+
+If process G has the highest priority in the chain, then all the tasks up
+the chain (A and B in this example), must have their priorities increased
+to that of G.
+
+Mutex Waiters Tree
+------------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The
+mutex has a rbtree to store these waiters by priority. This tree is protected
+by a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.
+
+
+Task PI Tree
+------------
+
+To keep track of the PI chains, each process has its own PI rbtree. This is
+a tree of all top waiters of the mutexes that are owned by the process.
+Note that this tree only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI tree is always the highest priority task that
+is waiting on a mutex that is owned by the task. So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this tree.
+
+This tree is stored in the task structure of a process as a rbtree called
+pi_waiters. It is protected by a spin lock also in the task structure,
+called pi_lock. This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined. But is very complex to figure it out, since it depends on all
+the nesting of mutexes. Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way::
+
+ void func1(void)
+ {
+ mutex_lock(L1);
+
+ /* do anything */
+
+ mutex_unlock(L1);
+ }
+
+ void func2(void)
+ {
+ mutex_lock(L1);
+ mutex_lock(L2);
+
+ /* do something */
+
+ mutex_unlock(L2);
+ mutex_unlock(L1);
+ }
+
+ void func3(void)
+ {
+ mutex_lock(L2);
+ mutex_lock(L3);
+
+ /* do something else */
+
+ mutex_unlock(L3);
+ mutex_unlock(L2);
+ }
+
+ void func4(void)
+ {
+ mutex_lock(L3);
+
+ /* do something again */
+
+ mutex_unlock(L3);
+ }
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last. With D being preempted
+in func4 in the "do something again" area, we have a locking that follows::
+
+ D owns L3
+ C blocked on L3
+ C owns L2
+ B blocked on L2
+ B owns L1
+ A blocked on L1
+
+ And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two. So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data. So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain. More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex. If the
+mutex is not owned, this owner is set to NULL. Since all architectures
+have the task structure on at least a two byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the least
+significant bit to be used as a flag. Bit 0 is used as the "Has Waiters"
+flag. It's set whenever there are waiters on a mutex.
+
+See Documentation/locking/rt-mutex.rst for further details.
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange). This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically::
+
+ unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+ {
+ unsigned long T = *A;
+ if (*A == *B) {
+ *A = *C;
+ }
+ return T;
+ }
+ #define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be. You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time. But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it. This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority. With the help of the pi_waiters of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio
+and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
+
+rt_mutex_adjust_prio examines the priority of the task, and the highest
+priority process that is waiting any of mutexes owned by the task. Since
+the pi_waiters of a task holds an order by priority of all the top waiters
+of all the mutexes that the task owns, we simply need to compare the top
+pi waiter to its own normal/deadline priority and take the higher one.
+Then rt_mutex_setprio is called to adjust the priority of the task to the
+new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
+to implement the actual change in priority.
+
+Note:
+ For the "prio" field in task_struct, the lower the number, the
+ higher the priority. A "prio" of 5 is of higher priority than a
+ "prio" of 10.
+
+It is interesting to note that rt_mutex_adjust_prio can either increase
+or decrease the priority of the task. In the case that a higher priority
+process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
+would increase/boost the task's priority. But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task. That is because the pi_waiters
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best. It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting), a pointer to the mutex on which the task
+is blocked, and a top_task as the top waiter of the mutex.
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held. That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it. This means that the task is set to the priority that it
+should be at, but the rbtree nodes of the task's waiter have not been updated
+with the new priorities, and this task may not be in the proper locations
+in the pi_waiters and waiters trees that the task is blocked on. This function
+solves all that.
+
+The main operation of this function is summarized by Thomas Gleixner in
+rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
+details.
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex. This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails). Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, we go about the slow path
+(rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack. This is because the waiter structure is only needed for the
+scope of this function. The waiter structure holds the nodes to store
+the task on the waiters tree of the mutex, and if need be, the pi_waiters
+tree of the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex. This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path. The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field. By setting this flag
+now, the current owner of the mutex being contended for can't release the mutex
+without going into the slow unlock path, and it would then need to grab the
+wait_lock, which this code currently holds. So setting the "Has Waiters" flag
+forces the current owner to synchronize with this code.
+
+The lock is taken if the following are true:
+
+ 1) The lock has no owner
+ 2) The current task is the highest priority against all other
+ waiters of the lock
+
+If the task succeeds to acquire the lock, then the task is set as the
+owner of the lock, and if the lock still has waiters, the top_waiter
+(highest priority task waiting on the lock) is added to this task's
+pi_waiters tree.
+
+If the lock is not taken by try_to_take_rt_mutex(), then the
+task_blocks_on_rt_mutex() function is called. This will add the task to
+the lock's waiter tree and propagate the pi chain of the lock as well
+as the lock's owner's pi_waiters tree. This is described in the next
+section.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process. The "task" field is set to the process, and the "lock" field
+to the mutex. The rbtree node of waiter are initialized to the processes
+current priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the task waiter tree. If the current process is the
+highest priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_waiters of the owner,
+and add the current process to that tree. Since the pi_waiter of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_waiters changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The task can then wake up for a couple of reasons:
+ 1) The previous lock owner released the lock, and the task now is top_waiter
+ 2) we received a signal or timeout
+
+In both cases, the task will try again to acquire the lock. If it
+does, then it will take itself off the waiters tree and set itself back
+to the TASK_RUNNING state.
+
+In first case, if the lock was acquired by another task before this task
+could get the lock, then it will go back to sleep and wait to be woken again.
+
+The second case is only applicable for tasks that are grabbing a mutex
+that can wake up before getting the lock, either due to a signal or
+a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
+take the lock again, if it succeeds, then the task will return with the
+lock held, otherwise it will return with -EINTR if the task was woken
+by a signal, or -ETIMEDOUT if it timed out.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG. Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex. If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex. This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not. On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not. On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too. If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters then the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up.
+
+On the wake up code, the pi_lock of the current owner is taken. The top
+waiter of the lock is found and removed from the waiters tree of the mutex
+as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
+marked to prevent lower priority tasks from stealing the lock.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author: Steven Rostedt <rostedt@goodmis.org>
+
+Updated: Alex Shi <alex.shi@linaro.org> - 7/6/2017
+
+Original Reviewers:
+ Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
+ Randy Dunlap
+
+Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
+was updated on 4.12
diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
deleted file mode 100644
index 3d7b865539cc..000000000000
--- a/Documentation/locking/rt-mutex-design.txt
+++ /dev/null
@@ -1,559 +0,0 @@
-#
-# Copyright (c) 2006 Steven Rostedt
-# Licensed under the GNU Free Documentation License, Version 1.2
-#
-
-RT-mutex implementation design
-------------------------------
-
-This document tries to describe the design of the rtmutex.c implementation.
-It doesn't describe the reasons why rtmutex.c exists. For that please see
-Documentation/locking/rt-mutex.txt. Although this document does explain problems
-that happen without this code, but that is in the concept to understand
-what the code actually is doing.
-
-The goal of this document is to help others understand the priority
-inheritance (PI) algorithm that is used, as well as reasons for the
-decisions that were made to implement PI in the manner that was done.
-
-
-Unbounded Priority Inversion
-----------------------------
-
-Priority inversion is when a lower priority process executes while a higher
-priority process wants to run. This happens for several reasons, and
-most of the time it can't be helped. Anytime a high priority process wants
-to use a resource that a lower priority process has (a mutex for example),
-the high priority process must wait until the lower priority process is done
-with the resource. This is a priority inversion. What we want to prevent
-is something called unbounded priority inversion. That is when the high
-priority process is prevented from running by a lower priority process for
-an undetermined amount of time.
-
-The classic example of unbounded priority inversion is where you have three
-processes, let's call them processes A, B, and C, where A is the highest
-priority process, C is the lowest, and B is in between. A tries to grab a lock
-that C owns and must wait and lets C run to release the lock. But in the
-meantime, B executes, and since B is of a higher priority than C, it preempts C,
-but by doing so, it is in fact preempting A which is a higher priority process.
-Now there's no way of knowing how long A will be sleeping waiting for C
-to release the lock, because for all we know, B is a CPU hog and will
-never give C a chance to release the lock. This is called unbounded priority
-inversion.
-
-Here's a little ASCII art to show the problem.
-
- grab lock L1 (owned by C)
- |
-A ---+
- C preempted by B
- |
-C +----+
-
-B +-------->
- B now keeps A from running.
-
-
-Priority Inheritance (PI)
--------------------------
-
-There are several ways to solve this issue, but other ways are out of scope
-for this document. Here we only discuss PI.
-
-PI is where a process inherits the priority of another process if the other
-process blocks on a lock owned by the current process. To make this easier
-to understand, let's use the previous example, with processes A, B, and C again.
-
-This time, when A blocks on the lock owned by C, C would inherit the priority
-of A. So now if B becomes runnable, it would not preempt C, since C now has
-the high priority of A. As soon as C releases the lock, it loses its
-inherited priority, and A then can continue with the resource that C had.
-
-Terminology
------------
-
-Here I explain some terminology that is used in this document to help describe
-the design that is used to implement PI.
-
-PI chain - The PI chain is an ordered series of locks and processes that cause
- processes to inherit priorities from a previous process that is
- blocked on one of its locks. This is described in more detail
- later in this document.
-
-mutex - In this document, to differentiate from locks that implement
- PI and spin locks that are used in the PI code, from now on
- the PI locks will be called a mutex.
-
-lock - In this document from now on, I will use the term lock when
- referring to spin locks that are used to protect parts of the PI
- algorithm. These locks disable preemption for UP (when
- CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
- entering critical sections simultaneously.
-
-spin lock - Same as lock above.
-
-waiter - A waiter is a struct that is stored on the stack of a blocked
- process. Since the scope of the waiter is within the code for
- a process being blocked on the mutex, it is fine to allocate
- the waiter on the process's stack (local variable). This
- structure holds a pointer to the task, as well as the mutex that
- the task is blocked on. It also has rbtree node structures to
- place the task in the waiters rbtree of a mutex as well as the
- pi_waiters rbtree of a mutex owner task (described below).
-
- waiter is sometimes used in reference to the task that is waiting
- on a mutex. This is the same as waiter->task.
-
-waiters - A list of processes that are blocked on a mutex.
-
-top waiter - The highest priority process waiting on a specific mutex.
-
-top pi waiter - The highest priority process waiting on one of the mutexes
- that a specific process owns.
-
-Note: task and process are used interchangeably in this document, mostly to
- differentiate between two processes that are being described together.
-
-
-PI chain
---------
-
-The PI chain is a list of processes and mutexes that may cause priority
-inheritance to take place. Multiple chains may converge, but a chain
-would never diverge, since a process can't be blocked on more than one
-mutex at a time.
-
-Example:
-
- Process: A, B, C, D, E
- Mutexes: L1, L2, L3, L4
-
- A owns: L1
- B blocked on L1
- B owns L2
- C blocked on L2
- C owns L3
- D blocked on L3
- D owns L4
- E blocked on L4
-
-The chain would be:
-
- E->L4->D->L3->C->L2->B->L1->A
-
-To show where two chains merge, we could add another process F and
-another mutex L5 where B owns L5 and F is blocked on mutex L5.
-
-The chain for F would be:
-
- F->L5->B->L1->A
-
-Since a process may own more than one mutex, but never be blocked on more than
-one, the chains merge.
-
-Here we show both chains:
-
- E->L4->D->L3->C->L2-+
- |
- +->B->L1->A
- |
- F->L5-+
-
-For PI to work, the processes at the right end of these chains (or we may
-also call it the Top of the chain) must be equal to or higher in priority
-than the processes to the left or below in the chain.
-
-Also since a mutex may have more than one process blocked on it, we can
-have multiple chains merge at mutexes. If we add another process G that is
-blocked on mutex L2:
-
- G->L2->B->L1->A
-
-And once again, to show how this can grow I will show the merging chains
-again.
-
- E->L4->D->L3->C-+
- +->L2-+
- | |
- G-+ +->B->L1->A
- |
- F->L5-+
-
-If process G has the highest priority in the chain, then all the tasks up
-the chain (A and B in this example), must have their priorities increased
-to that of G.
-
-Mutex Waiters Tree
------------------
-
-Every mutex keeps track of all the waiters that are blocked on itself. The
-mutex has a rbtree to store these waiters by priority. This tree is protected
-by a spin lock that is located in the struct of the mutex. This lock is called
-wait_lock.
-
-
-Task PI Tree
-------------
-
-To keep track of the PI chains, each process has its own PI rbtree. This is
-a tree of all top waiters of the mutexes that are owned by the process.
-Note that this tree only holds the top waiters and not all waiters that are
-blocked on mutexes owned by the process.
-
-The top of the task's PI tree is always the highest priority task that
-is waiting on a mutex that is owned by the task. So if the task has
-inherited a priority, it will always be the priority of the task that is
-at the top of this tree.
-
-This tree is stored in the task structure of a process as a rbtree called
-pi_waiters. It is protected by a spin lock also in the task structure,
-called pi_lock. This lock may also be taken in interrupt context, so when
-locking the pi_lock, interrupts must be disabled.
-
-
-Depth of the PI Chain
----------------------
-
-The maximum depth of the PI chain is not dynamic, and could actually be
-defined. But is very complex to figure it out, since it depends on all
-the nesting of mutexes. Let's look at the example where we have 3 mutexes,
-L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
-The following shows a locking order of L1->L2->L3, but may not actually
-be directly nested that way.
-
-void func1(void)
-{
- mutex_lock(L1);
-
- /* do anything */
-
- mutex_unlock(L1);
-}
-
-void func2(void)
-{
- mutex_lock(L1);
- mutex_lock(L2);
-
- /* do something */
-
- mutex_unlock(L2);
- mutex_unlock(L1);
-}
-
-void func3(void)
-{
- mutex_lock(L2);
- mutex_lock(L3);
-
- /* do something else */
-
- mutex_unlock(L3);
- mutex_unlock(L2);
-}
-
-void func4(void)
-{
- mutex_lock(L3);
-
- /* do something again */
-
- mutex_unlock(L3);
-}
-
-Now we add 4 processes that run each of these functions separately.
-Processes A, B, C, and D which run functions func1, func2, func3 and func4
-respectively, and such that D runs first and A last. With D being preempted
-in func4 in the "do something again" area, we have a locking that follows:
-
-D owns L3
- C blocked on L3
- C owns L2
- B blocked on L2
- B owns L1
- A blocked on L1
-
-And thus we have the chain A->L1->B->L2->C->L3->D.
-
-This gives us a PI depth of 4 (four processes), but looking at any of the
-functions individually, it seems as though they only have at most a locking
-depth of two. So, although the locking depth is defined at compile time,
-it still is very difficult to find the possibilities of that depth.
-
-Now since mutexes can be defined by user-land applications, we don't want a DOS
-type of application that nests large amounts of mutexes to create a large
-PI chain, and have the code holding spin locks while looking at a large
-amount of data. So to prevent this, the implementation not only implements
-a maximum lock depth, but also only holds at most two different locks at a
-time, as it walks the PI chain. More about this below.
-
-
-Mutex owner and flags
----------------------
-
-The mutex structure contains a pointer to the owner of the mutex. If the
-mutex is not owned, this owner is set to NULL. Since all architectures
-have the task structure on at least a two byte alignment (and if this is
-not true, the rtmutex.c code will be broken!), this allows for the least
-significant bit to be used as a flag. Bit 0 is used as the "Has Waiters"
-flag. It's set whenever there are waiters on a mutex.
-
-See Documentation/locking/rt-mutex.txt for further details.
-
-cmpxchg Tricks
---------------
-
-Some architectures implement an atomic cmpxchg (Compare and Exchange). This
-is used (when applicable) to keep the fast path of grabbing and releasing
-mutexes short.
-
-cmpxchg is basically the following function performed atomically:
-
-unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
-{
- unsigned long T = *A;
- if (*A == *B) {
- *A = *C;
- }
- return T;
-}
-#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
-
-This is really nice to have, since it allows you to only update a variable
-if the variable is what you expect it to be. You know if it succeeded if
-the return value (the old value of A) is equal to B.
-
-The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
-the architecture does not support CMPXCHG, then this macro is simply set
-to fail every time. But if CMPXCHG is supported, then this will
-help out extremely to keep the fast path short.
-
-The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
-the system for architectures that support it. This will also be explained
-later in this document.
-
-
-Priority adjustments
---------------------
-
-The implementation of the PI code in rtmutex.c has several places that a
-process must adjust its priority. With the help of the pi_waiters of a
-process this is rather easy to know what needs to be adjusted.
-
-The functions implementing the task adjustments are rt_mutex_adjust_prio
-and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
-
-rt_mutex_adjust_prio examines the priority of the task, and the highest
-priority process that is waiting any of mutexes owned by the task. Since
-the pi_waiters of a task holds an order by priority of all the top waiters
-of all the mutexes that the task owns, we simply need to compare the top
-pi waiter to its own normal/deadline priority and take the higher one.
-Then rt_mutex_setprio is called to adjust the priority of the task to the
-new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
-to implement the actual change in priority.
-
-(Note: For the "prio" field in task_struct, the lower the number, the
- higher the priority. A "prio" of 5 is of higher priority than a
- "prio" of 10.)
-
-It is interesting to note that rt_mutex_adjust_prio can either increase
-or decrease the priority of the task. In the case that a higher priority
-process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
-would increase/boost the task's priority. But if a higher priority task
-were for some reason to leave the mutex (timeout or signal), this same function
-would decrease/unboost the priority of the task. That is because the pi_waiters
-always contains the highest priority task that is waiting on a mutex owned
-by the task, so we only need to compare the priority of that top pi waiter
-to the normal priority of the given task.
-
-
-High level overview of the PI chain walk
-----------------------------------------
-
-The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
-
-The implementation has gone through several iterations, and has ended up
-with what we believe is the best. It walks the PI chain by only grabbing
-at most two locks at a time, and is very efficient.
-
-The rt_mutex_adjust_prio_chain can be used either to boost or lower process
-priorities.
-
-rt_mutex_adjust_prio_chain is called with a task to be checked for PI
-(de)boosting (the owner of a mutex that a process is blocking on), a flag to
-check for deadlocking, the mutex that the task owns, a pointer to a waiter
-that is the process's waiter struct that is blocked on the mutex (although this
-parameter may be NULL for deboosting), a pointer to the mutex on which the task
-is blocked, and a top_task as the top waiter of the mutex.
-
-For this explanation, I will not mention deadlock detection. This explanation
-will try to stay at a high level.
-
-When this function is called, there are no locks held. That also means
-that the state of the owner and lock can change when entered into this function.
-
-Before this function is called, the task has already had rt_mutex_adjust_prio
-performed on it. This means that the task is set to the priority that it
-should be at, but the rbtree nodes of the task's waiter have not been updated
-with the new priorities, and this task may not be in the proper locations
-in the pi_waiters and waiters trees that the task is blocked on. This function
-solves all that.
-
-The main operation of this function is summarized by Thomas Gleixner in
-rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
-details.
-
-Taking of a mutex (The walk through)
-------------------------------------
-
-OK, now let's take a look at the detailed walk through of what happens when
-taking a mutex.
-
-The first thing that is tried is the fast taking of the mutex. This is
-done when we have CMPXCHG enabled (otherwise the fast taking automatically
-fails). Only when the owner field of the mutex is NULL can the lock be
-taken with the CMPXCHG and nothing else needs to be done.
-
-If there is contention on the lock, we go about the slow path
-(rt_mutex_slowlock).
-
-The slow path function is where the task's waiter structure is created on
-the stack. This is because the waiter structure is only needed for the
-scope of this function. The waiter structure holds the nodes to store
-the task on the waiters tree of the mutex, and if need be, the pi_waiters
-tree of the owner.
-
-The wait_lock of the mutex is taken since the slow path of unlocking the
-mutex also takes this lock.
-
-We then call try_to_take_rt_mutex. This is where the architecture that
-does not implement CMPXCHG would always grab the lock (if there's no
-contention).
-
-try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
-slow path. The first thing that is done here is an atomic setting of
-the "Has Waiters" flag of the mutex's owner field. By setting this flag
-now, the current owner of the mutex being contended for can't release the mutex
-without going into the slow unlock path, and it would then need to grab the
-wait_lock, which this code currently holds. So setting the "Has Waiters" flag
-forces the current owner to synchronize with this code.
-
-The lock is taken if the following are true:
- 1) The lock has no owner
- 2) The current task is the highest priority against all other
- waiters of the lock
-
-If the task succeeds to acquire the lock, then the task is set as the
-owner of the lock, and if the lock still has waiters, the top_waiter
-(highest priority task waiting on the lock) is added to this task's
-pi_waiters tree.
-
-If the lock is not taken by try_to_take_rt_mutex(), then the
-task_blocks_on_rt_mutex() function is called. This will add the task to
-the lock's waiter tree and propagate the pi chain of the lock as well
-as the lock's owner's pi_waiters tree. This is described in the next
-section.
-
-Task blocks on mutex
---------------------
-
-The accounting of a mutex and process is done with the waiter structure of
-the process. The "task" field is set to the process, and the "lock" field
-to the mutex. The rbtree node of waiter are initialized to the processes
-current priority.
-
-Since the wait_lock was taken at the entry of the slow lock, we can safely
-add the waiter to the task waiter tree. If the current process is the
-highest priority process currently waiting on this mutex, then we remove the
-previous top waiter process (if it exists) from the pi_waiters of the owner,
-and add the current process to that tree. Since the pi_waiter of the owner
-has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
-should adjust its priority accordingly.
-
-If the owner is also blocked on a lock, and had its pi_waiters changed
-(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
-and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
-
-Now all locks are released, and if the current process is still blocked on a
-mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
-
-Waking up in the loop
----------------------
-
-The task can then wake up for a couple of reasons:
- 1) The previous lock owner released the lock, and the task now is top_waiter
- 2) we received a signal or timeout
-
-In both cases, the task will try again to acquire the lock. If it
-does, then it will take itself off the waiters tree and set itself back
-to the TASK_RUNNING state.
-
-In first case, if the lock was acquired by another task before this task
-could get the lock, then it will go back to sleep and wait to be woken again.
-
-The second case is only applicable for tasks that are grabbing a mutex
-that can wake up before getting the lock, either due to a signal or
-a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
-take the lock again, if it succeeds, then the task will return with the
-lock held, otherwise it will return with -EINTR if the task was woken
-by a signal, or -ETIMEDOUT if it timed out.
-
-
-Unlocking the Mutex
--------------------
-
-The unlocking of a mutex also has a fast path for those architectures with
-CMPXCHG. Since the taking of a mutex on contention always sets the
-"Has Waiters" flag of the mutex's owner, we use this to know if we need to
-take the slow path when unlocking the mutex. If the mutex doesn't have any
-waiters, the owner field of the mutex would equal the current process and
-the mutex can be unlocked by just replacing the owner field with NULL.
-
-If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
-the slow unlock path is taken.
-
-The first thing done in the slow unlock path is to take the wait_lock of the
-mutex. This synchronizes the locking and unlocking of the mutex.
-
-A check is made to see if the mutex has waiters or not. On architectures that
-do not have CMPXCHG, this is the location that the owner of the mutex will
-determine if a waiter needs to be awoken or not. On architectures that
-do have CMPXCHG, that check is done in the fast path, but it is still needed
-in the slow path too. If a waiter of a mutex woke up because of a signal
-or timeout between the time the owner failed the fast path CMPXCHG check and
-the grabbing of the wait_lock, the mutex may not have any waiters, thus the
-owner still needs to make this check. If there are no waiters then the mutex
-owner field is set to NULL, the wait_lock is released and nothing more is
-needed.
-
-If there are waiters, then we need to wake one up.
-
-On the wake up code, the pi_lock of the current owner is taken. The top
-waiter of the lock is found and removed from the waiters tree of the mutex
-as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
-marked to prevent lower priority tasks from stealing the lock.
-
-Finally we unlock the pi_lock of the pending owner and wake it up.
-
-
-Contact
--------
-
-For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
-
-
-Credits
--------
-
-Author: Steven Rostedt <rostedt@goodmis.org>
-Updated: Alex Shi <alex.shi@linaro.org> - 7/6/2017
-
-Original Reviewers: Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
- Randy Dunlap
-Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
-
-Updates
--------
-
-This document was originally written for 2.6.17-rc3-mm1
-was updated on 4.12
diff --git a/Documentation/locking/rt-mutex.rst b/Documentation/locking/rt-mutex.rst
new file mode 100644
index 000000000000..c365dc302081
--- /dev/null
+++ b/Documentation/locking/rt-mutex.rst
@@ -0,0 +1,77 @@
+==================================
+RT-mutex subsystem with PI support
+==================================
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter tree is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters tree. This tree too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. The
+priority enqueueing is handled by "pi_waiters".
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
+keep track of the "lock has waiters" state:
+
+ ============ ======= ================================================
+ owner bit0 Notes
+ ============ ======= ================================================
+ NULL 0 lock is free (fast acquire possible)
+ NULL 1 lock is free and has waiters and the top waiter
+ is going to take the lock [1]_
+ taskpointer 0 lock is held (fast release possible)
+ taskpointer 1 lock is held and has waiters [2]_
+ ============ ======= ================================================
+
+The fast atomic compare exchange based acquire and release is only
+possible when bit 0 of lock->owner is 0.
+
+.. [1] It also can be a transitional state when grabbing the lock
+ with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ we need to set the bit0 before looking at the lock, and the owner may
+ be NULL in this small time, hence this can be a transitional state.
+
+.. [2] There is a small time when bit 0 is set but there are no
+ waiters. This can happen when grabbing the lock in the slow path.
+ To prevent a cmpxchg of the owner releasing the lock, we need to
+ set this bit before looking at the lock.
+
+BTW, there is still technically a "Pending Owner", it's just not called
+that anymore. The pending owner happens to be the top_waiter of a lock
+that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
deleted file mode 100644
index 35793e003041..000000000000
--- a/Documentation/locking/rt-mutex.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-RT-mutex subsystem with PI support
-----------------------------------
-
-RT-mutexes with priority inheritance are used to support PI-futexes,
-which enable pthread_mutex_t priority inheritance attributes
-(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
-about PI-futexes.]
-
-This technology was developed in the -rt tree and streamlined for
-pthread_mutex support.
-
-Basic principles:
------------------
-
-RT-mutexes extend the semantics of simple mutexes by the priority
-inheritance protocol.
-
-A low priority owner of a rt-mutex inherits the priority of a higher
-priority waiter until the rt-mutex is released. If the temporarily
-boosted owner blocks on a rt-mutex itself it propagates the priority
-boosting to the owner of the other rt_mutex it gets blocked on. The
-priority boosting is immediately removed once the rt_mutex has been
-unlocked.
-
-This approach allows us to shorten the block of high-prio tasks on
-mutexes which protect shared resources. Priority inheritance is not a
-magic bullet for poorly designed applications, but it allows
-well-designed applications to use userspace locks in critical parts of
-an high priority thread, without losing determinism.
-
-The enqueueing of the waiters into the rtmutex waiter tree is done in
-priority order. For same priorities FIFO order is chosen. For each
-rtmutex, only the top priority waiter is enqueued into the owner's
-priority waiters tree. This tree too queues in priority order. Whenever
-the top priority waiter of a task changes (for example it timed out or
-got a signal), the priority of the owner task is readjusted. The
-priority enqueueing is handled by "pi_waiters".
-
-RT-mutexes are optimized for fastpath operations and have no internal
-locking overhead when locking an uncontended mutex or unlocking a mutex
-without waiters. The optimized fastpath operations require cmpxchg
-support. [If that is not available then the rt-mutex internal spinlock
-is used]
-
-The state of the rt-mutex is tracked via the owner field of the rt-mutex
-structure:
-
-lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
-keep track of the "lock has waiters" state.
-
- owner bit0
- NULL 0 lock is free (fast acquire possible)
- NULL 1 lock is free and has waiters and the top waiter
- is going to take the lock*
- taskpointer 0 lock is held (fast release possible)
- taskpointer 1 lock is held and has waiters**
-
-The fast atomic compare exchange based acquire and release is only
-possible when bit 0 of lock->owner is 0.
-
-(*) It also can be a transitional state when grabbing the lock
-with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
-we need to set the bit0 before looking at the lock, and the owner may be
-NULL in this small time, hence this can be a transitional state.
-
-(**) There is a small time when bit 0 is set but there are no
-waiters. This can happen when grabbing the lock in the slow path.
-To prevent a cmpxchg of the owner releasing the lock, we need to
-set this bit before looking at the lock.
-
-BTW, there is still technically a "Pending Owner", it's just not called
-that anymore. The pending owner happens to be the top_waiter of a lock
-that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/spinlocks.rst b/Documentation/locking/spinlocks.rst
new file mode 100644
index 000000000000..098107fb7d86
--- /dev/null
+++ b/Documentation/locking/spinlocks.rst
@@ -0,0 +1,177 @@
+===============
+Locking lessons
+===============
+
+Lesson 1: Spin locks
+====================
+
+The most basic primitive for locking is spinlock::
+
+ static DEFINE_SPINLOCK(xxx_lock);
+
+ unsigned long flags;
+
+ spin_lock_irqsave(&xxx_lock, flags);
+ ... critical section here ..
+ spin_unlock_irqrestore(&xxx_lock, flags);
+
+The above is always safe. It will disable interrupts _locally_, but the
+spinlock itself will guarantee the global lock, so it will guarantee that
+there is only one thread-of-control within the region(s) protected by that
+lock. This works well even under UP also, so the code does _not_ need to
+worry about UP vs SMP issues: the spinlocks work correctly under both.
+
+ NOTE! Implications of spin_locks for memory are further described in:
+
+ Documentation/memory-barriers.txt
+
+ (5) LOCK operations.
+
+ (6) UNLOCK operations.
+
+The above is usually pretty simple (you usually need and want only one
+spinlock for most things - using more than one spinlock can make things a
+lot more complex and even slower and is usually worth it only for
+sequences that you **know** need to be split up: avoid it at all cost if you
+aren't sure).
+
+This is really the only really hard part about spinlocks: once you start
+using spinlocks they tend to expand to areas you might not have noticed
+before, because you have to make sure the spinlocks correctly protect the
+shared data structures **everywhere** they are used. The spinlocks are most
+easily added to places that are completely independent of other code (for
+example, internal driver data structures that nobody else ever touches).
+
+ NOTE! The spin-lock is safe only when you **also** use the lock itself
+ to do locking across CPU's, which implies that EVERYTHING that
+ touches a shared variable has to agree about the spinlock they want
+ to use.
+
+----
+
+Lesson 2: reader-writer spinlocks.
+==================================
+
+If your data accesses have a very natural pattern where you usually tend
+to mostly read from the shared variables, the reader-writer locks
+(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
+readers to be in the same critical region at once, but if somebody wants
+to change the variables it has to get an exclusive write lock.
+
+ NOTE! reader-writer locks require more atomic memory operations than
+ simple spinlocks. Unless the reader critical section is long, you
+ are better off just using spinlocks.
+
+The routines look the same as above::
+
+ rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
+
+ unsigned long flags;
+
+ read_lock_irqsave(&xxx_lock, flags);
+ .. critical section that only reads the info ...
+ read_unlock_irqrestore(&xxx_lock, flags);
+
+ write_lock_irqsave(&xxx_lock, flags);
+ .. read and write exclusive access to the info ...
+ write_unlock_irqrestore(&xxx_lock, flags);
+
+The above kind of lock may be useful for complex data structures like
+linked lists, especially searching for entries without changing the list
+itself. The read lock allows many concurrent readers. Anything that
+**changes** the list will have to get the write lock.
+
+ NOTE! RCU is better for list traversal, but requires careful
+ attention to design detail (see Documentation/RCU/listRCU.txt).
+
+Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
+time need to do any changes (even if you don't do it every time), you have
+to get the write-lock at the very beginning.
+
+ NOTE! We are working hard to remove reader-writer spinlocks in most
+ cases, so please don't add a new one without consensus. (Instead, see
+ Documentation/RCU/rcu.txt for complete information.)
+
+----
+
+Lesson 3: spinlocks revisited.
+==============================
+
+The single spin-lock primitives above are by no means the only ones. They
+are the most safe ones, and the ones that work under all circumstances,
+but partly **because** they are safe they are also fairly slow. They are slower
+than they'd need to be, because they do have to disable interrupts
+(which is just a single instruction on a x86, but it's an expensive one -
+and on other architectures it can be worse).
+
+If you have a case where you have to protect a data structure across
+several CPU's and you want to use spinlocks you can potentially use
+cheaper versions of the spinlocks. IFF you know that the spinlocks are
+never used in interrupt handlers, you can use the non-irq versions::
+
+ spin_lock(&lock);
+ ...
+ spin_unlock(&lock);
+
+(and the equivalent read-write versions too, of course). The spinlock will
+guarantee the same kind of exclusive access, and it will be much faster.
+This is useful if you know that the data in question is only ever
+manipulated from a "process context", ie no interrupts involved.
+
+The reasons you mustn't use these versions if you have interrupts that
+play with the spinlock is that you can get deadlocks::
+
+ spin_lock(&lock);
+ ...
+ <- interrupt comes in:
+ spin_lock(&lock);
+
+where an interrupt tries to lock an already locked variable. This is ok if
+the other interrupt happens on another CPU, but it is _not_ ok if the
+interrupt happens on the same CPU that already holds the lock, because the
+lock will obviously never be released (because the interrupt is waiting
+for the lock, and the lock-holder is interrupted by the interrupt and will
+not continue until the interrupt has been processed).
+
+(This is also the reason why the irq-versions of the spinlocks only need
+to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
+on other CPU's, because an interrupt on another CPU doesn't interrupt the
+CPU that holds the lock, so the lock-holder can continue and eventually
+releases the lock).
+
+Note that you can be clever with read-write locks and interrupts. For
+example, if you know that the interrupt only ever gets a read-lock, then
+you can use a non-irq version of read locks everywhere - because they
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
+
+For an example of being clever with rw-locks, see the "waitqueue_lock"
+handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
+within an interrupt, they only read the queue in order to know whom to
+wake up. So read-locks are safe (which is good: they are very common
+indeed), while write-locks need to protect themselves against interrupts.
+
+ Linus
+
+----
+
+Reference information:
+======================
+
+For dynamic initialization, use spin_lock_init() or rwlock_init() as
+appropriate::
+
+ spinlock_t xxx_lock;
+ rwlock_t xxx_rw_lock;
+
+ static int __init xxx_init(void)
+ {
+ spin_lock_init(&xxx_lock);
+ rwlock_init(&xxx_rw_lock);
+ ...
+ }
+
+ module_init(xxx_init);
+
+For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
+__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt
deleted file mode 100644
index ff35e40bdf5b..000000000000
--- a/Documentation/locking/spinlocks.txt
+++ /dev/null
@@ -1,167 +0,0 @@
-Lesson 1: Spin locks
-
-The most basic primitive for locking is spinlock.
-
-static DEFINE_SPINLOCK(xxx_lock);
-
- unsigned long flags;
-
- spin_lock_irqsave(&xxx_lock, flags);
- ... critical section here ..
- spin_unlock_irqrestore(&xxx_lock, flags);
-
-The above is always safe. It will disable interrupts _locally_, but the
-spinlock itself will guarantee the global lock, so it will guarantee that
-there is only one thread-of-control within the region(s) protected by that
-lock. This works well even under UP also, so the code does _not_ need to
-worry about UP vs SMP issues: the spinlocks work correctly under both.
-
- NOTE! Implications of spin_locks for memory are further described in:
-
- Documentation/memory-barriers.txt
- (5) LOCK operations.
- (6) UNLOCK operations.
-
-The above is usually pretty simple (you usually need and want only one
-spinlock for most things - using more than one spinlock can make things a
-lot more complex and even slower and is usually worth it only for
-sequences that you _know_ need to be split up: avoid it at all cost if you
-aren't sure).
-
-This is really the only really hard part about spinlocks: once you start
-using spinlocks they tend to expand to areas you might not have noticed
-before, because you have to make sure the spinlocks correctly protect the
-shared data structures _everywhere_ they are used. The spinlocks are most
-easily added to places that are completely independent of other code (for
-example, internal driver data structures that nobody else ever touches).
-
- NOTE! The spin-lock is safe only when you _also_ use the lock itself
- to do locking across CPU's, which implies that EVERYTHING that
- touches a shared variable has to agree about the spinlock they want
- to use.
-
-----
-
-Lesson 2: reader-writer spinlocks.
-
-If your data accesses have a very natural pattern where you usually tend
-to mostly read from the shared variables, the reader-writer locks
-(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
-readers to be in the same critical region at once, but if somebody wants
-to change the variables it has to get an exclusive write lock.
-
- NOTE! reader-writer locks require more atomic memory operations than
- simple spinlocks. Unless the reader critical section is long, you
- are better off just using spinlocks.
-
-The routines look the same as above:
-
- rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
-
- unsigned long flags;
-
- read_lock_irqsave(&xxx_lock, flags);
- .. critical section that only reads the info ...
- read_unlock_irqrestore(&xxx_lock, flags);
-
- write_lock_irqsave(&xxx_lock, flags);
- .. read and write exclusive access to the info ...
- write_unlock_irqrestore(&xxx_lock, flags);
-
-The above kind of lock may be useful for complex data structures like
-linked lists, especially searching for entries without changing the list
-itself. The read lock allows many concurrent readers. Anything that
-_changes_ the list will have to get the write lock.
-
- NOTE! RCU is better for list traversal, but requires careful
- attention to design detail (see Documentation/RCU/listRCU.txt).
-
-Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
-time need to do any changes (even if you don't do it every time), you have
-to get the write-lock at the very beginning.
-
- NOTE! We are working hard to remove reader-writer spinlocks in most
- cases, so please don't add a new one without consensus. (Instead, see
- Documentation/RCU/rcu.txt for complete information.)
-
-----
-
-Lesson 3: spinlocks revisited.
-
-The single spin-lock primitives above are by no means the only ones. They
-are the most safe ones, and the ones that work under all circumstances,
-but partly _because_ they are safe they are also fairly slow. They are slower
-than they'd need to be, because they do have to disable interrupts
-(which is just a single instruction on a x86, but it's an expensive one -
-and on other architectures it can be worse).
-
-If you have a case where you have to protect a data structure across
-several CPU's and you want to use spinlocks you can potentially use
-cheaper versions of the spinlocks. IFF you know that the spinlocks are
-never used in interrupt handlers, you can use the non-irq versions:
-
- spin_lock(&lock);
- ...
- spin_unlock(&lock);
-
-(and the equivalent read-write versions too, of course). The spinlock will
-guarantee the same kind of exclusive access, and it will be much faster.
-This is useful if you know that the data in question is only ever
-manipulated from a "process context", ie no interrupts involved.
-
-The reasons you mustn't use these versions if you have interrupts that
-play with the spinlock is that you can get deadlocks:
-
- spin_lock(&lock);
- ...
- <- interrupt comes in:
- spin_lock(&lock);
-
-where an interrupt tries to lock an already locked variable. This is ok if
-the other interrupt happens on another CPU, but it is _not_ ok if the
-interrupt happens on the same CPU that already holds the lock, because the
-lock will obviously never be released (because the interrupt is waiting
-for the lock, and the lock-holder is interrupted by the interrupt and will
-not continue until the interrupt has been processed).
-
-(This is also the reason why the irq-versions of the spinlocks only need
-to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
-on other CPU's, because an interrupt on another CPU doesn't interrupt the
-CPU that holds the lock, so the lock-holder can continue and eventually
-releases the lock).
-
-Note that you can be clever with read-write locks and interrupts. For
-example, if you know that the interrupt only ever gets a read-lock, then
-you can use a non-irq version of read locks everywhere - because they
-don't block on each other (and thus there is no dead-lock wrt interrupts.
-But when you do the write-lock, you have to use the irq-safe version.
-
-For an example of being clever with rw-locks, see the "waitqueue_lock"
-handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
-within an interrupt, they only read the queue in order to know whom to
-wake up. So read-locks are safe (which is good: they are very common
-indeed), while write-locks need to protect themselves against interrupts.
-
- Linus
-
-----
-
-Reference information:
-
-For dynamic initialization, use spin_lock_init() or rwlock_init() as
-appropriate:
-
- spinlock_t xxx_lock;
- rwlock_t xxx_rw_lock;
-
- static int __init xxx_init(void)
- {
- spin_lock_init(&xxx_lock);
- rwlock_init(&xxx_rw_lock);
- ...
- }
-
- module_init(xxx_init);
-
-For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
-__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/ww-mutex-design.rst b/Documentation/locking/ww-mutex-design.rst
new file mode 100644
index 000000000000..1846c199da23
--- /dev/null
+++ b/Documentation/locking/ww-mutex-design.rst
@@ -0,0 +1,393 @@
+======================================
+Wound/Wait Deadlock-Proof Mutex Design
+======================================
+
+Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
+
+Motivation for WW-Mutexes
+-------------------------
+
+GPU's do operations that commonly involve many buffers. Those buffers
+can be shared across contexts/processes, exist in different memory
+domains (for example VRAM vs system memory), and so on. And with
+PRIME / dmabuf, they can even be shared across devices. So there are
+a handful of situations where the driver needs to wait for buffers to
+become ready. If you think about this in terms of waiting on a buffer
+mutex for it to become available, this presents a problem because
+there is no way to guarantee that buffers appear in a execbuf/batch in
+the same order in all contexts. That is directly under control of
+userspace, and a result of the sequence of GL calls that an application
+makes. Which results in the potential for deadlock. The problem gets
+more complex when you consider that the kernel may need to migrate the
+buffer(s) into VRAM before the GPU operates on the buffer(s), which
+may in turn require evicting some other buffers (and you don't want to
+evict other buffers which are already queued up to the GPU), but for a
+simplified understanding of the problem you can ignore this.
+
+The algorithm that the TTM graphics subsystem came up with for dealing with
+this problem is quite simple. For each group of buffers (execbuf) that need
+to be locked, the caller would be assigned a unique reservation id/ticket,
+from a global counter. In case of deadlock while locking all the buffers
+associated with a execbuf, the one with the lowest reservation ticket (i.e.
+the oldest task) wins, and the one with the higher reservation id (i.e. the
+younger task) unlocks all of the buffers that it has already locked, and then
+tries again.
+
+In the RDBMS literature, a reservation ticket is associated with a transaction.
+and the deadlock handling approach is called Wait-Die. The name is based on
+the actions of a locking thread when it encounters an already locked mutex.
+If the transaction holding the lock is younger, the locking transaction waits.
+If the transaction holding the lock is older, the locking transaction backs off
+and dies. Hence Wait-Die.
+There is also another algorithm called Wound-Wait:
+If the transaction holding the lock is younger, the locking transaction
+wounds the transaction holding the lock, requesting it to die.
+If the transaction holding the lock is older, it waits for the other
+transaction. Hence Wound-Wait.
+The two algorithms are both fair in that a transaction will eventually succeed.
+However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
+compared to Wait-Die, but is, on the other hand, associated with more work than
+Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
+algorithm in that transactions are wounded by other transactions, and that
+requires a reliable way to pick up up the wounded condition and preempt the
+running transaction. Note that this is not the same as process preemption. A
+Wound-Wait transaction is considered preempted when it dies (returning
+-EDEADLK) following a wound.
+
+Concepts
+--------
+
+Compared to normal mutexes two additional concepts/objects show up in the lock
+interface for w/w mutexes:
+
+Acquire context: To ensure eventual forward progress it is important the a task
+trying to acquire locks doesn't grab a new reservation id, but keeps the one it
+acquired when starting the lock acquisition. This ticket is stored in the
+acquire context. Furthermore the acquire context keeps track of debugging state
+to catch w/w mutex interface abuse. An acquire context is representing a
+transaction.
+
+W/w class: In contrast to normal mutexes the lock class needs to be explicit for
+w/w mutexes, since it is required to initialize the acquire context. The lock
+class also specifies what algorithm to use, Wound-Wait or Wait-Die.
+
+Furthermore there are three different class of w/w lock acquire functions:
+
+* Normal lock acquisition with a context, using ww_mutex_lock.
+
+* Slowpath lock acquisition on the contending lock, used by the task that just
+ killed its transaction after having dropped all already acquired locks.
+ These functions have the _slow postfix.
+
+ From a simple semantics point-of-view the _slow functions are not strictly
+ required, since simply calling the normal ww_mutex_lock functions on the
+ contending lock (after having dropped all other already acquired locks) will
+ work correctly. After all if no other ww mutex has been acquired yet there's
+ no deadlock potential and hence the ww_mutex_lock call will block and not
+ prematurely return -EDEADLK. The advantage of the _slow functions is in
+ interface safety:
+
+ - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
+ has a void return type. Note that since ww mutex code needs loops/retries
+ anyway the __must_check doesn't result in spurious warnings, even though the
+ very first lock operation can never fail.
+ - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
+ ww mutex have been released (preventing deadlocks) and makes sure that we
+ block on the contending lock (preventing spinning through the -EDEADLK
+ slowpath until the contended lock can be acquired).
+
+* Functions to only acquire a single w/w mutex, which results in the exact same
+ semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
+ context.
+
+ Again this is not strictly required. But often you only want to acquire a
+ single lock in which case it's pointless to set up an acquire context (and so
+ better to avoid grabbing a deadlock avoidance ticket).
+
+Of course, all the usual variants for handling wake-ups due to signals are also
+provided.
+
+Usage
+-----
+
+The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
+DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
+As a rough rule of thumb, use Wound-Wait iff you
+expect the number of simultaneous competing transactions to be typically small,
+and you want to reduce the number of rollbacks.
+
+Three different ways to acquire locks within the same w/w class. Common
+definitions for methods #1 and #2::
+
+ static DEFINE_WW_CLASS(ww_class);
+
+ struct obj {
+ struct ww_mutex lock;
+ /* obj data */
+ };
+
+ struct obj_entry {
+ struct list_head head;
+ struct obj *obj;
+ };
+
+Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
+This is useful if a list of required objects is already tracked somewhere.
+Furthermore the lock helper can use propagate the -EALREADY return code back to
+the caller as a signal that an object is twice on the list. This is useful if
+the list is constructed from userspace input and the ABI requires userspace to
+not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl)::
+
+ int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+ {
+ struct obj *res_obj = NULL;
+ struct obj_entry *contended_entry = NULL;
+ struct obj_entry *entry;
+
+ ww_acquire_init(ctx, &ww_class);
+
+ retry:
+ list_for_each_entry (entry, list, head) {
+ if (entry->obj == res_obj) {
+ res_obj = NULL;
+ continue;
+ }
+ ret = ww_mutex_lock(&entry->obj->lock, ctx);
+ if (ret < 0) {
+ contended_entry = entry;
+ goto err;
+ }
+ }
+
+ ww_acquire_done(ctx);
+ return 0;
+
+ err:
+ list_for_each_entry_continue_reverse (entry, list, head)
+ ww_mutex_unlock(&entry->obj->lock);
+
+ if (res_obj)
+ ww_mutex_unlock(&res_obj->lock);
+
+ if (ret == -EDEADLK) {
+ /* we lost out in a seqno race, lock and retry.. */
+ ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
+ res_obj = contended_entry->obj;
+ goto retry;
+ }
+ ww_acquire_fini(ctx);
+
+ return ret;
+ }
+
+Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
+of duplicate entry detection using -EALREADY as method 1 above. But the
+list-reordering allows for a bit more idiomatic code::
+
+ int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+ {
+ struct obj_entry *entry, *entry2;
+
+ ww_acquire_init(ctx, &ww_class);
+
+ list_for_each_entry (entry, list, head) {
+ ret = ww_mutex_lock(&entry->obj->lock, ctx);
+ if (ret < 0) {
+ entry2 = entry;
+
+ list_for_each_entry_continue_reverse (entry2, list, head)
+ ww_mutex_unlock(&entry2->obj->lock);
+
+ if (ret != -EDEADLK) {
+ ww_acquire_fini(ctx);
+ return ret;
+ }
+
+ /* we lost out in a seqno race, lock and retry.. */
+ ww_mutex_lock_slow(&entry->obj->lock, ctx);
+
+ /*
+ * Move buf to head of the list, this will point
+ * buf->next to the first unlocked entry,
+ * restarting the for loop.
+ */
+ list_del(&entry->head);
+ list_add(&entry->head, list);
+ }
+ }
+
+ ww_acquire_done(ctx);
+ return 0;
+ }
+
+Unlocking works the same way for both methods #1 and #2::
+
+ void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+ {
+ struct obj_entry *entry;
+
+ list_for_each_entry (entry, list, head)
+ ww_mutex_unlock(&entry->obj->lock);
+
+ ww_acquire_fini(ctx);
+ }
+
+Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
+e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
+and edges can only be changed when holding the locks of all involved nodes. w/w
+mutexes are a natural fit for such a case for two reasons:
+
+- They can handle lock-acquisition in any order which allows us to start walking
+ a graph from a starting point and then iteratively discovering new edges and
+ locking down the nodes those edges connect to.
+- Due to the -EALREADY return code signalling that a given objects is already
+ held there's no need for additional book-keeping to break cycles in the graph
+ or keep track off which looks are already held (when using more than one node
+ as a starting point).
+
+Note that this approach differs in two important ways from the above methods:
+
+- Since the list of objects is dynamically constructed (and might very well be
+ different when retrying due to hitting the -EDEADLK die condition) there's
+ no need to keep any object on a persistent list when it's not locked. We can
+ therefore move the list_head into the object itself.
+- On the other hand the dynamic object list construction also means that the -EALREADY return
+ code can't be propagated.
+
+Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
+list of starting nodes (passed in from userspace) using one of the above
+methods. And then lock any additional objects affected by the operations using
+method #3 below. The backoff/retry procedure will be a bit more involved, since
+when the dynamic locking step hits -EDEADLK we also need to unlock all the
+objects acquired with the fixed list. But the w/w mutex debug checks will catch
+any interface misuse for these cases.
+
+Also, method 3 can't fail the lock acquisition step since it doesn't return
+-EALREADY. Of course this would be different when using the _interruptible
+variants, but that's outside of the scope of these examples here::
+
+ struct obj {
+ struct ww_mutex ww_mutex;
+ struct list_head locked_list;
+ };
+
+ static DEFINE_WW_CLASS(ww_class);
+
+ void __unlock_objs(struct list_head *list)
+ {
+ struct obj *entry, *temp;
+
+ list_for_each_entry_safe (entry, temp, list, locked_list) {
+ /* need to do that before unlocking, since only the current lock holder is
+ allowed to use object */
+ list_del(&entry->locked_list);
+ ww_mutex_unlock(entry->ww_mutex)
+ }
+ }
+
+ void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+ {
+ struct obj *obj;
+
+ ww_acquire_init(ctx, &ww_class);
+
+ retry:
+ /* re-init loop start state */
+ loop {
+ /* magic code which walks over a graph and decides which objects
+ * to lock */
+
+ ret = ww_mutex_lock(obj->ww_mutex, ctx);
+ if (ret == -EALREADY) {
+ /* we have that one already, get to the next object */
+ continue;
+ }
+ if (ret == -EDEADLK) {
+ __unlock_objs(list);
+
+ ww_mutex_lock_slow(obj, ctx);
+ list_add(&entry->locked_list, list);
+ goto retry;
+ }
+
+ /* locked a new object, add it to the list */
+ list_add_tail(&entry->locked_list, list);
+ }
+
+ ww_acquire_done(ctx);
+ return 0;
+ }
+
+ void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+ {
+ __unlock_objs(list);
+ ww_acquire_fini(ctx);
+ }
+
+Method 4: Only lock one single objects. In that case deadlock detection and
+prevention is obviously overkill, since with grabbing just one lock you can't
+produce a deadlock within just one class. To simplify this case the w/w mutex
+api can be used with a NULL context.
+
+Implementation Details
+----------------------
+
+Design:
+^^^^^^^
+
+ ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
+ normal mutex locks, which are far more common. As such there is only a small
+ increase in code size if wait/wound mutexes are not used.
+
+ We maintain the following invariants for the wait list:
+
+ (1) Waiters with an acquire context are sorted by stamp order; waiters
+ without an acquire context are interspersed in FIFO order.
+ (2) For Wait-Die, among waiters with contexts, only the first one can have
+ other locks acquired already (ctx->acquired > 0). Note that this waiter
+ may come after other waiters without contexts in the list.
+
+ The Wound-Wait preemption is implemented with a lazy-preemption scheme:
+ The wounded status of the transaction is checked only when there is
+ contention for a new lock and hence a true chance of deadlock. In that
+ situation, if the transaction is wounded, it backs off, clears the
+ wounded status and retries. A great benefit of implementing preemption in
+ this way is that the wounded transaction can identify a contending lock to
+ wait for before restarting the transaction. Just blindly restarting the
+ transaction would likely make the transaction end up in a situation where
+ it would have to back off again.
+
+ In general, not much contention is expected. The locks are typically used to
+ serialize access to resources for devices, and optimization focus should
+ therefore be directed towards the uncontended cases.
+
+Lockdep:
+^^^^^^^^
+
+ Special care has been taken to warn for as many cases of api abuse
+ as possible. Some common api abuses will be caught with
+ CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
+
+ Some of the errors which will be warned about:
+ - Forgetting to call ww_acquire_fini or ww_acquire_init.
+ - Attempting to lock more mutexes after ww_acquire_done.
+ - Attempting to lock the wrong mutex after -EDEADLK and
+ unlocking all mutexes.
+ - Attempting to lock the right mutex after -EDEADLK,
+ before unlocking all mutexes.
+
+ - Calling ww_mutex_lock_slow before -EDEADLK was returned.
+
+ - Unlocking mutexes with the wrong unlock function.
+ - Calling one of the ww_acquire_* twice on the same context.
+ - Using a different ww_class for the mutex than for the ww_acquire_ctx.
+ - Normal lockdep errors that can result in deadlocks.
+
+ Some of the lockdep errors that can result in deadlocks:
+ - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
+ having called ww_acquire_fini on the first.
+ - 'normal' deadlocks that can occur.
+
+FIXME:
+ Update this section once we have the TASK_DEADLOCK task state flag magic
+ implemented.
diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
deleted file mode 100644
index f0ed7c30e695..000000000000
--- a/Documentation/locking/ww-mutex-design.txt
+++ /dev/null
@@ -1,383 +0,0 @@
-Wound/Wait Deadlock-Proof Mutex Design
-======================================
-
-Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
-
-Motivation for WW-Mutexes
--------------------------
-
-GPU's do operations that commonly involve many buffers. Those buffers
-can be shared across contexts/processes, exist in different memory
-domains (for example VRAM vs system memory), and so on. And with
-PRIME / dmabuf, they can even be shared across devices. So there are
-a handful of situations where the driver needs to wait for buffers to
-become ready. If you think about this in terms of waiting on a buffer
-mutex for it to become available, this presents a problem because
-there is no way to guarantee that buffers appear in a execbuf/batch in
-the same order in all contexts. That is directly under control of
-userspace, and a result of the sequence of GL calls that an application
-makes. Which results in the potential for deadlock. The problem gets
-more complex when you consider that the kernel may need to migrate the
-buffer(s) into VRAM before the GPU operates on the buffer(s), which
-may in turn require evicting some other buffers (and you don't want to
-evict other buffers which are already queued up to the GPU), but for a
-simplified understanding of the problem you can ignore this.
-
-The algorithm that the TTM graphics subsystem came up with for dealing with
-this problem is quite simple. For each group of buffers (execbuf) that need
-to be locked, the caller would be assigned a unique reservation id/ticket,
-from a global counter. In case of deadlock while locking all the buffers
-associated with a execbuf, the one with the lowest reservation ticket (i.e.
-the oldest task) wins, and the one with the higher reservation id (i.e. the
-younger task) unlocks all of the buffers that it has already locked, and then
-tries again.
-
-In the RDBMS literature, a reservation ticket is associated with a transaction.
-and the deadlock handling approach is called Wait-Die. The name is based on
-the actions of a locking thread when it encounters an already locked mutex.
-If the transaction holding the lock is younger, the locking transaction waits.
-If the transaction holding the lock is older, the locking transaction backs off
-and dies. Hence Wait-Die.
-There is also another algorithm called Wound-Wait:
-If the transaction holding the lock is younger, the locking transaction
-wounds the transaction holding the lock, requesting it to die.
-If the transaction holding the lock is older, it waits for the other
-transaction. Hence Wound-Wait.
-The two algorithms are both fair in that a transaction will eventually succeed.
-However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
-compared to Wait-Die, but is, on the other hand, associated with more work than
-Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
-algorithm in that transactions are wounded by other transactions, and that
-requires a reliable way to pick up up the wounded condition and preempt the
-running transaction. Note that this is not the same as process preemption. A
-Wound-Wait transaction is considered preempted when it dies (returning
--EDEADLK) following a wound.
-
-Concepts
---------
-
-Compared to normal mutexes two additional concepts/objects show up in the lock
-interface for w/w mutexes:
-
-Acquire context: To ensure eventual forward progress it is important the a task
-trying to acquire locks doesn't grab a new reservation id, but keeps the one it
-acquired when starting the lock acquisition. This ticket is stored in the
-acquire context. Furthermore the acquire context keeps track of debugging state
-to catch w/w mutex interface abuse. An acquire context is representing a
-transaction.
-
-W/w class: In contrast to normal mutexes the lock class needs to be explicit for
-w/w mutexes, since it is required to initialize the acquire context. The lock
-class also specifies what algorithm to use, Wound-Wait or Wait-Die.
-
-Furthermore there are three different class of w/w lock acquire functions:
-
-* Normal lock acquisition with a context, using ww_mutex_lock.
-
-* Slowpath lock acquisition on the contending lock, used by the task that just
- killed its transaction after having dropped all already acquired locks.
- These functions have the _slow postfix.
-
- From a simple semantics point-of-view the _slow functions are not strictly
- required, since simply calling the normal ww_mutex_lock functions on the
- contending lock (after having dropped all other already acquired locks) will
- work correctly. After all if no other ww mutex has been acquired yet there's
- no deadlock potential and hence the ww_mutex_lock call will block and not
- prematurely return -EDEADLK. The advantage of the _slow functions is in
- interface safety:
- - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
- has a void return type. Note that since ww mutex code needs loops/retries
- anyway the __must_check doesn't result in spurious warnings, even though the
- very first lock operation can never fail.
- - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
- ww mutex have been released (preventing deadlocks) and makes sure that we
- block on the contending lock (preventing spinning through the -EDEADLK
- slowpath until the contended lock can be acquired).
-
-* Functions to only acquire a single w/w mutex, which results in the exact same
- semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
- context.
-
- Again this is not strictly required. But often you only want to acquire a
- single lock in which case it's pointless to set up an acquire context (and so
- better to avoid grabbing a deadlock avoidance ticket).
-
-Of course, all the usual variants for handling wake-ups due to signals are also
-provided.
-
-Usage
------
-
-The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
-DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
-As a rough rule of thumb, use Wound-Wait iff you
-expect the number of simultaneous competing transactions to be typically small,
-and you want to reduce the number of rollbacks.
-
-Three different ways to acquire locks within the same w/w class. Common
-definitions for methods #1 and #2:
-
-static DEFINE_WW_CLASS(ww_class);
-
-struct obj {
- struct ww_mutex lock;
- /* obj data */
-};
-
-struct obj_entry {
- struct list_head head;
- struct obj *obj;
-};
-
-Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
-This is useful if a list of required objects is already tracked somewhere.
-Furthermore the lock helper can use propagate the -EALREADY return code back to
-the caller as a signal that an object is twice on the list. This is useful if
-the list is constructed from userspace input and the ABI requires userspace to
-not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
- struct obj *res_obj = NULL;
- struct obj_entry *contended_entry = NULL;
- struct obj_entry *entry;
-
- ww_acquire_init(ctx, &ww_class);
-
-retry:
- list_for_each_entry (entry, list, head) {
- if (entry->obj == res_obj) {
- res_obj = NULL;
- continue;
- }
- ret = ww_mutex_lock(&entry->obj->lock, ctx);
- if (ret < 0) {
- contended_entry = entry;
- goto err;
- }
- }
-
- ww_acquire_done(ctx);
- return 0;
-
-err:
- list_for_each_entry_continue_reverse (entry, list, head)
- ww_mutex_unlock(&entry->obj->lock);
-
- if (res_obj)
- ww_mutex_unlock(&res_obj->lock);
-
- if (ret == -EDEADLK) {
- /* we lost out in a seqno race, lock and retry.. */
- ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
- res_obj = contended_entry->obj;
- goto retry;
- }
- ww_acquire_fini(ctx);
-
- return ret;
-}
-
-Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
-of duplicate entry detection using -EALREADY as method 1 above. But the
-list-reordering allows for a bit more idiomatic code.
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
- struct obj_entry *entry, *entry2;
-
- ww_acquire_init(ctx, &ww_class);
-
- list_for_each_entry (entry, list, head) {
- ret = ww_mutex_lock(&entry->obj->lock, ctx);
- if (ret < 0) {
- entry2 = entry;
-
- list_for_each_entry_continue_reverse (entry2, list, head)
- ww_mutex_unlock(&entry2->obj->lock);
-
- if (ret != -EDEADLK) {
- ww_acquire_fini(ctx);
- return ret;
- }
-
- /* we lost out in a seqno race, lock and retry.. */
- ww_mutex_lock_slow(&entry->obj->lock, ctx);
-
- /*
- * Move buf to head of the list, this will point
- * buf->next to the first unlocked entry,
- * restarting the for loop.
- */
- list_del(&entry->head);
- list_add(&entry->head, list);
- }
- }
-
- ww_acquire_done(ctx);
- return 0;
-}
-
-Unlocking works the same way for both methods #1 and #2:
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
- struct obj_entry *entry;
-
- list_for_each_entry (entry, list, head)
- ww_mutex_unlock(&entry->obj->lock);
-
- ww_acquire_fini(ctx);
-}
-
-Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
-e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
-and edges can only be changed when holding the locks of all involved nodes. w/w
-mutexes are a natural fit for such a case for two reasons:
-- They can handle lock-acquisition in any order which allows us to start walking
- a graph from a starting point and then iteratively discovering new edges and
- locking down the nodes those edges connect to.
-- Due to the -EALREADY return code signalling that a given objects is already
- held there's no need for additional book-keeping to break cycles in the graph
- or keep track off which looks are already held (when using more than one node
- as a starting point).
-
-Note that this approach differs in two important ways from the above methods:
-- Since the list of objects is dynamically constructed (and might very well be
- different when retrying due to hitting the -EDEADLK die condition) there's
- no need to keep any object on a persistent list when it's not locked. We can
- therefore move the list_head into the object itself.
-- On the other hand the dynamic object list construction also means that the -EALREADY return
- code can't be propagated.
-
-Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
-list of starting nodes (passed in from userspace) using one of the above
-methods. And then lock any additional objects affected by the operations using
-method #3 below. The backoff/retry procedure will be a bit more involved, since
-when the dynamic locking step hits -EDEADLK we also need to unlock all the
-objects acquired with the fixed list. But the w/w mutex debug checks will catch
-any interface misuse for these cases.
-
-Also, method 3 can't fail the lock acquisition step since it doesn't return
--EALREADY. Of course this would be different when using the _interruptible
-variants, but that's outside of the scope of these examples here.
-
-struct obj {
- struct ww_mutex ww_mutex;
- struct list_head locked_list;
-};
-
-static DEFINE_WW_CLASS(ww_class);
-
-void __unlock_objs(struct list_head *list)
-{
- struct obj *entry, *temp;
-
- list_for_each_entry_safe (entry, temp, list, locked_list) {
- /* need to do that before unlocking, since only the current lock holder is
- allowed to use object */
- list_del(&entry->locked_list);
- ww_mutex_unlock(entry->ww_mutex)
- }
-}
-
-void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
- struct obj *obj;
-
- ww_acquire_init(ctx, &ww_class);
-
-retry:
- /* re-init loop start state */
- loop {
- /* magic code which walks over a graph and decides which objects
- * to lock */
-
- ret = ww_mutex_lock(obj->ww_mutex, ctx);
- if (ret == -EALREADY) {
- /* we have that one already, get to the next object */
- continue;
- }
- if (ret == -EDEADLK) {
- __unlock_objs(list);
-
- ww_mutex_lock_slow(obj, ctx);
- list_add(&entry->locked_list, list);
- goto retry;
- }
-
- /* locked a new object, add it to the list */
- list_add_tail(&entry->locked_list, list);
- }
-
- ww_acquire_done(ctx);
- return 0;
-}
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
- __unlock_objs(list);
- ww_acquire_fini(ctx);
-}
-
-Method 4: Only lock one single objects. In that case deadlock detection and
-prevention is obviously overkill, since with grabbing just one lock you can't
-produce a deadlock within just one class. To simplify this case the w/w mutex
-api can be used with a NULL context.
-
-Implementation Details
-----------------------
-
-Design:
- ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
- normal mutex locks, which are far more common. As such there is only a small
- increase in code size if wait/wound mutexes are not used.
-
- We maintain the following invariants for the wait list:
- (1) Waiters with an acquire context are sorted by stamp order; waiters
- without an acquire context are interspersed in FIFO order.
- (2) For Wait-Die, among waiters with contexts, only the first one can have
- other locks acquired already (ctx->acquired > 0). Note that this waiter
- may come after other waiters without contexts in the list.
-
- The Wound-Wait preemption is implemented with a lazy-preemption scheme:
- The wounded status of the transaction is checked only when there is
- contention for a new lock and hence a true chance of deadlock. In that
- situation, if the transaction is wounded, it backs off, clears the
- wounded status and retries. A great benefit of implementing preemption in
- this way is that the wounded transaction can identify a contending lock to
- wait for before restarting the transaction. Just blindly restarting the
- transaction would likely make the transaction end up in a situation where
- it would have to back off again.
-
- In general, not much contention is expected. The locks are typically used to
- serialize access to resources for devices, and optimization focus should
- therefore be directed towards the uncontended cases.
-
-Lockdep:
- Special care has been taken to warn for as many cases of api abuse
- as possible. Some common api abuses will be caught with
- CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
-
- Some of the errors which will be warned about:
- - Forgetting to call ww_acquire_fini or ww_acquire_init.
- - Attempting to lock more mutexes after ww_acquire_done.
- - Attempting to lock the wrong mutex after -EDEADLK and
- unlocking all mutexes.
- - Attempting to lock the right mutex after -EDEADLK,
- before unlocking all mutexes.
-
- - Calling ww_mutex_lock_slow before -EDEADLK was returned.
-
- - Unlocking mutexes with the wrong unlock function.
- - Calling one of the ww_acquire_* twice on the same context.
- - Using a different ww_class for the mutex than for the ww_acquire_ctx.
- - Normal lockdep errors that can result in deadlocks.
-
- Some of the lockdep errors that can result in deadlocks:
- - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
- having called ww_acquire_fini on the first.
- - 'normal' deadlocks that can occur.
-
-FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic
-implemented.
diff --git a/Documentation/m68k/index.rst b/Documentation/m68k/index.rst
new file mode 100644
index 000000000000..3a5ba7fe1703
--- /dev/null
+++ b/Documentation/m68k/index.rst
@@ -0,0 +1,17 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+m68k Architecture
+=================
+
+.. toctree::
+ :maxdepth: 2
+
+ kernel-options
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/m68k/kernel-options.rst b/Documentation/m68k/kernel-options.rst
new file mode 100644
index 000000000000..cabd9419740d
--- /dev/null
+++ b/Documentation/m68k/kernel-options.rst
@@ -0,0 +1,911 @@
+===================================
+Command Line Options for Linux/m68k
+===================================
+
+Last Update: 2 May 1999
+
+Linux/m68k version: 2.2.6
+
+Author: Roman.Hodek@informatik.uni-erlangen.de (Roman Hodek)
+
+Update: jds@kom.auc.dk (Jes Sorensen) and faq@linux-m68k.org (Chris Lawrence)
+
+0) Introduction
+===============
+
+Often I've been asked which command line options the Linux/m68k
+kernel understands, or how the exact syntax for the ... option is, or
+... about the option ... . I hope, this document supplies all the
+answers...
+
+Note that some options might be outdated, their descriptions being
+incomplete or missing. Please update the information and send in the
+patches.
+
+
+1) Overview of the Kernel's Option Processing
+=============================================
+
+The kernel knows three kinds of options on its command line:
+
+ 1) kernel options
+ 2) environment settings
+ 3) arguments for init
+
+To which of these classes an argument belongs is determined as
+follows: If the option is known to the kernel itself, i.e. if the name
+(the part before the '=') or, in some cases, the whole argument string
+is known to the kernel, it belongs to class 1. Otherwise, if the
+argument contains an '=', it is of class 2, and the definition is put
+into init's environment. All other arguments are passed to init as
+command line options.
+
+This document describes the valid kernel options for Linux/m68k in
+the version mentioned at the start of this file. Later revisions may
+add new such options, and some may be missing in older versions.
+
+In general, the value (the part after the '=') of an option is a
+list of values separated by commas. The interpretation of these values
+is up to the driver that "owns" the option. This association of
+options with drivers is also the reason that some are further
+subdivided.
+
+
+2) General Kernel Options
+=========================
+
+2.1) root=
+----------
+
+:Syntax: root=/dev/<device>
+:or: root=<hex_number>
+
+This tells the kernel which device it should mount as the root
+filesystem. The device must be a block device with a valid filesystem
+on it.
+
+The first syntax gives the device by name. These names are converted
+into a major/minor number internally in the kernel in an unusual way.
+Normally, this "conversion" is done by the device files in /dev, but
+this isn't possible here, because the root filesystem (with /dev)
+isn't mounted yet... So the kernel parses the name itself, with some
+hardcoded name to number mappings. The name must always be a
+combination of two or three letters, followed by a decimal number.
+Valid names are::
+
+ /dev/ram: -> 0x0100 (initial ramdisk)
+ /dev/hda: -> 0x0300 (first IDE disk)
+ /dev/hdb: -> 0x0340 (second IDE disk)
+ /dev/sda: -> 0x0800 (first SCSI disk)
+ /dev/sdb: -> 0x0810 (second SCSI disk)
+ /dev/sdc: -> 0x0820 (third SCSI disk)
+ /dev/sdd: -> 0x0830 (forth SCSI disk)
+ /dev/sde: -> 0x0840 (fifth SCSI disk)
+ /dev/fd : -> 0x0200 (floppy disk)
+
+The name must be followed by a decimal number, that stands for the
+partition number. Internally, the value of the number is just
+added to the device number mentioned in the table above. The
+exceptions are /dev/ram and /dev/fd, where /dev/ram refers to an
+initial ramdisk loaded by your bootstrap program (please consult the
+instructions for your bootstrap program to find out how to load an
+initial ramdisk). As of kernel version 2.0.18 you must specify
+/dev/ram as the root device if you want to boot from an initial
+ramdisk. For the floppy devices, /dev/fd, the number stands for the
+floppy drive number (there are no partitions on floppy disks). I.e.,
+/dev/fd0 stands for the first drive, /dev/fd1 for the second, and so
+on. Since the number is just added, you can also force the disk format
+by adding a number greater than 3. If you look into your /dev
+directory, use can see the /dev/fd0D720 has major 2 and minor 16. You
+can specify this device for the root FS by writing "root=/dev/fd16" on
+the kernel command line.
+
+[Strange and maybe uninteresting stuff ON]
+
+This unusual translation of device names has some strange
+consequences: If, for example, you have a symbolic link from /dev/fd
+to /dev/fd0D720 as an abbreviation for floppy driver #0 in DD format,
+you cannot use this name for specifying the root device, because the
+kernel cannot see this symlink before mounting the root FS and it
+isn't in the table above. If you use it, the root device will not be
+set at all, without an error message. Another example: You cannot use a
+partition on e.g. the sixth SCSI disk as the root filesystem, if you
+want to specify it by name. This is, because only the devices up to
+/dev/sde are in the table above, but not /dev/sdf. Although, you can
+use the sixth SCSI disk for the root FS, but you have to specify the
+device by number... (see below). Or, even more strange, you can use the
+fact that there is no range checking of the partition number, and your
+knowledge that each disk uses 16 minors, and write "root=/dev/sde17"
+(for /dev/sdf1).
+
+[Strange and maybe uninteresting stuff OFF]
+
+If the device containing your root partition isn't in the table
+above, you can also specify it by major and minor numbers. These are
+written in hex, with no prefix and no separator between. E.g., if you
+have a CD with contents appropriate as a root filesystem in the first
+SCSI CD-ROM drive, you boot from it by "root=0b00". Here, hex "0b" =
+decimal 11 is the major of SCSI CD-ROMs, and the minor 0 stands for
+the first of these. You can find out all valid major numbers by
+looking into include/linux/major.h.
+
+In addition to major and minor numbers, if the device containing your
+root partition uses a partition table format with unique partition
+identifiers, then you may use them. For instance,
+"root=PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF". It is also
+possible to reference another partition on the same device using a
+known partition UUID as the starting point. For example,
+if partition 5 of the device has the UUID of
+00112233-4455-6677-8899-AABBCCDDEEFF then partition 3 may be found as
+follows:
+
+ PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=-2
+
+Authoritative information can be found in
+"Documentation/admin-guide/kernel-parameters.rst".
+
+
+2.2) ro, rw
+-----------
+
+:Syntax: ro
+:or: rw
+
+These two options tell the kernel whether it should mount the root
+filesystem read-only or read-write. The default is read-only, except
+for ramdisks, which default to read-write.
+
+
+2.3) debug
+----------
+
+:Syntax: debug
+
+This raises the kernel log level to 10 (the default is 7). This is the
+same level as set by the "dmesg" command, just that the maximum level
+selectable by dmesg is 8.
+
+
+2.4) debug=
+-----------
+
+:Syntax: debug=<device>
+
+This option causes certain kernel messages be printed to the selected
+debugging device. This can aid debugging the kernel, since the
+messages can be captured and analyzed on some other machine. Which
+devices are possible depends on the machine type. There are no checks
+for the validity of the device name. If the device isn't implemented,
+nothing happens.
+
+Messages logged this way are in general stack dumps after kernel
+memory faults or bad kernel traps, and kernel panics. To be exact: all
+messages of level 0 (panic messages) and all messages printed while
+the log level is 8 or more (their level doesn't matter). Before stack
+dumps, the kernel sets the log level to 10 automatically. A level of
+at least 8 can also be set by the "debug" command line option (see
+2.3) and at run time with "dmesg -n 8".
+
+Devices possible for Amiga:
+
+ - "ser":
+ built-in serial port; parameters: 9600bps, 8N1
+ - "mem":
+ Save the messages to a reserved area in chip mem. After
+ rebooting, they can be read under AmigaOS with the tool
+ 'dmesg'.
+
+Devices possible for Atari:
+
+ - "ser1":
+ ST-MFP serial port ("Modem1"); parameters: 9600bps, 8N1
+ - "ser2":
+ SCC channel B serial port ("Modem2"); parameters: 9600bps, 8N1
+ - "ser" :
+ default serial port
+ This is "ser2" for a Falcon, and "ser1" for any other machine
+ - "midi":
+ The MIDI port; parameters: 31250bps, 8N1
+ - "par" :
+ parallel port
+
+ The printing routine for this implements a timeout for the
+ case there's no printer connected (else the kernel would
+ lock up). The timeout is not exact, but usually a few
+ seconds.
+
+
+2.6) ramdisk_size=
+------------------
+
+:Syntax: ramdisk_size=<size>
+
+This option instructs the kernel to set up a ramdisk of the given
+size in KBytes. Do not use this option if the ramdisk contents are
+passed by bootstrap! In this case, the size is selected automatically
+and should not be overwritten.
+
+The only application is for root filesystems on floppy disks, that
+should be loaded into memory. To do that, select the corresponding
+size of the disk as ramdisk size, and set the root device to the disk
+drive (with "root=").
+
+
+2.7) swap=
+
+ I can't find any sign of this option in 2.2.6.
+
+2.8) buff=
+-----------
+
+ I can't find any sign of this option in 2.2.6.
+
+
+3) General Device Options (Amiga and Atari)
+===========================================
+
+3.1) ether=
+-----------
+
+:Syntax: ether=[<irq>[,<base_addr>[,<mem_start>[,<mem_end>]]]],<dev-name>
+
+<dev-name> is the name of a net driver, as specified in
+drivers/net/Space.c in the Linux source. Most prominent are eth0, ...
+eth3, sl0, ... sl3, ppp0, ..., ppp3, dummy, and lo.
+
+The non-ethernet drivers (sl, ppp, dummy, lo) obviously ignore the
+settings by this options. Also, the existing ethernet drivers for
+Linux/m68k (ariadne, a2065, hydra) don't use them because Zorro boards
+are really Plug-'n-Play, so the "ether=" option is useless altogether
+for Linux/m68k.
+
+
+3.2) hd=
+--------
+
+:Syntax: hd=<cylinders>,<heads>,<sectors>
+
+This option sets the disk geometry of an IDE disk. The first hd=
+option is for the first IDE disk, the second for the second one.
+(I.e., you can give this option twice.) In most cases, you won't have
+to use this option, since the kernel can obtain the geometry data
+itself. It exists just for the case that this fails for one of your
+disks.
+
+
+3.3) max_scsi_luns=
+-------------------
+
+:Syntax: max_scsi_luns=<n>
+
+Sets the maximum number of LUNs (logical units) of SCSI devices to
+be scanned. Valid values for <n> are between 1 and 8. Default is 8 if
+"Probe all LUNs on each SCSI device" was selected during the kernel
+configuration, else 1.
+
+
+3.4) st=
+--------
+
+:Syntax: st=<buffer_size>,[<write_thres>,[<max_buffers>]]
+
+Sets several parameters of the SCSI tape driver. <buffer_size> is
+the number of 512-byte buffers reserved for tape operations for each
+device. <write_thres> sets the number of blocks which must be filled
+to start an actual write operation to the tape. Maximum value is the
+total number of buffers. <max_buffer> limits the total number of
+buffers allocated for all tape devices.
+
+
+3.5) dmasound=
+--------------
+
+:Syntax: dmasound=[<buffers>,<buffer-size>[,<catch-radius>]]
+
+This option controls some configurations of the Linux/m68k DMA sound
+driver (Amiga and Atari): <buffers> is the number of buffers you want
+to use (minimum 4, default 4), <buffer-size> is the size of each
+buffer in kilobytes (minimum 4, default 32) and <catch-radius> says
+how much percent of error will be tolerated when setting a frequency
+(maximum 10, default 0). For example with 3% you can play 8000Hz
+AU-Files on the Falcon with its hardware frequency of 8195Hz and thus
+don't need to expand the sound.
+
+
+
+4) Options for Atari Only
+=========================
+
+4.1) video=
+-----------
+
+:Syntax: video=<fbname>:<sub-options...>
+
+The <fbname> parameter specifies the name of the frame buffer,
+eg. most atari users will want to specify `atafb` here. The
+<sub-options> is a comma-separated list of the sub-options listed
+below.
+
+NB:
+ Please notice that this option was renamed from `atavideo` to
+ `video` during the development of the 1.3.x kernels, thus you
+ might need to update your boot-scripts if upgrading to 2.x from
+ an 1.2.x kernel.
+
+NBB:
+ The behavior of video= was changed in 2.1.57 so the recommended
+ option is to specify the name of the frame buffer.
+
+4.1.1) Video Mode
+-----------------
+
+This sub-option may be any of the predefined video modes, as listed
+in atari/atafb.c in the Linux/m68k source tree. The kernel will
+activate the given video mode at boot time and make it the default
+mode, if the hardware allows. Currently defined names are:
+
+ - stlow : 320x200x4
+ - stmid, default5 : 640x200x2
+ - sthigh, default4: 640x400x1
+ - ttlow : 320x480x8, TT only
+ - ttmid, default1 : 640x480x4, TT only
+ - tthigh, default2: 1280x960x1, TT only
+ - vga2 : 640x480x1, Falcon only
+ - vga4 : 640x480x2, Falcon only
+ - vga16, default3 : 640x480x4, Falcon only
+ - vga256 : 640x480x8, Falcon only
+ - falh2 : 896x608x1, Falcon only
+ - falh16 : 896x608x4, Falcon only
+
+If no video mode is given on the command line, the kernel tries the
+modes names "default<n>" in turn, until one is possible with the
+hardware in use.
+
+A video mode setting doesn't make sense, if the external driver is
+activated by a "external:" sub-option.
+
+4.1.2) inverse
+--------------
+
+Invert the display. This affects both, text (consoles) and graphics
+(X) display. Usually, the background is chosen to be black. With this
+option, you can make the background white.
+
+4.1.3) font
+-----------
+
+:Syntax: font:<fontname>
+
+Specify the font to use in text modes. Currently you can choose only
+between `VGA8x8`, `VGA8x16` and `PEARL8x8`. `VGA8x8` is default, if the
+vertical size of the display is less than 400 pixel rows. Otherwise, the
+`VGA8x16` font is the default.
+
+4.1.4) `hwscroll_`
+------------------
+
+:Syntax: `hwscroll_<n>`
+
+The number of additional lines of video memory to reserve for
+speeding up the scrolling ("hardware scrolling"). Hardware scrolling
+is possible only if the kernel can set the video base address in steps
+fine enough. This is true for STE, MegaSTE, TT, and Falcon. It is not
+possible with plain STs and graphics cards (The former because the
+base address must be on a 256 byte boundary there, the latter because
+the kernel doesn't know how to set the base address at all.)
+
+By default, <n> is set to the number of visible text lines on the
+display. Thus, the amount of video memory is doubled, compared to no
+hardware scrolling. You can turn off the hardware scrolling altogether
+by setting <n> to 0.
+
+4.1.5) internal:
+----------------
+
+:Syntax: internal:<xres>;<yres>[;<xres_max>;<yres_max>;<offset>]
+
+This option specifies the capabilities of some extended internal video
+hardware, like e.g. OverScan. <xres> and <yres> give the (extended)
+dimensions of the screen.
+
+If your OverScan needs a black border, you have to write the last
+three arguments of the "internal:". <xres_max> is the maximum line
+length the hardware allows, <yres_max> the maximum number of lines.
+<offset> is the offset of the visible part of the screen memory to its
+physical start, in bytes.
+
+Often, extended interval video hardware has to be activated somehow.
+For this, see the "sw_*" options below.
+
+4.1.6) external:
+----------------
+
+:Syntax:
+ external:<xres>;<yres>;<depth>;<org>;<scrmem>[;<scrlen>[;<vgabase>
+ [;<colw>[;<coltype>[;<xres_virtual>]]]]]
+
+.. I had to break this line...
+
+This is probably the most complicated parameter... It specifies that
+you have some external video hardware (a graphics board), and how to
+use it under Linux/m68k. The kernel cannot know more about the hardware
+than you tell it here! The kernel also is unable to set or change any
+video modes, since it doesn't know about any board internal. So, you
+have to switch to that video mode before you start Linux, and cannot
+switch to another mode once Linux has started.
+
+The first 3 parameters of this sub-option should be obvious: <xres>,
+<yres> and <depth> give the dimensions of the screen and the number of
+planes (depth). The depth is the logarithm to base 2 of the number
+of colors possible. (Or, the other way round: The number of colors is
+2^depth).
+
+You have to tell the kernel furthermore how the video memory is
+organized. This is done by a letter as <org> parameter:
+
+ 'n':
+ "normal planes", i.e. one whole plane after another
+ 'i':
+ "interleaved planes", i.e. 16 bit of the first plane, than 16 bit
+ of the next, and so on... This mode is used only with the
+ built-in Atari video modes, I think there is no card that
+ supports this mode.
+ 'p':
+ "packed pixels", i.e. <depth> consecutive bits stand for all
+ planes of one pixel; this is the most common mode for 8 planes
+ (256 colors) on graphic cards
+ 't':
+ "true color" (more or less packed pixels, but without a color
+ lookup table); usually depth is 24
+
+For monochrome modes (i.e., <depth> is 1), the <org> letter has a
+different meaning:
+
+ 'n':
+ normal colors, i.e. 0=white, 1=black
+ 'i':
+ inverted colors, i.e. 0=black, 1=white
+
+The next important information about the video hardware is the base
+address of the video memory. That is given in the <scrmem> parameter,
+as a hexadecimal number with a "0x" prefix. You have to find out this
+address in the documentation of your hardware.
+
+The next parameter, <scrlen>, tells the kernel about the size of the
+video memory. If it's missing, the size is calculated from <xres>,
+<yres>, and <depth>. For now, it is not useful to write a value here.
+It would be used only for hardware scrolling (which isn't possible
+with the external driver, because the kernel cannot set the video base
+address), or for virtual resolutions under X (which the X server
+doesn't support yet). So, it's currently best to leave this field
+empty, either by ending the "external:" after the video address or by
+writing two consecutive semicolons, if you want to give a <vgabase>
+(it is allowed to leave this parameter empty).
+
+The <vgabase> parameter is optional. If it is not given, the kernel
+cannot read or write any color registers of the video hardware, and
+thus you have to set appropriate colors before you start Linux. But if
+your card is somehow VGA compatible, you can tell the kernel the base
+address of the VGA register set, so it can change the color lookup
+table. You have to look up this address in your board's documentation.
+To avoid misunderstandings: <vgabase> is the _base_ address, i.e. a 4k
+aligned address. For read/writing the color registers, the kernel
+uses the addresses vgabase+0x3c7...vgabase+0x3c9. The <vgabase>
+parameter is written in hexadecimal with a "0x" prefix, just as
+<scrmem>.
+
+<colw> is meaningful only if <vgabase> is specified. It tells the
+kernel how wide each of the color register is, i.e. the number of bits
+per single color (red/green/blue). Default is 6, another quite usual
+value is 8.
+
+Also <coltype> is used together with <vgabase>. It tells the kernel
+about the color register model of your gfx board. Currently, the types
+"vga" (which is also the default) and "mv300" (SANG MV300) are
+implemented.
+
+Parameter <xres_virtual> is required for ProMST or ET4000 cards where
+the physical linelength differs from the visible length. With ProMST,
+xres_virtual must be set to 2048. For ET4000, xres_virtual depends on the
+initialisation of the video-card.
+If you're missing a corresponding yres_virtual: the external part is legacy,
+therefore we don't support hardware-dependent functions like hardware-scroll,
+panning or blanking.
+
+4.1.7) eclock:
+--------------
+
+The external pixel clock attached to the Falcon VIDEL shifter. This
+currently works only with the ScreenWonder!
+
+4.1.8) monitorcap:
+-------------------
+
+:Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
+
+This describes the capabilities of a multisync monitor. Don't use it
+with a fixed-frequency monitor! For now, only the Falcon frame buffer
+uses the settings of "monitorcap:".
+
+<vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
+your monitor can work with, in Hz. <hmin> and <hmax> are the same for
+the horizontal frequency, in kHz.
+
+ The defaults are 58;62;31;32 (VGA compatible).
+
+ The defaults for TV/SC1224/SC1435 cover both PAL and NTSC standards.
+
+4.1.9) keep
+------------
+
+If this option is given, the framebuffer device doesn't do any video
+mode calculations and settings on its own. The only Atari fb device
+that does this currently is the Falcon.
+
+What you reach with this: Settings for unknown video extensions
+aren't overridden by the driver, so you can still use the mode found
+when booting, when the driver doesn't know to set this mode itself.
+But this also means, that you can't switch video modes anymore...
+
+An example where you may want to use "keep" is the ScreenBlaster for
+the Falcon.
+
+
+4.2) atamouse=
+--------------
+
+:Syntax: atamouse=<x-threshold>,[<y-threshold>]
+
+With this option, you can set the mouse movement reporting threshold.
+This is the number of pixels of mouse movement that have to accumulate
+before the IKBD sends a new mouse packet to the kernel. Higher values
+reduce the mouse interrupt load and thus reduce the chance of keyboard
+overruns. Lower values give a slightly faster mouse responses and
+slightly better mouse tracking.
+
+You can set the threshold in x and y separately, but usually this is
+of little practical use. If there's just one number in the option, it
+is used for both dimensions. The default value is 2 for both
+thresholds.
+
+
+4.3) ataflop=
+-------------
+
+:Syntax: ataflop=<drive type>[,<trackbuffering>[,<steprateA>[,<steprateB>]]]
+
+ The drive type may be 0, 1, or 2, for DD, HD, and ED, resp. This
+ setting affects how many buffers are reserved and which formats are
+ probed (see also below). The default is 1 (HD). Only one drive type
+ can be selected. If you have two disk drives, select the "better"
+ type.
+
+ The second parameter <trackbuffer> tells the kernel whether to use
+ track buffering (1) or not (0). The default is machine-dependent:
+ no for the Medusa and yes for all others.
+
+ With the two following parameters, you can change the default
+ steprate used for drive A and B, resp.
+
+
+4.4) atascsi=
+-------------
+
+:Syntax: atascsi=<can_queue>[,<cmd_per_lun>[,<scat-gat>[,<host-id>[,<tagged>]]]]
+
+This option sets some parameters for the Atari native SCSI driver.
+Generally, any number of arguments can be omitted from the end. And
+for each of the numbers, a negative value means "use default". The
+defaults depend on whether TT-style or Falcon-style SCSI is used.
+Below, defaults are noted as n/m, where the first value refers to
+TT-SCSI and the latter to Falcon-SCSI. If an illegal value is given
+for one parameter, an error message is printed and that one setting is
+ignored (others aren't affected).
+
+ <can_queue>:
+ This is the maximum number of SCSI commands queued internally to the
+ Atari SCSI driver. A value of 1 effectively turns off the driver
+ internal multitasking (if it causes problems). Legal values are >=
+ 1. <can_queue> can be as high as you like, but values greater than
+ <cmd_per_lun> times the number of SCSI targets (LUNs) you have
+ don't make sense. Default: 16/8.
+
+ <cmd_per_lun>:
+ Maximum number of SCSI commands issued to the driver for one
+ logical unit (LUN, usually one SCSI target). Legal values start
+ from 1. If tagged queuing (see below) is not used, values greater
+ than 2 don't make sense, but waste memory. Otherwise, the maximum
+ is the number of command tags available to the driver (currently
+ 32). Default: 8/1. (Note: Values > 1 seem to cause problems on a
+ Falcon, cause not yet known.)
+
+ The <cmd_per_lun> value at a great part determines the amount of
+ memory SCSI reserves for itself. The formula is rather
+ complicated, but I can give you some hints:
+
+ no scatter-gather:
+ cmd_per_lun * 232 bytes
+ full scatter-gather:
+ cmd_per_lun * approx. 17 Kbytes
+
+ <scat-gat>:
+ Size of the scatter-gather table, i.e. the number of requests
+ consecutive on the disk that can be merged into one SCSI command.
+ Legal values are between 0 and 255. Default: 255/0. Note: This
+ value is forced to 0 on a Falcon, since scatter-gather isn't
+ possible with the ST-DMA. Not using scatter-gather hurts
+ performance significantly.
+
+ <host-id>:
+ The SCSI ID to be used by the initiator (your Atari). This is
+ usually 7, the highest possible ID. Every ID on the SCSI bus must
+ be unique. Default: determined at run time: If the NV-RAM checksum
+ is valid, and bit 7 in byte 30 of the NV-RAM is set, the lower 3
+ bits of this byte are used as the host ID. (This method is defined
+ by Atari and also used by some TOS HD drivers.) If the above
+ isn't given, the default ID is 7. (both, TT and Falcon).
+
+ <tagged>:
+ 0 means turn off tagged queuing support, all other values > 0 mean
+ use tagged queuing for targets that support it. Default: currently
+ off, but this may change when tagged queuing handling has been
+ proved to be reliable.
+
+ Tagged queuing means that more than one command can be issued to
+ one LUN, and the SCSI device itself orders the requests so they
+ can be performed in optimal order. Not all SCSI devices support
+ tagged queuing (:-().
+
+4.5 switches=
+-------------
+
+:Syntax: switches=<list of switches>
+
+With this option you can switch some hardware lines that are often
+used to enable/disable certain hardware extensions. Examples are
+OverScan, overclocking, ...
+
+The <list of switches> is a comma-separated list of the following
+items:
+
+ ikbd:
+ set RTS of the keyboard ACIA high
+ midi:
+ set RTS of the MIDI ACIA high
+ snd6:
+ set bit 6 of the PSG port A
+ snd7:
+ set bit 6 of the PSG port A
+
+It doesn't make sense to mention a switch more than once (no
+difference to only once), but you can give as many switches as you
+want to enable different features. The switch lines are set as early
+as possible during kernel initialization (even before determining the
+present hardware.)
+
+All of the items can also be prefixed with `ov_`, i.e. `ov_ikbd`,
+`ov_midi`, ... These options are meant for switching on an OverScan
+video extension. The difference to the bare option is that the
+switch-on is done after video initialization, and somehow synchronized
+to the HBLANK. A speciality is that ov_ikbd and ov_midi are switched
+off before rebooting, so that OverScan is disabled and TOS boots
+correctly.
+
+If you give an option both, with and without the `ov_` prefix, the
+earlier initialization (`ov_`-less) takes precedence. But the
+switching-off on reset still happens in this case.
+
+5) Options for Amiga Only:
+==========================
+
+5.1) video=
+-----------
+
+:Syntax: video=<fbname>:<sub-options...>
+
+The <fbname> parameter specifies the name of the frame buffer, valid
+options are `amifb`, `cyber`, 'virge', `retz3` and `clgen`, provided
+that the respective frame buffer devices have been compiled into the
+kernel (or compiled as loadable modules). The behavior of the <fbname>
+option was changed in 2.1.57 so it is now recommended to specify this
+option.
+
+The <sub-options> is a comma-separated list of the sub-options listed
+below. This option is organized similar to the Atari version of the
+"video"-option (4.1), but knows fewer sub-options.
+
+5.1.1) video mode
+-----------------
+
+Again, similar to the video mode for the Atari (see 4.1.1). Predefined
+modes depend on the used frame buffer device.
+
+OCS, ECS and AGA machines all use the color frame buffer. The following
+predefined video modes are available:
+
+NTSC modes:
+ - ntsc : 640x200, 15 kHz, 60 Hz
+ - ntsc-lace : 640x400, 15 kHz, 60 Hz interlaced
+
+PAL modes:
+ - pal : 640x256, 15 kHz, 50 Hz
+ - pal-lace : 640x512, 15 kHz, 50 Hz interlaced
+
+ECS modes:
+ - multiscan : 640x480, 29 kHz, 57 Hz
+ - multiscan-lace : 640x960, 29 kHz, 57 Hz interlaced
+ - euro36 : 640x200, 15 kHz, 72 Hz
+ - euro36-lace : 640x400, 15 kHz, 72 Hz interlaced
+ - euro72 : 640x400, 29 kHz, 68 Hz
+ - euro72-lace : 640x800, 29 kHz, 68 Hz interlaced
+ - super72 : 800x300, 23 kHz, 70 Hz
+ - super72-lace : 800x600, 23 kHz, 70 Hz interlaced
+ - dblntsc-ff : 640x400, 27 kHz, 57 Hz
+ - dblntsc-lace : 640x800, 27 kHz, 57 Hz interlaced
+ - dblpal-ff : 640x512, 27 kHz, 47 Hz
+ - dblpal-lace : 640x1024, 27 kHz, 47 Hz interlaced
+ - dblntsc : 640x200, 27 kHz, 57 Hz doublescan
+ - dblpal : 640x256, 27 kHz, 47 Hz doublescan
+
+VGA modes:
+ - vga : 640x480, 31 kHz, 60 Hz
+ - vga70 : 640x400, 31 kHz, 70 Hz
+
+Please notice that the ECS and VGA modes require either an ECS or AGA
+chipset, and that these modes are limited to 2-bit color for the ECS
+chipset and 8-bit color for the AGA chipset.
+
+5.1.2) depth
+------------
+
+:Syntax: depth:<nr. of bit-planes>
+
+Specify the number of bit-planes for the selected video-mode.
+
+5.1.3) inverse
+--------------
+
+Use inverted display (black on white). Functionally the same as the
+"inverse" sub-option for the Atari.
+
+5.1.4) font
+-----------
+
+:Syntax: font:<fontname>
+
+Specify the font to use in text modes. Functionally the same as the
+"font" sub-option for the Atari, except that `PEARL8x8` is used instead
+of `VGA8x8` if the vertical size of the display is less than 400 pixel
+rows.
+
+5.1.5) monitorcap:
+-------------------
+
+:Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
+
+This describes the capabilities of a multisync monitor. For now, only
+the color frame buffer uses the settings of "monitorcap:".
+
+<vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
+your monitor can work with, in Hz. <hmin> and <hmax> are the same for
+the horizontal frequency, in kHz.
+
+The defaults are 50;90;15;38 (Generic Amiga multisync monitor).
+
+
+5.2) fd_def_df0=
+----------------
+
+:Syntax: fd_def_df0=<value>
+
+Sets the df0 value for "silent" floppy drives. The value should be in
+hexadecimal with "0x" prefix.
+
+
+5.3) wd33c93=
+-------------
+
+:Syntax: wd33c93=<sub-options...>
+
+These options affect the A590/A2091, A3000 and GVP Series II SCSI
+controllers.
+
+The <sub-options> is a comma-separated list of the sub-options listed
+below.
+
+5.3.1) nosync
+-------------
+
+:Syntax: nosync:bitmask
+
+bitmask is a byte where the 1st 7 bits correspond with the 7
+possible SCSI devices. Set a bit to prevent sync negotiation on that
+device. To maintain backwards compatibility, a command-line such as
+"wd33c93=255" will be automatically translated to
+"wd33c93=nosync:0xff". The default is to disable sync negotiation for
+all devices, eg. nosync:0xff.
+
+5.3.2) period
+-------------
+
+:Syntax: period:ns
+
+`ns` is the minimum # of nanoseconds in a SCSI data transfer
+period. Default is 500; acceptable values are 250 - 1000.
+
+5.3.3) disconnect
+-----------------
+
+:Syntax: disconnect:x
+
+Specify x = 0 to never allow disconnects, 2 to always allow them.
+x = 1 does 'adaptive' disconnects, which is the default and generally
+the best choice.
+
+5.3.4) debug
+------------
+
+:Syntax: debug:x
+
+If `DEBUGGING_ON` is defined, x is a bit mask that causes various
+types of debug output to printed - see the DB_xxx defines in
+wd33c93.h.
+
+5.3.5) clock
+------------
+
+:Syntax: clock:x
+
+x = clock input in MHz for WD33c93 chip. Normal values would be from
+8 through 20. The default value depends on your hostadapter(s),
+default for the A3000 internal controller is 14, for the A2091 it's 8
+and for the GVP hostadapters it's either 8 or 14, depending on the
+hostadapter and the SCSI-clock jumper present on some GVP
+hostadapters.
+
+5.3.6) next
+-----------
+
+No argument. Used to separate blocks of keywords when there's more
+than one wd33c93-based host adapter in the system.
+
+5.3.7) nodma
+------------
+
+:Syntax: nodma:x
+
+If x is 1 (or if the option is just written as "nodma"), the WD33c93
+controller will not use DMA (= direct memory access) to access the
+Amiga's memory. This is useful for some systems (like A3000's and
+A4000's with the A3640 accelerator, revision 3.0) that have problems
+using DMA to chip memory. The default is 0, i.e. to use DMA if
+possible.
+
+
+5.4) gvp11=
+-----------
+
+:Syntax: gvp11=<addr-mask>
+
+The earlier versions of the GVP driver did not handle DMA
+address-mask settings correctly which made it necessary for some
+people to use this option, in order to get their GVP controller
+running under Linux. These problems have hopefully been solved and the
+use of this option is now highly unrecommended!
+
+Incorrect use can lead to unpredictable behavior, so please only use
+this option if you *know* what you are doing and have a reason to do
+so. In any case if you experience problems and need to use this
+option, please inform us about it by mailing to the Linux/68k kernel
+mailing list.
+
+The address mask set by this option specifies which addresses are
+valid for DMA with the GVP Series II SCSI controller. An address is
+valid, if no bits are set except the bits that are set in the mask,
+too.
+
+Some versions of the GVP can only DMA into a 24 bit address range,
+some can address a 25 bit address range while others can use the whole
+32 bit address range for DMA. The correct setting depends on your
+controller and should be autodetected by the driver. An example is the
+24 bit region which is specified by a mask of 0x00fffffe.
diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt
deleted file mode 100644
index 79d21246c75a..000000000000
--- a/Documentation/m68k/kernel-options.txt
+++ /dev/null
@@ -1,884 +0,0 @@
-
-
- Command Line Options for Linux/m68k
- ===================================
-
-Last Update: 2 May 1999
-Linux/m68k version: 2.2.6
-Author: Roman.Hodek@informatik.uni-erlangen.de (Roman Hodek)
-Update: jds@kom.auc.dk (Jes Sorensen) and faq@linux-m68k.org (Chris Lawrence)
-
-0) Introduction
-===============
-
- Often I've been asked which command line options the Linux/m68k
-kernel understands, or how the exact syntax for the ... option is, or
-... about the option ... . I hope, this document supplies all the
-answers...
-
- Note that some options might be outdated, their descriptions being
-incomplete or missing. Please update the information and send in the
-patches.
-
-
-1) Overview of the Kernel's Option Processing
-=============================================
-
-The kernel knows three kinds of options on its command line:
-
- 1) kernel options
- 2) environment settings
- 3) arguments for init
-
-To which of these classes an argument belongs is determined as
-follows: If the option is known to the kernel itself, i.e. if the name
-(the part before the '=') or, in some cases, the whole argument string
-is known to the kernel, it belongs to class 1. Otherwise, if the
-argument contains an '=', it is of class 2, and the definition is put
-into init's environment. All other arguments are passed to init as
-command line options.
-
- This document describes the valid kernel options for Linux/m68k in
-the version mentioned at the start of this file. Later revisions may
-add new such options, and some may be missing in older versions.
-
- In general, the value (the part after the '=') of an option is a
-list of values separated by commas. The interpretation of these values
-is up to the driver that "owns" the option. This association of
-options with drivers is also the reason that some are further
-subdivided.
-
-
-2) General Kernel Options
-=========================
-
-2.1) root=
-----------
-
-Syntax: root=/dev/<device>
- or: root=<hex_number>
-
-This tells the kernel which device it should mount as the root
-filesystem. The device must be a block device with a valid filesystem
-on it.
-
- The first syntax gives the device by name. These names are converted
-into a major/minor number internally in the kernel in an unusual way.
-Normally, this "conversion" is done by the device files in /dev, but
-this isn't possible here, because the root filesystem (with /dev)
-isn't mounted yet... So the kernel parses the name itself, with some
-hardcoded name to number mappings. The name must always be a
-combination of two or three letters, followed by a decimal number.
-Valid names are:
-
- /dev/ram: -> 0x0100 (initial ramdisk)
- /dev/hda: -> 0x0300 (first IDE disk)
- /dev/hdb: -> 0x0340 (second IDE disk)
- /dev/sda: -> 0x0800 (first SCSI disk)
- /dev/sdb: -> 0x0810 (second SCSI disk)
- /dev/sdc: -> 0x0820 (third SCSI disk)
- /dev/sdd: -> 0x0830 (forth SCSI disk)
- /dev/sde: -> 0x0840 (fifth SCSI disk)
- /dev/fd : -> 0x0200 (floppy disk)
-
- The name must be followed by a decimal number, that stands for the
-partition number. Internally, the value of the number is just
-added to the device number mentioned in the table above. The
-exceptions are /dev/ram and /dev/fd, where /dev/ram refers to an
-initial ramdisk loaded by your bootstrap program (please consult the
-instructions for your bootstrap program to find out how to load an
-initial ramdisk). As of kernel version 2.0.18 you must specify
-/dev/ram as the root device if you want to boot from an initial
-ramdisk. For the floppy devices, /dev/fd, the number stands for the
-floppy drive number (there are no partitions on floppy disks). I.e.,
-/dev/fd0 stands for the first drive, /dev/fd1 for the second, and so
-on. Since the number is just added, you can also force the disk format
-by adding a number greater than 3. If you look into your /dev
-directory, use can see the /dev/fd0D720 has major 2 and minor 16. You
-can specify this device for the root FS by writing "root=/dev/fd16" on
-the kernel command line.
-
-[Strange and maybe uninteresting stuff ON]
-
- This unusual translation of device names has some strange
-consequences: If, for example, you have a symbolic link from /dev/fd
-to /dev/fd0D720 as an abbreviation for floppy driver #0 in DD format,
-you cannot use this name for specifying the root device, because the
-kernel cannot see this symlink before mounting the root FS and it
-isn't in the table above. If you use it, the root device will not be
-set at all, without an error message. Another example: You cannot use a
-partition on e.g. the sixth SCSI disk as the root filesystem, if you
-want to specify it by name. This is, because only the devices up to
-/dev/sde are in the table above, but not /dev/sdf. Although, you can
-use the sixth SCSI disk for the root FS, but you have to specify the
-device by number... (see below). Or, even more strange, you can use the
-fact that there is no range checking of the partition number, and your
-knowledge that each disk uses 16 minors, and write "root=/dev/sde17"
-(for /dev/sdf1).
-
-[Strange and maybe uninteresting stuff OFF]
-
- If the device containing your root partition isn't in the table
-above, you can also specify it by major and minor numbers. These are
-written in hex, with no prefix and no separator between. E.g., if you
-have a CD with contents appropriate as a root filesystem in the first
-SCSI CD-ROM drive, you boot from it by "root=0b00". Here, hex "0b" =
-decimal 11 is the major of SCSI CD-ROMs, and the minor 0 stands for
-the first of these. You can find out all valid major numbers by
-looking into include/linux/major.h.
-
-In addition to major and minor numbers, if the device containing your
-root partition uses a partition table format with unique partition
-identifiers, then you may use them. For instance,
-"root=PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF". It is also
-possible to reference another partition on the same device using a
-known partition UUID as the starting point. For example,
-if partition 5 of the device has the UUID of
-00112233-4455-6677-8899-AABBCCDDEEFF then partition 3 may be found as
-follows:
- PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF/PARTNROFF=-2
-
-Authoritative information can be found in
-"Documentation/admin-guide/kernel-parameters.rst".
-
-
-2.2) ro, rw
------------
-
-Syntax: ro
- or: rw
-
-These two options tell the kernel whether it should mount the root
-filesystem read-only or read-write. The default is read-only, except
-for ramdisks, which default to read-write.
-
-
-2.3) debug
-----------
-
-Syntax: debug
-
-This raises the kernel log level to 10 (the default is 7). This is the
-same level as set by the "dmesg" command, just that the maximum level
-selectable by dmesg is 8.
-
-
-2.4) debug=
------------
-
-Syntax: debug=<device>
-
-This option causes certain kernel messages be printed to the selected
-debugging device. This can aid debugging the kernel, since the
-messages can be captured and analyzed on some other machine. Which
-devices are possible depends on the machine type. There are no checks
-for the validity of the device name. If the device isn't implemented,
-nothing happens.
-
- Messages logged this way are in general stack dumps after kernel
-memory faults or bad kernel traps, and kernel panics. To be exact: all
-messages of level 0 (panic messages) and all messages printed while
-the log level is 8 or more (their level doesn't matter). Before stack
-dumps, the kernel sets the log level to 10 automatically. A level of
-at least 8 can also be set by the "debug" command line option (see
-2.3) and at run time with "dmesg -n 8".
-
-Devices possible for Amiga:
-
- - "ser": built-in serial port; parameters: 9600bps, 8N1
- - "mem": Save the messages to a reserved area in chip mem. After
- rebooting, they can be read under AmigaOS with the tool
- 'dmesg'.
-
-Devices possible for Atari:
-
- - "ser1": ST-MFP serial port ("Modem1"); parameters: 9600bps, 8N1
- - "ser2": SCC channel B serial port ("Modem2"); parameters: 9600bps, 8N1
- - "ser" : default serial port
- This is "ser2" for a Falcon, and "ser1" for any other machine
- - "midi": The MIDI port; parameters: 31250bps, 8N1
- - "par" : parallel port
- The printing routine for this implements a timeout for the
- case there's no printer connected (else the kernel would
- lock up). The timeout is not exact, but usually a few
- seconds.
-
-
-2.6) ramdisk_size=
--------------
-
-Syntax: ramdisk_size=<size>
-
- This option instructs the kernel to set up a ramdisk of the given
-size in KBytes. Do not use this option if the ramdisk contents are
-passed by bootstrap! In this case, the size is selected automatically
-and should not be overwritten.
-
- The only application is for root filesystems on floppy disks, that
-should be loaded into memory. To do that, select the corresponding
-size of the disk as ramdisk size, and set the root device to the disk
-drive (with "root=").
-
-
-2.7) swap=
-2.8) buff=
------------
-
- I can't find any sign of these options in 2.2.6.
-
-
-3) General Device Options (Amiga and Atari)
-===========================================
-
-3.1) ether=
------------
-
-Syntax: ether=[<irq>[,<base_addr>[,<mem_start>[,<mem_end>]]]],<dev-name>
-
- <dev-name> is the name of a net driver, as specified in
-drivers/net/Space.c in the Linux source. Most prominent are eth0, ...
-eth3, sl0, ... sl3, ppp0, ..., ppp3, dummy, and lo.
-
- The non-ethernet drivers (sl, ppp, dummy, lo) obviously ignore the
-settings by this options. Also, the existing ethernet drivers for
-Linux/m68k (ariadne, a2065, hydra) don't use them because Zorro boards
-are really Plug-'n-Play, so the "ether=" option is useless altogether
-for Linux/m68k.
-
-
-3.2) hd=
---------
-
-Syntax: hd=<cylinders>,<heads>,<sectors>
-
- This option sets the disk geometry of an IDE disk. The first hd=
-option is for the first IDE disk, the second for the second one.
-(I.e., you can give this option twice.) In most cases, you won't have
-to use this option, since the kernel can obtain the geometry data
-itself. It exists just for the case that this fails for one of your
-disks.
-
-
-3.3) max_scsi_luns=
--------------------
-
-Syntax: max_scsi_luns=<n>
-
- Sets the maximum number of LUNs (logical units) of SCSI devices to
-be scanned. Valid values for <n> are between 1 and 8. Default is 8 if
-"Probe all LUNs on each SCSI device" was selected during the kernel
-configuration, else 1.
-
-
-3.4) st=
---------
-
-Syntax: st=<buffer_size>,[<write_thres>,[<max_buffers>]]
-
- Sets several parameters of the SCSI tape driver. <buffer_size> is
-the number of 512-byte buffers reserved for tape operations for each
-device. <write_thres> sets the number of blocks which must be filled
-to start an actual write operation to the tape. Maximum value is the
-total number of buffers. <max_buffer> limits the total number of
-buffers allocated for all tape devices.
-
-
-3.5) dmasound=
---------------
-
-Syntax: dmasound=[<buffers>,<buffer-size>[,<catch-radius>]]
-
- This option controls some configurations of the Linux/m68k DMA sound
-driver (Amiga and Atari): <buffers> is the number of buffers you want
-to use (minimum 4, default 4), <buffer-size> is the size of each
-buffer in kilobytes (minimum 4, default 32) and <catch-radius> says
-how much percent of error will be tolerated when setting a frequency
-(maximum 10, default 0). For example with 3% you can play 8000Hz
-AU-Files on the Falcon with its hardware frequency of 8195Hz and thus
-don't need to expand the sound.
-
-
-
-4) Options for Atari Only
-=========================
-
-4.1) video=
------------
-
-Syntax: video=<fbname>:<sub-options...>
-
-The <fbname> parameter specifies the name of the frame buffer,
-eg. most atari users will want to specify `atafb' here. The
-<sub-options> is a comma-separated list of the sub-options listed
-below.
-
-NB: Please notice that this option was renamed from `atavideo' to
- `video' during the development of the 1.3.x kernels, thus you
- might need to update your boot-scripts if upgrading to 2.x from
- an 1.2.x kernel.
-
-NBB: The behavior of video= was changed in 2.1.57 so the recommended
-option is to specify the name of the frame buffer.
-
-4.1.1) Video Mode
------------------
-
-This sub-option may be any of the predefined video modes, as listed
-in atari/atafb.c in the Linux/m68k source tree. The kernel will
-activate the given video mode at boot time and make it the default
-mode, if the hardware allows. Currently defined names are:
-
- - stlow : 320x200x4
- - stmid, default5 : 640x200x2
- - sthigh, default4: 640x400x1
- - ttlow : 320x480x8, TT only
- - ttmid, default1 : 640x480x4, TT only
- - tthigh, default2: 1280x960x1, TT only
- - vga2 : 640x480x1, Falcon only
- - vga4 : 640x480x2, Falcon only
- - vga16, default3 : 640x480x4, Falcon only
- - vga256 : 640x480x8, Falcon only
- - falh2 : 896x608x1, Falcon only
- - falh16 : 896x608x4, Falcon only
-
- If no video mode is given on the command line, the kernel tries the
-modes names "default<n>" in turn, until one is possible with the
-hardware in use.
-
- A video mode setting doesn't make sense, if the external driver is
-activated by a "external:" sub-option.
-
-4.1.2) inverse
---------------
-
-Invert the display. This affects both, text (consoles) and graphics
-(X) display. Usually, the background is chosen to be black. With this
-option, you can make the background white.
-
-4.1.3) font
------------
-
-Syntax: font:<fontname>
-
-Specify the font to use in text modes. Currently you can choose only
-between `VGA8x8', `VGA8x16' and `PEARL8x8'. `VGA8x8' is default, if the
-vertical size of the display is less than 400 pixel rows. Otherwise, the
-`VGA8x16' font is the default.
-
-4.1.4) hwscroll_
-----------------
-
-Syntax: hwscroll_<n>
-
-The number of additional lines of video memory to reserve for
-speeding up the scrolling ("hardware scrolling"). Hardware scrolling
-is possible only if the kernel can set the video base address in steps
-fine enough. This is true for STE, MegaSTE, TT, and Falcon. It is not
-possible with plain STs and graphics cards (The former because the
-base address must be on a 256 byte boundary there, the latter because
-the kernel doesn't know how to set the base address at all.)
-
- By default, <n> is set to the number of visible text lines on the
-display. Thus, the amount of video memory is doubled, compared to no
-hardware scrolling. You can turn off the hardware scrolling altogether
-by setting <n> to 0.
-
-4.1.5) internal:
-----------------
-
-Syntax: internal:<xres>;<yres>[;<xres_max>;<yres_max>;<offset>]
-
-This option specifies the capabilities of some extended internal video
-hardware, like e.g. OverScan. <xres> and <yres> give the (extended)
-dimensions of the screen.
-
- If your OverScan needs a black border, you have to write the last
-three arguments of the "internal:". <xres_max> is the maximum line
-length the hardware allows, <yres_max> the maximum number of lines.
-<offset> is the offset of the visible part of the screen memory to its
-physical start, in bytes.
-
- Often, extended interval video hardware has to be activated somehow.
-For this, see the "sw_*" options below.
-
-4.1.6) external:
-----------------
-
-Syntax:
- external:<xres>;<yres>;<depth>;<org>;<scrmem>[;<scrlen>[;<vgabase>\
- [;<colw>[;<coltype>[;<xres_virtual>]]]]]
-
-[I had to break this line...]
-
- This is probably the most complicated parameter... It specifies that
-you have some external video hardware (a graphics board), and how to
-use it under Linux/m68k. The kernel cannot know more about the hardware
-than you tell it here! The kernel also is unable to set or change any
-video modes, since it doesn't know about any board internal. So, you
-have to switch to that video mode before you start Linux, and cannot
-switch to another mode once Linux has started.
-
- The first 3 parameters of this sub-option should be obvious: <xres>,
-<yres> and <depth> give the dimensions of the screen and the number of
-planes (depth). The depth is the logarithm to base 2 of the number
-of colors possible. (Or, the other way round: The number of colors is
-2^depth).
-
- You have to tell the kernel furthermore how the video memory is
-organized. This is done by a letter as <org> parameter:
-
- 'n': "normal planes", i.e. one whole plane after another
- 'i': "interleaved planes", i.e. 16 bit of the first plane, than 16 bit
- of the next, and so on... This mode is used only with the
- built-in Atari video modes, I think there is no card that
- supports this mode.
- 'p': "packed pixels", i.e. <depth> consecutive bits stand for all
- planes of one pixel; this is the most common mode for 8 planes
- (256 colors) on graphic cards
- 't': "true color" (more or less packed pixels, but without a color
- lookup table); usually depth is 24
-
-For monochrome modes (i.e., <depth> is 1), the <org> letter has a
-different meaning:
-
- 'n': normal colors, i.e. 0=white, 1=black
- 'i': inverted colors, i.e. 0=black, 1=white
-
- The next important information about the video hardware is the base
-address of the video memory. That is given in the <scrmem> parameter,
-as a hexadecimal number with a "0x" prefix. You have to find out this
-address in the documentation of your hardware.
-
- The next parameter, <scrlen>, tells the kernel about the size of the
-video memory. If it's missing, the size is calculated from <xres>,
-<yres>, and <depth>. For now, it is not useful to write a value here.
-It would be used only for hardware scrolling (which isn't possible
-with the external driver, because the kernel cannot set the video base
-address), or for virtual resolutions under X (which the X server
-doesn't support yet). So, it's currently best to leave this field
-empty, either by ending the "external:" after the video address or by
-writing two consecutive semicolons, if you want to give a <vgabase>
-(it is allowed to leave this parameter empty).
-
- The <vgabase> parameter is optional. If it is not given, the kernel
-cannot read or write any color registers of the video hardware, and
-thus you have to set appropriate colors before you start Linux. But if
-your card is somehow VGA compatible, you can tell the kernel the base
-address of the VGA register set, so it can change the color lookup
-table. You have to look up this address in your board's documentation.
-To avoid misunderstandings: <vgabase> is the _base_ address, i.e. a 4k
-aligned address. For read/writing the color registers, the kernel
-uses the addresses vgabase+0x3c7...vgabase+0x3c9. The <vgabase>
-parameter is written in hexadecimal with a "0x" prefix, just as
-<scrmem>.
-
- <colw> is meaningful only if <vgabase> is specified. It tells the
-kernel how wide each of the color register is, i.e. the number of bits
-per single color (red/green/blue). Default is 6, another quite usual
-value is 8.
-
- Also <coltype> is used together with <vgabase>. It tells the kernel
-about the color register model of your gfx board. Currently, the types
-"vga" (which is also the default) and "mv300" (SANG MV300) are
-implemented.
-
- Parameter <xres_virtual> is required for ProMST or ET4000 cards where
-the physical linelength differs from the visible length. With ProMST,
-xres_virtual must be set to 2048. For ET4000, xres_virtual depends on the
-initialisation of the video-card.
-If you're missing a corresponding yres_virtual: the external part is legacy,
-therefore we don't support hardware-dependent functions like hardware-scroll,
-panning or blanking.
-
-4.1.7) eclock:
---------------
-
-The external pixel clock attached to the Falcon VIDEL shifter. This
-currently works only with the ScreenWonder!
-
-4.1.8) monitorcap:
--------------------
-
-Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
-
-This describes the capabilities of a multisync monitor. Don't use it
-with a fixed-frequency monitor! For now, only the Falcon frame buffer
-uses the settings of "monitorcap:".
-
- <vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
-your monitor can work with, in Hz. <hmin> and <hmax> are the same for
-the horizontal frequency, in kHz.
-
- The defaults are 58;62;31;32 (VGA compatible).
-
- The defaults for TV/SC1224/SC1435 cover both PAL and NTSC standards.
-
-4.1.9) keep
-------------
-
-If this option is given, the framebuffer device doesn't do any video
-mode calculations and settings on its own. The only Atari fb device
-that does this currently is the Falcon.
-
- What you reach with this: Settings for unknown video extensions
-aren't overridden by the driver, so you can still use the mode found
-when booting, when the driver doesn't know to set this mode itself.
-But this also means, that you can't switch video modes anymore...
-
- An example where you may want to use "keep" is the ScreenBlaster for
-the Falcon.
-
-
-4.2) atamouse=
---------------
-
-Syntax: atamouse=<x-threshold>,[<y-threshold>]
-
- With this option, you can set the mouse movement reporting threshold.
-This is the number of pixels of mouse movement that have to accumulate
-before the IKBD sends a new mouse packet to the kernel. Higher values
-reduce the mouse interrupt load and thus reduce the chance of keyboard
-overruns. Lower values give a slightly faster mouse responses and
-slightly better mouse tracking.
-
- You can set the threshold in x and y separately, but usually this is
-of little practical use. If there's just one number in the option, it
-is used for both dimensions. The default value is 2 for both
-thresholds.
-
-
-4.3) ataflop=
--------------
-
-Syntax: ataflop=<drive type>[,<trackbuffering>[,<steprateA>[,<steprateB>]]]
-
- The drive type may be 0, 1, or 2, for DD, HD, and ED, resp. This
- setting affects how many buffers are reserved and which formats are
- probed (see also below). The default is 1 (HD). Only one drive type
- can be selected. If you have two disk drives, select the "better"
- type.
-
- The second parameter <trackbuffer> tells the kernel whether to use
- track buffering (1) or not (0). The default is machine-dependent:
- no for the Medusa and yes for all others.
-
- With the two following parameters, you can change the default
- steprate used for drive A and B, resp.
-
-
-4.4) atascsi=
--------------
-
-Syntax: atascsi=<can_queue>[,<cmd_per_lun>[,<scat-gat>[,<host-id>[,<tagged>]]]]
-
- This option sets some parameters for the Atari native SCSI driver.
-Generally, any number of arguments can be omitted from the end. And
-for each of the numbers, a negative value means "use default". The
-defaults depend on whether TT-style or Falcon-style SCSI is used.
-Below, defaults are noted as n/m, where the first value refers to
-TT-SCSI and the latter to Falcon-SCSI. If an illegal value is given
-for one parameter, an error message is printed and that one setting is
-ignored (others aren't affected).
-
- <can_queue>:
- This is the maximum number of SCSI commands queued internally to the
- Atari SCSI driver. A value of 1 effectively turns off the driver
- internal multitasking (if it causes problems). Legal values are >=
- 1. <can_queue> can be as high as you like, but values greater than
- <cmd_per_lun> times the number of SCSI targets (LUNs) you have
- don't make sense. Default: 16/8.
-
- <cmd_per_lun>:
- Maximum number of SCSI commands issued to the driver for one
- logical unit (LUN, usually one SCSI target). Legal values start
- from 1. If tagged queuing (see below) is not used, values greater
- than 2 don't make sense, but waste memory. Otherwise, the maximum
- is the number of command tags available to the driver (currently
- 32). Default: 8/1. (Note: Values > 1 seem to cause problems on a
- Falcon, cause not yet known.)
-
- The <cmd_per_lun> value at a great part determines the amount of
- memory SCSI reserves for itself. The formula is rather
- complicated, but I can give you some hints:
- no scatter-gather : cmd_per_lun * 232 bytes
- full scatter-gather: cmd_per_lun * approx. 17 Kbytes
-
- <scat-gat>:
- Size of the scatter-gather table, i.e. the number of requests
- consecutive on the disk that can be merged into one SCSI command.
- Legal values are between 0 and 255. Default: 255/0. Note: This
- value is forced to 0 on a Falcon, since scatter-gather isn't
- possible with the ST-DMA. Not using scatter-gather hurts
- performance significantly.
-
- <host-id>:
- The SCSI ID to be used by the initiator (your Atari). This is
- usually 7, the highest possible ID. Every ID on the SCSI bus must
- be unique. Default: determined at run time: If the NV-RAM checksum
- is valid, and bit 7 in byte 30 of the NV-RAM is set, the lower 3
- bits of this byte are used as the host ID. (This method is defined
- by Atari and also used by some TOS HD drivers.) If the above
- isn't given, the default ID is 7. (both, TT and Falcon).
-
- <tagged>:
- 0 means turn off tagged queuing support, all other values > 0 mean
- use tagged queuing for targets that support it. Default: currently
- off, but this may change when tagged queuing handling has been
- proved to be reliable.
-
- Tagged queuing means that more than one command can be issued to
- one LUN, and the SCSI device itself orders the requests so they
- can be performed in optimal order. Not all SCSI devices support
- tagged queuing (:-().
-
-4.5 switches=
--------------
-
-Syntax: switches=<list of switches>
-
- With this option you can switch some hardware lines that are often
-used to enable/disable certain hardware extensions. Examples are
-OverScan, overclocking, ...
-
- The <list of switches> is a comma-separated list of the following
-items:
-
- ikbd: set RTS of the keyboard ACIA high
- midi: set RTS of the MIDI ACIA high
- snd6: set bit 6 of the PSG port A
- snd7: set bit 6 of the PSG port A
-
-It doesn't make sense to mention a switch more than once (no
-difference to only once), but you can give as many switches as you
-want to enable different features. The switch lines are set as early
-as possible during kernel initialization (even before determining the
-present hardware.)
-
- All of the items can also be prefixed with "ov_", i.e. "ov_ikbd",
-"ov_midi", ... These options are meant for switching on an OverScan
-video extension. The difference to the bare option is that the
-switch-on is done after video initialization, and somehow synchronized
-to the HBLANK. A speciality is that ov_ikbd and ov_midi are switched
-off before rebooting, so that OverScan is disabled and TOS boots
-correctly.
-
- If you give an option both, with and without the "ov_" prefix, the
-earlier initialization ("ov_"-less) takes precedence. But the
-switching-off on reset still happens in this case.
-
-5) Options for Amiga Only:
-==========================
-
-5.1) video=
------------
-
-Syntax: video=<fbname>:<sub-options...>
-
-The <fbname> parameter specifies the name of the frame buffer, valid
-options are `amifb', `cyber', 'virge', `retz3' and `clgen', provided
-that the respective frame buffer devices have been compiled into the
-kernel (or compiled as loadable modules). The behavior of the <fbname>
-option was changed in 2.1.57 so it is now recommended to specify this
-option.
-
-The <sub-options> is a comma-separated list of the sub-options listed
-below. This option is organized similar to the Atari version of the
-"video"-option (4.1), but knows fewer sub-options.
-
-5.1.1) video mode
------------------
-
-Again, similar to the video mode for the Atari (see 4.1.1). Predefined
-modes depend on the used frame buffer device.
-
-OCS, ECS and AGA machines all use the color frame buffer. The following
-predefined video modes are available:
-
-NTSC modes:
- - ntsc : 640x200, 15 kHz, 60 Hz
- - ntsc-lace : 640x400, 15 kHz, 60 Hz interlaced
-PAL modes:
- - pal : 640x256, 15 kHz, 50 Hz
- - pal-lace : 640x512, 15 kHz, 50 Hz interlaced
-ECS modes:
- - multiscan : 640x480, 29 kHz, 57 Hz
- - multiscan-lace : 640x960, 29 kHz, 57 Hz interlaced
- - euro36 : 640x200, 15 kHz, 72 Hz
- - euro36-lace : 640x400, 15 kHz, 72 Hz interlaced
- - euro72 : 640x400, 29 kHz, 68 Hz
- - euro72-lace : 640x800, 29 kHz, 68 Hz interlaced
- - super72 : 800x300, 23 kHz, 70 Hz
- - super72-lace : 800x600, 23 kHz, 70 Hz interlaced
- - dblntsc-ff : 640x400, 27 kHz, 57 Hz
- - dblntsc-lace : 640x800, 27 kHz, 57 Hz interlaced
- - dblpal-ff : 640x512, 27 kHz, 47 Hz
- - dblpal-lace : 640x1024, 27 kHz, 47 Hz interlaced
- - dblntsc : 640x200, 27 kHz, 57 Hz doublescan
- - dblpal : 640x256, 27 kHz, 47 Hz doublescan
-VGA modes:
- - vga : 640x480, 31 kHz, 60 Hz
- - vga70 : 640x400, 31 kHz, 70 Hz
-
-Please notice that the ECS and VGA modes require either an ECS or AGA
-chipset, and that these modes are limited to 2-bit color for the ECS
-chipset and 8-bit color for the AGA chipset.
-
-5.1.2) depth
-------------
-
-Syntax: depth:<nr. of bit-planes>
-
-Specify the number of bit-planes for the selected video-mode.
-
-5.1.3) inverse
---------------
-
-Use inverted display (black on white). Functionally the same as the
-"inverse" sub-option for the Atari.
-
-5.1.4) font
------------
-
-Syntax: font:<fontname>
-
-Specify the font to use in text modes. Functionally the same as the
-"font" sub-option for the Atari, except that `PEARL8x8' is used instead
-of `VGA8x8' if the vertical size of the display is less than 400 pixel
-rows.
-
-5.1.5) monitorcap:
--------------------
-
-Syntax: monitorcap:<vmin>;<vmax>;<hmin>;<hmax>
-
-This describes the capabilities of a multisync monitor. For now, only
-the color frame buffer uses the settings of "monitorcap:".
-
- <vmin> and <vmax> are the minimum and maximum, resp., vertical frequencies
-your monitor can work with, in Hz. <hmin> and <hmax> are the same for
-the horizontal frequency, in kHz.
-
- The defaults are 50;90;15;38 (Generic Amiga multisync monitor).
-
-
-5.2) fd_def_df0=
-----------------
-
-Syntax: fd_def_df0=<value>
-
-Sets the df0 value for "silent" floppy drives. The value should be in
-hexadecimal with "0x" prefix.
-
-
-5.3) wd33c93=
--------------
-
-Syntax: wd33c93=<sub-options...>
-
-These options affect the A590/A2091, A3000 and GVP Series II SCSI
-controllers.
-
-The <sub-options> is a comma-separated list of the sub-options listed
-below.
-
-5.3.1) nosync
--------------
-
-Syntax: nosync:bitmask
-
- bitmask is a byte where the 1st 7 bits correspond with the 7
-possible SCSI devices. Set a bit to prevent sync negotiation on that
-device. To maintain backwards compatibility, a command-line such as
-"wd33c93=255" will be automatically translated to
-"wd33c93=nosync:0xff". The default is to disable sync negotiation for
-all devices, eg. nosync:0xff.
-
-5.3.2) period
--------------
-
-Syntax: period:ns
-
- `ns' is the minimum # of nanoseconds in a SCSI data transfer
-period. Default is 500; acceptable values are 250 - 1000.
-
-5.3.3) disconnect
------------------
-
-Syntax: disconnect:x
-
- Specify x = 0 to never allow disconnects, 2 to always allow them.
-x = 1 does 'adaptive' disconnects, which is the default and generally
-the best choice.
-
-5.3.4) debug
-------------
-
-Syntax: debug:x
-
- If `DEBUGGING_ON' is defined, x is a bit mask that causes various
-types of debug output to printed - see the DB_xxx defines in
-wd33c93.h.
-
-5.3.5) clock
-------------
-
-Syntax: clock:x
-
- x = clock input in MHz for WD33c93 chip. Normal values would be from
-8 through 20. The default value depends on your hostadapter(s),
-default for the A3000 internal controller is 14, for the A2091 it's 8
-and for the GVP hostadapters it's either 8 or 14, depending on the
-hostadapter and the SCSI-clock jumper present on some GVP
-hostadapters.
-
-5.3.6) next
------------
-
- No argument. Used to separate blocks of keywords when there's more
-than one wd33c93-based host adapter in the system.
-
-5.3.7) nodma
-------------
-
-Syntax: nodma:x
-
- If x is 1 (or if the option is just written as "nodma"), the WD33c93
-controller will not use DMA (= direct memory access) to access the
-Amiga's memory. This is useful for some systems (like A3000's and
-A4000's with the A3640 accelerator, revision 3.0) that have problems
-using DMA to chip memory. The default is 0, i.e. to use DMA if
-possible.
-
-
-5.4) gvp11=
------------
-
-Syntax: gvp11=<addr-mask>
-
- The earlier versions of the GVP driver did not handle DMA
-address-mask settings correctly which made it necessary for some
-people to use this option, in order to get their GVP controller
-running under Linux. These problems have hopefully been solved and the
-use of this option is now highly unrecommended!
-
- Incorrect use can lead to unpredictable behavior, so please only use
-this option if you *know* what you are doing and have a reason to do
-so. In any case if you experience problems and need to use this
-option, please inform us about it by mailing to the Linux/68k kernel
-mailing list.
-
- The address mask set by this option specifies which addresses are
-valid for DMA with the GVP Series II SCSI controller. An address is
-valid, if no bits are set except the bits that are set in the mask,
-too.
-
- Some versions of the GVP can only DMA into a 24 bit address range,
-some can address a 25 bit address range while others can use the whole
-32 bit address range for DMA. The correct setting depends on your
-controller and should be autodetected by the driver. An example is the
-24 bit region which is specified by a mask of 0x00fffffe.
-
-
-/* Local Variables: */
-/* mode: text */
-/* End: */
diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt
deleted file mode 100644
index e1055f105cf5..000000000000
--- a/Documentation/md/md-cluster.txt
+++ /dev/null
@@ -1,325 +0,0 @@
-The cluster MD is a shared-device RAID for a cluster, it supports
-two levels: raid1 and raid10 (limited support).
-
-
-1. On-disk format
-
-Separate write-intent-bitmaps are used for each cluster node.
-The bitmaps record all writes that may have been started on that node,
-and may not yet have finished. The on-disk layout is:
-
-0 4k 8k 12k
--------------------------------------------------------------------
-| idle | md super | bm super [0] + bits |
-| bm bits[0, contd] | bm super[1] + bits | bm bits[1, contd] |
-| bm super[2] + bits | bm bits [2, contd] | bm super[3] + bits |
-| bm bits [3, contd] | | |
-
-During "normal" functioning we assume the filesystem ensures that only
-one node writes to any given block at a time, so a write request will
-
- - set the appropriate bit (if not already set)
- - commit the write to all mirrors
- - schedule the bit to be cleared after a timeout.
-
-Reads are just handled normally. It is up to the filesystem to ensure
-one node doesn't read from a location where another node (or the same
-node) is writing.
-
-
-2. DLM Locks for management
-
-There are three groups of locks for managing the device:
-
-2.1 Bitmap lock resource (bm_lockres)
-
- The bm_lockres protects individual node bitmaps. They are named in
- the form bitmap000 for node 1, bitmap001 for node 2 and so on. When a
- node joins the cluster, it acquires the lock in PW mode and it stays
- so during the lifetime the node is part of the cluster. The lock
- resource number is based on the slot number returned by the DLM
- subsystem. Since DLM starts node count from one and bitmap slots
- start from zero, one is subtracted from the DLM slot number to arrive
- at the bitmap slot number.
-
- The LVB of the bitmap lock for a particular node records the range
- of sectors that are being re-synced by that node. No other
- node may write to those sectors. This is used when a new nodes
- joins the cluster.
-
-2.2 Message passing locks
-
- Each node has to communicate with other nodes when starting or ending
- resync, and for metadata superblock updates. This communication is
- managed through three locks: "token", "message", and "ack", together
- with the Lock Value Block (LVB) of one of the "message" lock.
-
-2.3 new-device management
-
- A single lock: "no-new-dev" is used to co-ordinate the addition of
- new devices - this must be synchronized across the array.
- Normally all nodes hold a concurrent-read lock on this device.
-
-3. Communication
-
- Messages can be broadcast to all nodes, and the sender waits for all
- other nodes to acknowledge the message before proceeding. Only one
- message can be processed at a time.
-
-3.1 Message Types
-
- There are six types of messages which are passed:
-
- 3.1.1 METADATA_UPDATED: informs other nodes that the metadata has
- been updated, and the node must re-read the md superblock. This is
- performed synchronously. It is primarily used to signal device
- failure.
-
- 3.1.2 RESYNCING: informs other nodes that a resync is initiated or
- ended so that each node may suspend or resume the region. Each
- RESYNCING message identifies a range of the devices that the
- sending node is about to resync. This overrides any previous
- notification from that node: only one ranged can be resynced at a
- time per-node.
-
- 3.1.3 NEWDISK: informs other nodes that a device is being added to
- the array. Message contains an identifier for that device. See
- below for further details.
-
- 3.1.4 REMOVE: A failed or spare device is being removed from the
- array. The slot-number of the device is included in the message.
-
- 3.1.5 RE_ADD: A failed device is being re-activated - the assumption
- is that it has been determined to be working again.
-
- 3.1.6 BITMAP_NEEDS_SYNC: if a node is stopped locally but the bitmap
- isn't clean, then another node is informed to take the ownership of
- resync.
-
-3.2 Communication mechanism
-
- The DLM LVB is used to communicate within nodes of the cluster. There
- are three resources used for the purpose:
-
- 3.2.1 token: The resource which protects the entire communication
- system. The node having the token resource is allowed to
- communicate.
-
- 3.2.2 message: The lock resource which carries the data to
- communicate.
-
- 3.2.3 ack: The resource, acquiring which means the message has been
- acknowledged by all nodes in the cluster. The BAST of the resource
- is used to inform the receiving node that a node wants to
- communicate.
-
-The algorithm is:
-
- 1. receive status - all nodes have concurrent-reader lock on "ack".
-
- sender receiver receiver
- "ack":CR "ack":CR "ack":CR
-
- 2. sender get EX on "token"
- sender get EX on "message"
- sender receiver receiver
- "token":EX "ack":CR "ack":CR
- "message":EX
- "ack":CR
-
- Sender checks that it still needs to send a message. Messages
- received or other events that happened while waiting for the
- "token" may have made this message inappropriate or redundant.
-
- 3. sender writes LVB.
- sender down-convert "message" from EX to CW
- sender try to get EX of "ack"
- [ wait until all receivers have *processed* the "message" ]
-
- [ triggered by bast of "ack" ]
- receiver get CR on "message"
- receiver read LVB
- receiver processes the message
- [ wait finish ]
- receiver releases "ack"
- receiver tries to get PR on "message"
-
- sender receiver receiver
- "token":EX "message":CR "message":CR
- "message":CW
- "ack":EX
-
- 4. triggered by grant of EX on "ack" (indicating all receivers
- have processed message)
- sender down-converts "ack" from EX to CR
- sender releases "message"
- sender releases "token"
- receiver upconvert to PR on "message"
- receiver get CR of "ack"
- receiver release "message"
-
- sender receiver receiver
- "ack":CR "ack":CR "ack":CR
-
-
-4. Handling Failures
-
-4.1 Node Failure
-
- When a node fails, the DLM informs the cluster with the slot
- number. The node starts a cluster recovery thread. The cluster
- recovery thread:
-
- - acquires the bitmap<number> lock of the failed node
- - opens the bitmap
- - reads the bitmap of the failed node
- - copies the set bitmap to local node
- - cleans the bitmap of the failed node
- - releases bitmap<number> lock of the failed node
- - initiates resync of the bitmap on the current node
- md_check_recovery is invoked within recover_bitmaps,
- then md_check_recovery -> metadata_update_start/finish,
- it will lock the communication by lock_comm.
- Which means when one node is resyncing it blocks all
- other nodes from writing anywhere on the array.
-
- The resync process is the regular md resync. However, in a clustered
- environment when a resync is performed, it needs to tell other nodes
- of the areas which are suspended. Before a resync starts, the node
- send out RESYNCING with the (lo,hi) range of the area which needs to
- be suspended. Each node maintains a suspend_list, which contains the
- list of ranges which are currently suspended. On receiving RESYNCING,
- the node adds the range to the suspend_list. Similarly, when the node
- performing resync finishes, it sends RESYNCING with an empty range to
- other nodes and other nodes remove the corresponding entry from the
- suspend_list.
-
- A helper function, ->area_resyncing() can be used to check if a
- particular I/O range should be suspended or not.
-
-4.2 Device Failure
-
- Device failures are handled and communicated with the metadata update
- routine. When a node detects a device failure it does not allow
- any further writes to that device until the failure has been
- acknowledged by all other nodes.
-
-5. Adding a new Device
-
- For adding a new device, it is necessary that all nodes "see" the new
- device to be added. For this, the following algorithm is used:
-
- 1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
- ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CLUSTER_ADD)
- 2. Node 1 sends a NEWDISK message with uuid and slot number
- 3. Other nodes issue kobject_uevent_env with uuid and slot number
- (Steps 4,5 could be a udev rule)
- 4. In userspace, the node searches for the disk, perhaps
- using blkid -t SUB_UUID=""
- 5. Other nodes issue either of the following depending on whether
- the disk was found:
- ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
- disc.number set to slot number)
- ioctl(CLUSTERED_DISK_NACK)
- 6. Other nodes drop lock on "no-new-devs" (CR) if device is found
- 7. Node 1 attempts EX lock on "no-new-dev"
- 8. If node 1 gets the lock, it sends METADATA_UPDATED after
- unmarking the disk as SpareLocal
- 9. If not (get "no-new-dev" lock), it fails the operation and sends
- METADATA_UPDATED.
- 10. Other nodes get the information whether a disk is added or not
- by the following METADATA_UPDATED.
-
-6. Module interface.
-
- There are 17 call-backs which the md core can make to the cluster
- module. Understanding these can give a good overview of the whole
- process.
-
-6.1 join(nodes) and leave()
-
- These are called when an array is started with a clustered bitmap,
- and when the array is stopped. join() ensures the cluster is
- available and initializes the various resources.
- Only the first 'nodes' nodes in the cluster can use the array.
-
-6.2 slot_number()
-
- Reports the slot number advised by the cluster infrastructure.
- Range is from 0 to nodes-1.
-
-6.3 resync_info_update()
-
- This updates the resync range that is stored in the bitmap lock.
- The starting point is updated as the resync progresses. The
- end point is always the end of the array.
- It does *not* send a RESYNCING message.
-
-6.4 resync_start(), resync_finish()
-
- These are called when resync/recovery/reshape starts or stops.
- They update the resyncing range in the bitmap lock and also
- send a RESYNCING message. resync_start reports the whole
- array as resyncing, resync_finish reports none of it.
-
- resync_finish() also sends a BITMAP_NEEDS_SYNC message which
- allows some other node to take over.
-
-6.5 metadata_update_start(), metadata_update_finish(),
- metadata_update_cancel().
-
- metadata_update_start is used to get exclusive access to
- the metadata. If a change is still needed once that access is
- gained, metadata_update_finish() will send a METADATA_UPDATE
- message to all other nodes, otherwise metadata_update_cancel()
- can be used to release the lock.
-
-6.6 area_resyncing()
-
- This combines two elements of functionality.
-
- Firstly, it will check if any node is currently resyncing
- anything in a given range of sectors. If any resync is found,
- then the caller will avoid writing or read-balancing in that
- range.
-
- Secondly, while node recovery is happening it reports that
- all areas are resyncing for READ requests. This avoids races
- between the cluster-filesystem and the cluster-RAID handling
- a node failure.
-
-6.7 add_new_disk_start(), add_new_disk_finish(), new_disk_ack()
-
- These are used to manage the new-disk protocol described above.
- When a new device is added, add_new_disk_start() is called before
- it is bound to the array and, if that succeeds, add_new_disk_finish()
- is called the device is fully added.
-
- When a device is added in acknowledgement to a previous
- request, or when the device is declared "unavailable",
- new_disk_ack() is called.
-
-6.8 remove_disk()
-
- This is called when a spare or failed device is removed from
- the array. It causes a REMOVE message to be send to other nodes.
-
-6.9 gather_bitmaps()
-
- This sends a RE_ADD message to all other nodes and then
- gathers bitmap information from all bitmaps. This combined
- bitmap is then used to recovery the re-added device.
-
-6.10 lock_all_bitmaps() and unlock_all_bitmaps()
-
- These are called when change bitmap to none. If a node plans
- to clear the cluster raid's bitmap, it need to make sure no other
- nodes are using the raid which is achieved by lock all bitmap
- locks within the cluster, and also those locks are unlocked
- accordingly.
-
-7. Unsupported features
-
-There are somethings which are not supported by cluster MD yet.
-
-- change array_sectors.
diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt
deleted file mode 100644
index 2b210f295786..000000000000
--- a/Documentation/md/raid5-cache.txt
+++ /dev/null
@@ -1,109 +0,0 @@
-RAID5 cache
-
-Raid 4/5/6 could include an extra disk for data cache besides normal RAID
-disks. The role of RAID disks isn't changed with the cache disk. The cache disk
-caches data to the RAID disks. The cache can be in write-through (supported
-since 4.4) or write-back mode (supported since 4.10). mdadm (supported since
-3.4) has a new option '--write-journal' to create array with cache. Please
-refer to mdadm manual for details. By default (RAID array starts), the cache is
-in write-through mode. A user can switch it to write-back mode by:
-
-echo "write-back" > /sys/block/md0/md/journal_mode
-
-And switch it back to write-through mode by:
-
-echo "write-through" > /sys/block/md0/md/journal_mode
-
-In both modes, all writes to the array will hit cache disk first. This means
-the cache disk must be fast and sustainable.
-
--------------------------------------
-write-through mode:
-
-This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean
-shutdown can cause data in some stripes to not be in consistent state, eg, data
-and parity don't match. The reason is that a stripe write involves several RAID
-disks and it's possible the writes don't hit all RAID disks yet before the
-unclean shutdown. We call an array degraded if it has inconsistent data. MD
-tries to resync the array to bring it back to normal state. But before the
-resync completes, any system crash will expose the chance of real data
-corruption in the RAID array. This problem is called 'write hole'.
-
-The write-through cache will cache all data on cache disk first. After the data
-is safe on the cache disk, the data will be flushed onto RAID disks. The
-two-step write will guarantee MD can recover correct data after unclean
-shutdown even the array is degraded. Thus the cache can close the 'write hole'.
-
-In write-through mode, MD reports IO completion to upper layer (usually
-filesystems) after the data is safe on RAID disks, so cache disk failure
-doesn't cause data loss. Of course cache disk failure means the array is
-exposed to 'write hole' again.
-
-In write-through mode, the cache disk isn't required to be big. Several
-hundreds megabytes are enough.
-
---------------------------------------
-write-back mode:
-
-write-back mode fixes the 'write hole' issue too, since all write data is
-cached on cache disk. But the main goal of 'write-back' cache is to speed up
-write. If a write crosses all RAID disks of a stripe, we call it full-stripe
-write. For non-full-stripe writes, MD must read old data before the new parity
-can be calculated. These synchronous reads hurt write throughput. Some writes
-which are sequential but not dispatched in the same time will suffer from this
-overhead too. Write-back cache will aggregate the data and flush the data to
-RAID disks only after the data becomes a full stripe write. This will
-completely avoid the overhead, so it's very helpful for some workloads. A
-typical workload which does sequential write followed by fsync is an example.
-
-In write-back mode, MD reports IO completion to upper layer (usually
-filesystems) right after the data hits cache disk. The data is flushed to raid
-disks later after specific conditions met. So cache disk failure will cause
-data loss.
-
-In write-back mode, MD also caches data in memory. The memory cache includes
-the same data stored on cache disk, so a power loss doesn't cause data loss.
-The memory cache size has performance impact for the array. It's recommended
-the size is big. A user can configure the size by:
-
-echo "2048" > /sys/block/md0/md/stripe_cache_size
-
-Too small cache disk will make the write aggregation less efficient in this
-mode depending on the workloads. It's recommended to use a cache disk with at
-least several gigabytes size in write-back mode.
-
---------------------------------------
-The implementation:
-
-The write-through and write-back cache use the same disk format. The cache disk
-is organized as a simple write log. The log consists of 'meta data' and 'data'
-pairs. The meta data describes the data. It also includes checksum and sequence
-ID for recovery identification. Data can be IO data and parity data. Data is
-checksumed too. The checksum is stored in the meta data ahead of the data. The
-checksum is an optimization because MD can write meta and data freely without
-worry about the order. MD superblock has a field pointed to the valid meta data
-of log head.
-
-The log implementation is pretty straightforward. The difficult part is the
-order in which MD writes data to cache disk and RAID disks. Specifically, in
-write-through mode, MD calculates parity for IO data, writes both IO data and
-parity to the log, writes the data and parity to RAID disks after the data and
-parity is settled down in log and finally the IO is finished. Read just reads
-from raid disks as usual.
-
-In write-back mode, MD writes IO data to the log and reports IO completion. The
-data is also fully cached in memory at that time, which means read must query
-memory cache. If some conditions are met, MD will flush the data to RAID disks.
-MD will calculate parity for the data and write parity into the log. After this
-is finished, MD will write both data and parity into RAID disks, then MD can
-release the memory cache. The flush conditions could be stripe becomes a full
-stripe write, free cache disk space is low or free in-kernel memory cache space
-is low.
-
-After an unclean shutdown, MD does recovery. MD reads all meta data and data
-from the log. The sequence ID and checksum will help us detect corrupted meta
-data and data. If MD finds a stripe with data and valid parities (1 parity for
-raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If
-parities are incompleted, they are discarded. If part of data is corrupted,
-they are discarded too. MD then loads valid data and writes them to RAID disks
-in normal way.
diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
deleted file mode 100644
index bfa092589e00..000000000000
--- a/Documentation/md/raid5-ppl.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-Partial Parity Log
-
-Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
-addressed by PPL is that after a dirty shutdown, parity of a particular stripe
-may become inconsistent with data on other member disks. If the array is also
-in degraded state, there is no way to recalculate parity, because one of the
-disks is missing. This can lead to silent data corruption when rebuilding the
-array or using it is as degraded - data calculated from parity for array blocks
-that have not been touched by a write request during the unclean shutdown can
-be incorrect. Such condition is known as the RAID5 Write Hole. Because of
-this, md by default does not allow starting a dirty degraded array.
-
-Partial parity for a write operation is the XOR of stripe data chunks not
-modified by this write. It is just enough data needed for recovering from the
-write hole. XORing partial parity with the modified chunks produces parity for
-the stripe, consistent with its state before the write operation, regardless of
-which chunk writes have completed. If one of the not modified data disks of
-this stripe is missing, this updated parity can be used to recover its
-contents. PPL recovery is also performed when starting an array after an
-unclean shutdown and all disks are available, eliminating the need to resync
-the array. Because of this, using write-intent bitmap and PPL together is not
-supported.
-
-When handling a write request PPL writes partial parity before new data and
-parity are dispatched to disks. PPL is a distributed log - it is stored on
-array member drives in the metadata area, on the parity drive of a particular
-stripe. It does not require a dedicated journaling drive. Write performance is
-reduced by up to 30%-40% but it scales with the number of drives in the array
-and the journaling drive does not become a bottleneck or a single point of
-failure.
-
-Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
-not a true journal. It does not protect from losing in-flight data, only from
-silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
-performed for this stripe (parity is not updated). So it is possible to have
-arbitrary data in the written part of a stripe if that disk is lost. In such
-case the behavior is the same as in plain raid5.
-
-PPL is available for md version-1 metadata and external (specifically IMSM)
-metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
-
-There is a limitation of maximum 64 disks in the array for PPL. It allows to
-keep data structures and implementation simple. RAID5 arrays with so many disks
-are not likely due to high risk of multiple disks failure. Such restriction
-should not be a real life limitation.
diff --git a/Documentation/media/kapi/dtv-core.rst b/Documentation/media/kapi/dtv-core.rst
index ac005b46f23e..82c5b85ed9b1 100644
--- a/Documentation/media/kapi/dtv-core.rst
+++ b/Documentation/media/kapi/dtv-core.rst
@@ -11,12 +11,12 @@ Digital TV devices are implemented by several different drivers:
- Frontend drivers that are usually implemented as two separate drivers:
- - A tuner driver that implements the logic with commands the part of the
- hardware with is responsible to tune into a digital TV transponder or
+ - A tuner driver that implements the logic which commands the part of
+ the hardware responsible for tuning into a digital TV transponder or
physical channel. The output of a tuner is usually a baseband or
Intermediate Frequency (IF) signal;
- - A demodulator driver (a.k.a "demod") that implements the logic with
+ - A demodulator driver (a.k.a "demod") that implements the logic which
commands the digital TV decoding hardware. The output of a demod is
a digital stream, with multiple audio, video and data channels typically
multiplexed using MPEG Transport Stream [#f1]_.
diff --git a/Documentation/media/kapi/v4l2-controls.rst b/Documentation/media/kapi/v4l2-controls.rst
index 64ab99abf0b6..ebe2a55908be 100644
--- a/Documentation/media/kapi/v4l2-controls.rst
+++ b/Documentation/media/kapi/v4l2-controls.rst
@@ -26,8 +26,9 @@ The control framework was created in order to implement all the rules of the
V4L2 specification with respect to controls in a central place. And to make
life as easy as possible for the driver developer.
-Note that the control framework relies on the presence of a struct v4l2_device
-for V4L2 drivers and struct v4l2_subdev for sub-device drivers.
+Note that the control framework relies on the presence of a struct
+:c:type:`v4l2_device` for V4L2 drivers and struct :c:type:`v4l2_subdev` for
+sub-device drivers.
Objects in the framework
@@ -35,12 +36,13 @@ Objects in the framework
There are two main objects:
-The v4l2_ctrl object describes the control properties and keeps track of the
-control's value (both the current value and the proposed new value).
+The :c:type:`v4l2_ctrl` object describes the control properties and keeps
+track of the control's value (both the current value and the proposed new
+value).
-v4l2_ctrl_handler is the object that keeps track of controls. It maintains a
-list of v4l2_ctrl objects that it owns and another list of references to
-controls, possibly to controls owned by other handlers.
+:c:type:`v4l2_ctrl_handler` is the object that keeps track of controls. It
+maintains a list of v4l2_ctrl objects that it owns and another list of
+references to controls, possibly to controls owned by other handlers.
Basic usage for V4L2 and sub-device drivers
@@ -48,21 +50,39 @@ Basic usage for V4L2 and sub-device drivers
1) Prepare the driver:
+.. code-block:: c
+
+ #include <media/v4l2-ctrls.h>
+
1.1) Add the handler to your driver's top-level struct:
-.. code-block:: none
+For V4L2 drivers:
+
+.. code-block:: c
struct foo_dev {
...
+ struct v4l2_device v4l2_dev;
+ ...
struct v4l2_ctrl_handler ctrl_handler;
...
};
- struct foo_dev *foo;
+For sub-device drivers:
+
+.. code-block:: c
+
+ struct foo_dev {
+ ...
+ struct v4l2_subdev sd;
+ ...
+ struct v4l2_ctrl_handler ctrl_handler;
+ ...
+ };
1.2) Initialize the handler:
-.. code-block:: none
+.. code-block:: c
v4l2_ctrl_handler_init(&foo->ctrl_handler, nr_of_controls);
@@ -72,72 +92,48 @@ information. It is a hint only.
1.3) Hook the control handler into the driver:
-1.3.1) For V4L2 drivers do this:
+For V4L2 drivers:
-.. code-block:: none
-
- struct foo_dev {
- ...
- struct v4l2_device v4l2_dev;
- ...
- struct v4l2_ctrl_handler ctrl_handler;
- ...
- };
+.. code-block:: c
foo->v4l2_dev.ctrl_handler = &foo->ctrl_handler;
-Where foo->v4l2_dev is of type struct v4l2_device.
-
-Finally, remove all control functions from your v4l2_ioctl_ops (if any):
-vidioc_queryctrl, vidioc_query_ext_ctrl, vidioc_querymenu, vidioc_g_ctrl,
-vidioc_s_ctrl, vidioc_g_ext_ctrls, vidioc_try_ext_ctrls and vidioc_s_ext_ctrls.
-Those are now no longer needed.
-
-1.3.2) For sub-device drivers do this:
-
-.. code-block:: none
+For sub-device drivers:
- struct foo_dev {
- ...
- struct v4l2_subdev sd;
- ...
- struct v4l2_ctrl_handler ctrl_handler;
- ...
- };
+.. code-block:: c
foo->sd.ctrl_handler = &foo->ctrl_handler;
-Where foo->sd is of type struct v4l2_subdev.
-
1.4) Clean up the handler at the end:
-.. code-block:: none
+.. code-block:: c
v4l2_ctrl_handler_free(&foo->ctrl_handler);
2) Add controls:
-You add non-menu controls by calling v4l2_ctrl_new_std:
+You add non-menu controls by calling :c:func:`v4l2_ctrl_new_std`:
-.. code-block:: none
+.. code-block:: c
struct v4l2_ctrl *v4l2_ctrl_new_std(struct v4l2_ctrl_handler *hdl,
const struct v4l2_ctrl_ops *ops,
u32 id, s32 min, s32 max, u32 step, s32 def);
-Menu and integer menu controls are added by calling v4l2_ctrl_new_std_menu:
+Menu and integer menu controls are added by calling
+:c:func:`v4l2_ctrl_new_std_menu`:
-.. code-block:: none
+.. code-block:: c
struct v4l2_ctrl *v4l2_ctrl_new_std_menu(struct v4l2_ctrl_handler *hdl,
const struct v4l2_ctrl_ops *ops,
u32 id, s32 max, s32 skip_mask, s32 def);
Menu controls with a driver specific menu are added by calling
-v4l2_ctrl_new_std_menu_items:
+:c:func:`v4l2_ctrl_new_std_menu_items`:
-.. code-block:: none
+.. code-block:: c
struct v4l2_ctrl *v4l2_ctrl_new_std_menu_items(
struct v4l2_ctrl_handler *hdl,
@@ -145,17 +141,18 @@ v4l2_ctrl_new_std_menu_items:
s32 skip_mask, s32 def, const char * const *qmenu);
Integer menu controls with a driver specific menu can be added by calling
-v4l2_ctrl_new_int_menu:
+:c:func:`v4l2_ctrl_new_int_menu`:
-.. code-block:: none
+.. code-block:: c
struct v4l2_ctrl *v4l2_ctrl_new_int_menu(struct v4l2_ctrl_handler *hdl,
const struct v4l2_ctrl_ops *ops,
u32 id, s32 max, s32 def, const s64 *qmenu_int);
-These functions are typically called right after the v4l2_ctrl_handler_init:
+These functions are typically called right after the
+:c:func:`v4l2_ctrl_handler_init`:
-.. code-block:: none
+.. code-block:: c
static const s64 exp_bias_qmenu[] = {
-2, -1, 0, 1, 2
@@ -192,33 +189,34 @@ These functions are typically called right after the v4l2_ctrl_handler_init:
return err;
}
-The v4l2_ctrl_new_std function returns the v4l2_ctrl pointer to the new
-control, but if you do not need to access the pointer outside the control ops,
-then there is no need to store it.
-
-The v4l2_ctrl_new_std function will fill in most fields based on the control
-ID except for the min, max, step and default values. These are passed in the
-last four arguments. These values are driver specific while control attributes
-like type, name, flags are all global. The control's current value will be set
-to the default value.
-
-The v4l2_ctrl_new_std_menu function is very similar but it is used for menu
-controls. There is no min argument since that is always 0 for menu controls,
-and instead of a step there is a skip_mask argument: if bit X is 1, then menu
-item X is skipped.
-
-The v4l2_ctrl_new_int_menu function creates a new standard integer menu
-control with driver-specific items in the menu. It differs from
-v4l2_ctrl_new_std_menu in that it doesn't have the mask argument and takes
-as the last argument an array of signed 64-bit integers that form an exact
-menu item list.
-
-The v4l2_ctrl_new_std_menu_items function is very similar to
-v4l2_ctrl_new_std_menu but takes an extra parameter qmenu, which is the driver
-specific menu for an otherwise standard menu control. A good example for this
-control is the test pattern control for capture/display/sensors devices that
-have the capability to generate test patterns. These test patterns are hardware
-specific, so the contents of the menu will vary from device to device.
+The :c:func:`v4l2_ctrl_new_std` function returns the v4l2_ctrl pointer to
+the new control, but if you do not need to access the pointer outside the
+control ops, then there is no need to store it.
+
+The :c:func:`v4l2_ctrl_new_std` function will fill in most fields based on
+the control ID except for the min, max, step and default values. These are
+passed in the last four arguments. These values are driver specific while
+control attributes like type, name, flags are all global. The control's
+current value will be set to the default value.
+
+The :c:func:`v4l2_ctrl_new_std_menu` function is very similar but it is
+used for menu controls. There is no min argument since that is always 0 for
+menu controls, and instead of a step there is a skip_mask argument: if bit
+X is 1, then menu item X is skipped.
+
+The :c:func:`v4l2_ctrl_new_int_menu` function creates a new standard
+integer menu control with driver-specific items in the menu. It differs
+from v4l2_ctrl_new_std_menu in that it doesn't have the mask argument and
+takes as the last argument an array of signed 64-bit integers that form an
+exact menu item list.
+
+The :c:func:`v4l2_ctrl_new_std_menu_items` function is very similar to
+v4l2_ctrl_new_std_menu but takes an extra parameter qmenu, which is the
+driver specific menu for an otherwise standard menu control. A good example
+for this control is the test pattern control for capture/display/sensors
+devices that have the capability to generate test patterns. These test
+patterns are hardware specific, so the contents of the menu will vary from
+device to device.
Note that if something fails, the function will return NULL or an error and
set ctrl_handler->error to the error code. If ctrl_handler->error was already
@@ -233,7 +231,7 @@ a bit faster that way.
3) Optionally force initial control setup:
-.. code-block:: none
+.. code-block:: c
v4l2_ctrl_handler_setup(&foo->ctrl_handler);
@@ -242,9 +240,9 @@ initializes the hardware to the default control values. It is recommended
that you do this as this ensures that both the internal data structures and
the hardware are in sync.
-4) Finally: implement the v4l2_ctrl_ops
+4) Finally: implement the :c:type:`v4l2_ctrl_ops`
-.. code-block:: none
+.. code-block:: c
static const struct v4l2_ctrl_ops foo_ctrl_ops = {
.s_ctrl = foo_s_ctrl,
@@ -252,7 +250,7 @@ the hardware are in sync.
Usually all you need is s_ctrl:
-.. code-block:: none
+.. code-block:: c
static int foo_s_ctrl(struct v4l2_ctrl *ctrl)
{
@@ -305,7 +303,7 @@ Accessing Control Values
The following union is used inside the control framework to access control
values:
-.. code-block:: none
+.. code-block:: c
union v4l2_ctrl_ptr {
s32 *p_s32;
@@ -317,7 +315,7 @@ values:
The v4l2_ctrl struct contains these fields that can be used to access both
current and new values:
-.. code-block:: none
+.. code-block:: c
s32 val;
struct {
@@ -330,7 +328,7 @@ current and new values:
If the control has a simple s32 type type, then:
-.. code-block:: none
+.. code-block:: c
&ctrl->val == ctrl->p_new.p_s32
&ctrl->cur.val == ctrl->p_cur.p_s32
@@ -354,7 +352,7 @@ exception is for controls that return a volatile register such as a signal
strength read-out that changes continuously. In that case you will need to
implement g_volatile_ctrl like this:
-.. code-block:: none
+.. code-block:: c
static int foo_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
{
@@ -372,7 +370,7 @@ changes.
To mark a control as volatile you have to set V4L2_CTRL_FLAG_VOLATILE:
-.. code-block:: none
+.. code-block:: c
ctrl = v4l2_ctrl_new_std(&sd->ctrl_handler, ...);
if (ctrl)
@@ -393,7 +391,7 @@ not to introduce deadlocks.
Outside of the control ops you have to go through to helper functions to get
or set a single control value safely in your driver:
-.. code-block:: none
+.. code-block:: c
s32 v4l2_ctrl_g_ctrl(struct v4l2_ctrl *ctrl);
int v4l2_ctrl_s_ctrl(struct v4l2_ctrl *ctrl, s32 val);
@@ -404,7 +402,7 @@ will result in a deadlock since these helpers lock the handler as well.
You can also take the handler lock yourself:
-.. code-block:: none
+.. code-block:: c
mutex_lock(&state->ctrl_handler.lock);
pr_info("String value is '%s'\n", ctrl1->p_cur.p_char);
@@ -417,7 +415,7 @@ Menu Controls
The v4l2_ctrl struct contains this union:
-.. code-block:: none
+.. code-block:: c
union {
u32 step;
@@ -445,7 +443,7 @@ Custom Controls
Driver specific controls can be created using v4l2_ctrl_new_custom():
-.. code-block:: none
+.. code-block:: c
static const struct v4l2_ctrl_config ctrl_filter = {
.ops = &ctrl_custom_ops,
@@ -499,7 +497,7 @@ By default all controls are independent from the others. But in more
complex scenarios you can get dependencies from one control to another.
In that case you need to 'cluster' them:
-.. code-block:: none
+.. code-block:: c
struct foo {
struct v4l2_ctrl_handler ctrl_handler;
@@ -523,7 +521,7 @@ composite control. Similar to how a 'struct' works in C.
So when s_ctrl is called with V4L2_CID_AUDIO_VOLUME as argument, you should set
all two controls belonging to the audio_cluster:
-.. code-block:: none
+.. code-block:: c
static int foo_s_ctrl(struct v4l2_ctrl *ctrl)
{
@@ -545,7 +543,7 @@ all two controls belonging to the audio_cluster:
In the example above the following are equivalent for the VOLUME case:
-.. code-block:: none
+.. code-block:: c
ctrl == ctrl->cluster[AUDIO_CL_VOLUME] == state->audio_cluster[AUDIO_CL_VOLUME]
ctrl->cluster[AUDIO_CL_MUTE] == state->audio_cluster[AUDIO_CL_MUTE]
@@ -553,7 +551,7 @@ In the example above the following are equivalent for the VOLUME case:
In practice using cluster arrays like this becomes very tiresome. So instead
the following equivalent method is used:
-.. code-block:: none
+.. code-block:: c
struct {
/* audio cluster */
@@ -565,7 +563,7 @@ The anonymous struct is used to clearly 'cluster' these two control pointers,
but it serves no other purpose. The effect is the same as creating an
array with two control pointers. So you can just do:
-.. code-block:: none
+.. code-block:: c
state->volume = v4l2_ctrl_new_std(&state->ctrl_handler, ...);
state->mute = v4l2_ctrl_new_std(&state->ctrl_handler, ...);
@@ -621,7 +619,7 @@ changing that control affects the control flags of the manual controls.
In order to simplify this a special variation of v4l2_ctrl_cluster was
introduced:
-.. code-block:: none
+.. code-block:: c
void v4l2_ctrl_auto_cluster(unsigned ncontrols, struct v4l2_ctrl **controls,
u8 manual_val, bool set_volatile);
@@ -676,7 +674,7 @@ of another handler (e.g. for a video device node), then you should first add
the controls to the first handler, add the other controls to the second
handler and finally add the first handler to the second. For example:
-.. code-block:: none
+.. code-block:: c
v4l2_ctrl_new_std(&radio_ctrl_handler, &radio_ops, V4L2_CID_AUDIO_VOLUME, ...);
v4l2_ctrl_new_std(&radio_ctrl_handler, &radio_ops, V4L2_CID_AUDIO_MUTE, ...);
@@ -690,7 +688,7 @@ all controls.
Or you can add specific controls to a handler:
-.. code-block:: none
+.. code-block:: c
volume = v4l2_ctrl_new_std(&video_ctrl_handler, &ops, V4L2_CID_AUDIO_VOLUME, ...);
v4l2_ctrl_new_std(&video_ctrl_handler, &ops, V4L2_CID_BRIGHTNESS, ...);
@@ -699,7 +697,7 @@ Or you can add specific controls to a handler:
What you should not do is make two identical controls for two handlers.
For example:
-.. code-block:: none
+.. code-block:: c
v4l2_ctrl_new_std(&radio_ctrl_handler, &radio_ops, V4L2_CID_AUDIO_MUTE, ...);
v4l2_ctrl_new_std(&video_ctrl_handler, &video_ops, V4L2_CID_AUDIO_MUTE, ...);
@@ -720,7 +718,7 @@ not own. For example, if you have to find a volume control from a subdev.
You can do that by calling v4l2_ctrl_find:
-.. code-block:: none
+.. code-block:: c
struct v4l2_ctrl *volume;
@@ -729,7 +727,7 @@ You can do that by calling v4l2_ctrl_find:
Since v4l2_ctrl_find will lock the handler you have to be careful where you
use it. For example, this is not a good idea:
-.. code-block:: none
+.. code-block:: c
struct v4l2_ctrl_handler ctrl_handler;
@@ -738,7 +736,7 @@ use it. For example, this is not a good idea:
...and in video_ops.s_ctrl:
-.. code-block:: none
+.. code-block:: c
case V4L2_CID_BRIGHTNESS:
contrast = v4l2_find_ctrl(&ctrl_handler, V4L2_CID_CONTRAST);
@@ -760,7 +758,7 @@ not when it is used in consumer-level hardware. In that case you want to keep
those low-level controls local to the subdev. You can do this by simply
setting the 'is_private' flag of the control to 1:
-.. code-block:: none
+.. code-block:: c
static const struct v4l2_ctrl_config ctrl_private = {
.ops = &ctrl_custom_ops,
@@ -797,7 +795,7 @@ Sometimes the platform or bridge driver needs to be notified when a control
from a sub-device driver changes. You can set a notify callback by calling
this function:
-.. code-block:: none
+.. code-block:: c
void v4l2_ctrl_notify(struct v4l2_ctrl *ctrl,
void (*notify)(struct v4l2_ctrl *ctrl, void *priv), void *priv);
diff --git a/Documentation/media/uapi/cec/cec-api.rst b/Documentation/media/uapi/cec/cec-api.rst
index b614bf81aa20..0780ba07995a 100644
--- a/Documentation/media/uapi/cec/cec-api.rst
+++ b/Documentation/media/uapi/cec/cec-api.rst
@@ -39,7 +39,7 @@ Revision and Copyright
**********************
Authors:
-- Verkuil, Hans <hans.verkuil@cisco.com>
+- Verkuil, Hans <hverkuil-cisco@xs4all.nl>
- Initial version.
diff --git a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
index c53bb5f73f0d..d0902f356d65 100644
--- a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst
@@ -294,7 +294,8 @@ EINVAL
The requested mode is invalid.
EPERM
- Monitor mode is requested without having root permissions
+ Monitor mode is requested, but the process does have the ``CAP_NET_ADMIN``
+ capability.
EBUSY
Someone else is already an exclusive follower or initiator.
diff --git a/Documentation/media/uapi/cec/cec-ioc-receive.rst b/Documentation/media/uapi/cec/cec-ioc-receive.rst
index c3a685ff05cb..4137903d672e 100644
--- a/Documentation/media/uapi/cec/cec-ioc-receive.rst
+++ b/Documentation/media/uapi/cec/cec-ioc-receive.rst
@@ -223,6 +223,18 @@ View On' messages from initiator 0xf ('Unregistered') to destination 0 ('TV').
result of the :ref:`ioctl CEC_TRANSMIT <CEC_TRANSMIT>`, and once via
:ref:`ioctl CEC_RECEIVE <CEC_RECEIVE>`.
+ * .. _`CEC-MSG-FL-RAW`:
+
+ - ``CEC_MSG_FL_RAW``
+ - 2
+ - Normally CEC messages are validated before transmitting them. If this
+ flag is set when :ref:`ioctl CEC_TRANSMIT <CEC_TRANSMIT>` is called,
+ then no validation takes place and the message is transmitted as-is.
+ This is useful when debugging CEC issues.
+ This flag is only allowed if the process has the ``CAP_SYS_RAWIO``
+ capability. If that is not set, then the ``EPERM`` error code is
+ returned.
+
.. tabularcolumns:: |p{5.6cm}|p{0.9cm}|p{11.0cm}|
@@ -358,7 +370,8 @@ ENOTTY
EPERM
The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS <CEC_ADAP_S_LOG_ADDRS>`
- has never been called.
+ has never been called, or ``CEC_MSG_FL_RAW`` was used from a process that
+ did not have the ``CAP_SYS_RAWIO`` capability.
ENONET
The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS <CEC_ADAP_S_LOG_ADDRS>`
diff --git a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst
index a982f16e55a4..b827ebc398f8 100644
--- a/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst
+++ b/Documentation/media/uapi/mediactl/media-ioc-enum-links.rst
@@ -84,6 +84,11 @@ returned during the enumeration process.
- Pointer to a links array allocated by the application. Ignored if
NULL.
+ * - __u32
+ - ``reserved[4]``
+ - Reserved for future extensions. Drivers and applications must set
+ the array to zero.
+
.. c:type:: media_pad_desc
@@ -135,7 +140,7 @@ returned during the enumeration process.
- Link flags, see :ref:`media-link-flag` for more details.
* - __u32
- - ``reserved[4]``
+ - ``reserved[2]``
- Reserved for future extensions. Drivers and applications must set
the array to zero.
diff --git a/Documentation/media/uapi/rc/rc-tables.rst b/Documentation/media/uapi/rc/rc-tables.rst
index 177ac44fa0fa..20d7c686922b 100644
--- a/Documentation/media/uapi/rc/rc-tables.rst
+++ b/Documentation/media/uapi/rc/rc-tables.rst
@@ -54,7 +54,7 @@ the remote via /dev/input/event devices.
- .. row 3
- - ``KEY_0``
+ - ``KEY_NUMERIC_0``
- Keyboard digit 0
@@ -62,7 +62,7 @@ the remote via /dev/input/event devices.
- .. row 4
- - ``KEY_1``
+ - ``KEY_NUMERIC_1``
- Keyboard digit 1
@@ -70,7 +70,7 @@ the remote via /dev/input/event devices.
- .. row 5
- - ``KEY_2``
+ - ``KEY_NUMERIC_2``
- Keyboard digit 2
@@ -78,7 +78,7 @@ the remote via /dev/input/event devices.
- .. row 6
- - ``KEY_3``
+ - ``KEY_NUMERIC_3``
- Keyboard digit 3
@@ -86,7 +86,7 @@ the remote via /dev/input/event devices.
- .. row 7
- - ``KEY_4``
+ - ``KEY_NUMERIC_4``
- Keyboard digit 4
@@ -94,7 +94,7 @@ the remote via /dev/input/event devices.
- .. row 8
- - ``KEY_5``
+ - ``KEY_NUMERIC_5``
- Keyboard digit 5
@@ -102,7 +102,7 @@ the remote via /dev/input/event devices.
- .. row 9
- - ``KEY_6``
+ - ``KEY_NUMERIC_6``
- Keyboard digit 6
@@ -110,7 +110,7 @@ the remote via /dev/input/event devices.
- .. row 10
- - ``KEY_7``
+ - ``KEY_NUMERIC_7``
- Keyboard digit 7
@@ -118,7 +118,7 @@ the remote via /dev/input/event devices.
- .. row 11
- - ``KEY_8``
+ - ``KEY_NUMERIC_8``
- Keyboard digit 8
@@ -126,7 +126,7 @@ the remote via /dev/input/event devices.
- .. row 12
- - ``KEY_9``
+ - ``KEY_NUMERIC_9``
- Keyboard digit 9
@@ -196,7 +196,7 @@ the remote via /dev/input/event devices.
- ``KEY_PAUSE``
- - Pause sroweam
+ - Pause stream
- PAUSE / FREEZE
@@ -220,7 +220,7 @@ the remote via /dev/input/event devices.
- ``KEY_STOP``
- - Stop sroweam
+ - Stop stream
- STOP
@@ -228,7 +228,7 @@ the remote via /dev/input/event devices.
- ``KEY_RECORD``
- - Start/stop recording sroweam
+ - Start/stop recording stream
- CAPTURE / REC / RECORD/PAUSE
@@ -577,7 +577,7 @@ the remote via /dev/input/event devices.
- ``KEY_CLEAR``
- - Stop sroweam and return to default input video/audio
+ - Stop stream and return to default input video/audio
- CLEAR / RESET / BOSS KEY
@@ -593,7 +593,7 @@ the remote via /dev/input/event devices.
- ``KEY_FAVORITES``
- - Open the favorites sroweam window
+ - Open the favorites stream window
- TV WALL / Favorites
diff --git a/Documentation/media/uapi/v4l/biblio.rst b/Documentation/media/uapi/v4l/biblio.rst
index ec33768c055e..8f4eb8823d82 100644
--- a/Documentation/media/uapi/v4l/biblio.rst
+++ b/Documentation/media/uapi/v4l/biblio.rst
@@ -122,6 +122,15 @@ ITU BT.1119
:author: International Telecommunication Union (http://www.itu.ch)
+.. _h264:
+
+ITU-T Rec. H.264 Specification (04/2017 Edition)
+================================================
+
+:title: ITU-T Recommendation H.264 "Advanced Video Coding for Generic Audiovisual Services"
+
+:author: International Telecommunication Union (http://www.itu.ch)
+
.. _jfif:
JFIF
diff --git a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
index 4a8446203085..d6ea2ffd65c5 100644
--- a/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
+++ b/Documentation/media/uapi/v4l/ext-ctrls-codec.rst
@@ -759,6 +759,32 @@ enum v4l2_mpeg_video_h264_level -
+.. _v4l2-mpeg-video-mpeg2-level:
+
+``V4L2_CID_MPEG_VIDEO_MPEG2_LEVEL``
+ (enum)
+
+enum v4l2_mpeg_video_mpeg2_level -
+ The level information for the MPEG2 elementary stream. Applicable to
+ MPEG2 codecs. Possible values are:
+
+
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+
+ * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_LOW``
+ - Low Level (LL)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_MAIN``
+ - Main Level (ML)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH_1440``
+ - High-1440 Level (H-14)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_LEVEL_HIGH``
+ - High Level (HL)
+
+
+
.. _v4l2-mpeg-video-mpeg4-level:
``V4L2_CID_MPEG_VIDEO_MPEG4_LEVEL``
@@ -845,6 +871,36 @@ enum v4l2_mpeg_video_h264_profile -
+.. _v4l2-mpeg-video-mpeg2-profile:
+
+``V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE``
+ (enum)
+
+enum v4l2_mpeg_video_mpeg2_profile -
+ The profile information for MPEG2. Applicable to MPEG2 codecs.
+ Possible values are:
+
+
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+
+ * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SIMPLE``
+ - Simple profile (SP)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MAIN``
+ - Main profile (MP)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SNR_SCALABLE``
+ - SNR Scalable profile (SNR)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_SPATIALLY_SCALABLE``
+ - Spatially Scalable profile (Spt)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_HIGH``
+ - High profile (HP)
+ * - ``V4L2_MPEG_VIDEO_MPEG2_PROFILE_MULTIVIEW``
+ - Multi-view profile (MVP)
+
+
+
.. _v4l2-mpeg-video-mpeg4-profile:
``V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE``
@@ -1395,6 +1451,575 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type -
- Layer number
+.. _v4l2-mpeg-h264:
+
+``V4L2_CID_MPEG_VIDEO_H264_SPS (struct)``
+ Specifies the sequence parameter set (as extracted from the
+ bitstream) for the associated H264 slice data. This includes the
+ necessary parameters for configuring a stateless hardware decoding
+ pipeline for H264. The bitstream parameters are defined according
+ to :ref:`h264`, section 7.4.2.1.1 "Sequence Parameter Set Data
+ Semantics". For further documentation, refer to the above
+ specification, unless there is an explicit comment stating
+ otherwise.
+
+ .. note::
+
+ This compound control is not yet part of the public kernel API and
+ it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_sps
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_sps
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - __u8
+ - ``profile_idc``
+ -
+ * - __u8
+ - ``constraint_set_flags``
+ - See :ref:`Sequence Parameter Set Constraints Set Flags <h264_sps_constraints_set_flags>`
+ * - __u8
+ - ``level_idc``
+ -
+ * - __u8
+ - ``seq_parameter_set_id``
+ -
+ * - __u8
+ - ``chroma_format_idc``
+ -
+ * - __u8
+ - ``bit_depth_luma_minus8``
+ -
+ * - __u8
+ - ``bit_depth_chroma_minus8``
+ -
+ * - __u8
+ - ``log2_max_frame_num_minus4``
+ -
+ * - __u8
+ - ``pic_order_cnt_type``
+ -
+ * - __u8
+ - ``log2_max_pic_order_cnt_lsb_minus4``
+ -
+ * - __u8
+ - ``max_num_ref_frames``
+ -
+ * - __u8
+ - ``num_ref_frames_in_pic_order_cnt_cycle``
+ -
+ * - __s32
+ - ``offset_for_ref_frame[255]``
+ -
+ * - __s32
+ - ``offset_for_non_ref_pic``
+ -
+ * - __s32
+ - ``offset_for_top_to_bottom_field``
+ -
+ * - __u16
+ - ``pic_width_in_mbs_minus1``
+ -
+ * - __u16
+ - ``pic_height_in_map_units_minus1``
+ -
+ * - __u32
+ - ``flags``
+ - See :ref:`Sequence Parameter Set Flags <h264_sps_flags>`
+
+.. _h264_sps_constraints_set_flags:
+
+``Sequence Parameter Set Constraints Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - ``V4L2_H264_SPS_CONSTRAINT_SET0_FLAG``
+ - 0x00000001
+ -
+ * - ``V4L2_H264_SPS_CONSTRAINT_SET1_FLAG``
+ - 0x00000002
+ -
+ * - ``V4L2_H264_SPS_CONSTRAINT_SET2_FLAG``
+ - 0x00000004
+ -
+ * - ``V4L2_H264_SPS_CONSTRAINT_SET3_FLAG``
+ - 0x00000008
+ -
+ * - ``V4L2_H264_SPS_CONSTRAINT_SET4_FLAG``
+ - 0x00000010
+ -
+ * - ``V4L2_H264_SPS_CONSTRAINT_SET5_FLAG``
+ - 0x00000020
+ -
+
+.. _h264_sps_flags:
+
+``Sequence Parameter Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - ``V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE``
+ - 0x00000001
+ -
+ * - ``V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS``
+ - 0x00000002
+ -
+ * - ``V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO``
+ - 0x00000004
+ -
+ * - ``V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED``
+ - 0x00000008
+ -
+ * - ``V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY``
+ - 0x00000010
+ -
+ * - ``V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD``
+ - 0x00000020
+ -
+ * - ``V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE``
+ - 0x00000040
+ -
+
+``V4L2_CID_MPEG_VIDEO_H264_PPS (struct)``
+ Specifies the picture parameter set (as extracted from the
+ bitstream) for the associated H264 slice data. This includes the
+ necessary parameters for configuring a stateless hardware decoding
+ pipeline for H264. The bitstream parameters are defined according
+ to :ref:`h264`, section 7.4.2.2 "Picture Parameter Set RBSP
+ Semantics". For further documentation, refer to the above
+ specification, unless there is an explicit comment stating
+ otherwise.
+
+ .. note::
+
+ This compound control is not yet part of the public kernel API and
+ it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_pps
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_pps
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - __u8
+ - ``pic_parameter_set_id``
+ -
+ * - __u8
+ - ``seq_parameter_set_id``
+ -
+ * - __u8
+ - ``num_slice_groups_minus1``
+ -
+ * - __u8
+ - ``num_ref_idx_l0_default_active_minus1``
+ -
+ * - __u8
+ - ``num_ref_idx_l1_default_active_minus1``
+ -
+ * - __u8
+ - ``weighted_bipred_idc``
+ -
+ * - __s8
+ - ``pic_init_qp_minus26``
+ -
+ * - __s8
+ - ``pic_init_qs_minus26``
+ -
+ * - __s8
+ - ``chroma_qp_index_offset``
+ -
+ * - __s8
+ - ``second_chroma_qp_index_offset``
+ -
+ * - __u16
+ - ``flags``
+ - See :ref:`Picture Parameter Set Flags <h264_pps_flags>`
+
+.. _h264_pps_flags:
+
+``Picture Parameter Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - ``V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE``
+ - 0x00000001
+ -
+ * - ``V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT``
+ - 0x00000002
+ -
+ * - ``V4L2_H264_PPS_FLAG_WEIGHTED_PRED``
+ - 0x00000004
+ -
+ * - ``V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT``
+ - 0x00000008
+ -
+ * - ``V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED``
+ - 0x00000010
+ -
+ * - ``V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT``
+ - 0x00000020
+ -
+ * - ``V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE``
+ - 0x00000040
+ -
+ * - ``V4L2_H264_PPS_FLAG_PIC_SCALING_MATRIX_PRESENT``
+ - 0x00000080
+ -
+
+``V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX (struct)``
+ Specifies the scaling matrix (as extracted from the bitstream) for
+ the associated H264 slice data. The bitstream parameters are
+ defined according to :ref:`h264`, section 7.4.2.1.1.1 "Scaling
+ List Semantics". For further documentation, refer to the above
+ specification, unless there is an explicit comment stating
+ otherwise.
+
+ .. note::
+
+ This compound control is not yet part of the public kernel API and
+ it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_scaling_matrix
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_scaling_matrix
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - __u8
+ - ``scaling_list_4x4[6][16]``
+ -
+ * - __u8
+ - ``scaling_list_8x8[6][64]``
+ -
+
+``V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS (struct)``
+ Specifies the slice parameters (as extracted from the bitstream)
+ for the associated H264 slice data. This includes the necessary
+ parameters for configuring a stateless hardware decoding pipeline
+ for H264. The bitstream parameters are defined according to
+ :ref:`h264`, section 7.4.3 "Slice Header Semantics". For further
+ documentation, refer to the above specification, unless there is
+ an explicit comment stating otherwise.
+
+ .. note::
+
+ This compound control is not yet part of the public kernel API
+ and it is expected to change.
+
+ This structure is expected to be passed as an array, with one
+ entry for each slice included in the bitstream buffer.
+
+.. c:type:: v4l2_ctrl_h264_slice_params
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_slice_params
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - __u32
+ - ``size``
+ -
+ * - __u32
+ - ``header_bit_size``
+ -
+ * - __u16
+ - ``first_mb_in_slice``
+ -
+ * - __u8
+ - ``slice_type``
+ -
+ * - __u8
+ - ``pic_parameter_set_id``
+ -
+ * - __u8
+ - ``colour_plane_id``
+ -
+ * - __u8
+ - ``redundant_pic_cnt``
+ -
+ * - __u16
+ - ``frame_num``
+ -
+ * - __u16
+ - ``idr_pic_id``
+ -
+ * - __u16
+ - ``pic_order_cnt_lsb``
+ -
+ * - __s32
+ - ``delta_pic_order_cnt_bottom``
+ -
+ * - __s32
+ - ``delta_pic_order_cnt0``
+ -
+ * - __s32
+ - ``delta_pic_order_cnt1``
+ -
+ * - struct :c:type:`v4l2_h264_pred_weight_table`
+ - ``pred_weight_table``
+ -
+ * - __u32
+ - ``dec_ref_pic_marking_bit_size``
+ -
+ * - __u32
+ - ``pic_order_cnt_bit_size``
+ -
+ * - __u8
+ - ``cabac_init_idc``
+ -
+ * - __s8
+ - ``slice_qp_delta``
+ -
+ * - __s8
+ - ``slice_qs_delta``
+ -
+ * - __u8
+ - ``disable_deblocking_filter_idc``
+ -
+ * - __s8
+ - ``slice_alpha_c0_offset_div2``
+ -
+ * - __s8
+ - ``slice_beta_offset_div2``
+ -
+ * - __u8
+ - ``num_ref_idx_l0_active_minus1``
+ -
+ * - __u8
+ - ``num_ref_idx_l1_active_minus1``
+ -
+ * - __u32
+ - ``slice_group_change_cycle``
+ -
+ * - __u8
+ - ``ref_pic_list0[32]``
+ - Reference picture list after applying the per-slice modifications
+ * - __u8
+ - ``ref_pic_list1[32]``
+ - Reference picture list after applying the per-slice modifications
+ * - __u32
+ - ``flags``
+ - See :ref:`Slice Parameter Flags <h264_slice_flags>`
+
+.. _h264_slice_flags:
+
+``Slice Parameter Set Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - ``V4L2_H264_SLICE_FLAG_FIELD_PIC``
+ - 0x00000001
+ -
+ * - ``V4L2_H264_SLICE_FLAG_BOTTOM_FIELD``
+ - 0x00000002
+ -
+ * - ``V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED``
+ - 0x00000004
+ -
+ * - ``V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH``
+ - 0x00000008
+ -
+
+``Prediction Weight Table``
+
+ The bitstream parameters are defined according to :ref:`h264`,
+ section 7.4.3.2 "Prediction Weight Table Semantics". For further
+ documentation, refer to the above specification, unless there is
+ an explicit comment stating otherwise.
+
+.. c:type:: v4l2_h264_pred_weight_table
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_h264_pred_weight_table
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - __u16
+ - ``luma_log2_weight_denom``
+ -
+ * - __u16
+ - ``chroma_log2_weight_denom``
+ -
+ * - struct :c:type:`v4l2_h264_weight_factors`
+ - ``weight_factors[2]``
+ - The weight factors at index 0 are the weight factors for the reference
+ list 0, the one at index 1 for the reference list 1.
+
+.. c:type:: v4l2_h264_weight_factors
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_h264_weight_factors
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - __s16
+ - ``luma_weight[32]``
+ -
+ * - __s16
+ - ``luma_offset[32]``
+ -
+ * - __s16
+ - ``chroma_weight[32][2]``
+ -
+ * - __s16
+ - ``chroma_offset[32][2]``
+ -
+
+``V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS (struct)``
+ Specifies the decode parameters (as extracted from the bitstream)
+ for the associated H264 slice data. This includes the necessary
+ parameters for configuring a stateless hardware decoding pipeline
+ for H264. The bitstream parameters are defined according to
+ :ref:`h264`. For further documentation, refer to the above
+ specification, unless there is an explicit comment stating
+ otherwise.
+
+ .. note::
+
+ This compound control is not yet part of the public kernel API and
+ it is expected to change.
+
+.. c:type:: v4l2_ctrl_h264_decode_params
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_ctrl_h264_decode_params
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - struct :c:type:`v4l2_h264_dpb_entry`
+ - ``dpb[16]``
+ -
+ * - __u16
+ - ``num_slices``
+ - Number of slices needed to decode the current frame
+ * - __u16
+ - ``nal_ref_idc``
+ - NAL reference ID value coming from the NAL Unit header
+ * - __u8
+ - ``ref_pic_list_p0[32]``
+ - Backward reference list used by P-frames in the original bitstream order
+ * - __u8
+ - ``ref_pic_list_b0[32]``
+ - Backward reference list used by B-frames in the original bitstream order
+ * - __u8
+ - ``ref_pic_list_b1[32]``
+ - Forward reference list used by B-frames in the original bitstream order
+ * - __s32
+ - ``top_field_order_cnt``
+ - Picture Order Count for the coded top field
+ * - __s32
+ - ``bottom_field_order_cnt``
+ - Picture Order Count for the coded bottom field
+ * - __u32
+ - ``flags``
+ - See :ref:`Decode Parameters Flags <h264_decode_params_flags>`
+
+.. _h264_decode_params_flags:
+
+``Decode Parameters Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - ``V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC``
+ - 0x00000001
+ - That picture is an IDR picture
+
+.. c:type:: v4l2_h264_dpb_entry
+
+.. cssclass:: longtable
+
+.. flat-table:: struct v4l2_h264_dpb_entry
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - __u64
+ - ``reference_ts``
+ - Timestamp of the V4L2 capture buffer to use as reference, used
+ with B-coded and P-coded frames. The timestamp refers to the
+ ``timestamp`` field in struct :c:type:`v4l2_buffer`. Use the
+ :c:func:`v4l2_timeval_to_ns()` function to convert the struct
+ :c:type:`timeval` in struct :c:type:`v4l2_buffer` to a __u64.
+ * - __u16
+ - ``frame_num``
+ -
+ * - __u16
+ - ``pic_num``
+ -
+ * - __s32
+ - ``top_field_order_cnt``
+ -
+ * - __s32
+ - ``bottom_field_order_cnt``
+ -
+ * - __u32
+ - ``flags``
+ - See :ref:`DPB Entry Flags <h264_dpb_flags>`
+
+.. _h264_dpb_flags:
+
+``DPB Entries Flags``
+
+.. cssclass:: longtable
+
+.. flat-table::
+ :header-rows: 0
+ :stub-columns: 0
+ :widths: 1 1 2
+
+ * - ``V4L2_H264_DPB_ENTRY_FLAG_VALID``
+ - 0x00000001
+ - The DPB entry is valid and should be considered
+ * - ``V4L2_H264_DPB_ENTRY_FLAG_ACTIVE``
+ - 0x00000002
+ - The DPB entry is currently being used as a reference frame
+ * - ``V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM``
+ - 0x00000004
+ - The DPB entry is a long term reference frame
.. _v4l2-mpeg-mpeg2:
diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst
index 24274b398e63..655362483730 100644
--- a/Documentation/media/uapi/v4l/extended-controls.rst
+++ b/Documentation/media/uapi/v4l/extended-controls.rst
@@ -85,20 +85,17 @@ be able to see such compound controls. In other words, these controls
with compound types should only be used programmatically.
Since such compound controls need to expose more information about
-themselves than is possible with
-:ref:`VIDIOC_QUERYCTRL` the
-:ref:`VIDIOC_QUERY_EXT_CTRL <VIDIOC_QUERYCTRL>` ioctl was added. In
-particular, this ioctl gives the dimensions of the N-dimensional array
-if this control consists of more than one element.
+themselves than is possible with :ref:`VIDIOC_QUERYCTRL <VIDIOC_QUERYCTRL>`
+the :ref:`VIDIOC_QUERY_EXT_CTRL <VIDIOC_QUERYCTRL>` ioctl was added. In
+particular, this ioctl gives the dimensions of the N-dimensional array if
+this control consists of more than one element.
.. note::
#. It is important to realize that due to the flexibility of controls it is
necessary to check whether the control you want to set actually is
supported in the driver and what the valid range of values is. So use
- the :ref:`VIDIOC_QUERYCTRL` (or :ref:`VIDIOC_QUERY_EXT_CTRL
- <VIDIOC_QUERYCTRL>`) and :ref:`VIDIOC_QUERYMENU <VIDIOC_QUERYCTRL>`
- ioctls to check this.
+ :ref:`VIDIOC_QUERYCTRL` to check this.
#. It is possible that some of the menu indices in a control of
type ``V4L2_CTRL_TYPE_MENU`` may not be supported (``VIDIOC_QUERYMENU``
@@ -144,7 +141,7 @@ control class is found:
while (0 == ioctl(fd, VIDIOC_QUERYCTRL, &qctrl)) {
if (V4L2_CTRL_ID2CLASS(qctrl.id) != V4L2_CTRL_CLASS_MPEG)
break;
- /* ... */
+ /* ... */
qctrl.id |= V4L2_CTRL_FLAG_NEXT_CTRL;
}
diff --git a/Documentation/media/uapi/v4l/field-order.rst b/Documentation/media/uapi/v4l/field-order.rst
index d640e922a974..c422bebe4314 100644
--- a/Documentation/media/uapi/v4l/field-order.rst
+++ b/Documentation/media/uapi/v4l/field-order.rst
@@ -51,6 +51,11 @@ determined by the video standard. Hence the distinction between temporal
and spatial order of fields. The diagrams below should make this
clearer.
+In V4L it is assumed that all video cameras transmit fields on the media
+bus in the same order they were captured, so if the top field was
+captured first (is the older field), the top field is also transmitted
+first on the bus.
+
All video capture and output devices must report the current field
order. Some drivers may permit the selection of a different order, to
this end applications initialize the ``field`` field of struct
@@ -101,10 +106,10 @@ enum v4l2_field
* - ``V4L2_FIELD_INTERLACED``
- 4
- Images contain both fields, interleaved line by line. The temporal
- order of the fields (whether the top or bottom field is first
- transmitted) depends on the current video standard. M/NTSC
- transmits the bottom field first, all other standards the top
- field first.
+ order of the fields (whether the top or bottom field is older)
+ depends on the current video standard. In M/NTSC the bottom
+ field is the older field. In all other standards the top field
+ is the older field.
* - ``V4L2_FIELD_SEQ_TB``
- 5
- Images contain both fields, the top field lines are stored first
@@ -135,11 +140,11 @@ enum v4l2_field
* - ``V4L2_FIELD_INTERLACED_TB``
- 8
- Images contain both fields, interleaved line by line, top field
- first. The top field is transmitted first.
+ first. The top field is the older field.
* - ``V4L2_FIELD_INTERLACED_BT``
- 9
- Images contain both fields, interleaved line by line, top field
- first. The bottom field is transmitted first.
+ first. The bottom field is the older field.
diff --git a/Documentation/media/uapi/v4l/pixfmt-compressed.rst b/Documentation/media/uapi/v4l/pixfmt-compressed.rst
index 6c961cfb74da..4b701fc7653e 100644
--- a/Documentation/media/uapi/v4l/pixfmt-compressed.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-compressed.rst
@@ -52,6 +52,31 @@ Compressed Formats
- ``V4L2_PIX_FMT_H264_MVC``
- 'M264'
- H264 MVC video elementary stream.
+ * .. _V4L2-PIX-FMT-H264-SLICE-RAW:
+
+ - ``V4L2_PIX_FMT_H264_SLICE_RAW``
+ - 'S264'
+ - H264 parsed slice data, without the start code and as
+ extracted from the H264 bitstream. This format is adapted for
+ stateless video decoders that implement an H264 pipeline
+ (using the :ref:`mem2mem` and :ref:`media-request-api`).
+ Metadata associated with the frame to decode are required to
+ be passed through the ``V4L2_CID_MPEG_VIDEO_H264_SPS``,
+ ``V4L2_CID_MPEG_VIDEO_H264_PPS``,
+ ``V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX``,
+ ``V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS`` and
+ ``V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS`` controls. See the
+ :ref:`associated Codec Control IDs <v4l2-mpeg-h264>`. Exactly
+ one output and one capture buffer must be provided for use
+ with this pixel format. The output buffer must contain the
+ appropriate number of macroblocks to decode a full
+ corresponding frame to the matching capture buffer.
+
+ .. note::
+
+ This format is not yet part of the public kernel API and it
+ is expected to change.
+
* .. _V4L2-PIX-FMT-H263:
- ``V4L2_PIX_FMT_H263``
diff --git a/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst b/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst
index 5688c816e334..db43dda5aafb 100644
--- a/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-v4l2-mplane.rst
@@ -31,7 +31,20 @@ describing all planes of that format.
* - __u32
- ``sizeimage``
- - Maximum size in bytes required for image data in this plane.
+ - Maximum size in bytes required for image data in this plane,
+ set by the driver. When the image consists of variable length
+ compressed data this is the number of bytes required by the
+ codec to support the worst-case compression scenario.
+
+ The driver will set the value for uncompressed images.
+
+ Clients are allowed to set the sizeimage field for variable length
+ compressed data flagged with ``V4L2_FMT_FLAG_COMPRESSED`` at
+ :ref:`VIDIOC_ENUM_FMT`, but the driver may ignore it and set the
+ value itself, or it may modify the provided value based on
+ alignment requirements or minimum/maximum size requirements.
+ If the client wants to leave this to the driver, then it should
+ set sizeimage to 0.
* - __u32
- ``bytesperline``
- Distance in bytes between the leftmost pixels in two adjacent
diff --git a/Documentation/media/uapi/v4l/pixfmt-v4l2.rst b/Documentation/media/uapi/v4l/pixfmt-v4l2.rst
index 71eebfc6d853..da6da2ef139a 100644
--- a/Documentation/media/uapi/v4l/pixfmt-v4l2.rst
+++ b/Documentation/media/uapi/v4l/pixfmt-v4l2.rst
@@ -89,7 +89,18 @@ Single-planar format structure
- Size in bytes of the buffer to hold a complete image, set by the
driver. Usually this is ``bytesperline`` times ``height``. When
the image consists of variable length compressed data this is the
- maximum number of bytes required to hold an image.
+ number of bytes required by the codec to support the worst-case
+ compression scenario.
+
+ The driver will set the value for uncompressed images.
+
+ Clients are allowed to set the sizeimage field for variable length
+ compressed data flagged with ``V4L2_FMT_FLAG_COMPRESSED`` at
+ :ref:`VIDIOC_ENUM_FMT`, but the driver may ignore it and set the
+ value itself, or it may modify the provided value based on
+ alignment requirements or minimum/maximum size requirements.
+ If the client wants to leave this to the driver, then it should
+ set sizeimage to 0.
* - __u32
- ``colorspace``
- Image colorspace, from enum :c:type:`v4l2_colorspace`.
diff --git a/Documentation/media/uapi/v4l/vidioc-qbuf.rst b/Documentation/media/uapi/v4l/vidioc-qbuf.rst
index dbf7b445a27b..407302d80684 100644
--- a/Documentation/media/uapi/v4l/vidioc-qbuf.rst
+++ b/Documentation/media/uapi/v4l/vidioc-qbuf.rst
@@ -139,6 +139,14 @@ may continue as normal, but should be aware that data in the dequeued
buffer might be corrupted. When using the multi-planar API, the planes
array must be passed in as well.
+If the application sets the ``memory`` field to ``V4L2_MEMORY_DMABUF`` to
+dequeue a :ref:`DMABUF <dmabuf>` buffer, the driver fills the ``m.fd`` field
+with a file descriptor numerically the same as the one given to ``VIDIOC_QBUF``
+when the buffer was enqueued. No new file descriptor is created at dequeue time
+and the value is only for the application convenience. When the multi-planar
+API is used the ``m.fd`` fields of the passed array of struct
+:c:type:`v4l2_plane` are filled instead.
+
By default ``VIDIOC_DQBUF`` blocks when no buffer is in the outgoing
queue. When the ``O_NONBLOCK`` flag was given to the
:ref:`open() <func-open>` function, ``VIDIOC_DQBUF`` returns
diff --git a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst
index f824162d0ea9..dc500632095d 100644
--- a/Documentation/media/uapi/v4l/vidioc-queryctrl.rst
+++ b/Documentation/media/uapi/v4l/vidioc-queryctrl.rst
@@ -443,6 +443,36 @@ See also the examples in :ref:`control`.
- n/a
- A struct :c:type:`v4l2_ctrl_mpeg2_quantization`, containing MPEG-2
quantization matrices for stateless video decoders.
+ * - ``V4L2_CTRL_TYPE_H264_SPS``
+ - n/a
+ - n/a
+ - n/a
+ - A struct :c:type:`v4l2_ctrl_h264_sps`, containing H264
+ sequence parameters for stateless video decoders.
+ * - ``V4L2_CTRL_TYPE_H264_PPS``
+ - n/a
+ - n/a
+ - n/a
+ - A struct :c:type:`v4l2_ctrl_h264_pps`, containing H264
+ picture parameters for stateless video decoders.
+ * - ``V4L2_CTRL_TYPE_H264_SCALING_MATRIX``
+ - n/a
+ - n/a
+ - n/a
+ - A struct :c:type:`v4l2_ctrl_h264_scaling_matrix`, containing H264
+ scaling matrices for stateless video decoders.
+ * - ``V4L2_CTRL_TYPE_H264_SLICE_PARAMS``
+ - n/a
+ - n/a
+ - n/a
+ - A struct :c:type:`v4l2_ctrl_h264_slice_params`, containing H264
+ slice parameters for stateless video decoders.
+ * - ``V4L2_CTRL_TYPE_H264_DECODE_PARAMS``
+ - n/a
+ - n/a
+ - n/a
+ - A struct :c:type:`v4l2_ctrl_h264_decode_params`, containing H264
+ decode parameters for stateless video decoders.
.. tabularcolumns:: |p{6.6cm}|p{2.2cm}|p{8.7cm}|
diff --git a/Documentation/media/v4l-drivers/index.rst b/Documentation/media/v4l-drivers/index.rst
index 33a055907258..c4c78a28654c 100644
--- a/Documentation/media/v4l-drivers/index.rst
+++ b/Documentation/media/v4l-drivers/index.rst
@@ -64,5 +64,6 @@ For more details see the file COPYING in the source distribution of Linux.
si476x
soc-camera
uvcvideo
+ vimc
vivid
zr364xx
diff --git a/Documentation/media/v4l-drivers/vimc.dot b/Documentation/media/v4l-drivers/vimc.dot
new file mode 100644
index 000000000000..57863a13fa39
--- /dev/null
+++ b/Documentation/media/v4l-drivers/vimc.dot
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0
+
+digraph board {
+ rankdir=TB
+ n00000001 [label="{{} | Sensor A\n/dev/v4l-subdev0 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000001:port0 -> n00000005:port0 [style=bold]
+ n00000001:port0 -> n0000000b [style=bold]
+ n00000003 [label="{{} | Sensor B\n/dev/v4l-subdev1 | {<port0> 0}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000003:port0 -> n00000008:port0 [style=bold]
+ n00000003:port0 -> n0000000f [style=bold]
+ n00000005 [label="{{<port0> 0} | Debayer A\n/dev/v4l-subdev2 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000005:port1 -> n00000017:port0
+ n00000008 [label="{{<port0> 0} | Debayer B\n/dev/v4l-subdev3 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000008:port1 -> n00000017:port0 [style=dashed]
+ n0000000b [label="Raw Capture 0\n/dev/video0", shape=box, style=filled, fillcolor=yellow]
+ n0000000f [label="Raw Capture 1\n/dev/video1", shape=box, style=filled, fillcolor=yellow]
+ n00000013 [label="RGB/YUV Input\n/dev/video2", shape=box, style=filled, fillcolor=yellow]
+ n00000013 -> n00000017:port0 [style=dashed]
+ n00000017 [label="{{<port0> 0} | Scaler\n/dev/v4l-subdev4 | {<port1> 1}}", shape=Mrecord, style=filled, fillcolor=green]
+ n00000017:port1 -> n0000001a [style=bold]
+ n0000001a [label="RGB/YUV Capture\n/dev/video3", shape=box, style=filled, fillcolor=yellow]
+}
diff --git a/Documentation/media/v4l-drivers/vimc.rst b/Documentation/media/v4l-drivers/vimc.rst
new file mode 100644
index 000000000000..4628b12d417f
--- /dev/null
+++ b/Documentation/media/v4l-drivers/vimc.rst
@@ -0,0 +1,98 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+The Virtual Media Controller Driver (vimc)
+==========================================
+
+The vimc driver emulates complex video hardware using the V4L2 API and the Media
+API. It has a capture device and three subdevices: sensor, debayer and scaler.
+
+Topology
+--------
+
+The topology is hardcoded, although you could modify it in vimc-core and
+recompile the driver to achieve your own topology. This is the default topology:
+
+.. _vimc_topology_graph:
+
+.. kernel-figure:: vimc.dot
+ :alt: vimc.dot
+ :align: center
+
+ Media pipeline graph on vimc
+
+Configuring the topology
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Each subdevice will come with its default configuration (pixelformat, height,
+width, ...). One needs to configure the topology in order to match the
+configuration on each linked subdevice to stream frames through the pipeline.
+If the configuration doesn't match, the stream will fail. The ``v4l-utils``
+package is a bundle of user-space applications, that comes with ``media-ctl`` and
+``v4l2-ctl`` that can be used to configure the vimc configuration. This sequence
+of commands fits for the default topology:
+
+.. code-block:: bash
+
+ media-ctl -d platform:vimc -V '"Sensor A":0[fmt:SBGGR8_1X8/640x480]'
+ media-ctl -d platform:vimc -V '"Debayer A":0[fmt:SBGGR8_1X8/640x480]'
+ media-ctl -d platform:vimc -V '"Sensor B":0[fmt:SBGGR8_1X8/640x480]'
+ media-ctl -d platform:vimc -V '"Debayer B":0[fmt:SBGGR8_1X8/640x480]'
+ v4l2-ctl -z platform:vimc -d "RGB/YUV Capture" -v width=1920,height=1440
+ v4l2-ctl -z platform:vimc -d "Raw Capture 0" -v pixelformat=BA81
+ v4l2-ctl -z platform:vimc -d "Raw Capture 1" -v pixelformat=BA81
+
+Subdevices
+----------
+
+Subdevices define the behavior of an entity in the topology. Depending on the
+subdevice, the entity can have multiple pads of type source or sink.
+
+vimc-sensor:
+ Generates images in several formats using video test pattern generator.
+ Exposes:
+
+ * 1 Pad source
+
+vimc-debayer:
+ Transforms images in bayer format into a non-bayer format.
+ Exposes:
+
+ * 1 Pad sink
+ * 1 Pad source
+
+vimc-scaler:
+ Scale up the image by a factor of 3. E.g.: a 640x480 image becomes a
+ 1920x1440 image. (this value can be configured, see at
+ `Module options`_).
+ Exposes:
+
+ * 1 Pad sink
+ * 1 Pad source
+
+vimc-capture:
+ Exposes node /dev/videoX to allow userspace to capture the stream.
+ Exposes:
+
+ * 1 Pad sink
+ * 1 Pad source
+
+Module options
+---------------
+
+Vimc has a few module parameters to configure the driver. You should pass
+those arguments to each subdevice, not to the vimc module. For example::
+
+ vimc_subdevice.param=value
+
+* ``vimc_scaler.sca_mult=<unsigned int>``
+
+ Image size multiplier factor to be used to multiply both width and
+ height, so the image size will be ``sca_mult^2`` bigger than the
+ original one. Currently, only supports scaling up (the default value
+ is 3).
+
+* ``vimc_debayer.deb_mean_win_size=<unsigned int>``
+
+ Window size to calculate the mean. Note: the window size needs to be an
+ odd number, as the main pixel stays in the center of the window,
+ otherwise the next odd number is considered (the default value is 3).
diff --git a/Documentation/media/v4l-drivers/vivid.rst b/Documentation/media/v4l-drivers/vivid.rst
index edb6f33e029c..7082fec4075d 100644
--- a/Documentation/media/v4l-drivers/vivid.rst
+++ b/Documentation/media/v4l-drivers/vivid.rst
@@ -941,6 +941,11 @@ Digital Video Controls
affects the reported colorspace since DVI_D outputs will always use
sRGB.
+- Display Present:
+
+ sets the presence of a "display" on the HDMI output. This affects
+ the tx_edid_present, tx_hotplug and tx_rxsense controls.
+
FM Radio Receiver Controls
~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/media/videodev2.h.rst.exceptions b/Documentation/media/videodev2.h.rst.exceptions
index 64d348e67df9..55cbe324b9fc 100644
--- a/Documentation/media/videodev2.h.rst.exceptions
+++ b/Documentation/media/videodev2.h.rst.exceptions
@@ -136,6 +136,11 @@ replace symbol V4L2_CTRL_TYPE_U32 :c:type:`v4l2_ctrl_type`
replace symbol V4L2_CTRL_TYPE_U8 :c:type:`v4l2_ctrl_type`
replace symbol V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS :c:type:`v4l2_ctrl_type`
replace symbol V4L2_CTRL_TYPE_MPEG2_QUANTIZATION :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_SPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_PPS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_SCALING_MATRIX :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_SLICE_PARAMS :c:type:`v4l2_ctrl_type`
+replace symbol V4L2_CTRL_TYPE_H264_DECODE_PARAMS :c:type:`v4l2_ctrl_type`
# V4L2 capability defines
replace define V4L2_CAP_VIDEO_CAPTURE device-capabilities
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f4170aae1d75..045bb8148fe9 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -3,7 +3,7 @@
============================
By: David Howells <dhowells@redhat.com>
- Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ Paul E. McKenney <paulmck@linux.ibm.com>
Will Deacon <will.deacon@arm.com>
Peter Zijlstra <peterz@infradead.org>
diff --git a/Documentation/memory-devices/ti-emif.txt b/Documentation/memory-devices/ti-emif.txt
deleted file mode 100644
index f4ad9a7d0f4b..000000000000
--- a/Documentation/memory-devices/ti-emif.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-TI EMIF SDRAM Controller Driver:
-
-Author
-========
-Aneesh V <aneesh@ti.com>
-
-Location
-============
-driver/memory/emif.c
-
-Supported SoCs:
-===================
-TI OMAP44xx
-TI OMAP54xx
-
-Menuconfig option:
-==========================
-Device Drivers
- Memory devices
- Texas Instruments EMIF driver
-
-Description
-===========
-This driver is for the EMIF module available in Texas Instruments
-SoCs. EMIF is an SDRAM controller that, based on its revision,
-supports one or more of DDR2, DDR3, and LPDDR2 SDRAM protocols.
-This driver takes care of only LPDDR2 memories presently. The
-functions of the driver includes re-configuring AC timing
-parameters and other settings during frequency, voltage and
-temperature changes
-
-Platform Data (see include/linux/platform_data/emif_plat.h):
-=====================================================================
-DDR device details and other board dependent and SoC dependent
-information can be passed through platform data (struct emif_platform_data)
-- DDR device details: 'struct ddr_device_info'
-- Device AC timings: 'struct lpddr2_timings' and 'struct lpddr2_min_tck'
-- Custom configurations: customizable policy options through
- 'struct emif_custom_configs'
-- IP revision
-- PHY type
-
-Interface to the external world:
-================================
-EMIF driver registers notifiers for voltage and frequency changes
-affecting EMIF and takes appropriate actions when these are invoked.
-- freq_pre_notify_handling()
-- freq_post_notify_handling()
-- volt_notify_handling()
-
-Debugfs
-========
-The driver creates two debugfs entries per device.
-- regcache_dump : dump of register values calculated and saved for all
- frequencies used so far.
-- mr4 : last polled value of MR4 register in the LPDDR2 device. MR4
- indicates the current temperature level of the device.
diff --git a/Documentation/mic/index.rst b/Documentation/mic/index.rst
index 082fa8f6a260..3a8d06367ef1 100644
--- a/Documentation/mic/index.rst
+++ b/Documentation/mic/index.rst
@@ -1,5 +1,3 @@
-:orphan:
-
=============================================
Intel Many Integrated Core (MIC) architecture
=============================================
diff --git a/Documentation/misc-devices/eeprom b/Documentation/misc-devices/eeprom
deleted file mode 100644
index ba692011f221..000000000000
--- a/Documentation/misc-devices/eeprom
+++ /dev/null
@@ -1,96 +0,0 @@
-Kernel driver eeprom
-====================
-
-Supported chips:
- * Any EEPROM chip in the designated address range
- Prefix: 'eeprom'
- Addresses scanned: I2C 0x50 - 0x57
- Datasheets: Publicly available from:
- Atmel (www.atmel.com),
- Catalyst (www.catsemi.com),
- Fairchild (www.fairchildsemi.com),
- Microchip (www.microchip.com),
- Philips (www.semiconductor.philips.com),
- Rohm (www.rohm.com),
- ST (www.st.com),
- Xicor (www.xicor.com),
- and others.
-
- Chip Size (bits) Address
- 24C01 1K 0x50 (shadows at 0x51 - 0x57)
- 24C01A 1K 0x50 - 0x57 (Typical device on DIMMs)
- 24C02 2K 0x50 - 0x57
- 24C04 4K 0x50, 0x52, 0x54, 0x56
- (additional data at 0x51, 0x53, 0x55, 0x57)
- 24C08 8K 0x50, 0x54 (additional data at 0x51, 0x52,
- 0x53, 0x55, 0x56, 0x57)
- 24C16 16K 0x50 (additional data at 0x51 - 0x57)
- Sony 2K 0x57
-
- Atmel 34C02B 2K 0x50 - 0x57, SW write protect at 0x30-37
- Catalyst 34FC02 2K 0x50 - 0x57, SW write protect at 0x30-37
- Catalyst 34RC02 2K 0x50 - 0x57, SW write protect at 0x30-37
- Fairchild 34W02 2K 0x50 - 0x57, SW write protect at 0x30-37
- Microchip 24AA52 2K 0x50 - 0x57, SW write protect at 0x30-37
- ST M34C02 2K 0x50 - 0x57, SW write protect at 0x30-37
-
-
-Authors:
- Frodo Looijaard <frodol@dds.nl>,
- Philip Edelbrock <phil@netroedge.com>,
- Jean Delvare <jdelvare@suse.de>,
- Greg Kroah-Hartman <greg@kroah.com>,
- IBM Corp.
-
-Description
------------
-
-This is a simple EEPROM module meant to enable reading the first 256 bytes
-of an EEPROM (on a SDRAM DIMM for example). However, it will access serial
-EEPROMs on any I2C adapter. The supported devices are generically called
-24Cxx, and are listed above; however the numbering for these
-industry-standard devices may vary by manufacturer.
-
-This module was a programming exercise to get used to the new project
-organization laid out by Frodo, but it should be at least completely
-effective for decoding the contents of EEPROMs on DIMMs.
-
-DIMMS will typically contain a 24C01A or 24C02, or the 34C02 variants.
-The other devices will not be found on a DIMM because they respond to more
-than one address.
-
-DDC Monitors may contain any device. Often a 24C01, which responds to all 8
-addresses, is found.
-
-Recent Sony Vaio laptops have an EEPROM at 0x57. We couldn't get the
-specification, so it is guess work and far from being complete.
-
-The Microchip 24AA52/24LCS52, ST M34C02, and others support an additional
-software write protect register at 0x30 - 0x37 (0x20 less than the memory
-location). The chip responds to "write quick" detection at this address but
-does not respond to byte reads. If this register is present, the lower 128
-bytes of the memory array are not write protected. Any byte data write to
-this address will write protect the memory array permanently, and the
-device will no longer respond at the 0x30-37 address. The eeprom driver
-does not support this register.
-
-Lacking functionality:
-
-* Full support for larger devices (24C04, 24C08, 24C16). These are not
-typically found on a PC. These devices will appear as separate devices at
-multiple addresses.
-
-* Support for really large devices (24C32, 24C64, 24C128, 24C256, 24C512).
-These devices require two-byte address fields and are not supported.
-
-* Enable Writing. Again, no technical reason why not, but making it easy
-to change the contents of the EEPROMs (on DIMMs anyway) also makes it easy
-to disable the DIMMs (potentially preventing the computer from booting)
-until the values are restored somehow.
-
-Use:
-
-After inserting the module (and any other required SMBus/i2c modules), you
-should have some EEPROM directories in /sys/bus/i2c/devices/* of names such
-as "0-0050". Inside each of these is a series of files, the eeprom file
-contains the binary data from EEPROM.
diff --git a/Documentation/misc-devices/eeprom.rst b/Documentation/misc-devices/eeprom.rst
new file mode 100644
index 000000000000..008249675ccc
--- /dev/null
+++ b/Documentation/misc-devices/eeprom.rst
@@ -0,0 +1,107 @@
+====================
+Kernel driver eeprom
+====================
+
+Supported chips:
+
+ * Any EEPROM chip in the designated address range
+
+ Prefix: 'eeprom'
+
+ Addresses scanned: I2C 0x50 - 0x57
+
+ Datasheets: Publicly available from:
+
+ Atmel (www.atmel.com),
+ Catalyst (www.catsemi.com),
+ Fairchild (www.fairchildsemi.com),
+ Microchip (www.microchip.com),
+ Philips (www.semiconductor.philips.com),
+ Rohm (www.rohm.com),
+ ST (www.st.com),
+ Xicor (www.xicor.com),
+ and others.
+
+ ========= ============= ============================================
+ Chip Size (bits) Address
+ ========= ============= ============================================
+ 24C01 1K 0x50 (shadows at 0x51 - 0x57)
+ 24C01A 1K 0x50 - 0x57 (Typical device on DIMMs)
+ 24C02 2K 0x50 - 0x57
+ 24C04 4K 0x50, 0x52, 0x54, 0x56
+ (additional data at 0x51, 0x53, 0x55, 0x57)
+ 24C08 8K 0x50, 0x54 (additional data at 0x51, 0x52,
+ 0x53, 0x55, 0x56, 0x57)
+ 24C16 16K 0x50 (additional data at 0x51 - 0x57)
+ Sony 2K 0x57
+
+ Atmel 34C02B 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Catalyst 34FC02 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Catalyst 34RC02 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Fairchild 34W02 2K 0x50 - 0x57, SW write protect at 0x30-37
+ Microchip 24AA52 2K 0x50 - 0x57, SW write protect at 0x30-37
+ ST M34C02 2K 0x50 - 0x57, SW write protect at 0x30-37
+ ========= ============= ============================================
+
+
+Authors:
+ - Frodo Looijaard <frodol@dds.nl>,
+ - Philip Edelbrock <phil@netroedge.com>,
+ - Jean Delvare <jdelvare@suse.de>,
+ - Greg Kroah-Hartman <greg@kroah.com>,
+ - IBM Corp.
+
+Description
+-----------
+
+This is a simple EEPROM module meant to enable reading the first 256 bytes
+of an EEPROM (on a SDRAM DIMM for example). However, it will access serial
+EEPROMs on any I2C adapter. The supported devices are generically called
+24Cxx, and are listed above; however the numbering for these
+industry-standard devices may vary by manufacturer.
+
+This module was a programming exercise to get used to the new project
+organization laid out by Frodo, but it should be at least completely
+effective for decoding the contents of EEPROMs on DIMMs.
+
+DIMMS will typically contain a 24C01A or 24C02, or the 34C02 variants.
+The other devices will not be found on a DIMM because they respond to more
+than one address.
+
+DDC Monitors may contain any device. Often a 24C01, which responds to all 8
+addresses, is found.
+
+Recent Sony Vaio laptops have an EEPROM at 0x57. We couldn't get the
+specification, so it is guess work and far from being complete.
+
+The Microchip 24AA52/24LCS52, ST M34C02, and others support an additional
+software write protect register at 0x30 - 0x37 (0x20 less than the memory
+location). The chip responds to "write quick" detection at this address but
+does not respond to byte reads. If this register is present, the lower 128
+bytes of the memory array are not write protected. Any byte data write to
+this address will write protect the memory array permanently, and the
+device will no longer respond at the 0x30-37 address. The eeprom driver
+does not support this register.
+
+Lacking functionality
+---------------------
+
+* Full support for larger devices (24C04, 24C08, 24C16). These are not
+ typically found on a PC. These devices will appear as separate devices at
+ multiple addresses.
+
+* Support for really large devices (24C32, 24C64, 24C128, 24C256, 24C512).
+ These devices require two-byte address fields and are not supported.
+
+* Enable Writing. Again, no technical reason why not, but making it easy
+ to change the contents of the EEPROMs (on DIMMs anyway) also makes it easy
+ to disable the DIMMs (potentially preventing the computer from booting)
+ until the values are restored somehow.
+
+Use
+---
+
+After inserting the module (and any other required SMBus/i2c modules), you
+should have some EEPROM directories in ``/sys/bus/i2c/devices/*`` of names such
+as "0-0050". Inside each of these is a series of files, the eeprom file
+contains the binary data from EEPROM.
diff --git a/Documentation/misc-devices/ics932s401 b/Documentation/misc-devices/ics932s401
deleted file mode 100644
index bdac67ff6e3f..000000000000
--- a/Documentation/misc-devices/ics932s401
+++ /dev/null
@@ -1,31 +0,0 @@
-Kernel driver ics932s401
-======================
-
-Supported chips:
- * IDT ICS932S401
- Prefix: 'ics932s401'
- Addresses scanned: I2C 0x69
- Datasheet: Publicly available at the IDT website
-
-Author: Darrick J. Wong
-
-Description
------------
-
-This driver implements support for the IDT ICS932S401 chip family.
-
-This chip has 4 clock outputs--a base clock for the CPU (which is likely
-multiplied to get the real CPU clock), a system clock, a PCI clock, a USB
-clock, and a reference clock. The driver reports selected and actual
-frequency. If spread spectrum mode is enabled, the driver also reports by what
-percent the clock signal is being spread, which should be between 0 and -0.5%.
-All frequencies are reported in KHz.
-
-The ICS932S401 monitors all inputs continuously. The driver will not read
-the registers more often than once every other second.
-
-Special Features
-----------------
-
-The clocks could be reprogrammed to increase system speed. I will not help you
-do this, as you risk damaging your system!
diff --git a/Documentation/misc-devices/ics932s401.rst b/Documentation/misc-devices/ics932s401.rst
new file mode 100644
index 000000000000..613ee54a9c21
--- /dev/null
+++ b/Documentation/misc-devices/ics932s401.rst
@@ -0,0 +1,36 @@
+========================
+Kernel driver ics932s401
+========================
+
+Supported chips:
+
+ * IDT ICS932S401
+
+ Prefix: 'ics932s401'
+
+ Addresses scanned: I2C 0x69
+
+ Datasheet: Publicly available at the IDT website
+
+Author: Darrick J. Wong
+
+Description
+-----------
+
+This driver implements support for the IDT ICS932S401 chip family.
+
+This chip has 4 clock outputs--a base clock for the CPU (which is likely
+multiplied to get the real CPU clock), a system clock, a PCI clock, a USB
+clock, and a reference clock. The driver reports selected and actual
+frequency. If spread spectrum mode is enabled, the driver also reports by what
+percent the clock signal is being spread, which should be between 0 and -0.5%.
+All frequencies are reported in KHz.
+
+The ICS932S401 monitors all inputs continuously. The driver will not read
+the registers more often than once every other second.
+
+Special Features
+----------------
+
+The clocks could be reprogrammed to increase system speed. I will not help you
+do this, as you risk damaging your system!
diff --git a/Documentation/misc-devices/index.rst b/Documentation/misc-devices/index.rst
index dfd1f45a3127..a57f92dfe49a 100644
--- a/Documentation/misc-devices/index.rst
+++ b/Documentation/misc-devices/index.rst
@@ -14,4 +14,9 @@ fit into other categories.
.. toctree::
:maxdepth: 2
+ eeprom
ibmvmc
+ ics932s401
+ isl29003
+ lis3lv02d
+ max6875
diff --git a/Documentation/misc-devices/isl29003 b/Documentation/misc-devices/isl29003
deleted file mode 100644
index 80b952fd32ff..000000000000
--- a/Documentation/misc-devices/isl29003
+++ /dev/null
@@ -1,62 +0,0 @@
-Kernel driver isl29003
-=====================
-
-Supported chips:
-* Intersil ISL29003
-Prefix: 'isl29003'
-Addresses scanned: none
-Datasheet:
-http://www.intersil.com/data/fn/fn7464.pdf
-
-Author: Daniel Mack <daniel@caiaq.de>
-
-
-Description
------------
-The ISL29003 is an integrated light sensor with a 16-bit integrating type
-ADC, I2C user programmable lux range select for optimized counts/lux, and
-I2C multi-function control and monitoring capabilities. The internal ADC
-provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by
-artificial light sources.
-
-The driver allows to set the lux range, the bit resolution, the operational
-mode (see below) and the power state of device and can read the current lux
-value, of course.
-
-
-Detection
----------
-
-The ISL29003 does not have an ID register which could be used to identify
-it, so the detection routine will just try to read from the configured I2C
-address and consider the device to be present as soon as it ACKs the
-transfer.
-
-
-Sysfs entries
--------------
-
-range:
- 0: 0 lux to 1000 lux (default)
- 1: 0 lux to 4000 lux
- 2: 0 lux to 16,000 lux
- 3: 0 lux to 64,000 lux
-
-resolution:
- 0: 2^16 cycles (default)
- 1: 2^12 cycles
- 2: 2^8 cycles
- 3: 2^4 cycles
-
-mode:
- 0: diode1's current (unsigned 16bit) (default)
- 1: diode1's current (unsigned 16bit)
- 2: difference between diodes (l1 - l2, signed 15bit)
-
-power_state:
- 0: device is disabled (default)
- 1: device is enabled
-
-lux (read only):
- returns the value from the last sensor reading
-
diff --git a/Documentation/misc-devices/isl29003.rst b/Documentation/misc-devices/isl29003.rst
new file mode 100644
index 000000000000..0cc38aed6c00
--- /dev/null
+++ b/Documentation/misc-devices/isl29003.rst
@@ -0,0 +1,75 @@
+======================
+Kernel driver isl29003
+======================
+
+Supported chips:
+
+* Intersil ISL29003
+
+Prefix: 'isl29003'
+
+Addresses scanned: none
+
+Datasheet:
+http://www.intersil.com/data/fn/fn7464.pdf
+
+Author: Daniel Mack <daniel@caiaq.de>
+
+
+Description
+-----------
+The ISL29003 is an integrated light sensor with a 16-bit integrating type
+ADC, I2C user programmable lux range select for optimized counts/lux, and
+I2C multi-function control and monitoring capabilities. The internal ADC
+provides 16-bit resolution while rejecting 50Hz and 60Hz flicker caused by
+artificial light sources.
+
+The driver allows to set the lux range, the bit resolution, the operational
+mode (see below) and the power state of device and can read the current lux
+value, of course.
+
+
+Detection
+---------
+
+The ISL29003 does not have an ID register which could be used to identify
+it, so the detection routine will just try to read from the configured I2C
+address and consider the device to be present as soon as it ACKs the
+transfer.
+
+
+Sysfs entries
+-------------
+
+range:
+ == ===========================
+ 0: 0 lux to 1000 lux (default)
+ 1: 0 lux to 4000 lux
+ 2: 0 lux to 16,000 lux
+ 3: 0 lux to 64,000 lux
+ == ===========================
+
+resolution:
+ == =====================
+ 0: 2^16 cycles (default)
+ 1: 2^12 cycles
+ 2: 2^8 cycles
+ 3: 2^4 cycles
+ == =====================
+
+mode:
+ == =================================================
+ 0: diode1's current (unsigned 16bit) (default)
+ 1: diode1's current (unsigned 16bit)
+ 2: difference between diodes (l1 - l2, signed 15bit)
+ == =================================================
+
+power_state:
+ == =================================================
+ 0: device is disabled (default)
+ 1: device is enabled
+ == =================================================
+
+lux (read only):
+ returns the value from the last sensor reading
+
diff --git a/Documentation/misc-devices/lis3lv02d b/Documentation/misc-devices/lis3lv02d
deleted file mode 100644
index f89960a0ff95..000000000000
--- a/Documentation/misc-devices/lis3lv02d
+++ /dev/null
@@ -1,93 +0,0 @@
-Kernel driver lis3lv02d
-=======================
-
-Supported chips:
-
- * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision)
- * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits) and
- LIS331DLH (16 bits)
-
-Authors:
- Yan Burman <burman.yan@gmail.com>
- Eric Piel <eric.piel@tremplin-utc.net>
-
-
-Description
------------
-
-This driver provides support for the accelerometer found in various HP laptops
-sporting the feature officially called "HP Mobile Data Protection System 3D" or
-"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known
-models (full list can be found in drivers/platform/x86/hp_accel.c) will have
-their axis automatically oriented on standard way (eg: you can directly play
-neverball). The accelerometer data is readable via
-/sys/devices/platform/lis3lv02d. Reported values are scaled
-to mg values (1/1000th of earth gravity).
-
-Sysfs attributes under /sys/devices/platform/lis3lv02d/:
-position - 3D position that the accelerometer reports. Format: "(x,y,z)"
-rate - read reports the sampling rate of the accelerometer device in HZ.
- write changes sampling rate of the accelerometer device.
- Only values which are supported by HW are accepted.
-selftest - performs selftest for the chip as specified by chip manufacturer.
-
-This driver also provides an absolute input class device, allowing
-the laptop to act as a pinball machine-esque joystick. Joystick device can be
-calibrated. Joystick device can be in two different modes.
-By default output values are scaled between -32768 .. 32767. In joystick raw
-mode, joystick and sysfs position entry have the same scale. There can be
-small difference due to input system fuzziness feature.
-Events are also available as input event device.
-
-Selftest is meant only for hardware diagnostic purposes. It is not meant to be
-used during normal operations. Position data is not corrupted during selftest
-but interrupt behaviour is not guaranteed to work reliably. In test mode, the
-sensing element is internally moved little bit. Selftest measures difference
-between normal mode and test mode. Chip specifications tell the acceptance
-limit for each type of the chip. Limits are provided via platform data
-to allow adjustment of the limits without a change to the actual driver.
-Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are
-measured difference between modes. Axes are not remapped in selftest mode.
-Measurement values are provided to help HW diagnostic applications to make
-final decision.
-
-On HP laptops, if the led infrastructure is activated, support for a led
-indicating disk protection will be provided as /sys/class/leds/hp::hddprotect.
-
-Another feature of the driver is misc device called "freefall" that
-acts similar to /dev/rtc and reacts on free-fall interrupts received
-from the device. It supports blocking operations, poll/select and
-fasync operation modes. You must read 1 bytes from the device. The
-result is number of free-fall interrupts since the last successful
-read (or 255 if number of interrupts would not fit). See the freefall.c
-file for an example on using the device.
-
-
-Axes orientation
-----------------
-
-For better compatibility between the various laptops. The values reported by
-the accelerometer are converted into a "standard" organisation of the axes
-(aka "can play neverball out of the box"):
- * When the laptop is horizontal the position reported is about 0 for X and Y
- and a positive value for Z
- * If the left side is elevated, X increases (becomes positive)
- * If the front side (where the touchpad is) is elevated, Y decreases
- (becomes negative)
- * If the laptop is put upside-down, Z becomes negative
-
-If your laptop model is not recognized (cf "dmesg"), you can send an
-email to the maintainer to add it to the database. When reporting a new
-laptop, please include the output of "dmidecode" plus the value of
-/sys/devices/platform/lis3lv02d/position in these four cases.
-
-Q&A
----
-
-Q: How do I safely simulate freefall? I have an HP "portable
-workstation" which has about 3.5kg and a plastic case, so letting it
-fall to the ground is out of question...
-
-A: The sensor is pretty sensitive, so your hands can do it. Lift it
-into free space, follow the fall with your hands for like 10
-centimeters. That should be enough to trigger the detection.
diff --git a/Documentation/misc-devices/lis3lv02d.rst b/Documentation/misc-devices/lis3lv02d.rst
new file mode 100644
index 000000000000..959bd2b822cf
--- /dev/null
+++ b/Documentation/misc-devices/lis3lv02d.rst
@@ -0,0 +1,99 @@
+=======================
+Kernel driver lis3lv02d
+=======================
+
+Supported chips:
+
+ * STMicroelectronics LIS3LV02DL, LIS3LV02DQ (12 bits precision)
+ * STMicroelectronics LIS302DL, LIS3L02DQ, LIS331DL (8 bits) and
+ LIS331DLH (16 bits)
+
+Authors:
+ - Yan Burman <burman.yan@gmail.com>
+ - Eric Piel <eric.piel@tremplin-utc.net>
+
+
+Description
+-----------
+
+This driver provides support for the accelerometer found in various HP laptops
+sporting the feature officially called "HP Mobile Data Protection System 3D" or
+"HP 3D DriveGuard". It detects automatically laptops with this sensor. Known
+models (full list can be found in drivers/platform/x86/hp_accel.c) will have
+their axis automatically oriented on standard way (eg: you can directly play
+neverball). The accelerometer data is readable via
+/sys/devices/platform/lis3lv02d. Reported values are scaled
+to mg values (1/1000th of earth gravity).
+
+Sysfs attributes under /sys/devices/platform/lis3lv02d/:
+
+position
+ - 3D position that the accelerometer reports. Format: "(x,y,z)"
+rate
+ - read reports the sampling rate of the accelerometer device in HZ.
+ write changes sampling rate of the accelerometer device.
+ Only values which are supported by HW are accepted.
+selftest
+ - performs selftest for the chip as specified by chip manufacturer.
+
+This driver also provides an absolute input class device, allowing
+the laptop to act as a pinball machine-esque joystick. Joystick device can be
+calibrated. Joystick device can be in two different modes.
+By default output values are scaled between -32768 .. 32767. In joystick raw
+mode, joystick and sysfs position entry have the same scale. There can be
+small difference due to input system fuzziness feature.
+Events are also available as input event device.
+
+Selftest is meant only for hardware diagnostic purposes. It is not meant to be
+used during normal operations. Position data is not corrupted during selftest
+but interrupt behaviour is not guaranteed to work reliably. In test mode, the
+sensing element is internally moved little bit. Selftest measures difference
+between normal mode and test mode. Chip specifications tell the acceptance
+limit for each type of the chip. Limits are provided via platform data
+to allow adjustment of the limits without a change to the actual driver.
+Seltest returns either "OK x y z" or "FAIL x y z" where x, y and z are
+measured difference between modes. Axes are not remapped in selftest mode.
+Measurement values are provided to help HW diagnostic applications to make
+final decision.
+
+On HP laptops, if the led infrastructure is activated, support for a led
+indicating disk protection will be provided as /sys/class/leds/hp::hddprotect.
+
+Another feature of the driver is misc device called "freefall" that
+acts similar to /dev/rtc and reacts on free-fall interrupts received
+from the device. It supports blocking operations, poll/select and
+fasync operation modes. You must read 1 bytes from the device. The
+result is number of free-fall interrupts since the last successful
+read (or 255 if number of interrupts would not fit). See the freefall.c
+file for an example on using the device.
+
+
+Axes orientation
+----------------
+
+For better compatibility between the various laptops. The values reported by
+the accelerometer are converted into a "standard" organisation of the axes
+(aka "can play neverball out of the box"):
+
+ * When the laptop is horizontal the position reported is about 0 for X and Y
+ and a positive value for Z
+ * If the left side is elevated, X increases (becomes positive)
+ * If the front side (where the touchpad is) is elevated, Y decreases
+ (becomes negative)
+ * If the laptop is put upside-down, Z becomes negative
+
+If your laptop model is not recognized (cf "dmesg"), you can send an
+email to the maintainer to add it to the database. When reporting a new
+laptop, please include the output of "dmidecode" plus the value of
+/sys/devices/platform/lis3lv02d/position in these four cases.
+
+Q&A
+---
+
+Q: How do I safely simulate freefall? I have an HP "portable
+workstation" which has about 3.5kg and a plastic case, so letting it
+fall to the ground is out of question...
+
+A: The sensor is pretty sensitive, so your hands can do it. Lift it
+into free space, follow the fall with your hands for like 10
+centimeters. That should be enough to trigger the detection.
diff --git a/Documentation/misc-devices/max6875 b/Documentation/misc-devices/max6875
deleted file mode 100644
index 2f2bd0b17b5d..000000000000
--- a/Documentation/misc-devices/max6875
+++ /dev/null
@@ -1,110 +0,0 @@
-Kernel driver max6875
-=====================
-
-Supported chips:
- * Maxim MAX6874, MAX6875
- Prefix: 'max6875'
- Addresses scanned: None (see below)
- Datasheet:
- http://pdfserv.maxim-ic.com/en/ds/MAX6874-MAX6875.pdf
-
-Author: Ben Gardner <bgardner@wabtec.com>
-
-
-Description
------------
-
-The Maxim MAX6875 is an EEPROM-programmable power-supply sequencer/supervisor.
-It provides timed outputs that can be used as a watchdog, if properly wired.
-It also provides 512 bytes of user EEPROM.
-
-At reset, the MAX6875 reads the configuration EEPROM into its configuration
-registers. The chip then begins to operate according to the values in the
-registers.
-
-The Maxim MAX6874 is a similar, mostly compatible device, with more inputs
-and outputs:
- vin gpi vout
-MAX6874 6 4 8
-MAX6875 4 3 5
-
-See the datasheet for more information.
-
-
-Sysfs entries
--------------
-
-eeprom - 512 bytes of user-defined EEPROM space.
-
-
-General Remarks
----------------
-
-Valid addresses for the MAX6875 are 0x50 and 0x52.
-Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56.
-The driver does not probe any address, so you explicitly instantiate the
-devices.
-
-Example:
-$ modprobe max6875
-$ echo max6875 0x50 > /sys/bus/i2c/devices/i2c-0/new_device
-
-The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple
-addresses. For example, for address 0x50, it also reserves 0x51.
-The even-address instance is called 'max6875', the odd one is 'dummy'.
-
-
-Programming the chip using i2c-dev
-----------------------------------
-
-Use the i2c-dev interface to access and program the chips.
-Reads and writes are performed differently depending on the address range.
-
-The configuration registers are at addresses 0x00 - 0x45.
-Use i2c_smbus_write_byte_data() to write a register and
-i2c_smbus_read_byte_data() to read a register.
-The command is the register number.
-
-Examples:
-To write a 1 to register 0x45:
- i2c_smbus_write_byte_data(fd, 0x45, 1);
-
-To read register 0x45:
- value = i2c_smbus_read_byte_data(fd, 0x45);
-
-
-The configuration EEPROM is at addresses 0x8000 - 0x8045.
-The user EEPROM is at addresses 0x8100 - 0x82ff.
-
-Use i2c_smbus_write_word_data() to write a byte to EEPROM.
-
-The command is the upper byte of the address: 0x80, 0x81, or 0x82.
-The data word is the lower part of the address or'd with data << 8.
- cmd = address >> 8;
- val = (address & 0xff) | (data << 8);
-
-Example:
-To write 0x5a to address 0x8003:
- i2c_smbus_write_word_data(fd, 0x80, 0x5a03);
-
-
-Reading data from the EEPROM is a little more complicated.
-Use i2c_smbus_write_byte_data() to set the read address and then
-i2c_smbus_read_byte() or i2c_smbus_read_i2c_block_data() to read the data.
-
-Example:
-To read data starting at offset 0x8100, first set the address:
- i2c_smbus_write_byte_data(fd, 0x81, 0x00);
-
-And then read the data
- value = i2c_smbus_read_byte(fd);
-
- or
-
- count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer);
-
-The block read should read 16 bytes.
-0x84 is the block read command.
-
-See the datasheet for more details.
-
diff --git a/Documentation/misc-devices/max6875.rst b/Documentation/misc-devices/max6875.rst
new file mode 100644
index 000000000000..ad419ac22a5b
--- /dev/null
+++ b/Documentation/misc-devices/max6875.rst
@@ -0,0 +1,136 @@
+=====================
+Kernel driver max6875
+=====================
+
+Supported chips:
+
+ * Maxim MAX6874, MAX6875
+
+ Prefix: 'max6875'
+
+ Addresses scanned: None (see below)
+
+ Datasheet: http://pdfserv.maxim-ic.com/en/ds/MAX6874-MAX6875.pdf
+
+Author: Ben Gardner <bgardner@wabtec.com>
+
+
+Description
+-----------
+
+The Maxim MAX6875 is an EEPROM-programmable power-supply sequencer/supervisor.
+It provides timed outputs that can be used as a watchdog, if properly wired.
+It also provides 512 bytes of user EEPROM.
+
+At reset, the MAX6875 reads the configuration EEPROM into its configuration
+registers. The chip then begins to operate according to the values in the
+registers.
+
+The Maxim MAX6874 is a similar, mostly compatible device, with more inputs
+and outputs:
+
+=========== === === ====
+- vin gpi vout
+=========== === === ====
+MAX6874 6 4 8
+MAX6875 4 3 5
+=========== === === ====
+
+See the datasheet for more information.
+
+
+Sysfs entries
+-------------
+
+eeprom - 512 bytes of user-defined EEPROM space.
+
+
+General Remarks
+---------------
+
+Valid addresses for the MAX6875 are 0x50 and 0x52.
+
+Valid addresses for the MAX6874 are 0x50, 0x52, 0x54 and 0x56.
+
+The driver does not probe any address, so you explicitly instantiate the
+devices.
+
+Example::
+
+ $ modprobe max6875
+ $ echo max6875 0x50 > /sys/bus/i2c/devices/i2c-0/new_device
+
+The MAX6874/MAX6875 ignores address bit 0, so this driver attaches to multiple
+addresses. For example, for address 0x50, it also reserves 0x51.
+The even-address instance is called 'max6875', the odd one is 'dummy'.
+
+
+Programming the chip using i2c-dev
+----------------------------------
+
+Use the i2c-dev interface to access and program the chips.
+
+Reads and writes are performed differently depending on the address range.
+
+The configuration registers are at addresses 0x00 - 0x45.
+
+Use i2c_smbus_write_byte_data() to write a register and
+i2c_smbus_read_byte_data() to read a register.
+
+The command is the register number.
+
+Examples:
+
+To write a 1 to register 0x45::
+
+ i2c_smbus_write_byte_data(fd, 0x45, 1);
+
+To read register 0x45::
+
+ value = i2c_smbus_read_byte_data(fd, 0x45);
+
+
+The configuration EEPROM is at addresses 0x8000 - 0x8045.
+
+The user EEPROM is at addresses 0x8100 - 0x82ff.
+
+Use i2c_smbus_write_word_data() to write a byte to EEPROM.
+
+The command is the upper byte of the address: 0x80, 0x81, or 0x82.
+The data word is the lower part of the address or'd with data << 8::
+
+ cmd = address >> 8;
+ val = (address & 0xff) | (data << 8);
+
+Example:
+
+To write 0x5a to address 0x8003::
+
+ i2c_smbus_write_word_data(fd, 0x80, 0x5a03);
+
+
+Reading data from the EEPROM is a little more complicated.
+
+Use i2c_smbus_write_byte_data() to set the read address and then
+i2c_smbus_read_byte() or i2c_smbus_read_i2c_block_data() to read the data.
+
+Example:
+
+To read data starting at offset 0x8100, first set the address::
+
+ i2c_smbus_write_byte_data(fd, 0x81, 0x00);
+
+And then read the data::
+
+ value = i2c_smbus_read_byte(fd);
+
+or::
+
+ count = i2c_smbus_read_i2c_block_data(fd, 0x84, 16, buffer);
+
+The block read should read 16 bytes.
+
+0x84 is the block read command.
+
+See the datasheet for more details.
+
diff --git a/Documentation/misc-devices/mei/mei-client-bus.txt b/Documentation/misc-devices/mei/mei-client-bus.txt
deleted file mode 100644
index 743be4ec8989..000000000000
--- a/Documentation/misc-devices/mei/mei-client-bus.txt
+++ /dev/null
@@ -1,141 +0,0 @@
-Intel(R) Management Engine (ME) Client bus API
-==============================================
-
-
-Rationale
-=========
-
-MEI misc character device is useful for dedicated applications to send and receive
-data to the many FW appliance found in Intel's ME from the user space.
-However for some of the ME functionalities it make sense to leverage existing software
-stack and expose them through existing kernel subsystems.
-
-In order to plug seamlessly into the kernel device driver model we add kernel virtual
-bus abstraction on top of the MEI driver. This allows implementing linux kernel drivers
-for the various MEI features as a stand alone entities found in their respective subsystem.
-Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to
-the existing code.
-
-
-MEI CL bus API
-==============
-
-A driver implementation for an MEI Client is very similar to existing bus
-based device drivers. The driver registers itself as an MEI CL bus driver through
-the mei_cl_driver structure:
-
-struct mei_cl_driver {
- struct device_driver driver;
- const char *name;
-
- const struct mei_cl_device_id *id_table;
-
- int (*probe)(struct mei_cl_device *dev, const struct mei_cl_id *id);
- int (*remove)(struct mei_cl_device *dev);
-};
-
-struct mei_cl_id {
- char name[MEI_NAME_SIZE];
- kernel_ulong_t driver_info;
-};
-
-The mei_cl_id structure allows the driver to bind itself against a device name.
-
-To actually register a driver on the ME Client bus one must call the mei_cl_add_driver()
-API. This is typically called at module init time.
-
-Once registered on the ME Client bus, a driver will typically try to do some I/O on
-this bus and this should be done through the mei_cl_send() and mei_cl_recv()
-routines. The latter is synchronous (blocks and sleeps until data shows up).
-In order for drivers to be notified of pending events waiting for them (e.g.
-an Rx event) they can register an event handler through the
-mei_cl_register_event_cb() routine. Currently only the MEI_EVENT_RX event
-will trigger an event handler call and the driver implementation is supposed
-to call mei_recv() from the event handler in order to fetch the pending
-received buffers.
-
-
-Example
-=======
-
-As a theoretical example let's pretend the ME comes with a "contact" NFC IP.
-The driver init and exit routines for this device would look like:
-
-#define CONTACT_DRIVER_NAME "contact"
-
-static struct mei_cl_device_id contact_mei_cl_tbl[] = {
- { CONTACT_DRIVER_NAME, },
-
- /* required last entry */
- { }
-};
-MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl);
-
-static struct mei_cl_driver contact_driver = {
- .id_table = contact_mei_tbl,
- .name = CONTACT_DRIVER_NAME,
-
- .probe = contact_probe,
- .remove = contact_remove,
-};
-
-static int contact_init(void)
-{
- int r;
-
- r = mei_cl_driver_register(&contact_driver);
- if (r) {
- pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n");
- return r;
- }
-
- return 0;
-}
-
-static void __exit contact_exit(void)
-{
- mei_cl_driver_unregister(&contact_driver);
-}
-
-module_init(contact_init);
-module_exit(contact_exit);
-
-And the driver's simplified probe routine would look like that:
-
-int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id)
-{
- struct contact_driver *contact;
-
- [...]
- mei_cl_enable_device(dev);
-
- mei_cl_register_event_cb(dev, contact_event_cb, contact);
-
- return 0;
-}
-
-In the probe routine the driver first enable the MEI device and then registers
-an ME bus event handler which is as close as it can get to registering a
-threaded IRQ handler.
-The handler implementation will typically call some I/O routine depending on
-the pending events:
-
-#define MAX_NFC_PAYLOAD 128
-
-static void contact_event_cb(struct mei_cl_device *dev, u32 events,
- void *context)
-{
- struct contact_driver *contact = context;
-
- if (events & BIT(MEI_EVENT_RX)) {
- u8 payload[MAX_NFC_PAYLOAD];
- int payload_size;
-
- payload_size = mei_recv(dev, payload, MAX_NFC_PAYLOAD);
- if (payload_size <= 0)
- return;
-
- /* Hook to the NFC subsystem */
- nfc_hci_recv_frame(contact->hdev, payload, payload_size);
- }
-}
diff --git a/Documentation/misc-devices/mei/mei.txt b/Documentation/misc-devices/mei/mei.txt
deleted file mode 100644
index 2b80a0cd621f..000000000000
--- a/Documentation/misc-devices/mei/mei.txt
+++ /dev/null
@@ -1,266 +0,0 @@
-Intel(R) Management Engine Interface (Intel(R) MEI)
-===================================================
-
-Introduction
-============
-
-The Intel Management Engine (Intel ME) is an isolated and protected computing
-resource (Co-processor) residing inside certain Intel chipsets. The Intel ME
-provides support for computer/IT management features. The feature set
-depends on the Intel chipset SKU.
-
-The Intel Management Engine Interface (Intel MEI, previously known as HECI)
-is the interface between the Host and Intel ME. This interface is exposed
-to the host as a PCI device. The Intel MEI Driver is in charge of the
-communication channel between a host application and the Intel ME feature.
-
-Each Intel ME feature (Intel ME Client) is addressed by a GUID/UUID and
-each client has its own protocol. The protocol is message-based with a
-header and payload up to 512 bytes.
-
-Prominent usage of the Intel ME Interface is to communicate with Intel(R)
-Active Management Technology (Intel AMT) implemented in firmware running on
-the Intel ME.
-
-Intel AMT provides the ability to manage a host remotely out-of-band (OOB)
-even when the operating system running on the host processor has crashed or
-is in a sleep state.
-
-Some examples of Intel AMT usage are:
- - Monitoring hardware state and platform components
- - Remote power off/on (useful for green computing or overnight IT
- maintenance)
- - OS updates
- - Storage of useful platform information such as software assets
- - Built-in hardware KVM
- - Selective network isolation of Ethernet and IP protocol flows based
- on policies set by a remote management console
- - IDE device redirection from remote management console
-
-Intel AMT (OOB) communication is based on SOAP (deprecated
-starting with Release 6.0) over HTTP/S or WS-Management protocol over
-HTTP/S that are received from a remote management console application.
-
-For more information about Intel AMT:
-http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
-
-
-Intel MEI Driver
-================
-
-The driver exposes a misc device called /dev/mei.
-
-An application maintains communication with an Intel ME feature while
-/dev/mei is open. The binding to a specific feature is performed by calling
-MEI_CONNECT_CLIENT_IOCTL, which passes the desired UUID.
-The number of instances of an Intel ME feature that can be opened
-at the same time depends on the Intel ME feature, but most of the
-features allow only a single instance.
-
-The Intel AMT Host Interface (Intel AMTHI) feature supports multiple
-simultaneous user connected applications. The Intel MEI driver
-handles this internally by maintaining request queues for the applications.
-
-The driver is transparent to data that are passed between firmware feature
-and host application.
-
-Because some of the Intel ME features can change the system
-configuration, the driver by default allows only a privileged
-user to access it.
-
-A code snippet for an application communicating with Intel AMTHI client:
-
- struct mei_connect_client_data data;
- fd = open(MEI_DEVICE);
-
- data.d.in_client_uuid = AMTHI_UUID;
-
- ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &data);
-
- printf("Ver=%d, MaxLen=%ld\n",
- data.d.in_client_uuid.protocol_version,
- data.d.in_client_uuid.max_msg_length);
-
- [...]
-
- write(fd, amthi_req_data, amthi_req_data_len);
-
- [...]
-
- read(fd, &amthi_res_data, amthi_res_data_len);
-
- [...]
- close(fd);
-
-
-IOCTL
-=====
-
-The Intel MEI Driver supports the following IOCTL commands:
- IOCTL_MEI_CONNECT_CLIENT Connect to firmware Feature (client).
-
- usage:
- struct mei_connect_client_data clientData;
- ioctl(fd, IOCTL_MEI_CONNECT_CLIENT, &clientData);
-
- inputs:
- mei_connect_client_data struct contain the following
- input field:
-
- in_client_uuid - UUID of the FW Feature that needs
- to connect to.
- outputs:
- out_client_properties - Client Properties: MTU and Protocol Version.
-
- error returns:
- EINVAL Wrong IOCTL Number
- ENODEV Device or Connection is not initialized or ready.
- (e.g. Wrong UUID)
- ENOMEM Unable to allocate memory to client internal data.
- EFAULT Fatal Error (e.g. Unable to access user input data)
- EBUSY Connection Already Open
-
- Notes:
- max_msg_length (MTU) in client properties describes the maximum
- data that can be sent or received. (e.g. if MTU=2K, can send
- requests up to bytes 2k and received responses up to 2k bytes).
-
- IOCTL_MEI_NOTIFY_SET: enable or disable event notifications
-
- Usage:
- uint32_t enable;
- ioctl(fd, IOCTL_MEI_NOTIFY_SET, &enable);
-
- Inputs:
- uint32_t enable = 1;
- or
- uint32_t enable[disable] = 0;
-
- Error returns:
- EINVAL Wrong IOCTL Number
- ENODEV Device is not initialized or the client not connected
- ENOMEM Unable to allocate memory to client internal data.
- EFAULT Fatal Error (e.g. Unable to access user input data)
- EOPNOTSUPP if the device doesn't support the feature
-
- Notes:
- The client must be connected in order to enable notification events
-
-
- IOCTL_MEI_NOTIFY_GET : retrieve event
-
- Usage:
- uint32_t event;
- ioctl(fd, IOCTL_MEI_NOTIFY_GET, &event);
-
- Outputs:
- 1 - if an event is pending
- 0 - if there is no even pending
-
- Error returns:
- EINVAL Wrong IOCTL Number
- ENODEV Device is not initialized or the client not connected
- ENOMEM Unable to allocate memory to client internal data.
- EFAULT Fatal Error (e.g. Unable to access user input data)
- EOPNOTSUPP if the device doesn't support the feature
-
- Notes:
- The client must be connected and event notification has to be enabled
- in order to receive an event
-
-
-Intel ME Applications
-=====================
-
- 1) Intel Local Management Service (Intel LMS)
-
- Applications running locally on the platform communicate with Intel AMT Release
- 2.0 and later releases in the same way that network applications do via SOAP
- over HTTP (deprecated starting with Release 6.0) or with WS-Management over
- SOAP over HTTP. This means that some Intel AMT features can be accessed from a
- local application using the same network interface as a remote application
- communicating with Intel AMT over the network.
-
- When a local application sends a message addressed to the local Intel AMT host
- name, the Intel LMS, which listens for traffic directed to the host name,
- intercepts the message and routes it to the Intel MEI.
- For more information:
- http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
- Under "About Intel AMT" => "Local Access"
-
- For downloading Intel LMS:
- http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/
-
- The Intel LMS opens a connection using the Intel MEI driver to the Intel LMS
- firmware feature using a defined UUID and then communicates with the feature
- using a protocol called Intel AMT Port Forwarding Protocol (Intel APF protocol).
- The protocol is used to maintain multiple sessions with Intel AMT from a
- single application.
-
- See the protocol specification in the Intel AMT Software Development Kit (SDK)
- http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
- Under "SDK Resources" => "Intel(R) vPro(TM) Gateway (MPS)"
- => "Information for Intel(R) vPro(TM) Gateway Developers"
- => "Description of the Intel AMT Port Forwarding (APF) Protocol"
-
- 2) Intel AMT Remote configuration using a Local Agent
-
- A Local Agent enables IT personnel to configure Intel AMT out-of-the-box
- without requiring installing additional data to enable setup. The remote
- configuration process may involve an ISV-developed remote configuration
- agent that runs on the host.
- For more information:
- http://software.intel.com/sites/manageability/AMT_Implementation_and_Reference_Guide
- Under "Setup and Configuration of Intel AMT" =>
- "SDK Tools Supporting Setup and Configuration" =>
- "Using the Local Agent Sample"
-
- An open source Intel AMT configuration utility, implementing a local agent
- that accesses the Intel MEI driver, can be found here:
- http://software.intel.com/en-us/articles/download-the-latest-intel-amt-open-source-drivers/
-
-
-Intel AMT OS Health Watchdog
-============================
-
-The Intel AMT Watchdog is an OS Health (Hang/Crash) watchdog.
-Whenever the OS hangs or crashes, Intel AMT will send an event
-to any subscriber to this event. This mechanism means that
-IT knows when a platform crashes even when there is a hard failure on the host.
-
-The Intel AMT Watchdog is composed of two parts:
- 1) Firmware feature - receives the heartbeats
- and sends an event when the heartbeats stop.
- 2) Intel MEI iAMT watchdog driver - connects to the watchdog feature,
- configures the watchdog and sends the heartbeats.
-
-The Intel iAMT watchdog MEI driver uses the kernel watchdog API to configure
-the Intel AMT Watchdog and to send heartbeats to it. The default timeout of the
-watchdog is 120 seconds.
-
-If the Intel AMT is not enabled in the firmware then the watchdog client won't enumerate
-on the me client bus and watchdog devices won't be exposed.
-
-
-Supported Chipsets
-==================
-
-7 Series Chipset Family
-6 Series Chipset Family
-5 Series Chipset Family
-4 Series Chipset Family
-Mobile 4 Series Chipset Family
-ICH9
-82946GZ/GL
-82G35 Express
-82Q963/Q965
-82P965/G965
-Mobile PM965/GM965
-Mobile GME965/GLE960
-82Q35 Express
-82G33/G31/P35/P31 Express
-82Q33 Express
-82X38/X48 Express
-
----
-linux-mei@linux.intel.com
diff --git a/Documentation/mmc/mmc-async-req.txt b/Documentation/mmc/mmc-async-req.txt
deleted file mode 100644
index ae1907b10e4a..000000000000
--- a/Documentation/mmc/mmc-async-req.txt
+++ /dev/null
@@ -1,87 +0,0 @@
-Rationale
-=========
-
-How significant is the cache maintenance overhead?
-It depends. Fast eMMC and multiple cache levels with speculative cache
-pre-fetch makes the cache overhead relatively significant. If the DMA
-preparations for the next request are done in parallel with the current
-transfer, the DMA preparation overhead would not affect the MMC performance.
-The intention of non-blocking (asynchronous) MMC requests is to minimize the
-time between when an MMC request ends and another MMC request begins.
-Using mmc_wait_for_req(), the MMC controller is idle while dma_map_sg and
-dma_unmap_sg are processing. Using non-blocking MMC requests makes it
-possible to prepare the caches for next job in parallel with an active
-MMC request.
-
-MMC block driver
-================
-
-The mmc_blk_issue_rw_rq() in the MMC block driver is made non-blocking.
-The increase in throughput is proportional to the time it takes to
-prepare (major part of preparations are dma_map_sg() and dma_unmap_sg())
-a request and how fast the memory is. The faster the MMC/SD is the
-more significant the prepare request time becomes. Roughly the expected
-performance gain is 5% for large writes and 10% on large reads on a L2 cache
-platform. In power save mode, when clocks run on a lower frequency, the DMA
-preparation may cost even more. As long as these slower preparations are run
-in parallel with the transfer performance won't be affected.
-
-Details on measurements from IOZone and mmc_test
-================================================
-
-https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req
-
-MMC core API extension
-======================
-
-There is one new public function mmc_start_req().
-It starts a new MMC command request for a host. The function isn't
-truly non-blocking. If there is an ongoing async request it waits
-for completion of that request and starts the new one and returns. It
-doesn't wait for the new request to complete. If there is no ongoing
-request it starts the new request and returns immediately.
-
-MMC host extensions
-===================
-
-There are two optional members in the mmc_host_ops -- pre_req() and
-post_req() -- that the host driver may implement in order to move work
-to before and after the actual mmc_host_ops.request() function is called.
-In the DMA case pre_req() may do dma_map_sg() and prepare the DMA
-descriptor, and post_req() runs the dma_unmap_sg().
-
-Optimize for the first request
-==============================
-
-The first request in a series of requests can't be prepared in parallel
-with the previous transfer, since there is no previous request.
-The argument is_first_req in pre_req() indicates that there is no previous
-request. The host driver may optimize for this scenario to minimize
-the performance loss. A way to optimize for this is to split the current
-request in two chunks, prepare the first chunk and start the request,
-and finally prepare the second chunk and start the transfer.
-
-Pseudocode to handle is_first_req scenario with minimal prepare overhead:
-
-if (is_first_req && req->size > threshold)
- /* start MMC transfer for the complete transfer size */
- mmc_start_command(MMC_CMD_TRANSFER_FULL_SIZE);
-
- /*
- * Begin to prepare DMA while cmd is being processed by MMC.
- * The first chunk of the request should take the same time
- * to prepare as the "MMC process command time".
- * If prepare time exceeds MMC cmd time
- * the transfer is delayed, guesstimate max 4k as first chunk size.
- */
- prepare_1st_chunk_for_dma(req);
- /* flush pending desc to the DMAC (dmaengine.h) */
- dma_issue_pending(req->dma_desc);
-
- prepare_2nd_chunk_for_dma(req);
- /*
- * The second issue_pending should be called before MMC runs out
- * of the first chunk. If the MMC runs out of the first data chunk
- * before this call, the transfer is delayed.
- */
- dma_issue_pending(req->dma_desc);
diff --git a/Documentation/mmc/mmc-dev-attrs.txt b/Documentation/mmc/mmc-dev-attrs.txt
deleted file mode 100644
index 4ad0bb17f343..000000000000
--- a/Documentation/mmc/mmc-dev-attrs.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-SD and MMC Block Device Attributes
-==================================
-
-These attributes are defined for the block devices associated with the
-SD or MMC device.
-
-The following attributes are read/write.
-
- force_ro Enforce read-only access even if write protect switch is off.
-
-SD and MMC Device Attributes
-============================
-
-All attributes are read-only.
-
- cid Card Identification Register
- csd Card Specific Data Register
- scr SD Card Configuration Register (SD only)
- date Manufacturing Date (from CID Register)
- fwrev Firmware/Product Revision (from CID Register) (SD and MMCv1 only)
- hwrev Hardware/Product Revision (from CID Register) (SD and MMCv1 only)
- manfid Manufacturer ID (from CID Register)
- name Product Name (from CID Register)
- oemid OEM/Application ID (from CID Register)
- prv Product Revision (from CID Register) (SD and MMCv4 only)
- serial Product Serial Number (from CID Register)
- erase_size Erase group size
- preferred_erase_size Preferred erase size
- raw_rpmb_size_mult RPMB partition size
- rel_sectors Reliable write sector count
- ocr Operation Conditions Register
- dsr Driver Stage Register
- cmdq_en Command Queue enabled: 1 => enabled, 0 => not enabled
-
-Note on Erase Size and Preferred Erase Size:
-
- "erase_size" is the minimum size, in bytes, of an erase
- operation. For MMC, "erase_size" is the erase group size
- reported by the card. Note that "erase_size" does not apply
- to trim or secure trim operations where the minimum size is
- always one 512 byte sector. For SD, "erase_size" is 512
- if the card is block-addressed, 0 otherwise.
-
- SD/MMC cards can erase an arbitrarily large area up to and
- including the whole card. When erasing a large area it may
- be desirable to do it in smaller chunks for three reasons:
- 1. A single erase command will make all other I/O on
- the card wait. This is not a problem if the whole card
- is being erased, but erasing one partition will make
- I/O for another partition on the same card wait for the
- duration of the erase - which could be a several
- minutes.
- 2. To be able to inform the user of erase progress.
- 3. The erase timeout becomes too large to be very
- useful. Because the erase timeout contains a margin
- which is multiplied by the size of the erase area,
- the value can end up being several minutes for large
- areas.
-
- "erase_size" is not the most efficient unit to erase
- (especially for SD where it is just one sector),
- hence "preferred_erase_size" provides a good chunk
- size for erasing large areas.
-
- For MMC, "preferred_erase_size" is the high-capacity
- erase size if a card specifies one, otherwise it is
- based on the capacity of the card.
-
- For SD, "preferred_erase_size" is the allocation unit
- size specified by the card.
-
- "preferred_erase_size" is in bytes.
-
-Note on raw_rpmb_size_mult:
- "raw_rpmb_size_mult" is a multiple of 128kB block.
- RPMB size in byte is calculated by using the following equation:
- RPMB partition size = 128kB x raw_rpmb_size_mult
diff --git a/Documentation/mmc/mmc-dev-parts.txt b/Documentation/mmc/mmc-dev-parts.txt
deleted file mode 100644
index f08d078d43cf..000000000000
--- a/Documentation/mmc/mmc-dev-parts.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-SD and MMC Device Partitions
-============================
-
-Device partitions are additional logical block devices present on the
-SD/MMC device.
-
-As of this writing, MMC boot partitions as supported and exposed as
-/dev/mmcblkXboot0 and /dev/mmcblkXboot1, where X is the index of the
-parent /dev/mmcblkX.
-
-MMC Boot Partitions
-===================
-
-Read and write access is provided to the two MMC boot partitions. Due to
-the sensitive nature of the boot partition contents, which often store
-a bootloader or bootloader configuration tables crucial to booting the
-platform, write access is disabled by default to reduce the chance of
-accidental bricking.
-
-To enable write access to /dev/mmcblkXbootY, disable the forced read-only
-access with:
-
-echo 0 > /sys/block/mmcblkXbootY/force_ro
-
-To re-enable read-only access:
-
-echo 1 > /sys/block/mmcblkXbootY/force_ro
-
-The boot partitions can also be locked read only until the next power on,
-with:
-
-echo 1 > /sys/block/mmcblkXbootY/ro_lock_until_next_power_on
-
-This is a feature of the card and not of the kernel. If the card does
-not support boot partition locking, the file will not exist. If the
-feature has been disabled on the card, the file will be read-only.
-
-The boot partitions can also be locked permanently, but this feature is
-not accessible through sysfs in order to avoid accidental or malicious
-bricking.
diff --git a/Documentation/mmc/mmc-tools.txt b/Documentation/mmc/mmc-tools.txt
deleted file mode 100644
index 735509c165d5..000000000000
--- a/Documentation/mmc/mmc-tools.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-MMC tools introduction
-======================
-
-There is one MMC test tools called mmc-utils, which is maintained by Chris Ball,
-you can find it at the below public git repository:
-http://git.kernel.org/cgit/linux/kernel/git/cjb/mmc-utils.git/
-
-Functions
-=========
-
-The mmc-utils tools can do the following:
- - Print and parse extcsd data.
- - Determine the eMMC writeprotect status.
- - Set the eMMC writeprotect status.
- - Set the eMMC data sector size to 4KB by disabling emulation.
- - Create general purpose partition.
- - Enable the enhanced user area.
- - Enable write reliability per partition.
- - Print the response to STATUS_SEND (CMD13).
- - Enable the boot partition.
- - Set Boot Bus Conditions.
- - Enable the eMMC BKOPS feature.
- - Permanently enable the eMMC H/W Reset feature.
- - Permanently disable the eMMC H/W Reset feature.
- - Send Sanitize command.
- - Program authentication key for the device.
- - Counter value for the rpmb device will be read to stdout.
- - Read from rpmb device to output.
- - Write to rpmb device from data file.
- - Enable the eMMC cache feature.
- - Disable the eMMC cache feature.
- - Print and parse CID data.
- - Print and parse CSD data.
- - Print and parse SCR data.
diff --git a/Documentation/mtd/intel-spi.txt b/Documentation/mtd/intel-spi.txt
deleted file mode 100644
index bc357729c2cb..000000000000
--- a/Documentation/mtd/intel-spi.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-Upgrading BIOS using intel-spi
-------------------------------
-
-Many Intel CPUs like Baytrail and Braswell include SPI serial flash host
-controller which is used to hold BIOS and other platform specific data.
-Since contents of the SPI serial flash is crucial for machine to function,
-it is typically protected by different hardware protection mechanisms to
-avoid accidental (or on purpose) overwrite of the content.
-
-Not all manufacturers protect the SPI serial flash, mainly because it
-allows upgrading the BIOS image directly from an OS.
-
-The intel-spi driver makes it possible to read and write the SPI serial
-flash, if certain protection bits are not set and locked. If it finds
-any of them set, the whole MTD device is made read-only to prevent
-partial overwrites. By default the driver exposes SPI serial flash
-contents as read-only but it can be changed from kernel command line,
-passing "intel-spi.writeable=1".
-
-Please keep in mind that overwriting the BIOS image on SPI serial flash
-might render the machine unbootable and requires special equipment like
-Dediprog to revive. You have been warned!
-
-Below are the steps how to upgrade MinnowBoard MAX BIOS directly from
-Linux.
-
- 1) Download and extract the latest Minnowboard MAX BIOS SPI image
- [1]. At the time writing this the latest image is v92.
-
- 2) Install mtd-utils package [2]. We need this in order to erase the SPI
- serial flash. Distros like Debian and Fedora have this prepackaged with
- name "mtd-utils".
-
- 3) Add "intel-spi.writeable=1" to the kernel command line and reboot
- the board (you can also reload the driver passing "writeable=1" as
- module parameter to modprobe).
-
- 4) Once the board is up and running again, find the right MTD partition
- (it is named as "BIOS"):
-
- # cat /proc/mtd
- dev: size erasesize name
- mtd0: 00800000 00001000 "BIOS"
-
- So here it will be /dev/mtd0 but it may vary.
-
- 5) Make backup of the existing image first:
-
- # dd if=/dev/mtd0ro of=bios.bak
- 16384+0 records in
- 16384+0 records out
- 8388608 bytes (8.4 MB) copied, 10.0269 s, 837 kB/s
-
- 6) Verify the backup
-
- # sha1sum /dev/mtd0ro bios.bak
- fdbb011920572ca6c991377c4b418a0502668b73 /dev/mtd0ro
- fdbb011920572ca6c991377c4b418a0502668b73 bios.bak
-
- The SHA1 sums must match. Otherwise do not continue any further!
-
- 7) Erase the SPI serial flash. After this step, do not reboot the
- board! Otherwise it will not start anymore.
-
- # flash_erase /dev/mtd0 0 0
- Erasing 4 Kibyte @ 7ff000 -- 100 % complete
-
- 8) Once completed without errors you can write the new BIOS image:
-
- # dd if=MNW2MAX1.X64.0092.R01.1605221712.bin of=/dev/mtd0
-
- 9) Verify that the new content of the SPI serial flash matches the new
- BIOS image:
-
- # sha1sum /dev/mtd0ro MNW2MAX1.X64.0092.R01.1605221712.bin
- 9b4df9e4be2057fceec3a5529ec3d950836c87a2 /dev/mtd0ro
- 9b4df9e4be2057fceec3a5529ec3d950836c87a2 MNW2MAX1.X64.0092.R01.1605221712.bin
-
- The SHA1 sums should match.
-
- 10) Now you can reboot your board and observe the new BIOS starting up
- properly.
-
-References
-----------
-
-[1] https://firmware.intel.com/sites/default/files/MinnowBoard.MAX_.X64.92.R01.zip
-[2] http://www.linux-mtd.infradead.org/
diff --git a/Documentation/mtd/nand_ecc.txt b/Documentation/mtd/nand_ecc.txt
deleted file mode 100644
index f8c3284bf6a7..000000000000
--- a/Documentation/mtd/nand_ecc.txt
+++ /dev/null
@@ -1,714 +0,0 @@
-Introduction
-============
-
-Having looked at the linux mtd/nand driver and more specific at nand_ecc.c
-I felt there was room for optimisation. I bashed the code for a few hours
-performing tricks like table lookup removing superfluous code etc.
-After that the speed was increased by 35-40%.
-Still I was not too happy as I felt there was additional room for improvement.
-
-Bad! I was hooked.
-I decided to annotate my steps in this file. Perhaps it is useful to someone
-or someone learns something from it.
-
-
-The problem
-===========
-
-NAND flash (at least SLC one) typically has sectors of 256 bytes.
-However NAND flash is not extremely reliable so some error detection
-(and sometimes correction) is needed.
-
-This is done by means of a Hamming code. I'll try to explain it in
-laymans terms (and apologies to all the pro's in the field in case I do
-not use the right terminology, my coding theory class was almost 30
-years ago, and I must admit it was not one of my favourites).
-
-As I said before the ecc calculation is performed on sectors of 256
-bytes. This is done by calculating several parity bits over the rows and
-columns. The parity used is even parity which means that the parity bit = 1
-if the data over which the parity is calculated is 1 and the parity bit = 0
-if the data over which the parity is calculated is 0. So the total
-number of bits over the data over which the parity is calculated + the
-parity bit is even. (see wikipedia if you can't follow this).
-Parity is often calculated by means of an exclusive or operation,
-sometimes also referred to as xor. In C the operator for xor is ^
-
-Back to ecc.
-Let's give a small figure:
-
-byte 0: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp4 ... rp14
-byte 1: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp2 rp4 ... rp14
-byte 2: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp4 ... rp14
-byte 3: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp4 ... rp14
-byte 4: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp2 rp5 ... rp14
-....
-byte 254: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp0 rp3 rp5 ... rp15
-byte 255: bit7 bit6 bit5 bit4 bit3 bit2 bit1 bit0 rp1 rp3 rp5 ... rp15
- cp1 cp0 cp1 cp0 cp1 cp0 cp1 cp0
- cp3 cp3 cp2 cp2 cp3 cp3 cp2 cp2
- cp5 cp5 cp5 cp5 cp4 cp4 cp4 cp4
-
-This figure represents a sector of 256 bytes.
-cp is my abbreviation for column parity, rp for row parity.
-
-Let's start to explain column parity.
-cp0 is the parity that belongs to all bit0, bit2, bit4, bit6.
-so the sum of all bit0, bit2, bit4 and bit6 values + cp0 itself is even.
-Similarly cp1 is the sum of all bit1, bit3, bit5 and bit7.
-cp2 is the parity over bit0, bit1, bit4 and bit5
-cp3 is the parity over bit2, bit3, bit6 and bit7.
-cp4 is the parity over bit0, bit1, bit2 and bit3.
-cp5 is the parity over bit4, bit5, bit6 and bit7.
-Note that each of cp0 .. cp5 is exactly one bit.
-
-Row parity actually works almost the same.
-rp0 is the parity of all even bytes (0, 2, 4, 6, ... 252, 254)
-rp1 is the parity of all odd bytes (1, 3, 5, 7, ..., 253, 255)
-rp2 is the parity of all bytes 0, 1, 4, 5, 8, 9, ...
-(so handle two bytes, then skip 2 bytes).
-rp3 is covers the half rp2 does not cover (bytes 2, 3, 6, 7, 10, 11, ...)
-for rp4 the rule is cover 4 bytes, skip 4 bytes, cover 4 bytes, skip 4 etc.
-so rp4 calculates parity over bytes 0, 1, 2, 3, 8, 9, 10, 11, 16, ...)
-and rp5 covers the other half, so bytes 4, 5, 6, 7, 12, 13, 14, 15, 20, ..
-The story now becomes quite boring. I guess you get the idea.
-rp6 covers 8 bytes then skips 8 etc
-rp7 skips 8 bytes then covers 8 etc
-rp8 covers 16 bytes then skips 16 etc
-rp9 skips 16 bytes then covers 16 etc
-rp10 covers 32 bytes then skips 32 etc
-rp11 skips 32 bytes then covers 32 etc
-rp12 covers 64 bytes then skips 64 etc
-rp13 skips 64 bytes then covers 64 etc
-rp14 covers 128 bytes then skips 128
-rp15 skips 128 bytes then covers 128
-
-In the end the parity bits are grouped together in three bytes as
-follows:
-ECC Bit 7 Bit 6 Bit 5 Bit 4 Bit 3 Bit 2 Bit 1 Bit 0
-ECC 0 rp07 rp06 rp05 rp04 rp03 rp02 rp01 rp00
-ECC 1 rp15 rp14 rp13 rp12 rp11 rp10 rp09 rp08
-ECC 2 cp5 cp4 cp3 cp2 cp1 cp0 1 1
-
-I detected after writing this that ST application note AN1823
-(http://www.st.com/stonline/) gives a much
-nicer picture.(but they use line parity as term where I use row parity)
-Oh well, I'm graphically challenged, so suffer with me for a moment :-)
-And I could not reuse the ST picture anyway for copyright reasons.
-
-
-Attempt 0
-=========
-
-Implementing the parity calculation is pretty simple.
-In C pseudocode:
-for (i = 0; i < 256; i++)
-{
- if (i & 0x01)
- rp1 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp1;
- else
- rp0 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp0;
- if (i & 0x02)
- rp3 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp3;
- else
- rp2 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp2;
- if (i & 0x04)
- rp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp5;
- else
- rp4 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp4;
- if (i & 0x08)
- rp7 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp7;
- else
- rp6 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp6;
- if (i & 0x10)
- rp9 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp9;
- else
- rp8 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp8;
- if (i & 0x20)
- rp11 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp11;
- else
- rp10 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp10;
- if (i & 0x40)
- rp13 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp13;
- else
- rp12 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp12;
- if (i & 0x80)
- rp15 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp15;
- else
- rp14 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ bit3 ^ bit2 ^ bit1 ^ bit0 ^ rp14;
- cp0 = bit6 ^ bit4 ^ bit2 ^ bit0 ^ cp0;
- cp1 = bit7 ^ bit5 ^ bit3 ^ bit1 ^ cp1;
- cp2 = bit5 ^ bit4 ^ bit1 ^ bit0 ^ cp2;
- cp3 = bit7 ^ bit6 ^ bit3 ^ bit2 ^ cp3
- cp4 = bit3 ^ bit2 ^ bit1 ^ bit0 ^ cp4
- cp5 = bit7 ^ bit6 ^ bit5 ^ bit4 ^ cp5
-}
-
-
-Analysis 0
-==========
-
-C does have bitwise operators but not really operators to do the above
-efficiently (and most hardware has no such instructions either).
-Therefore without implementing this it was clear that the code above was
-not going to bring me a Nobel prize :-)
-
-Fortunately the exclusive or operation is commutative, so we can combine
-the values in any order. So instead of calculating all the bits
-individually, let us try to rearrange things.
-For the column parity this is easy. We can just xor the bytes and in the
-end filter out the relevant bits. This is pretty nice as it will bring
-all cp calculation out of the for loop.
-
-Similarly we can first xor the bytes for the various rows.
-This leads to:
-
-
-Attempt 1
-=========
-
-const char parity[256] = {
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
- 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0
-};
-
-void ecc1(const unsigned char *buf, unsigned char *code)
-{
- int i;
- const unsigned char *bp = buf;
- unsigned char cur;
- unsigned char rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
- unsigned char rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
- unsigned char par;
-
- par = 0;
- rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
- rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
- rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
- rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
-
- for (i = 0; i < 256; i++)
- {
- cur = *bp++;
- par ^= cur;
- if (i & 0x01) rp1 ^= cur; else rp0 ^= cur;
- if (i & 0x02) rp3 ^= cur; else rp2 ^= cur;
- if (i & 0x04) rp5 ^= cur; else rp4 ^= cur;
- if (i & 0x08) rp7 ^= cur; else rp6 ^= cur;
- if (i & 0x10) rp9 ^= cur; else rp8 ^= cur;
- if (i & 0x20) rp11 ^= cur; else rp10 ^= cur;
- if (i & 0x40) rp13 ^= cur; else rp12 ^= cur;
- if (i & 0x80) rp15 ^= cur; else rp14 ^= cur;
- }
- code[0] =
- (parity[rp7] << 7) |
- (parity[rp6] << 6) |
- (parity[rp5] << 5) |
- (parity[rp4] << 4) |
- (parity[rp3] << 3) |
- (parity[rp2] << 2) |
- (parity[rp1] << 1) |
- (parity[rp0]);
- code[1] =
- (parity[rp15] << 7) |
- (parity[rp14] << 6) |
- (parity[rp13] << 5) |
- (parity[rp12] << 4) |
- (parity[rp11] << 3) |
- (parity[rp10] << 2) |
- (parity[rp9] << 1) |
- (parity[rp8]);
- code[2] =
- (parity[par & 0xf0] << 7) |
- (parity[par & 0x0f] << 6) |
- (parity[par & 0xcc] << 5) |
- (parity[par & 0x33] << 4) |
- (parity[par & 0xaa] << 3) |
- (parity[par & 0x55] << 2);
- code[0] = ~code[0];
- code[1] = ~code[1];
- code[2] = ~code[2];
-}
-
-Still pretty straightforward. The last three invert statements are there to
-give a checksum of 0xff 0xff 0xff for an empty flash. In an empty flash
-all data is 0xff, so the checksum then matches.
-
-I also introduced the parity lookup. I expected this to be the fastest
-way to calculate the parity, but I will investigate alternatives later
-on.
-
-
-Analysis 1
-==========
-
-The code works, but is not terribly efficient. On my system it took
-almost 4 times as much time as the linux driver code. But hey, if it was
-*that* easy this would have been done long before.
-No pain. no gain.
-
-Fortunately there is plenty of room for improvement.
-
-In step 1 we moved from bit-wise calculation to byte-wise calculation.
-However in C we can also use the unsigned long data type and virtually
-every modern microprocessor supports 32 bit operations, so why not try
-to write our code in such a way that we process data in 32 bit chunks.
-
-Of course this means some modification as the row parity is byte by
-byte. A quick analysis:
-for the column parity we use the par variable. When extending to 32 bits
-we can in the end easily calculate rp0 and rp1 from it.
-(because par now consists of 4 bytes, contributing to rp1, rp0, rp1, rp0
-respectively, from MSB to LSB)
-also rp2 and rp3 can be easily retrieved from par as rp3 covers the
-first two MSBs and rp2 covers the last two LSBs.
-
-Note that of course now the loop is executed only 64 times (256/4).
-And note that care must taken wrt byte ordering. The way bytes are
-ordered in a long is machine dependent, and might affect us.
-Anyway, if there is an issue: this code is developed on x86 (to be
-precise: a DELL PC with a D920 Intel CPU)
-
-And of course the performance might depend on alignment, but I expect
-that the I/O buffers in the nand driver are aligned properly (and
-otherwise that should be fixed to get maximum performance).
-
-Let's give it a try...
-
-
-Attempt 2
-=========
-
-extern const char parity[256];
-
-void ecc2(const unsigned char *buf, unsigned char *code)
-{
- int i;
- const unsigned long *bp = (unsigned long *)buf;
- unsigned long cur;
- unsigned long rp0, rp1, rp2, rp3, rp4, rp5, rp6, rp7;
- unsigned long rp8, rp9, rp10, rp11, rp12, rp13, rp14, rp15;
- unsigned long par;
-
- par = 0;
- rp0 = 0; rp1 = 0; rp2 = 0; rp3 = 0;
- rp4 = 0; rp5 = 0; rp6 = 0; rp7 = 0;
- rp8 = 0; rp9 = 0; rp10 = 0; rp11 = 0;
- rp12 = 0; rp13 = 0; rp14 = 0; rp15 = 0;
-
- for (i = 0; i < 64; i++)
- {
- cur = *bp++;
- par ^= cur;
- if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
- if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
- if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
- if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
- if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
- if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
- }
- /*
- we need to adapt the code generation for the fact that rp vars are now
- long; also the column parity calculation needs to be changed.
- we'll bring rp4 to 15 back to single byte entities by shifting and
- xoring
- */
- rp4 ^= (rp4 >> 16); rp4 ^= (rp4 >> 8); rp4 &= 0xff;
- rp5 ^= (rp5 >> 16); rp5 ^= (rp5 >> 8); rp5 &= 0xff;
- rp6 ^= (rp6 >> 16); rp6 ^= (rp6 >> 8); rp6 &= 0xff;
- rp7 ^= (rp7 >> 16); rp7 ^= (rp7 >> 8); rp7 &= 0xff;
- rp8 ^= (rp8 >> 16); rp8 ^= (rp8 >> 8); rp8 &= 0xff;
- rp9 ^= (rp9 >> 16); rp9 ^= (rp9 >> 8); rp9 &= 0xff;
- rp10 ^= (rp10 >> 16); rp10 ^= (rp10 >> 8); rp10 &= 0xff;
- rp11 ^= (rp11 >> 16); rp11 ^= (rp11 >> 8); rp11 &= 0xff;
- rp12 ^= (rp12 >> 16); rp12 ^= (rp12 >> 8); rp12 &= 0xff;
- rp13 ^= (rp13 >> 16); rp13 ^= (rp13 >> 8); rp13 &= 0xff;
- rp14 ^= (rp14 >> 16); rp14 ^= (rp14 >> 8); rp14 &= 0xff;
- rp15 ^= (rp15 >> 16); rp15 ^= (rp15 >> 8); rp15 &= 0xff;
- rp3 = (par >> 16); rp3 ^= (rp3 >> 8); rp3 &= 0xff;
- rp2 = par & 0xffff; rp2 ^= (rp2 >> 8); rp2 &= 0xff;
- par ^= (par >> 16);
- rp1 = (par >> 8); rp1 &= 0xff;
- rp0 = (par & 0xff);
- par ^= (par >> 8); par &= 0xff;
-
- code[0] =
- (parity[rp7] << 7) |
- (parity[rp6] << 6) |
- (parity[rp5] << 5) |
- (parity[rp4] << 4) |
- (parity[rp3] << 3) |
- (parity[rp2] << 2) |
- (parity[rp1] << 1) |
- (parity[rp0]);
- code[1] =
- (parity[rp15] << 7) |
- (parity[rp14] << 6) |
- (parity[rp13] << 5) |
- (parity[rp12] << 4) |
- (parity[rp11] << 3) |
- (parity[rp10] << 2) |
- (parity[rp9] << 1) |
- (parity[rp8]);
- code[2] =
- (parity[par & 0xf0] << 7) |
- (parity[par & 0x0f] << 6) |
- (parity[par & 0xcc] << 5) |
- (parity[par & 0x33] << 4) |
- (parity[par & 0xaa] << 3) |
- (parity[par & 0x55] << 2);
- code[0] = ~code[0];
- code[1] = ~code[1];
- code[2] = ~code[2];
-}
-
-The parity array is not shown any more. Note also that for these
-examples I kinda deviated from my regular programming style by allowing
-multiple statements on a line, not using { } in then and else blocks
-with only a single statement and by using operators like ^=
-
-
-Analysis 2
-==========
-
-The code (of course) works, and hurray: we are a little bit faster than
-the linux driver code (about 15%). But wait, don't cheer too quickly.
-There is more to be gained.
-If we look at e.g. rp14 and rp15 we see that we either xor our data with
-rp14 or with rp15. However we also have par which goes over all data.
-This means there is no need to calculate rp14 as it can be calculated from
-rp15 through rp14 = par ^ rp15, because par = rp14 ^ rp15;
-(or if desired we can avoid calculating rp15 and calculate it from
-rp14). That is why some places refer to inverse parity.
-Of course the same thing holds for rp4/5, rp6/7, rp8/9, rp10/11 and rp12/13.
-Effectively this means we can eliminate the else clause from the if
-statements. Also we can optimise the calculation in the end a little bit
-by going from long to byte first. Actually we can even avoid the table
-lookups
-
-Attempt 3
-=========
-
-Odd replaced:
- if (i & 0x01) rp5 ^= cur; else rp4 ^= cur;
- if (i & 0x02) rp7 ^= cur; else rp6 ^= cur;
- if (i & 0x04) rp9 ^= cur; else rp8 ^= cur;
- if (i & 0x08) rp11 ^= cur; else rp10 ^= cur;
- if (i & 0x10) rp13 ^= cur; else rp12 ^= cur;
- if (i & 0x20) rp15 ^= cur; else rp14 ^= cur;
-with
- if (i & 0x01) rp5 ^= cur;
- if (i & 0x02) rp7 ^= cur;
- if (i & 0x04) rp9 ^= cur;
- if (i & 0x08) rp11 ^= cur;
- if (i & 0x10) rp13 ^= cur;
- if (i & 0x20) rp15 ^= cur;
-
- and outside the loop added:
- rp4 = par ^ rp5;
- rp6 = par ^ rp7;
- rp8 = par ^ rp9;
- rp10 = par ^ rp11;
- rp12 = par ^ rp13;
- rp14 = par ^ rp15;
-
-And after that the code takes about 30% more time, although the number of
-statements is reduced. This is also reflected in the assembly code.
-
-
-Analysis 3
-==========
-
-Very weird. Guess it has to do with caching or instruction parallellism
-or so. I also tried on an eeePC (Celeron, clocked at 900 Mhz). Interesting
-observation was that this one is only 30% slower (according to time)
-executing the code as my 3Ghz D920 processor.
-
-Well, it was expected not to be easy so maybe instead move to a
-different track: let's move back to the code from attempt2 and do some
-loop unrolling. This will eliminate a few if statements. I'll try
-different amounts of unrolling to see what works best.
-
-
-Attempt 4
-=========
-
-Unrolled the loop 1, 2, 3 and 4 times.
-For 4 the code starts with:
-
- for (i = 0; i < 4; i++)
- {
- cur = *bp++;
- par ^= cur;
- rp4 ^= cur;
- rp6 ^= cur;
- rp8 ^= cur;
- rp10 ^= cur;
- if (i & 0x1) rp13 ^= cur; else rp12 ^= cur;
- if (i & 0x2) rp15 ^= cur; else rp14 ^= cur;
- cur = *bp++;
- par ^= cur;
- rp5 ^= cur;
- rp6 ^= cur;
- ...
-
-
-Analysis 4
-==========
-
-Unrolling once gains about 15%
-Unrolling twice keeps the gain at about 15%
-Unrolling three times gives a gain of 30% compared to attempt 2.
-Unrolling four times gives a marginal improvement compared to unrolling
-three times.
-
-I decided to proceed with a four time unrolled loop anyway. It was my gut
-feeling that in the next steps I would obtain additional gain from it.
-
-The next step was triggered by the fact that par contains the xor of all
-bytes and rp4 and rp5 each contain the xor of half of the bytes.
-So in effect par = rp4 ^ rp5. But as xor is commutative we can also say
-that rp5 = par ^ rp4. So no need to keep both rp4 and rp5 around. We can
-eliminate rp5 (or rp4, but I already foresaw another optimisation).
-The same holds for rp6/7, rp8/9, rp10/11 rp12/13 and rp14/15.
-
-
-Attempt 5
-=========
-
-Effectively so all odd digit rp assignments in the loop were removed.
-This included the else clause of the if statements.
-Of course after the loop we need to correct things by adding code like:
- rp5 = par ^ rp4;
-Also the initial assignments (rp5 = 0; etc) could be removed.
-Along the line I also removed the initialisation of rp0/1/2/3.
-
-
-Analysis 5
-==========
-
-Measurements showed this was a good move. The run-time roughly halved
-compared with attempt 4 with 4 times unrolled, and we only require 1/3rd
-of the processor time compared to the current code in the linux kernel.
-
-However, still I thought there was more. I didn't like all the if
-statements. Why not keep a running parity and only keep the last if
-statement. Time for yet another version!
-
-
-Attempt 6
-=========
-
-THe code within the for loop was changed to:
-
- for (i = 0; i < 4; i++)
- {
- cur = *bp++; tmppar = cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
-
- cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
-
- cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur; rp8 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= cur; rp8 ^= cur;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp8 ^= cur;
- cur = *bp++; tmppar ^= cur; rp8 ^= cur;
-
- cur = *bp++; tmppar ^= cur; rp4 ^= cur; rp6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur;
-
- par ^= tmppar;
- if ((i & 0x1) == 0) rp12 ^= tmppar;
- if ((i & 0x2) == 0) rp14 ^= tmppar;
- }
-
-As you can see tmppar is used to accumulate the parity within a for
-iteration. In the last 3 statements is added to par and, if needed,
-to rp12 and rp14.
-
-While making the changes I also found that I could exploit that tmppar
-contains the running parity for this iteration. So instead of having:
-rp4 ^= cur; rp6 ^= cur;
-I removed the rp6 ^= cur; statement and did rp6 ^= tmppar; on next
-statement. A similar change was done for rp8 and rp10
-
-
-Analysis 6
-==========
-
-Measuring this code again showed big gain. When executing the original
-linux code 1 million times, this took about 1 second on my system.
-(using time to measure the performance). After this iteration I was back
-to 0.075 sec. Actually I had to decide to start measuring over 10
-million iterations in order not to lose too much accuracy. This one
-definitely seemed to be the jackpot!
-
-There is a little bit more room for improvement though. There are three
-places with statements:
-rp4 ^= cur; rp6 ^= cur;
-It seems more efficient to also maintain a variable rp4_6 in the while
-loop; This eliminates 3 statements per loop. Of course after the loop we
-need to correct by adding:
- rp4 ^= rp4_6;
- rp6 ^= rp4_6
-Furthermore there are 4 sequential assignments to rp8. This can be
-encoded slightly more efficiently by saving tmppar before those 4 lines
-and later do rp8 = rp8 ^ tmppar ^ notrp8;
-(where notrp8 is the value of rp8 before those 4 lines).
-Again a use of the commutative property of xor.
-Time for a new test!
-
-
-Attempt 7
-=========
-
-The new code now looks like:
-
- for (i = 0; i < 4; i++)
- {
- cur = *bp++; tmppar = cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= tmppar;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur; rp8 ^= tmppar;
-
- cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur; rp10 ^= tmppar;
-
- notrp8 = tmppar;
- cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur;
- rp8 = rp8 ^ tmppar ^ notrp8;
-
- cur = *bp++; tmppar ^= cur; rp4_6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp6 ^= cur;
- cur = *bp++; tmppar ^= cur; rp4 ^= cur;
- cur = *bp++; tmppar ^= cur;
-
- par ^= tmppar;
- if ((i & 0x1) == 0) rp12 ^= tmppar;
- if ((i & 0x2) == 0) rp14 ^= tmppar;
- }
- rp4 ^= rp4_6;
- rp6 ^= rp4_6;
-
-
-Not a big change, but every penny counts :-)
-
-
-Analysis 7
-==========
-
-Actually this made things worse. Not very much, but I don't want to move
-into the wrong direction. Maybe something to investigate later. Could
-have to do with caching again.
-
-Guess that is what there is to win within the loop. Maybe unrolling one
-more time will help. I'll keep the optimisations from 7 for now.
-
-
-Attempt 8
-=========
-
-Unrolled the loop one more time.
-
-
-Analysis 8
-==========
-
-This makes things worse. Let's stick with attempt 6 and continue from there.
-Although it seems that the code within the loop cannot be optimised
-further there is still room to optimize the generation of the ecc codes.
-We can simply calculate the total parity. If this is 0 then rp4 = rp5
-etc. If the parity is 1, then rp4 = !rp5;
-But if rp4 = rp5 we do not need rp5 etc. We can just write the even bits
-in the result byte and then do something like
- code[0] |= (code[0] << 1);
-Lets test this.
-
-
-Attempt 9
-=========
-
-Changed the code but again this slightly degrades performance. Tried all
-kind of other things, like having dedicated parity arrays to avoid the
-shift after parity[rp7] << 7; No gain.
-Change the lookup using the parity array by using shift operators (e.g.
-replace parity[rp7] << 7 with:
-rp7 ^= (rp7 << 4);
-rp7 ^= (rp7 << 2);
-rp7 ^= (rp7 << 1);
-rp7 &= 0x80;
-No gain.
-
-The only marginal change was inverting the parity bits, so we can remove
-the last three invert statements.
-
-Ah well, pity this does not deliver more. Then again 10 million
-iterations using the linux driver code takes between 13 and 13.5
-seconds, whereas my code now takes about 0.73 seconds for those 10
-million iterations. So basically I've improved the performance by a
-factor 18 on my system. Not that bad. Of course on different hardware
-you will get different results. No warranties!
-
-But of course there is no such thing as a free lunch. The codesize almost
-tripled (from 562 bytes to 1434 bytes). Then again, it is not that much.
-
-
-Correcting errors
-=================
-
-For correcting errors I again used the ST application note as a starter,
-but I also peeked at the existing code.
-The algorithm itself is pretty straightforward. Just xor the given and
-the calculated ecc. If all bytes are 0 there is no problem. If 11 bits
-are 1 we have one correctable bit error. If there is 1 bit 1, we have an
-error in the given ecc code.
-It proved to be fastest to do some table lookups. Performance gain
-introduced by this is about a factor 2 on my system when a repair had to
-be done, and 1% or so if no repair had to be done.
-Code size increased from 330 bytes to 686 bytes for this function.
-(gcc 4.2, -O3)
-
-
-Conclusion
-==========
-
-The gain when calculating the ecc is tremendous. Om my development hardware
-a speedup of a factor of 18 for ecc calculation was achieved. On a test on an
-embedded system with a MIPS core a factor 7 was obtained.
-On a test with a Linksys NSLU2 (ARMv5TE processor) the speedup was a factor
-5 (big endian mode, gcc 4.1.2, -O3)
-For correction not much gain could be obtained (as bitflips are rare). Then
-again there are also much less cycles spent there.
-
-It seems there is not much more gain possible in this, at least when
-programmed in C. Of course it might be possible to squeeze something more
-out of it with an assembler program, but due to pipeline behaviour etc
-this is very tricky (at least for intel hw).
-
-Author: Frans Meulenbroeks
-Copyright (C) 2008 Koninklijke Philips Electronics NV.
diff --git a/Documentation/mtd/spi-nor.txt b/Documentation/mtd/spi-nor.txt
deleted file mode 100644
index da1fbff5a24c..000000000000
--- a/Documentation/mtd/spi-nor.txt
+++ /dev/null
@@ -1,65 +0,0 @@
- SPI NOR framework
- ============================================
-
-Part I - Why do we need this framework?
----------------------------------------
-
-SPI bus controllers (drivers/spi/) only deal with streams of bytes; the bus
-controller operates agnostic of the specific device attached. However, some
-controllers (such as Freescale's QuadSPI controller) cannot easily handle
-arbitrary streams of bytes, but rather are designed specifically for SPI NOR.
-
-In particular, Freescale's QuadSPI controller must know the NOR commands to
-find the right LUT sequence. Unfortunately, the SPI subsystem has no notion of
-opcodes, addresses, or data payloads; a SPI controller simply knows to send or
-receive bytes (Tx and Rx). Therefore, we must define a new layering scheme under
-which the controller driver is aware of the opcodes, addressing, and other
-details of the SPI NOR protocol.
-
-Part II - How does the framework work?
---------------------------------------
-
-This framework just adds a new layer between the MTD and the SPI bus driver.
-With this new layer, the SPI NOR controller driver does not depend on the
-m25p80 code anymore.
-
- Before this framework, the layer is like:
-
- MTD
- ------------------------
- m25p80
- ------------------------
- SPI bus driver
- ------------------------
- SPI NOR chip
-
- After this framework, the layer is like:
- MTD
- ------------------------
- SPI NOR framework
- ------------------------
- m25p80
- ------------------------
- SPI bus driver
- ------------------------
- SPI NOR chip
-
- With the SPI NOR controller driver (Freescale QuadSPI), it looks like:
- MTD
- ------------------------
- SPI NOR framework
- ------------------------
- fsl-quadSPI
- ------------------------
- SPI NOR chip
-
-Part III - How can drivers use the framework?
----------------------------------------------
-
-The main API is spi_nor_scan(). Before you call the hook, a driver should
-initialize the necessary fields for spi_nor{}. Please see
-drivers/mtd/spi-nor/spi-nor.c for detail. Please also refer to fsl-quadspi.c
-when you want to write a new driver for a SPI NOR controller.
-Another API is spi_nor_restore(), this is used to restore the status of SPI
-flash chip such as addressing mode. Call it whenever detach the driver from
-device or reboot the system.
diff --git a/Documentation/namespaces/compatibility-list.txt b/Documentation/namespaces/compatibility-list.txt
deleted file mode 100644
index defc5589bfcd..000000000000
--- a/Documentation/namespaces/compatibility-list.txt
+++ /dev/null
@@ -1,39 +0,0 @@
- Namespaces compatibility list
-
-This document contains the information about the problems user
-may have when creating tasks living in different namespaces.
-
-Here's the summary. This matrix shows the known problems, that
-occur when tasks share some namespace (the columns) while living
-in different other namespaces (the rows):
-
- UTS IPC VFS PID User Net
-UTS X
-IPC X 1
-VFS X
-PID 1 1 X
-User 2 2 X
-Net X
-
-1. Both the IPC and the PID namespaces provide IDs to address
- object inside the kernel. E.g. semaphore with IPCID or
- process group with pid.
-
- In both cases, tasks shouldn't try exposing this ID to some
- other task living in a different namespace via a shared filesystem
- or IPC shmem/message. The fact is that this ID is only valid
- within the namespace it was obtained in and may refer to some
- other object in another namespace.
-
-2. Intentionally, two equal user IDs in different user namespaces
- should not be equal from the VFS point of view. In other
- words, user 10 in one user namespace shouldn't have the same
- access permissions to files, belonging to user 10 in another
- namespace.
-
- The same is true for the IPC namespaces being shared - two users
- from different user namespaces should not access the same IPC objects
- even having equal UIDs.
-
- But currently this is not so.
-
diff --git a/Documentation/namespaces/resource-control.txt b/Documentation/namespaces/resource-control.txt
deleted file mode 100644
index abc13c394738..000000000000
--- a/Documentation/namespaces/resource-control.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-There are a lot of kinds of objects in the kernel that don't have
-individual limits or that have limits that are ineffective when a set
-of processes is allowed to switch user ids. With user namespaces
-enabled in a kernel for people who don't trust their users or their
-users programs to play nice this problems becomes more acute.
-
-Therefore it is recommended that memory control groups be enabled in
-kernels that enable user namespaces, and it is further recommended
-that userspace configure memory control groups to limit how much
-memory user's they don't trust to play nice can use.
-
-Memory control groups can be configured by installing the libcgroup
-package present on most distros editing /etc/cgrules.conf,
-/etc/cgconfig.conf and setting up libpam-cgroup.
diff --git a/Documentation/netlabel/index.rst b/Documentation/netlabel/index.rst
index 47f1e0e5acd1..984e1b191b12 100644
--- a/Documentation/netlabel/index.rst
+++ b/Documentation/netlabel/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
========
NetLabel
diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst
index e14d7d40fc75..eeedc2e826aa 100644
--- a/Documentation/networking/af_xdp.rst
+++ b/Documentation/networking/af_xdp.rst
@@ -220,7 +220,21 @@ Usage
In order to use AF_XDP sockets there are two parts needed. The
user-space application and the XDP program. For a complete setup and
usage example, please refer to the sample application. The user-space
-side is xdpsock_user.c and the XDP side xdpsock_kern.c.
+side is xdpsock_user.c and the XDP side is part of libbpf.
+
+The XDP code sample included in tools/lib/bpf/xsk.c is the following::
+
+ SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
+ {
+ int index = ctx->rx_queue_index;
+
+ // A set entry here means that the correspnding queue_id
+ // has an active AF_XDP socket bound to it.
+ if (bpf_map_lookup_elem(&xsks_map, &index))
+ return bpf_redirect_map(&xsks_map, index, 0);
+
+ return XDP_PASS;
+ }
Naive ring dequeue and enqueue could look like this::
@@ -316,16 +330,16 @@ A: When a netdev of a physical NIC is initialized, Linux usually
all the traffic, you can force the netdev to only have 1 queue, queue
id 0, and then bind to queue 0. You can use ethtool to do this::
- sudo ethtool -L <interface> combined 1
+ sudo ethtool -L <interface> combined 1
If you want to only see part of the traffic, you can program the
NIC through ethtool to filter out your traffic to a single queue id
that you can bind your XDP socket to. Here is one example in which
UDP traffic to and from port 4242 are sent to queue 2::
- sudo ethtool -N <interface> rx-flow-hash udp4 fn
- sudo ethtool -N <interface> flow-type udp4 src-port 4242 dst-port \
- 4242 action 2
+ sudo ethtool -N <interface> rx-flow-hash udp4 fn
+ sudo ethtool -N <interface> flow-type udp4 src-port 4242 dst-port \
+ 4242 action 2
A number of other ways are possible all up to the capabilitites of
the NIC you have.
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index d3e5dd26db12..e3abfbd32f71 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -706,9 +706,9 @@ num_unsol_na
unsolicited IPv6 Neighbor Advertisements) to be issued after a
failover event. As soon as the link is up on the new slave
(possibly immediately) a peer notification is sent on the
- bonding device and each VLAN sub-device. This is repeated at
- each link monitor interval (arp_interval or miimon, whichever
- is active) if the number is greater than 1.
+ bonding device and each VLAN sub-device. This is repeated at
+ the rate specified by peer_notif_delay if the number is
+ greater than 1.
The valid range is 0 - 255; the default value is 1. These options
affect only the active-backup mode. These options were added for
@@ -727,6 +727,16 @@ packets_per_slave
The valid range is 0 - 65535; the default value is 1. This option
has effect only in balance-rr mode.
+peer_notif_delay
+
+ Specify the delay, in milliseconds, between each peer
+ notification (gratuitous ARP and unsolicited IPv6 Neighbor
+ Advertisement) when they are issued after a failover event.
+ This delay should be a multiple of the link monitor interval
+ (arp_interval or miimon, whichever is active). The default
+ value is 0 which means to match the value of the link monitor
+ interval.
+
primary
A string (eth0, eth2, etc) specifying which slave is the
diff --git a/Documentation/networking/device_drivers/amazon/ena.txt b/Documentation/networking/device_drivers/amazon/ena.txt
index 2b4b6f57e549..1bb55c7b604c 100644
--- a/Documentation/networking/device_drivers/amazon/ena.txt
+++ b/Documentation/networking/device_drivers/amazon/ena.txt
@@ -73,7 +73,7 @@ operation.
AQ is used for submitting management commands, and the
results/responses are reported asynchronously through ACQ.
-ENA introduces a very small set of management commands with room for
+ENA introduces a small set of management commands with room for
vendor-specific extensions. Most of the management operations are
framed in a generic Get/Set feature command.
@@ -202,11 +202,14 @@ delay value to each level.
The user can enable/disable adaptive moderation, modify the interrupt
delay table and restore its default values through sysfs.
+RX copybreak:
+=============
The rx_copybreak is initialized by default to ENA_DEFAULT_RX_COPYBREAK
and can be configured by the ETHTOOL_STUNABLE command of the
SIOCETHTOOL ioctl.
SKB:
+====
The driver-allocated SKB for frames received from Rx handling using
NAPI context. The allocation method depends on the size of the packet.
If the frame length is larger than rx_copybreak, napi_get_frags()
diff --git a/Documentation/networking/device_drivers/aquantia/atlantic.txt b/Documentation/networking/device_drivers/aquantia/atlantic.txt
new file mode 100644
index 000000000000..d235cbaeccc6
--- /dev/null
+++ b/Documentation/networking/device_drivers/aquantia/atlantic.txt
@@ -0,0 +1,439 @@
+aQuantia AQtion Driver for the aQuantia Multi-Gigabit PCI Express Family of
+Ethernet Adapters
+=============================================================================
+
+Contents
+========
+
+- Identifying Your Adapter
+- Configuration
+- Supported ethtool options
+- Command Line Parameters
+- Config file parameters
+- Support
+- License
+
+Identifying Your Adapter
+========================
+
+The driver in this release is compatible with AQC-100, AQC-107, AQC-108 based ethernet adapters.
+
+
+SFP+ Devices (for AQC-100 based adapters)
+----------------------------------
+
+This release tested with passive Direct Attach Cables (DAC) and SFP+/LC Optical Transceiver.
+
+Configuration
+=========================
+ Viewing Link Messages
+ ---------------------
+ Link messages will not be displayed to the console if the distribution is
+ restricting system messages. In order to see network driver link messages on
+ your console, set dmesg to eight by entering the following:
+
+ dmesg -n 8
+
+ NOTE: This setting is not saved across reboots.
+
+ Jumbo Frames
+ ------------
+ The driver supports Jumbo Frames for all adapters. Jumbo Frames support is
+ enabled by changing the MTU to a value larger than the default of 1500.
+ The maximum value for the MTU is 16000. Use the `ip` command to
+ increase the MTU size. For example:
+
+ ip link set mtu 16000 dev enp1s0
+
+ ethtool
+ -------
+ The driver utilizes the ethtool interface for driver configuration and
+ diagnostics, as well as displaying statistical information. The latest
+ ethtool version is required for this functionality.
+
+ NAPI
+ ----
+ NAPI (Rx polling mode) is supported in the atlantic driver.
+
+Supported ethtool options
+============================
+ Viewing adapter settings
+ ---------------------
+ ethtool <ethX>
+
+ Output example:
+
+ Settings for enp1s0:
+ Supported ports: [ TP ]
+ Supported link modes: 100baseT/Full
+ 1000baseT/Full
+ 10000baseT/Full
+ 2500baseT/Full
+ 5000baseT/Full
+ Supported pause frame use: Symmetric
+ Supports auto-negotiation: Yes
+ Supported FEC modes: Not reported
+ Advertised link modes: 100baseT/Full
+ 1000baseT/Full
+ 10000baseT/Full
+ 2500baseT/Full
+ 5000baseT/Full
+ Advertised pause frame use: Symmetric
+ Advertised auto-negotiation: Yes
+ Advertised FEC modes: Not reported
+ Speed: 10000Mb/s
+ Duplex: Full
+ Port: Twisted Pair
+ PHYAD: 0
+ Transceiver: internal
+ Auto-negotiation: on
+ MDI-X: Unknown
+ Supports Wake-on: g
+ Wake-on: d
+ Link detected: yes
+
+ ---
+ Note: AQrate speeds (2.5/5 Gb/s) will be displayed only with linux kernels > 4.10.
+ But you can still use these speeds:
+ ethtool -s eth0 autoneg off speed 2500
+
+ Viewing adapter information
+ ---------------------
+ ethtool -i <ethX>
+
+ Output example:
+
+ driver: atlantic
+ version: 5.2.0-050200rc5-generic-kern
+ firmware-version: 3.1.78
+ expansion-rom-version:
+ bus-info: 0000:01:00.0
+ supports-statistics: yes
+ supports-test: no
+ supports-eeprom-access: no
+ supports-register-dump: yes
+ supports-priv-flags: no
+
+
+ Viewing Ethernet adapter statistics:
+ ---------------------
+ ethtool -S <ethX>
+
+ Output example:
+ NIC statistics:
+ InPackets: 13238607
+ InUCast: 13293852
+ InMCast: 52
+ InBCast: 3
+ InErrors: 0
+ OutPackets: 23703019
+ OutUCast: 23704941
+ OutMCast: 67
+ OutBCast: 11
+ InUCastOctects: 213182760
+ OutUCastOctects: 22698443
+ InMCastOctects: 6600
+ OutMCastOctects: 8776
+ InBCastOctects: 192
+ OutBCastOctects: 704
+ InOctects: 2131839552
+ OutOctects: 226938073
+ InPacketsDma: 95532300
+ OutPacketsDma: 59503397
+ InOctetsDma: 1137102462
+ OutOctetsDma: 2394339518
+ InDroppedDma: 0
+ Queue[0] InPackets: 23567131
+ Queue[0] OutPackets: 20070028
+ Queue[0] InJumboPackets: 0
+ Queue[0] InLroPackets: 0
+ Queue[0] InErrors: 0
+ Queue[1] InPackets: 45428967
+ Queue[1] OutPackets: 11306178
+ Queue[1] InJumboPackets: 0
+ Queue[1] InLroPackets: 0
+ Queue[1] InErrors: 0
+ Queue[2] InPackets: 3187011
+ Queue[2] OutPackets: 13080381
+ Queue[2] InJumboPackets: 0
+ Queue[2] InLroPackets: 0
+ Queue[2] InErrors: 0
+ Queue[3] InPackets: 23349136
+ Queue[3] OutPackets: 15046810
+ Queue[3] InJumboPackets: 0
+ Queue[3] InLroPackets: 0
+ Queue[3] InErrors: 0
+
+ Interrupt coalescing support
+ ---------------------------------
+ ITR mode, TX/RX coalescing timings could be viewed with:
+
+ ethtool -c <ethX>
+
+ and changed with:
+
+ ethtool -C <ethX> tx-usecs <usecs> rx-usecs <usecs>
+
+ To disable coalescing:
+
+ ethtool -C <ethX> tx-usecs 0 rx-usecs 0 tx-max-frames 1 tx-max-frames 1
+
+ Wake on LAN support
+ ---------------------------------
+
+ WOL support by magic packet:
+
+ ethtool -s <ethX> wol g
+
+ To disable WOL:
+
+ ethtool -s <ethX> wol d
+
+ Set and check the driver message level
+ ---------------------------------
+
+ Set message level
+
+ ethtool -s <ethX> msglvl <level>
+
+ Level values:
+
+ 0x0001 - general driver status.
+ 0x0002 - hardware probing.
+ 0x0004 - link state.
+ 0x0008 - periodic status check.
+ 0x0010 - interface being brought down.
+ 0x0020 - interface being brought up.
+ 0x0040 - receive error.
+ 0x0080 - transmit error.
+ 0x0200 - interrupt handling.
+ 0x0400 - transmit completion.
+ 0x0800 - receive completion.
+ 0x1000 - packet contents.
+ 0x2000 - hardware status.
+ 0x4000 - Wake-on-LAN status.
+
+ By default, the level of debugging messages is set 0x0001(general driver status).
+
+ Check message level
+
+ ethtool <ethX> | grep "Current message level"
+
+ If you want to disable the output of messages
+
+ ethtool -s <ethX> msglvl 0
+
+ RX flow rules (ntuple filters)
+ ---------------------------------
+ There are separate rules supported, that applies in that order:
+ 1. 16 VLAN ID rules
+ 2. 16 L2 EtherType rules
+ 3. 8 L3/L4 5-Tuple rules
+
+
+ The driver utilizes the ethtool interface for configuring ntuple filters,
+ via "ethtool -N <device> <filter>".
+
+ To enable or disable the RX flow rules:
+
+ ethtool -K ethX ntuple <on|off>
+
+ When disabling ntuple filters, all the user programed filters are
+ flushed from the driver cache and hardware. All needed filters must
+ be re-added when ntuple is re-enabled.
+
+ Because of the fixed order of the rules, the location of filters is also fixed:
+ - Locations 0 - 15 for VLAN ID filters
+ - Locations 16 - 31 for L2 EtherType filters
+ - Locations 32 - 39 for L3/L4 5-tuple filters (locations 32, 36 for IPv6)
+
+ The L3/L4 5-tuple (protocol, source and destination IP address, source and
+ destination TCP/UDP/SCTP port) is compared against 8 filters. For IPv4, up to
+ 8 source and destination addresses can be matched. For IPv6, up to 2 pairs of
+ addresses can be supported. Source and destination ports are only compared for
+ TCP/UDP/SCTP packets.
+
+ To add a filter that directs packet to queue 5, use <-N|-U|--config-nfc|--config-ntuple> switch:
+
+ ethtool -N <ethX> flow-type udp4 src-ip 10.0.0.1 dst-ip 10.0.0.2 src-port 2000 dst-port 2001 action 5 <loc 32>
+
+ - action is the queue number.
+ - loc is the rule number.
+
+ For "flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6" you must set the loc
+ number within 32 - 39.
+ For "flow-type ip4|udp4|tcp4|sctp4|ip6|udp6|tcp6|sctp6" you can set 8 rules
+ for traffic IPv4 or you can set 2 rules for traffic IPv6. Loc number traffic
+ IPv6 is 32 and 36.
+ At the moment you can not use IPv4 and IPv6 filters at the same time.
+
+ Example filter for IPv6 filter traffic:
+
+ sudo ethtool -N <ethX> flow-type tcp6 src-ip 2001:db8:0:f101::1 dst-ip 2001:db8:0:f101::2 action 1 loc 32
+ sudo ethtool -N <ethX> flow-type ip6 src-ip 2001:db8:0:f101::2 dst-ip 2001:db8:0:f101::5 action -1 loc 36
+
+ Example filter for IPv4 filter traffic:
+
+ sudo ethtool -N <ethX> flow-type udp4 src-ip 10.0.0.4 dst-ip 10.0.0.7 src-port 2000 dst-port 2001 loc 32
+ sudo ethtool -N <ethX> flow-type tcp4 src-ip 10.0.0.3 dst-ip 10.0.0.9 src-port 2000 dst-port 2001 loc 33
+ sudo ethtool -N <ethX> flow-type ip4 src-ip 10.0.0.6 dst-ip 10.0.0.4 loc 34
+
+ If you set action -1, then all traffic corresponding to the filter will be discarded.
+ The maximum value action is 31.
+
+
+ The VLAN filter (VLAN id) is compared against 16 filters.
+ VLAN id must be accompanied by mask 0xF000. That is to distinguish VLAN filter
+ from L2 Ethertype filter with UserPriority since both User Priority and VLAN ID
+ are passed in the same 'vlan' parameter.
+
+ To add a filter that directs packets from VLAN 2001 to queue 5:
+ ethtool -N <ethX> flow-type ip4 vlan 2001 m 0xF000 action 1 loc 0
+
+
+ L2 EtherType filters allows filter packet by EtherType field or both EtherType
+ and User Priority (PCP) field of 802.1Q.
+ UserPriority (vlan) parameter must be accompanied by mask 0x1FFF. That is to
+ distinguish VLAN filter from L2 Ethertype filter with UserPriority since both
+ User Priority and VLAN ID are passed in the same 'vlan' parameter.
+
+ To add a filter that directs IP4 packess of priority 3 to queue 3:
+ ethtool -N <ethX> flow-type ether proto 0x800 vlan 0x600 m 0x1FFF action 3 loc 16
+
+
+ To see the list of filters currently present:
+
+ ethtool <-u|-n|--show-nfc|--show-ntuple> <ethX>
+
+ Rules may be deleted from the table itself. This is done using:
+
+ sudo ethtool <-N|-U|--config-nfc|--config-ntuple> <ethX> delete <loc>
+
+ - loc is the rule number to be deleted.
+
+ Rx filters is an interface to load the filter table that funnels all flow
+ into queue 0 unless an alternative queue is specified using "action". In that
+ case, any flow that matches the filter criteria will be directed to the
+ appropriate queue. RX filters is supported on all kernels 2.6.30 and later.
+
+ RSS for UDP
+ ---------------------------------
+ Currently, NIC does not support RSS for fragmented IP packets, which leads to
+ incorrect working of RSS for fragmented UDP traffic. To disable RSS for UDP the
+ RX Flow L3/L4 rule may be used.
+
+ Example:
+ ethtool -N eth0 flow-type udp4 action 0 loc 32
+
+Command Line Parameters
+=======================
+The following command line parameters are available on atlantic driver:
+
+aq_itr -Interrupt throttling mode
+----------------------------------------
+Accepted values: 0, 1, 0xFFFF
+Default value: 0xFFFF
+0 - Disable interrupt throttling.
+1 - Enable interrupt throttling and use specified tx and rx rates.
+0xFFFF - Auto throttling mode. Driver will choose the best RX and TX
+ interrupt throtting settings based on link speed.
+
+aq_itr_tx - TX interrupt throttle rate
+----------------------------------------
+Accepted values: 0 - 0x1FF
+Default value: 0
+TX side throttling in microseconds. Adapter will setup maximum interrupt delay
+to this value. Minimum interrupt delay will be a half of this value
+
+aq_itr_rx - RX interrupt throttle rate
+----------------------------------------
+Accepted values: 0 - 0x1FF
+Default value: 0
+RX side throttling in microseconds. Adapter will setup maximum interrupt delay
+to this value. Minimum interrupt delay will be a half of this value
+
+Note: ITR settings could be changed in runtime by ethtool -c means (see below)
+
+Config file parameters
+=======================
+For some fine tuning and performance optimizations,
+some parameters can be changed in the {source_dir}/aq_cfg.h file.
+
+AQ_CFG_RX_PAGEORDER
+----------------------------------------
+Default value: 0
+RX page order override. Thats a power of 2 number of RX pages allocated for
+each descriptor. Received descriptor size is still limited by AQ_CFG_RX_FRAME_MAX.
+Increasing pageorder makes page reuse better (actual on iommu enabled systems).
+
+AQ_CFG_RX_REFILL_THRES
+----------------------------------------
+Default value: 32
+RX refill threshold. RX path will not refill freed descriptors until the
+specified number of free descriptors is observed. Larger values may help
+better page reuse but may lead to packet drops as well.
+
+AQ_CFG_VECS_DEF
+------------------------------------------------------------
+Number of queues
+Valid Range: 0 - 8 (up to AQ_CFG_VECS_MAX)
+Default value: 8
+Notice this value will be capped by the number of cores available on the system.
+
+AQ_CFG_IS_RSS_DEF
+------------------------------------------------------------
+Enable/disable Receive Side Scaling
+
+This feature allows the adapter to distribute receive processing
+across multiple CPU-cores and to prevent from overloading a single CPU core.
+
+Valid values
+0 - disabled
+1 - enabled
+
+Default value: 1
+
+AQ_CFG_NUM_RSS_QUEUES_DEF
+------------------------------------------------------------
+Number of queues for Receive Side Scaling
+Valid Range: 0 - 8 (up to AQ_CFG_VECS_DEF)
+
+Default value: AQ_CFG_VECS_DEF
+
+AQ_CFG_IS_LRO_DEF
+------------------------------------------------------------
+Enable/disable Large Receive Offload
+
+This offload enables the adapter to coalesce multiple TCP segments and indicate
+them as a single coalesced unit to the OS networking subsystem.
+The system consumes less energy but it also introduces more latency in packets processing.
+
+Valid values
+0 - disabled
+1 - enabled
+
+Default value: 1
+
+AQ_CFG_TX_CLEAN_BUDGET
+----------------------------------------
+Maximum descriptors to cleanup on TX at once.
+Default value: 256
+
+After the aq_cfg.h file changed the driver must be rebuilt to take effect.
+
+Support
+=======
+
+If an issue is identified with the released source code on the supported
+kernel with a supported adapter, email the specific information related
+to the issue to support@aquantia.com
+
+License
+=======
+
+aQuantia Corporation Network Driver
+Copyright(c) 2014 - 2019 aQuantia Corporation.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms and conditions of the GNU General Public License,
+version 2, as published by the Free Software Foundation.
diff --git a/Documentation/networking/device_drivers/google/gve.rst b/Documentation/networking/device_drivers/google/gve.rst
new file mode 100644
index 000000000000..793693cef6e3
--- /dev/null
+++ b/Documentation/networking/device_drivers/google/gve.rst
@@ -0,0 +1,123 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+==============================================================
+Linux kernel driver for Compute Engine Virtual Ethernet (gve):
+==============================================================
+
+Supported Hardware
+===================
+The GVE driver binds to a single PCI device id used by the virtual
+Ethernet device found in some Compute Engine VMs.
+
++--------------+----------+---------+
+|Field | Value | Comments|
++==============+==========+=========+
+|Vendor ID | `0x1AE0` | Google |
++--------------+----------+---------+
+|Device ID | `0x0042` | |
++--------------+----------+---------+
+|Sub-vendor ID | `0x1AE0` | Google |
++--------------+----------+---------+
+|Sub-device ID | `0x0058` | |
++--------------+----------+---------+
+|Revision ID | `0x0` | |
++--------------+----------+---------+
+|Device Class | `0x200` | Ethernet|
++--------------+----------+---------+
+
+PCI Bars
+========
+The gVNIC PCI device exposes three 32-bit memory BARS:
+- Bar0 - Device configuration and status registers.
+- Bar1 - MSI-X vector table
+- Bar2 - IRQ, RX and TX doorbells
+
+Device Interactions
+===================
+The driver interacts with the device in the following ways:
+ - Registers
+ - A block of MMIO registers
+ - See gve_register.h for more detail
+ - Admin Queue
+ - See description below
+ - Reset
+ - At any time the device can be reset
+ - Interrupts
+ - See supported interrupts below
+ - Transmit and Receive Queues
+ - See description below
+
+Registers
+---------
+All registers are MMIO and big endian.
+
+The registers are used for initializing and configuring the device as well as
+querying device status in response to management interrupts.
+
+Admin Queue (AQ)
+----------------
+The Admin Queue is a PAGE_SIZE memory block, treated as an array of AQ
+commands, used by the driver to issue commands to the device and set up
+resources.The driver and the device maintain a count of how many commands
+have been submitted and executed. To issue AQ commands, the driver must do
+the following (with proper locking):
+
+1) Copy new commands into next available slots in the AQ array
+2) Increment its counter by he number of new commands
+3) Write the counter into the GVE_ADMIN_QUEUE_DOORBELL register
+4) Poll the ADMIN_QUEUE_EVENT_COUNTER register until it equals
+ the value written to the doorbell, or until a timeout.
+
+The device will update the status field in each AQ command reported as
+executed through the ADMIN_QUEUE_EVENT_COUNTER register.
+
+Device Resets
+-------------
+A device reset is triggered by writing 0x0 to the AQ PFN register.
+This causes the device to release all resources allocated by the
+driver, including the AQ itself.
+
+Interrupts
+----------
+The following interrupts are supported by the driver:
+
+Management Interrupt
+~~~~~~~~~~~~~~~~~~~~
+The management interrupt is used by the device to tell the driver to
+look at the GVE_DEVICE_STATUS register.
+
+The handler for the management irq simply queues the service task in
+the workqueue to check the register and acks the irq.
+
+Notification Block Interrupts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The notification block interrupts are used to tell the driver to poll
+the queues associated with that interrupt.
+
+The handler for these irqs schedule the napi for that block to run
+and poll the queues.
+
+Traffic Queues
+--------------
+gVNIC's queues are composed of a descriptor ring and a buffer and are
+assigned to a notification block.
+
+The descriptor rings are power-of-two-sized ring buffers consisting of
+fixed-size descriptors. They advance their head pointer using a __be32
+doorbell located in Bar2. The tail pointers are advanced by consuming
+descriptors in-order and updating a __be32 counter. Both the doorbell
+and the counter overflow to zero.
+
+Each queue's buffers must be registered in advance with the device as a
+queue page list, and packet data can only be put in those pages.
+
+Transmit
+~~~~~~~~
+gve maps the buffers for transmit rings into a FIFO and copies the packets
+into the FIFO before sending them to the NIC.
+
+Receive
+~~~~~~~
+The buffers for receive rings are put into a data ring that is the same
+length as the descriptor ring and the head and tail pointers advance over
+the rings together.
diff --git a/Documentation/networking/device_drivers/index.rst b/Documentation/networking/device_drivers/index.rst
index 75fa537763a4..2b7fefe72351 100644
--- a/Documentation/networking/device_drivers/index.rst
+++ b/Documentation/networking/device_drivers/index.rst
@@ -21,6 +21,8 @@ Contents:
intel/i40e
intel/iavf
intel/ice
+ google/gve
+ mellanox/mlx5
.. only:: subproject
diff --git a/Documentation/networking/device_drivers/mellanox/mlx5.rst b/Documentation/networking/device_drivers/mellanox/mlx5.rst
new file mode 100644
index 000000000000..214325897732
--- /dev/null
+++ b/Documentation/networking/device_drivers/mellanox/mlx5.rst
@@ -0,0 +1,192 @@
+.. SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+
+=================================================
+Mellanox ConnectX(R) mlx5 core VPI Network Driver
+=================================================
+
+Copyright (c) 2019, Mellanox Technologies LTD.
+
+Contents
+========
+
+- `Enabling the driver and kconfig options`_
+- `Devlink info`_
+- `Devlink health reporters`_
+
+Enabling the driver and kconfig options
+================================================
+
+| mlx5 core is modular and most of the major mlx5 core driver features can be selected (compiled in/out)
+| at build time via kernel Kconfig flags.
+| Basic features, ethernet net device rx/tx offloads and XDP, are available with the most basic flags
+| CONFIG_MLX5_CORE=y/m and CONFIG_MLX5_CORE_EN=y.
+| For the list of advanced features please see below.
+
+**CONFIG_MLX5_CORE=(y/m/n)** (module mlx5_core.ko)
+
+| The driver can be enabled by choosing CONFIG_MLX5_CORE=y/m in kernel config.
+| This will provide mlx5 core driver for mlx5 ulps to interface with (mlx5e, mlx5_ib).
+
+
+**CONFIG_MLX5_CORE_EN=(y/n)**
+
+| Choosing this option will allow basic ethernet netdevice support with all of the standard rx/tx offloads.
+| mlx5e is the mlx5 ulp driver which provides netdevice kernel interface, when chosen, mlx5e will be
+| built-in into mlx5_core.ko.
+
+
+**CONFIG_MLX5_EN_ARFS=(y/n)**
+
+| Enables Hardware-accelerated receive flow steering (arfs) support, and ntuple filtering.
+| https://community.mellanox.com/s/article/howto-configure-arfs-on-connectx-4
+
+
+**CONFIG_MLX5_EN_RXNFC=(y/n)**
+
+| Enables ethtool receive network flow classification, which allows user defined
+| flow rules to direct traffic into arbitrary rx queue via ethtool set/get_rxnfc API.
+
+
+**CONFIG_MLX5_CORE_EN_DCB=(y/n)**:
+
+| Enables `Data Center Bridging (DCB) Support <https://community.mellanox.com/s/article/howto-auto-config-pfc-and-ets-on-connectx-4-via-lldp-dcbx>`_.
+
+
+**CONFIG_MLX5_MPFS=(y/n)**
+
+| Ethernet Multi-Physical Function Switch (MPFS) support in ConnectX NIC.
+| MPFs is required for when `Multi-Host <http://www.mellanox.com/page/multihost>`_ configuration is enabled to allow passing
+| user configured unicast MAC addresses to the requesting PF.
+
+
+**CONFIG_MLX5_ESWITCH=(y/n)**
+
+| Ethernet SRIOV E-Switch support in ConnectX NIC. E-Switch provides internal SRIOV packet steering
+| and switching for the enabled VFs and PF in two available modes:
+| 1) `Legacy SRIOV mode (L2 mac vlan steering based) <https://community.mellanox.com/s/article/howto-configure-sr-iov-for-connectx-4-connectx-5-with-kvm--ethernet-x>`_.
+| 2) `Switchdev mode (eswitch offloads) <https://www.mellanox.com/related-docs/prod_software/ASAP2_Hardware_Offloading_for_vSwitches_User_Manual_v4.4.pdf>`_.
+
+
+**CONFIG_MLX5_CORE_IPOIB=(y/n)**
+
+| IPoIB offloads & acceleration support.
+| Requires CONFIG_MLX5_CORE_EN to provide an accelerated interface for the rdma
+| IPoIB ulp netdevice.
+
+
+**CONFIG_MLX5_FPGA=(y/n)**
+
+| Build support for the Innova family of network cards by Mellanox Technologies.
+| Innova network cards are comprised of a ConnectX chip and an FPGA chip on one board.
+| If you select this option, the mlx5_core driver will include the Innova FPGA core and allow
+| building sandbox-specific client drivers.
+
+
+**CONFIG_MLX5_EN_IPSEC=(y/n)**
+
+| Enables `IPSec XFRM cryptography-offload accelaration <http://www.mellanox.com/related-docs/prod_software/Mellanox_Innova_IPsec_Ethernet_Adapter_Card_User_Manual.pdf>`_.
+
+**CONFIG_MLX5_EN_TLS=(y/n)**
+
+| TLS cryptography-offload accelaration.
+
+
+**CONFIG_MLX5_INFINIBAND=(y/n/m)** (module mlx5_ib.ko)
+
+| Provides low-level InfiniBand/RDMA and `RoCE <https://community.mellanox.com/s/article/recommended-network-configuration-examples-for-roce-deployment>`_ support.
+
+
+**External options** ( Choose if the corresponding mlx5 feature is required )
+
+- CONFIG_PTP_1588_CLOCK: When chosen, mlx5 ptp support will be enabled
+- CONFIG_VXLAN: When chosen, mlx5 vxaln support will be enabled.
+- CONFIG_MLXFW: When chosen, mlx5 firmware flashing support will be enabled (via devlink and ethtool).
+
+Devlink info
+============
+
+The devlink info reports the running and stored firmware versions on device.
+It also prints the device PSID which represents the HCA board type ID.
+
+User command example::
+
+ $ devlink dev info pci/0000:00:06.0
+ pci/0000:00:06.0:
+ driver mlx5_core
+ versions:
+ fixed:
+ fw.psid MT_0000000009
+ running:
+ fw.version 16.26.0100
+ stored:
+ fw.version 16.26.0100
+
+Devlink health reporters
+========================
+
+tx reporter
+-----------
+The tx reporter is responsible of two error scenarios:
+
+- TX timeout
+ Report on kernel tx timeout detection.
+ Recover by searching lost interrupts.
+- TX error completion
+ Report on error tx completion.
+ Recover by flushing the TX queue and reset it.
+
+TX reporter also support Diagnose callback, on which it provides
+real time information of its send queues status.
+
+User commands examples:
+
+- Diagnose send queues status::
+
+ $ devlink health diagnose pci/0000:82:00.0 reporter tx
+
+- Show number of tx errors indicated, number of recover flows ended successfully,
+ is autorecover enabled and graceful period from last recover::
+
+ $ devlink health show pci/0000:82:00.0 reporter tx
+
+fw reporter
+-----------
+The fw reporter implements diagnose and dump callbacks.
+It follows symptoms of fw error such as fw syndrome by triggering
+fw core dump and storing it into the dump buffer.
+The fw reporter diagnose command can be triggered any time by the user to check
+current fw status.
+
+User commands examples:
+
+- Check fw heath status::
+
+ $ devlink health diagnose pci/0000:82:00.0 reporter fw
+
+- Read FW core dump if already stored or trigger new one::
+
+ $ devlink health dump show pci/0000:82:00.0 reporter fw
+
+NOTE: This command can run only on the PF which has fw tracer ownership,
+running it on other PF or any VF will return "Operation not permitted".
+
+fw fatal reporter
+-----------------
+The fw fatal reporter implements dump and recover callbacks.
+It follows fatal errors indications by CR-space dump and recover flow.
+The CR-space dump uses vsc interface which is valid even if the FW command
+interface is not functional, which is the case in most FW fatal errors.
+The recover function runs recover flow which reloads the driver and triggers fw
+reset if needed.
+
+User commands examples:
+
+- Run fw recover flow manually::
+
+ $ devlink health recover pci/0000:82:00.0 reporter fw_fatal
+
+- Read FW CR-space dump if already strored or trigger new one::
+
+ $ devlink health dump show pci/0000:82:00.1 reporter fw_fatal
+
+NOTE: This command can run only on PF.
diff --git a/Documentation/networking/dsa/b53.rst b/Documentation/networking/dsa/b53.rst
new file mode 100644
index 000000000000..b41637cdb82b
--- /dev/null
+++ b/Documentation/networking/dsa/b53.rst
@@ -0,0 +1,183 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================================
+Broadcom RoboSwitch Ethernet switch driver
+==========================================
+
+The Broadcom RoboSwitch Ethernet switch family is used in quite a range of
+xDSL router, cable modems and other multimedia devices.
+
+The actual implementation supports the devices BCM5325E, BCM5365, BCM539x,
+BCM53115 and BCM53125 as well as BCM63XX.
+
+Implementation details
+======================
+
+The driver is located in ``drivers/net/dsa/b53/`` and is implemented as a
+DSA driver; see ``Documentation/networking/dsa/dsa.rst`` for details on the
+subsystem and what it provides.
+
+The switch is, if possible, configured to enable a Broadcom specific 4-bytes
+switch tag which gets inserted by the switch for every packet forwarded to the
+CPU interface, conversely, the CPU network interface should insert a similar
+tag for packets entering the CPU port. The tag format is described in
+``net/dsa/tag_brcm.c``.
+
+The configuration of the device depends on whether or not tagging is
+supported.
+
+The interface names and example network configuration are used according the
+configuration described in the :ref:`dsa-config-showcases`.
+
+Configuration with tagging support
+----------------------------------
+
+The tagging based configuration is desired. It is not specific to the b53
+DSA driver and will work like all DSA drivers which supports tagging.
+
+See :ref:`dsa-tagged-configuration`.
+
+Configuration without tagging support
+-------------------------------------
+
+Older models (5325, 5365) support a different tag format that is not supported
+yet. 539x and 531x5 require managed mode and some special handling, which is
+also not yet supported. The tagging support is disabled in these cases and the
+switch need a different configuration.
+
+The configuration slightly differ from the :ref:`dsa-vlan-configuration`.
+
+The b53 tags the CPU port in all VLANs, since otherwise any PVID untagged
+VLAN programming would basically change the CPU port's default PVID and make
+it untagged, undesirable.
+
+In difference to the configuration described in :ref:`dsa-vlan-configuration`
+the default VLAN 1 has to be removed from the slave interface configuration in
+single port and gateway configuration, while there is no need to add an extra
+VLAN configuration in the bridge showcase.
+
+single port
+~~~~~~~~~~~
+The configuration can only be set up via VLAN tagging and bridge setup.
+By default packages are tagged with vid 1:
+
+.. code-block:: sh
+
+ # tag traffic on CPU port
+ ip link add link eth0 name eth0.1 type vlan id 1
+ ip link add link eth0 name eth0.2 type vlan id 2
+ ip link add link eth0 name eth0.3 type vlan id 3
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+ ip link set eth0.1 up
+ ip link set eth0.2 up
+ ip link set eth0.3 up
+
+ # bring up the slave interfaces
+ ip link set wan up
+ ip link set lan1 up
+ ip link set lan2 up
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # activate VLAN filtering
+ ip link set dev br0 type bridge vlan_filtering 1
+
+ # add ports to bridges
+ ip link set dev wan master br0
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+
+ # tag traffic on ports
+ bridge vlan add dev lan1 vid 2 pvid untagged
+ bridge vlan del dev lan1 vid 1
+ bridge vlan add dev lan2 vid 3 pvid untagged
+ bridge vlan del dev lan2 vid 1
+
+ # configure the VLANs
+ ip addr add 192.0.2.1/30 dev eth0.1
+ ip addr add 192.0.2.5/30 dev eth0.2
+ ip addr add 192.0.2.9/30 dev eth0.3
+
+ # bring up the bridge devices
+ ip link set br0 up
+
+
+bridge
+~~~~~~
+
+.. code-block:: sh
+
+ # tag traffic on CPU port
+ ip link add link eth0 name eth0.1 type vlan id 1
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+ ip link set eth0.1 up
+
+ # bring up the slave interfaces
+ ip link set wan up
+ ip link set lan1 up
+ ip link set lan2 up
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # activate VLAN filtering
+ ip link set dev br0 type bridge vlan_filtering 1
+
+ # add ports to bridge
+ ip link set dev wan master br0
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+ ip link set eth0.1 master br0
+
+ # configure the bridge
+ ip addr add 192.0.2.129/25 dev br0
+
+ # bring up the bridge
+ ip link set dev br0 up
+
+gateway
+~~~~~~~
+
+.. code-block:: sh
+
+ # tag traffic on CPU port
+ ip link add link eth0 name eth0.1 type vlan id 1
+ ip link add link eth0 name eth0.2 type vlan id 2
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+ ip link set eth0.1 up
+ ip link set eth0.2 up
+
+ # bring up the slave interfaces
+ ip link set wan up
+ ip link set lan1 up
+ ip link set lan2 up
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # activate VLAN filtering
+ ip link set dev br0 type bridge vlan_filtering 1
+
+ # add ports to bridges
+ ip link set dev wan master br0
+ ip link set eth0.1 master br0
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+
+ # tag traffic on ports
+ bridge vlan add dev wan vid 2 pvid untagged
+ bridge vlan del dev wan vid 1
+
+ # configure the VLANs
+ ip addr add 192.0.2.1/30 dev eth0.2
+ ip addr add 192.0.2.129/25 dev br0
+
+ # bring up the bridge devices
+ ip link set br0 up
diff --git a/Documentation/networking/dsa/configuration.rst b/Documentation/networking/dsa/configuration.rst
new file mode 100644
index 000000000000..af029b3ca2ab
--- /dev/null
+++ b/Documentation/networking/dsa/configuration.rst
@@ -0,0 +1,292 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=======================================
+DSA switch configuration from userspace
+=======================================
+
+The DSA switch configuration is not integrated into the main userspace
+network configuration suites by now and has to be performed manualy.
+
+.. _dsa-config-showcases:
+
+Configuration showcases
+-----------------------
+
+To configure a DSA switch a couple of commands need to be executed. In this
+documentation some common configuration scenarios are handled as showcases:
+
+*single port*
+ Every switch port acts as a different configurable Ethernet port
+
+*bridge*
+ Every switch port is part of one configurable Ethernet bridge
+
+*gateway*
+ Every switch port except one upstream port is part of a configurable
+ Ethernet bridge.
+ The upstream port acts as different configurable Ethernet port.
+
+All configurations are performed with tools from iproute2, which is available
+at https://www.kernel.org/pub/linux/utils/net/iproute2/
+
+Through DSA every port of a switch is handled like a normal linux Ethernet
+interface. The CPU port is the switch port connected to an Ethernet MAC chip.
+The corresponding linux Ethernet interface is called the master interface.
+All other corresponding linux interfaces are called slave interfaces.
+
+The slave interfaces depend on the master interface. They can only brought up,
+when the master interface is up.
+
+In this documentation the following Ethernet interfaces are used:
+
+*eth0*
+ the master interface
+
+*lan1*
+ a slave interface
+
+*lan2*
+ another slave interface
+
+*lan3*
+ a third slave interface
+
+*wan*
+ A slave interface dedicated for upstream traffic
+
+Further Ethernet interfaces can be configured similar.
+The configured IPs and networks are:
+
+*single port*
+ * lan1: 192.0.2.1/30 (192.0.2.0 - 192.0.2.3)
+ * lan2: 192.0.2.5/30 (192.0.2.4 - 192.0.2.7)
+ * lan3: 192.0.2.9/30 (192.0.2.8 - 192.0.2.11)
+
+*bridge*
+ * br0: 192.0.2.129/25 (192.0.2.128 - 192.0.2.255)
+
+*gateway*
+ * br0: 192.0.2.129/25 (192.0.2.128 - 192.0.2.255)
+ * wan: 192.0.2.1/30 (192.0.2.0 - 192.0.2.3)
+
+.. _dsa-tagged-configuration:
+
+Configuration with tagging support
+----------------------------------
+
+The tagging based configuration is desired and supported by the majority of
+DSA switches. These switches are capable to tag incoming and outgoing traffic
+without using a VLAN based configuration.
+
+single port
+~~~~~~~~~~~
+
+.. code-block:: sh
+
+ # configure each interface
+ ip addr add 192.0.2.1/30 dev lan1
+ ip addr add 192.0.2.5/30 dev lan2
+ ip addr add 192.0.2.9/30 dev lan3
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+
+ # bring up the slave interfaces
+ ip link set lan1 up
+ ip link set lan2 up
+ ip link set lan3 up
+
+bridge
+~~~~~~
+
+.. code-block:: sh
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+
+ # bring up the slave interfaces
+ ip link set lan1 up
+ ip link set lan2 up
+ ip link set lan3 up
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # add ports to bridge
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+ ip link set dev lan3 master br0
+
+ # configure the bridge
+ ip addr add 192.0.2.129/25 dev br0
+
+ # bring up the bridge
+ ip link set dev br0 up
+
+gateway
+~~~~~~~
+
+.. code-block:: sh
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+
+ # bring up the slave interfaces
+ ip link set wan up
+ ip link set lan1 up
+ ip link set lan2 up
+
+ # configure the upstream port
+ ip addr add 192.0.2.1/30 dev wan
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # add ports to bridge
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+
+ # configure the bridge
+ ip addr add 192.0.2.129/25 dev br0
+
+ # bring up the bridge
+ ip link set dev br0 up
+
+.. _dsa-vlan-configuration:
+
+Configuration without tagging support
+-------------------------------------
+
+A minority of switches are not capable to use a taging protocol
+(DSA_TAG_PROTO_NONE). These switches can be configured by a VLAN based
+configuration.
+
+single port
+~~~~~~~~~~~
+The configuration can only be set up via VLAN tagging and bridge setup.
+
+.. code-block:: sh
+
+ # tag traffic on CPU port
+ ip link add link eth0 name eth0.1 type vlan id 1
+ ip link add link eth0 name eth0.2 type vlan id 2
+ ip link add link eth0 name eth0.3 type vlan id 3
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+ ip link set eth0.1 up
+ ip link set eth0.2 up
+ ip link set eth0.3 up
+
+ # bring up the slave interfaces
+ ip link set lan1 up
+ ip link set lan1 up
+ ip link set lan3 up
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # activate VLAN filtering
+ ip link set dev br0 type bridge vlan_filtering 1
+
+ # add ports to bridges
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+ ip link set dev lan3 master br0
+
+ # tag traffic on ports
+ bridge vlan add dev lan1 vid 1 pvid untagged
+ bridge vlan add dev lan2 vid 2 pvid untagged
+ bridge vlan add dev lan3 vid 3 pvid untagged
+
+ # configure the VLANs
+ ip addr add 192.0.2.1/30 dev eth0.1
+ ip addr add 192.0.2.5/30 dev eth0.2
+ ip addr add 192.0.2.9/30 dev eth0.3
+
+ # bring up the bridge devices
+ ip link set br0 up
+
+
+bridge
+~~~~~~
+
+.. code-block:: sh
+
+ # tag traffic on CPU port
+ ip link add link eth0 name eth0.1 type vlan id 1
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+ ip link set eth0.1 up
+
+ # bring up the slave interfaces
+ ip link set lan1 up
+ ip link set lan2 up
+ ip link set lan3 up
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # activate VLAN filtering
+ ip link set dev br0 type bridge vlan_filtering 1
+
+ # add ports to bridge
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+ ip link set dev lan3 master br0
+ ip link set eth0.1 master br0
+
+ # tag traffic on ports
+ bridge vlan add dev lan1 vid 1 pvid untagged
+ bridge vlan add dev lan2 vid 1 pvid untagged
+ bridge vlan add dev lan3 vid 1 pvid untagged
+
+ # configure the bridge
+ ip addr add 192.0.2.129/25 dev br0
+
+ # bring up the bridge
+ ip link set dev br0 up
+
+gateway
+~~~~~~~
+
+.. code-block:: sh
+
+ # tag traffic on CPU port
+ ip link add link eth0 name eth0.1 type vlan id 1
+ ip link add link eth0 name eth0.2 type vlan id 2
+
+ # The master interface needs to be brought up before the slave ports.
+ ip link set eth0 up
+ ip link set eth0.1 up
+ ip link set eth0.2 up
+
+ # bring up the slave interfaces
+ ip link set wan up
+ ip link set lan1 up
+ ip link set lan2 up
+
+ # create bridge
+ ip link add name br0 type bridge
+
+ # activate VLAN filtering
+ ip link set dev br0 type bridge vlan_filtering 1
+
+ # add ports to bridges
+ ip link set dev wan master br0
+ ip link set eth0.1 master br0
+ ip link set dev lan1 master br0
+ ip link set dev lan2 master br0
+
+ # tag traffic on ports
+ bridge vlan add dev lan1 vid 1 pvid untagged
+ bridge vlan add dev lan2 vid 1 pvid untagged
+ bridge vlan add dev wan vid 2 pvid untagged
+
+ # configure the VLANs
+ ip addr add 192.0.2.1/30 dev eth0.2
+ ip addr add 192.0.2.129/25 dev br0
+
+ # bring up the bridge devices
+ ip link set br0 up
diff --git a/Documentation/networking/dsa/index.rst b/Documentation/networking/dsa/index.rst
index 0e5b7a9be406..ee631e2d646f 100644
--- a/Documentation/networking/dsa/index.rst
+++ b/Documentation/networking/dsa/index.rst
@@ -6,6 +6,8 @@ Distributed Switch Architecture
:maxdepth: 1
dsa
+ b53
bcm_sf2
lan9303
sja1105
+ configuration
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 14fe93049d28..df33674799b5 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -80,6 +80,7 @@ fib_multipath_hash_policy - INTEGER
Possible values:
0 - Layer 3
1 - Layer 4
+ 2 - Layer 3 or inner Layer 3 if present
fib_sync_mem - UNSIGNED INTEGER
Amount of dirty memory from fib entries that can be backlogged before
@@ -255,6 +256,14 @@ tcp_base_mss - INTEGER
Path MTU discovery (MTU probing). If MTU probing is enabled,
this is the initial MSS used by the connection.
+tcp_min_snd_mss - INTEGER
+ TCP SYN and SYNACK messages usually advertise an ADVMSS option,
+ as described in RFC 1122 and RFC 6691.
+ If this ADVMSS option is smaller than tcp_min_snd_mss,
+ it is silently capped to tcp_min_snd_mss.
+
+ Default : 48 (at least 8 bytes of payload per segment)
+
tcp_congestion_control - STRING
Set the congestion control algorithm to be used for new
connections. The algorithm "reno" is always available, but
@@ -648,6 +657,26 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER
0 to disable the blackhole detection.
By default, it is set to 1hr.
+tcp_fastopen_key - list of comma separated 32-digit hexadecimal INTEGERs
+ The list consists of a primary key and an optional backup key. The
+ primary key is used for both creating and validating cookies, while the
+ optional backup key is only used for validating cookies. The purpose of
+ the backup key is to maximize TFO validation when keys are rotated.
+
+ A randomly chosen primary key may be configured by the kernel if
+ the tcp_fastopen sysctl is set to 0x400 (see above), or if the
+ TCP_FASTOPEN setsockopt() optname is set and a key has not been
+ previously configured via sysctl. If keys are configured via
+ setsockopt() by using the TCP_FASTOPEN_KEY optname, then those
+ per-socket keys will be used instead of any keys that are specified via
+ sysctl.
+
+ A key is specified as 4 8-digit hexadecimal integers which are separated
+ by a '-' as: xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxxxx. Leading zeros may be
+ omitted. A primary and a backup key may be specified by separating them
+ by a comma. If only one key is specified, it becomes the primary key and
+ any previously configured backup keys are removed.
+
tcp_syn_retries - INTEGER
Number of times initial SYNs for an active TCP connection attempt
will be retransmitted. Should not be higher than 127. Default value
@@ -772,6 +801,14 @@ tcp_challenge_ack_limit - INTEGER
in RFC 5961 (Improving TCP's Robustness to Blind In-Window Attacks)
Default: 100
+tcp_rx_skb_cache - BOOLEAN
+ Controls a per TCP socket cache of one skb, that might help
+ performance of some workloads. This might be dangerous
+ on systems with a lot of TCP sockets, since it increases
+ memory usage.
+
+ Default: 0 (disabled)
+
UDP variables:
udp_l3mdev_accept - BOOLEAN
@@ -1409,14 +1446,26 @@ flowlabel_state_ranges - BOOLEAN
FALSE: disabled
Default: true
-flowlabel_reflect - BOOLEAN
- Automatically reflect the flow label. Needed for Path MTU
+flowlabel_reflect - INTEGER
+ Control flow label reflection. Needed for Path MTU
Discovery to work with Equal Cost Multipath Routing in anycast
environments. See RFC 7690 and:
https://tools.ietf.org/html/draft-wang-6man-flow-label-reflection-01
- TRUE: enabled
- FALSE: disabled
- Default: FALSE
+
+ This is a bitmask.
+ 1: enabled for established flows
+
+ Note that this prevents automatic flowlabel changes, as done
+ in "tcp: change IPv6 flow-label upon receiving spurious retransmission"
+ and "tcp: Change txhash on every SYN and RTO retransmit"
+
+ 2: enabled for TCP RESET packets (no active listener)
+ If set, a RST packet sent in response to a SYN packet on a closed
+ port will reflect the incoming flow label.
+
+ 4: enabled for ICMPv6 echo reply messages.
+
+ Default: 0
fib_multipath_hash_policy - INTEGER
Controls which hash policy to use for multipath routes.
@@ -1424,6 +1473,7 @@ fib_multipath_hash_policy - INTEGER
Possible values:
0 - Layer 3 (source and destination addresses plus flow label)
1 - Layer 4 (standard 5-tuple)
+ 2 - Layer 3 or inner Layer 3 if present
anycast_src_echo_reply - BOOLEAN
Controls the use of anycast addresses as source addresses for ICMPv6
@@ -2237,7 +2287,7 @@ addr_scope_policy - INTEGER
/proc/sys/net/core/*
- Please see: Documentation/sysctl/net.txt for descriptions of these entries.
+ Please see: Documentation/admin-guide/sysctl/net.rst for descriptions of these entries.
/proc/sys/net/unix/*
diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt
index 2f24a1912a48..025cc9b96992 100644
--- a/Documentation/networking/mpls-sysctl.txt
+++ b/Documentation/networking/mpls-sysctl.txt
@@ -30,7 +30,7 @@ ip_ttl_propagate - BOOL
0 - disabled / RFC 3443 [Short] Pipe Model
1 - enabled / RFC 3443 Uniform Model (default)
-default_ttl - BOOL
+default_ttl - INTEGER
Default TTL value to use for MPLS packets where it cannot be
propagated from an IP header, either because one isn't present
or ip_ttl_propagate has been disabled.
diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst
index 0dd90d7df5ec..a689966bc4be 100644
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -202,7 +202,8 @@ the PHY/controller, of which the PHY needs to be aware.
*interface* is a u32 which specifies the connection type used
between the controller and the PHY. Examples are GMII, MII,
-RGMII, and SGMII. For a full list, see include/linux/phy.h
+RGMII, and SGMII. See "PHY interface mode" below. For a full
+list, see include/linux/phy.h
Now just make sure that phydev->supported and phydev->advertising have any
values pruned from them which don't make sense for your controller (a 10/100
@@ -225,6 +226,48 @@ When you want to disconnect from the network (even if just briefly), you call
phy_stop(phydev). This function also stops the phylib state machine and
disables PHY interrupts.
+PHY interface modes
+===================
+
+The PHY interface mode supplied in the phy_connect() family of functions
+defines the initial operating mode of the PHY interface. This is not
+guaranteed to remain constant; there are PHYs which dynamically change
+their interface mode without software interaction depending on the
+negotiation results.
+
+Some of the interface modes are described below:
+
+``PHY_INTERFACE_MODE_1000BASEX``
+ This defines the 1000BASE-X single-lane serdes link as defined by the
+ 802.3 standard section 36. The link operates at a fixed bit rate of
+ 1.25Gbaud using a 10B/8B encoding scheme, resulting in an underlying
+ data rate of 1Gbps. Embedded in the data stream is a 16-bit control
+ word which is used to negotiate the duplex and pause modes with the
+ remote end. This does not include "up-clocked" variants such as 2.5Gbps
+ speeds (see below.)
+
+``PHY_INTERFACE_MODE_2500BASEX``
+ This defines a variant of 1000BASE-X which is clocked 2.5 times faster,
+ than the 802.3 standard giving a fixed bit rate of 3.125Gbaud.
+
+``PHY_INTERFACE_MODE_SGMII``
+ This is used for Cisco SGMII, which is a modification of 1000BASE-X
+ as defined by the 802.3 standard. The SGMII link consists of a single
+ serdes lane running at a fixed bit rate of 1.25Gbaud with 10B/8B
+ encoding. The underlying data rate is 1Gbps, with the slower speeds of
+ 100Mbps and 10Mbps being achieved through replication of each data symbol.
+ The 802.3 control word is re-purposed to send the negotiated speed and
+ duplex information from to the MAC, and for the MAC to acknowledge
+ receipt. This does not include "up-clocked" variants such as 2.5Gbps
+ speeds.
+
+ Note: mismatched SGMII vs 1000BASE-X configuration on a link can
+ successfully pass data in some circumstances, but the 16-bit control
+ word will not be correctly interpreted, which may cause mismatches in
+ duplex, pause or other settings. This is dependent on the MAC and/or
+ PHY behaviour.
+
+
Pause frames / flow control
===========================
diff --git a/Documentation/networking/rds.txt b/Documentation/networking/rds.txt
index 0235ae69af2a..f2a0147c933d 100644
--- a/Documentation/networking/rds.txt
+++ b/Documentation/networking/rds.txt
@@ -389,7 +389,7 @@ Multipath RDS (mprds)
a common (to all paths) part, and a per-path struct rds_conn_path. All
I/O workqs and reconnect threads are driven from the rds_conn_path.
Transports such as TCP that are multipath capable may then set up a
- TPC socket per rds_conn_path, and this is managed by the transport via
+ TCP socket per rds_conn_path, and this is managed by the transport via
the transport privatee cp_transport_data pointer.
Transports announce themselves as multipath capable by setting the
diff --git a/Documentation/networking/sfp-phylink.rst b/Documentation/networking/sfp-phylink.rst
index 5bd26cb07244..91446b431b70 100644
--- a/Documentation/networking/sfp-phylink.rst
+++ b/Documentation/networking/sfp-phylink.rst
@@ -98,6 +98,7 @@ this documentation.
4. Add::
struct phylink *phylink;
+ struct phylink_config phylink_config;
to the driver's private data structure. We shall refer to the
driver's private data pointer as ``priv`` below, and the driver's
@@ -223,8 +224,10 @@ this documentation.
.. code-block:: c
struct phylink *phylink;
+ priv->phylink_config.dev = &dev.dev;
+ priv->phylink_config.type = PHYLINK_NETDEV;
- phylink = phylink_create(dev, node, phy_mode, &phylink_ops);
+ phylink = phylink_create(&priv->phylink_config, node, phy_mode, &phylink_ops);
if (IS_ERR(phylink)) {
err = PTR_ERR(phylink);
fail probe;
diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst
index cb85af559dff..048e5ca44824 100644
--- a/Documentation/networking/tls-offload.rst
+++ b/Documentation/networking/tls-offload.rst
@@ -206,7 +206,11 @@ TX
Segments transmitted from an offloaded socket can get out of sync
in similar ways to the receive side-retransmissions - local drops
-are possible, though network reorders are not.
+are possible, though network reorders are not. There are currently
+two mechanisms for dealing with out of order segments.
+
+Crypto state rebuilding
+~~~~~~~~~~~~~~~~~~~~~~~
Whenever an out of order segment is transmitted the driver provides
the device with enough information to perform cryptographic operations.
@@ -225,6 +229,35 @@ was just a retransmission. The former is simpler, and does not require
retransmission detection therefore it is the recommended method until
such time it is proven inefficient.
+Next record sync
+~~~~~~~~~~~~~~~~
+
+Whenever an out of order segment is detected the driver requests
+that the ``ktls`` software fallback code encrypt it. If the segment's
+sequence number is lower than expected the driver assumes retransmission
+and doesn't change device state. If the segment is in the future, it
+may imply a local drop, the driver asks the stack to sync the device
+to the next record state and falls back to software.
+
+Resync request is indicated with:
+
+.. code-block:: c
+
+ void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq)
+
+Until resync is complete driver should not access its expected TCP
+sequence number (as it will be updated from a different context).
+Following helper should be used to test if resync is complete:
+
+.. code-block:: c
+
+ bool tls_offload_tx_resync_pending(struct sock *sk)
+
+Next time ``ktls`` pushes a record it will first send its TCP sequence number
+and TLS record number to the driver. Stack will also make sure that
+the new record will start on a segment boundary (like it does when
+the connection is initially added).
+
RX
--
@@ -268,6 +301,9 @@ Device can only detect that segment 4 also contains a TLS header
if it knows the length of the previous record from segment 2. In this case
the device will lose synchronization with the stream.
+Stream scan resynchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
When the device gets out of sync and the stream reaches TCP sequence
numbers more than a max size record past the expected TCP sequence number,
the device starts scanning for a known header pattern. For example
@@ -298,6 +334,22 @@ Special care has to be taken if the confirmation request is passed
asynchronously to the packet stream and record may get processed
by the kernel before the confirmation request.
+Stack-driven resynchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The driver may also request the stack to perform resynchronization
+whenever it sees the records are no longer getting decrypted.
+If the connection is configured in this mode the stack automatically
+schedules resynchronization after it has received two completely encrypted
+records.
+
+The stack waits for the socket to drain and informs the device about
+the next expected record number and its TCP sequence number. If the
+records continue to be received fully encrypted stack retries the
+synchronization with an exponential back off (first after 2 encrypted
+records, then after 4 records, after 8, after 16... up until every
+128 records).
+
Error handling
==============
@@ -379,7 +431,6 @@ by the driver:
but did not arrive in the expected order
* ``tx_tls_drop_no_sync_data`` - number of TX packets dropped because
they arrived out of order and associated record could not be found
- (see also :ref:`pre_tls_data`)
Notable corner cases, exceptions and additional requirements
============================================================
@@ -462,21 +513,3 @@ Redirects leak clear text
In the RX direction, if segment has already been decrypted by the device
and it gets redirected or mirrored - clear text will be transmitted out.
-
-.. _pre_tls_data:
-
-Transmission of pre-TLS data
-----------------------------
-
-User can enqueue some already encrypted and framed records before enabling
-``ktls`` on the socket. Those records have to get sent as they are. This is
-perfectly easy to handle in the software case - such data will be waiting
-in the TCP layer, TLS ULP won't see it. In the offloaded case when pre-queued
-segment reaches transmission point it appears to be out of order (before the
-expected TCP sequence number) and the stack does not have a record information
-associated.
-
-All segments without record information cannot, however, be assumed to be
-pre-queued data, because a race condition exists between TCP stack queuing
-a retransmission, the driver seeing the retransmission and TCP ACK arriving
-for the retransmitted data.
diff --git a/Documentation/nfc/nfc-hci.txt b/Documentation/nfc/nfc-hci.txt
deleted file mode 100644
index 0dc078cab972..000000000000
--- a/Documentation/nfc/nfc-hci.txt
+++ /dev/null
@@ -1,290 +0,0 @@
-HCI backend for NFC Core
-
-Author: Eric Lapuyade, Samuel Ortiz
-Contact: eric.lapuyade@intel.com, samuel.ortiz@intel.com
-
-General
--------
-
-The HCI layer implements much of the ETSI TS 102 622 V10.2.0 specification. It
-enables easy writing of HCI-based NFC drivers. The HCI layer runs as an NFC Core
-backend, implementing an abstract nfc device and translating NFC Core API
-to HCI commands and events.
-
-HCI
----
-
-HCI registers as an nfc device with NFC Core. Requests coming from userspace are
-routed through netlink sockets to NFC Core and then to HCI. From this point,
-they are translated in a sequence of HCI commands sent to the HCI layer in the
-host controller (the chip). Commands can be executed synchronously (the sending
-context blocks waiting for response) or asynchronously (the response is returned
-from HCI Rx context).
-HCI events can also be received from the host controller. They will be handled
-and a translation will be forwarded to NFC Core as needed. There are hooks to
-let the HCI driver handle proprietary events or override standard behavior.
-HCI uses 2 execution contexts:
-- one for executing commands : nfc_hci_msg_tx_work(). Only one command
-can be executing at any given moment.
-- one for dispatching received events and commands : nfc_hci_msg_rx_work().
-
-HCI Session initialization:
----------------------------
-
-The Session initialization is an HCI standard which must unfortunately
-support proprietary gates. This is the reason why the driver will pass a list
-of proprietary gates that must be part of the session. HCI will ensure all
-those gates have pipes connected when the hci device is set up.
-In case the chip supports pre-opened gates and pseudo-static pipes, the driver
-can pass that information to HCI core.
-
-HCI Gates and Pipes
--------------------
-
-A gate defines the 'port' where some service can be found. In order to access
-a service, one must create a pipe to that gate and open it. In this
-implementation, pipes are totally hidden. The public API only knows gates.
-This is consistent with the driver need to send commands to proprietary gates
-without knowing the pipe connected to it.
-
-Driver interface
-----------------
-
-A driver is generally written in two parts : the physical link management and
-the HCI management. This makes it easier to maintain a driver for a chip that
-can be connected using various phy (i2c, spi, ...)
-
-HCI Management
---------------
-
-A driver would normally register itself with HCI and provide the following
-entry points:
-
-struct nfc_hci_ops {
- int (*open)(struct nfc_hci_dev *hdev);
- void (*close)(struct nfc_hci_dev *hdev);
- int (*hci_ready) (struct nfc_hci_dev *hdev);
- int (*xmit) (struct nfc_hci_dev *hdev, struct sk_buff *skb);
- int (*start_poll) (struct nfc_hci_dev *hdev,
- u32 im_protocols, u32 tm_protocols);
- int (*dep_link_up)(struct nfc_hci_dev *hdev, struct nfc_target *target,
- u8 comm_mode, u8 *gb, size_t gb_len);
- int (*dep_link_down)(struct nfc_hci_dev *hdev);
- int (*target_from_gate) (struct nfc_hci_dev *hdev, u8 gate,
- struct nfc_target *target);
- int (*complete_target_discovered) (struct nfc_hci_dev *hdev, u8 gate,
- struct nfc_target *target);
- int (*im_transceive) (struct nfc_hci_dev *hdev,
- struct nfc_target *target, struct sk_buff *skb,
- data_exchange_cb_t cb, void *cb_context);
- int (*tm_send)(struct nfc_hci_dev *hdev, struct sk_buff *skb);
- int (*check_presence)(struct nfc_hci_dev *hdev,
- struct nfc_target *target);
- int (*event_received)(struct nfc_hci_dev *hdev, u8 gate, u8 event,
- struct sk_buff *skb);
-};
-
-- open() and close() shall turn the hardware on and off.
-- hci_ready() is an optional entry point that is called right after the hci
-session has been set up. The driver can use it to do additional initialization
-that must be performed using HCI commands.
-- xmit() shall simply write a frame to the physical link.
-- start_poll() is an optional entrypoint that shall set the hardware in polling
-mode. This must be implemented only if the hardware uses proprietary gates or a
-mechanism slightly different from the HCI standard.
-- dep_link_up() is called after a p2p target has been detected, to finish
-the p2p connection setup with hardware parameters that need to be passed back
-to nfc core.
-- dep_link_down() is called to bring the p2p link down.
-- target_from_gate() is an optional entrypoint to return the nfc protocols
-corresponding to a proprietary gate.
-- complete_target_discovered() is an optional entry point to let the driver
-perform additional proprietary processing necessary to auto activate the
-discovered target.
-- im_transceive() must be implemented by the driver if proprietary HCI commands
-are required to send data to the tag. Some tag types will require custom
-commands, others can be written to using the standard HCI commands. The driver
-can check the tag type and either do proprietary processing, or return 1 to ask
-for standard processing. The data exchange command itself must be sent
-asynchronously.
-- tm_send() is called to send data in the case of a p2p connection
-- check_presence() is an optional entry point that will be called regularly
-by the core to check that an activated tag is still in the field. If this is
-not implemented, the core will not be able to push tag_lost events to the user
-space
-- event_received() is called to handle an event coming from the chip. Driver
-can handle the event or return 1 to let HCI attempt standard processing.
-
-On the rx path, the driver is responsible to push incoming HCP frames to HCI
-using nfc_hci_recv_frame(). HCI will take care of re-aggregation and handling
-This must be done from a context that can sleep.
-
-PHY Management
---------------
-
-The physical link (i2c, ...) management is defined by the following structure:
-
-struct nfc_phy_ops {
- int (*write)(void *dev_id, struct sk_buff *skb);
- int (*enable)(void *dev_id);
- void (*disable)(void *dev_id);
-};
-
-enable(): turn the phy on (power on), make it ready to transfer data
-disable(): turn the phy off
-write(): Send a data frame to the chip. Note that to enable higher
-layers such as an llc to store the frame for re-emission, this function must
-not alter the skb. It must also not return a positive result (return 0 for
-success, negative for failure).
-
-Data coming from the chip shall be sent directly to nfc_hci_recv_frame().
-
-LLC
----
-
-Communication between the CPU and the chip often requires some link layer
-protocol. Those are isolated as modules managed by the HCI layer. There are
-currently two modules : nop (raw transfert) and shdlc.
-A new llc must implement the following functions:
-
-struct nfc_llc_ops {
- void *(*init) (struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv,
- rcv_to_hci_t rcv_to_hci, int tx_headroom,
- int tx_tailroom, int *rx_headroom, int *rx_tailroom,
- llc_failure_t llc_failure);
- void (*deinit) (struct nfc_llc *llc);
- int (*start) (struct nfc_llc *llc);
- int (*stop) (struct nfc_llc *llc);
- void (*rcv_from_drv) (struct nfc_llc *llc, struct sk_buff *skb);
- int (*xmit_from_hci) (struct nfc_llc *llc, struct sk_buff *skb);
-};
-
-- init() : allocate and init your private storage
-- deinit() : cleanup
-- start() : establish the logical connection
-- stop () : terminate the logical connection
-- rcv_from_drv() : handle data coming from the chip, going to HCI
-- xmit_from_hci() : handle data sent by HCI, going to the chip
-
-The llc must be registered with nfc before it can be used. Do that by
-calling nfc_llc_register(const char *name, struct nfc_llc_ops *ops);
-
-Again, note that the llc does not handle the physical link. It is thus very
-easy to mix any physical link with any llc for a given chip driver.
-
-Included Drivers
-----------------
-
-An HCI based driver for an NXP PN544, connected through I2C bus, and using
-shdlc is included.
-
-Execution Contexts
-------------------
-
-The execution contexts are the following:
-- IRQ handler (IRQH):
-fast, cannot sleep. sends incoming frames to HCI where they are passed to
-the current llc. In case of shdlc, the frame is queued in shdlc rx queue.
-
-- SHDLC State Machine worker (SMW)
-Only when llc_shdlc is used: handles shdlc rx & tx queues.
-Dispatches HCI cmd responses.
-
-- HCI Tx Cmd worker (MSGTXWQ)
-Serializes execution of HCI commands. Completes execution in case of response
-timeout.
-
-- HCI Rx worker (MSGRXWQ)
-Dispatches incoming HCI commands or events.
-
-- Syscall context from a userspace call (SYSCALL)
-Any entrypoint in HCI called from NFC Core
-
-Workflow executing an HCI command (using shdlc)
------------------------------------------------
-
-Executing an HCI command can easily be performed synchronously using the
-following API:
-
-int nfc_hci_send_cmd (struct nfc_hci_dev *hdev, u8 gate, u8 cmd,
- const u8 *param, size_t param_len, struct sk_buff **skb)
-
-The API must be invoked from a context that can sleep. Most of the time, this
-will be the syscall context. skb will return the result that was received in
-the response.
-
-Internally, execution is asynchronous. So all this API does is to enqueue the
-HCI command, setup a local wait queue on stack, and wait_event() for completion.
-The wait is not interruptible because it is guaranteed that the command will
-complete after some short timeout anyway.
-
-MSGTXWQ context will then be scheduled and invoke nfc_hci_msg_tx_work().
-This function will dequeue the next pending command and send its HCP fragments
-to the lower layer which happens to be shdlc. It will then start a timer to be
-able to complete the command with a timeout error if no response arrive.
-
-SMW context gets scheduled and invokes nfc_shdlc_sm_work(). This function
-handles shdlc framing in and out. It uses the driver xmit to send frames and
-receives incoming frames in an skb queue filled from the driver IRQ handler.
-SHDLC I(nformation) frames payload are HCP fragments. They are aggregated to
-form complete HCI frames, which can be a response, command, or event.
-
-HCI Responses are dispatched immediately from this context to unblock
-waiting command execution. Response processing involves invoking the completion
-callback that was provided by nfc_hci_msg_tx_work() when it sent the command.
-The completion callback will then wake the syscall context.
-
-It is also possible to execute the command asynchronously using this API:
-
-static int nfc_hci_execute_cmd_async(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
- const u8 *param, size_t param_len,
- data_exchange_cb_t cb, void *cb_context)
-
-The workflow is the same, except that the API call returns immediately, and
-the callback will be called with the result from the SMW context.
-
-Workflow receiving an HCI event or command
-------------------------------------------
-
-HCI commands or events are not dispatched from SMW context. Instead, they are
-queued to HCI rx_queue and will be dispatched from HCI rx worker
-context (MSGRXWQ). This is done this way to allow a cmd or event handler
-to also execute other commands (for example, handling the
-NFC_HCI_EVT_TARGET_DISCOVERED event from PN544 requires to issue an
-ANY_GET_PARAMETER to the reader A gate to get information on the target
-that was discovered).
-
-Typically, such an event will be propagated to NFC Core from MSGRXWQ context.
-
-Error management
-----------------
-
-Errors that occur synchronously with the execution of an NFC Core request are
-simply returned as the execution result of the request. These are easy.
-
-Errors that occur asynchronously (e.g. in a background protocol handling thread)
-must be reported such that upper layers don't stay ignorant that something
-went wrong below and know that expected events will probably never happen.
-Handling of these errors is done as follows:
-
-- driver (pn544) fails to deliver an incoming frame: it stores the error such
-that any subsequent call to the driver will result in this error. Then it calls
-the standard nfc_shdlc_recv_frame() with a NULL argument to report the problem
-above. shdlc stores a EREMOTEIO sticky status, which will trigger SMW to
-report above in turn.
-
-- SMW is basically a background thread to handle incoming and outgoing shdlc
-frames. This thread will also check the shdlc sticky status and report to HCI
-when it discovers it is not able to run anymore because of an unrecoverable
-error that happened within shdlc or below. If the problem occurs during shdlc
-connection, the error is reported through the connect completion.
-
-- HCI: if an internal HCI error happens (frame is lost), or HCI is reported an
-error from a lower layer, HCI will either complete the currently executing
-command with that error, or notify NFC Core directly if no command is executing.
-
-- NFC Core: when NFC Core is notified of an error from below and polling is
-active, it will send a tag discovered event with an empty tag list to the user
-space to let it know that the poll operation will never be able to detect a tag.
-If polling is not active and the error was sticky, lower levels will return it
-at next invocation.
diff --git a/Documentation/nfc/nfc-pn544.txt b/Documentation/nfc/nfc-pn544.txt
deleted file mode 100644
index b36ca14ca2d6..000000000000
--- a/Documentation/nfc/nfc-pn544.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Kernel driver for the NXP Semiconductors PN544 Near Field
-Communication chip
-
-General
--------
-
-The PN544 is an integrated transmission module for contactless
-communication. The driver goes under drives/nfc/ and is compiled as a
-module named "pn544".
-
-Host Interfaces: I2C, SPI and HSU, this driver supports currently only I2C.
-
-Protocols
----------
-
-In the normal (HCI) mode and in the firmware update mode read and
-write functions behave a bit differently because the message formats
-or the protocols are different.
-
-In the normal (HCI) mode the protocol used is derived from the ETSI
-HCI specification. The firmware is updated using a specific protocol,
-which is different from HCI.
-
-HCI messages consist of an eight bit header and the message body. The
-header contains the message length. Maximum size for an HCI message is
-33. In HCI mode sent messages are tested for a correct
-checksum. Firmware update messages have the length in the second (MSB)
-and third (LSB) bytes of the message. The maximum FW message length is
-1024 bytes.
-
-For the ETSI HCI specification see
-http://www.etsi.org/WebSite/Technologies/ProtocolSpecification.aspx
diff --git a/Documentation/ntb.txt b/Documentation/ntb.txt
deleted file mode 100644
index 074a423c853c..000000000000
--- a/Documentation/ntb.txt
+++ /dev/null
@@ -1,236 +0,0 @@
-===========
-NTB Drivers
-===========
-
-NTB (Non-Transparent Bridge) is a type of PCI-Express bridge chip that connects
-the separate memory systems of two or more computers to the same PCI-Express
-fabric. Existing NTB hardware supports a common feature set: doorbell
-registers and memory translation windows, as well as non common features like
-scratchpad and message registers. Scratchpad registers are read-and-writable
-registers that are accessible from either side of the device, so that peers can
-exchange a small amount of information at a fixed address. Message registers can
-be utilized for the same purpose. Additionally they are provided with with
-special status bits to make sure the information isn't rewritten by another
-peer. Doorbell registers provide a way for peers to send interrupt events.
-Memory windows allow translated read and write access to the peer memory.
-
-NTB Core Driver (ntb)
-=====================
-
-The NTB core driver defines an api wrapping the common feature set, and allows
-clients interested in NTB features to discover NTB the devices supported by
-hardware drivers. The term "client" is used here to mean an upper layer
-component making use of the NTB api. The term "driver," or "hardware driver,"
-is used here to mean a driver for a specific vendor and model of NTB hardware.
-
-NTB Client Drivers
-==================
-
-NTB client drivers should register with the NTB core driver. After
-registering, the client probe and remove functions will be called appropriately
-as ntb hardware, or hardware drivers, are inserted and removed. The
-registration uses the Linux Device framework, so it should feel familiar to
-anyone who has written a pci driver.
-
-NTB Typical client driver implementation
-----------------------------------------
-
-Primary purpose of NTB is to share some peace of memory between at least two
-systems. So the NTB device features like Scratchpad/Message registers are
-mainly used to perform the proper memory window initialization. Typically
-there are two types of memory window interfaces supported by the NTB API:
-inbound translation configured on the local ntb port and outbound translation
-configured by the peer, on the peer ntb port. The first type is
-depicted on the next figure::
-
- Inbound translation:
-
- Memory: Local NTB Port: Peer NTB Port: Peer MMIO:
- ____________
- | dma-mapped |-ntb_mw_set_trans(addr) |
- | memory | _v____________ | ______________
- | (addr) |<======| MW xlat addr |<====| MW base addr |<== memory-mapped IO
- |------------| |--------------| | |--------------|
-
-So typical scenario of the first type memory window initialization looks:
-1) allocate a memory region, 2) put translated address to NTB config,
-3) somehow notify a peer device of performed initialization, 4) peer device
-maps corresponding outbound memory window so to have access to the shared
-memory region.
-
-The second type of interface, that implies the shared windows being
-initialized by a peer device, is depicted on the figure::
-
- Outbound translation:
-
- Memory: Local NTB Port: Peer NTB Port: Peer MMIO:
- ____________ ______________
- | dma-mapped | | | MW base addr |<== memory-mapped IO
- | memory | | |--------------|
- | (addr) |<===================| MW xlat addr |<-ntb_peer_mw_set_trans(addr)
- |------------| | |--------------|
-
-Typical scenario of the second type interface initialization would be:
-1) allocate a memory region, 2) somehow deliver a translated address to a peer
-device, 3) peer puts the translated address to NTB config, 4) peer device maps
-outbound memory window so to have access to the shared memory region.
-
-As one can see the described scenarios can be combined in one portable
-algorithm.
-
- Local device:
- 1) Allocate memory for a shared window
- 2) Initialize memory window by translated address of the allocated region
- (it may fail if local memory window initialization is unsupported)
- 3) Send the translated address and memory window index to a peer device
-
- Peer device:
- 1) Initialize memory window with retrieved address of the allocated
- by another device memory region (it may fail if peer memory window
- initialization is unsupported)
- 2) Map outbound memory window
-
-In accordance with this scenario, the NTB Memory Window API can be used as
-follows:
-
- Local device:
- 1) ntb_mw_count(pidx) - retrieve number of memory ranges, which can
- be allocated for memory windows between local device and peer device
- of port with specified index.
- 2) ntb_get_align(pidx, midx) - retrieve parameters restricting the
- shared memory region alignment and size. Then memory can be properly
- allocated.
- 3) Allocate physically contiguous memory region in compliance with
- restrictions retrieved in 2).
- 4) ntb_mw_set_trans(pidx, midx) - try to set translation address of
- the memory window with specified index for the defined peer device
- (it may fail if local translated address setting is not supported)
- 5) Send translated base address (usually together with memory window
- number) to the peer device using, for instance, scratchpad or message
- registers.
-
- Peer device:
- 1) ntb_peer_mw_set_trans(pidx, midx) - try to set received from other
- device (related to pidx) translated address for specified memory
- window. It may fail if retrieved address, for instance, exceeds
- maximum possible address or isn't properly aligned.
- 2) ntb_peer_mw_get_addr(widx) - retrieve MMIO address to map the memory
- window so to have an access to the shared memory.
-
-Also it is worth to note, that method ntb_mw_count(pidx) should return the
-same value as ntb_peer_mw_count() on the peer with port index - pidx.
-
-NTB Transport Client (ntb\_transport) and NTB Netdev (ntb\_netdev)
-------------------------------------------------------------------
-
-The primary client for NTB is the Transport client, used in tandem with NTB
-Netdev. These drivers function together to create a logical link to the peer,
-across the ntb, to exchange packets of network data. The Transport client
-establishes a logical link to the peer, and creates queue pairs to exchange
-messages and data. The NTB Netdev then creates an ethernet device using a
-Transport queue pair. Network data is copied between socket buffers and the
-Transport queue pair buffer. The Transport client may be used for other things
-besides Netdev, however no other applications have yet been written.
-
-NTB Ping Pong Test Client (ntb\_pingpong)
------------------------------------------
-
-The Ping Pong test client serves as a demonstration to exercise the doorbell
-and scratchpad registers of NTB hardware, and as an example simple NTB client.
-Ping Pong enables the link when started, waits for the NTB link to come up, and
-then proceeds to read and write the doorbell scratchpad registers of the NTB.
-The peers interrupt each other using a bit mask of doorbell bits, which is
-shifted by one in each round, to test the behavior of multiple doorbell bits
-and interrupt vectors. The Ping Pong driver also reads the first local
-scratchpad, and writes the value plus one to the first peer scratchpad, each
-round before writing the peer doorbell register.
-
-Module Parameters:
-
-* unsafe - Some hardware has known issues with scratchpad and doorbell
- registers. By default, Ping Pong will not attempt to exercise such
- hardware. You may override this behavior at your own risk by setting
- unsafe=1.
-* delay\_ms - Specify the delay between receiving a doorbell
- interrupt event and setting the peer doorbell register for the next
- round.
-* init\_db - Specify the doorbell bits to start new series of rounds. A new
- series begins once all the doorbell bits have been shifted out of
- range.
-* dyndbg - It is suggested to specify dyndbg=+p when loading this module, and
- then to observe debugging output on the console.
-
-NTB Tool Test Client (ntb\_tool)
---------------------------------
-
-The Tool test client serves for debugging, primarily, ntb hardware and drivers.
-The Tool provides access through debugfs for reading, setting, and clearing the
-NTB doorbell, and reading and writing scratchpads.
-
-The Tool does not currently have any module parameters.
-
-Debugfs Files:
-
-* *debugfs*/ntb\_tool/*hw*/
- A directory in debugfs will be created for each
- NTB device probed by the tool. This directory is shortened to *hw*
- below.
-* *hw*/db
- This file is used to read, set, and clear the local doorbell. Not
- all operations may be supported by all hardware. To read the doorbell,
- read the file. To set the doorbell, write `s` followed by the bits to
- set (eg: `echo 's 0x0101' > db`). To clear the doorbell, write `c`
- followed by the bits to clear.
-* *hw*/mask
- This file is used to read, set, and clear the local doorbell mask.
- See *db* for details.
-* *hw*/peer\_db
- This file is used to read, set, and clear the peer doorbell.
- See *db* for details.
-* *hw*/peer\_mask
- This file is used to read, set, and clear the peer doorbell
- mask. See *db* for details.
-* *hw*/spad
- This file is used to read and write local scratchpads. To read
- the values of all scratchpads, read the file. To write values, write a
- series of pairs of scratchpad number and value
- (eg: `echo '4 0x123 7 0xabc' > spad`
- # to set scratchpads `4` and `7` to `0x123` and `0xabc`, respectively).
-* *hw*/peer\_spad
- This file is used to read and write peer scratchpads. See
- *spad* for details.
-
-NTB Hardware Drivers
-====================
-
-NTB hardware drivers should register devices with the NTB core driver. After
-registering, clients probe and remove functions will be called.
-
-NTB Intel Hardware Driver (ntb\_hw\_intel)
-------------------------------------------
-
-The Intel hardware driver supports NTB on Xeon and Atom CPUs.
-
-Module Parameters:
-
-* b2b\_mw\_idx
- If the peer ntb is to be accessed via a memory window, then use
- this memory window to access the peer ntb. A value of zero or positive
- starts from the first mw idx, and a negative value starts from the last
- mw idx. Both sides MUST set the same value here! The default value is
- `-1`.
-* b2b\_mw\_share
- If the peer ntb is to be accessed via a memory window, and if
- the memory window is large enough, still allow the client to use the
- second half of the memory window for address translation to the peer.
-* xeon\_b2b\_usd\_bar2\_addr64
- If using B2B topology on Xeon hardware, use
- this 64 bit address on the bus between the NTB devices for the window
- at BAR2, on the upstream side of the link.
-* xeon\_b2b\_usd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_usd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_usd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar2\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar4\_addr64 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar4\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
-* xeon\_b2b\_dsd\_bar5\_addr32 - See *xeon\_b2b\_bar2\_addr64*.
diff --git a/Documentation/nvdimm/btt.txt b/Documentation/nvdimm/btt.txt
deleted file mode 100644
index e293fb664924..000000000000
--- a/Documentation/nvdimm/btt.txt
+++ /dev/null
@@ -1,273 +0,0 @@
-BTT - Block Translation Table
-=============================
-
-
-1. Introduction
----------------
-
-Persistent memory based storage is able to perform IO at byte (or more
-accurately, cache line) granularity. However, we often want to expose such
-storage as traditional block devices. The block drivers for persistent memory
-will do exactly this. However, they do not provide any atomicity guarantees.
-Traditional SSDs typically provide protection against torn sectors in hardware,
-using stored energy in capacitors to complete in-flight block writes, or perhaps
-in firmware. We don't have this luxury with persistent memory - if a write is in
-progress, and we experience a power failure, the block will contain a mix of old
-and new data. Applications may not be prepared to handle such a scenario.
-
-The Block Translation Table (BTT) provides atomic sector update semantics for
-persistent memory devices, so that applications that rely on sector writes not
-being torn can continue to do so. The BTT manifests itself as a stacked block
-device, and reserves a portion of the underlying storage for its metadata. At
-the heart of it, is an indirection table that re-maps all the blocks on the
-volume. It can be thought of as an extremely simple file system that only
-provides atomic sector updates.
-
-
-2. Static Layout
-----------------
-
-The underlying storage on which a BTT can be laid out is not limited in any way.
-The BTT, however, splits the available space into chunks of up to 512 GiB,
-called "Arenas".
-
-Each arena follows the same layout for its metadata, and all references in an
-arena are internal to it (with the exception of one field that points to the
-next arena). The following depicts the "On-disk" metadata layout:
-
-
- Backing Store +-------> Arena
-+---------------+ | +------------------+
-| | | | Arena info block |
-| Arena 0 +---+ | 4K |
-| 512G | +------------------+
-| | | |
-+---------------+ | |
-| | | |
-| Arena 1 | | Data Blocks |
-| 512G | | |
-| | | |
-+---------------+ | |
-| . | | |
-| . | | |
-| . | | |
-| | | |
-| | | |
-+---------------+ +------------------+
- | |
- | BTT Map |
- | |
- | |
- +------------------+
- | |
- | BTT Flog |
- | |
- +------------------+
- | Info block copy |
- | 4K |
- +------------------+
-
-
-3. Theory of Operation
-----------------------
-
-
-a. The BTT Map
---------------
-
-The map is a simple lookup/indirection table that maps an LBA to an internal
-block. Each map entry is 32 bits. The two most significant bits are special
-flags, and the remaining form the internal block number.
-
-Bit Description
-31 - 30 : Error and Zero flags - Used in the following way:
- Bit Description
- 31 30
- -----------------------------------------------------------------------
- 00 Initial state. Reads return zeroes; Premap = Postmap
- 01 Zero state: Reads return zeroes
- 10 Error state: Reads fail; Writes clear 'E' bit
- 11 Normal Block – has valid postmap
-
-
-29 - 0 : Mappings to internal 'postmap' blocks
-
-
-Some of the terminology that will be subsequently used:
-
-External LBA : LBA as made visible to upper layers.
-ABA : Arena Block Address - Block offset/number within an arena
-Premap ABA : The block offset into an arena, which was decided upon by range
- checking the External LBA
-Postmap ABA : The block number in the "Data Blocks" area obtained after
- indirection from the map
-nfree : The number of free blocks that are maintained at any given time.
- This is the number of concurrent writes that can happen to the
- arena.
-
-
-For example, after adding a BTT, we surface a disk of 1024G. We get a read for
-the external LBA at 768G. This falls into the second arena, and of the 512G
-worth of blocks that this arena contributes, this block is at 256G. Thus, the
-premap ABA is 256G. We now refer to the map, and find out the mapping for block
-'X' (256G) points to block 'Y', say '64'. Thus the postmap ABA is 64.
-
-
-b. The BTT Flog
----------------
-
-The BTT provides sector atomicity by making every write an "allocating write",
-i.e. Every write goes to a "free" block. A running list of free blocks is
-maintained in the form of the BTT flog. 'Flog' is a combination of the words
-"free list" and "log". The flog contains 'nfree' entries, and an entry contains:
-
-lba : The premap ABA that is being written to
-old_map : The old postmap ABA - after 'this' write completes, this will be a
- free block.
-new_map : The new postmap ABA. The map will up updated to reflect this
- lba->postmap_aba mapping, but we log it here in case we have to
- recover.
-seq : Sequence number to mark which of the 2 sections of this flog entry is
- valid/newest. It cycles between 01->10->11->01 (binary) under normal
- operation, with 00 indicating an uninitialized state.
-lba' : alternate lba entry
-old_map': alternate old postmap entry
-new_map': alternate new postmap entry
-seq' : alternate sequence number.
-
-Each of the above fields is 32-bit, making one entry 32 bytes. Entries are also
-padded to 64 bytes to avoid cache line sharing or aliasing. Flog updates are
-done such that for any entry being written, it:
-a. overwrites the 'old' section in the entry based on sequence numbers
-b. writes the 'new' section such that the sequence number is written last.
-
-
-c. The concept of lanes
------------------------
-
-While 'nfree' describes the number of concurrent IOs an arena can process
-concurrently, 'nlanes' is the number of IOs the BTT device as a whole can
-process.
- nlanes = min(nfree, num_cpus)
-A lane number is obtained at the start of any IO, and is used for indexing into
-all the on-disk and in-memory data structures for the duration of the IO. If
-there are more CPUs than the max number of available lanes, than lanes are
-protected by spinlocks.
-
-
-d. In-memory data structure: Read Tracking Table (RTT)
-------------------------------------------------------
-
-Consider a case where we have two threads, one doing reads and the other,
-writes. We can hit a condition where the writer thread grabs a free block to do
-a new IO, but the (slow) reader thread is still reading from it. In other words,
-the reader consulted a map entry, and started reading the corresponding block. A
-writer started writing to the same external LBA, and finished the write updating
-the map for that external LBA to point to its new postmap ABA. At this point the
-internal, postmap block that the reader is (still) reading has been inserted
-into the list of free blocks. If another write comes in for the same LBA, it can
-grab this free block, and start writing to it, causing the reader to read
-incorrect data. To prevent this, we introduce the RTT.
-
-The RTT is a simple, per arena table with 'nfree' entries. Every reader inserts
-into rtt[lane_number], the postmap ABA it is reading, and clears it after the
-read is complete. Every writer thread, after grabbing a free block, checks the
-RTT for its presence. If the postmap free block is in the RTT, it waits till the
-reader clears the RTT entry, and only then starts writing to it.
-
-
-e. In-memory data structure: map locks
---------------------------------------
-
-Consider a case where two writer threads are writing to the same LBA. There can
-be a race in the following sequence of steps:
-
-free[lane] = map[premap_aba]
-map[premap_aba] = postmap_aba
-
-Both threads can update their respective free[lane] with the same old, freed
-postmap_aba. This has made the layout inconsistent by losing a free entry, and
-at the same time, duplicating another free entry for two lanes.
-
-To solve this, we could have a single map lock (per arena) that has to be taken
-before performing the above sequence, but we feel that could be too contentious.
-Instead we use an array of (nfree) map_locks that is indexed by
-(premap_aba modulo nfree).
-
-
-f. Reconstruction from the Flog
--------------------------------
-
-On startup, we analyze the BTT flog to create our list of free blocks. We walk
-through all the entries, and for each lane, of the set of two possible
-'sections', we always look at the most recent one only (based on the sequence
-number). The reconstruction rules/steps are simple:
-- Read map[log_entry.lba].
-- If log_entry.new matches the map entry, then log_entry.old is free.
-- If log_entry.new does not match the map entry, then log_entry.new is free.
- (This case can only be caused by power-fails/unsafe shutdowns)
-
-
-g. Summarizing - Read and Write flows
--------------------------------------
-
-Read:
-
-1. Convert external LBA to arena number + pre-map ABA
-2. Get a lane (and take lane_lock)
-3. Read map to get the entry for this pre-map ABA
-4. Enter post-map ABA into RTT[lane]
-5. If TRIM flag set in map, return zeroes, and end IO (go to step 8)
-6. If ERROR flag set in map, end IO with EIO (go to step 8)
-7. Read data from this block
-8. Remove post-map ABA entry from RTT[lane]
-9. Release lane (and lane_lock)
-
-Write:
-
-1. Convert external LBA to Arena number + pre-map ABA
-2. Get a lane (and take lane_lock)
-3. Use lane to index into in-memory free list and obtain a new block, next flog
- index, next sequence number
-4. Scan the RTT to check if free block is present, and spin/wait if it is.
-5. Write data to this free block
-6. Read map to get the existing post-map ABA entry for this pre-map ABA
-7. Write flog entry: [premap_aba / old postmap_aba / new postmap_aba / seq_num]
-8. Write new post-map ABA into map.
-9. Write old post-map entry into the free list
-10. Calculate next sequence number and write into the free list entry
-11. Release lane (and lane_lock)
-
-
-4. Error Handling
-=================
-
-An arena would be in an error state if any of the metadata is corrupted
-irrecoverably, either due to a bug or a media error. The following conditions
-indicate an error:
-- Info block checksum does not match (and recovering from the copy also fails)
-- All internal available blocks are not uniquely and entirely addressed by the
- sum of mapped blocks and free blocks (from the BTT flog).
-- Rebuilding free list from the flog reveals missing/duplicate/impossible
- entries
-- A map entry is out of bounds
-
-If any of these error conditions are encountered, the arena is put into a read
-only state using a flag in the info block.
-
-
-5. Usage
-========
-
-The BTT can be set up on any disk (namespace) exposed by the libnvdimm subsystem
-(pmem, or blk mode). The easiest way to set up such a namespace is using the
-'ndctl' utility [1]:
-
-For example, the ndctl command line to setup a btt with a 4k sector size is:
-
- ndctl create-namespace -f -e namespace0.0 -m sector -l 4k
-
-See ndctl create-namespace --help for more options.
-
-[1]: https://github.com/pmem/ndctl
-
diff --git a/Documentation/nvdimm/nvdimm.txt b/Documentation/nvdimm/nvdimm.txt
deleted file mode 100644
index 1669f626b037..000000000000
--- a/Documentation/nvdimm/nvdimm.txt
+++ /dev/null
@@ -1,815 +0,0 @@
- LIBNVDIMM: Non-Volatile Devices
- libnvdimm - kernel / libndctl - userspace helper library
- linux-nvdimm@lists.01.org
- v13
-
-
- Glossary
- Overview
- Supporting Documents
- Git Trees
- LIBNVDIMM PMEM and BLK
- Why BLK?
- PMEM vs BLK
- BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
- Example NVDIMM Platform
- LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
- LIBNDCTL: Context
- libndctl: instantiate a new library context example
- LIBNVDIMM/LIBNDCTL: Bus
- libnvdimm: control class device in /sys/class
- libnvdimm: bus
- libndctl: bus enumeration example
- LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
- libnvdimm: DIMM (NMEM)
- libndctl: DIMM enumeration example
- LIBNVDIMM/LIBNDCTL: Region
- libnvdimm: region
- libndctl: region enumeration example
- Why Not Encode the Region Type into the Region Name?
- How Do I Determine the Major Type of a Region?
- LIBNVDIMM/LIBNDCTL: Namespace
- libnvdimm: namespace
- libndctl: namespace enumeration example
- libndctl: namespace creation example
- Why the Term "namespace"?
- LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
- libnvdimm: btt layout
- libndctl: btt creation example
- Summary LIBNDCTL Diagram
-
-
-Glossary
---------
-
-PMEM: A system-physical-address range where writes are persistent. A
-block device composed of PMEM is capable of DAX. A PMEM address range
-may span an interleave of several DIMMs.
-
-BLK: A set of one or more programmable memory mapped apertures provided
-by a DIMM to access its media. This indirection precludes the
-performance benefit of interleaving, but enables DIMM-bounded failure
-modes.
-
-DPA: DIMM Physical Address, is a DIMM-relative offset. With one DIMM in
-the system there would be a 1:1 system-physical-address:DPA association.
-Once more DIMMs are added a memory controller interleave must be
-decoded to determine the DPA associated with a given
-system-physical-address. BLK capacity always has a 1:1 relationship
-with a single-DIMM's DPA range.
-
-DAX: File system extensions to bypass the page cache and block layer to
-mmap persistent memory, from a PMEM block device, directly into a
-process address space.
-
-DSM: Device Specific Method: ACPI method to to control specific
-device - in this case the firmware.
-
-DCR: NVDIMM Control Region Structure defined in ACPI 6 Section 5.2.25.5.
-It defines a vendor-id, device-id, and interface format for a given DIMM.
-
-BTT: Block Translation Table: Persistent memory is byte addressable.
-Existing software may have an expectation that the power-fail-atomicity
-of writes is at least one sector, 512 bytes. The BTT is an indirection
-table with atomic update semantics to front a PMEM/BLK block device
-driver and present arbitrary atomic sector sizes.
-
-LABEL: Metadata stored on a DIMM device that partitions and identifies
-(persistently names) storage between PMEM and BLK. It also partitions
-BLK storage to host BTTs with different parameters per BLK-partition.
-Note that traditional partition tables, GPT/MBR, are layered on top of a
-BLK or PMEM device.
-
-
-Overview
---------
-
-The LIBNVDIMM subsystem provides support for three types of NVDIMMs, namely,
-PMEM, BLK, and NVDIMM devices that can simultaneously support both PMEM
-and BLK mode access. These three modes of operation are described by
-the "NVDIMM Firmware Interface Table" (NFIT) in ACPI 6. While the LIBNVDIMM
-implementation is generic and supports pre-NFIT platforms, it was guided
-by the superset of capabilities need to support this ACPI 6 definition
-for NVDIMM resources. The bulk of the kernel implementation is in place
-to handle the case where DPA accessible via PMEM is aliased with DPA
-accessible via BLK. When that occurs a LABEL is needed to reserve DPA
-for exclusive access via one mode a time.
-
-Supporting Documents
-ACPI 6: http://www.uefi.org/sites/default/files/resources/ACPI_6.0.pdf
-NVDIMM Namespace: http://pmem.io/documents/NVDIMM_Namespace_Spec.pdf
-DSM Interface Example: http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf
-Driver Writer's Guide: http://pmem.io/documents/NVDIMM_Driver_Writers_Guide.pdf
-
-Git Trees
-LIBNVDIMM: https://git.kernel.org/cgit/linux/kernel/git/djbw/nvdimm.git
-LIBNDCTL: https://github.com/pmem/ndctl.git
-PMEM: https://github.com/01org/prd
-
-
-LIBNVDIMM PMEM and BLK
-------------------
-
-Prior to the arrival of the NFIT, non-volatile memory was described to a
-system in various ad-hoc ways. Usually only the bare minimum was
-provided, namely, a single system-physical-address range where writes
-are expected to be durable after a system power loss. Now, the NFIT
-specification standardizes not only the description of PMEM, but also
-BLK and platform message-passing entry points for control and
-configuration.
-
-For each NVDIMM access method (PMEM, BLK), LIBNVDIMM provides a block
-device driver:
-
- 1. PMEM (nd_pmem.ko): Drives a system-physical-address range. This
- range is contiguous in system memory and may be interleaved (hardware
- memory controller striped) across multiple DIMMs. When interleaved the
- platform may optionally provide details of which DIMMs are participating
- in the interleave.
-
- Note that while LIBNVDIMM describes system-physical-address ranges that may
- alias with BLK access as ND_NAMESPACE_PMEM ranges and those without
- alias as ND_NAMESPACE_IO ranges, to the nd_pmem driver there is no
- distinction. The different device-types are an implementation detail
- that userspace can exploit to implement policies like "only interface
- with address ranges from certain DIMMs". It is worth noting that when
- aliasing is present and a DIMM lacks a label, then no block device can
- be created by default as userspace needs to do at least one allocation
- of DPA to the PMEM range. In contrast ND_NAMESPACE_IO ranges, once
- registered, can be immediately attached to nd_pmem.
-
- 2. BLK (nd_blk.ko): This driver performs I/O using a set of platform
- defined apertures. A set of apertures will access just one DIMM.
- Multiple windows (apertures) allow multiple concurrent accesses, much like
- tagged-command-queuing, and would likely be used by different threads or
- different CPUs.
-
- The NFIT specification defines a standard format for a BLK-aperture, but
- the spec also allows for vendor specific layouts, and non-NFIT BLK
- implementations may have other designs for BLK I/O. For this reason
- "nd_blk" calls back into platform-specific code to perform the I/O.
- One such implementation is defined in the "Driver Writer's Guide" and "DSM
- Interface Example".
-
-
-Why BLK?
---------
-
-While PMEM provides direct byte-addressable CPU-load/store access to
-NVDIMM storage, it does not provide the best system RAS (recovery,
-availability, and serviceability) model. An access to a corrupted
-system-physical-address address causes a CPU exception while an access
-to a corrupted address through an BLK-aperture causes that block window
-to raise an error status in a register. The latter is more aligned with
-the standard error model that host-bus-adapter attached disks present.
-Also, if an administrator ever wants to replace a memory it is easier to
-service a system at DIMM module boundaries. Compare this to PMEM where
-data could be interleaved in an opaque hardware specific manner across
-several DIMMs.
-
-PMEM vs BLK
-BLK-apertures solve these RAS problems, but their presence is also the
-major contributing factor to the complexity of the ND subsystem. They
-complicate the implementation because PMEM and BLK alias in DPA space.
-Any given DIMM's DPA-range may contribute to one or more
-system-physical-address sets of interleaved DIMMs, *and* may also be
-accessed in its entirety through its BLK-aperture. Accessing a DPA
-through a system-physical-address while simultaneously accessing the
-same DPA through a BLK-aperture has undefined results. For this reason,
-DIMMs with this dual interface configuration include a DSM function to
-store/retrieve a LABEL. The LABEL effectively partitions the DPA-space
-into exclusive system-physical-address and BLK-aperture accessible
-regions. For simplicity a DIMM is allowed a PMEM "region" per each
-interleave set in which it is a member. The remaining DPA space can be
-carved into an arbitrary number of BLK devices with discontiguous
-extents.
-
-BLK-REGIONs, PMEM-REGIONs, Atomic Sectors, and DAX
---------------------------------------------------
-
-One of the few
-reasons to allow multiple BLK namespaces per REGION is so that each
-BLK-namespace can be configured with a BTT with unique atomic sector
-sizes. While a PMEM device can host a BTT the LABEL specification does
-not provide for a sector size to be specified for a PMEM namespace.
-This is due to the expectation that the primary usage model for PMEM is
-via DAX, and the BTT is incompatible with DAX. However, for the cases
-where an application or filesystem still needs atomic sector update
-guarantees it can register a BTT on a PMEM device or partition. See
-LIBNVDIMM/NDCTL: Block Translation Table "btt"
-
-
-Example NVDIMM Platform
------------------------
-
-For the remainder of this document the following diagram will be
-referenced for any example sysfs layouts.
-
-
- (a) (b) DIMM BLK-REGION
- +-------------------+--------+--------+--------+
-+------+ | pm0.0 | blk2.0 | pm1.0 | blk2.1 | 0 region2
-| imc0 +--+- - - region0- - - +--------+ +--------+
-+--+---+ | pm0.0 | blk3.0 | pm1.0 | blk3.1 | 1 region3
- | +-------------------+--------v v--------+
-+--+---+ | |
-| cpu0 | region1
-+--+---+ | |
- | +----------------------------^ ^--------+
-+--+---+ | blk4.0 | pm1.0 | blk4.0 | 2 region4
-| imc1 +--+----------------------------| +--------+
-+------+ | blk5.0 | pm1.0 | blk5.0 | 3 region5
- +----------------------------+--------+--------+
-
-In this platform we have four DIMMs and two memory controllers in one
-socket. Each unique interface (BLK or PMEM) to DPA space is identified
-by a region device with a dynamically assigned id (REGION0 - REGION5).
-
- 1. The first portion of DIMM0 and DIMM1 are interleaved as REGION0. A
- single PMEM namespace is created in the REGION0-SPA-range that spans most
- of DIMM0 and DIMM1 with a user-specified name of "pm0.0". Some of that
- interleaved system-physical-address range is reclaimed as BLK-aperture
- accessed space starting at DPA-offset (a) into each DIMM. In that
- reclaimed space we create two BLK-aperture "namespaces" from REGION2 and
- REGION3 where "blk2.0" and "blk3.0" are just human readable names that
- could be set to any user-desired name in the LABEL.
-
- 2. In the last portion of DIMM0 and DIMM1 we have an interleaved
- system-physical-address range, REGION1, that spans those two DIMMs as
- well as DIMM2 and DIMM3. Some of REGION1 is allocated to a PMEM namespace
- named "pm1.0", the rest is reclaimed in 4 BLK-aperture namespaces (for
- each DIMM in the interleave set), "blk2.1", "blk3.1", "blk4.0", and
- "blk5.0".
-
- 3. The portion of DIMM2 and DIMM3 that do not participate in the REGION1
- interleaved system-physical-address range (i.e. the DPA address past
- offset (b) are also included in the "blk4.0" and "blk5.0" namespaces.
- Note, that this example shows that BLK-aperture namespaces don't need to
- be contiguous in DPA-space.
-
- This bus is provided by the kernel under the device
- /sys/devices/platform/nfit_test.0 when CONFIG_NFIT_TEST is enabled and
- the nfit_test.ko module is loaded. This not only test LIBNVDIMM but the
- acpi_nfit.ko driver as well.
-
-
-LIBNVDIMM Kernel Device Model and LIBNDCTL Userspace API
-----------------------------------------------------
-
-What follows is a description of the LIBNVDIMM sysfs layout and a
-corresponding object hierarchy diagram as viewed through the LIBNDCTL
-API. The example sysfs paths and diagrams are relative to the Example
-NVDIMM Platform which is also the LIBNVDIMM bus used in the LIBNDCTL unit
-test.
-
-LIBNDCTL: Context
-Every API call in the LIBNDCTL library requires a context that holds the
-logging parameters and other library instance state. The library is
-based on the libabc template:
-https://git.kernel.org/cgit/linux/kernel/git/kay/libabc.git
-
-LIBNDCTL: instantiate a new library context example
-
- struct ndctl_ctx *ctx;
-
- if (ndctl_new(&ctx) == 0)
- return ctx;
- else
- return NULL;
-
-LIBNVDIMM/LIBNDCTL: Bus
--------------------
-
-A bus has a 1:1 relationship with an NFIT. The current expectation for
-ACPI based systems is that there is only ever one platform-global NFIT.
-That said, it is trivial to register multiple NFITs, the specification
-does not preclude it. The infrastructure supports multiple busses and
-we use this capability to test multiple NFIT configurations in the unit
-test.
-
-LIBNVDIMM: control class device in /sys/class
-
-This character device accepts DSM messages to be passed to DIMM
-identified by its NFIT handle.
-
- /sys/class/nd/ndctl0
- |-- dev
- |-- device -> ../../../ndbus0
- |-- subsystem -> ../../../../../../../class/nd
-
-
-
-LIBNVDIMM: bus
-
- struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
- struct nvdimm_bus_descriptor *nfit_desc);
-
- /sys/devices/platform/nfit_test.0/ndbus0
- |-- commands
- |-- nd
- |-- nfit
- |-- nmem0
- |-- nmem1
- |-- nmem2
- |-- nmem3
- |-- power
- |-- provider
- |-- region0
- |-- region1
- |-- region2
- |-- region3
- |-- region4
- |-- region5
- |-- uevent
- `-- wait_probe
-
-LIBNDCTL: bus enumeration example
-Find the bus handle that describes the bus from Example NVDIMM Platform
-
- static struct ndctl_bus *get_bus_by_provider(struct ndctl_ctx *ctx,
- const char *provider)
- {
- struct ndctl_bus *bus;
-
- ndctl_bus_foreach(ctx, bus)
- if (strcmp(provider, ndctl_bus_get_provider(bus)) == 0)
- return bus;
-
- return NULL;
- }
-
- bus = get_bus_by_provider(ctx, "nfit_test.0");
-
-
-LIBNVDIMM/LIBNDCTL: DIMM (NMEM)
----------------------------
-
-The DIMM device provides a character device for sending commands to
-hardware, and it is a container for LABELs. If the DIMM is defined by
-NFIT then an optional 'nfit' attribute sub-directory is available to add
-NFIT-specifics.
-
-Note that the kernel device name for "DIMMs" is "nmemX". The NFIT
-describes these devices via "Memory Device to System Physical Address
-Range Mapping Structure", and there is no requirement that they actually
-be physical DIMMs, so we use a more generic name.
-
-LIBNVDIMM: DIMM (NMEM)
-
- struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data,
- const struct attribute_group **groups, unsigned long flags,
- unsigned long *dsm_mask);
-
- /sys/devices/platform/nfit_test.0/ndbus0
- |-- nmem0
- | |-- available_slots
- | |-- commands
- | |-- dev
- | |-- devtype
- | |-- driver -> ../../../../../bus/nd/drivers/nvdimm
- | |-- modalias
- | |-- nfit
- | | |-- device
- | | |-- format
- | | |-- handle
- | | |-- phys_id
- | | |-- rev_id
- | | |-- serial
- | | `-- vendor
- | |-- state
- | |-- subsystem -> ../../../../../bus/nd
- | `-- uevent
- |-- nmem1
- [..]
-
-
-LIBNDCTL: DIMM enumeration example
-
-Note, in this example we are assuming NFIT-defined DIMMs which are
-identified by an "nfit_handle" a 32-bit value where:
-Bit 3:0 DIMM number within the memory channel
-Bit 7:4 memory channel number
-Bit 11:8 memory controller ID
-Bit 15:12 socket ID (within scope of a Node controller if node controller is present)
-Bit 27:16 Node Controller ID
-Bit 31:28 Reserved
-
- static struct ndctl_dimm *get_dimm_by_handle(struct ndctl_bus *bus,
- unsigned int handle)
- {
- struct ndctl_dimm *dimm;
-
- ndctl_dimm_foreach(bus, dimm)
- if (ndctl_dimm_get_handle(dimm) == handle)
- return dimm;
-
- return NULL;
- }
-
- #define DIMM_HANDLE(n, s, i, c, d) \
- (((n & 0xfff) << 16) | ((s & 0xf) << 12) | ((i & 0xf) << 8) \
- | ((c & 0xf) << 4) | (d & 0xf))
-
- dimm = get_dimm_by_handle(bus, DIMM_HANDLE(0, 0, 0, 0, 0));
-
-LIBNVDIMM/LIBNDCTL: Region
-----------------------
-
-A generic REGION device is registered for each PMEM range or BLK-aperture
-set. Per the example there are 6 regions: 2 PMEM and 4 BLK-aperture
-sets on the "nfit_test.0" bus. The primary role of regions are to be a
-container of "mappings". A mapping is a tuple of <DIMM,
-DPA-start-offset, length>.
-
-LIBNVDIMM provides a built-in driver for these REGION devices. This driver
-is responsible for reconciling the aliased DPA mappings across all
-regions, parsing the LABEL, if present, and then emitting NAMESPACE
-devices with the resolved/exclusive DPA-boundaries for the nd_pmem or
-nd_blk device driver to consume.
-
-In addition to the generic attributes of "mapping"s, "interleave_ways"
-and "size" the REGION device also exports some convenience attributes.
-"nstype" indicates the integer type of namespace-device this region
-emits, "devtype" duplicates the DEVTYPE variable stored by udev at the
-'add' event, "modalias" duplicates the MODALIAS variable stored by udev
-at the 'add' event, and finally, the optional "spa_index" is provided in
-the case where the region is defined by a SPA.
-
-LIBNVDIMM: region
-
- struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus,
- struct nd_region_desc *ndr_desc);
- struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus,
- struct nd_region_desc *ndr_desc);
-
- /sys/devices/platform/nfit_test.0/ndbus0
- |-- region0
- | |-- available_size
- | |-- btt0
- | |-- btt_seed
- | |-- devtype
- | |-- driver -> ../../../../../bus/nd/drivers/nd_region
- | |-- init_namespaces
- | |-- mapping0
- | |-- mapping1
- | |-- mappings
- | |-- modalias
- | |-- namespace0.0
- | |-- namespace_seed
- | |-- numa_node
- | |-- nfit
- | | `-- spa_index
- | |-- nstype
- | |-- set_cookie
- | |-- size
- | |-- subsystem -> ../../../../../bus/nd
- | `-- uevent
- |-- region1
- [..]
-
-LIBNDCTL: region enumeration example
-
-Sample region retrieval routines based on NFIT-unique data like
-"spa_index" (interleave set id) for PMEM and "nfit_handle" (dimm id) for
-BLK.
-
- static struct ndctl_region *get_pmem_region_by_spa_index(struct ndctl_bus *bus,
- unsigned int spa_index)
- {
- struct ndctl_region *region;
-
- ndctl_region_foreach(bus, region) {
- if (ndctl_region_get_type(region) != ND_DEVICE_REGION_PMEM)
- continue;
- if (ndctl_region_get_spa_index(region) == spa_index)
- return region;
- }
- return NULL;
- }
-
- static struct ndctl_region *get_blk_region_by_dimm_handle(struct ndctl_bus *bus,
- unsigned int handle)
- {
- struct ndctl_region *region;
-
- ndctl_region_foreach(bus, region) {
- struct ndctl_mapping *map;
-
- if (ndctl_region_get_type(region) != ND_DEVICE_REGION_BLOCK)
- continue;
- ndctl_mapping_foreach(region, map) {
- struct ndctl_dimm *dimm = ndctl_mapping_get_dimm(map);
-
- if (ndctl_dimm_get_handle(dimm) == handle)
- return region;
- }
- }
- return NULL;
- }
-
-
-Why Not Encode the Region Type into the Region Name?
-----------------------------------------------------
-
-At first glance it seems since NFIT defines just PMEM and BLK interface
-types that we should simply name REGION devices with something derived
-from those type names. However, the ND subsystem explicitly keeps the
-REGION name generic and expects userspace to always consider the
-region-attributes for four reasons:
-
- 1. There are already more than two REGION and "namespace" types. For
- PMEM there are two subtypes. As mentioned previously we have PMEM where
- the constituent DIMM devices are known and anonymous PMEM. For BLK
- regions the NFIT specification already anticipates vendor specific
- implementations. The exact distinction of what a region contains is in
- the region-attributes not the region-name or the region-devtype.
-
- 2. A region with zero child-namespaces is a possible configuration. For
- example, the NFIT allows for a DCR to be published without a
- corresponding BLK-aperture. This equates to a DIMM that can only accept
- control/configuration messages, but no i/o through a descendant block
- device. Again, this "type" is advertised in the attributes ('mappings'
- == 0) and the name does not tell you much.
-
- 3. What if a third major interface type arises in the future? Outside
- of vendor specific implementations, it's not difficult to envision a
- third class of interface type beyond BLK and PMEM. With a generic name
- for the REGION level of the device-hierarchy old userspace
- implementations can still make sense of new kernel advertised
- region-types. Userspace can always rely on the generic region
- attributes like "mappings", "size", etc and the expected child devices
- named "namespace". This generic format of the device-model hierarchy
- allows the LIBNVDIMM and LIBNDCTL implementations to be more uniform and
- future-proof.
-
- 4. There are more robust mechanisms for determining the major type of a
- region than a device name. See the next section, How Do I Determine the
- Major Type of a Region?
-
-How Do I Determine the Major Type of a Region?
-----------------------------------------------
-
-Outside of the blanket recommendation of "use libndctl", or simply
-looking at the kernel header (/usr/include/linux/ndctl.h) to decode the
-"nstype" integer attribute, here are some other options.
-
- 1. module alias lookup:
-
- The whole point of region/namespace device type differentiation is to
- decide which block-device driver will attach to a given LIBNVDIMM namespace.
- One can simply use the modalias to lookup the resulting module. It's
- important to note that this method is robust in the presence of a
- vendor-specific driver down the road. If a vendor-specific
- implementation wants to supplant the standard nd_blk driver it can with
- minimal impact to the rest of LIBNVDIMM.
-
- In fact, a vendor may also want to have a vendor-specific region-driver
- (outside of nd_region). For example, if a vendor defined its own LABEL
- format it would need its own region driver to parse that LABEL and emit
- the resulting namespaces. The output from module resolution is more
- accurate than a region-name or region-devtype.
-
- 2. udev:
-
- The kernel "devtype" is registered in the udev database
- # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region0
- P: /devices/platform/nfit_test.0/ndbus0/region0
- E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region0
- E: DEVTYPE=nd_pmem
- E: MODALIAS=nd:t2
- E: SUBSYSTEM=nd
-
- # udevadm info --path=/devices/platform/nfit_test.0/ndbus0/region4
- P: /devices/platform/nfit_test.0/ndbus0/region4
- E: DEVPATH=/devices/platform/nfit_test.0/ndbus0/region4
- E: DEVTYPE=nd_blk
- E: MODALIAS=nd:t3
- E: SUBSYSTEM=nd
-
- ...and is available as a region attribute, but keep in mind that the
- "devtype" does not indicate sub-type variations and scripts should
- really be understanding the other attributes.
-
- 3. type specific attributes:
-
- As it currently stands a BLK-aperture region will never have a
- "nfit/spa_index" attribute, but neither will a non-NFIT PMEM region. A
- BLK region with a "mappings" value of 0 is, as mentioned above, a DIMM
- that does not allow I/O. A PMEM region with a "mappings" value of zero
- is a simple system-physical-address range.
-
-
-LIBNVDIMM/LIBNDCTL: Namespace
--------------------------
-
-A REGION, after resolving DPA aliasing and LABEL specified boundaries,
-surfaces one or more "namespace" devices. The arrival of a "namespace"
-device currently triggers either the nd_blk or nd_pmem driver to load
-and register a disk/block device.
-
-LIBNVDIMM: namespace
-Here is a sample layout from the three major types of NAMESPACE where
-namespace0.0 represents DIMM-info-backed PMEM (note that it has a 'uuid'
-attribute), namespace2.0 represents a BLK namespace (note it has a
-'sector_size' attribute) that, and namespace6.0 represents an anonymous
-PMEM namespace (note that has no 'uuid' attribute due to not support a
-LABEL).
-
- /sys/devices/platform/nfit_test.0/ndbus0/region0/namespace0.0
- |-- alt_name
- |-- devtype
- |-- dpa_extents
- |-- force_raw
- |-- modalias
- |-- numa_node
- |-- resource
- |-- size
- |-- subsystem -> ../../../../../../bus/nd
- |-- type
- |-- uevent
- `-- uuid
- /sys/devices/platform/nfit_test.0/ndbus0/region2/namespace2.0
- |-- alt_name
- |-- devtype
- |-- dpa_extents
- |-- force_raw
- |-- modalias
- |-- numa_node
- |-- sector_size
- |-- size
- |-- subsystem -> ../../../../../../bus/nd
- |-- type
- |-- uevent
- `-- uuid
- /sys/devices/platform/nfit_test.1/ndbus1/region6/namespace6.0
- |-- block
- | `-- pmem0
- |-- devtype
- |-- driver -> ../../../../../../bus/nd/drivers/pmem
- |-- force_raw
- |-- modalias
- |-- numa_node
- |-- resource
- |-- size
- |-- subsystem -> ../../../../../../bus/nd
- |-- type
- `-- uevent
-
-LIBNDCTL: namespace enumeration example
-Namespaces are indexed relative to their parent region, example below.
-These indexes are mostly static from boot to boot, but subsystem makes
-no guarantees in this regard. For a static namespace identifier use its
-'uuid' attribute.
-
-static struct ndctl_namespace *get_namespace_by_id(struct ndctl_region *region,
- unsigned int id)
-{
- struct ndctl_namespace *ndns;
-
- ndctl_namespace_foreach(region, ndns)
- if (ndctl_namespace_get_id(ndns) == id)
- return ndns;
-
- return NULL;
-}
-
-LIBNDCTL: namespace creation example
-Idle namespaces are automatically created by the kernel if a given
-region has enough available capacity to create a new namespace.
-Namespace instantiation involves finding an idle namespace and
-configuring it. For the most part the setting of namespace attributes
-can occur in any order, the only constraint is that 'uuid' must be set
-before 'size'. This enables the kernel to track DPA allocations
-internally with a static identifier.
-
-static int configure_namespace(struct ndctl_region *region,
- struct ndctl_namespace *ndns,
- struct namespace_parameters *parameters)
-{
- char devname[50];
-
- snprintf(devname, sizeof(devname), "namespace%d.%d",
- ndctl_region_get_id(region), paramaters->id);
-
- ndctl_namespace_set_alt_name(ndns, devname);
- /* 'uuid' must be set prior to setting size! */
- ndctl_namespace_set_uuid(ndns, paramaters->uuid);
- ndctl_namespace_set_size(ndns, paramaters->size);
- /* unlike pmem namespaces, blk namespaces have a sector size */
- if (parameters->lbasize)
- ndctl_namespace_set_sector_size(ndns, parameters->lbasize);
- ndctl_namespace_enable(ndns);
-}
-
-
-Why the Term "namespace"?
-
- 1. Why not "volume" for instance? "volume" ran the risk of confusing
- ND (libnvdimm subsystem) to a volume manager like device-mapper.
-
- 2. The term originated to describe the sub-devices that can be created
- within a NVME controller (see the nvme specification:
- http://www.nvmexpress.org/specifications/), and NFIT namespaces are
- meant to parallel the capabilities and configurability of
- NVME-namespaces.
-
-
-LIBNVDIMM/LIBNDCTL: Block Translation Table "btt"
----------------------------------------------
-
-A BTT (design document: http://pmem.io/2014/09/23/btt.html) is a stacked
-block device driver that fronts either the whole block device or a
-partition of a block device emitted by either a PMEM or BLK NAMESPACE.
-
-LIBNVDIMM: btt layout
-Every region will start out with at least one BTT device which is the
-seed device. To activate it set the "namespace", "uuid", and
-"sector_size" attributes and then bind the device to the nd_pmem or
-nd_blk driver depending on the region type.
-
- /sys/devices/platform/nfit_test.1/ndbus0/region0/btt0/
- |-- namespace
- |-- delete
- |-- devtype
- |-- modalias
- |-- numa_node
- |-- sector_size
- |-- subsystem -> ../../../../../bus/nd
- |-- uevent
- `-- uuid
-
-LIBNDCTL: btt creation example
-Similar to namespaces an idle BTT device is automatically created per
-region. Each time this "seed" btt device is configured and enabled a new
-seed is created. Creating a BTT configuration involves two steps of
-finding and idle BTT and assigning it to consume a PMEM or BLK namespace.
-
- static struct ndctl_btt *get_idle_btt(struct ndctl_region *region)
- {
- struct ndctl_btt *btt;
-
- ndctl_btt_foreach(region, btt)
- if (!ndctl_btt_is_enabled(btt)
- && !ndctl_btt_is_configured(btt))
- return btt;
-
- return NULL;
- }
-
- static int configure_btt(struct ndctl_region *region,
- struct btt_parameters *parameters)
- {
- btt = get_idle_btt(region);
-
- ndctl_btt_set_uuid(btt, parameters->uuid);
- ndctl_btt_set_sector_size(btt, parameters->sector_size);
- ndctl_btt_set_namespace(btt, parameters->ndns);
- /* turn off raw mode device */
- ndctl_namespace_disable(parameters->ndns);
- /* turn on btt access */
- ndctl_btt_enable(btt);
- }
-
-Once instantiated a new inactive btt seed device will appear underneath
-the region.
-
-Once a "namespace" is removed from a BTT that instance of the BTT device
-will be deleted or otherwise reset to default values. This deletion is
-only at the device model level. In order to destroy a BTT the "info
-block" needs to be destroyed. Note, that to destroy a BTT the media
-needs to be written in raw mode. By default, the kernel will autodetect
-the presence of a BTT and disable raw mode. This autodetect behavior
-can be suppressed by enabling raw mode for the namespace via the
-ndctl_namespace_set_raw_mode() API.
-
-
-Summary LIBNDCTL Diagram
-------------------------
-
-For the given example above, here is the view of the objects as seen by the
-LIBNDCTL API:
- +---+
- |CTX| +---------+ +--------------+ +---------------+
- +-+-+ +-> REGION0 +---> NAMESPACE0.0 +--> PMEM8 "pm0.0" |
- | | +---------+ +--------------+ +---------------+
-+-------+ | | +---------+ +--------------+ +---------------+
-| DIMM0 <-+ | +-> REGION1 +---> NAMESPACE1.0 +--> PMEM6 "pm1.0" |
-+-------+ | | | +---------+ +--------------+ +---------------+
-| DIMM1 <-+ +-v--+ | +---------+ +--------------+ +---------------+
-+-------+ +-+BUS0+---> REGION2 +-+-> NAMESPACE2.0 +--> ND6 "blk2.0" |
-| DIMM2 <-+ +----+ | +---------+ | +--------------+ +----------------------+
-+-------+ | | +-> NAMESPACE2.1 +--> ND5 "blk2.1" | BTT2 |
-| DIMM3 <-+ | +--------------+ +----------------------+
-+-------+ | +---------+ +--------------+ +---------------+
- +-> REGION3 +-+-> NAMESPACE3.0 +--> ND4 "blk3.0" |
- | +---------+ | +--------------+ +----------------------+
- | +-> NAMESPACE3.1 +--> ND3 "blk3.1" | BTT1 |
- | +--------------+ +----------------------+
- | +---------+ +--------------+ +---------------+
- +-> REGION4 +---> NAMESPACE4.0 +--> ND2 "blk4.0" |
- | +---------+ +--------------+ +---------------+
- | +---------+ +--------------+ +----------------------+
- +-> REGION5 +---> NAMESPACE5.0 +--> ND1 "blk5.0" | BTT0 |
- +---------+ +--------------+ +---------------+------+
-
-
diff --git a/Documentation/nvdimm/security.txt b/Documentation/nvdimm/security.txt
deleted file mode 100644
index 4c36c05ca98e..000000000000
--- a/Documentation/nvdimm/security.txt
+++ /dev/null
@@ -1,141 +0,0 @@
-NVDIMM SECURITY
-===============
-
-1. Introduction
----------------
-
-With the introduction of Intel Device Specific Methods (DSM) v1.8
-specification [1], security DSMs are introduced. The spec added the following
-security DSMs: "get security state", "set passphrase", "disable passphrase",
-"unlock unit", "freeze lock", "secure erase", and "overwrite". A security_ops
-data structure has been added to struct dimm in order to support the security
-operations and generic APIs are exposed to allow vendor neutral operations.
-
-2. Sysfs Interface
-------------------
-The "security" sysfs attribute is provided in the nvdimm sysfs directory. For
-example:
-/sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/security
-
-The "show" attribute of that attribute will display the security state for
-that DIMM. The following states are available: disabled, unlocked, locked,
-frozen, and overwrite. If security is not supported, the sysfs attribute
-will not be visible.
-
-The "store" attribute takes several commands when it is being written to
-in order to support some of the security functionalities:
-update <old_keyid> <new_keyid> - enable or update passphrase.
-disable <keyid> - disable enabled security and remove key.
-freeze - freeze changing of security states.
-erase <keyid> - delete existing user encryption key.
-overwrite <keyid> - wipe the entire nvdimm.
-master_update <keyid> <new_keyid> - enable or update master passphrase.
-master_erase <keyid> - delete existing user encryption key.
-
-3. Key Management
------------------
-
-The key is associated to the payload by the DIMM id. For example:
-# cat /sys/devices/LNXSYSTM:00/LNXSYBUS:00/ACPI0012:00/ndbus0/nmem0/nfit/id
-8089-a2-1740-00000133
-The DIMM id would be provided along with the key payload (passphrase) to
-the kernel.
-
-The security keys are managed on the basis of a single key per DIMM. The
-key "passphrase" is expected to be 32bytes long. This is similar to the ATA
-security specification [2]. A key is initially acquired via the request_key()
-kernel API call during nvdimm unlock. It is up to the user to make sure that
-all the keys are in the kernel user keyring for unlock.
-
-A nvdimm encrypted-key of format enc32 has the description format of:
-nvdimm:<bus-provider-specific-unique-id>
-
-See file ``Documentation/security/keys/trusted-encrypted.rst`` for creating
-encrypted-keys of enc32 format. TPM usage with a master trusted key is
-preferred for sealing the encrypted-keys.
-
-4. Unlocking
-------------
-When the DIMMs are being enumerated by the kernel, the kernel will attempt to
-retrieve the key from the kernel user keyring. This is the only time
-a locked DIMM can be unlocked. Once unlocked, the DIMM will remain unlocked
-until reboot. Typically an entity (i.e. shell script) will inject all the
-relevant encrypted-keys into the kernel user keyring during the initramfs phase.
-This provides the unlock function access to all the related keys that contain
-the passphrase for the respective nvdimms. It is also recommended that the
-keys are injected before libnvdimm is loaded by modprobe.
-
-5. Update
----------
-When doing an update, it is expected that the existing key is removed from
-the kernel user keyring and reinjected as different (old) key. It's irrelevant
-what the key description is for the old key since we are only interested in the
-keyid when doing the update operation. It is also expected that the new key
-is injected with the description format described from earlier in this
-document. The update command written to the sysfs attribute will be with
-the format:
-update <old keyid> <new keyid>
-
-If there is no old keyid due to a security enabling, then a 0 should be
-passed in.
-
-6. Freeze
----------
-The freeze operation does not require any keys. The security config can be
-frozen by a user with root privelege.
-
-7. Disable
-----------
-The security disable command format is:
-disable <keyid>
-
-An key with the current passphrase payload that is tied to the nvdimm should be
-in the kernel user keyring.
-
-8. Secure Erase
----------------
-The command format for doing a secure erase is:
-erase <keyid>
-
-An key with the current passphrase payload that is tied to the nvdimm should be
-in the kernel user keyring.
-
-9. Overwrite
-------------
-The command format for doing an overwrite is:
-overwrite <keyid>
-
-Overwrite can be done without a key if security is not enabled. A key serial
-of 0 can be passed in to indicate no key.
-
-The sysfs attribute "security" can be polled to wait on overwrite completion.
-Overwrite can last tens of minutes or more depending on nvdimm size.
-
-An encrypted-key with the current user passphrase that is tied to the nvdimm
-should be injected and its keyid should be passed in via sysfs.
-
-10. Master Update
------------------
-The command format for doing a master update is:
-update <old keyid> <new keyid>
-
-The operating mechanism for master update is identical to update except the
-master passphrase key is passed to the kernel. The master passphrase key
-is just another encrypted-key.
-
-This command is only available when security is disabled.
-
-11. Master Erase
-----------------
-The command format for doing a master erase is:
-master_erase <current keyid>
-
-This command has the same operating mechanism as erase except the master
-passphrase key is passed to the kernel. The master passphrase key is just
-another encrypted-key.
-
-This command is only available when the master security is enabled, indicated
-by the extended security status.
-
-[1]: http://pmem.io/documents/NVDIMM_DSM_Interface-V1.8.pdf
-[2]: http://www.t13.org/documents/UploadedDocuments/docs2006/e05179r4-ACS-SecurityClarifications.pdf
diff --git a/Documentation/nvmem/nvmem.txt b/Documentation/nvmem/nvmem.txt
deleted file mode 100644
index fc2fe4b18655..000000000000
--- a/Documentation/nvmem/nvmem.txt
+++ /dev/null
@@ -1,183 +0,0 @@
- NVMEM SUBSYSTEM
- Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
-
-This document explains the NVMEM Framework along with the APIs provided,
-and how to use it.
-
-1. Introduction
-===============
-*NVMEM* is the abbreviation for Non Volatile Memory layer. It is used to
-retrieve configuration of SOC or Device specific data from non volatile
-memories like eeprom, efuses and so on.
-
-Before this framework existed, NVMEM drivers like eeprom were stored in
-drivers/misc, where they all had to duplicate pretty much the same code to
-register a sysfs file, allow in-kernel users to access the content of the
-devices they were driving, etc.
-
-This was also a problem as far as other in-kernel users were involved, since
-the solutions used were pretty much different from one driver to another, there
-was a rather big abstraction leak.
-
-This framework aims at solve these problems. It also introduces DT
-representation for consumer devices to go get the data they require (MAC
-Addresses, SoC/Revision ID, part numbers, and so on) from the NVMEMs. This
-framework is based on regmap, so that most of the abstraction available in
-regmap can be reused, across multiple types of buses.
-
-NVMEM Providers
-+++++++++++++++
-
-NVMEM provider refers to an entity that implements methods to initialize, read
-and write the non-volatile memory.
-
-2. Registering/Unregistering the NVMEM provider
-===============================================
-
-A NVMEM provider can register with NVMEM core by supplying relevant
-nvmem configuration to nvmem_register(), on success core would return a valid
-nvmem_device pointer.
-
-nvmem_unregister(nvmem) is used to unregister a previously registered provider.
-
-For example, a simple qfprom case:
-
-static struct nvmem_config econfig = {
- .name = "qfprom",
- .owner = THIS_MODULE,
-};
-
-static int qfprom_probe(struct platform_device *pdev)
-{
- ...
- econfig.dev = &pdev->dev;
- nvmem = nvmem_register(&econfig);
- ...
-}
-
-It is mandatory that the NVMEM provider has a regmap associated with its
-struct device. Failure to do would return error code from nvmem_register().
-
-Users of board files can define and register nvmem cells using the
-nvmem_cell_table struct:
-
-static struct nvmem_cell_info foo_nvmem_cells[] = {
- {
- .name = "macaddr",
- .offset = 0x7f00,
- .bytes = ETH_ALEN,
- }
-};
-
-static struct nvmem_cell_table foo_nvmem_cell_table = {
- .nvmem_name = "i2c-eeprom",
- .cells = foo_nvmem_cells,
- .ncells = ARRAY_SIZE(foo_nvmem_cells),
-};
-
-nvmem_add_cell_table(&foo_nvmem_cell_table);
-
-Additionally it is possible to create nvmem cell lookup entries and register
-them with the nvmem framework from machine code as shown in the example below:
-
-static struct nvmem_cell_lookup foo_nvmem_lookup = {
- .nvmem_name = "i2c-eeprom",
- .cell_name = "macaddr",
- .dev_id = "foo_mac.0",
- .con_id = "mac-address",
-};
-
-nvmem_add_cell_lookups(&foo_nvmem_lookup, 1);
-
-NVMEM Consumers
-+++++++++++++++
-
-NVMEM consumers are the entities which make use of the NVMEM provider to
-read from and to NVMEM.
-
-3. NVMEM cell based consumer APIs
-=================================
-
-NVMEM cells are the data entries/fields in the NVMEM.
-The NVMEM framework provides 3 APIs to read/write NVMEM cells.
-
-struct nvmem_cell *nvmem_cell_get(struct device *dev, const char *name);
-struct nvmem_cell *devm_nvmem_cell_get(struct device *dev, const char *name);
-
-void nvmem_cell_put(struct nvmem_cell *cell);
-void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
-
-void *nvmem_cell_read(struct nvmem_cell *cell, ssize_t *len);
-int nvmem_cell_write(struct nvmem_cell *cell, void *buf, ssize_t len);
-
-*nvmem_cell_get() apis will get a reference to nvmem cell for a given id,
-and nvmem_cell_read/write() can then read or write to the cell.
-Once the usage of the cell is finished the consumer should call *nvmem_cell_put()
-to free all the allocation memory for the cell.
-
-4. Direct NVMEM device based consumer APIs
-==========================================
-
-In some instances it is necessary to directly read/write the NVMEM.
-To facilitate such consumers NVMEM framework provides below apis.
-
-struct nvmem_device *nvmem_device_get(struct device *dev, const char *name);
-struct nvmem_device *devm_nvmem_device_get(struct device *dev,
- const char *name);
-void nvmem_device_put(struct nvmem_device *nvmem);
-int nvmem_device_read(struct nvmem_device *nvmem, unsigned int offset,
- size_t bytes, void *buf);
-int nvmem_device_write(struct nvmem_device *nvmem, unsigned int offset,
- size_t bytes, void *buf);
-int nvmem_device_cell_read(struct nvmem_device *nvmem,
- struct nvmem_cell_info *info, void *buf);
-int nvmem_device_cell_write(struct nvmem_device *nvmem,
- struct nvmem_cell_info *info, void *buf);
-
-Before the consumers can read/write NVMEM directly, it should get hold
-of nvmem_controller from one of the *nvmem_device_get() api.
-
-The difference between these apis and cell based apis is that these apis always
-take nvmem_device as parameter.
-
-5. Releasing a reference to the NVMEM
-=====================================
-
-When a consumer no longer needs the NVMEM, it has to release the reference
-to the NVMEM it has obtained using the APIs mentioned in the above section.
-The NVMEM framework provides 2 APIs to release a reference to the NVMEM.
-
-void nvmem_cell_put(struct nvmem_cell *cell);
-void devm_nvmem_cell_put(struct device *dev, struct nvmem_cell *cell);
-void nvmem_device_put(struct nvmem_device *nvmem);
-void devm_nvmem_device_put(struct device *dev, struct nvmem_device *nvmem);
-
-Both these APIs are used to release a reference to the NVMEM and
-devm_nvmem_cell_put and devm_nvmem_device_put destroys the devres associated
-with this NVMEM.
-
-Userspace
-+++++++++
-
-6. Userspace binary interface
-==============================
-
-Userspace can read/write the raw NVMEM file located at
-/sys/bus/nvmem/devices/*/nvmem
-
-ex:
-
-hexdump /sys/bus/nvmem/devices/qfprom0/nvmem
-
-0000000 0000 0000 0000 0000 0000 0000 0000 0000
-*
-00000a0 db10 2240 0000 e000 0c00 0c00 0000 0c00
-0000000 0000 0000 0000 0000 0000 0000 0000 0000
-...
-*
-0001000
-
-7. DeviceTree Binding
-=====================
-
-See Documentation/devicetree/bindings/nvmem/nvmem.txt
diff --git a/Documentation/pcmcia/index.rst b/Documentation/pcmcia/index.rst
index 779c8527109e..7ae1f62fca14 100644
--- a/Documentation/pcmcia/index.rst
+++ b/Documentation/pcmcia/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
======
pcmcia
diff --git a/Documentation/perf/arm-ccn.txt b/Documentation/perf/arm-ccn.txt
deleted file mode 100644
index 15cdb7bc57c3..000000000000
--- a/Documentation/perf/arm-ccn.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-ARM Cache Coherent Network
-==========================
-
-CCN-504 is a ring-bus interconnect consisting of 11 crosspoints
-(XPs), with each crosspoint supporting up to two device ports,
-so nodes (devices) 0 and 1 are connected to crosspoint 0,
-nodes 2 and 3 to crosspoint 1 etc.
-
-PMU (perf) driver
------------------
-
-The CCN driver registers a perf PMU driver, which provides
-description of available events and configuration options
-in sysfs, see /sys/bus/event_source/devices/ccn*.
-
-The "format" directory describes format of the config, config1
-and config2 fields of the perf_event_attr structure. The "events"
-directory provides configuration templates for all documented
-events, that can be used with perf tool. For example "xp_valid_flit"
-is an equivalent of "type=0x8,event=0x4". Other parameters must be
-explicitly specified.
-
-For events originating from device, "node" defines its index.
-
-Crosspoint PMU events require "xp" (index), "bus" (bus number)
-and "vc" (virtual channel ID).
-
-Crosspoint watchpoint-based events (special "event" value 0xfe)
-require "xp" and "vc" as as above plus "port" (device port index),
-"dir" (transmit/receive direction), comparator values ("cmp_l"
-and "cmp_h") and "mask", being index of the comparator mask.
-Masks are defined separately from the event description
-(due to limited number of the config values) in the "cmp_mask"
-directory, with first 8 configurable by user and additional
-4 hardcoded for the most frequent use cases.
-
-Cycle counter is described by a "type" value 0xff and does
-not require any other settings.
-
-The driver also provides a "cpumask" sysfs attribute, which contains
-a single CPU ID, of the processor which will be used to handle all
-the CCN PMU events. It is recommended that the user space tools
-request the events on this processor (if not, the perf_event->cpu value
-will be overwritten anyway). In case of this processor being offlined,
-the events are migrated to another one and the attribute is updated.
-
-Example of perf tool use:
-
-/ # perf list | grep ccn
- ccn/cycles/ [Kernel PMU event]
-<...>
- ccn/xp_valid_flit,xp=?,port=?,vc=?,dir=?/ [Kernel PMU event]
-<...>
-
-/ # perf stat -a -e ccn/cycles/,ccn/xp_valid_flit,xp=1,port=0,vc=1,dir=1/ \
- sleep 1
-
-The driver does not support sampling, therefore "perf record" will
-not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/perf/arm_dsu_pmu.txt b/Documentation/perf/arm_dsu_pmu.txt
deleted file mode 100644
index d611e15f5add..000000000000
--- a/Documentation/perf/arm_dsu_pmu.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-ARM DynamIQ Shared Unit (DSU) PMU
-==================================
-
-ARM DynamIQ Shared Unit integrates one or more cores with an L3 memory system,
-control logic and external interfaces to form a multicore cluster. The PMU
-allows counting the various events related to the L3 cache, Snoop Control Unit
-etc, using 32bit independent counters. It also provides a 64bit cycle counter.
-
-The PMU can only be accessed via CPU system registers and are common to the
-cores connected to the same DSU. Like most of the other uncore PMUs, DSU
-PMU doesn't support process specific events and cannot be used in sampling mode.
-
-The DSU provides a bitmap for a subset of implemented events via hardware
-registers. There is no way for the driver to determine if the other events
-are available or not. Hence the driver exposes only those events advertised
-by the DSU, in "events" directory under :
-
- /sys/bus/event_sources/devices/arm_dsu_<N>/
-
-The user should refer to the TRM of the product to figure out the supported events
-and use the raw event code for the unlisted events.
-
-The driver also exposes the CPUs connected to the DSU instance in "associated_cpus".
-
-
-e.g usage :
-
- perf stat -a -e arm_dsu_0/cycles/
diff --git a/Documentation/perf/hisi-pmu.txt b/Documentation/perf/hisi-pmu.txt
deleted file mode 100644
index 267a028b2741..000000000000
--- a/Documentation/perf/hisi-pmu.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-HiSilicon SoC uncore Performance Monitoring Unit (PMU)
-======================================================
-The HiSilicon SoC chip includes various independent system device PMUs
-such as L3 cache (L3C), Hydra Home Agent (HHA) and DDRC. These PMUs are
-independent and have hardware logic to gather statistics and performance
-information.
-
-The HiSilicon SoC encapsulates multiple CPU and IO dies. Each CPU cluster
-(CCL) is made up of 4 cpu cores sharing one L3 cache; each CPU die is
-called Super CPU cluster (SCCL) and is made up of 6 CCLs. Each SCCL has
-two HHAs (0 - 1) and four DDRCs (0 - 3), respectively.
-
-HiSilicon SoC uncore PMU driver
----------------------------------------
-Each device PMU has separate registers for event counting, control and
-interrupt, and the PMU driver shall register perf PMU drivers like L3C,
-HHA and DDRC etc. The available events and configuration options shall
-be described in the sysfs, see :
-/sys/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>/, or
-/sys/bus/event_source/devices/hisi_sccl{X}_<l3c{Y}/hha{Y}/ddrc{Y}>.
-The "perf list" command shall list the available events from sysfs.
-
-Each L3C, HHA and DDRC is registered as a separate PMU with perf. The PMU
-name will appear in event listing as hisi_sccl<sccl-id>_module<index-id>.
-where "sccl-id" is the identifier of the SCCL and "index-id" is the index of
-module.
-e.g. hisi_sccl3_l3c0/rd_hit_cpipe is READ_HIT_CPIPE event of L3C index #0 in
-SCCL ID #3.
-e.g. hisi_sccl1_hha0/rx_operations is RX_OPERATIONS event of HHA index #0 in
-SCCL ID #1.
-
-The driver also provides a "cpumask" sysfs attribute, which shows the CPU core
-ID used to count the uncore PMU event.
-
-Example usage of perf:
-$# perf list
-hisi_sccl3_l3c0/rd_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-hisi_sccl3_l3c0/wr_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-hisi_sccl1_l3c0/rd_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-hisi_sccl1_l3c0/wr_hit_cpipe/ [kernel PMU event]
-------------------------------------------
-
-$# perf stat -a -e hisi_sccl3_l3c0/rd_hit_cpipe/ sleep 5
-$# perf stat -a -e hisi_sccl3_l3c0/config=0x02/ sleep 5
-
-The current driver does not support sampling. So "perf record" is unsupported.
-Also attach to a task is unsupported as the events are all uncore.
-
-Note: Please contact the maintainer for a complete list of events supported for
-the PMU devices in the SoC and its information if needed.
diff --git a/Documentation/perf/qcom_l2_pmu.txt b/Documentation/perf/qcom_l2_pmu.txt
deleted file mode 100644
index b25b97659ab9..000000000000
--- a/Documentation/perf/qcom_l2_pmu.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-Qualcomm Technologies Level-2 Cache Performance Monitoring Unit (PMU)
-=====================================================================
-
-This driver supports the L2 cache clusters found in Qualcomm Technologies
-Centriq SoCs. There are multiple physical L2 cache clusters, each with their
-own PMU. Each cluster has one or more CPUs associated with it.
-
-There is one logical L2 PMU exposed, which aggregates the results from
-the physical PMUs.
-
-The driver provides a description of its available events and configuration
-options in sysfs, see /sys/devices/l2cache_0.
-
-The "format" directory describes the format of the events.
-
-Events can be envisioned as a 2-dimensional array. Each column represents
-a group of events. There are 8 groups. Only one entry from each
-group can be in use at a time. If multiple events from the same group
-are specified, the conflicting events cannot be counted at the same time.
-
-Events are specified as 0xCCG, where CC is 2 hex digits specifying
-the code (array row) and G specifies the group (column) 0-7.
-
-In addition there is a cycle counter event specified by the value 0xFE
-which is outside the above scheme.
-
-The driver provides a "cpumask" sysfs attribute which contains a mask
-consisting of one CPU per cluster which will be used to handle all the PMU
-events on that cluster.
-
-Examples for use with perf:
-
- perf stat -e l2cache_0/config=0x001/,l2cache_0/config=0x042/ -a sleep 1
-
- perf stat -e l2cache_0/config=0xfe/ -C 2 sleep 1
-
-The driver does not support sampling, therefore "perf record" will
-not work. Per-task perf sessions are not supported.
diff --git a/Documentation/perf/qcom_l3_pmu.txt b/Documentation/perf/qcom_l3_pmu.txt
deleted file mode 100644
index 96b3a9444a0d..000000000000
--- a/Documentation/perf/qcom_l3_pmu.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-Qualcomm Datacenter Technologies L3 Cache Performance Monitoring Unit (PMU)
-===========================================================================
-
-This driver supports the L3 cache PMUs found in Qualcomm Datacenter Technologies
-Centriq SoCs. The L3 cache on these SOCs is composed of multiple slices, shared
-by all cores within a socket. Each slice is exposed as a separate uncore perf
-PMU with device name l3cache_<socket>_<instance>. User space is responsible
-for aggregating across slices.
-
-The driver provides a description of its available events and configuration
-options in sysfs, see /sys/devices/l3cache*. Given that these are uncore PMUs
-the driver also exposes a "cpumask" sysfs attribute which contains a mask
-consisting of one CPU per socket which will be used to handle all the PMU
-events on that socket.
-
-The hardware implements 32bit event counters and has a flat 8bit event space
-exposed via the "event" format attribute. In addition to the 32bit physical
-counters the driver supports virtual 64bit hardware counters by using hardware
-counter chaining. This feature is exposed via the "lc" (long counter) format
-flag. E.g.:
-
- perf stat -e l3cache_0_0/read-miss,lc/
-
-Given that these are uncore PMUs the driver does not support sampling, therefore
-"perf record" will not work. Per-task perf sessions are not supported.
diff --git a/Documentation/perf/thunderx2-pmu.txt b/Documentation/perf/thunderx2-pmu.txt
deleted file mode 100644
index dffc57143736..000000000000
--- a/Documentation/perf/thunderx2-pmu.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-Cavium ThunderX2 SoC Performance Monitoring Unit (PMU UNCORE)
-=============================================================
-
-The ThunderX2 SoC PMU consists of independent, system-wide, per-socket
-PMUs such as the Level 3 Cache (L3C) and DDR4 Memory Controller (DMC).
-
-The DMC has 8 interleaved channels and the L3C has 16 interleaved tiles.
-Events are counted for the default channel (i.e. channel 0) and prorated
-to the total number of channels/tiles.
-
-The DMC and L3C support up to 4 counters. Counters are independently
-programmable and can be started and stopped individually. Each counter
-can be set to a different event. Counters are 32-bit and do not support
-an overflow interrupt; they are read every 2 seconds.
-
-PMU UNCORE (perf) driver:
-
-The thunderx2_pmu driver registers per-socket perf PMUs for the DMC and
-L3C devices. Each PMU can be used to count up to 4 events
-simultaneously. The PMUs provide a description of their available events
-and configuration options under sysfs, see
-/sys/devices/uncore_<l3c_S/dmc_S/>; S is the socket id.
-
-The driver does not support sampling, therefore "perf record" will not
-work. Per-task perf sessions are also not supported.
-
-Examples:
-
-# perf stat -a -e uncore_dmc_0/cnt_cycles/ sleep 1
-
-# perf stat -a -e \
-uncore_dmc_0/cnt_cycles/,\
-uncore_dmc_0/data_transfers/,\
-uncore_dmc_0/read_txns/,\
-uncore_dmc_0/write_txns/ sleep 1
-
-# perf stat -a -e \
-uncore_l3c_0/read_request/,\
-uncore_l3c_0/read_hit/,\
-uncore_l3c_0/inv_request/,\
-uncore_l3c_0/inv_hit/ sleep 1
diff --git a/Documentation/perf/xgene-pmu.txt b/Documentation/perf/xgene-pmu.txt
deleted file mode 100644
index d7cff4454e5b..000000000000
--- a/Documentation/perf/xgene-pmu.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-APM X-Gene SoC Performance Monitoring Unit (PMU)
-================================================
-
-X-Gene SoC PMU consists of various independent system device PMUs such as
-L3 cache(s), I/O bridge(s), memory controller bridge(s) and memory
-controller(s). These PMU devices are loosely architected to follow the
-same model as the PMU for ARM cores. The PMUs share the same top level
-interrupt and status CSR region.
-
-PMU (perf) driver
------------------
-
-The xgene-pmu driver registers several perf PMU drivers. Each of the perf
-driver provides description of its available events and configuration options
-in sysfs, see /sys/devices/<l3cX/iobX/mcbX/mcX>/.
-
-The "format" directory describes format of the config (event ID),
-config1 (agent ID) fields of the perf_event_attr structure. The "events"
-directory provides configuration templates for all supported event types that
-can be used with perf tool. For example, "l3c0/bank-fifo-full/" is an
-equivalent of "l3c0/config=0x0b/".
-
-Most of the SoC PMU has a specific list of agent ID used for monitoring
-performance of a specific datapath. For example, agents of a L3 cache can be
-a specific CPU or an I/O bridge. Each PMU has a set of 2 registers capable of
-masking the agents from which the request come from. If the bit with
-the bit number corresponding to the agent is set, the event is counted only if
-it is caused by a request from that agent. Each agent ID bit is inversely mapped
-to a corresponding bit in "config1" field. By default, the event will be
-counted for all agent requests (config1 = 0x0). For all the supported agents of
-each PMU, please refer to APM X-Gene User Manual.
-
-Each perf driver also provides a "cpumask" sysfs attribute, which contains a
-single CPU ID of the processor which will be used to handle all the PMU events.
-
-Example for perf tool use:
-
- / # perf list | grep -e l3c -e iob -e mcb -e mc
- l3c0/ackq-full/ [Kernel PMU event]
- <...>
- mcb1/mcb-csw-stall/ [Kernel PMU event]
-
- / # perf stat -a -e l3c0/read-miss/,mcb1/csw-write-request/ sleep 1
-
- / # perf stat -a -e l3c0/read-miss,config1=0xfffffffffffffffe/ sleep 1
-
-The driver does not support sampling, therefore "perf record" will
-not work. Per-task (without "-a") perf sessions are not supported.
diff --git a/Documentation/phy/samsung-usb2.txt b/Documentation/phy/samsung-usb2.txt
deleted file mode 100644
index ed12d437189d..000000000000
--- a/Documentation/phy/samsung-usb2.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-.------------------------------------------------------------------------------+
-| Samsung USB 2.0 PHY adaptation layer |
-+-----------------------------------------------------------------------------+'
-
-| 1. Description
-+----------------
-
-The architecture of the USB 2.0 PHY module in Samsung SoCs is similar
-among many SoCs. In spite of the similarities it proved difficult to
-create a one driver that would fit all these PHY controllers. Often
-the differences were minor and were found in particular bits of the
-registers of the PHY. In some rare cases the order of register writes or
-the PHY powering up process had to be altered. This adaptation layer is
-a compromise between having separate drivers and having a single driver
-with added support for many special cases.
-
-| 2. Files description
-+----------------------
-
-- phy-samsung-usb2.c
- This is the main file of the adaptation layer. This file contains
- the probe function and provides two callbacks to the Generic PHY
- Framework. This two callbacks are used to power on and power off the
- phy. They carry out the common work that has to be done on all version
- of the PHY module. Depending on which SoC was chosen they execute SoC
- specific callbacks. The specific SoC version is selected by choosing
- the appropriate compatible string. In addition, this file contains
- struct of_device_id definitions for particular SoCs.
-
-- phy-samsung-usb2.h
- This is the include file. It declares the structures used by this
- driver. In addition it should contain extern declarations for
- structures that describe particular SoCs.
-
-| 3. Supporting SoCs
-+--------------------
-
-To support a new SoC a new file should be added to the drivers/phy
-directory. Each SoC's configuration is stored in an instance of the
-struct samsung_usb2_phy_config.
-
-struct samsung_usb2_phy_config {
- const struct samsung_usb2_common_phy *phys;
- int (*rate_to_clk)(unsigned long, u32 *);
- unsigned int num_phys;
- bool has_mode_switch;
-};
-
-The num_phys is the number of phys handled by the driver. *phys is an
-array that contains the configuration for each phy. The has_mode_switch
-property is a boolean flag that determines whether the SoC has USB host
-and device on a single pair of pins. If so, a special register has to
-be modified to change the internal routing of these pins between a USB
-device or host module.
-
-For example the configuration for Exynos 4210 is following:
-
-const struct samsung_usb2_phy_config exynos4210_usb2_phy_config = {
- .has_mode_switch = 0,
- .num_phys = EXYNOS4210_NUM_PHYS,
- .phys = exynos4210_phys,
- .rate_to_clk = exynos4210_rate_to_clk,
-}
-
-- int (*rate_to_clk)(unsigned long, u32 *)
- The rate_to_clk callback is to convert the rate of the clock
- used as the reference clock for the PHY module to the value
- that should be written in the hardware register.
-
-The exynos4210_phys configuration array is as follows:
-
-static const struct samsung_usb2_common_phy exynos4210_phys[] = {
- {
- .label = "device",
- .id = EXYNOS4210_DEVICE,
- .power_on = exynos4210_power_on,
- .power_off = exynos4210_power_off,
- },
- {
- .label = "host",
- .id = EXYNOS4210_HOST,
- .power_on = exynos4210_power_on,
- .power_off = exynos4210_power_off,
- },
- {
- .label = "hsic0",
- .id = EXYNOS4210_HSIC0,
- .power_on = exynos4210_power_on,
- .power_off = exynos4210_power_off,
- },
- {
- .label = "hsic1",
- .id = EXYNOS4210_HSIC1,
- .power_on = exynos4210_power_on,
- .power_off = exynos4210_power_off,
- },
- {},
-};
-
-- int (*power_on)(struct samsung_usb2_phy_instance *);
-- int (*power_off)(struct samsung_usb2_phy_instance *);
- These two callbacks are used to power on and power off the phy
- by modifying appropriate registers.
-
-Final change to the driver is adding appropriate compatible value to the
-phy-samsung-usb2.c file. In case of Exynos 4210 the following lines were
-added to the struct of_device_id samsung_usb2_phy_of_match[] array:
-
-#ifdef CONFIG_PHY_EXYNOS4210_USB2
- {
- .compatible = "samsung,exynos4210-usb2-phy",
- .data = &exynos4210_usb2_phy_config,
- },
-#endif
-
-To add further flexibility to the driver the Kconfig file enables to
-include support for selected SoCs in the compiled driver. The Kconfig
-entry for Exynos 4210 is following:
-
-config PHY_EXYNOS4210_USB2
- bool "Support for Exynos 4210"
- depends on PHY_SAMSUNG_USB2
- depends on CPU_EXYNOS4210
- help
- Enable USB PHY support for Exynos 4210. This option requires that
- Samsung USB 2.0 PHY driver is enabled and means that support for this
- particular SoC is compiled in the driver. In case of Exynos 4210 four
- phys are available - device, host, HSCI0 and HSCI1.
-
-The newly created file that supports the new SoC has to be also added to the
-Makefile. In case of Exynos 4210 the added line is following:
-
-obj-$(CONFIG_PHY_EXYNOS4210_USB2) += phy-exynos4210-usb2.o
-
-After completing these steps the support for the new SoC should be ready.
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
index b154f6c0c36e..c33ba2befbf8 100644
--- a/Documentation/pi-futex.txt
+++ b/Documentation/pi-futex.txt
@@ -119,4 +119,4 @@ properties of futexes, and all four combinations are possible: futex,
robust-futex, PI-futex, robust+PI-futex.
More details about priority inheritance can be found in
-Documentation/locking/rt-mutex.txt.
+Documentation/locking/rt-mutex.rst.
diff --git a/Documentation/power/apm-acpi.rst b/Documentation/power/apm-acpi.rst
new file mode 100644
index 000000000000..5b90d947126d
--- /dev/null
+++ b/Documentation/power/apm-acpi.rst
@@ -0,0 +1,36 @@
+============
+APM or ACPI?
+============
+
+If you have a relatively recent x86 mobile, desktop, or server system,
+odds are it supports either Advanced Power Management (APM) or
+Advanced Configuration and Power Interface (ACPI). ACPI is the newer
+of the two technologies and puts power management in the hands of the
+operating system, allowing for more intelligent power management than
+is possible with BIOS controlled APM.
+
+The best way to determine which, if either, your system supports is to
+build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
+enabled by default). If a working ACPI implementation is found, the
+ACPI driver will override and disable APM, otherwise the APM driver
+will be used.
+
+No, sorry, you cannot have both ACPI and APM enabled and running at
+once. Some people with broken ACPI or broken APM implementations
+would like to use both to get a full set of working features, but you
+simply cannot mix and match the two. Only one power management
+interface can be in control of the machine at once. Think about it..
+
+User-space Daemons
+------------------
+Both APM and ACPI rely on user-space daemons, apmd and acpid
+respectively, to be completely functional. Obtain both of these
+daemons from your Linux distribution or from the Internet (see below)
+and be sure that they are started sometime in the system boot process.
+Go ahead and start both. If ACPI or APM is not available on your
+system the associated daemon will exit gracefully.
+
+ ===== =======================================
+ apmd http://ftp.debian.org/pool/main/a/apmd/
+ acpid http://acpid.sf.net/
+ ===== =======================================
diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt
deleted file mode 100644
index 6cc423d3662e..000000000000
--- a/Documentation/power/apm-acpi.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-APM or ACPI?
-------------
-If you have a relatively recent x86 mobile, desktop, or server system,
-odds are it supports either Advanced Power Management (APM) or
-Advanced Configuration and Power Interface (ACPI). ACPI is the newer
-of the two technologies and puts power management in the hands of the
-operating system, allowing for more intelligent power management than
-is possible with BIOS controlled APM.
-
-The best way to determine which, if either, your system supports is to
-build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
-enabled by default). If a working ACPI implementation is found, the
-ACPI driver will override and disable APM, otherwise the APM driver
-will be used.
-
-No, sorry, you cannot have both ACPI and APM enabled and running at
-once. Some people with broken ACPI or broken APM implementations
-would like to use both to get a full set of working features, but you
-simply cannot mix and match the two. Only one power management
-interface can be in control of the machine at once. Think about it..
-
-User-space Daemons
-------------------
-Both APM and ACPI rely on user-space daemons, apmd and acpid
-respectively, to be completely functional. Obtain both of these
-daemons from your Linux distribution or from the Internet (see below)
-and be sure that they are started sometime in the system boot process.
-Go ahead and start both. If ACPI or APM is not available on your
-system the associated daemon will exit gracefully.
-
- apmd: http://ftp.debian.org/pool/main/a/apmd/
- acpid: http://acpid.sf.net/
diff --git a/Documentation/power/basic-pm-debugging.rst b/Documentation/power/basic-pm-debugging.rst
new file mode 100644
index 000000000000..69862e759c30
--- /dev/null
+++ b/Documentation/power/basic-pm-debugging.rst
@@ -0,0 +1,269 @@
+=================================
+Debugging hibernation and suspend
+=================================
+
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Testing hibernation (aka suspend to disk or STD)
+===================================================
+
+To check if hibernation works, you can try to hibernate in the "reboot" mode::
+
+ # echo reboot > /sys/power/disk
+ # echo disk > /sys/power/state
+
+and the system should create a hibernation image, reboot, resume and get back to
+the command prompt where you have started the transition. If that happens,
+hibernation is most likely to work correctly. Still, you need to repeat the
+test at least a couple of times in a row for confidence. [This is necessary,
+because some problems only show up on a second attempt at suspending and
+resuming the system.] Moreover, hibernating in the "reboot" and "shutdown"
+modes causes the PM core to skip some platform-related callbacks which on ACPI
+systems might be necessary to make hibernation work. Thus, if your machine
+fails to hibernate or resume in the "reboot" mode, you should try the
+"platform" mode::
+
+ # echo platform > /sys/power/disk
+ # echo disk > /sys/power/state
+
+which is the default and recommended mode of hibernation.
+
+Unfortunately, the "platform" mode of hibernation does not work on some systems
+with broken BIOSes. In such cases the "shutdown" mode of hibernation might
+work::
+
+ # echo shutdown > /sys/power/disk
+ # echo disk > /sys/power/state
+
+(it is similar to the "reboot" mode, but it requires you to press the power
+button to make the system resume).
+
+If neither "platform" nor "shutdown" hibernation mode works, you will need to
+identify what goes wrong.
+
+a) Test modes of hibernation
+----------------------------
+
+To find out why hibernation fails on your system, you can use a special testing
+facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then,
+there is the file /sys/power/pm_test that can be used to make the hibernation
+core run in a test mode. There are 5 test modes available:
+
+freezer
+ - test the freezing of processes
+
+devices
+ - test the freezing of processes and suspending of devices
+
+platform
+ - test the freezing of processes, suspending of devices and platform
+ global control methods [1]_
+
+processors
+ - test the freezing of processes, suspending of devices, platform
+ global control methods [1]_ and the disabling of nonboot CPUs
+
+core
+ - test the freezing of processes, suspending of devices, platform global
+ control methods\ [1]_, the disabling of nonboot CPUs and suspending
+ of platform/system devices
+
+.. [1]
+
+ the platform global control methods are only available on ACPI systems
+ and are only tested if the hibernation mode is set to "platform"
+
+To use one of them it is necessary to write the corresponding string to
+/sys/power/pm_test (eg. "devices" to test the freezing of processes and
+suspending devices) and issue the standard hibernation commands. For example,
+to use the "devices" test mode along with the "platform" mode of hibernation,
+you should do the following::
+
+ # echo devices > /sys/power/pm_test
+ # echo platform > /sys/power/disk
+ # echo disk > /sys/power/state
+
+Then, the kernel will try to freeze processes, suspend devices, wait a few
+seconds (5 by default, but configurable by the suspend.pm_test_delay module
+parameter), resume devices and thaw processes. If "platform" is written to
+/sys/power/pm_test , then after suspending devices the kernel will additionally
+invoke the global control methods (eg. ACPI global control methods) used to
+prepare the platform firmware for hibernation. Next, it will wait a
+configurable number of seconds and invoke the platform (eg. ACPI) global
+methods used to cancel hibernation etc.
+
+Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
+hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test
+contains a space-separated list of all available tests (including "none" that
+represents the normal functionality) in which the current test level is
+indicated by square brackets.
+
+Generally, as you can see, each test level is more "invasive" than the previous
+one and the "core" level tests the hardware and drivers as deeply as possible
+without creating a hibernation image. Obviously, if the "devices" test fails,
+the "platform" test will fail as well and so on. Thus, as a rule of thumb, you
+should try the test modes starting from "freezer", through "devices", "platform"
+and "processors" up to "core" (repeat the test on each level a couple of times
+to make sure that any random factors are avoided).
+
+If the "freezer" test fails, there is a task that cannot be frozen (in that case
+it usually is possible to identify the offending task by analysing the output of
+dmesg obtained after the failing test). Failure at this level usually means
+that there is a problem with the tasks freezer subsystem that should be
+reported.
+
+If the "devices" test fails, most likely there is a driver that cannot suspend
+or resume its device (in the latter case the system may hang or become unstable
+after the test, so please take that into consideration). To find this driver,
+you can carry out a binary search according to the rules:
+
+- if the test fails, unload a half of the drivers currently loaded and repeat
+ (that would probably involve rebooting the system, so always note what drivers
+ have been loaded before the test),
+- if the test succeeds, load a half of the drivers you have unloaded most
+ recently and repeat.
+
+Once you have found the failing driver (there can be more than just one of
+them), you have to unload it every time before hibernation. In that case please
+make sure to report the problem with the driver.
+
+It is also possible that the "devices" test will still fail after you have
+unloaded all modules. In that case, you may want to look in your kernel
+configuration for the drivers that can be compiled as modules (and test again
+with these drivers compiled as modules). You may also try to use some special
+kernel command line options such as "noapic", "noacpi" or even "acpi=off".
+
+If the "platform" test fails, there is a problem with the handling of the
+platform (eg. ACPI) firmware on your system. In that case the "platform" mode
+of hibernation is not likely to work. You can try the "shutdown" mode, but that
+is rather a poor man's workaround.
+
+If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
+work (of course, this only may be an issue on SMP systems) and the problem
+should be reported. In that case you can also try to switch the nonboot CPUs
+off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
+see if that works.
+
+If the "core" test fails, which means that suspending of the system/platform
+devices has failed (these devices are suspended on one CPU with interrupts off),
+the problem is most probably hardware-related and serious, so it should be
+reported.
+
+A failure of any of the "platform", "processors" or "core" tests may cause your
+system to hang or become unstable, so please beware. Such a failure usually
+indicates a serious problem that very well may be related to the hardware, but
+please report it anyway.
+
+b) Testing minimal configuration
+--------------------------------
+
+If all of the hibernation test modes work, you can boot the system with the
+"init=/bin/bash" command line parameter and attempt to hibernate in the
+"reboot", "shutdown" and "platform" modes. If that does not work, there
+probably is a problem with a driver statically compiled into the kernel and you
+can try to compile more drivers as modules, so that they can be tested
+individually. Otherwise, there is a problem with a modular driver and you can
+find it by loading a half of the modules you normally use and binary searching
+in accordance with the algorithm:
+- if there are n modules loaded and the attempt to suspend and resume fails,
+unload n/2 of the modules and try again (that would probably involve rebooting
+the system),
+- if there are n modules loaded and the attempt to suspend and resume succeeds,
+load n/2 modules more and try again.
+
+Again, if you find the offending module(s), it(they) must be unloaded every time
+before hibernation, and please report the problem with it(them).
+
+c) Using the "test_resume" hibernation option
+---------------------------------------------
+
+/sys/power/disk generally tells the kernel what to do after creating a
+hibernation image. One of the available options is "test_resume" which
+causes the just created image to be used for immediate restoration. Namely,
+after doing::
+
+ # echo test_resume > /sys/power/disk
+ # echo disk > /sys/power/state
+
+a hibernation image will be created and a resume from it will be triggered
+immediately without involving the platform firmware in any way.
+
+That test can be used to check if failures to resume from hibernation are
+related to bad interactions with the platform firmware. That is, if the above
+works every time, but resume from actual hibernation does not work or is
+unreliable, the platform firmware may be responsible for the failures.
+
+On architectures and platforms that support using different kernels to restore
+hibernation images (that is, the kernel used to read the image from storage and
+load it into memory is different from the one included in the image) or support
+kernel address space randomization, it also can be used to check if failures
+to resume may be related to the differences between the restore and image
+kernels.
+
+d) Advanced debugging
+---------------------
+
+In case that hibernation does not work on your system even in the minimal
+configuration and compiling more drivers as modules is not practical or some
+modules cannot be unloaded, you can use one of the more advanced debugging
+techniques to find the problem. First, if there is a serial port in your box,
+you can boot the kernel with the 'no_console_suspend' parameter and try to log
+kernel messages using the serial console. This may provide you with some
+information about the reasons of the suspend (resume) failure. Alternatively,
+it may be possible to use a FireWire port for debugging with firescope
+(http://v3.sk/~lkundrak/firescope/). On x86 it is also possible to
+use the PM_TRACE mechanism documented in Documentation/power/s2ram.rst .
+
+2. Testing suspend to RAM (STR)
+===============================
+
+To verify that the STR works, it is generally more convenient to use the s2ram
+tool available from http://suspend.sf.net and documented at
+http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
+
+Namely, after writing "freezer", "devices", "platform", "processors", or "core"
+into /sys/power/pm_test (available if the kernel is compiled with
+CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
+to given string. The STR test modes are defined in the same way as for
+hibernation, so please refer to Section 1 for more information about them. In
+particular, the "core" test allows you to test everything except for the actual
+invocation of the platform firmware in order to put the system into the sleep
+state.
+
+Among other things, the testing with the help of /sys/power/pm_test may allow
+you to identify drivers that fail to suspend or resume their devices. They
+should be unloaded every time before an STR transition.
+
+Next, you can follow the instructions at S2RAM_LINK to test the system, but if
+it does not work "out of the box", you may need to boot it with
+"init=/bin/bash" and test s2ram in the minimal configuration. In that case,
+you may be able to search for failing drivers by following the procedure
+analogous to the one described in section 1. If you find some failing drivers,
+you will have to unload them every time before an STR transition (ie. before
+you run s2ram), and please report the problems with them.
+
+There is a debugfs entry which shows the suspend to RAM statistics. Here is an
+example of its output::
+
+ # mount -t debugfs none /sys/kernel/debug
+ # cat /sys/kernel/debug/suspend_stats
+ success: 20
+ fail: 5
+ failed_freeze: 0
+ failed_prepare: 0
+ failed_suspend: 5
+ failed_suspend_noirq: 0
+ failed_resume: 0
+ failed_resume_noirq: 0
+ failures:
+ last_failed_dev: alarm
+ adc
+ last_failed_errno: -16
+ -16
+ last_failed_step: suspend
+ suspend
+
+Field success means the success number of suspend to RAM, and field fail means
+the failure number. Others are the failure number of different steps of suspend
+to RAM. suspend_stats just lists the last 2 failed devices, error number and
+failed step of suspend.
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
deleted file mode 100644
index 708f87f78a75..000000000000
--- a/Documentation/power/basic-pm-debugging.txt
+++ /dev/null
@@ -1,254 +0,0 @@
-Debugging hibernation and suspend
- (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Testing hibernation (aka suspend to disk or STD)
-
-To check if hibernation works, you can try to hibernate in the "reboot" mode:
-
-# echo reboot > /sys/power/disk
-# echo disk > /sys/power/state
-
-and the system should create a hibernation image, reboot, resume and get back to
-the command prompt where you have started the transition. If that happens,
-hibernation is most likely to work correctly. Still, you need to repeat the
-test at least a couple of times in a row for confidence. [This is necessary,
-because some problems only show up on a second attempt at suspending and
-resuming the system.] Moreover, hibernating in the "reboot" and "shutdown"
-modes causes the PM core to skip some platform-related callbacks which on ACPI
-systems might be necessary to make hibernation work. Thus, if your machine fails
-to hibernate or resume in the "reboot" mode, you should try the "platform" mode:
-
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-which is the default and recommended mode of hibernation.
-
-Unfortunately, the "platform" mode of hibernation does not work on some systems
-with broken BIOSes. In such cases the "shutdown" mode of hibernation might
-work:
-
-# echo shutdown > /sys/power/disk
-# echo disk > /sys/power/state
-
-(it is similar to the "reboot" mode, but it requires you to press the power
-button to make the system resume).
-
-If neither "platform" nor "shutdown" hibernation mode works, you will need to
-identify what goes wrong.
-
-a) Test modes of hibernation
-
-To find out why hibernation fails on your system, you can use a special testing
-facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then,
-there is the file /sys/power/pm_test that can be used to make the hibernation
-core run in a test mode. There are 5 test modes available:
-
-freezer
-- test the freezing of processes
-
-devices
-- test the freezing of processes and suspending of devices
-
-platform
-- test the freezing of processes, suspending of devices and platform
- global control methods(*)
-
-processors
-- test the freezing of processes, suspending of devices, platform
- global control methods(*) and the disabling of nonboot CPUs
-
-core
-- test the freezing of processes, suspending of devices, platform global
- control methods(*), the disabling of nonboot CPUs and suspending of
- platform/system devices
-
-(*) the platform global control methods are only available on ACPI systems
- and are only tested if the hibernation mode is set to "platform"
-
-To use one of them it is necessary to write the corresponding string to
-/sys/power/pm_test (eg. "devices" to test the freezing of processes and
-suspending devices) and issue the standard hibernation commands. For example,
-to use the "devices" test mode along with the "platform" mode of hibernation,
-you should do the following:
-
-# echo devices > /sys/power/pm_test
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-Then, the kernel will try to freeze processes, suspend devices, wait a few
-seconds (5 by default, but configurable by the suspend.pm_test_delay module
-parameter), resume devices and thaw processes. If "platform" is written to
-/sys/power/pm_test , then after suspending devices the kernel will additionally
-invoke the global control methods (eg. ACPI global control methods) used to
-prepare the platform firmware for hibernation. Next, it will wait a
-configurable number of seconds and invoke the platform (eg. ACPI) global
-methods used to cancel hibernation etc.
-
-Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
-hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test
-contains a space-separated list of all available tests (including "none" that
-represents the normal functionality) in which the current test level is
-indicated by square brackets.
-
-Generally, as you can see, each test level is more "invasive" than the previous
-one and the "core" level tests the hardware and drivers as deeply as possible
-without creating a hibernation image. Obviously, if the "devices" test fails,
-the "platform" test will fail as well and so on. Thus, as a rule of thumb, you
-should try the test modes starting from "freezer", through "devices", "platform"
-and "processors" up to "core" (repeat the test on each level a couple of times
-to make sure that any random factors are avoided).
-
-If the "freezer" test fails, there is a task that cannot be frozen (in that case
-it usually is possible to identify the offending task by analysing the output of
-dmesg obtained after the failing test). Failure at this level usually means
-that there is a problem with the tasks freezer subsystem that should be
-reported.
-
-If the "devices" test fails, most likely there is a driver that cannot suspend
-or resume its device (in the latter case the system may hang or become unstable
-after the test, so please take that into consideration). To find this driver,
-you can carry out a binary search according to the rules:
-- if the test fails, unload a half of the drivers currently loaded and repeat
-(that would probably involve rebooting the system, so always note what drivers
-have been loaded before the test),
-- if the test succeeds, load a half of the drivers you have unloaded most
-recently and repeat.
-
-Once you have found the failing driver (there can be more than just one of
-them), you have to unload it every time before hibernation. In that case please
-make sure to report the problem with the driver.
-
-It is also possible that the "devices" test will still fail after you have
-unloaded all modules. In that case, you may want to look in your kernel
-configuration for the drivers that can be compiled as modules (and test again
-with these drivers compiled as modules). You may also try to use some special
-kernel command line options such as "noapic", "noacpi" or even "acpi=off".
-
-If the "platform" test fails, there is a problem with the handling of the
-platform (eg. ACPI) firmware on your system. In that case the "platform" mode
-of hibernation is not likely to work. You can try the "shutdown" mode, but that
-is rather a poor man's workaround.
-
-If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
-work (of course, this only may be an issue on SMP systems) and the problem
-should be reported. In that case you can also try to switch the nonboot CPUs
-off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
-see if that works.
-
-If the "core" test fails, which means that suspending of the system/platform
-devices has failed (these devices are suspended on one CPU with interrupts off),
-the problem is most probably hardware-related and serious, so it should be
-reported.
-
-A failure of any of the "platform", "processors" or "core" tests may cause your
-system to hang or become unstable, so please beware. Such a failure usually
-indicates a serious problem that very well may be related to the hardware, but
-please report it anyway.
-
-b) Testing minimal configuration
-
-If all of the hibernation test modes work, you can boot the system with the
-"init=/bin/bash" command line parameter and attempt to hibernate in the
-"reboot", "shutdown" and "platform" modes. If that does not work, there
-probably is a problem with a driver statically compiled into the kernel and you
-can try to compile more drivers as modules, so that they can be tested
-individually. Otherwise, there is a problem with a modular driver and you can
-find it by loading a half of the modules you normally use and binary searching
-in accordance with the algorithm:
-- if there are n modules loaded and the attempt to suspend and resume fails,
-unload n/2 of the modules and try again (that would probably involve rebooting
-the system),
-- if there are n modules loaded and the attempt to suspend and resume succeeds,
-load n/2 modules more and try again.
-
-Again, if you find the offending module(s), it(they) must be unloaded every time
-before hibernation, and please report the problem with it(them).
-
-c) Using the "test_resume" hibernation option
-
-/sys/power/disk generally tells the kernel what to do after creating a
-hibernation image. One of the available options is "test_resume" which
-causes the just created image to be used for immediate restoration. Namely,
-after doing:
-
-# echo test_resume > /sys/power/disk
-# echo disk > /sys/power/state
-
-a hibernation image will be created and a resume from it will be triggered
-immediately without involving the platform firmware in any way.
-
-That test can be used to check if failures to resume from hibernation are
-related to bad interactions with the platform firmware. That is, if the above
-works every time, but resume from actual hibernation does not work or is
-unreliable, the platform firmware may be responsible for the failures.
-
-On architectures and platforms that support using different kernels to restore
-hibernation images (that is, the kernel used to read the image from storage and
-load it into memory is different from the one included in the image) or support
-kernel address space randomization, it also can be used to check if failures
-to resume may be related to the differences between the restore and image
-kernels.
-
-d) Advanced debugging
-
-In case that hibernation does not work on your system even in the minimal
-configuration and compiling more drivers as modules is not practical or some
-modules cannot be unloaded, you can use one of the more advanced debugging
-techniques to find the problem. First, if there is a serial port in your box,
-you can boot the kernel with the 'no_console_suspend' parameter and try to log
-kernel messages using the serial console. This may provide you with some
-information about the reasons of the suspend (resume) failure. Alternatively,
-it may be possible to use a FireWire port for debugging with firescope
-(http://v3.sk/~lkundrak/firescope/). On x86 it is also possible to
-use the PM_TRACE mechanism documented in Documentation/power/s2ram.txt .
-
-2. Testing suspend to RAM (STR)
-
-To verify that the STR works, it is generally more convenient to use the s2ram
-tool available from http://suspend.sf.net and documented at
-http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
-
-Namely, after writing "freezer", "devices", "platform", "processors", or "core"
-into /sys/power/pm_test (available if the kernel is compiled with
-CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
-to given string. The STR test modes are defined in the same way as for
-hibernation, so please refer to Section 1 for more information about them. In
-particular, the "core" test allows you to test everything except for the actual
-invocation of the platform firmware in order to put the system into the sleep
-state.
-
-Among other things, the testing with the help of /sys/power/pm_test may allow
-you to identify drivers that fail to suspend or resume their devices. They
-should be unloaded every time before an STR transition.
-
-Next, you can follow the instructions at S2RAM_LINK to test the system, but if
-it does not work "out of the box", you may need to boot it with
-"init=/bin/bash" and test s2ram in the minimal configuration. In that case,
-you may be able to search for failing drivers by following the procedure
-analogous to the one described in section 1. If you find some failing drivers,
-you will have to unload them every time before an STR transition (ie. before
-you run s2ram), and please report the problems with them.
-
-There is a debugfs entry which shows the suspend to RAM statistics. Here is an
-example of its output.
- # mount -t debugfs none /sys/kernel/debug
- # cat /sys/kernel/debug/suspend_stats
- success: 20
- fail: 5
- failed_freeze: 0
- failed_prepare: 0
- failed_suspend: 5
- failed_suspend_noirq: 0
- failed_resume: 0
- failed_resume_noirq: 0
- failures:
- last_failed_dev: alarm
- adc
- last_failed_errno: -16
- -16
- last_failed_step: suspend
- suspend
-Field success means the success number of suspend to RAM, and field fail means
-the failure number. Others are the failure number of different steps of suspend
-to RAM. suspend_stats just lists the last 2 failed devices, error number and
-failed step of suspend.
diff --git a/Documentation/power/charger-manager.rst b/Documentation/power/charger-manager.rst
new file mode 100644
index 000000000000..84fab9376792
--- /dev/null
+++ b/Documentation/power/charger-manager.rst
@@ -0,0 +1,205 @@
+===============
+Charger Manager
+===============
+
+ (C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
+
+Charger Manager provides in-kernel battery charger management that
+requires temperature monitoring during suspend-to-RAM state
+and where each battery may have multiple chargers attached and the userland
+wants to look at the aggregated information of the multiple chargers.
+
+Charger Manager is a platform_driver with power-supply-class entries.
+An instance of Charger Manager (a platform-device created with Charger-Manager)
+represents an independent battery with chargers. If there are multiple
+batteries with their own chargers acting independently in a system,
+the system may need multiple instances of Charger Manager.
+
+1. Introduction
+===============
+
+Charger Manager supports the following:
+
+* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
+ A system may have multiple chargers (or power sources) and some of
+ they may be activated at the same time. Each charger may have its
+ own power-supply-class and each power-supply-class can provide
+ different information about the battery status. This framework
+ aggregates charger-related information from multiple sources and
+ shows combined information as a single power-supply-class.
+
+* Support for in suspend-to-RAM polling (with suspend_again callback)
+ While the battery is being charged and the system is in suspend-to-RAM,
+ we may need to monitor the battery health by looking at the ambient or
+ battery temperature. We can accomplish this by waking up the system
+ periodically. However, such a method wakes up devices unnecessarily for
+ monitoring the battery health and tasks, and user processes that are
+ supposed to be kept suspended. That, in turn, incurs unnecessary power
+ consumption and slow down charging process. Or even, such peak power
+ consumption can stop chargers in the middle of charging
+ (external power input < device power consumption), which not
+ only affects the charging time, but the lifespan of the battery.
+
+ Charger Manager provides a function "cm_suspend_again" that can be
+ used as suspend_again callback of platform_suspend_ops. If the platform
+ requires tasks other than cm_suspend_again, it may implement its own
+ suspend_again callback that calls cm_suspend_again in the middle.
+ Normally, the platform will need to resume and suspend some devices
+ that are used by Charger Manager.
+
+* Support for premature full-battery event handling
+ If the battery voltage drops by "fullbatt_vchkdrop_uV" after
+ "fullbatt_vchkdrop_ms" from the full-battery event, the framework
+ restarts charging. This check is also performed while suspended by
+ setting wakeup time accordingly and using suspend_again.
+
+* Support for uevent-notify
+ With the charger-related events, the device sends
+ notification to users with UEVENT.
+
+2. Global Charger-Manager Data related with suspend_again
+=========================================================
+In order to setup Charger Manager with suspend-again feature
+(in-suspend monitoring), the user should provide charger_global_desc
+with setup_charger_manager(`struct charger_global_desc *`).
+This charger_global_desc data for in-suspend monitoring is global
+as the name suggests. Thus, the user needs to provide only once even
+if there are multiple batteries. If there are multiple batteries, the
+multiple instances of Charger Manager share the same charger_global_desc
+and it will manage in-suspend monitoring for all instances of Charger Manager.
+
+The user needs to provide all the three entries to `struct charger_global_desc`
+properly in order to activate in-suspend monitoring:
+
+`char *rtc_name;`
+ The name of rtc (e.g., "rtc0") used to wakeup the system from
+ suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
+ should be able to wake up the system from suspend. Charger Manager
+ saves and restores the alarm value and use the previously-defined
+ alarm if it is going to go off earlier than Charger Manager so that
+ Charger Manager does not interfere with previously-defined alarms.
+
+`bool (*rtc_only_wakeup)(void);`
+ This callback should let CM know whether
+ the wakeup-from-suspend is caused only by the alarm of "rtc" in the
+ same struct. If there is any other wakeup source triggered the
+ wakeup, it should return false. If the "rtc" is the only wakeup
+ reason, it should return true.
+
+`bool assume_timer_stops_in_suspend;`
+ if true, Charger Manager assumes that
+ the timer (CM uses jiffies as timer) stops during suspend. Then, CM
+ assumes that the suspend-duration is same as the alarm length.
+
+
+3. How to setup suspend_again
+=============================
+Charger Manager provides a function "extern bool cm_suspend_again(void)".
+When cm_suspend_again is called, it monitors every battery. The suspend_ops
+callback of the system's platform_suspend_ops can call cm_suspend_again
+function to know whether Charger Manager wants to suspend again or not.
+If there are no other devices or tasks that want to use suspend_again
+feature, the platform_suspend_ops may directly refer to cm_suspend_again
+for its suspend_again callback.
+
+The cm_suspend_again() returns true (meaning "I want to suspend again")
+if the system was woken up by Charger Manager and the polling
+(in-suspend monitoring) results in "normal".
+
+4. Charger-Manager Data (struct charger_desc)
+=============================================
+For each battery charged independently from other batteries (if a series of
+batteries are charged by a single charger, they are counted as one independent
+battery), an instance of Charger Manager is attached to it. The following
+
+struct charger_desc elements:
+
+`char *psy_name;`
+ The power-supply-class name of the battery. Default is
+ "battery" if psy_name is NULL. Users can access the psy entries
+ at "/sys/class/power_supply/[psy_name]/".
+
+`enum polling_modes polling_mode;`
+ CM_POLL_DISABLE:
+ do not poll this battery.
+ CM_POLL_ALWAYS:
+ always poll this battery.
+ CM_POLL_EXTERNAL_POWER_ONLY:
+ poll this battery if and only if an external power
+ source is attached.
+ CM_POLL_CHARGING_ONLY:
+ poll this battery if and only if the battery is being charged.
+
+`unsigned int fullbatt_vchkdrop_ms; / unsigned int fullbatt_vchkdrop_uV;`
+ If both have non-zero values, Charger Manager will check the
+ battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
+ charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
+ Manager will try to recharge the battery by disabling and enabling
+ chargers. Recharge with voltage drop condition only (without delay
+ condition) is needed to be implemented with hardware interrupts from
+ fuel gauges or charger devices/chips.
+
+`unsigned int fullbatt_uV;`
+ If specified with a non-zero value, Charger Manager assumes
+ that the battery is full (capacity = 100) if the battery is not being
+ charged and the battery voltage is equal to or greater than
+ fullbatt_uV.
+
+`unsigned int polling_interval_ms;`
+ Required polling interval in ms. Charger Manager will poll
+ this battery every polling_interval_ms or more frequently.
+
+`enum data_source battery_present;`
+ CM_BATTERY_PRESENT:
+ assume that the battery exists.
+ CM_NO_BATTERY:
+ assume that the battery does not exists.
+ CM_FUEL_GAUGE:
+ get battery presence information from fuel gauge.
+ CM_CHARGER_STAT:
+ get battery presence from chargers.
+
+`char **psy_charger_stat;`
+ An array ending with NULL that has power-supply-class names of
+ chargers. Each power-supply-class should provide "PRESENT" (if
+ battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
+ external power source is attached or not), and "STATUS" (shows whether
+ the battery is {"FULL" or not FULL} or {"FULL", "Charging",
+ "Discharging", "NotCharging"}).
+
+`int num_charger_regulators; / struct regulator_bulk_data *charger_regulators;`
+ Regulators representing the chargers in the form for
+ regulator framework's bulk functions.
+
+`char *psy_fuel_gauge;`
+ Power-supply-class name of the fuel gauge.
+
+`int (*temperature_out_of_range)(int *mC); / bool measure_battery_temp;`
+ This callback returns 0 if the temperature is safe for charging,
+ a positive number if it is too hot to charge, and a negative number
+ if it is too cold to charge. With the variable mC, the callback returns
+ the temperature in 1/1000 of centigrade.
+ The source of temperature can be battery or ambient one according to
+ the value of measure_battery_temp.
+
+
+5. Notify Charger-Manager of charger events: cm_notify_event()
+==============================================================
+If there is an charger event is required to notify
+Charger Manager, a charger device driver that triggers the event can call
+cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
+In the function, psy is the charger driver's power_supply pointer, which is
+associated with Charger-Manager. The parameter "type"
+is the same as irq's type (enum cm_event_types). The event message "msg" is
+optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
+
+6. Other Considerations
+=======================
+
+At the charger/battery-related events such as battery-pulled-out,
+charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
+and others critical to chargers, the system should be configured to wake up.
+At least the following should wake up the system from a suspend:
+a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
+
+It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/charger-manager.txt b/Documentation/power/charger-manager.txt
deleted file mode 100644
index 9ff1105e58d6..000000000000
--- a/Documentation/power/charger-manager.txt
+++ /dev/null
@@ -1,200 +0,0 @@
-Charger Manager
- (C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
-
-Charger Manager provides in-kernel battery charger management that
-requires temperature monitoring during suspend-to-RAM state
-and where each battery may have multiple chargers attached and the userland
-wants to look at the aggregated information of the multiple chargers.
-
-Charger Manager is a platform_driver with power-supply-class entries.
-An instance of Charger Manager (a platform-device created with Charger-Manager)
-represents an independent battery with chargers. If there are multiple
-batteries with their own chargers acting independently in a system,
-the system may need multiple instances of Charger Manager.
-
-1. Introduction
-===============
-
-Charger Manager supports the following:
-
-* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
- A system may have multiple chargers (or power sources) and some of
- they may be activated at the same time. Each charger may have its
- own power-supply-class and each power-supply-class can provide
- different information about the battery status. This framework
- aggregates charger-related information from multiple sources and
- shows combined information as a single power-supply-class.
-
-* Support for in suspend-to-RAM polling (with suspend_again callback)
- While the battery is being charged and the system is in suspend-to-RAM,
- we may need to monitor the battery health by looking at the ambient or
- battery temperature. We can accomplish this by waking up the system
- periodically. However, such a method wakes up devices unnecessarily for
- monitoring the battery health and tasks, and user processes that are
- supposed to be kept suspended. That, in turn, incurs unnecessary power
- consumption and slow down charging process. Or even, such peak power
- consumption can stop chargers in the middle of charging
- (external power input < device power consumption), which not
- only affects the charging time, but the lifespan of the battery.
-
- Charger Manager provides a function "cm_suspend_again" that can be
- used as suspend_again callback of platform_suspend_ops. If the platform
- requires tasks other than cm_suspend_again, it may implement its own
- suspend_again callback that calls cm_suspend_again in the middle.
- Normally, the platform will need to resume and suspend some devices
- that are used by Charger Manager.
-
-* Support for premature full-battery event handling
- If the battery voltage drops by "fullbatt_vchkdrop_uV" after
- "fullbatt_vchkdrop_ms" from the full-battery event, the framework
- restarts charging. This check is also performed while suspended by
- setting wakeup time accordingly and using suspend_again.
-
-* Support for uevent-notify
- With the charger-related events, the device sends
- notification to users with UEVENT.
-
-2. Global Charger-Manager Data related with suspend_again
-========================================================
-In order to setup Charger Manager with suspend-again feature
-(in-suspend monitoring), the user should provide charger_global_desc
-with setup_charger_manager(struct charger_global_desc *).
-This charger_global_desc data for in-suspend monitoring is global
-as the name suggests. Thus, the user needs to provide only once even
-if there are multiple batteries. If there are multiple batteries, the
-multiple instances of Charger Manager share the same charger_global_desc
-and it will manage in-suspend monitoring for all instances of Charger Manager.
-
-The user needs to provide all the three entries properly in order to activate
-in-suspend monitoring:
-
-struct charger_global_desc {
-
-char *rtc_name;
- : The name of rtc (e.g., "rtc0") used to wakeup the system from
- suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
- should be able to wake up the system from suspend. Charger Manager
- saves and restores the alarm value and use the previously-defined
- alarm if it is going to go off earlier than Charger Manager so that
- Charger Manager does not interfere with previously-defined alarms.
-
-bool (*rtc_only_wakeup)(void);
- : This callback should let CM know whether
- the wakeup-from-suspend is caused only by the alarm of "rtc" in the
- same struct. If there is any other wakeup source triggered the
- wakeup, it should return false. If the "rtc" is the only wakeup
- reason, it should return true.
-
-bool assume_timer_stops_in_suspend;
- : if true, Charger Manager assumes that
- the timer (CM uses jiffies as timer) stops during suspend. Then, CM
- assumes that the suspend-duration is same as the alarm length.
-};
-
-3. How to setup suspend_again
-=============================
-Charger Manager provides a function "extern bool cm_suspend_again(void)".
-When cm_suspend_again is called, it monitors every battery. The suspend_ops
-callback of the system's platform_suspend_ops can call cm_suspend_again
-function to know whether Charger Manager wants to suspend again or not.
-If there are no other devices or tasks that want to use suspend_again
-feature, the platform_suspend_ops may directly refer to cm_suspend_again
-for its suspend_again callback.
-
-The cm_suspend_again() returns true (meaning "I want to suspend again")
-if the system was woken up by Charger Manager and the polling
-(in-suspend monitoring) results in "normal".
-
-4. Charger-Manager Data (struct charger_desc)
-=============================================
-For each battery charged independently from other batteries (if a series of
-batteries are charged by a single charger, they are counted as one independent
-battery), an instance of Charger Manager is attached to it.
-
-struct charger_desc {
-
-char *psy_name;
- : The power-supply-class name of the battery. Default is
- "battery" if psy_name is NULL. Users can access the psy entries
- at "/sys/class/power_supply/[psy_name]/".
-
-enum polling_modes polling_mode;
- : CM_POLL_DISABLE: do not poll this battery.
- CM_POLL_ALWAYS: always poll this battery.
- CM_POLL_EXTERNAL_POWER_ONLY: poll this battery if and only if
- an external power source is attached.
- CM_POLL_CHARGING_ONLY: poll this battery if and only if the
- battery is being charged.
-
-unsigned int fullbatt_vchkdrop_ms;
-unsigned int fullbatt_vchkdrop_uV;
- : If both have non-zero values, Charger Manager will check the
- battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
- charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
- Manager will try to recharge the battery by disabling and enabling
- chargers. Recharge with voltage drop condition only (without delay
- condition) is needed to be implemented with hardware interrupts from
- fuel gauges or charger devices/chips.
-
-unsigned int fullbatt_uV;
- : If specified with a non-zero value, Charger Manager assumes
- that the battery is full (capacity = 100) if the battery is not being
- charged and the battery voltage is equal to or greater than
- fullbatt_uV.
-
-unsigned int polling_interval_ms;
- : Required polling interval in ms. Charger Manager will poll
- this battery every polling_interval_ms or more frequently.
-
-enum data_source battery_present;
- : CM_BATTERY_PRESENT: assume that the battery exists.
- CM_NO_BATTERY: assume that the battery does not exists.
- CM_FUEL_GAUGE: get battery presence information from fuel gauge.
- CM_CHARGER_STAT: get battery presence from chargers.
-
-char **psy_charger_stat;
- : An array ending with NULL that has power-supply-class names of
- chargers. Each power-supply-class should provide "PRESENT" (if
- battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
- external power source is attached or not), and "STATUS" (shows whether
- the battery is {"FULL" or not FULL} or {"FULL", "Charging",
- "Discharging", "NotCharging"}).
-
-int num_charger_regulators;
-struct regulator_bulk_data *charger_regulators;
- : Regulators representing the chargers in the form for
- regulator framework's bulk functions.
-
-char *psy_fuel_gauge;
- : Power-supply-class name of the fuel gauge.
-
-int (*temperature_out_of_range)(int *mC);
-bool measure_battery_temp;
- : This callback returns 0 if the temperature is safe for charging,
- a positive number if it is too hot to charge, and a negative number
- if it is too cold to charge. With the variable mC, the callback returns
- the temperature in 1/1000 of centigrade.
- The source of temperature can be battery or ambient one according to
- the value of measure_battery_temp.
-};
-
-5. Notify Charger-Manager of charger events: cm_notify_event()
-=========================================================
-If there is an charger event is required to notify
-Charger Manager, a charger device driver that triggers the event can call
-cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
-In the function, psy is the charger driver's power_supply pointer, which is
-associated with Charger-Manager. The parameter "type"
-is the same as irq's type (enum cm_event_types). The event message "msg" is
-optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
-
-6. Other Considerations
-=======================
-
-At the charger/battery-related events such as battery-pulled-out,
-charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
-and others critical to chargers, the system should be configured to wake up.
-At least the following should wake up the system from a suspend:
-a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
-
-It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/drivers-testing.rst b/Documentation/power/drivers-testing.rst
new file mode 100644
index 000000000000..e53f1999fc39
--- /dev/null
+++ b/Documentation/power/drivers-testing.rst
@@ -0,0 +1,51 @@
+====================================================
+Testing suspend and resume support in device drivers
+====================================================
+
+ (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Preparing the test system
+============================
+
+Unfortunately, to effectively test the support for the system-wide suspend and
+resume transitions in a driver, it is necessary to suspend and resume a fully
+functional system with this driver loaded. Moreover, that should be done
+several times, preferably several times in a row, and separately for hibernation
+(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
+cases involves slightly different operations and different interactions with
+the machine's BIOS.
+
+Of course, for this purpose the test system has to be known to suspend and
+resume without the driver being tested. Thus, if possible, you should first
+resolve all suspend/resume-related problems in the test system before you start
+testing the new driver. Please see Documentation/power/basic-pm-debugging.rst
+for more information about the debugging of suspend/resume functionality.
+
+2. Testing the driver
+=====================
+
+Once you have resolved the suspend/resume-related problems with your test system
+without the new driver, you are ready to test it:
+
+a) Build the driver as a module, load it and try the test modes of hibernation
+ (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
+ "platform" modes (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+c) Compile the driver directly into the kernel and try the test modes of
+ hibernation.
+
+d) Attempt to hibernate with the driver compiled directly into the kernel
+ in the "reboot", "shutdown" and "platform" modes.
+
+e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.rst,
+ 2). [As far as the STR tests are concerned, it should not matter whether or
+ not the driver is built as a module.]
+
+f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
+ (see: Documentation/power/basic-pm-debugging.rst, 2).
+
+Each of the above tests should be repeated several times and the STD tests
+should be mixed with the STR tests. If any of them fails, the driver cannot be
+regarded as suspend/resume-safe.
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
deleted file mode 100644
index 638afdf4d6b8..000000000000
--- a/Documentation/power/drivers-testing.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Testing suspend and resume support in device drivers
- (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Preparing the test system
-
-Unfortunately, to effectively test the support for the system-wide suspend and
-resume transitions in a driver, it is necessary to suspend and resume a fully
-functional system with this driver loaded. Moreover, that should be done
-several times, preferably several times in a row, and separately for hibernation
-(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
-cases involves slightly different operations and different interactions with
-the machine's BIOS.
-
-Of course, for this purpose the test system has to be known to suspend and
-resume without the driver being tested. Thus, if possible, you should first
-resolve all suspend/resume-related problems in the test system before you start
-testing the new driver. Please see Documentation/power/basic-pm-debugging.txt
-for more information about the debugging of suspend/resume functionality.
-
-2. Testing the driver
-
-Once you have resolved the suspend/resume-related problems with your test system
-without the new driver, you are ready to test it:
-
-a) Build the driver as a module, load it and try the test modes of hibernation
- (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
- "platform" modes (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-c) Compile the driver directly into the kernel and try the test modes of
- hibernation.
-
-d) Attempt to hibernate with the driver compiled directly into the kernel
- in the "reboot", "shutdown" and "platform" modes.
-
-e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.txt,
- 2). [As far as the STR tests are concerned, it should not matter whether or
- not the driver is built as a module.]
-
-f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
- (see: Documentation/power/basic-pm-debugging.txt, 2).
-
-Each of the above tests should be repeated several times and the STD tests
-should be mixed with the STR tests. If any of them fails, the driver cannot be
-regarded as suspend/resume-safe.
diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst
new file mode 100644
index 000000000000..90a345d57ae9
--- /dev/null
+++ b/Documentation/power/energy-model.rst
@@ -0,0 +1,147 @@
+====================
+Energy Model of CPUs
+====================
+
+1. Overview
+-----------
+
+The Energy Model (EM) framework serves as an interface between drivers knowing
+the power consumed by CPUs at various performance levels, and the kernel
+subsystems willing to use that information to make energy-aware decisions.
+
+The source of the information about the power consumed by CPUs can vary greatly
+from one platform to another. These power costs can be estimated using
+devicetree data in some cases. In others, the firmware will know better.
+Alternatively, userspace might be best positioned. And so on. In order to avoid
+each and every client subsystem to re-implement support for each and every
+possible source of information on its own, the EM framework intervenes as an
+abstraction layer which standardizes the format of power cost tables in the
+kernel, hence enabling to avoid redundant work.
+
+The figure below depicts an example of drivers (Arm-specific here, but the
+approach is applicable to any architecture) providing power costs to the EM
+framework, and interested clients reading the data from it::
+
+ +---------------+ +-----------------+ +---------------+
+ | Thermal (IPA) | | Scheduler (EAS) | | Other |
+ +---------------+ +-----------------+ +---------------+
+ | | em_pd_energy() |
+ | | em_cpu_get() |
+ +---------+ | +---------+
+ | | |
+ v v v
+ +---------------------+
+ | Energy Model |
+ | Framework |
+ +---------------------+
+ ^ ^ ^
+ | | | em_register_perf_domain()
+ +----------+ | +---------+
+ | | |
+ +---------------+ +---------------+ +--------------+
+ | cpufreq-dt | | arm_scmi | | Other |
+ +---------------+ +---------------+ +--------------+
+ ^ ^ ^
+ | | |
+ +--------------+ +---------------+ +--------------+
+ | Device Tree | | Firmware | | ? |
+ +--------------+ +---------------+ +--------------+
+
+The EM framework manages power cost tables per 'performance domain' in the
+system. A performance domain is a group of CPUs whose performance is scaled
+together. Performance domains generally have a 1-to-1 mapping with CPUFreq
+policies. All CPUs in a performance domain are required to have the same
+micro-architecture. CPUs in different performance domains can have different
+micro-architectures.
+
+
+2. Core APIs
+------------
+
+2.1 Config options
+^^^^^^^^^^^^^^^^^^
+
+CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
+
+
+2.2 Registration of performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Drivers are expected to register performance domains into the EM framework by
+calling the following API::
+
+ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+ struct em_data_callback *cb);
+
+Drivers must specify the CPUs of the performance domains using the cpumask
+argument, and provide a callback function returning <frequency, power> tuples
+for each capacity state. The callback function provided by the driver is free
+to fetch data from any relevant location (DT, firmware, ...), and by any mean
+deemed necessary. See Section 3. for an example of driver implementing this
+callback, and kernel/power/energy_model.c for further documentation on this
+API.
+
+
+2.3 Accessing performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Subsystems interested in the energy model of a CPU can retrieve it using the
+em_cpu_get() API. The energy model tables are allocated once upon creation of
+the performance domains, and kept in memory untouched.
+
+The energy consumed by a performance domain can be estimated using the
+em_pd_energy() API. The estimation is performed assuming that the schedutil
+CPUfreq governor is in use.
+
+More details about the above APIs can be found in include/linux/energy_model.h.
+
+
+3. Example driver
+-----------------
+
+This section provides a simple example of a CPUFreq driver registering a
+performance domain in the Energy Model framework using the (fake) 'foo'
+protocol. The driver implements an est_power() function to be provided to the
+EM framework::
+
+ -> drivers/cpufreq/foo_cpufreq.c
+
+ 01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
+ 02 {
+ 03 long freq, power;
+ 04
+ 05 /* Use the 'foo' protocol to ceil the frequency */
+ 06 freq = foo_get_freq_ceil(cpu, *KHz);
+ 07 if (freq < 0);
+ 08 return freq;
+ 09
+ 10 /* Estimate the power cost for the CPU at the relevant freq. */
+ 11 power = foo_estimate_power(cpu, freq);
+ 12 if (power < 0);
+ 13 return power;
+ 14
+ 15 /* Return the values to the EM framework */
+ 16 *mW = power;
+ 17 *KHz = freq;
+ 18
+ 19 return 0;
+ 20 }
+ 21
+ 22 static int foo_cpufreq_init(struct cpufreq_policy *policy)
+ 23 {
+ 24 struct em_data_callback em_cb = EM_DATA_CB(est_power);
+ 25 int nr_opp, ret;
+ 26
+ 27 /* Do the actual CPUFreq init work ... */
+ 28 ret = do_foo_cpufreq_init(policy);
+ 29 if (ret)
+ 30 return ret;
+ 31
+ 32 /* Find the number of OPPs for this policy */
+ 33 nr_opp = foo_get_nr_opp(policy);
+ 34
+ 35 /* And register the new performance domain */
+ 36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+ 37
+ 38 return 0;
+ 39 }
diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt
deleted file mode 100644
index a2b0ae4c76bd..000000000000
--- a/Documentation/power/energy-model.txt
+++ /dev/null
@@ -1,144 +0,0 @@
- ====================
- Energy Model of CPUs
- ====================
-
-1. Overview
------------
-
-The Energy Model (EM) framework serves as an interface between drivers knowing
-the power consumed by CPUs at various performance levels, and the kernel
-subsystems willing to use that information to make energy-aware decisions.
-
-The source of the information about the power consumed by CPUs can vary greatly
-from one platform to another. These power costs can be estimated using
-devicetree data in some cases. In others, the firmware will know better.
-Alternatively, userspace might be best positioned. And so on. In order to avoid
-each and every client subsystem to re-implement support for each and every
-possible source of information on its own, the EM framework intervenes as an
-abstraction layer which standardizes the format of power cost tables in the
-kernel, hence enabling to avoid redundant work.
-
-The figure below depicts an example of drivers (Arm-specific here, but the
-approach is applicable to any architecture) providing power costs to the EM
-framework, and interested clients reading the data from it.
-
- +---------------+ +-----------------+ +---------------+
- | Thermal (IPA) | | Scheduler (EAS) | | Other |
- +---------------+ +-----------------+ +---------------+
- | | em_pd_energy() |
- | | em_cpu_get() |
- +---------+ | +---------+
- | | |
- v v v
- +---------------------+
- | Energy Model |
- | Framework |
- +---------------------+
- ^ ^ ^
- | | | em_register_perf_domain()
- +----------+ | +---------+
- | | |
- +---------------+ +---------------+ +--------------+
- | cpufreq-dt | | arm_scmi | | Other |
- +---------------+ +---------------+ +--------------+
- ^ ^ ^
- | | |
- +--------------+ +---------------+ +--------------+
- | Device Tree | | Firmware | | ? |
- +--------------+ +---------------+ +--------------+
-
-The EM framework manages power cost tables per 'performance domain' in the
-system. A performance domain is a group of CPUs whose performance is scaled
-together. Performance domains generally have a 1-to-1 mapping with CPUFreq
-policies. All CPUs in a performance domain are required to have the same
-micro-architecture. CPUs in different performance domains can have different
-micro-architectures.
-
-
-2. Core APIs
-------------
-
- 2.1 Config options
-
-CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
-
-
- 2.2 Registration of performance domains
-
-Drivers are expected to register performance domains into the EM framework by
-calling the following API:
-
- int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
- struct em_data_callback *cb);
-
-Drivers must specify the CPUs of the performance domains using the cpumask
-argument, and provide a callback function returning <frequency, power> tuples
-for each capacity state. The callback function provided by the driver is free
-to fetch data from any relevant location (DT, firmware, ...), and by any mean
-deemed necessary. See Section 3. for an example of driver implementing this
-callback, and kernel/power/energy_model.c for further documentation on this
-API.
-
-
- 2.3 Accessing performance domains
-
-Subsystems interested in the energy model of a CPU can retrieve it using the
-em_cpu_get() API. The energy model tables are allocated once upon creation of
-the performance domains, and kept in memory untouched.
-
-The energy consumed by a performance domain can be estimated using the
-em_pd_energy() API. The estimation is performed assuming that the schedutil
-CPUfreq governor is in use.
-
-More details about the above APIs can be found in include/linux/energy_model.h.
-
-
-3. Example driver
------------------
-
-This section provides a simple example of a CPUFreq driver registering a
-performance domain in the Energy Model framework using the (fake) 'foo'
-protocol. The driver implements an est_power() function to be provided to the
-EM framework.
-
- -> drivers/cpufreq/foo_cpufreq.c
-
-01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
-02 {
-03 long freq, power;
-04
-05 /* Use the 'foo' protocol to ceil the frequency */
-06 freq = foo_get_freq_ceil(cpu, *KHz);
-07 if (freq < 0);
-08 return freq;
-09
-10 /* Estimate the power cost for the CPU at the relevant freq. */
-11 power = foo_estimate_power(cpu, freq);
-12 if (power < 0);
-13 return power;
-14
-15 /* Return the values to the EM framework */
-16 *mW = power;
-17 *KHz = freq;
-18
-19 return 0;
-20 }
-21
-22 static int foo_cpufreq_init(struct cpufreq_policy *policy)
-23 {
-24 struct em_data_callback em_cb = EM_DATA_CB(est_power);
-25 int nr_opp, ret;
-26
-27 /* Do the actual CPUFreq init work ... */
-28 ret = do_foo_cpufreq_init(policy);
-29 if (ret)
-30 return ret;
-31
-32 /* Find the number of OPPs for this policy */
-33 nr_opp = foo_get_nr_opp(policy);
-34
-35 /* And register the new performance domain */
-36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
-37
-38 return 0;
-39 }
diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst
new file mode 100644
index 000000000000..ef110fe55e82
--- /dev/null
+++ b/Documentation/power/freezing-of-tasks.rst
@@ -0,0 +1,244 @@
+=================
+Freezing of tasks
+=================
+
+(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+I. What is the freezing of tasks?
+=================================
+
+The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).
+
+II. How does it work?
+=====================
+
+There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
+and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have
+PF_NOFREEZE unset (all user space processes and some kernel threads) are
+regarded as 'freezable' and treated in a special way before the system enters a
+suspend state as well as before a hibernation image is created (in what follows
+we only consider hibernation, but the description also applies to suspend).
+
+Namely, as the first step of the hibernation procedure the function
+freeze_processes() (defined in kernel/power/process.c) is called. A system-wide
+variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
+whether the system is to undergo a freezing operation. And freeze_processes()
+sets this variable. After this, it executes try_to_freeze_tasks() that sends a
+fake signal to all user space processes, and wakes up all the kernel threads.
+All freezable tasks must react to that by calling try_to_freeze(), which
+results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
+the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
+it loop until PF_FROZEN is cleared for it. Then, we say that the task is
+'frozen' and therefore the set of functions handling this mechanism is referred
+to as 'the freezer' (these functions are defined in kernel/power/process.c,
+kernel/freezer.c & include/linux/freezer.h). User space processes are generally
+frozen before kernel threads.
+
+__refrigerator() must not be called directly. Instead, use the
+try_to_freeze() function (defined in include/linux/freezer.h), that checks
+if the task is to be frozen and makes the task enter __refrigerator().
+
+For user space processes try_to_freeze() is called automatically from the
+signal-handling code, but the freezable kernel threads need to call it
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if the task is to be frozen and
+calling try_to_freeze(). The main loop of a freezable kernel thread may look
+like the following one::
+
+ set_freezable();
+ do {
+ hub_events();
+ wait_event_freezable(khubd_wait,
+ !list_empty(&hub_event_list) ||
+ kthread_should_stop());
+ } while (!kthread_should_stop() || !list_empty(&hub_event_list));
+
+(from drivers/usb/core/hub.c::hub_thread()).
+
+If a freezable kernel thread fails to call try_to_freeze() after the freezer has
+initiated a freezing operation, the freezing of tasks will fail and the entire
+hibernation operation will be cancelled. For this reason, freezable kernel
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
+
+After the system memory state has been restored from a hibernation image and
+devices have been reinitialized, the function thaw_processes() is called in
+order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that
+have been frozen leave __refrigerator() and continue running.
+
+
+Rationale behind the functions dealing with freezing and thawing of tasks
+-------------------------------------------------------------------------
+
+freeze_processes():
+ - freezes only userspace tasks
+
+freeze_kernel_threads():
+ - freezes all tasks (including kernel threads) because we can't freeze
+ kernel threads without freezing userspace tasks
+
+thaw_kernel_threads():
+ - thaws only kernel threads; this is particularly useful if we need to do
+ anything special in between thawing of kernel threads and thawing of
+ userspace tasks, or if we want to postpone the thawing of userspace tasks
+
+thaw_processes():
+ - thaws all tasks (including kernel threads) because we can't thaw userspace
+ tasks without thawing kernel threads
+
+
+III. Which kernel threads are freezable?
+========================================
+
+Kernel threads are not freezable by default. However, a kernel thread may clear
+PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
+directly is not allowed). From this point it is regarded as freezable
+and must call try_to_freeze() in a suitable place.
+
+IV. Why do we do that?
+======================
+
+Generally speaking, there is a couple of reasons to use the freezing of tasks:
+
+1. The principal reason is to prevent filesystems from being damaged after
+ hibernation. At the moment we have no simple means of checkpointing
+ filesystems, so if there are any modifications made to filesystem data and/or
+ metadata on disks, we cannot bring them back to the state from before the
+ modifications. At the same time each hibernation image contains some
+ filesystem-related information that must be consistent with the state of the
+ on-disk data and metadata after the system memory state has been restored
+ from the image (otherwise the filesystems will be damaged in a nasty way,
+ usually making them almost impossible to repair). We therefore freeze
+ tasks that might cause the on-disk filesystems' data and metadata to be
+ modified after the hibernation image has been created and before the
+ system is finally powered off. The majority of these are user space
+ processes, but if any of the kernel threads may cause something like this
+ to happen, they have to be freezable.
+
+2. Next, to create the hibernation image we need to free a sufficient amount of
+ memory (approximately 50% of available RAM) and we need to do that before
+ devices are deactivated, because we generally need them for swapping out.
+ Then, after the memory for the image has been freed, we don't want tasks
+ to allocate additional memory and we prevent them from doing that by
+ freezing them earlier. [Of course, this also means that device drivers
+ should not allocate substantial amounts of memory from their .suspend()
+ callbacks before hibernation, but this is a separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
+ from interfering with the suspending and resuming of devices. A user space
+ process running on a second CPU while we are suspending devices may, for
+ example, be troublesome and without the freezing of tasks we would need some
+ safeguards against race conditions that might occur in such a case.
+
+Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
+of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
+
+"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
+
+Linus: In many ways, 'at all'.
+
+I **do** realize the IO request queue issues, and that we cannot actually do
+s2ram with some devices in the middle of a DMA. So we want to be able to
+avoid *that*, there's no question about that. And I suspect that stopping
+user threads and then waiting for a sync is practically one of the easier
+ways to do so.
+
+So in practice, the 'at all' may become a 'why freeze kernel threads?' and
+freezing user threads I don't find really objectionable."
+
+Still, there are kernel threads that may want to be freezable. For example, if
+a kernel thread that belongs to a device driver accesses the device directly, it
+in principle needs to know when the device is suspended, so that it doesn't try
+to access it at that time. However, if the kernel thread is freezable, it will
+be frozen before the driver's .suspend() callback is executed and it will be
+thawed after the driver's .resume() callback has run, so it won't be accessing
+the device while it's suspended.
+
+4. Another reason for freezing tasks is to prevent user space processes from
+ realizing that hibernation (or suspend) operation takes place. Ideally, user
+ space processes should not notice that such a system-wide operation has
+ occurred and should continue running without any problems after the restore
+ (or resume from suspend). Unfortunately, in the most general case this
+ is quite difficult to achieve without the freezing of tasks. Consider,
+ for example, a process that depends on all CPUs being online while it's
+ running. Since we need to disable nonboot CPUs during the hibernation,
+ if this process is not frozen, it may notice that the number of CPUs has
+ changed and may start to work incorrectly because of that.
+
+V. Are there any problems related to the freezing of tasks?
+===========================================================
+
+Yes, there are.
+
+First of all, the freezing of kernel threads may be tricky if they depend one
+on another. For example, if kernel thread A waits for a completion (in the
+TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
+and B is frozen in the meantime, then A will be blocked until B is thawed, which
+may be undesirable. That's why kernel threads are not freezable by default.
+
+Second, there are the following two problems related to the freezing of user
+space processes:
+
+1. Putting processes into an uninterruptible sleep distorts the load average.
+2. Now that we have FUSE, plus the framework for doing device drivers in
+ userspace, it gets even more complicated because some userspace processes are
+ now doing the sorts of things that kernel threads do
+ (https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
+
+The problem 1. seems to be fixable, although it hasn't been fixed so far. The
+other one is more serious, but it seems that we can work around it by using
+hibernation (and suspend) notifiers (in that case, though, we won't be able to
+avoid the realization by the user space processes that the hibernation is taking
+place).
+
+There are also problems that the freezing of tasks tends to expose, although
+they are not directly related to it. For example, if request_firmware() is
+called from a device driver's .resume() routine, it will timeout and eventually
+fail, because the user land process that should respond to the request is frozen
+at this point. So, seemingly, the failure is due to the freezing of tasks.
+Suppose, however, that the firmware file is located on a filesystem accessible
+only through another device that hasn't been resumed yet. In that case,
+request_firmware() will fail regardless of whether or not the freezing of tasks
+is used. Consequently, the problem is not really related to the freezing of
+tasks, since it generally exists anyway.
+
+A driver must have all firmwares it may need in RAM before suspend() is called.
+If keeping them is not practical, for example due to their size, they must be
+requested early enough using the suspend notifier API described in
+Documentation/driver-api/pm/notifiers.rst.
+
+VI. Are there any precautions to be taken to prevent freezing failures?
+=======================================================================
+
+Yes, there are.
+
+First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
+from system-wide sleep such as suspend/hibernation is not encouraged.
+If possible, that piece of code must instead hook onto the suspend/hibernation
+notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
+(kernel/cpu.c) for an example.
+
+However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
+it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
+that could lead to freezing failures, because if the suspend/hibernate code
+successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
+to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
+state. As a consequence, the freezer would not be able to freeze that task,
+leading to freezing failure.
+
+However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
+since they ask the freezer to skip freezing this task, since it is anyway
+"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
+only after the entire suspend/hibernation sequence is complete.
+So, to summarize, use [un]lock_system_sleep() instead of directly using
+mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
+
+V. Miscellaneous
+================
+
+/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
+all user space processes or all freezable kernel threads, in unit of millisecond.
+The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
deleted file mode 100644
index cd283190855a..000000000000
--- a/Documentation/power/freezing-of-tasks.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Freezing of tasks
- (C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-I. What is the freezing of tasks?
-
-The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).
-
-II. How does it work?
-
-There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
-and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have
-PF_NOFREEZE unset (all user space processes and some kernel threads) are
-regarded as 'freezable' and treated in a special way before the system enters a
-suspend state as well as before a hibernation image is created (in what follows
-we only consider hibernation, but the description also applies to suspend).
-
-Namely, as the first step of the hibernation procedure the function
-freeze_processes() (defined in kernel/power/process.c) is called. A system-wide
-variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
-whether the system is to undergo a freezing operation. And freeze_processes()
-sets this variable. After this, it executes try_to_freeze_tasks() that sends a
-fake signal to all user space processes, and wakes up all the kernel threads.
-All freezable tasks must react to that by calling try_to_freeze(), which
-results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
-the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
-it loop until PF_FROZEN is cleared for it. Then, we say that the task is
-'frozen' and therefore the set of functions handling this mechanism is referred
-to as 'the freezer' (these functions are defined in kernel/power/process.c,
-kernel/freezer.c & include/linux/freezer.h). User space processes are generally
-frozen before kernel threads.
-
-__refrigerator() must not be called directly. Instead, use the
-try_to_freeze() function (defined in include/linux/freezer.h), that checks
-if the task is to be frozen and makes the task enter __refrigerator().
-
-For user space processes try_to_freeze() is called automatically from the
-signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places or use the wait_event_freezable() or
-wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
-that combine interruptible sleep with checking if the task is to be frozen and
-calling try_to_freeze(). The main loop of a freezable kernel thread may look
-like the following one:
-
- set_freezable();
- do {
- hub_events();
- wait_event_freezable(khubd_wait,
- !list_empty(&hub_event_list) ||
- kthread_should_stop());
- } while (!kthread_should_stop() || !list_empty(&hub_event_list));
-
-(from drivers/usb/core/hub.c::hub_thread()).
-
-If a freezable kernel thread fails to call try_to_freeze() after the freezer has
-initiated a freezing operation, the freezing of tasks will fail and the entire
-hibernation operation will be cancelled. For this reason, freezable kernel
-threads must call try_to_freeze() somewhere or use one of the
-wait_event_freezable() and wait_event_freezable_timeout() macros.
-
-After the system memory state has been restored from a hibernation image and
-devices have been reinitialized, the function thaw_processes() is called in
-order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that
-have been frozen leave __refrigerator() and continue running.
-
-
-Rationale behind the functions dealing with freezing and thawing of tasks:
--------------------------------------------------------------------------
-
-freeze_processes():
- - freezes only userspace tasks
-
-freeze_kernel_threads():
- - freezes all tasks (including kernel threads) because we can't freeze
- kernel threads without freezing userspace tasks
-
-thaw_kernel_threads():
- - thaws only kernel threads; this is particularly useful if we need to do
- anything special in between thawing of kernel threads and thawing of
- userspace tasks, or if we want to postpone the thawing of userspace tasks
-
-thaw_processes():
- - thaws all tasks (including kernel threads) because we can't thaw userspace
- tasks without thawing kernel threads
-
-
-III. Which kernel threads are freezable?
-
-Kernel threads are not freezable by default. However, a kernel thread may clear
-PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
-directly is not allowed). From this point it is regarded as freezable
-and must call try_to_freeze() in a suitable place.
-
-IV. Why do we do that?
-
-Generally speaking, there is a couple of reasons to use the freezing of tasks:
-
-1. The principal reason is to prevent filesystems from being damaged after
-hibernation. At the moment we have no simple means of checkpointing
-filesystems, so if there are any modifications made to filesystem data and/or
-metadata on disks, we cannot bring them back to the state from before the
-modifications. At the same time each hibernation image contains some
-filesystem-related information that must be consistent with the state of the
-on-disk data and metadata after the system memory state has been restored from
-the image (otherwise the filesystems will be damaged in a nasty way, usually
-making them almost impossible to repair). We therefore freeze tasks that might
-cause the on-disk filesystems' data and metadata to be modified after the
-hibernation image has been created and before the system is finally powered off.
-The majority of these are user space processes, but if any of the kernel threads
-may cause something like this to happen, they have to be freezable.
-
-2. Next, to create the hibernation image we need to free a sufficient amount of
-memory (approximately 50% of available RAM) and we need to do that before
-devices are deactivated, because we generally need them for swapping out. Then,
-after the memory for the image has been freed, we don't want tasks to allocate
-additional memory and we prevent them from doing that by freezing them earlier.
-[Of course, this also means that device drivers should not allocate substantial
-amounts of memory from their .suspend() callbacks before hibernation, but this
-is a separate issue.]
-
-3. The third reason is to prevent user space processes and some kernel threads
-from interfering with the suspending and resuming of devices. A user space
-process running on a second CPU while we are suspending devices may, for
-example, be troublesome and without the freezing of tasks we would need some
-safeguards against race conditions that might occur in such a case.
-
-Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
-of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
-
-"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
-
-Linus: In many ways, 'at all'.
-
-I _do_ realize the IO request queue issues, and that we cannot actually do
-s2ram with some devices in the middle of a DMA. So we want to be able to
-avoid *that*, there's no question about that. And I suspect that stopping
-user threads and then waiting for a sync is practically one of the easier
-ways to do so.
-
-So in practice, the 'at all' may become a 'why freeze kernel threads?' and
-freezing user threads I don't find really objectionable."
-
-Still, there are kernel threads that may want to be freezable. For example, if
-a kernel thread that belongs to a device driver accesses the device directly, it
-in principle needs to know when the device is suspended, so that it doesn't try
-to access it at that time. However, if the kernel thread is freezable, it will
-be frozen before the driver's .suspend() callback is executed and it will be
-thawed after the driver's .resume() callback has run, so it won't be accessing
-the device while it's suspended.
-
-4. Another reason for freezing tasks is to prevent user space processes from
-realizing that hibernation (or suspend) operation takes place. Ideally, user
-space processes should not notice that such a system-wide operation has occurred
-and should continue running without any problems after the restore (or resume
-from suspend). Unfortunately, in the most general case this is quite difficult
-to achieve without the freezing of tasks. Consider, for example, a process
-that depends on all CPUs being online while it's running. Since we need to
-disable nonboot CPUs during the hibernation, if this process is not frozen, it
-may notice that the number of CPUs has changed and may start to work incorrectly
-because of that.
-
-V. Are there any problems related to the freezing of tasks?
-
-Yes, there are.
-
-First of all, the freezing of kernel threads may be tricky if they depend one
-on another. For example, if kernel thread A waits for a completion (in the
-TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
-and B is frozen in the meantime, then A will be blocked until B is thawed, which
-may be undesirable. That's why kernel threads are not freezable by default.
-
-Second, there are the following two problems related to the freezing of user
-space processes:
-1. Putting processes into an uninterruptible sleep distorts the load average.
-2. Now that we have FUSE, plus the framework for doing device drivers in
-userspace, it gets even more complicated because some userspace processes are
-now doing the sorts of things that kernel threads do
-(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
-
-The problem 1. seems to be fixable, although it hasn't been fixed so far. The
-other one is more serious, but it seems that we can work around it by using
-hibernation (and suspend) notifiers (in that case, though, we won't be able to
-avoid the realization by the user space processes that the hibernation is taking
-place).
-
-There are also problems that the freezing of tasks tends to expose, although
-they are not directly related to it. For example, if request_firmware() is
-called from a device driver's .resume() routine, it will timeout and eventually
-fail, because the user land process that should respond to the request is frozen
-at this point. So, seemingly, the failure is due to the freezing of tasks.
-Suppose, however, that the firmware file is located on a filesystem accessible
-only through another device that hasn't been resumed yet. In that case,
-request_firmware() will fail regardless of whether or not the freezing of tasks
-is used. Consequently, the problem is not really related to the freezing of
-tasks, since it generally exists anyway.
-
-A driver must have all firmwares it may need in RAM before suspend() is called.
-If keeping them is not practical, for example due to their size, they must be
-requested early enough using the suspend notifier API described in
-Documentation/driver-api/pm/notifiers.rst.
-
-VI. Are there any precautions to be taken to prevent freezing failures?
-
-Yes, there are.
-
-First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
-from system-wide sleep such as suspend/hibernation is not encouraged.
-If possible, that piece of code must instead hook onto the suspend/hibernation
-notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
-(kernel/cpu.c) for an example.
-
-However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
-it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
-that could lead to freezing failures, because if the suspend/hibernate code
-successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
-to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
-state. As a consequence, the freezer would not be able to freeze that task,
-leading to freezing failure.
-
-However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
-since they ask the freezer to skip freezing this task, since it is anyway
-"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
-only after the entire suspend/hibernation sequence is complete.
-So, to summarize, use [un]lock_system_sleep() instead of directly using
-mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
-
-V. Miscellaneous
-/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
-all user space processes or all freezable kernel threads, in unit of millisecond.
-The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst
new file mode 100644
index 000000000000..20415f21e48a
--- /dev/null
+++ b/Documentation/power/index.rst
@@ -0,0 +1,46 @@
+:orphan:
+
+================
+Power Management
+================
+
+.. toctree::
+ :maxdepth: 1
+
+ apm-acpi
+ basic-pm-debugging
+ charger-manager
+ drivers-testing
+ energy-model
+ freezing-of-tasks
+ interface
+ opp
+ pci
+ pm_qos_interface
+ power_supply_class
+ runtime_pm
+ s2ram
+ suspend-and-cpuhotplug
+ suspend-and-interrupts
+ swsusp-and-swap-files
+ swsusp-dmcrypt
+ swsusp
+ video
+ tricks
+
+ userland-swsusp
+
+ powercap/powercap
+
+ regulator/consumer
+ regulator/design
+ regulator/machine
+ regulator/overview
+ regulator/regulator
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/power/interface.rst b/Documentation/power/interface.rst
new file mode 100644
index 000000000000..8d270ed27228
--- /dev/null
+++ b/Documentation/power/interface.rst
@@ -0,0 +1,79 @@
+===========================================
+Power Management Interface for System Sleep
+===========================================
+
+Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+The power management subsystem provides userspace with a unified sysfs interface
+for system sleep regardless of the underlying system architecture or platform.
+The interface is located in the /sys/power/ directory (assuming that sysfs is
+mounted at /sys).
+
+/sys/power/state is the system sleep state control file.
+
+Reading from it returns a list of supported sleep states, encoded as:
+
+- 'freeze' (Suspend-to-Idle)
+- 'standby' (Power-On Suspend)
+- 'mem' (Suspend-to-RAM)
+- 'disk' (Suspend-to-Disk)
+
+Suspend-to-Idle is always supported. Suspend-to-Disk is always supported
+too as long the kernel has been configured to support hibernation at all
+(ie. CONFIG_HIBERNATION is set in the kernel configuration file). Support
+for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
+platform.
+
+If one of the strings listed in /sys/power/state is written to it, the system
+will attempt to transition into the corresponding sleep state. Refer to
+Documentation/admin-guide/pm/sleep-states.rst for a description of each of
+those states.
+
+/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
+Specifically, it tells the kernel what to do after creating a hibernation image.
+
+Reading from it returns a list of supported options encoded as:
+
+- 'platform' (put the system into sleep using a platform-provided method)
+- 'shutdown' (shut the system down)
+- 'reboot' (reboot the system)
+- 'suspend' (trigger a Suspend-to-RAM transition)
+- 'test_resume' (resume-after-hibernation test mode)
+
+The currently selected option is printed in square brackets.
+
+The 'platform' option is only available if the platform provides a special
+mechanism to put the system to sleep after creating a hibernation image (ACPI
+does that, for example). The 'suspend' option is available if Suspend-to-RAM
+is supported. Refer to Documentation/power/basic-pm-debugging.rst for the
+description of the 'test_resume' option.
+
+To select an option, write the string representing it to /sys/power/disk.
+
+/sys/power/image_size controls the size of hibernation images.
+
+It can be written a string representing a non-negative integer that will be
+used as a best-effort upper limit of the image size, in bytes. The hibernation
+core will do its best to ensure that the image size will not exceed that number.
+However, if that turns out to be impossible to achieve, a hibernation image will
+still be created and its size will be as small as possible. In particular,
+writing '0' to this file will enforce hibernation images to be as small as
+possible.
+
+Reading from this file returns the current image size limit, which is set to
+around 2/5 of available RAM by default.
+
+/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
+or resume event point in the RTC across reboots.
+
+It helps to debug hard lockups or reboots due to device driver failures that
+occur during system suspend or resume (which is more common) more effectively.
+
+If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
+event point in turn will be stored in the RTC memory (overwriting the actual
+RTC information), so it will survive a system crash if one occurs right after
+storing it and it can be used later to identify the driver that caused the crash
+to happen (see Documentation/power/s2ram.rst for more information).
+
+Initially it contains '0' which may be changed to '1' by writing a string
+representing a nonzero integer into it.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
deleted file mode 100644
index 27df7f98668a..000000000000
--- a/Documentation/power/interface.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-Power Management Interface for System Sleep
-
-Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-The power management subsystem provides userspace with a unified sysfs interface
-for system sleep regardless of the underlying system architecture or platform.
-The interface is located in the /sys/power/ directory (assuming that sysfs is
-mounted at /sys).
-
-/sys/power/state is the system sleep state control file.
-
-Reading from it returns a list of supported sleep states, encoded as:
-
-'freeze' (Suspend-to-Idle)
-'standby' (Power-On Suspend)
-'mem' (Suspend-to-RAM)
-'disk' (Suspend-to-Disk)
-
-Suspend-to-Idle is always supported. Suspend-to-Disk is always supported
-too as long the kernel has been configured to support hibernation at all
-(ie. CONFIG_HIBERNATION is set in the kernel configuration file). Support
-for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
-platform.
-
-If one of the strings listed in /sys/power/state is written to it, the system
-will attempt to transition into the corresponding sleep state. Refer to
-Documentation/admin-guide/pm/sleep-states.rst for a description of each of
-those states.
-
-/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
-Specifically, it tells the kernel what to do after creating a hibernation image.
-
-Reading from it returns a list of supported options encoded as:
-
-'platform' (put the system into sleep using a platform-provided method)
-'shutdown' (shut the system down)
-'reboot' (reboot the system)
-'suspend' (trigger a Suspend-to-RAM transition)
-'test_resume' (resume-after-hibernation test mode)
-
-The currently selected option is printed in square brackets.
-
-The 'platform' option is only available if the platform provides a special
-mechanism to put the system to sleep after creating a hibernation image (ACPI
-does that, for example). The 'suspend' option is available if Suspend-to-RAM
-is supported. Refer to Documentation/power/basic-pm-debugging.txt for the
-description of the 'test_resume' option.
-
-To select an option, write the string representing it to /sys/power/disk.
-
-/sys/power/image_size controls the size of hibernation images.
-
-It can be written a string representing a non-negative integer that will be
-used as a best-effort upper limit of the image size, in bytes. The hibernation
-core will do its best to ensure that the image size will not exceed that number.
-However, if that turns out to be impossible to achieve, a hibernation image will
-still be created and its size will be as small as possible. In particular,
-writing '0' to this file will enforce hibernation images to be as small as
-possible.
-
-Reading from this file returns the current image size limit, which is set to
-around 2/5 of available RAM by default.
-
-/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
-or resume event point in the RTC across reboots.
-
-It helps to debug hard lockups or reboots due to device driver failures that
-occur during system suspend or resume (which is more common) more effectively.
-
-If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
-event point in turn will be stored in the RTC memory (overwriting the actual
-RTC information), so it will survive a system crash if one occurs right after
-storing it and it can be used later to identify the driver that caused the crash
-to happen (see Documentation/power/s2ram.txt for more information).
-
-Initially it contains '0' which may be changed to '1' by writing a string
-representing a nonzero integer into it.
diff --git a/Documentation/power/opp.rst b/Documentation/power/opp.rst
new file mode 100644
index 000000000000..b3cf1def9dee
--- /dev/null
+++ b/Documentation/power/opp.rst
@@ -0,0 +1,379 @@
+==========================================
+Operating Performance Points (OPP) Library
+==========================================
+
+(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
+
+.. Contents
+
+ 1. Introduction
+ 2. Initial OPP List Registration
+ 3. OPP Search Functions
+ 4. OPP Availability Control Functions
+ 5. OPP Data Retrieval Functions
+ 6. Data Structures
+
+1. Introduction
+===============
+
+1.1 What is an Operating Performance Point (OPP)?
+-------------------------------------------------
+
+Complex SoCs of today consists of a multiple sub-modules working in conjunction.
+In an operational system executing varied use cases, not all modules in the SoC
+need to function at their highest performing frequency all the time. To
+facilitate this, sub-modules in a SoC are grouped into domains, allowing some
+domains to run at lower voltage and frequency while other domains run at
+voltage/frequency pairs that are higher.
+
+The set of discrete tuples consisting of frequency and voltage pairs that
+the device will support per domain are called Operating Performance Points or
+OPPs.
+
+As an example:
+
+Let us consider an MPU device which supports the following:
+{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
+{1GHz at minimum voltage of 1.3V}
+
+We can represent these as three OPPs as the following {Hz, uV} tuples:
+
+- {300000000, 1000000}
+- {800000000, 1200000}
+- {1000000000, 1300000}
+
+1.2 Operating Performance Points Library
+----------------------------------------
+
+OPP library provides a set of helper functions to organize and query the OPP
+information. The library is located in drivers/base/power/opp.c and the header
+is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
+CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
+CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
+optionally boot at a certain OPP without needing cpufreq.
+
+Typical usage of the OPP library is as follows::
+
+ (users) -> registers a set of default OPPs -> (library)
+ SoC framework -> modifies on required cases certain OPPs -> OPP layer
+ -> queries to search/retrieve information ->
+
+OPP layer expects each domain to be represented by a unique device pointer. SoC
+framework registers a set of initial OPPs per device with the OPP layer. This
+list is expected to be an optimally small number typically around 5 per device.
+This initial list contains a set of OPPs that the framework expects to be safely
+enabled by default in the system.
+
+Note on OPP Availability
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+As the system proceeds to operate, SoC framework may choose to make certain
+OPPs available or not available on each device based on various external
+factors. Example usage: Thermal management or other exceptional situations where
+SoC framework might choose to disable a higher frequency OPP to safely continue
+operations until that OPP could be re-enabled if possible.
+
+OPP library facilitates this concept in it's implementation. The following
+operational functions operate only on available opps:
+opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
+
+dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
+be used for dev_pm_opp_enable/disable functions to make an opp available as required.
+
+WARNING: Users of OPP library should refresh their availability count using
+get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
+exact mechanism to trigger these or the notification mechanism to other
+dependent subsystems such as cpufreq are left to the discretion of the SoC
+specific framework which uses the OPP library. Similar care needs to be taken
+care to refresh the cpufreq table in cases of these operations.
+
+2. Initial OPP List Registration
+================================
+The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
+device. It is expected that the SoC framework will register the OPP entries
+optimally- typical numbers range to be less than 5. The list generated by
+registering the OPPs is maintained by OPP library throughout the device
+operation. The SoC framework can subsequently control the availability of the
+OPPs dynamically using the dev_pm_opp_enable / disable functions.
+
+dev_pm_opp_add
+ Add a new OPP for a specific domain represented by the device pointer.
+ The OPP is defined using the frequency and voltage. Once added, the OPP
+ is assumed to be available and control of it's availability can be done
+ with the dev_pm_opp_enable/disable functions. OPP library internally stores
+ and manages this information in the opp struct. This function may be
+ used by SoC framework to define a optimal list as per the demands of
+ SoC usage environment.
+
+ WARNING:
+ Do not use this function in interrupt context.
+
+ Example::
+
+ soc_pm_init()
+ {
+ /* Do things */
+ r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
+ if (!r) {
+ pr_err("%s: unable to register mpu opp(%d)\n", r);
+ goto no_cpufreq;
+ }
+ /* Do cpufreq things */
+ no_cpufreq:
+ /* Do remaining things */
+ }
+
+3. OPP Search Functions
+=======================
+High level framework such as cpufreq operates on frequencies. To map the
+frequency back to the corresponding OPP, OPP library provides handy functions
+to search the OPP list that OPP library internally manages. These search
+functions return the matching pointer representing the opp if a match is
+found, else returns error. These errors are expected to be handled by standard
+error checks such as IS_ERR() and appropriate actions taken by the caller.
+
+Callers of these functions shall call dev_pm_opp_put() after they have used the
+OPP. Otherwise the memory for the OPP will never get freed and result in
+memleak.
+
+dev_pm_opp_find_freq_exact
+ Search for an OPP based on an *exact* frequency and
+ availability. This function is especially useful to enable an OPP which
+ is not available by default.
+ Example: In a case when SoC framework detects a situation where a
+ higher frequency could be made available, it can use this function to
+ find the OPP prior to call the dev_pm_opp_enable to actually make
+ it available::
+
+ opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+ dev_pm_opp_put(opp);
+ /* dont operate on the pointer.. just do a sanity check.. */
+ if (IS_ERR(opp)) {
+ pr_err("frequency not disabled!\n");
+ /* trigger appropriate actions.. */
+ } else {
+ dev_pm_opp_enable(dev,1000000000);
+ }
+
+ NOTE:
+ This is the only search function that operates on OPPs which are
+ not available.
+
+dev_pm_opp_find_freq_floor
+ Search for an available OPP which is *at most* the
+ provided frequency. This function is useful while searching for a lesser
+ match OR operating on OPP information in the order of decreasing
+ frequency.
+ Example: To find the highest opp for a device::
+
+ freq = ULONG_MAX;
+ opp = dev_pm_opp_find_freq_floor(dev, &freq);
+ dev_pm_opp_put(opp);
+
+dev_pm_opp_find_freq_ceil
+ Search for an available OPP which is *at least* the
+ provided frequency. This function is useful while searching for a
+ higher match OR operating on OPP information in the order of increasing
+ frequency.
+ Example 1: To find the lowest opp for a device::
+
+ freq = 0;
+ opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+ dev_pm_opp_put(opp);
+
+ Example 2: A simplified implementation of a SoC cpufreq_driver->target::
+
+ soc_cpufreq_target(..)
+ {
+ /* Do stuff like policy checks etc. */
+ /* Find the best frequency match for the req */
+ opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+ dev_pm_opp_put(opp);
+ if (!IS_ERR(opp))
+ soc_switch_to_freq_voltage(freq);
+ else
+ /* do something when we can't satisfy the req */
+ /* do other stuff */
+ }
+
+4. OPP Availability Control Functions
+=====================================
+A default OPP list registered with the OPP library may not cater to all possible
+situation. The OPP library provides a set of functions to modify the
+availability of a OPP within the OPP list. This allows SoC frameworks to have
+fine grained dynamic control of which sets of OPPs are operationally available.
+These functions are intended to *temporarily* remove an OPP in conditions such
+as thermal considerations (e.g. don't use OPPx until the temperature drops).
+
+WARNING:
+ Do not use these functions in interrupt context.
+
+dev_pm_opp_enable
+ Make a OPP available for operation.
+ Example: Lets say that 1GHz OPP is to be made available only if the
+ SoC temperature is lower than a certain threshold. The SoC framework
+ implementation might choose to do something as follows::
+
+ if (cur_temp < temp_low_thresh) {
+ /* Enable 1GHz if it was disabled */
+ opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+ dev_pm_opp_put(opp);
+ /* just error check */
+ if (!IS_ERR(opp))
+ ret = dev_pm_opp_enable(dev, 1000000000);
+ else
+ goto try_something_else;
+ }
+
+dev_pm_opp_disable
+ Make an OPP to be not available for operation
+ Example: Lets say that 1GHz OPP is to be disabled if the temperature
+ exceeds a threshold value. The SoC framework implementation might
+ choose to do something as follows::
+
+ if (cur_temp > temp_high_thresh) {
+ /* Disable 1GHz if it was enabled */
+ opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
+ dev_pm_opp_put(opp);
+ /* just error check */
+ if (!IS_ERR(opp))
+ ret = dev_pm_opp_disable(dev, 1000000000);
+ else
+ goto try_something_else;
+ }
+
+5. OPP Data Retrieval Functions
+===============================
+Since OPP library abstracts away the OPP information, a set of functions to pull
+information from the OPP structure is necessary. Once an OPP pointer is
+retrieved using the search functions, the following functions can be used by SoC
+framework to retrieve the information represented inside the OPP layer.
+
+dev_pm_opp_get_voltage
+ Retrieve the voltage represented by the opp pointer.
+ Example: At a cpufreq transition to a different frequency, SoC
+ framework requires to set the voltage represented by the OPP using
+ the regulator framework to the Power Management chip providing the
+ voltage::
+
+ soc_switch_to_freq_voltage(freq)
+ {
+ /* do things */
+ opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+ v = dev_pm_opp_get_voltage(opp);
+ dev_pm_opp_put(opp);
+ if (v)
+ regulator_set_voltage(.., v);
+ /* do other things */
+ }
+
+dev_pm_opp_get_freq
+ Retrieve the freq represented by the opp pointer.
+ Example: Lets say the SoC framework uses a couple of helper functions
+ we could pass opp pointers instead of doing additional parameters to
+ handle quiet a bit of data parameters::
+
+ soc_cpufreq_target(..)
+ {
+ /* do things.. */
+ max_freq = ULONG_MAX;
+ max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
+ requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
+ if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
+ r = soc_test_validity(max_opp, requested_opp);
+ dev_pm_opp_put(max_opp);
+ dev_pm_opp_put(requested_opp);
+ /* do other things */
+ }
+ soc_test_validity(..)
+ {
+ if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
+ return -EINVAL;
+ if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
+ return -EINVAL;
+ /* do things.. */
+ }
+
+dev_pm_opp_get_opp_count
+ Retrieve the number of available opps for a device
+ Example: Lets say a co-processor in the SoC needs to know the available
+ frequencies in a table, the main processor can notify as following::
+
+ soc_notify_coproc_available_frequencies()
+ {
+ /* Do things */
+ num_available = dev_pm_opp_get_opp_count(dev);
+ speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
+ /* populate the table in increasing order */
+ freq = 0;
+ while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
+ speeds[i] = freq;
+ freq++;
+ i++;
+ dev_pm_opp_put(opp);
+ }
+
+ soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
+ /* Do other things */
+ }
+
+6. Data Structures
+==================
+Typically an SoC contains multiple voltage domains which are variable. Each
+domain is represented by a device pointer. The relationship to OPP can be
+represented as follows::
+
+ SoC
+ |- device 1
+ | |- opp 1 (availability, freq, voltage)
+ | |- opp 2 ..
+ ... ...
+ | `- opp n ..
+ |- device 2
+ ...
+ `- device m
+
+OPP library maintains a internal list that the SoC framework populates and
+accessed by various functions as described above. However, the structures
+representing the actual OPPs and domains are internal to the OPP library itself
+to allow for suitable abstraction reusable across systems.
+
+struct dev_pm_opp
+ The internal data structure of OPP library which is used to
+ represent an OPP. In addition to the freq, voltage, availability
+ information, it also contains internal book keeping information required
+ for the OPP library to operate on. Pointer to this structure is
+ provided back to the users such as SoC framework to be used as a
+ identifier for OPP in the interactions with OPP layer.
+
+ WARNING:
+ The struct dev_pm_opp pointer should not be parsed or modified by the
+ users. The defaults of for an instance is populated by
+ dev_pm_opp_add, but the availability of the OPP can be modified
+ by dev_pm_opp_enable/disable functions.
+
+struct device
+ This is used to identify a domain to the OPP layer. The
+ nature of the device and it's implementation is left to the user of
+ OPP library such as the SoC framework.
+
+Overall, in a simplistic view, the data structure operations is represented as
+following::
+
+ Initialization / modification:
+ +-----+ /- dev_pm_opp_enable
+ dev_pm_opp_add --> | opp | <-------
+ | +-----+ \- dev_pm_opp_disable
+ \-------> domain_info(device)
+
+ Search functions:
+ /-- dev_pm_opp_find_freq_ceil ---\ +-----+
+ domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
+ \-- dev_pm_opp_find_freq_floor ---/ +-----+
+
+ Retrieval functions:
+ +-----+ /- dev_pm_opp_get_voltage
+ | opp | <---
+ +-----+ \- dev_pm_opp_get_freq
+
+ domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt
deleted file mode 100644
index 0c007e250cd1..000000000000
--- a/Documentation/power/opp.txt
+++ /dev/null
@@ -1,342 +0,0 @@
-Operating Performance Points (OPP) Library
-==========================================
-
-(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
-
-Contents
---------
-1. Introduction
-2. Initial OPP List Registration
-3. OPP Search Functions
-4. OPP Availability Control Functions
-5. OPP Data Retrieval Functions
-6. Data Structures
-
-1. Introduction
-===============
-1.1 What is an Operating Performance Point (OPP)?
-
-Complex SoCs of today consists of a multiple sub-modules working in conjunction.
-In an operational system executing varied use cases, not all modules in the SoC
-need to function at their highest performing frequency all the time. To
-facilitate this, sub-modules in a SoC are grouped into domains, allowing some
-domains to run at lower voltage and frequency while other domains run at
-voltage/frequency pairs that are higher.
-
-The set of discrete tuples consisting of frequency and voltage pairs that
-the device will support per domain are called Operating Performance Points or
-OPPs.
-
-As an example:
-Let us consider an MPU device which supports the following:
-{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
-{1GHz at minimum voltage of 1.3V}
-
-We can represent these as three OPPs as the following {Hz, uV} tuples:
-{300000000, 1000000}
-{800000000, 1200000}
-{1000000000, 1300000}
-
-1.2 Operating Performance Points Library
-
-OPP library provides a set of helper functions to organize and query the OPP
-information. The library is located in drivers/base/power/opp.c and the header
-is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
-CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
-CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
-optionally boot at a certain OPP without needing cpufreq.
-
-Typical usage of the OPP library is as follows:
-(users) -> registers a set of default OPPs -> (library)
-SoC framework -> modifies on required cases certain OPPs -> OPP layer
- -> queries to search/retrieve information ->
-
-OPP layer expects each domain to be represented by a unique device pointer. SoC
-framework registers a set of initial OPPs per device with the OPP layer. This
-list is expected to be an optimally small number typically around 5 per device.
-This initial list contains a set of OPPs that the framework expects to be safely
-enabled by default in the system.
-
-Note on OPP Availability:
-------------------------
-As the system proceeds to operate, SoC framework may choose to make certain
-OPPs available or not available on each device based on various external
-factors. Example usage: Thermal management or other exceptional situations where
-SoC framework might choose to disable a higher frequency OPP to safely continue
-operations until that OPP could be re-enabled if possible.
-
-OPP library facilitates this concept in it's implementation. The following
-operational functions operate only on available opps:
-opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
-
-dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
-be used for dev_pm_opp_enable/disable functions to make an opp available as required.
-
-WARNING: Users of OPP library should refresh their availability count using
-get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
-exact mechanism to trigger these or the notification mechanism to other
-dependent subsystems such as cpufreq are left to the discretion of the SoC
-specific framework which uses the OPP library. Similar care needs to be taken
-care to refresh the cpufreq table in cases of these operations.
-
-2. Initial OPP List Registration
-================================
-The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
-device. It is expected that the SoC framework will register the OPP entries
-optimally- typical numbers range to be less than 5. The list generated by
-registering the OPPs is maintained by OPP library throughout the device
-operation. The SoC framework can subsequently control the availability of the
-OPPs dynamically using the dev_pm_opp_enable / disable functions.
-
-dev_pm_opp_add - Add a new OPP for a specific domain represented by the device pointer.
- The OPP is defined using the frequency and voltage. Once added, the OPP
- is assumed to be available and control of it's availability can be done
- with the dev_pm_opp_enable/disable functions. OPP library internally stores
- and manages this information in the opp struct. This function may be
- used by SoC framework to define a optimal list as per the demands of
- SoC usage environment.
-
- WARNING: Do not use this function in interrupt context.
-
- Example:
- soc_pm_init()
- {
- /* Do things */
- r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
- if (!r) {
- pr_err("%s: unable to register mpu opp(%d)\n", r);
- goto no_cpufreq;
- }
- /* Do cpufreq things */
- no_cpufreq:
- /* Do remaining things */
- }
-
-3. OPP Search Functions
-=======================
-High level framework such as cpufreq operates on frequencies. To map the
-frequency back to the corresponding OPP, OPP library provides handy functions
-to search the OPP list that OPP library internally manages. These search
-functions return the matching pointer representing the opp if a match is
-found, else returns error. These errors are expected to be handled by standard
-error checks such as IS_ERR() and appropriate actions taken by the caller.
-
-Callers of these functions shall call dev_pm_opp_put() after they have used the
-OPP. Otherwise the memory for the OPP will never get freed and result in
-memleak.
-
-dev_pm_opp_find_freq_exact - Search for an OPP based on an *exact* frequency and
- availability. This function is especially useful to enable an OPP which
- is not available by default.
- Example: In a case when SoC framework detects a situation where a
- higher frequency could be made available, it can use this function to
- find the OPP prior to call the dev_pm_opp_enable to actually make it available.
- opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
- dev_pm_opp_put(opp);
- /* dont operate on the pointer.. just do a sanity check.. */
- if (IS_ERR(opp)) {
- pr_err("frequency not disabled!\n");
- /* trigger appropriate actions.. */
- } else {
- dev_pm_opp_enable(dev,1000000000);
- }
-
- NOTE: This is the only search function that operates on OPPs which are
- not available.
-
-dev_pm_opp_find_freq_floor - Search for an available OPP which is *at most* the
- provided frequency. This function is useful while searching for a lesser
- match OR operating on OPP information in the order of decreasing
- frequency.
- Example: To find the highest opp for a device:
- freq = ULONG_MAX;
- opp = dev_pm_opp_find_freq_floor(dev, &freq);
- dev_pm_opp_put(opp);
-
-dev_pm_opp_find_freq_ceil - Search for an available OPP which is *at least* the
- provided frequency. This function is useful while searching for a
- higher match OR operating on OPP information in the order of increasing
- frequency.
- Example 1: To find the lowest opp for a device:
- freq = 0;
- opp = dev_pm_opp_find_freq_ceil(dev, &freq);
- dev_pm_opp_put(opp);
- Example 2: A simplified implementation of a SoC cpufreq_driver->target:
- soc_cpufreq_target(..)
- {
- /* Do stuff like policy checks etc. */
- /* Find the best frequency match for the req */
- opp = dev_pm_opp_find_freq_ceil(dev, &freq);
- dev_pm_opp_put(opp);
- if (!IS_ERR(opp))
- soc_switch_to_freq_voltage(freq);
- else
- /* do something when we can't satisfy the req */
- /* do other stuff */
- }
-
-4. OPP Availability Control Functions
-=====================================
-A default OPP list registered with the OPP library may not cater to all possible
-situation. The OPP library provides a set of functions to modify the
-availability of a OPP within the OPP list. This allows SoC frameworks to have
-fine grained dynamic control of which sets of OPPs are operationally available.
-These functions are intended to *temporarily* remove an OPP in conditions such
-as thermal considerations (e.g. don't use OPPx until the temperature drops).
-
-WARNING: Do not use these functions in interrupt context.
-
-dev_pm_opp_enable - Make a OPP available for operation.
- Example: Lets say that 1GHz OPP is to be made available only if the
- SoC temperature is lower than a certain threshold. The SoC framework
- implementation might choose to do something as follows:
- if (cur_temp < temp_low_thresh) {
- /* Enable 1GHz if it was disabled */
- opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
- dev_pm_opp_put(opp);
- /* just error check */
- if (!IS_ERR(opp))
- ret = dev_pm_opp_enable(dev, 1000000000);
- else
- goto try_something_else;
- }
-
-dev_pm_opp_disable - Make an OPP to be not available for operation
- Example: Lets say that 1GHz OPP is to be disabled if the temperature
- exceeds a threshold value. The SoC framework implementation might
- choose to do something as follows:
- if (cur_temp > temp_high_thresh) {
- /* Disable 1GHz if it was enabled */
- opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
- dev_pm_opp_put(opp);
- /* just error check */
- if (!IS_ERR(opp))
- ret = dev_pm_opp_disable(dev, 1000000000);
- else
- goto try_something_else;
- }
-
-5. OPP Data Retrieval Functions
-===============================
-Since OPP library abstracts away the OPP information, a set of functions to pull
-information from the OPP structure is necessary. Once an OPP pointer is
-retrieved using the search functions, the following functions can be used by SoC
-framework to retrieve the information represented inside the OPP layer.
-
-dev_pm_opp_get_voltage - Retrieve the voltage represented by the opp pointer.
- Example: At a cpufreq transition to a different frequency, SoC
- framework requires to set the voltage represented by the OPP using
- the regulator framework to the Power Management chip providing the
- voltage.
- soc_switch_to_freq_voltage(freq)
- {
- /* do things */
- opp = dev_pm_opp_find_freq_ceil(dev, &freq);
- v = dev_pm_opp_get_voltage(opp);
- dev_pm_opp_put(opp);
- if (v)
- regulator_set_voltage(.., v);
- /* do other things */
- }
-
-dev_pm_opp_get_freq - Retrieve the freq represented by the opp pointer.
- Example: Lets say the SoC framework uses a couple of helper functions
- we could pass opp pointers instead of doing additional parameters to
- handle quiet a bit of data parameters.
- soc_cpufreq_target(..)
- {
- /* do things.. */
- max_freq = ULONG_MAX;
- max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
- requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
- if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
- r = soc_test_validity(max_opp, requested_opp);
- dev_pm_opp_put(max_opp);
- dev_pm_opp_put(requested_opp);
- /* do other things */
- }
- soc_test_validity(..)
- {
- if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
- return -EINVAL;
- if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
- return -EINVAL;
- /* do things.. */
- }
-
-dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device
- Example: Lets say a co-processor in the SoC needs to know the available
- frequencies in a table, the main processor can notify as following:
- soc_notify_coproc_available_frequencies()
- {
- /* Do things */
- num_available = dev_pm_opp_get_opp_count(dev);
- speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
- /* populate the table in increasing order */
- freq = 0;
- while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
- speeds[i] = freq;
- freq++;
- i++;
- dev_pm_opp_put(opp);
- }
-
- soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
- /* Do other things */
- }
-
-6. Data Structures
-==================
-Typically an SoC contains multiple voltage domains which are variable. Each
-domain is represented by a device pointer. The relationship to OPP can be
-represented as follows:
-SoC
- |- device 1
- | |- opp 1 (availability, freq, voltage)
- | |- opp 2 ..
- ... ...
- | `- opp n ..
- |- device 2
- ...
- `- device m
-
-OPP library maintains a internal list that the SoC framework populates and
-accessed by various functions as described above. However, the structures
-representing the actual OPPs and domains are internal to the OPP library itself
-to allow for suitable abstraction reusable across systems.
-
-struct dev_pm_opp - The internal data structure of OPP library which is used to
- represent an OPP. In addition to the freq, voltage, availability
- information, it also contains internal book keeping information required
- for the OPP library to operate on. Pointer to this structure is
- provided back to the users such as SoC framework to be used as a
- identifier for OPP in the interactions with OPP layer.
-
- WARNING: The struct dev_pm_opp pointer should not be parsed or modified by the
- users. The defaults of for an instance is populated by dev_pm_opp_add, but the
- availability of the OPP can be modified by dev_pm_opp_enable/disable functions.
-
-struct device - This is used to identify a domain to the OPP layer. The
- nature of the device and it's implementation is left to the user of
- OPP library such as the SoC framework.
-
-Overall, in a simplistic view, the data structure operations is represented as
-following:
-
-Initialization / modification:
- +-----+ /- dev_pm_opp_enable
-dev_pm_opp_add --> | opp | <-------
- | +-----+ \- dev_pm_opp_disable
- \-------> domain_info(device)
-
-Search functions:
- /-- dev_pm_opp_find_freq_ceil ---\ +-----+
-domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
- \-- dev_pm_opp_find_freq_floor ---/ +-----+
-
-Retrieval functions:
-+-----+ /- dev_pm_opp_get_voltage
-| opp | <---
-+-----+ \- dev_pm_opp_get_freq
-
-domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst
new file mode 100644
index 000000000000..0e2ef7429304
--- /dev/null
+++ b/Documentation/power/pci.rst
@@ -0,0 +1,1135 @@
+====================
+PCI Power Management
+====================
+
+Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+An overview of concepts and the Linux kernel's interfaces related to PCI power
+management. Based on previous work by Patrick Mochel <mochel@transmeta.com>
+(and others).
+
+This document only covers the aspects of power management specific to PCI
+devices. For general description of the kernel's interfaces related to device
+power management refer to Documentation/driver-api/pm/devices.rst and
+Documentation/power/runtime_pm.rst.
+
+.. contents:
+
+ 1. Hardware and Platform Support for PCI Power Management
+ 2. PCI Subsystem and Device Power Management
+ 3. PCI Device Drivers and Power Management
+ 4. Resources
+
+
+1. Hardware and Platform Support for PCI Power Management
+=========================================================
+
+1.1. Native and Platform-Based Power Management
+-----------------------------------------------
+
+In general, power management is a feature allowing one to save energy by putting
+devices into states in which they draw less power (low-power states) at the
+price of reduced functionality or performance.
+
+Usually, a device is put into a low-power state when it is underutilized or
+completely inactive. However, when it is necessary to use the device once
+again, it has to be put back into the "fully functional" state (full-power
+state). This may happen when there are some data for the device to handle or
+as a result of an external event requiring the device to be active, which may
+be signaled by the device itself.
+
+PCI devices may be put into low-power states in two ways, by using the device
+capabilities introduced by the PCI Bus Power Management Interface Specification,
+or with the help of platform firmware, such as an ACPI BIOS. In the first
+approach, that is referred to as the native PCI power management (native PCI PM)
+in what follows, the device power state is changed as a result of writing a
+specific value into one of its standard configuration registers. The second
+approach requires the platform firmware to provide special methods that may be
+used by the kernel to change the device's power state.
+
+Devices supporting the native PCI PM usually can generate wakeup signals called
+Power Management Events (PMEs) to let the kernel know about external events
+requiring the device to be active. After receiving a PME the kernel is supposed
+to put the device that sent it into the full-power state. However, the PCI Bus
+Power Management Interface Specification doesn't define any standard method of
+delivering the PME from the device to the CPU and the operating system kernel.
+It is assumed that the platform firmware will perform this task and therefore,
+even though a PCI device is set up to generate PMEs, it also may be necessary to
+prepare the platform firmware for notifying the CPU of the PMEs coming from the
+device (e.g. by generating interrupts).
+
+In turn, if the methods provided by the platform firmware are used for changing
+the power state of a device, usually the platform also provides a method for
+preparing the device to generate wakeup signals. In that case, however, it
+often also is necessary to prepare the device for generating PMEs using the
+native PCI PM mechanism, because the method provided by the platform depends on
+that.
+
+Thus in many situations both the native and the platform-based power management
+mechanisms have to be used simultaneously to obtain the desired result.
+
+1.2. Native PCI Power Management
+--------------------------------
+
+The PCI Bus Power Management Interface Specification (PCI PM Spec) was
+introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a
+standard interface for performing various operations related to power
+management.
+
+The implementation of the PCI PM Spec is optional for conventional PCI devices,
+but it is mandatory for PCI Express devices. If a device supports the PCI PM
+Spec, it has an 8 byte power management capability field in its PCI
+configuration space. This field is used to describe and control the standard
+features related to the native PCI power management.
+
+The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
+(B0-B3). The higher the number, the less power is drawn by the device or bus
+in that state. However, the higher the number, the longer the latency for
+the device or bus to return to the full-power state (D0 or B0, respectively).
+
+There are two variants of the D3 state defined by the specification. The first
+one is D3hot, referred to as the software accessible D3, because devices can be
+programmed to go into it. The second one, D3cold, is the state that PCI devices
+are in when the supply voltage (Vcc) is removed from them. It is not possible
+to program a PCI device to go into D3cold, although there may be a programmable
+interface for putting the bus the device is on into a state in which Vcc is
+removed from all devices on the bus.
+
+PCI bus power management, however, is not supported by the Linux kernel at the
+time of this writing and therefore it is not covered by this document.
+
+Note that every PCI device can be in the full-power state (D0) or in D3cold,
+regardless of whether or not it implements the PCI PM Spec. In addition to
+that, if the PCI PM Spec is implemented by the device, it must support D3hot
+as well as D0. The support for the D1 and D2 power states is optional.
+
+PCI devices supporting the PCI PM Spec can be programmed to go to any of the
+supported low-power states (except for D3cold). While in D1-D3hot the
+standard configuration registers of the device must be accessible to software
+(i.e. the device is required to respond to PCI configuration accesses), although
+its I/O and memory spaces are then disabled. This allows the device to be
+programmatically put into D0. Thus the kernel can switch the device back and
+forth between D0 and the supported low-power states (except for D3cold) and the
+possible power state transitions the device can undergo are the following:
+
++----------------------------+
+| Current State | New State |
++----------------------------+
+| D0 | D1, D2, D3 |
++----------------------------+
+| D1 | D2, D3 |
++----------------------------+
+| D2 | D3 |
++----------------------------+
+| D1, D2, D3 | D0 |
++----------------------------+
+
+The transition from D3cold to D0 occurs when the supply voltage is provided to
+the device (i.e. power is restored). In that case the device returns to D0 with
+a full power-on reset sequence and the power-on defaults are restored to the
+device by hardware just as at initial power up.
+
+PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
+while in a low-power state (D1-D3), but they are not required to be capable
+of generating PMEs from all supported low-power states. In particular, the
+capability of generating PMEs from D3cold is optional and depends on the
+presence of additional voltage (3.3Vaux) allowing the device to remain
+sufficiently active to generate a wakeup signal.
+
+1.3. ACPI Device Power Management
+---------------------------------
+
+The platform firmware support for the power management of PCI devices is
+system-specific. However, if the system in question is compliant with the
+Advanced Configuration and Power Interface (ACPI) Specification, like the
+majority of x86-based systems, it is supposed to implement device power
+management interfaces defined by the ACPI standard.
+
+For this purpose the ACPI BIOS provides special functions called "control
+methods" that may be executed by the kernel to perform specific tasks, such as
+putting a device into a low-power state. These control methods are encoded
+using special byte-code language called the ACPI Machine Language (AML) and
+stored in the machine's BIOS. The kernel loads them from the BIOS and executes
+them as needed using an AML interpreter that translates the AML byte code into
+computations and memory or I/O space accesses. This way, in theory, a BIOS
+writer can provide the kernel with a means to perform actions depending
+on the system design in a system-specific fashion.
+
+ACPI control methods may be divided into global control methods, that are not
+associated with any particular devices, and device control methods, that have
+to be defined separately for each device supposed to be handled with the help of
+the platform. This means, in particular, that ACPI device control methods can
+only be used to handle devices that the BIOS writer knew about in advance. The
+ACPI methods used for device power management fall into that category.
+
+The ACPI specification assumes that devices can be in one of four power states
+labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
+D0-D3 states (although the difference between D3hot and D3cold is not taken
+into account by ACPI). Moreover, for each power state of a device there is a
+set of power resources that have to be enabled for the device to be put into
+that state. These power resources are controlled (i.e. enabled or disabled)
+with the help of their own control methods, _ON and _OFF, that have to be
+defined individually for each of them.
+
+To put a device into the ACPI power state Dx (where x is a number between 0 and
+3 inclusive) the kernel is supposed to (1) enable the power resources required
+by the device in this state using their _ON control methods and (2) execute the
+_PSx control method defined for the device. In addition to that, if the device
+is going to be put into a low-power state (D1-D3) and is supposed to generate
+wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
+3.0) control method defined for it has to be executed before _PSx. Power
+resources that are not required by the device in the target power state and are
+not required any more by any other device should be disabled (by executing their
+_OFF control methods). If the current power state of the device is D3, it can
+only be put into D0 this way.
+
+However, quite often the power states of devices are changed during a
+system-wide transition into a sleep state or back into the working state. ACPI
+defines four system sleep states, S1, S2, S3, and S4, and denotes the system
+working state as S0. In general, the target system sleep (or working) state
+determines the highest power (lowest number) state the device can be put
+into and the kernel is supposed to obtain this information by executing the
+device's _SxD control method (where x is a number between 0 and 4 inclusive).
+If the device is required to wake up the system from the target sleep state, the
+lowest power (highest number) state it can be put into is also determined by the
+target state of the system. The kernel is then supposed to use the device's
+_SxW control method to obtain the number of that state. It also is supposed to
+use the device's _PRW control method to learn which power resources need to be
+enabled for the device to be able to generate wakeup signals.
+
+1.4. Wakeup Signaling
+---------------------
+
+Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
+a result of the execution of the _DSW (or _PSW) ACPI control method before
+putting the device into a low-power state, have to be caught and handled as
+appropriate. If they are sent while the system is in the working state
+(ACPI S0), they should be translated into interrupts so that the kernel can
+put the devices generating them into the full-power state and take care of the
+events that triggered them. In turn, if they are sent while the system is
+sleeping, they should cause the system's core logic to trigger wakeup.
+
+On ACPI-based systems wakeup signals sent by conventional PCI devices are
+converted into ACPI General-Purpose Events (GPEs) which are hardware signals
+from the system core logic generated in response to various events that need to
+be acted upon. Every GPE is associated with one or more sources of potentially
+interesting events. In particular, a GPE may be associated with a PCI device
+capable of signaling wakeup. The information on the connections between GPEs
+and event sources is recorded in the system's ACPI BIOS from where it can be
+read by the kernel.
+
+If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
+associated with it (if there is one) is triggered. The GPEs associated with PCI
+bridges may also be triggered in response to a wakeup signal from one of the
+devices below the bridge (this also is the case for root bridges) and, for
+example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
+handled this way.
+
+A GPE may be triggered when the system is sleeping (i.e. when it is in one of
+the ACPI S1-S4 states), in which case system wakeup is started by its core logic
+(the device that was the source of the signal causing the system wakeup to occur
+may be identified later). The GPEs used in such situations are referred to as
+wakeup GPEs.
+
+Usually, however, GPEs are also triggered when the system is in the working
+state (ACPI S0) and in that case the system's core logic generates a System
+Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI
+handler identifies the GPE that caused the interrupt to be generated which,
+in turn, allows the kernel to identify the source of the event (that may be
+a PCI device signaling wakeup). The GPEs used for notifying the kernel of
+events occurring while the system is in the working state are referred to as
+runtime GPEs.
+
+Unfortunately, there is no standard way of handling wakeup signals sent by
+conventional PCI devices on systems that are not ACPI-based, but there is one
+for PCI Express devices. Namely, the PCI Express Base Specification introduced
+a native mechanism for converting native PCI PMEs into interrupts generated by
+root ports. For conventional PCI devices native PMEs are out-of-band, so they
+are routed separately and they need not pass through bridges (in principle they
+may be routed directly to the system's core logic), but for PCI Express devices
+they are in-band messages that have to pass through the PCI Express hierarchy,
+including the root port on the path from the device to the Root Complex. Thus
+it was possible to introduce a mechanism by which a root port generates an
+interrupt whenever it receives a PME message from one of the devices below it.
+The PCI Express Requester ID of the device that sent the PME message is then
+recorded in one of the root port's configuration registers from where it may be
+read by the interrupt handler allowing the device to be identified. [PME
+messages sent by PCI Express endpoints integrated with the Root Complex don't
+pass through root ports, but instead they cause a Root Complex Event Collector
+(if there is one) to generate interrupts.]
+
+In principle the native PCI Express PME signaling may also be used on ACPI-based
+systems along with the GPEs, but to use it the kernel has to ask the system's
+ACPI BIOS to release control of root port configuration registers. The ACPI
+BIOS, however, is not required to allow the kernel to control these registers
+and if it doesn't do that, the kernel must not modify their contents. Of course
+the native PCI Express PME signaling cannot be used by the kernel in that case.
+
+
+2. PCI Subsystem and Device Power Management
+============================================
+
+2.1. Device Power Management Callbacks
+--------------------------------------
+
+The PCI Subsystem participates in the power management of PCI devices in a
+number of ways. First of all, it provides an intermediate code layer between
+the device power management core (PM core) and PCI device drivers.
+Specifically, the pm field of the PCI subsystem's struct bus_type object,
+pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
+pointers to several device power management callbacks::
+
+ const struct dev_pm_ops pci_dev_pm_ops = {
+ .prepare = pci_pm_prepare,
+ .complete = pci_pm_complete,
+ .suspend = pci_pm_suspend,
+ .resume = pci_pm_resume,
+ .freeze = pci_pm_freeze,
+ .thaw = pci_pm_thaw,
+ .poweroff = pci_pm_poweroff,
+ .restore = pci_pm_restore,
+ .suspend_noirq = pci_pm_suspend_noirq,
+ .resume_noirq = pci_pm_resume_noirq,
+ .freeze_noirq = pci_pm_freeze_noirq,
+ .thaw_noirq = pci_pm_thaw_noirq,
+ .poweroff_noirq = pci_pm_poweroff_noirq,
+ .restore_noirq = pci_pm_restore_noirq,
+ .runtime_suspend = pci_pm_runtime_suspend,
+ .runtime_resume = pci_pm_runtime_resume,
+ .runtime_idle = pci_pm_runtime_idle,
+ };
+
+These callbacks are executed by the PM core in various situations related to
+device power management and they, in turn, execute power management callbacks
+provided by PCI device drivers. They also perform power management operations
+involving some standard configuration registers of PCI devices that device
+drivers need not know or care about.
+
+The structure representing a PCI device, struct pci_dev, contains several fields
+that these callbacks operate on::
+
+ struct pci_dev {
+ ...
+ pci_power_t current_state; /* Current operating state. */
+ int pm_cap; /* PM capability offset in the
+ configuration space */
+ unsigned int pme_support:5; /* Bitmask of states from which PME#
+ can be generated */
+ unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */
+ unsigned int d1_support:1; /* Low power state D1 is supported */
+ unsigned int d2_support:1; /* Low power state D2 is supported */
+ unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
+ unsigned int wakeup_prepared:1; /* Device prepared for wake up */
+ unsigned int d3_delay; /* D3->D0 transition time in ms */
+ ...
+ };
+
+They also indirectly use some fields of the struct device that is embedded in
+struct pci_dev.
+
+2.2. Device Initialization
+--------------------------
+
+The PCI subsystem's first task related to device power management is to
+prepare the device for power management and initialize the fields of struct
+pci_dev used for this purpose. This happens in two functions defined in
+drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
+
+The first of these functions checks if the device supports native PCI PM
+and if that's the case the offset of its power management capability structure
+in the configuration space is stored in the pm_cap field of the device's struct
+pci_dev object. Next, the function checks which PCI low-power states are
+supported by the device and from which low-power states the device can generate
+native PCI PMEs. The power management fields of the device's struct pci_dev and
+the struct device embedded in it are updated accordingly and the generation of
+PMEs by the device is disabled.
+
+The second function checks if the device can be prepared to signal wakeup with
+the help of the platform firmware, such as the ACPI BIOS. If that is the case,
+the function updates the wakeup fields in struct device embedded in the
+device's struct pci_dev and uses the firmware-provided method to prevent the
+device from signaling wakeup.
+
+At this point the device is ready for power management. For driverless devices,
+however, this functionality is limited to a few basic operations carried out
+during system-wide transitions to a sleep state and back to the working state.
+
+2.3. Runtime Device Power Management
+------------------------------------
+
+The PCI subsystem plays a vital role in the runtime power management of PCI
+devices. For this purpose it uses the general runtime power management
+(runtime PM) framework described in Documentation/power/runtime_pm.rst.
+Namely, it provides subsystem-level callbacks::
+
+ pci_pm_runtime_suspend()
+ pci_pm_runtime_resume()
+ pci_pm_runtime_idle()
+
+that are executed by the core runtime PM routines. It also implements the
+entire mechanics necessary for handling runtime wakeup signals from PCI devices
+in low-power states, which at the time of this writing works for both the native
+PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
+Section 1.
+
+First, a PCI device is put into a low-power state, or suspended, with the help
+of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
+pci_pm_runtime_suspend() to do the actual job. For this to work, the device's
+driver has to provide a pm->runtime_suspend() callback (see below), which is
+run by pci_pm_runtime_suspend() as the first action. If the driver's callback
+returns successfully, the device's standard configuration registers are saved,
+the device is prepared to generate wakeup signals and, finally, it is put into
+the target low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup. The exact method of signaling wakeup is
+system-dependent and is determined by the PCI subsystem on the basis of the
+reported capabilities of the device and the platform firmware. To prepare the
+device for signaling wakeup and put it into the selected low-power state, the
+PCI subsystem can use the platform firmware as well as the device's native PCI
+PM capabilities, if supported.
+
+It is expected that the device driver's pm->runtime_suspend() callback will
+not attempt to prepare the device for signaling wakeup or to put it into a
+low-power state. The driver ought to leave these tasks to the PCI subsystem
+that has all of the information necessary to perform them.
+
+A suspended device is brought back into the "active" state, or resumed,
+with the help of pm_request_resume() or pm_runtime_resume() which both call
+pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's
+driver provides a pm->runtime_resume() callback (see below). However, before
+the driver's callback is executed, pci_pm_runtime_resume() brings the device
+back into the full-power state, prevents it from signaling wakeup while in that
+state and restores its standard configuration registers. Thus the driver's
+callback need not worry about the PCI-specific aspects of the device resume.
+
+Note that generally pci_pm_runtime_resume() may be called in two different
+situations. First, it may be called at the request of the device's driver, for
+example if there are some data for it to process. Second, it may be called
+as a result of a wakeup signal from the device itself (this sometimes is
+referred to as "remote wakeup"). Of course, for this purpose the wakeup signal
+is handled in one of the ways described in Section 1 and finally converted into
+a notification for the PCI subsystem after the source device has been
+identified.
+
+The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
+and pm_request_idle(), executes the device driver's pm->runtime_idle()
+callback, if defined, and if that callback doesn't return error code (or is not
+present at all), suspends the device with the help of pm_runtime_suspend().
+Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
+example, it is called right after the device has just been resumed), in which
+cases it is expected to suspend the device if that makes sense. Usually,
+however, the PCI subsystem doesn't really know if the device really can be
+suspended, so it lets the device's driver decide by running its
+pm->runtime_idle() callback.
+
+2.4. System-Wide Power Transitions
+----------------------------------
+There are a few different types of system-wide power transitions, described in
+Documentation/driver-api/pm/devices.rst. Each of them requires devices to be handled
+in a specific way and the PM core executes subsystem-level power management
+callbacks for this purpose. They are executed in phases such that each phase
+involves executing the same subsystem-level callback for every device belonging
+to the given subsystem before the next phase begins. These phases always run
+after tasks have been frozen.
+
+2.4.1. System Suspend
+^^^^^^^^^^^^^^^^^^^^^
+
+When the system is going into a sleep state in which the contents of memory will
+be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
+
+ prepare, suspend, suspend_noirq.
+
+The following PCI bus type's callbacks, respectively, are used in these phases::
+
+ pci_pm_prepare()
+ pci_pm_suspend()
+ pci_pm_suspend_noirq()
+
+The pci_pm_prepare() routine first puts the device into the "fully functional"
+state with the help of pm_runtime_resume(). Then, it executes the device
+driver's pm->prepare() callback if defined (i.e. if the driver's struct
+dev_pm_ops object is present and the prepare pointer in that object is valid).
+
+The pci_pm_suspend() routine first checks if the device's driver implements
+legacy PCI suspend routines (see Section 3), in which case the driver's legacy
+suspend callback is executed, if present, and its result is returned. Next, if
+the device's driver doesn't provide a struct dev_pm_ops object (containing
+pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
+simply turns off the device's bus master capability and runs
+pcibios_disable_device() to disable it, unless the device is a bridge (PCI
+bridges are ignored by this routine). Next, the device driver's pm->suspend()
+callback is executed, if defined, and its result is returned if it fails.
+Finally, pci_fixup_device() is called to apply hardware suspend quirks related
+to the device if necessary.
+
+Note that the suspend phase is carried out asynchronously for PCI devices, so
+the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
+devices that don't depend on each other in a known way (i.e. none of the paths
+in the device tree from the root bridge to a leaf device contains both of them).
+
+The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
+been called, which means that the device driver's interrupt handler won't be
+invoked while this routine is running. It first checks if the device's driver
+implements legacy PCI suspends routines (Section 3), in which case the legacy
+late suspend routine is called and its result is returned (the standard
+configuration registers of the device are saved if the driver's callback hasn't
+done that). Second, if the device driver's struct dev_pm_ops object is not
+present, the device's standard configuration registers are saved and the routine
+returns success. Otherwise the device driver's pm->suspend_noirq() callback is
+executed, if present, and its result is returned if it fails. Next, if the
+device's standard configuration registers haven't been saved yet (one of the
+device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
+saves them, prepares the device to signal wakeup (if necessary) and puts it into
+a low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup while the system is in the target sleep
+state. Just like in the runtime PM case described above, the mechanism of
+signaling wakeup is system-dependent and determined by the PCI subsystem, which
+is also responsible for preparing the device to signal wakeup from the system's
+target sleep state as appropriate.
+
+PCI device drivers (that don't implement legacy power management callbacks) are
+generally not expected to prepare devices for signaling wakeup or to put them
+into low-power states. However, if one of the driver's suspend callbacks
+(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
+registers, pci_pm_suspend_noirq() will assume that the device has been prepared
+to signal wakeup and put into a low-power state by the driver (the driver is
+then assumed to have used the helper functions provided by the PCI subsystem for
+this purpose). PCI device drivers are not encouraged to do that, but in some
+rare cases doing that in the driver may be the optimum approach.
+
+2.4.2. System Resume
+^^^^^^^^^^^^^^^^^^^^
+
+When the system is undergoing a transition from a sleep state in which the
+contents of memory have been preserved, such as one of the ACPI sleep states
+S1-S3, into the working state (ACPI S0), the phases are:
+
+ resume_noirq, resume, complete.
+
+The following PCI bus type's callbacks, respectively, are executed in these
+phases::
+
+ pci_pm_resume_noirq()
+ pci_pm_resume()
+ pci_pm_complete()
+
+The pci_pm_resume_noirq() routine first puts the device into the full-power
+state, restores its standard configuration registers and applies early resume
+hardware quirks related to the device, if necessary. This is done
+unconditionally, regardless of whether or not the device's driver implements
+legacy PCI power management callbacks (this way all PCI devices are in the
+full-power state and their standard configuration registers have been restored
+when their interrupt handlers are invoked for the first time during resume,
+which allows the kernel to avoid problems with the handling of shared interrupts
+by drivers whose devices are still suspended). If legacy PCI power management
+callbacks (see Section 3) are implemented by the device's driver, the legacy
+early resume callback is executed and its result is returned. Otherwise, the
+device driver's pm->resume_noirq() callback is executed, if defined, and its
+result is returned.
+
+The pci_pm_resume() routine first checks if the device's standard configuration
+registers have been restored and restores them if that's not the case (this
+only is necessary in the error path during a failing suspend). Next, resume
+hardware quirks related to the device are applied, if necessary, and if the
+device's driver implements legacy PCI power management callbacks (see
+Section 3), the driver's legacy resume callback is executed and its result is
+returned. Otherwise, the device's wakeup signaling mechanisms are blocked and
+its driver's pm->resume() callback is executed, if defined (the callback's
+result is then returned).
+
+The resume phase is carried out asynchronously for PCI devices, like the
+suspend phase described above, which means that if two PCI devices don't depend
+on each other in a known way, the pci_pm_resume() routine may be executed for
+the both of them in parallel.
+
+The pci_pm_complete() routine only executes the device driver's pm->complete()
+callback, if defined.
+
+2.4.3. System Hibernation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+System hibernation is more complicated than system suspend, because it requires
+a system image to be created and written into a persistent storage medium. The
+image is created atomically and all devices are quiesced, or frozen, before that
+happens.
+
+The freezing of devices is carried out after enough memory has been freed (at
+the time of this writing the image creation requires at least 50% of system RAM
+to be free) in the following three phases:
+
+ prepare, freeze, freeze_noirq
+
+that correspond to the PCI bus type's callbacks::
+
+ pci_pm_prepare()
+ pci_pm_freeze()
+ pci_pm_freeze_noirq()
+
+This means that the prepare phase is exactly the same as for system suspend.
+The other two phases, however, are different.
+
+The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
+the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
+and it doesn't apply the suspend-related hardware quirks. It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The pci_pm_freeze_noirq() routine, in turn, is similar to
+pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
+routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the
+device for signaling wakeup and put it into a low-power state. Still, it saves
+the device's standard configuration registers if they haven't been saved by one
+of the driver's callbacks.
+
+Once the image has been created, it has to be saved. However, at this point all
+devices are frozen and they cannot handle I/O, while their ability to handle
+I/O is obviously necessary for the image saving. Thus they have to be brought
+back to the fully functional state and this is done in the following phases:
+
+ thaw_noirq, thaw, complete
+
+using the following PCI bus type's callbacks::
+
+ pci_pm_thaw_noirq()
+ pci_pm_thaw()
+ pci_pm_complete()
+
+respectively.
+
+The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
+but it doesn't put the device into the full power state and doesn't attempt to
+restore its standard configuration registers. It also executes the device
+driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
+
+The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
+driver's pm->thaw() callback instead of pm->resume(). It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The complete phase it the same as for system resume.
+
+After saving the image, devices need to be powered down before the system can
+enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in
+three phases:
+
+ prepare, poweroff, poweroff_noirq
+
+where the prepare phase is exactly the same as for system suspend. The other
+two phases are analogous to the suspend and suspend_noirq phases, respectively.
+The PCI subsystem-level callbacks they correspond to::
+
+ pci_pm_poweroff()
+ pci_pm_poweroff_noirq()
+
+work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
+although they don't attempt to save the device's standard configuration
+registers.
+
+2.4.4. System Restore
+^^^^^^^^^^^^^^^^^^^^^
+
+System restore requires a hibernation image to be loaded into memory and the
+pre-hibernation memory contents to be restored before the pre-hibernation system
+activity can be resumed.
+
+As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
+into memory by a fresh instance of the kernel, called the boot kernel, which in
+turn is loaded and run by a boot loader in the usual way. After the boot kernel
+has loaded the image, it needs to replace its own code and data with the code
+and data of the "hibernated" kernel stored within the image, called the image
+kernel. For this purpose all devices are frozen just like before creating
+the image during hibernation, in the
+
+ prepare, freeze, freeze_noirq
+
+phases described above. However, the devices affected by these phases are only
+those having drivers in the boot kernel; other devices will still be in whatever
+state the boot loader left them.
+
+Should the restoration of the pre-hibernation memory contents fail, the boot
+kernel would go through the "thawing" procedure described above, using the
+thaw_noirq, thaw, and complete phases (that will only affect the devices having
+drivers in the boot kernel), and then continue running normally.
+
+If the pre-hibernation memory contents are restored successfully, which is the
+usual situation, control is passed to the image kernel, which then becomes
+responsible for bringing the system back to the working state. To achieve this,
+it must restore the devices' pre-hibernation functionality, which is done much
+like waking up from the memory sleep state, although it involves different
+phases:
+
+ restore_noirq, restore, complete
+
+The first two of these are analogous to the resume_noirq and resume phases
+described above, respectively, and correspond to the following PCI subsystem
+callbacks::
+
+ pci_pm_restore_noirq()
+ pci_pm_restore()
+
+These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
+respectively, but they execute the device driver's pm->restore_noirq() and
+pm->restore() callbacks, if available.
+
+The complete phase is carried out in exactly the same way as during system
+resume.
+
+
+3. PCI Device Drivers and Power Management
+==========================================
+
+3.1. Power Management Callbacks
+-------------------------------
+
+PCI device drivers participate in power management by providing callbacks to be
+executed by the PCI subsystem's power management routines described above and by
+controlling the runtime power management of their devices.
+
+At the time of this writing there are two ways to define power management
+callbacks for a PCI device driver, the recommended one, based on using a
+dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
+"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
+.resume() callbacks from struct pci_driver are used. The legacy approach,
+however, doesn't allow one to define runtime power management callbacks and is
+not really suitable for any new drivers. Therefore it is not covered by this
+document (refer to the source code to learn more about it).
+
+It is recommended that all PCI device drivers define a struct dev_pm_ops object
+containing pointers to power management (PM) callbacks that will be executed by
+the PCI subsystem's PM routines in various circumstances. A pointer to the
+driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
+its struct pci_driver object. Once that has happened, the "legacy" PM callbacks
+in struct pci_driver are ignored (even if they are not NULL).
+
+The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
+defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
+subsystem will handle the device in a simplified default manner. If they are
+defined, though, they are expected to behave as described in the following
+subsections.
+
+3.1.1. prepare()
+^^^^^^^^^^^^^^^^
+
+The prepare() callback is executed during system suspend, during hibernation
+(when a hibernation image is about to be created), during power-off after
+saving a hibernation image and during system restore, when a hibernation image
+has just been loaded into memory.
+
+This callback is only necessary if the driver's device has children that in
+general may be registered at any time. In that case the role of the prepare()
+callback is to prevent new children of the device from being registered until
+one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
+
+In addition to that the prepare() callback may carry out some operations
+preparing the device to be suspended, although it should not allocate memory
+(if additional memory is required to suspend the device, it has to be
+preallocated earlier, for example in a suspend/hibernate notifier as described
+in Documentation/driver-api/pm/notifiers.rst).
+
+3.1.2. suspend()
+^^^^^^^^^^^^^^^^
+
+The suspend() callback is only executed during system suspend, after prepare()
+callbacks have been executed for all devices in the system.
+
+This callback is expected to quiesce the device and prepare it to be put into a
+low-power state by the PCI subsystem. It is not required (in fact it even is
+not recommended) that a PCI driver's suspend() callback save the standard
+configuration registers of the device, prepare it for waking up the system, or
+put it into a low-power state. All of these operations can very well be taken
+care of by the PCI subsystem, without the driver's participation.
+
+However, in some rare case it is convenient to carry out these operations in
+a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and
+pci_set_power_state() should be used to save the device's standard configuration
+registers, to prepare it for system wakeup (if necessary), and to put it into a
+low-power state, respectively. Moreover, if the driver calls pci_save_state(),
+the PCI subsystem will not execute either pci_prepare_to_sleep(), or
+pci_set_power_state() for its device, so the driver is then responsible for
+handling the device as appropriate.
+
+While the suspend() callback is being executed, the driver's interrupt handler
+can be invoked to handle an interrupt from the device, so all suspend-related
+operations relying on the driver's ability to handle interrupts should be
+carried out in this callback.
+
+3.1.3. suspend_noirq()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The suspend_noirq() callback is only executed during system suspend, after
+suspend() callbacks have been executed for all devices in the system and
+after device interrupts have been disabled by the PM core.
+
+The difference between suspend_noirq() and suspend() is that the driver's
+interrupt handler will not be invoked while suspend_noirq() is running. Thus
+suspend_noirq() can carry out operations that would cause race conditions to
+arise if they were performed in suspend().
+
+3.1.4. freeze()
+^^^^^^^^^^^^^^^
+
+The freeze() callback is hibernation-specific and is executed in two situations,
+during hibernation, after prepare() callbacks have been executed for all devices
+in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory from persistent storage and the
+prepare() callbacks have been executed for all devices.
+
+The role of this callback is analogous to the role of the suspend() callback
+described above. In fact, they only need to be different in the rare cases when
+the driver takes the responsibility for putting the device into a low-power
+state.
+
+In that cases the freeze() callback should not prepare the device system wakeup
+or put it into a low-power state. Still, either it or freeze_noirq() should
+save the device's standard configuration registers using pci_save_state().
+
+3.1.5. freeze_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The freeze_noirq() callback is hibernation-specific. It is executed during
+hibernation, after prepare() and freeze() callbacks have been executed for all
+devices in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory and after prepare() and
+freeze() callbacks have been executed for all devices. It is always executed
+after device interrupts have been disabled by the PM core.
+
+The role of this callback is analogous to the role of the suspend_noirq()
+callback described above and it very rarely is necessary to define
+freeze_noirq().
+
+The difference between freeze_noirq() and freeze() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.6. poweroff()
+^^^^^^^^^^^^^^^^^
+
+The poweroff() callback is hibernation-specific. It is executed when the system
+is about to be powered off after saving a hibernation image to a persistent
+storage. prepare() callbacks are executed for all devices before poweroff() is
+called.
+
+The role of this callback is analogous to the role of the suspend() and freeze()
+callbacks described above, although it does not need to save the contents of
+the device's registers. In particular, if the driver wants to put the device
+into a low-power state itself instead of allowing the PCI subsystem to do that,
+the poweroff() callback should use pci_prepare_to_sleep() and
+pci_set_power_state() to prepare the device for system wakeup and to put it
+into a low-power state, respectively, but it need not save the device's standard
+configuration registers.
+
+3.1.7. poweroff_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The poweroff_noirq() callback is hibernation-specific. It is executed after
+poweroff() callbacks have been executed for all devices in the system.
+
+The role of this callback is analogous to the role of the suspend_noirq() and
+freeze_noirq() callbacks described above, but it does not need to save the
+contents of the device's registers.
+
+The difference between poweroff_noirq() and poweroff() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.8. resume_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The resume_noirq() callback is only executed during system resume, after the
+PM core has enabled the non-boot CPUs. The driver's interrupt handler will not
+be invoked while resume_noirq() is running, so this callback can carry out
+operations that might race with the interrupt handler.
+
+Since the PCI subsystem unconditionally puts all devices into the full power
+state in the resume_noirq phase of system resume and restores their standard
+configuration registers, resume_noirq() is usually not necessary. In general
+it should only be used for performing operations that would lead to race
+conditions if carried out by resume().
+
+3.1.9. resume()
+^^^^^^^^^^^^^^^
+
+The resume() callback is only executed during system resume, after
+resume_noirq() callbacks have been executed for all devices in the system and
+device interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-suspend configuration of the
+device and bringing it back to the fully functional state. The device should be
+able to process I/O in a usual way after resume() has returned.
+
+3.1.10. thaw_noirq()
+^^^^^^^^^^^^^^^^^^^^
+
+The thaw_noirq() callback is hibernation-specific. It is executed after a
+system image has been created and the non-boot CPUs have been enabled by the PM
+core, in the thaw_noirq phase of hibernation. It also may be executed if the
+loading of a hibernation image fails during system restore (it is then executed
+after enabling the non-boot CPUs). The driver's interrupt handler will not be
+invoked while thaw_noirq() is running.
+
+The role of this callback is analogous to the role of resume_noirq(). The
+difference between these two callbacks is that thaw_noirq() is executed after
+freeze() and freeze_noirq(), so in general it does not need to modify the
+contents of the device's registers.
+
+3.1.11. thaw()
+^^^^^^^^^^^^^^
+
+The thaw() callback is hibernation-specific. It is executed after thaw_noirq()
+callbacks have been executed for all devices in the system and after device
+interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-freeze configuration of
+the device, so that it will work in a usual way after thaw() has returned.
+
+3.1.12. restore_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The restore_noirq() callback is hibernation-specific. It is executed in the
+restore_noirq phase of hibernation, when the boot kernel has passed control to
+the image kernel and the non-boot CPUs have been enabled by the image kernel's
+PM core.
+
+This callback is analogous to resume_noirq() with the exception that it cannot
+make any assumption on the previous state of the device, even if the BIOS (or
+generally the platform firmware) is known to preserve that state over a
+suspend-resume cycle.
+
+For the vast majority of PCI device drivers there is no difference between
+resume_noirq() and restore_noirq().
+
+3.1.13. restore()
+^^^^^^^^^^^^^^^^^
+
+The restore() callback is hibernation-specific. It is executed after
+restore_noirq() callbacks have been executed for all devices in the system and
+after the PM core has enabled device drivers' interrupt handlers to be invoked.
+
+This callback is analogous to resume(), just like restore_noirq() is analogous
+to resume_noirq(). Consequently, the difference between restore_noirq() and
+restore() is analogous to the difference between resume_noirq() and resume().
+
+For the vast majority of PCI device drivers there is no difference between
+resume() and restore().
+
+3.1.14. complete()
+^^^^^^^^^^^^^^^^^^
+
+The complete() callback is executed in the following situations:
+
+ - during system resume, after resume() callbacks have been executed for all
+ devices,
+ - during hibernation, before saving the system image, after thaw() callbacks
+ have been executed for all devices,
+ - during system restore, when the system is going back to its pre-hibernation
+ state, after restore() callbacks have been executed for all devices.
+
+It also may be executed if the loading of a hibernation image into memory fails
+(in that case it is run after thaw() callbacks have been executed for all
+devices that have drivers in the boot kernel).
+
+This callback is entirely optional, although it may be necessary if the
+prepare() callback performs operations that need to be reversed.
+
+3.1.15. runtime_suspend()
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_suspend() callback is specific to device runtime power management
+(runtime PM). It is executed by the PM core's runtime PM framework when the
+device is about to be suspended (i.e. quiesced and put into a low-power state)
+at run time.
+
+This callback is responsible for freezing the device and preparing it to be
+put into a low-power state, but it must allow the PCI subsystem to perform all
+of the PCI-specific actions necessary for suspending the device.
+
+3.1.16. runtime_resume()
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_resume() callback is specific to device runtime PM. It is executed
+by the PM core's runtime PM framework when the device is about to be resumed
+(i.e. put into the full-power state and programmed to process I/O normally) at
+run time.
+
+This callback is responsible for restoring the normal functionality of the
+device after it has been put into the full-power state by the PCI subsystem.
+The device is expected to be able to process I/O in the usual way after
+runtime_resume() has returned.
+
+3.1.17. runtime_idle()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_idle() callback is specific to device runtime PM. It is executed
+by the PM core's runtime PM framework whenever it may be desirable to suspend
+the device according to the PM core's information. In particular, it is
+automatically executed right after runtime_resume() has returned in case the
+resume of the device has happened as a result of a spurious event.
+
+This callback is optional, but if it is not implemented or if it returns 0, the
+PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
+cause the driver's runtime_suspend() callback to be executed.
+
+3.1.18. Pointing Multiple Callback Pointers to One Routine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Although in principle each of the callbacks described in the previous
+subsections can be defined as a separate function, it often is convenient to
+point two or more members of struct dev_pm_ops to the same routine. There are
+a few convenience macros that can be used for this purpose.
+
+The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
+suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
+members and one resume routine pointed to by the .resume(), .thaw(), and
+.restore() members. The other function pointers in this struct dev_pm_ops are
+unset.
+
+The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
+additionally sets the .runtime_resume() pointer to the same value as
+.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
+the same value as .suspend() (and .freeze() and .poweroff()).
+
+The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
+dev_pm_ops to indicate that one suspend routine is to be pointed to by the
+.suspend(), .freeze(), and .poweroff() members and one resume routine is to
+be pointed to by the .resume(), .thaw(), and .restore() members.
+
+3.1.19. Driver Flags for Power Management
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PM core allows device drivers to set flags that influence the handling of
+power management for the devices by the core itself and by middle layer code
+including the PCI bus type. The flags should be set once at the driver probe
+time with the help of the dev_pm_set_driver_flags() function and they should not
+be updated directly afterwards.
+
+The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
+mechanism allowing device suspend/resume callbacks to be skipped if the device
+is in runtime suspend when the system suspend starts. That also affects all of
+the ancestors of the device, so this flag should only be used if absolutely
+necessary.
+
+The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
+positive value from pci_pm_prepare() if the ->prepare callback provided by the
+driver of the device returns a positive value. That allows the driver to opt
+out from using the direct-complete mechanism dynamically.
+
+The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
+perspective the device can be safely left in runtime suspend during system
+suspend. That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
+to skip resuming the device from runtime suspend unless there are PCI-specific
+reasons for doing that. Also, it causes pci_pm_suspend_late/noirq(),
+pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
+if the device remains in runtime suspend in the beginning of the "late" phase
+of the system-wide transition under way. Moreover, if the device is in
+runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
+power management status will be changed to "active" (as it is going to be put
+into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
+the function will set the power.direct_complete flag for it (to make the PM core
+skip the subsequent "thaw" callbacks for it) and return.
+
+Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
+device to be left in suspend after system-wide transitions to the working state.
+This flag is checked by the PM core, but the PCI bus type informs the PM core
+which devices may be left in suspend from its perspective (that happens during
+the "noirq" phase of system-wide suspend and analogous transitions) and next it
+uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
+pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
+callbacks for the device during the transition under way and will set its
+runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
+it.
+
+3.2. Device Runtime Power Management
+------------------------------------
+
+In addition to providing device power management callbacks PCI device drivers
+are responsible for controlling the runtime power management (runtime PM) of
+their devices.
+
+The PCI device runtime PM is optional, but it is recommended that PCI device
+drivers implement it at least in the cases where there is a reliable way of
+verifying that the device is not used (like when the network cable is detached
+from an Ethernet adapter or there are no devices attached to a USB controller).
+
+To support the PCI runtime PM the driver first needs to implement the
+runtime_suspend() and runtime_resume() callbacks. It also may need to implement
+the runtime_idle() callback to prevent the device from being suspended again
+every time right after the runtime_resume() callback has returned
+(alternatively, the runtime_suspend() callback will have to check if the
+device should really be suspended and return -EAGAIN if that is not the case).
+
+The runtime PM of PCI devices is enabled by default by the PCI core. PCI
+device drivers do not need to enable it and should not attempt to do so.
+However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
+helper function. In addition to that, the runtime PM usage counter of
+each PCI device is incremented by local_pci_probe() before executing the
+probe callback provided by the device's driver.
+
+If a PCI driver implements the runtime PM callbacks and intends to use the
+runtime PM framework provided by the PM core and the PCI subsystem, it needs
+to decrement the device's runtime PM usage counter in its probe callback
+function. If it doesn't do that, the counter will always be different from
+zero for the device and it will never be runtime-suspended. The simplest
+way to do that is by calling pm_runtime_put_noidle(), but if the driver
+wants to schedule an autosuspend right away, for example, it may call
+pm_runtime_put_autosuspend() instead for this purpose. Generally, it
+just needs to call a function that decrements the devices usage counter
+from its probe routine to make runtime PM work for the device.
+
+It is important to remember that the driver's runtime_suspend() callback
+may be executed right after the usage counter has been decremented, because
+user space may already have caused the pm_runtime_allow() helper function
+unblocking the runtime PM of the device to run via sysfs, so the driver must
+be prepared to cope with that.
+
+The driver itself should not call pm_runtime_allow(), though. Instead, it
+should let user space or some platform-specific code do that (user space can
+do it via sysfs as stated above), but it must be prepared to handle the
+runtime PM of the device correctly as soon as pm_runtime_allow() is called
+(which may happen at any time, even before the driver is loaded).
+
+When the driver's remove callback runs, it has to balance the decrementation
+of the device's runtime PM usage counter at the probe time. For this reason,
+if it has decremented the counter in its probe callback, it must run
+pm_runtime_get_noresume() in its remove callback. [Since the core carries
+out a runtime resume of the device and bumps up the device's usage counter
+before running the driver's remove callback, the runtime PM of the device
+is effectively disabled for the duration of the remove execution and all
+runtime PM helper functions incrementing the device's usage counter are
+then effectively equivalent to pm_runtime_get_noresume().]
+
+The runtime PM framework works by processing requests to suspend or resume
+devices, or to check if they are idle (in which cases it is reasonable to
+subsequently request that they be suspended). These requests are represented
+by work items put into the power management workqueue, pm_wq. Although there
+are a few situations in which power management requests are automatically
+queued by the PM core (for example, after processing a request to resume a
+device the PM core automatically queues a request to check if the device is
+idle), device drivers are generally responsible for queuing power management
+requests for their devices. For this purpose they should use the runtime PM
+helper functions provided by the PM core, discussed in
+Documentation/power/runtime_pm.rst.
+
+Devices can also be suspended and resumed synchronously, without placing a
+request into pm_wq. In the majority of cases this also is done by their
+drivers that use helper functions provided by the PM core for this purpose.
+
+For more information on the runtime PM of devices refer to
+Documentation/power/runtime_pm.rst.
+
+
+4. Resources
+============
+
+PCI Local Bus Specification, Rev. 3.0
+
+PCI Bus Power Management Interface Specification, Rev. 1.2
+
+Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
+
+PCI Express Base Specification, Rev. 2.0
+
+Documentation/driver-api/pm/devices.rst
+
+Documentation/power/runtime_pm.rst
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
deleted file mode 100644
index 8eaf9ee24d43..000000000000
--- a/Documentation/power/pci.txt
+++ /dev/null
@@ -1,1094 +0,0 @@
-PCI Power Management
-
-Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-
-An overview of concepts and the Linux kernel's interfaces related to PCI power
-management. Based on previous work by Patrick Mochel <mochel@transmeta.com>
-(and others).
-
-This document only covers the aspects of power management specific to PCI
-devices. For general description of the kernel's interfaces related to device
-power management refer to Documentation/driver-api/pm/devices.rst and
-Documentation/power/runtime_pm.txt.
-
----------------------------------------------------------------------------
-
-1. Hardware and Platform Support for PCI Power Management
-2. PCI Subsystem and Device Power Management
-3. PCI Device Drivers and Power Management
-4. Resources
-
-
-1. Hardware and Platform Support for PCI Power Management
-=========================================================
-
-1.1. Native and Platform-Based Power Management
------------------------------------------------
-In general, power management is a feature allowing one to save energy by putting
-devices into states in which they draw less power (low-power states) at the
-price of reduced functionality or performance.
-
-Usually, a device is put into a low-power state when it is underutilized or
-completely inactive. However, when it is necessary to use the device once
-again, it has to be put back into the "fully functional" state (full-power
-state). This may happen when there are some data for the device to handle or
-as a result of an external event requiring the device to be active, which may
-be signaled by the device itself.
-
-PCI devices may be put into low-power states in two ways, by using the device
-capabilities introduced by the PCI Bus Power Management Interface Specification,
-or with the help of platform firmware, such as an ACPI BIOS. In the first
-approach, that is referred to as the native PCI power management (native PCI PM)
-in what follows, the device power state is changed as a result of writing a
-specific value into one of its standard configuration registers. The second
-approach requires the platform firmware to provide special methods that may be
-used by the kernel to change the device's power state.
-
-Devices supporting the native PCI PM usually can generate wakeup signals called
-Power Management Events (PMEs) to let the kernel know about external events
-requiring the device to be active. After receiving a PME the kernel is supposed
-to put the device that sent it into the full-power state. However, the PCI Bus
-Power Management Interface Specification doesn't define any standard method of
-delivering the PME from the device to the CPU and the operating system kernel.
-It is assumed that the platform firmware will perform this task and therefore,
-even though a PCI device is set up to generate PMEs, it also may be necessary to
-prepare the platform firmware for notifying the CPU of the PMEs coming from the
-device (e.g. by generating interrupts).
-
-In turn, if the methods provided by the platform firmware are used for changing
-the power state of a device, usually the platform also provides a method for
-preparing the device to generate wakeup signals. In that case, however, it
-often also is necessary to prepare the device for generating PMEs using the
-native PCI PM mechanism, because the method provided by the platform depends on
-that.
-
-Thus in many situations both the native and the platform-based power management
-mechanisms have to be used simultaneously to obtain the desired result.
-
-1.2. Native PCI Power Management
---------------------------------
-The PCI Bus Power Management Interface Specification (PCI PM Spec) was
-introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a
-standard interface for performing various operations related to power
-management.
-
-The implementation of the PCI PM Spec is optional for conventional PCI devices,
-but it is mandatory for PCI Express devices. If a device supports the PCI PM
-Spec, it has an 8 byte power management capability field in its PCI
-configuration space. This field is used to describe and control the standard
-features related to the native PCI power management.
-
-The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
-(B0-B3). The higher the number, the less power is drawn by the device or bus
-in that state. However, the higher the number, the longer the latency for
-the device or bus to return to the full-power state (D0 or B0, respectively).
-
-There are two variants of the D3 state defined by the specification. The first
-one is D3hot, referred to as the software accessible D3, because devices can be
-programmed to go into it. The second one, D3cold, is the state that PCI devices
-are in when the supply voltage (Vcc) is removed from them. It is not possible
-to program a PCI device to go into D3cold, although there may be a programmable
-interface for putting the bus the device is on into a state in which Vcc is
-removed from all devices on the bus.
-
-PCI bus power management, however, is not supported by the Linux kernel at the
-time of this writing and therefore it is not covered by this document.
-
-Note that every PCI device can be in the full-power state (D0) or in D3cold,
-regardless of whether or not it implements the PCI PM Spec. In addition to
-that, if the PCI PM Spec is implemented by the device, it must support D3hot
-as well as D0. The support for the D1 and D2 power states is optional.
-
-PCI devices supporting the PCI PM Spec can be programmed to go to any of the
-supported low-power states (except for D3cold). While in D1-D3hot the
-standard configuration registers of the device must be accessible to software
-(i.e. the device is required to respond to PCI configuration accesses), although
-its I/O and memory spaces are then disabled. This allows the device to be
-programmatically put into D0. Thus the kernel can switch the device back and
-forth between D0 and the supported low-power states (except for D3cold) and the
-possible power state transitions the device can undergo are the following:
-
-+----------------------------+
-| Current State | New State |
-+----------------------------+
-| D0 | D1, D2, D3 |
-+----------------------------+
-| D1 | D2, D3 |
-+----------------------------+
-| D2 | D3 |
-+----------------------------+
-| D1, D2, D3 | D0 |
-+----------------------------+
-
-The transition from D3cold to D0 occurs when the supply voltage is provided to
-the device (i.e. power is restored). In that case the device returns to D0 with
-a full power-on reset sequence and the power-on defaults are restored to the
-device by hardware just as at initial power up.
-
-PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
-while in a low-power state (D1-D3), but they are not required to be capable
-of generating PMEs from all supported low-power states. In particular, the
-capability of generating PMEs from D3cold is optional and depends on the
-presence of additional voltage (3.3Vaux) allowing the device to remain
-sufficiently active to generate a wakeup signal.
-
-1.3. ACPI Device Power Management
----------------------------------
-The platform firmware support for the power management of PCI devices is
-system-specific. However, if the system in question is compliant with the
-Advanced Configuration and Power Interface (ACPI) Specification, like the
-majority of x86-based systems, it is supposed to implement device power
-management interfaces defined by the ACPI standard.
-
-For this purpose the ACPI BIOS provides special functions called "control
-methods" that may be executed by the kernel to perform specific tasks, such as
-putting a device into a low-power state. These control methods are encoded
-using special byte-code language called the ACPI Machine Language (AML) and
-stored in the machine's BIOS. The kernel loads them from the BIOS and executes
-them as needed using an AML interpreter that translates the AML byte code into
-computations and memory or I/O space accesses. This way, in theory, a BIOS
-writer can provide the kernel with a means to perform actions depending
-on the system design in a system-specific fashion.
-
-ACPI control methods may be divided into global control methods, that are not
-associated with any particular devices, and device control methods, that have
-to be defined separately for each device supposed to be handled with the help of
-the platform. This means, in particular, that ACPI device control methods can
-only be used to handle devices that the BIOS writer knew about in advance. The
-ACPI methods used for device power management fall into that category.
-
-The ACPI specification assumes that devices can be in one of four power states
-labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
-D0-D3 states (although the difference between D3hot and D3cold is not taken
-into account by ACPI). Moreover, for each power state of a device there is a
-set of power resources that have to be enabled for the device to be put into
-that state. These power resources are controlled (i.e. enabled or disabled)
-with the help of their own control methods, _ON and _OFF, that have to be
-defined individually for each of them.
-
-To put a device into the ACPI power state Dx (where x is a number between 0 and
-3 inclusive) the kernel is supposed to (1) enable the power resources required
-by the device in this state using their _ON control methods and (2) execute the
-_PSx control method defined for the device. In addition to that, if the device
-is going to be put into a low-power state (D1-D3) and is supposed to generate
-wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
-3.0) control method defined for it has to be executed before _PSx. Power
-resources that are not required by the device in the target power state and are
-not required any more by any other device should be disabled (by executing their
-_OFF control methods). If the current power state of the device is D3, it can
-only be put into D0 this way.
-
-However, quite often the power states of devices are changed during a
-system-wide transition into a sleep state or back into the working state. ACPI
-defines four system sleep states, S1, S2, S3, and S4, and denotes the system
-working state as S0. In general, the target system sleep (or working) state
-determines the highest power (lowest number) state the device can be put
-into and the kernel is supposed to obtain this information by executing the
-device's _SxD control method (where x is a number between 0 and 4 inclusive).
-If the device is required to wake up the system from the target sleep state, the
-lowest power (highest number) state it can be put into is also determined by the
-target state of the system. The kernel is then supposed to use the device's
-_SxW control method to obtain the number of that state. It also is supposed to
-use the device's _PRW control method to learn which power resources need to be
-enabled for the device to be able to generate wakeup signals.
-
-1.4. Wakeup Signaling
----------------------
-Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
-a result of the execution of the _DSW (or _PSW) ACPI control method before
-putting the device into a low-power state, have to be caught and handled as
-appropriate. If they are sent while the system is in the working state
-(ACPI S0), they should be translated into interrupts so that the kernel can
-put the devices generating them into the full-power state and take care of the
-events that triggered them. In turn, if they are sent while the system is
-sleeping, they should cause the system's core logic to trigger wakeup.
-
-On ACPI-based systems wakeup signals sent by conventional PCI devices are
-converted into ACPI General-Purpose Events (GPEs) which are hardware signals
-from the system core logic generated in response to various events that need to
-be acted upon. Every GPE is associated with one or more sources of potentially
-interesting events. In particular, a GPE may be associated with a PCI device
-capable of signaling wakeup. The information on the connections between GPEs
-and event sources is recorded in the system's ACPI BIOS from where it can be
-read by the kernel.
-
-If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
-associated with it (if there is one) is triggered. The GPEs associated with PCI
-bridges may also be triggered in response to a wakeup signal from one of the
-devices below the bridge (this also is the case for root bridges) and, for
-example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
-handled this way.
-
-A GPE may be triggered when the system is sleeping (i.e. when it is in one of
-the ACPI S1-S4 states), in which case system wakeup is started by its core logic
-(the device that was the source of the signal causing the system wakeup to occur
-may be identified later). The GPEs used in such situations are referred to as
-wakeup GPEs.
-
-Usually, however, GPEs are also triggered when the system is in the working
-state (ACPI S0) and in that case the system's core logic generates a System
-Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI
-handler identifies the GPE that caused the interrupt to be generated which,
-in turn, allows the kernel to identify the source of the event (that may be
-a PCI device signaling wakeup). The GPEs used for notifying the kernel of
-events occurring while the system is in the working state are referred to as
-runtime GPEs.
-
-Unfortunately, there is no standard way of handling wakeup signals sent by
-conventional PCI devices on systems that are not ACPI-based, but there is one
-for PCI Express devices. Namely, the PCI Express Base Specification introduced
-a native mechanism for converting native PCI PMEs into interrupts generated by
-root ports. For conventional PCI devices native PMEs are out-of-band, so they
-are routed separately and they need not pass through bridges (in principle they
-may be routed directly to the system's core logic), but for PCI Express devices
-they are in-band messages that have to pass through the PCI Express hierarchy,
-including the root port on the path from the device to the Root Complex. Thus
-it was possible to introduce a mechanism by which a root port generates an
-interrupt whenever it receives a PME message from one of the devices below it.
-The PCI Express Requester ID of the device that sent the PME message is then
-recorded in one of the root port's configuration registers from where it may be
-read by the interrupt handler allowing the device to be identified. [PME
-messages sent by PCI Express endpoints integrated with the Root Complex don't
-pass through root ports, but instead they cause a Root Complex Event Collector
-(if there is one) to generate interrupts.]
-
-In principle the native PCI Express PME signaling may also be used on ACPI-based
-systems along with the GPEs, but to use it the kernel has to ask the system's
-ACPI BIOS to release control of root port configuration registers. The ACPI
-BIOS, however, is not required to allow the kernel to control these registers
-and if it doesn't do that, the kernel must not modify their contents. Of course
-the native PCI Express PME signaling cannot be used by the kernel in that case.
-
-
-2. PCI Subsystem and Device Power Management
-============================================
-
-2.1. Device Power Management Callbacks
---------------------------------------
-The PCI Subsystem participates in the power management of PCI devices in a
-number of ways. First of all, it provides an intermediate code layer between
-the device power management core (PM core) and PCI device drivers.
-Specifically, the pm field of the PCI subsystem's struct bus_type object,
-pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
-pointers to several device power management callbacks:
-
-const struct dev_pm_ops pci_dev_pm_ops = {
- .prepare = pci_pm_prepare,
- .complete = pci_pm_complete,
- .suspend = pci_pm_suspend,
- .resume = pci_pm_resume,
- .freeze = pci_pm_freeze,
- .thaw = pci_pm_thaw,
- .poweroff = pci_pm_poweroff,
- .restore = pci_pm_restore,
- .suspend_noirq = pci_pm_suspend_noirq,
- .resume_noirq = pci_pm_resume_noirq,
- .freeze_noirq = pci_pm_freeze_noirq,
- .thaw_noirq = pci_pm_thaw_noirq,
- .poweroff_noirq = pci_pm_poweroff_noirq,
- .restore_noirq = pci_pm_restore_noirq,
- .runtime_suspend = pci_pm_runtime_suspend,
- .runtime_resume = pci_pm_runtime_resume,
- .runtime_idle = pci_pm_runtime_idle,
-};
-
-These callbacks are executed by the PM core in various situations related to
-device power management and they, in turn, execute power management callbacks
-provided by PCI device drivers. They also perform power management operations
-involving some standard configuration registers of PCI devices that device
-drivers need not know or care about.
-
-The structure representing a PCI device, struct pci_dev, contains several fields
-that these callbacks operate on:
-
-struct pci_dev {
- ...
- pci_power_t current_state; /* Current operating state. */
- int pm_cap; /* PM capability offset in the
- configuration space */
- unsigned int pme_support:5; /* Bitmask of states from which PME#
- can be generated */
- unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */
- unsigned int d1_support:1; /* Low power state D1 is supported */
- unsigned int d2_support:1; /* Low power state D2 is supported */
- unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
- unsigned int wakeup_prepared:1; /* Device prepared for wake up */
- unsigned int d3_delay; /* D3->D0 transition time in ms */
- ...
-};
-
-They also indirectly use some fields of the struct device that is embedded in
-struct pci_dev.
-
-2.2. Device Initialization
---------------------------
-The PCI subsystem's first task related to device power management is to
-prepare the device for power management and initialize the fields of struct
-pci_dev used for this purpose. This happens in two functions defined in
-drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
-
-The first of these functions checks if the device supports native PCI PM
-and if that's the case the offset of its power management capability structure
-in the configuration space is stored in the pm_cap field of the device's struct
-pci_dev object. Next, the function checks which PCI low-power states are
-supported by the device and from which low-power states the device can generate
-native PCI PMEs. The power management fields of the device's struct pci_dev and
-the struct device embedded in it are updated accordingly and the generation of
-PMEs by the device is disabled.
-
-The second function checks if the device can be prepared to signal wakeup with
-the help of the platform firmware, such as the ACPI BIOS. If that is the case,
-the function updates the wakeup fields in struct device embedded in the
-device's struct pci_dev and uses the firmware-provided method to prevent the
-device from signaling wakeup.
-
-At this point the device is ready for power management. For driverless devices,
-however, this functionality is limited to a few basic operations carried out
-during system-wide transitions to a sleep state and back to the working state.
-
-2.3. Runtime Device Power Management
-------------------------------------
-The PCI subsystem plays a vital role in the runtime power management of PCI
-devices. For this purpose it uses the general runtime power management
-(runtime PM) framework described in Documentation/power/runtime_pm.txt.
-Namely, it provides subsystem-level callbacks:
-
- pci_pm_runtime_suspend()
- pci_pm_runtime_resume()
- pci_pm_runtime_idle()
-
-that are executed by the core runtime PM routines. It also implements the
-entire mechanics necessary for handling runtime wakeup signals from PCI devices
-in low-power states, which at the time of this writing works for both the native
-PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
-Section 1.
-
-First, a PCI device is put into a low-power state, or suspended, with the help
-of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
-pci_pm_runtime_suspend() to do the actual job. For this to work, the device's
-driver has to provide a pm->runtime_suspend() callback (see below), which is
-run by pci_pm_runtime_suspend() as the first action. If the driver's callback
-returns successfully, the device's standard configuration registers are saved,
-the device is prepared to generate wakeup signals and, finally, it is put into
-the target low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup. The exact method of signaling wakeup is
-system-dependent and is determined by the PCI subsystem on the basis of the
-reported capabilities of the device and the platform firmware. To prepare the
-device for signaling wakeup and put it into the selected low-power state, the
-PCI subsystem can use the platform firmware as well as the device's native PCI
-PM capabilities, if supported.
-
-It is expected that the device driver's pm->runtime_suspend() callback will
-not attempt to prepare the device for signaling wakeup or to put it into a
-low-power state. The driver ought to leave these tasks to the PCI subsystem
-that has all of the information necessary to perform them.
-
-A suspended device is brought back into the "active" state, or resumed,
-with the help of pm_request_resume() or pm_runtime_resume() which both call
-pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's
-driver provides a pm->runtime_resume() callback (see below). However, before
-the driver's callback is executed, pci_pm_runtime_resume() brings the device
-back into the full-power state, prevents it from signaling wakeup while in that
-state and restores its standard configuration registers. Thus the driver's
-callback need not worry about the PCI-specific aspects of the device resume.
-
-Note that generally pci_pm_runtime_resume() may be called in two different
-situations. First, it may be called at the request of the device's driver, for
-example if there are some data for it to process. Second, it may be called
-as a result of a wakeup signal from the device itself (this sometimes is
-referred to as "remote wakeup"). Of course, for this purpose the wakeup signal
-is handled in one of the ways described in Section 1 and finally converted into
-a notification for the PCI subsystem after the source device has been
-identified.
-
-The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
-and pm_request_idle(), executes the device driver's pm->runtime_idle()
-callback, if defined, and if that callback doesn't return error code (or is not
-present at all), suspends the device with the help of pm_runtime_suspend().
-Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
-example, it is called right after the device has just been resumed), in which
-cases it is expected to suspend the device if that makes sense. Usually,
-however, the PCI subsystem doesn't really know if the device really can be
-suspended, so it lets the device's driver decide by running its
-pm->runtime_idle() callback.
-
-2.4. System-Wide Power Transitions
-----------------------------------
-There are a few different types of system-wide power transitions, described in
-Documentation/driver-api/pm/devices.rst. Each of them requires devices to be handled
-in a specific way and the PM core executes subsystem-level power management
-callbacks for this purpose. They are executed in phases such that each phase
-involves executing the same subsystem-level callback for every device belonging
-to the given subsystem before the next phase begins. These phases always run
-after tasks have been frozen.
-
-2.4.1. System Suspend
-
-When the system is going into a sleep state in which the contents of memory will
-be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
-
- prepare, suspend, suspend_noirq.
-
-The following PCI bus type's callbacks, respectively, are used in these phases:
-
- pci_pm_prepare()
- pci_pm_suspend()
- pci_pm_suspend_noirq()
-
-The pci_pm_prepare() routine first puts the device into the "fully functional"
-state with the help of pm_runtime_resume(). Then, it executes the device
-driver's pm->prepare() callback if defined (i.e. if the driver's struct
-dev_pm_ops object is present and the prepare pointer in that object is valid).
-
-The pci_pm_suspend() routine first checks if the device's driver implements
-legacy PCI suspend routines (see Section 3), in which case the driver's legacy
-suspend callback is executed, if present, and its result is returned. Next, if
-the device's driver doesn't provide a struct dev_pm_ops object (containing
-pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
-simply turns off the device's bus master capability and runs
-pcibios_disable_device() to disable it, unless the device is a bridge (PCI
-bridges are ignored by this routine). Next, the device driver's pm->suspend()
-callback is executed, if defined, and its result is returned if it fails.
-Finally, pci_fixup_device() is called to apply hardware suspend quirks related
-to the device if necessary.
-
-Note that the suspend phase is carried out asynchronously for PCI devices, so
-the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
-devices that don't depend on each other in a known way (i.e. none of the paths
-in the device tree from the root bridge to a leaf device contains both of them).
-
-The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
-been called, which means that the device driver's interrupt handler won't be
-invoked while this routine is running. It first checks if the device's driver
-implements legacy PCI suspends routines (Section 3), in which case the legacy
-late suspend routine is called and its result is returned (the standard
-configuration registers of the device are saved if the driver's callback hasn't
-done that). Second, if the device driver's struct dev_pm_ops object is not
-present, the device's standard configuration registers are saved and the routine
-returns success. Otherwise the device driver's pm->suspend_noirq() callback is
-executed, if present, and its result is returned if it fails. Next, if the
-device's standard configuration registers haven't been saved yet (one of the
-device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
-saves them, prepares the device to signal wakeup (if necessary) and puts it into
-a low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup while the system is in the target sleep
-state. Just like in the runtime PM case described above, the mechanism of
-signaling wakeup is system-dependent and determined by the PCI subsystem, which
-is also responsible for preparing the device to signal wakeup from the system's
-target sleep state as appropriate.
-
-PCI device drivers (that don't implement legacy power management callbacks) are
-generally not expected to prepare devices for signaling wakeup or to put them
-into low-power states. However, if one of the driver's suspend callbacks
-(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
-registers, pci_pm_suspend_noirq() will assume that the device has been prepared
-to signal wakeup and put into a low-power state by the driver (the driver is
-then assumed to have used the helper functions provided by the PCI subsystem for
-this purpose). PCI device drivers are not encouraged to do that, but in some
-rare cases doing that in the driver may be the optimum approach.
-
-2.4.2. System Resume
-
-When the system is undergoing a transition from a sleep state in which the
-contents of memory have been preserved, such as one of the ACPI sleep states
-S1-S3, into the working state (ACPI S0), the phases are:
-
- resume_noirq, resume, complete.
-
-The following PCI bus type's callbacks, respectively, are executed in these
-phases:
-
- pci_pm_resume_noirq()
- pci_pm_resume()
- pci_pm_complete()
-
-The pci_pm_resume_noirq() routine first puts the device into the full-power
-state, restores its standard configuration registers and applies early resume
-hardware quirks related to the device, if necessary. This is done
-unconditionally, regardless of whether or not the device's driver implements
-legacy PCI power management callbacks (this way all PCI devices are in the
-full-power state and their standard configuration registers have been restored
-when their interrupt handlers are invoked for the first time during resume,
-which allows the kernel to avoid problems with the handling of shared interrupts
-by drivers whose devices are still suspended). If legacy PCI power management
-callbacks (see Section 3) are implemented by the device's driver, the legacy
-early resume callback is executed and its result is returned. Otherwise, the
-device driver's pm->resume_noirq() callback is executed, if defined, and its
-result is returned.
-
-The pci_pm_resume() routine first checks if the device's standard configuration
-registers have been restored and restores them if that's not the case (this
-only is necessary in the error path during a failing suspend). Next, resume
-hardware quirks related to the device are applied, if necessary, and if the
-device's driver implements legacy PCI power management callbacks (see
-Section 3), the driver's legacy resume callback is executed and its result is
-returned. Otherwise, the device's wakeup signaling mechanisms are blocked and
-its driver's pm->resume() callback is executed, if defined (the callback's
-result is then returned).
-
-The resume phase is carried out asynchronously for PCI devices, like the
-suspend phase described above, which means that if two PCI devices don't depend
-on each other in a known way, the pci_pm_resume() routine may be executed for
-the both of them in parallel.
-
-The pci_pm_complete() routine only executes the device driver's pm->complete()
-callback, if defined.
-
-2.4.3. System Hibernation
-
-System hibernation is more complicated than system suspend, because it requires
-a system image to be created and written into a persistent storage medium. The
-image is created atomically and all devices are quiesced, or frozen, before that
-happens.
-
-The freezing of devices is carried out after enough memory has been freed (at
-the time of this writing the image creation requires at least 50% of system RAM
-to be free) in the following three phases:
-
- prepare, freeze, freeze_noirq
-
-that correspond to the PCI bus type's callbacks:
-
- pci_pm_prepare()
- pci_pm_freeze()
- pci_pm_freeze_noirq()
-
-This means that the prepare phase is exactly the same as for system suspend.
-The other two phases, however, are different.
-
-The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
-the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
-and it doesn't apply the suspend-related hardware quirks. It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The pci_pm_freeze_noirq() routine, in turn, is similar to
-pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
-routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the
-device for signaling wakeup and put it into a low-power state. Still, it saves
-the device's standard configuration registers if they haven't been saved by one
-of the driver's callbacks.
-
-Once the image has been created, it has to be saved. However, at this point all
-devices are frozen and they cannot handle I/O, while their ability to handle
-I/O is obviously necessary for the image saving. Thus they have to be brought
-back to the fully functional state and this is done in the following phases:
-
- thaw_noirq, thaw, complete
-
-using the following PCI bus type's callbacks:
-
- pci_pm_thaw_noirq()
- pci_pm_thaw()
- pci_pm_complete()
-
-respectively.
-
-The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
-but it doesn't put the device into the full power state and doesn't attempt to
-restore its standard configuration registers. It also executes the device
-driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
-
-The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
-driver's pm->thaw() callback instead of pm->resume(). It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The complete phase it the same as for system resume.
-
-After saving the image, devices need to be powered down before the system can
-enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in
-three phases:
-
- prepare, poweroff, poweroff_noirq
-
-where the prepare phase is exactly the same as for system suspend. The other
-two phases are analogous to the suspend and suspend_noirq phases, respectively.
-The PCI subsystem-level callbacks they correspond to
-
- pci_pm_poweroff()
- pci_pm_poweroff_noirq()
-
-work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
-although they don't attempt to save the device's standard configuration
-registers.
-
-2.4.4. System Restore
-
-System restore requires a hibernation image to be loaded into memory and the
-pre-hibernation memory contents to be restored before the pre-hibernation system
-activity can be resumed.
-
-As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
-into memory by a fresh instance of the kernel, called the boot kernel, which in
-turn is loaded and run by a boot loader in the usual way. After the boot kernel
-has loaded the image, it needs to replace its own code and data with the code
-and data of the "hibernated" kernel stored within the image, called the image
-kernel. For this purpose all devices are frozen just like before creating
-the image during hibernation, in the
-
- prepare, freeze, freeze_noirq
-
-phases described above. However, the devices affected by these phases are only
-those having drivers in the boot kernel; other devices will still be in whatever
-state the boot loader left them.
-
-Should the restoration of the pre-hibernation memory contents fail, the boot
-kernel would go through the "thawing" procedure described above, using the
-thaw_noirq, thaw, and complete phases (that will only affect the devices having
-drivers in the boot kernel), and then continue running normally.
-
-If the pre-hibernation memory contents are restored successfully, which is the
-usual situation, control is passed to the image kernel, which then becomes
-responsible for bringing the system back to the working state. To achieve this,
-it must restore the devices' pre-hibernation functionality, which is done much
-like waking up from the memory sleep state, although it involves different
-phases:
-
- restore_noirq, restore, complete
-
-The first two of these are analogous to the resume_noirq and resume phases
-described above, respectively, and correspond to the following PCI subsystem
-callbacks:
-
- pci_pm_restore_noirq()
- pci_pm_restore()
-
-These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
-respectively, but they execute the device driver's pm->restore_noirq() and
-pm->restore() callbacks, if available.
-
-The complete phase is carried out in exactly the same way as during system
-resume.
-
-
-3. PCI Device Drivers and Power Management
-==========================================
-
-3.1. Power Management Callbacks
--------------------------------
-PCI device drivers participate in power management by providing callbacks to be
-executed by the PCI subsystem's power management routines described above and by
-controlling the runtime power management of their devices.
-
-At the time of this writing there are two ways to define power management
-callbacks for a PCI device driver, the recommended one, based on using a
-dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
-"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
-.resume() callbacks from struct pci_driver are used. The legacy approach,
-however, doesn't allow one to define runtime power management callbacks and is
-not really suitable for any new drivers. Therefore it is not covered by this
-document (refer to the source code to learn more about it).
-
-It is recommended that all PCI device drivers define a struct dev_pm_ops object
-containing pointers to power management (PM) callbacks that will be executed by
-the PCI subsystem's PM routines in various circumstances. A pointer to the
-driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
-its struct pci_driver object. Once that has happened, the "legacy" PM callbacks
-in struct pci_driver are ignored (even if they are not NULL).
-
-The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
-defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
-subsystem will handle the device in a simplified default manner. If they are
-defined, though, they are expected to behave as described in the following
-subsections.
-
-3.1.1. prepare()
-
-The prepare() callback is executed during system suspend, during hibernation
-(when a hibernation image is about to be created), during power-off after
-saving a hibernation image and during system restore, when a hibernation image
-has just been loaded into memory.
-
-This callback is only necessary if the driver's device has children that in
-general may be registered at any time. In that case the role of the prepare()
-callback is to prevent new children of the device from being registered until
-one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
-
-In addition to that the prepare() callback may carry out some operations
-preparing the device to be suspended, although it should not allocate memory
-(if additional memory is required to suspend the device, it has to be
-preallocated earlier, for example in a suspend/hibernate notifier as described
-in Documentation/driver-api/pm/notifiers.rst).
-
-3.1.2. suspend()
-
-The suspend() callback is only executed during system suspend, after prepare()
-callbacks have been executed for all devices in the system.
-
-This callback is expected to quiesce the device and prepare it to be put into a
-low-power state by the PCI subsystem. It is not required (in fact it even is
-not recommended) that a PCI driver's suspend() callback save the standard
-configuration registers of the device, prepare it for waking up the system, or
-put it into a low-power state. All of these operations can very well be taken
-care of by the PCI subsystem, without the driver's participation.
-
-However, in some rare case it is convenient to carry out these operations in
-a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and
-pci_set_power_state() should be used to save the device's standard configuration
-registers, to prepare it for system wakeup (if necessary), and to put it into a
-low-power state, respectively. Moreover, if the driver calls pci_save_state(),
-the PCI subsystem will not execute either pci_prepare_to_sleep(), or
-pci_set_power_state() for its device, so the driver is then responsible for
-handling the device as appropriate.
-
-While the suspend() callback is being executed, the driver's interrupt handler
-can be invoked to handle an interrupt from the device, so all suspend-related
-operations relying on the driver's ability to handle interrupts should be
-carried out in this callback.
-
-3.1.3. suspend_noirq()
-
-The suspend_noirq() callback is only executed during system suspend, after
-suspend() callbacks have been executed for all devices in the system and
-after device interrupts have been disabled by the PM core.
-
-The difference between suspend_noirq() and suspend() is that the driver's
-interrupt handler will not be invoked while suspend_noirq() is running. Thus
-suspend_noirq() can carry out operations that would cause race conditions to
-arise if they were performed in suspend().
-
-3.1.4. freeze()
-
-The freeze() callback is hibernation-specific and is executed in two situations,
-during hibernation, after prepare() callbacks have been executed for all devices
-in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory from persistent storage and the
-prepare() callbacks have been executed for all devices.
-
-The role of this callback is analogous to the role of the suspend() callback
-described above. In fact, they only need to be different in the rare cases when
-the driver takes the responsibility for putting the device into a low-power
-state.
-
-In that cases the freeze() callback should not prepare the device system wakeup
-or put it into a low-power state. Still, either it or freeze_noirq() should
-save the device's standard configuration registers using pci_save_state().
-
-3.1.5. freeze_noirq()
-
-The freeze_noirq() callback is hibernation-specific. It is executed during
-hibernation, after prepare() and freeze() callbacks have been executed for all
-devices in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory and after prepare() and
-freeze() callbacks have been executed for all devices. It is always executed
-after device interrupts have been disabled by the PM core.
-
-The role of this callback is analogous to the role of the suspend_noirq()
-callback described above and it very rarely is necessary to define
-freeze_noirq().
-
-The difference between freeze_noirq() and freeze() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.6. poweroff()
-
-The poweroff() callback is hibernation-specific. It is executed when the system
-is about to be powered off after saving a hibernation image to a persistent
-storage. prepare() callbacks are executed for all devices before poweroff() is
-called.
-
-The role of this callback is analogous to the role of the suspend() and freeze()
-callbacks described above, although it does not need to save the contents of
-the device's registers. In particular, if the driver wants to put the device
-into a low-power state itself instead of allowing the PCI subsystem to do that,
-the poweroff() callback should use pci_prepare_to_sleep() and
-pci_set_power_state() to prepare the device for system wakeup and to put it
-into a low-power state, respectively, but it need not save the device's standard
-configuration registers.
-
-3.1.7. poweroff_noirq()
-
-The poweroff_noirq() callback is hibernation-specific. It is executed after
-poweroff() callbacks have been executed for all devices in the system.
-
-The role of this callback is analogous to the role of the suspend_noirq() and
-freeze_noirq() callbacks described above, but it does not need to save the
-contents of the device's registers.
-
-The difference between poweroff_noirq() and poweroff() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.8. resume_noirq()
-
-The resume_noirq() callback is only executed during system resume, after the
-PM core has enabled the non-boot CPUs. The driver's interrupt handler will not
-be invoked while resume_noirq() is running, so this callback can carry out
-operations that might race with the interrupt handler.
-
-Since the PCI subsystem unconditionally puts all devices into the full power
-state in the resume_noirq phase of system resume and restores their standard
-configuration registers, resume_noirq() is usually not necessary. In general
-it should only be used for performing operations that would lead to race
-conditions if carried out by resume().
-
-3.1.9. resume()
-
-The resume() callback is only executed during system resume, after
-resume_noirq() callbacks have been executed for all devices in the system and
-device interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-suspend configuration of the
-device and bringing it back to the fully functional state. The device should be
-able to process I/O in a usual way after resume() has returned.
-
-3.1.10. thaw_noirq()
-
-The thaw_noirq() callback is hibernation-specific. It is executed after a
-system image has been created and the non-boot CPUs have been enabled by the PM
-core, in the thaw_noirq phase of hibernation. It also may be executed if the
-loading of a hibernation image fails during system restore (it is then executed
-after enabling the non-boot CPUs). The driver's interrupt handler will not be
-invoked while thaw_noirq() is running.
-
-The role of this callback is analogous to the role of resume_noirq(). The
-difference between these two callbacks is that thaw_noirq() is executed after
-freeze() and freeze_noirq(), so in general it does not need to modify the
-contents of the device's registers.
-
-3.1.11. thaw()
-
-The thaw() callback is hibernation-specific. It is executed after thaw_noirq()
-callbacks have been executed for all devices in the system and after device
-interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-freeze configuration of
-the device, so that it will work in a usual way after thaw() has returned.
-
-3.1.12. restore_noirq()
-
-The restore_noirq() callback is hibernation-specific. It is executed in the
-restore_noirq phase of hibernation, when the boot kernel has passed control to
-the image kernel and the non-boot CPUs have been enabled by the image kernel's
-PM core.
-
-This callback is analogous to resume_noirq() with the exception that it cannot
-make any assumption on the previous state of the device, even if the BIOS (or
-generally the platform firmware) is known to preserve that state over a
-suspend-resume cycle.
-
-For the vast majority of PCI device drivers there is no difference between
-resume_noirq() and restore_noirq().
-
-3.1.13. restore()
-
-The restore() callback is hibernation-specific. It is executed after
-restore_noirq() callbacks have been executed for all devices in the system and
-after the PM core has enabled device drivers' interrupt handlers to be invoked.
-
-This callback is analogous to resume(), just like restore_noirq() is analogous
-to resume_noirq(). Consequently, the difference between restore_noirq() and
-restore() is analogous to the difference between resume_noirq() and resume().
-
-For the vast majority of PCI device drivers there is no difference between
-resume() and restore().
-
-3.1.14. complete()
-
-The complete() callback is executed in the following situations:
- - during system resume, after resume() callbacks have been executed for all
- devices,
- - during hibernation, before saving the system image, after thaw() callbacks
- have been executed for all devices,
- - during system restore, when the system is going back to its pre-hibernation
- state, after restore() callbacks have been executed for all devices.
-It also may be executed if the loading of a hibernation image into memory fails
-(in that case it is run after thaw() callbacks have been executed for all
-devices that have drivers in the boot kernel).
-
-This callback is entirely optional, although it may be necessary if the
-prepare() callback performs operations that need to be reversed.
-
-3.1.15. runtime_suspend()
-
-The runtime_suspend() callback is specific to device runtime power management
-(runtime PM). It is executed by the PM core's runtime PM framework when the
-device is about to be suspended (i.e. quiesced and put into a low-power state)
-at run time.
-
-This callback is responsible for freezing the device and preparing it to be
-put into a low-power state, but it must allow the PCI subsystem to perform all
-of the PCI-specific actions necessary for suspending the device.
-
-3.1.16. runtime_resume()
-
-The runtime_resume() callback is specific to device runtime PM. It is executed
-by the PM core's runtime PM framework when the device is about to be resumed
-(i.e. put into the full-power state and programmed to process I/O normally) at
-run time.
-
-This callback is responsible for restoring the normal functionality of the
-device after it has been put into the full-power state by the PCI subsystem.
-The device is expected to be able to process I/O in the usual way after
-runtime_resume() has returned.
-
-3.1.17. runtime_idle()
-
-The runtime_idle() callback is specific to device runtime PM. It is executed
-by the PM core's runtime PM framework whenever it may be desirable to suspend
-the device according to the PM core's information. In particular, it is
-automatically executed right after runtime_resume() has returned in case the
-resume of the device has happened as a result of a spurious event.
-
-This callback is optional, but if it is not implemented or if it returns 0, the
-PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
-cause the driver's runtime_suspend() callback to be executed.
-
-3.1.18. Pointing Multiple Callback Pointers to One Routine
-
-Although in principle each of the callbacks described in the previous
-subsections can be defined as a separate function, it often is convenient to
-point two or more members of struct dev_pm_ops to the same routine. There are
-a few convenience macros that can be used for this purpose.
-
-The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
-suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
-members and one resume routine pointed to by the .resume(), .thaw(), and
-.restore() members. The other function pointers in this struct dev_pm_ops are
-unset.
-
-The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
-additionally sets the .runtime_resume() pointer to the same value as
-.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
-the same value as .suspend() (and .freeze() and .poweroff()).
-
-The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
-dev_pm_ops to indicate that one suspend routine is to be pointed to by the
-.suspend(), .freeze(), and .poweroff() members and one resume routine is to
-be pointed to by the .resume(), .thaw(), and .restore() members.
-
-3.1.19. Driver Flags for Power Management
-
-The PM core allows device drivers to set flags that influence the handling of
-power management for the devices by the core itself and by middle layer code
-including the PCI bus type. The flags should be set once at the driver probe
-time with the help of the dev_pm_set_driver_flags() function and they should not
-be updated directly afterwards.
-
-The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
-mechanism allowing device suspend/resume callbacks to be skipped if the device
-is in runtime suspend when the system suspend starts. That also affects all of
-the ancestors of the device, so this flag should only be used if absolutely
-necessary.
-
-The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
-positive value from pci_pm_prepare() if the ->prepare callback provided by the
-driver of the device returns a positive value. That allows the driver to opt
-out from using the direct-complete mechanism dynamically.
-
-The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
-perspective the device can be safely left in runtime suspend during system
-suspend. That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
-to skip resuming the device from runtime suspend unless there are PCI-specific
-reasons for doing that. Also, it causes pci_pm_suspend_late/noirq(),
-pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
-if the device remains in runtime suspend in the beginning of the "late" phase
-of the system-wide transition under way. Moreover, if the device is in
-runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
-power management status will be changed to "active" (as it is going to be put
-into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
-the function will set the power.direct_complete flag for it (to make the PM core
-skip the subsequent "thaw" callbacks for it) and return.
-
-Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
-device to be left in suspend after system-wide transitions to the working state.
-This flag is checked by the PM core, but the PCI bus type informs the PM core
-which devices may be left in suspend from its perspective (that happens during
-the "noirq" phase of system-wide suspend and analogous transitions) and next it
-uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
-pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
-callbacks for the device during the transition under way and will set its
-runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
-it.
-
-3.2. Device Runtime Power Management
-------------------------------------
-In addition to providing device power management callbacks PCI device drivers
-are responsible for controlling the runtime power management (runtime PM) of
-their devices.
-
-The PCI device runtime PM is optional, but it is recommended that PCI device
-drivers implement it at least in the cases where there is a reliable way of
-verifying that the device is not used (like when the network cable is detached
-from an Ethernet adapter or there are no devices attached to a USB controller).
-
-To support the PCI runtime PM the driver first needs to implement the
-runtime_suspend() and runtime_resume() callbacks. It also may need to implement
-the runtime_idle() callback to prevent the device from being suspended again
-every time right after the runtime_resume() callback has returned
-(alternatively, the runtime_suspend() callback will have to check if the
-device should really be suspended and return -EAGAIN if that is not the case).
-
-The runtime PM of PCI devices is enabled by default by the PCI core. PCI
-device drivers do not need to enable it and should not attempt to do so.
-However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
-helper function. In addition to that, the runtime PM usage counter of
-each PCI device is incremented by local_pci_probe() before executing the
-probe callback provided by the device's driver.
-
-If a PCI driver implements the runtime PM callbacks and intends to use the
-runtime PM framework provided by the PM core and the PCI subsystem, it needs
-to decrement the device's runtime PM usage counter in its probe callback
-function. If it doesn't do that, the counter will always be different from
-zero for the device and it will never be runtime-suspended. The simplest
-way to do that is by calling pm_runtime_put_noidle(), but if the driver
-wants to schedule an autosuspend right away, for example, it may call
-pm_runtime_put_autosuspend() instead for this purpose. Generally, it
-just needs to call a function that decrements the devices usage counter
-from its probe routine to make runtime PM work for the device.
-
-It is important to remember that the driver's runtime_suspend() callback
-may be executed right after the usage counter has been decremented, because
-user space may already have caused the pm_runtime_allow() helper function
-unblocking the runtime PM of the device to run via sysfs, so the driver must
-be prepared to cope with that.
-
-The driver itself should not call pm_runtime_allow(), though. Instead, it
-should let user space or some platform-specific code do that (user space can
-do it via sysfs as stated above), but it must be prepared to handle the
-runtime PM of the device correctly as soon as pm_runtime_allow() is called
-(which may happen at any time, even before the driver is loaded).
-
-When the driver's remove callback runs, it has to balance the decrementation
-of the device's runtime PM usage counter at the probe time. For this reason,
-if it has decremented the counter in its probe callback, it must run
-pm_runtime_get_noresume() in its remove callback. [Since the core carries
-out a runtime resume of the device and bumps up the device's usage counter
-before running the driver's remove callback, the runtime PM of the device
-is effectively disabled for the duration of the remove execution and all
-runtime PM helper functions incrementing the device's usage counter are
-then effectively equivalent to pm_runtime_get_noresume().]
-
-The runtime PM framework works by processing requests to suspend or resume
-devices, or to check if they are idle (in which cases it is reasonable to
-subsequently request that they be suspended). These requests are represented
-by work items put into the power management workqueue, pm_wq. Although there
-are a few situations in which power management requests are automatically
-queued by the PM core (for example, after processing a request to resume a
-device the PM core automatically queues a request to check if the device is
-idle), device drivers are generally responsible for queuing power management
-requests for their devices. For this purpose they should use the runtime PM
-helper functions provided by the PM core, discussed in
-Documentation/power/runtime_pm.txt.
-
-Devices can also be suspended and resumed synchronously, without placing a
-request into pm_wq. In the majority of cases this also is done by their
-drivers that use helper functions provided by the PM core for this purpose.
-
-For more information on the runtime PM of devices refer to
-Documentation/power/runtime_pm.txt.
-
-
-4. Resources
-============
-
-PCI Local Bus Specification, Rev. 3.0
-PCI Bus Power Management Interface Specification, Rev. 1.2
-Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
-PCI Express Base Specification, Rev. 2.0
-Documentation/driver-api/pm/devices.rst
-Documentation/power/runtime_pm.txt
diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst
new file mode 100644
index 000000000000..69921f072ce1
--- /dev/null
+++ b/Documentation/power/pm_qos_interface.rst
@@ -0,0 +1,227 @@
+===============================
+PM Quality Of Service Interface
+===============================
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Two different PM QoS frameworks are available:
+1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
+memory_bandwidth.
+2. the per-device PM QoS framework provides the API to manage the per-device latency
+constraints and PM QoS flags.
+
+Each parameters have defined units:
+
+ * latency: usec
+ * timeout: usec
+ * throughput: kbs (kilo bit / sec)
+ * memory bandwidth: mbs (mega bit / sec)
+
+
+1. PM QoS framework
+===================
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter. The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h. This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requests is maintained along with
+an aggregated target value. The aggregated target value is updated with
+changes to the request list or elements of the list. Typically the
+aggregated target value is simply the max or min of the request values held
+in the parameter list elements.
+Note: the aggregated target value is implemented as an atomic variable so that
+reading the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is simple:
+
+void pm_qos_add_request(handle, param_class, target_value):
+ Will insert an element into the list for that identified PM QoS class with the
+ target value. Upon change to this list the new target is recomputed and any
+ registered notifiers are called only if the target value is now different.
+ Clients of pm_qos need to save the returned handle for future use in other
+ pm_qos API functions.
+
+void pm_qos_update_request(handle, new_target_value):
+ Will update the list element pointed to by the handle with the new target value
+ and recompute the new aggregated target, calling the notification tree if the
+ target is changed.
+
+void pm_qos_remove_request(handle):
+ Will remove the element. After removal it will update the aggregate target and
+ call the notification tree if the target was changed as a result of removing
+ the request.
+
+int pm_qos_request(param_class):
+ Returns the aggregated value for a given PM QoS class.
+
+int pm_qos_request_active(handle):
+ Returns if the request is still active, i.e. it has not been removed from a
+ PM QoS class constraints list.
+
+int pm_qos_add_notifier(param_class, notifier):
+ Adds a notification callback function to the PM QoS class. The callback is
+ called when the aggregated value for the PM QoS class is changed.
+
+int pm_qos_remove_notifier(int param_class, notifier):
+ Removes the notification callback function for the PM QoS class.
+
+
+From user mode:
+
+Only processes can register a pm_qos request. To provide for automatic
+cleanup of a process, the interface requires the process to register its
+parameter requests in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+request on the parameter.
+
+To change the requested target value the process needs to write an s32 value to
+the open device node. Alternatively the user mode program could write a hex
+string for the value using 10 char long format e.g. "0x12345678". This
+translates to a pm_qos_update_request call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+2. PM QoS per-device latency and flags framework
+================================================
+
+For each device, there are three lists of PM QoS requests. Two of them are
+maintained along with the aggregated targets of resume latency and active
+state latency tolerance (in microseconds) and the third one is for PM QoS flags.
+Values are updated in response to changes of the request list.
+
+The target values of resume latency and active state latency tolerance are
+simply the minimum of the request values held in the parameter list elements.
+The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
+values. One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
+
+Note: The aggregated target values are implemented in such a way that reading
+the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is the following:
+
+int dev_pm_qos_add_request(device, handle, type, value):
+ Will insert an element into the list for that identified device with the
+ target value. Upon change to this list the new target is recomputed and any
+ registered notifiers are called only if the target value is now different.
+ Clients of dev_pm_qos need to save the handle for future use in other
+ dev_pm_qos API functions.
+
+int dev_pm_qos_update_request(handle, new_value):
+ Will update the list element pointed to by the handle with the new target
+ value and recompute the new aggregated target, calling the notification
+ trees if the target is changed.
+
+int dev_pm_qos_remove_request(handle):
+ Will remove the element. After removal it will update the aggregate target
+ and call the notification trees if the target was changed as a result of
+ removing the request.
+
+s32 dev_pm_qos_read_value(device, type):
+ Returns the aggregated value for a given device's constraints list.
+
+enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
+ Check PM QoS flags of the given device against the given mask of flags.
+ The meaning of the return values is as follows:
+
+ PM_QOS_FLAGS_ALL:
+ All flags from the mask are set
+ PM_QOS_FLAGS_SOME:
+ Some flags from the mask are set
+ PM_QOS_FLAGS_NONE:
+ No flags from the mask are set
+ PM_QOS_FLAGS_UNDEFINED:
+ The device's PM QoS structure has not been initialized
+ or the list of requests is empty.
+
+int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
+ Add a PM QoS request for the first direct ancestor of the given device whose
+ power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
+ or whose power.set_latency_tolerance callback pointer is not NULL (for
+ DEV_PM_QOS_LATENCY_TOLERANCE requests).
+
+int dev_pm_qos_expose_latency_limit(device, value)
+ Add a request to the device's PM QoS list of resume latency constraints and
+ create a sysfs attribute pm_qos_resume_latency_us under the device's power
+ directory allowing user space to manipulate that request.
+
+void dev_pm_qos_hide_latency_limit(device)
+ Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
+ PM QoS list of resume latency constraints and remove sysfs attribute
+ pm_qos_resume_latency_us from the device's power directory.
+
+int dev_pm_qos_expose_flags(device, value)
+ Add a request to the device's PM QoS list of flags and create sysfs attribute
+ pm_qos_no_power_off under the device's power directory allowing user space to
+ change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
+
+void dev_pm_qos_hide_flags(device)
+ Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
+ of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
+ directory.
+
+Notification mechanisms:
+
+The per-device PM QoS framework has a per-device notification tree.
+
+int dev_pm_qos_add_notifier(device, notifier, type):
+ Adds a notification callback function for the device for a particular request
+ type.
+
+ The callback is called when the aggregated value of the device constraints list
+ is changed.
+
+int dev_pm_qos_remove_notifier(device, notifier, type):
+ Removes the notification callback function for the device.
+
+
+Active state latency tolerance
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This device PM QoS type is used to support systems in which hardware may switch
+to energy-saving operation modes on the fly. In those systems, if the operation
+mode chosen by the hardware attempts to save energy in an overly aggressive way,
+it may cause excess latencies to be visible to software, causing it to miss
+certain protocol requirements or target frame or sample rates etc.
+
+If there is a latency tolerance control mechanism for a given device available
+to software, the .set_latency_tolerance callback in that device's dev_pm_info
+structure should be populated. The routine pointed to by it is should implement
+whatever is necessary to transfer the effective requirement value to the
+hardware.
+
+Whenever the effective latency tolerance changes for the device, its
+.set_latency_tolerance() callback will be executed and the effective value will
+be passed to it. If that value is negative, which means that the list of
+latency tolerance requirements for the device is empty, the callback is expected
+to switch the underlying hardware latency tolerance control mechanism to an
+autonomous mode if available. If that value is PM_QOS_LATENCY_ANY, in turn, and
+the hardware supports a special "no requirement" setting, the callback is
+expected to use it. That allows software to prevent the hardware from
+automatically updating the device's latency tolerance in response to its power
+state changes (e.g. during transitions from D3cold to D0), which generally may
+be done in the autonomous latency tolerance control mode.
+
+If .set_latency_tolerance() is present for the device, sysfs attribute
+pm_qos_latency_tolerance_us will be present in the devivce's power directory.
+Then, user space can use that attribute to specify its latency tolerance
+requirement for the device, if any. Writing "any" to it means "no requirement,
+but do not let the hardware control latency tolerance" and writing "auto" to it
+allows the hardware to be switched to the autonomous mode if there are no other
+requirements from the kernel side in the device's list.
+
+Kernel code can use the functions described above along with the
+DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
+latency tolerance requirements for devices.
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
deleted file mode 100644
index 19c5f7b1a7ba..000000000000
--- a/Documentation/power/pm_qos_interface.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-PM Quality Of Service Interface.
-
-This interface provides a kernel and user mode interface for registering
-performance expectations by drivers, subsystems and user space applications on
-one of the parameters.
-
-Two different PM QoS frameworks are available:
-1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
-memory_bandwidth.
-2. the per-device PM QoS framework provides the API to manage the per-device latency
-constraints and PM QoS flags.
-
-Each parameters have defined units:
- * latency: usec
- * timeout: usec
- * throughput: kbs (kilo bit / sec)
- * memory bandwidth: mbs (mega bit / sec)
-
-
-1. PM QoS framework
-
-The infrastructure exposes multiple misc device nodes one per implemented
-parameter. The set of parameters implement is defined by pm_qos_power_init()
-and pm_qos_params.h. This is done because having the available parameters
-being runtime configurable or changeable from a driver was seen as too easy to
-abuse.
-
-For each parameter a list of performance requests is maintained along with
-an aggregated target value. The aggregated target value is updated with
-changes to the request list or elements of the list. Typically the
-aggregated target value is simply the max or min of the request values held
-in the parameter list elements.
-Note: the aggregated target value is implemented as an atomic variable so that
-reading the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is simple:
-
-void pm_qos_add_request(handle, param_class, target_value):
-Will insert an element into the list for that identified PM QoS class with the
-target value. Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of pm_qos need to save the returned handle for future use in other
-pm_qos API functions.
-
-void pm_qos_update_request(handle, new_target_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification tree if the
-target is changed.
-
-void pm_qos_remove_request(handle):
-Will remove the element. After removal it will update the aggregate target and
-call the notification tree if the target was changed as a result of removing
-the request.
-
-int pm_qos_request(param_class):
-Returns the aggregated value for a given PM QoS class.
-
-int pm_qos_request_active(handle):
-Returns if the request is still active, i.e. it has not been removed from a
-PM QoS class constraints list.
-
-int pm_qos_add_notifier(param_class, notifier):
-Adds a notification callback function to the PM QoS class. The callback is
-called when the aggregated value for the PM QoS class is changed.
-
-int pm_qos_remove_notifier(int param_class, notifier):
-Removes the notification callback function for the PM QoS class.
-
-
-From user mode:
-Only processes can register a pm_qos request. To provide for automatic
-cleanup of a process, the interface requires the process to register its
-parameter requests in the following way:
-
-To register the default pm_qos target for the specific parameter, the process
-must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
-
-As long as the device node is held open that process has a registered
-request on the parameter.
-
-To change the requested target value the process needs to write an s32 value to
-the open device node. Alternatively the user mode program could write a hex
-string for the value using 10 char long format e.g. "0x12345678". This
-translates to a pm_qos_update_request call.
-
-To remove the user mode request for a target value simply close the device
-node.
-
-
-2. PM QoS per-device latency and flags framework
-
-For each device, there are three lists of PM QoS requests. Two of them are
-maintained along with the aggregated targets of resume latency and active
-state latency tolerance (in microseconds) and the third one is for PM QoS flags.
-Values are updated in response to changes of the request list.
-
-The target values of resume latency and active state latency tolerance are
-simply the minimum of the request values held in the parameter list elements.
-The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
-values. One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
-
-Note: The aggregated target values are implemented in such a way that reading
-the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is the following:
-
-int dev_pm_qos_add_request(device, handle, type, value):
-Will insert an element into the list for that identified device with the
-target value. Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of dev_pm_qos need to save the handle for future use in other
-dev_pm_qos API functions.
-
-int dev_pm_qos_update_request(handle, new_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification trees if the
-target is changed.
-
-int dev_pm_qos_remove_request(handle):
-Will remove the element. After removal it will update the aggregate target and
-call the notification trees if the target was changed as a result of removing
-the request.
-
-s32 dev_pm_qos_read_value(device):
-Returns the aggregated value for a given device's constraints list.
-
-enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
-Check PM QoS flags of the given device against the given mask of flags.
-The meaning of the return values is as follows:
- PM_QOS_FLAGS_ALL: All flags from the mask are set
- PM_QOS_FLAGS_SOME: Some flags from the mask are set
- PM_QOS_FLAGS_NONE: No flags from the mask are set
- PM_QOS_FLAGS_UNDEFINED: The device's PM QoS structure has not been
- initialized or the list of requests is empty.
-
-int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
-Add a PM QoS request for the first direct ancestor of the given device whose
-power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
-or whose power.set_latency_tolerance callback pointer is not NULL (for
-DEV_PM_QOS_LATENCY_TOLERANCE requests).
-
-int dev_pm_qos_expose_latency_limit(device, value)
-Add a request to the device's PM QoS list of resume latency constraints and
-create a sysfs attribute pm_qos_resume_latency_us under the device's power
-directory allowing user space to manipulate that request.
-
-void dev_pm_qos_hide_latency_limit(device)
-Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
-PM QoS list of resume latency constraints and remove sysfs attribute
-pm_qos_resume_latency_us from the device's power directory.
-
-int dev_pm_qos_expose_flags(device, value)
-Add a request to the device's PM QoS list of flags and create sysfs attribute
-pm_qos_no_power_off under the device's power directory allowing user space to
-change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
-
-void dev_pm_qos_hide_flags(device)
-Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
-of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
-directory.
-
-Notification mechanisms:
-The per-device PM QoS framework has a per-device notification tree.
-
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
-The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
-
-int dev_pm_qos_remove_notifier(device, notifier):
-Removes the notification callback function for the device.
-
-
-Active state latency tolerance
-
-This device PM QoS type is used to support systems in which hardware may switch
-to energy-saving operation modes on the fly. In those systems, if the operation
-mode chosen by the hardware attempts to save energy in an overly aggressive way,
-it may cause excess latencies to be visible to software, causing it to miss
-certain protocol requirements or target frame or sample rates etc.
-
-If there is a latency tolerance control mechanism for a given device available
-to software, the .set_latency_tolerance callback in that device's dev_pm_info
-structure should be populated. The routine pointed to by it is should implement
-whatever is necessary to transfer the effective requirement value to the
-hardware.
-
-Whenever the effective latency tolerance changes for the device, its
-.set_latency_tolerance() callback will be executed and the effective value will
-be passed to it. If that value is negative, which means that the list of
-latency tolerance requirements for the device is empty, the callback is expected
-to switch the underlying hardware latency tolerance control mechanism to an
-autonomous mode if available. If that value is PM_QOS_LATENCY_ANY, in turn, and
-the hardware supports a special "no requirement" setting, the callback is
-expected to use it. That allows software to prevent the hardware from
-automatically updating the device's latency tolerance in response to its power
-state changes (e.g. during transitions from D3cold to D0), which generally may
-be done in the autonomous latency tolerance control mode.
-
-If .set_latency_tolerance() is present for the device, sysfs attribute
-pm_qos_latency_tolerance_us will be present in the devivce's power directory.
-Then, user space can use that attribute to specify its latency tolerance
-requirement for the device, if any. Writing "any" to it means "no requirement,
-but do not let the hardware control latency tolerance" and writing "auto" to it
-allows the hardware to be switched to the autonomous mode if there are no other
-requirements from the kernel side in the device's list.
-
-Kernel code can use the functions described above along with the
-DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
-latency tolerance requirements for devices.
diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst
new file mode 100644
index 000000000000..7b8c42f8b1de
--- /dev/null
+++ b/Documentation/power/power_supply_class.rst
@@ -0,0 +1,288 @@
+========================
+Linux power supply class
+========================
+
+Synopsis
+~~~~~~~~
+Power supply class used to represent battery, UPS, AC or DC power supply
+properties to user-space.
+
+It defines core set of attributes, which should be applicable to (almost)
+every power supply out there. Attributes are available via sysfs and uevent
+interfaces.
+
+Each attribute has well defined meaning, up to unit of measure used. While
+the attributes provided are believed to be universally applicable to any
+power supply, specific monitoring hardware may not be able to provide them
+all, so any of them may be skipped.
+
+Power supply class is extensible, and allows to define drivers own attributes.
+The core attribute set is subject to the standard Linux evolution (i.e.
+if it will be found that some attribute is applicable to many power supply
+types or their drivers, it can be added to the core set).
+
+It also integrates with LED framework, for the purpose of providing
+typically expected feedback of battery charging/fully charged status and
+AC/USB power supply online status. (Note that specific details of the
+indication (including whether to use it at all) are fully controllable by
+user and/or specific machine defaults, per design principles of LED
+framework).
+
+
+Attributes/properties
+~~~~~~~~~~~~~~~~~~~~~
+Power supply class has predefined set of attributes, this eliminates code
+duplication across drivers. Power supply class insist on reusing its
+predefined attributes *and* their units.
+
+So, userspace gets predictable set of attributes and their units for any
+kind of power supply, and can process/present them to a user in consistent
+manner. Results for different power supplies and machines are also directly
+comparable.
+
+See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
+for the example how to declare and handle attributes.
+
+
+Units
+~~~~~
+Quoting include/linux/power_supply.h:
+
+ All voltages, currents, charges, energies, time and temperatures in µV,
+ µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
+ stated. It's driver's job to convert its raw values to units in which
+ this class operates.
+
+
+Attributes/properties detailed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++--------------------------------------------------------------------------+
+| **Charge/Energy/Capacity - how to not confuse** |
++--------------------------------------------------------------------------+
+| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity" |
+| of battery, this class distinguish these terms. Don't mix them!** |
+| |
+| - `CHARGE_*` |
+| attributes represents capacity in µAh only. |
+| - `ENERGY_*` |
+| attributes represents capacity in µWh only. |
+| - `CAPACITY` |
+| attribute represents capacity in *percents*, from 0 to 100. |
++--------------------------------------------------------------------------+
+
+Postfixes:
+
+_AVG
+ *hardware* averaged value, use it if your hardware is really able to
+ report averaged values.
+_NOW
+ momentary/instantaneous values.
+
+STATUS
+ this attribute represents operating status (charging, full,
+ discharging (i.e. powering a load), etc.). This corresponds to
+ `BATTERY_STATUS_*` values, as defined in battery.h.
+
+CHARGE_TYPE
+ batteries can typically charge at different rates.
+ This defines trickle and fast charges. For batteries that
+ are already charged or discharging, 'n/a' can be displayed (or
+ 'unknown', if the status is not known).
+
+AUTHENTIC
+ indicates the power supply (battery or charger) connected
+ to the platform is authentic(1) or non authentic(0).
+
+HEALTH
+ represents health of the battery, values corresponds to
+ POWER_SUPPLY_HEALTH_*, defined in battery.h.
+
+VOLTAGE_OCV
+ open circuit voltage of the battery.
+
+VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN
+ design values for maximal and minimal power supply voltages.
+ Maximal/minimal means values of voltages when battery considered
+ "full"/"empty" at normal conditions. Yes, there is no direct relation
+ between voltage and battery capacity, but some dumb
+ batteries use voltage for very approximated calculation of capacity.
+ Battery driver also can use this attribute just to inform userspace
+ about maximal and minimal voltage thresholds of a given battery.
+
+VOLTAGE_MAX, VOLTAGE_MIN
+ same as _DESIGN voltage values except that these ones should be used
+ if hardware could only guess (measure and retain) the thresholds of a
+ given power supply.
+
+VOLTAGE_BOOT
+ Reports the voltage measured during boot
+
+CURRENT_BOOT
+ Reports the current measured during boot
+
+CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN
+ design charge values, when battery considered full/empty.
+
+ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN
+ same as above but for energy.
+
+CHARGE_FULL, CHARGE_EMPTY
+ These attributes means "last remembered value of charge when battery
+ became full/empty". It also could mean "value of charge when battery
+ considered full/empty at given conditions (temperature, age)".
+ I.e. these attributes represents real thresholds, not design values.
+
+ENERGY_FULL, ENERGY_EMPTY
+ same as above but for energy.
+
+CHARGE_COUNTER
+ the current charge counter (in µAh). This could easily
+ be negative; there is no empty or full value. It is only useful for
+ relative, time-based measurements.
+
+PRECHARGE_CURRENT
+ the maximum charge current during precharge phase of charge cycle
+ (typically 20% of battery capacity).
+
+CHARGE_TERM_CURRENT
+ Charge termination current. The charge cycle terminates when battery
+ voltage is above recharge threshold, and charge current is below
+ this setting (typically 10% of battery capacity).
+
+CONSTANT_CHARGE_CURRENT
+ constant charge current programmed by charger.
+
+
+CONSTANT_CHARGE_CURRENT_MAX
+ maximum charge current supported by the power supply object.
+
+CONSTANT_CHARGE_VOLTAGE
+ constant charge voltage programmed by charger.
+CONSTANT_CHARGE_VOLTAGE_MAX
+ maximum charge voltage supported by the power supply object.
+
+INPUT_CURRENT_LIMIT
+ input current limit programmed by charger. Indicates
+ the current drawn from a charging source.
+INPUT_VOLTAGE_LIMIT
+ input voltage limit programmed by charger. Indicates
+ the voltage limit from a charging source.
+INPUT_POWER_LIMIT
+ input power limit programmed by charger. Indicates
+ the power limit from a charging source.
+
+CHARGE_CONTROL_LIMIT
+ current charge control limit setting
+CHARGE_CONTROL_LIMIT_MAX
+ maximum charge control limit setting
+
+CALIBRATE
+ battery or coulomb counter calibration status
+
+CAPACITY
+ capacity in percents.
+CAPACITY_ALERT_MIN
+ minimum capacity alert value in percents.
+CAPACITY_ALERT_MAX
+ maximum capacity alert value in percents.
+CAPACITY_LEVEL
+ capacity level. This corresponds to POWER_SUPPLY_CAPACITY_LEVEL_*.
+
+TEMP
+ temperature of the power supply.
+TEMP_ALERT_MIN
+ minimum battery temperature alert.
+TEMP_ALERT_MAX
+ maximum battery temperature alert.
+TEMP_AMBIENT
+ ambient temperature.
+TEMP_AMBIENT_ALERT_MIN
+ minimum ambient temperature alert.
+TEMP_AMBIENT_ALERT_MAX
+ maximum ambient temperature alert.
+TEMP_MIN
+ minimum operatable temperature
+TEMP_MAX
+ maximum operatable temperature
+
+TIME_TO_EMPTY
+ seconds left for battery to be considered empty
+ (i.e. while battery powers a load)
+TIME_TO_FULL
+ seconds left for battery to be considered full
+ (i.e. while battery is charging)
+
+
+Battery <-> external power supply interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Often power supplies are acting as supplies and supplicants at the same
+time. Batteries are good example. So, batteries usually care if they're
+externally powered or not.
+
+For that case, power supply class implements notification mechanism for
+batteries.
+
+External power supply (AC) lists supplicants (batteries) names in
+"supplied_to" struct member, and each power_supply_changed() call
+issued by external power supply will notify supplicants via
+external_power_changed callback.
+
+
+Devicetree battery characteristics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Drivers should call power_supply_get_battery_info() to obtain battery
+characteristics from a devicetree battery node, defined in
+Documentation/devicetree/bindings/power/supply/battery.txt. This is
+implemented in drivers/power/supply/bq27xxx_battery.c.
+
+Properties in struct power_supply_battery_info and their counterparts in the
+battery node have names corresponding to elements in enum power_supply_property,
+for naming consistency between sysfs attributes and battery node properties.
+
+
+QA
+~~
+
+Q:
+ Where is POWER_SUPPLY_PROP_XYZ attribute?
+A:
+ If you cannot find attribute suitable for your driver needs, feel free
+ to add it and send patch along with your driver.
+
+ The attributes available currently are the ones currently provided by the
+ drivers written.
+
+ Good candidates to add in future: model/part#, cycle_time, manufacturer,
+ etc.
+
+
+Q:
+ I have some very specific attribute (e.g. battery color), should I add
+ this attribute to standard ones?
+A:
+ Most likely, no. Such attribute can be placed in the driver itself, if
+ it is useful. Of course, if the attribute in question applicable to
+ large set of batteries, provided by many drivers, and/or comes from
+ some general battery specification/standard, it may be a candidate to
+ be added to the core attribute set.
+
+
+Q:
+ Suppose, my battery monitoring chip/firmware does not provides capacity
+ in percents, but provides charge_{now,full,empty}. Should I calculate
+ percentage capacity manually, inside the driver, and register CAPACITY
+ attribute? The same question about time_to_empty/time_to_full.
+A:
+ Most likely, no. This class is designed to export properties which are
+ directly measurable by the specific hardware available.
+
+ Inferring not available properties using some heuristics or mathematical
+ model is not subject of work for a battery driver. Such functionality
+ should be factored out, and in fact, apm_power, the driver to serve
+ legacy APM API on top of power supply class, uses a simple heuristic of
+ approximating remaining battery capacity based on its charge, current,
+ voltage and so on. But full-fledged battery model is likely not subject
+ for kernel at all, as it would require floating point calculation to deal
+ with things like differential equations and Kalman filters. This is
+ better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
deleted file mode 100644
index 300d37896e51..000000000000
--- a/Documentation/power/power_supply_class.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Linux power supply class
-========================
-
-Synopsis
-~~~~~~~~
-Power supply class used to represent battery, UPS, AC or DC power supply
-properties to user-space.
-
-It defines core set of attributes, which should be applicable to (almost)
-every power supply out there. Attributes are available via sysfs and uevent
-interfaces.
-
-Each attribute has well defined meaning, up to unit of measure used. While
-the attributes provided are believed to be universally applicable to any
-power supply, specific monitoring hardware may not be able to provide them
-all, so any of them may be skipped.
-
-Power supply class is extensible, and allows to define drivers own attributes.
-The core attribute set is subject to the standard Linux evolution (i.e.
-if it will be found that some attribute is applicable to many power supply
-types or their drivers, it can be added to the core set).
-
-It also integrates with LED framework, for the purpose of providing
-typically expected feedback of battery charging/fully charged status and
-AC/USB power supply online status. (Note that specific details of the
-indication (including whether to use it at all) are fully controllable by
-user and/or specific machine defaults, per design principles of LED
-framework).
-
-
-Attributes/properties
-~~~~~~~~~~~~~~~~~~~~~
-Power supply class has predefined set of attributes, this eliminates code
-duplication across drivers. Power supply class insist on reusing its
-predefined attributes *and* their units.
-
-So, userspace gets predictable set of attributes and their units for any
-kind of power supply, and can process/present them to a user in consistent
-manner. Results for different power supplies and machines are also directly
-comparable.
-
-See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
-for the example how to declare and handle attributes.
-
-
-Units
-~~~~~
-Quoting include/linux/power_supply.h:
-
- All voltages, currents, charges, energies, time and temperatures in µV,
- µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
- stated. It's driver's job to convert its raw values to units in which
- this class operates.
-
-
-Attributes/properties detailed
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-~ ~ ~ ~ ~ ~ ~ Charge/Energy/Capacity - how to not confuse ~ ~ ~ ~ ~ ~ ~
-~ ~
-~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity" ~
-~ of battery, this class distinguish these terms. Don't mix them! ~
-~ ~
-~ CHARGE_* attributes represents capacity in µAh only. ~
-~ ENERGY_* attributes represents capacity in µWh only. ~
-~ CAPACITY attribute represents capacity in *percents*, from 0 to 100. ~
-~ ~
-~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
-
-Postfixes:
-_AVG - *hardware* averaged value, use it if your hardware is really able to
-report averaged values.
-_NOW - momentary/instantaneous values.
-
-STATUS - this attribute represents operating status (charging, full,
-discharging (i.e. powering a load), etc.). This corresponds to
-BATTERY_STATUS_* values, as defined in battery.h.
-
-CHARGE_TYPE - batteries can typically charge at different rates.
-This defines trickle and fast charges. For batteries that
-are already charged or discharging, 'n/a' can be displayed (or
-'unknown', if the status is not known).
-
-AUTHENTIC - indicates the power supply (battery or charger) connected
-to the platform is authentic(1) or non authentic(0).
-
-HEALTH - represents health of the battery, values corresponds to
-POWER_SUPPLY_HEALTH_*, defined in battery.h.
-
-VOLTAGE_OCV - open circuit voltage of the battery.
-
-VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and
-minimal power supply voltages. Maximal/minimal means values of voltages
-when battery considered "full"/"empty" at normal conditions. Yes, there is
-no direct relation between voltage and battery capacity, but some dumb
-batteries use voltage for very approximated calculation of capacity.
-Battery driver also can use this attribute just to inform userspace
-about maximal and minimal voltage thresholds of a given battery.
-
-VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that
-these ones should be used if hardware could only guess (measure and
-retain) the thresholds of a given power supply.
-
-VOLTAGE_BOOT - Reports the voltage measured during boot
-
-CURRENT_BOOT - Reports the current measured during boot
-
-CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when
-battery considered full/empty.
-
-ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy.
-
-CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value
-of charge when battery became full/empty". It also could mean "value of
-charge when battery considered full/empty at given conditions (temperature,
-age)". I.e. these attributes represents real thresholds, not design values.
-
-ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
-
-CHARGE_COUNTER - the current charge counter (in µAh). This could easily
-be negative; there is no empty or full value. It is only useful for
-relative, time-based measurements.
-
-PRECHARGE_CURRENT - the maximum charge current during precharge phase
-of charge cycle (typically 20% of battery capacity).
-CHARGE_TERM_CURRENT - Charge termination current. The charge cycle
-terminates when battery voltage is above recharge threshold, and charge
-current is below this setting (typically 10% of battery capacity).
-
-CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger.
-CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the
-power supply object.
-
-CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger.
-CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the
-power supply object.
-
-INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
-the current drawn from a charging source.
-
-CHARGE_CONTROL_LIMIT - current charge control limit setting
-CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
-
-CALIBRATE - battery or coulomb counter calibration status
-
-CAPACITY - capacity in percents.
-CAPACITY_ALERT_MIN - minimum capacity alert value in percents.
-CAPACITY_ALERT_MAX - maximum capacity alert value in percents.
-CAPACITY_LEVEL - capacity level. This corresponds to
-POWER_SUPPLY_CAPACITY_LEVEL_*.
-
-TEMP - temperature of the power supply.
-TEMP_ALERT_MIN - minimum battery temperature alert.
-TEMP_ALERT_MAX - maximum battery temperature alert.
-TEMP_AMBIENT - ambient temperature.
-TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert.
-TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert.
-TEMP_MIN - minimum operatable temperature
-TEMP_MAX - maximum operatable temperature
-
-TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e.
-while battery powers a load)
-TIME_TO_FULL - seconds left for battery to be considered full (i.e.
-while battery is charging)
-
-
-Battery <-> external power supply interaction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Often power supplies are acting as supplies and supplicants at the same
-time. Batteries are good example. So, batteries usually care if they're
-externally powered or not.
-
-For that case, power supply class implements notification mechanism for
-batteries.
-
-External power supply (AC) lists supplicants (batteries) names in
-"supplied_to" struct member, and each power_supply_changed() call
-issued by external power supply will notify supplicants via
-external_power_changed callback.
-
-
-Devicetree battery characteristics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Drivers should call power_supply_get_battery_info() to obtain battery
-characteristics from a devicetree battery node, defined in
-Documentation/devicetree/bindings/power/supply/battery.txt. This is
-implemented in drivers/power/supply/bq27xxx_battery.c.
-
-Properties in struct power_supply_battery_info and their counterparts in the
-battery node have names corresponding to elements in enum power_supply_property,
-for naming consistency between sysfs attributes and battery node properties.
-
-
-QA
-~~
-Q: Where is POWER_SUPPLY_PROP_XYZ attribute?
-A: If you cannot find attribute suitable for your driver needs, feel free
- to add it and send patch along with your driver.
-
- The attributes available currently are the ones currently provided by the
- drivers written.
-
- Good candidates to add in future: model/part#, cycle_time, manufacturer,
- etc.
-
-
-Q: I have some very specific attribute (e.g. battery color), should I add
- this attribute to standard ones?
-A: Most likely, no. Such attribute can be placed in the driver itself, if
- it is useful. Of course, if the attribute in question applicable to
- large set of batteries, provided by many drivers, and/or comes from
- some general battery specification/standard, it may be a candidate to
- be added to the core attribute set.
-
-
-Q: Suppose, my battery monitoring chip/firmware does not provides capacity
- in percents, but provides charge_{now,full,empty}. Should I calculate
- percentage capacity manually, inside the driver, and register CAPACITY
- attribute? The same question about time_to_empty/time_to_full.
-A: Most likely, no. This class is designed to export properties which are
- directly measurable by the specific hardware available.
-
- Inferring not available properties using some heuristics or mathematical
- model is not subject of work for a battery driver. Such functionality
- should be factored out, and in fact, apm_power, the driver to serve
- legacy APM API on top of power supply class, uses a simple heuristic of
- approximating remaining battery capacity based on its charge, current,
- voltage and so on. But full-fledged battery model is likely not subject
- for kernel at all, as it would require floating point calculation to deal
- with things like differential equations and Kalman filters. This is
- better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/powercap/powercap.rst b/Documentation/power/powercap/powercap.rst
new file mode 100644
index 000000000000..7ae3b44c7624
--- /dev/null
+++ b/Documentation/power/powercap/powercap.rst
@@ -0,0 +1,257 @@
+=======================
+Power Capping Framework
+=======================
+
+The power capping framework provides a consistent interface between the kernel
+and the user space that allows power capping drivers to expose the settings to
+user space in a uniform way.
+
+Terminology
+===========
+
+The framework exposes power capping devices to user space via sysfs in the
+form of a tree of objects. The objects at the root level of the tree represent
+'control types', which correspond to different methods of power capping. For
+example, the intel-rapl control type represents the Intel "Running Average
+Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
+corresponds to the use of idle injection for controlling power.
+
+Power zones represent different parts of the system, which can be controlled and
+monitored using the power capping method determined by the control type the
+given zone belongs to. They each contain attributes for monitoring power, as
+well as controls represented in the form of power constraints. If the parts of
+the system represented by different power zones are hierarchical (that is, one
+bigger part consists of multiple smaller parts that each have their own power
+controls), those power zones may also be organized in a hierarchy with one
+parent power zone containing multiple subzones and so on to reflect the power
+control topology of the system. In that case, it is possible to apply power
+capping to a set of devices together using the parent power zone and if more
+fine grained control is required, it can be applied through the subzones.
+
+
+Example sysfs interface tree::
+
+ /sys/devices/virtual/powercap
+ └──intel-rapl
+ ├──intel-rapl:0
+ │   ├──constraint_0_name
+ │   ├──constraint_0_power_limit_uw
+ │   ├──constraint_0_time_window_us
+ │   ├──constraint_1_name
+ │   ├──constraint_1_power_limit_uw
+ │   ├──constraint_1_time_window_us
+ │   ├──device -> ../../intel-rapl
+ │   ├──energy_uj
+ │   ├──intel-rapl:0:0
+ │   │   ├──constraint_0_name
+ │   │   ├──constraint_0_power_limit_uw
+ │   │   ├──constraint_0_time_window_us
+ │   │   ├──constraint_1_name
+ │   │   ├──constraint_1_power_limit_uw
+ │   │   ├──constraint_1_time_window_us
+ │   │   ├──device -> ../../intel-rapl:0
+ │   │   ├──energy_uj
+ │   │   ├──max_energy_range_uj
+ │   │   ├──name
+ │   │   ├──enabled
+ │   │   ├──power
+ │   │   │   ├──async
+ │   │   │   []
+ │   │   ├──subsystem -> ../../../../../../class/power_cap
+ │   │   └──uevent
+ │   ├──intel-rapl:0:1
+ │   │   ├──constraint_0_name
+ │   │   ├──constraint_0_power_limit_uw
+ │   │   ├──constraint_0_time_window_us
+ │   │   ├──constraint_1_name
+ │   │   ├──constraint_1_power_limit_uw
+ │   │   ├──constraint_1_time_window_us
+ │   │   ├──device -> ../../intel-rapl:0
+ │   │   ├──energy_uj
+ │   │   ├──max_energy_range_uj
+ │   │   ├──name
+ │   │   ├──enabled
+ │   │   ├──power
+ │   │   │   ├──async
+ │   │   │   []
+ │   │   ├──subsystem -> ../../../../../../class/power_cap
+ │   │   └──uevent
+ │   ├──max_energy_range_uj
+ │   ├──max_power_range_uw
+ │   ├──name
+ │   ├──enabled
+ │   ├──power
+ │   │   ├──async
+ │   │   []
+ │   ├──subsystem -> ../../../../../class/power_cap
+ │   ├──enabled
+ │   ├──uevent
+ ├──intel-rapl:1
+ │   ├──constraint_0_name
+ │   ├──constraint_0_power_limit_uw
+ │   ├──constraint_0_time_window_us
+ │   ├──constraint_1_name
+ │   ├──constraint_1_power_limit_uw
+ │   ├──constraint_1_time_window_us
+ │   ├──device -> ../../intel-rapl
+ │   ├──energy_uj
+ │   ├──intel-rapl:1:0
+ │   │   ├──constraint_0_name
+ │   │   ├──constraint_0_power_limit_uw
+ │   │   ├──constraint_0_time_window_us
+ │   │   ├──constraint_1_name
+ │   │   ├──constraint_1_power_limit_uw
+ │   │   ├──constraint_1_time_window_us
+ │   │   ├──device -> ../../intel-rapl:1
+ │   │   ├──energy_uj
+ │   │   ├──max_energy_range_uj
+ │   │   ├──name
+ │   │   ├──enabled
+ │   │   ├──power
+ │   │   │   ├──async
+ │   │   │   []
+ │   │   ├──subsystem -> ../../../../../../class/power_cap
+ │   │   └──uevent
+ │   ├──intel-rapl:1:1
+ │   │   ├──constraint_0_name
+ │   │   ├──constraint_0_power_limit_uw
+ │   │   ├──constraint_0_time_window_us
+ │   │   ├──constraint_1_name
+ │   │   ├──constraint_1_power_limit_uw
+ │   │   ├──constraint_1_time_window_us
+ │   │   ├──device -> ../../intel-rapl:1
+ │   │   ├──energy_uj
+ │   │   ├──max_energy_range_uj
+ │   │   ├──name
+ │   │   ├──enabled
+ │   │   ├──power
+ │   │   │   ├──async
+ │   │   │   []
+ │   │   ├──subsystem -> ../../../../../../class/power_cap
+ │   │   └──uevent
+ │   ├──max_energy_range_uj
+ │   ├──max_power_range_uw
+ │   ├──name
+ │   ├──enabled
+ │   ├──power
+ │   │   ├──async
+ │   │   []
+ │   ├──subsystem -> ../../../../../class/power_cap
+ │   ├──uevent
+ ├──power
+ │   ├──async
+ │   []
+ ├──subsystem -> ../../../../class/power_cap
+ ├──enabled
+ └──uevent
+
+The above example illustrates a case in which the Intel RAPL technology,
+available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
+control type called intel-rapl which contains two power zones, intel-rapl:0 and
+intel-rapl:1, representing CPU packages. Each of these power zones contains
+two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
+"core" and the "uncore" parts of the given CPU package, respectively. All of
+the zones and subzones contain energy monitoring attributes (energy_uj,
+max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
+to be applied (the constraints in the 'package' power zones apply to the whole
+CPU packages and the subzone constraints only apply to the respective parts of
+the given package individually). Since Intel RAPL doesn't provide instantaneous
+power value, there is no power_uw attribute.
+
+In addition to that, each power zone contains a name attribute, allowing the
+part of the system represented by that zone to be identified.
+For example::
+
+ cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
+
+package-0
+---------
+
+The Intel RAPL technology allows two constraints, short term and long term,
+with two different time windows to be applied to each power zone. Thus for
+each zone there are 2 attributes representing the constraint names, 2 power
+limits and 2 attributes representing the sizes of the time windows. Such that,
+constraint_j_* attributes correspond to the jth constraint (j = 0,1).
+
+For example::
+
+ constraint_0_name
+ constraint_0_power_limit_uw
+ constraint_0_time_window_us
+ constraint_1_name
+ constraint_1_power_limit_uw
+ constraint_1_time_window_us
+
+Power Zone Attributes
+=====================
+
+Monitoring attributes
+---------------------
+
+energy_uj (rw)
+ Current energy counter in micro joules. Write "0" to reset.
+ If the counter can not be reset, then this attribute is read only.
+
+max_energy_range_uj (ro)
+ Range of the above energy counter in micro-joules.
+
+power_uw (ro)
+ Current power in micro watts.
+
+max_power_range_uw (ro)
+ Range of the above power value in micro-watts.
+
+name (ro)
+ Name of this power zone.
+
+It is possible that some domains have both power ranges and energy counter ranges;
+however, only one is mandatory.
+
+Constraints
+-----------
+
+constraint_X_power_limit_uw (rw)
+ Power limit in micro watts, which should be applicable for the
+ time window specified by "constraint_X_time_window_us".
+
+constraint_X_time_window_us (rw)
+ Time window in micro seconds.
+
+constraint_X_name (ro)
+ An optional name of the constraint
+
+constraint_X_max_power_uw(ro)
+ Maximum allowed power in micro watts.
+
+constraint_X_min_power_uw(ro)
+ Minimum allowed power in micro watts.
+
+constraint_X_max_time_window_us(ro)
+ Maximum allowed time window in micro seconds.
+
+constraint_X_min_time_window_us(ro)
+ Minimum allowed time window in micro seconds.
+
+Except power_limit_uw and time_window_us other fields are optional.
+
+Common zone and control type attributes
+---------------------------------------
+
+enabled (rw): Enable/Disable controls at zone level or for all zones using
+a control type.
+
+Power Cap Client Driver Interface
+=================================
+
+The API summary:
+
+Call powercap_register_control_type() to register control type object.
+Call powercap_register_zone() to register a power zone (under a given
+control type), either as a top-level power zone or as a subzone of another
+power zone registered earlier.
+The number of constraints in a power zone and the corresponding callbacks have
+to be defined prior to calling powercap_register_zone() to register that zone.
+
+To Free a power zone call powercap_unregister_zone().
+To free a control type object call powercap_unregister_control_type().
+Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt
deleted file mode 100644
index 1e6ef164e07a..000000000000
--- a/Documentation/power/powercap/powercap.txt
+++ /dev/null
@@ -1,236 +0,0 @@
-Power Capping Framework
-==================================
-
-The power capping framework provides a consistent interface between the kernel
-and the user space that allows power capping drivers to expose the settings to
-user space in a uniform way.
-
-Terminology
-=========================
-The framework exposes power capping devices to user space via sysfs in the
-form of a tree of objects. The objects at the root level of the tree represent
-'control types', which correspond to different methods of power capping. For
-example, the intel-rapl control type represents the Intel "Running Average
-Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
-corresponds to the use of idle injection for controlling power.
-
-Power zones represent different parts of the system, which can be controlled and
-monitored using the power capping method determined by the control type the
-given zone belongs to. They each contain attributes for monitoring power, as
-well as controls represented in the form of power constraints. If the parts of
-the system represented by different power zones are hierarchical (that is, one
-bigger part consists of multiple smaller parts that each have their own power
-controls), those power zones may also be organized in a hierarchy with one
-parent power zone containing multiple subzones and so on to reflect the power
-control topology of the system. In that case, it is possible to apply power
-capping to a set of devices together using the parent power zone and if more
-fine grained control is required, it can be applied through the subzones.
-
-
-Example sysfs interface tree:
-
-/sys/devices/virtual/powercap
-??? intel-rapl
- ??? intel-rapl:0
- ?   ??? constraint_0_name
- ?   ??? constraint_0_power_limit_uw
- ?   ??? constraint_0_time_window_us
- ?   ??? constraint_1_name
- ?   ??? constraint_1_power_limit_uw
- ?   ??? constraint_1_time_window_us
- ?   ??? device -> ../../intel-rapl
- ?   ??? energy_uj
- ?   ??? intel-rapl:0:0
- ?   ?   ??? constraint_0_name
- ?   ?   ??? constraint_0_power_limit_uw
- ?   ?   ??? constraint_0_time_window_us
- ?   ?   ??? constraint_1_name
- ?   ?   ??? constraint_1_power_limit_uw
- ?   ?   ??? constraint_1_time_window_us
- ?   ?   ??? device -> ../../intel-rapl:0
- ?   ?   ??? energy_uj
- ?   ?   ??? max_energy_range_uj
- ?   ?   ??? name
- ?   ?   ??? enabled
- ?   ?   ??? power
- ?   ?   ?   ??? async
- ?   ?   ?   []
- ?   ?   ??? subsystem -> ../../../../../../class/power_cap
- ?   ?   ??? uevent
- ?   ??? intel-rapl:0:1
- ?   ?   ??? constraint_0_name
- ?   ?   ??? constraint_0_power_limit_uw
- ?   ?   ??? constraint_0_time_window_us
- ?   ?   ??? constraint_1_name
- ?   ?   ??? constraint_1_power_limit_uw
- ?   ?   ??? constraint_1_time_window_us
- ?   ?   ??? device -> ../../intel-rapl:0
- ?   ?   ??? energy_uj
- ?   ?   ??? max_energy_range_uj
- ?   ?   ??? name
- ?   ?   ??? enabled
- ?   ?   ??? power
- ?   ?   ?   ??? async
- ?   ?   ?   []
- ?   ?   ??? subsystem -> ../../../../../../class/power_cap
- ?   ?   ??? uevent
- ?   ??? max_energy_range_uj
- ?   ??? max_power_range_uw
- ?   ??? name
- ?   ??? enabled
- ?   ??? power
- ?   ?   ??? async
- ?   ?   []
- ?   ??? subsystem -> ../../../../../class/power_cap
- ?   ??? enabled
- ?   ??? uevent
- ??? intel-rapl:1
- ?   ??? constraint_0_name
- ?   ??? constraint_0_power_limit_uw
- ?   ??? constraint_0_time_window_us
- ?   ??? constraint_1_name
- ?   ??? constraint_1_power_limit_uw
- ?   ??? constraint_1_time_window_us
- ?   ??? device -> ../../intel-rapl
- ?   ??? energy_uj
- ?   ??? intel-rapl:1:0
- ?   ?   ??? constraint_0_name
- ?   ?   ??? constraint_0_power_limit_uw
- ?   ?   ??? constraint_0_time_window_us
- ?   ?   ??? constraint_1_name
- ?   ?   ??? constraint_1_power_limit_uw
- ?   ?   ??? constraint_1_time_window_us
- ?   ?   ??? device -> ../../intel-rapl:1
- ?   ?   ??? energy_uj
- ?   ?   ??? max_energy_range_uj
- ?   ?   ??? name
- ?   ?   ??? enabled
- ?   ?   ??? power
- ?   ?   ?   ??? async
- ?   ?   ?   []
- ?   ?   ??? subsystem -> ../../../../../../class/power_cap
- ?   ?   ??? uevent
- ?   ??? intel-rapl:1:1
- ?   ?   ??? constraint_0_name
- ?   ?   ??? constraint_0_power_limit_uw
- ?   ?   ??? constraint_0_time_window_us
- ?   ?   ??? constraint_1_name
- ?   ?   ??? constraint_1_power_limit_uw
- ?   ?   ??? constraint_1_time_window_us
- ?   ?   ??? device -> ../../intel-rapl:1
- ?   ?   ??? energy_uj
- ?   ?   ??? max_energy_range_uj
- ?   ?   ??? name
- ?   ?   ??? enabled
- ?   ?   ??? power
- ?   ?   ?   ??? async
- ?   ?   ?   []
- ?   ?   ??? subsystem -> ../../../../../../class/power_cap
- ?   ?   ??? uevent
- ?   ??? max_energy_range_uj
- ?   ??? max_power_range_uw
- ?   ??? name
- ?   ??? enabled
- ?   ??? power
- ?   ?   ??? async
- ?   ?   []
- ?   ??? subsystem -> ../../../../../class/power_cap
- ?   ??? uevent
- ??? power
- ?   ??? async
- ?   []
- ??? subsystem -> ../../../../class/power_cap
- ??? enabled
- ??? uevent
-
-The above example illustrates a case in which the Intel RAPL technology,
-available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
-control type called intel-rapl which contains two power zones, intel-rapl:0 and
-intel-rapl:1, representing CPU packages. Each of these power zones contains
-two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
-"core" and the "uncore" parts of the given CPU package, respectively. All of
-the zones and subzones contain energy monitoring attributes (energy_uj,
-max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
-to be applied (the constraints in the 'package' power zones apply to the whole
-CPU packages and the subzone constraints only apply to the respective parts of
-the given package individually). Since Intel RAPL doesn't provide instantaneous
-power value, there is no power_uw attribute.
-
-In addition to that, each power zone contains a name attribute, allowing the
-part of the system represented by that zone to be identified.
-For example:
-
-cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
-package-0
-
-The Intel RAPL technology allows two constraints, short term and long term,
-with two different time windows to be applied to each power zone. Thus for
-each zone there are 2 attributes representing the constraint names, 2 power
-limits and 2 attributes representing the sizes of the time windows. Such that,
-constraint_j_* attributes correspond to the jth constraint (j = 0,1).
-
-For example:
- constraint_0_name
- constraint_0_power_limit_uw
- constraint_0_time_window_us
- constraint_1_name
- constraint_1_power_limit_uw
- constraint_1_time_window_us
-
-Power Zone Attributes
-=================================
-Monitoring attributes
-----------------------
-
-energy_uj (rw): Current energy counter in micro joules. Write "0" to reset.
-If the counter can not be reset, then this attribute is read only.
-
-max_energy_range_uj (ro): Range of the above energy counter in micro-joules.
-
-power_uw (ro): Current power in micro watts.
-
-max_power_range_uw (ro): Range of the above power value in micro-watts.
-
-name (ro): Name of this power zone.
-
-It is possible that some domains have both power ranges and energy counter ranges;
-however, only one is mandatory.
-
-Constraints
-----------------
-constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be
-applicable for the time window specified by "constraint_X_time_window_us".
-
-constraint_X_time_window_us (rw): Time window in micro seconds.
-
-constraint_X_name (ro): An optional name of the constraint
-
-constraint_X_max_power_uw(ro): Maximum allowed power in micro watts.
-
-constraint_X_min_power_uw(ro): Minimum allowed power in micro watts.
-
-constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds.
-
-constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds.
-
-Except power_limit_uw and time_window_us other fields are optional.
-
-Common zone and control type attributes
-----------------------------------------
-enabled (rw): Enable/Disable controls at zone level or for all zones using
-a control type.
-
-Power Cap Client Driver Interface
-==================================
-The API summary:
-
-Call powercap_register_control_type() to register control type object.
-Call powercap_register_zone() to register a power zone (under a given
-control type), either as a top-level power zone or as a subzone of another
-power zone registered earlier.
-The number of constraints in a power zone and the corresponding callbacks have
-to be defined prior to calling powercap_register_zone() to register that zone.
-
-To Free a power zone call powercap_unregister_zone().
-To free a control type object call powercap_unregister_control_type().
-Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst
new file mode 100644
index 000000000000..0cd8cc1275a7
--- /dev/null
+++ b/Documentation/power/regulator/consumer.rst
@@ -0,0 +1,229 @@
+===================================
+Regulator Consumer Driver Interface
+===================================
+
+This text describes the regulator interface for consumer device drivers.
+Please see overview.txt for a description of the terms used in this text.
+
+
+1. Consumer Regulator Access (static & dynamic drivers)
+=======================================================
+
+A consumer driver can get access to its supply regulator by calling ::
+
+ regulator = regulator_get(dev, "Vcc");
+
+The consumer passes in its struct device pointer and power supply ID. The core
+then finds the correct regulator by consulting a machine specific lookup table.
+If the lookup is successful then this call will return a pointer to the struct
+regulator that supplies this consumer.
+
+To release the regulator the consumer driver should call ::
+
+ regulator_put(regulator);
+
+Consumers can be supplied by more than one regulator e.g. codec consumer with
+analog and digital supplies ::
+
+ digital = regulator_get(dev, "Vcc"); /* digital core */
+ analog = regulator_get(dev, "Avdd"); /* analog */
+
+The regulator access functions regulator_get() and regulator_put() will
+usually be called in your device drivers probe() and remove() respectively.
+
+
+2. Regulator Output Enable & Disable (static & dynamic drivers)
+===============================================================
+
+
+A consumer can enable its power supply by calling::
+
+ int regulator_enable(regulator);
+
+NOTE:
+ The supply may already be enabled before regulator_enabled() is called.
+ This may happen if the consumer shares the regulator or the regulator has been
+ previously enabled by bootloader or kernel board initialization code.
+
+A consumer can determine if a regulator is enabled by calling::
+
+ int regulator_is_enabled(regulator);
+
+This will return > zero when the regulator is enabled.
+
+
+A consumer can disable its supply when no longer needed by calling::
+
+ int regulator_disable(regulator);
+
+NOTE:
+ This may not disable the supply if it's shared with other consumers. The
+ regulator will only be disabled when the enabled reference count is zero.
+
+Finally, a regulator can be forcefully disabled in the case of an emergency::
+
+ int regulator_force_disable(regulator);
+
+NOTE:
+ this will immediately and forcefully shutdown the regulator output. All
+ consumers will be powered off.
+
+
+3. Regulator Voltage Control & Status (dynamic drivers)
+=======================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+voltage to match system operating points. e.g. CPUfreq drivers can scale
+voltage along with frequency to save power, SD drivers may need to select the
+correct card voltage, etc.
+
+Consumers can control their supply voltage by calling::
+
+ int regulator_set_voltage(regulator, min_uV, max_uV);
+
+Where min_uV and max_uV are the minimum and maximum acceptable voltages in
+microvolts.
+
+NOTE: this can be called when the regulator is enabled or disabled. If called
+when enabled, then the voltage changes instantly, otherwise the voltage
+configuration changes and the voltage is physically set when the regulator is
+next enabled.
+
+The regulators configured voltage output can be found by calling::
+
+ int regulator_get_voltage(regulator);
+
+NOTE:
+ get_voltage() will return the configured output voltage whether the
+ regulator is enabled or disabled and should NOT be used to determine regulator
+ output state. However this can be used in conjunction with is_enabled() to
+ determine the regulator physical output voltage.
+
+
+4. Regulator Current Limit Control & Status (dynamic drivers)
+=============================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+current limit to match system operating points. e.g. LCD backlight driver can
+change the current limit to vary the backlight brightness, USB drivers may want
+to set the limit to 500mA when supplying power.
+
+Consumers can control their supply current limit by calling::
+
+ int regulator_set_current_limit(regulator, min_uA, max_uA);
+
+Where min_uA and max_uA are the minimum and maximum acceptable current limit in
+microamps.
+
+NOTE:
+ this can be called when the regulator is enabled or disabled. If called
+ when enabled, then the current limit changes instantly, otherwise the current
+ limit configuration changes and the current limit is physically set when the
+ regulator is next enabled.
+
+A regulators current limit can be found by calling::
+
+ int regulator_get_current_limit(regulator);
+
+NOTE:
+ get_current_limit() will return the current limit whether the regulator
+ is enabled or disabled and should not be used to determine regulator current
+ load.
+
+
+5. Regulator Operating Mode Control & Status (dynamic drivers)
+==============================================================
+
+Some consumers can further save system power by changing the operating mode of
+their supply regulator to be more efficient when the consumers operating state
+changes. e.g. consumer driver is idle and subsequently draws less current
+
+Regulator operating mode can be changed indirectly or directly.
+
+Indirect operating mode control.
+--------------------------------
+Consumer drivers can request a change in their supply regulator operating mode
+by calling::
+
+ int regulator_set_load(struct regulator *regulator, int load_uA);
+
+This will cause the core to recalculate the total load on the regulator (based
+on all its consumers) and change operating mode (if necessary and permitted)
+to best match the current operating load.
+
+The load_uA value can be determined from the consumer's datasheet. e.g. most
+datasheets have tables showing the maximum current consumed in certain
+situations.
+
+Most consumers will use indirect operating mode control since they have no
+knowledge of the regulator or whether the regulator is shared with other
+consumers.
+
+Direct operating mode control.
+------------------------------
+
+Bespoke or tightly coupled drivers may want to directly control regulator
+operating mode depending on their operating point. This can be achieved by
+calling::
+
+ int regulator_set_mode(struct regulator *regulator, unsigned int mode);
+ unsigned int regulator_get_mode(struct regulator *regulator);
+
+Direct mode will only be used by consumers that *know* about the regulator and
+are not sharing the regulator with other consumers.
+
+
+6. Regulator Events
+===================
+
+Regulators can notify consumers of external events. Events could be received by
+consumers under regulator stress or failure conditions.
+
+Consumers can register interest in regulator events by calling::
+
+ int regulator_register_notifier(struct regulator *regulator,
+ struct notifier_block *nb);
+
+Consumers can unregister interest by calling::
+
+ int regulator_unregister_notifier(struct regulator *regulator,
+ struct notifier_block *nb);
+
+Regulators use the kernel notifier framework to send event to their interested
+consumers.
+
+7. Regulator Direct Register Access
+===================================
+
+Some kinds of power management hardware or firmware are designed such that
+they need to do low-level hardware access to regulators, with no involvement
+from the kernel. Examples of such devices are:
+
+- clocksource with a voltage-controlled oscillator and control logic to change
+ the supply voltage over I2C to achieve a desired output clock rate
+- thermal management firmware that can issue an arbitrary I2C transaction to
+ perform system poweroff during overtemperature conditions
+
+To set up such a device/firmware, various parameters like I2C address of the
+regulator, addresses of various regulator registers etc. need to be configured
+to it. The regulator framework provides the following helpers for querying
+these details.
+
+Bus-specific details, like I2C addresses or transfer rates are handled by the
+regmap framework. To get the regulator's regmap (if supported), use::
+
+ struct regmap *regulator_get_regmap(struct regulator *regulator);
+
+To obtain the hardware register offset and bitmask for the regulator's voltage
+selector register, use::
+
+ int regulator_get_hardware_vsel_register(struct regulator *regulator,
+ unsigned *vsel_reg,
+ unsigned *vsel_mask);
+
+To convert a regulator framework voltage selector code (used by
+regulator_list_voltage) to a hardware-specific voltage selector that can be
+directly written to the voltage selector register, use::
+
+ int regulator_list_hardware_vsel(struct regulator *regulator,
+ unsigned selector);
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
deleted file mode 100644
index e51564c1a140..000000000000
--- a/Documentation/power/regulator/consumer.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-Regulator Consumer Driver Interface
-===================================
-
-This text describes the regulator interface for consumer device drivers.
-Please see overview.txt for a description of the terms used in this text.
-
-
-1. Consumer Regulator Access (static & dynamic drivers)
-=======================================================
-
-A consumer driver can get access to its supply regulator by calling :-
-
-regulator = regulator_get(dev, "Vcc");
-
-The consumer passes in its struct device pointer and power supply ID. The core
-then finds the correct regulator by consulting a machine specific lookup table.
-If the lookup is successful then this call will return a pointer to the struct
-regulator that supplies this consumer.
-
-To release the regulator the consumer driver should call :-
-
-regulator_put(regulator);
-
-Consumers can be supplied by more than one regulator e.g. codec consumer with
-analog and digital supplies :-
-
-digital = regulator_get(dev, "Vcc"); /* digital core */
-analog = regulator_get(dev, "Avdd"); /* analog */
-
-The regulator access functions regulator_get() and regulator_put() will
-usually be called in your device drivers probe() and remove() respectively.
-
-
-2. Regulator Output Enable & Disable (static & dynamic drivers)
-====================================================================
-
-A consumer can enable its power supply by calling:-
-
-int regulator_enable(regulator);
-
-NOTE: The supply may already be enabled before regulator_enabled() is called.
-This may happen if the consumer shares the regulator or the regulator has been
-previously enabled by bootloader or kernel board initialization code.
-
-A consumer can determine if a regulator is enabled by calling :-
-
-int regulator_is_enabled(regulator);
-
-This will return > zero when the regulator is enabled.
-
-
-A consumer can disable its supply when no longer needed by calling :-
-
-int regulator_disable(regulator);
-
-NOTE: This may not disable the supply if it's shared with other consumers. The
-regulator will only be disabled when the enabled reference count is zero.
-
-Finally, a regulator can be forcefully disabled in the case of an emergency :-
-
-int regulator_force_disable(regulator);
-
-NOTE: this will immediately and forcefully shutdown the regulator output. All
-consumers will be powered off.
-
-
-3. Regulator Voltage Control & Status (dynamic drivers)
-======================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-voltage to match system operating points. e.g. CPUfreq drivers can scale
-voltage along with frequency to save power, SD drivers may need to select the
-correct card voltage, etc.
-
-Consumers can control their supply voltage by calling :-
-
-int regulator_set_voltage(regulator, min_uV, max_uV);
-
-Where min_uV and max_uV are the minimum and maximum acceptable voltages in
-microvolts.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the voltage changes instantly, otherwise the voltage
-configuration changes and the voltage is physically set when the regulator is
-next enabled.
-
-The regulators configured voltage output can be found by calling :-
-
-int regulator_get_voltage(regulator);
-
-NOTE: get_voltage() will return the configured output voltage whether the
-regulator is enabled or disabled and should NOT be used to determine regulator
-output state. However this can be used in conjunction with is_enabled() to
-determine the regulator physical output voltage.
-
-
-4. Regulator Current Limit Control & Status (dynamic drivers)
-===========================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-current limit to match system operating points. e.g. LCD backlight driver can
-change the current limit to vary the backlight brightness, USB drivers may want
-to set the limit to 500mA when supplying power.
-
-Consumers can control their supply current limit by calling :-
-
-int regulator_set_current_limit(regulator, min_uA, max_uA);
-
-Where min_uA and max_uA are the minimum and maximum acceptable current limit in
-microamps.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the current limit changes instantly, otherwise the current
-limit configuration changes and the current limit is physically set when the
-regulator is next enabled.
-
-A regulators current limit can be found by calling :-
-
-int regulator_get_current_limit(regulator);
-
-NOTE: get_current_limit() will return the current limit whether the regulator
-is enabled or disabled and should not be used to determine regulator current
-load.
-
-
-5. Regulator Operating Mode Control & Status (dynamic drivers)
-=============================================================
-
-Some consumers can further save system power by changing the operating mode of
-their supply regulator to be more efficient when the consumers operating state
-changes. e.g. consumer driver is idle and subsequently draws less current
-
-Regulator operating mode can be changed indirectly or directly.
-
-Indirect operating mode control.
---------------------------------
-Consumer drivers can request a change in their supply regulator operating mode
-by calling :-
-
-int regulator_set_load(struct regulator *regulator, int load_uA);
-
-This will cause the core to recalculate the total load on the regulator (based
-on all its consumers) and change operating mode (if necessary and permitted)
-to best match the current operating load.
-
-The load_uA value can be determined from the consumer's datasheet. e.g. most
-datasheets have tables showing the maximum current consumed in certain
-situations.
-
-Most consumers will use indirect operating mode control since they have no
-knowledge of the regulator or whether the regulator is shared with other
-consumers.
-
-Direct operating mode control.
-------------------------------
-Bespoke or tightly coupled drivers may want to directly control regulator
-operating mode depending on their operating point. This can be achieved by
-calling :-
-
-int regulator_set_mode(struct regulator *regulator, unsigned int mode);
-unsigned int regulator_get_mode(struct regulator *regulator);
-
-Direct mode will only be used by consumers that *know* about the regulator and
-are not sharing the regulator with other consumers.
-
-
-6. Regulator Events
-===================
-Regulators can notify consumers of external events. Events could be received by
-consumers under regulator stress or failure conditions.
-
-Consumers can register interest in regulator events by calling :-
-
-int regulator_register_notifier(struct regulator *regulator,
- struct notifier_block *nb);
-
-Consumers can unregister interest by calling :-
-
-int regulator_unregister_notifier(struct regulator *regulator,
- struct notifier_block *nb);
-
-Regulators use the kernel notifier framework to send event to their interested
-consumers.
-
-7. Regulator Direct Register Access
-===================================
-Some kinds of power management hardware or firmware are designed such that
-they need to do low-level hardware access to regulators, with no involvement
-from the kernel. Examples of such devices are:
-
-- clocksource with a voltage-controlled oscillator and control logic to change
- the supply voltage over I2C to achieve a desired output clock rate
-- thermal management firmware that can issue an arbitrary I2C transaction to
- perform system poweroff during overtemperature conditions
-
-To set up such a device/firmware, various parameters like I2C address of the
-regulator, addresses of various regulator registers etc. need to be configured
-to it. The regulator framework provides the following helpers for querying
-these details.
-
-Bus-specific details, like I2C addresses or transfer rates are handled by the
-regmap framework. To get the regulator's regmap (if supported), use :-
-
-struct regmap *regulator_get_regmap(struct regulator *regulator);
-
-To obtain the hardware register offset and bitmask for the regulator's voltage
-selector register, use :-
-
-int regulator_get_hardware_vsel_register(struct regulator *regulator,
- unsigned *vsel_reg,
- unsigned *vsel_mask);
-
-To convert a regulator framework voltage selector code (used by
-regulator_list_voltage) to a hardware-specific voltage selector that can be
-directly written to the voltage selector register, use :-
-
-int regulator_list_hardware_vsel(struct regulator *regulator,
- unsigned selector);
diff --git a/Documentation/power/regulator/design.rst b/Documentation/power/regulator/design.rst
new file mode 100644
index 000000000000..3b09c6841dc4
--- /dev/null
+++ b/Documentation/power/regulator/design.rst
@@ -0,0 +1,38 @@
+==========================
+Regulator API design notes
+==========================
+
+This document provides a brief, partially structured, overview of some
+of the design considerations which impact the regulator API design.
+
+Safety
+------
+
+ - Errors in regulator configuration can have very serious consequences
+ for the system, potentially including lasting hardware damage.
+ - It is not possible to automatically determine the power configuration
+ of the system - software-equivalent variants of the same chip may
+ have different power requirements, and not all components with power
+ requirements are visible to software.
+
+.. note::
+
+ The API should make no changes to the hardware state unless it has
+ specific knowledge that these changes are safe to perform on this
+ particular system.
+
+Consumer use cases
+------------------
+
+ - The overwhelming majority of devices in a system will have no
+ requirement to do any runtime configuration of their power beyond
+ being able to turn it on or off.
+
+ - Many of the power supplies in the system will be shared between many
+ different consumers.
+
+.. note::
+
+ The consumer API should be structured so that these use cases are
+ very easy to handle and so that consumers will work with shared
+ supplies without any additional effort.
diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt
deleted file mode 100644
index fdd919b96830..000000000000
--- a/Documentation/power/regulator/design.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Regulator API design notes
-==========================
-
-This document provides a brief, partially structured, overview of some
-of the design considerations which impact the regulator API design.
-
-Safety
-------
-
- - Errors in regulator configuration can have very serious consequences
- for the system, potentially including lasting hardware damage.
- - It is not possible to automatically determine the power configuration
- of the system - software-equivalent variants of the same chip may
- have different power requirements, and not all components with power
- requirements are visible to software.
-
- => The API should make no changes to the hardware state unless it has
- specific knowledge that these changes are safe to perform on this
- particular system.
-
-Consumer use cases
-------------------
-
- - The overwhelming majority of devices in a system will have no
- requirement to do any runtime configuration of their power beyond
- being able to turn it on or off.
-
- - Many of the power supplies in the system will be shared between many
- different consumers.
-
- => The consumer API should be structured so that these use cases are
- very easy to handle and so that consumers will work with shared
- supplies without any additional effort.
diff --git a/Documentation/power/regulator/machine.rst b/Documentation/power/regulator/machine.rst
new file mode 100644
index 000000000000..22fffefaa3ad
--- /dev/null
+++ b/Documentation/power/regulator/machine.rst
@@ -0,0 +1,97 @@
+==================================
+Regulator Machine Driver Interface
+==================================
+
+The regulator machine driver interface is intended for board/machine specific
+initialisation code to configure the regulator subsystem.
+
+Consider the following machine::
+
+ Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
+ |
+ +-> [Consumer B @ 3.3V]
+
+The drivers for consumers A & B must be mapped to the correct regulator in
+order to control their power supplies. This mapping can be achieved in machine
+initialisation code by creating a struct regulator_consumer_supply for
+each regulator::
+
+ struct regulator_consumer_supply {
+ const char *dev_name; /* consumer dev_name() */
+ const char *supply; /* consumer supply - e.g. "vcc" */
+ };
+
+e.g. for the machine above::
+
+ static struct regulator_consumer_supply regulator1_consumers[] = {
+ REGULATOR_SUPPLY("Vcc", "consumer B"),
+ };
+
+ static struct regulator_consumer_supply regulator2_consumers[] = {
+ REGULATOR_SUPPLY("Vcc", "consumer A"),
+ };
+
+This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
+to the 'Vcc' supply for Consumer A.
+
+Constraints can now be registered by defining a struct regulator_init_data
+for each regulator power domain. This structure also maps the consumers
+to their supply regulators::
+
+ static struct regulator_init_data regulator1_data = {
+ .constraints = {
+ .name = "Regulator-1",
+ .min_uV = 3300000,
+ .max_uV = 3300000,
+ .valid_modes_mask = REGULATOR_MODE_NORMAL,
+ },
+ .num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
+ .consumer_supplies = regulator1_consumers,
+ };
+
+The name field should be set to something that is usefully descriptive
+for the board for configuration of supplies for other regulators and
+for use in logging and other diagnostic output. Normally the name
+used for the supply rail in the schematic is a good choice. If no
+name is provided then the subsystem will choose one.
+
+Regulator-1 supplies power to Regulator-2. This relationship must be registered
+with the core so that Regulator-1 is also enabled when Consumer A enables its
+supply (Regulator-2). The supply regulator is set by the supply_regulator
+field below and co::
+
+ static struct regulator_init_data regulator2_data = {
+ .supply_regulator = "Regulator-1",
+ .constraints = {
+ .min_uV = 1800000,
+ .max_uV = 2000000,
+ .valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
+ .valid_modes_mask = REGULATOR_MODE_NORMAL,
+ },
+ .num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
+ .consumer_supplies = regulator2_consumers,
+ };
+
+Finally the regulator devices must be registered in the usual manner::
+
+ static struct platform_device regulator_devices[] = {
+ {
+ .name = "regulator",
+ .id = DCDC_1,
+ .dev = {
+ .platform_data = &regulator1_data,
+ },
+ },
+ {
+ .name = "regulator",
+ .id = DCDC_2,
+ .dev = {
+ .platform_data = &regulator2_data,
+ },
+ },
+ };
+ /* register regulator 1 device */
+ platform_device_register(&regulator_devices[0]);
+
+ /* register regulator 2 device */
+ platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
deleted file mode 100644
index eff4dcaaa252..000000000000
--- a/Documentation/power/regulator/machine.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-Regulator Machine Driver Interface
-===================================
-
-The regulator machine driver interface is intended for board/machine specific
-initialisation code to configure the regulator subsystem.
-
-Consider the following machine :-
-
- Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
- |
- +-> [Consumer B @ 3.3V]
-
-The drivers for consumers A & B must be mapped to the correct regulator in
-order to control their power supplies. This mapping can be achieved in machine
-initialisation code by creating a struct regulator_consumer_supply for
-each regulator.
-
-struct regulator_consumer_supply {
- const char *dev_name; /* consumer dev_name() */
- const char *supply; /* consumer supply - e.g. "vcc" */
-};
-
-e.g. for the machine above
-
-static struct regulator_consumer_supply regulator1_consumers[] = {
- REGULATOR_SUPPLY("Vcc", "consumer B"),
-};
-
-static struct regulator_consumer_supply regulator2_consumers[] = {
- REGULATOR_SUPPLY("Vcc", "consumer A"),
-};
-
-This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
-to the 'Vcc' supply for Consumer A.
-
-Constraints can now be registered by defining a struct regulator_init_data
-for each regulator power domain. This structure also maps the consumers
-to their supply regulators :-
-
-static struct regulator_init_data regulator1_data = {
- .constraints = {
- .name = "Regulator-1",
- .min_uV = 3300000,
- .max_uV = 3300000,
- .valid_modes_mask = REGULATOR_MODE_NORMAL,
- },
- .num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
- .consumer_supplies = regulator1_consumers,
-};
-
-The name field should be set to something that is usefully descriptive
-for the board for configuration of supplies for other regulators and
-for use in logging and other diagnostic output. Normally the name
-used for the supply rail in the schematic is a good choice. If no
-name is provided then the subsystem will choose one.
-
-Regulator-1 supplies power to Regulator-2. This relationship must be registered
-with the core so that Regulator-1 is also enabled when Consumer A enables its
-supply (Regulator-2). The supply regulator is set by the supply_regulator
-field below and co:-
-
-static struct regulator_init_data regulator2_data = {
- .supply_regulator = "Regulator-1",
- .constraints = {
- .min_uV = 1800000,
- .max_uV = 2000000,
- .valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
- .valid_modes_mask = REGULATOR_MODE_NORMAL,
- },
- .num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
- .consumer_supplies = regulator2_consumers,
-};
-
-Finally the regulator devices must be registered in the usual manner.
-
-static struct platform_device regulator_devices[] = {
- {
- .name = "regulator",
- .id = DCDC_1,
- .dev = {
- .platform_data = &regulator1_data,
- },
- },
- {
- .name = "regulator",
- .id = DCDC_2,
- .dev = {
- .platform_data = &regulator2_data,
- },
- },
-};
-/* register regulator 1 device */
-platform_device_register(&regulator_devices[0]);
-
-/* register regulator 2 device */
-platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/overview.rst b/Documentation/power/regulator/overview.rst
new file mode 100644
index 000000000000..ee494c70a7c4
--- /dev/null
+++ b/Documentation/power/regulator/overview.rst
@@ -0,0 +1,178 @@
+=============================================
+Linux voltage and current regulator framework
+=============================================
+
+About
+=====
+
+This framework is designed to provide a standard kernel interface to control
+voltage and current regulators.
+
+The intention is to allow systems to dynamically control regulator power output
+in order to save power and prolong battery life. This applies to both voltage
+regulators (where voltage output is controllable) and current sinks (where
+current limit is controllable).
+
+(C) 2008 Wolfson Microelectronics PLC.
+
+Author: Liam Girdwood <lrg@slimlogic.co.uk>
+
+
+Nomenclature
+============
+
+Some terms used in this document:
+
+ - Regulator
+ - Electronic device that supplies power to other devices.
+ Most regulators can enable and disable their output while
+ some can control their output voltage and or current.
+
+ Input Voltage -> Regulator -> Output Voltage
+
+
+ - PMIC
+ - Power Management IC. An IC that contains numerous
+ regulators and often contains other subsystems.
+
+
+ - Consumer
+ - Electronic device that is supplied power by a regulator.
+ Consumers can be classified into two types:-
+
+ Static: consumer does not change its supply voltage or
+ current limit. It only needs to enable or disable its
+ power supply. Its supply voltage is set by the hardware,
+ bootloader, firmware or kernel board initialisation code.
+
+ Dynamic: consumer needs to change its supply voltage or
+ current limit to meet operation demands.
+
+
+ - Power Domain
+ - Electronic circuit that is supplied its input power by the
+ output power of a regulator, switch or by another power
+ domain.
+
+ The supply regulator may be behind a switch(s). i.e.::
+
+ Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
+ | |
+ | +-> [Consumer B], [Consumer C]
+ |
+ +-> [Consumer D], [Consumer E]
+
+ That is one regulator and three power domains:
+
+ - Domain 1: Switch-1, Consumers D & E.
+ - Domain 2: Switch-2, Consumers B & C.
+ - Domain 3: Consumer A.
+
+ and this represents a "supplies" relationship:
+
+ Domain-1 --> Domain-2 --> Domain-3.
+
+ A power domain may have regulators that are supplied power
+ by other regulators. i.e.::
+
+ Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
+ |
+ +-> [Consumer B]
+
+ This gives us two regulators and two power domains:
+
+ - Domain 1: Regulator-2, Consumer B.
+ - Domain 2: Consumer A.
+
+ and a "supplies" relationship:
+
+ Domain-1 --> Domain-2
+
+
+ - Constraints
+ - Constraints are used to define power levels for performance
+ and hardware protection. Constraints exist at three levels:
+
+ Regulator Level: This is defined by the regulator hardware
+ operating parameters and is specified in the regulator
+ datasheet. i.e.
+
+ - voltage output is in the range 800mV -> 3500mV.
+ - regulator current output limit is 20mA @ 5V but is
+ 10mA @ 10V.
+
+ Power Domain Level: This is defined in software by kernel
+ level board initialisation code. It is used to constrain a
+ power domain to a particular power range. i.e.
+
+ - Domain-1 voltage is 3300mV
+ - Domain-2 voltage is 1400mV -> 1600mV
+ - Domain-3 current limit is 0mA -> 20mA.
+
+ Consumer Level: This is defined by consumer drivers
+ dynamically setting voltage or current limit levels.
+
+ e.g. a consumer backlight driver asks for a current increase
+ from 5mA to 10mA to increase LCD illumination. This passes
+ to through the levels as follows :-
+
+ Consumer: need to increase LCD brightness. Lookup and
+ request next current mA value in brightness table (the
+ consumer driver could be used on several different
+ personalities based upon the same reference device).
+
+ Power Domain: is the new current limit within the domain
+ operating limits for this domain and system state (e.g.
+ battery power, USB power)
+
+ Regulator Domains: is the new current limit within the
+ regulator operating parameters for input/output voltage.
+
+ If the regulator request passes all the constraint tests
+ then the new regulator value is applied.
+
+
+Design
+======
+
+The framework is designed and targeted at SoC based devices but may also be
+relevant to non SoC devices and is split into the following four interfaces:-
+
+
+ 1. Consumer driver interface.
+
+ This uses a similar API to the kernel clock interface in that consumer
+ drivers can get and put a regulator (like they can with clocks atm) and
+ get/set voltage, current limit, mode, enable and disable. This should
+ allow consumers complete control over their supply voltage and current
+ limit. This also compiles out if not in use so drivers can be reused in
+ systems with no regulator based power control.
+
+ See Documentation/power/regulator/consumer.rst
+
+ 2. Regulator driver interface.
+
+ This allows regulator drivers to register their regulators and provide
+ operations to the core. It also has a notifier call chain for propagating
+ regulator events to clients.
+
+ See Documentation/power/regulator/regulator.rst
+
+ 3. Machine interface.
+
+ This interface is for machine specific code and allows the creation of
+ voltage/current domains (with constraints) for each regulator. It can
+ provide regulator constraints that will prevent device damage through
+ overvoltage or overcurrent caused by buggy client drivers. It also
+ allows the creation of a regulator tree whereby some regulators are
+ supplied by others (similar to a clock tree).
+
+ See Documentation/power/regulator/machine.rst
+
+ 4. Userspace ABI.
+
+ The framework also exports a lot of useful voltage/current/opmode data to
+ userspace via sysfs. This could be used to help monitor device power
+ consumption and status.
+
+ See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
deleted file mode 100644
index 721b4739ec32..000000000000
--- a/Documentation/power/regulator/overview.txt
+++ /dev/null
@@ -1,171 +0,0 @@
-Linux voltage and current regulator framework
-=============================================
-
-About
-=====
-
-This framework is designed to provide a standard kernel interface to control
-voltage and current regulators.
-
-The intention is to allow systems to dynamically control regulator power output
-in order to save power and prolong battery life. This applies to both voltage
-regulators (where voltage output is controllable) and current sinks (where
-current limit is controllable).
-
-(C) 2008 Wolfson Microelectronics PLC.
-Author: Liam Girdwood <lrg@slimlogic.co.uk>
-
-
-Nomenclature
-============
-
-Some terms used in this document:-
-
- o Regulator - Electronic device that supplies power to other devices.
- Most regulators can enable and disable their output while
- some can control their output voltage and or current.
-
- Input Voltage -> Regulator -> Output Voltage
-
-
- o PMIC - Power Management IC. An IC that contains numerous regulators
- and often contains other subsystems.
-
-
- o Consumer - Electronic device that is supplied power by a regulator.
- Consumers can be classified into two types:-
-
- Static: consumer does not change its supply voltage or
- current limit. It only needs to enable or disable its
- power supply. Its supply voltage is set by the hardware,
- bootloader, firmware or kernel board initialisation code.
-
- Dynamic: consumer needs to change its supply voltage or
- current limit to meet operation demands.
-
-
- o Power Domain - Electronic circuit that is supplied its input power by the
- output power of a regulator, switch or by another power
- domain.
-
- The supply regulator may be behind a switch(s). i.e.
-
- Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
- | |
- | +-> [Consumer B], [Consumer C]
- |
- +-> [Consumer D], [Consumer E]
-
- That is one regulator and three power domains:
-
- Domain 1: Switch-1, Consumers D & E.
- Domain 2: Switch-2, Consumers B & C.
- Domain 3: Consumer A.
-
- and this represents a "supplies" relationship:
-
- Domain-1 --> Domain-2 --> Domain-3.
-
- A power domain may have regulators that are supplied power
- by other regulators. i.e.
-
- Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
- |
- +-> [Consumer B]
-
- This gives us two regulators and two power domains:
-
- Domain 1: Regulator-2, Consumer B.
- Domain 2: Consumer A.
-
- and a "supplies" relationship:
-
- Domain-1 --> Domain-2
-
-
- o Constraints - Constraints are used to define power levels for performance
- and hardware protection. Constraints exist at three levels:
-
- Regulator Level: This is defined by the regulator hardware
- operating parameters and is specified in the regulator
- datasheet. i.e.
-
- - voltage output is in the range 800mV -> 3500mV.
- - regulator current output limit is 20mA @ 5V but is
- 10mA @ 10V.
-
- Power Domain Level: This is defined in software by kernel
- level board initialisation code. It is used to constrain a
- power domain to a particular power range. i.e.
-
- - Domain-1 voltage is 3300mV
- - Domain-2 voltage is 1400mV -> 1600mV
- - Domain-3 current limit is 0mA -> 20mA.
-
- Consumer Level: This is defined by consumer drivers
- dynamically setting voltage or current limit levels.
-
- e.g. a consumer backlight driver asks for a current increase
- from 5mA to 10mA to increase LCD illumination. This passes
- to through the levels as follows :-
-
- Consumer: need to increase LCD brightness. Lookup and
- request next current mA value in brightness table (the
- consumer driver could be used on several different
- personalities based upon the same reference device).
-
- Power Domain: is the new current limit within the domain
- operating limits for this domain and system state (e.g.
- battery power, USB power)
-
- Regulator Domains: is the new current limit within the
- regulator operating parameters for input/output voltage.
-
- If the regulator request passes all the constraint tests
- then the new regulator value is applied.
-
-
-Design
-======
-
-The framework is designed and targeted at SoC based devices but may also be
-relevant to non SoC devices and is split into the following four interfaces:-
-
-
- 1. Consumer driver interface.
-
- This uses a similar API to the kernel clock interface in that consumer
- drivers can get and put a regulator (like they can with clocks atm) and
- get/set voltage, current limit, mode, enable and disable. This should
- allow consumers complete control over their supply voltage and current
- limit. This also compiles out if not in use so drivers can be reused in
- systems with no regulator based power control.
-
- See Documentation/power/regulator/consumer.txt
-
- 2. Regulator driver interface.
-
- This allows regulator drivers to register their regulators and provide
- operations to the core. It also has a notifier call chain for propagating
- regulator events to clients.
-
- See Documentation/power/regulator/regulator.txt
-
- 3. Machine interface.
-
- This interface is for machine specific code and allows the creation of
- voltage/current domains (with constraints) for each regulator. It can
- provide regulator constraints that will prevent device damage through
- overvoltage or overcurrent caused by buggy client drivers. It also
- allows the creation of a regulator tree whereby some regulators are
- supplied by others (similar to a clock tree).
-
- See Documentation/power/regulator/machine.txt
-
- 4. Userspace ABI.
-
- The framework also exports a lot of useful voltage/current/opmode data to
- userspace via sysfs. This could be used to help monitor device power
- consumption and status.
-
- See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/regulator.rst b/Documentation/power/regulator/regulator.rst
new file mode 100644
index 000000000000..794b3256fbb9
--- /dev/null
+++ b/Documentation/power/regulator/regulator.rst
@@ -0,0 +1,32 @@
+==========================
+Regulator Driver Interface
+==========================
+
+The regulator driver interface is relatively simple and designed to allow
+regulator drivers to register their services with the core framework.
+
+
+Registration
+============
+
+Drivers can register a regulator by calling::
+
+ struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
+ const struct regulator_config *config);
+
+This will register the regulator's capabilities and operations to the regulator
+core.
+
+Regulators can be unregistered by calling::
+
+ void regulator_unregister(struct regulator_dev *rdev);
+
+
+Regulator Events
+================
+
+Regulators can send events (e.g. overtemperature, undervoltage, etc) to
+consumer drivers by calling::
+
+ int regulator_notifier_call_chain(struct regulator_dev *rdev,
+ unsigned long event, void *data);
diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
deleted file mode 100644
index b17e5833ce21..000000000000
--- a/Documentation/power/regulator/regulator.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Regulator Driver Interface
-==========================
-
-The regulator driver interface is relatively simple and designed to allow
-regulator drivers to register their services with the core framework.
-
-
-Registration
-============
-
-Drivers can register a regulator by calling :-
-
-struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
- const struct regulator_config *config);
-
-This will register the regulator's capabilities and operations to the regulator
-core.
-
-Regulators can be unregistered by calling :-
-
-void regulator_unregister(struct regulator_dev *rdev);
-
-
-Regulator Events
-================
-Regulators can send events (e.g. overtemperature, undervoltage, etc) to
-consumer drivers by calling :-
-
-int regulator_notifier_call_chain(struct regulator_dev *rdev,
- unsigned long event, void *data);
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
new file mode 100644
index 000000000000..2c2ec99b5088
--- /dev/null
+++ b/Documentation/power/runtime_pm.rst
@@ -0,0 +1,940 @@
+==================================================
+Runtime Power Management Framework for I/O Devices
+==================================================
+
+(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+(C) 2010 Alan Stern <stern@rowland.harvard.edu>
+
+(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+1. Introduction
+===============
+
+Support for runtime power management (runtime PM) of I/O devices is provided
+at the power management core (PM core) level by means of:
+
+* The power management workqueue pm_wq in which bus types and device drivers can
+ put their PM-related work items. It is strongly recommended that pm_wq be
+ used for queuing all work items related to runtime PM, because this allows
+ them to be synchronized with system-wide power transitions (suspend to RAM,
+ hibernation and resume from system sleep states). pm_wq is declared in
+ include/linux/pm_runtime.h and defined in kernel/power/main.c.
+
+* A number of runtime PM fields in the 'power' member of 'struct device' (which
+ is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
+ be used for synchronizing runtime PM operations with one another.
+
+* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
+ include/linux/pm.h).
+
+* A set of helper functions defined in drivers/base/power/runtime.c that can be
+ used for carrying out runtime PM operations in such a way that the
+ synchronization between them is taken care of by the PM core. Bus types and
+ device drivers are encouraged to use these functions.
+
+The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
+fields of 'struct dev_pm_info' and the core helper functions provided for
+runtime PM are described below.
+
+2. Device Runtime PM Callbacks
+==============================
+
+There are three device runtime PM callbacks defined in 'struct dev_pm_ops'::
+
+ struct dev_pm_ops {
+ ...
+ int (*runtime_suspend)(struct device *dev);
+ int (*runtime_resume)(struct device *dev);
+ int (*runtime_idle)(struct device *dev);
+ ...
+ };
+
+The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
+are executed by the PM core for the device's subsystem that may be either of
+the following:
+
+ 1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
+ is present.
+
+ 2. Device type of the device, if both dev->type and dev->type->pm are present.
+
+ 3. Device class of the device, if both dev->class and dev->class->pm are
+ present.
+
+ 4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
+
+If the subsystem chosen by applying the above rules doesn't provide the relevant
+callback, the PM core will invoke the corresponding driver callback stored in
+dev->driver->pm directly (if present).
+
+The PM core always checks which callback to use in the order given above, so the
+priority order of callbacks from high to low is: PM domain, device type, class
+and bus type. Moreover, the high-priority one will always take precedence over
+a low-priority one. The PM domain, bus type, device type and class callbacks
+are referred to as subsystem-level callbacks in what follows.
+
+By default, the callbacks are always invoked in process context with interrupts
+enabled. However, the pm_runtime_irq_safe() helper function can be used to tell
+the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
+and ->runtime_idle() callbacks for the given device in atomic context with
+interrupts disabled. This implies that the callback routines in question must
+not block or sleep, but it also means that the synchronous helper functions
+listed at the end of Section 4 may be used for that device within an interrupt
+handler or generally in an atomic context.
+
+The subsystem-level suspend callback, if present, is _entirely_ _responsible_
+for handling the suspend of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_suspend() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_suspend()
+callback in a device driver as long as the subsystem-level suspend callback
+knows what to do to handle the device).
+
+ * Once the subsystem-level suspend callback (or the driver suspend callback,
+ if invoked directly) has completed successfully for the given device, the PM
+ core regards the device as suspended, which need not mean that it has been
+ put into a low power state. It is supposed to mean, however, that the
+ device will not process data and will not communicate with the CPU(s) and
+ RAM until the appropriate resume callback is executed for it. The runtime
+ PM status of a device after successful execution of the suspend callback is
+ 'suspended'.
+
+ * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
+ status remains 'active', which means that the device _must_ be fully
+ operational afterwards.
+
+ * If the suspend callback returns an error code different from -EBUSY and
+ -EAGAIN, the PM core regards this as a fatal error and will refuse to run
+ the helper functions described in Section 4 for the device until its status
+ is directly set to either 'active', or 'suspended' (the PM core provides
+ special helper functions for this purpose).
+
+In particular, if the driver requires remote wakeup capability (i.e. hardware
+mechanism allowing the device to request a change of its power state, such as
+PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
+device, then ->runtime_suspend() should return -EBUSY. On the other hand, if
+device_can_wakeup() returns 'true' for the device and the device is put into a
+low-power state during the execution of the suspend callback, it is expected
+that remote wakeup will be enabled for the device. Generally, remote wakeup
+should be enabled for all input devices put into low-power states at run time.
+
+The subsystem-level resume callback, if present, is **entirely responsible** for
+handling the resume of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_resume() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_resume()
+callback in a device driver as long as the subsystem-level resume callback knows
+what to do to handle the device).
+
+ * Once the subsystem-level resume callback (or the driver resume callback, if
+ invoked directly) has completed successfully, the PM core regards the device
+ as fully operational, which means that the device _must_ be able to complete
+ I/O operations as needed. The runtime PM status of the device is then
+ 'active'.
+
+ * If the resume callback returns an error code, the PM core regards this as a
+ fatal error and will refuse to run the helper functions described in Section
+ 4 for the device, until its status is directly set to either 'active', or
+ 'suspended' (by means of special helper functions provided by the PM core
+ for this purpose).
+
+The idle callback (a subsystem-level one, if present, or the driver one) is
+executed by the PM core whenever the device appears to be idle, which is
+indicated to the PM core by two counters, the device's usage counter and the
+counter of 'active' children of the device.
+
+ * If any of these counters is decreased using a helper function provided by
+ the PM core and it turns out to be equal to zero, the other counter is
+ checked. If that counter also is equal to zero, the PM core executes the
+ idle callback with the device as its argument.
+
+The action performed by the idle callback is totally dependent on the subsystem
+(or driver) in question, but the expected and recommended action is to check
+if the device can be suspended (i.e. if all of the conditions necessary for
+suspending the device are satisfied) and to queue up a suspend request for the
+device in that case. If there is no idle callback, or if the callback returns
+0, then the PM core will attempt to carry out a runtime suspend of the device,
+also respecting devices configured for autosuspend. In essence this means a
+call to pm_runtime_autosuspend() (do note that drivers needs to update the
+device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
+this circumstance). To prevent this (for example, if the callback routine has
+started a delayed suspend), the routine must return a non-zero value. Negative
+error return codes are ignored by the PM core.
+
+The helper functions provided by the PM core, described in Section 4, guarantee
+that the following constraints are met with respect to runtime PM callbacks for
+one device:
+
+(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
+ ->runtime_suspend() in parallel with ->runtime_resume() or with another
+ instance of ->runtime_suspend() for the same device) with the exception that
+ ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
+ ->runtime_idle() (although ->runtime_idle() will not be started while any
+ of the other callbacks is being executed for the same device).
+
+(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
+ devices (i.e. the PM core will only execute ->runtime_idle() or
+ ->runtime_suspend() for the devices the runtime PM status of which is
+ 'active').
+
+(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
+ the usage counter of which is equal to zero _and_ either the counter of
+ 'active' children of which is equal to zero, or the 'power.ignore_children'
+ flag of which is set.
+
+(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the
+ PM core will only execute ->runtime_resume() for the devices the runtime
+ PM status of which is 'suspended').
+
+Additionally, the helper functions provided by the PM core obey the following
+rules:
+
+ * If ->runtime_suspend() is about to be executed or there's a pending request
+ to execute it, ->runtime_idle() will not be executed for the same device.
+
+ * A request to execute or to schedule the execution of ->runtime_suspend()
+ will cancel any pending requests to execute ->runtime_idle() for the same
+ device.
+
+ * If ->runtime_resume() is about to be executed or there's a pending request
+ to execute it, the other callbacks will not be executed for the same device.
+
+ * A request to execute ->runtime_resume() will cancel any pending or
+ scheduled requests to execute the other callbacks for the same device,
+ except for scheduled autosuspends.
+
+3. Runtime PM Device Fields
+===========================
+
+The following device runtime PM fields are present in 'struct dev_pm_info', as
+defined in include/linux/pm.h:
+
+ `struct timer_list suspend_timer;`
+ - timer used for scheduling (delayed) suspend and autosuspend requests
+
+ `unsigned long timer_expires;`
+ - timer expiration time, in jiffies (if this is different from zero, the
+ timer is running and will expire at that time, otherwise the timer is not
+ running)
+
+ `struct work_struct work;`
+ - work structure used for queuing up requests (i.e. work items in pm_wq)
+
+ `wait_queue_head_t wait_queue;`
+ - wait queue used if any of the helper functions needs to wait for another
+ one to complete
+
+ `spinlock_t lock;`
+ - lock used for synchronization
+
+ `atomic_t usage_count;`
+ - the usage counter of the device
+
+ `atomic_t child_count;`
+ - the count of 'active' children of the device
+
+ `unsigned int ignore_children;`
+ - if set, the value of child_count is ignored (but still updated)
+
+ `unsigned int disable_depth;`
+ - used for disabling the helper functions (they work normally if this is
+ equal to zero); the initial value of it is 1 (i.e. runtime PM is
+ initially disabled for all devices)
+
+ `int runtime_error;`
+ - if set, there was a fatal error (one of the callbacks returned error code
+ as described in Section 2), so the helper functions will not work until
+ this flag is cleared; this is the error code returned by the failing
+ callback
+
+ `unsigned int idle_notification;`
+ - if set, ->runtime_idle() is being executed
+
+ `unsigned int request_pending;`
+ - if set, there's a pending request (i.e. a work item queued up into pm_wq)
+
+ `enum rpm_request request;`
+ - type of request that's pending (valid if request_pending is set)
+
+ `unsigned int deferred_resume;`
+ - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
+ being executed for that device and it is not practical to wait for the
+ suspend to complete; means "start a resume as soon as you've suspended"
+
+ `enum rpm_status runtime_status;`
+ - the runtime PM status of the device; this field's initial value is
+ RPM_SUSPENDED, which means that each device is initially regarded by the
+ PM core as 'suspended', regardless of its real hardware status
+
+ `unsigned int runtime_auto;`
+ - if set, indicates that the user space has allowed the device driver to
+ power manage the device at run time via the /sys/devices/.../power/control
+ `interface;` it may only be modified with the help of the pm_runtime_allow()
+ and pm_runtime_forbid() helper functions
+
+ `unsigned int no_callbacks;`
+ - indicates that the device does not use the runtime PM callbacks (see
+ Section 8); it may be modified only by the pm_runtime_no_callbacks()
+ helper function
+
+ `unsigned int irq_safe;`
+ - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
+ will be invoked with the spinlock held and interrupts disabled
+
+ `unsigned int use_autosuspend;`
+ - indicates that the device's driver supports delayed autosuspend (see
+ Section 9); it may be modified only by the
+ pm_runtime{_dont}_use_autosuspend() helper functions
+
+ `unsigned int timer_autosuspends;`
+ - indicates that the PM core should attempt to carry out an autosuspend
+ when the timer expires rather than a normal suspend
+
+ `int autosuspend_delay;`
+ - the delay time (in milliseconds) to be used for autosuspend
+
+ `unsigned long last_busy;`
+ - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
+ function was last called for this device; used in calculating inactivity
+ periods for autosuspend
+
+All of the above fields are members of the 'power' member of 'struct device'.
+
+4. Runtime PM Device Helper Functions
+=====================================
+
+The following runtime PM helper functions are defined in
+drivers/base/power/runtime.c and include/linux/pm_runtime.h:
+
+ `void pm_runtime_init(struct device *dev);`
+ - initialize the device runtime PM fields in 'struct dev_pm_info'
+
+ `void pm_runtime_remove(struct device *dev);`
+ - make sure that the runtime PM of the device will be disabled after
+ removing the device from device hierarchy
+
+ `int pm_runtime_idle(struct device *dev);`
+ - execute the subsystem-level idle callback for the device; returns an
+ error code on failure, where -EINPROGRESS means that ->runtime_idle() is
+ already being executed; if there is no callback or the callback returns 0
+ then run pm_runtime_autosuspend(dev) and return its result
+
+ `int pm_runtime_suspend(struct device *dev);`
+ - execute the subsystem-level suspend callback for the device; returns 0 on
+ success, 1 if the device's runtime PM status was already 'suspended', or
+ error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
+ to suspend the device again in future and -EACCES means that
+ 'power.disable_depth' is different from 0
+
+ `int pm_runtime_autosuspend(struct device *dev);`
+ - same as pm_runtime_suspend() except that the autosuspend delay is taken
+ `into account;` if pm_runtime_autosuspend_expiration() says the delay has
+ not yet expired then an autosuspend is scheduled for the appropriate time
+ and 0 is returned
+
+ `int pm_runtime_resume(struct device *dev);`
+ - execute the subsystem-level resume callback for the device; returns 0 on
+ success, 1 if the device's runtime PM status was already 'active' or
+ error code on failure, where -EAGAIN means it may be safe to attempt to
+ resume the device again in future, but 'power.runtime_error' should be
+ checked additionally, and -EACCES means that 'power.disable_depth' is
+ different from 0
+
+ `int pm_request_idle(struct device *dev);`
+ - submit a request to execute the subsystem-level idle callback for the
+ device (the request is represented by a work item in pm_wq); returns 0 on
+ success or error code if the request has not been queued up
+
+ `int pm_request_autosuspend(struct device *dev);`
+ - schedule the execution of the subsystem-level suspend callback for the
+ device when the autosuspend delay has expired; if the delay has already
+ expired then the work item is queued up immediately
+
+ `int pm_schedule_suspend(struct device *dev, unsigned int delay);`
+ - schedule the execution of the subsystem-level suspend callback for the
+ device in future, where 'delay' is the time to wait before queuing up a
+ suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
+ item is queued up immediately); returns 0 on success, 1 if the device's PM
+ runtime status was already 'suspended', or error code if the request
+ hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
+ ->runtime_suspend() is already scheduled and not yet expired, the new
+ value of 'delay' will be used as the time to wait
+
+ `int pm_request_resume(struct device *dev);`
+ - submit a request to execute the subsystem-level resume callback for the
+ device (the request is represented by a work item in pm_wq); returns 0 on
+ success, 1 if the device's runtime PM status was already 'active', or
+ error code if the request hasn't been queued up
+
+ `void pm_runtime_get_noresume(struct device *dev);`
+ - increment the device's usage counter
+
+ `int pm_runtime_get(struct device *dev);`
+ - increment the device's usage counter, run pm_request_resume(dev) and
+ return its result
+
+ `int pm_runtime_get_sync(struct device *dev);`
+ - increment the device's usage counter, run pm_runtime_resume(dev) and
+ return its result
+
+ `int pm_runtime_get_if_in_use(struct device *dev);`
+ - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
+ runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
+ nonzero, increment the counter and return 1; otherwise return 0 without
+ changing the counter
+
+ `void pm_runtime_put_noidle(struct device *dev);`
+ - decrement the device's usage counter
+
+ `int pm_runtime_put(struct device *dev);`
+ - decrement the device's usage counter; if the result is 0 then run
+ pm_request_idle(dev) and return its result
+
+ `int pm_runtime_put_autosuspend(struct device *dev);`
+ - decrement the device's usage counter; if the result is 0 then run
+ pm_request_autosuspend(dev) and return its result
+
+ `int pm_runtime_put_sync(struct device *dev);`
+ - decrement the device's usage counter; if the result is 0 then run
+ pm_runtime_idle(dev) and return its result
+
+ `int pm_runtime_put_sync_suspend(struct device *dev);`
+ - decrement the device's usage counter; if the result is 0 then run
+ pm_runtime_suspend(dev) and return its result
+
+ `int pm_runtime_put_sync_autosuspend(struct device *dev);`
+ - decrement the device's usage counter; if the result is 0 then run
+ pm_runtime_autosuspend(dev) and return its result
+
+ `void pm_runtime_enable(struct device *dev);`
+ - decrement the device's 'power.disable_depth' field; if that field is equal
+ to zero, the runtime PM helper functions can execute subsystem-level
+ callbacks described in Section 2 for the device
+
+ `int pm_runtime_disable(struct device *dev);`
+ - increment the device's 'power.disable_depth' field (if the value of that
+ field was previously zero, this prevents subsystem-level runtime PM
+ callbacks from being run for the device), make sure that all of the
+ pending runtime PM operations on the device are either completed or
+ canceled; returns 1 if there was a resume request pending and it was
+ necessary to execute the subsystem-level resume callback for the device
+ to satisfy that request, otherwise 0 is returned
+
+ `int pm_runtime_barrier(struct device *dev);`
+ - check if there's a resume request pending for the device and resume it
+ (synchronously) in that case, cancel any other pending runtime PM requests
+ regarding it and wait for all runtime PM operations on it in progress to
+ complete; returns 1 if there was a resume request pending and it was
+ necessary to execute the subsystem-level resume callback for the device to
+ satisfy that request, otherwise 0 is returned
+
+ `void pm_suspend_ignore_children(struct device *dev, bool enable);`
+ - set/unset the power.ignore_children flag of the device
+
+ `int pm_runtime_set_active(struct device *dev);`
+ - clear the device's 'power.runtime_error' flag, set the device's runtime
+ PM status to 'active' and update its parent's counter of 'active'
+ children as appropriate (it is only valid to use this function if
+ 'power.runtime_error' is set or 'power.disable_depth' is greater than
+ zero); it will fail and return error code if the device has a parent
+ which is not active and the 'power.ignore_children' flag of which is unset
+
+ `void pm_runtime_set_suspended(struct device *dev);`
+ - clear the device's 'power.runtime_error' flag, set the device's runtime
+ PM status to 'suspended' and update its parent's counter of 'active'
+ children as appropriate (it is only valid to use this function if
+ 'power.runtime_error' is set or 'power.disable_depth' is greater than
+ zero)
+
+ `bool pm_runtime_active(struct device *dev);`
+ - return true if the device's runtime PM status is 'active' or its
+ 'power.disable_depth' field is not equal to zero, or false otherwise
+
+ `bool pm_runtime_suspended(struct device *dev);`
+ - return true if the device's runtime PM status is 'suspended' and its
+ 'power.disable_depth' field is equal to zero, or false otherwise
+
+ `bool pm_runtime_status_suspended(struct device *dev);`
+ - return true if the device's runtime PM status is 'suspended'
+
+ `void pm_runtime_allow(struct device *dev);`
+ - set the power.runtime_auto flag for the device and decrease its usage
+ counter (used by the /sys/devices/.../power/control interface to
+ effectively allow the device to be power managed at run time)
+
+ `void pm_runtime_forbid(struct device *dev);`
+ - unset the power.runtime_auto flag for the device and increase its usage
+ counter (used by the /sys/devices/.../power/control interface to
+ effectively prevent the device from being power managed at run time)
+
+ `void pm_runtime_no_callbacks(struct device *dev);`
+ - set the power.no_callbacks flag for the device and remove the runtime
+ PM attributes from /sys/devices/.../power (or prevent them from being
+ added when the device is registered)
+
+ `void pm_runtime_irq_safe(struct device *dev);`
+ - set the power.irq_safe flag for the device, causing the runtime-PM
+ callbacks to be invoked with interrupts off
+
+ `bool pm_runtime_is_irq_safe(struct device *dev);`
+ - return true if power.irq_safe flag was set for the device, causing
+ the runtime-PM callbacks to be invoked with interrupts off
+
+ `void pm_runtime_mark_last_busy(struct device *dev);`
+ - set the power.last_busy field to the current time
+
+ `void pm_runtime_use_autosuspend(struct device *dev);`
+ - set the power.use_autosuspend flag, enabling autosuspend delays; call
+ pm_runtime_get_sync if the flag was previously cleared and
+ power.autosuspend_delay is negative
+
+ `void pm_runtime_dont_use_autosuspend(struct device *dev);`
+ - clear the power.use_autosuspend flag, disabling autosuspend delays;
+ decrement the device's usage counter if the flag was previously set and
+ power.autosuspend_delay is negative; call pm_runtime_idle
+
+ `void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);`
+ - set the power.autosuspend_delay value to 'delay' (expressed in
+ milliseconds); if 'delay' is negative then runtime suspends are
+ prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
+ called or the device's usage counter may be decremented and
+ pm_runtime_idle called depending on if power.autosuspend_delay is
+ changed to or from a negative value; if power.use_autosuspend is clear,
+ pm_runtime_idle is called
+
+ `unsigned long pm_runtime_autosuspend_expiration(struct device *dev);`
+ - calculate the time when the current autosuspend delay period will expire,
+ based on power.last_busy and power.autosuspend_delay; if the delay time
+ is 1000 ms or larger then the expiration time is rounded up to the
+ nearest second; returns 0 if the delay period has already expired or
+ power.use_autosuspend isn't set, otherwise returns the expiration time
+ in jiffies
+
+It is safe to execute the following helper functions from interrupt context:
+
+- pm_request_idle()
+- pm_request_autosuspend()
+- pm_schedule_suspend()
+- pm_request_resume()
+- pm_runtime_get_noresume()
+- pm_runtime_get()
+- pm_runtime_put_noidle()
+- pm_runtime_put()
+- pm_runtime_put_autosuspend()
+- pm_runtime_enable()
+- pm_suspend_ignore_children()
+- pm_runtime_set_active()
+- pm_runtime_set_suspended()
+- pm_runtime_suspended()
+- pm_runtime_mark_last_busy()
+- pm_runtime_autosuspend_expiration()
+
+If pm_runtime_irq_safe() has been called for a device then the following helper
+functions may also be used in interrupt context:
+
+- pm_runtime_idle()
+- pm_runtime_suspend()
+- pm_runtime_autosuspend()
+- pm_runtime_resume()
+- pm_runtime_get_sync()
+- pm_runtime_put_sync()
+- pm_runtime_put_sync_suspend()
+- pm_runtime_put_sync_autosuspend()
+
+5. Runtime PM Initialization, Device Probing and Removal
+========================================================
+
+Initially, the runtime PM is disabled for all devices, which means that the
+majority of the runtime PM helper functions described in Section 4 will return
+-EAGAIN until pm_runtime_enable() is called for the device.
+
+In addition to that, the initial runtime PM status of all devices is
+'suspended', but it need not reflect the actual physical state of the device.
+Thus, if the device is initially active (i.e. it is able to process I/O), its
+runtime PM status must be changed to 'active', with the help of
+pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
+
+However, if the device has a parent and the parent's runtime PM is enabled,
+calling pm_runtime_set_active() for the device will affect the parent, unless
+the parent's 'power.ignore_children' flag is set. Namely, in that case the
+parent won't be able to suspend at run time, using the PM core's helper
+functions, as long as the child's status is 'active', even if the child's
+runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
+the child yet or pm_runtime_disable() has been called for it). For this reason,
+once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
+should be called for it too as soon as reasonably possible or its runtime PM
+status should be changed back to 'suspended' with the help of
+pm_runtime_set_suspended().
+
+If the default initial runtime PM status of the device (i.e. 'suspended')
+reflects the actual state of the device, its bus type's or its driver's
+->probe() callback will likely need to wake it up using one of the PM core's
+helper functions described in Section 4. In that case, pm_runtime_resume()
+should be used. Of course, for this purpose the device's runtime PM has to be
+enabled earlier by calling pm_runtime_enable().
+
+Note, if the device may execute pm_runtime calls during the probe (such as
+if it is registers with a subsystem that may call back in) then the
+pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
+appropriate to ensure that the device is not put back to sleep during the
+probe. This can happen with systems such as the network device layer.
+
+It may be desirable to suspend the device once ->probe() has finished.
+Therefore the driver core uses the asynchronous pm_request_idle() to submit a
+request to execute the subsystem-level idle callback for the device at that
+time. A driver that makes use of the runtime autosuspend feature, may want to
+update the last busy mark before returning from ->probe().
+
+Moreover, the driver core prevents runtime PM callbacks from racing with the bus
+notifier callback in __device_release_driver(), which is necessary, because the
+notifier is used by some subsystems to carry out operations affecting the
+runtime PM functionality. It does so by calling pm_runtime_get_sync() before
+driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This
+resumes the device if it's in the suspended state and prevents it from
+being suspended again while those routines are being executed.
+
+To allow bus types and drivers to put devices into the suspended state by
+calling pm_runtime_suspend() from their ->remove() routines, the driver core
+executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
+notifications in __device_release_driver(). This requires bus types and
+drivers to make their ->remove() callbacks avoid races with runtime PM directly,
+but also it allows of more flexibility in the handling of devices during the
+removal of their drivers.
+
+Drivers in ->remove() callback should undo the runtime PM changes done
+in ->probe(). Usually this means calling pm_runtime_disable(),
+pm_runtime_dont_use_autosuspend() etc.
+
+The user space can effectively disallow the driver of the device to power manage
+it at run time by changing the value of its /sys/devices/.../power/control
+attribute to "on", which causes pm_runtime_forbid() to be called. In principle,
+this mechanism may also be used by the driver to effectively turn off the
+runtime power management of the device until the user space turns it on.
+Namely, during the initialization the driver can make sure that the runtime PM
+status of the device is 'active' and call pm_runtime_forbid(). It should be
+noted, however, that if the user space has already intentionally changed the
+value of /sys/devices/.../power/control to "auto" to allow the driver to power
+manage the device at run time, the driver may confuse it by using
+pm_runtime_forbid() this way.
+
+6. Runtime PM and System Sleep
+==============================
+
+Runtime PM and system sleep (i.e., system suspend and hibernation, also known
+as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
+ways. If a device is active when a system sleep starts, everything is
+straightforward. But what should happen if the device is already suspended?
+
+The device may have different wake-up settings for runtime PM and system sleep.
+For example, remote wake-up may be enabled for runtime suspend but disallowed
+for system sleep (device_may_wakeup(dev) returns 'false'). When this happens,
+the subsystem-level system suspend callback is responsible for changing the
+device's wake-up setting (it may leave that to the device driver's system
+suspend routine). It may be necessary to resume the device and suspend it again
+in order to do so. The same is true if the driver uses different power levels
+or other settings for runtime suspend and system sleep.
+
+During system resume, the simplest approach is to bring all devices back to full
+power, even if they had been suspended before the system suspend began. There
+are several reasons for this, including:
+
+ * The device might need to switch power levels, wake-up settings, etc.
+
+ * Remote wake-up events might have been lost by the firmware.
+
+ * The device's children may need the device to be at full power in order
+ to resume themselves.
+
+ * The driver's idea of the device state may not agree with the device's
+ physical state. This can happen during resume from hibernation.
+
+ * The device might need to be reset.
+
+ * Even though the device was suspended, if its usage counter was > 0 then most
+ likely it would need a runtime resume in the near future anyway.
+
+If the device had been suspended before the system suspend began and it's
+brought back to full power during resume, then its runtime PM status will have
+to be updated to reflect the actual post-system sleep status. The way to do
+this is:
+
+ - pm_runtime_disable(dev);
+ - pm_runtime_set_active(dev);
+ - pm_runtime_enable(dev);
+
+The PM core always increments the runtime usage counter before calling the
+->suspend() callback and decrements it after calling the ->resume() callback.
+Hence disabling runtime PM temporarily like this will not cause any runtime
+suspend attempts to be permanently lost. If the usage count goes to zero
+following the return of the ->resume() callback, the ->runtime_idle() callback
+will be invoked as usual.
+
+On some systems, however, system sleep is not entered through a global firmware
+or hardware operation. Instead, all hardware components are put into low-power
+states directly by the kernel in a coordinated way. Then, the system sleep
+state effectively follows from the states the hardware components end up in
+and the system is woken up from that state by a hardware interrupt or a similar
+mechanism entirely under the kernel's control. As a result, the kernel never
+gives control away and the states of all devices during resume are precisely
+known to it. If that is the case and none of the situations listed above takes
+place (in particular, if the system is not waking up from hibernation), it may
+be more efficient to leave the devices that had been suspended before the system
+suspend began in the suspended state.
+
+To this end, the PM core provides a mechanism allowing some coordination between
+different levels of device hierarchy. Namely, if a system suspend .prepare()
+callback returns a positive number for a device, that indicates to the PM core
+that the device appears to be runtime-suspended and its state is fine, so it
+may be left in runtime suspend provided that all of its descendants are also
+left in runtime suspend. If that happens, the PM core will not execute any
+system suspend and resume callbacks for all of those devices, except for the
+complete callback, which is then entirely responsible for handling the device
+as appropriate. This only applies to system suspend transitions that are not
+related to hibernation (see Documentation/driver-api/pm/devices.rst for more
+information).
+
+The PM core does its best to reduce the probability of race conditions between
+the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
+out the following operations:
+
+ * During system suspend pm_runtime_get_noresume() is called for every device
+ right before executing the subsystem-level .prepare() callback for it and
+ pm_runtime_barrier() is called for every device right before executing the
+ subsystem-level .suspend() callback for it. In addition to that the PM core
+ calls __pm_runtime_disable() with 'false' as the second argument for every
+ device right before executing the subsystem-level .suspend_late() callback
+ for it.
+
+ * During system resume pm_runtime_enable() and pm_runtime_put() are called for
+ every device right after executing the subsystem-level .resume_early()
+ callback and right after executing the subsystem-level .complete() callback
+ for it, respectively.
+
+7. Generic subsystem callbacks
+
+Subsystems may wish to conserve code space by using the set of generic power
+management callbacks provided by the PM core, defined in
+driver/base/power/generic_ops.c:
+
+ `int pm_generic_runtime_suspend(struct device *dev);`
+ - invoke the ->runtime_suspend() callback provided by the driver of this
+ device and return its result, or return 0 if not defined
+
+ `int pm_generic_runtime_resume(struct device *dev);`
+ - invoke the ->runtime_resume() callback provided by the driver of this
+ device and return its result, or return 0 if not defined
+
+ `int pm_generic_suspend(struct device *dev);`
+ - if the device has not been suspended at run time, invoke the ->suspend()
+ callback provided by its driver and return its result, or return 0 if not
+ defined
+
+ `int pm_generic_suspend_noirq(struct device *dev);`
+ - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
+ callback provided by the device's driver and return its result, or return
+ 0 if not defined
+
+ `int pm_generic_resume(struct device *dev);`
+ - invoke the ->resume() callback provided by the driver of this device and,
+ if successful, change the device's runtime PM status to 'active'
+
+ `int pm_generic_resume_noirq(struct device *dev);`
+ - invoke the ->resume_noirq() callback provided by the driver of this device
+
+ `int pm_generic_freeze(struct device *dev);`
+ - if the device has not been suspended at run time, invoke the ->freeze()
+ callback provided by its driver and return its result, or return 0 if not
+ defined
+
+ `int pm_generic_freeze_noirq(struct device *dev);`
+ - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
+ callback provided by the device's driver and return its result, or return
+ 0 if not defined
+
+ `int pm_generic_thaw(struct device *dev);`
+ - if the device has not been suspended at run time, invoke the ->thaw()
+ callback provided by its driver and return its result, or return 0 if not
+ defined
+
+ `int pm_generic_thaw_noirq(struct device *dev);`
+ - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
+ callback provided by the device's driver and return its result, or return
+ 0 if not defined
+
+ `int pm_generic_poweroff(struct device *dev);`
+ - if the device has not been suspended at run time, invoke the ->poweroff()
+ callback provided by its driver and return its result, or return 0 if not
+ defined
+
+ `int pm_generic_poweroff_noirq(struct device *dev);`
+ - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
+ callback provided by the device's driver and return its result, or return
+ 0 if not defined
+
+ `int pm_generic_restore(struct device *dev);`
+ - invoke the ->restore() callback provided by the driver of this device and,
+ if successful, change the device's runtime PM status to 'active'
+
+ `int pm_generic_restore_noirq(struct device *dev);`
+ - invoke the ->restore_noirq() callback provided by the device's driver
+
+These functions are the defaults used by the PM core, if a subsystem doesn't
+provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
+->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
+->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
+->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
+subsystem-level dev_pm_ops structure.
+
+Device drivers that wish to use the same function as a system suspend, freeze,
+poweroff and runtime suspend callback, and similarly for system resume, thaw,
+restore, and runtime resume, can achieve this with the help of the
+UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
+last argument to NULL).
+
+8. "No-Callback" Devices
+========================
+
+Some "devices" are only logical sub-devices of their parent and cannot be
+power-managed on their own. (The prototype example is a USB interface. Entire
+USB devices can go into low-power mode or send wake-up requests, but neither is
+possible for individual interfaces.) The drivers for these devices have no
+need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
+and ->runtime_resume() would always return 0 without doing anything else and
+->runtime_idle() would always call pm_runtime_suspend().
+
+Subsystems can tell the PM core about these devices by calling
+pm_runtime_no_callbacks(). This should be done after the device structure is
+initialized and before it is registered (although after device registration is
+also okay). The routine will set the device's power.no_callbacks flag and
+prevent the non-debugging runtime PM sysfs attributes from being created.
+
+When power.no_callbacks is set, the PM core will not invoke the
+->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
+Instead it will assume that suspends and resumes always succeed and that idle
+devices should be suspended.
+
+As a consequence, the PM core will never directly inform the device's subsystem
+or driver about runtime power changes. Instead, the driver for the device's
+parent must take responsibility for telling the device's driver when the
+parent's power state changes.
+
+9. Autosuspend, or automatically-delayed suspends
+=================================================
+
+Changing a device's power state isn't free; it requires both time and energy.
+A device should be put in a low-power state only when there's some reason to
+think it will remain in that state for a substantial time. A common heuristic
+says that a device which hasn't been used for a while is liable to remain
+unused; following this advice, drivers should not allow devices to be suspended
+at runtime until they have been inactive for some minimum period. Even when
+the heuristic ends up being non-optimal, it will still prevent devices from
+"bouncing" too rapidly between low-power and full-power states.
+
+The term "autosuspend" is an historical remnant. It doesn't mean that the
+device is automatically suspended (the subsystem or driver still has to call
+the appropriate PM routines); rather it means that runtime suspends will
+automatically be delayed until the desired period of inactivity has elapsed.
+
+Inactivity is determined based on the power.last_busy field. Drivers should
+call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
+typically just before calling pm_runtime_put_autosuspend(). The desired length
+of the inactivity period is a matter of policy. Subsystems can set this length
+initially by calling pm_runtime_set_autosuspend_delay(), but after device
+registration the length should be controlled by user space, using the
+/sys/devices/.../power/autosuspend_delay_ms attribute.
+
+In order to use autosuspend, subsystems or drivers must call
+pm_runtime_use_autosuspend() (preferably before registering the device), and
+thereafter they should use the various `*_autosuspend()` helper functions
+instead of the non-autosuspend counterparts::
+
+ Instead of: pm_runtime_suspend use: pm_runtime_autosuspend;
+ Instead of: pm_schedule_suspend use: pm_request_autosuspend;
+ Instead of: pm_runtime_put use: pm_runtime_put_autosuspend;
+ Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend.
+
+Drivers may also continue to use the non-autosuspend helper functions; they
+will behave normally, which means sometimes taking the autosuspend delay into
+account (see pm_runtime_idle).
+
+Under some circumstances a driver or subsystem may want to prevent a device
+from autosuspending immediately, even though the usage counter is zero and the
+autosuspend delay time has expired. If the ->runtime_suspend() callback
+returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
+in the future (as it normally would be if the callback invoked
+pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
+autosuspend. The ->runtime_suspend() callback can't do this rescheduling
+itself because no suspend requests of any kind are accepted while the device is
+suspending (i.e., while the callback is running).
+
+The implementation is well suited for asynchronous use in interrupt contexts.
+However such use inevitably involves races, because the PM core can't
+synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
+This synchronization must be handled by the driver, using its private lock.
+Here is a schematic pseudo-code example::
+
+ foo_read_or_write(struct foo_priv *foo, void *data)
+ {
+ lock(&foo->private_lock);
+ add_request_to_io_queue(foo, data);
+ if (foo->num_pending_requests++ == 0)
+ pm_runtime_get(&foo->dev);
+ if (!foo->is_suspended)
+ foo_process_next_request(foo);
+ unlock(&foo->private_lock);
+ }
+
+ foo_io_completion(struct foo_priv *foo, void *req)
+ {
+ lock(&foo->private_lock);
+ if (--foo->num_pending_requests == 0) {
+ pm_runtime_mark_last_busy(&foo->dev);
+ pm_runtime_put_autosuspend(&foo->dev);
+ } else {
+ foo_process_next_request(foo);
+ }
+ unlock(&foo->private_lock);
+ /* Send req result back to the user ... */
+ }
+
+ int foo_runtime_suspend(struct device *dev)
+ {
+ struct foo_priv foo = container_of(dev, ...);
+ int ret = 0;
+
+ lock(&foo->private_lock);
+ if (foo->num_pending_requests > 0) {
+ ret = -EBUSY;
+ } else {
+ /* ... suspend the device ... */
+ foo->is_suspended = 1;
+ }
+ unlock(&foo->private_lock);
+ return ret;
+ }
+
+ int foo_runtime_resume(struct device *dev)
+ {
+ struct foo_priv foo = container_of(dev, ...);
+
+ lock(&foo->private_lock);
+ /* ... resume the device ... */
+ foo->is_suspended = 0;
+ pm_runtime_mark_last_busy(&foo->dev);
+ if (foo->num_pending_requests > 0)
+ foo_process_next_request(foo);
+ unlock(&foo->private_lock);
+ return 0;
+ }
+
+The important point is that after foo_io_completion() asks for an autosuspend,
+the foo_runtime_suspend() callback may race with foo_read_or_write().
+Therefore foo_runtime_suspend() has to check whether there are any pending I/O
+requests (while holding the private lock) before allowing the suspend to
+proceed.
+
+In addition, the power.autosuspend_delay field can be changed by user space at
+any time. If a driver cares about this, it can call
+pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
+callback while holding its private lock. If the function returns a nonzero
+value then the delay has not yet expired and the callback should return
+-EAGAIN.
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
deleted file mode 100644
index 937e33c46211..000000000000
--- a/Documentation/power/runtime_pm.txt
+++ /dev/null
@@ -1,928 +0,0 @@
-Runtime Power Management Framework for I/O Devices
-
-(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-(C) 2010 Alan Stern <stern@rowland.harvard.edu>
-(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-1. Introduction
-
-Support for runtime power management (runtime PM) of I/O devices is provided
-at the power management core (PM core) level by means of:
-
-* The power management workqueue pm_wq in which bus types and device drivers can
- put their PM-related work items. It is strongly recommended that pm_wq be
- used for queuing all work items related to runtime PM, because this allows
- them to be synchronized with system-wide power transitions (suspend to RAM,
- hibernation and resume from system sleep states). pm_wq is declared in
- include/linux/pm_runtime.h and defined in kernel/power/main.c.
-
-* A number of runtime PM fields in the 'power' member of 'struct device' (which
- is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
- be used for synchronizing runtime PM operations with one another.
-
-* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
- include/linux/pm.h).
-
-* A set of helper functions defined in drivers/base/power/runtime.c that can be
- used for carrying out runtime PM operations in such a way that the
- synchronization between them is taken care of by the PM core. Bus types and
- device drivers are encouraged to use these functions.
-
-The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
-fields of 'struct dev_pm_info' and the core helper functions provided for
-runtime PM are described below.
-
-2. Device Runtime PM Callbacks
-
-There are three device runtime PM callbacks defined in 'struct dev_pm_ops':
-
-struct dev_pm_ops {
- ...
- int (*runtime_suspend)(struct device *dev);
- int (*runtime_resume)(struct device *dev);
- int (*runtime_idle)(struct device *dev);
- ...
-};
-
-The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
-are executed by the PM core for the device's subsystem that may be either of
-the following:
-
- 1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
- is present.
-
- 2. Device type of the device, if both dev->type and dev->type->pm are present.
-
- 3. Device class of the device, if both dev->class and dev->class->pm are
- present.
-
- 4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
-
-If the subsystem chosen by applying the above rules doesn't provide the relevant
-callback, the PM core will invoke the corresponding driver callback stored in
-dev->driver->pm directly (if present).
-
-The PM core always checks which callback to use in the order given above, so the
-priority order of callbacks from high to low is: PM domain, device type, class
-and bus type. Moreover, the high-priority one will always take precedence over
-a low-priority one. The PM domain, bus type, device type and class callbacks
-are referred to as subsystem-level callbacks in what follows.
-
-By default, the callbacks are always invoked in process context with interrupts
-enabled. However, the pm_runtime_irq_safe() helper function can be used to tell
-the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
-and ->runtime_idle() callbacks for the given device in atomic context with
-interrupts disabled. This implies that the callback routines in question must
-not block or sleep, but it also means that the synchronous helper functions
-listed at the end of Section 4 may be used for that device within an interrupt
-handler or generally in an atomic context.
-
-The subsystem-level suspend callback, if present, is _entirely_ _responsible_
-for handling the suspend of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_suspend() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_suspend()
-callback in a device driver as long as the subsystem-level suspend callback
-knows what to do to handle the device).
-
- * Once the subsystem-level suspend callback (or the driver suspend callback,
- if invoked directly) has completed successfully for the given device, the PM
- core regards the device as suspended, which need not mean that it has been
- put into a low power state. It is supposed to mean, however, that the
- device will not process data and will not communicate with the CPU(s) and
- RAM until the appropriate resume callback is executed for it. The runtime
- PM status of a device after successful execution of the suspend callback is
- 'suspended'.
-
- * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
- status remains 'active', which means that the device _must_ be fully
- operational afterwards.
-
- * If the suspend callback returns an error code different from -EBUSY and
- -EAGAIN, the PM core regards this as a fatal error and will refuse to run
- the helper functions described in Section 4 for the device until its status
- is directly set to either 'active', or 'suspended' (the PM core provides
- special helper functions for this purpose).
-
-In particular, if the driver requires remote wakeup capability (i.e. hardware
-mechanism allowing the device to request a change of its power state, such as
-PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
-device, then ->runtime_suspend() should return -EBUSY. On the other hand, if
-device_can_wakeup() returns 'true' for the device and the device is put into a
-low-power state during the execution of the suspend callback, it is expected
-that remote wakeup will be enabled for the device. Generally, remote wakeup
-should be enabled for all input devices put into low-power states at run time.
-
-The subsystem-level resume callback, if present, is _entirely_ _responsible_ for
-handling the resume of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_resume() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_resume()
-callback in a device driver as long as the subsystem-level resume callback knows
-what to do to handle the device).
-
- * Once the subsystem-level resume callback (or the driver resume callback, if
- invoked directly) has completed successfully, the PM core regards the device
- as fully operational, which means that the device _must_ be able to complete
- I/O operations as needed. The runtime PM status of the device is then
- 'active'.
-
- * If the resume callback returns an error code, the PM core regards this as a
- fatal error and will refuse to run the helper functions described in Section
- 4 for the device, until its status is directly set to either 'active', or
- 'suspended' (by means of special helper functions provided by the PM core
- for this purpose).
-
-The idle callback (a subsystem-level one, if present, or the driver one) is
-executed by the PM core whenever the device appears to be idle, which is
-indicated to the PM core by two counters, the device's usage counter and the
-counter of 'active' children of the device.
-
- * If any of these counters is decreased using a helper function provided by
- the PM core and it turns out to be equal to zero, the other counter is
- checked. If that counter also is equal to zero, the PM core executes the
- idle callback with the device as its argument.
-
-The action performed by the idle callback is totally dependent on the subsystem
-(or driver) in question, but the expected and recommended action is to check
-if the device can be suspended (i.e. if all of the conditions necessary for
-suspending the device are satisfied) and to queue up a suspend request for the
-device in that case. If there is no idle callback, or if the callback returns
-0, then the PM core will attempt to carry out a runtime suspend of the device,
-also respecting devices configured for autosuspend. In essence this means a
-call to pm_runtime_autosuspend() (do note that drivers needs to update the
-device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
-this circumstance). To prevent this (for example, if the callback routine has
-started a delayed suspend), the routine must return a non-zero value. Negative
-error return codes are ignored by the PM core.
-
-The helper functions provided by the PM core, described in Section 4, guarantee
-that the following constraints are met with respect to runtime PM callbacks for
-one device:
-
-(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
- ->runtime_suspend() in parallel with ->runtime_resume() or with another
- instance of ->runtime_suspend() for the same device) with the exception that
- ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
- ->runtime_idle() (although ->runtime_idle() will not be started while any
- of the other callbacks is being executed for the same device).
-
-(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
- devices (i.e. the PM core will only execute ->runtime_idle() or
- ->runtime_suspend() for the devices the runtime PM status of which is
- 'active').
-
-(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
- the usage counter of which is equal to zero _and_ either the counter of
- 'active' children of which is equal to zero, or the 'power.ignore_children'
- flag of which is set.
-
-(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the
- PM core will only execute ->runtime_resume() for the devices the runtime
- PM status of which is 'suspended').
-
-Additionally, the helper functions provided by the PM core obey the following
-rules:
-
- * If ->runtime_suspend() is about to be executed or there's a pending request
- to execute it, ->runtime_idle() will not be executed for the same device.
-
- * A request to execute or to schedule the execution of ->runtime_suspend()
- will cancel any pending requests to execute ->runtime_idle() for the same
- device.
-
- * If ->runtime_resume() is about to be executed or there's a pending request
- to execute it, the other callbacks will not be executed for the same device.
-
- * A request to execute ->runtime_resume() will cancel any pending or
- scheduled requests to execute the other callbacks for the same device,
- except for scheduled autosuspends.
-
-3. Runtime PM Device Fields
-
-The following device runtime PM fields are present in 'struct dev_pm_info', as
-defined in include/linux/pm.h:
-
- struct timer_list suspend_timer;
- - timer used for scheduling (delayed) suspend and autosuspend requests
-
- unsigned long timer_expires;
- - timer expiration time, in jiffies (if this is different from zero, the
- timer is running and will expire at that time, otherwise the timer is not
- running)
-
- struct work_struct work;
- - work structure used for queuing up requests (i.e. work items in pm_wq)
-
- wait_queue_head_t wait_queue;
- - wait queue used if any of the helper functions needs to wait for another
- one to complete
-
- spinlock_t lock;
- - lock used for synchronization
-
- atomic_t usage_count;
- - the usage counter of the device
-
- atomic_t child_count;
- - the count of 'active' children of the device
-
- unsigned int ignore_children;
- - if set, the value of child_count is ignored (but still updated)
-
- unsigned int disable_depth;
- - used for disabling the helper functions (they work normally if this is
- equal to zero); the initial value of it is 1 (i.e. runtime PM is
- initially disabled for all devices)
-
- int runtime_error;
- - if set, there was a fatal error (one of the callbacks returned error code
- as described in Section 2), so the helper functions will not work until
- this flag is cleared; this is the error code returned by the failing
- callback
-
- unsigned int idle_notification;
- - if set, ->runtime_idle() is being executed
-
- unsigned int request_pending;
- - if set, there's a pending request (i.e. a work item queued up into pm_wq)
-
- enum rpm_request request;
- - type of request that's pending (valid if request_pending is set)
-
- unsigned int deferred_resume;
- - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
- being executed for that device and it is not practical to wait for the
- suspend to complete; means "start a resume as soon as you've suspended"
-
- enum rpm_status runtime_status;
- - the runtime PM status of the device; this field's initial value is
- RPM_SUSPENDED, which means that each device is initially regarded by the
- PM core as 'suspended', regardless of its real hardware status
-
- unsigned int runtime_auto;
- - if set, indicates that the user space has allowed the device driver to
- power manage the device at run time via the /sys/devices/.../power/control
- interface; it may only be modified with the help of the pm_runtime_allow()
- and pm_runtime_forbid() helper functions
-
- unsigned int no_callbacks;
- - indicates that the device does not use the runtime PM callbacks (see
- Section 8); it may be modified only by the pm_runtime_no_callbacks()
- helper function
-
- unsigned int irq_safe;
- - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
- will be invoked with the spinlock held and interrupts disabled
-
- unsigned int use_autosuspend;
- - indicates that the device's driver supports delayed autosuspend (see
- Section 9); it may be modified only by the
- pm_runtime{_dont}_use_autosuspend() helper functions
-
- unsigned int timer_autosuspends;
- - indicates that the PM core should attempt to carry out an autosuspend
- when the timer expires rather than a normal suspend
-
- int autosuspend_delay;
- - the delay time (in milliseconds) to be used for autosuspend
-
- unsigned long last_busy;
- - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
- function was last called for this device; used in calculating inactivity
- periods for autosuspend
-
-All of the above fields are members of the 'power' member of 'struct device'.
-
-4. Runtime PM Device Helper Functions
-
-The following runtime PM helper functions are defined in
-drivers/base/power/runtime.c and include/linux/pm_runtime.h:
-
- void pm_runtime_init(struct device *dev);
- - initialize the device runtime PM fields in 'struct dev_pm_info'
-
- void pm_runtime_remove(struct device *dev);
- - make sure that the runtime PM of the device will be disabled after
- removing the device from device hierarchy
-
- int pm_runtime_idle(struct device *dev);
- - execute the subsystem-level idle callback for the device; returns an
- error code on failure, where -EINPROGRESS means that ->runtime_idle() is
- already being executed; if there is no callback or the callback returns 0
- then run pm_runtime_autosuspend(dev) and return its result
-
- int pm_runtime_suspend(struct device *dev);
- - execute the subsystem-level suspend callback for the device; returns 0 on
- success, 1 if the device's runtime PM status was already 'suspended', or
- error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
- to suspend the device again in future and -EACCES means that
- 'power.disable_depth' is different from 0
-
- int pm_runtime_autosuspend(struct device *dev);
- - same as pm_runtime_suspend() except that the autosuspend delay is taken
- into account; if pm_runtime_autosuspend_expiration() says the delay has
- not yet expired then an autosuspend is scheduled for the appropriate time
- and 0 is returned
-
- int pm_runtime_resume(struct device *dev);
- - execute the subsystem-level resume callback for the device; returns 0 on
- success, 1 if the device's runtime PM status was already 'active' or
- error code on failure, where -EAGAIN means it may be safe to attempt to
- resume the device again in future, but 'power.runtime_error' should be
- checked additionally, and -EACCES means that 'power.disable_depth' is
- different from 0
-
- int pm_request_idle(struct device *dev);
- - submit a request to execute the subsystem-level idle callback for the
- device (the request is represented by a work item in pm_wq); returns 0 on
- success or error code if the request has not been queued up
-
- int pm_request_autosuspend(struct device *dev);
- - schedule the execution of the subsystem-level suspend callback for the
- device when the autosuspend delay has expired; if the delay has already
- expired then the work item is queued up immediately
-
- int pm_schedule_suspend(struct device *dev, unsigned int delay);
- - schedule the execution of the subsystem-level suspend callback for the
- device in future, where 'delay' is the time to wait before queuing up a
- suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
- item is queued up immediately); returns 0 on success, 1 if the device's PM
- runtime status was already 'suspended', or error code if the request
- hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
- ->runtime_suspend() is already scheduled and not yet expired, the new
- value of 'delay' will be used as the time to wait
-
- int pm_request_resume(struct device *dev);
- - submit a request to execute the subsystem-level resume callback for the
- device (the request is represented by a work item in pm_wq); returns 0 on
- success, 1 if the device's runtime PM status was already 'active', or
- error code if the request hasn't been queued up
-
- void pm_runtime_get_noresume(struct device *dev);
- - increment the device's usage counter
-
- int pm_runtime_get(struct device *dev);
- - increment the device's usage counter, run pm_request_resume(dev) and
- return its result
-
- int pm_runtime_get_sync(struct device *dev);
- - increment the device's usage counter, run pm_runtime_resume(dev) and
- return its result
-
- int pm_runtime_get_if_in_use(struct device *dev);
- - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
- runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
- nonzero, increment the counter and return 1; otherwise return 0 without
- changing the counter
-
- void pm_runtime_put_noidle(struct device *dev);
- - decrement the device's usage counter
-
- int pm_runtime_put(struct device *dev);
- - decrement the device's usage counter; if the result is 0 then run
- pm_request_idle(dev) and return its result
-
- int pm_runtime_put_autosuspend(struct device *dev);
- - decrement the device's usage counter; if the result is 0 then run
- pm_request_autosuspend(dev) and return its result
-
- int pm_runtime_put_sync(struct device *dev);
- - decrement the device's usage counter; if the result is 0 then run
- pm_runtime_idle(dev) and return its result
-
- int pm_runtime_put_sync_suspend(struct device *dev);
- - decrement the device's usage counter; if the result is 0 then run
- pm_runtime_suspend(dev) and return its result
-
- int pm_runtime_put_sync_autosuspend(struct device *dev);
- - decrement the device's usage counter; if the result is 0 then run
- pm_runtime_autosuspend(dev) and return its result
-
- void pm_runtime_enable(struct device *dev);
- - decrement the device's 'power.disable_depth' field; if that field is equal
- to zero, the runtime PM helper functions can execute subsystem-level
- callbacks described in Section 2 for the device
-
- int pm_runtime_disable(struct device *dev);
- - increment the device's 'power.disable_depth' field (if the value of that
- field was previously zero, this prevents subsystem-level runtime PM
- callbacks from being run for the device), make sure that all of the
- pending runtime PM operations on the device are either completed or
- canceled; returns 1 if there was a resume request pending and it was
- necessary to execute the subsystem-level resume callback for the device
- to satisfy that request, otherwise 0 is returned
-
- int pm_runtime_barrier(struct device *dev);
- - check if there's a resume request pending for the device and resume it
- (synchronously) in that case, cancel any other pending runtime PM requests
- regarding it and wait for all runtime PM operations on it in progress to
- complete; returns 1 if there was a resume request pending and it was
- necessary to execute the subsystem-level resume callback for the device to
- satisfy that request, otherwise 0 is returned
-
- void pm_suspend_ignore_children(struct device *dev, bool enable);
- - set/unset the power.ignore_children flag of the device
-
- int pm_runtime_set_active(struct device *dev);
- - clear the device's 'power.runtime_error' flag, set the device's runtime
- PM status to 'active' and update its parent's counter of 'active'
- children as appropriate (it is only valid to use this function if
- 'power.runtime_error' is set or 'power.disable_depth' is greater than
- zero); it will fail and return error code if the device has a parent
- which is not active and the 'power.ignore_children' flag of which is unset
-
- void pm_runtime_set_suspended(struct device *dev);
- - clear the device's 'power.runtime_error' flag, set the device's runtime
- PM status to 'suspended' and update its parent's counter of 'active'
- children as appropriate (it is only valid to use this function if
- 'power.runtime_error' is set or 'power.disable_depth' is greater than
- zero)
-
- bool pm_runtime_active(struct device *dev);
- - return true if the device's runtime PM status is 'active' or its
- 'power.disable_depth' field is not equal to zero, or false otherwise
-
- bool pm_runtime_suspended(struct device *dev);
- - return true if the device's runtime PM status is 'suspended' and its
- 'power.disable_depth' field is equal to zero, or false otherwise
-
- bool pm_runtime_status_suspended(struct device *dev);
- - return true if the device's runtime PM status is 'suspended'
-
- void pm_runtime_allow(struct device *dev);
- - set the power.runtime_auto flag for the device and decrease its usage
- counter (used by the /sys/devices/.../power/control interface to
- effectively allow the device to be power managed at run time)
-
- void pm_runtime_forbid(struct device *dev);
- - unset the power.runtime_auto flag for the device and increase its usage
- counter (used by the /sys/devices/.../power/control interface to
- effectively prevent the device from being power managed at run time)
-
- void pm_runtime_no_callbacks(struct device *dev);
- - set the power.no_callbacks flag for the device and remove the runtime
- PM attributes from /sys/devices/.../power (or prevent them from being
- added when the device is registered)
-
- void pm_runtime_irq_safe(struct device *dev);
- - set the power.irq_safe flag for the device, causing the runtime-PM
- callbacks to be invoked with interrupts off
-
- bool pm_runtime_is_irq_safe(struct device *dev);
- - return true if power.irq_safe flag was set for the device, causing
- the runtime-PM callbacks to be invoked with interrupts off
-
- void pm_runtime_mark_last_busy(struct device *dev);
- - set the power.last_busy field to the current time
-
- void pm_runtime_use_autosuspend(struct device *dev);
- - set the power.use_autosuspend flag, enabling autosuspend delays; call
- pm_runtime_get_sync if the flag was previously cleared and
- power.autosuspend_delay is negative
-
- void pm_runtime_dont_use_autosuspend(struct device *dev);
- - clear the power.use_autosuspend flag, disabling autosuspend delays;
- decrement the device's usage counter if the flag was previously set and
- power.autosuspend_delay is negative; call pm_runtime_idle
-
- void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
- - set the power.autosuspend_delay value to 'delay' (expressed in
- milliseconds); if 'delay' is negative then runtime suspends are
- prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
- called or the device's usage counter may be decremented and
- pm_runtime_idle called depending on if power.autosuspend_delay is
- changed to or from a negative value; if power.use_autosuspend is clear,
- pm_runtime_idle is called
-
- unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
- - calculate the time when the current autosuspend delay period will expire,
- based on power.last_busy and power.autosuspend_delay; if the delay time
- is 1000 ms or larger then the expiration time is rounded up to the
- nearest second; returns 0 if the delay period has already expired or
- power.use_autosuspend isn't set, otherwise returns the expiration time
- in jiffies
-
-It is safe to execute the following helper functions from interrupt context:
-
-pm_request_idle()
-pm_request_autosuspend()
-pm_schedule_suspend()
-pm_request_resume()
-pm_runtime_get_noresume()
-pm_runtime_get()
-pm_runtime_put_noidle()
-pm_runtime_put()
-pm_runtime_put_autosuspend()
-pm_runtime_enable()
-pm_suspend_ignore_children()
-pm_runtime_set_active()
-pm_runtime_set_suspended()
-pm_runtime_suspended()
-pm_runtime_mark_last_busy()
-pm_runtime_autosuspend_expiration()
-
-If pm_runtime_irq_safe() has been called for a device then the following helper
-functions may also be used in interrupt context:
-
-pm_runtime_idle()
-pm_runtime_suspend()
-pm_runtime_autosuspend()
-pm_runtime_resume()
-pm_runtime_get_sync()
-pm_runtime_put_sync()
-pm_runtime_put_sync_suspend()
-pm_runtime_put_sync_autosuspend()
-
-5. Runtime PM Initialization, Device Probing and Removal
-
-Initially, the runtime PM is disabled for all devices, which means that the
-majority of the runtime PM helper functions described in Section 4 will return
--EAGAIN until pm_runtime_enable() is called for the device.
-
-In addition to that, the initial runtime PM status of all devices is
-'suspended', but it need not reflect the actual physical state of the device.
-Thus, if the device is initially active (i.e. it is able to process I/O), its
-runtime PM status must be changed to 'active', with the help of
-pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
-
-However, if the device has a parent and the parent's runtime PM is enabled,
-calling pm_runtime_set_active() for the device will affect the parent, unless
-the parent's 'power.ignore_children' flag is set. Namely, in that case the
-parent won't be able to suspend at run time, using the PM core's helper
-functions, as long as the child's status is 'active', even if the child's
-runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
-the child yet or pm_runtime_disable() has been called for it). For this reason,
-once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
-should be called for it too as soon as reasonably possible or its runtime PM
-status should be changed back to 'suspended' with the help of
-pm_runtime_set_suspended().
-
-If the default initial runtime PM status of the device (i.e. 'suspended')
-reflects the actual state of the device, its bus type's or its driver's
-->probe() callback will likely need to wake it up using one of the PM core's
-helper functions described in Section 4. In that case, pm_runtime_resume()
-should be used. Of course, for this purpose the device's runtime PM has to be
-enabled earlier by calling pm_runtime_enable().
-
-Note, if the device may execute pm_runtime calls during the probe (such as
-if it is registers with a subsystem that may call back in) then the
-pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
-appropriate to ensure that the device is not put back to sleep during the
-probe. This can happen with systems such as the network device layer.
-
-It may be desirable to suspend the device once ->probe() has finished.
-Therefore the driver core uses the asynchronous pm_request_idle() to submit a
-request to execute the subsystem-level idle callback for the device at that
-time. A driver that makes use of the runtime autosuspend feature, may want to
-update the last busy mark before returning from ->probe().
-
-Moreover, the driver core prevents runtime PM callbacks from racing with the bus
-notifier callback in __device_release_driver(), which is necessary, because the
-notifier is used by some subsystems to carry out operations affecting the
-runtime PM functionality. It does so by calling pm_runtime_get_sync() before
-driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This
-resumes the device if it's in the suspended state and prevents it from
-being suspended again while those routines are being executed.
-
-To allow bus types and drivers to put devices into the suspended state by
-calling pm_runtime_suspend() from their ->remove() routines, the driver core
-executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
-notifications in __device_release_driver(). This requires bus types and
-drivers to make their ->remove() callbacks avoid races with runtime PM directly,
-but also it allows of more flexibility in the handling of devices during the
-removal of their drivers.
-
-Drivers in ->remove() callback should undo the runtime PM changes done
-in ->probe(). Usually this means calling pm_runtime_disable(),
-pm_runtime_dont_use_autosuspend() etc.
-
-The user space can effectively disallow the driver of the device to power manage
-it at run time by changing the value of its /sys/devices/.../power/control
-attribute to "on", which causes pm_runtime_forbid() to be called. In principle,
-this mechanism may also be used by the driver to effectively turn off the
-runtime power management of the device until the user space turns it on.
-Namely, during the initialization the driver can make sure that the runtime PM
-status of the device is 'active' and call pm_runtime_forbid(). It should be
-noted, however, that if the user space has already intentionally changed the
-value of /sys/devices/.../power/control to "auto" to allow the driver to power
-manage the device at run time, the driver may confuse it by using
-pm_runtime_forbid() this way.
-
-6. Runtime PM and System Sleep
-
-Runtime PM and system sleep (i.e., system suspend and hibernation, also known
-as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
-ways. If a device is active when a system sleep starts, everything is
-straightforward. But what should happen if the device is already suspended?
-
-The device may have different wake-up settings for runtime PM and system sleep.
-For example, remote wake-up may be enabled for runtime suspend but disallowed
-for system sleep (device_may_wakeup(dev) returns 'false'). When this happens,
-the subsystem-level system suspend callback is responsible for changing the
-device's wake-up setting (it may leave that to the device driver's system
-suspend routine). It may be necessary to resume the device and suspend it again
-in order to do so. The same is true if the driver uses different power levels
-or other settings for runtime suspend and system sleep.
-
-During system resume, the simplest approach is to bring all devices back to full
-power, even if they had been suspended before the system suspend began. There
-are several reasons for this, including:
-
- * The device might need to switch power levels, wake-up settings, etc.
-
- * Remote wake-up events might have been lost by the firmware.
-
- * The device's children may need the device to be at full power in order
- to resume themselves.
-
- * The driver's idea of the device state may not agree with the device's
- physical state. This can happen during resume from hibernation.
-
- * The device might need to be reset.
-
- * Even though the device was suspended, if its usage counter was > 0 then most
- likely it would need a runtime resume in the near future anyway.
-
-If the device had been suspended before the system suspend began and it's
-brought back to full power during resume, then its runtime PM status will have
-to be updated to reflect the actual post-system sleep status. The way to do
-this is:
-
- pm_runtime_disable(dev);
- pm_runtime_set_active(dev);
- pm_runtime_enable(dev);
-
-The PM core always increments the runtime usage counter before calling the
-->suspend() callback and decrements it after calling the ->resume() callback.
-Hence disabling runtime PM temporarily like this will not cause any runtime
-suspend attempts to be permanently lost. If the usage count goes to zero
-following the return of the ->resume() callback, the ->runtime_idle() callback
-will be invoked as usual.
-
-On some systems, however, system sleep is not entered through a global firmware
-or hardware operation. Instead, all hardware components are put into low-power
-states directly by the kernel in a coordinated way. Then, the system sleep
-state effectively follows from the states the hardware components end up in
-and the system is woken up from that state by a hardware interrupt or a similar
-mechanism entirely under the kernel's control. As a result, the kernel never
-gives control away and the states of all devices during resume are precisely
-known to it. If that is the case and none of the situations listed above takes
-place (in particular, if the system is not waking up from hibernation), it may
-be more efficient to leave the devices that had been suspended before the system
-suspend began in the suspended state.
-
-To this end, the PM core provides a mechanism allowing some coordination between
-different levels of device hierarchy. Namely, if a system suspend .prepare()
-callback returns a positive number for a device, that indicates to the PM core
-that the device appears to be runtime-suspended and its state is fine, so it
-may be left in runtime suspend provided that all of its descendants are also
-left in runtime suspend. If that happens, the PM core will not execute any
-system suspend and resume callbacks for all of those devices, except for the
-complete callback, which is then entirely responsible for handling the device
-as appropriate. This only applies to system suspend transitions that are not
-related to hibernation (see Documentation/driver-api/pm/devices.rst for more
-information).
-
-The PM core does its best to reduce the probability of race conditions between
-the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
-out the following operations:
-
- * During system suspend pm_runtime_get_noresume() is called for every device
- right before executing the subsystem-level .prepare() callback for it and
- pm_runtime_barrier() is called for every device right before executing the
- subsystem-level .suspend() callback for it. In addition to that the PM core
- calls __pm_runtime_disable() with 'false' as the second argument for every
- device right before executing the subsystem-level .suspend_late() callback
- for it.
-
- * During system resume pm_runtime_enable() and pm_runtime_put() are called for
- every device right after executing the subsystem-level .resume_early()
- callback and right after executing the subsystem-level .complete() callback
- for it, respectively.
-
-7. Generic subsystem callbacks
-
-Subsystems may wish to conserve code space by using the set of generic power
-management callbacks provided by the PM core, defined in
-driver/base/power/generic_ops.c:
-
- int pm_generic_runtime_suspend(struct device *dev);
- - invoke the ->runtime_suspend() callback provided by the driver of this
- device and return its result, or return 0 if not defined
-
- int pm_generic_runtime_resume(struct device *dev);
- - invoke the ->runtime_resume() callback provided by the driver of this
- device and return its result, or return 0 if not defined
-
- int pm_generic_suspend(struct device *dev);
- - if the device has not been suspended at run time, invoke the ->suspend()
- callback provided by its driver and return its result, or return 0 if not
- defined
-
- int pm_generic_suspend_noirq(struct device *dev);
- - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
- callback provided by the device's driver and return its result, or return
- 0 if not defined
-
- int pm_generic_resume(struct device *dev);
- - invoke the ->resume() callback provided by the driver of this device and,
- if successful, change the device's runtime PM status to 'active'
-
- int pm_generic_resume_noirq(struct device *dev);
- - invoke the ->resume_noirq() callback provided by the driver of this device
-
- int pm_generic_freeze(struct device *dev);
- - if the device has not been suspended at run time, invoke the ->freeze()
- callback provided by its driver and return its result, or return 0 if not
- defined
-
- int pm_generic_freeze_noirq(struct device *dev);
- - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
- callback provided by the device's driver and return its result, or return
- 0 if not defined
-
- int pm_generic_thaw(struct device *dev);
- - if the device has not been suspended at run time, invoke the ->thaw()
- callback provided by its driver and return its result, or return 0 if not
- defined
-
- int pm_generic_thaw_noirq(struct device *dev);
- - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
- callback provided by the device's driver and return its result, or return
- 0 if not defined
-
- int pm_generic_poweroff(struct device *dev);
- - if the device has not been suspended at run time, invoke the ->poweroff()
- callback provided by its driver and return its result, or return 0 if not
- defined
-
- int pm_generic_poweroff_noirq(struct device *dev);
- - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
- callback provided by the device's driver and return its result, or return
- 0 if not defined
-
- int pm_generic_restore(struct device *dev);
- - invoke the ->restore() callback provided by the driver of this device and,
- if successful, change the device's runtime PM status to 'active'
-
- int pm_generic_restore_noirq(struct device *dev);
- - invoke the ->restore_noirq() callback provided by the device's driver
-
-These functions are the defaults used by the PM core, if a subsystem doesn't
-provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
-->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
-->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
-->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
-subsystem-level dev_pm_ops structure.
-
-Device drivers that wish to use the same function as a system suspend, freeze,
-poweroff and runtime suspend callback, and similarly for system resume, thaw,
-restore, and runtime resume, can achieve this with the help of the
-UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
-last argument to NULL).
-
-8. "No-Callback" Devices
-
-Some "devices" are only logical sub-devices of their parent and cannot be
-power-managed on their own. (The prototype example is a USB interface. Entire
-USB devices can go into low-power mode or send wake-up requests, but neither is
-possible for individual interfaces.) The drivers for these devices have no
-need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
-and ->runtime_resume() would always return 0 without doing anything else and
-->runtime_idle() would always call pm_runtime_suspend().
-
-Subsystems can tell the PM core about these devices by calling
-pm_runtime_no_callbacks(). This should be done after the device structure is
-initialized and before it is registered (although after device registration is
-also okay). The routine will set the device's power.no_callbacks flag and
-prevent the non-debugging runtime PM sysfs attributes from being created.
-
-When power.no_callbacks is set, the PM core will not invoke the
-->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
-Instead it will assume that suspends and resumes always succeed and that idle
-devices should be suspended.
-
-As a consequence, the PM core will never directly inform the device's subsystem
-or driver about runtime power changes. Instead, the driver for the device's
-parent must take responsibility for telling the device's driver when the
-parent's power state changes.
-
-9. Autosuspend, or automatically-delayed suspends
-
-Changing a device's power state isn't free; it requires both time and energy.
-A device should be put in a low-power state only when there's some reason to
-think it will remain in that state for a substantial time. A common heuristic
-says that a device which hasn't been used for a while is liable to remain
-unused; following this advice, drivers should not allow devices to be suspended
-at runtime until they have been inactive for some minimum period. Even when
-the heuristic ends up being non-optimal, it will still prevent devices from
-"bouncing" too rapidly between low-power and full-power states.
-
-The term "autosuspend" is an historical remnant. It doesn't mean that the
-device is automatically suspended (the subsystem or driver still has to call
-the appropriate PM routines); rather it means that runtime suspends will
-automatically be delayed until the desired period of inactivity has elapsed.
-
-Inactivity is determined based on the power.last_busy field. Drivers should
-call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
-typically just before calling pm_runtime_put_autosuspend(). The desired length
-of the inactivity period is a matter of policy. Subsystems can set this length
-initially by calling pm_runtime_set_autosuspend_delay(), but after device
-registration the length should be controlled by user space, using the
-/sys/devices/.../power/autosuspend_delay_ms attribute.
-
-In order to use autosuspend, subsystems or drivers must call
-pm_runtime_use_autosuspend() (preferably before registering the device), and
-thereafter they should use the various *_autosuspend() helper functions instead
-of the non-autosuspend counterparts:
-
- Instead of: pm_runtime_suspend use: pm_runtime_autosuspend;
- Instead of: pm_schedule_suspend use: pm_request_autosuspend;
- Instead of: pm_runtime_put use: pm_runtime_put_autosuspend;
- Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend.
-
-Drivers may also continue to use the non-autosuspend helper functions; they
-will behave normally, which means sometimes taking the autosuspend delay into
-account (see pm_runtime_idle).
-
-Under some circumstances a driver or subsystem may want to prevent a device
-from autosuspending immediately, even though the usage counter is zero and the
-autosuspend delay time has expired. If the ->runtime_suspend() callback
-returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
-in the future (as it normally would be if the callback invoked
-pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
-autosuspend. The ->runtime_suspend() callback can't do this rescheduling
-itself because no suspend requests of any kind are accepted while the device is
-suspending (i.e., while the callback is running).
-
-The implementation is well suited for asynchronous use in interrupt contexts.
-However such use inevitably involves races, because the PM core can't
-synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
-This synchronization must be handled by the driver, using its private lock.
-Here is a schematic pseudo-code example:
-
- foo_read_or_write(struct foo_priv *foo, void *data)
- {
- lock(&foo->private_lock);
- add_request_to_io_queue(foo, data);
- if (foo->num_pending_requests++ == 0)
- pm_runtime_get(&foo->dev);
- if (!foo->is_suspended)
- foo_process_next_request(foo);
- unlock(&foo->private_lock);
- }
-
- foo_io_completion(struct foo_priv *foo, void *req)
- {
- lock(&foo->private_lock);
- if (--foo->num_pending_requests == 0) {
- pm_runtime_mark_last_busy(&foo->dev);
- pm_runtime_put_autosuspend(&foo->dev);
- } else {
- foo_process_next_request(foo);
- }
- unlock(&foo->private_lock);
- /* Send req result back to the user ... */
- }
-
- int foo_runtime_suspend(struct device *dev)
- {
- struct foo_priv foo = container_of(dev, ...);
- int ret = 0;
-
- lock(&foo->private_lock);
- if (foo->num_pending_requests > 0) {
- ret = -EBUSY;
- } else {
- /* ... suspend the device ... */
- foo->is_suspended = 1;
- }
- unlock(&foo->private_lock);
- return ret;
- }
-
- int foo_runtime_resume(struct device *dev)
- {
- struct foo_priv foo = container_of(dev, ...);
-
- lock(&foo->private_lock);
- /* ... resume the device ... */
- foo->is_suspended = 0;
- pm_runtime_mark_last_busy(&foo->dev);
- if (foo->num_pending_requests > 0)
- foo_process_next_request(foo);
- unlock(&foo->private_lock);
- return 0;
- }
-
-The important point is that after foo_io_completion() asks for an autosuspend,
-the foo_runtime_suspend() callback may race with foo_read_or_write().
-Therefore foo_runtime_suspend() has to check whether there are any pending I/O
-requests (while holding the private lock) before allowing the suspend to
-proceed.
-
-In addition, the power.autosuspend_delay field can be changed by user space at
-any time. If a driver cares about this, it can call
-pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
-callback while holding its private lock. If the function returns a nonzero
-value then the delay has not yet expired and the callback should return
--EAGAIN.
diff --git a/Documentation/power/s2ram.rst b/Documentation/power/s2ram.rst
new file mode 100644
index 000000000000..d739aa7c742c
--- /dev/null
+++ b/Documentation/power/s2ram.rst
@@ -0,0 +1,87 @@
+========================
+How to get s2ram working
+========================
+
+2006 Linus Torvalds
+2006 Pavel Machek
+
+1) Check suspend.sf.net, program s2ram there has long whitelist of
+ "known ok" machines, along with tricks to use on each one.
+
+2) If that does not help, try reading tricks.txt and
+ video.txt. Perhaps problem is as simple as broken module, and
+ simple module unload can fix it.
+
+3) You can use Linus' TRACE_RESUME infrastructure, described below.
+
+Using TRACE_RESUME
+~~~~~~~~~~~~~~~~~~
+
+I've been working at making the machines I have able to STR, and almost
+always it's a driver that is buggy. Thank God for the suspend/resume
+debugging - the thing that Chuck tried to disable. That's often the _only_
+way to debug these things, and it's actually pretty powerful (but
+time-consuming - having to insert TRACE_RESUME() markers into the device
+driver that doesn't resume and recompile and reboot).
+
+Anyway, the way to debug this for people who are interested (have a
+machine that doesn't boot) is:
+
+ - enable PM_DEBUG, and PM_TRACE
+
+ - use a script like this::
+
+ #!/bin/sh
+ sync
+ echo 1 > /sys/power/pm_trace
+ echo mem > /sys/power/state
+
+ to suspend
+
+ - if it doesn't come back up (which is usually the problem), reboot by
+ holding the power button down, and look at the dmesg output for things
+ like::
+
+ Magic number: 4:156:725
+ hash matches drivers/base/power/resume.c:28
+ hash matches device 0000:01:00.0
+
+ which means that the last trace event was just before trying to resume
+ device 0000:01:00.0. Then figure out what driver is controlling that
+ device (lspci and /sys/devices/pci* is your friend), and see if you can
+ fix it, disable it, or trace into its resume function.
+
+ If no device matches the hash (or any matches appear to be false positives),
+ the culprit may be a device from a loadable kernel module that is not loaded
+ until after the hash is checked. You can check the hash against the current
+ devices again after more modules are loaded using sysfs::
+
+ cat /sys/power/pm_trace_dev_match
+
+For example, the above happens to be the VGA device on my EVO, which I
+used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
+that "radeonfb" simply cannot resume that device - it tries to set the
+PLL's, and it just _hangs_. Using the regular VGA console and letting X
+resume it instead works fine.
+
+NOTE
+====
+pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
+Reason for this is that the RTC is the only reliably available piece of
+hardware during resume operations where a value can be set that will
+survive a reboot.
+
+pm_trace is not compatible with asynchronous suspend, so it turns
+asynchronous suspend off (which may work around timing or
+ordering-sensitive bugs).
+
+Consequence is that after a resume (even if it is successful) your system
+clock will have a value corresponding to the magic number instead of the
+correct date/time! It is therefore advisable to use a program like ntp-date
+or rdate to reset the correct date/time from an external time source when
+using this trace option.
+
+As the clock keeps ticking it is also essential that the reboot is done
+quickly after the resume failure. The trace option does not use the seconds
+or the low order bits of the minutes of the RTC, but a too long delay will
+corrupt the magic value.
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
deleted file mode 100644
index 4685aee197fd..000000000000
--- a/Documentation/power/s2ram.txt
+++ /dev/null
@@ -1,85 +0,0 @@
- How to get s2ram working
- ~~~~~~~~~~~~~~~~~~~~~~~~
- 2006 Linus Torvalds
- 2006 Pavel Machek
-
-1) Check suspend.sf.net, program s2ram there has long whitelist of
- "known ok" machines, along with tricks to use on each one.
-
-2) If that does not help, try reading tricks.txt and
- video.txt. Perhaps problem is as simple as broken module, and
- simple module unload can fix it.
-
-3) You can use Linus' TRACE_RESUME infrastructure, described below.
-
- Using TRACE_RESUME
- ~~~~~~~~~~~~~~~~~~
-
-I've been working at making the machines I have able to STR, and almost
-always it's a driver that is buggy. Thank God for the suspend/resume
-debugging - the thing that Chuck tried to disable. That's often the _only_
-way to debug these things, and it's actually pretty powerful (but
-time-consuming - having to insert TRACE_RESUME() markers into the device
-driver that doesn't resume and recompile and reboot).
-
-Anyway, the way to debug this for people who are interested (have a
-machine that doesn't boot) is:
-
- - enable PM_DEBUG, and PM_TRACE
-
- - use a script like this:
-
- #!/bin/sh
- sync
- echo 1 > /sys/power/pm_trace
- echo mem > /sys/power/state
-
- to suspend
-
- - if it doesn't come back up (which is usually the problem), reboot by
- holding the power button down, and look at the dmesg output for things
- like
-
- Magic number: 4:156:725
- hash matches drivers/base/power/resume.c:28
- hash matches device 0000:01:00.0
-
- which means that the last trace event was just before trying to resume
- device 0000:01:00.0. Then figure out what driver is controlling that
- device (lspci and /sys/devices/pci* is your friend), and see if you can
- fix it, disable it, or trace into its resume function.
-
- If no device matches the hash (or any matches appear to be false positives),
- the culprit may be a device from a loadable kernel module that is not loaded
- until after the hash is checked. You can check the hash against the current
- devices again after more modules are loaded using sysfs:
-
- cat /sys/power/pm_trace_dev_match
-
-For example, the above happens to be the VGA device on my EVO, which I
-used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
-that "radeonfb" simply cannot resume that device - it tries to set the
-PLL's, and it just _hangs_. Using the regular VGA console and letting X
-resume it instead works fine.
-
-NOTE
-====
-pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
-Reason for this is that the RTC is the only reliably available piece of
-hardware during resume operations where a value can be set that will
-survive a reboot.
-
-pm_trace is not compatible with asynchronous suspend, so it turns
-asynchronous suspend off (which may work around timing or
-ordering-sensitive bugs).
-
-Consequence is that after a resume (even if it is successful) your system
-clock will have a value corresponding to the magic number instead of the
-correct date/time! It is therefore advisable to use a program like ntp-date
-or rdate to reset the correct date/time from an external time source when
-using this trace option.
-
-As the clock keeps ticking it is also essential that the reboot is done
-quickly after the resume failure. The trace option does not use the seconds
-or the low order bits of the minutes of the RTC, but a too long delay will
-corrupt the magic value.
diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst
new file mode 100644
index 000000000000..7ac8e1f549f4
--- /dev/null
+++ b/Documentation/power/suspend-and-cpuhotplug.rst
@@ -0,0 +1,286 @@
+====================================================================
+Interaction of Suspend code (S3) with the CPU hotplug infrastructure
+====================================================================
+
+(C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
+
+
+I. Differences between CPU hotplug and Suspend-to-RAM
+======================================================
+
+How does the regular CPU hotplug code differ from how the Suspend-to-RAM
+infrastructure uses it internally? And where do they share common code?
+
+Well, a picture is worth a thousand words... So ASCII art follows :-)
+
+[This depicts the current design in the kernel, and focusses only on the
+interactions involving the freezer and CPU hotplug and also tries to explain
+the locking involved. It outlines the notifications involved as well.
+But please note that here, only the call paths are illustrated, with the aim
+of describing where they take different paths and where they share code.
+What happens when regular CPU hotplug and Suspend-to-RAM race with each other
+is not depicted here.]
+
+On a high level, the suspend-resume cycle goes like this::
+
+ |Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
+ |tasks | | cpus | | | | cpus | |tasks|
+
+
+More details follow::
+
+ Suspend call path
+ -----------------
+
+ Write 'mem' to
+ /sys/power/state
+ sysfs file
+ |
+ v
+ Acquire system_transition_mutex lock
+ |
+ v
+ Send PM_SUSPEND_PREPARE
+ notifications
+ |
+ v
+ Freeze tasks
+ |
+ |
+ v
+ disable_nonboot_cpus()
+ /* start */
+ |
+ v
+ Acquire cpu_add_remove_lock
+ |
+ v
+ Iterate over CURRENTLY
+ online CPUs
+ |
+ |
+ | ----------
+ v | L
+ ======> _cpu_down() |
+ | [This takes cpuhotplug.lock |
+ Common | before taking down the CPU |
+ code | and releases it when done] | O
+ | While it is at it, notifications |
+ | are sent when notable events occur, |
+ ======> by running all registered callbacks. |
+ | | O
+ | |
+ | |
+ v |
+ Note down these cpus in | P
+ frozen_cpus mask ----------
+ |
+ v
+ Disable regular cpu hotplug
+ by increasing cpu_hotplug_disabled
+ |
+ v
+ Release cpu_add_remove_lock
+ |
+ v
+ /* disable_nonboot_cpus() complete */
+ |
+ v
+ Do suspend
+
+
+
+Resuming back is likewise, with the counterparts being (in the order of
+execution during resume):
+
+* enable_nonboot_cpus() which involves::
+
+ | Acquire cpu_add_remove_lock
+ | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
+ | Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
+ | Release cpu_add_remove_lock
+ v
+
+* thaw tasks
+* send PM_POST_SUSPEND notifications
+* Release system_transition_mutex lock.
+
+
+It is to be noted here that the system_transition_mutex lock is acquired at the very
+beginning, when we are just starting out to suspend, and then released only
+after the entire cycle is complete (i.e., suspend + resume).
+
+::
+
+
+
+ Regular CPU hotplug call path
+ -----------------------------
+
+ Write 0 (or 1) to
+ /sys/devices/system/cpu/cpu*/online
+ sysfs file
+ |
+ |
+ v
+ cpu_down()
+ |
+ v
+ Acquire cpu_add_remove_lock
+ |
+ v
+ If cpu_hotplug_disabled > 0
+ return gracefully
+ |
+ |
+ v
+ ======> _cpu_down()
+ | [This takes cpuhotplug.lock
+ Common | before taking down the CPU
+ code | and releases it when done]
+ | While it is at it, notifications
+ | are sent when notable events occur,
+ ======> by running all registered callbacks.
+ |
+ |
+ v
+ Release cpu_add_remove_lock
+ [That's it!, for
+ regular CPU hotplug]
+
+
+
+So, as can be seen from the two diagrams (the parts marked as "Common code"),
+regular CPU hotplug and the suspend code path converge at the _cpu_down() and
+_cpu_up() functions. They differ in the arguments passed to these functions,
+in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
+argument. But during suspend, since the tasks are already frozen by the time
+the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
+with the 'tasks_frozen' argument set to 1.
+[See below for some known issues regarding this.]
+
+
+Important files and functions/entry points:
+-------------------------------------------
+
+- kernel/power/process.c : freeze_processes(), thaw_processes()
+- kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
+- kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
+
+
+
+II. What are the issues involved in CPU hotplug?
+------------------------------------------------
+
+There are some interesting situations involving CPU hotplug and microcode
+update on the CPUs, as discussed below:
+
+[Please bear in mind that the kernel requests the microcode images from
+userspace, using the request_firmware() function defined in
+drivers/base/firmware_loader/main.c]
+
+
+a. When all the CPUs are identical:
+
+ This is the most common situation and it is quite straightforward: we want
+ to apply the same microcode revision to each of the CPUs.
+ To give an example of x86, the collect_cpu_info() function defined in
+ arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
+ and thereby in applying the correct microcode revision to it.
+ But note that the kernel does not maintain a common microcode image for the
+ all CPUs, in order to handle case 'b' described below.
+
+
+b. When some of the CPUs are different than the rest:
+
+ In this case since we probably need to apply different microcode revisions
+ to different CPUs, the kernel maintains a copy of the correct microcode
+ image for each CPU (after appropriate CPU type/model discovery using
+ functions such as collect_cpu_info()).
+
+
+c. When a CPU is physically hot-unplugged and a new (and possibly different
+ type of) CPU is hot-plugged into the system:
+
+ In the current design of the kernel, whenever a CPU is taken offline during
+ a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
+ (which is sent by the CPU hotplug code), the microcode update driver's
+ callback for that event reacts by freeing the kernel's copy of the
+ microcode image for that CPU.
+
+ Hence, when a new CPU is brought online, since the kernel finds that it
+ doesn't have the microcode image, it does the CPU type/model discovery
+ afresh and then requests the userspace for the appropriate microcode image
+ for that CPU, which is subsequently applied.
+
+ For example, in x86, the mc_cpu_callback() function (which is the microcode
+ update driver's callback registered for CPU hotplug events) calls
+ microcode_update_cpu() which would call microcode_init_cpu() in this case,
+ instead of microcode_resume_cpu() when it finds that the kernel doesn't
+ have a valid microcode image. This ensures that the CPU type/model
+ discovery is performed and the right microcode is applied to the CPU after
+ getting it from userspace.
+
+
+d. Handling microcode update during suspend/hibernate:
+
+ Strictly speaking, during a CPU hotplug operation which does not involve
+ physically removing or inserting CPUs, the CPUs are not actually powered
+ off during a CPU offline. They are just put to the lowest C-states possible.
+ Hence, in such a case, it is not really necessary to re-apply microcode
+ when the CPUs are brought back online, since they wouldn't have lost the
+ image during the CPU offline operation.
+
+ This is the usual scenario encountered during a resume after a suspend.
+ However, in the case of hibernation, since all the CPUs are completely
+ powered off, during restore it becomes necessary to apply the microcode
+ images to all the CPUs.
+
+ [Note that we don't expect someone to physically pull out nodes and insert
+ nodes with a different type of CPUs in-between a suspend-resume or a
+ hibernate/restore cycle.]
+
+ In the current design of the kernel however, during a CPU offline operation
+ as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
+ the existing copy of microcode image in the kernel is not freed up.
+ And during the CPU online operations (during resume/restore), since the
+ kernel finds that it already has copies of the microcode images for all the
+ CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
+ type/model and the need for validating whether the microcode revisions are
+ right for the CPUs or not (due to the above assumption that physical CPU
+ hotplug will not be done in-between suspend/resume or hibernate/restore
+ cycles).
+
+
+III. Known problems
+===================
+
+Are there any known problems when regular CPU hotplug and suspend race
+with each other?
+
+Yes, they are listed below:
+
+1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
+ the _cpu_down() and _cpu_up() functions is *always* 0.
+ This might not reflect the true current state of the system, since the
+ tasks could have been frozen by an out-of-band event such as a suspend
+ operation in progress. Hence, the cpuhp_tasks_frozen variable will not
+ reflect the frozen state and the CPU hotplug callbacks which evaluate
+ that variable might execute the wrong code path.
+
+2. If a regular CPU hotplug stress test happens to race with the freezer due
+ to a suspend operation in progress at the same time, then we could hit the
+ situation described below:
+
+ * A regular cpu online operation continues its journey from userspace
+ into the kernel, since the freezing has not yet begun.
+ * Then freezer gets to work and freezes userspace.
+ * If cpu online has not yet completed the microcode update stuff by now,
+ it will now start waiting on the frozen userspace in the
+ TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
+ * Now the freezer continues and tries to freeze the remaining tasks. But
+ due to this wait mentioned above, the freezer won't be able to freeze
+ the cpu online hotplug task and hence freezing of tasks fails.
+
+ As a result of this task freezing failure, the suspend operation gets
+ aborted.
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
deleted file mode 100644
index a8751b8df10e..000000000000
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ /dev/null
@@ -1,274 +0,0 @@
-Interaction of Suspend code (S3) with the CPU hotplug infrastructure
-
- (C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
-
-
-I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM
- infrastructure uses it internally? And where do they share common code?
-
-Well, a picture is worth a thousand words... So ASCII art follows :-)
-
-[This depicts the current design in the kernel, and focusses only on the
-interactions involving the freezer and CPU hotplug and also tries to explain
-the locking involved. It outlines the notifications involved as well.
-But please note that here, only the call paths are illustrated, with the aim
-of describing where they take different paths and where they share code.
-What happens when regular CPU hotplug and Suspend-to-RAM race with each other
-is not depicted here.]
-
-On a high level, the suspend-resume cycle goes like this:
-
-|Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
-|tasks | | cpus | | | | cpus | |tasks|
-
-
-More details follow:
-
- Suspend call path
- -----------------
-
- Write 'mem' to
- /sys/power/state
- sysfs file
- |
- v
- Acquire system_transition_mutex lock
- |
- v
- Send PM_SUSPEND_PREPARE
- notifications
- |
- v
- Freeze tasks
- |
- |
- v
- disable_nonboot_cpus()
- /* start */
- |
- v
- Acquire cpu_add_remove_lock
- |
- v
- Iterate over CURRENTLY
- online CPUs
- |
- |
- | ----------
- v | L
- ======> _cpu_down() |
- | [This takes cpuhotplug.lock |
- Common | before taking down the CPU |
- code | and releases it when done] | O
- | While it is at it, notifications |
- | are sent when notable events occur, |
- ======> by running all registered callbacks. |
- | | O
- | |
- | |
- v |
- Note down these cpus in | P
- frozen_cpus mask ----------
- |
- v
- Disable regular cpu hotplug
- by increasing cpu_hotplug_disabled
- |
- v
- Release cpu_add_remove_lock
- |
- v
- /* disable_nonboot_cpus() complete */
- |
- v
- Do suspend
-
-
-
-Resuming back is likewise, with the counterparts being (in the order of
-execution during resume):
-* enable_nonboot_cpus() which involves:
- | Acquire cpu_add_remove_lock
- | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
- | Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
- | Release cpu_add_remove_lock
- v
-
-* thaw tasks
-* send PM_POST_SUSPEND notifications
-* Release system_transition_mutex lock.
-
-
-It is to be noted here that the system_transition_mutex lock is acquired at the very
-beginning, when we are just starting out to suspend, and then released only
-after the entire cycle is complete (i.e., suspend + resume).
-
-
-
- Regular CPU hotplug call path
- -----------------------------
-
- Write 0 (or 1) to
- /sys/devices/system/cpu/cpu*/online
- sysfs file
- |
- |
- v
- cpu_down()
- |
- v
- Acquire cpu_add_remove_lock
- |
- v
- If cpu_hotplug_disabled > 0
- return gracefully
- |
- |
- v
- ======> _cpu_down()
- | [This takes cpuhotplug.lock
- Common | before taking down the CPU
- code | and releases it when done]
- | While it is at it, notifications
- | are sent when notable events occur,
- ======> by running all registered callbacks.
- |
- |
- v
- Release cpu_add_remove_lock
- [That's it!, for
- regular CPU hotplug]
-
-
-
-So, as can be seen from the two diagrams (the parts marked as "Common code"),
-regular CPU hotplug and the suspend code path converge at the _cpu_down() and
-_cpu_up() functions. They differ in the arguments passed to these functions,
-in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
-argument. But during suspend, since the tasks are already frozen by the time
-the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
-with the 'tasks_frozen' argument set to 1.
-[See below for some known issues regarding this.]
-
-
-Important files and functions/entry points:
-------------------------------------------
-
-kernel/power/process.c : freeze_processes(), thaw_processes()
-kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
-kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
-
-
-
-II. What are the issues involved in CPU hotplug?
- -------------------------------------------
-
-There are some interesting situations involving CPU hotplug and microcode
-update on the CPUs, as discussed below:
-
-[Please bear in mind that the kernel requests the microcode images from
-userspace, using the request_firmware() function defined in
-drivers/base/firmware_loader/main.c]
-
-
-a. When all the CPUs are identical:
-
- This is the most common situation and it is quite straightforward: we want
- to apply the same microcode revision to each of the CPUs.
- To give an example of x86, the collect_cpu_info() function defined in
- arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
- and thereby in applying the correct microcode revision to it.
- But note that the kernel does not maintain a common microcode image for the
- all CPUs, in order to handle case 'b' described below.
-
-
-b. When some of the CPUs are different than the rest:
-
- In this case since we probably need to apply different microcode revisions
- to different CPUs, the kernel maintains a copy of the correct microcode
- image for each CPU (after appropriate CPU type/model discovery using
- functions such as collect_cpu_info()).
-
-
-c. When a CPU is physically hot-unplugged and a new (and possibly different
- type of) CPU is hot-plugged into the system:
-
- In the current design of the kernel, whenever a CPU is taken offline during
- a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
- (which is sent by the CPU hotplug code), the microcode update driver's
- callback for that event reacts by freeing the kernel's copy of the
- microcode image for that CPU.
-
- Hence, when a new CPU is brought online, since the kernel finds that it
- doesn't have the microcode image, it does the CPU type/model discovery
- afresh and then requests the userspace for the appropriate microcode image
- for that CPU, which is subsequently applied.
-
- For example, in x86, the mc_cpu_callback() function (which is the microcode
- update driver's callback registered for CPU hotplug events) calls
- microcode_update_cpu() which would call microcode_init_cpu() in this case,
- instead of microcode_resume_cpu() when it finds that the kernel doesn't
- have a valid microcode image. This ensures that the CPU type/model
- discovery is performed and the right microcode is applied to the CPU after
- getting it from userspace.
-
-
-d. Handling microcode update during suspend/hibernate:
-
- Strictly speaking, during a CPU hotplug operation which does not involve
- physically removing or inserting CPUs, the CPUs are not actually powered
- off during a CPU offline. They are just put to the lowest C-states possible.
- Hence, in such a case, it is not really necessary to re-apply microcode
- when the CPUs are brought back online, since they wouldn't have lost the
- image during the CPU offline operation.
-
- This is the usual scenario encountered during a resume after a suspend.
- However, in the case of hibernation, since all the CPUs are completely
- powered off, during restore it becomes necessary to apply the microcode
- images to all the CPUs.
-
- [Note that we don't expect someone to physically pull out nodes and insert
- nodes with a different type of CPUs in-between a suspend-resume or a
- hibernate/restore cycle.]
-
- In the current design of the kernel however, during a CPU offline operation
- as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
- the existing copy of microcode image in the kernel is not freed up.
- And during the CPU online operations (during resume/restore), since the
- kernel finds that it already has copies of the microcode images for all the
- CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
- type/model and the need for validating whether the microcode revisions are
- right for the CPUs or not (due to the above assumption that physical CPU
- hotplug will not be done in-between suspend/resume or hibernate/restore
- cycles).
-
-
-III. Are there any known problems when regular CPU hotplug and suspend race
- with each other?
-
-Yes, they are listed below:
-
-1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
- the _cpu_down() and _cpu_up() functions is *always* 0.
- This might not reflect the true current state of the system, since the
- tasks could have been frozen by an out-of-band event such as a suspend
- operation in progress. Hence, the cpuhp_tasks_frozen variable will not
- reflect the frozen state and the CPU hotplug callbacks which evaluate
- that variable might execute the wrong code path.
-
-2. If a regular CPU hotplug stress test happens to race with the freezer due
- to a suspend operation in progress at the same time, then we could hit the
- situation described below:
-
- * A regular cpu online operation continues its journey from userspace
- into the kernel, since the freezing has not yet begun.
- * Then freezer gets to work and freezes userspace.
- * If cpu online has not yet completed the microcode update stuff by now,
- it will now start waiting on the frozen userspace in the
- TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
- * Now the freezer continues and tries to freeze the remaining tasks. But
- due to this wait mentioned above, the freezer won't be able to freeze
- the cpu online hotplug task and hence freezing of tasks fails.
-
- As a result of this task freezing failure, the suspend operation gets
- aborted.
diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst
new file mode 100644
index 000000000000..4cda6617709a
--- /dev/null
+++ b/Documentation/power/suspend-and-interrupts.rst
@@ -0,0 +1,137 @@
+====================================
+System Suspend and Device Interrupts
+====================================
+
+Copyright (C) 2014 Intel Corp.
+Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+Suspending and Resuming Device IRQs
+-----------------------------------
+
+Device interrupt request lines (IRQs) are generally disabled during system
+suspend after the "late" phase of suspending devices (that is, after all of the
+->prepare, ->suspend and ->suspend_late callbacks have been executed for all
+devices). That is done by suspend_device_irqs().
+
+The rationale for doing so is that after the "late" phase of device suspend
+there is no legitimate reason why any interrupts from suspended devices should
+trigger and if any devices have not been suspended properly yet, it is better to
+block interrupts from them anyway. Also, in the past we had problems with
+interrupt handlers for shared IRQs that device drivers implementing them were
+not prepared for interrupts triggering after their devices had been suspended.
+In some cases they would attempt to access, for example, memory address spaces
+of suspended devices and cause unpredictable behavior to ensue as a result.
+Unfortunately, such problems are very difficult to debug and the introduction
+of suspend_device_irqs(), along with the "noirq" phase of device suspend and
+resume, was the only practical way to mitigate them.
+
+Device IRQs are re-enabled during system resume, right before the "early" phase
+of resuming devices (that is, before starting to execute ->resume_early
+callbacks for devices). The function doing that is resume_device_irqs().
+
+
+The IRQF_NO_SUSPEND Flag
+------------------------
+
+There are interrupts that can legitimately trigger during the entire system
+suspend-resume cycle, including the "noirq" phases of suspending and resuming
+devices as well as during the time when nonboot CPUs are taken offline and
+brought back online. That applies to timer interrupts in the first place,
+but also to IPIs and to some other special-purpose interrupts.
+
+The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
+requesting a special-purpose interrupt. It causes suspend_device_irqs() to
+leave the corresponding IRQ enabled so as to allow the interrupt to work as
+expected during the suspend-resume cycle, but does not guarantee that the
+interrupt will wake the system from a suspended state -- for such cases it is
+necessary to use enable_irq_wake().
+
+Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
+user of it. Thus, if the IRQ is shared, all of the interrupt handlers installed
+for it will be executed as usual after suspend_device_irqs(), even if the
+IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
+the IRQ's users. For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
+same time should be avoided.
+
+
+System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
+------------------------------------------------------------------
+
+System wakeup interrupts generally need to be configured to wake up the system
+from sleep states, especially if they are used for different purposes (e.g. as
+I/O interrupts) in the working state.
+
+That may involve turning on a special signal handling logic within the platform
+(such as an SoC) so that signals from a given line are routed in a different way
+during system sleep so as to trigger a system wakeup when needed. For example,
+the platform may include a dedicated interrupt controller used specifically for
+handling system wakeup events. Then, if a given interrupt line is supposed to
+wake up the system from sleep sates, the corresponding input of that interrupt
+controller needs to be enabled to receive signals from the line in question.
+After wakeup, it generally is better to disable that input to prevent the
+dedicated controller from triggering interrupts unnecessarily.
+
+The IRQ subsystem provides two helper functions to be used by device drivers for
+those purposes. Namely, enable_irq_wake() turns on the platform's logic for
+handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
+turns that logic off.
+
+Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
+in a special way. Namely, the IRQ remains enabled, by on the first interrupt
+it will be disabled, marked as pending and "suspended" so that it will be
+re-enabled by resume_device_irqs() during the subsequent system resume. Also
+the PM core is notified about the event which causes the system suspend in
+progress to be aborted (that doesn't have to happen immediately, but at one
+of the points where the suspend thread looks for pending wakeup events).
+
+This way every interrupt from a wakeup interrupt source will either cause the
+system suspend currently in progress to be aborted or wake up the system if
+already suspended. However, after suspend_device_irqs() interrupt handlers are
+not executed for system wakeup IRQs. They are only executed for IRQF_NO_SUSPEND
+IRQs at that time, but those IRQs should not be configured for system wakeup
+using enable_irq_wake().
+
+
+Interrupts and Suspend-to-Idle
+------------------------------
+
+Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
+system sleep state that works by idling all of the processors and waiting for
+interrupts right after the "noirq" phase of suspending devices.
+
+Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
+set will bring CPUs out of idle while in that state, but they will not cause the
+IRQ subsystem to trigger a system wakeup.
+
+System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
+analogy with what they do in the full system suspend case. The only difference
+is that the wakeup from suspend-to-idle is signaled using the usual working
+state interrupt delivery mechanisms and doesn't require the platform to use
+any special interrupt handling logic for it to work.
+
+
+IRQF_NO_SUSPEND and enable_irq_wake()
+-------------------------------------
+
+There are very few valid reasons to use both enable_irq_wake() and the
+IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
+same device.
+
+First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
+interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
+directly at odds with the rules for handling system wakeup interrupts (interrupt
+handlers are not invoked after suspend_device_irqs()).
+
+Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
+to individual interrupt handlers, so sharing an IRQ between a system wakeup
+interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
+make sense.
+
+In rare cases an IRQ can be shared between a wakeup device driver and an
+IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
+must be able to discern spurious IRQs from genuine wakeup events (signalling
+the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
+ensure that the IRQ will function as a wakeup source, and must request the IRQ
+with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
+these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt
deleted file mode 100644
index 8afb29a8604a..000000000000
--- a/Documentation/power/suspend-and-interrupts.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-System Suspend and Device Interrupts
-
-Copyright (C) 2014 Intel Corp.
-Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-
-Suspending and Resuming Device IRQs
------------------------------------
-
-Device interrupt request lines (IRQs) are generally disabled during system
-suspend after the "late" phase of suspending devices (that is, after all of the
-->prepare, ->suspend and ->suspend_late callbacks have been executed for all
-devices). That is done by suspend_device_irqs().
-
-The rationale for doing so is that after the "late" phase of device suspend
-there is no legitimate reason why any interrupts from suspended devices should
-trigger and if any devices have not been suspended properly yet, it is better to
-block interrupts from them anyway. Also, in the past we had problems with
-interrupt handlers for shared IRQs that device drivers implementing them were
-not prepared for interrupts triggering after their devices had been suspended.
-In some cases they would attempt to access, for example, memory address spaces
-of suspended devices and cause unpredictable behavior to ensue as a result.
-Unfortunately, such problems are very difficult to debug and the introduction
-of suspend_device_irqs(), along with the "noirq" phase of device suspend and
-resume, was the only practical way to mitigate them.
-
-Device IRQs are re-enabled during system resume, right before the "early" phase
-of resuming devices (that is, before starting to execute ->resume_early
-callbacks for devices). The function doing that is resume_device_irqs().
-
-
-The IRQF_NO_SUSPEND Flag
-------------------------
-
-There are interrupts that can legitimately trigger during the entire system
-suspend-resume cycle, including the "noirq" phases of suspending and resuming
-devices as well as during the time when nonboot CPUs are taken offline and
-brought back online. That applies to timer interrupts in the first place,
-but also to IPIs and to some other special-purpose interrupts.
-
-The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
-requesting a special-purpose interrupt. It causes suspend_device_irqs() to
-leave the corresponding IRQ enabled so as to allow the interrupt to work as
-expected during the suspend-resume cycle, but does not guarantee that the
-interrupt will wake the system from a suspended state -- for such cases it is
-necessary to use enable_irq_wake().
-
-Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
-user of it. Thus, if the IRQ is shared, all of the interrupt handlers installed
-for it will be executed as usual after suspend_device_irqs(), even if the
-IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
-the IRQ's users. For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
-same time should be avoided.
-
-
-System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
-------------------------------------------------------------------
-
-System wakeup interrupts generally need to be configured to wake up the system
-from sleep states, especially if they are used for different purposes (e.g. as
-I/O interrupts) in the working state.
-
-That may involve turning on a special signal handling logic within the platform
-(such as an SoC) so that signals from a given line are routed in a different way
-during system sleep so as to trigger a system wakeup when needed. For example,
-the platform may include a dedicated interrupt controller used specifically for
-handling system wakeup events. Then, if a given interrupt line is supposed to
-wake up the system from sleep sates, the corresponding input of that interrupt
-controller needs to be enabled to receive signals from the line in question.
-After wakeup, it generally is better to disable that input to prevent the
-dedicated controller from triggering interrupts unnecessarily.
-
-The IRQ subsystem provides two helper functions to be used by device drivers for
-those purposes. Namely, enable_irq_wake() turns on the platform's logic for
-handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
-turns that logic off.
-
-Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
-in a special way. Namely, the IRQ remains enabled, by on the first interrupt
-it will be disabled, marked as pending and "suspended" so that it will be
-re-enabled by resume_device_irqs() during the subsequent system resume. Also
-the PM core is notified about the event which causes the system suspend in
-progress to be aborted (that doesn't have to happen immediately, but at one
-of the points where the suspend thread looks for pending wakeup events).
-
-This way every interrupt from a wakeup interrupt source will either cause the
-system suspend currently in progress to be aborted or wake up the system if
-already suspended. However, after suspend_device_irqs() interrupt handlers are
-not executed for system wakeup IRQs. They are only executed for IRQF_NO_SUSPEND
-IRQs at that time, but those IRQs should not be configured for system wakeup
-using enable_irq_wake().
-
-
-Interrupts and Suspend-to-Idle
-------------------------------
-
-Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
-system sleep state that works by idling all of the processors and waiting for
-interrupts right after the "noirq" phase of suspending devices.
-
-Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
-set will bring CPUs out of idle while in that state, but they will not cause the
-IRQ subsystem to trigger a system wakeup.
-
-System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
-analogy with what they do in the full system suspend case. The only difference
-is that the wakeup from suspend-to-idle is signaled using the usual working
-state interrupt delivery mechanisms and doesn't require the platform to use
-any special interrupt handling logic for it to work.
-
-
-IRQF_NO_SUSPEND and enable_irq_wake()
--------------------------------------
-
-There are very few valid reasons to use both enable_irq_wake() and the
-IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
-same device.
-
-First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
-interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
-directly at odds with the rules for handling system wakeup interrupts (interrupt
-handlers are not invoked after suspend_device_irqs()).
-
-Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
-to individual interrupt handlers, so sharing an IRQ between a system wakeup
-interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
-make sense.
-
-In rare cases an IRQ can be shared between a wakeup device driver and an
-IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
-must be able to discern spurious IRQs from genuine wakeup events (signalling
-the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
-ensure that the IRQ will function as a wakeup source, and must request the IRQ
-with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
-these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/swsusp-and-swap-files.rst b/Documentation/power/swsusp-and-swap-files.rst
new file mode 100644
index 000000000000..a33a2919dbe4
--- /dev/null
+++ b/Documentation/power/swsusp-and-swap-files.rst
@@ -0,0 +1,63 @@
+===============================================
+Using swap files with software suspend (swsusp)
+===============================================
+
+ (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+The Linux kernel handles swap files almost in the same way as it handles swap
+partitions and there are only two differences between these two types of swap
+areas:
+(1) swap files need not be contiguous,
+(2) the header of a swap file is not in the first block of the partition that
+holds it. From the swsusp's point of view (1) is not a problem, because it is
+already taken care of by the swap-handling code, but (2) has to be taken into
+consideration.
+
+In principle the location of a swap file's header may be determined with the
+help of appropriate filesystem driver. Unfortunately, however, it requires the
+filesystem holding the swap file to be mounted, and if this filesystem is
+journaled, it cannot be mounted during resume from disk. For this reason to
+identify a swap file swsusp uses the name of the partition that holds the file
+and the offset from the beginning of the partition at which the swap file's
+header is located. For convenience, this offset is expressed in <PAGE_SIZE>
+units.
+
+In order to use a swap file with swsusp, you need to:
+
+1) Create the swap file and make it active, eg.::
+
+ # dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
+ # mkswap <swap_file_path>
+ # swapon <swap_file_path>
+
+2) Use an application that will bmap the swap file with the help of the
+FIBMAP ioctl and determine the location of the file's swap header, as the
+offset, in <PAGE_SIZE> units, from the beginning of the partition which
+holds the swap file.
+
+3) Add the following parameters to the kernel command line::
+
+ resume=<swap_file_partition> resume_offset=<swap_file_offset>
+
+where <swap_file_partition> is the partition on which the swap file is located
+and <swap_file_offset> is the offset of the swap header determined by the
+application in 2) (of course, this step may be carried out automatically
+by the same application that determines the swap file's header offset using the
+FIBMAP ioctl)
+
+OR
+
+Use a userland suspend application that will set the partition and offset
+with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
+Documentation/power/userland-swsusp.rst (this is the only method to suspend
+to a swap file allowing the resume to be initiated from an initrd or initramfs
+image).
+
+Now, swsusp will use the swap file in the same way in which it would use a swap
+partition. In particular, the swap file has to be active (ie. be present in
+/proc/swaps) so that it can be used for suspending.
+
+Note that if the swap file used for suspending is deleted and recreated,
+the location of its header need not be the same as before. Thus every time
+this happens the value of the "resume_offset=" kernel command line parameter
+has to be updated.
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
deleted file mode 100644
index f281886de490..000000000000
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-Using swap files with software suspend (swsusp)
- (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-The Linux kernel handles swap files almost in the same way as it handles swap
-partitions and there are only two differences between these two types of swap
-areas:
-(1) swap files need not be contiguous,
-(2) the header of a swap file is not in the first block of the partition that
-holds it. From the swsusp's point of view (1) is not a problem, because it is
-already taken care of by the swap-handling code, but (2) has to be taken into
-consideration.
-
-In principle the location of a swap file's header may be determined with the
-help of appropriate filesystem driver. Unfortunately, however, it requires the
-filesystem holding the swap file to be mounted, and if this filesystem is
-journaled, it cannot be mounted during resume from disk. For this reason to
-identify a swap file swsusp uses the name of the partition that holds the file
-and the offset from the beginning of the partition at which the swap file's
-header is located. For convenience, this offset is expressed in <PAGE_SIZE>
-units.
-
-In order to use a swap file with swsusp, you need to:
-
-1) Create the swap file and make it active, eg.
-
-# dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
-# mkswap <swap_file_path>
-# swapon <swap_file_path>
-
-2) Use an application that will bmap the swap file with the help of the
-FIBMAP ioctl and determine the location of the file's swap header, as the
-offset, in <PAGE_SIZE> units, from the beginning of the partition which
-holds the swap file.
-
-3) Add the following parameters to the kernel command line:
-
-resume=<swap_file_partition> resume_offset=<swap_file_offset>
-
-where <swap_file_partition> is the partition on which the swap file is located
-and <swap_file_offset> is the offset of the swap header determined by the
-application in 2) (of course, this step may be carried out automatically
-by the same application that determines the swap file's header offset using the
-FIBMAP ioctl)
-
-OR
-
-Use a userland suspend application that will set the partition and offset
-with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
-Documentation/power/userland-swsusp.txt (this is the only method to suspend
-to a swap file allowing the resume to be initiated from an initrd or initramfs
-image).
-
-Now, swsusp will use the swap file in the same way in which it would use a swap
-partition. In particular, the swap file has to be active (ie. be present in
-/proc/swaps) so that it can be used for suspending.
-
-Note that if the swap file used for suspending is deleted and recreated,
-the location of its header need not be the same as before. Thus every time
-this happens the value of the "resume_offset=" kernel command line parameter
-has to be updated.
diff --git a/Documentation/power/swsusp-dmcrypt.rst b/Documentation/power/swsusp-dmcrypt.rst
new file mode 100644
index 000000000000..426df59172cd
--- /dev/null
+++ b/Documentation/power/swsusp-dmcrypt.rst
@@ -0,0 +1,140 @@
+=======================================
+How to use dm-crypt and swsusp together
+=======================================
+
+Author: Andreas Steinmetz <ast@domdv.de>
+
+
+
+Some prerequisites:
+You know how dm-crypt works. If not, visit the following web page:
+http://www.saout.de/misc/dm-crypt/
+You have read Documentation/power/swsusp.rst and understand it.
+You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
+You know how to create or how to modify an initrd.
+
+Now your system is properly set up, your disk is encrypted except for
+the swap device(s) and the boot partition which may contain a mini
+system for crypto setup and/or rescue purposes. You may even have
+an initrd that does your current crypto setup already.
+
+At this point you want to encrypt your swap, too. Still you want to
+be able to suspend using swsusp. This, however, means that you
+have to be able to either enter a passphrase or that you read
+the key(s) from an external device like a pcmcia flash disk
+or an usb stick prior to resume. So you need an initrd, that sets
+up dm-crypt and then asks swsusp to resume from the encrypted
+swap device.
+
+The most important thing is that you set up dm-crypt in such
+a way that the swap device you suspend to/resume from has
+always the same major/minor within the initrd as well as
+within your running system. The easiest way to achieve this is
+to always set up this swap device first with dmsetup, so that
+it will always look like the following::
+
+ brw------- 1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
+
+Now set up your kernel to use /dev/mapper/swap0 as the default
+resume partition, so your kernel .config contains::
+
+ CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
+
+Prepare your boot loader to use the initrd you will create or
+modify. For lilo the simplest setup looks like the following
+lines::
+
+ image=/boot/vmlinuz
+ initrd=/boot/initrd.gz
+ label=linux
+ append="root=/dev/ram0 init=/linuxrc rw"
+
+Finally you need to create or modify your initrd. Lets assume
+you create an initrd that reads the required dm-crypt setup
+from a pcmcia flash disk card. The card is formatted with an ext2
+fs which resides on /dev/hde1 when the card is inserted. The
+card contains at least the encrypted swap setup in a file
+named "swapkey". /etc/fstab of your initrd contains something
+like the following::
+
+ /dev/hda1 /mnt ext3 ro 0 0
+ none /proc proc defaults,noatime,nodiratime 0 0
+ none /sys sysfs defaults,noatime,nodiratime 0 0
+
+/dev/hda1 contains an unencrypted mini system that sets up all
+of your crypto devices, again by reading the setup from the
+pcmcia flash disk. What follows now is a /linuxrc for your
+initrd that allows you to resume from encrypted swap and that
+continues boot with your mini system on /dev/hda1 if resume
+does not happen::
+
+ #!/bin/sh
+ PATH=/sbin:/bin:/usr/sbin:/usr/bin
+ mount /proc
+ mount /sys
+ mapped=0
+ noresume=`grep -c noresume /proc/cmdline`
+ if [ "$*" != "" ]
+ then
+ noresume=1
+ fi
+ dmesg -n 1
+ /sbin/cardmgr -q
+ for i in 1 2 3 4 5 6 7 8 9 0
+ do
+ if [ -f /proc/ide/hde/media ]
+ then
+ usleep 500000
+ mount -t ext2 -o ro /dev/hde1 /mnt
+ if [ -f /mnt/swapkey ]
+ then
+ dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
+ fi
+ umount /mnt
+ break
+ fi
+ usleep 500000
+ done
+ killproc /sbin/cardmgr
+ dmesg -n 6
+ if [ $mapped = 1 ]
+ then
+ if [ $noresume != 0 ]
+ then
+ mkswap /dev/mapper/swap0 > /dev/null 2>&1
+ fi
+ echo 254:0 > /sys/power/resume
+ dmsetup remove swap0
+ fi
+ umount /sys
+ mount /mnt
+ umount /proc
+ cd /mnt
+ pivot_root . mnt
+ mount /proc
+ umount -l /mnt
+ umount /proc
+ exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
+
+Please don't mind the weird loop above, busybox's msh doesn't know
+the let statement. Now, what is happening in the script?
+First we have to decide if we want to try to resume, or not.
+We will not resume if booting with "noresume" or any parameters
+for init like "single" or "emergency" as boot parameters.
+
+Then we need to set up dmcrypt with the setup data from the
+pcmcia flash disk. If this succeeds we need to reset the swap
+device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
+then attempts to resume from the first device mapper device.
+Note that it is important to set the device in /sys/power/resume,
+regardless if resuming or not, otherwise later suspend will fail.
+If resume starts, script execution terminates here.
+
+Otherwise we just remove the encrypted swap device and leave it to the
+mini system on /dev/hda1 to set the whole crypto up (it is up to
+you to modify this to your taste).
+
+What then follows is the well known process to change the root
+file system and continue booting from there. I prefer to unmount
+the initrd prior to continue booting but it is up to you to modify
+this.
diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt
deleted file mode 100644
index b802fbfd95ef..000000000000
--- a/Documentation/power/swsusp-dmcrypt.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-Author: Andreas Steinmetz <ast@domdv.de>
-
-
-How to use dm-crypt and swsusp together:
-========================================
-
-Some prerequisites:
-You know how dm-crypt works. If not, visit the following web page:
-http://www.saout.de/misc/dm-crypt/
-You have read Documentation/power/swsusp.txt and understand it.
-You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
-You know how to create or how to modify an initrd.
-
-Now your system is properly set up, your disk is encrypted except for
-the swap device(s) and the boot partition which may contain a mini
-system for crypto setup and/or rescue purposes. You may even have
-an initrd that does your current crypto setup already.
-
-At this point you want to encrypt your swap, too. Still you want to
-be able to suspend using swsusp. This, however, means that you
-have to be able to either enter a passphrase or that you read
-the key(s) from an external device like a pcmcia flash disk
-or an usb stick prior to resume. So you need an initrd, that sets
-up dm-crypt and then asks swsusp to resume from the encrypted
-swap device.
-
-The most important thing is that you set up dm-crypt in such
-a way that the swap device you suspend to/resume from has
-always the same major/minor within the initrd as well as
-within your running system. The easiest way to achieve this is
-to always set up this swap device first with dmsetup, so that
-it will always look like the following:
-
-brw------- 1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
-
-Now set up your kernel to use /dev/mapper/swap0 as the default
-resume partition, so your kernel .config contains:
-
-CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
-
-Prepare your boot loader to use the initrd you will create or
-modify. For lilo the simplest setup looks like the following
-lines:
-
-image=/boot/vmlinuz
-initrd=/boot/initrd.gz
-label=linux
-append="root=/dev/ram0 init=/linuxrc rw"
-
-Finally you need to create or modify your initrd. Lets assume
-you create an initrd that reads the required dm-crypt setup
-from a pcmcia flash disk card. The card is formatted with an ext2
-fs which resides on /dev/hde1 when the card is inserted. The
-card contains at least the encrypted swap setup in a file
-named "swapkey". /etc/fstab of your initrd contains something
-like the following:
-
-/dev/hda1 /mnt ext3 ro 0 0
-none /proc proc defaults,noatime,nodiratime 0 0
-none /sys sysfs defaults,noatime,nodiratime 0 0
-
-/dev/hda1 contains an unencrypted mini system that sets up all
-of your crypto devices, again by reading the setup from the
-pcmcia flash disk. What follows now is a /linuxrc for your
-initrd that allows you to resume from encrypted swap and that
-continues boot with your mini system on /dev/hda1 if resume
-does not happen:
-
-#!/bin/sh
-PATH=/sbin:/bin:/usr/sbin:/usr/bin
-mount /proc
-mount /sys
-mapped=0
-noresume=`grep -c noresume /proc/cmdline`
-if [ "$*" != "" ]
-then
- noresume=1
-fi
-dmesg -n 1
-/sbin/cardmgr -q
-for i in 1 2 3 4 5 6 7 8 9 0
-do
- if [ -f /proc/ide/hde/media ]
- then
- usleep 500000
- mount -t ext2 -o ro /dev/hde1 /mnt
- if [ -f /mnt/swapkey ]
- then
- dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
- fi
- umount /mnt
- break
- fi
- usleep 500000
-done
-killproc /sbin/cardmgr
-dmesg -n 6
-if [ $mapped = 1 ]
-then
- if [ $noresume != 0 ]
- then
- mkswap /dev/mapper/swap0 > /dev/null 2>&1
- fi
- echo 254:0 > /sys/power/resume
- dmsetup remove swap0
-fi
-umount /sys
-mount /mnt
-umount /proc
-cd /mnt
-pivot_root . mnt
-mount /proc
-umount -l /mnt
-umount /proc
-exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
-
-Please don't mind the weird loop above, busybox's msh doesn't know
-the let statement. Now, what is happening in the script?
-First we have to decide if we want to try to resume, or not.
-We will not resume if booting with "noresume" or any parameters
-for init like "single" or "emergency" as boot parameters.
-
-Then we need to set up dmcrypt with the setup data from the
-pcmcia flash disk. If this succeeds we need to reset the swap
-device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
-then attempts to resume from the first device mapper device.
-Note that it is important to set the device in /sys/power/resume,
-regardless if resuming or not, otherwise later suspend will fail.
-If resume starts, script execution terminates here.
-
-Otherwise we just remove the encrypted swap device and leave it to the
-mini system on /dev/hda1 to set the whole crypto up (it is up to
-you to modify this to your taste).
-
-What then follows is the well known process to change the root
-file system and continue booting from there. I prefer to unmount
-the initrd prior to continue booting but it is up to you to modify
-this.
diff --git a/Documentation/power/swsusp.rst b/Documentation/power/swsusp.rst
new file mode 100644
index 000000000000..d000312f6965
--- /dev/null
+++ b/Documentation/power/swsusp.rst
@@ -0,0 +1,501 @@
+============
+Swap suspend
+============
+
+Some warnings, first.
+
+.. warning::
+
+ **BIG FAT WARNING**
+
+ If you touch anything on disk between suspend and resume...
+ ...kiss your data goodbye.
+
+ If you do resume from initrd after your filesystems are mounted...
+ ...bye bye root partition.
+
+ [this is actually same case as above]
+
+ If you have unsupported ( ) devices using DMA, you may have some
+ problems. If your disk driver does not support suspend... (IDE does),
+ it may cause some problems, too. If you change kernel command line
+ between suspend and resume, it may do something wrong. If you change
+ your hardware while system is suspended... well, it was not good idea;
+ but it will probably only crash.
+
+ ( ) suspend/resume support is needed to make it safe.
+
+ If you have any filesystems on USB devices mounted before software suspend,
+ they won't be accessible after resume and you may lose data, as though
+ you have unplugged the USB devices with mounted filesystems on them;
+ see the FAQ below for details. (This is not true for more traditional
+ power states like "standby", which normally don't turn USB off.)
+
+Swap partition:
+ You need to append resume=/dev/your_swap_partition to kernel command
+ line or specify it using /sys/power/resume.
+
+Swap file:
+ If using a swapfile you can also specify a resume offset using
+ resume_offset=<number> on the kernel command line or specify it
+ in /sys/power/resume_offset.
+
+After preparing then you suspend by::
+
+ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
+
+- If you feel ACPI works pretty well on your system, you might try::
+
+ echo platform > /sys/power/disk; echo disk > /sys/power/state
+
+- If you would like to write hibernation image to swap and then suspend
+ to RAM (provided your platform supports it), you can try::
+
+ echo suspend > /sys/power/disk; echo disk > /sys/power/state
+
+- If you have SATA disks, you'll need recent kernels with SATA suspend
+ support. For suspend and resume to work, make sure your disk drivers
+ are built into kernel -- not modules. [There's way to make
+ suspend/resume with modular disk drivers, see FAQ, but you probably
+ should not do that.]
+
+If you want to limit the suspend image size to N bytes, do::
+
+ echo N > /sys/power/image_size
+
+before suspend (it is limited to around 2/5 of available RAM by default).
+
+- The resume process checks for the presence of the resume device,
+ if found, it then checks the contents for the hibernation image signature.
+ If both are found, it resumes the hibernation image.
+
+- The resume process may be triggered in two ways:
+
+ 1) During lateinit: If resume=/dev/your_swap_partition is specified on
+ the kernel command line, lateinit runs the resume process. If the
+ resume device has not been probed yet, the resume process fails and
+ bootup continues.
+ 2) Manually from an initrd or initramfs: May be run from
+ the init script by using the /sys/power/resume file. It is vital
+ that this be done prior to remounting any filesystems (even as
+ read-only) otherwise data may be corrupted.
+
+Article about goals and implementation of Software Suspend for Linux
+====================================================================
+
+Author: Gábor Kuti
+Last revised: 2003-10-20 by Pavel Machek
+
+Idea and goals to achieve
+-------------------------
+
+Nowadays it is common in several laptops that they have a suspend button. It
+saves the state of the machine to a filesystem or to a partition and switches
+to standby mode. Later resuming the machine the saved state is loaded back to
+ram and the machine can continue its work. It has two real benefits. First we
+save ourselves the time machine goes down and later boots up, energy costs
+are real high when running from batteries. The other gain is that we don't have
+to interrupt our programs so processes that are calculating something for a long
+time shouldn't need to be written interruptible.
+
+swsusp saves the state of the machine into active swaps and then reboots or
+powerdowns. You must explicitly specify the swap partition to resume from with
+`resume=` kernel option. If signature is found it loads and restores saved
+state. If the option `noresume` is specified as a boot parameter, it skips
+the resuming. If the option `hibernate=nocompress` is specified as a boot
+parameter, it saves hibernation image without compression.
+
+In the meantime while the system is suspended you should not add/remove any
+of the hardware, write to the filesystems, etc.
+
+Sleep states summary
+====================
+
+There are three different interfaces you can use, /proc/acpi should
+work like this:
+
+In a really perfect world::
+
+ echo 1 > /proc/acpi/sleep # for standby
+ echo 2 > /proc/acpi/sleep # for suspend to ram
+ echo 3 > /proc/acpi/sleep # for suspend to ram, but with more power conservative
+ echo 4 > /proc/acpi/sleep # for suspend to disk
+ echo 5 > /proc/acpi/sleep # for shutdown unfriendly the system
+
+and perhaps::
+
+ echo 4b > /proc/acpi/sleep # for suspend to disk via s4bios
+
+Frequently Asked Questions
+==========================
+
+Q:
+ well, suspending a server is IMHO a really stupid thing,
+ but... (Diego Zuccato):
+
+A:
+ You bought new UPS for your server. How do you install it without
+ bringing machine down? Suspend to disk, rearrange power cables,
+ resume.
+
+ You have your server on UPS. Power died, and UPS is indicating 30
+ seconds to failure. What do you do? Suspend to disk.
+
+
+Q:
+ Maybe I'm missing something, but why don't the regular I/O paths work?
+
+A:
+ We do use the regular I/O paths. However we cannot restore the data
+ to its original location as we load it. That would create an
+ inconsistent kernel state which would certainly result in an oops.
+ Instead, we load the image into unused memory and then atomically copy
+ it back to it original location. This implies, of course, a maximum
+ image size of half the amount of memory.
+
+ There are two solutions to this:
+
+ * require half of memory to be free during suspend. That way you can
+ read "new" data onto free spots, then cli and copy
+
+ * assume we had special "polling" ide driver that only uses memory
+ between 0-640KB. That way, I'd have to make sure that 0-640KB is free
+ during suspending, but otherwise it would work...
+
+ suspend2 shares this fundamental limitation, but does not include user
+ data and disk caches into "used memory" by saving them in
+ advance. That means that the limitation goes away in practice.
+
+Q:
+ Does linux support ACPI S4?
+
+A:
+ Yes. That's what echo platform > /sys/power/disk does.
+
+Q:
+ What is 'suspend2'?
+
+A:
+ suspend2 is 'Software Suspend 2', a forked implementation of
+ suspend-to-disk which is available as separate patches for 2.4 and 2.6
+ kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
+ highmem and preemption. It also has a extensible architecture that
+ allows for arbitrary transformations on the image (compression,
+ encryption) and arbitrary backends for writing the image (eg to swap
+ or an NFS share[Work In Progress]). Questions regarding suspend2
+ should be sent to the mailing list available through the suspend2
+ website, and not to the Linux Kernel Mailing List. We are working
+ toward merging suspend2 into the mainline kernel.
+
+Q:
+ What is the freezing of tasks and why are we using it?
+
+A:
+ The freezing of tasks is a mechanism by which user space processes and some
+ kernel threads are controlled during hibernation or system-wide suspend (on some
+ architectures). See freezing-of-tasks.txt for details.
+
+Q:
+ What is the difference between "platform" and "shutdown"?
+
+A:
+ shutdown:
+ save state in linux, then tell bios to powerdown
+
+ platform:
+ save state in linux, then tell bios to powerdown and blink
+ "suspended led"
+
+ "platform" is actually right thing to do where supported, but
+ "shutdown" is most reliable (except on ACPI systems).
+
+Q:
+ I do not understand why you have such strong objections to idea of
+ selective suspend.
+
+A:
+ Do selective suspend during runtime power management, that's okay. But
+ it's useless for suspend-to-disk. (And I do not see how you could use
+ it for suspend-to-ram, I hope you do not want that).
+
+ Lets see, so you suggest to
+
+ * SUSPEND all but swap device and parents
+ * Snapshot
+ * Write image to disk
+ * SUSPEND swap device and parents
+ * Powerdown
+
+ Oh no, that does not work, if swap device or its parents uses DMA,
+ you've corrupted data. You'd have to do
+
+ * SUSPEND all but swap device and parents
+ * FREEZE swap device and parents
+ * Snapshot
+ * UNFREEZE swap device and parents
+ * Write
+ * SUSPEND swap device and parents
+
+ Which means that you still need that FREEZE state, and you get more
+ complicated code. (And I have not yet introduce details like system
+ devices).
+
+Q:
+ There don't seem to be any generally useful behavioral
+ distinctions between SUSPEND and FREEZE.
+
+A:
+ Doing SUSPEND when you are asked to do FREEZE is always correct,
+ but it may be unnecessarily slow. If you want your driver to stay simple,
+ slowness may not matter to you. It can always be fixed later.
+
+ For devices like disk it does matter, you do not want to spindown for
+ FREEZE.
+
+Q:
+ After resuming, system is paging heavily, leading to very bad interactivity.
+
+A:
+ Try running::
+
+ cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
+ do
+ test -f "$file" && cat "$file" > /dev/null
+ done
+
+ after resume. swapoff -a; swapon -a may also be useful.
+
+Q:
+ What happens to devices during swsusp? They seem to be resumed
+ during system suspend?
+
+A:
+ That's correct. We need to resume them if we want to write image to
+ disk. Whole sequence goes like
+
+ **Suspend part**
+
+ running system, user asks for suspend-to-disk
+
+ user processes are stopped
+
+ suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+ with state snapshot
+
+ state snapshot: copy of whole used memory is taken with interrupts disabled
+
+ resume(): devices are woken up so that we can write image to swap
+
+ write image to swap
+
+ suspend(PMSG_SUSPEND): suspend devices so that we can power off
+
+ turn the power off
+
+ **Resume part**
+
+ (is actually pretty similar)
+
+ running system, user asks for suspend-to-disk
+
+ user processes are stopped (in common case there are none,
+ but with resume-from-initrd, no one knows)
+
+ read image from disk
+
+ suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+ with image restoration
+
+ image restoration: rewrite memory with image
+
+ resume(): devices are woken up so that system can continue
+
+ thaw all user processes
+
+Q:
+ What is this 'Encrypt suspend image' for?
+
+A:
+ First of all: it is not a replacement for dm-crypt encrypted swap.
+ It cannot protect your computer while it is suspended. Instead it does
+ protect from leaking sensitive data after resume from suspend.
+
+ Think of the following: you suspend while an application is running
+ that keeps sensitive data in memory. The application itself prevents
+ the data from being swapped out. Suspend, however, must write these
+ data to swap to be able to resume later on. Without suspend encryption
+ your sensitive data are then stored in plaintext on disk. This means
+ that after resume your sensitive data are accessible to all
+ applications having direct access to the swap device which was used
+ for suspend. If you don't need swap after resume these data can remain
+ on disk virtually forever. Thus it can happen that your system gets
+ broken in weeks later and sensitive data which you thought were
+ encrypted and protected are retrieved and stolen from the swap device.
+ To prevent this situation you should use 'Encrypt suspend image'.
+
+ During suspend a temporary key is created and this key is used to
+ encrypt the data written to disk. When, during resume, the data was
+ read back into memory the temporary key is destroyed which simply
+ means that all data written to disk during suspend are then
+ inaccessible so they can't be stolen later on. The only thing that
+ you must then take care of is that you call 'mkswap' for the swap
+ partition used for suspend as early as possible during regular
+ boot. This asserts that any temporary key from an oopsed suspend or
+ from a failed or aborted resume is erased from the swap device.
+
+ As a rule of thumb use encrypted swap to protect your data while your
+ system is shut down or suspended. Additionally use the encrypted
+ suspend image to prevent sensitive data from being stolen after
+ resume.
+
+Q:
+ Can I suspend to a swap file?
+
+A:
+ Generally, yes, you can. However, it requires you to use the "resume=" and
+ "resume_offset=" kernel command line parameters, so the resume from a swap file
+ cannot be initiated from an initrd or initramfs image. See
+ swsusp-and-swap-files.txt for details.
+
+Q:
+ Is there a maximum system RAM size that is supported by swsusp?
+
+A:
+ It should work okay with highmem.
+
+Q:
+ Does swsusp (to disk) use only one swap partition or can it use
+ multiple swap partitions (aggregate them into one logical space)?
+
+A:
+ Only one swap partition, sorry.
+
+Q:
+ If my application(s) causes lots of memory & swap space to be used
+ (over half of the total system RAM), is it correct that it is likely
+ to be useless to try to suspend to disk while that app is running?
+
+A:
+ No, it should work okay, as long as your app does not mlock()
+ it. Just prepare big enough swap partition.
+
+Q:
+ What information is useful for debugging suspend-to-disk problems?
+
+A:
+ Well, last messages on the screen are always useful. If something
+ is broken, it is usually some kernel driver, therefore trying with as
+ little as possible modules loaded helps a lot. I also prefer people to
+ suspend from console, preferably without X running. Booting with
+ init=/bin/bash, then swapon and starting suspend sequence manually
+ usually does the trick. Then it is good idea to try with latest
+ vanilla kernel.
+
+Q:
+ How can distributions ship a swsusp-supporting kernel with modular
+ disk drivers (especially SATA)?
+
+A:
+ Well, it can be done, load the drivers, then do echo into
+ /sys/power/resume file from initrd. Be sure not to mount
+ anything, not even read-only mount, or you are going to lose your
+ data.
+
+Q:
+ How do I make suspend more verbose?
+
+A:
+ If you want to see any non-error kernel messages on the virtual
+ terminal the kernel switches to during suspend, you have to set the
+ kernel console loglevel to at least 4 (KERN_WARNING), for example by
+ doing::
+
+ # save the old loglevel
+ read LOGLEVEL DUMMY < /proc/sys/kernel/printk
+ # set the loglevel so we see the progress bar.
+ # if the level is higher than needed, we leave it alone.
+ if [ $LOGLEVEL -lt 5 ]; then
+ echo 5 > /proc/sys/kernel/printk
+ fi
+
+ IMG_SZ=0
+ read IMG_SZ < /sys/power/image_size
+ echo -n disk > /sys/power/state
+ RET=$?
+ #
+ # the logic here is:
+ # if image_size > 0 (without kernel support, IMG_SZ will be zero),
+ # then try again with image_size set to zero.
+ if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
+ echo 0 > /sys/power/image_size
+ echo -n disk > /sys/power/state
+ RET=$?
+ fi
+
+ # restore previous loglevel
+ echo $LOGLEVEL > /proc/sys/kernel/printk
+ exit $RET
+
+Q:
+ Is this true that if I have a mounted filesystem on a USB device and
+ I suspend to disk, I can lose data unless the filesystem has been mounted
+ with "sync"?
+
+A:
+ That's right ... if you disconnect that device, you may lose data.
+ In fact, even with "-o sync" you can lose data if your programs have
+ information in buffers they haven't written out to a disk you disconnect,
+ or if you disconnect before the device finished saving data you wrote.
+
+ Software suspend normally powers down USB controllers, which is equivalent
+ to disconnecting all USB devices attached to your system.
+
+ Your system might well support low-power modes for its USB controllers
+ while the system is asleep, maintaining the connection, using true sleep
+ modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the
+ /sys/power/state file; write "standby" or "mem".) We've not seen any
+ hardware that can use these modes through software suspend, although in
+ theory some systems might support "platform" modes that won't break the
+ USB connections.
+
+ Remember that it's always a bad idea to unplug a disk drive containing a
+ mounted filesystem. That's true even when your system is asleep! The
+ safest thing is to unmount all filesystems on removable media (such USB,
+ Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
+ before suspending; then remount them after resuming.
+
+ There is a work-around for this problem. For more information, see
+ Documentation/driver-api/usb/persist.rst.
+
+Q:
+ Can I suspend-to-disk using a swap partition under LVM?
+
+A:
+ Yes and No. You can suspend successfully, but the kernel will not be able
+ to resume on its own. You need an initramfs that can recognize the resume
+ situation, activate the logical volume containing the swap volume (but not
+ touch any filesystems!), and eventually call::
+
+ echo -n "$major:$minor" > /sys/power/resume
+
+ where $major and $minor are the respective major and minor device numbers of
+ the swap volume.
+
+ uswsusp works with LVM, too. See http://suspend.sourceforge.net/
+
+Q:
+ I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
+ compiled with the similar configuration files. Anyway I found that
+ suspend to disk (and resume) is much slower on 2.6.16 compared to
+ 2.6.15. Any idea for why that might happen or how can I speed it up?
+
+A:
+ This is because the size of the suspend image is now greater than
+ for 2.6.15 (by saving more data we can get more responsive system
+ after resume).
+
+ There's the /sys/power/image_size knob that controls the size of the
+ image. If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
+ root), the 2.6.15 behavior should be restored. If it is still too
+ slow, take a look at suspend.sf.net -- userland suspend is faster and
+ supports LZF compression to speed it up further.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
deleted file mode 100644
index 236d1fb13640..000000000000
--- a/Documentation/power/swsusp.txt
+++ /dev/null
@@ -1,446 +0,0 @@
-Some warnings, first.
-
- * BIG FAT WARNING *********************************************************
- *
- * If you touch anything on disk between suspend and resume...
- * ...kiss your data goodbye.
- *
- * If you do resume from initrd after your filesystems are mounted...
- * ...bye bye root partition.
- * [this is actually same case as above]
- *
- * If you have unsupported (*) devices using DMA, you may have some
- * problems. If your disk driver does not support suspend... (IDE does),
- * it may cause some problems, too. If you change kernel command line
- * between suspend and resume, it may do something wrong. If you change
- * your hardware while system is suspended... well, it was not good idea;
- * but it will probably only crash.
- *
- * (*) suspend/resume support is needed to make it safe.
- *
- * If you have any filesystems on USB devices mounted before software suspend,
- * they won't be accessible after resume and you may lose data, as though
- * you have unplugged the USB devices with mounted filesystems on them;
- * see the FAQ below for details. (This is not true for more traditional
- * power states like "standby", which normally don't turn USB off.)
-
-Swap partition:
-You need to append resume=/dev/your_swap_partition to kernel command
-line or specify it using /sys/power/resume.
-
-Swap file:
-If using a swapfile you can also specify a resume offset using
-resume_offset=<number> on the kernel command line or specify it
-in /sys/power/resume_offset.
-
-After preparing then you suspend by
-
-echo shutdown > /sys/power/disk; echo disk > /sys/power/state
-
-. If you feel ACPI works pretty well on your system, you might try
-
-echo platform > /sys/power/disk; echo disk > /sys/power/state
-
-. If you would like to write hibernation image to swap and then suspend
-to RAM (provided your platform supports it), you can try
-
-echo suspend > /sys/power/disk; echo disk > /sys/power/state
-
-. If you have SATA disks, you'll need recent kernels with SATA suspend
-support. For suspend and resume to work, make sure your disk drivers
-are built into kernel -- not modules. [There's way to make
-suspend/resume with modular disk drivers, see FAQ, but you probably
-should not do that.]
-
-If you want to limit the suspend image size to N bytes, do
-
-echo N > /sys/power/image_size
-
-before suspend (it is limited to around 2/5 of available RAM by default).
-
-. The resume process checks for the presence of the resume device,
-if found, it then checks the contents for the hibernation image signature.
-If both are found, it resumes the hibernation image.
-
-. The resume process may be triggered in two ways:
- 1) During lateinit: If resume=/dev/your_swap_partition is specified on
- the kernel command line, lateinit runs the resume process. If the
- resume device has not been probed yet, the resume process fails and
- bootup continues.
- 2) Manually from an initrd or initramfs: May be run from
- the init script by using the /sys/power/resume file. It is vital
- that this be done prior to remounting any filesystems (even as
- read-only) otherwise data may be corrupted.
-
-Article about goals and implementation of Software Suspend for Linux
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Author: Gábor Kuti
-Last revised: 2003-10-20 by Pavel Machek
-
-Idea and goals to achieve
-
-Nowadays it is common in several laptops that they have a suspend button. It
-saves the state of the machine to a filesystem or to a partition and switches
-to standby mode. Later resuming the machine the saved state is loaded back to
-ram and the machine can continue its work. It has two real benefits. First we
-save ourselves the time machine goes down and later boots up, energy costs
-are real high when running from batteries. The other gain is that we don't have to
-interrupt our programs so processes that are calculating something for a long
-time shouldn't need to be written interruptible.
-
-swsusp saves the state of the machine into active swaps and then reboots or
-powerdowns. You must explicitly specify the swap partition to resume from with
-``resume='' kernel option. If signature is found it loads and restores saved
-state. If the option ``noresume'' is specified as a boot parameter, it skips
-the resuming. If the option ``hibernate=nocompress'' is specified as a boot
-parameter, it saves hibernation image without compression.
-
-In the meantime while the system is suspended you should not add/remove any
-of the hardware, write to the filesystems, etc.
-
-Sleep states summary
-====================
-
-There are three different interfaces you can use, /proc/acpi should
-work like this:
-
-In a really perfect world:
-echo 1 > /proc/acpi/sleep # for standby
-echo 2 > /proc/acpi/sleep # for suspend to ram
-echo 3 > /proc/acpi/sleep # for suspend to ram, but with more power conservative
-echo 4 > /proc/acpi/sleep # for suspend to disk
-echo 5 > /proc/acpi/sleep # for shutdown unfriendly the system
-
-and perhaps
-echo 4b > /proc/acpi/sleep # for suspend to disk via s4bios
-
-Frequently Asked Questions
-==========================
-
-Q: well, suspending a server is IMHO a really stupid thing,
-but... (Diego Zuccato):
-
-A: You bought new UPS for your server. How do you install it without
-bringing machine down? Suspend to disk, rearrange power cables,
-resume.
-
-You have your server on UPS. Power died, and UPS is indicating 30
-seconds to failure. What do you do? Suspend to disk.
-
-
-Q: Maybe I'm missing something, but why don't the regular I/O paths work?
-
-A: We do use the regular I/O paths. However we cannot restore the data
-to its original location as we load it. That would create an
-inconsistent kernel state which would certainly result in an oops.
-Instead, we load the image into unused memory and then atomically copy
-it back to it original location. This implies, of course, a maximum
-image size of half the amount of memory.
-
-There are two solutions to this:
-
-* require half of memory to be free during suspend. That way you can
-read "new" data onto free spots, then cli and copy
-
-* assume we had special "polling" ide driver that only uses memory
-between 0-640KB. That way, I'd have to make sure that 0-640KB is free
-during suspending, but otherwise it would work...
-
-suspend2 shares this fundamental limitation, but does not include user
-data and disk caches into "used memory" by saving them in
-advance. That means that the limitation goes away in practice.
-
-Q: Does linux support ACPI S4?
-
-A: Yes. That's what echo platform > /sys/power/disk does.
-
-Q: What is 'suspend2'?
-
-A: suspend2 is 'Software Suspend 2', a forked implementation of
-suspend-to-disk which is available as separate patches for 2.4 and 2.6
-kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
-highmem and preemption. It also has a extensible architecture that
-allows for arbitrary transformations on the image (compression,
-encryption) and arbitrary backends for writing the image (eg to swap
-or an NFS share[Work In Progress]). Questions regarding suspend2
-should be sent to the mailing list available through the suspend2
-website, and not to the Linux Kernel Mailing List. We are working
-toward merging suspend2 into the mainline kernel.
-
-Q: What is the freezing of tasks and why are we using it?
-
-A: The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures). See freezing-of-tasks.txt for details.
-
-Q: What is the difference between "platform" and "shutdown"?
-
-A:
-
-shutdown: save state in linux, then tell bios to powerdown
-
-platform: save state in linux, then tell bios to powerdown and blink
- "suspended led"
-
-"platform" is actually right thing to do where supported, but
-"shutdown" is most reliable (except on ACPI systems).
-
-Q: I do not understand why you have such strong objections to idea of
-selective suspend.
-
-A: Do selective suspend during runtime power management, that's okay. But
-it's useless for suspend-to-disk. (And I do not see how you could use
-it for suspend-to-ram, I hope you do not want that).
-
-Lets see, so you suggest to
-
-* SUSPEND all but swap device and parents
-* Snapshot
-* Write image to disk
-* SUSPEND swap device and parents
-* Powerdown
-
-Oh no, that does not work, if swap device or its parents uses DMA,
-you've corrupted data. You'd have to do
-
-* SUSPEND all but swap device and parents
-* FREEZE swap device and parents
-* Snapshot
-* UNFREEZE swap device and parents
-* Write
-* SUSPEND swap device and parents
-
-Which means that you still need that FREEZE state, and you get more
-complicated code. (And I have not yet introduce details like system
-devices).
-
-Q: There don't seem to be any generally useful behavioral
-distinctions between SUSPEND and FREEZE.
-
-A: Doing SUSPEND when you are asked to do FREEZE is always correct,
-but it may be unnecessarily slow. If you want your driver to stay simple,
-slowness may not matter to you. It can always be fixed later.
-
-For devices like disk it does matter, you do not want to spindown for
-FREEZE.
-
-Q: After resuming, system is paging heavily, leading to very bad interactivity.
-
-A: Try running
-
-cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
-do
- test -f "$file" && cat "$file" > /dev/null
-done
-
-after resume. swapoff -a; swapon -a may also be useful.
-
-Q: What happens to devices during swsusp? They seem to be resumed
-during system suspend?
-
-A: That's correct. We need to resume them if we want to write image to
-disk. Whole sequence goes like
-
- Suspend part
- ~~~~~~~~~~~~
- running system, user asks for suspend-to-disk
-
- user processes are stopped
-
- suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
- with state snapshot
-
- state snapshot: copy of whole used memory is taken with interrupts disabled
-
- resume(): devices are woken up so that we can write image to swap
-
- write image to swap
-
- suspend(PMSG_SUSPEND): suspend devices so that we can power off
-
- turn the power off
-
- Resume part
- ~~~~~~~~~~~
- (is actually pretty similar)
-
- running system, user asks for suspend-to-disk
-
- user processes are stopped (in common case there are none, but with resume-from-initrd, no one knows)
-
- read image from disk
-
- suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
- with image restoration
-
- image restoration: rewrite memory with image
-
- resume(): devices are woken up so that system can continue
-
- thaw all user processes
-
-Q: What is this 'Encrypt suspend image' for?
-
-A: First of all: it is not a replacement for dm-crypt encrypted swap.
-It cannot protect your computer while it is suspended. Instead it does
-protect from leaking sensitive data after resume from suspend.
-
-Think of the following: you suspend while an application is running
-that keeps sensitive data in memory. The application itself prevents
-the data from being swapped out. Suspend, however, must write these
-data to swap to be able to resume later on. Without suspend encryption
-your sensitive data are then stored in plaintext on disk. This means
-that after resume your sensitive data are accessible to all
-applications having direct access to the swap device which was used
-for suspend. If you don't need swap after resume these data can remain
-on disk virtually forever. Thus it can happen that your system gets
-broken in weeks later and sensitive data which you thought were
-encrypted and protected are retrieved and stolen from the swap device.
-To prevent this situation you should use 'Encrypt suspend image'.
-
-During suspend a temporary key is created and this key is used to
-encrypt the data written to disk. When, during resume, the data was
-read back into memory the temporary key is destroyed which simply
-means that all data written to disk during suspend are then
-inaccessible so they can't be stolen later on. The only thing that
-you must then take care of is that you call 'mkswap' for the swap
-partition used for suspend as early as possible during regular
-boot. This asserts that any temporary key from an oopsed suspend or
-from a failed or aborted resume is erased from the swap device.
-
-As a rule of thumb use encrypted swap to protect your data while your
-system is shut down or suspended. Additionally use the encrypted
-suspend image to prevent sensitive data from being stolen after
-resume.
-
-Q: Can I suspend to a swap file?
-
-A: Generally, yes, you can. However, it requires you to use the "resume=" and
-"resume_offset=" kernel command line parameters, so the resume from a swap file
-cannot be initiated from an initrd or initramfs image. See
-swsusp-and-swap-files.txt for details.
-
-Q: Is there a maximum system RAM size that is supported by swsusp?
-
-A: It should work okay with highmem.
-
-Q: Does swsusp (to disk) use only one swap partition or can it use
-multiple swap partitions (aggregate them into one logical space)?
-
-A: Only one swap partition, sorry.
-
-Q: If my application(s) causes lots of memory & swap space to be used
-(over half of the total system RAM), is it correct that it is likely
-to be useless to try to suspend to disk while that app is running?
-
-A: No, it should work okay, as long as your app does not mlock()
-it. Just prepare big enough swap partition.
-
-Q: What information is useful for debugging suspend-to-disk problems?
-
-A: Well, last messages on the screen are always useful. If something
-is broken, it is usually some kernel driver, therefore trying with as
-little as possible modules loaded helps a lot. I also prefer people to
-suspend from console, preferably without X running. Booting with
-init=/bin/bash, then swapon and starting suspend sequence manually
-usually does the trick. Then it is good idea to try with latest
-vanilla kernel.
-
-Q: How can distributions ship a swsusp-supporting kernel with modular
-disk drivers (especially SATA)?
-
-A: Well, it can be done, load the drivers, then do echo into
-/sys/power/resume file from initrd. Be sure not to mount
-anything, not even read-only mount, or you are going to lose your
-data.
-
-Q: How do I make suspend more verbose?
-
-A: If you want to see any non-error kernel messages on the virtual
-terminal the kernel switches to during suspend, you have to set the
-kernel console loglevel to at least 4 (KERN_WARNING), for example by
-doing
-
- # save the old loglevel
- read LOGLEVEL DUMMY < /proc/sys/kernel/printk
- # set the loglevel so we see the progress bar.
- # if the level is higher than needed, we leave it alone.
- if [ $LOGLEVEL -lt 5 ]; then
- echo 5 > /proc/sys/kernel/printk
- fi
-
- IMG_SZ=0
- read IMG_SZ < /sys/power/image_size
- echo -n disk > /sys/power/state
- RET=$?
- #
- # the logic here is:
- # if image_size > 0 (without kernel support, IMG_SZ will be zero),
- # then try again with image_size set to zero.
- if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
- echo 0 > /sys/power/image_size
- echo -n disk > /sys/power/state
- RET=$?
- fi
-
- # restore previous loglevel
- echo $LOGLEVEL > /proc/sys/kernel/printk
- exit $RET
-
-Q: Is this true that if I have a mounted filesystem on a USB device and
-I suspend to disk, I can lose data unless the filesystem has been mounted
-with "sync"?
-
-A: That's right ... if you disconnect that device, you may lose data.
-In fact, even with "-o sync" you can lose data if your programs have
-information in buffers they haven't written out to a disk you disconnect,
-or if you disconnect before the device finished saving data you wrote.
-
-Software suspend normally powers down USB controllers, which is equivalent
-to disconnecting all USB devices attached to your system.
-
-Your system might well support low-power modes for its USB controllers
-while the system is asleep, maintaining the connection, using true sleep
-modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the
-/sys/power/state file; write "standby" or "mem".) We've not seen any
-hardware that can use these modes through software suspend, although in
-theory some systems might support "platform" modes that won't break the
-USB connections.
-
-Remember that it's always a bad idea to unplug a disk drive containing a
-mounted filesystem. That's true even when your system is asleep! The
-safest thing is to unmount all filesystems on removable media (such USB,
-Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
-before suspending; then remount them after resuming.
-
-There is a work-around for this problem. For more information, see
-Documentation/driver-api/usb/persist.rst.
-
-Q: Can I suspend-to-disk using a swap partition under LVM?
-
-A: Yes and No. You can suspend successfully, but the kernel will not be able
-to resume on its own. You need an initramfs that can recognize the resume
-situation, activate the logical volume containing the swap volume (but not
-touch any filesystems!), and eventually call
-
-echo -n "$major:$minor" > /sys/power/resume
-
-where $major and $minor are the respective major and minor device numbers of
-the swap volume.
-
-uswsusp works with LVM, too. See http://suspend.sourceforge.net/
-
-Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
-compiled with the similar configuration files. Anyway I found that
-suspend to disk (and resume) is much slower on 2.6.16 compared to
-2.6.15. Any idea for why that might happen or how can I speed it up?
-
-A: This is because the size of the suspend image is now greater than
-for 2.6.15 (by saving more data we can get more responsive system
-after resume).
-
-There's the /sys/power/image_size knob that controls the size of the
-image. If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
-root), the 2.6.15 behavior should be restored. If it is still too
-slow, take a look at suspend.sf.net -- userland suspend is faster and
-supports LZF compression to speed it up further.
diff --git a/Documentation/power/tricks.rst b/Documentation/power/tricks.rst
new file mode 100644
index 000000000000..ca787f142c3f
--- /dev/null
+++ b/Documentation/power/tricks.rst
@@ -0,0 +1,29 @@
+================
+swsusp/S3 tricks
+================
+
+Pavel Machek <pavel@ucw.cz>
+
+If you want to trick swsusp/S3 into working, you might want to try:
+
+* go with minimal config, turn off drivers like USB, AGP you don't
+ really need
+
+* turn off APIC and preempt
+
+* use ext2. At least it has working fsck. [If something seems to go
+ wrong, force fsck when you have a chance]
+
+* turn off modules
+
+* use vga text console, shut down X. [If you really want X, you might
+ want to try vesafb later]
+
+* try running as few processes as possible, preferably go to single
+ user mode.
+
+* due to video issues, swsusp should be easier to get working than
+ S3. Try that first.
+
+When you make it work, try to find out what exactly was it that broke
+suspend, and preferably fix that.
diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt
deleted file mode 100644
index a1b8f7249f4c..000000000000
--- a/Documentation/power/tricks.txt
+++ /dev/null
@@ -1,27 +0,0 @@
- swsusp/S3 tricks
- ~~~~~~~~~~~~~~~~
-Pavel Machek <pavel@ucw.cz>
-
-If you want to trick swsusp/S3 into working, you might want to try:
-
-* go with minimal config, turn off drivers like USB, AGP you don't
- really need
-
-* turn off APIC and preempt
-
-* use ext2. At least it has working fsck. [If something seems to go
- wrong, force fsck when you have a chance]
-
-* turn off modules
-
-* use vga text console, shut down X. [If you really want X, you might
- want to try vesafb later]
-
-* try running as few processes as possible, preferably go to single
- user mode.
-
-* due to video issues, swsusp should be easier to get working than
- S3. Try that first.
-
-When you make it work, try to find out what exactly was it that broke
-suspend, and preferably fix that.
diff --git a/Documentation/power/userland-swsusp.rst b/Documentation/power/userland-swsusp.rst
new file mode 100644
index 000000000000..a0fa51bb1a4d
--- /dev/null
+++ b/Documentation/power/userland-swsusp.rst
@@ -0,0 +1,191 @@
+=====================================================
+Documentation for userland software suspend interface
+=====================================================
+
+ (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel. Such utilities are available, for example, from
+<http://suspend.sourceforge.net>. You may want to have a look at them if you
+are going to develop your own suspend/resume utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in include/linux/suspend_ioctls.h . The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing. If open for
+reading, it is considered to be in the suspend mode. Otherwise it is
+assumed to be in the resume mode. The device cannot be open for simultaneous
+reading and writing. It is also impossible to have the device open more than
+once at a time.
+
+Even opening the device has side effects. Data structures are
+allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
+called.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE
+ freeze user space processes (the current process is
+ not frozen); this is required for SNAPSHOT_CREATE_IMAGE
+ and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE
+ thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_CREATE_IMAGE
+ create a snapshot of the system memory; the
+ last argument of ioctl() should be a pointer to an int variable,
+ the value of which will indicate whether the call returned after
+ creating the snapshot (1) or after restoring the system memory state
+ from it (0) (after resume the system finds itself finishing the
+ SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
+ has been created the read() operation can be used to transfer
+ it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE
+ restore the system memory state from the
+ uploaded snapshot image; before calling it you should transfer
+ the system memory snapshot back to the kernel using the write()
+ operation; this call will not succeed if the snapshot
+ image is not available to the kernel
+
+SNAPSHOT_FREE
+ free memory allocated for the snapshot image
+
+SNAPSHOT_PREF_IMAGE_SIZE
+ set the preferred maximum size of the image
+ (the kernel will do its best to ensure the image size will not exceed
+ this number, but if it turns out to be impossible, the kernel will
+ create the smallest image possible)
+
+SNAPSHOT_GET_IMAGE_SIZE
+ return the actual size of the hibernation image
+
+SNAPSHOT_AVAIL_SWAP_SIZE
+ return the amount of available swap in bytes (the
+ last argument should be a pointer to an unsigned int variable that will
+ contain the result if the call is successful).
+
+SNAPSHOT_ALLOC_SWAP_PAGE
+ allocate a swap page from the resume partition
+ (the last argument should be a pointer to a loff_t variable that
+ will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES
+ free all swap pages allocated by
+ SNAPSHOT_ALLOC_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_AREA
+ set the resume partition and the offset (in <PAGE_SIZE>
+ units) from the beginning of the partition at which the swap header is
+ located (the last ioctl() argument should point to a struct
+ resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
+ containing the resume device specification and the offset); for swap
+ partitions the offset is always 0, but it is different from zero for
+ swap files (see Documentation/power/swsusp-and-swap-files.rst for
+ details).
+
+SNAPSHOT_PLATFORM_SUPPORT
+ enable/disable the hibernation platform support,
+ depending on the argument value (enable, if the argument is nonzero)
+
+SNAPSHOT_POWER_OFF
+ make the kernel transition the system to the hibernation
+ state (eg. ACPI S4) using the platform (eg. ACPI) driver
+
+SNAPSHOT_S2RAM
+ suspend to RAM; using this call causes the kernel to
+ immediately enter the suspend-to-RAM state, so this call must always
+ be preceded by the SNAPSHOT_FREEZE call and it is also necessary
+ to use the SNAPSHOT_UNFREEZE call after the system wakes up. This call
+ is needed to implement the suspend-to-both mechanism in which the
+ suspend image is first created, as though the system had been suspended
+ to disk, and then the system is suspended to RAM (this makes it possible
+ to resume the system from RAM if there's enough battery power or restore
+ its state on the basis of the saved suspend image otherwise)
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel. It has the following limitations:
+
+- you cannot read() more than one virtual memory page at a time
+- read()s across page boundaries are impossible (ie. if you read() 1/2 of
+ a page in the previous call, you will only be able to read()
+ **at most** 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel. It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap partition, called the resume
+partition, or a swap file as storage space (if a swap file is used, the resume
+partition is the partition that holds this file). However, this is not really
+required, as they can use, for example, a special (blank) suspend partition or
+a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
+mounted afterwards.
+
+These utilities MUST NOT make any assumptions regarding the ordering of
+data within the snapshot image. The contents of the image are entirely owned
+by the kernel and its structure may be changed in future kernel releases.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read). Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header. If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+
+1. If the value is 1 (ie. the system memory snapshot has just been
+ created and the system is ready for saving it):
+
+ (a) The suspending utility MUST NOT close the snapshot device
+ _unless_ the whole suspend procedure is to be cancelled, in
+ which case, if the snapshot image has already been saved, the
+ suspending utility SHOULD destroy it, preferably by zapping
+ its header. If the suspend is not to be cancelled, the
+ system MUST be powered off or rebooted after the snapshot
+ image has been saved.
+ (b) The suspending utility SHOULD NOT attempt to perform any
+ file system operations (including reads) on the file systems
+ that were mounted before SNAPSHOT_CREATE_IMAGE has been
+ called. However, it MAY mount a file system that was not
+ mounted at that time and perform some operations on it (eg.
+ use it for saving the image).
+
+2. If the value is 0 (ie. the system state has just been restored from
+ the snapshot image), the suspending utility MUST close the snapshot
+ device. Afterwards it will be treated as a regular userland process,
+ so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
deleted file mode 100644
index bbfcd1bbedc5..000000000000
--- a/Documentation/power/userland-swsusp.txt
+++ /dev/null
@@ -1,170 +0,0 @@
-Documentation for userland software suspend interface
- (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-First, the warnings at the beginning of swsusp.txt still apply.
-
-Second, you should read the FAQ in swsusp.txt _now_ if you have not
-done it already.
-
-Now, to use the userland interface for software suspend you need special
-utilities that will read/write the system memory snapshot from/to the
-kernel. Such utilities are available, for example, from
-<http://suspend.sourceforge.net>. You may want to have a look at them if you
-are going to develop your own suspend/resume utilities.
-
-The interface consists of a character device providing the open(),
-release(), read(), and write() operations as well as several ioctl()
-commands defined in include/linux/suspend_ioctls.h . The major and minor
-numbers of the device are, respectively, 10 and 231, and they can
-be read from /sys/class/misc/snapshot/dev.
-
-The device can be open either for reading or for writing. If open for
-reading, it is considered to be in the suspend mode. Otherwise it is
-assumed to be in the resume mode. The device cannot be open for simultaneous
-reading and writing. It is also impossible to have the device open more than
-once at a time.
-
-Even opening the device has side effects. Data structures are
-allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
-called.
-
-The ioctl() commands recognized by the device are:
-
-SNAPSHOT_FREEZE - freeze user space processes (the current process is
- not frozen); this is required for SNAPSHOT_CREATE_IMAGE
- and SNAPSHOT_ATOMIC_RESTORE to succeed
-
-SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
-
-SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the
- last argument of ioctl() should be a pointer to an int variable,
- the value of which will indicate whether the call returned after
- creating the snapshot (1) or after restoring the system memory state
- from it (0) (after resume the system finds itself finishing the
- SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
- has been created the read() operation can be used to transfer
- it out of the kernel
-
-SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
- uploaded snapshot image; before calling it you should transfer
- the system memory snapshot back to the kernel using the write()
- operation; this call will not succeed if the snapshot
- image is not available to the kernel
-
-SNAPSHOT_FREE - free memory allocated for the snapshot image
-
-SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image
- (the kernel will do its best to ensure the image size will not exceed
- this number, but if it turns out to be impossible, the kernel will
- create the smallest image possible)
-
-SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image
-
-SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the
- last argument should be a pointer to an unsigned int variable that will
- contain the result if the call is successful).
-
-SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition
- (the last argument should be a pointer to a loff_t variable that
- will contain the swap page offset if the call is successful)
-
-SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by
- SNAPSHOT_ALLOC_SWAP_PAGE
-
-SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
- units) from the beginning of the partition at which the swap header is
- located (the last ioctl() argument should point to a struct
- resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
- containing the resume device specification and the offset); for swap
- partitions the offset is always 0, but it is different from zero for
- swap files (see Documentation/power/swsusp-and-swap-files.txt for
- details).
-
-SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support,
- depending on the argument value (enable, if the argument is nonzero)
-
-SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation
- state (eg. ACPI S4) using the platform (eg. ACPI) driver
-
-SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
- immediately enter the suspend-to-RAM state, so this call must always
- be preceded by the SNAPSHOT_FREEZE call and it is also necessary
- to use the SNAPSHOT_UNFREEZE call after the system wakes up. This call
- is needed to implement the suspend-to-both mechanism in which the
- suspend image is first created, as though the system had been suspended
- to disk, and then the system is suspended to RAM (this makes it possible
- to resume the system from RAM if there's enough battery power or restore
- its state on the basis of the saved suspend image otherwise)
-
-The device's read() operation can be used to transfer the snapshot image from
-the kernel. It has the following limitations:
-- you cannot read() more than one virtual memory page at a time
-- read()s across page boundaries are impossible (ie. if you read() 1/2 of
- a page in the previous call, you will only be able to read()
- _at_ _most_ 1/2 of the page in the next call)
-
-The device's write() operation is used for uploading the system memory snapshot
-into the kernel. It has the same limitations as the read() operation.
-
-The release() operation frees all memory allocated for the snapshot image
-and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
-Thus it is not necessary to use either SNAPSHOT_FREE or
-SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
-unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
-still frozen when the device is being closed).
-
-Currently it is assumed that the userland utilities reading/writing the
-snapshot image from/to the kernel will use a swap partition, called the resume
-partition, or a swap file as storage space (if a swap file is used, the resume
-partition is the partition that holds this file). However, this is not really
-required, as they can use, for example, a special (blank) suspend partition or
-a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
-mounted afterwards.
-
-These utilities MUST NOT make any assumptions regarding the ordering of
-data within the snapshot image. The contents of the image are entirely owned
-by the kernel and its structure may be changed in future kernel releases.
-
-The snapshot image MUST be written to the kernel unaltered (ie. all of the image
-data, metadata and header MUST be written in _exactly_ the same amount, form
-and order in which they have been read). Otherwise, the behavior of the
-resumed system may be totally unpredictable.
-
-While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
-structure of the snapshot image is consistent with the information stored
-in the image header. If any inconsistencies are detected,
-SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof
-mechanism and the userland utilities using the interface SHOULD use additional
-means, such as checksums, to ensure the integrity of the snapshot image.
-
-The suspending and resuming utilities MUST lock themselves in memory,
-preferably using mlockall(), before calling SNAPSHOT_FREEZE.
-
-The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
-in the memory location pointed to by the last argument of ioctl() and proceed
-in accordance with it:
-1. If the value is 1 (ie. the system memory snapshot has just been
- created and the system is ready for saving it):
- (a) The suspending utility MUST NOT close the snapshot device
- _unless_ the whole suspend procedure is to be cancelled, in
- which case, if the snapshot image has already been saved, the
- suspending utility SHOULD destroy it, preferably by zapping
- its header. If the suspend is not to be cancelled, the
- system MUST be powered off or rebooted after the snapshot
- image has been saved.
- (b) The suspending utility SHOULD NOT attempt to perform any
- file system operations (including reads) on the file systems
- that were mounted before SNAPSHOT_CREATE_IMAGE has been
- called. However, it MAY mount a file system that was not
- mounted at that time and perform some operations on it (eg.
- use it for saving the image).
-2. If the value is 0 (ie. the system state has just been restored from
- the snapshot image), the suspending utility MUST close the snapshot
- device. Afterwards it will be treated as a regular userland process,
- so it need not exit.
-
-The resuming utility SHOULD NOT attempt to mount any file systems that could
-be mounted before suspend and SHOULD NOT attempt to perform any operations
-involving such file systems.
-
-For details, please refer to the source code.
diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst
new file mode 100644
index 000000000000..337a2ba9f32f
--- /dev/null
+++ b/Documentation/power/video.rst
@@ -0,0 +1,213 @@
+===========================
+Video issues with S3 resume
+===========================
+
+2003-2006, Pavel Machek
+
+During S3 resume, hardware needs to be reinitialized. For most
+devices, this is easy, and kernel driver knows how to do
+it. Unfortunately there's one exception: video card. Those are usually
+initialized by BIOS, and kernel does not have enough information to
+boot video card. (Kernel usually does not even contain video card
+driver -- vesafb and vgacon are widely used).
+
+This is not problem for swsusp, because during swsusp resume, BIOS is
+run normally so video card is normally initialized. It should not be
+problem for S1 standby, because hardware should retain its state over
+that.
+
+We either have to run video BIOS during early resume, or interpret it
+using vbetool later, or maybe nothing is necessary on particular
+system because video state is preserved. Unfortunately different
+methods work on different systems, and no known method suits all of
+them.
+
+Userland application called s2ram has been developed; it contains long
+whitelist of systems, and automatically selects working method for a
+given system. It can be downloaded from CVS at
+www.sf.net/projects/suspend . If you get a system that is not in the
+whitelist, please try to find a working solution, and submit whitelist
+entry so that work does not need to be repeated.
+
+Currently, VBE_SAVE method (6 below) works on most
+systems. Unfortunately, vbetool only runs after userland is resumed,
+so it makes debugging of early resume problems
+hard/impossible. Methods that do not rely on userland are preferable.
+
+Details
+~~~~~~~
+
+There are a few types of systems where video works after S3 resume:
+
+(1) systems where video state is preserved over S3.
+
+(2) systems where it is possible to call the video BIOS during S3
+ resume. Unfortunately, it is not correct to call the video BIOS at
+ that point, but it happens to work on some machines. Use
+ acpi_sleep=s3_bios.
+
+(3) systems that initialize video card into vga text mode and where
+ the BIOS works well enough to be able to set video mode. Use
+ acpi_sleep=s3_mode on these.
+
+(4) on some systems s3_bios kicks video into text mode, and
+ acpi_sleep=s3_bios,s3_mode is needed.
+
+(5) radeon systems, where X can soft-boot your video card. You'll need
+ a new enough X, and a plain text console (no vesafb or radeonfb). See
+ http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
+ Alternatively, you should use vbetool (6) instead.
+
+(6) other radeon systems, where vbetool is enough to bring system back
+ to life. It needs text console to be working. Do vbetool vbestate
+ save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
+ vbestate restore < /tmp/delme; setfont <whatever>, and your video
+ should work.
+
+(7) on some systems, it is possible to boot most of kernel, and then
+ POSTing bios works. Ole Rohne has patch to do just that at
+ http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
+
+(8) on some systems, you can use the video_post utility and or
+ do echo 3 > /sys/power/state && /usr/sbin/video_post - which will
+ initialize the display in console mode. If you are in X, you can switch
+ to a virtual terminal and back to X using CTRL+ALT+F1 - CTRL+ALT+F7 to get
+ the display working in graphical mode again.
+
+Now, if you pass acpi_sleep=something, and it does not work with your
+bios, you'll get a hard crash during resume. Be careful. Also it is
+safest to do your experiments with plain old VGA console. The vesafb
+and radeonfb (etc) drivers have a tendency to crash the machine during
+resume.
+
+You may have a system where none of above works. At that point you
+either invent another ugly hack that works, or write proper driver for
+your video card (good luck getting docs :-(). Maybe suspending from X
+(proper X, knowing your hardware, not XF68_FBcon) might have better
+chance of working.
+
+Table of known working notebooks:
+
+
+=============================== ===============================================
+Model hack (or "how to do it")
+=============================== ===============================================
+Acer Aspire 1406LC ole's late BIOS init (7), turn off DRI
+Acer TM 230 s3_bios (2)
+Acer TM 242FX vbetool (6)
+Acer TM C110 video_post (8)
+Acer TM C300 vga=normal (only suspend on console, not in X),
+ vbetool (6) or video_post (8)
+Acer TM 4052LCi s3_bios (2)
+Acer TM 636Lci s3_bios,s3_mode (4)
+Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text
+ console back
+Acer TM 660 ??? [#f1]_
+Acer TM 800 vga=normal, X patches, see webpage (5)
+ or vbetool (6)
+Acer TM 803 vga=normal, X patches, see webpage (5)
+ or vbetool (6)
+Acer TM 803LCi vga=normal, vbetool (6)
+Arima W730a vbetool needed (6)
+Asus L2400D s3_mode (3) [#f2]_ (S1 also works OK)
+Asus L3350M (SiS 740) (6)
+Asus L3800C (Radeon M7) s3_bios (2) (S1 also works OK)
+Asus M6887Ne vga=normal, s3_bios (2), use radeon driver
+ instead of fglrx in x.org
+Athlon64 desktop prototype s3_bios (2)
+Compal CL-50 ??? [#f1]_
+Compaq Armada E500 - P3-700 none (1) (S1 also works OK)
+Compaq Evo N620c vga=normal, s3_bios (2)
+Dell 600m, ATI R250 Lf none (1), but needs xorg-x11-6.8.1.902-1
+Dell D600, ATI RV250 vga=normal and X, or try vbestate (6)
+Dell D610 vga=normal and X (possibly vbestate (6) too,
+ but not tested)
+Dell Inspiron 4000 ??? [#f1]_
+Dell Inspiron 500m ??? [#f1]_
+Dell Inspiron 510m ???
+Dell Inspiron 5150 vbetool needed (6)
+Dell Inspiron 600m ??? [#f1]_
+Dell Inspiron 8200 ??? [#f1]_
+Dell Inspiron 8500 ??? [#f1]_
+Dell Inspiron 8600 ??? [#f1]_
+eMachines athlon64 machines vbetool needed (6) (someone please get
+ me model #s)
+HP NC6000 s3_bios, may not use radeonfb (2);
+ or vbetool (6)
+HP NX7000 ??? [#f1]_
+HP Pavilion ZD7000 vbetool post needed, need open-source nv
+ driver for X
+HP Omnibook XE3 athlon version none (1)
+HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV
+HP Omnibook XE3L-GF vbetool (6)
+HP Omnibook 5150 none (1), (S1 also works OK)
+IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294
+ Savage/IX-MV, vesafb gets "interesting"
+ but X work.
+IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with
+ BIOS 1.04 2002-08-23, but not at all with
+ BIOS 1.11 2004-11-05 :-(]
+IBM TP R32 / Type 2658-MMG none (1)
+IBM TP R40 2722B3G ??? [#f1]_
+IBM TP R50p / Type 1832-22U s3_bios (2)
+IBM TP R51 none (1)
+IBM TP T30 236681A ??? [#f1]_
+IBM TP T40 / Type 2373-MU4 none (1)
+IBM TP T40p none (1)
+IBM TP R40p s3_bios (2)
+IBM TP T41p s3_bios (2), switch to X after resume
+IBM TP T42 s3_bios (2)
+IBM ThinkPad T42p (2373-GTG) s3_bios (2)
+IBM TP X20 ??? [#f1]_
+IBM TP X30 s3_bios, s3_mode (4)
+IBM TP X31 / Type 2672-XXH none (1), use radeontool
+ (http://fdd.com/software/radeon/) to
+ turn off backlight.
+IBM TP X32 none (1), but backlight is on and video is
+ trashed after long suspend. s3_bios,
+ s3_mode (4) works too. Perhaps that gets
+ better results?
+IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4)
+IBM TP 600e none(1), but a switch to console and
+ back to X is needed
+Medion MD4220 ??? [#f1]_
+Samsung P35 vbetool needed (6)
+Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off
+Sony Vaio PCG-C1VRX/K s3_bios (2)
+Sony Vaio PCG-F403 ??? [#f1]_
+Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver
+Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use
+ radeontool (http://fdd.com/software/radeon/)
+ to turn off backlight.
+Sony Vaio PCG-N505SN ??? [#f1]_
+Sony Vaio vgn-s260 X or boot-radeon can init it (5)
+Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will
+ be blank unless you return to X.
+Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4)
+Toshiba Libretto L5 none (1)
+Toshiba Libretto 100CT/110CT vbetool (6)
+Toshiba Portege 3020CT s3_mode (3)
+Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4090XCDT ??? [#f1]_
+Toshiba Satellite P10-554 s3_bios,s3_mode (4)[#f3]_
+Toshiba M30 (2) xor X with nvidia driver using internal AGP
+Uniwill 244IIO ??? [#f1]_
+=============================== ===============================================
+
+Known working desktop systems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+=================== ============================= ========================
+Mainboard Graphics card hack (or "how to do it")
+=================== ============================= ========================
+Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4)
+=================== ============================= ========================
+
+
+.. [#f1] from https://wiki.ubuntu.com/HoaryPMResults, not sure
+ which options to use. If you know, please tell me.
+
+.. [#f2] To be tested with a newer kernel.
+
+.. [#f3] Not with SMP kernel, UP only.
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
deleted file mode 100644
index 3e6272bc4472..000000000000
--- a/Documentation/power/video.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-
- Video issues with S3 resume
- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 2003-2006, Pavel Machek
-
-During S3 resume, hardware needs to be reinitialized. For most
-devices, this is easy, and kernel driver knows how to do
-it. Unfortunately there's one exception: video card. Those are usually
-initialized by BIOS, and kernel does not have enough information to
-boot video card. (Kernel usually does not even contain video card
-driver -- vesafb and vgacon are widely used).
-
-This is not problem for swsusp, because during swsusp resume, BIOS is
-run normally so video card is normally initialized. It should not be
-problem for S1 standby, because hardware should retain its state over
-that.
-
-We either have to run video BIOS during early resume, or interpret it
-using vbetool later, or maybe nothing is necessary on particular
-system because video state is preserved. Unfortunately different
-methods work on different systems, and no known method suits all of
-them.
-
-Userland application called s2ram has been developed; it contains long
-whitelist of systems, and automatically selects working method for a
-given system. It can be downloaded from CVS at
-www.sf.net/projects/suspend . If you get a system that is not in the
-whitelist, please try to find a working solution, and submit whitelist
-entry so that work does not need to be repeated.
-
-Currently, VBE_SAVE method (6 below) works on most
-systems. Unfortunately, vbetool only runs after userland is resumed,
-so it makes debugging of early resume problems
-hard/impossible. Methods that do not rely on userland are preferable.
-
-Details
-~~~~~~~
-
-There are a few types of systems where video works after S3 resume:
-
-(1) systems where video state is preserved over S3.
-
-(2) systems where it is possible to call the video BIOS during S3
- resume. Unfortunately, it is not correct to call the video BIOS at
- that point, but it happens to work on some machines. Use
- acpi_sleep=s3_bios.
-
-(3) systems that initialize video card into vga text mode and where
- the BIOS works well enough to be able to set video mode. Use
- acpi_sleep=s3_mode on these.
-
-(4) on some systems s3_bios kicks video into text mode, and
- acpi_sleep=s3_bios,s3_mode is needed.
-
-(5) radeon systems, where X can soft-boot your video card. You'll need
- a new enough X, and a plain text console (no vesafb or radeonfb). See
- http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
- Alternatively, you should use vbetool (6) instead.
-
-(6) other radeon systems, where vbetool is enough to bring system back
- to life. It needs text console to be working. Do vbetool vbestate
- save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
- vbestate restore < /tmp/delme; setfont <whatever>, and your video
- should work.
-
-(7) on some systems, it is possible to boot most of kernel, and then
- POSTing bios works. Ole Rohne has patch to do just that at
- http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
-
-(8) on some systems, you can use the video_post utility and or
- do echo 3 > /sys/power/state && /usr/sbin/video_post - which will
- initialize the display in console mode. If you are in X, you can switch
- to a virtual terminal and back to X using CTRL+ALT+F1 - CTRL+ALT+F7 to get
- the display working in graphical mode again.
-
-Now, if you pass acpi_sleep=something, and it does not work with your
-bios, you'll get a hard crash during resume. Be careful. Also it is
-safest to do your experiments with plain old VGA console. The vesafb
-and radeonfb (etc) drivers have a tendency to crash the machine during
-resume.
-
-You may have a system where none of above works. At that point you
-either invent another ugly hack that works, or write proper driver for
-your video card (good luck getting docs :-(). Maybe suspending from X
-(proper X, knowing your hardware, not XF68_FBcon) might have better
-chance of working.
-
-Table of known working notebooks:
-
-Model hack (or "how to do it")
-------------------------------------------------------------------------------
-Acer Aspire 1406LC ole's late BIOS init (7), turn off DRI
-Acer TM 230 s3_bios (2)
-Acer TM 242FX vbetool (6)
-Acer TM C110 video_post (8)
-Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8)
-Acer TM 4052LCi s3_bios (2)
-Acer TM 636Lci s3_bios,s3_mode (4)
-Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back
-Acer TM 660 ??? (*)
-Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803 vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803LCi vga=normal, vbetool (6)
-Arima W730a vbetool needed (6)
-Asus L2400D s3_mode (3)(***) (S1 also works OK)
-Asus L3350M (SiS 740) (6)
-Asus L3800C (Radeon M7) s3_bios (2) (S1 also works OK)
-Asus M6887Ne vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org
-Athlon64 desktop prototype s3_bios (2)
-Compal CL-50 ??? (*)
-Compaq Armada E500 - P3-700 none (1) (S1 also works OK)
-Compaq Evo N620c vga=normal, s3_bios (2)
-Dell 600m, ATI R250 Lf none (1), but needs xorg-x11-6.8.1.902-1
-Dell D600, ATI RV250 vga=normal and X, or try vbestate (6)
-Dell D610 vga=normal and X (possibly vbestate (6) too, but not tested)
-Dell Inspiron 4000 ??? (*)
-Dell Inspiron 500m ??? (*)
-Dell Inspiron 510m ???
-Dell Inspiron 5150 vbetool needed (6)
-Dell Inspiron 600m ??? (*)
-Dell Inspiron 8200 ??? (*)
-Dell Inspiron 8500 ??? (*)
-Dell Inspiron 8600 ??? (*)
-eMachines athlon64 machines vbetool needed (6) (someone please get me model #s)
-HP NC6000 s3_bios, may not use radeonfb (2); or vbetool (6)
-HP NX7000 ??? (*)
-HP Pavilion ZD7000 vbetool post needed, need open-source nv driver for X
-HP Omnibook XE3 athlon version none (1)
-HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV
-HP Omnibook XE3L-GF vbetool (6)
-HP Omnibook 5150 none (1), (S1 also works OK)
-IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work.
-IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(]
-IBM TP R32 / Type 2658-MMG none (1)
-IBM TP R40 2722B3G ??? (*)
-IBM TP R50p / Type 1832-22U s3_bios (2)
-IBM TP R51 none (1)
-IBM TP T30 236681A ??? (*)
-IBM TP T40 / Type 2373-MU4 none (1)
-IBM TP T40p none (1)
-IBM TP R40p s3_bios (2)
-IBM TP T41p s3_bios (2), switch to X after resume
-IBM TP T42 s3_bios (2)
-IBM ThinkPad T42p (2373-GTG) s3_bios (2)
-IBM TP X20 ??? (*)
-IBM TP X30 s3_bios, s3_mode (4)
-IBM TP X31 / Type 2672-XXH none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-IBM TP X32 none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results?
-IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4)
-IBM TP 600e none(1), but a switch to console and back to X is needed
-Medion MD4220 ??? (*)
-Samsung P35 vbetool needed (6)
-Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off
-Sony Vaio PCG-C1VRX/K s3_bios (2)
-Sony Vaio PCG-F403 ??? (*)
-Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver
-Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-Sony Vaio PCG-N505SN ??? (*)
-Sony Vaio vgn-s260 X or boot-radeon can init it (5)
-Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will be blank unless you return to X.
-Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4)
-Toshiba Libretto L5 none (1)
-Toshiba Libretto 100CT/110CT vbetool (6)
-Toshiba Portege 3020CT s3_mode (3)
-Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4090XCDT ??? (*)
-Toshiba Satellite P10-554 s3_bios,s3_mode (4)(****)
-Toshiba M30 (2) xor X with nvidia driver using internal AGP
-Uniwill 244IIO ??? (*)
-
-Known working desktop systems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Mainboard Graphics card hack (or "how to do it")
-------------------------------------------------------------------------------
-Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4)
-
-
-(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure
- which options to use. If you know, please tell me.
-
-(***) To be tested with a newer kernel.
-
-(****) Not with SMP kernel, UP only.
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
index 0c41d6d463f3..10e7f4d16c14 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -59,7 +59,7 @@ as follows:
the default calculated size. Use this option if default
boot memory size is not sufficient for second kernel to
boot successfully. For syntax of crashkernel= parameter,
- refer to Documentation/kdump/kdump.rst. If any offset is
+ refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is
provided in crashkernel= parameter, it will be ignored
as fadump uses a predefined offset to reserve memory
for boot memory dump preservation in case of a crash.
diff --git a/Documentation/powerpc/vcpudispatch_stats.txt b/Documentation/powerpc/vcpudispatch_stats.txt
new file mode 100644
index 000000000000..e21476bfd78c
--- /dev/null
+++ b/Documentation/powerpc/vcpudispatch_stats.txt
@@ -0,0 +1,68 @@
+VCPU Dispatch Statistics:
+=========================
+
+For Shared Processor LPARs, the POWER Hypervisor maintains a relatively
+static mapping of the LPAR processors (vcpus) to physical processor
+chips (representing the "home" node) and tries to always dispatch vcpus
+on their associated physical processor chip. However, under certain
+scenarios, vcpus may be dispatched on a different processor chip (away
+from its home node).
+
+/proc/powerpc/vcpudispatch_stats can be used to obtain statistics
+related to the vcpu dispatch behavior. Writing '1' to this file enables
+collecting the statistics, while writing '0' disables the statistics.
+By default, the DTLB log for each vcpu is processed 50 times a second so
+as not to miss any entries. This processing frequency can be changed
+through /proc/powerpc/vcpudispatch_stats_freq.
+
+The statistics themselves are available by reading the procfs file
+/proc/powerpc/vcpudispatch_stats. Each line in the output corresponds to
+a vcpu as represented by the first field, followed by 8 numbers.
+
+The first number corresponds to:
+1. total vcpu dispatches since the beginning of statistics collection
+
+The next 4 numbers represent vcpu dispatch dispersions:
+2. number of times this vcpu was dispatched on the same processor as last
+ time
+3. number of times this vcpu was dispatched on a different processor core
+ as last time, but within the same chip
+4. number of times this vcpu was dispatched on a different chip
+5. number of times this vcpu was dispatches on a different socket/drawer
+(next numa boundary)
+
+The final 3 numbers represent statistics in relation to the home node of
+the vcpu:
+6. number of times this vcpu was dispatched in its home node (chip)
+7. number of times this vcpu was dispatched in a different node
+8. number of times this vcpu was dispatched in a node further away (numa
+distance)
+
+An example output:
+ $ sudo cat /proc/powerpc/vcpudispatch_stats
+ cpu0 6839 4126 2683 30 0 6821 18 0
+ cpu1 2515 1274 1229 12 0 2509 6 0
+ cpu2 2317 1198 1109 10 0 2312 5 0
+ cpu3 2259 1165 1088 6 0 2256 3 0
+ cpu4 2205 1143 1056 6 0 2202 3 0
+ cpu5 2165 1121 1038 6 0 2162 3 0
+ cpu6 2183 1127 1050 6 0 2180 3 0
+ cpu7 2193 1133 1052 8 0 2187 6 0
+ cpu8 2165 1115 1032 18 0 2156 9 0
+ cpu9 2301 1252 1033 16 0 2293 8 0
+ cpu10 2197 1138 1041 18 0 2187 10 0
+ cpu11 2273 1185 1062 26 0 2260 13 0
+ cpu12 2186 1125 1043 18 0 2177 9 0
+ cpu13 2161 1115 1030 16 0 2153 8 0
+ cpu14 2206 1153 1033 20 0 2196 10 0
+ cpu15 2163 1115 1032 16 0 2155 8 0
+
+In the output above, for vcpu0, there have been 6839 dispatches since
+statistics were enabled. 4126 of those dispatches were on the same
+physical cpu as the last time. 2683 were on a different core, but within
+the same chip, while 30 dispatches were on a different chip compared to
+its last dispatch.
+
+Also, out of the total of 6839 dispatches, we see that there have been
+6821 dispatches on the vcpu's home node, while 18 dispatches were
+outside its home node, on a neighbouring chip.
diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 18735dc460a0..2284f2221f02 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -23,15 +23,15 @@ running, the suggested command should tell you.
Again, keep in mind that this list assumes you are already functionally
running a Linux kernel. Also, not all tools are necessary on all
-systems; obviously, if you don't have any ISDN hardware, for example,
-you probably needn't concern yourself with isdn4k-utils.
+systems; obviously, if you don't have any PC Card hardware, for example,
+you probably needn't concern yourself with pcmciautils.
====================== =============== ========================================
Program Minimal version Command to check the version
====================== =============== ========================================
GNU C 4.6 gcc --version
GNU make 3.81 make --version
-binutils 2.20 ld -v
+binutils 2.21 ld -v
flex 2.5.35 flex --version
bison 2.0 bison --version
util-linux 2.10o fdformat --version
@@ -45,7 +45,6 @@ btrfs-progs 0.18 btrfsck
pcmciautils 004 pccardctl -V
quota-tools 3.09 quota -V
PPP 2.4.0 pppd --version
-isdn4k-utils 3.1pre1 isdnctrl 2>&1|grep version
nfs-utils 1.0.5 showmount --version
procps 3.2.0 ps --version
oprofile 0.9 oprofiled --version
@@ -77,9 +76,7 @@ You will need GNU make 3.81 or later to build the kernel.
Binutils
--------
-The build system has, as of 4.13, switched to using thin archives (`ar T`)
-rather than incremental linking (`ld -r`) for built-in.a intermediate steps.
-This requires binutils 2.20 or newer.
+Binutils 2.21 or newer is needed to build the kernel.
pkg-config
----------
@@ -279,12 +276,6 @@ which can be made by::
as root.
-Isdn4k-utils
-------------
-
-Due to changes in the length of the phone number field, isdn4k-utils
-needs to be recompiled or (preferably) upgraded.
-
NFS-utils
---------
@@ -448,11 +439,6 @@ PPP
- <ftp://ftp.samba.org/pub/ppp/>
-Isdn4k-utils
-------------
-
-- <ftp://ftp.isdn4linux.de/pub/isdn4linux/utils/>
-
NFS-utils
---------
diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst
index 365efc9e4aa8..8e56337d422d 100644
--- a/Documentation/process/submit-checklist.rst
+++ b/Documentation/process/submit-checklist.rst
@@ -107,7 +107,7 @@ and elsewhere regarding submitting Linux kernel patches.
and why.
26) If any ioctl's are added by the patch, then also update
- ``Documentation/ioctl/ioctl-number.txt``.
+ ``Documentation/ioctl/ioctl-number.rst``.
27) If your modified source code depends on or uses any of the kernel
APIs or features that are related to the following ``Kconfig`` symbols,
diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst
index 58bc047e7b95..1acaa14903d6 100644
--- a/Documentation/process/submitting-drivers.rst
+++ b/Documentation/process/submitting-drivers.rst
@@ -117,7 +117,7 @@ PM support:
implemented") error. You should also try to make sure that your
driver uses as little power as possible when it's not doing
anything. For the driver testing instructions see
- Documentation/power/drivers-testing.txt and for a relatively
+ Documentation/power/drivers-testing.rst and for a relatively
complete overview of the power management issues related to
drivers see :ref:`Documentation/driver-api/pm/devices.rst <driverapi_pm_devices>`.
diff --git a/Documentation/pti/pti_intel_mid.txt b/Documentation/pti/pti_intel_mid.txt
deleted file mode 100644
index e7a5b6d1f7a9..000000000000
--- a/Documentation/pti/pti_intel_mid.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-The Intel MID PTI project is HW implemented in Intel Atom
-system-on-a-chip designs based on the Parallel Trace
-Interface for MIPI P1149.7 cJTAG standard. The kernel solution
-for this platform involves the following files:
-
-./include/linux/pti.h
-./drivers/.../n_tracesink.h
-./drivers/.../n_tracerouter.c
-./drivers/.../n_tracesink.c
-./drivers/.../pti.c
-
-pti.c is the driver that enables various debugging features
-popular on platforms from certain mobile manufacturers.
-n_tracerouter.c and n_tracesink.c allow extra system information to
-be collected and routed to the pti driver, such as trace
-debugging data from a modem. Although n_tracerouter
-and n_tracesink are a part of the complete PTI solution,
-these two line disciplines can work separately from
-pti.c and route any data stream from one /dev/tty node
-to another /dev/tty node via kernel-space. This provides
-a stable, reliable connection that will not break unless
-the user-space application shuts down (plus avoids
-kernel->user->kernel context switch overheads of routing
-data).
-
-An example debugging usage for this driver system:
- *Hook /dev/ttyPTI0 to syslogd. Opening this port will also start
- a console device to further capture debugging messages to PTI.
- *Hook /dev/ttyPTI1 to modem debugging data to write to PTI HW.
- This is where n_tracerouter and n_tracesink are used.
- *Hook /dev/pti to a user-level debugging application for writing
- to PTI HW.
- *Use mipi_* Kernel Driver API in other device drivers for
- debugging to PTI by first requesting a PTI write address via
- mipi_request_masterchannel(1).
-
-Below is example pseudo-code on how a 'privileged' application
-can hook up n_tracerouter and n_tracesink to any tty on
-a system. 'Privileged' means the application has enough
-privileges to successfully manipulate the ldisc drivers
-but is not just blindly executing as 'root'. Keep in mind
-the use of ioctl(,TIOCSETD,) is not specific to the n_tracerouter
-and n_tracesink line discpline drivers but is a generic
-operation for a program to use a line discpline driver
-on a tty port other than the default n_tty.
-
-/////////// To hook up n_tracerouter and n_tracesink /////////
-
-// Note that n_tracerouter depends on n_tracesink.
-#include <errno.h>
-#define ONE_TTY "/dev/ttyOne"
-#define TWO_TTY "/dev/ttyTwo"
-
-// needed global to hand onto ldisc connection
-static int g_fd_source = -1;
-static int g_fd_sink = -1;
-
-// these two vars used to grab LDISC values from loaded ldisc drivers
-// in OS. Look at /proc/tty/ldiscs to get the right numbers from
-// the ldiscs loaded in the system.
-int source_ldisc_num, sink_ldisc_num = -1;
-int retval;
-
-g_fd_source = open(ONE_TTY, O_RDWR); // must be R/W
-g_fd_sink = open(TWO_TTY, O_RDWR); // must be R/W
-
-if (g_fd_source <= 0) || (g_fd_sink <= 0) {
- // doubt you'll want to use these exact error lines of code
- printf("Error on open(). errno: %d\n",errno);
- return errno;
-}
-
-retval = ioctl(g_fd_sink, TIOCSETD, &sink_ldisc_num);
-if (retval < 0) {
- printf("Error on ioctl(). errno: %d\n", errno);
- return errno;
-}
-
-retval = ioctl(g_fd_source, TIOCSETD, &source_ldisc_num);
-if (retval < 0) {
- printf("Error on ioctl(). errno: %d\n", errno);
- return errno;
-}
-
-/////////// To disconnect n_tracerouter and n_tracesink ////////
-
-// First make sure data through the ldiscs has stopped.
-
-// Second, disconnect ldiscs. This provides a
-// little cleaner shutdown on tty stack.
-sink_ldisc_num = 0;
-source_ldisc_num = 0;
-ioctl(g_fd_uart, TIOCSETD, &sink_ldisc_num);
-ioctl(g_fd_gadget, TIOCSETD, &source_ldisc_num);
-
-// Three, program closes connection, and cleanup:
-close(g_fd_uart);
-close(g_fd_gadget);
-g_fd_uart = g_fd_gadget = NULL;
diff --git a/Documentation/pwm.txt b/Documentation/pwm.txt
deleted file mode 100644
index 8fbf0aa3ba2d..000000000000
--- a/Documentation/pwm.txt
+++ /dev/null
@@ -1,158 +0,0 @@
-======================================
-Pulse Width Modulation (PWM) interface
-======================================
-
-This provides an overview about the Linux PWM interface
-
-PWMs are commonly used for controlling LEDs, fans or vibrators in
-cell phones. PWMs with a fixed purpose have no need implementing
-the Linux PWM API (although they could). However, PWMs are often
-found as discrete devices on SoCs which have no fixed purpose. It's
-up to the board designer to connect them to LEDs or fans. To provide
-this kind of flexibility the generic PWM API exists.
-
-Identifying PWMs
-----------------
-
-Users of the legacy PWM API use unique IDs to refer to PWM devices.
-
-Instead of referring to a PWM device via its unique ID, board setup code
-should instead register a static mapping that can be used to match PWM
-consumers to providers, as given in the following example::
-
- static struct pwm_lookup board_pwm_lookup[] = {
- PWM_LOOKUP("tegra-pwm", 0, "pwm-backlight", NULL,
- 50000, PWM_POLARITY_NORMAL),
- };
-
- static void __init board_init(void)
- {
- ...
- pwm_add_table(board_pwm_lookup, ARRAY_SIZE(board_pwm_lookup));
- ...
- }
-
-Using PWMs
-----------
-
-Legacy users can request a PWM device using pwm_request() and free it
-after usage with pwm_free().
-
-New users should use the pwm_get() function and pass to it the consumer
-device or a consumer name. pwm_put() is used to free the PWM device. Managed
-variants of these functions, devm_pwm_get() and devm_pwm_put(), also exist.
-
-After being requested, a PWM has to be configured using::
-
- int pwm_apply_state(struct pwm_device *pwm, struct pwm_state *state);
-
-This API controls both the PWM period/duty_cycle config and the
-enable/disable state.
-
-The pwm_config(), pwm_enable() and pwm_disable() functions are just wrappers
-around pwm_apply_state() and should not be used if the user wants to change
-several parameter at once. For example, if you see pwm_config() and
-pwm_{enable,disable}() calls in the same function, this probably means you
-should switch to pwm_apply_state().
-
-The PWM user API also allows one to query the PWM state with pwm_get_state().
-
-In addition to the PWM state, the PWM API also exposes PWM arguments, which
-are the reference PWM config one should use on this PWM.
-PWM arguments are usually platform-specific and allows the PWM user to only
-care about dutycycle relatively to the full period (like, duty = 50% of the
-period). struct pwm_args contains 2 fields (period and polarity) and should
-be used to set the initial PWM config (usually done in the probe function
-of the PWM user). PWM arguments are retrieved with pwm_get_args().
-
-Using PWMs with the sysfs interface
------------------------------------
-
-If CONFIG_SYSFS is enabled in your kernel configuration a simple sysfs
-interface is provided to use the PWMs from userspace. It is exposed at
-/sys/class/pwm/. Each probed PWM controller/chip will be exported as
-pwmchipN, where N is the base of the PWM chip. Inside the directory you
-will find:
-
- npwm
- The number of PWM channels this chip supports (read-only).
-
- export
- Exports a PWM channel for use with sysfs (write-only).
-
- unexport
- Unexports a PWM channel from sysfs (write-only).
-
-The PWM channels are numbered using a per-chip index from 0 to npwm-1.
-
-When a PWM channel is exported a pwmX directory will be created in the
-pwmchipN directory it is associated with, where X is the number of the
-channel that was exported. The following properties will then be available:
-
- period
- The total period of the PWM signal (read/write).
- Value is in nanoseconds and is the sum of the active and inactive
- time of the PWM.
-
- duty_cycle
- The active time of the PWM signal (read/write).
- Value is in nanoseconds and must be less than the period.
-
- polarity
- Changes the polarity of the PWM signal (read/write).
- Writes to this property only work if the PWM chip supports changing
- the polarity. The polarity can only be changed if the PWM is not
- enabled. Value is the string "normal" or "inversed".
-
- enable
- Enable/disable the PWM signal (read/write).
-
- - 0 - disabled
- - 1 - enabled
-
-Implementing a PWM driver
--------------------------
-
-Currently there are two ways to implement pwm drivers. Traditionally
-there only has been the barebone API meaning that each driver has
-to implement the pwm_*() functions itself. This means that it's impossible
-to have multiple PWM drivers in the system. For this reason it's mandatory
-for new drivers to use the generic PWM framework.
-
-A new PWM controller/chip can be added using pwmchip_add() and removed
-again with pwmchip_remove(). pwmchip_add() takes a filled in struct
-pwm_chip as argument which provides a description of the PWM chip, the
-number of PWM devices provided by the chip and the chip-specific
-implementation of the supported PWM operations to the framework.
-
-When implementing polarity support in a PWM driver, make sure to respect the
-signal conventions in the PWM framework. By definition, normal polarity
-characterizes a signal starts high for the duration of the duty cycle and
-goes low for the remainder of the period. Conversely, a signal with inversed
-polarity starts low for the duration of the duty cycle and goes high for the
-remainder of the period.
-
-Drivers are encouraged to implement ->apply() instead of the legacy
-->enable(), ->disable() and ->config() methods. Doing that should provide
-atomicity in the PWM config workflow, which is required when the PWM controls
-a critical device (like a regulator).
-
-The implementation of ->get_state() (a method used to retrieve initial PWM
-state) is also encouraged for the same reason: letting the PWM user know
-about the current PWM state would allow him to avoid glitches.
-
-Locking
--------
-
-The PWM core list manipulations are protected by a mutex, so pwm_request()
-and pwm_free() may not be called from an atomic context. Currently the
-PWM core does not enforce any locking to pwm_enable(), pwm_disable() and
-pwm_config(), so the calling context is currently driver specific. This
-is an issue derived from the former barebone API and should be fixed soon.
-
-Helpers
--------
-
-Currently a PWM can only be configured with period_ns and duty_ns. For several
-use cases freq_hz and duty_percent might be better. Instead of calculating
-this in your driver please consider adding appropriate helpers to the framework.
diff --git a/Documentation/rapidio/mport_cdev.txt b/Documentation/rapidio/mport_cdev.txt
deleted file mode 100644
index a53f786ee2e9..000000000000
--- a/Documentation/rapidio/mport_cdev.txt
+++ /dev/null
@@ -1,107 +0,0 @@
-RapidIO subsystem mport character device driver (rio_mport_cdev.c)
-==================================================================
-
-Version History:
-----------------
- 1.0.0 - Initial driver release.
-
-==================================================================
-
-I. Overview
-
-This device driver is the result of collaboration within the RapidIO.org
-Software Task Group (STG) between Texas Instruments, Freescale,
-Prodrive Technologies, Nokia Networks, BAE and IDT. Additional input was
-received from other members of RapidIO.org. The objective was to create a
-character mode driver interface which exposes the capabilities of RapidIO
-devices directly to applications, in a manner that allows the numerous and
-varied RapidIO implementations to interoperate.
-
-This driver (MPORT_CDEV) provides access to basic RapidIO subsystem operations
-for user-space applications. Most of RapidIO operations are supported through
-'ioctl' system calls.
-
-When loaded this device driver creates filesystem nodes named rio_mportX in /dev
-directory for each registered RapidIO mport device. 'X' in the node name matches
-to unique port ID assigned to each local mport device.
-
-Using available set of ioctl commands user-space applications can perform
-following RapidIO bus and subsystem operations:
-
-- Reads and writes from/to configuration registers of mport devices
- (RIO_MPORT_MAINT_READ_LOCAL/RIO_MPORT_MAINT_WRITE_LOCAL)
-- Reads and writes from/to configuration registers of remote RapidIO devices.
- This operations are defined as RapidIO Maintenance reads/writes in RIO spec.
- (RIO_MPORT_MAINT_READ_REMOTE/RIO_MPORT_MAINT_WRITE_REMOTE)
-- Set RapidIO Destination ID for mport devices (RIO_MPORT_MAINT_HDID_SET)
-- Set RapidIO Component Tag for mport devices (RIO_MPORT_MAINT_COMPTAG_SET)
-- Query logical index of mport devices (RIO_MPORT_MAINT_PORT_IDX_GET)
-- Query capabilities and RapidIO link configuration of mport devices
- (RIO_MPORT_GET_PROPERTIES)
-- Enable/Disable reporting of RapidIO doorbell events to user-space applications
- (RIO_ENABLE_DOORBELL_RANGE/RIO_DISABLE_DOORBELL_RANGE)
-- Enable/Disable reporting of RIO port-write events to user-space applications
- (RIO_ENABLE_PORTWRITE_RANGE/RIO_DISABLE_PORTWRITE_RANGE)
-- Query/Control type of events reported through this driver: doorbells,
- port-writes or both (RIO_SET_EVENT_MASK/RIO_GET_EVENT_MASK)
-- Configure/Map mport's outbound requests window(s) for specific size,
- RapidIO destination ID, hopcount and request type
- (RIO_MAP_OUTBOUND/RIO_UNMAP_OUTBOUND)
-- Configure/Map mport's inbound requests window(s) for specific size,
- RapidIO base address and local memory base address
- (RIO_MAP_INBOUND/RIO_UNMAP_INBOUND)
-- Allocate/Free contiguous DMA coherent memory buffer for DMA data transfers
- to/from remote RapidIO devices (RIO_ALLOC_DMA/RIO_FREE_DMA)
-- Initiate DMA data transfers to/from remote RapidIO devices (RIO_TRANSFER).
- Supports blocking, asynchronous and posted (a.k.a 'fire-and-forget') data
- transfer modes.
-- Check/Wait for completion of asynchronous DMA data transfer
- (RIO_WAIT_FOR_ASYNC)
-- Manage device objects supported by RapidIO subsystem (RIO_DEV_ADD/RIO_DEV_DEL).
- This allows implementation of various RapidIO fabric enumeration algorithms
- as user-space applications while using remaining functionality provided by
- kernel RapidIO subsystem.
-
-II. Hardware Compatibility
-
-This device driver uses standard interfaces defined by kernel RapidIO subsystem
-and therefore it can be used with any mport device driver registered by RapidIO
-subsystem with limitations set by available mport implementation.
-
-At this moment the most common limitation is availability of RapidIO-specific
-DMA engine framework for specific mport device. Users should verify available
-functionality of their platform when planning to use this driver:
-
-- IDT Tsi721 PCIe-to-RapidIO bridge device and its mport device driver are fully
- compatible with this driver.
-- Freescale SoCs 'fsl_rio' mport driver does not have implementation for RapidIO
- specific DMA engine support and therefore DMA data transfers mport_cdev driver
- are not available.
-
-III. Module parameters
-
-- 'dma_timeout' - DMA transfer completion timeout (in msec, default value 3000).
- This parameter set a maximum completion wait time for SYNC mode DMA
- transfer requests and for RIO_WAIT_FOR_ASYNC ioctl requests.
-
-- 'dbg_level' - This parameter allows to control amount of debug information
- generated by this device driver. This parameter is formed by set of
- bit masks that correspond to the specific functional blocks.
- For mask definitions see 'drivers/rapidio/devices/rio_mport_cdev.c'
- This parameter can be changed dynamically.
- Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
-
-IV. Known problems
-
- None.
-
-V. User-space Applications and API
-
-API library and applications that use this device driver are available from
-RapidIO.org.
-
-VI. TODO List
-
-- Add support for sending/receiving "raw" RapidIO messaging packets.
-- Add memory mapped DMA data transfers as an option when RapidIO-specific DMA
- is not available.
diff --git a/Documentation/rapidio/rapidio.txt b/Documentation/rapidio/rapidio.txt
deleted file mode 100644
index 28fbd877f85a..000000000000
--- a/Documentation/rapidio/rapidio.txt
+++ /dev/null
@@ -1,351 +0,0 @@
- The Linux RapidIO Subsystem
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The RapidIO standard is a packet-based fabric interconnect standard designed for
-use in embedded systems. Development of the RapidIO standard is directed by the
-RapidIO Trade Association (RTA). The current version of the RapidIO specification
-is publicly available for download from the RTA web-site [1].
-
-This document describes the basics of the Linux RapidIO subsystem and provides
-information on its major components.
-
-1 Overview
-----------
-
-Because the RapidIO subsystem follows the Linux device model it is integrated
-into the kernel similarly to other buses by defining RapidIO-specific device and
-bus types and registering them within the device model.
-
-The Linux RapidIO subsystem is architecture independent and therefore defines
-architecture-specific interfaces that provide support for common RapidIO
-subsystem operations.
-
-2. Core Components
-------------------
-
-A typical RapidIO network is a combination of endpoints and switches.
-Each of these components is represented in the subsystem by an associated data
-structure. The core logical components of the RapidIO subsystem are defined
-in include/linux/rio.h file.
-
-2.1 Master Port
-
-A master port (or mport) is a RapidIO interface controller that is local to the
-processor executing the Linux code. A master port generates and receives RapidIO
-packets (transactions). In the RapidIO subsystem each master port is represented
-by a rio_mport data structure. This structure contains master port specific
-resources such as mailboxes and doorbells. The rio_mport also includes a unique
-host device ID that is valid when a master port is configured as an enumerating
-host.
-
-RapidIO master ports are serviced by subsystem specific mport device drivers
-that provide functionality defined for this subsystem. To provide a hardware
-independent interface for RapidIO subsystem operations, rio_mport structure
-includes rio_ops data structure which contains pointers to hardware specific
-implementations of RapidIO functions.
-
-2.2 Device
-
-A RapidIO device is any endpoint (other than mport) or switch in the network.
-All devices are presented in the RapidIO subsystem by corresponding rio_dev data
-structure. Devices form one global device list and per-network device lists
-(depending on number of available mports and networks).
-
-2.3 Switch
-
-A RapidIO switch is a special class of device that routes packets between its
-ports towards their final destination. The packet destination port within a
-switch is defined by an internal routing table. A switch is presented in the
-RapidIO subsystem by rio_dev data structure expanded by additional rio_switch
-data structure, which contains switch specific information such as copy of the
-routing table and pointers to switch specific functions.
-
-The RapidIO subsystem defines the format and initialization method for subsystem
-specific switch drivers that are designed to provide hardware-specific
-implementation of common switch management routines.
-
-2.4 Network
-
-A RapidIO network is a combination of interconnected endpoint and switch devices.
-Each RapidIO network known to the system is represented by corresponding rio_net
-data structure. This structure includes lists of all devices and local master
-ports that form the same network. It also contains a pointer to the default
-master port that is used to communicate with devices within the network.
-
-2.5 Device Drivers
-
-RapidIO device-specific drivers follow Linux Kernel Driver Model and are
-intended to support specific RapidIO devices attached to the RapidIO network.
-
-2.6 Subsystem Interfaces
-
-RapidIO interconnect specification defines features that may be used to provide
-one or more common service layers for all participating RapidIO devices. These
-common services may act separately from device-specific drivers or be used by
-device-specific drivers. Example of such service provider is the RIONET driver
-which implements Ethernet-over-RapidIO interface. Because only one driver can be
-registered for a device, all common RapidIO services have to be registered as
-subsystem interfaces. This allows to have multiple common services attached to
-the same device without blocking attachment of a device-specific driver.
-
-3. Subsystem Initialization
----------------------------
-
-In order to initialize the RapidIO subsystem, a platform must initialize and
-register at least one master port within the RapidIO network. To register mport
-within the subsystem controller driver's initialization code calls function
-rio_register_mport() for each available master port.
-
-After all active master ports are registered with a RapidIO subsystem,
-an enumeration and/or discovery routine may be called automatically or
-by user-space command.
-
-RapidIO subsystem can be configured to be built as a statically linked or
-modular component of the kernel (see details below).
-
-4. Enumeration and Discovery
-----------------------------
-
-4.1 Overview
-------------
-
-RapidIO subsystem configuration options allow users to build enumeration and
-discovery methods as statically linked components or loadable modules.
-An enumeration/discovery method implementation and available input parameters
-define how any given method can be attached to available RapidIO mports:
-simply to all available mports OR individually to the specified mport device.
-
-Depending on selected enumeration/discovery build configuration, there are
-several methods to initiate an enumeration and/or discovery process:
-
- (a) Statically linked enumeration and discovery process can be started
- automatically during kernel initialization time using corresponding module
- parameters. This was the original method used since introduction of RapidIO
- subsystem. Now this method relies on enumerator module parameter which is
- 'rio-scan.scan' for existing basic enumeration/discovery method.
- When automatic start of enumeration/discovery is used a user has to ensure
- that all discovering endpoints are started before the enumerating endpoint
- and are waiting for enumeration to be completed.
- Configuration option CONFIG_RAPIDIO_DISC_TIMEOUT defines time that discovering
- endpoint waits for enumeration to be completed. If the specified timeout
- expires the discovery process is terminated without obtaining RapidIO network
- information. NOTE: a timed out discovery process may be restarted later using
- a user-space command as it is described below (if the given endpoint was
- enumerated successfully).
-
- (b) Statically linked enumeration and discovery process can be started by
- a command from user space. This initiation method provides more flexibility
- for a system startup compared to the option (a) above. After all participating
- endpoints have been successfully booted, an enumeration process shall be
- started first by issuing a user-space command, after an enumeration is
- completed a discovery process can be started on all remaining endpoints.
-
- (c) Modular enumeration and discovery process can be started by a command from
- user space. After an enumeration/discovery module is loaded, a network scan
- process can be started by issuing a user-space command.
- Similar to the option (b) above, an enumerator has to be started first.
-
- (d) Modular enumeration and discovery process can be started by a module
- initialization routine. In this case an enumerating module shall be loaded
- first.
-
-When a network scan process is started it calls an enumeration or discovery
-routine depending on the configured role of a master port: host or agent.
-
-Enumeration is performed by a master port if it is configured as a host port by
-assigning a host destination ID greater than or equal to zero. The host
-destination ID can be assigned to a master port using various methods depending
-on RapidIO subsystem build configuration:
-
- (a) For a statically linked RapidIO subsystem core use command line parameter
- "rapidio.hdid=" with a list of destination ID assignments in order of mport
- device registration. For example, in a system with two RapidIO controllers
- the command line parameter "rapidio.hdid=-1,7" will result in assignment of
- the host destination ID=7 to the second RapidIO controller, while the first
- one will be assigned destination ID=-1.
-
- (b) If the RapidIO subsystem core is built as a loadable module, in addition
- to the method shown above, the host destination ID(s) can be specified using
- traditional methods of passing module parameter "hdid=" during its loading:
- - from command line: "modprobe rapidio hdid=-1,7", or
- - from modprobe configuration file using configuration command "options",
- like in this example: "options rapidio hdid=-1,7". An example of modprobe
- configuration file is provided in the section below.
-
- NOTES:
- (i) if "hdid=" parameter is omitted all available mport will be assigned
- destination ID = -1;
- (ii) the "hdid=" parameter in systems with multiple mports can have
- destination ID assignments omitted from the end of list (default = -1).
-
-If the host device ID for a specific master port is set to -1, the discovery
-process will be performed for it.
-
-The enumeration and discovery routines use RapidIO maintenance transactions
-to access the configuration space of devices.
-
-NOTE: If RapidIO switch-specific device drivers are built as loadable modules
-they must be loaded before enumeration/discovery process starts.
-This requirement is cased by the fact that enumeration/discovery methods invoke
-vendor-specific callbacks on early stages.
-
-4.2 Automatic Start of Enumeration and Discovery
-------------------------------------------------
-
-Automatic enumeration/discovery start method is applicable only to built-in
-enumeration/discovery RapidIO configuration selection. To enable automatic
-enumeration/discovery start by existing basic enumerator method set use boot
-command line parameter "rio-scan.scan=1".
-
-This configuration requires synchronized start of all RapidIO endpoints that
-form a network which will be enumerated/discovered. Discovering endpoints have
-to be started before an enumeration starts to ensure that all RapidIO
-controllers have been initialized and are ready to be discovered. Configuration
-parameter CONFIG_RAPIDIO_DISC_TIMEOUT defines time (in seconds) which
-a discovering endpoint will wait for enumeration to be completed.
-
-When automatic enumeration/discovery start is selected, basic method's
-initialization routine calls rio_init_mports() to perform enumeration or
-discovery for all known mport devices.
-
-Depending on RapidIO network size and configuration this automatic
-enumeration/discovery start method may be difficult to use due to the
-requirement for synchronized start of all endpoints.
-
-4.3 User-space Start of Enumeration and Discovery
--------------------------------------------------
-
-User-space start of enumeration and discovery can be used with built-in and
-modular build configurations. For user-space controlled start RapidIO subsystem
-creates the sysfs write-only attribute file '/sys/bus/rapidio/scan'. To initiate
-an enumeration or discovery process on specific mport device, a user needs to
-write mport_ID (not RapidIO destination ID) into that file. The mport_ID is a
-sequential number (0 ... RIO_MAX_MPORTS) assigned during mport device
-registration. For example for machine with single RapidIO controller, mport_ID
-for that controller always will be 0.
-
-To initiate RapidIO enumeration/discovery on all available mports a user may
-write '-1' (or RIO_MPORT_ANY) into the scan attribute file.
-
-4.4 Basic Enumeration Method
-----------------------------
-
-This is an original enumeration/discovery method which is available since
-first release of RapidIO subsystem code. The enumeration process is
-implemented according to the enumeration algorithm outlined in the RapidIO
-Interconnect Specification: Annex I [1].
-
-This method can be configured as statically linked or loadable module.
-The method's single parameter "scan" allows to trigger the enumeration/discovery
-process from module initialization routine.
-
-This enumeration/discovery method can be started only once and does not support
-unloading if it is built as a module.
-
-The enumeration process traverses the network using a recursive depth-first
-algorithm. When a new device is found, the enumerator takes ownership of that
-device by writing into the Host Device ID Lock CSR. It does this to ensure that
-the enumerator has exclusive right to enumerate the device. If device ownership
-is successfully acquired, the enumerator allocates a new rio_dev structure and
-initializes it according to device capabilities.
-
-If the device is an endpoint, a unique device ID is assigned to it and its value
-is written into the device's Base Device ID CSR.
-
-If the device is a switch, the enumerator allocates an additional rio_switch
-structure to store switch specific information. Then the switch's vendor ID and
-device ID are queried against a table of known RapidIO switches. Each switch
-table entry contains a pointer to a switch-specific initialization routine that
-initializes pointers to the rest of switch specific operations, and performs
-hardware initialization if necessary. A RapidIO switch does not have a unique
-device ID; it relies on hopcount and routing for device ID of an attached
-endpoint if access to its configuration registers is required. If a switch (or
-chain of switches) does not have any endpoint (except enumerator) attached to
-it, a fake device ID will be assigned to configure a route to that switch.
-In the case of a chain of switches without endpoint, one fake device ID is used
-to configure a route through the entire chain and switches are differentiated by
-their hopcount value.
-
-For both endpoints and switches the enumerator writes a unique component tag
-into device's Component Tag CSR. That unique value is used by the error
-management notification mechanism to identify a device that is reporting an
-error management event.
-
-Enumeration beyond a switch is completed by iterating over each active egress
-port of that switch. For each active link, a route to a default device ID
-(0xFF for 8-bit systems and 0xFFFF for 16-bit systems) is temporarily written
-into the routing table. The algorithm recurs by calling itself with hopcount + 1
-and the default device ID in order to access the device on the active port.
-
-After the host has completed enumeration of the entire network it releases
-devices by clearing device ID locks (calls rio_clear_locks()). For each endpoint
-in the system, it sets the Discovered bit in the Port General Control CSR
-to indicate that enumeration is completed and agents are allowed to execute
-passive discovery of the network.
-
-The discovery process is performed by agents and is similar to the enumeration
-process that is described above. However, the discovery process is performed
-without changes to the existing routing because agents only gather information
-about RapidIO network structure and are building an internal map of discovered
-devices. This way each Linux-based component of the RapidIO subsystem has
-a complete view of the network. The discovery process can be performed
-simultaneously by several agents. After initializing its RapidIO master port
-each agent waits for enumeration completion by the host for the configured wait
-time period. If this wait time period expires before enumeration is completed,
-an agent skips RapidIO discovery and continues with remaining kernel
-initialization.
-
-4.5 Adding New Enumeration/Discovery Method
--------------------------------------------
-
-RapidIO subsystem code organization allows addition of new enumeration/discovery
-methods as new configuration options without significant impact to the core
-RapidIO code.
-
-A new enumeration/discovery method has to be attached to one or more mport
-devices before an enumeration/discovery process can be started. Normally,
-method's module initialization routine calls rio_register_scan() to attach
-an enumerator to a specified mport device (or devices). The basic enumerator
-implementation demonstrates this process.
-
-4.6 Using Loadable RapidIO Switch Drivers
------------------------------------------
-
-In the case when RapidIO switch drivers are built as loadable modules a user
-must ensure that they are loaded before the enumeration/discovery starts.
-This process can be automated by specifying pre- or post- dependencies in the
-RapidIO-specific modprobe configuration file as shown in the example below.
-
- File /etc/modprobe.d/rapidio.conf:
- ----------------------------------
-
- # Configure RapidIO subsystem modules
-
- # Set enumerator host destination ID (overrides kernel command line option)
- options rapidio hdid=-1,2
-
- # Load RapidIO switch drivers immediately after rapidio core module was loaded
- softdep rapidio post: idt_gen2 idtcps tsi57x
-
- # OR :
-
- # Load RapidIO switch drivers just before rio-scan enumerator module is loaded
- softdep rio-scan pre: idt_gen2 idtcps tsi57x
-
- --------------------------
-
-NOTE: In the example above, one of "softdep" commands must be removed or
-commented out to keep required module loading sequence.
-
-A. References
--------------
-
-[1] RapidIO Trade Association. RapidIO Interconnect Specifications.
- http://www.rapidio.org.
-[2] Rapidio TA. Technology Comparisons.
- http://www.rapidio.org/education/technology_comparisons/
-[3] RapidIO support for Linux.
- http://lwn.net/Articles/139118/
-[4] Matt Porter. RapidIO for Linux. Ottawa Linux Symposium, 2005
- http://www.kernel.org/doc/ols/2005/ols2005v2-pages-43-56.pdf
diff --git a/Documentation/rapidio/rio_cm.txt b/Documentation/rapidio/rio_cm.txt
deleted file mode 100644
index 27aa401f1126..000000000000
--- a/Documentation/rapidio/rio_cm.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-RapidIO subsystem Channelized Messaging character device driver (rio_cm.c)
-==========================================================================
-
-Version History:
-----------------
- 1.0.0 - Initial driver release.
-
-==========================================================================
-
-I. Overview
-
-This device driver is the result of collaboration within the RapidIO.org
-Software Task Group (STG) between Texas Instruments, Prodrive Technologies,
-Nokia Networks, BAE and IDT. Additional input was received from other members
-of RapidIO.org.
-
-The objective was to create a character mode driver interface which exposes
-messaging capabilities of RapidIO endpoint devices (mports) directly
-to applications, in a manner that allows the numerous and varied RapidIO
-implementations to interoperate.
-
-This driver (RIO_CM) provides to user-space applications shared access to
-RapidIO mailbox messaging resources.
-
-RapidIO specification (Part 2) defines that endpoint devices may have up to four
-messaging mailboxes in case of multi-packet message (up to 4KB) and
-up to 64 mailboxes if single-packet messages (up to 256 B) are used. In addition
-to protocol definition limitations, a particular hardware implementation can
-have reduced number of messaging mailboxes. RapidIO aware applications must
-therefore share the messaging resources of a RapidIO endpoint.
-
-Main purpose of this device driver is to provide RapidIO mailbox messaging
-capability to large number of user-space processes by introducing socket-like
-operations using a single messaging mailbox. This allows applications to
-use the limited RapidIO messaging hardware resources efficiently.
-
-Most of device driver's operations are supported through 'ioctl' system calls.
-
-When loaded this device driver creates a single file system node named rio_cm
-in /dev directory common for all registered RapidIO mport devices.
-
-Following ioctl commands are available to user-space applications:
-
-- RIO_CM_MPORT_GET_LIST : Returns to caller list of local mport devices that
- support messaging operations (number of entries up to RIO_MAX_MPORTS).
- Each list entry is combination of mport's index in the system and RapidIO
- destination ID assigned to the port.
-- RIO_CM_EP_GET_LIST_SIZE : Returns number of messaging capable remote endpoints
- in a RapidIO network associated with the specified mport device.
-- RIO_CM_EP_GET_LIST : Returns list of RapidIO destination IDs for messaging
- capable remote endpoints (peers) available in a RapidIO network associated
- with the specified mport device.
-- RIO_CM_CHAN_CREATE : Creates RapidIO message exchange channel data structure
- with channel ID assigned automatically or as requested by a caller.
-- RIO_CM_CHAN_BIND : Binds the specified channel data structure to the specified
- mport device.
-- RIO_CM_CHAN_LISTEN : Enables listening for connection requests on the specified
- channel.
-- RIO_CM_CHAN_ACCEPT : Accepts a connection request from peer on the specified
- channel. If wait timeout for this request is specified by a caller it is
- a blocking call. If timeout set to 0 this is non-blocking call - ioctl
- handler checks for a pending connection request and if one is not available
- exits with -EGAIN error status immediately.
-- RIO_CM_CHAN_CONNECT : Sends a connection request to a remote peer/channel.
-- RIO_CM_CHAN_SEND : Sends a data message through the specified channel.
- The handler for this request assumes that message buffer specified by
- a caller includes the reserved space for a packet header required by
- this driver.
-- RIO_CM_CHAN_RECEIVE : Receives a data message through a connected channel.
- If the channel does not have an incoming message ready to return this ioctl
- handler will wait for new message until timeout specified by a caller
- expires. If timeout value is set to 0, ioctl handler uses a default value
- defined by MAX_SCHEDULE_TIMEOUT.
-- RIO_CM_CHAN_CLOSE : Closes a specified channel and frees associated buffers.
- If the specified channel is in the CONNECTED state, sends close notification
- to the remote peer.
-
-The ioctl command codes and corresponding data structures intended for use by
-user-space applications are defined in 'include/uapi/linux/rio_cm_cdev.h'.
-
-II. Hardware Compatibility
-
-This device driver uses standard interfaces defined by kernel RapidIO subsystem
-and therefore it can be used with any mport device driver registered by RapidIO
-subsystem with limitations set by available mport HW implementation of messaging
-mailboxes.
-
-III. Module parameters
-
-- 'dbg_level' - This parameter allows to control amount of debug information
- generated by this device driver. This parameter is formed by set of
- bit masks that correspond to the specific functional block.
- For mask definitions see 'drivers/rapidio/devices/rio_cm.c'
- This parameter can be changed dynamically.
- Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
-
-- 'cmbox' - Number of RapidIO mailbox to use (default value is 1).
- This parameter allows to set messaging mailbox number that will be used
- within entire RapidIO network. It can be used when default mailbox is
- used by other device drivers or is not supported by some nodes in the
- RapidIO network.
-
-- 'chstart' - Start channel number for dynamic assignment. Default value - 256.
- Allows to exclude channel numbers below this parameter from dynamic
- allocation to avoid conflicts with software components that use
- reserved predefined channel numbers.
-
-IV. Known problems
-
- None.
-
-V. User-space Applications and API Library
-
-Messaging API library and applications that use this device driver are available
-from RapidIO.org.
-
-VI. TODO List
-
-- Add support for system notification messages (reserved channel 0).
diff --git a/Documentation/rapidio/sysfs.txt b/Documentation/rapidio/sysfs.txt
deleted file mode 100644
index a1adac888e6e..000000000000
--- a/Documentation/rapidio/sysfs.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-The RapidIO sysfs files have moved to:
-Documentation/ABI/testing/sysfs-bus-rapidio and
-Documentation/ABI/testing/sysfs-class-rapidio
diff --git a/Documentation/rapidio/tsi721.txt b/Documentation/rapidio/tsi721.txt
deleted file mode 100644
index cd2a2935d51d..000000000000
--- a/Documentation/rapidio/tsi721.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-RapidIO subsystem mport driver for IDT Tsi721 PCI Express-to-SRIO bridge.
-=========================================================================
-
-I. Overview
-
-This driver implements all currently defined RapidIO mport callback functions.
-It supports maintenance read and write operations, inbound and outbound RapidIO
-doorbells, inbound maintenance port-writes and RapidIO messaging.
-
-To generate SRIO maintenance transactions this driver uses one of Tsi721 DMA
-channels. This mechanism provides access to larger range of hop counts and
-destination IDs without need for changes in outbound window translation.
-
-RapidIO messaging support uses dedicated messaging channels for each mailbox.
-For inbound messages this driver uses destination ID matching to forward messages
-into the corresponding message queue. Messaging callbacks are implemented to be
-fully compatible with RIONET driver (Ethernet over RapidIO messaging services).
-
-1. Module parameters:
-- 'dbg_level' - This parameter allows to control amount of debug information
- generated by this device driver. This parameter is formed by set of
- This parameter can be changed bit masks that correspond to the specific
- functional block.
- For mask definitions see 'drivers/rapidio/devices/tsi721.h'
- This parameter can be changed dynamically.
- Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
-
-- 'dma_desc_per_channel' - This parameter defines number of hardware buffer
- descriptors allocated for each registered Tsi721 DMA channel.
- Its default value is 128.
-
-- 'dma_txqueue_sz' - DMA transactions queue size. Defines number of pending
- transaction requests that can be accepted by each DMA channel.
- Default value is 16.
-
-- 'dma_sel' - DMA channel selection mask. Bitmask that defines which hardware
- DMA channels (0 ... 6) will be registered with DmaEngine core.
- If bit is set to 1, the corresponding DMA channel will be registered.
- DMA channels not selected by this mask will not be used by this device
- driver. Default value is 0x7f (use all channels).
-
-- 'pcie_mrrs' - override value for PCIe Maximum Read Request Size (MRRS).
- This parameter gives an ability to override MRRS value set during PCIe
- configuration process. Tsi721 supports read request sizes up to 4096B.
- Value for this parameter must be set as defined by PCIe specification:
- 0 = 128B, 1 = 256B, 2 = 512B, 3 = 1024B, 4 = 2048B and 5 = 4096B.
- Default value is '-1' (= keep platform setting).
-
-- 'mbox_sel' - RIO messaging MBOX selection mask. This is a bitmask that defines
- messaging MBOXes are managed by this device driver. Mask bits 0 - 3
- correspond to MBOX0 - MBOX3. MBOX is under driver's control if the
- corresponding bit is set to '1'. Default value is 0x0f (= all).
-
-II. Known problems
-
- None.
-
-III. DMA Engine Support
-
-Tsi721 mport driver supports DMA data transfers between local system memory and
-remote RapidIO devices. This functionality is implemented according to SLAVE
-mode API defined by common Linux kernel DMA Engine framework.
-
-Depending on system requirements RapidIO DMA operations can be included/excluded
-by setting CONFIG_RAPIDIO_DMA_ENGINE option. Tsi721 miniport driver uses seven
-out of eight available BDMA channels to support DMA data transfers.
-One BDMA channel is reserved for generation of maintenance read/write requests.
-
-If Tsi721 mport driver have been built with RAPIDIO_DMA_ENGINE support included,
-this driver will accept DMA-specific module parameter:
- "dma_desc_per_channel" - defines number of hardware buffer descriptors used by
- each BDMA channel of Tsi721 (by default - 128).
-
-IV. Version History
-
- 1.1.0 - DMA operations re-worked to support data scatter/gather lists larger
- than hardware buffer descriptors ring.
- 1.0.0 - Initial driver release.
-
-V. License
------------------------------------------------
-
- Copyright(c) 2011 Integrated Device Technology, Inc. All rights reserved.
-
- This program is free software; you can redistribute it and/or modify it
- under the terms of the GNU General Public License as published by the Free
- Software Foundation; either version 2 of the License, or (at your option)
- any later version.
-
- This program is distributed in the hope that it will be useful, but WITHOUT
- ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- more details.
-
- You should have received a copy of the GNU General Public License along with
- this program; if not, write to the Free Software Foundation, Inc.,
- 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt
index c42a21b99046..523d54b60087 100644
--- a/Documentation/rbtree.txt
+++ b/Documentation/rbtree.txt
@@ -204,21 +204,21 @@ potentially expensive tree iterations. This is done at negligible runtime
overhead for maintanence; albeit larger memory footprint.
Similar to the rb_root structure, cached rbtrees are initialized to be
-empty via:
+empty via::
struct rb_root_cached mytree = RB_ROOT_CACHED;
Cached rbtree is simply a regular rb_root with an extra pointer to cache the
leftmost node. This allows rb_root_cached to exist wherever rb_root does,
which permits augmented trees to be supported as well as only a few extra
-interfaces:
+interfaces::
struct rb_node *rb_first_cached(struct rb_root_cached *tree);
void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool);
void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
Both insert and erase calls have their respective counterpart of augmented
-trees:
+trees::
void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *,
bool, struct rb_augment_callbacks *);
diff --git a/Documentation/remoteproc.txt b/Documentation/remoteproc.txt
index 77fb03acdbb4..03c3d2e568b0 100644
--- a/Documentation/remoteproc.txt
+++ b/Documentation/remoteproc.txt
@@ -314,6 +314,8 @@ Here are the various resource types that are currently supported::
* @RSC_VDEV: declare support for a virtio device, and serve as its
* virtio header.
* @RSC_LAST: just keep this one at the end
+ * @RSC_VENDOR_START: start of the vendor specific resource types range
+ * @RSC_VENDOR_END: end of the vendor specific resource types range
*
* Please note that these values are used as indices to the rproc_handle_rsc
* lookup table, so please keep them sane. Moreover, @RSC_LAST is used to
@@ -321,11 +323,13 @@ Here are the various resource types that are currently supported::
* please update it as needed.
*/
enum fw_resource_type {
- RSC_CARVEOUT = 0,
- RSC_DEVMEM = 1,
- RSC_TRACE = 2,
- RSC_VDEV = 3,
- RSC_LAST = 4,
+ RSC_CARVEOUT = 0,
+ RSC_DEVMEM = 1,
+ RSC_TRACE = 2,
+ RSC_VDEV = 3,
+ RSC_LAST = 4,
+ RSC_VENDOR_START = 128,
+ RSC_VENDOR_END = 512,
};
For more details regarding a specific resource type, please see its
diff --git a/Documentation/riscv/boot-image-header.txt b/Documentation/riscv/boot-image-header.txt
new file mode 100644
index 000000000000..1b73fea23b39
--- /dev/null
+++ b/Documentation/riscv/boot-image-header.txt
@@ -0,0 +1,50 @@
+ Boot image header in RISC-V Linux
+ =============================================
+
+Author: Atish Patra <atish.patra@wdc.com>
+Date : 20 May 2019
+
+This document only describes the boot image header details for RISC-V Linux.
+The complete booting guide will be available at Documentation/riscv/booting.txt.
+
+The following 64-byte header is present in decompressed Linux kernel image.
+
+ u32 code0; /* Executable code */
+ u32 code1; /* Executable code */
+ u64 text_offset; /* Image load offset, little endian */
+ u64 image_size; /* Effective Image size, little endian */
+ u64 flags; /* kernel flags, little endian */
+ u32 version; /* Version of this header */
+ u32 res1 = 0; /* Reserved */
+ u64 res2 = 0; /* Reserved */
+ u64 magic = 0x5643534952; /* Magic number, little endian, "RISCV" */
+ u32 res3; /* Reserved for additional RISC-V specific header */
+ u32 res4; /* Reserved for PE COFF offset */
+
+This header format is compliant with PE/COFF header and largely inspired from
+ARM64 header. Thus, both ARM64 & RISC-V header can be combined into one common
+header in future.
+
+Notes:
+- This header can also be reused to support EFI stub for RISC-V in future. EFI
+ specification needs PE/COFF image header in the beginning of the kernel image
+ in order to load it as an EFI application. In order to support EFI stub,
+ code0 should be replaced with "MZ" magic string and res5(at offset 0x3c) should
+ point to the rest of the PE/COFF header.
+
+- version field indicate header version number.
+ Bits 0:15 - Minor version
+ Bits 16:31 - Major version
+
+ This preserves compatibility across newer and older version of the header.
+ The current version is defined as 0.1.
+
+- res3 is reserved for offset to any other additional fields. This makes the
+ header extendible in future. One example would be to accommodate ISA
+ extension for RISC-V in future. For current version, it is set to be zero.
+
+- In current header, the flag field has only one field.
+ Bit 0: Kernel endianness. 1 if BE, 0 if LE.
+
+- Image size is mandatory for boot loader to load kernel image. Booting will
+ fail otherwise.
diff --git a/Documentation/riscv/index.rst b/Documentation/riscv/index.rst
index c4b906d9b5a7..e3ca0922a8c2 100644
--- a/Documentation/riscv/index.rst
+++ b/Documentation/riscv/index.rst
@@ -1,5 +1,3 @@
-:orphan:
-
===================
RISC-V architecture
===================
diff --git a/Documentation/s390/3270.rst b/Documentation/s390/3270.rst
new file mode 100644
index 000000000000..e09e77954238
--- /dev/null
+++ b/Documentation/s390/3270.rst
@@ -0,0 +1,298 @@
+===============================
+IBM 3270 Display System support
+===============================
+
+This file describes the driver that supports local channel attachment
+of IBM 3270 devices. It consists of three sections:
+
+ * Introduction
+ * Installation
+ * Operation
+
+
+Introduction
+============
+
+This paper describes installing and operating 3270 devices under
+Linux/390. A 3270 device is a block-mode rows-and-columns terminal of
+which I'm sure hundreds of millions were sold by IBM and clonemakers
+twenty and thirty years ago.
+
+You may have 3270s in-house and not know it. If you're using the
+VM-ESA operating system, define a 3270 to your virtual machine by using
+the command "DEF GRAF <hex-address>" This paper presumes you will be
+defining four 3270s with the CP/CMS commands:
+
+ - DEF GRAF 620
+ - DEF GRAF 621
+ - DEF GRAF 622
+ - DEF GRAF 623
+
+Your network connection from VM-ESA allows you to use x3270, tn3270, or
+another 3270 emulator, started from an xterm window on your PC or
+workstation. With the DEF GRAF command, an application such as xterm,
+and this Linux-390 3270 driver, you have another way of talking to your
+Linux box.
+
+This paper covers installation of the driver and operation of a
+dialed-in x3270.
+
+
+Installation
+============
+
+You install the driver by installing a patch, doing a kernel build, and
+running the configuration script (config3270.sh, in this directory).
+
+WARNING: If you are using 3270 console support, you must rerun the
+configuration script every time you change the console's address (perhaps
+by using the condev= parameter in silo's /boot/parmfile). More precisely,
+you should rerun the configuration script every time your set of 3270s,
+including the console 3270, changes subchannel identifier relative to
+one another. ReIPL as soon as possible after running the configuration
+script and the resulting /tmp/mkdev3270.
+
+If you have chosen to make tub3270 a module, you add a line to a
+configuration file under /etc/modprobe.d/. If you are working on a VM
+virtual machine, you can use DEF GRAF to define virtual 3270 devices.
+
+You may generate both 3270 and 3215 console support, or one or the
+other, or neither. If you generate both, the console type under VM is
+not changed. Use #CP Q TERM to see what the current console type is.
+Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only
+3270 console support, then the driver automatically converts your console
+at boot time to a 3270 if it is a 3215.
+
+In brief, these are the steps:
+
+ 1. Install the tub3270 patch
+ 2. (If a module) add a line to a file in `/etc/modprobe.d/*.conf`
+ 3. (If VM) define devices with DEF GRAF
+ 4. Reboot
+ 5. Configure
+
+To test that everything works, assuming VM and x3270,
+
+ 1. Bring up an x3270 window.
+ 2. Use the DIAL command in that window.
+ 3. You should immediately see a Linux login screen.
+
+Here are the installation steps in detail:
+
+ 1. The 3270 driver is a part of the official Linux kernel
+ source. Build a tree with the kernel source and any necessary
+ patches. Then do::
+
+ make oldconfig
+ (If you wish to disable 3215 console support, edit
+ .config; change CONFIG_TN3215's value to "n";
+ and rerun "make oldconfig".)
+ make image
+ make modules
+ make modules_install
+
+ 2. (Perform this step only if you have configured tub3270 as a
+ module.) Add a line to a file `/etc/modprobe.d/*.conf` to automatically
+ load the driver when it's needed. With this line added, you will see
+ login prompts appear on your 3270s as soon as boot is complete (or
+ with emulated 3270s, as soon as you dial into your vm guest using the
+ command "DIAL <vmguestname>"). Since the line-mode major number is
+ 227, the line to add should be::
+
+ alias char-major-227 tub3270
+
+ 3. Define graphic devices to your vm guest machine, if you
+ haven't already. Define them before you reboot (reipl):
+
+ - DEFINE GRAF 620
+ - DEFINE GRAF 621
+ - DEFINE GRAF 622
+ - DEFINE GRAF 623
+
+ 4. Reboot. The reboot process scans hardware devices, including
+ 3270s, and this enables the tub3270 driver once loaded to respond
+ correctly to the configuration requests of the next step. If
+ you have chosen 3270 console support, your console now behaves
+ as a 3270, not a 3215.
+
+ 5. Run the 3270 configuration script config3270. It is
+ distributed in this same directory, Documentation/s390, as
+ config3270.sh. Inspect the output script it produces,
+ /tmp/mkdev3270, and then run that script. This will create the
+ necessary character special device files and make the necessary
+ changes to /etc/inittab.
+
+ Then notify /sbin/init that /etc/inittab has changed, by issuing
+ the telinit command with the q operand::
+
+ cd Documentation/s390
+ sh config3270.sh
+ sh /tmp/mkdev3270
+ telinit q
+
+ This should be sufficient for your first time. If your 3270
+ configuration has changed and you're reusing config3270, you
+ should follow these steps::
+
+ Change 3270 configuration
+ Reboot
+ Run config3270 and /tmp/mkdev3270
+ Reboot
+
+Here are the testing steps in detail:
+
+ 1. Bring up an x3270 window, or use an actual hardware 3278 or
+ 3279, or use the 3270 emulator of your choice. You would be
+ running the emulator on your PC or workstation. You would use
+ the command, for example::
+
+ x3270 vm-esa-domain-name &
+
+ if you wanted a 3278 Model 4 with 43 rows of 80 columns, the
+ default model number. The driver does not take advantage of
+ extended attributes.
+
+ The screen you should now see contains a VM logo with input
+ lines near the bottom. Use TAB to move to the bottom line,
+ probably labeled "COMMAND ===>".
+
+ 2. Use the DIAL command instead of the LOGIN command to connect
+ to one of the virtual 3270s you defined with the DEF GRAF
+ commands::
+
+ dial my-vm-guest-name
+
+ 3. You should immediately see a login prompt from your
+ Linux-390 operating system. If that does not happen, you would
+ see instead the line "DIALED TO my-vm-guest-name 0620".
+
+ To troubleshoot: do these things.
+
+ A. Is the driver loaded? Use the lsmod command (no operands)
+ to find out. Probably it isn't. Try loading it manually, with
+ the command "insmod tub3270". Does that command give error
+ messages? Ha! There's your problem.
+
+ B. Is the /etc/inittab file modified as in installation step 3
+ above? Use the grep command to find out; for instance, issue
+ "grep 3270 /etc/inittab". Nothing found? There's your
+ problem!
+
+ C. Are the device special files created, as in installation
+ step 2 above? Use the ls -l command to find out; for instance,
+ issue "ls -l /dev/3270/tty620". The output should start with the
+ letter "c" meaning character device and should contain "227, 1"
+ just to the left of the device name. No such file? no "c"?
+ Wrong major number? Wrong minor number? There's your
+ problem!
+
+ D. Do you get the message::
+
+ "HCPDIA047E my-vm-guest-name 0620 does not exist"?
+
+ If so, you must issue the command "DEF GRAF 620" from your VM
+ 3215 console and then reboot the system.
+
+
+
+OPERATION.
+==========
+
+The driver defines three areas on the 3270 screen: the log area, the
+input area, and the status area.
+
+The log area takes up all but the bottom two lines of the screen. The
+driver writes terminal output to it, starting at the top line and going
+down. When it fills, the status area changes from "Linux Running" to
+"Linux More...". After a scrolling timeout of (default) 5 sec, the
+screen clears and more output is written, from the top down.
+
+The input area extends from the beginning of the second-to-last screen
+line to the start of the status area. You type commands in this area
+and hit ENTER to execute them.
+
+The status area initializes to "Linux Running" to give you a warm
+fuzzy feeling. When the log area fills up and output awaits, it
+changes to "Linux More...". At this time you can do several things or
+nothing. If you do nothing, the screen will clear in (default) 5 sec
+and more output will appear. You may hit ENTER with nothing typed in
+the input area to toggle between "Linux More..." and "Linux Holding",
+which indicates no scrolling will occur. (If you hit ENTER with "Linux
+Running" and nothing typed, the application receives a newline.)
+
+You may change the scrolling timeout value. For example, the following
+command line::
+
+ echo scrolltime=60 > /proc/tty/driver/tty3270
+
+changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if
+you wish to prevent scrolling entirely.
+
+Other things you may do when the log area fills up are: hit PA2 to
+clear the log area and write more output to it, or hit CLEAR to clear
+the log area and the input area and write more output to the log area.
+
+Some of the Program Function (PF) and Program Attention (PA) keys are
+preassigned special functions. The ones that are not yield an alarm
+when pressed.
+
+PA1 causes a SIGINT to the currently running application. You may do
+the same thing from the input area, by typing "^C" and hitting ENTER.
+
+PA2 causes the log area to be cleared. If output awaits, it is then
+written to the log area.
+
+PF3 causes an EOF to be received as input by the application. You may
+cause an EOF also by typing "^D" and hitting ENTER.
+
+No PF key is preassigned to cause a job suspension, but you may cause a
+job suspension by typing "^Z" and hitting ENTER. You may wish to
+assign this function to a PF key. To make PF7 cause job suspension,
+execute the command::
+
+ echo pf7=^z > /proc/tty/driver/tty3270
+
+If the input you type does not end with the two characters "^n", the
+driver appends a newline character and sends it to the tty driver;
+otherwise the driver strips the "^n" and does not append a newline.
+The IBM 3215 driver behaves similarly.
+
+Pf10 causes the most recent command to be retrieved from the tube's
+command stack (default depth 20) and displayed in the input area. You
+may hit PF10 again for the next-most-recent command, and so on. A
+command is entered into the stack only when the input area is not made
+invisible (such as for password entry) and it is not identical to the
+current top entry. PF10 rotates backward through the command stack;
+PF11 rotates forward. You may assign the backward function to any PF
+key (or PA key, for that matter), say, PA3, with the command::
+
+ echo -e pa3=\\033k > /proc/tty/driver/tty3270
+
+This assigns the string ESC-k to PA3. Similarly, the string ESC-j
+performs the forward function. (Rationale: In bash with vi-mode line
+editing, ESC-k and ESC-j retrieve backward and forward history.
+Suggestions welcome.)
+
+Is a stack size of twenty commands not to your liking? Change it on
+the fly. To change to saving the last 100 commands, execute the
+command::
+
+ echo recallsize=100 > /proc/tty/driver/tty3270
+
+Have a command you issue frequently? Assign it to a PF or PA key! Use
+the command::
+
+ echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270
+
+to execute the commands mkdir foobar and cd foobar immediately when you
+hit PF24. Want to see the command line first, before you execute it?
+Use the -n option of the echo command::
+
+ echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270
+
+
+
+Happy testing! I welcome any and all comments about this document, the
+driver, etc etc.
+
+Dick Hitt <rbh00@utsglobal.com>
diff --git a/Documentation/s390/3270.txt b/Documentation/s390/3270.txt
deleted file mode 100644
index 7c715de99774..000000000000
--- a/Documentation/s390/3270.txt
+++ /dev/null
@@ -1,271 +0,0 @@
-IBM 3270 Display System support
-
-This file describes the driver that supports local channel attachment
-of IBM 3270 devices. It consists of three sections:
- * Introduction
- * Installation
- * Operation
-
-
-INTRODUCTION.
-
-This paper describes installing and operating 3270 devices under
-Linux/390. A 3270 device is a block-mode rows-and-columns terminal of
-which I'm sure hundreds of millions were sold by IBM and clonemakers
-twenty and thirty years ago.
-
-You may have 3270s in-house and not know it. If you're using the
-VM-ESA operating system, define a 3270 to your virtual machine by using
-the command "DEF GRAF <hex-address>" This paper presumes you will be
-defining four 3270s with the CP/CMS commands
-
- DEF GRAF 620
- DEF GRAF 621
- DEF GRAF 622
- DEF GRAF 623
-
-Your network connection from VM-ESA allows you to use x3270, tn3270, or
-another 3270 emulator, started from an xterm window on your PC or
-workstation. With the DEF GRAF command, an application such as xterm,
-and this Linux-390 3270 driver, you have another way of talking to your
-Linux box.
-
-This paper covers installation of the driver and operation of a
-dialed-in x3270.
-
-
-INSTALLATION.
-
-You install the driver by installing a patch, doing a kernel build, and
-running the configuration script (config3270.sh, in this directory).
-
-WARNING: If you are using 3270 console support, you must rerun the
-configuration script every time you change the console's address (perhaps
-by using the condev= parameter in silo's /boot/parmfile). More precisely,
-you should rerun the configuration script every time your set of 3270s,
-including the console 3270, changes subchannel identifier relative to
-one another. ReIPL as soon as possible after running the configuration
-script and the resulting /tmp/mkdev3270.
-
-If you have chosen to make tub3270 a module, you add a line to a
-configuration file under /etc/modprobe.d/. If you are working on a VM
-virtual machine, you can use DEF GRAF to define virtual 3270 devices.
-
-You may generate both 3270 and 3215 console support, or one or the
-other, or neither. If you generate both, the console type under VM is
-not changed. Use #CP Q TERM to see what the current console type is.
-Use #CP TERM CONMODE 3270 to change it to 3270. If you generate only
-3270 console support, then the driver automatically converts your console
-at boot time to a 3270 if it is a 3215.
-
-In brief, these are the steps:
- 1. Install the tub3270 patch
- 2. (If a module) add a line to a file in /etc/modprobe.d/*.conf
- 3. (If VM) define devices with DEF GRAF
- 4. Reboot
- 5. Configure
-
-To test that everything works, assuming VM and x3270,
- 1. Bring up an x3270 window.
- 2. Use the DIAL command in that window.
- 3. You should immediately see a Linux login screen.
-
-Here are the installation steps in detail:
-
- 1. The 3270 driver is a part of the official Linux kernel
- source. Build a tree with the kernel source and any necessary
- patches. Then do
- make oldconfig
- (If you wish to disable 3215 console support, edit
- .config; change CONFIG_TN3215's value to "n";
- and rerun "make oldconfig".)
- make image
- make modules
- make modules_install
-
- 2. (Perform this step only if you have configured tub3270 as a
- module.) Add a line to a file /etc/modprobe.d/*.conf to automatically
- load the driver when it's needed. With this line added, you will see
- login prompts appear on your 3270s as soon as boot is complete (or
- with emulated 3270s, as soon as you dial into your vm guest using the
- command "DIAL <vmguestname>"). Since the line-mode major number is
- 227, the line to add should be:
- alias char-major-227 tub3270
-
- 3. Define graphic devices to your vm guest machine, if you
- haven't already. Define them before you reboot (reipl):
- DEFINE GRAF 620
- DEFINE GRAF 621
- DEFINE GRAF 622
- DEFINE GRAF 623
-
- 4. Reboot. The reboot process scans hardware devices, including
- 3270s, and this enables the tub3270 driver once loaded to respond
- correctly to the configuration requests of the next step. If
- you have chosen 3270 console support, your console now behaves
- as a 3270, not a 3215.
-
- 5. Run the 3270 configuration script config3270. It is
- distributed in this same directory, Documentation/s390, as
- config3270.sh. Inspect the output script it produces,
- /tmp/mkdev3270, and then run that script. This will create the
- necessary character special device files and make the necessary
- changes to /etc/inittab.
-
- Then notify /sbin/init that /etc/inittab has changed, by issuing
- the telinit command with the q operand:
- cd Documentation/s390
- sh config3270.sh
- sh /tmp/mkdev3270
- telinit q
-
- This should be sufficient for your first time. If your 3270
- configuration has changed and you're reusing config3270, you
- should follow these steps:
- Change 3270 configuration
- Reboot
- Run config3270 and /tmp/mkdev3270
- Reboot
-
-Here are the testing steps in detail:
-
- 1. Bring up an x3270 window, or use an actual hardware 3278 or
- 3279, or use the 3270 emulator of your choice. You would be
- running the emulator on your PC or workstation. You would use
- the command, for example,
- x3270 vm-esa-domain-name &
- if you wanted a 3278 Model 4 with 43 rows of 80 columns, the
- default model number. The driver does not take advantage of
- extended attributes.
-
- The screen you should now see contains a VM logo with input
- lines near the bottom. Use TAB to move to the bottom line,
- probably labeled "COMMAND ===>".
-
- 2. Use the DIAL command instead of the LOGIN command to connect
- to one of the virtual 3270s you defined with the DEF GRAF
- commands:
- dial my-vm-guest-name
-
- 3. You should immediately see a login prompt from your
- Linux-390 operating system. If that does not happen, you would
- see instead the line "DIALED TO my-vm-guest-name 0620".
-
- To troubleshoot: do these things.
-
- A. Is the driver loaded? Use the lsmod command (no operands)
- to find out. Probably it isn't. Try loading it manually, with
- the command "insmod tub3270". Does that command give error
- messages? Ha! There's your problem.
-
- B. Is the /etc/inittab file modified as in installation step 3
- above? Use the grep command to find out; for instance, issue
- "grep 3270 /etc/inittab". Nothing found? There's your
- problem!
-
- C. Are the device special files created, as in installation
- step 2 above? Use the ls -l command to find out; for instance,
- issue "ls -l /dev/3270/tty620". The output should start with the
- letter "c" meaning character device and should contain "227, 1"
- just to the left of the device name. No such file? no "c"?
- Wrong major number? Wrong minor number? There's your
- problem!
-
- D. Do you get the message
- "HCPDIA047E my-vm-guest-name 0620 does not exist"?
- If so, you must issue the command "DEF GRAF 620" from your VM
- 3215 console and then reboot the system.
-
-
-
-OPERATION.
-
-The driver defines three areas on the 3270 screen: the log area, the
-input area, and the status area.
-
-The log area takes up all but the bottom two lines of the screen. The
-driver writes terminal output to it, starting at the top line and going
-down. When it fills, the status area changes from "Linux Running" to
-"Linux More...". After a scrolling timeout of (default) 5 sec, the
-screen clears and more output is written, from the top down.
-
-The input area extends from the beginning of the second-to-last screen
-line to the start of the status area. You type commands in this area
-and hit ENTER to execute them.
-
-The status area initializes to "Linux Running" to give you a warm
-fuzzy feeling. When the log area fills up and output awaits, it
-changes to "Linux More...". At this time you can do several things or
-nothing. If you do nothing, the screen will clear in (default) 5 sec
-and more output will appear. You may hit ENTER with nothing typed in
-the input area to toggle between "Linux More..." and "Linux Holding",
-which indicates no scrolling will occur. (If you hit ENTER with "Linux
-Running" and nothing typed, the application receives a newline.)
-
-You may change the scrolling timeout value. For example, the following
-command line:
- echo scrolltime=60 > /proc/tty/driver/tty3270
-changes the scrolling timeout value to 60 sec. Set scrolltime to 0 if
-you wish to prevent scrolling entirely.
-
-Other things you may do when the log area fills up are: hit PA2 to
-clear the log area and write more output to it, or hit CLEAR to clear
-the log area and the input area and write more output to the log area.
-
-Some of the Program Function (PF) and Program Attention (PA) keys are
-preassigned special functions. The ones that are not yield an alarm
-when pressed.
-
-PA1 causes a SIGINT to the currently running application. You may do
-the same thing from the input area, by typing "^C" and hitting ENTER.
-
-PA2 causes the log area to be cleared. If output awaits, it is then
-written to the log area.
-
-PF3 causes an EOF to be received as input by the application. You may
-cause an EOF also by typing "^D" and hitting ENTER.
-
-No PF key is preassigned to cause a job suspension, but you may cause a
-job suspension by typing "^Z" and hitting ENTER. You may wish to
-assign this function to a PF key. To make PF7 cause job suspension,
-execute the command:
- echo pf7=^z > /proc/tty/driver/tty3270
-
-If the input you type does not end with the two characters "^n", the
-driver appends a newline character and sends it to the tty driver;
-otherwise the driver strips the "^n" and does not append a newline.
-The IBM 3215 driver behaves similarly.
-
-Pf10 causes the most recent command to be retrieved from the tube's
-command stack (default depth 20) and displayed in the input area. You
-may hit PF10 again for the next-most-recent command, and so on. A
-command is entered into the stack only when the input area is not made
-invisible (such as for password entry) and it is not identical to the
-current top entry. PF10 rotates backward through the command stack;
-PF11 rotates forward. You may assign the backward function to any PF
-key (or PA key, for that matter), say, PA3, with the command:
- echo -e pa3=\\033k > /proc/tty/driver/tty3270
-This assigns the string ESC-k to PA3. Similarly, the string ESC-j
-performs the forward function. (Rationale: In bash with vi-mode line
-editing, ESC-k and ESC-j retrieve backward and forward history.
-Suggestions welcome.)
-
-Is a stack size of twenty commands not to your liking? Change it on
-the fly. To change to saving the last 100 commands, execute the
-command:
- echo recallsize=100 > /proc/tty/driver/tty3270
-
-Have a command you issue frequently? Assign it to a PF or PA key! Use
-the command
- echo pf24="mkdir foobar; cd foobar" > /proc/tty/driver/tty3270
-to execute the commands mkdir foobar and cd foobar immediately when you
-hit PF24. Want to see the command line first, before you execute it?
-Use the -n option of the echo command:
- echo -n pf24="mkdir foo; cd foo" > /proc/tty/driver/tty3270
-
-
-
-Happy testing! I welcome any and all comments about this document, the
-driver, etc etc.
-
-Dick Hitt <rbh00@utsglobal.com>
diff --git a/Documentation/s390/CommonIO b/Documentation/s390/CommonIO
deleted file mode 100644
index 6e0f63f343b4..000000000000
--- a/Documentation/s390/CommonIO
+++ /dev/null
@@ -1,125 +0,0 @@
-S/390 common I/O-Layer - command line parameters, procfs and debugfs entries
-============================================================================
-
-Command line parameters
------------------------
-
-* ccw_timeout_log
-
- Enable logging of debug information in case of ccw device timeouts.
-
-* cio_ignore = device[,device[,..]]
-
- device := {all | [!]ipldev | [!]condev | [!]<devno> | [!]<devno>-<devno>}
-
- The given devices will be ignored by the common I/O-layer; no detection
- and device sensing will be done on any of those devices. The subchannel to
- which the device in question is attached will be treated as if no device was
- attached.
-
- An ignored device can be un-ignored later; see the "/proc entries"-section for
- details.
-
- The devices must be given either as bus ids (0.x.abcd) or as hexadecimal
- device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you
- give a device number 0xabcd, it will be interpreted as 0.0.abcd.
-
- You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev'
- keywords can be used to refer to the CCW based boot device and CCW console
- device respectively (these are probably useful only when combined with the '!'
- operator). The '!' operator will cause the I/O-layer to _not_ ignore a device.
- The command line is parsed from left to right.
-
- For example,
- cio_ignore=0.0.0023-0.0.0042,0.0.4711
- will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device
- 0.0.4711, if detected.
- As another example,
- cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02
- will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02.
-
- By default, no devices are ignored.
-
-
-/proc entries
--------------
-
-* /proc/cio_ignore
-
- Lists the ranges of devices (by bus id) which are ignored by common I/O.
-
- You can un-ignore certain or all devices by piping to /proc/cio_ignore.
- "free all" will un-ignore all ignored devices,
- "free <device range>, <device range>, ..." will un-ignore the specified
- devices.
-
- For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored,
- - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore
- will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023
- to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored;
- - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device
- 0.0.0041;
- - echo free all > /proc/cio_ignore will un-ignore all remaining ignored
- devices.
-
- When a device is un-ignored, device recognition and sensing is performed and
- the device driver will be notified if possible, so the device will become
- available to the system. Note that un-ignoring is performed asynchronously.
-
- You can also add ranges of devices to be ignored by piping to
- /proc/cio_ignore; "add <device range>, <device range>, ..." will ignore the
- specified devices.
-
- Note: While already known devices can be added to the list of devices to be
- ignored, there will be no effect on then. However, if such a device
- disappears and then reappears, it will then be ignored. To make
- known devices go away, you need the "purge" command (see below).
-
- For example,
- "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore"
- will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
- devices.
-
- You can remove already known but now ignored devices via
- "echo purge > /proc/cio_ignore"
- All devices ignored but still registered and not online (= not in use)
- will be deregistered and thus removed from the system.
-
- The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
- compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
- numbers given as 0xabcd will be interpreted as 0.0.abcd.
-
-* /proc/cio_settle
-
- A write request to this file is blocked until all queued cio actions are
- handled. This will allow userspace to wait for pending work affecting
- device availability after changing cio_ignore or the hardware configuration.
-
-* For some of the information present in the /proc filesystem in 2.4 (namely,
- /proc/subchannels and /proc/chpids), see driver-model.txt.
- Information formerly in /proc/irq_count is now in /proc/interrupts.
-
-
-debugfs entries
----------------
-
-* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature)
-
- Some views generated by the debug feature to hold various debug outputs.
-
- - /sys/kernel/debug/s390dbf/cio_crw/sprintf
- Messages from the processing of pending channel report words (machine check
- handling).
-
- - /sys/kernel/debug/s390dbf/cio_msg/sprintf
- Various debug messages from the common I/O-layer.
-
- - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
- Logs the calling of functions in the common I/O-layer and, if applicable,
- which subchannel they were called for, as well as dumps of some data
- structures (like irb in an error case).
-
- The level of logging can be changed to be more or less verbose by piping to
- /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the
- documentation on the S/390 debug feature (Documentation/s390/s390dbf.txt)
- for details.
diff --git a/Documentation/s390/DASD b/Documentation/s390/DASD
deleted file mode 100644
index 9963f1e9c98a..000000000000
--- a/Documentation/s390/DASD
+++ /dev/null
@@ -1,73 +0,0 @@
-DASD device driver
-
-S/390's disk devices (DASDs) are managed by Linux via the DASD device
-driver. It is valid for all types of DASDs and represents them to
-Linux as block devices, namely "dd". Currently the DASD driver uses a
-single major number (254) and 4 minor numbers per volume (1 for the
-physical volume and 3 for partitions). With respect to partitions see
-below. Thus you may have up to 64 DASD devices in your system.
-
-The kernel parameter 'dasd=from-to,...' may be issued arbitrary times
-in the kernel's parameter line or not at all. The 'from' and 'to'
-parameters are to be given in hexadecimal notation without a leading
-0x.
-If you supply kernel parameters the different instances are processed
-in order of appearance and a minor number is reserved for any device
-covered by the supplied range up to 64 volumes. Additional DASDs are
-ignored. If you do not supply the 'dasd=' kernel parameter at all, the
-DASD driver registers all supported DASDs of your system to a minor
-number in ascending order of the subchannel number.
-
-The driver currently supports ECKD-devices and there are stubs for
-support of the FBA and CKD architectures. For the FBA architecture
-only some smart data structures are missing to make the support
-complete.
-We performed our testing on 3380 and 3390 type disks of different
-sizes, under VM and on the bare hardware (LPAR), using internal disks
-of the multiprise as well as a RAMAC virtual array. Disks exported by
-an Enterprise Storage Server (Seascape) should work fine as well.
-
-We currently implement one partition per volume, which is the whole
-volume, skipping the first blocks up to the volume label. These are
-reserved for IPL records and IBM's volume label to assure
-accessibility of the DASD from other OSs. In a later stage we will
-provide support of partitions, maybe VTOC oriented or using a kind of
-partition table in the label record.
-
-USAGE
-
--Low-level format (?CKD only)
-For using an ECKD-DASD as a Linux harddisk you have to low-level
-format the tracks by issuing the BLKDASDFORMAT-ioctl on that
-device. This will erase any data on that volume including IBM volume
-labels, VTOCs etc. The ioctl may take a 'struct format_data *' or
-'NULL' as an argument.
-typedef struct {
- int start_unit;
- int stop_unit;
- int blksize;
-} format_data_t;
-When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole
-disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit
-and stop_unit are the first and last track to be formatted. If
-stop_unit is -1 it implies that the DASD is formatted from start_unit
-up to the last track. blksize can be any power of two between 512 and
-4096. We recommend no blksize lower than 1024 because the ext2fs uses
-1kB blocks anyway and you gain approx. 50% of capacity increasing your
-blksize from 512 byte to 1kB.
-
--Make a filesystem
-Then you can mk??fs the filesystem of your choice on that volume or
-partition. For reasons of sanity you should build your filesystem on
-the partition /dev/dd?1 instead of the whole volume. You only lose 3kB
-but may be sure that you can reuse your data after introduction of a
-real partition table.
-
-BUGS:
-- Performance sometimes is rather low because we don't fully exploit clustering
-
-TODO-List:
-- Add IBM'S Disk layout to genhd
-- Enhance driver to use more than one major number
-- Enable usage as a module
-- Support Cache fast write and DASD fast write (ECKD)
diff --git a/Documentation/s390/Debugging390.txt b/Documentation/s390/Debugging390.txt
deleted file mode 100644
index 5ae7f868a007..000000000000
--- a/Documentation/s390/Debugging390.txt
+++ /dev/null
@@ -1,2142 +0,0 @@
-
- Debugging on Linux for s/390 & z/Architecture
- by
- Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
- Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation
- Best viewed with fixed width fonts
-
-Overview of Document:
-=====================
-This document is intended to give a good overview of how to debug Linux for
-s/390 and z/Architecture. It is not intended as a complete reference and not a
-tutorial on the fundamentals of C & assembly. It doesn't go into
-390 IO in any detail. It is intended to complement the documents in the
-reference section below & any other worthwhile references you get.
-
-It is intended like the Enterprise Systems Architecture/390 Reference Summary
-to be printed out & used as a quick cheat sheet self help style reference when
-problems occur.
-
-Contents
-========
-Register Set
-Address Spaces on Intel Linux
-Address Spaces on Linux for s/390 & z/Architecture
-The Linux for s/390 & z/Architecture Kernel Task Structure
-Register Usage & Stackframes on Linux for s/390 & z/Architecture
-A sample program with comments
-Compiling programs for debugging on Linux for s/390 & z/Architecture
-Debugging under VM
-s/390 & z/Architecture IO Overview
-Debugging IO on s/390 & z/Architecture under VM
-GDB on s/390 & z/Architecture
-Stack chaining in gdb by hand
-Examining core dumps
-ldd
-Debugging modules
-The proc file system
-SysRq
-References
-Special Thanks
-
-Register Set
-============
-The current architectures have the following registers.
-
-16 General propose registers, 32 bit on s/390 and 64 bit on z/Architecture,
-r0-r15 (or gpr0-gpr15), used for arithmetic and addressing.
-
-16 Control registers, 32 bit on s/390 and 64 bit on z/Architecture, cr0-cr15,
-kernel usage only, used for memory management, interrupt control, debugging
-control etc.
-
-16 Access registers (ar0-ar15), 32 bit on both s/390 and z/Architecture,
-normally not used by normal programs but potentially could be used as
-temporary storage. These registers have a 1:1 association with general
-purpose registers and are designed to be used in the so-called access
-register mode to select different address spaces.
-Access register 0 (and access register 1 on z/Architecture, which needs a
-64 bit pointer) is currently used by the pthread library as a pointer to
-the current running threads private area.
-
-16 64 bit floating point registers (fp0-fp15 ) IEEE & HFP floating
-point format compliant on G5 upwards & a Floating point control reg (FPC)
-4 64 bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines.
-Note:
-Linux (currently) always uses IEEE & emulates G5 IEEE format on older machines,
-( provided the kernel is configured for this ).
-
-
-The PSW is the most important register on the machine it
-is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of
-a program counter (pc), condition code register,memory space designator.
-In IBM standard notation I am counting bit 0 as the MSB.
-It has several advantages over a normal program counter
-in that you can change address translation & program counter
-in a single instruction. To change address translation,
-e.g. switching address translation off requires that you
-have a logical=physical mapping for the address you are
-currently running at.
-
- Bit Value
-s/390 z/Architecture
-0 0 Reserved ( must be 0 ) otherwise specification exception occurs.
-
-1 1 Program Event Recording 1 PER enabled,
- PER is used to facilitate debugging e.g. single stepping.
-
-2-4 2-4 Reserved ( must be 0 ).
-
-5 5 Dynamic address translation 1=DAT on.
-
-6 6 Input/Output interrupt Mask
-
-7 7 External interrupt Mask used primarily for interprocessor
- signalling and clock interrupts.
-
-8-11 8-11 PSW Key used for complex memory protection mechanism
- (not used under linux)
-
-12 12 1 on s/390 0 on z/Architecture
-
-13 13 Machine Check Mask 1=enable machine check interrupts
-
-14 14 Wait State. Set this to 1 to stop the processor except for
- interrupts and give time to other LPARS. Used in CPU idle in
- the kernel to increase overall usage of processor resources.
-
-15 15 Problem state ( if set to 1 certain instructions are disabled )
- all linux user programs run with this bit 1
- ( useful info for debugging under VM ).
-
-16-17 16-17 Address Space Control
-
- 00 Primary Space Mode:
- The register CR1 contains the primary address-space control ele-
- ment (PASCE), which points to the primary space region/segment
- table origin.
-
- 01 Access register mode
-
- 10 Secondary Space Mode:
- The register CR7 contains the secondary address-space control
- element (SASCE), which points to the secondary space region or
- segment table origin.
-
- 11 Home Space Mode:
- The register CR13 contains the home space address-space control
- element (HASCE), which points to the home space region/segment
- table origin.
-
- See "Address Spaces on Linux for s/390 & z/Architecture" below
- for more information about address space usage in Linux.
-
-18-19 18-19 Condition codes (CC)
-
-20 20 Fixed point overflow mask if 1=FPU exceptions for this event
- occur ( normally 0 )
-
-21 21 Decimal overflow mask if 1=FPU exceptions for this event occur
- ( normally 0 )
-
-22 22 Exponent underflow mask if 1=FPU exceptions for this event occur
- ( normally 0 )
-
-23 23 Significance Mask if 1=FPU exceptions for this event occur
- ( normally 0 )
-
-24-31 24-30 Reserved Must be 0.
-
- 31 Extended Addressing Mode
- 32 Basic Addressing Mode
- Used to set addressing mode
- PSW 31 PSW 32
- 0 0 24 bit
- 0 1 31 bit
- 1 1 64 bit
-
-32 1=31 bit addressing mode 0=24 bit addressing mode (for backward
- compatibility), linux always runs with this bit set to 1
-
-33-64 Instruction address.
- 33-63 Reserved must be 0
- 64-127 Address
- In 24 bits mode bits 64-103=0 bits 104-127 Address
- In 31 bits mode bits 64-96=0 bits 97-127 Address
- Note: unlike 31 bit mode on s/390 bit 96 must be zero
- when loading the address with LPSWE otherwise a
- specification exception occurs, LPSW is fully backward
- compatible.
-
-
-Prefix Page(s)
---------------
-This per cpu memory area is too intimately tied to the processor not to mention.
-It exists between the real addresses 0-4096 on s/390 and between 0-8192 on
-z/Architecture and is exchanged with one page on s/390 or two pages on
-z/Architecture in absolute storage by the set prefix instruction during Linux
-startup.
-This page is mapped to a different prefix for each processor in an SMP
-configuration (assuming the OS designer is sane of course).
-Bytes 0-512 (200 hex) on s/390 and 0-512, 4096-4544, 4604-5119 currently on
-z/Architecture are used by the processor itself for holding such information
-as exception indications and entry points for exceptions.
-Bytes after 0xc00 hex are used by linux for per processor globals on s/390 and
-z/Architecture (there is a gap on z/Architecture currently between 0xc00 and
-0x1000, too, which is used by Linux).
-The closest thing to this on traditional architectures is the interrupt
-vector table. This is a good thing & does simplify some of the kernel coding
-however it means that we now cannot catch stray NULL pointers in the
-kernel without hard coded checks.
-
-
-
-Address Spaces on Intel Linux
-=============================
-
-The traditional Intel Linux is approximately mapped as follows forgive
-the ascii art.
-0xFFFFFFFF 4GB Himem *****************
- * *
- * Kernel Space *
- * *
- ***************** ****************
-User Space Himem * User Stack * * *
-(typically 0xC0000000 3GB ) ***************** * *
- * Shared Libs * * Next Process *
- ***************** * to *
- * * <== * Run * <==
- * User Program * * *
- * Data BSS * * *
- * Text * * *
- * Sections * * *
-0x00000000 ***************** ****************
-
-Now it is easy to see that on Intel it is quite easy to recognise a kernel
-address as being one greater than user space himem (in this case 0xC0000000),
-and addresses of less than this are the ones in the current running program on
-this processor (if an smp box).
-If using the virtual machine ( VM ) as a debugger it is quite difficult to
-know which user process is running as the address space you are looking at
-could be from any process in the run queue.
-
-The limitation of Intels addressing technique is that the linux
-kernel uses a very simple real address to virtual addressing technique
-of Real Address=Virtual Address-User Space Himem.
-This means that on Intel the kernel linux can typically only address
-Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines
-can typically use.
-They can lower User Himem to 2GB or lower & thus be
-able to use 2GB of RAM however this shrinks the maximum size
-of User Space from 3GB to 2GB they have a no win limit of 4GB unless
-they go to 64 Bit.
-
-
-On 390 our limitations & strengths make us slightly different.
-For backward compatibility we are only allowed use 31 bits (2GB)
-of our 32 bit addresses, however, we use entirely separate address
-spaces for the user & kernel.
-
-This means we can support 2GB of non Extended RAM on s/390, & more
-with the Extended memory management swap device &
-currently 4TB of physical memory currently on z/Architecture.
-
-
-Address Spaces on Linux for s/390 & z/Architecture
-==================================================
-
-Our addressing scheme is basically as follows:
-
- Primary Space Home Space
-Himem 0x7fffffff 2GB on s/390 ***************** ****************
-currently 0x3ffffffffff (2^42)-1 * User Stack * * *
-on z/Architecture. ***************** * *
- * Shared Libs * * *
- ***************** * *
- * * * Kernel *
- * User Program * * *
- * Data BSS * * *
- * Text * * *
- * Sections * * *
-0x00000000 ***************** ****************
-
-This also means that we need to look at the PSW problem state bit and the
-addressing mode to decide whether we are looking at user or kernel space.
-
-User space runs in primary address mode (or access register mode within
-the vdso code).
-
-The kernel usually also runs in home space mode, however when accessing
-user space the kernel switches to primary or secondary address mode if
-the mvcos instruction is not available or if a compare-and-swap (futex)
-instruction on a user space address is performed.
-
-When also looking at the ASCE control registers, this means:
-
-User space:
-- runs in primary or access register mode
-- cr1 contains the user asce
-- cr7 contains the user asce
-- cr13 contains the kernel asce
-
-Kernel space:
-- runs in home space mode
-- cr1 contains the user or kernel asce
- -> the kernel asce is loaded when a uaccess requires primary or
- secondary address mode
-- cr7 contains the user or kernel asce, (changed with set_fs())
-- cr13 contains the kernel asce
-
-In case of uaccess the kernel changes to:
-- primary space mode in case of a uaccess (copy_to_user) and uses
- e.g. the mvcp instruction to access user space. However the kernel
- will stay in home space mode if the mvcos instruction is available
-- secondary space mode in case of futex atomic operations, so that the
- instructions come from primary address space and data from secondary
- space
-
-In case of KVM, the kernel runs in home space mode, but cr1 gets switched
-to contain the gmap asce before the SIE instruction gets executed. When
-the SIE instruction is finished, cr1 will be switched back to contain the
-user asce.
-
-
-Virtual Addresses on s/390 & z/Architecture
-===========================================
-
-A virtual address on s/390 is made up of 3 parts
-The SX (segment index, roughly corresponding to the PGD & PMD in Linux
-terminology) being bits 1-11.
-The PX (page index, corresponding to the page table entry (pte) in Linux
-terminology) being bits 12-19.
-The remaining bits BX (the byte index are the offset in the page )
-i.e. bits 20 to 31.
-
-On z/Architecture in linux we currently make up an address from 4 parts.
-The region index bits (RX) 0-32 we currently use bits 22-32
-The segment index (SX) being bits 33-43
-The page index (PX) being bits 44-51
-The byte index (BX) being bits 52-63
-
-Notes:
-1) s/390 has no PMD so the PMD is really the PGD also.
-A lot of this stuff is defined in pgtable.h.
-
-2) Also seeing as s/390's page indexes are only 1k in size
-(bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k )
-to make the best use of memory by updating 4 segment indices
-entries each time we mess with a PMD & use offsets
-0,1024,2048 & 3072 in this page as for our segment indexes.
-On z/Architecture our page indexes are now 2k in size
-( bits 12-19 x 8 bytes per pte ) we do a similar trick
-but only mess with 2 segment indices each time we mess with
-a PMD.
-
-3) As z/Architecture supports up to a massive 5-level page table lookup we
-can only use 3 currently on Linux ( as this is all the generic kernel
-currently supports ) however this may change in future
-this allows us to access ( according to my sums )
-4TB of virtual storage per process i.e.
-4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes,
-enough for another 2 or 3 of years I think :-).
-to do this we use a region-third-table designation type in
-our address space control registers.
-
-
-The Linux for s/390 & z/Architecture Kernel Task Structure
-==========================================================
-Each process/thread under Linux for S390 has its own kernel task_struct
-defined in linux/include/linux/sched.h
-The S390 on initialisation & resuming of a process on a cpu sets
-the __LC_KERNEL_STACK variable in the spare prefix area for this cpu
-(which we use for per-processor globals).
-
-The kernel stack pointer is intimately tied with the task structure for
-each processor as follows.
-
- s/390
- ************************
- * 1 page kernel stack *
- * ( 4K ) *
- ************************
- * 1 page task_struct *
- * ( 4K ) *
-8K aligned ************************
-
- z/Architecture
- ************************
- * 2 page kernel stack *
- * ( 8K ) *
- ************************
- * 2 page task_struct *
- * ( 8K ) *
-16K aligned ************************
-
-What this means is that we don't need to dedicate any register or global
-variable to point to the current running process & can retrieve it with the
-following very simple construct for s/390 & one very similar for z/Architecture.
-
-static inline struct task_struct * get_current(void)
-{
- struct task_struct *current;
- __asm__("lhi %0,-8192\n\t"
- "nr %0,15"
- : "=r" (current) );
- return current;
-}
-
-i.e. just anding the current kernel stack pointer with the mask -8192.
-Thankfully because Linux doesn't have support for nested IO interrupts
-& our devices have large buffers can survive interrupts being shut for
-short amounts of time we don't need a separate stack for interrupts.
-
-
-
-
-Register Usage & Stackframes on Linux for s/390 & z/Architecture
-=================================================================
-Overview:
----------
-This is the code that gcc produces at the top & the bottom of
-each function. It usually is fairly consistent & similar from
-function to function & if you know its layout you can probably
-make some headway in finding the ultimate cause of a problem
-after a crash without a source level debugger.
-
-Note: To follow stackframes requires a knowledge of C or Pascal &
-limited knowledge of one assembly language.
-
-It should be noted that there are some differences between the
-s/390 and z/Architecture stack layouts as the z/Architecture stack layout
-didn't have to maintain compatibility with older linkage formats.
-
-Glossary:
----------
-alloca:
-This is a built in compiler function for runtime allocation
-of extra space on the callers stack which is obviously freed
-up on function exit ( e.g. the caller may choose to allocate nothing
-of a buffer of 4k if required for temporary purposes ), it generates
-very efficient code ( a few cycles ) when compared to alternatives
-like malloc.
-
-automatics: These are local variables on the stack,
-i.e they aren't in registers & they aren't static.
-
-back-chain:
-This is a pointer to the stack pointer before entering a
-framed functions ( see frameless function ) prologue got by
-dereferencing the address of the current stack pointer,
- i.e. got by accessing the 32 bit value at the stack pointers
-current location.
-
-base-pointer:
-This is a pointer to the back of the literal pool which
-is an area just behind each procedure used to store constants
-in each function.
-
-call-clobbered: The caller probably needs to save these registers if there
-is something of value in them, on the stack or elsewhere before making a
-call to another procedure so that it can restore it later.
-
-epilogue:
-The code generated by the compiler to return to the caller.
-
-frameless-function
-A frameless function in Linux for s390 & z/Architecture is one which doesn't
-need more than the register save area (96 bytes on s/390, 160 on z/Architecture)
-given to it by the caller.
-A frameless function never:
-1) Sets up a back chain.
-2) Calls alloca.
-3) Calls other normal functions
-4) Has automatics.
-
-GOT-pointer:
-This is a pointer to the global-offset-table in ELF
-( Executable Linkable Format, Linux'es most common executable format ),
-all globals & shared library objects are found using this pointer.
-
-lazy-binding
-ELF shared libraries are typically only loaded when routines in the shared
-library are actually first called at runtime. This is lazy binding.
-
-procedure-linkage-table
-This is a table found from the GOT which contains pointers to routines
-in other shared libraries which can't be called to by easier means.
-
-prologue:
-The code generated by the compiler to set up the stack frame.
-
-outgoing-args:
-This is extra area allocated on the stack of the calling function if the
-parameters for the callee's cannot all be put in registers, the same
-area can be reused by each function the caller calls.
-
-routine-descriptor:
-A COFF executable format based concept of a procedure reference
-actually being 8 bytes or more as opposed to a simple pointer to the routine.
-This is typically defined as follows
-Routine Descriptor offset 0=Pointer to Function
-Routine Descriptor offset 4=Pointer to Table of Contents
-The table of contents/TOC is roughly equivalent to a GOT pointer.
-& it means that shared libraries etc. can be shared between several
-environments each with their own TOC.
-
-
-static-chain: This is used in nested functions a concept adopted from pascal
-by gcc not used in ansi C or C++ ( although quite useful ), basically it
-is a pointer used to reference local variables of enclosing functions.
-You might come across this stuff once or twice in your lifetime.
-
-e.g.
-The function below should return 11 though gcc may get upset & toss warnings
-about unused variables.
-int FunctionA(int a)
-{
- int b;
- FunctionC(int c)
- {
- b=c+1;
- }
- FunctionC(10);
- return(b);
-}
-
-
-s/390 & z/Architecture Register usage
-=====================================
-r0 used by syscalls/assembly call-clobbered
-r1 used by syscalls/assembly call-clobbered
-r2 argument 0 / return value 0 call-clobbered
-r3 argument 1 / return value 1 (if long long) call-clobbered
-r4 argument 2 call-clobbered
-r5 argument 3 call-clobbered
-r6 argument 4 saved
-r7 pointer-to arguments 5 to ... saved
-r8 this & that saved
-r9 this & that saved
-r10 static-chain ( if nested function ) saved
-r11 frame-pointer ( if function used alloca ) saved
-r12 got-pointer saved
-r13 base-pointer saved
-r14 return-address saved
-r15 stack-pointer saved
-
-f0 argument 0 / return value ( float/double ) call-clobbered
-f2 argument 1 call-clobbered
-f4 z/Architecture argument 2 saved
-f6 z/Architecture argument 3 saved
-The remaining floating points
-f1,f3,f5 f7-f15 are call-clobbered.
-
-Notes:
-------
-1) The only requirement is that registers which are used
-by the callee are saved, e.g. the compiler is perfectly
-capable of using r11 for purposes other than a frame a
-frame pointer if a frame pointer is not needed.
-2) In functions with variable arguments e.g. printf the calling procedure
-is identical to one without variable arguments & the same number of
-parameters. However, the prologue of this function is somewhat more
-hairy owing to it having to move these parameters to the stack to
-get va_start, va_arg & va_end to work.
-3) Access registers are currently unused by gcc but are used in
-the kernel. Possibilities exist to use them at the moment for
-temporary storage but it isn't recommended.
-4) Only 4 of the floating point registers are used for
-parameter passing as older machines such as G3 only have only 4
-& it keeps the stack frame compatible with other compilers.
-However with IEEE floating point emulation under linux on the
-older machines you are free to use the other 12.
-5) A long long or double parameter cannot be have the
-first 4 bytes in a register & the second four bytes in the
-outgoing args area. It must be purely in the outgoing args
-area if crossing this boundary.
-6) Floating point parameters are mixed with outgoing args
-on the outgoing args area in the order the are passed in as parameters.
-7) Floating point arguments 2 & 3 are saved in the outgoing args area for
-z/Architecture
-
-
-Stack Frame Layout
-------------------
-s/390 z/Architecture
-0 0 back chain ( a 0 here signifies end of back chain )
-4 8 eos ( end of stack, not used on Linux for S390 used in other linkage formats )
-8 16 glue used in other s/390 linkage formats for saved routine descriptors etc.
-12 24 glue used in other s/390 linkage formats for saved routine descriptors etc.
-16 32 scratch area
-20 40 scratch area
-24 48 saved r6 of caller function
-28 56 saved r7 of caller function
-32 64 saved r8 of caller function
-36 72 saved r9 of caller function
-40 80 saved r10 of caller function
-44 88 saved r11 of caller function
-48 96 saved r12 of caller function
-52 104 saved r13 of caller function
-56 112 saved r14 of caller function
-60 120 saved r15 of caller function
-64 128 saved f4 of caller function
-72 132 saved f6 of caller function
-80 undefined
-96 160 outgoing args passed from caller to callee
-96+x 160+x possible stack alignment ( 8 bytes desirable )
-96+x+y 160+x+y alloca space of caller ( if used )
-96+x+y+z 160+x+y+z automatics of caller ( if used )
-0 back-chain
-
-A sample program with comments.
-===============================
-
-Comments on the function test
------------------------------
-1) It didn't need to set up a pointer to the constant pool gpr13 as it is not
-used ( :-( ).
-2) This is a frameless function & no stack is bought.
-3) The compiler was clever enough to recognise that it could return the
-value in r2 as well as use it for the passed in parameter ( :-) ).
-4) The basr ( branch relative & save ) trick works as follows the instruction
-has a special case with r0,r0 with some instruction operands is understood as
-the literal value 0, some risc architectures also do this ). So now
-we are branching to the next address & the address new program counter is
-in r13,so now we subtract the size of the function prologue we have executed
-+ the size of the literal pool to get to the top of the literal pool
-0040037c int test(int b)
-{ # Function prologue below
- 40037c: 90 de f0 34 stm %r13,%r14,52(%r15) # Save registers r13 & r14
- 400380: 0d d0 basr %r13,%r0 # Set up pointer to constant pool using
- 400382: a7 da ff fa ahi %r13,-6 # basr trick
- return(5+b);
- # Huge main program
- 400386: a7 2a 00 05 ahi %r2,5 # add 5 to r2
-
- # Function epilogue below
- 40038a: 98 de f0 34 lm %r13,%r14,52(%r15) # restore registers r13 & 14
- 40038e: 07 fe br %r14 # return
-}
-
-Comments on the function main
------------------------------
-1) The compiler did this function optimally ( 8-) )
-
-Literal pool for main.
-400390: ff ff ff ec .long 0xffffffec
-main(int argc,char *argv[])
-{ # Function prologue below
- 400394: 90 bf f0 2c stm %r11,%r15,44(%r15) # Save necessary registers
- 400398: 18 0f lr %r0,%r15 # copy stack pointer to r0
- 40039a: a7 fa ff a0 ahi %r15,-96 # Make area for callee saving
- 40039e: 0d d0 basr %r13,%r0 # Set up r13 to point to
- 4003a0: a7 da ff f0 ahi %r13,-16 # literal pool
- 4003a4: 50 00 f0 00 st %r0,0(%r15) # Save backchain
-
- return(test(5)); # Main Program Below
- 4003a8: 58 e0 d0 00 l %r14,0(%r13) # load relative address of test from
- # literal pool
- 4003ac: a7 28 00 05 lhi %r2,5 # Set first parameter to 5
- 4003b0: 4d ee d0 00 bas %r14,0(%r14,%r13) # jump to test setting r14 as return
- # address using branch & save instruction.
-
- # Function Epilogue below
- 4003b4: 98 bf f0 8c lm %r11,%r15,140(%r15)# Restore necessary registers.
- 4003b8: 07 fe br %r14 # return to do program exit
-}
-
-
-Compiler updates
-----------------
-
-main(int argc,char *argv[])
-{
- 4004fc: 90 7f f0 1c stm %r7,%r15,28(%r15)
- 400500: a7 d5 00 04 bras %r13,400508 <main+0xc>
- 400504: 00 40 04 f4 .long 0x004004f4
- # compiler now puts constant pool in code to so it saves an instruction
- 400508: 18 0f lr %r0,%r15
- 40050a: a7 fa ff a0 ahi %r15,-96
- 40050e: 50 00 f0 00 st %r0,0(%r15)
- return(test(5));
- 400512: 58 10 d0 00 l %r1,0(%r13)
- 400516: a7 28 00 05 lhi %r2,5
- 40051a: 0d e1 basr %r14,%r1
- # compiler adds 1 extra instruction to epilogue this is done to
- # avoid processor pipeline stalls owing to data dependencies on g5 &
- # above as register 14 in the old code was needed directly after being loaded
- # by the lm %r11,%r15,140(%r15) for the br %14.
- 40051c: 58 40 f0 98 l %r4,152(%r15)
- 400520: 98 7f f0 7c lm %r7,%r15,124(%r15)
- 400524: 07 f4 br %r4
-}
-
-
-Hartmut ( our compiler developer ) also has been threatening to take out the
-stack backchain in optimised code as this also causes pipeline stalls, you
-have been warned.
-
-64 bit z/Architecture code disassembly
---------------------------------------
-
-If you understand the stuff above you'll understand the stuff
-below too so I'll avoid repeating myself & just say that
-some of the instructions have g's on the end of them to indicate
-they are 64 bit & the stack offsets are a bigger,
-the only other difference you'll find between 32 & 64 bit is that
-we now use f4 & f6 for floating point arguments on 64 bit.
-00000000800005b0 <test>:
-int test(int b)
-{
- return(5+b);
- 800005b0: a7 2a 00 05 ahi %r2,5
- 800005b4: b9 14 00 22 lgfr %r2,%r2 # downcast to integer
- 800005b8: 07 fe br %r14
- 800005ba: 07 07 bcr 0,%r7
-
-
-}
-
-00000000800005bc <main>:
-main(int argc,char *argv[])
-{
- 800005bc: eb bf f0 58 00 24 stmg %r11,%r15,88(%r15)
- 800005c2: b9 04 00 1f lgr %r1,%r15
- 800005c6: a7 fb ff 60 aghi %r15,-160
- 800005ca: e3 10 f0 00 00 24 stg %r1,0(%r15)
- return(test(5));
- 800005d0: a7 29 00 05 lghi %r2,5
- # brasl allows jumps > 64k & is overkill here bras would do fune
- 800005d4: c0 e5 ff ff ff ee brasl %r14,800005b0 <test>
- 800005da: e3 40 f1 10 00 04 lg %r4,272(%r15)
- 800005e0: eb bf f0 f8 00 04 lmg %r11,%r15,248(%r15)
- 800005e6: 07 f4 br %r4
-}
-
-
-
-Compiling programs for debugging on Linux for s/390 & z/Architecture
-====================================================================
--gdwarf-2 now works it should be considered the default debugging
-format for s/390 & z/Architecture as it is more reliable for debugging
-shared libraries, normal -g debugging works much better now
-Thanks to the IBM java compiler developers bug reports.
-
-This is typically done adding/appending the flags -g or -gdwarf-2 to the
-CFLAGS & LDFLAGS variables Makefile of the program concerned.
-
-If using gdb & you would like accurate displays of registers &
- stack traces compile without optimisation i.e make sure
-that there is no -O2 or similar on the CFLAGS line of the Makefile &
-the emitted gcc commands, obviously this will produce worse code
-( not advisable for shipment ) but it is an aid to the debugging process.
-
-This aids debugging because the compiler will copy parameters passed in
-in registers onto the stack so backtracing & looking at passed in
-parameters will work, however some larger programs which use inline functions
-will not compile without optimisation.
-
-Debugging with optimisation has since much improved after fixing
-some bugs, please make sure you are using gdb-5.0 or later developed
-after Nov'2000.
-
-
-
-Debugging under VM
-==================
-
-Notes
------
-Addresses & values in the VM debugger are always hex never decimal
-Address ranges are of the format <HexValue1>-<HexValue2> or
-<HexValue1>.<HexValue2>
-For example, the address range 0x2000 to 0x3000 can be described as 2000-3000
-or 2000.1000
-
-The VM Debugger is case insensitive.
-
-VM's strengths are usually other debuggers weaknesses you can get at any
-resource no matter how sensitive e.g. memory management resources, change
-address translation in the PSW. For kernel hacking you will reap dividends if
-you get good at it.
-
-The VM Debugger displays operators but not operands, and also the debugger
-displays useful information on the same line as the author of the code probably
-felt that it was a good idea not to go over the 80 columns on the screen.
-This isn't as unintuitive as it may seem as the s/390 instructions are easy to
-decode mentally and you can make a good guess at a lot of them as all the
-operands are nibble (half byte aligned).
-So if you have an objdump listing by hand, it is quite easy to follow, and if
-you don't have an objdump listing keep a copy of the s/390 Reference Summary
-or alternatively the s/390 principles of operation next to you.
-e.g. even I can guess that
-0001AFF8' LR 180F CC 0
-is a ( load register ) lr r0,r15
-
-Also it is very easy to tell the length of a 390 instruction from the 2 most
-significant bits in the instruction (not that this info is really useful except
-if you are trying to make sense of a hexdump of code).
-Here is a table
-Bits Instruction Length
-------------------------------------------
-00 2 Bytes
-01 4 Bytes
-10 4 Bytes
-11 6 Bytes
-
-The debugger also displays other useful info on the same line such as the
-addresses being operated on destination addresses of branches & condition codes.
-e.g.
-00019736' AHI A7DAFF0E CC 1
-000198BA' BRC A7840004 -> 000198C2' CC 0
-000198CE' STM 900EF068 >> 0FA95E78 CC 2
-
-
-
-Useful VM debugger commands
----------------------------
-
-I suppose I'd better mention this before I start
-to list the current active traces do
-Q TR
-there can be a maximum of 255 of these per set
-( more about trace sets later ).
-To stop traces issue a
-TR END.
-To delete a particular breakpoint issue
-TR DEL <breakpoint number>
-
-The PA1 key drops to CP mode so you can issue debugger commands,
-Doing alt c (on my 3270 console at least ) clears the screen.
-hitting b <enter> comes back to the running operating system
-from cp mode ( in our case linux ).
-It is typically useful to add shortcuts to your profile.exec file
-if you have one ( this is roughly equivalent to autoexec.bat in DOS ).
-file here are a few from mine.
-/* this gives me command history on issuing f12 */
-set pf12 retrieve
-/* this continues */
-set pf8 imm b
-/* goes to trace set a */
-set pf1 imm tr goto a
-/* goes to trace set b */
-set pf2 imm tr goto b
-/* goes to trace set c */
-set pf3 imm tr goto c
-
-
-
-Instruction Tracing
--------------------
-Setting a simple breakpoint
-TR I PSWA <address>
-To debug a particular function try
-TR I R <function address range>
-TR I on its own will single step.
-TR I DATA <MNEMONIC> <OPTIONAL RANGE> will trace for particular mnemonics
-e.g.
-TR I DATA 4D R 0197BC.4000
-will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000
-if you were inclined you could add traces for all branch instructions &
-suffix them with the run prefix so you would have a backtrace on screen
-when a program crashes.
-TR BR <INTO OR FROM> will trace branches into or out of an address.
-e.g.
-TR BR INTO 0 is often quite useful if a program is getting awkward & deciding
-to branch to 0 & crashing as this will stop at the address before in jumps to 0.
-TR I R <address range> RUN cmd d g
-single steps a range of addresses but stays running &
-displays the gprs on each step.
-
-
-
-Displaying & modifying Registers
---------------------------------
-D G will display all the gprs
-Adding a extra G to all the commands is necessary to access the full 64 bit
-content in VM on z/Architecture. Obviously this isn't required for access
-registers as these are still 32 bit.
-e.g. DGG instead of DG
-D X will display all the control registers
-D AR will display all the access registers
-D AR4-7 will display access registers 4 to 7
-CPU ALL D G will display the GRPS of all CPUS in the configuration
-D PSW will display the current PSW
-st PSW 2000 will put the value 2000 into the PSW &
-cause crash your machine.
-D PREFIX displays the prefix offset
-
-
-Displaying Memory
------------------
-To display memory mapped using the current PSW's mapping try
-D <range>
-To make VM display a message each time it hits a particular address and
-continue try
-D I<range> will disassemble/display a range of instructions.
-ST addr 32 bit word will store a 32 bit aligned address
-D T<range> will display the EBCDIC in an address (if you are that way inclined)
-D R<range> will display real addresses ( without DAT ) but with prefixing.
-There are other complex options to display if you need to get at say home space
-but are in primary space the easiest thing to do is to temporarily
-modify the PSW to the other addressing mode, display the stuff & then
-restore it.
-
-
-
-Hints
------
-If you want to issue a debugger command without halting your virtual machine
-with the PA1 key try prefixing the command with #CP e.g.
-#cp tr i pswa 2000
-also suffixing most debugger commands with RUN will cause them not
-to stop just display the mnemonic at the current instruction on the console.
-If you have several breakpoints you want to put into your program &
-you get fed up of cross referencing with System.map
-you can do the following trick for several symbols.
-grep do_signal System.map
-which emits the following among other things
-0001f4e0 T do_signal
-now you can do
-
-TR I PSWA 0001f4e0 cmd msg * do_signal
-This sends a message to your own console each time do_signal is entered.
-( As an aside I wrote a perl script once which automatically generated a REXX
-script with breakpoints on every kernel procedure, this isn't a good idea
-because there are thousands of these routines & VM can only set 255 breakpoints
-at a time so you nearly had to spend as long pruning the file down as you would
-entering the msgs by hand), however, the trick might be useful for a single
-object file. In the 3270 terminal emulator x3270 there is a very useful option
-in the file menu called "Save Screen In File" - this is very good for keeping a
-copy of traces.
-
-From CMS help <command name> will give you online help on a particular command.
-e.g.
-HELP DISPLAY
-
-Also CP has a file called profile.exec which automatically gets called
-on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session
-CP has a feature similar to doskey, it may be useful for you to
-use profile.exec to define some keystrokes.
-e.g.
-SET PF9 IMM B
-This does a single step in VM on pressing F8.
-SET PF10 ^
-This sets up the ^ key.
-which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed directly
-into some 3270 consoles.
-SET PF11 ^-
-This types the starting keystrokes for a sysrq see SysRq below.
-SET PF12 RETRIEVE
-This retrieves command history on pressing F12.
-
-
-Sometimes in VM the display is set up to scroll automatically this
-can be very annoying if there are messages you wish to look at
-to stop this do
-TERM MORE 255 255
-This will nearly stop automatic screen updates, however it will
-cause a denial of service if lots of messages go to the 3270 console,
-so it would be foolish to use this as the default on a production machine.
-
-
-Tracing particular processes
-----------------------------
-The kernel's text segment is intentionally at an address in memory that it will
-very seldom collide with text segments of user programs ( thanks Martin ),
-this simplifies debugging the kernel.
-However it is quite common for user processes to have addresses which collide
-this can make debugging a particular process under VM painful under normal
-circumstances as the process may change when doing a
-TR I R <address range>.
-Thankfully after reading VM's online help I figured out how to debug
-I particular process.
-
-Your first problem is to find the STD ( segment table designation )
-of the program you wish to debug.
-There are several ways you can do this here are a few
-1) objdump --syms <program to be debugged> | grep main
-To get the address of main in the program.
-tr i pswa <address of main>
-Start the program, if VM drops to CP on what looks like the entry
-point of the main function this is most likely the process you wish to debug.
-Now do a D X13 or D XG13 on z/Architecture.
-On 31 bit the STD is bits 1-19 ( the STO segment table origin )
-& 25-31 ( the STL segment table length ) of CR13.
-now type
-TR I R STD <CR13's value> 0.7fffffff
-e.g.
-TR I R STD 8F32E1FF 0.7fffffff
-Another very useful variation is
-TR STORE INTO STD <CR13's value> <address range>
-for finding out when a particular variable changes.
-
-An alternative way of finding the STD of a currently running process
-is to do the following, ( this method is more complex but
-could be quite convenient if you aren't updating the kernel much &
-so your kernel structures will stay constant for a reasonable period of
-time ).
-
-grep task /proc/<pid>/status
-from this you should see something like
-task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68
-This now gives you a pointer to the task structure.
-Now make CC:="s390-gcc -g" kernel/sched.s
-To get the task_struct stabinfo.
-( task_struct is defined in include/linux/sched.h ).
-Now we want to look at
-task->active_mm->pgd
-on my machine the active_mm in the task structure stab is
-active_mm:(4,12),672,32
-its offset is 672/8=84=0x54
-the pgd member in the mm_struct stab is
-pgd:(4,6)=*(29,5),96,32
-so its offset is 96/8=12=0xc
-
-so we'll
-hexdump -s 0xf160054 /dev/mem | more
-i.e. task_struct+active_mm offset
-to look at the active_mm member
-f160054 0fee cc60 0019 e334 0000 0000 0000 0011
-hexdump -s 0x0feecc6c /dev/mem | more
-i.e. active_mm+pgd offset
-feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010
-we get something like
-now do
-TR I R STD <pgd|0x7f> 0.7fffffff
-i.e. the 0x7f is added because the pgd only
-gives the page table origin & we need to set the low bits
-to the maximum possible segment table length.
-TR I R STD 0f2c007f 0.7fffffff
-on z/Architecture you'll probably need to do
-TR I R STD <pgd|0x7> 0.ffffffffffffffff
-to set the TableType to 0x1 & the Table length to 3.
-
-
-
-Tracing Program Exceptions
---------------------------
-If you get a crash which says something like
-illegal operation or specification exception followed by a register dump
-You can restart linux & trace these using the tr prog <range or value> trace
-option.
-
-
-The most common ones you will normally be tracing for is
-1=operation exception
-2=privileged operation exception
-4=protection exception
-5=addressing exception
-6=specification exception
-10=segment translation exception
-11=page translation exception
-
-The full list of these is on page 22 of the current s/390 Reference Summary.
-e.g.
-tr prog 10 will trace segment translation exceptions.
-tr prog on its own will trace all program interruption codes.
-
-Trace Sets
-----------
-On starting VM you are initially in the INITIAL trace set.
-You can do a Q TR to verify this.
-If you have a complex tracing situation where you wish to wait for instance
-till a driver is open before you start tracing IO, but know in your
-heart that you are going to have to make several runs through the code till you
-have a clue whats going on.
-
-What you can do is
-TR I PSWA <Driver open address>
-hit b to continue till breakpoint
-reach the breakpoint
-now do your
-TR GOTO B
-TR IO 7c08-7c09 inst int run
-or whatever the IO channels you wish to trace are & hit b
-
-To got back to the initial trace set do
-TR GOTO INITIAL
-& the TR I PSWA <Driver open address> will be the only active breakpoint again.
-
-
-Tracing linux syscalls under VM
--------------------------------
-Syscalls are implemented on Linux for S390 by the Supervisor call instruction
-(SVC). There 256 possibilities of these as the instruction is made up of a 0xA
-opcode and the second byte being the syscall number. They are traced using the
-simple command:
-TR SVC <Optional value or range>
-the syscalls are defined in linux/arch/s390/include/asm/unistd.h
-e.g. to trace all file opens just do
-TR SVC 5 ( as this is the syscall number of open )
-
-
-SMP Specific commands
----------------------
-To find out how many cpus you have
-Q CPUS displays all the CPU's available to your virtual machine
-To find the cpu that the current cpu VM debugger commands are being directed at
-do Q CPU to change the current cpu VM debugger commands are being directed at do
-CPU <desired cpu no>
-
-On a SMP guest issue a command to all CPUs try prefixing the command with cpu
-all. To issue a command to a particular cpu try cpu <cpu number> e.g.
-CPU 01 TR I R 2000.3000
-If you are running on a guest with several cpus & you have a IO related problem
-& cannot follow the flow of code but you know it isn't smp related.
-from the bash prompt issue
-shutdown -h now or halt.
-do a Q CPUS to find out how many cpus you have
-detach each one of them from cp except cpu 0
-by issuing a
-DETACH CPU 01-(number of cpus in configuration)
-& boot linux again.
-TR SIGP will trace inter processor signal processor instructions.
-DEFINE CPU 01-(number in configuration)
-will get your guests cpus back.
-
-
-Help for displaying ascii textstrings
--------------------------------------
-On the very latest VM Nucleus'es VM can now display ascii
-( thanks Neale for the hint ) by doing
-D TX<lowaddr>.<len>
-e.g.
-D TX0.100
-
-Alternatively
-=============
-Under older VM debuggers (I love EBDIC too) you can use following little
-program which converts a command line of hex digits to ascii text. It can be
-compiled under linux and you can copy the hex digits from your x3270 terminal
-to your xterm if you are debugging from a linuxbox.
-
-This is quite useful when looking at a parameter passed in as a text string
-under VM ( unless you are good at decoding ASCII in your head ).
-
-e.g. consider tracing an open syscall
-TR SVC 5
-We have stopped at a breakpoint
-000151B0' SVC 0A05 -> 0001909A' CC 0
-
-D 20.8 to check the SVC old psw in the prefix area and see was it from userspace
-(for the layout of the prefix area consult the "Fixed Storage Locations"
-chapter of the s/390 Reference Summary if you have it available).
-V00000020 070C2000 800151B2
-The problem state bit wasn't set & it's also too early in the boot sequence
-for it to be a userspace SVC if it was we would have to temporarily switch the
-psw to user space addressing so we could get at the first parameter of the open
-in gpr2.
-Next do a
-D G2
-GPR 2 = 00014CB4
-Now display what gpr2 is pointing to
-D 00014CB4.20
-V00014CB4 2F646576 2F636F6E 736F6C65 00001BF5
-V00014CC4 FC00014C B4001001 E0001000 B8070707
-Now copy the text till the first 00 hex ( which is the end of the string
-to an xterm & do hex2ascii on it.
-hex2ascii 2F646576 2F636F6E 736F6C65 00
-outputs
-Decoded Hex:=/ d e v / c o n s o l e 0x00
-We were opening the console device,
-
-You can compile the code below yourself for practice :-),
-/*
- * hex2ascii.c
- * a useful little tool for converting a hexadecimal command line to ascii
- *
- * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
- * (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation.
- */
-#include <stdio.h>
-
-int main(int argc,char *argv[])
-{
- int cnt1,cnt2,len,toggle=0;
- int startcnt=1;
- unsigned char c,hex;
-
- if(argc>1&&(strcmp(argv[1],"-a")==0))
- startcnt=2;
- printf("Decoded Hex:=");
- for(cnt1=startcnt;cnt1<argc;cnt1++)
- {
- len=strlen(argv[cnt1]);
- for(cnt2=0;cnt2<len;cnt2++)
- {
- c=argv[cnt1][cnt2];
- if(c>='0'&&c<='9')
- c=c-'0';
- if(c>='A'&&c<='F')
- c=c-'A'+10;
- if(c>='a'&&c<='f')
- c=c-'a'+10;
- switch(toggle)
- {
- case 0:
- hex=c<<4;
- toggle=1;
- break;
- case 1:
- hex+=c;
- if(hex<32||hex>127)
- {
- if(startcnt==1)
- printf("0x%02X ",(int)hex);
- else
- printf(".");
- }
- else
- {
- printf("%c",hex);
- if(startcnt==1)
- printf(" ");
- }
- toggle=0;
- break;
- }
- }
- }
- printf("\n");
-}
-
-
-
-
-Stack tracing under VM
-----------------------
-A basic backtrace
------------------
-
-Here are the tricks I use 9 out of 10 times it works pretty well,
-
-When your backchain reaches a dead end
---------------------------------------
-This can happen when an exception happens in the kernel and the kernel is
-entered twice. If you reach the NULL pointer at the end of the back chain you
-should be able to sniff further back if you follow the following tricks.
-1) A kernel address should be easy to recognise since it is in
-primary space & the problem state bit isn't set & also
-The Hi bit of the address is set.
-2) Another backchain should also be easy to recognise since it is an
-address pointing to another address approximately 100 bytes or 0x70 hex
-behind the current stackpointer.
-
-
-Here is some practice.
-boot the kernel & hit PA1 at some random time
-d g to display the gprs, this should display something like
-GPR 0 = 00000001 00156018 0014359C 00000000
-GPR 4 = 00000001 001B8888 000003E0 00000000
-GPR 8 = 00100080 00100084 00000000 000FE000
-GPR 12 = 00010400 8001B2DC 8001B36A 000FFED8
-Note that GPR14 is a return address but as we are real men we are going to
-trace the stack.
-display 0x40 bytes after the stack pointer.
-
-V000FFED8 000FFF38 8001B838 80014C8E 000FFF38
-V000FFEE8 00000000 00000000 000003E0 00000000
-V000FFEF8 00100080 00100084 00000000 000FE000
-V000FFF08 00010400 8001B2DC 8001B36A 000FFED8
-
-
-Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if
-you look above at our stackframe & also agrees with GPR14.
-
-now backchain
-d 000FFF38.40
-we now are taking the contents of SP to get our first backchain.
-
-V000FFF38 000FFFA0 00000000 00014995 00147094
-V000FFF48 00147090 001470A0 000003E0 00000000
-V000FFF58 00100080 00100084 00000000 001BF1D0
-V000FFF68 00010400 800149BA 80014CA6 000FFF38
-
-This displays a 2nd return address of 80014CA6
-
-now do d 000FFFA0.40 for our 3rd backchain
-
-V000FFFA0 04B52002 0001107F 00000000 00000000
-V000FFFB0 00000000 00000000 FF000000 0001107F
-V000FFFC0 00000000 00000000 00000000 00000000
-V000FFFD0 00010400 80010802 8001085A 000FFFA0
-
-
-our 3rd return address is 8001085A
-
-as the 04B52002 looks suspiciously like rubbish it is fair to assume that the
-kernel entry routines for the sake of optimisation don't set up a backchain.
-
-now look at System.map to see if the addresses make any sense.
-
-grep -i 0001b3 System.map
-outputs among other things
-0001b304 T cpu_idle
-so 8001B36A
-is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it )
-
-
-grep -i 00014 System.map
-produces among other things
-00014a78 T start_kernel
-so 0014CA6 is start_kernel+some hex number I can't add in my head.
-
-grep -i 00108 System.map
-this produces
-00010800 T _stext
-so 8001085A is _stext+0x5a
-
-Congrats you've done your first backchain.
-
-
-
-s/390 & z/Architecture IO Overview
-==================================
-
-I am not going to give a course in 390 IO architecture as this would take me
-quite a while and I'm no expert. Instead I'll give a 390 IO architecture
-summary for Dummies. If you have the s/390 principles of operation available
-read this instead. If nothing else you may find a few useful keywords in here
-and be able to use them on a web search engine to find more useful information.
-
-Unlike other bus architectures modern 390 systems do their IO using mostly
-fibre optics and devices such as tapes and disks can be shared between several
-mainframes. Also S390 can support up to 65536 devices while a high end PC based
-system might be choking with around 64.
-
-Here is some of the common IO terminology:
-
-Subchannel:
-This is the logical number most IO commands use to talk to an IO device. There
-can be up to 0x10000 (65536) of these in a configuration, typically there are a
-few hundred. Under VM for simplicity they are allocated contiguously, however
-on the native hardware they are not. They typically stay consistent between
-boots provided no new hardware is inserted or removed.
-Under Linux for s390 we use these as IRQ's and also when issuing an IO command
-(CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL,
-START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID
-of the device we wish to talk to. The most important of these instructions are
-START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO
-completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have
-up to 8 channel paths to a device, this offers redundancy if one is not
-available.
-
-Device Number:
-This number remains static and is closely tied to the hardware. There are 65536
-of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and
-another lsb 8 bits. These remain static even if more devices are inserted or
-removed from the hardware. There is a 1 to 1 mapping between subchannels and
-device numbers, provided devices aren't inserted or removed.
-
-Channel Control Words:
-CCWs are linked lists of instructions initially pointed to by an operation
-request block (ORB), which is initially given to Start Subchannel (SSCH)
-command along with the subchannel number for the IO subsystem to process
-while the CPU continues executing normal code.
-CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and
-Format 1 (31 bit). These are typically used to issue read and write (and many
-other) instructions. They consist of a length field and an absolute address
-field.
-Each IO typically gets 1 or 2 interrupts, one for channel end (primary status)
-when the channel is idle, and the second for device end (secondary status).
-Sometimes you get both concurrently. You check how the IO went on by issuing a
-TEST SUBCHANNEL at each interrupt, from which you receive an Interruption
-response block (IRB). If you get channel and device end status in the IRB
-without channel checks etc. your IO probably went okay. If you didn't you
-probably need to examine the IRB, extended status word etc.
-If an error occurs, more sophisticated control units have a facility known as
-concurrent sense. This means that if an error occurs Extended sense information
-will be presented in the Extended status word in the IRB. If not you have to
-issue a subsequent SENSE CCW command after the test subchannel.
-
-
-TPI (Test pending interrupt) can also be used for polled IO, but in
-multitasking multiprocessor systems it isn't recommended except for
-checking special cases (i.e. non looping checks for pending IO etc.).
-
-Store Subchannel and Modify Subchannel can be used to examine and modify
-operating characteristics of a subchannel (e.g. channel paths).
-
-Other IO related Terms:
-Sysplex: S390's Clustering Technology
-QDIO: S390's new high speed IO architecture to support devices such as gigabit
-ethernet, this architecture is also designed to be forward compatible with
-upcoming 64 bit machines.
-
-
-General Concepts
-
-Input Output Processors (IOP's) are responsible for communicating between
-the mainframe CPU's & the channel & relieve the mainframe CPU's from the
-burden of communicating with IO devices directly, this allows the CPU's to
-concentrate on data processing.
-
-IOP's can use one or more links ( known as channel paths ) to talk to each
-IO device. It first checks for path availability & chooses an available one,
-then starts ( & sometimes terminates IO ).
-There are two types of channel path: ESCON & the Parallel IO interface.
-
-IO devices are attached to control units, control units provide the
-logic to interface the channel paths & channel path IO protocols to
-the IO devices, they can be integrated with the devices or housed separately
-& often talk to several similar devices ( typical examples would be raid
-controllers or a control unit which connects to 1000 3270 terminals ).
-
-
- +---------------------------------------------------------------+
- | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ |
- | | CPU | | CPU | | CPU | | CPU | | Main | | Expanded | |
- | | | | | | | | | | Memory | | Storage | |
- | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ |
- |---------------------------------------------------------------+
- | IOP | IOP | IOP |
- |---------------------------------------------------------------
- | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C |
- ----------------------------------------------------------------
- || ||
- || Bus & Tag Channel Path || ESCON
- || ====================== || Channel
- || || || || Path
- +----------+ +----------+ +----------+
- | | | | | |
- | CU | | CU | | CU |
- | | | | | |
- +----------+ +----------+ +----------+
- | | | | |
-+----------+ +----------+ +----------+ +----------+ +----------+
-|I/O Device| |I/O Device| |I/O Device| |I/O Device| |I/O Device|
-+----------+ +----------+ +----------+ +----------+ +----------+
- CPU = Central Processing Unit
- C = Channel
- IOP = IP Processor
- CU = Control Unit
-
-The 390 IO systems come in 2 flavours the current 390 machines support both
-
-The Older 360 & 370 Interface,sometimes called the Parallel I/O interface,
-sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers
-Interface (OEMI).
-
-This byte wide Parallel channel path/bus has parity & data on the "Bus" cable
-and control lines on the "Tag" cable. These can operate in byte multiplex mode
-for sharing between several slow devices or burst mode and monopolize the
-channel for the whole burst. Up to 256 devices can be addressed on one of these
-cables. These cables are about one inch in diameter. The maximum unextended
-length supported by these cables is 125 Meters but this can be extended up to
-2km with a fibre optic channel extended such as a 3044. The maximum burst speed
-supported is 4.5 megabytes per second. However, some really old processors
-support only transfer rates of 3.0, 2.0 & 1.0 MB/sec.
-One of these paths can be daisy chained to up to 8 control units.
-
-
-ESCON if fibre optic it is also called FICON
-Was introduced by IBM in 1990. Has 2 fibre optic cables and uses either leds or
-lasers for communication at a signaling rate of up to 200 megabits/sec. As
-10bits are transferred for every 8 bits info this drops to 160 megabits/sec
-and to 18.6 Megabytes/sec once control info and CRC are added. ESCON only
-operates in burst mode.
-
-ESCONs typical max cable length is 3km for the led version and 20km for the
-laser version known as XDF (extended distance facility). This can be further
-extended by using an ESCON director which triples the above mentioned ranges.
-Unlike Bus & Tag as ESCON is serial it uses a packet switching architecture,
-the standard Bus & Tag control protocol is however present within the packets.
-Up to 256 devices can be attached to each control unit that uses one of these
-interfaces.
-
-Common 390 Devices include:
-Network adapters typically OSA2,3172's,2116's & OSA-E gigabit ethernet adapters,
-Consoles 3270 & 3215 (a teletype emulated under linux for a line mode console).
-DASD's direct access storage devices ( otherwise known as hard disks ).
-Tape Drives.
-CTC ( Channel to Channel Adapters ),
-ESCON or Parallel Cables used as a very high speed serial link
-between 2 machines.
-
-
-Debugging IO on s/390 & z/Architecture under VM
-===============================================
-
-Now we are ready to go on with IO tracing commands under VM
-
-A few self explanatory queries:
-Q OSA
-Q CTC
-Q DISK ( This command is CMS specific )
-Q DASD
-
-
-
-
-
-
-Q OSA on my machine returns
-OSA 7C08 ON OSA 7C08 SUBCHANNEL = 0000
-OSA 7C09 ON OSA 7C09 SUBCHANNEL = 0001
-OSA 7C14 ON OSA 7C14 SUBCHANNEL = 0002
-OSA 7C15 ON OSA 7C15 SUBCHANNEL = 0003
-
-If you have a guest with certain privileges you may be able to see devices
-which don't belong to you. To avoid this, add the option V.
-e.g.
-Q V OSA
-
-Now using the device numbers returned by this command we will
-Trace the io starting up on the first device 7c08 & 7c09
-In our simplest case we can trace the
-start subchannels
-like TR SSCH 7C08-7C09
-or the halt subchannels
-or TR HSCH 7C08-7C09
-MSCH's ,STSCH's I think you can guess the rest
-
-A good trick is tracing all the IO's and CCWS and spooling them into the reader
-of another VM guest so he can ftp the logfile back to his own machine. I'll do
-a small bit of this and give you a look at the output.
-
-1) Spool stdout to VM reader
-SP PRT TO (another vm guest ) or * for the local vm guest
-2) Fill the reader with the trace
-TR IO 7c08-7c09 INST INT CCW PRT RUN
-3) Start up linux
-i 00c
-4) Finish the trace
-TR END
-5) close the reader
-C PRT
-6) list reader contents
-RDRLIST
-7) copy it to linux4's minidisk
-RECEIVE / LOG TXT A1 ( replace
-8)
-filel & press F11 to look at it
-You should see something like:
-
-00020942' SSCH B2334000 0048813C CC 0 SCH 0000 DEV 7C08
- CPA 000FFDF0 PARM 00E2C9C4 KEY 0 FPI C0 LPM 80
- CCW 000FFDF0 E4200100 00487FE8 0000 E4240100 ........
- IDAL 43D8AFE8
- IDAL 0FB76000
-00020B0A' I/O DEV 7C08 -> 000197BC' SCH 0000 PARM 00E2C9C4
-00021628' TSCH B2354000 >> 00488164 CC 0 SCH 0000 DEV 7C08
- CCWA 000FFDF8 DEV STS 0C SCH STS 00 CNT 00EC
- KEY 0 FPI C0 CC 0 CTLS 4007
-00022238' STSCH B2344000 >> 00488108 CC 0 SCH 0000 DEV 7C08
-
-If you don't like messing up your readed ( because you possibly booted from it )
-you can alternatively spool it to another readers guest.
-
-
-Other common VM device related commands
----------------------------------------------
-These commands are listed only because they have
-been of use to me in the past & may be of use to
-you too. For more complete info on each of the commands
-use type HELP <command> from CMS.
-detaching devices
-DET <devno range>
-ATT <devno range> <guest>
-attach a device to guest * for your own guest
-READY <devno> cause VM to issue a fake interrupt.
-
-The VARY command is normally only available to VM administrators.
-VARY ON PATH <path> TO <devno range>
-VARY OFF PATH <PATH> FROM <devno range>
-This is used to switch on or off channel paths to devices.
-
-Q CHPID <channel path ID>
-This displays state of devices using this channel path
-D SCHIB <subchannel>
-This displays the subchannel information SCHIB block for the device.
-this I believe is also only available to administrators.
-DEFINE CTC <devno>
-defines a virtual CTC channel to channel connection
-2 need to be defined on each guest for the CTC driver to use.
-COUPLE devno userid remote devno
-Joins a local virtual device to a remote virtual device
-( commonly used for the CTC driver ).
-
-Building a VM ramdisk under CMS which linux can use
-def vfb-<blocksize> <subchannel> <number blocks>
-blocksize is commonly 4096 for linux.
-Formatting it
-format <subchannel> <driver letter e.g. x> (blksize <blocksize>
-
-Sharing a disk between multiple guests
-LINK userid devno1 devno2 mode password
-
-
-
-GDB on S390
-===========
-N.B. if compiling for debugging gdb works better without optimisation
-( see Compiling programs for debugging )
-
-invocation
-----------
-gdb <victim program> <optional corefile>
-
-Online help
------------
-help: gives help on commands
-e.g.
-help
-help display
-Note gdb's online help is very good use it.
-
-
-Assembly
---------
-info registers: displays registers other than floating point.
-info all-registers: displays floating points as well.
-disassemble: disassembles
-e.g.
-disassemble without parameters will disassemble the current function
-disassemble $pc $pc+10
-
-Viewing & modifying variables
------------------------------
-print or p: displays variable or register
-e.g. p/x $sp will display the stack pointer
-
-display: prints variable or register each time program stops
-e.g.
-display/x $pc will display the program counter
-display argc
-
-undisplay : undo's display's
-
-info breakpoints: shows all current breakpoints
-
-info stack: shows stack back trace (if this doesn't work too well, I'll show
-you the stacktrace by hand below).
-
-info locals: displays local variables.
-
-info args: display current procedure arguments.
-
-set args: will set argc & argv each time the victim program is invoked.
-
-set <variable>=value
-set argc=100
-set $pc=0
-
-
-
-Modifying execution
--------------------
-step: steps n lines of sourcecode
-step steps 1 line.
-step 100 steps 100 lines of code.
-
-next: like step except this will not step into subroutines
-
-stepi: steps a single machine code instruction.
-e.g. stepi 100
-
-nexti: steps a single machine code instruction but will not step into
-subroutines.
-
-finish: will run until exit of the current routine
-
-run: (re)starts a program
-
-cont: continues a program
-
-quit: exits gdb.
-
-
-breakpoints
-------------
-
-break
-sets a breakpoint
-e.g.
-
-break main
-
-break *$pc
-
-break *0x400618
-
-Here's a really useful one for large programs
-rbr
-Set a breakpoint for all functions matching REGEXP
-e.g.
-rbr 390
-will set a breakpoint with all functions with 390 in their name.
-
-info breakpoints
-lists all breakpoints
-
-delete: delete breakpoint by number or delete them all
-e.g.
-delete 1 will delete the first breakpoint
-delete will delete them all
-
-watch: This will set a watchpoint ( usually hardware assisted ),
-This will watch a variable till it changes
-e.g.
-watch cnt, will watch the variable cnt till it changes.
-As an aside unfortunately gdb's, architecture independent watchpoint code
-is inconsistent & not very good, watchpoints usually work but not always.
-
-info watchpoints: Display currently active watchpoints
-
-condition: ( another useful one )
-Specify breakpoint number N to break only if COND is true.
-Usage is `condition N COND', where N is an integer and COND is an
-expression to be evaluated whenever breakpoint N is reached.
-
-
-
-User defined functions/macros
------------------------------
-define: ( Note this is very very useful,simple & powerful )
-usage define <name> <list of commands> end
-
-examples which you should consider putting into .gdbinit in your home directory
-define d
-stepi
-disassemble $pc $pc+10
-end
-
-define e
-nexti
-disassemble $pc $pc+10
-end
-
-
-Other hard to classify stuff
-----------------------------
-signal n:
-sends the victim program a signal.
-e.g. signal 3 will send a SIGQUIT.
-
-info signals:
-what gdb does when the victim receives certain signals.
-
-list:
-e.g.
-list lists current function source
-list 1,10 list first 10 lines of current file.
-list test.c:1,10
-
-
-directory:
-Adds directories to be searched for source if gdb cannot find the source.
-(note it is a bit sensitive about slashes)
-e.g. To add the root of the filesystem to the searchpath do
-directory //
-
-
-call <function>
-This calls a function in the victim program, this is pretty powerful
-e.g.
-(gdb) call printf("hello world")
-outputs:
-$1 = 11
-
-You might now be thinking that the line above didn't work, something extra had
-to be done.
-(gdb) call fflush(stdout)
-hello world$2 = 0
-As an aside the debugger also calls malloc & free under the hood
-to make space for the "hello world" string.
-
-
-
-hints
------
-1) command completion works just like bash
-( if you are a bad typist like me this really helps )
-e.g. hit br <TAB> & cursor up & down :-).
-
-2) if you have a debugging problem that takes a few steps to recreate
-put the steps into a file called .gdbinit in your current working directory
-if you have defined a few extra useful user defined commands put these in
-your home directory & they will be read each time gdb is launched.
-
-A typical .gdbinit file might be.
-break main
-run
-break runtime_exception
-cont
-
-
-stack chaining in gdb by hand
------------------------------
-This is done using a the same trick described for VM
-p/x (*($sp+56))&0x7fffffff get the first backchain.
-
-For z/Architecture
-Replace 56 with 112 & ignore the &0x7fffffff
-in the macros below & do nasty casts to longs like the following
-as gdb unfortunately deals with printed arguments as ints which
-messes up everything.
-i.e. here is a 3rd backchain dereference
-p/x *(long *)(***(long ***)$sp+112)
-
-
-this outputs
-$5 = 0x528f18
-on my machine.
-Now you can use
-info symbol (*($sp+56))&0x7fffffff
-you might see something like.
-rl_getc + 36 in section .text telling you what is located at address 0x528f18
-Now do.
-p/x (*(*$sp+56))&0x7fffffff
-This outputs
-$6 = 0x528ed0
-Now do.
-info symbol (*(*$sp+56))&0x7fffffff
-rl_read_key + 180 in section .text
-now do
-p/x (*(**$sp+56))&0x7fffffff
-& so on.
-
-Disassembling instructions without debug info
----------------------------------------------
-gdb typically complains if there is a lack of debugging
-symbols in the disassemble command with
-"No function contains specified address." To get around
-this do
-x/<number lines to disassemble>xi <address>
-e.g.
-x/20xi 0x400730
-
-
-
-Note: Remember gdb has history just like bash you don't need to retype the
-whole line just use the up & down arrows.
-
-
-
-For more info
--------------
-From your linuxbox do
-man gdb or info gdb.
-
-core dumps
-----------
-What a core dump ?,
-A core dump is a file generated by the kernel (if allowed) which contains the
-registers and all active pages of the program which has crashed.
-From this file gdb will allow you to look at the registers, stack trace and
-memory of the program as if it just crashed on your system. It is usually
-called core and created in the current working directory.
-This is very useful in that a customer can mail a core dump to a technical
-support department and the technical support department can reconstruct what
-happened. Provided they have an identical copy of this program with debugging
-symbols compiled in and the source base of this build is available.
-In short it is far more useful than something like a crash log could ever hope
-to be.
-
-Why have I never seen one ?.
-Probably because you haven't used the command
-ulimit -c unlimited in bash
-to allow core dumps, now do
-ulimit -a
-to verify that the limit was accepted.
-
-A sample core dump
-To create this I'm going to do
-ulimit -c unlimited
-gdb
-to launch gdb (my victim app. ) now be bad & do the following from another
-telnet/xterm session to the same machine
-ps -aux | grep gdb
-kill -SIGSEGV <gdb's pid>
-or alternatively use killall -SIGSEGV gdb if you have the killall command.
-Now look at the core dump.
-./gdb core
-Displays the following
-GNU gdb 4.18
-Copyright 1998 Free Software Foundation, Inc.
-GDB is free software, covered by the GNU General Public License, and you are
-welcome to change it and/or distribute copies of it under certain conditions.
-Type "show copying" to see the conditions.
-There is absolutely no warranty for GDB. Type "show warranty" for details.
-This GDB was configured as "s390-ibm-linux"...
-Core was generated by `./gdb'.
-Program terminated with signal 11, Segmentation fault.
-Reading symbols from /usr/lib/libncurses.so.4...done.
-Reading symbols from /lib/libm.so.6...done.
-Reading symbols from /lib/libc.so.6...done.
-Reading symbols from /lib/ld-linux.so.2...done.
-#0 0x40126d1a in read () from /lib/libc.so.6
-Setting up the environment for debugging gdb.
-Breakpoint 1 at 0x4dc6f8: file utils.c, line 471.
-Breakpoint 2 at 0x4d87a4: file top.c, line 2609.
-(top-gdb) info stack
-#0 0x40126d1a in read () from /lib/libc.so.6
-#1 0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402
-#2 0x528ed0 in rl_read_key () at input.c:381
-#3 0x5167e6 in readline_internal_char () at readline.c:454
-#4 0x5168ee in readline_internal_charloop () at readline.c:507
-#5 0x51692c in readline_internal () at readline.c:521
-#6 0x5164fe in readline (prompt=0x7ffff810)
- at readline.c:349
-#7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1,
- annotation_suffix=0x4d6b44 "prompt") at top.c:2091
-#8 0x4d6cf0 in command_loop () at top.c:1345
-#9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635
-
-
-LDD
-===
-This is a program which lists the shared libraries which a library needs,
-Note you also get the relocations of the shared library text segments which
-help when using objdump --source.
-e.g.
- ldd ./gdb
-outputs
-libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000)
-libm.so.6 => /lib/libm.so.6 (0x4005e000)
-libc.so.6 => /lib/libc.so.6 (0x40084000)
-/lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000)
-
-
-Debugging shared libraries
-==========================
-Most programs use shared libraries, however it can be very painful
-when you single step instruction into a function like printf for the
-first time & you end up in functions like _dl_runtime_resolve this is
-the ld.so doing lazy binding, lazy binding is a concept in ELF where
-shared library functions are not loaded into memory unless they are
-actually used, great for saving memory but a pain to debug.
-To get around this either relink the program -static or exit gdb type
-export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing
-the program in question.
-
-
-
-Debugging modules
-=================
-As modules are dynamically loaded into the kernel their address can be
-anywhere to get around this use the -m option with insmod to emit a load
-map which can be piped into a file if required.
-
-The proc file system
-====================
-What is it ?.
-It is a filesystem created by the kernel with files which are created on demand
-by the kernel if read, or can be used to modify kernel parameters,
-it is a powerful concept.
-
-e.g.
-
-cat /proc/sys/net/ipv4/ip_forward
-On my machine outputs
-0
-telling me ip_forwarding is not on to switch it on I can do
-echo 1 > /proc/sys/net/ipv4/ip_forward
-cat it again
-cat /proc/sys/net/ipv4/ip_forward
-On my machine now outputs
-1
-IP forwarding is on.
-There is a lot of useful info in here best found by going in and having a look
-around, so I'll take you through some entries I consider important.
-
-All the processes running on the machine have their own entry defined by
-/proc/<pid>
-So lets have a look at the init process
-cd /proc/1
-
-cat cmdline
-emits
-init [2]
-
-cd /proc/1/fd
-This contains numerical entries of all the open files,
-some of these you can cat e.g. stdout (2)
-
-cat /proc/29/maps
-on my machine emits
-
-00400000-00478000 r-xp 00000000 5f:00 4103 /bin/bash
-00478000-0047e000 rw-p 00077000 5f:00 4103 /bin/bash
-0047e000-00492000 rwxp 00000000 00:00 0
-40000000-40015000 r-xp 00000000 5f:00 14382 /lib/ld-2.1.2.so
-40015000-40016000 rw-p 00014000 5f:00 14382 /lib/ld-2.1.2.so
-40016000-40017000 rwxp 00000000 00:00 0
-40017000-40018000 rw-p 00000000 00:00 0
-40018000-4001b000 r-xp 00000000 5f:00 14435 /lib/libtermcap.so.2.0.8
-4001b000-4001c000 rw-p 00002000 5f:00 14435 /lib/libtermcap.so.2.0.8
-4001c000-4010d000 r-xp 00000000 5f:00 14387 /lib/libc-2.1.2.so
-4010d000-40111000 rw-p 000f0000 5f:00 14387 /lib/libc-2.1.2.so
-40111000-40114000 rw-p 00000000 00:00 0
-40114000-4011e000 r-xp 00000000 5f:00 14408 /lib/libnss_files-2.1.2.so
-4011e000-4011f000 rw-p 00009000 5f:00 14408 /lib/libnss_files-2.1.2.so
-7fffd000-80000000 rwxp ffffe000 00:00 0
-
-
-Showing us the shared libraries init uses where they are in memory
-& memory access permissions for each virtual memory area.
-
-/proc/1/cwd is a softlink to the current working directory.
-/proc/1/root is the root of the filesystem for this process.
-
-/proc/1/mem is the current running processes memory which you
-can read & write to like a file.
-strace uses this sometimes as it is a bit faster than the
-rather inefficient ptrace interface for peeking at DATA.
-
-
-cat status
-
-Name: init
-State: S (sleeping)
-Pid: 1
-PPid: 0
-Uid: 0 0 0 0
-Gid: 0 0 0 0
-Groups:
-VmSize: 408 kB
-VmLck: 0 kB
-VmRSS: 208 kB
-VmData: 24 kB
-VmStk: 8 kB
-VmExe: 368 kB
-VmLib: 0 kB
-SigPnd: 0000000000000000
-SigBlk: 0000000000000000
-SigIgn: 7fffffffd7f0d8fc
-SigCgt: 00000000280b2603
-CapInh: 00000000fffffeff
-CapPrm: 00000000ffffffff
-CapEff: 00000000fffffeff
-
-User PSW: 070de000 80414146
-task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68
-User GPRS:
-00000400 00000000 0000000b 7ffffa90
-00000000 00000000 00000000 0045d9f4
-0045cafc 7ffffa90 7fffff18 0045cb08
-00010400 804039e8 80403af8 7ffff8b0
-User ACRS:
-00000000 00000000 00000000 00000000
-00000001 00000000 00000000 00000000
-00000000 00000000 00000000 00000000
-00000000 00000000 00000000 00000000
-Kernel BackChain CallChain BackChain CallChain
- 004b7ca8 8002bd0c 004b7d18 8002b92c
- 004b7db8 8005cd50 004b7e38 8005d12a
- 004b7f08 80019114
-Showing among other things memory usage & status of some signals &
-the processes'es registers from the kernel task_structure
-as well as a backchain which may be useful if a process crashes
-in the kernel for some unknown reason.
-
-Some driver debugging techniques
-================================
-debug feature
--------------
-Some of our drivers now support a "debug feature" in
-/proc/s390dbf see s390dbf.txt in the linux/Documentation directory
-for more info.
-e.g.
-to switch on the lcs "debug feature"
-echo 5 > /proc/s390dbf/lcs/level
-& then after the error occurred.
-cat /proc/s390dbf/lcs/sprintf >/logfile
-the logfile now contains some information which may help
-tech support resolve a problem in the field.
-
-
-
-high level debugging network drivers
-------------------------------------
-ifconfig is a quite useful command
-it gives the current state of network drivers.
-
-If you suspect your network device driver is dead
-one way to check is type
-ifconfig <network device>
-e.g. tr0
-You should see something like
-tr0 Link encap:16/4 Mbps Token Ring (New) HWaddr 00:04:AC:20:8E:48
- inet addr:9.164.185.132 Bcast:9.164.191.255 Mask:255.255.224.0
- UP BROADCAST RUNNING MULTICAST MTU:2000 Metric:1
- RX packets:246134 errors:0 dropped:0 overruns:0 frame:0
- TX packets:5 errors:0 dropped:0 overruns:0 carrier:0
- collisions:0 txqueuelen:100
-
-if the device doesn't say up
-try
-/etc/rc.d/init.d/network start
-( this starts the network stack & hopefully calls ifconfig tr0 up ).
-ifconfig looks at the output of /proc/net/dev and presents it in a more
-presentable form.
-Now ping the device from a machine in the same subnet.
-if the RX packets count & TX packets counts don't increment you probably
-have problems.
-next
-cat /proc/net/arp
-Do you see any hardware addresses in the cache if not you may have problems.
-Next try
-ping -c 5 <broadcast_addr> i.e. the Bcast field above in the output of
-ifconfig. Do you see any replies from machines other than the local machine
-if not you may have problems. also if the TX packets count in ifconfig
-hasn't incremented either you have serious problems in your driver
-(e.g. the txbusy field of the network device being stuck on )
-or you may have multiple network devices connected.
-
-
-chandev
--------
-There is a new device layer for channel devices, some
-drivers e.g. lcs are registered with this layer.
-If the device uses the channel device layer you'll be
-able to find what interrupts it uses & the current state
-of the device.
-See the manpage chandev.8 &type cat /proc/chandev for more info.
-
-
-SysRq
-=====
-This is now supported by linux for s/390 & z/Architecture.
-To enable it do compile the kernel with
-Kernel Hacking -> Magic SysRq Key Enabled
-echo "1" > /proc/sys/kernel/sysrq
-also type
-echo "8" >/proc/sys/kernel/printk
-To make printk output go to console.
-On 390 all commands are prefixed with
-^-
-e.g.
-^-t will show tasks.
-^-? or some unknown command will display help.
-The sysrq key reading is very picky ( I have to type the keys in an
- xterm session & paste them into the x3270 console )
-& it may be wise to predefine the keys as described in the VM hints above
-
-This is particularly useful for syncing disks unmounting & rebooting
-if the machine gets partially hung.
-
-Read Documentation/admin-guide/sysrq.rst for more info
-
-References:
-===========
-Enterprise Systems Architecture Reference Summary
-Enterprise Systems Architecture Principles of Operation
-Hartmut Penners s390 stack frame sheet.
-IBM Mainframe Channel Attachment a technology brief from a CISCO webpage
-Various bits of man & info pages of Linux.
-Linux & GDB source.
-Various info & man pages.
-CMS Help on tracing commands.
-Linux for s/390 Elf Application Binary Interface
-Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended )
-z/Architecture Principles of Operation SA22-7832-00
-Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the
-Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05
-
-Special Thanks
-==============
-Special thanks to Neale Ferguson who maintains a much
-prettier HTML version of this page at
-http://linuxvm.org/penguinvm/
-Bob Grainger Stefan Bader & others for reporting bugs
diff --git a/Documentation/s390/cds.rst b/Documentation/s390/cds.rst
new file mode 100644
index 000000000000..7006d8209d2e
--- /dev/null
+++ b/Documentation/s390/cds.rst
@@ -0,0 +1,530 @@
+===========================
+Linux for S/390 and zSeries
+===========================
+
+Common Device Support (CDS)
+Device Driver I/O Support Routines
+
+Authors:
+ - Ingo Adlung
+ - Cornelia Huck
+
+Copyright, IBM Corp. 1999-2002
+
+Introduction
+============
+
+This document describes the common device support routines for Linux/390.
+Different than other hardware architectures, ESA/390 has defined a unified
+I/O access method. This gives relief to the device drivers as they don't
+have to deal with different bus types, polling versus interrupt
+processing, shared versus non-shared interrupt processing, DMA versus port
+I/O (PIO), and other hardware features more. However, this implies that
+either every single device driver needs to implement the hardware I/O
+attachment functionality itself, or the operating system provides for a
+unified method to access the hardware, providing all the functionality that
+every single device driver would have to provide itself.
+
+The document does not intend to explain the ESA/390 hardware architecture in
+every detail.This information can be obtained from the ESA/390 Principles of
+Operation manual (IBM Form. No. SA22-7201).
+
+In order to build common device support for ESA/390 I/O interfaces, a
+functional layer was introduced that provides generic I/O access methods to
+the hardware.
+
+The common device support layer comprises the I/O support routines defined
+below. Some of them implement common Linux device driver interfaces, while
+some of them are ESA/390 platform specific.
+
+Note:
+ In order to write a driver for S/390, you also need to look into the interface
+ described in Documentation/s390/driver-model.rst.
+
+Note for porting drivers from 2.4:
+
+The major changes are:
+
+* The functions use a ccw_device instead of an irq (subchannel).
+* All drivers must define a ccw_driver (see driver-model.txt) and the associated
+ functions.
+* request_irq() and free_irq() are no longer done by the driver.
+* The oper_handler is (kindof) replaced by the probe() and set_online() functions
+ of the ccw_driver.
+* The not_oper_handler is (kindof) replaced by the remove() and set_offline()
+ functions of the ccw_driver.
+* The channel device layer is gone.
+* The interrupt handlers must be adapted to use a ccw_device as argument.
+ Moreover, they don't return a devstat, but an irb.
+* Before initiating an io, the options must be set via ccw_device_set_options().
+* Instead of calling read_dev_chars()/read_conf_data(), the driver issues
+ the channel program and handles the interrupt itself.
+
+ccw_device_get_ciw()
+ get commands from extended sense data.
+
+ccw_device_start(), ccw_device_start_timeout(), ccw_device_start_key(), ccw_device_start_key_timeout()
+ initiate an I/O request.
+
+ccw_device_resume()
+ resume channel program execution.
+
+ccw_device_halt()
+ terminate the current I/O request processed on the device.
+
+do_IRQ()
+ generic interrupt routine. This function is called by the interrupt entry
+ routine whenever an I/O interrupt is presented to the system. The do_IRQ()
+ routine determines the interrupt status and calls the device specific
+ interrupt handler according to the rules (flags) defined during I/O request
+ initiation with do_IO().
+
+The next chapters describe the functions other than do_IRQ() in more details.
+The do_IRQ() interface is not described, as it is called from the Linux/390
+first level interrupt handler only and does not comprise a device driver
+callable interface. Instead, the functional description of do_IO() also
+describes the input to the device specific interrupt handler.
+
+Note:
+ All explanations apply also to the 64 bit architecture s390x.
+
+
+Common Device Support (CDS) for Linux/390 Device Drivers
+========================================================
+
+General Information
+-------------------
+
+The following chapters describe the I/O related interface routines the
+Linux/390 common device support (CDS) provides to allow for device specific
+driver implementations on the IBM ESA/390 hardware platform. Those interfaces
+intend to provide the functionality required by every device driver
+implementation to allow to drive a specific hardware device on the ESA/390
+platform. Some of the interface routines are specific to Linux/390 and some
+of them can be found on other Linux platforms implementations too.
+Miscellaneous function prototypes, data declarations, and macro definitions
+can be found in the architecture specific C header file
+linux/arch/s390/include/asm/irq.h.
+
+Overview of CDS interface concepts
+----------------------------------
+
+Different to other hardware platforms, the ESA/390 architecture doesn't define
+interrupt lines managed by a specific interrupt controller and bus systems
+that may or may not allow for shared interrupts, DMA processing, etc.. Instead,
+the ESA/390 architecture has implemented a so called channel subsystem, that
+provides a unified view of the devices physically attached to the systems.
+Though the ESA/390 hardware platform knows about a huge variety of different
+peripheral attachments like disk devices (aka. DASDs), tapes, communication
+controllers, etc. they can all be accessed by a well defined access method and
+they are presenting I/O completion a unified way : I/O interruptions. Every
+single device is uniquely identified to the system by a so called subchannel,
+where the ESA/390 architecture allows for 64k devices be attached.
+
+Linux, however, was first built on the Intel PC architecture, with its two
+cascaded 8259 programmable interrupt controllers (PICs), that allow for a
+maximum of 15 different interrupt lines. All devices attached to such a system
+share those 15 interrupt levels. Devices attached to the ISA bus system must
+not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered
+interrupts. MCA, EISA, PCI and other bus systems base on level triggered
+interrupts, and therewith allow for shared IRQs. However, if multiple devices
+present their hardware status by the same (shared) IRQ, the operating system
+has to call every single device driver registered on this IRQ in order to
+determine the device driver owning the device that raised the interrupt.
+
+Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel).
+For internal use of the common I/O layer, these are still there. However,
+device drivers should use the new calling interface via the ccw_device only.
+
+During its startup the Linux/390 system checks for peripheral devices. Each
+of those devices is uniquely defined by a so called subchannel by the ESA/390
+channel subsystem. While the subchannel numbers are system generated, each
+subchannel also takes a user defined attribute, the so called device number.
+Both subchannel number and device number cannot exceed 65535. During sysfs
+initialisation, the information about control unit type and device types that
+imply specific I/O commands (channel command words - CCWs) in order to operate
+the device are gathered. Device drivers can retrieve this set of hardware
+information during their initialization step to recognize the devices they
+support using the information saved in the struct ccw_device given to them.
+This methods implies that Linux/390 doesn't require to probe for free (not
+armed) interrupt request lines (IRQs) to drive its devices with. Where
+applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS
+ccw to retrieve device characteristics in its online routine.
+
+In order to allow for easy I/O initiation the CDS layer provides a
+ccw_device_start() interface that takes a device specific channel program (one
+or more CCWs) as input sets up the required architecture specific control blocks
+and initiates an I/O request on behalf of the device driver. The
+ccw_device_start() routine allows to specify whether it expects the CDS layer
+to notify the device driver for every interrupt it observes, or with final status
+only. See ccw_device_start() for more details. A device driver must never issue
+ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead.
+
+For long running I/O request to be canceled, the CDS layer provides the
+ccw_device_halt() function. Some devices require to initially issue a HALT
+SUBCHANNEL (HSCH) command without having pending I/O requests. This function is
+also covered by ccw_device_halt().
+
+
+get_ciw() - get command information word
+
+This call enables a device driver to get information about supported commands
+from the extended SenseID data.
+
+::
+
+ struct ciw *
+ ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd);
+
+==== ========================================================
+cdev The ccw_device for which the command is to be retrieved.
+cmd The command type to be retrieved.
+==== ========================================================
+
+ccw_device_get_ciw() returns:
+
+===== ================================================================
+ NULL No extended data available, invalid device or command not found.
+!NULL The command requested.
+===== ================================================================
+
+::
+
+ ccw_device_start() - Initiate I/O Request
+
+The ccw_device_start() routines is the I/O request front-end processor. All
+device driver I/O requests must be issued using this routine. A device driver
+must not issue ESA/390 I/O commands itself. Instead the ccw_device_start()
+routine provides all interfaces required to drive arbitrary devices.
+
+This description also covers the status information passed to the device
+driver's interrupt handler as this is related to the rules (flags) defined
+with the associated I/O request when calling ccw_device_start().
+
+::
+
+ int ccw_device_start(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ unsigned long flags);
+ int ccw_device_start_timeout(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ unsigned long flags,
+ int expires);
+ int ccw_device_start_key(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ __u8 key,
+ unsigned long flags);
+ int ccw_device_start_key_timeout(struct ccw_device *cdev,
+ struct ccw1 *cpa,
+ unsigned long intparm,
+ __u8 lpm,
+ __u8 key,
+ unsigned long flags,
+ int expires);
+
+============= =============================================================
+cdev ccw_device the I/O is destined for
+cpa logical start address of channel program
+user_intparm user specific interrupt information; will be presented
+ back to the device driver's interrupt handler. Allows a
+ device driver to associate the interrupt with a
+ particular I/O request.
+lpm defines the channel path to be used for a specific I/O
+ request. A value of 0 will make cio use the opm.
+key the storage key to use for the I/O (useful for operating on a
+ storage with a storage key != default key)
+flag defines the action to be performed for I/O processing
+expires timeout value in jiffies. The common I/O layer will terminate
+ the running program after this and call the interrupt handler
+ with ERR_PTR(-ETIMEDOUT) as irb.
+============= =============================================================
+
+Possible flag values are:
+
+========================= =============================================
+DOIO_ALLOW_SUSPEND channel program may become suspended
+DOIO_DENY_PREFETCH don't allow for CCW prefetch; usually
+ this implies the channel program might
+ become modified
+DOIO_SUPPRESS_INTER don't call the handler on intermediate status
+========================= =============================================
+
+The cpa parameter points to the first format 1 CCW of a channel program::
+
+ struct ccw1 {
+ __u8 cmd_code;/* command code */
+ __u8 flags; /* flags, like IDA addressing, etc. */
+ __u16 count; /* byte count */
+ __u32 cda; /* data address */
+ } __attribute__ ((packed,aligned(8)));
+
+with the following CCW flags values defined:
+
+=================== =========================
+CCW_FLAG_DC data chaining
+CCW_FLAG_CC command chaining
+CCW_FLAG_SLI suppress incorrect length
+CCW_FLAG_SKIP skip
+CCW_FLAG_PCI PCI
+CCW_FLAG_IDA indirect addressing
+CCW_FLAG_SUSPEND suspend
+=================== =========================
+
+
+Via ccw_device_set_options(), the device driver may specify the following
+options for the device:
+
+========================= ======================================
+DOIO_EARLY_NOTIFICATION allow for early interrupt notification
+DOIO_REPORT_ALL report all interrupt conditions
+========================= ======================================
+
+
+The ccw_device_start() function returns:
+
+======== ======================================================================
+ 0 successful completion or request successfully initiated
+ -EBUSY The device is currently processing a previous I/O request, or there is
+ a status pending at the device.
+-ENODEV cdev is invalid, the device is not operational or the ccw_device is
+ not online.
+======== ======================================================================
+
+When the I/O request completes, the CDS first level interrupt handler will
+accumulate the status in a struct irb and then call the device interrupt handler.
+The intparm field will contain the value the device driver has associated with a
+particular I/O request. If a pending device status was recognized,
+intparm will be set to 0 (zero). This may happen during I/O initiation or delayed
+by an alert status notification. In any case this status is not related to the
+current (last) I/O request. In case of a delayed status notification no special
+interrupt will be presented to indicate I/O completion as the I/O request was
+never started, even though ccw_device_start() returned with successful completion.
+
+The irb may contain an error value, and the device driver should check for this
+first:
+
+========== =================================================================
+-ETIMEDOUT the common I/O layer terminated the request after the specified
+ timeout value
+-EIO the common I/O layer terminated the request due to an error state
+========== =================================================================
+
+If the concurrent sense flag in the extended status word (esw) in the irb is
+set, the field erw.scnt in the esw describes the number of device specific
+sense bytes available in the extended control word irb->scsw.ecw[]. No device
+sensing by the device driver itself is required.
+
+The device interrupt handler can use the following definitions to investigate
+the primary unit check source coded in sense byte 0 :
+
+======================= ====
+SNS0_CMD_REJECT 0x80
+SNS0_INTERVENTION_REQ 0x40
+SNS0_BUS_OUT_CHECK 0x20
+SNS0_EQUIPMENT_CHECK 0x10
+SNS0_DATA_CHECK 0x08
+SNS0_OVERRUN 0x04
+SNS0_INCOMPL_DOMAIN 0x01
+======================= ====
+
+Depending on the device status, multiple of those values may be set together.
+Please refer to the device specific documentation for details.
+
+The irb->scsw.cstat field provides the (accumulated) subchannel status :
+
+========================= ============================
+SCHN_STAT_PCI program controlled interrupt
+SCHN_STAT_INCORR_LEN incorrect length
+SCHN_STAT_PROG_CHECK program check
+SCHN_STAT_PROT_CHECK protection check
+SCHN_STAT_CHN_DATA_CHK channel data check
+SCHN_STAT_CHN_CTRL_CHK channel control check
+SCHN_STAT_INTF_CTRL_CHK interface control check
+SCHN_STAT_CHAIN_CHECK chaining check
+========================= ============================
+
+The irb->scsw.dstat field provides the (accumulated) device status :
+
+===================== =================
+DEV_STAT_ATTENTION attention
+DEV_STAT_STAT_MOD status modifier
+DEV_STAT_CU_END control unit end
+DEV_STAT_BUSY busy
+DEV_STAT_CHN_END channel end
+DEV_STAT_DEV_END device end
+DEV_STAT_UNIT_CHECK unit check
+DEV_STAT_UNIT_EXCEP unit exception
+===================== =================
+
+Please see the ESA/390 Principles of Operation manual for details on the
+individual flag meanings.
+
+Usage Notes:
+
+ccw_device_start() must be called disabled and with the ccw device lock held.
+
+The device driver is allowed to issue the next ccw_device_start() call from
+within its interrupt handler already. It is not required to schedule a
+bottom-half, unless a non deterministically long running error recovery procedure
+or similar needs to be scheduled. During I/O processing the Linux/390 generic
+I/O device driver support has already obtained the IRQ lock, i.e. the handler
+must not try to obtain it again when calling ccw_device_start() or we end in a
+deadlock situation!
+
+If a device driver relies on an I/O request to be completed prior to start the
+next it can reduce I/O processing overhead by chaining a NoOp I/O command
+CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End
+and Device-End status to be presented together, with a single interrupt.
+However, this should be used with care as it implies the channel will remain
+busy, not being able to process I/O requests for other devices on the same
+channel. Therefore e.g. read commands should never use this technique, as the
+result will be presented by a single interrupt anyway.
+
+In order to minimize I/O overhead, a device driver should use the
+DOIO_REPORT_ALL only if the device can report intermediate interrupt
+information prior to device-end the device driver urgently relies on. In this
+case all I/O interruptions are presented to the device driver until final
+status is recognized.
+
+If a device is able to recover from asynchronously presented I/O errors, it can
+perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some
+devices always report channel-end and device-end together, with a single
+interrupt, others present primary status (channel-end) when the channel is
+ready for the next I/O request and secondary status (device-end) when the data
+transmission has been completed at the device.
+
+Above flag allows to exploit this feature, e.g. for communication devices that
+can handle lost data on the network to allow for enhanced I/O processing.
+
+Unless the channel subsystem at any time presents a secondary status interrupt,
+exploiting this feature will cause only primary status interrupts to be
+presented to the device driver while overlapping I/O is performed. When a
+secondary status without error (alert status) is presented, this indicates
+successful completion for all overlapping ccw_device_start() requests that have
+been issued since the last secondary (final) status.
+
+Channel programs that intend to set the suspend flag on a channel command word
+(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the
+suspend flag will cause a channel program check. At the time the channel program
+becomes suspended an intermediate interrupt will be generated by the channel
+subsystem.
+
+ccw_device_resume() - Resume Channel Program Execution
+
+If a device driver chooses to suspend the current channel program execution by
+setting the CCW suspend flag on a particular CCW, the channel program execution
+is suspended. In order to resume channel program execution the CIO layer
+provides the ccw_device_resume() routine.
+
+::
+
+ int ccw_device_resume(struct ccw_device *cdev);
+
+==== ================================================
+cdev ccw_device the resume operation is requested for
+==== ================================================
+
+The ccw_device_resume() function returns:
+
+========= ==============================================
+ 0 suspended channel program is resumed
+ -EBUSY status pending
+ -ENODEV cdev invalid or not-operational subchannel
+ -EINVAL resume function not applicable
+-ENOTCONN there is no I/O request pending for completion
+========= ==============================================
+
+Usage Notes:
+
+Please have a look at the ccw_device_start() usage notes for more details on
+suspended channel programs.
+
+ccw_device_halt() - Halt I/O Request Processing
+
+Sometimes a device driver might need a possibility to stop the processing of
+a long-running channel program or the device might require to initially issue
+a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt()
+command is provided.
+
+ccw_device_halt() must be called disabled and with the ccw device lock held.
+
+::
+
+ int ccw_device_halt(struct ccw_device *cdev,
+ unsigned long intparm);
+
+======= =====================================================
+cdev ccw_device the halt operation is requested for
+intparm interruption parameter; value is only used if no I/O
+ is outstanding, otherwise the intparm associated with
+ the I/O request is returned
+======= =====================================================
+
+The ccw_device_halt() function returns:
+
+======= ==============================================================
+ 0 request successfully initiated
+-EBUSY the device is currently busy, or status pending.
+-ENODEV cdev invalid.
+-EINVAL The device is not operational or the ccw device is not online.
+======= ==============================================================
+
+Usage Notes:
+
+A device driver may write a never-ending channel program by writing a channel
+program that at its end loops back to its beginning by means of a transfer in
+channel (TIC) command (CCW_CMD_TIC). Usually this is performed by network
+device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is
+executed a program controlled interrupt (PCI) is generated. The device driver
+can then perform an appropriate action. Prior to interrupt of an outstanding
+read to a network device (with or without PCI flag) a ccw_device_halt()
+is required to end the pending operation.
+
+::
+
+ ccw_device_clear() - Terminage I/O Request Processing
+
+In order to terminate all I/O processing at the subchannel, the clear subchannel
+(CSCH) command is used. It can be issued via ccw_device_clear().
+
+ccw_device_clear() must be called disabled and with the ccw device lock held.
+
+::
+
+ int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm);
+
+======= ===============================================
+cdev ccw_device the clear operation is requested for
+intparm interruption parameter (see ccw_device_halt())
+======= ===============================================
+
+The ccw_device_clear() function returns:
+
+======= ==============================================================
+ 0 request successfully initiated
+-ENODEV cdev invalid
+-EINVAL The device is not operational or the ccw device is not online.
+======= ==============================================================
+
+Miscellaneous Support Routines
+------------------------------
+
+This chapter describes various routines to be used in a Linux/390 device
+driver programming environment.
+
+get_ccwdev_lock()
+
+Get the address of the device specific lock. This is then used in
+spin_lock() / spin_unlock() calls.
+
+::
+
+ __u8 ccw_device_get_path_mask(struct ccw_device *cdev);
+
+Get the mask of the path currently available for cdev.
diff --git a/Documentation/s390/cds.txt b/Documentation/s390/cds.txt
deleted file mode 100644
index 480a78ef5a1e..000000000000
--- a/Documentation/s390/cds.txt
+++ /dev/null
@@ -1,472 +0,0 @@
-Linux for S/390 and zSeries
-
-Common Device Support (CDS)
-Device Driver I/O Support Routines
-
-Authors : Ingo Adlung
- Cornelia Huck
-
-Copyright, IBM Corp. 1999-2002
-
-Introduction
-
-This document describes the common device support routines for Linux/390.
-Different than other hardware architectures, ESA/390 has defined a unified
-I/O access method. This gives relief to the device drivers as they don't
-have to deal with different bus types, polling versus interrupt
-processing, shared versus non-shared interrupt processing, DMA versus port
-I/O (PIO), and other hardware features more. However, this implies that
-either every single device driver needs to implement the hardware I/O
-attachment functionality itself, or the operating system provides for a
-unified method to access the hardware, providing all the functionality that
-every single device driver would have to provide itself.
-
-The document does not intend to explain the ESA/390 hardware architecture in
-every detail.This information can be obtained from the ESA/390 Principles of
-Operation manual (IBM Form. No. SA22-7201).
-
-In order to build common device support for ESA/390 I/O interfaces, a
-functional layer was introduced that provides generic I/O access methods to
-the hardware.
-
-The common device support layer comprises the I/O support routines defined
-below. Some of them implement common Linux device driver interfaces, while
-some of them are ESA/390 platform specific.
-
-Note:
-In order to write a driver for S/390, you also need to look into the interface
-described in Documentation/s390/driver-model.txt.
-
-Note for porting drivers from 2.4:
-The major changes are:
-* The functions use a ccw_device instead of an irq (subchannel).
-* All drivers must define a ccw_driver (see driver-model.txt) and the associated
- functions.
-* request_irq() and free_irq() are no longer done by the driver.
-* The oper_handler is (kindof) replaced by the probe() and set_online() functions
- of the ccw_driver.
-* The not_oper_handler is (kindof) replaced by the remove() and set_offline()
- functions of the ccw_driver.
-* The channel device layer is gone.
-* The interrupt handlers must be adapted to use a ccw_device as argument.
- Moreover, they don't return a devstat, but an irb.
-* Before initiating an io, the options must be set via ccw_device_set_options().
-* Instead of calling read_dev_chars()/read_conf_data(), the driver issues
- the channel program and handles the interrupt itself.
-
-ccw_device_get_ciw()
- get commands from extended sense data.
-
-ccw_device_start()
-ccw_device_start_timeout()
-ccw_device_start_key()
-ccw_device_start_key_timeout()
- initiate an I/O request.
-
-ccw_device_resume()
- resume channel program execution.
-
-ccw_device_halt()
- terminate the current I/O request processed on the device.
-
-do_IRQ()
- generic interrupt routine. This function is called by the interrupt entry
- routine whenever an I/O interrupt is presented to the system. The do_IRQ()
- routine determines the interrupt status and calls the device specific
- interrupt handler according to the rules (flags) defined during I/O request
- initiation with do_IO().
-
-The next chapters describe the functions other than do_IRQ() in more details.
-The do_IRQ() interface is not described, as it is called from the Linux/390
-first level interrupt handler only and does not comprise a device driver
-callable interface. Instead, the functional description of do_IO() also
-describes the input to the device specific interrupt handler.
-
-Note: All explanations apply also to the 64 bit architecture s390x.
-
-
-Common Device Support (CDS) for Linux/390 Device Drivers
-
-General Information
-
-The following chapters describe the I/O related interface routines the
-Linux/390 common device support (CDS) provides to allow for device specific
-driver implementations on the IBM ESA/390 hardware platform. Those interfaces
-intend to provide the functionality required by every device driver
-implementation to allow to drive a specific hardware device on the ESA/390
-platform. Some of the interface routines are specific to Linux/390 and some
-of them can be found on other Linux platforms implementations too.
-Miscellaneous function prototypes, data declarations, and macro definitions
-can be found in the architecture specific C header file
-linux/arch/s390/include/asm/irq.h.
-
-Overview of CDS interface concepts
-
-Different to other hardware platforms, the ESA/390 architecture doesn't define
-interrupt lines managed by a specific interrupt controller and bus systems
-that may or may not allow for shared interrupts, DMA processing, etc.. Instead,
-the ESA/390 architecture has implemented a so called channel subsystem, that
-provides a unified view of the devices physically attached to the systems.
-Though the ESA/390 hardware platform knows about a huge variety of different
-peripheral attachments like disk devices (aka. DASDs), tapes, communication
-controllers, etc. they can all be accessed by a well defined access method and
-they are presenting I/O completion a unified way : I/O interruptions. Every
-single device is uniquely identified to the system by a so called subchannel,
-where the ESA/390 architecture allows for 64k devices be attached.
-
-Linux, however, was first built on the Intel PC architecture, with its two
-cascaded 8259 programmable interrupt controllers (PICs), that allow for a
-maximum of 15 different interrupt lines. All devices attached to such a system
-share those 15 interrupt levels. Devices attached to the ISA bus system must
-not share interrupt levels (aka. IRQs), as the ISA bus bases on edge triggered
-interrupts. MCA, EISA, PCI and other bus systems base on level triggered
-interrupts, and therewith allow for shared IRQs. However, if multiple devices
-present their hardware status by the same (shared) IRQ, the operating system
-has to call every single device driver registered on this IRQ in order to
-determine the device driver owning the device that raised the interrupt.
-
-Up to kernel 2.4, Linux/390 used to provide interfaces via the IRQ (subchannel).
-For internal use of the common I/O layer, these are still there. However,
-device drivers should use the new calling interface via the ccw_device only.
-
-During its startup the Linux/390 system checks for peripheral devices. Each
-of those devices is uniquely defined by a so called subchannel by the ESA/390
-channel subsystem. While the subchannel numbers are system generated, each
-subchannel also takes a user defined attribute, the so called device number.
-Both subchannel number and device number cannot exceed 65535. During sysfs
-initialisation, the information about control unit type and device types that
-imply specific I/O commands (channel command words - CCWs) in order to operate
-the device are gathered. Device drivers can retrieve this set of hardware
-information during their initialization step to recognize the devices they
-support using the information saved in the struct ccw_device given to them.
-This methods implies that Linux/390 doesn't require to probe for free (not
-armed) interrupt request lines (IRQs) to drive its devices with. Where
-applicable, the device drivers can use issue the READ DEVICE CHARACTERISTICS
-ccw to retrieve device characteristics in its online routine.
-
-In order to allow for easy I/O initiation the CDS layer provides a
-ccw_device_start() interface that takes a device specific channel program (one
-or more CCWs) as input sets up the required architecture specific control blocks
-and initiates an I/O request on behalf of the device driver. The
-ccw_device_start() routine allows to specify whether it expects the CDS layer
-to notify the device driver for every interrupt it observes, or with final status
-only. See ccw_device_start() for more details. A device driver must never issue
-ESA/390 I/O commands itself, but must use the Linux/390 CDS interfaces instead.
-
-For long running I/O request to be canceled, the CDS layer provides the
-ccw_device_halt() function. Some devices require to initially issue a HALT
-SUBCHANNEL (HSCH) command without having pending I/O requests. This function is
-also covered by ccw_device_halt().
-
-
-get_ciw() - get command information word
-
-This call enables a device driver to get information about supported commands
-from the extended SenseID data.
-
-struct ciw *
-ccw_device_get_ciw(struct ccw_device *cdev, __u32 cmd);
-
-cdev - The ccw_device for which the command is to be retrieved.
-cmd - The command type to be retrieved.
-
-ccw_device_get_ciw() returns:
-NULL - No extended data available, invalid device or command not found.
-!NULL - The command requested.
-
-
-ccw_device_start() - Initiate I/O Request
-
-The ccw_device_start() routines is the I/O request front-end processor. All
-device driver I/O requests must be issued using this routine. A device driver
-must not issue ESA/390 I/O commands itself. Instead the ccw_device_start()
-routine provides all interfaces required to drive arbitrary devices.
-
-This description also covers the status information passed to the device
-driver's interrupt handler as this is related to the rules (flags) defined
-with the associated I/O request when calling ccw_device_start().
-
-int ccw_device_start(struct ccw_device *cdev,
- struct ccw1 *cpa,
- unsigned long intparm,
- __u8 lpm,
- unsigned long flags);
-int ccw_device_start_timeout(struct ccw_device *cdev,
- struct ccw1 *cpa,
- unsigned long intparm,
- __u8 lpm,
- unsigned long flags,
- int expires);
-int ccw_device_start_key(struct ccw_device *cdev,
- struct ccw1 *cpa,
- unsigned long intparm,
- __u8 lpm,
- __u8 key,
- unsigned long flags);
-int ccw_device_start_key_timeout(struct ccw_device *cdev,
- struct ccw1 *cpa,
- unsigned long intparm,
- __u8 lpm,
- __u8 key,
- unsigned long flags,
- int expires);
-
-cdev : ccw_device the I/O is destined for
-cpa : logical start address of channel program
-user_intparm : user specific interrupt information; will be presented
- back to the device driver's interrupt handler. Allows a
- device driver to associate the interrupt with a
- particular I/O request.
-lpm : defines the channel path to be used for a specific I/O
- request. A value of 0 will make cio use the opm.
-key : the storage key to use for the I/O (useful for operating on a
- storage with a storage key != default key)
-flag : defines the action to be performed for I/O processing
-expires : timeout value in jiffies. The common I/O layer will terminate
- the running program after this and call the interrupt handler
- with ERR_PTR(-ETIMEDOUT) as irb.
-
-Possible flag values are :
-
-DOIO_ALLOW_SUSPEND - channel program may become suspended
-DOIO_DENY_PREFETCH - don't allow for CCW prefetch; usually
- this implies the channel program might
- become modified
-DOIO_SUPPRESS_INTER - don't call the handler on intermediate status
-
-The cpa parameter points to the first format 1 CCW of a channel program :
-
-struct ccw1 {
- __u8 cmd_code;/* command code */
- __u8 flags; /* flags, like IDA addressing, etc. */
- __u16 count; /* byte count */
- __u32 cda; /* data address */
-} __attribute__ ((packed,aligned(8)));
-
-with the following CCW flags values defined :
-
-CCW_FLAG_DC - data chaining
-CCW_FLAG_CC - command chaining
-CCW_FLAG_SLI - suppress incorrect length
-CCW_FLAG_SKIP - skip
-CCW_FLAG_PCI - PCI
-CCW_FLAG_IDA - indirect addressing
-CCW_FLAG_SUSPEND - suspend
-
-
-Via ccw_device_set_options(), the device driver may specify the following
-options for the device:
-
-DOIO_EARLY_NOTIFICATION - allow for early interrupt notification
-DOIO_REPORT_ALL - report all interrupt conditions
-
-
-The ccw_device_start() function returns :
-
- 0 - successful completion or request successfully initiated
--EBUSY - The device is currently processing a previous I/O request, or there is
- a status pending at the device.
--ENODEV - cdev is invalid, the device is not operational or the ccw_device is
- not online.
-
-When the I/O request completes, the CDS first level interrupt handler will
-accumulate the status in a struct irb and then call the device interrupt handler.
-The intparm field will contain the value the device driver has associated with a
-particular I/O request. If a pending device status was recognized,
-intparm will be set to 0 (zero). This may happen during I/O initiation or delayed
-by an alert status notification. In any case this status is not related to the
-current (last) I/O request. In case of a delayed status notification no special
-interrupt will be presented to indicate I/O completion as the I/O request was
-never started, even though ccw_device_start() returned with successful completion.
-
-The irb may contain an error value, and the device driver should check for this
-first:
-
--ETIMEDOUT: the common I/O layer terminated the request after the specified
- timeout value
--EIO: the common I/O layer terminated the request due to an error state
-
-If the concurrent sense flag in the extended status word (esw) in the irb is
-set, the field erw.scnt in the esw describes the number of device specific
-sense bytes available in the extended control word irb->scsw.ecw[]. No device
-sensing by the device driver itself is required.
-
-The device interrupt handler can use the following definitions to investigate
-the primary unit check source coded in sense byte 0 :
-
-SNS0_CMD_REJECT 0x80
-SNS0_INTERVENTION_REQ 0x40
-SNS0_BUS_OUT_CHECK 0x20
-SNS0_EQUIPMENT_CHECK 0x10
-SNS0_DATA_CHECK 0x08
-SNS0_OVERRUN 0x04
-SNS0_INCOMPL_DOMAIN 0x01
-
-Depending on the device status, multiple of those values may be set together.
-Please refer to the device specific documentation for details.
-
-The irb->scsw.cstat field provides the (accumulated) subchannel status :
-
-SCHN_STAT_PCI - program controlled interrupt
-SCHN_STAT_INCORR_LEN - incorrect length
-SCHN_STAT_PROG_CHECK - program check
-SCHN_STAT_PROT_CHECK - protection check
-SCHN_STAT_CHN_DATA_CHK - channel data check
-SCHN_STAT_CHN_CTRL_CHK - channel control check
-SCHN_STAT_INTF_CTRL_CHK - interface control check
-SCHN_STAT_CHAIN_CHECK - chaining check
-
-The irb->scsw.dstat field provides the (accumulated) device status :
-
-DEV_STAT_ATTENTION - attention
-DEV_STAT_STAT_MOD - status modifier
-DEV_STAT_CU_END - control unit end
-DEV_STAT_BUSY - busy
-DEV_STAT_CHN_END - channel end
-DEV_STAT_DEV_END - device end
-DEV_STAT_UNIT_CHECK - unit check
-DEV_STAT_UNIT_EXCEP - unit exception
-
-Please see the ESA/390 Principles of Operation manual for details on the
-individual flag meanings.
-
-Usage Notes :
-
-ccw_device_start() must be called disabled and with the ccw device lock held.
-
-The device driver is allowed to issue the next ccw_device_start() call from
-within its interrupt handler already. It is not required to schedule a
-bottom-half, unless a non deterministically long running error recovery procedure
-or similar needs to be scheduled. During I/O processing the Linux/390 generic
-I/O device driver support has already obtained the IRQ lock, i.e. the handler
-must not try to obtain it again when calling ccw_device_start() or we end in a
-deadlock situation!
-
-If a device driver relies on an I/O request to be completed prior to start the
-next it can reduce I/O processing overhead by chaining a NoOp I/O command
-CCW_CMD_NOOP to the end of the submitted CCW chain. This will force Channel-End
-and Device-End status to be presented together, with a single interrupt.
-However, this should be used with care as it implies the channel will remain
-busy, not being able to process I/O requests for other devices on the same
-channel. Therefore e.g. read commands should never use this technique, as the
-result will be presented by a single interrupt anyway.
-
-In order to minimize I/O overhead, a device driver should use the
-DOIO_REPORT_ALL only if the device can report intermediate interrupt
-information prior to device-end the device driver urgently relies on. In this
-case all I/O interruptions are presented to the device driver until final
-status is recognized.
-
-If a device is able to recover from asynchronously presented I/O errors, it can
-perform overlapping I/O using the DOIO_EARLY_NOTIFICATION flag. While some
-devices always report channel-end and device-end together, with a single
-interrupt, others present primary status (channel-end) when the channel is
-ready for the next I/O request and secondary status (device-end) when the data
-transmission has been completed at the device.
-
-Above flag allows to exploit this feature, e.g. for communication devices that
-can handle lost data on the network to allow for enhanced I/O processing.
-
-Unless the channel subsystem at any time presents a secondary status interrupt,
-exploiting this feature will cause only primary status interrupts to be
-presented to the device driver while overlapping I/O is performed. When a
-secondary status without error (alert status) is presented, this indicates
-successful completion for all overlapping ccw_device_start() requests that have
-been issued since the last secondary (final) status.
-
-Channel programs that intend to set the suspend flag on a channel command word
-(CCW) must start the I/O operation with the DOIO_ALLOW_SUSPEND option or the
-suspend flag will cause a channel program check. At the time the channel program
-becomes suspended an intermediate interrupt will be generated by the channel
-subsystem.
-
-ccw_device_resume() - Resume Channel Program Execution
-
-If a device driver chooses to suspend the current channel program execution by
-setting the CCW suspend flag on a particular CCW, the channel program execution
-is suspended. In order to resume channel program execution the CIO layer
-provides the ccw_device_resume() routine.
-
-int ccw_device_resume(struct ccw_device *cdev);
-
-cdev - ccw_device the resume operation is requested for
-
-The ccw_device_resume() function returns:
-
- 0 - suspended channel program is resumed
--EBUSY - status pending
--ENODEV - cdev invalid or not-operational subchannel
--EINVAL - resume function not applicable
--ENOTCONN - there is no I/O request pending for completion
-
-Usage Notes:
-Please have a look at the ccw_device_start() usage notes for more details on
-suspended channel programs.
-
-ccw_device_halt() - Halt I/O Request Processing
-
-Sometimes a device driver might need a possibility to stop the processing of
-a long-running channel program or the device might require to initially issue
-a halt subchannel (HSCH) I/O command. For those purposes the ccw_device_halt()
-command is provided.
-
-ccw_device_halt() must be called disabled and with the ccw device lock held.
-
-int ccw_device_halt(struct ccw_device *cdev,
- unsigned long intparm);
-
-cdev : ccw_device the halt operation is requested for
-intparm : interruption parameter; value is only used if no I/O
- is outstanding, otherwise the intparm associated with
- the I/O request is returned
-
-The ccw_device_halt() function returns :
-
- 0 - request successfully initiated
--EBUSY - the device is currently busy, or status pending.
--ENODEV - cdev invalid.
--EINVAL - The device is not operational or the ccw device is not online.
-
-Usage Notes :
-
-A device driver may write a never-ending channel program by writing a channel
-program that at its end loops back to its beginning by means of a transfer in
-channel (TIC) command (CCW_CMD_TIC). Usually this is performed by network
-device drivers by setting the PCI CCW flag (CCW_FLAG_PCI). Once this CCW is
-executed a program controlled interrupt (PCI) is generated. The device driver
-can then perform an appropriate action. Prior to interrupt of an outstanding
-read to a network device (with or without PCI flag) a ccw_device_halt()
-is required to end the pending operation.
-
-ccw_device_clear() - Terminage I/O Request Processing
-
-In order to terminate all I/O processing at the subchannel, the clear subchannel
-(CSCH) command is used. It can be issued via ccw_device_clear().
-
-ccw_device_clear() must be called disabled and with the ccw device lock held.
-
-int ccw_device_clear(struct ccw_device *cdev, unsigned long intparm);
-
-cdev: ccw_device the clear operation is requested for
-intparm: interruption parameter (see ccw_device_halt())
-
-The ccw_device_clear() function returns:
-
- 0 - request successfully initiated
--ENODEV - cdev invalid
--EINVAL - The device is not operational or the ccw device is not online.
-
-Miscellaneous Support Routines
-
-This chapter describes various routines to be used in a Linux/390 device
-driver programming environment.
-
-get_ccwdev_lock()
-
-Get the address of the device specific lock. This is then used in
-spin_lock() / spin_unlock() calls.
-
-
-__u8 ccw_device_get_path_mask(struct ccw_device *cdev);
-
-Get the mask of the path currently available for cdev.
diff --git a/Documentation/s390/common_io.rst b/Documentation/s390/common_io.rst
new file mode 100644
index 000000000000..846485681ce7
--- /dev/null
+++ b/Documentation/s390/common_io.rst
@@ -0,0 +1,140 @@
+======================
+S/390 common I/O-Layer
+======================
+
+command line parameters, procfs and debugfs entries
+===================================================
+
+Command line parameters
+-----------------------
+
+* ccw_timeout_log
+
+ Enable logging of debug information in case of ccw device timeouts.
+
+* cio_ignore = device[,device[,..]]
+
+ device := {all | [!]ipldev | [!]condev | [!]<devno> | [!]<devno>-<devno>}
+
+ The given devices will be ignored by the common I/O-layer; no detection
+ and device sensing will be done on any of those devices. The subchannel to
+ which the device in question is attached will be treated as if no device was
+ attached.
+
+ An ignored device can be un-ignored later; see the "/proc entries"-section for
+ details.
+
+ The devices must be given either as bus ids (0.x.abcd) or as hexadecimal
+ device numbers (0xabcd or abcd, for 2.4 backward compatibility). If you
+ give a device number 0xabcd, it will be interpreted as 0.0.abcd.
+
+ You can use the 'all' keyword to ignore all devices. The 'ipldev' and 'condev'
+ keywords can be used to refer to the CCW based boot device and CCW console
+ device respectively (these are probably useful only when combined with the '!'
+ operator). The '!' operator will cause the I/O-layer to _not_ ignore a device.
+ The command line
+ is parsed from left to right.
+
+ For example::
+
+ cio_ignore=0.0.0023-0.0.0042,0.0.4711
+
+ will ignore all devices ranging from 0.0.0023 to 0.0.0042 and the device
+ 0.0.4711, if detected.
+
+ As another example::
+
+ cio_ignore=all,!0.0.4711,!0.0.fd00-0.0.fd02
+
+ will ignore all devices but 0.0.4711, 0.0.fd00, 0.0.fd01, 0.0.fd02.
+
+ By default, no devices are ignored.
+
+
+/proc entries
+-------------
+
+* /proc/cio_ignore
+
+ Lists the ranges of devices (by bus id) which are ignored by common I/O.
+
+ You can un-ignore certain or all devices by piping to /proc/cio_ignore.
+ "free all" will un-ignore all ignored devices,
+ "free <device range>, <device range>, ..." will un-ignore the specified
+ devices.
+
+ For example, if devices 0.0.0023 to 0.0.0042 and 0.0.4711 are ignored,
+
+ - echo free 0.0.0030-0.0.0032 > /proc/cio_ignore
+ will un-ignore devices 0.0.0030 to 0.0.0032 and will leave devices 0.0.0023
+ to 0.0.002f, 0.0.0033 to 0.0.0042 and 0.0.4711 ignored;
+ - echo free 0.0.0041 > /proc/cio_ignore will furthermore un-ignore device
+ 0.0.0041;
+ - echo free all > /proc/cio_ignore will un-ignore all remaining ignored
+ devices.
+
+ When a device is un-ignored, device recognition and sensing is performed and
+ the device driver will be notified if possible, so the device will become
+ available to the system. Note that un-ignoring is performed asynchronously.
+
+ You can also add ranges of devices to be ignored by piping to
+ /proc/cio_ignore; "add <device range>, <device range>, ..." will ignore the
+ specified devices.
+
+ Note: While already known devices can be added to the list of devices to be
+ ignored, there will be no effect on then. However, if such a device
+ disappears and then reappears, it will then be ignored. To make
+ known devices go away, you need the "purge" command (see below).
+
+ For example::
+
+ "echo add 0.0.a000-0.0.accc, 0.0.af00-0.0.afff > /proc/cio_ignore"
+
+ will add 0.0.a000-0.0.accc and 0.0.af00-0.0.afff to the list of ignored
+ devices.
+
+ You can remove already known but now ignored devices via::
+
+ "echo purge > /proc/cio_ignore"
+
+ All devices ignored but still registered and not online (= not in use)
+ will be deregistered and thus removed from the system.
+
+ The devices can be specified either by bus id (0.x.abcd) or, for 2.4 backward
+ compatibility, by the device number in hexadecimal (0xabcd or abcd). Device
+ numbers given as 0xabcd will be interpreted as 0.0.abcd.
+
+* /proc/cio_settle
+
+ A write request to this file is blocked until all queued cio actions are
+ handled. This will allow userspace to wait for pending work affecting
+ device availability after changing cio_ignore or the hardware configuration.
+
+* For some of the information present in the /proc filesystem in 2.4 (namely,
+ /proc/subchannels and /proc/chpids), see driver-model.txt.
+ Information formerly in /proc/irq_count is now in /proc/interrupts.
+
+
+debugfs entries
+---------------
+
+* /sys/kernel/debug/s390dbf/cio_*/ (S/390 debug feature)
+
+ Some views generated by the debug feature to hold various debug outputs.
+
+ - /sys/kernel/debug/s390dbf/cio_crw/sprintf
+ Messages from the processing of pending channel report words (machine check
+ handling).
+
+ - /sys/kernel/debug/s390dbf/cio_msg/sprintf
+ Various debug messages from the common I/O-layer.
+
+ - /sys/kernel/debug/s390dbf/cio_trace/hex_ascii
+ Logs the calling of functions in the common I/O-layer and, if applicable,
+ which subchannel they were called for, as well as dumps of some data
+ structures (like irb in an error case).
+
+ The level of logging can be changed to be more or less verbose by piping to
+ /sys/kernel/debug/s390dbf/cio_*/level a number between 0 and 6; see the
+ documentation on the S/390 debug feature (Documentation/s390/s390dbf.rst)
+ for details.
diff --git a/Documentation/s390/dasd.rst b/Documentation/s390/dasd.rst
new file mode 100644
index 000000000000..9e22247285c8
--- /dev/null
+++ b/Documentation/s390/dasd.rst
@@ -0,0 +1,84 @@
+==================
+DASD device driver
+==================
+
+S/390's disk devices (DASDs) are managed by Linux via the DASD device
+driver. It is valid for all types of DASDs and represents them to
+Linux as block devices, namely "dd". Currently the DASD driver uses a
+single major number (254) and 4 minor numbers per volume (1 for the
+physical volume and 3 for partitions). With respect to partitions see
+below. Thus you may have up to 64 DASD devices in your system.
+
+The kernel parameter 'dasd=from-to,...' may be issued arbitrary times
+in the kernel's parameter line or not at all. The 'from' and 'to'
+parameters are to be given in hexadecimal notation without a leading
+0x.
+If you supply kernel parameters the different instances are processed
+in order of appearance and a minor number is reserved for any device
+covered by the supplied range up to 64 volumes. Additional DASDs are
+ignored. If you do not supply the 'dasd=' kernel parameter at all, the
+DASD driver registers all supported DASDs of your system to a minor
+number in ascending order of the subchannel number.
+
+The driver currently supports ECKD-devices and there are stubs for
+support of the FBA and CKD architectures. For the FBA architecture
+only some smart data structures are missing to make the support
+complete.
+We performed our testing on 3380 and 3390 type disks of different
+sizes, under VM and on the bare hardware (LPAR), using internal disks
+of the multiprise as well as a RAMAC virtual array. Disks exported by
+an Enterprise Storage Server (Seascape) should work fine as well.
+
+We currently implement one partition per volume, which is the whole
+volume, skipping the first blocks up to the volume label. These are
+reserved for IPL records and IBM's volume label to assure
+accessibility of the DASD from other OSs. In a later stage we will
+provide support of partitions, maybe VTOC oriented or using a kind of
+partition table in the label record.
+
+Usage
+=====
+
+-Low-level format (?CKD only)
+For using an ECKD-DASD as a Linux harddisk you have to low-level
+format the tracks by issuing the BLKDASDFORMAT-ioctl on that
+device. This will erase any data on that volume including IBM volume
+labels, VTOCs etc. The ioctl may take a `struct format_data *` or
+'NULL' as an argument::
+
+ typedef struct {
+ int start_unit;
+ int stop_unit;
+ int blksize;
+ } format_data_t;
+
+When a NULL argument is passed to the BLKDASDFORMAT ioctl the whole
+disk is formatted to a blocksize of 1024 bytes. Otherwise start_unit
+and stop_unit are the first and last track to be formatted. If
+stop_unit is -1 it implies that the DASD is formatted from start_unit
+up to the last track. blksize can be any power of two between 512 and
+4096. We recommend no blksize lower than 1024 because the ext2fs uses
+1kB blocks anyway and you gain approx. 50% of capacity increasing your
+blksize from 512 byte to 1kB.
+
+Make a filesystem
+=================
+
+Then you can mk??fs the filesystem of your choice on that volume or
+partition. For reasons of sanity you should build your filesystem on
+the partition /dev/dd?1 instead of the whole volume. You only lose 3kB
+but may be sure that you can reuse your data after introduction of a
+real partition table.
+
+Bugs
+====
+
+- Performance sometimes is rather low because we don't fully exploit clustering
+
+TODO-List
+=========
+
+- Add IBM'S Disk layout to genhd
+- Enhance driver to use more than one major number
+- Enable usage as a module
+- Support Cache fast write and DASD fast write (ECKD)
diff --git a/Documentation/s390/debugging390.rst b/Documentation/s390/debugging390.rst
new file mode 100644
index 000000000000..73ad0b06c666
--- /dev/null
+++ b/Documentation/s390/debugging390.rst
@@ -0,0 +1,2613 @@
+=============================================
+Debugging on Linux for s/390 & z/Architecture
+=============================================
+
+Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
+
+Copyright (C) 2000-2001 IBM Deutschland Entwicklung GmbH, IBM Corporation
+
+.. Best viewed with fixed width fonts
+
+Overview of Document:
+=====================
+This document is intended to give a good overview of how to debug Linux for
+s/390 and z/Architecture. It is not intended as a complete reference and not a
+tutorial on the fundamentals of C & assembly. It doesn't go into
+390 IO in any detail. It is intended to complement the documents in the
+reference section below & any other worthwhile references you get.
+
+It is intended like the Enterprise Systems Architecture/390 Reference Summary
+to be printed out & used as a quick cheat sheet self help style reference when
+problems occur.
+
+.. Contents
+ ========
+ Register Set
+ Address Spaces on Intel Linux
+ Address Spaces on Linux for s/390 & z/Architecture
+ The Linux for s/390 & z/Architecture Kernel Task Structure
+ Register Usage & Stackframes on Linux for s/390 & z/Architecture
+ A sample program with comments
+ Compiling programs for debugging on Linux for s/390 & z/Architecture
+ Debugging under VM
+ s/390 & z/Architecture IO Overview
+ Debugging IO on s/390 & z/Architecture under VM
+ GDB on s/390 & z/Architecture
+ Stack chaining in gdb by hand
+ Examining core dumps
+ ldd
+ Debugging modules
+ The proc file system
+ SysRq
+ References
+ Special Thanks
+
+Register Set
+============
+The current architectures have the following registers.
+
+16 General propose registers, 32 bit on s/390 and 64 bit on z/Architecture,
+r0-r15 (or gpr0-gpr15), used for arithmetic and addressing.
+
+16 Control registers, 32 bit on s/390 and 64 bit on z/Architecture, cr0-cr15,
+kernel usage only, used for memory management, interrupt control, debugging
+control etc.
+
+16 Access registers (ar0-ar15), 32 bit on both s/390 and z/Architecture,
+normally not used by normal programs but potentially could be used as
+temporary storage. These registers have a 1:1 association with general
+purpose registers and are designed to be used in the so-called access
+register mode to select different address spaces.
+Access register 0 (and access register 1 on z/Architecture, which needs a
+64 bit pointer) is currently used by the pthread library as a pointer to
+the current running threads private area.
+
+16 64-bit floating point registers (fp0-fp15 ) IEEE & HFP floating
+point format compliant on G5 upwards & a Floating point control reg (FPC)
+
+4 64-bit registers (fp0,fp2,fp4 & fp6) HFP only on older machines.
+
+Note:
+ Linux (currently) always uses IEEE & emulates G5 IEEE format on older
+ machines, ( provided the kernel is configured for this ).
+
+
+The PSW is the most important register on the machine it
+is 64 bit on s/390 & 128 bit on z/Architecture & serves the roles of
+a program counter (pc), condition code register,memory space designator.
+In IBM standard notation I am counting bit 0 as the MSB.
+It has several advantages over a normal program counter
+in that you can change address translation & program counter
+in a single instruction. To change address translation,
+e.g. switching address translation off requires that you
+have a logical=physical mapping for the address you are
+currently running at.
+
++-------------------------+-------------------------------------------------+
+| Bit | |
++--------+----------------+ Value |
+| s/390 | z/Architecture | |
++========+================+=================================================+
+| 0 | 0 | Reserved (must be 0) otherwise specification |
+| | | exception occurs. |
++--------+----------------+-------------------------------------------------+
+| 1 | 1 | Program Event Recording 1 PER enabled, |
+| | | PER is used to facilitate debugging e.g. |
+| | | single stepping. |
++--------+----------------+-------------------------------------------------+
+| 2-4 | 2-4 | Reserved (must be 0). |
++--------+----------------+-------------------------------------------------+
+| 5 | 5 | Dynamic address translation 1=DAT on. |
++--------+----------------+-------------------------------------------------+
+| 6 | 6 | Input/Output interrupt Mask |
++--------+----------------+-------------------------------------------------+
+| 7 | 7 | External interrupt Mask used primarily for |
+| | | interprocessor signalling and clock interrupts. |
++--------+----------------+-------------------------------------------------+
+| 8-11 | 8-11 | PSW Key used for complex memory protection |
+| | | mechanism (not used under linux) |
++--------+----------------+-------------------------------------------------+
+| 12 | 12 | 1 on s/390 0 on z/Architecture |
++--------+----------------+-------------------------------------------------+
+| 13 | 13 | Machine Check Mask 1=enable machine check |
+| | | interrupts |
++--------+----------------+-------------------------------------------------+
+| 14 | 14 | Wait State. Set this to 1 to stop the processor |
+| | | except for interrupts and give time to other |
+| | | LPARS. Used in CPU idle in the kernel to |
+| | | increase overall usage of processor resources. |
++--------+----------------+-------------------------------------------------+
+| 15 | 15 | Problem state (if set to 1 certain instructions |
+| | | are disabled). All linux user programs run with |
+| | | this bit 1 (useful info for debugging under VM).|
++--------+----------------+-------------------------------------------------+
+| 16-17 | 16-17 | Address Space Control |
+| | | |
+| | | 00 Primary Space Mode: |
+| | | |
+| | | The register CR1 contains the primary |
+| | | address-space control element (PASCE), which |
+| | | points to the primary space region/segment |
+| | | table origin. |
+| | | |
+| | | 01 Access register mode |
+| | | |
+| | | 10 Secondary Space Mode: |
+| | | |
+| | | The register CR7 contains the secondary |
+| | | address-space control element (SASCE), which |
+| | | points to the secondary space region or |
+| | | segment table origin. |
+| | | |
+| | | 11 Home Space Mode: |
+| | | |
+| | | The register CR13 contains the home space |
+| | | address-space control element (HASCE), which |
+| | | points to the home space region/segment |
+| | | table origin. |
+| | | |
+| | | See "Address Spaces on Linux for s/390 & |
+| | | z/Architecture" below for more information |
+| | | about address space usage in Linux. |
++--------+----------------+-------------------------------------------------+
+| 18-19 | 18-19 | Condition codes (CC) |
++--------+----------------+-------------------------------------------------+
+| 20 | 20 | Fixed point overflow mask if 1=FPU exceptions |
+| | | for this event occur (normally 0) |
++--------+----------------+-------------------------------------------------+
+| 21 | 21 | Decimal overflow mask if 1=FPU exceptions for |
+| | | this event occur (normally 0) |
++--------+----------------+-------------------------------------------------+
+| 22 | 22 | Exponent underflow mask if 1=FPU exceptions |
+| | | for this event occur (normally 0) |
++--------+----------------+-------------------------------------------------+
+| 23 | 23 | Significance Mask if 1=FPU exceptions for this |
+| | | event occur (normally 0) |
++--------+----------------+-------------------------------------------------+
+| 24-31 | 24-30 | Reserved Must be 0. |
+| +----------------+-------------------------------------------------+
+| | 31 | Extended Addressing Mode |
+| +----------------+-------------------------------------------------+
+| | 32 | Basic Addressing Mode |
+| | | |
+| | | Used to set addressing mode:: |
+| | | |
+| | | +---------+----------+----------+ |
+| | | | PSW 31 | PSW 32 | | |
+| | | +---------+----------+----------+ |
+| | | | 0 | 0 | 24 bit | |
+| | | +---------+----------+----------+ |
+| | | | 0 | 1 | 31 bit | |
+| | | +---------+----------+----------+ |
+| | | | 1 | 1 | 64 bit | |
+| | | +---------+----------+----------+ |
++--------+----------------+-------------------------------------------------+
+| 32 | | 1=31 bit addressing mode 0=24 bit addressing |
+| | | mode (for backward compatibility), linux |
+| | | always runs with this bit set to 1 |
++--------+----------------+-------------------------------------------------+
+| 33-64 | | Instruction address. |
+| +----------------+-------------------------------------------------+
+| | 33-63 | Reserved must be 0 |
+| +----------------+-------------------------------------------------+
+| | 64-127 | Address |
+| | | |
+| | | - In 24 bits mode bits 64-103=0 bits 104-127 |
+| | | Address |
+| | | - In 31 bits mode bits 64-96=0 bits 97-127 |
+| | | Address |
+| | | |
+| | | Note: |
+| | | unlike 31 bit mode on s/390 bit 96 must be |
+| | | zero when loading the address with LPSWE |
+| | | otherwise a specification exception occurs, |
+| | | LPSW is fully backward compatible. |
++--------+----------------+-------------------------------------------------+
+
+Prefix Page(s)
+--------------
+This per cpu memory area is too intimately tied to the processor not to mention.
+It exists between the real addresses 0-4096 on s/390 and between 0-8192 on
+z/Architecture and is exchanged with one page on s/390 or two pages on
+z/Architecture in absolute storage by the set prefix instruction during Linux
+startup.
+
+This page is mapped to a different prefix for each processor in an SMP
+configuration (assuming the OS designer is sane of course).
+
+Bytes 0-512 (200 hex) on s/390 and 0-512, 4096-4544, 4604-5119 currently on
+z/Architecture are used by the processor itself for holding such information
+as exception indications and entry points for exceptions.
+
+Bytes after 0xc00 hex are used by linux for per processor globals on s/390 and
+z/Architecture (there is a gap on z/Architecture currently between 0xc00 and
+0x1000, too, which is used by Linux).
+
+The closest thing to this on traditional architectures is the interrupt
+vector table. This is a good thing & does simplify some of the kernel coding
+however it means that we now cannot catch stray NULL pointers in the
+kernel without hard coded checks.
+
+
+
+Address Spaces on Intel Linux
+=============================
+
+The traditional Intel Linux is approximately mapped as follows forgive
+the ascii art::
+
+ 0xFFFFFFFF 4GB Himem *****************
+ * *
+ * Kernel Space *
+ * *
+ ***************** ****************
+ User Space Himem * User Stack * * *
+ (typically 0xC0000000 3GB ) ***************** * *
+ * Shared Libs * * Next Process *
+ ***************** * to *
+ * * <== * Run * <==
+ * User Program * * *
+ * Data BSS * * *
+ * Text * * *
+ * Sections * * *
+ 0x00000000 ***************** ****************
+
+Now it is easy to see that on Intel it is quite easy to recognise a kernel
+address as being one greater than user space himem (in this case 0xC0000000),
+and addresses of less than this are the ones in the current running program on
+this processor (if an smp box).
+
+If using the virtual machine ( VM ) as a debugger it is quite difficult to
+know which user process is running as the address space you are looking at
+could be from any process in the run queue.
+
+The limitation of Intels addressing technique is that the linux
+kernel uses a very simple real address to virtual addressing technique
+of Real Address=Virtual Address-User Space Himem.
+This means that on Intel the kernel linux can typically only address
+Himem=0xFFFFFFFF-0xC0000000=1GB & this is all the RAM these machines
+can typically use.
+
+They can lower User Himem to 2GB or lower & thus be
+able to use 2GB of RAM however this shrinks the maximum size
+of User Space from 3GB to 2GB they have a no win limit of 4GB unless
+they go to 64 Bit.
+
+
+On 390 our limitations & strengths make us slightly different.
+For backward compatibility we are only allowed use 31 bits (2GB)
+of our 32 bit addresses, however, we use entirely separate address
+spaces for the user & kernel.
+
+This means we can support 2GB of non Extended RAM on s/390, & more
+with the Extended memory management swap device &
+currently 4TB of physical memory currently on z/Architecture.
+
+
+Address Spaces on Linux for s/390 & z/Architecture
+==================================================
+
+Our addressing scheme is basically as follows::
+
+ Primary Space Home Space
+ Himem 0x7fffffff 2GB on s/390 ***************** ****************
+ currently 0x3ffffffffff (2^42)-1 * User Stack * * *
+ on z/Architecture. ***************** * *
+ * Shared Libs * * *
+ ***************** * *
+ * * * Kernel *
+ * User Program * * *
+ * Data BSS * * *
+ * Text * * *
+ * Sections * * *
+ 0x00000000 ***************** ****************
+
+This also means that we need to look at the PSW problem state bit and the
+addressing mode to decide whether we are looking at user or kernel space.
+
+User space runs in primary address mode (or access register mode within
+the vdso code).
+
+The kernel usually also runs in home space mode, however when accessing
+user space the kernel switches to primary or secondary address mode if
+the mvcos instruction is not available or if a compare-and-swap (futex)
+instruction on a user space address is performed.
+
+When also looking at the ASCE control registers, this means:
+
+User space:
+
+- runs in primary or access register mode
+- cr1 contains the user asce
+- cr7 contains the user asce
+- cr13 contains the kernel asce
+
+Kernel space:
+
+- runs in home space mode
+- cr1 contains the user or kernel asce
+
+ - the kernel asce is loaded when a uaccess requires primary or
+ secondary address mode
+
+- cr7 contains the user or kernel asce, (changed with set_fs())
+- cr13 contains the kernel asce
+
+In case of uaccess the kernel changes to:
+
+- primary space mode in case of a uaccess (copy_to_user) and uses
+ e.g. the mvcp instruction to access user space. However the kernel
+ will stay in home space mode if the mvcos instruction is available
+- secondary space mode in case of futex atomic operations, so that the
+ instructions come from primary address space and data from secondary
+ space
+
+In case of KVM, the kernel runs in home space mode, but cr1 gets switched
+to contain the gmap asce before the SIE instruction gets executed. When
+the SIE instruction is finished, cr1 will be switched back to contain the
+user asce.
+
+
+Virtual Addresses on s/390 & z/Architecture
+===========================================
+
+A virtual address on s/390 is made up of 3 parts
+The SX (segment index, roughly corresponding to the PGD & PMD in Linux
+terminology) being bits 1-11.
+
+The PX (page index, corresponding to the page table entry (pte) in Linux
+terminology) being bits 12-19.
+
+The remaining bits BX (the byte index are the offset in the page )
+i.e. bits 20 to 31.
+
+On z/Architecture in linux we currently make up an address from 4 parts.
+
+- The region index bits (RX) 0-32 we currently use bits 22-32
+- The segment index (SX) being bits 33-43
+- The page index (PX) being bits 44-51
+- The byte index (BX) being bits 52-63
+
+Notes:
+ 1) s/390 has no PMD so the PMD is really the PGD also.
+ A lot of this stuff is defined in pgtable.h.
+
+ 2) Also seeing as s/390's page indexes are only 1k in size
+ (bits 12-19 x 4 bytes per pte ) we use 1 ( page 4k )
+ to make the best use of memory by updating 4 segment indices
+ entries each time we mess with a PMD & use offsets
+ 0,1024,2048 & 3072 in this page as for our segment indexes.
+ On z/Architecture our page indexes are now 2k in size
+ ( bits 12-19 x 8 bytes per pte ) we do a similar trick
+ but only mess with 2 segment indices each time we mess with
+ a PMD.
+
+ 3) As z/Architecture supports up to a massive 5-level page table lookup we
+ can only use 3 currently on Linux ( as this is all the generic kernel
+ currently supports ) however this may change in future
+ this allows us to access ( according to my sums )
+ 4TB of virtual storage per process i.e.
+ 4096*512(PTES)*1024(PMDS)*2048(PGD) = 4398046511104 bytes,
+ enough for another 2 or 3 of years I think :-).
+ to do this we use a region-third-table designation type in
+ our address space control registers.
+
+
+The Linux for s/390 & z/Architecture Kernel Task Structure
+==========================================================
+Each process/thread under Linux for S390 has its own kernel task_struct
+defined in linux/include/linux/sched.h
+The S390 on initialisation & resuming of a process on a cpu sets
+the __LC_KERNEL_STACK variable in the spare prefix area for this cpu
+(which we use for per-processor globals).
+
+The kernel stack pointer is intimately tied with the task structure for
+each processor as follows::
+
+ s/390
+ ************************
+ * 1 page kernel stack *
+ * ( 4K ) *
+ ************************
+ * 1 page task_struct *
+ * ( 4K ) *
+ 8K aligned ************************
+
+ z/Architecture
+ ************************
+ * 2 page kernel stack *
+ * ( 8K ) *
+ ************************
+ * 2 page task_struct *
+ * ( 8K ) *
+ 16K aligned ************************
+
+What this means is that we don't need to dedicate any register or global
+variable to point to the current running process & can retrieve it with the
+following very simple construct for s/390 & one very similar for
+z/Architecture::
+
+ static inline struct task_struct * get_current(void)
+ {
+ struct task_struct *current;
+ __asm__("lhi %0,-8192\n\t"
+ "nr %0,15"
+ : "=r" (current) );
+ return current;
+ }
+
+i.e. just anding the current kernel stack pointer with the mask -8192.
+Thankfully because Linux doesn't have support for nested IO interrupts
+& our devices have large buffers can survive interrupts being shut for
+short amounts of time we don't need a separate stack for interrupts.
+
+
+
+
+Register Usage & Stackframes on Linux for s/390 & z/Architecture
+=================================================================
+Overview:
+---------
+This is the code that gcc produces at the top & the bottom of
+each function. It usually is fairly consistent & similar from
+function to function & if you know its layout you can probably
+make some headway in finding the ultimate cause of a problem
+after a crash without a source level debugger.
+
+Note: To follow stackframes requires a knowledge of C or Pascal &
+limited knowledge of one assembly language.
+
+It should be noted that there are some differences between the
+s/390 and z/Architecture stack layouts as the z/Architecture stack layout
+didn't have to maintain compatibility with older linkage formats.
+
+Glossary:
+---------
+alloca:
+ This is a built in compiler function for runtime allocation
+ of extra space on the callers stack which is obviously freed
+ up on function exit ( e.g. the caller may choose to allocate nothing
+ of a buffer of 4k if required for temporary purposes ), it generates
+ very efficient code ( a few cycles ) when compared to alternatives
+ like malloc.
+
+automatics:
+ These are local variables on the stack, i.e they aren't in registers &
+ they aren't static.
+
+back-chain:
+ This is a pointer to the stack pointer before entering a
+ framed functions ( see frameless function ) prologue got by
+ dereferencing the address of the current stack pointer,
+ i.e. got by accessing the 32 bit value at the stack pointers
+ current location.
+
+base-pointer:
+ This is a pointer to the back of the literal pool which
+ is an area just behind each procedure used to store constants
+ in each function.
+
+call-clobbered:
+ The caller probably needs to save these registers if there
+ is something of value in them, on the stack or elsewhere before making a
+ call to another procedure so that it can restore it later.
+
+epilogue:
+ The code generated by the compiler to return to the caller.
+
+frameless-function:
+ A frameless function in Linux for s390 & z/Architecture is one which doesn't
+ need more than the register save area (96 bytes on s/390, 160 on z/Architecture)
+ given to it by the caller.
+
+ A frameless function never:
+
+ 1) Sets up a back chain.
+ 2) Calls alloca.
+ 3) Calls other normal functions
+ 4) Has automatics.
+
+GOT-pointer:
+ This is a pointer to the global-offset-table in ELF
+ ( Executable Linkable Format, Linux'es most common executable format ),
+ all globals & shared library objects are found using this pointer.
+
+lazy-binding
+ ELF shared libraries are typically only loaded when routines in the shared
+ library are actually first called at runtime. This is lazy binding.
+
+procedure-linkage-table
+ This is a table found from the GOT which contains pointers to routines
+ in other shared libraries which can't be called to by easier means.
+
+prologue:
+ The code generated by the compiler to set up the stack frame.
+
+outgoing-args:
+ This is extra area allocated on the stack of the calling function if the
+ parameters for the callee's cannot all be put in registers, the same
+ area can be reused by each function the caller calls.
+
+routine-descriptor:
+ A COFF executable format based concept of a procedure reference
+ actually being 8 bytes or more as opposed to a simple pointer to the routine.
+ This is typically defined as follows:
+
+ - Routine Descriptor offset 0=Pointer to Function
+ - Routine Descriptor offset 4=Pointer to Table of Contents
+
+ The table of contents/TOC is roughly equivalent to a GOT pointer.
+ & it means that shared libraries etc. can be shared between several
+ environments each with their own TOC.
+
+static-chain:
+ This is used in nested functions a concept adopted from pascal
+ by gcc not used in ansi C or C++ ( although quite useful ), basically it
+ is a pointer used to reference local variables of enclosing functions.
+ You might come across this stuff once or twice in your lifetime.
+
+ e.g.
+
+ The function below should return 11 though gcc may get upset & toss warnings
+ about unused variables::
+
+ int FunctionA(int a)
+ {
+ int b;
+ FunctionC(int c)
+ {
+ b=c+1;
+ }
+ FunctionC(10);
+ return(b);
+ }
+
+
+s/390 & z/Architecture Register usage
+=====================================
+
+======== ========================================== ===============
+r0 used by syscalls/assembly call-clobbered
+r1 used by syscalls/assembly call-clobbered
+r2 argument 0 / return value 0 call-clobbered
+r3 argument 1 / return value 1 (if long long) call-clobbered
+r4 argument 2 call-clobbered
+r5 argument 3 call-clobbered
+r6 argument 4 saved
+r7 pointer-to arguments 5 to ... saved
+r8 this & that saved
+r9 this & that saved
+r10 static-chain ( if nested function ) saved
+r11 frame-pointer ( if function used alloca ) saved
+r12 got-pointer saved
+r13 base-pointer saved
+r14 return-address saved
+r15 stack-pointer saved
+
+f0 argument 0 / return value ( float/double ) call-clobbered
+f2 argument 1 call-clobbered
+f4 z/Architecture argument 2 saved
+f6 z/Architecture argument 3 saved
+======== ========================================== ===============
+
+The remaining floating points
+f1,f3,f5 f7-f15 are call-clobbered.
+
+Notes:
+------
+1) The only requirement is that registers which are used
+ by the callee are saved, e.g. the compiler is perfectly
+ capable of using r11 for purposes other than a frame a
+ frame pointer if a frame pointer is not needed.
+2) In functions with variable arguments e.g. printf the calling procedure
+ is identical to one without variable arguments & the same number of
+ parameters. However, the prologue of this function is somewhat more
+ hairy owing to it having to move these parameters to the stack to
+ get va_start, va_arg & va_end to work.
+3) Access registers are currently unused by gcc but are used in
+ the kernel. Possibilities exist to use them at the moment for
+ temporary storage but it isn't recommended.
+4) Only 4 of the floating point registers are used for
+ parameter passing as older machines such as G3 only have only 4
+ & it keeps the stack frame compatible with other compilers.
+ However with IEEE floating point emulation under linux on the
+ older machines you are free to use the other 12.
+5) A long long or double parameter cannot be have the
+ first 4 bytes in a register & the second four bytes in the
+ outgoing args area. It must be purely in the outgoing args
+ area if crossing this boundary.
+6) Floating point parameters are mixed with outgoing args
+ on the outgoing args area in the order the are passed in as parameters.
+7) Floating point arguments 2 & 3 are saved in the outgoing args area for
+ z/Architecture
+
+
+Stack Frame Layout
+------------------
+
+========= ============== ======================================================
+s/390 z/Architecture
+========= ============== ======================================================
+0 0 back chain ( a 0 here signifies end of back chain )
+4 8 eos ( end of stack, not used on Linux for S390 used
+ in other linkage formats )
+8 16 glue used in other s/390 linkage formats for saved
+ routine descriptors etc.
+12 24 glue used in other s/390 linkage formats for saved
+ routine descriptors etc.
+16 32 scratch area
+20 40 scratch area
+24 48 saved r6 of caller function
+28 56 saved r7 of caller function
+32 64 saved r8 of caller function
+36 72 saved r9 of caller function
+40 80 saved r10 of caller function
+44 88 saved r11 of caller function
+48 96 saved r12 of caller function
+52 104 saved r13 of caller function
+56 112 saved r14 of caller function
+60 120 saved r15 of caller function
+64 128 saved f4 of caller function
+72 132 saved f6 of caller function
+80 undefined
+96 160 outgoing args passed from caller to callee
+96+x 160+x possible stack alignment ( 8 bytes desirable )
+96+x+y 160+x+y alloca space of caller ( if used )
+96+x+y+z 160+x+y+z automatics of caller ( if used )
+0 back-chain
+========= ============== ======================================================
+
+A sample program with comments.
+===============================
+
+Comments on the function test
+-----------------------------
+1) It didn't need to set up a pointer to the constant pool gpr13 as it is not
+ used ( :-( ).
+2) This is a frameless function & no stack is bought.
+3) The compiler was clever enough to recognise that it could return the
+ value in r2 as well as use it for the passed in parameter ( :-) ).
+4) The basr ( branch relative & save ) trick works as follows the instruction
+ has a special case with r0,r0 with some instruction operands is understood as
+ the literal value 0, some risc architectures also do this ). So now
+ we are branching to the next address & the address new program counter is
+ in r13,so now we subtract the size of the function prologue we have executed
+ the size of the literal pool to get to the top of the literal pool::
+
+
+ 0040037c int test(int b)
+ { # Function prologue below
+ 40037c: 90 de f0 34 stm %r13,%r14,52(%r15) # Save registers r13 & r14
+ 400380: 0d d0 basr %r13,%r0 # Set up pointer to constant pool using
+ 400382: a7 da ff fa ahi %r13,-6 # basr trick
+ return(5+b);
+ # Huge main program
+ 400386: a7 2a 00 05 ahi %r2,5 # add 5 to r2
+
+ # Function epilogue below
+ 40038a: 98 de f0 34 lm %r13,%r14,52(%r15) # restore registers r13 & 14
+ 40038e: 07 fe br %r14 # return
+ }
+
+Comments on the function main
+-----------------------------
+1) The compiler did this function optimally ( 8-) )::
+
+ Literal pool for main.
+ 400390: ff ff ff ec .long 0xffffffec
+ main(int argc,char *argv[])
+ { # Function prologue below
+ 400394: 90 bf f0 2c stm %r11,%r15,44(%r15) # Save necessary registers
+ 400398: 18 0f lr %r0,%r15 # copy stack pointer to r0
+ 40039a: a7 fa ff a0 ahi %r15,-96 # Make area for callee saving
+ 40039e: 0d d0 basr %r13,%r0 # Set up r13 to point to
+ 4003a0: a7 da ff f0 ahi %r13,-16 # literal pool
+ 4003a4: 50 00 f0 00 st %r0,0(%r15) # Save backchain
+
+ return(test(5)); # Main Program Below
+ 4003a8: 58 e0 d0 00 l %r14,0(%r13) # load relative address of test from
+ # literal pool
+ 4003ac: a7 28 00 05 lhi %r2,5 # Set first parameter to 5
+ 4003b0: 4d ee d0 00 bas %r14,0(%r14,%r13) # jump to test setting r14 as return
+ # address using branch & save instruction.
+
+ # Function Epilogue below
+ 4003b4: 98 bf f0 8c lm %r11,%r15,140(%r15)# Restore necessary registers.
+ 4003b8: 07 fe br %r14 # return to do program exit
+ }
+
+
+Compiler updates
+----------------
+
+::
+
+ main(int argc,char *argv[])
+ {
+ 4004fc: 90 7f f0 1c stm %r7,%r15,28(%r15)
+ 400500: a7 d5 00 04 bras %r13,400508 <main+0xc>
+ 400504: 00 40 04 f4 .long 0x004004f4
+ # compiler now puts constant pool in code to so it saves an instruction
+ 400508: 18 0f lr %r0,%r15
+ 40050a: a7 fa ff a0 ahi %r15,-96
+ 40050e: 50 00 f0 00 st %r0,0(%r15)
+ return(test(5));
+ 400512: 58 10 d0 00 l %r1,0(%r13)
+ 400516: a7 28 00 05 lhi %r2,5
+ 40051a: 0d e1 basr %r14,%r1
+ # compiler adds 1 extra instruction to epilogue this is done to
+ # avoid processor pipeline stalls owing to data dependencies on g5 &
+ # above as register 14 in the old code was needed directly after being loaded
+ # by the lm %r11,%r15,140(%r15) for the br %14.
+ 40051c: 58 40 f0 98 l %r4,152(%r15)
+ 400520: 98 7f f0 7c lm %r7,%r15,124(%r15)
+ 400524: 07 f4 br %r4
+ }
+
+
+Hartmut ( our compiler developer ) also has been threatening to take out the
+stack backchain in optimised code as this also causes pipeline stalls, you
+have been warned.
+
+64 bit z/Architecture code disassembly
+--------------------------------------
+
+If you understand the stuff above you'll understand the stuff
+below too so I'll avoid repeating myself & just say that
+some of the instructions have g's on the end of them to indicate
+they are 64 bit & the stack offsets are a bigger,
+the only other difference you'll find between 32 & 64 bit is that
+we now use f4 & f6 for floating point arguments on 64 bit::
+
+ 00000000800005b0 <test>:
+ int test(int b)
+ {
+ return(5+b);
+ 800005b0: a7 2a 00 05 ahi %r2,5
+ 800005b4: b9 14 00 22 lgfr %r2,%r2 # downcast to integer
+ 800005b8: 07 fe br %r14
+ 800005ba: 07 07 bcr 0,%r7
+
+
+ }
+
+ 00000000800005bc <main>:
+ main(int argc,char *argv[])
+ {
+ 800005bc: eb bf f0 58 00 24 stmg %r11,%r15,88(%r15)
+ 800005c2: b9 04 00 1f lgr %r1,%r15
+ 800005c6: a7 fb ff 60 aghi %r15,-160
+ 800005ca: e3 10 f0 00 00 24 stg %r1,0(%r15)
+ return(test(5));
+ 800005d0: a7 29 00 05 lghi %r2,5
+ # brasl allows jumps > 64k & is overkill here bras would do fune
+ 800005d4: c0 e5 ff ff ff ee brasl %r14,800005b0 <test>
+ 800005da: e3 40 f1 10 00 04 lg %r4,272(%r15)
+ 800005e0: eb bf f0 f8 00 04 lmg %r11,%r15,248(%r15)
+ 800005e6: 07 f4 br %r4
+ }
+
+
+
+Compiling programs for debugging on Linux for s/390 & z/Architecture
+====================================================================
+-gdwarf-2 now works it should be considered the default debugging
+format for s/390 & z/Architecture as it is more reliable for debugging
+shared libraries, normal -g debugging works much better now
+Thanks to the IBM java compiler developers bug reports.
+
+This is typically done adding/appending the flags -g or -gdwarf-2 to the
+CFLAGS & LDFLAGS variables Makefile of the program concerned.
+
+If using gdb & you would like accurate displays of registers &
+stack traces compile without optimisation i.e make sure
+that there is no -O2 or similar on the CFLAGS line of the Makefile &
+the emitted gcc commands, obviously this will produce worse code
+( not advisable for shipment ) but it is an aid to the debugging process.
+
+This aids debugging because the compiler will copy parameters passed in
+in registers onto the stack so backtracing & looking at passed in
+parameters will work, however some larger programs which use inline functions
+will not compile without optimisation.
+
+Debugging with optimisation has since much improved after fixing
+some bugs, please make sure you are using gdb-5.0 or later developed
+after Nov'2000.
+
+
+
+Debugging under VM
+==================
+
+Notes
+-----
+Addresses & values in the VM debugger are always hex never decimal
+Address ranges are of the format <HexValue1>-<HexValue2> or
+<HexValue1>.<HexValue2>
+For example, the address range 0x2000 to 0x3000 can be described as 2000-3000
+or 2000.1000
+
+The VM Debugger is case insensitive.
+
+VM's strengths are usually other debuggers weaknesses you can get at any
+resource no matter how sensitive e.g. memory management resources, change
+address translation in the PSW. For kernel hacking you will reap dividends if
+you get good at it.
+
+The VM Debugger displays operators but not operands, and also the debugger
+displays useful information on the same line as the author of the code probably
+felt that it was a good idea not to go over the 80 columns on the screen.
+This isn't as unintuitive as it may seem as the s/390 instructions are easy to
+decode mentally and you can make a good guess at a lot of them as all the
+operands are nibble (half byte aligned).
+So if you have an objdump listing by hand, it is quite easy to follow, and if
+you don't have an objdump listing keep a copy of the s/390 Reference Summary
+or alternatively the s/390 principles of operation next to you.
+e.g. even I can guess that
+0001AFF8' LR 180F CC 0
+is a ( load register ) lr r0,r15
+
+Also it is very easy to tell the length of a 390 instruction from the 2 most
+significant bits in the instruction (not that this info is really useful except
+if you are trying to make sense of a hexdump of code).
+Here is a table
+
+======================= ==================
+Bits Instruction Length
+======================= ==================
+00 2 Bytes
+01 4 Bytes
+10 4 Bytes
+11 6 Bytes
+======================= ==================
+
+The debugger also displays other useful info on the same line such as the
+addresses being operated on destination addresses of branches & condition codes.
+e.g.::
+
+ 00019736' AHI A7DAFF0E CC 1
+ 000198BA' BRC A7840004 -> 000198C2' CC 0
+ 000198CE' STM 900EF068 >> 0FA95E78 CC 2
+
+
+
+Useful VM debugger commands
+---------------------------
+
+I suppose I'd better mention this before I start
+to list the current active traces do::
+
+ Q TR
+
+there can be a maximum of 255 of these per set
+( more about trace sets later ).
+
+To stop traces issue a::
+
+ TR END.
+
+To delete a particular breakpoint issue::
+
+ TR DEL <breakpoint number>
+
+The PA1 key drops to CP mode so you can issue debugger commands,
+Doing alt c (on my 3270 console at least ) clears the screen.
+
+hitting b <enter> comes back to the running operating system
+from cp mode ( in our case linux ).
+
+It is typically useful to add shortcuts to your profile.exec file
+if you have one ( this is roughly equivalent to autoexec.bat in DOS ).
+file here are a few from mine::
+
+ /* this gives me command history on issuing f12 */
+ set pf12 retrieve
+ /* this continues */
+ set pf8 imm b
+ /* goes to trace set a */
+ set pf1 imm tr goto a
+ /* goes to trace set b */
+ set pf2 imm tr goto b
+ /* goes to trace set c */
+ set pf3 imm tr goto c
+
+
+
+Instruction Tracing
+-------------------
+Setting a simple breakpoint::
+
+ TR I PSWA <address>
+
+To debug a particular function try::
+
+ TR I R <function address range>
+ TR I on its own will single step.
+ TR I DATA <MNEMONIC> <OPTIONAL RANGE> will trace for particular mnemonics
+
+e.g.::
+
+ TR I DATA 4D R 0197BC.4000
+
+will trace for BAS'es ( opcode 4D ) in the range 0197BC.4000
+
+if you were inclined you could add traces for all branch instructions &
+suffix them with the run prefix so you would have a backtrace on screen
+when a program crashes::
+
+ TR BR <INTO OR FROM> will trace branches into or out of an address.
+
+e.g.::
+
+ TR BR INTO 0
+
+is often quite useful if a program is getting awkward & deciding
+to branch to 0 & crashing as this will stop at the address before in jumps to 0.
+
+::
+
+ TR I R <address range> RUN cmd d g
+
+single steps a range of addresses but stays running &
+displays the gprs on each step.
+
+
+
+Displaying & modifying Registers
+--------------------------------
+D G
+ will display all the gprs
+
+Adding a extra G to all the commands is necessary to access the full 64 bit
+content in VM on z/Architecture. Obviously this isn't required for access
+registers as these are still 32 bit.
+
+e.g.
+
+DGG
+ instead of DG
+
+D X
+ will display all the control registers
+D AR
+ will display all the access registers
+D AR4-7
+ will display access registers 4 to 7
+CPU ALL D G
+ will display the GRPS of all CPUS in the configuration
+D PSW
+ will display the current PSW
+st PSW 2000
+ will put the value 2000 into the PSW & cause crash your machine.
+D PREFIX
+ displays the prefix offset
+
+
+Displaying Memory
+-----------------
+To display memory mapped using the current PSW's mapping try::
+
+ D <range>
+
+To make VM display a message each time it hits a particular address and
+continue try:
+
+D I<range>
+ will disassemble/display a range of instructions.
+
+ST addr 32 bit word
+ will store a 32 bit aligned address
+D T<range>
+ will display the EBCDIC in an address (if you are that way inclined)
+D R<range>
+ will display real addresses ( without DAT ) but with prefixing.
+
+There are other complex options to display if you need to get at say home space
+but are in primary space the easiest thing to do is to temporarily
+modify the PSW to the other addressing mode, display the stuff & then
+restore it.
+
+
+
+Hints
+-----
+If you want to issue a debugger command without halting your virtual machine
+with the PA1 key try prefixing the command with #CP e.g.::
+
+ #cp tr i pswa 2000
+
+also suffixing most debugger commands with RUN will cause them not
+to stop just display the mnemonic at the current instruction on the console.
+
+If you have several breakpoints you want to put into your program &
+you get fed up of cross referencing with System.map
+you can do the following trick for several symbols.
+
+::
+
+ grep do_signal System.map
+
+which emits the following among other things::
+
+ 0001f4e0 T do_signal
+
+now you can do::
+
+ TR I PSWA 0001f4e0 cmd msg * do_signal
+
+This sends a message to your own console each time do_signal is entered.
+( As an aside I wrote a perl script once which automatically generated a REXX
+script with breakpoints on every kernel procedure, this isn't a good idea
+because there are thousands of these routines & VM can only set 255 breakpoints
+at a time so you nearly had to spend as long pruning the file down as you would
+entering the msgs by hand), however, the trick might be useful for a single
+object file. In the 3270 terminal emulator x3270 there is a very useful option
+in the file menu called "Save Screen In File" - this is very good for keeping a
+copy of traces.
+
+From CMS help <command name> will give you online help on a particular command.
+e.g.::
+
+ HELP DISPLAY
+
+Also CP has a file called profile.exec which automatically gets called
+on startup of CMS ( like autoexec.bat ), keeping on a DOS analogy session
+CP has a feature similar to doskey, it may be useful for you to
+use profile.exec to define some keystrokes.
+
+SET PF9 IMM B
+ This does a single step in VM on pressing F8.
+
+SET PF10 ^
+ This sets up the ^ key.
+ which can be used for ^c (ctrl-c),^z (ctrl-z) which can't be typed
+ directly into some 3270 consoles.
+
+SET PF11 ^-
+ This types the starting keystrokes for a sysrq see SysRq below.
+SET PF12 RETRIEVE
+ This retrieves command history on pressing F12.
+
+
+Sometimes in VM the display is set up to scroll automatically this
+can be very annoying if there are messages you wish to look at
+to stop this do
+
+TERM MORE 255 255
+ This will nearly stop automatic screen updates, however it will
+ cause a denial of service if lots of messages go to the 3270 console,
+ so it would be foolish to use this as the default on a production machine.
+
+
+Tracing particular processes
+----------------------------
+The kernel's text segment is intentionally at an address in memory that it will
+very seldom collide with text segments of user programs ( thanks Martin ),
+this simplifies debugging the kernel.
+However it is quite common for user processes to have addresses which collide
+this can make debugging a particular process under VM painful under normal
+circumstances as the process may change when doing a::
+
+ TR I R <address range>.
+
+Thankfully after reading VM's online help I figured out how to debug
+I particular process.
+
+Your first problem is to find the STD ( segment table designation )
+of the program you wish to debug.
+There are several ways you can do this here are a few
+
+Run::
+
+ objdump --syms <program to be debugged> | grep main
+
+To get the address of main in the program. Then::
+
+ tr i pswa <address of main>
+
+Start the program, if VM drops to CP on what looks like the entry
+point of the main function this is most likely the process you wish to debug.
+Now do a D X13 or D XG13 on z/Architecture.
+
+On 31 bit the STD is bits 1-19 ( the STO segment table origin )
+& 25-31 ( the STL segment table length ) of CR13.
+
+now type::
+
+ TR I R STD <CR13's value> 0.7fffffff
+
+e.g.::
+
+ TR I R STD 8F32E1FF 0.7fffffff
+
+Another very useful variation is::
+
+ TR STORE INTO STD <CR13's value> <address range>
+
+for finding out when a particular variable changes.
+
+An alternative way of finding the STD of a currently running process
+is to do the following, ( this method is more complex but
+could be quite convenient if you aren't updating the kernel much &
+so your kernel structures will stay constant for a reasonable period of
+time ).
+
+::
+
+ grep task /proc/<pid>/status
+
+from this you should see something like::
+
+ task: 0f160000 ksp: 0f161de8 pt_regs: 0f161f68
+
+This now gives you a pointer to the task structure.
+
+Now make::
+
+ CC:="s390-gcc -g" kernel/sched.s
+
+To get the task_struct stabinfo.
+
+( task_struct is defined in include/linux/sched.h ).
+
+Now we want to look at
+task->active_mm->pgd
+
+on my machine the active_mm in the task structure stab is
+active_mm:(4,12),672,32
+
+its offset is 672/8=84=0x54
+
+the pgd member in the mm_struct stab is
+pgd:(4,6)=*(29,5),96,32
+so its offset is 96/8=12=0xc
+
+so we'll::
+
+ hexdump -s 0xf160054 /dev/mem | more
+
+i.e. task_struct+active_mm offset
+to look at the active_mm member::
+
+ f160054 0fee cc60 0019 e334 0000 0000 0000 0011
+
+::
+
+ hexdump -s 0x0feecc6c /dev/mem | more
+
+i.e. active_mm+pgd offset::
+
+ feecc6c 0f2c 0000 0000 0001 0000 0001 0000 0010
+
+we get something like
+now do::
+
+ TR I R STD <pgd|0x7f> 0.7fffffff
+
+i.e. the 0x7f is added because the pgd only
+gives the page table origin & we need to set the low bits
+to the maximum possible segment table length.
+
+::
+
+ TR I R STD 0f2c007f 0.7fffffff
+
+on z/Architecture you'll probably need to do::
+
+ TR I R STD <pgd|0x7> 0.ffffffffffffffff
+
+to set the TableType to 0x1 & the Table length to 3.
+
+
+
+Tracing Program Exceptions
+--------------------------
+If you get a crash which says something like
+illegal operation or specification exception followed by a register dump
+You can restart linux & trace these using the tr prog <range or value> trace
+option.
+
+
+The most common ones you will normally be tracing for is:
+
+- 1=operation exception
+- 2=privileged operation exception
+- 4=protection exception
+- 5=addressing exception
+- 6=specification exception
+- 10=segment translation exception
+- 11=page translation exception
+
+The full list of these is on page 22 of the current s/390 Reference Summary.
+e.g.
+
+tr prog 10 will trace segment translation exceptions.
+
+tr prog on its own will trace all program interruption codes.
+
+Trace Sets
+----------
+On starting VM you are initially in the INITIAL trace set.
+You can do a Q TR to verify this.
+If you have a complex tracing situation where you wish to wait for instance
+till a driver is open before you start tracing IO, but know in your
+heart that you are going to have to make several runs through the code till you
+have a clue whats going on.
+
+What you can do is::
+
+ TR I PSWA <Driver open address>
+
+hit b to continue till breakpoint
+
+reach the breakpoint
+
+now do your::
+
+ TR GOTO B
+ TR IO 7c08-7c09 inst int run
+
+or whatever the IO channels you wish to trace are & hit b
+
+To got back to the initial trace set do::
+
+ TR GOTO INITIAL
+
+& the TR I PSWA <Driver open address> will be the only active breakpoint again.
+
+
+Tracing linux syscalls under VM
+-------------------------------
+Syscalls are implemented on Linux for S390 by the Supervisor call instruction
+(SVC). There 256 possibilities of these as the instruction is made up of a 0xA
+opcode and the second byte being the syscall number. They are traced using the
+simple command::
+
+ TR SVC <Optional value or range>
+
+the syscalls are defined in linux/arch/s390/include/asm/unistd.h
+e.g. to trace all file opens just do::
+
+ TR SVC 5 ( as this is the syscall number of open )
+
+
+SMP Specific commands
+---------------------
+To find out how many cpus you have
+Q CPUS displays all the CPU's available to your virtual machine
+To find the cpu that the current cpu VM debugger commands are being directed at
+do Q CPU to change the current cpu VM debugger commands are being directed at
+do::
+
+ CPU <desired cpu no>
+
+On a SMP guest issue a command to all CPUs try prefixing the command with cpu
+all. To issue a command to a particular cpu try cpu <cpu number> e.g.::
+
+ CPU 01 TR I R 2000.3000
+
+If you are running on a guest with several cpus & you have a IO related problem
+& cannot follow the flow of code but you know it isn't smp related.
+
+from the bash prompt issue::
+
+ shutdown -h now or halt.
+
+do a::
+
+ Q CPUS
+
+to find out how many cpus you have detach each one of them from cp except
+cpu 0 by issuing a::
+
+ DETACH CPU 01-(number of cpus in configuration)
+
+& boot linux again.
+
+TR SIGP
+ will trace inter processor signal processor instructions.
+
+DEFINE CPU 01-(number in configuration)
+ will get your guests cpus back.
+
+
+Help for displaying ascii textstrings
+-------------------------------------
+On the very latest VM Nucleus'es VM can now display ascii
+( thanks Neale for the hint ) by doing::
+
+ D TX<lowaddr>.<len>
+
+e.g.::
+
+ D TX0.100
+
+Alternatively
+=============
+Under older VM debuggers (I love EBDIC too) you can use following little
+program which converts a command line of hex digits to ascii text. It can be
+compiled under linux and you can copy the hex digits from your x3270 terminal
+to your xterm if you are debugging from a linuxbox.
+
+This is quite useful when looking at a parameter passed in as a text string
+under VM ( unless you are good at decoding ASCII in your head ).
+
+e.g. consider tracing an open syscall::
+
+ TR SVC 5
+
+We have stopped at a breakpoint::
+
+ 000151B0' SVC 0A05 -> 0001909A' CC 0
+
+D 20.8 to check the SVC old psw in the prefix area and see was it from userspace
+(for the layout of the prefix area consult the "Fixed Storage Locations"
+chapter of the s/390 Reference Summary if you have it available).
+
+::
+
+ V00000020 070C2000 800151B2
+
+The problem state bit wasn't set & it's also too early in the boot sequence
+for it to be a userspace SVC if it was we would have to temporarily switch the
+psw to user space addressing so we could get at the first parameter of the open
+in gpr2.
+
+Next do a::
+
+ D G2
+ GPR 2 = 00014CB4
+
+Now display what gpr2 is pointing to::
+
+ D 00014CB4.20
+ V00014CB4 2F646576 2F636F6E 736F6C65 00001BF5
+ V00014CC4 FC00014C B4001001 E0001000 B8070707
+
+Now copy the text till the first 00 hex ( which is the end of the string
+to an xterm & do hex2ascii on it::
+
+ hex2ascii 2F646576 2F636F6E 736F6C65 00
+
+outputs::
+
+ Decoded Hex:=/ d e v / c o n s o l e 0x00
+
+We were opening the console device,
+
+You can compile the code below yourself for practice :-),
+
+::
+
+ /*
+ * hex2ascii.c
+ * a useful little tool for converting a hexadecimal command line to ascii
+ *
+ * Author(s): Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com)
+ * (C) 2000 IBM Deutschland Entwicklung GmbH, IBM Corporation.
+ */
+ #include <stdio.h>
+
+ int main(int argc,char *argv[])
+ {
+ int cnt1,cnt2,len,toggle=0;
+ int startcnt=1;
+ unsigned char c,hex;
+
+ if(argc>1&&(strcmp(argv[1],"-a")==0))
+ startcnt=2;
+ printf("Decoded Hex:=");
+ for(cnt1=startcnt;cnt1<argc;cnt1++)
+ {
+ len=strlen(argv[cnt1]);
+ for(cnt2=0;cnt2<len;cnt2++)
+ {
+ c=argv[cnt1][cnt2];
+ if(c>='0'&&c<='9')
+ c=c-'0';
+ if(c>='A'&&c<='F')
+ c=c-'A'+10;
+ if(c>='a'&&c<='f')
+ c=c-'a'+10;
+ switch(toggle)
+ {
+ case 0:
+ hex=c<<4;
+ toggle=1;
+ break;
+ case 1:
+ hex+=c;
+ if(hex<32||hex>127)
+ {
+ if(startcnt==1)
+ printf("0x%02X ",(int)hex);
+ else
+ printf(".");
+ }
+ else
+ {
+ printf("%c",hex);
+ if(startcnt==1)
+ printf(" ");
+ }
+ toggle=0;
+ break;
+ }
+ }
+ }
+ printf("\n");
+ }
+
+
+
+
+Stack tracing under VM
+----------------------
+A basic backtrace
+-----------------
+
+Here are the tricks I use 9 out of 10 times it works pretty well,
+
+When your backchain reaches a dead end
+--------------------------------------
+This can happen when an exception happens in the kernel and the kernel is
+entered twice. If you reach the NULL pointer at the end of the back chain you
+should be able to sniff further back if you follow the following tricks.
+1) A kernel address should be easy to recognise since it is in
+primary space & the problem state bit isn't set & also
+The Hi bit of the address is set.
+2) Another backchain should also be easy to recognise since it is an
+address pointing to another address approximately 100 bytes or 0x70 hex
+behind the current stackpointer.
+
+
+Here is some practice.
+
+boot the kernel & hit PA1 at some random time
+
+d g to display the gprs, this should display something like::
+
+ GPR 0 = 00000001 00156018 0014359C 00000000
+ GPR 4 = 00000001 001B8888 000003E0 00000000
+ GPR 8 = 00100080 00100084 00000000 000FE000
+ GPR 12 = 00010400 8001B2DC 8001B36A 000FFED8
+
+Note that GPR14 is a return address but as we are real men we are going to
+trace the stack.
+display 0x40 bytes after the stack pointer::
+
+ V000FFED8 000FFF38 8001B838 80014C8E 000FFF38
+ V000FFEE8 00000000 00000000 000003E0 00000000
+ V000FFEF8 00100080 00100084 00000000 000FE000
+ V000FFF08 00010400 8001B2DC 8001B36A 000FFED8
+
+
+Ah now look at whats in sp+56 (sp+0x38) this is 8001B36A our saved r14 if
+you look above at our stackframe & also agrees with GPR14.
+
+now backchain::
+
+ d 000FFF38.40
+
+we now are taking the contents of SP to get our first backchain::
+
+ V000FFF38 000FFFA0 00000000 00014995 00147094
+ V000FFF48 00147090 001470A0 000003E0 00000000
+ V000FFF58 00100080 00100084 00000000 001BF1D0
+ V000FFF68 00010400 800149BA 80014CA6 000FFF38
+
+This displays a 2nd return address of 80014CA6
+
+now do::
+
+ d 000FFFA0.40
+
+for our 3rd backchain::
+
+ V000FFFA0 04B52002 0001107F 00000000 00000000
+ V000FFFB0 00000000 00000000 FF000000 0001107F
+ V000FFFC0 00000000 00000000 00000000 00000000
+ V000FFFD0 00010400 80010802 8001085A 000FFFA0
+
+
+our 3rd return address is 8001085A
+
+as the 04B52002 looks suspiciously like rubbish it is fair to assume that the
+kernel entry routines for the sake of optimisation don't set up a backchain.
+
+now look at System.map to see if the addresses make any sense::
+
+ grep -i 0001b3 System.map
+
+outputs among other things::
+
+ 0001b304 T cpu_idle
+
+so 8001B36A
+is cpu_idle+0x66 ( quiet the cpu is asleep, don't wake it )
+
+::
+
+ grep -i 00014 System.map
+
+produces among other things::
+
+ 00014a78 T start_kernel
+
+so 0014CA6 is start_kernel+some hex number I can't add in my head.
+
+::
+
+ grep -i 00108 System.map
+
+this produces::
+
+ 00010800 T _stext
+
+so 8001085A is _stext+0x5a
+
+Congrats you've done your first backchain.
+
+
+
+s/390 & z/Architecture IO Overview
+==================================
+
+I am not going to give a course in 390 IO architecture as this would take me
+quite a while and I'm no expert. Instead I'll give a 390 IO architecture
+summary for Dummies. If you have the s/390 principles of operation available
+read this instead. If nothing else you may find a few useful keywords in here
+and be able to use them on a web search engine to find more useful information.
+
+Unlike other bus architectures modern 390 systems do their IO using mostly
+fibre optics and devices such as tapes and disks can be shared between several
+mainframes. Also S390 can support up to 65536 devices while a high end PC based
+system might be choking with around 64.
+
+Here is some of the common IO terminology:
+
+Subchannel:
+ This is the logical number most IO commands use to talk to an IO device. There
+ can be up to 0x10000 (65536) of these in a configuration, typically there are a
+ few hundred. Under VM for simplicity they are allocated contiguously, however
+ on the native hardware they are not. They typically stay consistent between
+ boots provided no new hardware is inserted or removed.
+
+ Under Linux for s390 we use these as IRQ's and also when issuing an IO command
+ (CLEAR SUBCHANNEL, HALT SUBCHANNEL, MODIFY SUBCHANNEL, RESUME SUBCHANNEL,
+ START SUBCHANNEL, STORE SUBCHANNEL and TEST SUBCHANNEL). We use this as the ID
+ of the device we wish to talk to. The most important of these instructions are
+ START SUBCHANNEL (to start IO), TEST SUBCHANNEL (to check whether the IO
+ completed successfully) and HALT SUBCHANNEL (to kill IO). A subchannel can have
+ up to 8 channel paths to a device, this offers redundancy if one is not
+ available.
+
+Device Number:
+ This number remains static and is closely tied to the hardware. There are 65536
+ of these, made up of a CHPID (Channel Path ID, the most significant 8 bits) and
+ another lsb 8 bits. These remain static even if more devices are inserted or
+ removed from the hardware. There is a 1 to 1 mapping between subchannels and
+ device numbers, provided devices aren't inserted or removed.
+
+Channel Control Words:
+ CCWs are linked lists of instructions initially pointed to by an operation
+ request block (ORB), which is initially given to Start Subchannel (SSCH)
+ command along with the subchannel number for the IO subsystem to process
+ while the CPU continues executing normal code.
+ CCWs come in two flavours, Format 0 (24 bit for backward compatibility) and
+ Format 1 (31 bit). These are typically used to issue read and write (and many
+ other) instructions. They consist of a length field and an absolute address
+ field.
+
+ Each IO typically gets 1 or 2 interrupts, one for channel end (primary status)
+ when the channel is idle, and the second for device end (secondary status).
+ Sometimes you get both concurrently. You check how the IO went on by issuing a
+ TEST SUBCHANNEL at each interrupt, from which you receive an Interruption
+ response block (IRB). If you get channel and device end status in the IRB
+ without channel checks etc. your IO probably went okay. If you didn't you
+ probably need to examine the IRB, extended status word etc.
+ If an error occurs, more sophisticated control units have a facility known as
+ concurrent sense. This means that if an error occurs Extended sense information
+ will be presented in the Extended status word in the IRB. If not you have to
+ issue a subsequent SENSE CCW command after the test subchannel.
+
+
+TPI (Test pending interrupt) can also be used for polled IO, but in
+multitasking multiprocessor systems it isn't recommended except for
+checking special cases (i.e. non looping checks for pending IO etc.).
+
+Store Subchannel and Modify Subchannel can be used to examine and modify
+operating characteristics of a subchannel (e.g. channel paths).
+
+Other IO related Terms:
+
+Sysplex:
+ S390's Clustering Technology
+QDIO:
+ S390's new high speed IO architecture to support devices such as gigabit
+ ethernet, this architecture is also designed to be forward compatible with
+ upcoming 64 bit machines.
+
+
+General Concepts
+----------------
+
+Input Output Processors (IOP's) are responsible for communicating between
+the mainframe CPU's & the channel & relieve the mainframe CPU's from the
+burden of communicating with IO devices directly, this allows the CPU's to
+concentrate on data processing.
+
+IOP's can use one or more links ( known as channel paths ) to talk to each
+IO device. It first checks for path availability & chooses an available one,
+then starts ( & sometimes terminates IO ).
+There are two types of channel path: ESCON & the Parallel IO interface.
+
+IO devices are attached to control units, control units provide the
+logic to interface the channel paths & channel path IO protocols to
+the IO devices, they can be integrated with the devices or housed separately
+& often talk to several similar devices ( typical examples would be raid
+controllers or a control unit which connects to 1000 3270 terminals )::
+
+
+ +---------------------------------------------------------------+
+ | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ |
+ | | CPU | | CPU | | CPU | | CPU | | Main | | Expanded | |
+ | | | | | | | | | | Memory | | Storage | |
+ | +-----+ +-----+ +-----+ +-----+ +----------+ +----------+ |
+ |---------------------------------------------------------------+
+ | IOP | IOP | IOP |
+ |---------------------------------------------------------------
+ | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C | C |
+ ----------------------------------------------------------------
+ || ||
+ || Bus & Tag Channel Path || ESCON
+ || ====================== || Channel
+ || || || || Path
+ +----------+ +----------+ +----------+
+ | | | | | |
+ | CU | | CU | | CU |
+ | | | | | |
+ +----------+ +----------+ +----------+
+ | | | | |
+ +----------+ +----------+ +----------+ +----------+ +----------+
+ |I/O Device| |I/O Device| |I/O Device| |I/O Device| |I/O Device|
+ +----------+ +----------+ +----------+ +----------+ +----------+
+ CPU = Central Processing Unit
+ C = Channel
+ IOP = IP Processor
+ CU = Control Unit
+
+The 390 IO systems come in 2 flavours the current 390 machines support both
+
+The Older 360 & 370 Interface,sometimes called the Parallel I/O interface,
+sometimes called Bus-and Tag & sometimes Original Equipment Manufacturers
+Interface (OEMI).
+
+This byte wide Parallel channel path/bus has parity & data on the "Bus" cable
+and control lines on the "Tag" cable. These can operate in byte multiplex mode
+for sharing between several slow devices or burst mode and monopolize the
+channel for the whole burst. Up to 256 devices can be addressed on one of these
+cables. These cables are about one inch in diameter. The maximum unextended
+length supported by these cables is 125 Meters but this can be extended up to
+2km with a fibre optic channel extended such as a 3044. The maximum burst speed
+supported is 4.5 megabytes per second. However, some really old processors
+support only transfer rates of 3.0, 2.0 & 1.0 MB/sec.
+One of these paths can be daisy chained to up to 8 control units.
+
+
+ESCON if fibre optic it is also called FICON
+Was introduced by IBM in 1990. Has 2 fibre optic cables and uses either leds or
+lasers for communication at a signaling rate of up to 200 megabits/sec. As
+10bits are transferred for every 8 bits info this drops to 160 megabits/sec
+and to 18.6 Megabytes/sec once control info and CRC are added. ESCON only
+operates in burst mode.
+
+ESCONs typical max cable length is 3km for the led version and 20km for the
+laser version known as XDF (extended distance facility). This can be further
+extended by using an ESCON director which triples the above mentioned ranges.
+Unlike Bus & Tag as ESCON is serial it uses a packet switching architecture,
+the standard Bus & Tag control protocol is however present within the packets.
+Up to 256 devices can be attached to each control unit that uses one of these
+interfaces.
+
+Common 390 Devices include:
+Network adapters typically OSA2,3172's,2116's & OSA-E gigabit ethernet adapters,
+Consoles 3270 & 3215 (a teletype emulated under linux for a line mode console).
+DASD's direct access storage devices ( otherwise known as hard disks ).
+Tape Drives.
+CTC ( Channel to Channel Adapters ),
+ESCON or Parallel Cables used as a very high speed serial link
+between 2 machines.
+
+
+Debugging IO on s/390 & z/Architecture under VM
+===============================================
+
+Now we are ready to go on with IO tracing commands under VM
+
+A few self explanatory queries::
+
+ Q OSA
+ Q CTC
+ Q DISK ( This command is CMS specific )
+ Q DASD
+
+Q OSA on my machine returns::
+
+ OSA 7C08 ON OSA 7C08 SUBCHANNEL = 0000
+ OSA 7C09 ON OSA 7C09 SUBCHANNEL = 0001
+ OSA 7C14 ON OSA 7C14 SUBCHANNEL = 0002
+ OSA 7C15 ON OSA 7C15 SUBCHANNEL = 0003
+
+If you have a guest with certain privileges you may be able to see devices
+which don't belong to you. To avoid this, add the option V.
+e.g.::
+
+ Q V OSA
+
+Now using the device numbers returned by this command we will
+Trace the io starting up on the first device 7c08 & 7c09
+In our simplest case we can trace the
+start subchannels
+like TR SSCH 7C08-7C09
+or the halt subchannels
+or TR HSCH 7C08-7C09
+MSCH's ,STSCH's I think you can guess the rest
+
+A good trick is tracing all the IO's and CCWS and spooling them into the reader
+of another VM guest so he can ftp the logfile back to his own machine. I'll do
+a small bit of this and give you a look at the output.
+
+1) Spool stdout to VM reader::
+
+ SP PRT TO (another vm guest ) or * for the local vm guest
+
+2) Fill the reader with the trace::
+
+ TR IO 7c08-7c09 INST INT CCW PRT RUN
+
+3) Start up linux::
+
+ i 00c
+4) Finish the trace::
+
+ TR END
+
+5) close the reader::
+
+ C PRT
+
+6) list reader contents::
+
+ RDRLIST
+
+7) copy it to linux4's minidisk::
+
+ RECEIVE / LOG TXT A1 ( replace
+
+8)
+filel & press F11 to look at it
+You should see something like::
+
+ 00020942' SSCH B2334000 0048813C CC 0 SCH 0000 DEV 7C08
+ CPA 000FFDF0 PARM 00E2C9C4 KEY 0 FPI C0 LPM 80
+ CCW 000FFDF0 E4200100 00487FE8 0000 E4240100 ........
+ IDAL 43D8AFE8
+ IDAL 0FB76000
+ 00020B0A' I/O DEV 7C08 -> 000197BC' SCH 0000 PARM 00E2C9C4
+ 00021628' TSCH B2354000 >> 00488164 CC 0 SCH 0000 DEV 7C08
+ CCWA 000FFDF8 DEV STS 0C SCH STS 00 CNT 00EC
+ KEY 0 FPI C0 CC 0 CTLS 4007
+ 00022238' STSCH B2344000 >> 00488108 CC 0 SCH 0000 DEV 7C08
+
+If you don't like messing up your readed ( because you possibly booted from it )
+you can alternatively spool it to another readers guest.
+
+
+Other common VM device related commands
+---------------------------------------------
+These commands are listed only because they have
+been of use to me in the past & may be of use to
+you too. For more complete info on each of the commands
+use type HELP <command> from CMS.
+
+detaching devices::
+
+ DET <devno range>
+ ATT <devno range> <guest>
+
+attach a device to guest * for your own guest
+
+READY <devno>
+ cause VM to issue a fake interrupt.
+
+The VARY command is normally only available to VM administrators::
+
+ VARY ON PATH <path> TO <devno range>
+ VARY OFF PATH <PATH> FROM <devno range>
+
+This is used to switch on or off channel paths to devices.
+
+Q CHPID <channel path ID>
+ This displays state of devices using this channel path
+
+D SCHIB <subchannel>
+ This displays the subchannel information SCHIB block for the device.
+ this I believe is also only available to administrators.
+
+DEFINE CTC <devno>
+ defines a virtual CTC channel to channel connection
+ 2 need to be defined on each guest for the CTC driver to use.
+
+COUPLE devno userid remote devno
+ Joins a local virtual device to a remote virtual device
+ ( commonly used for the CTC driver ).
+
+Building a VM ramdisk under CMS which linux can use::
+
+ def vfb-<blocksize> <subchannel> <number blocks>
+
+blocksize is commonly 4096 for linux.
+
+Formatting it::
+
+ format <subchannel> <driver letter e.g. x> (blksize <blocksize>
+
+Sharing a disk between multiple guests::
+
+ LINK userid devno1 devno2 mode password
+
+
+
+GDB on S390
+===========
+N.B. if compiling for debugging gdb works better without optimisation
+( see Compiling programs for debugging )
+
+invocation
+----------
+gdb <victim program> <optional corefile>
+
+Online help
+-----------
+help: gives help on commands
+
+e.g.::
+
+ help
+ help display
+
+Note gdb's online help is very good use it.
+
+
+Assembly
+--------
+info registers:
+ displays registers other than floating point.
+
+info all-registers:
+ displays floating points as well.
+
+disassemble:
+ disassembles
+
+e.g.::
+
+ disassemble without parameters will disassemble the current function
+ disassemble $pc $pc+10
+
+Viewing & modifying variables
+-----------------------------
+print or p:
+ displays variable or register
+
+e.g. p/x $sp will display the stack pointer
+
+display:
+ prints variable or register each time program stops
+
+e.g.::
+
+ display/x $pc will display the program counter
+ display argc
+
+undisplay:
+ undo's display's
+
+info breakpoints:
+ shows all current breakpoints
+
+info stack:
+ shows stack back trace (if this doesn't work too well, I'll show
+ you the stacktrace by hand below).
+
+info locals:
+ displays local variables.
+
+info args:
+ display current procedure arguments.
+
+set args:
+ will set argc & argv each time the victim program is invoked
+
+e.g.::
+
+ set <variable>=value
+ set argc=100
+ set $pc=0
+
+
+
+Modifying execution
+-------------------
+step:
+ steps n lines of sourcecode
+
+step
+ steps 1 line.
+
+step 100
+ steps 100 lines of code.
+
+next:
+ like step except this will not step into subroutines
+
+stepi:
+ steps a single machine code instruction.
+
+e.g.::
+
+ stepi 100
+
+nexti:
+ steps a single machine code instruction but will not step into
+ subroutines.
+
+finish:
+ will run until exit of the current routine
+
+run:
+ (re)starts a program
+
+cont:
+ continues a program
+
+quit:
+ exits gdb.
+
+
+breakpoints
+------------
+
+break
+ sets a breakpoint
+
+e.g.::
+
+ break main
+ break *$pc
+ break *0x400618
+
+Here's a really useful one for large programs
+
+rbr
+ Set a breakpoint for all functions matching REGEXP
+
+e.g.::
+
+ rbr 390
+
+will set a breakpoint with all functions with 390 in their name.
+
+info breakpoints
+ lists all breakpoints
+
+delete:
+ delete breakpoint by number or delete them all
+
+e.g.
+
+delete 1
+ will delete the first breakpoint
+
+
+delete
+ will delete them all
+
+watch:
+ This will set a watchpoint ( usually hardware assisted ),
+
+This will watch a variable till it changes
+
+e.g.
+
+watch cnt
+ will watch the variable cnt till it changes.
+
+As an aside unfortunately gdb's, architecture independent watchpoint code
+is inconsistent & not very good, watchpoints usually work but not always.
+
+info watchpoints:
+ Display currently active watchpoints
+
+condition: ( another useful one )
+ Specify breakpoint number N to break only if COND is true.
+
+Usage is `condition N COND`, where N is an integer and COND is an
+expression to be evaluated whenever breakpoint N is reached.
+
+
+
+User defined functions/macros
+-----------------------------
+define: ( Note this is very very useful,simple & powerful )
+
+usage define <name> <list of commands> end
+
+examples which you should consider putting into .gdbinit in your home
+directory::
+
+ define d
+ stepi
+ disassemble $pc $pc+10
+ end
+ define e
+ nexti
+ disassemble $pc $pc+10
+ end
+
+
+Other hard to classify stuff
+----------------------------
+signal n:
+ sends the victim program a signal.
+
+e.g. `signal 3` will send a SIGQUIT.
+
+info signals:
+ what gdb does when the victim receives certain signals.
+
+list:
+
+e.g.:
+
+list
+ lists current function source
+list 1,10
+ list first 10 lines of current file.
+
+list test.c:1,10
+
+
+directory:
+ Adds directories to be searched for source if gdb cannot find the source.
+ (note it is a bit sensitive about slashes)
+
+e.g. To add the root of the filesystem to the searchpath do::
+
+ directory //
+
+
+call <function>
+This calls a function in the victim program, this is pretty powerful
+e.g.
+(gdb) call printf("hello world")
+outputs:
+$1 = 11
+
+You might now be thinking that the line above didn't work, something extra had
+to be done.
+(gdb) call fflush(stdout)
+hello world$2 = 0
+As an aside the debugger also calls malloc & free under the hood
+to make space for the "hello world" string.
+
+
+
+hints
+-----
+1) command completion works just like bash
+ ( if you are a bad typist like me this really helps )
+
+e.g. hit br <TAB> & cursor up & down :-).
+
+2) if you have a debugging problem that takes a few steps to recreate
+put the steps into a file called .gdbinit in your current working directory
+if you have defined a few extra useful user defined commands put these in
+your home directory & they will be read each time gdb is launched.
+
+A typical .gdbinit file might be.::
+
+ break main
+ run
+ break runtime_exception
+ cont
+
+
+stack chaining in gdb by hand
+-----------------------------
+This is done using a the same trick described for VM::
+
+ p/x (*($sp+56))&0x7fffffff
+
+get the first backchain.
+
+For z/Architecture
+Replace 56 with 112 & ignore the &0x7fffffff
+in the macros below & do nasty casts to longs like the following
+as gdb unfortunately deals with printed arguments as ints which
+messes up everything.
+
+i.e. here is a 3rd backchain dereference::
+
+ p/x *(long *)(***(long ***)$sp+112)
+
+
+this outputs::
+
+ $5 = 0x528f18
+
+on my machine.
+
+Now you can use::
+
+ info symbol (*($sp+56))&0x7fffffff
+
+you might see something like::
+
+ rl_getc + 36 in section .text
+
+telling you what is located at address 0x528f18
+Now do::
+
+ p/x (*(*$sp+56))&0x7fffffff
+
+This outputs::
+
+ $6 = 0x528ed0
+
+Now do::
+
+ info symbol (*(*$sp+56))&0x7fffffff
+ rl_read_key + 180 in section .text
+
+now do::
+
+ p/x (*(**$sp+56))&0x7fffffff
+
+& so on.
+
+Disassembling instructions without debug info
+---------------------------------------------
+gdb typically complains if there is a lack of debugging
+symbols in the disassemble command with
+"No function contains specified address." To get around
+this do::
+
+ x/<number lines to disassemble>xi <address>
+
+e.g.::
+
+ x/20xi 0x400730
+
+
+
+Note:
+ Remember gdb has history just like bash you don't need to retype the
+ whole line just use the up & down arrows.
+
+
+
+For more info
+-------------
+From your linuxbox do::
+
+ man gdb
+
+or::
+
+ info gdb.
+
+core dumps
+----------
+
+What a core dump ?
+^^^^^^^^^^^^^^^^^^
+
+A core dump is a file generated by the kernel (if allowed) which contains the
+registers and all active pages of the program which has crashed.
+
+From this file gdb will allow you to look at the registers, stack trace and
+memory of the program as if it just crashed on your system. It is usually
+called core and created in the current working directory.
+
+This is very useful in that a customer can mail a core dump to a technical
+support department and the technical support department can reconstruct what
+happened. Provided they have an identical copy of this program with debugging
+symbols compiled in and the source base of this build is available.
+
+In short it is far more useful than something like a crash log could ever hope
+to be.
+
+Why have I never seen one ?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Probably because you haven't used the command::
+
+ ulimit -c unlimited in bash
+
+to allow core dumps, now do::
+
+ ulimit -a
+
+to verify that the limit was accepted.
+
+A sample core dump
+ To create this I'm going to do::
+
+ ulimit -c unlimited
+ gdb
+
+to launch gdb (my victim app. ) now be bad & do the following from another
+telnet/xterm session to the same machine::
+
+ ps -aux | grep gdb
+ kill -SIGSEGV <gdb's pid>
+
+or alternatively use `killall -SIGSEGV gdb` if you have the killall command.
+
+Now look at the core dump::
+
+ ./gdb core
+
+Displays the following::
+
+ GNU gdb 4.18
+ Copyright 1998 Free Software Foundation, Inc.
+ GDB is free software, covered by the GNU General Public License, and you are
+ welcome to change it and/or distribute copies of it under certain conditions.
+ Type "show copying" to see the conditions.
+ There is absolutely no warranty for GDB. Type "show warranty" for details.
+ This GDB was configured as "s390-ibm-linux"...
+ Core was generated by `./gdb'.
+ Program terminated with signal 11, Segmentation fault.
+ Reading symbols from /usr/lib/libncurses.so.4...done.
+ Reading symbols from /lib/libm.so.6...done.
+ Reading symbols from /lib/libc.so.6...done.
+ Reading symbols from /lib/ld-linux.so.2...done.
+ #0 0x40126d1a in read () from /lib/libc.so.6
+ Setting up the environment for debugging gdb.
+ Breakpoint 1 at 0x4dc6f8: file utils.c, line 471.
+ Breakpoint 2 at 0x4d87a4: file top.c, line 2609.
+ (top-gdb) info stack
+ #0 0x40126d1a in read () from /lib/libc.so.6
+ #1 0x528f26 in rl_getc (stream=0x7ffffde8) at input.c:402
+ #2 0x528ed0 in rl_read_key () at input.c:381
+ #3 0x5167e6 in readline_internal_char () at readline.c:454
+ #4 0x5168ee in readline_internal_charloop () at readline.c:507
+ #5 0x51692c in readline_internal () at readline.c:521
+ #6 0x5164fe in readline (prompt=0x7ffff810)
+ at readline.c:349
+ #7 0x4d7a8a in command_line_input (prompt=0x564420 "(gdb) ", repeat=1,
+ annotation_suffix=0x4d6b44 "prompt") at top.c:2091
+ #8 0x4d6cf0 in command_loop () at top.c:1345
+ #9 0x4e25bc in main (argc=1, argv=0x7ffffdf4) at main.c:635
+
+
+LDD
+===
+This is a program which lists the shared libraries which a library needs,
+Note you also get the relocations of the shared library text segments which
+help when using objdump --source.
+
+e.g.::
+
+ ldd ./gdb
+
+outputs::
+
+ libncurses.so.4 => /usr/lib/libncurses.so.4 (0x40018000)
+ libm.so.6 => /lib/libm.so.6 (0x4005e000)
+ libc.so.6 => /lib/libc.so.6 (0x40084000)
+ /lib/ld-linux.so.2 => /lib/ld-linux.so.2 (0x40000000)
+
+
+Debugging shared libraries
+==========================
+Most programs use shared libraries, however it can be very painful
+when you single step instruction into a function like printf for the
+first time & you end up in functions like _dl_runtime_resolve this is
+the ld.so doing lazy binding, lazy binding is a concept in ELF where
+shared library functions are not loaded into memory unless they are
+actually used, great for saving memory but a pain to debug.
+
+To get around this either relink the program -static or exit gdb type
+export LD_BIND_NOW=true this will stop lazy binding & restart the gdb'ing
+the program in question.
+
+
+
+Debugging modules
+=================
+As modules are dynamically loaded into the kernel their address can be
+anywhere to get around this use the -m option with insmod to emit a load
+map which can be piped into a file if required.
+
+The proc file system
+====================
+What is it ?.
+It is a filesystem created by the kernel with files which are created on demand
+by the kernel if read, or can be used to modify kernel parameters,
+it is a powerful concept.
+
+e.g.::
+
+ cat /proc/sys/net/ipv4/ip_forward
+
+On my machine outputs::
+
+ 0
+
+telling me ip_forwarding is not on to switch it on I can do::
+
+ echo 1 > /proc/sys/net/ipv4/ip_forward
+
+cat it again::
+
+ cat /proc/sys/net/ipv4/ip_forward
+
+On my machine now outputs::
+
+ 1
+
+IP forwarding is on.
+
+There is a lot of useful info in here best found by going in and having a look
+around, so I'll take you through some entries I consider important.
+
+All the processes running on the machine have their own entry defined by
+/proc/<pid>
+
+So lets have a look at the init process::
+
+ cd /proc/1
+ cat cmdline
+
+emits::
+
+ init [2]
+
+::
+
+ cd /proc/1/fd
+
+This contains numerical entries of all the open files,
+some of these you can cat e.g. stdout (2)::
+
+ cat /proc/29/maps
+
+on my machine emits::
+
+ 00400000-00478000 r-xp 00000000 5f:00 4103 /bin/bash
+ 00478000-0047e000 rw-p 00077000 5f:00 4103 /bin/bash
+ 0047e000-00492000 rwxp 00000000 00:00 0
+ 40000000-40015000 r-xp 00000000 5f:00 14382 /lib/ld-2.1.2.so
+ 40015000-40016000 rw-p 00014000 5f:00 14382 /lib/ld-2.1.2.so
+ 40016000-40017000 rwxp 00000000 00:00 0
+ 40017000-40018000 rw-p 00000000 00:00 0
+ 40018000-4001b000 r-xp 00000000 5f:00 14435 /lib/libtermcap.so.2.0.8
+ 4001b000-4001c000 rw-p 00002000 5f:00 14435 /lib/libtermcap.so.2.0.8
+ 4001c000-4010d000 r-xp 00000000 5f:00 14387 /lib/libc-2.1.2.so
+ 4010d000-40111000 rw-p 000f0000 5f:00 14387 /lib/libc-2.1.2.so
+ 40111000-40114000 rw-p 00000000 00:00 0
+ 40114000-4011e000 r-xp 00000000 5f:00 14408 /lib/libnss_files-2.1.2.so
+ 4011e000-4011f000 rw-p 00009000 5f:00 14408 /lib/libnss_files-2.1.2.so
+ 7fffd000-80000000 rwxp ffffe000 00:00 0
+
+
+Showing us the shared libraries init uses where they are in memory
+& memory access permissions for each virtual memory area.
+
+/proc/1/cwd is a softlink to the current working directory.
+
+/proc/1/root is the root of the filesystem for this process.
+
+/proc/1/mem is the current running processes memory which you
+can read & write to like a file.
+
+strace uses this sometimes as it is a bit faster than the
+rather inefficient ptrace interface for peeking at DATA.
+
+::
+
+ cat status
+
+ Name: init
+ State: S (sleeping)
+ Pid: 1
+ PPid: 0
+ Uid: 0 0 0 0
+ Gid: 0 0 0 0
+ Groups:
+ VmSize: 408 kB
+ VmLck: 0 kB
+ VmRSS: 208 kB
+ VmData: 24 kB
+ VmStk: 8 kB
+ VmExe: 368 kB
+ VmLib: 0 kB
+ SigPnd: 0000000000000000
+ SigBlk: 0000000000000000
+ SigIgn: 7fffffffd7f0d8fc
+ SigCgt: 00000000280b2603
+ CapInh: 00000000fffffeff
+ CapPrm: 00000000ffffffff
+ CapEff: 00000000fffffeff
+
+ User PSW: 070de000 80414146
+ task: 004b6000 tss: 004b62d8 ksp: 004b7ca8 pt_regs: 004b7f68
+ User GPRS:
+ 00000400 00000000 0000000b 7ffffa90
+ 00000000 00000000 00000000 0045d9f4
+ 0045cafc 7ffffa90 7fffff18 0045cb08
+ 00010400 804039e8 80403af8 7ffff8b0
+ User ACRS:
+ 00000000 00000000 00000000 00000000
+ 00000001 00000000 00000000 00000000
+ 00000000 00000000 00000000 00000000
+ 00000000 00000000 00000000 00000000
+ Kernel BackChain CallChain BackChain CallChain
+ 004b7ca8 8002bd0c 004b7d18 8002b92c
+ 004b7db8 8005cd50 004b7e38 8005d12a
+ 004b7f08 80019114
+
+Showing among other things memory usage & status of some signals &
+the processes'es registers from the kernel task_structure
+as well as a backchain which may be useful if a process crashes
+in the kernel for some unknown reason.
+
+Some driver debugging techniques
+================================
+debug feature
+-------------
+Some of our drivers now support a "debug feature" in
+/proc/s390dbf see s390dbf.txt in the linux/Documentation directory
+for more info.
+
+e.g.
+to switch on the lcs "debug feature"::
+
+ echo 5 > /proc/s390dbf/lcs/level
+
+& then after the error occurred::
+
+ cat /proc/s390dbf/lcs/sprintf >/logfile
+
+the logfile now contains some information which may help
+tech support resolve a problem in the field.
+
+
+
+high level debugging network drivers
+------------------------------------
+ifconfig is a quite useful command
+it gives the current state of network drivers.
+
+If you suspect your network device driver is dead
+one way to check is type::
+
+ ifconfig <network device>
+
+e.g. tr0
+
+You should see something like::
+
+ ifconfig tr0
+ tr0 Link encap:16/4 Mbps Token Ring (New) HWaddr 00:04:AC:20:8E:48
+ inet addr:9.164.185.132 Bcast:9.164.191.255 Mask:255.255.224.0
+ UP BROADCAST RUNNING MULTICAST MTU:2000 Metric:1
+ RX packets:246134 errors:0 dropped:0 overruns:0 frame:0
+ TX packets:5 errors:0 dropped:0 overruns:0 carrier:0
+ collisions:0 txqueuelen:100
+
+if the device doesn't say up
+try::
+
+ /etc/rc.d/init.d/network start
+
+( this starts the network stack & hopefully calls ifconfig tr0 up ).
+ifconfig looks at the output of /proc/net/dev and presents it in a more
+presentable form.
+
+Now ping the device from a machine in the same subnet.
+
+if the RX packets count & TX packets counts don't increment you probably
+have problems.
+
+next::
+
+ cat /proc/net/arp
+
+Do you see any hardware addresses in the cache if not you may have problems.
+Next try::
+
+ ping -c 5 <broadcast_addr>
+
+i.e. the Bcast field above in the output of
+ifconfig. Do you see any replies from machines other than the local machine
+if not you may have problems. also if the TX packets count in ifconfig
+hasn't incremented either you have serious problems in your driver
+(e.g. the txbusy field of the network device being stuck on )
+or you may have multiple network devices connected.
+
+
+chandev
+-------
+There is a new device layer for channel devices, some
+drivers e.g. lcs are registered with this layer.
+
+If the device uses the channel device layer you'll be
+able to find what interrupts it uses & the current state
+of the device.
+
+See the manpage chandev.8 &type cat /proc/chandev for more info.
+
+
+SysRq
+=====
+This is now supported by linux for s/390 & z/Architecture.
+
+To enable it do compile the kernel with::
+
+ Kernel Hacking -> Magic SysRq Key Enabled
+
+Then::
+
+ echo "1" > /proc/sys/kernel/sysrq
+
+also type::
+
+ echo "8" >/proc/sys/kernel/printk
+
+To make printk output go to console.
+
+On 390 all commands are prefixed with::
+
+ ^-
+
+e.g.::
+
+ ^-t will show tasks.
+ ^-? or some unknown command will display help.
+
+The sysrq key reading is very picky ( I have to type the keys in an
+xterm session & paste them into the x3270 console )
+& it may be wise to predefine the keys as described in the VM hints above
+
+This is particularly useful for syncing disks unmounting & rebooting
+if the machine gets partially hung.
+
+Read Documentation/admin-guide/sysrq.rst for more info
+
+References:
+===========
+- Enterprise Systems Architecture Reference Summary
+- Enterprise Systems Architecture Principles of Operation
+- Hartmut Penners s390 stack frame sheet.
+- IBM Mainframe Channel Attachment a technology brief from a CISCO webpage
+- Various bits of man & info pages of Linux.
+- Linux & GDB source.
+- Various info & man pages.
+- CMS Help on tracing commands.
+- Linux for s/390 Elf Application Binary Interface
+- Linux for z/Series Elf Application Binary Interface ( Both Highly Recommended )
+- z/Architecture Principles of Operation SA22-7832-00
+- Enterprise Systems Architecture/390 Reference Summary SA22-7209-01 & the
+- Enterprise Systems Architecture/390 Principles of Operation SA22-7201-05
+
+Special Thanks
+==============
+Special thanks to Neale Ferguson who maintains a much
+prettier HTML version of this page at
+http://linuxvm.org/penguinvm/
+Bob Grainger Stefan Bader & others for reporting bugs
diff --git a/Documentation/s390/driver-model.rst b/Documentation/s390/driver-model.rst
new file mode 100644
index 000000000000..ad4bc2dbea43
--- /dev/null
+++ b/Documentation/s390/driver-model.rst
@@ -0,0 +1,328 @@
+=============================
+S/390 driver model interfaces
+=============================
+
+1. CCW devices
+--------------
+
+All devices which can be addressed by means of ccws are called 'CCW devices' -
+even if they aren't actually driven by ccws.
+
+All ccw devices are accessed via a subchannel, this is reflected in the
+structures under devices/::
+
+ devices/
+ - system/
+ - css0/
+ - 0.0.0000/0.0.0815/
+ - 0.0.0001/0.0.4711/
+ - 0.0.0002/
+ - 0.1.0000/0.1.1234/
+ ...
+ - defunct/
+
+In this example, device 0815 is accessed via subchannel 0 in subchannel set 0,
+device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O
+subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1.
+
+The subchannel named 'defunct' does not represent any real subchannel on the
+system; it is a pseudo subchannel where disconnected ccw devices are moved to
+if they are displaced by another ccw device becoming operational on their
+former subchannel. The ccw devices will be moved again to a proper subchannel
+if they become operational again on that subchannel.
+
+You should address a ccw device via its bus id (e.g. 0.0.4711); the device can
+be found under bus/ccw/devices/.
+
+All ccw devices export some data via sysfs.
+
+cutype:
+ The control unit type / model.
+
+devtype:
+ The device type / model, if applicable.
+
+availability:
+ Can be 'good' or 'boxed'; 'no path' or 'no device' for
+ disconnected devices.
+
+online:
+ An interface to set the device online and offline.
+ In the special case of the device being disconnected (see the
+ notify function under 1.2), piping 0 to online will forcibly delete
+ the device.
+
+The device drivers can add entries to export per-device data and interfaces.
+
+There is also some data exported on a per-subchannel basis (see under
+bus/css/devices/):
+
+chpids:
+ Via which chpids the device is connected.
+
+pimpampom:
+ The path installed, path available and path operational masks.
+
+There also might be additional data, for example for block devices.
+
+
+1.1 Bringing up a ccw device
+----------------------------
+
+This is done in several steps.
+
+a. Each driver can provide one or more parameter interfaces where parameters can
+ be specified. These interfaces are also in the driver's responsibility.
+b. After a. has been performed, if necessary, the device is finally brought up
+ via the 'online' interface.
+
+
+1.2 Writing a driver for ccw devices
+------------------------------------
+
+The basic struct ccw_device and struct ccw_driver data structures can be found
+under include/asm/ccwdev.h::
+
+ struct ccw_device {
+ spinlock_t *ccwlock;
+ struct ccw_device_private *private;
+ struct ccw_device_id id;
+
+ struct ccw_driver *drv;
+ struct device dev;
+ int online;
+
+ void (*handler) (struct ccw_device *dev, unsigned long intparm,
+ struct irb *irb);
+ };
+
+ struct ccw_driver {
+ struct module *owner;
+ struct ccw_device_id *ids;
+ int (*probe) (struct ccw_device *);
+ int (*remove) (struct ccw_device *);
+ int (*set_online) (struct ccw_device *);
+ int (*set_offline) (struct ccw_device *);
+ int (*notify) (struct ccw_device *, int);
+ struct device_driver driver;
+ char *name;
+ };
+
+The 'private' field contains data needed for internal i/o operation only, and
+is not available to the device driver.
+
+Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models
+and/or device types/models it is interested. This information can later be found
+in the struct ccw_device_id fields::
+
+ struct ccw_device_id {
+ __u16 match_flags;
+
+ __u16 cu_type;
+ __u16 dev_type;
+ __u8 cu_model;
+ __u8 dev_model;
+
+ unsigned long driver_info;
+ };
+
+The functions in ccw_driver should be used in the following way:
+
+probe:
+ This function is called by the device layer for each device the driver
+ is interested in. The driver should only allocate private structures
+ to put in dev->driver_data and create attributes (if needed). Also,
+ the interrupt handler (see below) should be set here.
+
+::
+
+ int (*probe) (struct ccw_device *cdev);
+
+Parameters:
+ cdev
+ - the device to be probed.
+
+
+remove:
+ This function is called by the device layer upon removal of the driver,
+ the device or the module. The driver should perform cleanups here.
+
+::
+
+ int (*remove) (struct ccw_device *cdev);
+
+Parameters:
+ cdev
+ - the device to be removed.
+
+
+set_online:
+ This function is called by the common I/O layer when the device is
+ activated via the 'online' attribute. The driver should finally
+ setup and activate the device here.
+
+::
+
+ int (*set_online) (struct ccw_device *);
+
+Parameters:
+ cdev
+ - the device to be activated. The common layer has
+ verified that the device is not already online.
+
+
+set_offline: This function is called by the common I/O layer when the device is
+ de-activated via the 'online' attribute. The driver should shut
+ down the device, but not de-allocate its private data.
+
+::
+
+ int (*set_offline) (struct ccw_device *);
+
+Parameters:
+ cdev
+ - the device to be deactivated. The common layer has
+ verified that the device is online.
+
+
+notify:
+ This function is called by the common I/O layer for some state changes
+ of the device.
+
+ Signalled to the driver are:
+
+ * In online state, device detached (CIO_GONE) or last path gone
+ (CIO_NO_PATH). The driver must return !0 to keep the device; for
+ return code 0, the device will be deleted as usual (also when no
+ notify function is registered). If the driver wants to keep the
+ device, it is moved into disconnected state.
+ * In disconnected state, device operational again (CIO_OPER). The
+ common I/O layer performs some sanity checks on device number and
+ Device / CU to be reasonably sure if it is still the same device.
+ If not, the old device is removed and a new one registered. By the
+ return code of the notify function the device driver signals if it
+ wants the device back: !0 for keeping, 0 to make the device being
+ removed and re-registered.
+
+::
+
+ int (*notify) (struct ccw_device *, int);
+
+Parameters:
+ cdev
+ - the device whose state changed.
+
+ event
+ - the event that happened. This can be one of CIO_GONE,
+ CIO_NO_PATH or CIO_OPER.
+
+The handler field of the struct ccw_device is meant to be set to the interrupt
+handler for the device. In order to accommodate drivers which use several
+distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device
+instead of ccw_driver.
+The handler is registered with the common layer during set_online() processing
+before the driver is called, and is deregistered during set_offline() after the
+driver has been called. Also, after registering / before deregistering, path
+grouping resp. disbanding of the path group (if applicable) are performed.
+
+::
+
+ void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb);
+
+Parameters: dev - the device the handler is called for
+ intparm - the intparm which allows the device driver to identify
+ the i/o the interrupt is associated with, or to recognize
+ the interrupt as unsolicited.
+ irb - interruption response block which contains the accumulated
+ status.
+
+The device driver is called from the common ccw_device layer and can retrieve
+information about the interrupt from the irb parameter.
+
+
+1.3 ccwgroup devices
+--------------------
+
+The ccwgroup mechanism is designed to handle devices consisting of multiple ccw
+devices, like lcs or ctc.
+
+The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to
+this attributes creates a ccwgroup device consisting of these ccw devices (if
+possible). This ccwgroup device can be set online or offline just like a normal
+ccw device.
+
+Each ccwgroup device also provides an 'ungroup' attribute to destroy the device
+again (only when offline). This is a generic ccwgroup mechanism (the driver does
+not need to implement anything beyond normal removal routines).
+
+A ccw device which is a member of a ccwgroup device carries a pointer to the
+ccwgroup device in the driver_data of its device struct. This field must not be
+touched by the driver - it should use the ccwgroup device's driver_data for its
+private data.
+
+To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in
+mind that most drivers will need to implement both a ccwgroup and a ccw
+driver.
+
+
+2. Channel paths
+-----------------
+
+Channel paths show up, like subchannels, under the channel subsystem root (css0)
+and are called 'chp0.<chpid>'. They have no driver and do not belong to any bus.
+Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect
+only the logical state and not the physical state, since we cannot track the
+latter consistently due to lacking machine support (we don't need to be aware
+of it anyway).
+
+status
+ - Can be 'online' or 'offline'.
+ Piping 'on' or 'off' sets the chpid logically online/offline.
+ Piping 'on' to an online chpid triggers path reprobing for all devices
+ the chpid connects to. This can be used to force the kernel to re-use
+ a channel path the user knows to be online, but the machine hasn't
+ created a machine check for.
+
+type
+ - The physical type of the channel path.
+
+shared
+ - Whether the channel path is shared.
+
+cmg
+ - The channel measurement group.
+
+3. System devices
+-----------------
+
+3.1 xpram
+---------
+
+xpram shows up under devices/system/ as 'xpram'.
+
+3.2 cpus
+--------
+
+For each cpu, a directory is created under devices/system/cpu/. Each cpu has an
+attribute 'online' which can be 0 or 1.
+
+
+4. Other devices
+----------------
+
+4.1 Netiucv
+-----------
+
+The netiucv driver creates an attribute 'connection' under
+bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv
+connection to the specified host.
+
+Netiucv connections show up under devices/iucv/ as "netiucv<ifnum>". The interface
+number is assigned sequentially to the connections defined via the 'connection'
+attribute.
+
+user
+ - shows the connection partner.
+
+buffer
+ - maximum buffer size. Pipe to it to change buffer size.
diff --git a/Documentation/s390/driver-model.txt b/Documentation/s390/driver-model.txt
deleted file mode 100644
index ed265cf54cde..000000000000
--- a/Documentation/s390/driver-model.txt
+++ /dev/null
@@ -1,287 +0,0 @@
-S/390 driver model interfaces
------------------------------
-
-1. CCW devices
---------------
-
-All devices which can be addressed by means of ccws are called 'CCW devices' -
-even if they aren't actually driven by ccws.
-
-All ccw devices are accessed via a subchannel, this is reflected in the
-structures under devices/:
-
-devices/
- - system/
- - css0/
- - 0.0.0000/0.0.0815/
- - 0.0.0001/0.0.4711/
- - 0.0.0002/
- - 0.1.0000/0.1.1234/
- ...
- - defunct/
-
-In this example, device 0815 is accessed via subchannel 0 in subchannel set 0,
-device 4711 via subchannel 1 in subchannel set 0, and subchannel 2 is a non-I/O
-subchannel. Device 1234 is accessed via subchannel 0 in subchannel set 1.
-
-The subchannel named 'defunct' does not represent any real subchannel on the
-system; it is a pseudo subchannel where disconnected ccw devices are moved to
-if they are displaced by another ccw device becoming operational on their
-former subchannel. The ccw devices will be moved again to a proper subchannel
-if they become operational again on that subchannel.
-
-You should address a ccw device via its bus id (e.g. 0.0.4711); the device can
-be found under bus/ccw/devices/.
-
-All ccw devices export some data via sysfs.
-
-cutype: The control unit type / model.
-
-devtype: The device type / model, if applicable.
-
-availability: Can be 'good' or 'boxed'; 'no path' or 'no device' for
- disconnected devices.
-
-online: An interface to set the device online and offline.
- In the special case of the device being disconnected (see the
- notify function under 1.2), piping 0 to online will forcibly delete
- the device.
-
-The device drivers can add entries to export per-device data and interfaces.
-
-There is also some data exported on a per-subchannel basis (see under
-bus/css/devices/):
-
-chpids: Via which chpids the device is connected.
-
-pimpampom: The path installed, path available and path operational masks.
-
-There also might be additional data, for example for block devices.
-
-
-1.1 Bringing up a ccw device
-----------------------------
-
-This is done in several steps.
-
-a. Each driver can provide one or more parameter interfaces where parameters can
- be specified. These interfaces are also in the driver's responsibility.
-b. After a. has been performed, if necessary, the device is finally brought up
- via the 'online' interface.
-
-
-1.2 Writing a driver for ccw devices
-------------------------------------
-
-The basic struct ccw_device and struct ccw_driver data structures can be found
-under include/asm/ccwdev.h.
-
-struct ccw_device {
- spinlock_t *ccwlock;
- struct ccw_device_private *private;
- struct ccw_device_id id;
-
- struct ccw_driver *drv;
- struct device dev;
- int online;
-
- void (*handler) (struct ccw_device *dev, unsigned long intparm,
- struct irb *irb);
-};
-
-struct ccw_driver {
- struct module *owner;
- struct ccw_device_id *ids;
- int (*probe) (struct ccw_device *);
- int (*remove) (struct ccw_device *);
- int (*set_online) (struct ccw_device *);
- int (*set_offline) (struct ccw_device *);
- int (*notify) (struct ccw_device *, int);
- struct device_driver driver;
- char *name;
-};
-
-The 'private' field contains data needed for internal i/o operation only, and
-is not available to the device driver.
-
-Each driver should declare in a MODULE_DEVICE_TABLE into which CU types/models
-and/or device types/models it is interested. This information can later be found
-in the struct ccw_device_id fields:
-
-struct ccw_device_id {
- __u16 match_flags;
-
- __u16 cu_type;
- __u16 dev_type;
- __u8 cu_model;
- __u8 dev_model;
-
- unsigned long driver_info;
-};
-
-The functions in ccw_driver should be used in the following way:
-probe: This function is called by the device layer for each device the driver
- is interested in. The driver should only allocate private structures
- to put in dev->driver_data and create attributes (if needed). Also,
- the interrupt handler (see below) should be set here.
-
-int (*probe) (struct ccw_device *cdev);
-
-Parameters: cdev - the device to be probed.
-
-
-remove: This function is called by the device layer upon removal of the driver,
- the device or the module. The driver should perform cleanups here.
-
-int (*remove) (struct ccw_device *cdev);
-
-Parameters: cdev - the device to be removed.
-
-
-set_online: This function is called by the common I/O layer when the device is
- activated via the 'online' attribute. The driver should finally
- setup and activate the device here.
-
-int (*set_online) (struct ccw_device *);
-
-Parameters: cdev - the device to be activated. The common layer has
- verified that the device is not already online.
-
-
-set_offline: This function is called by the common I/O layer when the device is
- de-activated via the 'online' attribute. The driver should shut
- down the device, but not de-allocate its private data.
-
-int (*set_offline) (struct ccw_device *);
-
-Parameters: cdev - the device to be deactivated. The common layer has
- verified that the device is online.
-
-
-notify: This function is called by the common I/O layer for some state changes
- of the device.
- Signalled to the driver are:
- * In online state, device detached (CIO_GONE) or last path gone
- (CIO_NO_PATH). The driver must return !0 to keep the device; for
- return code 0, the device will be deleted as usual (also when no
- notify function is registered). If the driver wants to keep the
- device, it is moved into disconnected state.
- * In disconnected state, device operational again (CIO_OPER). The
- common I/O layer performs some sanity checks on device number and
- Device / CU to be reasonably sure if it is still the same device.
- If not, the old device is removed and a new one registered. By the
- return code of the notify function the device driver signals if it
- wants the device back: !0 for keeping, 0 to make the device being
- removed and re-registered.
-
-int (*notify) (struct ccw_device *, int);
-
-Parameters: cdev - the device whose state changed.
- event - the event that happened. This can be one of CIO_GONE,
- CIO_NO_PATH or CIO_OPER.
-
-The handler field of the struct ccw_device is meant to be set to the interrupt
-handler for the device. In order to accommodate drivers which use several
-distinct handlers (e.g. multi subchannel devices), this is a member of ccw_device
-instead of ccw_driver.
-The handler is registered with the common layer during set_online() processing
-before the driver is called, and is deregistered during set_offline() after the
-driver has been called. Also, after registering / before deregistering, path
-grouping resp. disbanding of the path group (if applicable) are performed.
-
-void (*handler) (struct ccw_device *dev, unsigned long intparm, struct irb *irb);
-
-Parameters: dev - the device the handler is called for
- intparm - the intparm which allows the device driver to identify
- the i/o the interrupt is associated with, or to recognize
- the interrupt as unsolicited.
- irb - interruption response block which contains the accumulated
- status.
-
-The device driver is called from the common ccw_device layer and can retrieve
-information about the interrupt from the irb parameter.
-
-
-1.3 ccwgroup devices
---------------------
-
-The ccwgroup mechanism is designed to handle devices consisting of multiple ccw
-devices, like lcs or ctc.
-
-The ccw driver provides a 'group' attribute. Piping bus ids of ccw devices to
-this attributes creates a ccwgroup device consisting of these ccw devices (if
-possible). This ccwgroup device can be set online or offline just like a normal
-ccw device.
-
-Each ccwgroup device also provides an 'ungroup' attribute to destroy the device
-again (only when offline). This is a generic ccwgroup mechanism (the driver does
-not need to implement anything beyond normal removal routines).
-
-A ccw device which is a member of a ccwgroup device carries a pointer to the
-ccwgroup device in the driver_data of its device struct. This field must not be
-touched by the driver - it should use the ccwgroup device's driver_data for its
-private data.
-
-To implement a ccwgroup driver, please refer to include/asm/ccwgroup.h. Keep in
-mind that most drivers will need to implement both a ccwgroup and a ccw
-driver.
-
-
-2. Channel paths
------------------
-
-Channel paths show up, like subchannels, under the channel subsystem root (css0)
-and are called 'chp0.<chpid>'. They have no driver and do not belong to any bus.
-Please note, that unlike /proc/chpids in 2.4, the channel path objects reflect
-only the logical state and not the physical state, since we cannot track the
-latter consistently due to lacking machine support (we don't need to be aware
-of it anyway).
-
-status - Can be 'online' or 'offline'.
- Piping 'on' or 'off' sets the chpid logically online/offline.
- Piping 'on' to an online chpid triggers path reprobing for all devices
- the chpid connects to. This can be used to force the kernel to re-use
- a channel path the user knows to be online, but the machine hasn't
- created a machine check for.
-
-type - The physical type of the channel path.
-
-shared - Whether the channel path is shared.
-
-cmg - The channel measurement group.
-
-3. System devices
------------------
-
-3.1 xpram
----------
-
-xpram shows up under devices/system/ as 'xpram'.
-
-3.2 cpus
---------
-
-For each cpu, a directory is created under devices/system/cpu/. Each cpu has an
-attribute 'online' which can be 0 or 1.
-
-
-4. Other devices
-----------------
-
-4.1 Netiucv
------------
-
-The netiucv driver creates an attribute 'connection' under
-bus/iucv/drivers/netiucv. Piping to this attribute creates a new netiucv
-connection to the specified host.
-
-Netiucv connections show up under devices/iucv/ as "netiucv<ifnum>". The interface
-number is assigned sequentially to the connections defined via the 'connection'
-attribute.
-
-user - shows the connection partner.
-
-buffer - maximum buffer size.
- Pipe to it to change buffer size.
-
-
diff --git a/Documentation/s390/index.rst b/Documentation/s390/index.rst
new file mode 100644
index 000000000000..4602312909d3
--- /dev/null
+++ b/Documentation/s390/index.rst
@@ -0,0 +1,28 @@
+=================
+s390 Architecture
+=================
+
+.. toctree::
+ :maxdepth: 1
+
+ cds
+ 3270
+ debugging390
+ driver-model
+ monreader
+ qeth
+ s390dbf
+ vfio-ap
+ vfio-ccw
+ zfcpdump
+ dasd
+ common_io
+
+ text_files
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/s390/monreader.rst b/Documentation/s390/monreader.rst
new file mode 100644
index 000000000000..1e857575c113
--- /dev/null
+++ b/Documentation/s390/monreader.rst
@@ -0,0 +1,212 @@
+=================================================
+Linux API for read access to z/VM Monitor Records
+=================================================
+
+Date : 2004-Nov-26
+
+Author: Gerald Schaefer (geraldsc@de.ibm.com)
+
+
+
+
+Description
+===========
+This item delivers a new Linux API in the form of a misc char device that is
+usable from user space and allows read access to the z/VM Monitor Records
+collected by the `*MONITOR` System Service of z/VM.
+
+
+User Requirements
+=================
+The z/VM guest on which you want to access this API needs to be configured in
+order to allow IUCV connections to the `*MONITOR` service, i.e. it needs the
+IUCV `*MONITOR` statement in its user entry. If the monitor DCSS to be used is
+restricted (likely), you also need the NAMESAVE <DCSS NAME> statement.
+This item will use the IUCV device driver to access the z/VM services, so you
+need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1.
+
+There are two options for being able to load the monitor DCSS (examples assume
+that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the
+location of the monitor DCSS with the Class E privileged CP command Q NSS MAP
+(the values BEGPAG and ENDPAG are given in units of 4K pages).
+
+See also "CP Command and Utility Reference" (SC24-6081-00) for more information
+on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning
+and Administration" (SC24-6116-00) for more information on DCSSes.
+
+1st option:
+-----------
+You can use the CP command DEF STOR CONFIG to define a "memory hole" in your
+guest virtual storage around the address range of the DCSS.
+
+Example: DEF STOR CONFIG 0.140M 200M.200M
+
+This defines two blocks of storage, the first is 140MB in size an begins at
+address 0MB, the second is 200MB in size and begins at address 200MB,
+resulting in a total storage of 340MB. Note that the first block should
+always start at 0 and be at least 64MB in size.
+
+2nd option:
+-----------
+Your guest virtual storage has to end below the starting address of the DCSS
+and you have to specify the "mem=" kernel parameter in your parmfile with a
+value greater than the ending address of the DCSS.
+
+Example::
+
+ DEF STOR 140M
+
+This defines 140MB storage size for your guest, the parameter "mem=160M" is
+added to the parmfile.
+
+
+User Interface
+==============
+The char device is implemented as a kernel module named "monreader",
+which can be loaded via the modprobe command, or it can be compiled into the
+kernel instead. There is one optional module (or kernel) parameter, "mondcss",
+to specify the name of the monitor DCSS. If the module is compiled into the
+kernel, the kernel parameter "monreader.mondcss=<DCSS NAME>" can be specified
+in the parmfile.
+
+The default name for the DCSS is "MONDCSS" if none is specified. In case that
+there are other users already connected to the `*MONITOR` service (e.g.
+Performance Toolkit), the monitor DCSS is already defined and you have to use
+the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name
+of the monitor DCSS, if already defined, and the users connected to the
+`*MONITOR` service.
+Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor
+DCSS if your z/VM doesn't have one already, you need Class E privileges to
+define and save a DCSS.
+
+Example:
+--------
+
+::
+
+ modprobe monreader mondcss=MYDCSS
+
+This loads the module and sets the DCSS name to "MYDCSS".
+
+NOTE:
+-----
+This API provides no interface to control the `*MONITOR` service, e.g. specify
+which data should be collected. This can be done by the CP command MONITOR
+(Class E privileged), see "CP Command and Utility Reference".
+
+Device nodes with udev:
+-----------------------
+After loading the module, a char device will be created along with the device
+node /<udev directory>/monreader.
+
+Device nodes without udev:
+--------------------------
+If your distribution does not support udev, a device node will not be created
+automatically and you have to create it manually after loading the module.
+Therefore you need to know the major and minor numbers of the device. These
+numbers can be found in /sys/class/misc/monreader/dev.
+
+Typing cat /sys/class/misc/monreader/dev will give an output of the form
+<major>:<minor>. The device node can be created via the mknod command, enter
+mknod <name> c <major> <minor>, where <name> is the name of the device node
+to be created.
+
+Example:
+--------
+
+::
+
+ # modprobe monreader
+ # cat /sys/class/misc/monreader/dev
+ 10:63
+ # mknod /dev/monreader c 10 63
+
+This loads the module with the default monitor DCSS (MONDCSS) and creates a
+device node.
+
+File operations:
+----------------
+The following file operations are supported: open, release, read, poll.
+There are two alternative methods for reading: either non-blocking read in
+conjunction with polling, or blocking read without polling. IOCTLs are not
+supported.
+
+Read:
+-----
+Reading from the device provides a 12 Byte monitor control element (MCE),
+followed by a set of one or more contiguous monitor records (similar to the
+output of the CMS utility MONWRITE without the 4K control blocks). The MCE
+contains information on the type of the following record set (sample/event
+data), the monitor domains contained within it and the start and end address
+of the record set in the monitor DCSS. The start and end address can be used
+to determine the size of the record set, the end address is the address of the
+last byte of data. The start address is needed to handle "end-of-frame" records
+correctly (domain 1, record 13), i.e. it can be used to determine the record
+start offset relative to a 4K page (frame) boundary.
+
+See "Appendix A: `*MONITOR`" in the "z/VM Performance" document for a description
+of the monitor control element layout. The layout of the monitor records can
+be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html
+
+The layout of the data stream provided by the monreader device is as follows::
+
+ ...
+ <0 byte read>
+ <first MCE> \
+ <first set of records> |
+ ... |- data set
+ <last MCE> |
+ <last set of records> /
+ <0 byte read>
+ ...
+
+There may be more than one combination of MCE and corresponding record set
+within one data set and the end of each data set is indicated by a successful
+read with a return value of 0 (0 byte read).
+Any received data must be considered invalid until a complete set was
+read successfully, including the closing 0 byte read. Therefore you should
+always read the complete set into a buffer before processing the data.
+
+The maximum size of a data set can be as large as the size of the
+monitor DCSS, so design the buffer adequately or use dynamic memory allocation.
+The size of the monitor DCSS will be printed into syslog after loading the
+module. You can also use the (Class E privileged) CP command Q NSS MAP to
+list all available segments and information about them.
+
+As with most char devices, error conditions are indicated by returning a
+negative value for the number of bytes read. In this case, the errno variable
+indicates the error condition:
+
+EIO:
+ reply failed, read data is invalid and the application
+ should discard the data read since the last successful read with 0 size.
+EFAULT:
+ copy_to_user failed, read data is invalid and the application should
+ discard the data read since the last successful read with 0 size.
+EAGAIN:
+ occurs on a non-blocking read if there is no data available at the
+ moment. There is no data missing or corrupted, just try again or rather
+ use polling for non-blocking reads.
+EOVERFLOW:
+ message limit reached, the data read since the last successful
+ read with 0 size is valid but subsequent records may be missing.
+
+In the last case (EOVERFLOW) there may be missing data, in the first two cases
+(EIO, EFAULT) there will be missing data. It's up to the application if it will
+continue reading subsequent data or rather exit.
+
+Open:
+-----
+Only one user is allowed to open the char device. If it is already in use, the
+open function will fail (return a negative value) and set errno to EBUSY.
+The open function may also fail if an IUCV connection to the `*MONITOR` service
+cannot be established. In this case errno will be set to EIO and an error
+message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER
+codes are described in the "z/VM Performance" book, Appendix A.
+
+NOTE:
+-----
+As soon as the device is opened, incoming messages will be accepted and they
+will account for the message limit, i.e. opening the device without reading
+from it will provoke the "message limit reached" error (EOVERFLOW error code)
+eventually.
diff --git a/Documentation/s390/monreader.txt b/Documentation/s390/monreader.txt
deleted file mode 100644
index d3729585fdb0..000000000000
--- a/Documentation/s390/monreader.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-
-Date : 2004-Nov-26
-Author: Gerald Schaefer (geraldsc@de.ibm.com)
-
-
- Linux API for read access to z/VM Monitor Records
- =================================================
-
-
-Description
-===========
-This item delivers a new Linux API in the form of a misc char device that is
-usable from user space and allows read access to the z/VM Monitor Records
-collected by the *MONITOR System Service of z/VM.
-
-
-User Requirements
-=================
-The z/VM guest on which you want to access this API needs to be configured in
-order to allow IUCV connections to the *MONITOR service, i.e. it needs the
-IUCV *MONITOR statement in its user entry. If the monitor DCSS to be used is
-restricted (likely), you also need the NAMESAVE <DCSS NAME> statement.
-This item will use the IUCV device driver to access the z/VM services, so you
-need a kernel with IUCV support. You also need z/VM version 4.4 or 5.1.
-
-There are two options for being able to load the monitor DCSS (examples assume
-that the monitor DCSS begins at 144 MB and ends at 152 MB). You can query the
-location of the monitor DCSS with the Class E privileged CP command Q NSS MAP
-(the values BEGPAG and ENDPAG are given in units of 4K pages).
-
-See also "CP Command and Utility Reference" (SC24-6081-00) for more information
-on the DEF STOR and Q NSS MAP commands, as well as "Saved Segments Planning
-and Administration" (SC24-6116-00) for more information on DCSSes.
-
-1st option:
------------
-You can use the CP command DEF STOR CONFIG to define a "memory hole" in your
-guest virtual storage around the address range of the DCSS.
-
-Example: DEF STOR CONFIG 0.140M 200M.200M
-
-This defines two blocks of storage, the first is 140MB in size an begins at
-address 0MB, the second is 200MB in size and begins at address 200MB,
-resulting in a total storage of 340MB. Note that the first block should
-always start at 0 and be at least 64MB in size.
-
-2nd option:
------------
-Your guest virtual storage has to end below the starting address of the DCSS
-and you have to specify the "mem=" kernel parameter in your parmfile with a
-value greater than the ending address of the DCSS.
-
-Example: DEF STOR 140M
-
-This defines 140MB storage size for your guest, the parameter "mem=160M" is
-added to the parmfile.
-
-
-User Interface
-==============
-The char device is implemented as a kernel module named "monreader",
-which can be loaded via the modprobe command, or it can be compiled into the
-kernel instead. There is one optional module (or kernel) parameter, "mondcss",
-to specify the name of the monitor DCSS. If the module is compiled into the
-kernel, the kernel parameter "monreader.mondcss=<DCSS NAME>" can be specified
-in the parmfile.
-
-The default name for the DCSS is "MONDCSS" if none is specified. In case that
-there are other users already connected to the *MONITOR service (e.g.
-Performance Toolkit), the monitor DCSS is already defined and you have to use
-the same DCSS. The CP command Q MONITOR (Class E privileged) shows the name
-of the monitor DCSS, if already defined, and the users connected to the
-*MONITOR service.
-Refer to the "z/VM Performance" book (SC24-6109-00) on how to create a monitor
-DCSS if your z/VM doesn't have one already, you need Class E privileges to
-define and save a DCSS.
-
-Example:
---------
-modprobe monreader mondcss=MYDCSS
-
-This loads the module and sets the DCSS name to "MYDCSS".
-
-NOTE:
------
-This API provides no interface to control the *MONITOR service, e.g. specify
-which data should be collected. This can be done by the CP command MONITOR
-(Class E privileged), see "CP Command and Utility Reference".
-
-Device nodes with udev:
------------------------
-After loading the module, a char device will be created along with the device
-node /<udev directory>/monreader.
-
-Device nodes without udev:
---------------------------
-If your distribution does not support udev, a device node will not be created
-automatically and you have to create it manually after loading the module.
-Therefore you need to know the major and minor numbers of the device. These
-numbers can be found in /sys/class/misc/monreader/dev.
-Typing cat /sys/class/misc/monreader/dev will give an output of the form
-<major>:<minor>. The device node can be created via the mknod command, enter
-mknod <name> c <major> <minor>, where <name> is the name of the device node
-to be created.
-
-Example:
---------
-# modprobe monreader
-# cat /sys/class/misc/monreader/dev
-10:63
-# mknod /dev/monreader c 10 63
-
-This loads the module with the default monitor DCSS (MONDCSS) and creates a
-device node.
-
-File operations:
-----------------
-The following file operations are supported: open, release, read, poll.
-There are two alternative methods for reading: either non-blocking read in
-conjunction with polling, or blocking read without polling. IOCTLs are not
-supported.
-
-Read:
------
-Reading from the device provides a 12 Byte monitor control element (MCE),
-followed by a set of one or more contiguous monitor records (similar to the
-output of the CMS utility MONWRITE without the 4K control blocks). The MCE
-contains information on the type of the following record set (sample/event
-data), the monitor domains contained within it and the start and end address
-of the record set in the monitor DCSS. The start and end address can be used
-to determine the size of the record set, the end address is the address of the
-last byte of data. The start address is needed to handle "end-of-frame" records
-correctly (domain 1, record 13), i.e. it can be used to determine the record
-start offset relative to a 4K page (frame) boundary.
-
-See "Appendix A: *MONITOR" in the "z/VM Performance" document for a description
-of the monitor control element layout. The layout of the monitor records can
-be found here (z/VM 5.1): http://www.vm.ibm.com/pubs/mon510/index.html
-
-The layout of the data stream provided by the monreader device is as follows:
-...
-<0 byte read>
-<first MCE> \
-<first set of records> |
-... |- data set
-<last MCE> |
-<last set of records> /
-<0 byte read>
-...
-
-There may be more than one combination of MCE and corresponding record set
-within one data set and the end of each data set is indicated by a successful
-read with a return value of 0 (0 byte read).
-Any received data must be considered invalid until a complete set was
-read successfully, including the closing 0 byte read. Therefore you should
-always read the complete set into a buffer before processing the data.
-
-The maximum size of a data set can be as large as the size of the
-monitor DCSS, so design the buffer adequately or use dynamic memory allocation.
-The size of the monitor DCSS will be printed into syslog after loading the
-module. You can also use the (Class E privileged) CP command Q NSS MAP to
-list all available segments and information about them.
-
-As with most char devices, error conditions are indicated by returning a
-negative value for the number of bytes read. In this case, the errno variable
-indicates the error condition:
-
-EIO: reply failed, read data is invalid and the application
- should discard the data read since the last successful read with 0 size.
-EFAULT: copy_to_user failed, read data is invalid and the application should
- discard the data read since the last successful read with 0 size.
-EAGAIN: occurs on a non-blocking read if there is no data available at the
- moment. There is no data missing or corrupted, just try again or rather
- use polling for non-blocking reads.
-EOVERFLOW: message limit reached, the data read since the last successful
- read with 0 size is valid but subsequent records may be missing.
-
-In the last case (EOVERFLOW) there may be missing data, in the first two cases
-(EIO, EFAULT) there will be missing data. It's up to the application if it will
-continue reading subsequent data or rather exit.
-
-Open:
------
-Only one user is allowed to open the char device. If it is already in use, the
-open function will fail (return a negative value) and set errno to EBUSY.
-The open function may also fail if an IUCV connection to the *MONITOR service
-cannot be established. In this case errno will be set to EIO and an error
-message with an IPUSER SEVER code will be printed into syslog. The IPUSER SEVER
-codes are described in the "z/VM Performance" book, Appendix A.
-
-NOTE:
------
-As soon as the device is opened, incoming messages will be accepted and they
-will account for the message limit, i.e. opening the device without reading
-from it will provoke the "message limit reached" error (EOVERFLOW error code)
-eventually.
-
diff --git a/Documentation/s390/qeth.rst b/Documentation/s390/qeth.rst
new file mode 100644
index 000000000000..f02fdaa68de0
--- /dev/null
+++ b/Documentation/s390/qeth.rst
@@ -0,0 +1,64 @@
+=============================
+IBM s390 QDIO Ethernet Driver
+=============================
+
+OSA and HiperSockets Bridge Port Support
+========================================
+
+Uevents
+-------
+
+To generate the events the device must be assigned a role of either
+a primary or a secondary Bridge Port. For more information, see
+"z/VM Connectivity, SC24-6174".
+
+When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state
+of some configured Bridge Port device on the channel changes, a udev
+event with ACTION=CHANGE is emitted on behalf of the corresponding
+ccwgroup device. The event has the following attributes:
+
+BRIDGEPORT=statechange
+ indicates that the Bridge Port device changed
+ its state.
+
+ROLE={primary|secondary|none}
+ the role assigned to the port.
+
+STATE={active|standby|inactive}
+ the newly assumed state of the port.
+
+When run on HiperSockets Bridge Capable Port hardware with host address
+notifications enabled, a udev event with ACTION=CHANGE is emitted.
+It is emitted on behalf of the corresponding ccwgroup device when a host
+or a VLAN is registered or unregistered on the network served by the device.
+The event has the following attributes:
+
+BRIDGEDHOST={reset|register|deregister|abort}
+ host address
+ notifications are started afresh, a new host or VLAN is registered or
+ deregistered on the Bridge Port HiperSockets channel, or address
+ notifications are aborted.
+
+VLAN=numeric-vlan-id
+ VLAN ID on which the event occurred. Not included
+ if no VLAN is involved in the event.
+
+MAC=xx:xx:xx:xx:xx:xx
+ MAC address of the host that is being registered
+ or deregistered from the HiperSockets channel. Not reported if the
+ event reports the creation or destruction of a VLAN.
+
+NTOK_BUSID=x.y.zzzz
+ device bus ID (CSSID, SSID and device number).
+
+NTOK_IID=xx
+ device IID.
+
+NTOK_CHPID=xx
+ device CHPID.
+
+NTOK_CHID=xxxx
+ device channel ID.
+
+Note that the `NTOK_*` attributes refer to devices other than the one
+connected to the system on which the OS is running.
diff --git a/Documentation/s390/qeth.txt b/Documentation/s390/qeth.txt
deleted file mode 100644
index aa06fcf5f8c2..000000000000
--- a/Documentation/s390/qeth.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-IBM s390 QDIO Ethernet Driver
-
-OSA and HiperSockets Bridge Port Support
-
-Uevents
-
-To generate the events the device must be assigned a role of either
-a primary or a secondary Bridge Port. For more information, see
-"z/VM Connectivity, SC24-6174".
-
-When run on an OSA or HiperSockets Bridge Capable Port hardware, and the state
-of some configured Bridge Port device on the channel changes, a udev
-event with ACTION=CHANGE is emitted on behalf of the corresponding
-ccwgroup device. The event has the following attributes:
-
-BRIDGEPORT=statechange - indicates that the Bridge Port device changed
- its state.
-
-ROLE={primary|secondary|none} - the role assigned to the port.
-
-STATE={active|standby|inactive} - the newly assumed state of the port.
-
-When run on HiperSockets Bridge Capable Port hardware with host address
-notifications enabled, a udev event with ACTION=CHANGE is emitted.
-It is emitted on behalf of the corresponding ccwgroup device when a host
-or a VLAN is registered or unregistered on the network served by the device.
-The event has the following attributes:
-
-BRIDGEDHOST={reset|register|deregister|abort} - host address
- notifications are started afresh, a new host or VLAN is registered or
- deregistered on the Bridge Port HiperSockets channel, or address
- notifications are aborted.
-
-VLAN=numeric-vlan-id - VLAN ID on which the event occurred. Not included
- if no VLAN is involved in the event.
-
-MAC=xx:xx:xx:xx:xx:xx - MAC address of the host that is being registered
- or deregistered from the HiperSockets channel. Not reported if the
- event reports the creation or destruction of a VLAN.
-
-NTOK_BUSID=x.y.zzzz - device bus ID (CSSID, SSID and device number).
-
-NTOK_IID=xx - device IID.
-
-NTOK_CHPID=xx - device CHPID.
-
-NTOK_CHID=xxxx - device channel ID.
-
-Note that the NTOK_* attributes refer to devices other than the one
-connected to the system on which the OS is running.
diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst
new file mode 100644
index 000000000000..cdb36842b898
--- /dev/null
+++ b/Documentation/s390/s390dbf.rst
@@ -0,0 +1,487 @@
+==================
+S390 Debug Feature
+==================
+
+files:
+ - arch/s390/kernel/debug.c
+ - arch/s390/include/asm/debug.h
+
+Description:
+------------
+The goal of this feature is to provide a kernel debug logging API
+where log records can be stored efficiently in memory, where each component
+(e.g. device drivers) can have one separate debug log.
+One purpose of this is to inspect the debug logs after a production system crash
+in order to analyze the reason for the crash.
+
+If the system still runs but only a subcomponent which uses dbf fails,
+it is possible to look at the debug logs on a live system via the Linux
+debugfs filesystem.
+
+The debug feature may also very useful for kernel and driver development.
+
+Design:
+-------
+Kernel components (e.g. device drivers) can register themselves at the debug
+feature with the function call :c:func:`debug_register()`.
+This function initializes a
+debug log for the caller. For each debug log exists a number of debug areas
+where exactly one is active at one time. Each debug area consists of contiguous
+pages in memory. In the debug areas there are stored debug entries (log records)
+which are written by event- and exception-calls.
+
+An event-call writes the specified debug entry to the active debug
+area and updates the log pointer for the active area. If the end
+of the active debug area is reached, a wrap around is done (ring buffer)
+and the next debug entry will be written at the beginning of the active
+debug area.
+
+An exception-call writes the specified debug entry to the log and
+switches to the next debug area. This is done in order to be sure
+that the records which describe the origin of the exception are not
+overwritten when a wrap around for the current area occurs.
+
+The debug areas themselves are also ordered in form of a ring buffer.
+When an exception is thrown in the last debug area, the following debug
+entries are then written again in the very first area.
+
+There are four versions for the event- and exception-calls: One for
+logging raw data, one for text, one for numbers (unsigned int and long),
+and one for sprintf-like formatted strings.
+
+Each debug entry contains the following data:
+
+- Timestamp
+- Cpu-Number of calling task
+- Level of debug entry (0...6)
+- Return Address to caller
+- Flag, if entry is an exception or not
+
+The debug logs can be inspected in a live system through entries in
+the debugfs-filesystem. Under the toplevel directory "``s390dbf``" there is
+a directory for each registered component, which is named like the
+corresponding component. The debugfs normally should be mounted to
+``/sys/kernel/debug`` therefore the debug feature can be accessed under
+``/sys/kernel/debug/s390dbf``.
+
+The content of the directories are files which represent different views
+to the debug log. Each component can decide which views should be
+used through registering them with the function :c:func:`debug_register_view()`.
+Predefined views for hex/ascii, sprintf and raw binary data are provided.
+It is also possible to define other views. The content of
+a view can be inspected simply by reading the corresponding debugfs file.
+
+All debug logs have an actual debug level (range from 0 to 6).
+The default level is 3. Event and Exception functions have a :c:data:`level`
+parameter. Only debug entries with a level that is lower or equal
+than the actual level are written to the log. This means, when
+writing events, high priority log entries should have a low level
+value whereas low priority entries should have a high one.
+The actual debug level can be changed with the help of the debugfs-filesystem
+through writing a number string "x" to the ``level`` debugfs file which is
+provided for every debug log. Debugging can be switched off completely
+by using "-" on the ``level`` debugfs file.
+
+Example::
+
+ > echo "-" > /sys/kernel/debug/s390dbf/dasd/level
+
+It is also possible to deactivate the debug feature globally for every
+debug log. You can change the behavior using 2 sysctl parameters in
+``/proc/sys/s390dbf``:
+
+There are currently 2 possible triggers, which stop the debug feature
+globally. The first possibility is to use the ``debug_active`` sysctl. If
+set to 1 the debug feature is running. If ``debug_active`` is set to 0 the
+debug feature is turned off.
+
+The second trigger which stops the debug feature is a kernel oops.
+That prevents the debug feature from overwriting debug information that
+happened before the oops. After an oops you can reactivate the debug feature
+by piping 1 to ``/proc/sys/s390dbf/debug_active``. Nevertheless, it's not
+suggested to use an oopsed kernel in a production environment.
+
+If you want to disallow the deactivation of the debug feature, you can use
+the ``debug_stoppable`` sysctl. If you set ``debug_stoppable`` to 0 the debug
+feature cannot be stopped. If the debug feature is already stopped, it
+will stay deactivated.
+
+Kernel Interfaces:
+------------------
+
+.. kernel-doc:: arch/s390/kernel/debug.c
+.. kernel-doc:: arch/s390/include/asm/debug.h
+
+Predefined views:
+-----------------
+
+.. code-block:: c
+
+ extern struct debug_view debug_hex_ascii_view;
+
+ extern struct debug_view debug_raw_view;
+
+ extern struct debug_view debug_sprintf_view;
+
+Examples
+--------
+
+.. code-block:: c
+
+ /*
+ * hex_ascii- + raw-view Example
+ */
+
+ #include <linux/init.h>
+ #include <asm/debug.h>
+
+ static debug_info_t *debug_info;
+
+ static int init(void)
+ {
+ /* register 4 debug areas with one page each and 4 byte data field */
+
+ debug_info = debug_register("test", 1, 4, 4 );
+ debug_register_view(debug_info, &debug_hex_ascii_view);
+ debug_register_view(debug_info, &debug_raw_view);
+
+ debug_text_event(debug_info, 4 , "one ");
+ debug_int_exception(debug_info, 4, 4711);
+ debug_event(debug_info, 3, &debug_info, 4);
+
+ return 0;
+ }
+
+ static void cleanup(void)
+ {
+ debug_unregister(debug_info);
+ }
+
+ module_init(init);
+ module_exit(cleanup);
+
+.. code-block:: c
+
+ /*
+ * sprintf-view Example
+ */
+
+ #include <linux/init.h>
+ #include <asm/debug.h>
+
+ static debug_info_t *debug_info;
+
+ static int init(void)
+ {
+ /* register 4 debug areas with one page each and data field for */
+ /* format string pointer + 2 varargs (= 3 * sizeof(long)) */
+
+ debug_info = debug_register("test", 1, 4, sizeof(long) * 3);
+ debug_register_view(debug_info, &debug_sprintf_view);
+
+ debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__);
+ debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info);
+
+ return 0;
+ }
+
+ static void cleanup(void)
+ {
+ debug_unregister(debug_info);
+ }
+
+ module_init(init);
+ module_exit(cleanup);
+
+Debugfs Interface
+-----------------
+Views to the debug logs can be investigated through reading the corresponding
+debugfs-files:
+
+Example::
+
+ > ls /sys/kernel/debug/s390dbf/dasd
+ flush hex_ascii level pages raw
+ > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s
+ 00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | ....
+ 00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE
+ 00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | ....
+ 00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP
+ 01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD
+ 01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | ....
+ 01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ...
+ 01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | ....
+ 01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE
+ 01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | ....
+
+See section about predefined views for explanation of the above output!
+
+Changing the debug level
+------------------------
+
+Example::
+
+
+ > cat /sys/kernel/debug/s390dbf/dasd/level
+ 3
+ > echo "5" > /sys/kernel/debug/s390dbf/dasd/level
+ > cat /sys/kernel/debug/s390dbf/dasd/level
+ 5
+
+Flushing debug areas
+--------------------
+Debug areas can be flushed with piping the number of the desired
+area (0...n) to the debugfs file "flush". When using "-" all debug areas
+are flushed.
+
+Examples:
+
+1. Flush debug area 0::
+
+ > echo "0" > /sys/kernel/debug/s390dbf/dasd/flush
+
+2. Flush all debug areas::
+
+ > echo "-" > /sys/kernel/debug/s390dbf/dasd/flush
+
+Changing the size of debug areas
+------------------------------------
+It is possible the change the size of debug areas through piping
+the number of pages to the debugfs file "pages". The resize request will
+also flush the debug areas.
+
+Example:
+
+Define 4 pages for the debug areas of debug feature "dasd"::
+
+ > echo "4" > /sys/kernel/debug/s390dbf/dasd/pages
+
+Stopping the debug feature
+--------------------------
+Example:
+
+1. Check if stopping is allowed::
+
+ > cat /proc/sys/s390dbf/debug_stoppable
+
+2. Stop debug feature::
+
+ > echo 0 > /proc/sys/s390dbf/debug_active
+
+crash Interface
+----------------
+The ``crash`` tool since v5.1.0 has a built-in command
+``s390dbf`` to display all the debug logs or export them to the file system.
+With this tool it is possible
+to investigate the debug logs on a live system and with a memory dump after
+a system crash.
+
+Investigating raw memory
+------------------------
+One last possibility to investigate the debug logs at a live
+system and after a system crash is to look at the raw memory
+under VM or at the Service Element.
+It is possible to find the anchor of the debug-logs through
+the ``debug_area_first`` symbol in the System map. Then one has
+to follow the correct pointers of the data-structures defined
+in debug.h and find the debug-areas in memory.
+Normally modules which use the debug feature will also have
+a global variable with the pointer to the debug-logs. Following
+this pointer it will also be possible to find the debug logs in
+memory.
+
+For this method it is recommended to use '16 * x + 4' byte (x = 0..n)
+for the length of the data field in :c:func:`debug_register()` in
+order to see the debug entries well formatted.
+
+
+Predefined Views
+----------------
+
+There are three predefined views: hex_ascii, raw and sprintf.
+The hex_ascii view shows the data field in hex and ascii representation
+(e.g. ``45 43 4b 44 | ECKD``).
+The raw view returns a bytestream as the debug areas are stored in memory.
+
+The sprintf view formats the debug entries in the same way as the sprintf
+function would do. The sprintf event/exception functions write to the
+debug entry a pointer to the format string (size = sizeof(long))
+and for each vararg a long value. So e.g. for a debug entry with a format
+string plus two varargs one would need to allocate a (3 * sizeof(long))
+byte data area in the debug_register() function.
+
+IMPORTANT:
+ Using "%s" in sprintf event functions is dangerous. You can only
+ use "%s" in the sprintf event functions, if the memory for the passed string
+ is available as long as the debug feature exists. The reason behind this is
+ that due to performance considerations only a pointer to the string is stored
+ in the debug feature. If you log a string that is freed afterwards, you will
+ get an OOPS when inspecting the debug feature, because then the debug feature
+ will access the already freed memory.
+
+NOTE:
+ If using the sprintf view do NOT use other event/exception functions
+ than the sprintf-event and -exception functions.
+
+The format of the hex_ascii and sprintf view is as follows:
+
+- Number of area
+- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated
+ Universal Time (UTC), January 1, 1970)
+- level of debug entry
+- Exception flag (* = Exception)
+- Cpu-Number of calling task
+- Return Address to caller
+- data field
+
+The format of the raw view is:
+
+- Header as described in debug.h
+- datafield
+
+A typical line of the hex_ascii view will look like the following (first line
+is only for explanation and will not be displayed when 'cating' the view)::
+
+ area time level exception cpu caller data (hex + ascii)
+ --------------------------------------------------------------------------
+ 00 00964419409:440690 1 - 00 88023fe
+
+
+Defining views
+--------------
+
+Views are specified with the 'debug_view' structure. There are defined
+callback functions which are used for reading and writing the debugfs files:
+
+.. code-block:: c
+
+ struct debug_view {
+ char name[DEBUG_MAX_PROCF_LEN];
+ debug_prolog_proc_t* prolog_proc;
+ debug_header_proc_t* header_proc;
+ debug_format_proc_t* format_proc;
+ debug_input_proc_t* input_proc;
+ void* private_data;
+ };
+
+where:
+
+.. code-block:: c
+
+ typedef int (debug_header_proc_t) (debug_info_t* id,
+ struct debug_view* view,
+ int area,
+ debug_entry_t* entry,
+ char* out_buf);
+
+ typedef int (debug_format_proc_t) (debug_info_t* id,
+ struct debug_view* view, char* out_buf,
+ const char* in_buf);
+ typedef int (debug_prolog_proc_t) (debug_info_t* id,
+ struct debug_view* view,
+ char* out_buf);
+ typedef int (debug_input_proc_t) (debug_info_t* id,
+ struct debug_view* view,
+ struct file* file, const char* user_buf,
+ size_t in_buf_size, loff_t* offset);
+
+
+The "private_data" member can be used as pointer to view specific data.
+It is not used by the debug feature itself.
+
+The output when reading a debugfs file is structured like this::
+
+ "prolog_proc output"
+
+ "header_proc output 1" "format_proc output 1"
+ "header_proc output 2" "format_proc output 2"
+ "header_proc output 3" "format_proc output 3"
+ ...
+
+When a view is read from the debugfs, the Debug Feature calls the
+'prolog_proc' once for writing the prolog.
+Then 'header_proc' and 'format_proc' are called for each
+existing debug entry.
+
+The input_proc can be used to implement functionality when it is written to
+the view (e.g. like with ``echo "0" > /sys/kernel/debug/s390dbf/dasd/level``).
+
+For header_proc there can be used the default function
+:c:func:`debug_dflt_header_fn()` which is defined in debug.h.
+and which produces the same header output as the predefined views.
+E.g::
+
+ 00 00964419409:440761 2 - 00 88023ec
+
+In order to see how to use the callback functions check the implementation
+of the default views!
+
+Example:
+
+.. code-block:: c
+
+ #include <asm/debug.h>
+
+ #define UNKNOWNSTR "data: %08x"
+
+ const char* messages[] =
+ {"This error...........\n",
+ "That error...........\n",
+ "Problem..............\n",
+ "Something went wrong.\n",
+ "Everything ok........\n",
+ NULL
+ };
+
+ static int debug_test_format_fn(
+ debug_info_t *id, struct debug_view *view,
+ char *out_buf, const char *in_buf
+ )
+ {
+ int i, rc = 0;
+
+ if (id->buf_size >= 4) {
+ int msg_nr = *((int*)in_buf);
+ if (msg_nr < sizeof(messages) / sizeof(char*) - 1)
+ rc += sprintf(out_buf, "%s", messages[msg_nr]);
+ else
+ rc += sprintf(out_buf, UNKNOWNSTR, msg_nr);
+ }
+ return rc;
+ }
+
+ struct debug_view debug_test_view = {
+ "myview", /* name of view */
+ NULL, /* no prolog */
+ &debug_dflt_header_fn, /* default header for each entry */
+ &debug_test_format_fn, /* our own format function */
+ NULL, /* no input function */
+ NULL /* no private data */
+ };
+
+test:
+=====
+
+.. code-block:: c
+
+ debug_info_t *debug_info;
+ int i;
+ ...
+ debug_info = debug_register("test", 0, 4, 4);
+ debug_register_view(debug_info, &debug_test_view);
+ for (i = 0; i < 10; i ++)
+ debug_int_event(debug_info, 1, i);
+
+::
+
+ > cat /sys/kernel/debug/s390dbf/test/myview
+ 00 00964419734:611402 1 - 00 88042ca This error...........
+ 00 00964419734:611405 1 - 00 88042ca That error...........
+ 00 00964419734:611408 1 - 00 88042ca Problem..............
+ 00 00964419734:611411 1 - 00 88042ca Something went wrong.
+ 00 00964419734:611414 1 - 00 88042ca Everything ok........
+ 00 00964419734:611417 1 - 00 88042ca data: 00000005
+ 00 00964419734:611419 1 - 00 88042ca data: 00000006
+ 00 00964419734:611422 1 - 00 88042ca data: 00000007
+ 00 00964419734:611425 1 - 00 88042ca data: 00000008
+ 00 00964419734:611428 1 - 00 88042ca data: 00000009
diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt
deleted file mode 100644
index 61329fd62e89..000000000000
--- a/Documentation/s390/s390dbf.txt
+++ /dev/null
@@ -1,667 +0,0 @@
-S390 Debug Feature
-==================
-
-files: arch/s390/kernel/debug.c
- arch/s390/include/asm/debug.h
-
-Description:
-------------
-The goal of this feature is to provide a kernel debug logging API
-where log records can be stored efficiently in memory, where each component
-(e.g. device drivers) can have one separate debug log.
-One purpose of this is to inspect the debug logs after a production system crash
-in order to analyze the reason for the crash.
-If the system still runs but only a subcomponent which uses dbf fails,
-it is possible to look at the debug logs on a live system via the Linux
-debugfs filesystem.
-The debug feature may also very useful for kernel and driver development.
-
-Design:
--------
-Kernel components (e.g. device drivers) can register themselves at the debug
-feature with the function call debug_register(). This function initializes a
-debug log for the caller. For each debug log exists a number of debug areas
-where exactly one is active at one time. Each debug area consists of contiguous
-pages in memory. In the debug areas there are stored debug entries (log records)
-which are written by event- and exception-calls.
-
-An event-call writes the specified debug entry to the active debug
-area and updates the log pointer for the active area. If the end
-of the active debug area is reached, a wrap around is done (ring buffer)
-and the next debug entry will be written at the beginning of the active
-debug area.
-
-An exception-call writes the specified debug entry to the log and
-switches to the next debug area. This is done in order to be sure
-that the records which describe the origin of the exception are not
-overwritten when a wrap around for the current area occurs.
-
-The debug areas themselves are also ordered in form of a ring buffer.
-When an exception is thrown in the last debug area, the following debug
-entries are then written again in the very first area.
-
-There are three versions for the event- and exception-calls: One for
-logging raw data, one for text and one for numbers.
-
-Each debug entry contains the following data:
-
-- Timestamp
-- Cpu-Number of calling task
-- Level of debug entry (0...6)
-- Return Address to caller
-- Flag, if entry is an exception or not
-
-The debug logs can be inspected in a live system through entries in
-the debugfs-filesystem. Under the toplevel directory "s390dbf" there is
-a directory for each registered component, which is named like the
-corresponding component. The debugfs normally should be mounted to
-/sys/kernel/debug therefore the debug feature can be accessed under
-/sys/kernel/debug/s390dbf.
-
-The content of the directories are files which represent different views
-to the debug log. Each component can decide which views should be
-used through registering them with the function debug_register_view().
-Predefined views for hex/ascii, sprintf and raw binary data are provided.
-It is also possible to define other views. The content of
-a view can be inspected simply by reading the corresponding debugfs file.
-
-All debug logs have an actual debug level (range from 0 to 6).
-The default level is 3. Event and Exception functions have a 'level'
-parameter. Only debug entries with a level that is lower or equal
-than the actual level are written to the log. This means, when
-writing events, high priority log entries should have a low level
-value whereas low priority entries should have a high one.
-The actual debug level can be changed with the help of the debugfs-filesystem
-through writing a number string "x" to the 'level' debugfs file which is
-provided for every debug log. Debugging can be switched off completely
-by using "-" on the 'level' debugfs file.
-
-Example:
-
-> echo "-" > /sys/kernel/debug/s390dbf/dasd/level
-
-It is also possible to deactivate the debug feature globally for every
-debug log. You can change the behavior using 2 sysctl parameters in
-/proc/sys/s390dbf:
-There are currently 2 possible triggers, which stop the debug feature
-globally. The first possibility is to use the "debug_active" sysctl. If
-set to 1 the debug feature is running. If "debug_active" is set to 0 the
-debug feature is turned off.
-The second trigger which stops the debug feature is a kernel oops.
-That prevents the debug feature from overwriting debug information that
-happened before the oops. After an oops you can reactivate the debug feature
-by piping 1 to /proc/sys/s390dbf/debug_active. Nevertheless, its not
-suggested to use an oopsed kernel in a production environment.
-If you want to disallow the deactivation of the debug feature, you can use
-the "debug_stoppable" sysctl. If you set "debug_stoppable" to 0 the debug
-feature cannot be stopped. If the debug feature is already stopped, it
-will stay deactivated.
-
-Kernel Interfaces:
-------------------
-
-----------------------------------------------------------------------------
-debug_info_t *debug_register(char *name, int pages, int nr_areas,
- int buf_size);
-
-Parameter: name: Name of debug log (e.g. used for debugfs entry)
- pages: number of pages, which will be allocated per area
- nr_areas: number of debug areas
- buf_size: size of data area in each debug entry
-
-Return Value: Handle for generated debug area
- NULL if register failed
-
-Description: Allocates memory for a debug log
- Must not be called within an interrupt handler
-
-----------------------------------------------------------------------------
-debug_info_t *debug_register_mode(char *name, int pages, int nr_areas,
- int buf_size, mode_t mode, uid_t uid,
- gid_t gid);
-
-Parameter: name: Name of debug log (e.g. used for debugfs entry)
- pages: Number of pages, which will be allocated per area
- nr_areas: Number of debug areas
- buf_size: Size of data area in each debug entry
- mode: File mode for debugfs files. E.g. S_IRWXUGO
- uid: User ID for debugfs files. Currently only 0 is
- supported.
- gid: Group ID for debugfs files. Currently only 0 is
- supported.
-
-Return Value: Handle for generated debug area
- NULL if register failed
-
-Description: Allocates memory for a debug log
- Must not be called within an interrupt handler
-
----------------------------------------------------------------------------
-void debug_unregister (debug_info_t * id);
-
-Parameter: id: handle for debug log
-
-Return Value: none
-
-Description: frees memory for a debug log and removes all registered debug
- views.
- Must not be called within an interrupt handler
-
----------------------------------------------------------------------------
-void debug_set_level (debug_info_t * id, int new_level);
-
-Parameter: id: handle for debug log
- new_level: new debug level
-
-Return Value: none
-
-Description: Sets new actual debug level if new_level is valid.
-
----------------------------------------------------------------------------
-bool debug_level_enabled (debug_info_t * id, int level);
-
-Parameter: id: handle for debug log
- level: debug level
-
-Return Value: True if level is less or equal to the current debug level.
-
-Description: Returns true if debug events for the specified level would be
- logged. Otherwise returns false.
----------------------------------------------------------------------------
-void debug_stop_all(void);
-
-Parameter: none
-
-Return Value: none
-
-Description: stops the debug feature if stopping is allowed. Currently
- used in case of a kernel oops.
-
----------------------------------------------------------------------------
-debug_entry_t* debug_event (debug_info_t* id, int level, void* data,
- int length);
-
-Parameter: id: handle for debug log
- level: debug level
- data: pointer to data for debug entry
- length: length of data in bytes
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry to active debug area (if level <= actual
- debug level)
-
----------------------------------------------------------------------------
-debug_entry_t* debug_int_event (debug_info_t * id, int level,
- unsigned int data);
-debug_entry_t* debug_long_event(debug_info_t * id, int level,
- unsigned long data);
-
-Parameter: id: handle for debug log
- level: debug level
- data: integer value for debug entry
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry to active debug area (if level <= actual
- debug level)
-
----------------------------------------------------------------------------
-debug_entry_t* debug_text_event (debug_info_t * id, int level,
- const char* data);
-
-Parameter: id: handle for debug log
- level: debug level
- data: string for debug entry
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry in ascii format to active debug area
- (if level <= actual debug level)
-
----------------------------------------------------------------------------
-debug_entry_t* debug_sprintf_event (debug_info_t * id, int level,
- char* string,...);
-
-Parameter: id: handle for debug log
- level: debug level
- string: format string for debug entry
- ...: varargs used as in sprintf()
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry with format string and varargs (longs) to
- active debug area (if level $<=$ actual debug level).
- floats and long long datatypes cannot be used as varargs.
-
----------------------------------------------------------------------------
-
-debug_entry_t* debug_exception (debug_info_t* id, int level, void* data,
- int length);
-
-Parameter: id: handle for debug log
- level: debug level
- data: pointer to data for debug entry
- length: length of data in bytes
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry to active debug area (if level <= actual
- debug level) and switches to next debug area
-
----------------------------------------------------------------------------
-debug_entry_t* debug_int_exception (debug_info_t * id, int level,
- unsigned int data);
-debug_entry_t* debug_long_exception(debug_info_t * id, int level,
- unsigned long data);
-
-Parameter: id: handle for debug log
- level: debug level
- data: integer value for debug entry
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry to active debug area (if level <= actual
- debug level) and switches to next debug area
-
----------------------------------------------------------------------------
-debug_entry_t* debug_text_exception (debug_info_t * id, int level,
- const char* data);
-
-Parameter: id: handle for debug log
- level: debug level
- data: string for debug entry
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry in ascii format to active debug area
- (if level <= actual debug level) and switches to next debug
- area
-
----------------------------------------------------------------------------
-debug_entry_t* debug_sprintf_exception (debug_info_t * id, int level,
- char* string,...);
-
-Parameter: id: handle for debug log
- level: debug level
- string: format string for debug entry
- ...: varargs used as in sprintf()
-
-Return Value: Address of written debug entry
-
-Description: writes debug entry with format string and varargs (longs) to
- active debug area (if level $<=$ actual debug level) and
- switches to next debug area.
- floats and long long datatypes cannot be used as varargs.
-
----------------------------------------------------------------------------
-
-int debug_register_view (debug_info_t * id, struct debug_view *view);
-
-Parameter: id: handle for debug log
- view: pointer to debug view struct
-
-Return Value: 0 : ok
- < 0: Error
-
-Description: registers new debug view and creates debugfs dir entry
-
----------------------------------------------------------------------------
-int debug_unregister_view (debug_info_t * id, struct debug_view *view);
-
-Parameter: id: handle for debug log
- view: pointer to debug view struct
-
-Return Value: 0 : ok
- < 0: Error
-
-Description: unregisters debug view and removes debugfs dir entry
-
-
-
-Predefined views:
------------------
-
-extern struct debug_view debug_hex_ascii_view;
-extern struct debug_view debug_raw_view;
-extern struct debug_view debug_sprintf_view;
-
-Examples
---------
-
-/*
- * hex_ascii- + raw-view Example
- */
-
-#include <linux/init.h>
-#include <asm/debug.h>
-
-static debug_info_t* debug_info;
-
-static int init(void)
-{
- /* register 4 debug areas with one page each and 4 byte data field */
-
- debug_info = debug_register ("test", 1, 4, 4 );
- debug_register_view(debug_info,&debug_hex_ascii_view);
- debug_register_view(debug_info,&debug_raw_view);
-
- debug_text_event(debug_info, 4 , "one ");
- debug_int_exception(debug_info, 4, 4711);
- debug_event(debug_info, 3, &debug_info, 4);
-
- return 0;
-}
-
-static void cleanup(void)
-{
- debug_unregister (debug_info);
-}
-
-module_init(init);
-module_exit(cleanup);
-
----------------------------------------------------------------------------
-
-/*
- * sprintf-view Example
- */
-
-#include <linux/init.h>
-#include <asm/debug.h>
-
-static debug_info_t* debug_info;
-
-static int init(void)
-{
- /* register 4 debug areas with one page each and data field for */
- /* format string pointer + 2 varargs (= 3 * sizeof(long)) */
-
- debug_info = debug_register ("test", 1, 4, sizeof(long) * 3);
- debug_register_view(debug_info,&debug_sprintf_view);
-
- debug_sprintf_event(debug_info, 2 , "first event in %s:%i\n",__FILE__,__LINE__);
- debug_sprintf_exception(debug_info, 1, "pointer to debug info: %p\n",&debug_info);
-
- return 0;
-}
-
-static void cleanup(void)
-{
- debug_unregister (debug_info);
-}
-
-module_init(init);
-module_exit(cleanup);
-
-
-
-Debugfs Interface
-----------------
-Views to the debug logs can be investigated through reading the corresponding
-debugfs-files:
-
-Example:
-
-> ls /sys/kernel/debug/s390dbf/dasd
-flush hex_ascii level pages raw
-> cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s
-00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | ....
-00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE
-00 00974733272:682213 2 - 02 0006adf6 07 ea 4a 90 | ....
-00 00974733272:682281 1 * 02 0006ab08 41 4c 4c 43 | EXCP
-01 00974733272:682284 2 - 02 0006ab16 45 43 4b 44 | ECKD
-01 00974733272:682287 2 - 02 0006ab28 00 00 00 04 | ....
-01 00974733272:682289 2 - 02 0006ab3e 00 00 00 20 | ...
-01 00974733272:682297 2 - 02 0006ad7e 07 ea 4a 90 | ....
-01 00974733272:684384 2 - 00 0006ade6 46 52 45 45 | FREE
-01 00974733272:684388 2 - 00 0006adf6 07 ea 4a 90 | ....
-
-See section about predefined views for explanation of the above output!
-
-Changing the debug level
-------------------------
-
-Example:
-
-
-> cat /sys/kernel/debug/s390dbf/dasd/level
-3
-> echo "5" > /sys/kernel/debug/s390dbf/dasd/level
-> cat /sys/kernel/debug/s390dbf/dasd/level
-5
-
-Flushing debug areas
---------------------
-Debug areas can be flushed with piping the number of the desired
-area (0...n) to the debugfs file "flush". When using "-" all debug areas
-are flushed.
-
-Examples:
-
-1. Flush debug area 0:
-> echo "0" > /sys/kernel/debug/s390dbf/dasd/flush
-
-2. Flush all debug areas:
-> echo "-" > /sys/kernel/debug/s390dbf/dasd/flush
-
-Changing the size of debug areas
-------------------------------------
-It is possible the change the size of debug areas through piping
-the number of pages to the debugfs file "pages". The resize request will
-also flush the debug areas.
-
-Example:
-
-Define 4 pages for the debug areas of debug feature "dasd":
-> echo "4" > /sys/kernel/debug/s390dbf/dasd/pages
-
-Stooping the debug feature
---------------------------
-Example:
-
-1. Check if stopping is allowed
-> cat /proc/sys/s390dbf/debug_stoppable
-2. Stop debug feature
-> echo 0 > /proc/sys/s390dbf/debug_active
-
-lcrash Interface
-----------------
-It is planned that the dump analysis tool lcrash gets an additional command
-'s390dbf' to display all the debug logs. With this tool it will be possible
-to investigate the debug logs on a live system and with a memory dump after
-a system crash.
-
-Investigating raw memory
-------------------------
-One last possibility to investigate the debug logs at a live
-system and after a system crash is to look at the raw memory
-under VM or at the Service Element.
-It is possible to find the anker of the debug-logs through
-the 'debug_area_first' symbol in the System map. Then one has
-to follow the correct pointers of the data-structures defined
-in debug.h and find the debug-areas in memory.
-Normally modules which use the debug feature will also have
-a global variable with the pointer to the debug-logs. Following
-this pointer it will also be possible to find the debug logs in
-memory.
-
-For this method it is recommended to use '16 * x + 4' byte (x = 0..n)
-for the length of the data field in debug_register() in
-order to see the debug entries well formatted.
-
-
-Predefined Views
-----------------
-
-There are three predefined views: hex_ascii, raw and sprintf.
-The hex_ascii view shows the data field in hex and ascii representation
-(e.g. '45 43 4b 44 | ECKD').
-The raw view returns a bytestream as the debug areas are stored in memory.
-
-The sprintf view formats the debug entries in the same way as the sprintf
-function would do. The sprintf event/exception functions write to the
-debug entry a pointer to the format string (size = sizeof(long))
-and for each vararg a long value. So e.g. for a debug entry with a format
-string plus two varargs one would need to allocate a (3 * sizeof(long))
-byte data area in the debug_register() function.
-
-IMPORTANT: Using "%s" in sprintf event functions is dangerous. You can only
-use "%s" in the sprintf event functions, if the memory for the passed string is
-available as long as the debug feature exists. The reason behind this is that
-due to performance considerations only a pointer to the string is stored in
-the debug feature. If you log a string that is freed afterwards, you will get
-an OOPS when inspecting the debug feature, because then the debug feature will
-access the already freed memory.
-
-NOTE: If using the sprintf view do NOT use other event/exception functions
-than the sprintf-event and -exception functions.
-
-The format of the hex_ascii and sprintf view is as follows:
-- Number of area
-- Timestamp (formatted as seconds and microseconds since 00:00:00 Coordinated
- Universal Time (UTC), January 1, 1970)
-- level of debug entry
-- Exception flag (* = Exception)
-- Cpu-Number of calling task
-- Return Address to caller
-- data field
-
-The format of the raw view is:
-- Header as described in debug.h
-- datafield
-
-A typical line of the hex_ascii view will look like the following (first line
-is only for explanation and will not be displayed when 'cating' the view):
-
-area time level exception cpu caller data (hex + ascii)
---------------------------------------------------------------------------
-00 00964419409:440690 1 - 00 88023fe
-
-
-Defining views
---------------
-
-Views are specified with the 'debug_view' structure. There are defined
-callback functions which are used for reading and writing the debugfs files:
-
-struct debug_view {
- char name[DEBUG_MAX_PROCF_LEN];
- debug_prolog_proc_t* prolog_proc;
- debug_header_proc_t* header_proc;
- debug_format_proc_t* format_proc;
- debug_input_proc_t* input_proc;
- void* private_data;
-};
-
-where
-
-typedef int (debug_header_proc_t) (debug_info_t* id,
- struct debug_view* view,
- int area,
- debug_entry_t* entry,
- char* out_buf);
-
-typedef int (debug_format_proc_t) (debug_info_t* id,
- struct debug_view* view, char* out_buf,
- const char* in_buf);
-typedef int (debug_prolog_proc_t) (debug_info_t* id,
- struct debug_view* view,
- char* out_buf);
-typedef int (debug_input_proc_t) (debug_info_t* id,
- struct debug_view* view,
- struct file* file, const char* user_buf,
- size_t in_buf_size, loff_t* offset);
-
-
-The "private_data" member can be used as pointer to view specific data.
-It is not used by the debug feature itself.
-
-The output when reading a debugfs file is structured like this:
-
-"prolog_proc output"
-
-"header_proc output 1" "format_proc output 1"
-"header_proc output 2" "format_proc output 2"
-"header_proc output 3" "format_proc output 3"
-...
-
-When a view is read from the debugfs, the Debug Feature calls the
-'prolog_proc' once for writing the prolog.
-Then 'header_proc' and 'format_proc' are called for each
-existing debug entry.
-
-The input_proc can be used to implement functionality when it is written to
-the view (e.g. like with 'echo "0" > /sys/kernel/debug/s390dbf/dasd/level).
-
-For header_proc there can be used the default function
-debug_dflt_header_fn() which is defined in debug.h.
-and which produces the same header output as the predefined views.
-E.g:
-00 00964419409:440761 2 - 00 88023ec
-
-In order to see how to use the callback functions check the implementation
-of the default views!
-
-Example
-
-#include <asm/debug.h>
-
-#define UNKNOWNSTR "data: %08x"
-
-const char* messages[] =
-{"This error...........\n",
- "That error...........\n",
- "Problem..............\n",
- "Something went wrong.\n",
- "Everything ok........\n",
- NULL
-};
-
-static int debug_test_format_fn(
- debug_info_t * id, struct debug_view *view,
- char *out_buf, const char *in_buf
-)
-{
- int i, rc = 0;
-
- if(id->buf_size >= 4) {
- int msg_nr = *((int*)in_buf);
- if(msg_nr < sizeof(messages)/sizeof(char*) - 1)
- rc += sprintf(out_buf, "%s", messages[msg_nr]);
- else
- rc += sprintf(out_buf, UNKNOWNSTR, msg_nr);
- }
- out:
- return rc;
-}
-
-struct debug_view debug_test_view = {
- "myview", /* name of view */
- NULL, /* no prolog */
- &debug_dflt_header_fn, /* default header for each entry */
- &debug_test_format_fn, /* our own format function */
- NULL, /* no input function */
- NULL /* no private data */
-};
-
-=====
-test:
-=====
-debug_info_t *debug_info;
-...
-debug_info = debug_register ("test", 0, 4, 4 ));
-debug_register_view(debug_info, &debug_test_view);
-for(i = 0; i < 10; i ++) debug_int_event(debug_info, 1, i);
-
-> cat /sys/kernel/debug/s390dbf/test/myview
-00 00964419734:611402 1 - 00 88042ca This error...........
-00 00964419734:611405 1 - 00 88042ca That error...........
-00 00964419734:611408 1 - 00 88042ca Problem..............
-00 00964419734:611411 1 - 00 88042ca Something went wrong.
-00 00964419734:611414 1 - 00 88042ca Everything ok........
-00 00964419734:611417 1 - 00 88042ca data: 00000005
-00 00964419734:611419 1 - 00 88042ca data: 00000006
-00 00964419734:611422 1 - 00 88042ca data: 00000007
-00 00964419734:611425 1 - 00 88042ca data: 00000008
-00 00964419734:611428 1 - 00 88042ca data: 00000009
diff --git a/Documentation/s390/text_files.rst b/Documentation/s390/text_files.rst
new file mode 100644
index 000000000000..c94d05d4fa17
--- /dev/null
+++ b/Documentation/s390/text_files.rst
@@ -0,0 +1,11 @@
+ibm 3270 changelog
+------------------
+
+.. include:: 3270.ChangeLog
+ :literal:
+
+ibm 3270 config3270.sh
+----------------------
+
+.. literalinclude:: config3270.sh
+ :language: shell
diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst
new file mode 100644
index 000000000000..b5c51f7c748d
--- /dev/null
+++ b/Documentation/s390/vfio-ap.rst
@@ -0,0 +1,866 @@
+===============================
+Adjunct Processor (AP) facility
+===============================
+
+
+Introduction
+============
+The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised
+of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards.
+The AP devices provide cryptographic functions to all CPUs assigned to a
+linux system running in an IBM Z system LPAR.
+
+The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap
+is to make AP cards available to KVM guests using the VFIO mediated device
+framework. This implementation relies considerably on the s390 virtualization
+facilities which do most of the hard work of providing direct access to AP
+devices.
+
+AP Architectural Overview
+=========================
+To facilitate the comprehension of the design, let's start with some
+definitions:
+
+* AP adapter
+
+ An AP adapter is an IBM Z adapter card that can perform cryptographic
+ functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters
+ assigned to the LPAR in which a linux host is running will be available to
+ the linux host. Each adapter is identified by a number from 0 to 255; however,
+ the maximum adapter number is determined by machine model and/or adapter type.
+ When installed, an AP adapter is accessed by AP instructions executed by any
+ CPU.
+
+ The AP adapter cards are assigned to a given LPAR via the system's Activation
+ Profile which can be edited via the HMC. When the linux host system is IPL'd
+ in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and
+ creates a sysfs device for each assigned adapter. For example, if AP adapters
+ 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following
+ sysfs device entries::
+
+ /sys/devices/ap/card04
+ /sys/devices/ap/card0a
+
+ Symbolic links to these devices will also be created in the AP bus devices
+ sub-directory::
+
+ /sys/bus/ap/devices/[card04]
+ /sys/bus/ap/devices/[card04]
+
+* AP domain
+
+ An adapter is partitioned into domains. An adapter can hold up to 256 domains
+ depending upon the adapter type and hardware configuration. A domain is
+ identified by a number from 0 to 255; however, the maximum domain number is
+ determined by machine model and/or adapter type.. A domain can be thought of
+ as a set of hardware registers and memory used for processing AP commands. A
+ domain can be configured with a secure private key used for clear key
+ encryption. A domain is classified in one of two ways depending upon how it
+ may be accessed:
+
+ * Usage domains are domains that are targeted by an AP instruction to
+ process an AP command.
+
+ * Control domains are domains that are changed by an AP command sent to a
+ usage domain; for example, to set the secure private key for the control
+ domain.
+
+ The AP usage and control domains are assigned to a given LPAR via the system's
+ Activation Profile which can be edited via the HMC. When a linux host system
+ is IPL'd in the LPAR, the AP bus module detects the AP usage and control
+ domains assigned to the LPAR. The domain number of each usage domain and
+ adapter number of each AP adapter are combined to create AP queue devices
+ (see AP Queue section below). The domain number of each control domain will be
+ represented in a bitmask and stored in a sysfs file
+ /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least
+ significant bit, correspond to domains 0-255.
+
+* AP Queue
+
+ An AP queue is the means by which an AP command is sent to a usage domain
+ inside a specific adapter. An AP queue is identified by a tuple
+ comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
+ APQI corresponds to a given usage domain number within the adapter. This tuple
+ forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
+ instructions include a field containing the APQN to identify the AP queue to
+ which the AP command is to be sent for processing.
+
+ The AP bus will create a sysfs device for each APQN that can be derived from
+ the cross product of the AP adapter and usage domain numbers detected when the
+ AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage
+ domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the
+ following sysfs entries::
+
+ /sys/devices/ap/card04/04.0006
+ /sys/devices/ap/card04/04.0047
+ /sys/devices/ap/card0a/0a.0006
+ /sys/devices/ap/card0a/0a.0047
+
+ The following symbolic links to these devices will be created in the AP bus
+ devices subdirectory::
+
+ /sys/bus/ap/devices/[04.0006]
+ /sys/bus/ap/devices/[04.0047]
+ /sys/bus/ap/devices/[0a.0006]
+ /sys/bus/ap/devices/[0a.0047]
+
+* AP Instructions:
+
+ There are three AP instructions:
+
+ * NQAP: to enqueue an AP command-request message to a queue
+ * DQAP: to dequeue an AP command-reply message from a queue
+ * PQAP: to administer the queues
+
+ AP instructions identify the domain that is targeted to process the AP
+ command; this must be one of the usage domains. An AP command may modify a
+ domain that is not one of the usage domains, but the modified domain
+ must be one of the control domains.
+
+AP and SIE
+==========
+Let's now take a look at how AP instructions executed on a guest are interpreted
+by the hardware.
+
+A satellite control block called the Crypto Control Block (CRYCB) is attached to
+our main hardware virtualization control block. The CRYCB contains three fields
+to identify the adapters, usage domains and control domains assigned to the KVM
+guest:
+
+* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
+ to the KVM guest. Each bit in the mask, from left to right (i.e. from most
+ significant to least significant bit in big endian order), corresponds to
+ an APID from 0-255. If a bit is set, the corresponding adapter is valid for
+ use by the KVM guest.
+
+* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
+ assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from
+ most significant to least significant bit in big endian order), corresponds to
+ an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue
+ is valid for use by the KVM guest.
+
+* The AP Domain Mask field is a bit mask that identifies the AP control domains
+ assigned to the KVM guest. The ADM bit mask controls which domains can be
+ changed by an AP command-request message sent to a usage domain from the
+ guest. Each bit in the mask, from left to right (i.e. from most significant to
+ least significant bit in big endian order), corresponds to a domain from
+ 0-255. If a bit is set, the corresponding domain can be modified by an AP
+ command-request message sent to a usage domain.
+
+If you recall from the description of an AP Queue, AP instructions include
+an APQN to identify the AP queue to which an AP command-request message is to be
+sent (NQAP and PQAP instructions), or from which a command-reply message is to
+be received (DQAP instruction). The validity of an APQN is defined by the matrix
+calculated from the APM and AQM; it is the cross product of all assigned adapter
+numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1
+and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6),
+(2,5) and (2,6) will be valid for the guest.
+
+The APQNs can provide secure key functionality - i.e., a private key is stored
+on the adapter card for each of its domains - so each APQN must be assigned to
+at most one guest or to the linux host::
+
+ Example 1: Valid configuration:
+ ------------------------------
+ Guest1: adapters 1,2 domains 5,6
+ Guest2: adapter 1,2 domain 7
+
+ This is valid because both guests have a unique set of APQNs:
+ Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
+ Guest2 has APQNs (1,7), (2,7)
+
+ Example 2: Valid configuration:
+ ------------------------------
+ Guest1: adapters 1,2 domains 5,6
+ Guest2: adapters 3,4 domains 5,6
+
+ This is also valid because both guests have a unique set of APQNs:
+ Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
+ Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
+
+ Example 3: Invalid configuration:
+ --------------------------------
+ Guest1: adapters 1,2 domains 5,6
+ Guest2: adapter 1 domains 6,7
+
+ This is an invalid configuration because both guests have access to
+ APQN (1,6).
+
+The Design
+==========
+The design introduces three new objects:
+
+1. AP matrix device
+2. VFIO AP device driver (vfio_ap.ko)
+3. VFIO AP mediated matrix pass-through device
+
+The VFIO AP device driver
+-------------------------
+The VFIO AP (vfio_ap) device driver serves the following purposes:
+
+1. Provides the interfaces to secure APQNs for exclusive use of KVM guests.
+
+2. Sets up the VFIO mediated device interfaces to manage a mediated matrix
+ device and creates the sysfs interfaces for assigning adapters, usage
+ domains, and control domains comprising the matrix for a KVM guest.
+
+3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's
+ SIE state description to grant the guest access to a matrix of AP devices
+
+Reserve APQNs for exclusive use of KVM guests
+---------------------------------------------
+The following block diagram illustrates the mechanism by which APQNs are
+reserved::
+
+ +------------------+
+ 7 remove | |
+ +--------------------> cex4queue driver |
+ | | |
+ | +------------------+
+ |
+ |
+ | +------------------+ +----------------+
+ | 5 register driver | | 3 create | |
+ | +----------------> Device core +----------> matrix device |
+ | | | | | |
+ | | +--------^---------+ +----------------+
+ | | |
+ | | +-------------------+
+ | | +-----------------------------------+ |
+ | | | 4 register AP driver | | 2 register device
+ | | | | |
+ +--------+---+-v---+ +--------+-------+-+
+ | | | |
+ | ap_bus +--------------------- > vfio_ap driver |
+ | | 8 probe | |
+ +--------^---------+ +--^--^------------+
+ 6 edit | | |
+ apmask | +-----------------------------+ | 9 mdev create
+ aqmask | | 1 modprobe |
+ +--------+-----+---+ +----------------+-+ +----------------+
+ | | | |8 create | mediated |
+ | admin | | VFIO device core |---------> matrix |
+ | + | | | device |
+ +------+-+---------+ +--------^---------+ +--------^-------+
+ | | | |
+ | | 9 create vfio_ap-passthrough | |
+ | +------------------------------+ |
+ +-------------------------------------------------------------+
+ 10 assign adapter/domain/control domain
+
+The process for reserving an AP queue for use by a KVM guest is:
+
+1. The administrator loads the vfio_ap device driver
+2. The vfio-ap driver during its initialization will register a single 'matrix'
+ device with the device core. This will serve as the parent device for
+ all mediated matrix devices used to configure an AP matrix for a guest.
+3. The /sys/devices/vfio_ap/matrix device is created by the device core
+4. The vfio_ap device driver will register with the AP bus for AP queue devices
+ of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap
+ driver's probe and remove callback interfaces. Devices older than CEX4 queues
+ are not supported to simplify the implementation by not needlessly
+ complicating the design by supporting older devices that will go out of
+ service in the relatively near future, and for which there are few older
+ systems around on which to test.
+5. The AP bus registers the vfio_ap device driver with the device core
+6. The administrator edits the AP adapter and queue masks to reserve AP queues
+ for use by the vfio_ap device driver.
+7. The AP bus removes the AP queues reserved for the vfio_ap driver from the
+ default zcrypt cex4queue driver.
+8. The AP bus probes the vfio_ap device driver to bind the queues reserved for
+ it.
+9. The administrator creates a passthrough type mediated matrix device to be
+ used by a guest
+10. The administrator assigns the adapters, usage domains and control domains
+ to be exclusively used by a guest.
+
+Set up the VFIO mediated device interfaces
+------------------------------------------
+The VFIO AP device driver utilizes the common interface of the VFIO mediated
+device core driver to:
+
+* Register an AP mediated bus driver to add a mediated matrix device to and
+ remove it from a VFIO group.
+* Create and destroy a mediated matrix device
+* Add a mediated matrix device to and remove it from the AP mediated bus driver
+* Add a mediated matrix device to and remove it from an IOMMU group
+
+The following high-level block diagram shows the main components and interfaces
+of the VFIO AP mediated matrix device driver::
+
+ +-------------+
+ | |
+ | +---------+ | mdev_register_driver() +--------------+
+ | | Mdev | +<-----------------------+ |
+ | | bus | | | vfio_mdev.ko |
+ | | driver | +----------------------->+ |<-> VFIO user
+ | +---------+ | probe()/remove() +--------------+ APIs
+ | |
+ | MDEV CORE |
+ | MODULE |
+ | mdev.ko |
+ | +---------+ | mdev_register_device() +--------------+
+ | |Physical | +<-----------------------+ |
+ | | device | | | vfio_ap.ko |<-> matrix
+ | |interface| +----------------------->+ | device
+ | +---------+ | callback +--------------+
+ +-------------+
+
+During initialization of the vfio_ap module, the matrix device is registered
+with an 'mdev_parent_ops' structure that provides the sysfs attribute
+structures, mdev functions and callback interfaces for managing the mediated
+matrix device.
+
+* sysfs attribute structures:
+
+ supported_type_groups
+ The VFIO mediated device framework supports creation of user-defined
+ mediated device types. These mediated device types are specified
+ via the 'supported_type_groups' structure when a device is registered
+ with the mediated device framework. The registration process creates the
+ sysfs structures for each mediated device type specified in the
+ 'mdev_supported_types' sub-directory of the device being registered. Along
+ with the device type, the sysfs attributes of the mediated device type are
+ provided.
+
+ The VFIO AP device driver will register one mediated device type for
+ passthrough devices:
+
+ /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough
+
+ Only the read-only attributes required by the VFIO mdev framework will
+ be provided::
+
+ ... name
+ ... device_api
+ ... available_instances
+ ... device_api
+
+ Where:
+
+ * name:
+ specifies the name of the mediated device type
+ * device_api:
+ the mediated device type's API
+ * available_instances:
+ the number of mediated matrix passthrough devices
+ that can be created
+ * device_api:
+ specifies the VFIO API
+ mdev_attr_groups
+ This attribute group identifies the user-defined sysfs attributes of the
+ mediated device. When a device is registered with the VFIO mediated device
+ framework, the sysfs attribute files identified in the 'mdev_attr_groups'
+ structure will be created in the mediated matrix device's directory. The
+ sysfs attributes for a mediated matrix device are:
+
+ assign_adapter / unassign_adapter:
+ Write-only attributes for assigning/unassigning an AP adapter to/from the
+ mediated matrix device. To assign/unassign an adapter, the APID of the
+ adapter is echoed to the respective attribute file.
+ assign_domain / unassign_domain:
+ Write-only attributes for assigning/unassigning an AP usage domain to/from
+ the mediated matrix device. To assign/unassign a domain, the domain
+ number of the the usage domain is echoed to the respective attribute
+ file.
+ matrix:
+ A read-only file for displaying the APQNs derived from the cross product
+ of the adapter and domain numbers assigned to the mediated matrix device.
+ assign_control_domain / unassign_control_domain:
+ Write-only attributes for assigning/unassigning an AP control domain
+ to/from the mediated matrix device. To assign/unassign a control domain,
+ the ID of the domain to be assigned/unassigned is echoed to the respective
+ attribute file.
+ control_domains:
+ A read-only file for displaying the control domain numbers assigned to the
+ mediated matrix device.
+
+* functions:
+
+ create:
+ allocates the ap_matrix_mdev structure used by the vfio_ap driver to:
+
+ * Store the reference to the KVM structure for the guest using the mdev
+ * Store the AP matrix configuration for the adapters, domains, and control
+ domains assigned via the corresponding sysfs attributes files
+
+ remove:
+ deallocates the mediated matrix device's ap_matrix_mdev structure. This will
+ be allowed only if a running guest is not using the mdev.
+
+* callback interfaces
+
+ open:
+ The vfio_ap driver uses this callback to register a
+ VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix
+ device. The open is invoked when QEMU connects the VFIO iommu group
+ for the mdev matrix device to the MDEV bus. Access to the KVM structure used
+ to configure the KVM guest is provided via this callback. The KVM structure,
+ is used to configure the guest's access to the AP matrix defined via the
+ mediated matrix device's sysfs attribute files.
+ release:
+ unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
+ mdev matrix device and deconfigures the guest's AP matrix.
+
+Configure the APM, AQM and ADM in the CRYCB
+-------------------------------------------
+Configuring the AP matrix for a KVM guest will be performed when the
+VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier
+function is called when QEMU connects to KVM. The guest's AP matrix is
+configured via it's CRYCB by:
+
+* Setting the bits in the APM corresponding to the APIDs assigned to the
+ mediated matrix device via its 'assign_adapter' interface.
+* Setting the bits in the AQM corresponding to the domains assigned to the
+ mediated matrix device via its 'assign_domain' interface.
+* Setting the bits in the ADM corresponding to the domain dIDs assigned to the
+ mediated matrix device via its 'assign_control_domains' interface.
+
+The CPU model features for AP
+-----------------------------
+The AP stack relies on the presence of the AP instructions as well as two
+facilities: The AP Facilities Test (APFT) facility; and the AP Query
+Configuration Information (QCI) facility. These features/facilities are made
+available to a KVM guest via the following CPU model features:
+
+1. ap: Indicates whether the AP instructions are installed on the guest. This
+ feature will be enabled by KVM only if the AP instructions are installed
+ on the host.
+
+2. apft: Indicates the APFT facility is available on the guest. This facility
+ can be made available to the guest only if it is available on the host (i.e.,
+ facility bit 15 is set).
+
+3. apqci: Indicates the AP QCI facility is available on the guest. This facility
+ can be made available to the guest only if it is available on the host (i.e.,
+ facility bit 12 is set).
+
+Note: If the user chooses to specify a CPU model different than the 'host'
+model to QEMU, the CPU model features and facilities need to be turned on
+explicitly; for example::
+
+ /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on
+
+A guest can be precluded from using AP features/facilities by turning them off
+explicitly; for example::
+
+ /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off
+
+Note: If the APFT facility is turned off (apft=off) for the guest, the guest
+will not see any AP devices. The zcrypt device drivers that register for type 10
+and newer AP devices - i.e., the cex4card and cex4queue device drivers - need
+the APFT facility to ascertain the facilities installed on a given AP device. If
+the APFT facility is not installed on the guest, then the probe of device
+drivers will fail since only type 10 and newer devices can be configured for
+guest use.
+
+Example
+=======
+Let's now provide an example to illustrate how KVM guests may be given
+access to AP facilities. For this example, we will show how to configure
+three guests such that executing the lszcrypt command on the guests would
+look like this:
+
+Guest1
+------
+=========== ===== ============
+CARD.DOMAIN TYPE MODE
+=========== ===== ============
+05 CEX5C CCA-Coproc
+05.0004 CEX5C CCA-Coproc
+05.00ab CEX5C CCA-Coproc
+06 CEX5A Accelerator
+06.0004 CEX5A Accelerator
+06.00ab CEX5C CCA-Coproc
+=========== ===== ============
+
+Guest2
+------
+=========== ===== ============
+CARD.DOMAIN TYPE MODE
+=========== ===== ============
+05 CEX5A Accelerator
+05.0047 CEX5A Accelerator
+05.00ff CEX5A Accelerator
+=========== ===== ============
+
+Guest2
+------
+=========== ===== ============
+CARD.DOMAIN TYPE MODE
+=========== ===== ============
+06 CEX5A Accelerator
+06.0047 CEX5A Accelerator
+06.00ff CEX5A Accelerator
+=========== ===== ============
+
+These are the steps:
+
+1. Install the vfio_ap module on the linux host. The dependency chain for the
+ vfio_ap module is:
+ * iommu
+ * s390
+ * zcrypt
+ * vfio
+ * vfio_mdev
+ * vfio_mdev_device
+ * KVM
+
+ To build the vfio_ap module, the kernel build must be configured with the
+ following Kconfig elements selected:
+ * IOMMU_SUPPORT
+ * S390
+ * ZCRYPT
+ * S390_AP_IOMMU
+ * VFIO
+ * VFIO_MDEV
+ * VFIO_MDEV_DEVICE
+ * KVM
+
+ If using make menuconfig select the following to build the vfio_ap module::
+
+ -> Device Drivers
+ -> IOMMU Hardware Support
+ select S390 AP IOMMU Support
+ -> VFIO Non-Privileged userspace driver framework
+ -> Mediated device driver frramework
+ -> VFIO driver for Mediated devices
+ -> I/O subsystem
+ -> VFIO support for AP devices
+
+2. Secure the AP queues to be used by the three guests so that the host can not
+ access them. To secure them, there are two sysfs files that specify
+ bitmasks marking a subset of the APQN range as 'usable by the default AP
+ queue device drivers' or 'not usable by the default device drivers' and thus
+ available for use by the vfio_ap device driver'. The location of the sysfs
+ files containing the masks are::
+
+ /sys/bus/ap/apmask
+ /sys/bus/ap/aqmask
+
+ The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
+ (APID). Each bit in the mask, from left to right (i.e., from most significant
+ to least significant bit in big endian order), corresponds to an APID from
+ 0-255. If a bit is set, the APID is marked as usable only by the default AP
+ queue device drivers; otherwise, the APID is usable by the vfio_ap
+ device driver.
+
+ The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
+ (APQI). Each bit in the mask, from left to right (i.e., from most significant
+ to least significant bit in big endian order), corresponds to an APQI from
+ 0-255. If a bit is set, the APQI is marked as usable only by the default AP
+ queue device drivers; otherwise, the APQI is usable by the vfio_ap device
+ driver.
+
+ Take, for example, the following mask::
+
+ 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+
+ It indicates:
+
+ 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
+ belong to the vfio_ap device driver's pool.
+
+ The APQN of each AP queue device assigned to the linux host is checked by the
+ AP bus against the set of APQNs derived from the cross product of APIDs
+ and APQIs marked as usable only by the default AP queue device drivers. If a
+ match is detected, only the default AP queue device drivers will be probed;
+ otherwise, the vfio_ap device driver will be probed.
+
+ By default, the two masks are set to reserve all APQNs for use by the default
+ AP queue device drivers. There are two ways the default masks can be changed:
+
+ 1. The sysfs mask files can be edited by echoing a string into the
+ respective sysfs mask file in one of two formats:
+
+ * An absolute hex string starting with 0x - like "0x12345678" - sets
+ the mask. If the given string is shorter than the mask, it is padded
+ with 0s on the right; for example, specifying a mask value of 0x41 is
+ the same as specifying::
+
+ 0x4100000000000000000000000000000000000000000000000000000000000000
+
+ Keep in mind that the mask reads from left to right (i.e., most
+ significant to least significant bit in big endian order), so the mask
+ above identifies device numbers 1 and 7 (01000001).
+
+ If the string is longer than the mask, the operation is terminated with
+ an error (EINVAL).
+
+ * Individual bits in the mask can be switched on and off by specifying
+ each bit number to be switched in a comma separated list. Each bit
+ number string must be prepended with a ('+') or minus ('-') to indicate
+ the corresponding bit is to be switched on ('+') or off ('-'). Some
+ valid values are:
+
+ - "+0" switches bit 0 on
+ - "-13" switches bit 13 off
+ - "+0x41" switches bit 65 on
+ - "-0xff" switches bit 255 off
+
+ The following example:
+
+ +0,-6,+0x47,-0xf0
+
+ Switches bits 0 and 71 (0x47) on
+
+ Switches bits 6 and 240 (0xf0) off
+
+ Note that the bits not specified in the list remain as they were before
+ the operation.
+
+ 2. The masks can also be changed at boot time via parameters on the kernel
+ command line like this:
+
+ ap.apmask=0xffff ap.aqmask=0x40
+
+ This would create the following masks::
+
+ apmask:
+ 0xffff000000000000000000000000000000000000000000000000000000000000
+
+ aqmask:
+ 0x4000000000000000000000000000000000000000000000000000000000000000
+
+ Resulting in these two pools::
+
+ default drivers pool: adapter 0-15, domain 1
+ alternate drivers pool: adapter 16-255, domains 0, 2-255
+
+Securing the APQNs for our example
+----------------------------------
+ To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047,
+ 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding
+ APQNs can either be removed from the default masks::
+
+ echo -5,-6 > /sys/bus/ap/apmask
+
+ echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
+
+ Or the masks can be set as follows::
+
+ echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \
+ > apmask
+
+ echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \
+ > aqmask
+
+ This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
+ 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
+ sysfs directory for the vfio_ap device driver will now contain symbolic links
+ to the AP queue devices bound to it::
+
+ /sys/bus/ap
+ ... [drivers]
+ ...... [vfio_ap]
+ ......... [05.0004]
+ ......... [05.0047]
+ ......... [05.00ab]
+ ......... [05.00ff]
+ ......... [06.0004]
+ ......... [06.0047]
+ ......... [06.00ab]
+ ......... [06.00ff]
+
+ Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
+ can be bound to the vfio_ap device driver. The reason for this is to
+ simplify the implementation by not needlessly complicating the design by
+ supporting older devices that will go out of service in the relatively near
+ future and for which there are few older systems on which to test.
+
+ The administrator, therefore, must take care to secure only AP queues that
+ can be bound to the vfio_ap device driver. The device type for a given AP
+ queue device can be read from the parent card's sysfs directory. For example,
+ to see the hardware type of the queue 05.0004:
+
+ cat /sys/bus/ap/devices/card05/hwtype
+
+ The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
+ vfio_ap device driver.
+
+3. Create the mediated devices needed to configure the AP matrixes for the
+ three guests and to provide an interface to the vfio_ap driver for
+ use by the guests::
+
+ /sys/devices/vfio_ap/matrix/
+ --- [mdev_supported_types]
+ ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
+ --------- create
+ --------- [devices]
+
+ To create the mediated devices for the three guests::
+
+ uuidgen > create
+ uuidgen > create
+ uuidgen > create
+
+ or
+
+ echo $uuid1 > create
+ echo $uuid2 > create
+ echo $uuid3 > create
+
+ This will create three mediated devices in the [devices] subdirectory named
+ after the UUID written to the create attribute file. We call them $uuid1,
+ $uuid2 and $uuid3 and this is the sysfs directory structure after creation::
+
+ /sys/devices/vfio_ap/matrix/
+ --- [mdev_supported_types]
+ ------ [vfio_ap-passthrough]
+ --------- [devices]
+ ------------ [$uuid1]
+ --------------- assign_adapter
+ --------------- assign_control_domain
+ --------------- assign_domain
+ --------------- matrix
+ --------------- unassign_adapter
+ --------------- unassign_control_domain
+ --------------- unassign_domain
+
+ ------------ [$uuid2]
+ --------------- assign_adapter
+ --------------- assign_control_domain
+ --------------- assign_domain
+ --------------- matrix
+ --------------- unassign_adapter
+ ----------------unassign_control_domain
+ ----------------unassign_domain
+
+ ------------ [$uuid3]
+ --------------- assign_adapter
+ --------------- assign_control_domain
+ --------------- assign_domain
+ --------------- matrix
+ --------------- unassign_adapter
+ ----------------unassign_control_domain
+ ----------------unassign_domain
+
+4. The administrator now needs to configure the matrixes for the mediated
+ devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
+
+ This is how the matrix is configured for Guest1::
+
+ echo 5 > assign_adapter
+ echo 6 > assign_adapter
+ echo 4 > assign_domain
+ echo 0xab > assign_domain
+
+ Control domains can similarly be assigned using the assign_control_domain
+ sysfs file.
+
+ If a mistake is made configuring an adapter, domain or control domain,
+ you can use the unassign_xxx files to unassign the adapter, domain or
+ control domain.
+
+ To display the matrix configuration for Guest1::
+
+ cat matrix
+
+ This is how the matrix is configured for Guest2::
+
+ echo 5 > assign_adapter
+ echo 0x47 > assign_domain
+ echo 0xff > assign_domain
+
+ This is how the matrix is configured for Guest3::
+
+ echo 6 > assign_adapter
+ echo 0x47 > assign_domain
+ echo 0xff > assign_domain
+
+ In order to successfully assign an adapter:
+
+ * The adapter number specified must represent a value from 0 up to the
+ maximum adapter number configured for the system. If an adapter number
+ higher than the maximum is specified, the operation will terminate with
+ an error (ENODEV).
+
+ * All APQNs that can be derived from the adapter ID and the IDs of
+ the previously assigned domains must be bound to the vfio_ap device
+ driver. If no domains have yet been assigned, then there must be at least
+ one APQN with the specified APID bound to the vfio_ap driver. If no such
+ APQNs are bound to the driver, the operation will terminate with an
+ error (EADDRNOTAVAIL).
+
+ No APQN that can be derived from the adapter ID and the IDs of the
+ previously assigned domains can be assigned to another mediated matrix
+ device. If an APQN is assigned to another mediated matrix device, the
+ operation will terminate with an error (EADDRINUSE).
+
+ In order to successfully assign a domain:
+
+ * The domain number specified must represent a value from 0 up to the
+ maximum domain number configured for the system. If a domain number
+ higher than the maximum is specified, the operation will terminate with
+ an error (ENODEV).
+
+ * All APQNs that can be derived from the domain ID and the IDs of
+ the previously assigned adapters must be bound to the vfio_ap device
+ driver. If no domains have yet been assigned, then there must be at least
+ one APQN with the specified APQI bound to the vfio_ap driver. If no such
+ APQNs are bound to the driver, the operation will terminate with an
+ error (EADDRNOTAVAIL).
+
+ No APQN that can be derived from the domain ID and the IDs of the
+ previously assigned adapters can be assigned to another mediated matrix
+ device. If an APQN is assigned to another mediated matrix device, the
+ operation will terminate with an error (EADDRINUSE).
+
+ In order to successfully assign a control domain, the domain number
+ specified must represent a value from 0 up to the maximum domain number
+ configured for the system. If a control domain number higher than the maximum
+ is specified, the operation will terminate with an error (ENODEV).
+
+5. Start Guest1::
+
+ /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+ -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
+
+7. Start Guest2::
+
+ /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+ -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
+
+7. Start Guest3::
+
+ /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
+ -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
+
+When the guest is shut down, the mediated matrix devices may be removed.
+
+Using our example again, to remove the mediated matrix device $uuid1::
+
+ /sys/devices/vfio_ap/matrix/
+ --- [mdev_supported_types]
+ ------ [vfio_ap-passthrough]
+ --------- [devices]
+ ------------ [$uuid1]
+ --------------- remove
+
+::
+
+ echo 1 > remove
+
+This will remove all of the mdev matrix device's sysfs structures including
+the mdev device itself. To recreate and reconfigure the mdev matrix device,
+all of the steps starting with step 3 will have to be performed again. Note
+that the remove will fail if a guest using the mdev is still running.
+
+It is not necessary to remove an mdev matrix device, but one may want to
+remove it if no guest will use it during the remaining lifetime of the linux
+host. If the mdev matrix device is removed, one may want to also reconfigure
+the pool of adapters and queues reserved for use by the default drivers.
+
+Limitations
+===========
+* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
+ to the default drivers pool of a queue that is still assigned to a mediated
+ device in use by a guest. It is incumbent upon the administrator to
+ ensure there is no mediated device in use by a guest to which the APQN is
+ assigned lest the host be given access to the private data of the AP queue
+ device such as a private key configured specifically for the guest.
+
+* Dynamically modifying the AP matrix for a running guest (which would amount to
+ hot(un)plug of AP devices for the guest) is currently not supported
+
+* Live guest migration is not supported for guests using AP devices.
diff --git a/Documentation/s390/vfio-ap.txt b/Documentation/s390/vfio-ap.txt
deleted file mode 100644
index 65167cfe4485..000000000000
--- a/Documentation/s390/vfio-ap.txt
+++ /dev/null
@@ -1,837 +0,0 @@
-Introduction:
-============
-The Adjunct Processor (AP) facility is an IBM Z cryptographic facility comprised
-of three AP instructions and from 1 up to 256 PCIe cryptographic adapter cards.
-The AP devices provide cryptographic functions to all CPUs assigned to a
-linux system running in an IBM Z system LPAR.
-
-The AP adapter cards are exposed via the AP bus. The motivation for vfio-ap
-is to make AP cards available to KVM guests using the VFIO mediated device
-framework. This implementation relies considerably on the s390 virtualization
-facilities which do most of the hard work of providing direct access to AP
-devices.
-
-AP Architectural Overview:
-=========================
-To facilitate the comprehension of the design, let's start with some
-definitions:
-
-* AP adapter
-
- An AP adapter is an IBM Z adapter card that can perform cryptographic
- functions. There can be from 0 to 256 adapters assigned to an LPAR. Adapters
- assigned to the LPAR in which a linux host is running will be available to
- the linux host. Each adapter is identified by a number from 0 to 255; however,
- the maximum adapter number is determined by machine model and/or adapter type.
- When installed, an AP adapter is accessed by AP instructions executed by any
- CPU.
-
- The AP adapter cards are assigned to a given LPAR via the system's Activation
- Profile which can be edited via the HMC. When the linux host system is IPL'd
- in the LPAR, the AP bus detects the AP adapter cards assigned to the LPAR and
- creates a sysfs device for each assigned adapter. For example, if AP adapters
- 4 and 10 (0x0a) are assigned to the LPAR, the AP bus will create the following
- sysfs device entries:
-
- /sys/devices/ap/card04
- /sys/devices/ap/card0a
-
- Symbolic links to these devices will also be created in the AP bus devices
- sub-directory:
-
- /sys/bus/ap/devices/[card04]
- /sys/bus/ap/devices/[card04]
-
-* AP domain
-
- An adapter is partitioned into domains. An adapter can hold up to 256 domains
- depending upon the adapter type and hardware configuration. A domain is
- identified by a number from 0 to 255; however, the maximum domain number is
- determined by machine model and/or adapter type.. A domain can be thought of
- as a set of hardware registers and memory used for processing AP commands. A
- domain can be configured with a secure private key used for clear key
- encryption. A domain is classified in one of two ways depending upon how it
- may be accessed:
-
- * Usage domains are domains that are targeted by an AP instruction to
- process an AP command.
-
- * Control domains are domains that are changed by an AP command sent to a
- usage domain; for example, to set the secure private key for the control
- domain.
-
- The AP usage and control domains are assigned to a given LPAR via the system's
- Activation Profile which can be edited via the HMC. When a linux host system
- is IPL'd in the LPAR, the AP bus module detects the AP usage and control
- domains assigned to the LPAR. The domain number of each usage domain and
- adapter number of each AP adapter are combined to create AP queue devices
- (see AP Queue section below). The domain number of each control domain will be
- represented in a bitmask and stored in a sysfs file
- /sys/bus/ap/ap_control_domain_mask. The bits in the mask, from most to least
- significant bit, correspond to domains 0-255.
-
-* AP Queue
-
- An AP queue is the means by which an AP command is sent to a usage domain
- inside a specific adapter. An AP queue is identified by a tuple
- comprised of an AP adapter ID (APID) and an AP queue index (APQI). The
- APQI corresponds to a given usage domain number within the adapter. This tuple
- forms an AP Queue Number (APQN) uniquely identifying an AP queue. AP
- instructions include a field containing the APQN to identify the AP queue to
- which the AP command is to be sent for processing.
-
- The AP bus will create a sysfs device for each APQN that can be derived from
- the cross product of the AP adapter and usage domain numbers detected when the
- AP bus module is loaded. For example, if adapters 4 and 10 (0x0a) and usage
- domains 6 and 71 (0x47) are assigned to the LPAR, the AP bus will create the
- following sysfs entries:
-
- /sys/devices/ap/card04/04.0006
- /sys/devices/ap/card04/04.0047
- /sys/devices/ap/card0a/0a.0006
- /sys/devices/ap/card0a/0a.0047
-
- The following symbolic links to these devices will be created in the AP bus
- devices subdirectory:
-
- /sys/bus/ap/devices/[04.0006]
- /sys/bus/ap/devices/[04.0047]
- /sys/bus/ap/devices/[0a.0006]
- /sys/bus/ap/devices/[0a.0047]
-
-* AP Instructions:
-
- There are three AP instructions:
-
- * NQAP: to enqueue an AP command-request message to a queue
- * DQAP: to dequeue an AP command-reply message from a queue
- * PQAP: to administer the queues
-
- AP instructions identify the domain that is targeted to process the AP
- command; this must be one of the usage domains. An AP command may modify a
- domain that is not one of the usage domains, but the modified domain
- must be one of the control domains.
-
-AP and SIE:
-==========
-Let's now take a look at how AP instructions executed on a guest are interpreted
-by the hardware.
-
-A satellite control block called the Crypto Control Block (CRYCB) is attached to
-our main hardware virtualization control block. The CRYCB contains three fields
-to identify the adapters, usage domains and control domains assigned to the KVM
-guest:
-
-* The AP Mask (APM) field is a bit mask that identifies the AP adapters assigned
- to the KVM guest. Each bit in the mask, from left to right (i.e. from most
- significant to least significant bit in big endian order), corresponds to
- an APID from 0-255. If a bit is set, the corresponding adapter is valid for
- use by the KVM guest.
-
-* The AP Queue Mask (AQM) field is a bit mask identifying the AP usage domains
- assigned to the KVM guest. Each bit in the mask, from left to right (i.e. from
- most significant to least significant bit in big endian order), corresponds to
- an AP queue index (APQI) from 0-255. If a bit is set, the corresponding queue
- is valid for use by the KVM guest.
-
-* The AP Domain Mask field is a bit mask that identifies the AP control domains
- assigned to the KVM guest. The ADM bit mask controls which domains can be
- changed by an AP command-request message sent to a usage domain from the
- guest. Each bit in the mask, from left to right (i.e. from most significant to
- least significant bit in big endian order), corresponds to a domain from
- 0-255. If a bit is set, the corresponding domain can be modified by an AP
- command-request message sent to a usage domain.
-
-If you recall from the description of an AP Queue, AP instructions include
-an APQN to identify the AP queue to which an AP command-request message is to be
-sent (NQAP and PQAP instructions), or from which a command-reply message is to
-be received (DQAP instruction). The validity of an APQN is defined by the matrix
-calculated from the APM and AQM; it is the cross product of all assigned adapter
-numbers (APM) with all assigned queue indexes (AQM). For example, if adapters 1
-and 2 and usage domains 5 and 6 are assigned to a guest, the APQNs (1,5), (1,6),
-(2,5) and (2,6) will be valid for the guest.
-
-The APQNs can provide secure key functionality - i.e., a private key is stored
-on the adapter card for each of its domains - so each APQN must be assigned to
-at most one guest or to the linux host.
-
- Example 1: Valid configuration:
- ------------------------------
- Guest1: adapters 1,2 domains 5,6
- Guest2: adapter 1,2 domain 7
-
- This is valid because both guests have a unique set of APQNs:
- Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
- Guest2 has APQNs (1,7), (2,7)
-
- Example 2: Valid configuration:
- ------------------------------
- Guest1: adapters 1,2 domains 5,6
- Guest2: adapters 3,4 domains 5,6
-
- This is also valid because both guests have a unique set of APQNs:
- Guest1 has APQNs (1,5), (1,6), (2,5), (2,6);
- Guest2 has APQNs (3,5), (3,6), (4,5), (4,6)
-
- Example 3: Invalid configuration:
- --------------------------------
- Guest1: adapters 1,2 domains 5,6
- Guest2: adapter 1 domains 6,7
-
- This is an invalid configuration because both guests have access to
- APQN (1,6).
-
-The Design:
-===========
-The design introduces three new objects:
-
-1. AP matrix device
-2. VFIO AP device driver (vfio_ap.ko)
-3. VFIO AP mediated matrix pass-through device
-
-The VFIO AP device driver
--------------------------
-The VFIO AP (vfio_ap) device driver serves the following purposes:
-
-1. Provides the interfaces to secure APQNs for exclusive use of KVM guests.
-
-2. Sets up the VFIO mediated device interfaces to manage a mediated matrix
- device and creates the sysfs interfaces for assigning adapters, usage
- domains, and control domains comprising the matrix for a KVM guest.
-
-3. Configures the APM, AQM and ADM in the CRYCB referenced by a KVM guest's
- SIE state description to grant the guest access to a matrix of AP devices
-
-Reserve APQNs for exclusive use of KVM guests
----------------------------------------------
-The following block diagram illustrates the mechanism by which APQNs are
-reserved:
-
- +------------------+
- 7 remove | |
- +--------------------> cex4queue driver |
- | | |
- | +------------------+
- |
- |
- | +------------------+ +-----------------+
- | 5 register driver | | 3 create | |
- | +----------------> Device core +----------> matrix device |
- | | | | | |
- | | +--------^---------+ +-----------------+
- | | |
- | | +-------------------+
- | | +-----------------------------------+ |
- | | | 4 register AP driver | | 2 register device
- | | | | |
-+--------+---+-v---+ +--------+-------+-+
-| | | |
-| ap_bus +--------------------- > vfio_ap driver |
-| | 8 probe | |
-+--------^---------+ +--^--^------------+
-6 edit | | |
- apmask | +-----------------------------+ | 9 mdev create
- aqmask | | 1 modprobe |
-+--------+-----+---+ +----------------+-+ +------------------+
-| | | |8 create | mediated |
-| admin | | VFIO device core |---------> matrix |
-| + | | | device |
-+------+-+---------+ +--------^---------+ +--------^---------+
- | | | |
- | | 9 create vfio_ap-passthrough | |
- | +------------------------------+ |
- +-------------------------------------------------------------+
- 10 assign adapter/domain/control domain
-
-The process for reserving an AP queue for use by a KVM guest is:
-
-1. The administrator loads the vfio_ap device driver
-2. The vfio-ap driver during its initialization will register a single 'matrix'
- device with the device core. This will serve as the parent device for
- all mediated matrix devices used to configure an AP matrix for a guest.
-3. The /sys/devices/vfio_ap/matrix device is created by the device core
-4 The vfio_ap device driver will register with the AP bus for AP queue devices
- of type 10 and higher (CEX4 and newer). The driver will provide the vfio_ap
- driver's probe and remove callback interfaces. Devices older than CEX4 queues
- are not supported to simplify the implementation by not needlessly
- complicating the design by supporting older devices that will go out of
- service in the relatively near future, and for which there are few older
- systems around on which to test.
-5. The AP bus registers the vfio_ap device driver with the device core
-6. The administrator edits the AP adapter and queue masks to reserve AP queues
- for use by the vfio_ap device driver.
-7. The AP bus removes the AP queues reserved for the vfio_ap driver from the
- default zcrypt cex4queue driver.
-8. The AP bus probes the vfio_ap device driver to bind the queues reserved for
- it.
-9. The administrator creates a passthrough type mediated matrix device to be
- used by a guest
-10 The administrator assigns the adapters, usage domains and control domains
- to be exclusively used by a guest.
-
-Set up the VFIO mediated device interfaces
-------------------------------------------
-The VFIO AP device driver utilizes the common interface of the VFIO mediated
-device core driver to:
-* Register an AP mediated bus driver to add a mediated matrix device to and
- remove it from a VFIO group.
-* Create and destroy a mediated matrix device
-* Add a mediated matrix device to and remove it from the AP mediated bus driver
-* Add a mediated matrix device to and remove it from an IOMMU group
-
-The following high-level block diagram shows the main components and interfaces
-of the VFIO AP mediated matrix device driver:
-
- +-------------+
- | |
- | +---------+ | mdev_register_driver() +--------------+
- | | Mdev | +<-----------------------+ |
- | | bus | | | vfio_mdev.ko |
- | | driver | +----------------------->+ |<-> VFIO user
- | +---------+ | probe()/remove() +--------------+ APIs
- | |
- | MDEV CORE |
- | MODULE |
- | mdev.ko |
- | +---------+ | mdev_register_device() +--------------+
- | |Physical | +<-----------------------+ |
- | | device | | | vfio_ap.ko |<-> matrix
- | |interface| +----------------------->+ | device
- | +---------+ | callback +--------------+
- +-------------+
-
-During initialization of the vfio_ap module, the matrix device is registered
-with an 'mdev_parent_ops' structure that provides the sysfs attribute
-structures, mdev functions and callback interfaces for managing the mediated
-matrix device.
-
-* sysfs attribute structures:
- * supported_type_groups
- The VFIO mediated device framework supports creation of user-defined
- mediated device types. These mediated device types are specified
- via the 'supported_type_groups' structure when a device is registered
- with the mediated device framework. The registration process creates the
- sysfs structures for each mediated device type specified in the
- 'mdev_supported_types' sub-directory of the device being registered. Along
- with the device type, the sysfs attributes of the mediated device type are
- provided.
-
- The VFIO AP device driver will register one mediated device type for
- passthrough devices:
- /sys/devices/vfio_ap/matrix/mdev_supported_types/vfio_ap-passthrough
- Only the read-only attributes required by the VFIO mdev framework will
- be provided:
- ... name
- ... device_api
- ... available_instances
- ... device_api
- Where:
- * name: specifies the name of the mediated device type
- * device_api: the mediated device type's API
- * available_instances: the number of mediated matrix passthrough devices
- that can be created
- * device_api: specifies the VFIO API
- * mdev_attr_groups
- This attribute group identifies the user-defined sysfs attributes of the
- mediated device. When a device is registered with the VFIO mediated device
- framework, the sysfs attribute files identified in the 'mdev_attr_groups'
- structure will be created in the mediated matrix device's directory. The
- sysfs attributes for a mediated matrix device are:
- * assign_adapter:
- * unassign_adapter:
- Write-only attributes for assigning/unassigning an AP adapter to/from the
- mediated matrix device. To assign/unassign an adapter, the APID of the
- adapter is echoed to the respective attribute file.
- * assign_domain:
- * unassign_domain:
- Write-only attributes for assigning/unassigning an AP usage domain to/from
- the mediated matrix device. To assign/unassign a domain, the domain
- number of the the usage domain is echoed to the respective attribute
- file.
- * matrix:
- A read-only file for displaying the APQNs derived from the cross product
- of the adapter and domain numbers assigned to the mediated matrix device.
- * assign_control_domain:
- * unassign_control_domain:
- Write-only attributes for assigning/unassigning an AP control domain
- to/from the mediated matrix device. To assign/unassign a control domain,
- the ID of the domain to be assigned/unassigned is echoed to the respective
- attribute file.
- * control_domains:
- A read-only file for displaying the control domain numbers assigned to the
- mediated matrix device.
-
-* functions:
- * create:
- allocates the ap_matrix_mdev structure used by the vfio_ap driver to:
- * Store the reference to the KVM structure for the guest using the mdev
- * Store the AP matrix configuration for the adapters, domains, and control
- domains assigned via the corresponding sysfs attributes files
- * remove:
- deallocates the mediated matrix device's ap_matrix_mdev structure. This will
- be allowed only if a running guest is not using the mdev.
-
-* callback interfaces
- * open:
- The vfio_ap driver uses this callback to register a
- VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the mdev matrix
- device. The open is invoked when QEMU connects the VFIO iommu group
- for the mdev matrix device to the MDEV bus. Access to the KVM structure used
- to configure the KVM guest is provided via this callback. The KVM structure,
- is used to configure the guest's access to the AP matrix defined via the
- mediated matrix device's sysfs attribute files.
- * release:
- unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the
- mdev matrix device and deconfigures the guest's AP matrix.
-
-Configure the APM, AQM and ADM in the CRYCB:
--------------------------------------------
-Configuring the AP matrix for a KVM guest will be performed when the
-VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier
-function is called when QEMU connects to KVM. The guest's AP matrix is
-configured via it's CRYCB by:
-* Setting the bits in the APM corresponding to the APIDs assigned to the
- mediated matrix device via its 'assign_adapter' interface.
-* Setting the bits in the AQM corresponding to the domains assigned to the
- mediated matrix device via its 'assign_domain' interface.
-* Setting the bits in the ADM corresponding to the domain dIDs assigned to the
- mediated matrix device via its 'assign_control_domains' interface.
-
-The CPU model features for AP
------------------------------
-The AP stack relies on the presence of the AP instructions as well as two
-facilities: The AP Facilities Test (APFT) facility; and the AP Query
-Configuration Information (QCI) facility. These features/facilities are made
-available to a KVM guest via the following CPU model features:
-
-1. ap: Indicates whether the AP instructions are installed on the guest. This
- feature will be enabled by KVM only if the AP instructions are installed
- on the host.
-
-2. apft: Indicates the APFT facility is available on the guest. This facility
- can be made available to the guest only if it is available on the host (i.e.,
- facility bit 15 is set).
-
-3. apqci: Indicates the AP QCI facility is available on the guest. This facility
- can be made available to the guest only if it is available on the host (i.e.,
- facility bit 12 is set).
-
-Note: If the user chooses to specify a CPU model different than the 'host'
-model to QEMU, the CPU model features and facilities need to be turned on
-explicitly; for example:
-
- /usr/bin/qemu-system-s390x ... -cpu z13,ap=on,apqci=on,apft=on
-
-A guest can be precluded from using AP features/facilities by turning them off
-explicitly; for example:
-
- /usr/bin/qemu-system-s390x ... -cpu host,ap=off,apqci=off,apft=off
-
-Note: If the APFT facility is turned off (apft=off) for the guest, the guest
-will not see any AP devices. The zcrypt device drivers that register for type 10
-and newer AP devices - i.e., the cex4card and cex4queue device drivers - need
-the APFT facility to ascertain the facilities installed on a given AP device. If
-the APFT facility is not installed on the guest, then the probe of device
-drivers will fail since only type 10 and newer devices can be configured for
-guest use.
-
-Example:
-=======
-Let's now provide an example to illustrate how KVM guests may be given
-access to AP facilities. For this example, we will show how to configure
-three guests such that executing the lszcrypt command on the guests would
-look like this:
-
-Guest1
-------
-CARD.DOMAIN TYPE MODE
-------------------------------
-05 CEX5C CCA-Coproc
-05.0004 CEX5C CCA-Coproc
-05.00ab CEX5C CCA-Coproc
-06 CEX5A Accelerator
-06.0004 CEX5A Accelerator
-06.00ab CEX5C CCA-Coproc
-
-Guest2
-------
-CARD.DOMAIN TYPE MODE
-------------------------------
-05 CEX5A Accelerator
-05.0047 CEX5A Accelerator
-05.00ff CEX5A Accelerator
-
-Guest2
-------
-CARD.DOMAIN TYPE MODE
-------------------------------
-06 CEX5A Accelerator
-06.0047 CEX5A Accelerator
-06.00ff CEX5A Accelerator
-
-These are the steps:
-
-1. Install the vfio_ap module on the linux host. The dependency chain for the
- vfio_ap module is:
- * iommu
- * s390
- * zcrypt
- * vfio
- * vfio_mdev
- * vfio_mdev_device
- * KVM
-
- To build the vfio_ap module, the kernel build must be configured with the
- following Kconfig elements selected:
- * IOMMU_SUPPORT
- * S390
- * ZCRYPT
- * S390_AP_IOMMU
- * VFIO
- * VFIO_MDEV
- * VFIO_MDEV_DEVICE
- * KVM
-
- If using make menuconfig select the following to build the vfio_ap module:
- -> Device Drivers
- -> IOMMU Hardware Support
- select S390 AP IOMMU Support
- -> VFIO Non-Privileged userspace driver framework
- -> Mediated device driver frramework
- -> VFIO driver for Mediated devices
- -> I/O subsystem
- -> VFIO support for AP devices
-
-2. Secure the AP queues to be used by the three guests so that the host can not
- access them. To secure them, there are two sysfs files that specify
- bitmasks marking a subset of the APQN range as 'usable by the default AP
- queue device drivers' or 'not usable by the default device drivers' and thus
- available for use by the vfio_ap device driver'. The location of the sysfs
- files containing the masks are:
-
- /sys/bus/ap/apmask
- /sys/bus/ap/aqmask
-
- The 'apmask' is a 256-bit mask that identifies a set of AP adapter IDs
- (APID). Each bit in the mask, from left to right (i.e., from most significant
- to least significant bit in big endian order), corresponds to an APID from
- 0-255. If a bit is set, the APID is marked as usable only by the default AP
- queue device drivers; otherwise, the APID is usable by the vfio_ap
- device driver.
-
- The 'aqmask' is a 256-bit mask that identifies a set of AP queue indexes
- (APQI). Each bit in the mask, from left to right (i.e., from most significant
- to least significant bit in big endian order), corresponds to an APQI from
- 0-255. If a bit is set, the APQI is marked as usable only by the default AP
- queue device drivers; otherwise, the APQI is usable by the vfio_ap device
- driver.
-
- Take, for example, the following mask:
-
- 0x7dffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
-
- It indicates:
-
- 1, 2, 3, 4, 5, and 7-255 belong to the default drivers' pool, and 0 and 6
- belong to the vfio_ap device driver's pool.
-
- The APQN of each AP queue device assigned to the linux host is checked by the
- AP bus against the set of APQNs derived from the cross product of APIDs
- and APQIs marked as usable only by the default AP queue device drivers. If a
- match is detected, only the default AP queue device drivers will be probed;
- otherwise, the vfio_ap device driver will be probed.
-
- By default, the two masks are set to reserve all APQNs for use by the default
- AP queue device drivers. There are two ways the default masks can be changed:
-
- 1. The sysfs mask files can be edited by echoing a string into the
- respective sysfs mask file in one of two formats:
-
- * An absolute hex string starting with 0x - like "0x12345678" - sets
- the mask. If the given string is shorter than the mask, it is padded
- with 0s on the right; for example, specifying a mask value of 0x41 is
- the same as specifying:
-
- 0x4100000000000000000000000000000000000000000000000000000000000000
-
- Keep in mind that the mask reads from left to right (i.e., most
- significant to least significant bit in big endian order), so the mask
- above identifies device numbers 1 and 7 (01000001).
-
- If the string is longer than the mask, the operation is terminated with
- an error (EINVAL).
-
- * Individual bits in the mask can be switched on and off by specifying
- each bit number to be switched in a comma separated list. Each bit
- number string must be prepended with a ('+') or minus ('-') to indicate
- the corresponding bit is to be switched on ('+') or off ('-'). Some
- valid values are:
-
- "+0" switches bit 0 on
- "-13" switches bit 13 off
- "+0x41" switches bit 65 on
- "-0xff" switches bit 255 off
-
- The following example:
- +0,-6,+0x47,-0xf0
-
- Switches bits 0 and 71 (0x47) on
- Switches bits 6 and 240 (0xf0) off
-
- Note that the bits not specified in the list remain as they were before
- the operation.
-
- 2. The masks can also be changed at boot time via parameters on the kernel
- command line like this:
-
- ap.apmask=0xffff ap.aqmask=0x40
-
- This would create the following masks:
-
- apmask:
- 0xffff000000000000000000000000000000000000000000000000000000000000
-
- aqmask:
- 0x4000000000000000000000000000000000000000000000000000000000000000
-
- Resulting in these two pools:
-
- default drivers pool: adapter 0-15, domain 1
- alternate drivers pool: adapter 16-255, domains 0, 2-255
-
- Securing the APQNs for our example:
- ----------------------------------
- To secure the AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004, 06.0047,
- 06.00ab, and 06.00ff for use by the vfio_ap device driver, the corresponding
- APQNs can either be removed from the default masks:
-
- echo -5,-6 > /sys/bus/ap/apmask
-
- echo -4,-0x47,-0xab,-0xff > /sys/bus/ap/aqmask
-
- Or the masks can be set as follows:
-
- echo 0xf9ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff \
- > apmask
-
- echo 0xf7fffffffffffffffeffffffffffffffffffffffffeffffffffffffffffffffe \
- > aqmask
-
- This will result in AP queues 05.0004, 05.0047, 05.00ab, 05.00ff, 06.0004,
- 06.0047, 06.00ab, and 06.00ff getting bound to the vfio_ap device driver. The
- sysfs directory for the vfio_ap device driver will now contain symbolic links
- to the AP queue devices bound to it:
-
- /sys/bus/ap
- ... [drivers]
- ...... [vfio_ap]
- ......... [05.0004]
- ......... [05.0047]
- ......... [05.00ab]
- ......... [05.00ff]
- ......... [06.0004]
- ......... [06.0047]
- ......... [06.00ab]
- ......... [06.00ff]
-
- Keep in mind that only type 10 and newer adapters (i.e., CEX4 and later)
- can be bound to the vfio_ap device driver. The reason for this is to
- simplify the implementation by not needlessly complicating the design by
- supporting older devices that will go out of service in the relatively near
- future and for which there are few older systems on which to test.
-
- The administrator, therefore, must take care to secure only AP queues that
- can be bound to the vfio_ap device driver. The device type for a given AP
- queue device can be read from the parent card's sysfs directory. For example,
- to see the hardware type of the queue 05.0004:
-
- cat /sys/bus/ap/devices/card05/hwtype
-
- The hwtype must be 10 or higher (CEX4 or newer) in order to be bound to the
- vfio_ap device driver.
-
-3. Create the mediated devices needed to configure the AP matrixes for the
- three guests and to provide an interface to the vfio_ap driver for
- use by the guests:
-
- /sys/devices/vfio_ap/matrix/
- --- [mdev_supported_types]
- ------ [vfio_ap-passthrough] (passthrough mediated matrix device type)
- --------- create
- --------- [devices]
-
- To create the mediated devices for the three guests:
-
- uuidgen > create
- uuidgen > create
- uuidgen > create
-
- or
-
- echo $uuid1 > create
- echo $uuid2 > create
- echo $uuid3 > create
-
- This will create three mediated devices in the [devices] subdirectory named
- after the UUID written to the create attribute file. We call them $uuid1,
- $uuid2 and $uuid3 and this is the sysfs directory structure after creation:
-
- /sys/devices/vfio_ap/matrix/
- --- [mdev_supported_types]
- ------ [vfio_ap-passthrough]
- --------- [devices]
- ------------ [$uuid1]
- --------------- assign_adapter
- --------------- assign_control_domain
- --------------- assign_domain
- --------------- matrix
- --------------- unassign_adapter
- --------------- unassign_control_domain
- --------------- unassign_domain
-
- ------------ [$uuid2]
- --------------- assign_adapter
- --------------- assign_control_domain
- --------------- assign_domain
- --------------- matrix
- --------------- unassign_adapter
- ----------------unassign_control_domain
- ----------------unassign_domain
-
- ------------ [$uuid3]
- --------------- assign_adapter
- --------------- assign_control_domain
- --------------- assign_domain
- --------------- matrix
- --------------- unassign_adapter
- ----------------unassign_control_domain
- ----------------unassign_domain
-
-4. The administrator now needs to configure the matrixes for the mediated
- devices $uuid1 (for Guest1), $uuid2 (for Guest2) and $uuid3 (for Guest3).
-
- This is how the matrix is configured for Guest1:
-
- echo 5 > assign_adapter
- echo 6 > assign_adapter
- echo 4 > assign_domain
- echo 0xab > assign_domain
-
- Control domains can similarly be assigned using the assign_control_domain
- sysfs file.
-
- If a mistake is made configuring an adapter, domain or control domain,
- you can use the unassign_xxx files to unassign the adapter, domain or
- control domain.
-
- To display the matrix configuration for Guest1:
-
- cat matrix
-
- This is how the matrix is configured for Guest2:
-
- echo 5 > assign_adapter
- echo 0x47 > assign_domain
- echo 0xff > assign_domain
-
- This is how the matrix is configured for Guest3:
-
- echo 6 > assign_adapter
- echo 0x47 > assign_domain
- echo 0xff > assign_domain
-
- In order to successfully assign an adapter:
-
- * The adapter number specified must represent a value from 0 up to the
- maximum adapter number configured for the system. If an adapter number
- higher than the maximum is specified, the operation will terminate with
- an error (ENODEV).
-
- * All APQNs that can be derived from the adapter ID and the IDs of
- the previously assigned domains must be bound to the vfio_ap device
- driver. If no domains have yet been assigned, then there must be at least
- one APQN with the specified APID bound to the vfio_ap driver. If no such
- APQNs are bound to the driver, the operation will terminate with an
- error (EADDRNOTAVAIL).
-
- No APQN that can be derived from the adapter ID and the IDs of the
- previously assigned domains can be assigned to another mediated matrix
- device. If an APQN is assigned to another mediated matrix device, the
- operation will terminate with an error (EADDRINUSE).
-
- In order to successfully assign a domain:
-
- * The domain number specified must represent a value from 0 up to the
- maximum domain number configured for the system. If a domain number
- higher than the maximum is specified, the operation will terminate with
- an error (ENODEV).
-
- * All APQNs that can be derived from the domain ID and the IDs of
- the previously assigned adapters must be bound to the vfio_ap device
- driver. If no domains have yet been assigned, then there must be at least
- one APQN with the specified APQI bound to the vfio_ap driver. If no such
- APQNs are bound to the driver, the operation will terminate with an
- error (EADDRNOTAVAIL).
-
- No APQN that can be derived from the domain ID and the IDs of the
- previously assigned adapters can be assigned to another mediated matrix
- device. If an APQN is assigned to another mediated matrix device, the
- operation will terminate with an error (EADDRINUSE).
-
- In order to successfully assign a control domain, the domain number
- specified must represent a value from 0 up to the maximum domain number
- configured for the system. If a control domain number higher than the maximum
- is specified, the operation will terminate with an error (ENODEV).
-
-5. Start Guest1:
-
- /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
- -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid1 ...
-
-7. Start Guest2:
-
- /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
- -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid2 ...
-
-7. Start Guest3:
-
- /usr/bin/qemu-system-s390x ... -cpu host,ap=on,apqci=on,apft=on \
- -device vfio-ap,sysfsdev=/sys/devices/vfio_ap/matrix/$uuid3 ...
-
-When the guest is shut down, the mediated matrix devices may be removed.
-
-Using our example again, to remove the mediated matrix device $uuid1:
-
- /sys/devices/vfio_ap/matrix/
- --- [mdev_supported_types]
- ------ [vfio_ap-passthrough]
- --------- [devices]
- ------------ [$uuid1]
- --------------- remove
-
-
- echo 1 > remove
-
- This will remove all of the mdev matrix device's sysfs structures including
- the mdev device itself. To recreate and reconfigure the mdev matrix device,
- all of the steps starting with step 3 will have to be performed again. Note
- that the remove will fail if a guest using the mdev is still running.
-
- It is not necessary to remove an mdev matrix device, but one may want to
- remove it if no guest will use it during the remaining lifetime of the linux
- host. If the mdev matrix device is removed, one may want to also reconfigure
- the pool of adapters and queues reserved for use by the default drivers.
-
-Limitations
-===========
-* The KVM/kernel interfaces do not provide a way to prevent restoring an APQN
- to the default drivers pool of a queue that is still assigned to a mediated
- device in use by a guest. It is incumbent upon the administrator to
- ensure there is no mediated device in use by a guest to which the APQN is
- assigned lest the host be given access to the private data of the AP queue
- device such as a private key configured specifically for the guest.
-
-* Dynamically modifying the AP matrix for a running guest (which would amount to
- hot(un)plug of AP devices for the guest) is currently not supported
-
-* Live guest migration is not supported for guests using AP devices.
diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst
new file mode 100644
index 000000000000..1e210c6afa88
--- /dev/null
+++ b/Documentation/s390/vfio-ccw.rst
@@ -0,0 +1,326 @@
+==================================
+vfio-ccw: the basic infrastructure
+==================================
+
+Introduction
+------------
+
+Here we describe the vfio support for I/O subchannel devices for
+Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a
+virtual machine, while vfio is the means.
+
+Different than other hardware architectures, s390 has defined a unified
+I/O access method, which is so called Channel I/O. It has its own access
+patterns:
+
+- Channel programs run asynchronously on a separate (co)processor.
+- The channel subsystem will access any memory designated by the caller
+ in the channel program directly, i.e. there is no iommu involved.
+
+Thus when we introduce vfio support for these devices, we realize it
+with a mediated device (mdev) implementation. The vfio mdev will be
+added to an iommu group, so as to make itself able to be managed by the
+vfio framework. And we add read/write callbacks for special vfio I/O
+regions to pass the channel programs from the mdev to its parent device
+(the real I/O subchannel device) to do further address translation and
+to perform I/O instructions.
+
+This document does not intend to explain the s390 I/O architecture in
+every detail. More information/reference could be found here:
+
+- A good start to know Channel I/O in general:
+ https://en.wikipedia.org/wiki/Channel_I/O
+- s390 architecture:
+ s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+- The existing QEMU code which implements a simple emulated channel
+ subsystem could also be a good reference. It makes it easier to follow
+ the flow.
+ qemu/hw/s390x/css.c
+
+For vfio mediated device framework:
+- Documentation/driver-api/vfio-mediated-device.rst
+
+Motivation of vfio-ccw
+----------------------
+
+Typically, a guest virtualized via QEMU/KVM on s390 only sees
+paravirtualized virtio devices via the "Virtio Over Channel I/O
+(virtio-ccw)" transport. This makes virtio devices discoverable via
+standard operating system algorithms for handling channel devices.
+
+However this is not enough. On s390 for the majority of devices, which
+use the standard Channel I/O based mechanism, we also need to provide
+the functionality of passing through them to a QEMU virtual machine.
+This includes devices that don't have a virtio counterpart (e.g. tape
+drives) or that have specific characteristics which guests want to
+exploit.
+
+For passing a device to a guest, we want to use the same interface as
+everybody else, namely vfio. We implement this vfio support for channel
+devices via the vfio mediated device framework and the subchannel device
+driver "vfio_ccw".
+
+Access patterns of CCW devices
+------------------------------
+
+s390 architecture has implemented a so called channel subsystem, that
+provides a unified view of the devices physically attached to the
+systems. Though the s390 hardware platform knows about a huge variety of
+different peripheral attachments like disk devices (aka. DASDs), tapes,
+communication controllers, etc. They can all be accessed by a well
+defined access method and they are presenting I/O completion a unified
+way: I/O interruptions.
+
+All I/O requires the use of channel command words (CCWs). A CCW is an
+instruction to a specialized I/O channel processor. A channel program is
+a sequence of CCWs which are executed by the I/O channel subsystem. To
+issue a channel program to the channel subsystem, it is required to
+build an operation request block (ORB), which can be used to point out
+the format of the CCW and other control information to the system. The
+operating system signals the I/O channel subsystem to begin executing
+the channel program with a SSCH (start sub-channel) instruction. The
+central processor is then free to proceed with non-I/O instructions
+until interrupted. The I/O completion result is received by the
+interrupt handler in the form of interrupt response block (IRB).
+
+Back to vfio-ccw, in short:
+
+- ORBs and channel programs are built in guest kernel (with guest
+ physical addresses).
+- ORBs and channel programs are passed to the host kernel.
+- Host kernel translates the guest physical addresses to real addresses
+ and starts the I/O with issuing a privileged Channel I/O instruction
+ (e.g SSCH).
+- channel programs run asynchronously on a separate processor.
+- I/O completion will be signaled to the host with I/O interruptions.
+ And it will be copied as IRB to user space to pass it back to the
+ guest.
+
+Physical vfio ccw device and its child mdev
+-------------------------------------------
+
+As mentioned above, we realize vfio-ccw with a mdev implementation.
+
+Channel I/O does not have IOMMU hardware support, so the physical
+vfio-ccw device does not have an IOMMU level translation or isolation.
+
+Subchannel I/O instructions are all privileged instructions. When
+handling the I/O instruction interception, vfio-ccw has the software
+policing and translation how the channel program is programmed before
+it gets sent to hardware.
+
+Within this implementation, we have two drivers for two types of
+devices:
+
+- The vfio_ccw driver for the physical subchannel device.
+ This is an I/O subchannel driver for the real subchannel device. It
+ realizes a group of callbacks and registers to the mdev framework as a
+ parent (physical) device. As a consequence, mdev provides vfio_ccw a
+ generic interface (sysfs) to create mdev devices. A vfio mdev could be
+ created by vfio_ccw then and added to the mediated bus. It is the vfio
+ device that added to an IOMMU group and a vfio group.
+ vfio_ccw also provides an I/O region to accept channel program
+ request from user space and store I/O interrupt result for user
+ space to retrieve. To notify user space an I/O completion, it offers
+ an interface to setup an eventfd fd for asynchronous signaling.
+
+- The vfio_mdev driver for the mediated vfio ccw device.
+ This is provided by the mdev framework. It is a vfio device driver for
+ the mdev that created by vfio_ccw.
+ It realizes a group of vfio device driver callbacks, adds itself to a
+ vfio group, and registers itself to the mdev framework as a mdev
+ driver.
+ It uses a vfio iommu backend that uses the existing map and unmap
+ ioctls, but rather than programming them into an IOMMU for a device,
+ it simply stores the translations for use by later requests. This
+ means that a device programmed in a VM with guest physical addresses
+ can have the vfio kernel convert that address to process virtual
+ address, pin the page and program the hardware with the host physical
+ address in one step.
+ For a mdev, the vfio iommu backend will not pin the pages during the
+ VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database
+ of the iova<->vaddr mappings in this operation. And they export a
+ vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu
+ backend for the physical devices to pin and unpin pages by demand.
+
+Below is a high Level block diagram::
+
+ +-------------+
+ | |
+ | +---------+ | mdev_register_driver() +--------------+
+ | | Mdev | +<-----------------------+ |
+ | | bus | | | vfio_mdev.ko |
+ | | driver | +----------------------->+ |<-> VFIO user
+ | +---------+ | probe()/remove() +--------------+ APIs
+ | |
+ | MDEV CORE |
+ | MODULE |
+ | mdev.ko |
+ | +---------+ | mdev_register_device() +--------------+
+ | |Physical | +<-----------------------+ |
+ | | device | | | vfio_ccw.ko |<-> subchannel
+ | |interface| +----------------------->+ | device
+ | +---------+ | callback +--------------+
+ +-------------+
+
+The process of how these work together.
+
+1. vfio_ccw.ko drives the physical I/O subchannel, and registers the
+ physical device (with callbacks) to mdev framework.
+ When vfio_ccw probing the subchannel device, it registers device
+ pointer and callbacks to the mdev framework. Mdev related file nodes
+ under the device node in sysfs would be created for the subchannel
+ device, namely 'mdev_create', 'mdev_destroy' and
+ 'mdev_supported_types'.
+2. Create a mediated vfio ccw device.
+ Use the 'mdev_create' sysfs file, we need to manually create one (and
+ only one for our case) mediated device.
+3. vfio_mdev.ko drives the mediated ccw device.
+ vfio_mdev is also the vfio device drvier. It will probe the mdev and
+ add it to an iommu_group and a vfio_group. Then we could pass through
+ the mdev to a guest.
+
+vfio-ccw I/O region
+-------------------
+
+An I/O region is used to accept channel program request from user
+space and store I/O interrupt result for user space to retrieve. The
+definition of the region is::
+
+ struct ccw_io_region {
+ #define ORB_AREA_SIZE 12
+ __u8 orb_area[ORB_AREA_SIZE];
+ #define SCSW_AREA_SIZE 12
+ __u8 scsw_area[SCSW_AREA_SIZE];
+ #define IRB_AREA_SIZE 96
+ __u8 irb_area[IRB_AREA_SIZE];
+ __u32 ret_code;
+ } __packed;
+
+While starting an I/O request, orb_area should be filled with the
+guest ORB, and scsw_area should be filled with the SCSW of the Virtual
+Subchannel.
+
+irb_area stores the I/O result.
+
+ret_code stores a return code for each access of the region.
+
+vfio-ccw operation details
+--------------------------
+
+vfio-ccw follows what vfio-pci did on the s390 platform and uses
+vfio-iommu-type1 as the vfio iommu backend.
+
+* CCW translation APIs
+ A group of APIs (start with `cp_`) to do CCW translation. The CCWs
+ passed in by a user space program are organized with their guest
+ physical memory addresses. These APIs will copy the CCWs into kernel
+ space, and assemble a runnable kernel channel program by updating the
+ guest physical addresses with their corresponding host physical addresses.
+ Note that we have to use IDALs even for direct-access CCWs, as the
+ referenced memory can be located anywhere, including above 2G.
+
+* vfio_ccw device driver
+ This driver utilizes the CCW translation APIs and introduces
+ vfio_ccw, which is the driver for the I/O subchannel devices you want
+ to pass through.
+ vfio_ccw implements the following vfio ioctls::
+
+ VFIO_DEVICE_GET_INFO
+ VFIO_DEVICE_GET_IRQ_INFO
+ VFIO_DEVICE_GET_REGION_INFO
+ VFIO_DEVICE_RESET
+ VFIO_DEVICE_SET_IRQS
+
+ This provides an I/O region, so that the user space program can pass a
+ channel program to the kernel, to do further CCW translation before
+ issuing them to a real device.
+ This also provides the SET_IRQ ioctl to setup an event notifier to
+ notify the user space program the I/O completion in an asynchronous
+ way.
+
+The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a
+good example to get understand how these patches work. Here is a little
+bit more detail how an I/O request triggered by the QEMU guest will be
+handled (without error handling).
+
+Explanation:
+
+- Q1-Q7: QEMU side process.
+- K1-K5: Kernel side process.
+
+Q1.
+ Get I/O region info during initialization.
+
+Q2.
+ Setup event notifier and handler to handle I/O completion.
+
+... ...
+
+Q3.
+ Intercept a ssch instruction.
+Q4.
+ Write the guest channel program and ORB to the I/O region.
+
+ K1.
+ Copy from guest to kernel.
+ K2.
+ Translate the guest channel program to a host kernel space
+ channel program, which becomes runnable for a real device.
+ K3.
+ With the necessary information contained in the orb passed in
+ by QEMU, issue the ccwchain to the device.
+ K4.
+ Return the ssch CC code.
+Q5.
+ Return the CC code to the guest.
+
+... ...
+
+ K5.
+ Interrupt handler gets the I/O result and write the result to
+ the I/O region.
+ K6.
+ Signal QEMU to retrieve the result.
+
+Q6.
+ Get the signal and event handler reads out the result from the I/O
+ region.
+Q7.
+ Update the irb for the guest.
+
+Limitations
+-----------
+
+The current vfio-ccw implementation focuses on supporting basic commands
+needed to implement block device functionality (read/write) of DASD/ECKD
+device only. Some commands may need special handling in the future, for
+example, anything related to path grouping.
+
+DASD is a kind of storage device. While ECKD is a data recording format.
+More information for DASD and ECKD could be found here:
+https://en.wikipedia.org/wiki/Direct-access_storage_device
+https://en.wikipedia.org/wiki/Count_key_data
+
+Together with the corresponding work in QEMU, we can bring the passed
+through DASD/ECKD device online in a guest now and use it as a block
+device.
+
+While the current code allows the guest to start channel programs via
+START SUBCHANNEL, support for HALT SUBCHANNEL or CLEAR SUBCHANNEL is
+not yet implemented.
+
+vfio-ccw supports classic (command mode) channel I/O only. Transport
+mode (HPF) is not supported.
+
+QDIO subchannels are currently not supported. Classic devices other than
+DASD/ECKD might work, but have not been tested.
+
+Reference
+---------
+1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832)
+2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204)
+3. https://en.wikipedia.org/wiki/Channel_I/O
+4. Documentation/s390/cds.rst
+5. Documentation/driver-api/vfio.rst
+6. Documentation/driver-api/vfio-mediated-device.rst
diff --git a/Documentation/s390/vfio-ccw.txt b/Documentation/s390/vfio-ccw.txt
deleted file mode 100644
index 2be11ad864ff..000000000000
--- a/Documentation/s390/vfio-ccw.txt
+++ /dev/null
@@ -1,300 +0,0 @@
-vfio-ccw: the basic infrastructure
-==================================
-
-Introduction
-------------
-
-Here we describe the vfio support for I/O subchannel devices for
-Linux/s390. Motivation for vfio-ccw is to passthrough subchannels to a
-virtual machine, while vfio is the means.
-
-Different than other hardware architectures, s390 has defined a unified
-I/O access method, which is so called Channel I/O. It has its own access
-patterns:
-- Channel programs run asynchronously on a separate (co)processor.
-- The channel subsystem will access any memory designated by the caller
- in the channel program directly, i.e. there is no iommu involved.
-Thus when we introduce vfio support for these devices, we realize it
-with a mediated device (mdev) implementation. The vfio mdev will be
-added to an iommu group, so as to make itself able to be managed by the
-vfio framework. And we add read/write callbacks for special vfio I/O
-regions to pass the channel programs from the mdev to its parent device
-(the real I/O subchannel device) to do further address translation and
-to perform I/O instructions.
-
-This document does not intend to explain the s390 I/O architecture in
-every detail. More information/reference could be found here:
-- A good start to know Channel I/O in general:
- https://en.wikipedia.org/wiki/Channel_I/O
-- s390 architecture:
- s390 Principles of Operation manual (IBM Form. No. SA22-7832)
-- The existing QEMU code which implements a simple emulated channel
- subsystem could also be a good reference. It makes it easier to follow
- the flow.
- qemu/hw/s390x/css.c
-
-For vfio mediated device framework:
-- Documentation/vfio-mediated-device.txt
-
-Motivation of vfio-ccw
-----------------------
-
-Typically, a guest virtualized via QEMU/KVM on s390 only sees
-paravirtualized virtio devices via the "Virtio Over Channel I/O
-(virtio-ccw)" transport. This makes virtio devices discoverable via
-standard operating system algorithms for handling channel devices.
-
-However this is not enough. On s390 for the majority of devices, which
-use the standard Channel I/O based mechanism, we also need to provide
-the functionality of passing through them to a QEMU virtual machine.
-This includes devices that don't have a virtio counterpart (e.g. tape
-drives) or that have specific characteristics which guests want to
-exploit.
-
-For passing a device to a guest, we want to use the same interface as
-everybody else, namely vfio. We implement this vfio support for channel
-devices via the vfio mediated device framework and the subchannel device
-driver "vfio_ccw".
-
-Access patterns of CCW devices
-------------------------------
-
-s390 architecture has implemented a so called channel subsystem, that
-provides a unified view of the devices physically attached to the
-systems. Though the s390 hardware platform knows about a huge variety of
-different peripheral attachments like disk devices (aka. DASDs), tapes,
-communication controllers, etc. They can all be accessed by a well
-defined access method and they are presenting I/O completion a unified
-way: I/O interruptions.
-
-All I/O requires the use of channel command words (CCWs). A CCW is an
-instruction to a specialized I/O channel processor. A channel program is
-a sequence of CCWs which are executed by the I/O channel subsystem. To
-issue a channel program to the channel subsystem, it is required to
-build an operation request block (ORB), which can be used to point out
-the format of the CCW and other control information to the system. The
-operating system signals the I/O channel subsystem to begin executing
-the channel program with a SSCH (start sub-channel) instruction. The
-central processor is then free to proceed with non-I/O instructions
-until interrupted. The I/O completion result is received by the
-interrupt handler in the form of interrupt response block (IRB).
-
-Back to vfio-ccw, in short:
-- ORBs and channel programs are built in guest kernel (with guest
- physical addresses).
-- ORBs and channel programs are passed to the host kernel.
-- Host kernel translates the guest physical addresses to real addresses
- and starts the I/O with issuing a privileged Channel I/O instruction
- (e.g SSCH).
-- channel programs run asynchronously on a separate processor.
-- I/O completion will be signaled to the host with I/O interruptions.
- And it will be copied as IRB to user space to pass it back to the
- guest.
-
-Physical vfio ccw device and its child mdev
--------------------------------------------
-
-As mentioned above, we realize vfio-ccw with a mdev implementation.
-
-Channel I/O does not have IOMMU hardware support, so the physical
-vfio-ccw device does not have an IOMMU level translation or isolation.
-
-Subchannel I/O instructions are all privileged instructions. When
-handling the I/O instruction interception, vfio-ccw has the software
-policing and translation how the channel program is programmed before
-it gets sent to hardware.
-
-Within this implementation, we have two drivers for two types of
-devices:
-- The vfio_ccw driver for the physical subchannel device.
- This is an I/O subchannel driver for the real subchannel device. It
- realizes a group of callbacks and registers to the mdev framework as a
- parent (physical) device. As a consequence, mdev provides vfio_ccw a
- generic interface (sysfs) to create mdev devices. A vfio mdev could be
- created by vfio_ccw then and added to the mediated bus. It is the vfio
- device that added to an IOMMU group and a vfio group.
- vfio_ccw also provides an I/O region to accept channel program
- request from user space and store I/O interrupt result for user
- space to retrieve. To notify user space an I/O completion, it offers
- an interface to setup an eventfd fd for asynchronous signaling.
-
-- The vfio_mdev driver for the mediated vfio ccw device.
- This is provided by the mdev framework. It is a vfio device driver for
- the mdev that created by vfio_ccw.
- It realizes a group of vfio device driver callbacks, adds itself to a
- vfio group, and registers itself to the mdev framework as a mdev
- driver.
- It uses a vfio iommu backend that uses the existing map and unmap
- ioctls, but rather than programming them into an IOMMU for a device,
- it simply stores the translations for use by later requests. This
- means that a device programmed in a VM with guest physical addresses
- can have the vfio kernel convert that address to process virtual
- address, pin the page and program the hardware with the host physical
- address in one step.
- For a mdev, the vfio iommu backend will not pin the pages during the
- VFIO_IOMMU_MAP_DMA ioctl. Mdev framework will only maintain a database
- of the iova<->vaddr mappings in this operation. And they export a
- vfio_pin_pages and a vfio_unpin_pages interfaces from the vfio iommu
- backend for the physical devices to pin and unpin pages by demand.
-
-Below is a high Level block diagram.
-
- +-------------+
- | |
- | +---------+ | mdev_register_driver() +--------------+
- | | Mdev | +<-----------------------+ |
- | | bus | | | vfio_mdev.ko |
- | | driver | +----------------------->+ |<-> VFIO user
- | +---------+ | probe()/remove() +--------------+ APIs
- | |
- | MDEV CORE |
- | MODULE |
- | mdev.ko |
- | +---------+ | mdev_register_device() +--------------+
- | |Physical | +<-----------------------+ |
- | | device | | | vfio_ccw.ko |<-> subchannel
- | |interface| +----------------------->+ | device
- | +---------+ | callback +--------------+
- +-------------+
-
-The process of how these work together.
-1. vfio_ccw.ko drives the physical I/O subchannel, and registers the
- physical device (with callbacks) to mdev framework.
- When vfio_ccw probing the subchannel device, it registers device
- pointer and callbacks to the mdev framework. Mdev related file nodes
- under the device node in sysfs would be created for the subchannel
- device, namely 'mdev_create', 'mdev_destroy' and
- 'mdev_supported_types'.
-2. Create a mediated vfio ccw device.
- Use the 'mdev_create' sysfs file, we need to manually create one (and
- only one for our case) mediated device.
-3. vfio_mdev.ko drives the mediated ccw device.
- vfio_mdev is also the vfio device drvier. It will probe the mdev and
- add it to an iommu_group and a vfio_group. Then we could pass through
- the mdev to a guest.
-
-vfio-ccw I/O region
--------------------
-
-An I/O region is used to accept channel program request from user
-space and store I/O interrupt result for user space to retrieve. The
-definition of the region is:
-
-struct ccw_io_region {
-#define ORB_AREA_SIZE 12
- __u8 orb_area[ORB_AREA_SIZE];
-#define SCSW_AREA_SIZE 12
- __u8 scsw_area[SCSW_AREA_SIZE];
-#define IRB_AREA_SIZE 96
- __u8 irb_area[IRB_AREA_SIZE];
- __u32 ret_code;
-} __packed;
-
-While starting an I/O request, orb_area should be filled with the
-guest ORB, and scsw_area should be filled with the SCSW of the Virtual
-Subchannel.
-
-irb_area stores the I/O result.
-
-ret_code stores a return code for each access of the region.
-
-vfio-ccw operation details
---------------------------
-
-vfio-ccw follows what vfio-pci did on the s390 platform and uses
-vfio-iommu-type1 as the vfio iommu backend.
-
-* CCW translation APIs
- A group of APIs (start with 'cp_') to do CCW translation. The CCWs
- passed in by a user space program are organized with their guest
- physical memory addresses. These APIs will copy the CCWs into kernel
- space, and assemble a runnable kernel channel program by updating the
- guest physical addresses with their corresponding host physical addresses.
- Note that we have to use IDALs even for direct-access CCWs, as the
- referenced memory can be located anywhere, including above 2G.
-
-* vfio_ccw device driver
- This driver utilizes the CCW translation APIs and introduces
- vfio_ccw, which is the driver for the I/O subchannel devices you want
- to pass through.
- vfio_ccw implements the following vfio ioctls:
- VFIO_DEVICE_GET_INFO
- VFIO_DEVICE_GET_IRQ_INFO
- VFIO_DEVICE_GET_REGION_INFO
- VFIO_DEVICE_RESET
- VFIO_DEVICE_SET_IRQS
- This provides an I/O region, so that the user space program can pass a
- channel program to the kernel, to do further CCW translation before
- issuing them to a real device.
- This also provides the SET_IRQ ioctl to setup an event notifier to
- notify the user space program the I/O completion in an asynchronous
- way.
-
-The use of vfio-ccw is not limited to QEMU, while QEMU is definitely a
-good example to get understand how these patches work. Here is a little
-bit more detail how an I/O request triggered by the QEMU guest will be
-handled (without error handling).
-
-Explanation:
-Q1-Q7: QEMU side process.
-K1-K5: Kernel side process.
-
-Q1. Get I/O region info during initialization.
-Q2. Setup event notifier and handler to handle I/O completion.
-
-... ...
-
-Q3. Intercept a ssch instruction.
-Q4. Write the guest channel program and ORB to the I/O region.
- K1. Copy from guest to kernel.
- K2. Translate the guest channel program to a host kernel space
- channel program, which becomes runnable for a real device.
- K3. With the necessary information contained in the orb passed in
- by QEMU, issue the ccwchain to the device.
- K4. Return the ssch CC code.
-Q5. Return the CC code to the guest.
-
-... ...
-
- K5. Interrupt handler gets the I/O result and write the result to
- the I/O region.
- K6. Signal QEMU to retrieve the result.
-Q6. Get the signal and event handler reads out the result from the I/O
- region.
-Q7. Update the irb for the guest.
-
-Limitations
------------
-
-The current vfio-ccw implementation focuses on supporting basic commands
-needed to implement block device functionality (read/write) of DASD/ECKD
-device only. Some commands may need special handling in the future, for
-example, anything related to path grouping.
-
-DASD is a kind of storage device. While ECKD is a data recording format.
-More information for DASD and ECKD could be found here:
-https://en.wikipedia.org/wiki/Direct-access_storage_device
-https://en.wikipedia.org/wiki/Count_key_data
-
-Together with the corresponding work in QEMU, we can bring the passed
-through DASD/ECKD device online in a guest now and use it as a block
-device.
-
-While the current code allows the guest to start channel programs via
-START SUBCHANNEL, support for HALT SUBCHANNEL or CLEAR SUBCHANNEL is
-not yet implemented.
-
-vfio-ccw supports classic (command mode) channel I/O only. Transport
-mode (HPF) is not supported.
-
-QDIO subchannels are currently not supported. Classic devices other than
-DASD/ECKD might work, but have not been tested.
-
-Reference
----------
-1. ESA/s390 Principles of Operation manual (IBM Form. No. SA22-7832)
-2. ESA/390 Common I/O Device Commands manual (IBM Form. No. SA22-7204)
-3. https://en.wikipedia.org/wiki/Channel_I/O
-4. Documentation/s390/cds.txt
-5. Documentation/vfio.txt
-6. Documentation/vfio-mediated-device.txt
diff --git a/Documentation/s390/zfcpdump.rst b/Documentation/s390/zfcpdump.rst
new file mode 100644
index 000000000000..54e8e7caf7e7
--- /dev/null
+++ b/Documentation/s390/zfcpdump.rst
@@ -0,0 +1,50 @@
+==================================
+The s390 SCSI dump tool (zfcpdump)
+==================================
+
+System z machines (z900 or higher) provide hardware support for creating system
+dumps on SCSI disks. The dump process is initiated by booting a dump tool, which
+has to create a dump of the current (probably crashed) Linux image. In order to
+not overwrite memory of the crashed Linux with data of the dump tool, the
+hardware saves some memory plus the register sets of the boot CPU before the
+dump tool is loaded. There exists an SCLP hardware interface to obtain the saved
+memory afterwards. Currently 32 MB are saved.
+
+This zfcpdump implementation consists of a Linux dump kernel together with
+a user space dump tool, which are loaded together into the saved memory region
+below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in
+the s390-tools package) to make the device bootable. The operator of a Linux
+system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump
+resides on.
+
+The user space dump tool accesses the memory of the crashed system by means
+of the /proc/vmcore interface. This interface exports the crashed system's
+memory and registers in ELF core dump format. To access the memory which has
+been saved by the hardware SCLP requests will be created at the time the data
+is needed by /proc/vmcore. The tail part of the crashed systems memory which
+has not been stashed by hardware can just be copied from real memory.
+
+To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP
+has to be set.
+
+To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig".
+
+The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs
+under the following locations:
+
+* kernel: <zfcpdump directory>/zfcpdump.image
+* ramdisk: <zfcpdump directory>/zfcpdump.rd
+
+The zfcpdump directory is defined in the s390-tools package.
+
+The user space application of zfcpdump can reside in an intitramfs or an
+initrd. It can also be included in a built-in kernel initramfs. The application
+reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk.
+
+The s390-tools package version 1.24.0 and above builds an external zfcpdump
+initramfs with a user space application that writes the dump to a SCSI
+partition.
+
+For more information on how to use zfcpdump refer to the s390 'Using the Dump
+Tools book', which is available from
+http://www.ibm.com/developerworks/linux/linux390.
diff --git a/Documentation/s390/zfcpdump.txt b/Documentation/s390/zfcpdump.txt
deleted file mode 100644
index b064aa59714d..000000000000
--- a/Documentation/s390/zfcpdump.txt
+++ /dev/null
@@ -1,48 +0,0 @@
-The s390 SCSI dump tool (zfcpdump)
-
-System z machines (z900 or higher) provide hardware support for creating system
-dumps on SCSI disks. The dump process is initiated by booting a dump tool, which
-has to create a dump of the current (probably crashed) Linux image. In order to
-not overwrite memory of the crashed Linux with data of the dump tool, the
-hardware saves some memory plus the register sets of the boot CPU before the
-dump tool is loaded. There exists an SCLP hardware interface to obtain the saved
-memory afterwards. Currently 32 MB are saved.
-
-This zfcpdump implementation consists of a Linux dump kernel together with
-a user space dump tool, which are loaded together into the saved memory region
-below 32 MB. zfcpdump is installed on a SCSI disk using zipl (as contained in
-the s390-tools package) to make the device bootable. The operator of a Linux
-system can then trigger a SCSI dump by booting the SCSI disk, where zfcpdump
-resides on.
-
-The user space dump tool accesses the memory of the crashed system by means
-of the /proc/vmcore interface. This interface exports the crashed system's
-memory and registers in ELF core dump format. To access the memory which has
-been saved by the hardware SCLP requests will be created at the time the data
-is needed by /proc/vmcore. The tail part of the crashed systems memory which
-has not been stashed by hardware can just be copied from real memory.
-
-To build a dump enabled kernel the kernel config option CONFIG_CRASH_DUMP
-has to be set.
-
-To get a valid zfcpdump kernel configuration use "make zfcpdump_defconfig".
-
-The s390 zipl tool looks for the zfcpdump kernel and optional initrd/initramfs
-under the following locations:
-
-* kernel: <zfcpdump directory>/zfcpdump.image
-* ramdisk: <zfcpdump directory>/zfcpdump.rd
-
-The zfcpdump directory is defined in the s390-tools package.
-
-The user space application of zfcpdump can reside in an intitramfs or an
-initrd. It can also be included in a built-in kernel initramfs. The application
-reads from /proc/vmcore or zcore/mem and writes the system dump to a SCSI disk.
-
-The s390-tools package version 1.24.0 and above builds an external zfcpdump
-initramfs with a user space application that writes the dump to a SCSI
-partition.
-
-For more information on how to use zfcpdump refer to the s390 'Using the Dump
-Tools book', which is available from
-http://www.ibm.com/developerworks/linux/linux390.
diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
index 058be77a4c34..69074e5de9c4 100644
--- a/Documentation/scheduler/index.rst
+++ b/Documentation/scheduler/index.rst
@@ -1,5 +1,3 @@
-:orphan:
-
===============
Linux Scheduler
===============
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index 873fb2775ca6..14a2f7bf63fe 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -669,7 +669,7 @@ Deadline Task Scheduling
-deadline tasks cannot have an affinity mask smaller that the entire
root_domain they are created on. However, affinities can be specified
- through the cpuset facility (Documentation/cgroup-v1/cpusets.txt).
+ through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst).
5.1 SCHED_DEADLINE and cpusets HOWTO
------------------------------------
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 82406685365a..a96c72651877 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -222,7 +222,7 @@ SCHED_BATCH) tasks.
These options need CONFIG_CGROUPS to be defined, and let the administrator
create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See
- Documentation/cgroup-v1/cgroups.txt for more information about this filesystem.
+ Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem.
When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
group created using the pseudo filesystem. See example steps below to create
diff --git a/Documentation/scheduler/sched-energy.rst b/Documentation/scheduler/sched-energy.rst
index fce5858c9082..9580c57a52bc 100644
--- a/Documentation/scheduler/sched-energy.rst
+++ b/Documentation/scheduler/sched-energy.rst
@@ -22,7 +22,7 @@ the highest.
The actual EM used by EAS is _not_ maintained by the scheduler, but by a
dedicated framework. For details about this framework and what it provides,
-please refer to its documentation (see Documentation/power/energy-model.txt).
+please refer to its documentation (see Documentation/power/energy-model.rst).
2. Background and Terminology
@@ -81,7 +81,7 @@ through the arch_scale_cpu_capacity() callback.
The rest of platform knowledge used by EAS is directly read from the Energy
Model (EM) framework. The EM of a platform is composed of a power cost table
-per 'performance domain' in the system (see Documentation/power/energy-model.txt
+per 'performance domain' in the system (see Documentation/power/energy-model.rst
for futher details about performance domains).
The scheduler manages references to the EM objects in the topology code when the
@@ -353,7 +353,7 @@ could be amended in the future if proven otherwise.
EAS uses the EM of a platform to estimate the impact of scheduling decisions on
energy. So, your platform must provide power cost tables to the EM framework in
order to make EAS start. To do so, please refer to documentation of the
-independent EM framework in Documentation/power/energy-model.txt.
+independent EM framework in Documentation/power/energy-model.rst.
Please also note that the scheduling domains need to be re-built after the
EM has been registered in order to start EAS.
diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c
index e4219139386a..7238b355919c 100644
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
@@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void)
int i;
unsigned int x;
- printf("static const u32 runnable_avg_yN_inv[] = {");
+ /* To silence -Wunused-but-set-variable warnings. */
+ printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {");
for (i = 0; i < HALFLIFE; i++) {
x = ((1UL<<32)-1)*pow(y, i);
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index 79b30a21c51a..655a096ec8fb 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
to control the CPU time reserved for each control group.
For more information on working with control groups, you should read
-Documentation/cgroup-v1/cgroups.txt as well.
+Documentation/admin-guide/cgroup-v1/cgroups.rst as well.
Group settings are checked against the following limits in order to keep the
configuration schedulable:
diff --git a/Documentation/scsi/osst.txt b/Documentation/scsi/osst.txt
deleted file mode 100644
index 00c8ebb2fd18..000000000000
--- a/Documentation/scsi/osst.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-README file for the osst driver
-===============================
-(w) Kurt Garloff <garloff@suse.de> 12/2000
-
-This file describes the osst driver as of version 0.8.x/0.9.x, the released
-version of the osst driver.
-It is intended to help advanced users to understand the role of osst and to
-get them started using (and maybe debugging) it.
-It won't address issues like "How do I compile a kernel?" or "How do I load
-a module?", as these are too basic.
-Once the OnStream got merged into the official kernel, the distro makers
-will provide the OnStream support for those who are not familiar with
-hacking their kernels.
-
-
-Purpose
--------
-The osst driver was developed, because the standard SCSI tape driver in
-Linux, st, does not support the OnStream SC-x0 SCSI tape. The st is not to
-blame for that, as the OnStream tape drives do not support the standard SCSI
-command set for Serial Access Storage Devices (SASDs), which basically
-corresponds to the QIC-157 spec.
-Nevertheless, the OnStream tapes are nice pieces of hardware and therefore
-the osst driver has been written to make these tape devs supported by Linux.
-The driver is free software. It's released under the GNU GPL and planned to
-be integrated into the mainstream kernel.
-
-
-Implementation
---------------
-The osst is a new high-level SCSI driver, just like st, sr, sd and sg. It
-can be compiled into the kernel or loaded as a module.
-As it represents a new device, it got assigned a new device node: /dev/osstX
-are character devices with major no 206 and minor numbers like the /dev/stX
-devices. If those are not present, you may create them by calling
-Makedevs.sh as root (see below).
-The driver started being a copy of st and as such, the osst devices'
-behavior looks very much the same as st to the userspace applications.
-
-
-History
--------
-In the first place, osst shared its identity very much with st. That meant
-that it used the same kernel structures and the same device node as st.
-So you could only have either of them being present in the kernel. This has
-been fixed by registering an own device, now.
-st and osst can coexist, each only accessing the devices it can support by
-themselves.
-
-
-Installation
-------------
-osst got integrated into the linux kernel. Select it during kernel
-configuration as module or compile statically into the kernel.
-Compile your kernel and install the modules.
-
-Now, your osst driver is inside the kernel or available as a module,
-depending on your choice during kernel config. You may still need to create
-the device nodes by calling the Makedevs.sh script (see below) manually.
-
-To load your module, you may use the command
-modprobe osst
-as root. dmesg should show you, whether your OnStream tapes have been
-recognized.
-
-If you want to have the module autoloaded on access to /dev/osst, you may
-add something like
-alias char-major-206 osst
-to a file under /etc/modprobe.d/ directory.
-
-You may find it convenient to create a symbolic link
-ln -s nosst0 /dev/tape
-to make programs assuming a default name of /dev/tape more convenient to
-use.
-
-The device nodes for osst have to be created. Use the Makedevs.sh script
-attached to this file.
-
-
-Using it
---------
-You may use the OnStream tape driver with your standard backup software,
-which may be tar, cpio, amanda, arkeia, BRU, Lone Tar, ...
-by specifying /dev/(n)osst0 as the tape device to use or using the above
-symlink trick. The IOCTLs to control tape operation are also mostly
-supported and you may try the mt (or mt_st) program to jump between
-filemarks, eject the tape, ...
-
-There's one limitation: You need to use a block size of 32kB.
-
-(This limitation is worked on and will be fixed in version 0.8.8 of
- this driver.)
-
-If you just want to get started with standard software, here is an example
-for creating and restoring a full backup:
-# Backup
-tar cvf - / --exclude /proc | buffer -s 32k -m 24M -B -t -o /dev/nosst0
-# Restore
-buffer -s 32k -m 8M -B -t -i /dev/osst0 | tar xvf - -C /
-
-The buffer command has been used to buffer the data before it goes to the
-tape (or the file system) in order to smooth out the data stream and prevent
-the tape from needing to stop and rewind. The OnStream does have an internal
-buffer and a variable speed which help this, but especially on writing, the
-buffering still proves useful in most cases. It also pads the data to
-guarantees the block size of 32k. (Otherwise you may pass the -b64 option to
-tar.)
-Expect something like 1.8MB/s for the SC-x0 drives and 0.9MB/s for the DI-30.
-The USB drive will give you about 0.7MB/s.
-On a fast machine, you may profit from software data compression (z flag for
-tar).
-
-
-USB and IDE
------------
-Via the SCSI emulation layers usb-storage and ide-scsi, you can also use the
-osst driver to drive the USB-30 and the DI-30 drives. (Unfortunately, there
-is no such layer for the parallel port, otherwise the DP-30 would work as
-well.) For the USB support, you need the latest 2.4.0-test kernels and the
-latest usb-storage driver from
-http://www.linux-usb.org/
-http://sourceforge.net/cvs/?group_id=3581
-
-Note that the ide-tape driver as of 1.16f uses a slightly outdated on-tape
-format and therefore is not completely interoperable with osst tapes.
-
-The ADR-x0 line is fully SCSI-2 compliant and is supported by st, not osst.
-The on-tape format is supposed to be compatible with the one used by osst.
-
-
-Feedback and updates
---------------------
-The driver development is coordinated through a mailing list
-<osst@linux1.onstream.nl>
-a CVS repository and some web pages.
-The tester's pages which contain recent news and updated drivers to download
-can be found on
-http://sourceforge.net/projects/osst/
-
-If you find any problems, please have a look at the tester's page in order
-to see whether the problem is already known and solved. Otherwise, please
-report it to the mailing list. Your feedback is welcome. (This holds also
-for reports of successful usage, of course.)
-In case of trouble, please do always provide the following info:
-* driver and kernel version used (see syslog)
-* driver messages (syslog)
-* SCSI config and OnStream Firmware (/proc/scsi/scsi)
-* description of error. Is it reproducible?
-* software and commands used
-
-You may subscribe to the mailing list, BTW, it's a majordomo list.
-
-
-Status
-------
-0.8.0 was the first widespread BETA release. Since then a lot of reports
-have been sent, but mostly reported success or only minor trouble.
-All the issues have been addressed.
-Check the web pages for more info about the current developments.
-0.9.x is the tree for the 2.3/2.4 kernel.
-
-
-Acknowledgments
-----------------
-The driver has been started by making a copy of Kai Makisara's st driver.
-Most of the development has been done by Willem Riede. The presence of the
-userspace program osg (onstreamsg) from Terry Hardie has been rather
-helpful. The same holds for Gadi Oxman's ide-tape support for the DI-30.
-I did add some patches to those drivers as well and coordinated things a
-little bit.
-Note that most of them did mostly spend their spare time for the creation of
-this driver.
-The people from OnStream, especially Jack Bombeeck did support this project
-and always tried to answer HW or FW related questions. Furthermore, he
-pushed the FW developers to do the right things.
-SuSE did support this project by allowing me to work on it during my working
-time for them and by integrating the driver into their distro.
-
-More people did help by sending useful comments. Sorry to those who have
-been forgotten. Thanks to all the GNU/FSF and Linux developers who made this
-platform such an interesting, nice and stable platform.
-Thanks go to those who tested the drivers and did send useful reports. Your
-help is needed!
-
-
-Makedevs.sh
------------
-#!/bin/sh
-# Script to create OnStream SC-x0 device nodes (major 206)
-# Usage: Makedevs.sh [nos [path to dev]]
-# $Id: README.osst.kernel,v 1.4 2000/12/20 14:13:15 garloff Exp $
-major=206
-nrs=4
-dir=/dev
-test -z "$1" || nrs=$1
-test -z "$2" || dir=$2
-declare -i nr
-nr=0
-test -d $dir || mkdir -p $dir
-while test $nr -lt $nrs; do
- mknod $dir/osst$nr c $major $nr
- chown 0.disk $dir/osst$nr; chmod 660 $dir/osst$nr;
- mknod $dir/nosst$nr c $major $[nr+128]
- chown 0.disk $dir/nosst$nr; chmod 660 $dir/nosst$nr;
- mknod $dir/osst${nr}l c $major $[nr+32]
- chown 0.disk $dir/osst${nr}l; chmod 660 $dir/osst${nr}l;
- mknod $dir/nosst${nr}l c $major $[nr+160]
- chown 0.disk $dir/nosst${nr}l; chmod 660 $dir/nosst${nr}l;
- mknod $dir/osst${nr}m c $major $[nr+64]
- chown 0.disk $dir/osst${nr}m; chmod 660 $dir/osst${nr}m;
- mknod $dir/nosst${nr}m c $major $[nr+192]
- chown 0.disk $dir/nosst${nr}m; chmod 660 $dir/nosst${nr}m;
- mknod $dir/osst${nr}a c $major $[nr+96]
- chown 0.disk $dir/osst${nr}a; chmod 660 $dir/osst${nr}a;
- mknod $dir/nosst${nr}a c $major $[nr+224]
- chown 0.disk $dir/nosst${nr}a; chmod 660 $dir/nosst${nr}a;
- let nr+=1
-done
diff --git a/Documentation/scsi/ufs.txt b/Documentation/scsi/ufs.txt
index 1769f71c4c20..81842ec3e116 100644
--- a/Documentation/scsi/ufs.txt
+++ b/Documentation/scsi/ufs.txt
@@ -158,6 +158,13 @@ send SG_IO with the applicable sg_io_v4:
If you wish to read or write a descriptor, use the appropriate xferp of
sg_io_v4.
+The userspace tool that interacts with the ufs-bsg endpoint and uses its
+upiu-based protocol is available at:
+
+ https://github.com/westerndigitalcorporation/ufs-tool
+
+For more detailed information about the tool and its supported
+features, please see the tool's README.
UFS Specifications can be found at,
UFS - http://www.jedec.org/sites/default/files/docs/JESD220.pdf
diff --git a/Documentation/security/IMA-templates.rst b/Documentation/security/IMA-templates.rst
index 2cd0e273cc9a..3d1cca287aa4 100644
--- a/Documentation/security/IMA-templates.rst
+++ b/Documentation/security/IMA-templates.rst
@@ -69,15 +69,16 @@ descriptors by adding their identifier to the format string
algorithm (field format: [<hash algo>:]digest, where the digest
prefix is shown only if the hash algorithm is not SHA1 or MD5);
- 'n-ng': the name of the event, without size limitations;
- - 'sig': the file signature.
+ - 'sig': the file signature;
+ - 'buf': the buffer data that was used to generate the hash without size limitations;
Below, there is the list of defined template descriptors:
- "ima": its format is ``d|n``;
- "ima-ng" (default): its format is ``d-ng|n-ng``;
- - "ima-sig": its format is ``d-ng|n-ng|sig``.
-
+ - "ima-sig": its format is ``d-ng|n-ng|sig``;
+ - "ima-buf": its format is ``d-ng|n-ng|buf``;
Use
diff --git a/Documentation/security/index.rst b/Documentation/security/index.rst
index aad6d92ffe31..fc503dd689a7 100644
--- a/Documentation/security/index.rst
+++ b/Documentation/security/index.rst
@@ -8,7 +8,10 @@ Security Documentation
credentials
IMA-templates
keys/index
- LSM
+ lsm
+ lsm-development
+ sak
SCTP
self-protection
+ siphash
tpm/index
diff --git a/Documentation/security/keys/core.rst b/Documentation/security/keys/core.rst
index 3fd60dcb2dc6..d6d8b0b756b6 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -433,6 +433,10 @@ The main syscalls are:
/sbin/request-key will be invoked in an attempt to obtain a key. The
callout_info string will be passed as an argument to the program.
+ To link a key into the destination keyring the key must grant link
+ permission on the key to the caller and the keyring must grant write
+ permission.
+
See also Documentation/security/keys/request-key.rst.
@@ -577,6 +581,27 @@ The keyctl syscall functions are:
added.
+ * Move a key from one keyring to another::
+
+ long keyctl(KEYCTL_MOVE,
+ key_serial_t id,
+ key_serial_t from_ring_id,
+ key_serial_t to_ring_id,
+ unsigned int flags);
+
+ Move the key specified by "id" from the keyring specified by
+ "from_ring_id" to the keyring specified by "to_ring_id". If the two
+ keyrings are the same, nothing is done.
+
+ "flags" can have KEYCTL_MOVE_EXCL set in it to cause the operation to fail
+ with EEXIST if a matching key exists in the destination keyring, otherwise
+ such a key will be replaced.
+
+ A process must have link permission on the key for this function to be
+ successful and write permission on both keyrings. Any errors that can
+ occur from KEYCTL_LINK also apply on the destination keyring here.
+
+
* Unlink a key or keyring from another keyring::
long keyctl(KEYCTL_UNLINK, key_serial_t keyring, key_serial_t key);
@@ -1077,49 +1102,43 @@ payload contents" for more information.
See also Documentation/security/keys/request-key.rst.
+ * To search for a key in a specific domain, call:
+
+ struct key *request_key_tag(const struct key_type *type,
+ const char *description,
+ struct key_tag *domain_tag,
+ const char *callout_info);
+
+ This is identical to request_key(), except that a domain tag may be
+ specifies that causes search algorithm to only match keys matching that
+ tag. The domain_tag may be NULL, specifying a global domain that is
+ separate from any nominated domain.
+
+
* To search for a key, passing auxiliary data to the upcaller, call::
struct key *request_key_with_auxdata(const struct key_type *type,
const char *description,
+ struct key_tag *domain_tag,
const void *callout_info,
size_t callout_len,
void *aux);
- This is identical to request_key(), except that the auxiliary data is
- passed to the key_type->request_key() op if it exists, and the callout_info
- is a blob of length callout_len, if given (the length may be 0).
-
-
- * A key can be requested asynchronously by calling one of::
-
- struct key *request_key_async(const struct key_type *type,
- const char *description,
- const void *callout_info,
- size_t callout_len);
-
- or::
-
- struct key *request_key_async_with_auxdata(const struct key_type *type,
- const char *description,
- const char *callout_info,
- size_t callout_len,
- void *aux);
-
- which are asynchronous equivalents of request_key() and
- request_key_with_auxdata() respectively.
+ This is identical to request_key_tag(), except that the auxiliary data is
+ passed to the key_type->request_key() op if it exists, and the
+ callout_info is a blob of length callout_len, if given (the length may be
+ 0).
- These two functions return with the key potentially still under
- construction. To wait for construction completion, the following should be
- called::
- int wait_for_key_construction(struct key *key, bool intr);
+ * To search for a key under RCU conditions, call::
- The function will wait for the key to finish being constructed and then
- invokes key_validate() to return an appropriate value to indicate the state
- of the key (0 indicates the key is usable).
+ struct key *request_key_rcu(const struct key_type *type,
+ const char *description,
+ struct key_tag *domain_tag);
- If intr is true, then the wait can be interrupted by a signal, in which
- case error ERESTARTSYS will be returned.
+ which is similar to request_key_tag() except that it does not check for
+ keys that are under construction and it will not call out to userspace to
+ construct a key if it can't find a match.
* When it is no longer required, the key should be released using::
@@ -1159,11 +1178,13 @@ payload contents" for more information.
key_ref_t keyring_search(key_ref_t keyring_ref,
const struct key_type *type,
- const char *description)
+ const char *description,
+ bool recurse)
- This searches the keyring tree specified for a matching key. Error ENOKEY
- is returned upon failure (use IS_ERR/PTR_ERR to determine). If successful,
- the returned key will need to be released.
+ This searches the specified keyring only (recurse == false) or keyring tree
+ (recurse == true) specified for a matching key. Error ENOKEY is returned
+ upon failure (use IS_ERR/PTR_ERR to determine). If successful, the returned
+ key will need to be released.
The possession attribute from the keyring reference is used to control
access through the permissions mask and is propagated to the returned key
diff --git a/Documentation/security/keys/request-key.rst b/Documentation/security/keys/request-key.rst
index 600ad67d1707..35f2296b704a 100644
--- a/Documentation/security/keys/request-key.rst
+++ b/Documentation/security/keys/request-key.rst
@@ -15,26 +15,25 @@ The process starts by either the kernel requesting a service by calling
or::
+ struct key *request_key_tag(const struct key_type *type,
+ const char *description,
+ const struct key_tag *domain_tag,
+ const char *callout_info);
+
+or::
+
struct key *request_key_with_auxdata(const struct key_type *type,
const char *description,
+ const struct key_tag *domain_tag,
const char *callout_info,
size_t callout_len,
void *aux);
or::
- struct key *request_key_async(const struct key_type *type,
- const char *description,
- const char *callout_info,
- size_t callout_len);
-
-or::
-
- struct key *request_key_async_with_auxdata(const struct key_type *type,
- const char *description,
- const char *callout_info,
- size_t callout_len,
- void *aux);
+ struct key *request_key_rcu(const struct key_type *type,
+ const char *description,
+ const struct key_tag *domain_tag);
Or by userspace invoking the request_key system call::
@@ -48,14 +47,18 @@ does not need to link the key to a keyring to prevent it from being immediately
destroyed. The kernel interface returns a pointer directly to the key, and
it's up to the caller to destroy the key.
-The request_key*_with_auxdata() calls are like the in-kernel request_key*()
-calls, except that they permit auxiliary data to be passed to the upcaller (the
-default is NULL). This is only useful for those key types that define their
-own upcall mechanism rather than using /sbin/request-key.
+The request_key_tag() call is like the in-kernel request_key(), except that it
+also takes a domain tag that allows keys to be separated by namespace and
+killed off as a group.
+
+The request_key_with_auxdata() calls is like the request_key_tag() call, except
+that they permit auxiliary data to be passed to the upcaller (the default is
+NULL). This is only useful for those key types that define their own upcall
+mechanism rather than using /sbin/request-key.
-The two async in-kernel calls may return keys that are still in the process of
-being constructed. The two non-async ones will wait for construction to
-complete first.
+The request_key_rcu() call is like the request_key_tag() call, except that it
+doesn't check for keys that are under construction and doesn't attempt to
+construct missing keys.
The userspace interface links the key to a keyring associated with the process
to prevent the key from going away, and returns the serial number of the key to
@@ -148,7 +151,7 @@ The Search Algorithm
A search of any particular keyring proceeds in the following fashion:
- 1) When the key management code searches for a key (keyring_search_aux) it
+ 1) When the key management code searches for a key (keyring_search_rcu) it
firstly calls key_permission(SEARCH) on the keyring it's starting with,
if this denies permission, it doesn't search further.
@@ -167,6 +170,9 @@ The process stops immediately a valid key is found with permission granted to
use it. Any error from a previous match attempt is discarded and the key is
returned.
+When request_key() is invoked, if CONFIG_KEYS_REQUEST_CACHE=y, a per-task
+one-key cache is first checked for a match.
+
When search_process_keyrings() is invoked, it performs the following searches
until one succeeds:
@@ -186,7 +192,9 @@ until one succeeds:
c) The calling process's session keyring is searched.
The moment one succeeds, all pending errors are discarded and the found key is
-returned.
+returned. If CONFIG_KEYS_REQUEST_CACHE=y, then that key is placed in the
+per-task cache, displacing the previous key. The cache is cleared on exit or
+just prior to resumption of userspace.
Only if all these fail does the whole thing fail with the highest priority
error. Note that several errors may have come from LSM.
diff --git a/Documentation/security/LSM.rst b/Documentation/security/lsm-development.rst
index 31d92bc5fdd2..31d92bc5fdd2 100644
--- a/Documentation/security/LSM.rst
+++ b/Documentation/security/lsm-development.rst
diff --git a/Documentation/lsm.txt b/Documentation/security/lsm.rst
index ad4dfd020e0d..ad4dfd020e0d 100644
--- a/Documentation/lsm.txt
+++ b/Documentation/security/lsm.rst
diff --git a/Documentation/SAK.txt b/Documentation/security/sak.rst
index 260e1d3687bd..260e1d3687bd 100644
--- a/Documentation/SAK.txt
+++ b/Documentation/security/sak.rst
diff --git a/Documentation/siphash.txt b/Documentation/security/siphash.rst
index 9965821ab333..9965821ab333 100644
--- a/Documentation/siphash.txt
+++ b/Documentation/security/siphash.rst
diff --git a/Documentation/security/tpm/index.rst b/Documentation/security/tpm/index.rst
index af77a7bbb070..3296533e54cf 100644
--- a/Documentation/security/tpm/index.rst
+++ b/Documentation/security/tpm/index.rst
@@ -5,3 +5,4 @@ Trusted Platform Module documentation
.. toctree::
tpm_vtpm_proxy
+ xen-tpmfront
diff --git a/Documentation/security/tpm/xen-tpmfront.rst b/Documentation/security/tpm/xen-tpmfront.rst
new file mode 100644
index 000000000000..00d5b1db227d
--- /dev/null
+++ b/Documentation/security/tpm/xen-tpmfront.rst
@@ -0,0 +1,124 @@
+=============================
+Virtual TPM interface for Xen
+=============================
+
+Authors: Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA)
+
+This document describes the virtual Trusted Platform Module (vTPM) subsystem for
+Xen. The reader is assumed to have familiarity with building and installing Xen,
+Linux, and a basic understanding of the TPM and vTPM concepts.
+
+Introduction
+------------
+
+The goal of this work is to provide a TPM functionality to a virtual guest
+operating system (in Xen terms, a DomU). This allows programs to interact with
+a TPM in a virtual system the same way they interact with a TPM on the physical
+system. Each guest gets its own unique, emulated, software TPM. However, each
+of the vTPM's secrets (Keys, NVRAM, etc) are managed by a vTPM Manager domain,
+which seals the secrets to the Physical TPM. If the process of creating each of
+these domains (manager, vTPM, and guest) is trusted, the vTPM subsystem extends
+the chain of trust rooted in the hardware TPM to virtual machines in Xen. Each
+major component of vTPM is implemented as a separate domain, providing secure
+separation guaranteed by the hypervisor. The vTPM domains are implemented in
+mini-os to reduce memory and processor overhead.
+
+This mini-os vTPM subsystem was built on top of the previous vTPM work done by
+IBM and Intel corporation.
+
+
+Design Overview
+---------------
+
+The architecture of vTPM is described below::
+
+ +------------------+
+ | Linux DomU | ...
+ | | ^ |
+ | v | |
+ | xen-tpmfront |
+ +------------------+
+ | ^
+ v |
+ +------------------+
+ | mini-os/tpmback |
+ | | ^ |
+ | v | |
+ | vtpm-stubdom | ...
+ | | ^ |
+ | v | |
+ | mini-os/tpmfront |
+ +------------------+
+ | ^
+ v |
+ +------------------+
+ | mini-os/tpmback |
+ | | ^ |
+ | v | |
+ | vtpmmgr-stubdom |
+ | | ^ |
+ | v | |
+ | mini-os/tpm_tis |
+ +------------------+
+ | ^
+ v |
+ +------------------+
+ | Hardware TPM |
+ +------------------+
+
+* Linux DomU:
+ The Linux based guest that wants to use a vTPM. There may be
+ more than one of these.
+
+* xen-tpmfront.ko:
+ Linux kernel virtual TPM frontend driver. This driver
+ provides vTPM access to a Linux-based DomU.
+
+* mini-os/tpmback:
+ Mini-os TPM backend driver. The Linux frontend driver
+ connects to this backend driver to facilitate communications
+ between the Linux DomU and its vTPM. This driver is also
+ used by vtpmmgr-stubdom to communicate with vtpm-stubdom.
+
+* vtpm-stubdom:
+ A mini-os stub domain that implements a vTPM. There is a
+ one to one mapping between running vtpm-stubdom instances and
+ logical vtpms on the system. The vTPM Platform Configuration
+ Registers (PCRs) are normally all initialized to zero.
+
+* mini-os/tpmfront:
+ Mini-os TPM frontend driver. The vTPM mini-os domain
+ vtpm-stubdom uses this driver to communicate with
+ vtpmmgr-stubdom. This driver is also used in mini-os
+ domains such as pv-grub that talk to the vTPM domain.
+
+* vtpmmgr-stubdom:
+ A mini-os domain that implements the vTPM manager. There is
+ only one vTPM manager and it should be running during the
+ entire lifetime of the machine. This domain regulates
+ access to the physical TPM on the system and secures the
+ persistent state of each vTPM.
+
+* mini-os/tpm_tis:
+ Mini-os TPM version 1.2 TPM Interface Specification (TIS)
+ driver. This driver used by vtpmmgr-stubdom to talk directly to
+ the hardware TPM. Communication is facilitated by mapping
+ hardware memory pages into vtpmmgr-stubdom.
+
+* Hardware TPM:
+ The physical TPM that is soldered onto the motherboard.
+
+
+Integration With Xen
+--------------------
+
+Support for the vTPM driver was added in Xen using the libxl toolstack in Xen
+4.3. See the Xen documentation (docs/misc/vtpm.txt) for details on setting up
+the vTPM and vTPM Manager stub domains. Once the stub domains are running, a
+vTPM device is set up in the same manner as a disk or network device in the
+domain's configuration file.
+
+In order to use features such as IMA that require a TPM to be loaded prior to
+the initrd, the xen-tpmfront driver must be compiled in to the kernel. If not
+using such features, the driver can be compiled as a module and will be loaded
+as usual.
diff --git a/Documentation/security/tpm/xen-tpmfront.txt b/Documentation/security/tpm/xen-tpmfront.txt
deleted file mode 100644
index 69346de87ff3..000000000000
--- a/Documentation/security/tpm/xen-tpmfront.txt
+++ /dev/null
@@ -1,113 +0,0 @@
-Virtual TPM interface for Xen
-
-Authors: Matthew Fioravante (JHUAPL), Daniel De Graaf (NSA)
-
-This document describes the virtual Trusted Platform Module (vTPM) subsystem for
-Xen. The reader is assumed to have familiarity with building and installing Xen,
-Linux, and a basic understanding of the TPM and vTPM concepts.
-
-INTRODUCTION
-
-The goal of this work is to provide a TPM functionality to a virtual guest
-operating system (in Xen terms, a DomU). This allows programs to interact with
-a TPM in a virtual system the same way they interact with a TPM on the physical
-system. Each guest gets its own unique, emulated, software TPM. However, each
-of the vTPM's secrets (Keys, NVRAM, etc) are managed by a vTPM Manager domain,
-which seals the secrets to the Physical TPM. If the process of creating each of
-these domains (manager, vTPM, and guest) is trusted, the vTPM subsystem extends
-the chain of trust rooted in the hardware TPM to virtual machines in Xen. Each
-major component of vTPM is implemented as a separate domain, providing secure
-separation guaranteed by the hypervisor. The vTPM domains are implemented in
-mini-os to reduce memory and processor overhead.
-
-This mini-os vTPM subsystem was built on top of the previous vTPM work done by
-IBM and Intel corporation.
-
-
-DESIGN OVERVIEW
----------------
-
-The architecture of vTPM is described below:
-
-+------------------+
-| Linux DomU | ...
-| | ^ |
-| v | |
-| xen-tpmfront |
-+------------------+
- | ^
- v |
-+------------------+
-| mini-os/tpmback |
-| | ^ |
-| v | |
-| vtpm-stubdom | ...
-| | ^ |
-| v | |
-| mini-os/tpmfront |
-+------------------+
- | ^
- v |
-+------------------+
-| mini-os/tpmback |
-| | ^ |
-| v | |
-| vtpmmgr-stubdom |
-| | ^ |
-| v | |
-| mini-os/tpm_tis |
-+------------------+
- | ^
- v |
-+------------------+
-| Hardware TPM |
-+------------------+
-
- * Linux DomU: The Linux based guest that wants to use a vTPM. There may be
- more than one of these.
-
- * xen-tpmfront.ko: Linux kernel virtual TPM frontend driver. This driver
- provides vTPM access to a Linux-based DomU.
-
- * mini-os/tpmback: Mini-os TPM backend driver. The Linux frontend driver
- connects to this backend driver to facilitate communications
- between the Linux DomU and its vTPM. This driver is also
- used by vtpmmgr-stubdom to communicate with vtpm-stubdom.
-
- * vtpm-stubdom: A mini-os stub domain that implements a vTPM. There is a
- one to one mapping between running vtpm-stubdom instances and
- logical vtpms on the system. The vTPM Platform Configuration
- Registers (PCRs) are normally all initialized to zero.
-
- * mini-os/tpmfront: Mini-os TPM frontend driver. The vTPM mini-os domain
- vtpm-stubdom uses this driver to communicate with
- vtpmmgr-stubdom. This driver is also used in mini-os
- domains such as pv-grub that talk to the vTPM domain.
-
- * vtpmmgr-stubdom: A mini-os domain that implements the vTPM manager. There is
- only one vTPM manager and it should be running during the
- entire lifetime of the machine. This domain regulates
- access to the physical TPM on the system and secures the
- persistent state of each vTPM.
-
- * mini-os/tpm_tis: Mini-os TPM version 1.2 TPM Interface Specification (TIS)
- driver. This driver used by vtpmmgr-stubdom to talk directly to
- the hardware TPM. Communication is facilitated by mapping
- hardware memory pages into vtpmmgr-stubdom.
-
- * Hardware TPM: The physical TPM that is soldered onto the motherboard.
-
-
-INTEGRATION WITH XEN
---------------------
-
-Support for the vTPM driver was added in Xen using the libxl toolstack in Xen
-4.3. See the Xen documentation (docs/misc/vtpm.txt) for details on setting up
-the vTPM and vTPM Manager stub domains. Once the stub domains are running, a
-vTPM device is set up in the same manner as a disk or network device in the
-domain's configuration file.
-
-In order to use features such as IMA that require a TPM to be loaded prior to
-the initrd, the xen-tpmfront driver must be compiled in to the kernel. If not
-using such features, the driver can be compiled as a module and will be loaded
-as usual.
diff --git a/Documentation/serial/driver.rst b/Documentation/serial/driver.rst
deleted file mode 100644
index 4537119bf624..000000000000
--- a/Documentation/serial/driver.rst
+++ /dev/null
@@ -1,549 +0,0 @@
-====================
-Low Level Serial API
-====================
-
-
-This document is meant as a brief overview of some aspects of the new serial
-driver. It is not complete, any questions you have should be directed to
-<rmk@arm.linux.org.uk>
-
-The reference implementation is contained within amba-pl011.c.
-
-
-
-Low Level Serial Hardware Driver
---------------------------------
-
-The low level serial hardware driver is responsible for supplying port
-information (defined by uart_port) and a set of control methods (defined
-by uart_ops) to the core serial driver. The low level driver is also
-responsible for handling interrupts for the port, and providing any
-console support.
-
-
-Console Support
----------------
-
-The serial core provides a few helper functions. This includes identifing
-the correct port structure (via uart_get_console) and decoding command line
-arguments (uart_parse_options).
-
-There is also a helper function (uart_console_write) which performs a
-character by character write, translating newlines to CRLF sequences.
-Driver writers are recommended to use this function rather than implementing
-their own version.
-
-
-Locking
--------
-
-It is the responsibility of the low level hardware driver to perform the
-necessary locking using port->lock. There are some exceptions (which
-are described in the uart_ops listing below.)
-
-There are two locks. A per-port spinlock, and an overall semaphore.
-
-From the core driver perspective, the port->lock locks the following
-data::
-
- port->mctrl
- port->icount
- port->state->xmit.head (circ_buf->head)
- port->state->xmit.tail (circ_buf->tail)
-
-The low level driver is free to use this lock to provide any additional
-locking.
-
-The port_sem semaphore is used to protect against ports being added/
-removed or reconfigured at inappropriate times. Since v2.6.27, this
-semaphore has been the 'mutex' member of the tty_port struct, and
-commonly referred to as the port mutex.
-
-
-uart_ops
---------
-
-The uart_ops structure is the main interface between serial_core and the
-hardware specific driver. It contains all the methods to control the
-hardware.
-
- tx_empty(port)
- This function tests whether the transmitter fifo and shifter
- for the port described by 'port' is empty. If it is empty,
- this function should return TIOCSER_TEMT, otherwise return 0.
- If the port does not support this operation, then it should
- return TIOCSER_TEMT.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- This call must not sleep
-
- set_mctrl(port, mctrl)
- This function sets the modem control lines for port described
- by 'port' to the state described by mctrl. The relevant bits
- of mctrl are:
-
- - TIOCM_RTS RTS signal.
- - TIOCM_DTR DTR signal.
- - TIOCM_OUT1 OUT1 signal.
- - TIOCM_OUT2 OUT2 signal.
- - TIOCM_LOOP Set the port into loopback mode.
-
- If the appropriate bit is set, the signal should be driven
- active. If the bit is clear, the signal should be driven
- inactive.
-
- Locking: port->lock taken.
-
- Interrupts: locally disabled.
-
- This call must not sleep
-
- get_mctrl(port)
- Returns the current state of modem control inputs. The state
- of the outputs should not be returned, since the core keeps
- track of their state. The state information should include:
-
- - TIOCM_CAR state of DCD signal
- - TIOCM_CTS state of CTS signal
- - TIOCM_DSR state of DSR signal
- - TIOCM_RI state of RI signal
-
- The bit is set if the signal is currently driven active. If
- the port does not support CTS, DCD or DSR, the driver should
- indicate that the signal is permanently active. If RI is
- not available, the signal should not be indicated as active.
-
- Locking: port->lock taken.
-
- Interrupts: locally disabled.
-
- This call must not sleep
-
- stop_tx(port)
- Stop transmitting characters. This might be due to the CTS
- line becoming inactive or the tty layer indicating we want
- to stop transmission due to an XOFF character.
-
- The driver should stop transmitting characters as soon as
- possible.
-
- Locking: port->lock taken.
-
- Interrupts: locally disabled.
-
- This call must not sleep
-
- start_tx(port)
- Start transmitting characters.
-
- Locking: port->lock taken.
-
- Interrupts: locally disabled.
-
- This call must not sleep
-
- throttle(port)
- Notify the serial driver that input buffers for the line discipline are
- close to full, and it should somehow signal that no more characters
- should be sent to the serial port.
- This will be called only if hardware assisted flow control is enabled.
-
- Locking: serialized with .unthrottle() and termios modification by the
- tty layer.
-
- unthrottle(port)
- Notify the serial driver that characters can now be sent to the serial
- port without fear of overrunning the input buffers of the line
- disciplines.
-
- This will be called only if hardware assisted flow control is enabled.
-
- Locking: serialized with .throttle() and termios modification by the
- tty layer.
-
- send_xchar(port,ch)
- Transmit a high priority character, even if the port is stopped.
- This is used to implement XON/XOFF flow control and tcflow(). If
- the serial driver does not implement this function, the tty core
- will append the character to the circular buffer and then call
- start_tx() / stop_tx() to flush the data out.
-
- Do not transmit if ch == '\0' (__DISABLED_CHAR).
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- stop_rx(port)
- Stop receiving characters; the port is in the process of
- being closed.
-
- Locking: port->lock taken.
-
- Interrupts: locally disabled.
-
- This call must not sleep
-
- enable_ms(port)
- Enable the modem status interrupts.
-
- This method may be called multiple times. Modem status
- interrupts should be disabled when the shutdown method is
- called.
-
- Locking: port->lock taken.
-
- Interrupts: locally disabled.
-
- This call must not sleep
-
- break_ctl(port,ctl)
- Control the transmission of a break signal. If ctl is
- nonzero, the break signal should be transmitted. The signal
- should be terminated when another call is made with a zero
- ctl.
-
- Locking: caller holds tty_port->mutex
-
- startup(port)
- Grab any interrupt resources and initialise any low level driver
- state. Enable the port for reception. It should not activate
- RTS nor DTR; this will be done via a separate call to set_mctrl.
-
- This method will only be called when the port is initially opened.
-
- Locking: port_sem taken.
-
- Interrupts: globally disabled.
-
- shutdown(port)
- Disable the port, disable any break condition that may be in
- effect, and free any interrupt resources. It should not disable
- RTS nor DTR; this will have already been done via a separate
- call to set_mctrl.
-
- Drivers must not access port->state once this call has completed.
-
- This method will only be called when there are no more users of
- this port.
-
- Locking: port_sem taken.
-
- Interrupts: caller dependent.
-
- flush_buffer(port)
- Flush any write buffers, reset any DMA state and stop any
- ongoing DMA transfers.
-
- This will be called whenever the port->state->xmit circular
- buffer is cleared.
-
- Locking: port->lock taken.
-
- Interrupts: locally disabled.
-
- This call must not sleep
-
- set_termios(port,termios,oldtermios)
- Change the port parameters, including word length, parity, stop
- bits. Update read_status_mask and ignore_status_mask to indicate
- the types of events we are interested in receiving. Relevant
- termios->c_cflag bits are:
-
- CSIZE
- - word size
- CSTOPB
- - 2 stop bits
- PARENB
- - parity enable
- PARODD
- - odd parity (when PARENB is in force)
- CREAD
- - enable reception of characters (if not set,
- still receive characters from the port, but
- throw them away.
- CRTSCTS
- - if set, enable CTS status change reporting
- CLOCAL
- - if not set, enable modem status change
- reporting.
-
- Relevant termios->c_iflag bits are:
-
- INPCK
- - enable frame and parity error events to be
- passed to the TTY layer.
- BRKINT / PARMRK
- - both of these enable break events to be
- passed to the TTY layer.
-
- IGNPAR
- - ignore parity and framing errors
- IGNBRK
- - ignore break errors, If IGNPAR is also
- set, ignore overrun errors as well.
-
- The interaction of the iflag bits is as follows (parity error
- given as an example):
-
- =============== ======= ====== =============================
- Parity error INPCK IGNPAR
- =============== ======= ====== =============================
- n/a 0 n/a character received, marked as
- TTY_NORMAL
- None 1 n/a character received, marked as
- TTY_NORMAL
- Yes 1 0 character received, marked as
- TTY_PARITY
- Yes 1 1 character discarded
- =============== ======= ====== =============================
-
- Other flags may be used (eg, xon/xoff characters) if your
- hardware supports hardware "soft" flow control.
-
- Locking: caller holds tty_port->mutex
-
- Interrupts: caller dependent.
-
- This call must not sleep
-
- set_ldisc(port,termios)
- Notifier for discipline change. See Documentation/serial/tty.rst.
-
- Locking: caller holds tty_port->mutex
-
- pm(port,state,oldstate)
- Perform any power management related activities on the specified
- port. State indicates the new state (defined by
- enum uart_pm_state), oldstate indicates the previous state.
-
- This function should not be used to grab any resources.
-
- This will be called when the port is initially opened and finally
- closed, except when the port is also the system console. This
- will occur even if CONFIG_PM is not set.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- type(port)
- Return a pointer to a string constant describing the specified
- port, or return NULL, in which case the string 'unknown' is
- substituted.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- release_port(port)
- Release any memory and IO region resources currently in use by
- the port.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- request_port(port)
- Request any memory and IO region resources required by the port.
- If any fail, no resources should be registered when this function
- returns, and it should return -EBUSY on failure.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- config_port(port,type)
- Perform any autoconfiguration steps required for the port. `type`
- contains a bit mask of the required configuration. UART_CONFIG_TYPE
- indicates that the port requires detection and identification.
- port->type should be set to the type found, or PORT_UNKNOWN if
- no port was detected.
-
- UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
- which should be probed using standard kernel autoprobing techniques.
- This is not necessary on platforms where ports have interrupts
- internally hard wired (eg, system on a chip implementations).
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- verify_port(port,serinfo)
- Verify the new serial port information contained within serinfo is
- suitable for this port type.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- ioctl(port,cmd,arg)
- Perform any port specific IOCTLs. IOCTL commands must be defined
- using the standard numbering system found in <asm/ioctl.h>
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- poll_init(port)
- Called by kgdb to perform the minimal hardware initialization needed
- to support poll_put_char() and poll_get_char(). Unlike ->startup()
- this should not request interrupts.
-
- Locking: tty_mutex and tty_port->mutex taken.
-
- Interrupts: n/a.
-
- poll_put_char(port,ch)
- Called by kgdb to write a single character directly to the serial
- port. It can and should block until there is space in the TX FIFO.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- This call must not sleep
-
- poll_get_char(port)
- Called by kgdb to read a single character directly from the serial
- port. If data is available, it should be returned; otherwise
- the function should return NO_POLL_CHAR immediately.
-
- Locking: none.
-
- Interrupts: caller dependent.
-
- This call must not sleep
-
-Other functions
----------------
-
-uart_update_timeout(port,cflag,baud)
- Update the FIFO drain timeout, port->timeout, according to the
- number of bits, parity, stop bits and baud rate.
-
- Locking: caller is expected to take port->lock
-
- Interrupts: n/a
-
-uart_get_baud_rate(port,termios,old,min,max)
- Return the numeric baud rate for the specified termios, taking
- account of the special 38400 baud "kludge". The B0 baud rate
- is mapped to 9600 baud.
-
- If the baud rate is not within min..max, then if old is non-NULL,
- the original baud rate will be tried. If that exceeds the
- min..max constraint, 9600 baud will be returned. termios will
- be updated to the baud rate in use.
-
- Note: min..max must always allow 9600 baud to be selected.
-
- Locking: caller dependent.
-
- Interrupts: n/a
-
-uart_get_divisor(port,baud)
- Return the divisor (baud_base / baud) for the specified baud
- rate, appropriately rounded.
-
- If 38400 baud and custom divisor is selected, return the
- custom divisor instead.
-
- Locking: caller dependent.
-
- Interrupts: n/a
-
-uart_match_port(port1,port2)
- This utility function can be used to determine whether two
- uart_port structures describe the same port.
-
- Locking: n/a
-
- Interrupts: n/a
-
-uart_write_wakeup(port)
- A driver is expected to call this function when the number of
- characters in the transmit buffer have dropped below a threshold.
-
- Locking: port->lock should be held.
-
- Interrupts: n/a
-
-uart_register_driver(drv)
- Register a uart driver with the core driver. We in turn register
- with the tty layer, and initialise the core driver per-port state.
-
- drv->port should be NULL, and the per-port structures should be
- registered using uart_add_one_port after this call has succeeded.
-
- Locking: none
-
- Interrupts: enabled
-
-uart_unregister_driver()
- Remove all references to a driver from the core driver. The low
- level driver must have removed all its ports via the
- uart_remove_one_port() if it registered them with uart_add_one_port().
-
- Locking: none
-
- Interrupts: enabled
-
-**uart_suspend_port()**
-
-**uart_resume_port()**
-
-**uart_add_one_port()**
-
-**uart_remove_one_port()**
-
-Other notes
------------
-
-It is intended some day to drop the 'unused' entries from uart_port, and
-allow low level drivers to register their own individual uart_port's with
-the core. This will allow drivers to use uart_port as a pointer to a
-structure containing both the uart_port entry with their own extensions,
-thus::
-
- struct my_port {
- struct uart_port port;
- int my_stuff;
- };
-
-Modem control lines via GPIO
-----------------------------
-
-Some helpers are provided in order to set/get modem control lines via GPIO.
-
-mctrl_gpio_init(port, idx):
- This will get the {cts,rts,...}-gpios from device tree if they are
- present and request them, set direction etc, and return an
- allocated structure. `devm_*` functions are used, so there's no need
- to call mctrl_gpio_free().
- As this sets up the irq handling make sure to not handle changes to the
- gpio input lines in your driver, too.
-
-mctrl_gpio_free(dev, gpios):
- This will free the requested gpios in mctrl_gpio_init().
- As `devm_*` functions are used, there's generally no need to call
- this function.
-
-mctrl_gpio_to_gpiod(gpios, gidx)
- This returns the gpio_desc structure associated to the modem line
- index.
-
-mctrl_gpio_set(gpios, mctrl):
- This will sets the gpios according to the mctrl state.
-
-mctrl_gpio_get(gpios, mctrl):
- This will update mctrl with the gpios values.
-
-mctrl_gpio_enable_ms(gpios):
- Enables irqs and handling of changes to the ms lines.
-
-mctrl_gpio_disable_ms(gpios):
- Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/serial/index.rst b/Documentation/serial/index.rst
deleted file mode 100644
index d0ba22ea23bf..000000000000
--- a/Documentation/serial/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:orphan:
-
-==========================
-Support for Serial devices
-==========================
-
-.. toctree::
- :maxdepth: 1
-
-
- driver
- tty
-
-Serial drivers
-==============
-
-.. toctree::
- :maxdepth: 1
-
- cyclades_z
- moxa-smartio
- n_gsm
- rocket
- serial-iso7816
- serial-rs485
-
-.. only:: subproject and html
-
- Indices
- =======
-
- * :ref:`genindex`
diff --git a/Documentation/sparc/index.rst b/Documentation/sparc/index.rst
index 91f7d6643dd5..71cff621f243 100644
--- a/Documentation/sparc/index.rst
+++ b/Documentation/sparc/index.rst
@@ -1,5 +1,3 @@
-:orphan:
-
==================
Sparc Architecture
==================
diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
deleted file mode 100644
index 30d6a64e53f7..000000000000
--- a/Documentation/switchtec.txt
+++ /dev/null
@@ -1,102 +0,0 @@
-========================
-Linux Switchtec Support
-========================
-
-Microsemi's "Switchtec" line of PCI switch devices is already
-supported by the kernel with standard PCI switch drivers. However, the
-Switchtec device advertises a special management endpoint which
-enables some additional functionality. This includes:
-
-* Packet and Byte Counters
-* Firmware Upgrades
-* Event and Error logs
-* Querying port link status
-* Custom user firmware commands
-
-The switchtec kernel module implements this functionality.
-
-
-Interface
-=========
-
-The primary means of communicating with the Switchtec management firmware is
-through the Memory-mapped Remote Procedure Call (MRPC) interface.
-Commands are submitted to the interface with a 4-byte command
-identifier and up to 1KB of command specific data. The firmware will
-respond with a 4-byte return code and up to 1KB of command-specific
-data. The interface only processes a single command at a time.
-
-
-Userspace Interface
-===================
-
-The MRPC interface will be exposed to userspace through a simple char
-device: /dev/switchtec#, one for each management endpoint in the system.
-
-The char device has the following semantics:
-
-* A write must consist of at least 4 bytes and no more than 1028 bytes.
- The first 4 bytes will be interpreted as the Command ID and the
- remainder will be used as the input data. A write will send the
- command to the firmware to begin processing.
-
-* Each write must be followed by exactly one read. Any double write will
- produce an error and any read that doesn't follow a write will
- produce an error.
-
-* A read will block until the firmware completes the command and return
- the 4-byte Command Return Value plus up to 1024 bytes of output
- data. (The length will be specified by the size parameter of the read
- call -- reading less than 4 bytes will produce an error.)
-
-* The poll call will also be supported for userspace applications that
- need to do other things while waiting for the command to complete.
-
-The following IOCTLs are also supported by the device:
-
-* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number
- of partitions in the device.
-
-* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for
- any specified partition in flash.
-
-* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps
- indicating all uncleared events.
-
-* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags
- for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct
- with the event_id, index and flags set (index being the partition or PFF
- number for non-global events). It returns whether the event has
- occurred, the number of times and any event specific data. The flags
- can be used to clear the count or enable and disable actions to
- happen when the event occurs.
- By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag,
- you can set an event to trigger a poll command to return with
- POLLPRI. In this way, userspace can wait for events to occur.
-
-* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert
- between PCI Function Framework number (used by the event system)
- and Switchtec Logic Port ID and Partition number (which is more
- user friendly).
-
-
-Non-Transparent Bridge (NTB) Driver
-===================================
-
-An NTB hardware driver is provided for the Switchtec hardware in
-ntb_hw_switchtec. Currently, it only supports switches configured with
-exactly 2 NT partitions and zero or more non-NT partitions. It also requires
-the following configuration settings:
-
-* Both NT partitions must be able to access each other's GAS spaces.
- Thus, the bits in the GAS Access Vector under Management Settings
- must be set to support this.
-* Kernel configuration MUST include support for NTB (CONFIG_NTB needs
- to be set)
-
-NT EP BAR 2 will be dynamically configured as a Direct Window, and
-the configuration file does not need to configure it explicitly.
-
-Please refer to Documentation/ntb.txt in Linux source tree for an overall
-understanding of the Linux NTB stack. ntb_hw_switchtec works as an NTB
-Hardware Driver in this stack.
diff --git a/Documentation/sysctl/README b/Documentation/sysctl/README
deleted file mode 100644
index d5f24ab0ecc3..000000000000
--- a/Documentation/sysctl/README
+++ /dev/null
@@ -1,76 +0,0 @@
-Documentation for /proc/sys/ kernel version 2.2.10
- (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
-
-'Why', I hear you ask, 'would anyone even _want_ documentation
-for them sysctl files? If anybody really needs it, it's all in
-the source...'
-
-Well, this documentation is written because some people either
-don't know they need to tweak something, or because they don't
-have the time or knowledge to read the source code.
-
-Furthermore, the programmers who built sysctl have built it to
-be actually used, not just for the fun of programming it :-)
-
-==============================================================
-
-Legal blurb:
-
-As usual, there are two main things to consider:
-1. you get what you pay for
-2. it's free
-
-The consequences are that I won't guarantee the correctness of
-this document, and if you come to me complaining about how you
-screwed up your system because of wrong documentation, I won't
-feel sorry for you. I might even laugh at you...
-
-But of course, if you _do_ manage to screw up your system using
-only the sysctl options used in this file, I'd like to hear of
-it. Not only to have a great laugh, but also to make sure that
-you're the last RTFMing person to screw up.
-
-In short, e-mail your suggestions, corrections and / or horror
-stories to: <riel@nl.linux.org>
-
-Rik van Riel.
-
-==============================================================
-
-Introduction:
-
-Sysctl is a means of configuring certain aspects of the kernel
-at run-time, and the /proc/sys/ directory is there so that you
-don't even need special tools to do it!
-In fact, there are only four things needed to use these config
-facilities:
-- a running Linux system
-- root access
-- common sense (this is especially hard to come by these days)
-- knowledge of what all those values mean
-
-As a quick 'ls /proc/sys' will show, the directory consists of
-several (arch-dependent?) subdirs. Each subdir is mainly about
-one part of the kernel, so you can do configuration on a piece
-by piece basis, or just some 'thematic frobbing'.
-
-The subdirs are about:
-abi/ execution domains & personalities
-debug/ <empty>
-dev/ device specific information (eg dev/cdrom/info)
-fs/ specific filesystems
- filehandle, inode, dentry and quota tuning
- binfmt_misc <Documentation/admin-guide/binfmt-misc.rst>
-kernel/ global kernel info / tuning
- miscellaneous stuff
-net/ networking stuff, for documentation look in:
- <Documentation/networking/>
-proc/ <empty>
-sunrpc/ SUN Remote Procedure Call (NFS)
-vm/ memory management tuning
- buffer and cache management
-user/ Per user per user namespace limits
-
-These are the subdirs I have on my system. There might be more
-or other subdirs in another setup. If you see another dir, I'd
-really like to hear about it :-)
diff --git a/Documentation/sysctl/abi.txt b/Documentation/sysctl/abi.txt
deleted file mode 100644
index 63f4ebcf652c..000000000000
--- a/Documentation/sysctl/abi.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-Documentation for /proc/sys/abi/* kernel version 2.6.0.test2
- (c) 2003, Fabian Frederick <ffrederick@users.sourceforge.net>
-
-For general info : README.
-
-==============================================================
-
-This path is binary emulation relevant aka personality types aka abi.
-When a process is executed, it's linked to an exec_domain whose
-personality is defined using values available from /proc/sys/abi.
-You can find further details about abi in include/linux/personality.h.
-
-Here are the files featuring in 2.6 kernel :
-
-- defhandler_coff
-- defhandler_elf
-- defhandler_lcall7
-- defhandler_libcso
-- fake_utsname
-- trace
-
-===========================================================
-defhandler_coff:
-defined value :
-PER_SCOSVR3
-0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE
-
-===========================================================
-defhandler_elf:
-defined value :
-PER_LINUX
-0
-
-===========================================================
-defhandler_lcall7:
-defined value :
-PER_SVR4
-0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
-
-===========================================================
-defhandler_libsco:
-defined value:
-PER_SVR4
-0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
-
-===========================================================
-fake_utsname:
-Unused
-
-===========================================================
-trace:
-Unused
-
-===========================================================
diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
deleted file mode 100644
index ebc679bcb2dc..000000000000
--- a/Documentation/sysctl/fs.txt
+++ /dev/null
@@ -1,374 +0,0 @@
-Documentation for /proc/sys/fs/* kernel version 2.2.10
- (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
- (c) 2009, Shen Feng<shen@cn.fujitsu.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains documentation for the sysctl files in
-/proc/sys/fs/ and is valid for Linux kernel version 2.2.
-
-The files in this directory can be used to tune and monitor
-miscellaneous and general things in the operation of the Linux
-kernel. Since some of the files _can_ be used to screw up your
-system, it is advisable to read both documentation and source
-before actually making adjustments.
-
-1. /proc/sys/fs
-----------------------------------------------------------
-
-Currently, these files are in /proc/sys/fs:
-- aio-max-nr
-- aio-nr
-- dentry-state
-- dquot-max
-- dquot-nr
-- file-max
-- file-nr
-- inode-max
-- inode-nr
-- inode-state
-- nr_open
-- overflowuid
-- overflowgid
-- pipe-user-pages-hard
-- pipe-user-pages-soft
-- protected_fifos
-- protected_hardlinks
-- protected_regular
-- protected_symlinks
-- suid_dumpable
-- super-max
-- super-nr
-
-==============================================================
-
-aio-nr & aio-max-nr:
-
-aio-nr is the running total of the number of events specified on the
-io_setup system call for all currently active aio contexts. If aio-nr
-reaches aio-max-nr then io_setup will fail with EAGAIN. Note that
-raising aio-max-nr does not result in the pre-allocation or re-sizing
-of any kernel data structures.
-
-==============================================================
-
-dentry-state:
-
-From linux/include/linux/dcache.h:
---------------------------------------------------------------
-struct dentry_stat_t dentry_stat {
- int nr_dentry;
- int nr_unused;
- int age_limit; /* age in seconds */
- int want_pages; /* pages requested by system */
- int nr_negative; /* # of unused negative dentries */
- int dummy; /* Reserved for future use */
-};
---------------------------------------------------------------
-
-Dentries are dynamically allocated and deallocated.
-
-nr_dentry shows the total number of dentries allocated (active
-+ unused). nr_unused shows the number of dentries that are not
-actively used, but are saved in the LRU list for future reuse.
-
-Age_limit is the age in seconds after which dcache entries
-can be reclaimed when memory is short and want_pages is
-nonzero when shrink_dcache_pages() has been called and the
-dcache isn't pruned yet.
-
-nr_negative shows the number of unused dentries that are also
-negative dentries which do not map to any files. Instead,
-they help speeding up rejection of non-existing files provided
-by the users.
-
-==============================================================
-
-dquot-max & dquot-nr:
-
-The file dquot-max shows the maximum number of cached disk
-quota entries.
-
-The file dquot-nr shows the number of allocated disk quota
-entries and the number of free disk quota entries.
-
-If the number of free cached disk quotas is very low and
-you have some awesome number of simultaneous system users,
-you might want to raise the limit.
-
-==============================================================
-
-file-max & file-nr:
-
-The value in file-max denotes the maximum number of file-
-handles that the Linux kernel will allocate. When you get lots
-of error messages about running out of file handles, you might
-want to increase this limit.
-
-Historically,the kernel was able to allocate file handles
-dynamically, but not to free them again. The three values in
-file-nr denote the number of allocated file handles, the number
-of allocated but unused file handles, and the maximum number of
-file handles. Linux 2.6 always reports 0 as the number of free
-file handles -- this is not an error, it just means that the
-number of allocated file handles exactly matches the number of
-used file handles.
-
-Attempts to allocate more file descriptors than file-max are
-reported with printk, look for "VFS: file-max limit <number>
-reached".
-==============================================================
-
-nr_open:
-
-This denotes the maximum number of file-handles a process can
-allocate. Default value is 1024*1024 (1048576) which should be
-enough for most machines. Actual limit depends on RLIMIT_NOFILE
-resource limit.
-
-==============================================================
-
-inode-max, inode-nr & inode-state:
-
-As with file handles, the kernel allocates the inode structures
-dynamically, but can't free them yet.
-
-The value in inode-max denotes the maximum number of inode
-handlers. This value should be 3-4 times larger than the value
-in file-max, since stdin, stdout and network sockets also
-need an inode struct to handle them. When you regularly run
-out of inodes, you need to increase this value.
-
-The file inode-nr contains the first two items from
-inode-state, so we'll skip to that file...
-
-Inode-state contains three actual numbers and four dummies.
-The actual numbers are, in order of appearance, nr_inodes,
-nr_free_inodes and preshrink.
-
-Nr_inodes stands for the number of inodes the system has
-allocated, this can be slightly more than inode-max because
-Linux allocates them one pageful at a time.
-
-Nr_free_inodes represents the number of free inodes (?) and
-preshrink is nonzero when the nr_inodes > inode-max and the
-system needs to prune the inode list instead of allocating
-more.
-
-==============================================================
-
-overflowgid & overflowuid:
-
-Some filesystems only support 16-bit UIDs and GIDs, although in Linux
-UIDs and GIDs are 32 bits. When one of these filesystems is mounted
-with writes enabled, any UID or GID that would exceed 65535 is translated
-to a fixed value before being written to disk.
-
-These sysctls allow you to change the value of the fixed UID and GID.
-The default is 65534.
-
-==============================================================
-
-pipe-user-pages-hard:
-
-Maximum total number of pages a non-privileged user may allocate for pipes.
-Once this limit is reached, no new pipes may be allocated until usage goes
-below the limit again. When set to 0, no limit is applied, which is the default
-setting.
-
-==============================================================
-
-pipe-user-pages-soft:
-
-Maximum total number of pages a non-privileged user may allocate for pipes
-before the pipe size gets limited to a single page. Once this limit is reached,
-new pipes will be limited to a single page in size for this user in order to
-limit total memory usage, and trying to increase them using fcntl() will be
-denied until usage goes below the limit again. The default value allows to
-allocate up to 1024 pipes at their default size. When set to 0, no limit is
-applied.
-
-==============================================================
-
-protected_fifos:
-
-The intent of this protection is to avoid unintentional writes to
-an attacker-controlled FIFO, where a program expected to create a regular
-file.
-
-When set to "0", writing to FIFOs is unrestricted.
-
-When set to "1" don't allow O_CREAT open on FIFOs that we don't own
-in world writable sticky directories, unless they are owned by the
-owner of the directory.
-
-When set to "2" it also applies to group writable sticky directories.
-
-This protection is based on the restrictions in Openwall.
-
-==============================================================
-
-protected_hardlinks:
-
-A long-standing class of security issues is the hardlink-based
-time-of-check-time-of-use race, most commonly seen in world-writable
-directories like /tmp. The common method of exploitation of this flaw
-is to cross privilege boundaries when following a given hardlink (i.e. a
-root process follows a hardlink created by another user). Additionally,
-on systems without separated partitions, this stops unauthorized users
-from "pinning" vulnerable setuid/setgid files against being upgraded by
-the administrator, or linking to special files.
-
-When set to "0", hardlink creation behavior is unrestricted.
-
-When set to "1" hardlinks cannot be created by users if they do not
-already own the source file, or do not have read/write access to it.
-
-This protection is based on the restrictions in Openwall and grsecurity.
-
-==============================================================
-
-protected_regular:
-
-This protection is similar to protected_fifos, but it
-avoids writes to an attacker-controlled regular file, where a program
-expected to create one.
-
-When set to "0", writing to regular files is unrestricted.
-
-When set to "1" don't allow O_CREAT open on regular files that we
-don't own in world writable sticky directories, unless they are
-owned by the owner of the directory.
-
-When set to "2" it also applies to group writable sticky directories.
-
-==============================================================
-
-protected_symlinks:
-
-A long-standing class of security issues is the symlink-based
-time-of-check-time-of-use race, most commonly seen in world-writable
-directories like /tmp. The common method of exploitation of this flaw
-is to cross privilege boundaries when following a given symlink (i.e. a
-root process follows a symlink belonging to another user). For a likely
-incomplete list of hundreds of examples across the years, please see:
-http://cve.mitre.org/cgi-bin/cvekey.cgi?keyword=/tmp
-
-When set to "0", symlink following behavior is unrestricted.
-
-When set to "1" symlinks are permitted to be followed only when outside
-a sticky world-writable directory, or when the uid of the symlink and
-follower match, or when the directory owner matches the symlink's owner.
-
-This protection is based on the restrictions in Openwall and grsecurity.
-
-==============================================================
-
-suid_dumpable:
-
-This value can be used to query and set the core dump mode for setuid
-or otherwise protected/tainted binaries. The modes are
-
-0 - (default) - traditional behaviour. Any process which has changed
- privilege levels or is execute only will not be dumped.
-1 - (debug) - all processes dump core when possible. The core dump is
- owned by the current user and no security is applied. This is
- intended for system debugging situations only. Ptrace is unchecked.
- This is insecure as it allows regular users to examine the memory
- contents of privileged processes.
-2 - (suidsafe) - any binary which normally would not be dumped is dumped
- anyway, but only if the "core_pattern" kernel sysctl is set to
- either a pipe handler or a fully qualified path. (For more details
- on this limitation, see CVE-2006-2451.) This mode is appropriate
- when administrators are attempting to debug problems in a normal
- environment, and either have a core dump pipe handler that knows
- to treat privileged core dumps with care, or specific directory
- defined for catching core dumps. If a core dump happens without
- a pipe handler or fully qualifid path, a message will be emitted
- to syslog warning about the lack of a correct setting.
-
-==============================================================
-
-super-max & super-nr:
-
-These numbers control the maximum number of superblocks, and
-thus the maximum number of mounted filesystems the kernel
-can have. You only need to increase super-max if you need to
-mount more filesystems than the current value in super-max
-allows you to.
-
-==============================================================
-
-aio-nr & aio-max-nr:
-
-aio-nr shows the current system-wide number of asynchronous io
-requests. aio-max-nr allows you to change the maximum value
-aio-nr can grow to.
-
-==============================================================
-
-mount-max:
-
-This denotes the maximum number of mounts that may exist
-in a mount namespace.
-
-==============================================================
-
-
-2. /proc/sys/fs/binfmt_misc
-----------------------------------------------------------
-
-Documentation for the files in /proc/sys/fs/binfmt_misc is
-in Documentation/admin-guide/binfmt-misc.rst.
-
-
-3. /proc/sys/fs/mqueue - POSIX message queues filesystem
-----------------------------------------------------------
-
-The "mqueue" filesystem provides the necessary kernel features to enable the
-creation of a user space library that implements the POSIX message queues
-API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System
-Interfaces specification.)
-
-The "mqueue" filesystem contains values for determining/setting the amount of
-resources used by the file system.
-
-/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the
-maximum number of message queues allowed on the system.
-
-/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the
-maximum number of messages in a queue value. In fact it is the limiting value
-for another (user) limit which is set in mq_open invocation. This attribute of
-a queue must be less or equal then msg_max.
-
-/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the
-maximum message size value (it is every message queue's attribute set during
-its creation).
-
-/proc/sys/fs/mqueue/msg_default is a read/write file for setting/getting the
-default number of messages in a queue value if attr parameter of mq_open(2) is
-NULL. If it exceed msg_max, the default value is initialized msg_max.
-
-/proc/sys/fs/mqueue/msgsize_default is a read/write file for setting/getting
-the default message size value if attr parameter of mq_open(2) is NULL. If it
-exceed msgsize_max, the default value is initialized msgsize_max.
-
-4. /proc/sys/fs/epoll - Configuration options for the epoll interface
---------------------------------------------------------
-
-This directory contains configuration options for the epoll(7) interface.
-
-max_user_watches
-----------------
-
-Every epoll file descriptor can store a number of files to be monitored
-for event readiness. Each one of these monitored files constitutes a "watch".
-This configuration option sets the maximum number of "watches" that are
-allowed for each user.
-Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes
-on a 64bit one.
-The current default value for max_user_watches is the 1/32 of the available
-low memory, divided for the "watch" cost in bytes.
-
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
deleted file mode 100644
index 92f7f34b021a..000000000000
--- a/Documentation/sysctl/kernel.txt
+++ /dev/null
@@ -1,1145 +0,0 @@
-Documentation for /proc/sys/kernel/* kernel version 2.2.10
- (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
- (c) 2009, Shen Feng<shen@cn.fujitsu.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains documentation for the sysctl files in
-/proc/sys/kernel/ and is valid for Linux kernel version 2.2.
-
-The files in this directory can be used to tune and monitor
-miscellaneous and general things in the operation of the Linux
-kernel. Since some of the files _can_ be used to screw up your
-system, it is advisable to read both documentation and source
-before actually making adjustments.
-
-Currently, these files might (depending on your configuration)
-show up in /proc/sys/kernel:
-
-- acct
-- acpi_video_flags
-- auto_msgmni
-- bootloader_type [ X86 only ]
-- bootloader_version [ X86 only ]
-- callhome [ S390 only ]
-- cap_last_cap
-- core_pattern
-- core_pipe_limit
-- core_uses_pid
-- ctrl-alt-del
-- dmesg_restrict
-- domainname
-- hostname
-- hotplug
-- hardlockup_all_cpu_backtrace
-- hardlockup_panic
-- hung_task_panic
-- hung_task_check_count
-- hung_task_timeout_secs
-- hung_task_check_interval_secs
-- hung_task_warnings
-- hyperv_record_panic_msg
-- kexec_load_disabled
-- kptr_restrict
-- l2cr [ PPC only ]
-- modprobe ==> Documentation/debugging-modules.txt
-- modules_disabled
-- msg_next_id [ sysv ipc ]
-- msgmax
-- msgmnb
-- msgmni
-- nmi_watchdog
-- osrelease
-- ostype
-- overflowgid
-- overflowuid
-- panic
-- panic_on_oops
-- panic_on_stackoverflow
-- panic_on_unrecovered_nmi
-- panic_on_warn
-- panic_print
-- panic_on_rcu_stall
-- perf_cpu_time_max_percent
-- perf_event_paranoid
-- perf_event_max_stack
-- perf_event_mlock_kb
-- perf_event_max_contexts_per_stack
-- pid_max
-- powersave-nap [ PPC only ]
-- printk
-- printk_delay
-- printk_ratelimit
-- printk_ratelimit_burst
-- pty ==> Documentation/filesystems/devpts.txt
-- randomize_va_space
-- real-root-dev ==> Documentation/admin-guide/initrd.rst
-- reboot-cmd [ SPARC only ]
-- rtsig-max
-- rtsig-nr
-- sched_energy_aware
-- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst
-- sem
-- sem_next_id [ sysv ipc ]
-- sg-big-buff [ generic SCSI device (sg) ]
-- shm_next_id [ sysv ipc ]
-- shm_rmid_forced
-- shmall
-- shmmax [ sysv ipc ]
-- shmmni
-- softlockup_all_cpu_backtrace
-- soft_watchdog
-- stack_erasing
-- stop-a [ SPARC only ]
-- sysrq ==> Documentation/admin-guide/sysrq.rst
-- sysctl_writes_strict
-- tainted ==> Documentation/admin-guide/tainted-kernels.rst
-- threads-max
-- unknown_nmi_panic
-- watchdog
-- watchdog_thresh
-- version
-
-==============================================================
-
-acct:
-
-highwater lowwater frequency
-
-If BSD-style process accounting is enabled these values control
-its behaviour. If free space on filesystem where the log lives
-goes below <lowwater>% accounting suspends. If free space gets
-above <highwater>% accounting resumes. <Frequency> determines
-how often do we check the amount of free space (value is in
-seconds). Default:
-4 2 30
-That is, suspend accounting if there left <= 2% free; resume it
-if we got >=4%; consider information about amount of free space
-valid for 30 seconds.
-
-==============================================================
-
-acpi_video_flags:
-
-flags
-
-See Doc*/kernel/power/video.txt, it allows mode of video boot to be
-set during run time.
-
-==============================================================
-
-auto_msgmni:
-
-This variable has no effect and may be removed in future kernel
-releases. Reading it always returns 0.
-Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni
-upon memory add/remove or upon ipc namespace creation/removal.
-Echoing "1" into this file enabled msgmni automatic recomputing.
-Echoing "0" turned it off. auto_msgmni default value was 1.
-
-
-==============================================================
-
-bootloader_type:
-
-x86 bootloader identification
-
-This gives the bootloader type number as indicated by the bootloader,
-shifted left by 4, and OR'd with the low four bits of the bootloader
-version. The reason for this encoding is that this used to match the
-type_of_loader field in the kernel header; the encoding is kept for
-backwards compatibility. That is, if the full bootloader type number
-is 0x15 and the full version number is 0x234, this file will contain
-the value 340 = 0x154.
-
-See the type_of_loader and ext_loader_type fields in
-Documentation/x86/boot.rst for additional information.
-
-==============================================================
-
-bootloader_version:
-
-x86 bootloader version
-
-The complete bootloader version number. In the example above, this
-file will contain the value 564 = 0x234.
-
-See the type_of_loader and ext_loader_ver fields in
-Documentation/x86/boot.rst for additional information.
-
-==============================================================
-
-callhome:
-
-Controls the kernel's callhome behavior in case of a kernel panic.
-
-The s390 hardware allows an operating system to send a notification
-to a service organization (callhome) in case of an operating system panic.
-
-When the value in this file is 0 (which is the default behavior)
-nothing happens in case of a kernel panic. If this value is set to "1"
-the complete kernel oops message is send to the IBM customer service
-organization in case the mainframe the Linux operating system is running
-on has a service contract with IBM.
-
-==============================================================
-
-cap_last_cap
-
-Highest valid capability of the running kernel. Exports
-CAP_LAST_CAP from the kernel.
-
-==============================================================
-
-core_pattern:
-
-core_pattern is used to specify a core dumpfile pattern name.
-. max length 127 characters; default value is "core"
-. core_pattern is used as a pattern template for the output filename;
- certain string patterns (beginning with '%') are substituted with
- their actual values.
-. backward compatibility with core_uses_pid:
- If core_pattern does not include "%p" (default does not)
- and core_uses_pid is set, then .PID will be appended to
- the filename.
-. corename format specifiers:
- %<NUL> '%' is dropped
- %% output one '%'
- %p pid
- %P global pid (init PID namespace)
- %i tid
- %I global tid (init PID namespace)
- %u uid (in initial user namespace)
- %g gid (in initial user namespace)
- %d dump mode, matches PR_SET_DUMPABLE and
- /proc/sys/fs/suid_dumpable
- %s signal number
- %t UNIX time of dump
- %h hostname
- %e executable filename (may be shortened)
- %E executable path
- %<OTHER> both are dropped
-. If the first character of the pattern is a '|', the kernel will treat
- the rest of the pattern as a command to run. The core dump will be
- written to the standard input of that program instead of to a file.
-
-==============================================================
-
-core_pipe_limit:
-
-This sysctl is only applicable when core_pattern is configured to pipe
-core files to a user space helper (when the first character of
-core_pattern is a '|', see above). When collecting cores via a pipe
-to an application, it is occasionally useful for the collecting
-application to gather data about the crashing process from its
-/proc/pid directory. In order to do this safely, the kernel must wait
-for the collecting process to exit, so as not to remove the crashing
-processes proc files prematurely. This in turn creates the
-possibility that a misbehaving userspace collecting process can block
-the reaping of a crashed process simply by never exiting. This sysctl
-defends against that. It defines how many concurrent crashing
-processes may be piped to user space applications in parallel. If
-this value is exceeded, then those crashing processes above that value
-are noted via the kernel log and their cores are skipped. 0 is a
-special value, indicating that unlimited processes may be captured in
-parallel, but that no waiting will take place (i.e. the collecting
-process is not guaranteed access to /proc/<crashing pid>/). This
-value defaults to 0.
-
-==============================================================
-
-core_uses_pid:
-
-The default coredump filename is "core". By setting
-core_uses_pid to 1, the coredump filename becomes core.PID.
-If core_pattern does not include "%p" (default does not)
-and core_uses_pid is set, then .PID will be appended to
-the filename.
-
-==============================================================
-
-ctrl-alt-del:
-
-When the value in this file is 0, ctrl-alt-del is trapped and
-sent to the init(1) program to handle a graceful restart.
-When, however, the value is > 0, Linux's reaction to a Vulcan
-Nerve Pinch (tm) will be an immediate reboot, without even
-syncing its dirty buffers.
-
-Note: when a program (like dosemu) has the keyboard in 'raw'
-mode, the ctrl-alt-del is intercepted by the program before it
-ever reaches the kernel tty layer, and it's up to the program
-to decide what to do with it.
-
-==============================================================
-
-dmesg_restrict:
-
-This toggle indicates whether unprivileged users are prevented
-from using dmesg(8) to view messages from the kernel's log buffer.
-When dmesg_restrict is set to (0) there are no restrictions. When
-dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use
-dmesg(8).
-
-The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the
-default value of dmesg_restrict.
-
-==============================================================
-
-domainname & hostname:
-
-These files can be used to set the NIS/YP domainname and the
-hostname of your box in exactly the same way as the commands
-domainname and hostname, i.e.:
-# echo "darkstar" > /proc/sys/kernel/hostname
-# echo "mydomain" > /proc/sys/kernel/domainname
-has the same effect as
-# hostname "darkstar"
-# domainname "mydomain"
-
-Note, however, that the classic darkstar.frop.org has the
-hostname "darkstar" and DNS (Internet Domain Name Server)
-domainname "frop.org", not to be confused with the NIS (Network
-Information Service) or YP (Yellow Pages) domainname. These two
-domain names are in general different. For a detailed discussion
-see the hostname(1) man page.
-
-==============================================================
-hardlockup_all_cpu_backtrace:
-
-This value controls the hard lockup detector behavior when a hard
-lockup condition is detected as to whether or not to gather further
-debug information. If enabled, arch-specific all-CPU stack dumping
-will be initiated.
-
-0: do nothing. This is the default behavior.
-
-1: on detection capture more debug information.
-==============================================================
-
-hardlockup_panic:
-
-This parameter can be used to control whether the kernel panics
-when a hard lockup is detected.
-
- 0 - don't panic on hard lockup
- 1 - panic on hard lockup
-
-See Documentation/lockup-watchdogs.txt for more information. This can
-also be set using the nmi_watchdog kernel parameter.
-
-==============================================================
-
-hotplug:
-
-Path for the hotplug policy agent.
-Default value is "/sbin/hotplug".
-
-==============================================================
-
-hung_task_panic:
-
-Controls the kernel's behavior when a hung task is detected.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-0: continue operation. This is the default behavior.
-
-1: panic immediately.
-
-==============================================================
-
-hung_task_check_count:
-
-The upper bound on the number of tasks that are checked.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-==============================================================
-
-hung_task_timeout_secs:
-
-When a task in D state did not get scheduled
-for more than this value report a warning.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-0: means infinite timeout - no checking done.
-Possible values to set are in range {0..LONG_MAX/HZ}.
-
-==============================================================
-
-hung_task_check_interval_secs:
-
-Hung task check interval. If hung task checking is enabled
-(see hung_task_timeout_secs), the check is done every
-hung_task_check_interval_secs seconds.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
-0 (default): means use hung_task_timeout_secs as checking interval.
-Possible values to set are in range {0..LONG_MAX/HZ}.
-
-==============================================================
-
-hung_task_warnings:
-
-The maximum number of warnings to report. During a check interval
-if a hung task is detected, this value is decreased by 1.
-When this value reaches 0, no more warnings will be reported.
-This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
-
--1: report an infinite number of warnings.
-
-==============================================================
-
-hyperv_record_panic_msg:
-
-Controls whether the panic kmsg data should be reported to Hyper-V.
-
-0: do not report panic kmsg data.
-
-1: report the panic kmsg data. This is the default behavior.
-
-==============================================================
-
-kexec_load_disabled:
-
-A toggle indicating if the kexec_load syscall has been disabled. This
-value defaults to 0 (false: kexec_load enabled), but can be set to 1
-(true: kexec_load disabled). Once true, kexec can no longer be used, and
-the toggle cannot be set back to false. This allows a kexec image to be
-loaded before disabling the syscall, allowing a system to set up (and
-later use) an image without it being altered. Generally used together
-with the "modules_disabled" sysctl.
-
-==============================================================
-
-kptr_restrict:
-
-This toggle indicates whether restrictions are placed on
-exposing kernel addresses via /proc and other interfaces.
-
-When kptr_restrict is set to 0 (the default) the address is hashed before
-printing. (This is the equivalent to %p.)
-
-When kptr_restrict is set to (1), kernel pointers printed using the %pK
-format specifier will be replaced with 0's unless the user has CAP_SYSLOG
-and effective user and group ids are equal to the real ids. This is
-because %pK checks are done at read() time rather than open() time, so
-if permissions are elevated between the open() and the read() (e.g via
-a setuid binary) then %pK will not leak kernel pointers to unprivileged
-users. Note, this is a temporary solution only. The correct long-term
-solution is to do the permission checks at open() time. Consider removing
-world read permissions from files that use %pK, and using dmesg_restrict
-to protect against uses of %pK in dmesg(8) if leaking kernel pointer
-values to unprivileged users is a concern.
-
-When kptr_restrict is set to (2), kernel pointers printed using
-%pK will be replaced with 0's regardless of privileges.
-
-==============================================================
-
-l2cr: (PPC only)
-
-This flag controls the L2 cache of G3 processor boards. If
-0, the cache is disabled. Enabled if nonzero.
-
-==============================================================
-
-modules_disabled:
-
-A toggle value indicating if modules are allowed to be loaded
-in an otherwise modular kernel. This toggle defaults to off
-(0), but can be set true (1). Once true, modules can be
-neither loaded nor unloaded, and the toggle cannot be set back
-to false. Generally used with the "kexec_load_disabled" toggle.
-
-==============================================================
-
-msg_next_id, sem_next_id, and shm_next_id:
-
-These three toggles allows to specify desired id for next allocated IPC
-object: message, semaphore or shared memory respectively.
-
-By default they are equal to -1, which means generic allocation logic.
-Possible values to set are in range {0..INT_MAX}.
-
-Notes:
-1) kernel doesn't guarantee, that new object will have desired id. So,
-it's up to userspace, how to handle an object with "wrong" id.
-2) Toggle with non-default value will be set back to -1 by kernel after
-successful IPC object allocation. If an IPC object allocation syscall
-fails, it is undefined if the value remains unmodified or is reset to -1.
-
-==============================================================
-
-nmi_watchdog:
-
-This parameter can be used to control the NMI watchdog
-(i.e. the hard lockup detector) on x86 systems.
-
- 0 - disable the hard lockup detector
- 1 - enable the hard lockup detector
-
-The hard lockup detector monitors each CPU for its ability to respond to
-timer interrupts. The mechanism utilizes CPU performance counter registers
-that are programmed to generate Non-Maskable Interrupts (NMIs) periodically
-while a CPU is busy. Hence, the alternative name 'NMI watchdog'.
-
-The NMI watchdog is disabled by default if the kernel is running as a guest
-in a KVM virtual machine. This default can be overridden by adding
-
- nmi_watchdog=1
-
-to the guest kernel command line (see Documentation/admin-guide/kernel-parameters.rst).
-
-==============================================================
-
-numa_balancing
-
-Enables/disables automatic page fault based NUMA memory
-balancing. Memory is moved automatically to nodes
-that access it often.
-
-Enables/disables automatic NUMA memory balancing. On NUMA machines, there
-is a performance penalty if remote memory is accessed by a CPU. When this
-feature is enabled the kernel samples what task thread is accessing memory
-by periodically unmapping pages and later trapping a page fault. At the
-time of the page fault, it is determined if the data being accessed should
-be migrated to a local memory node.
-
-The unmapping of pages and trapping faults incur additional overhead that
-ideally is offset by improved memory locality but there is no universal
-guarantee. If the target workload is already bound to NUMA nodes then this
-feature should be disabled. Otherwise, if the system overhead from the
-feature is too high then the rate the kernel samples for NUMA hinting
-faults may be controlled by the numa_balancing_scan_period_min_ms,
-numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
-
-==============================================================
-
-numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms,
-numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb
-
-Automatic NUMA balancing scans tasks address space and unmaps pages to
-detect if pages are properly placed or if the data should be migrated to a
-memory node local to where the task is running. Every "scan delay" the task
-scans the next "scan size" number of pages in its address space. When the
-end of the address space is reached the scanner restarts from the beginning.
-
-In combination, the "scan delay" and "scan size" determine the scan rate.
-When "scan delay" decreases, the scan rate increases. The scan delay and
-hence the scan rate of every task is adaptive and depends on historical
-behaviour. If pages are properly placed then the scan delay increases,
-otherwise the scan delay decreases. The "scan size" is not adaptive but
-the higher the "scan size", the higher the scan rate.
-
-Higher scan rates incur higher system overhead as page faults must be
-trapped and potentially data must be migrated. However, the higher the scan
-rate, the more quickly a tasks memory is migrated to a local node if the
-workload pattern changes and minimises performance impact due to remote
-memory accesses. These sysctls control the thresholds for scan delays and
-the number of pages scanned.
-
-numa_balancing_scan_period_min_ms is the minimum time in milliseconds to
-scan a tasks virtual memory. It effectively controls the maximum scanning
-rate for each task.
-
-numa_balancing_scan_delay_ms is the starting "scan delay" used for a task
-when it initially forks.
-
-numa_balancing_scan_period_max_ms is the maximum time in milliseconds to
-scan a tasks virtual memory. It effectively controls the minimum scanning
-rate for each task.
-
-numa_balancing_scan_size_mb is how many megabytes worth of pages are
-scanned for a given scan.
-
-==============================================================
-
-osrelease, ostype & version:
-
-# cat osrelease
-2.1.88
-# cat ostype
-Linux
-# cat version
-#5 Wed Feb 25 21:49:24 MET 1998
-
-The files osrelease and ostype should be clear enough. Version
-needs a little more clarification however. The '#5' means that
-this is the fifth kernel built from this source base and the
-date behind it indicates the time the kernel was built.
-The only way to tune these values is to rebuild the kernel :-)
-
-==============================================================
-
-overflowgid & overflowuid:
-
-if your architecture did not always support 32-bit UIDs (i.e. arm,
-i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to
-applications that use the old 16-bit UID/GID system calls, if the
-actual UID or GID would exceed 65535.
-
-These sysctls allow you to change the value of the fixed UID and GID.
-The default is 65534.
-
-==============================================================
-
-panic:
-
-The value in this file represents the number of seconds the kernel
-waits before rebooting on a panic. When you use the software watchdog,
-the recommended setting is 60.
-
-==============================================================
-
-panic_on_io_nmi:
-
-Controls the kernel's behavior when a CPU receives an NMI caused by
-an IO error.
-
-0: try to continue operation (default)
-
-1: panic immediately. The IO error triggered an NMI. This indicates a
- serious system condition which could result in IO data corruption.
- Rather than continuing, panicking might be a better choice. Some
- servers issue this sort of NMI when the dump button is pushed,
- and you can use this option to take a crash dump.
-
-==============================================================
-
-panic_on_oops:
-
-Controls the kernel's behaviour when an oops or BUG is encountered.
-
-0: try to continue operation
-
-1: panic immediately. If the `panic' sysctl is also non-zero then the
- machine will be rebooted.
-
-==============================================================
-
-panic_on_stackoverflow:
-
-Controls the kernel's behavior when detecting the overflows of
-kernel, IRQ and exception stacks except a user stack.
-This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
-
-0: try to continue operation.
-
-1: panic immediately.
-
-==============================================================
-
-panic_on_unrecovered_nmi:
-
-The default Linux behaviour on an NMI of either memory or unknown is
-to continue operation. For many environments such as scientific
-computing it is preferable that the box is taken out and the error
-dealt with than an uncorrected parity/ECC error get propagated.
-
-A small number of systems do generate NMI's for bizarre random reasons
-such as power management so the default is off. That sysctl works like
-the existing panic controls already in that directory.
-
-==============================================================
-
-panic_on_warn:
-
-Calls panic() in the WARN() path when set to 1. This is useful to avoid
-a kernel rebuild when attempting to kdump at the location of a WARN().
-
-0: only WARN(), default behaviour.
-
-1: call panic() after printing out WARN() location.
-
-==============================================================
-
-panic_print:
-
-Bitmask for printing system info when panic happens. User can chose
-combination of the following bits:
-
-bit 0: print all tasks info
-bit 1: print system memory info
-bit 2: print timer info
-bit 3: print locks info if CONFIG_LOCKDEP is on
-bit 4: print ftrace buffer
-
-So for example to print tasks and memory info on panic, user can:
- echo 3 > /proc/sys/kernel/panic_print
-
-==============================================================
-
-panic_on_rcu_stall:
-
-When set to 1, calls panic() after RCU stall detection messages. This
-is useful to define the root cause of RCU stalls using a vmcore.
-
-0: do not panic() when RCU stall takes place, default behavior.
-
-1: panic() after printing RCU stall messages.
-
-==============================================================
-
-perf_cpu_time_max_percent:
-
-Hints to the kernel how much CPU time it should be allowed to
-use to handle perf sampling events. If the perf subsystem
-is informed that its samples are exceeding this limit, it
-will drop its sampling frequency to attempt to reduce its CPU
-usage.
-
-Some perf sampling happens in NMIs. If these samples
-unexpectedly take too long to execute, the NMIs can become
-stacked up next to each other so much that nothing else is
-allowed to execute.
-
-0: disable the mechanism. Do not monitor or correct perf's
- sampling rate no matter how CPU time it takes.
-
-1-100: attempt to throttle perf's sample rate to this
- percentage of CPU. Note: the kernel calculates an
- "expected" length of each sample event. 100 here means
- 100% of that expected length. Even if this is set to
- 100, you may still see sample throttling if this
- length is exceeded. Set to 0 if you truly do not care
- how much CPU is consumed.
-
-==============================================================
-
-perf_event_paranoid:
-
-Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN). The default value is 2.
-
- -1: Allow use of (almost) all events by all users
- Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
->=0: Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN
- Disallow raw tracepoint access by users without CAP_SYS_ADMIN
->=1: Disallow CPU event access by users without CAP_SYS_ADMIN
->=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
-
-==============================================================
-
-perf_event_max_stack:
-
-Controls maximum number of stack frames to copy for (attr.sample_type &
-PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
-'perf record -g' or 'perf trace --call-graph fp'.
-
-This can only be done when no events are in use that have callchains
-enabled, otherwise writing to this file will return -EBUSY.
-
-The default value is 127.
-
-==============================================================
-
-perf_event_mlock_kb:
-
-Control size of per-cpu ring buffer not counted agains mlock limit.
-
-The default value is 512 + 1 page
-
-==============================================================
-
-perf_event_max_contexts_per_stack:
-
-Controls maximum number of stack frame context entries for
-(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
-instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
-
-This can only be done when no events are in use that have callchains
-enabled, otherwise writing to this file will return -EBUSY.
-
-The default value is 8.
-
-==============================================================
-
-pid_max:
-
-PID allocation wrap value. When the kernel's next PID value
-reaches this value, it wraps back to a minimum PID value.
-PIDs of value pid_max or larger are not allocated.
-
-==============================================================
-
-ns_last_pid:
-
-The last pid allocated in the current (the one task using this sysctl
-lives in) pid namespace. When selecting a pid for a next task on fork
-kernel tries to allocate a number starting from this one.
-
-==============================================================
-
-powersave-nap: (PPC only)
-
-If set, Linux-PPC will use the 'nap' mode of powersaving,
-otherwise the 'doze' mode will be used.
-
-==============================================================
-
-printk:
-
-The four values in printk denote: console_loglevel,
-default_message_loglevel, minimum_console_loglevel and
-default_console_loglevel respectively.
-
-These values influence printk() behavior when printing or
-logging error messages. See 'man 2 syslog' for more info on
-the different loglevels.
-
-- console_loglevel: messages with a higher priority than
- this will be printed to the console
-- default_message_loglevel: messages without an explicit priority
- will be printed with this priority
-- minimum_console_loglevel: minimum (highest) value to which
- console_loglevel can be set
-- default_console_loglevel: default value for console_loglevel
-
-==============================================================
-
-printk_delay:
-
-Delay each printk message in printk_delay milliseconds
-
-Value from 0 - 10000 is allowed.
-
-==============================================================
-
-printk_ratelimit:
-
-Some warning messages are rate limited. printk_ratelimit specifies
-the minimum length of time between these messages (in jiffies), by
-default we allow one every 5 seconds.
-
-A value of 0 will disable rate limiting.
-
-==============================================================
-
-printk_ratelimit_burst:
-
-While long term we enforce one message per printk_ratelimit
-seconds, we do allow a burst of messages to pass through.
-printk_ratelimit_burst specifies the number of messages we can
-send before ratelimiting kicks in.
-
-==============================================================
-
-printk_devkmsg:
-
-Control the logging to /dev/kmsg from userspace:
-
-ratelimit: default, ratelimited
-on: unlimited logging to /dev/kmsg from userspace
-off: logging to /dev/kmsg disabled
-
-The kernel command line parameter printk.devkmsg= overrides this and is
-a one-time setting until next reboot: once set, it cannot be changed by
-this sysctl interface anymore.
-
-==============================================================
-
-randomize_va_space:
-
-This option can be used to select the type of process address
-space randomization that is used in the system, for architectures
-that support this feature.
-
-0 - Turn the process address space randomization off. This is the
- default for architectures that do not support this feature anyways,
- and kernels that are booted with the "norandmaps" parameter.
-
-1 - Make the addresses of mmap base, stack and VDSO page randomized.
- This, among other things, implies that shared libraries will be
- loaded to random addresses. Also for PIE-linked binaries, the
- location of code start is randomized. This is the default if the
- CONFIG_COMPAT_BRK option is enabled.
-
-2 - Additionally enable heap randomization. This is the default if
- CONFIG_COMPAT_BRK is disabled.
-
- There are a few legacy applications out there (such as some ancient
- versions of libc.so.5 from 1996) that assume that brk area starts
- just after the end of the code+bss. These applications break when
- start of the brk area is randomized. There are however no known
- non-legacy applications that would be broken this way, so for most
- systems it is safe to choose full randomization.
-
- Systems with ancient and/or broken binaries should be configured
- with CONFIG_COMPAT_BRK enabled, which excludes the heap from process
- address space randomization.
-
-==============================================================
-
-reboot-cmd: (Sparc only)
-
-??? This seems to be a way to give an argument to the Sparc
-ROM/Flash boot loader. Maybe to tell it what to do after
-rebooting. ???
-
-==============================================================
-
-rtsig-max & rtsig-nr:
-
-The file rtsig-max can be used to tune the maximum number
-of POSIX realtime (queued) signals that can be outstanding
-in the system.
-
-rtsig-nr shows the number of RT signals currently queued.
-
-==============================================================
-
-sched_energy_aware:
-
-Enables/disables Energy Aware Scheduling (EAS). EAS starts
-automatically on platforms where it can run (that is,
-platforms with asymmetric CPU topologies and having an Energy
-Model available). If your platform happens to meet the
-requirements for EAS but you do not want to use it, change
-this value to 0.
-
-==============================================================
-
-sched_schedstats:
-
-Enables/disables scheduler statistics. Enabling this feature
-incurs a small amount of overhead in the scheduler but is
-useful for debugging and performance tuning.
-
-==============================================================
-
-sg-big-buff:
-
-This file shows the size of the generic SCSI (sg) buffer.
-You can't tune it just yet, but you could change it on
-compile time by editing include/scsi/sg.h and changing
-the value of SG_BIG_BUFF.
-
-There shouldn't be any reason to change this value. If
-you can come up with one, you probably know what you
-are doing anyway :)
-
-==============================================================
-
-shmall:
-
-This parameter sets the total amount of shared memory pages that
-can be used system wide. Hence, SHMALL should always be at least
-ceil(shmmax/PAGE_SIZE).
-
-If you are not sure what the default PAGE_SIZE is on your Linux
-system, you can run the following command:
-
-# getconf PAGE_SIZE
-
-==============================================================
-
-shmmax:
-
-This value can be used to query and set the run time limit
-on the maximum shared memory segment size that can be created.
-Shared memory segments up to 1Gb are now supported in the
-kernel. This value defaults to SHMMAX.
-
-==============================================================
-
-shm_rmid_forced:
-
-Linux lets you set resource limits, including how much memory one
-process can consume, via setrlimit(2). Unfortunately, shared memory
-segments are allowed to exist without association with any process, and
-thus might not be counted against any resource limits. If enabled,
-shared memory segments are automatically destroyed when their attach
-count becomes zero after a detach or a process termination. It will
-also destroy segments that were created, but never attached to, on exit
-from the process. The only use left for IPC_RMID is to immediately
-destroy an unattached segment. Of course, this breaks the way things are
-defined, so some applications might stop working. Note that this
-feature will do you no good unless you also configure your resource
-limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't
-need this.
-
-Note that if you change this from 0 to 1, already created segments
-without users and with a dead originative process will be destroyed.
-
-==============================================================
-
-sysctl_writes_strict:
-
-Control how file position affects the behavior of updating sysctl values
-via the /proc/sys interface:
-
- -1 - Legacy per-write sysctl value handling, with no printk warnings.
- Each write syscall must fully contain the sysctl value to be
- written, and multiple writes on the same sysctl file descriptor
- will rewrite the sysctl value, regardless of file position.
- 0 - Same behavior as above, but warn about processes that perform writes
- to a sysctl file descriptor when the file position is not 0.
- 1 - (default) Respect file position when writing sysctl strings. Multiple
- writes will append to the sysctl value buffer. Anything past the max
- length of the sysctl value buffer will be ignored. Writes to numeric
- sysctl entries must always be at file position 0 and the value must
- be fully contained in the buffer sent in the write syscall.
-
-==============================================================
-
-softlockup_all_cpu_backtrace:
-
-This value controls the soft lockup detector thread's behavior
-when a soft lockup condition is detected as to whether or not
-to gather further debug information. If enabled, each cpu will
-be issued an NMI and instructed to capture stack trace.
-
-This feature is only applicable for architectures which support
-NMI.
-
-0: do nothing. This is the default behavior.
-
-1: on detection capture more debug information.
-
-==============================================================
-
-soft_watchdog
-
-This parameter can be used to control the soft lockup detector.
-
- 0 - disable the soft lockup detector
- 1 - enable the soft lockup detector
-
-The soft lockup detector monitors CPUs for threads that are hogging the CPUs
-without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads
-from running. The mechanism depends on the CPUs ability to respond to timer
-interrupts which are needed for the 'watchdog/N' threads to be woken up by
-the watchdog timer function, otherwise the NMI watchdog - if enabled - can
-detect a hard lockup condition.
-
-==============================================================
-
-stack_erasing
-
-This parameter can be used to control kernel stack erasing at the end
-of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK.
-
-That erasing reduces the information which kernel stack leak bugs
-can reveal and blocks some uninitialized stack variable attacks.
-The tradeoff is the performance impact: on a single CPU system kernel
-compilation sees a 1% slowdown, other systems and workloads may vary.
-
- 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated.
-
- 1: kernel stack erasing is enabled (default), it is performed before
- returning to the userspace at the end of syscalls.
-==============================================================
-
-tainted
-
-Non-zero if the kernel has been tainted. Numeric values, which can be
-ORed together. The letters are seen in "Tainted" line of Oops reports.
-
- 1 (P): proprietary module was loaded
- 2 (F): module was force loaded
- 4 (S): SMP kernel oops on an officially SMP incapable processor
- 8 (R): module was force unloaded
- 16 (M): processor reported a Machine Check Exception (MCE)
- 32 (B): bad page referenced or some unexpected page flags
- 64 (U): taint requested by userspace application
- 128 (D): kernel died recently, i.e. there was an OOPS or BUG
- 256 (A): an ACPI table was overridden by user
- 512 (W): kernel issued warning
- 1024 (C): staging driver was loaded
- 2048 (I): workaround for bug in platform firmware applied
- 4096 (O): externally-built ("out-of-tree") module was loaded
- 8192 (E): unsigned module was loaded
- 16384 (L): soft lockup occurred
- 32768 (K): kernel has been live patched
- 65536 (X): Auxiliary taint, defined and used by for distros
-131072 (T): The kernel was built with the struct randomization plugin
-
-See Documentation/admin-guide/tainted-kernels.rst for more information.
-
-==============================================================
-
-threads-max
-
-This value controls the maximum number of threads that can be created
-using fork().
-
-During initialization the kernel sets this value such that even if the
-maximum number of threads is created, the thread structures occupy only
-a part (1/8th) of the available RAM pages.
-
-The minimum value that can be written to threads-max is 20.
-The maximum value that can be written to threads-max is given by the
-constant FUTEX_TID_MASK (0x3fffffff).
-If a value outside of this range is written to threads-max an error
-EINVAL occurs.
-
-The value written is checked against the available RAM pages. If the
-thread structures would occupy too much (more than 1/8th) of the
-available RAM pages threads-max is reduced accordingly.
-
-==============================================================
-
-unknown_nmi_panic:
-
-The value in this file affects behavior of handling NMI. When the
-value is non-zero, unknown NMI is trapped and then panic occurs. At
-that time, kernel debugging information is displayed on console.
-
-NMI switch that most IA32 servers have fires unknown NMI up, for
-example. If a system hangs up, try pressing the NMI switch.
-
-==============================================================
-
-watchdog:
-
-This parameter can be used to disable or enable the soft lockup detector
-_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time.
-
- 0 - disable both lockup detectors
- 1 - enable both lockup detectors
-
-The soft lockup detector and the NMI watchdog can also be disabled or
-enabled individually, using the soft_watchdog and nmi_watchdog parameters.
-If the watchdog parameter is read, for example by executing
-
- cat /proc/sys/kernel/watchdog
-
-the output of this command (0 or 1) shows the logical OR of soft_watchdog
-and nmi_watchdog.
-
-==============================================================
-
-watchdog_cpumask:
-
-This value can be used to control on which cpus the watchdog may run.
-The default cpumask is all possible cores, but if NO_HZ_FULL is
-enabled in the kernel config, and cores are specified with the
-nohz_full= boot argument, those cores are excluded by default.
-Offline cores can be included in this mask, and if the core is later
-brought online, the watchdog will be started based on the mask value.
-
-Typically this value would only be touched in the nohz_full case
-to re-enable cores that by default were not running the watchdog,
-if a kernel lockup was suspected on those cores.
-
-The argument value is the standard cpulist format for cpumasks,
-so for example to enable the watchdog on cores 0, 2, 3, and 4 you
-might say:
-
- echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask
-
-==============================================================
-
-watchdog_thresh:
-
-This value can be used to control the frequency of hrtimer and NMI
-events and the soft and hard lockup thresholds. The default threshold
-is 10 seconds.
-
-The softlockup threshold is (2 * watchdog_thresh). Setting this
-tunable to zero will disable lockup detection altogether.
-
-==============================================================
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
deleted file mode 100644
index 2ae91d3873bb..000000000000
--- a/Documentation/sysctl/net.txt
+++ /dev/null
@@ -1,422 +0,0 @@
-Documentation for /proc/sys/net/*
- (c) 1999 Terrehon Bowden <terrehon@pacbell.net>
- Bodo Bauer <bb@ricochet.net>
- (c) 2000 Jorge Nerin <comandante@zaralinux.com>
- (c) 2009 Shen Feng <shen@cn.fujitsu.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/net
-
-The interface to the networking parts of the kernel is located in
-/proc/sys/net. The following table shows all possible subdirectories. You may
-see only some of them, depending on your kernel's configuration.
-
-
-Table : Subdirectories in /proc/sys/net
-..............................................................................
- Directory Content Directory Content
- core General parameter appletalk Appletalk protocol
- unix Unix domain sockets netrom NET/ROM
- 802 E802 protocol ax25 AX25
- ethernet Ethernet protocol rose X.25 PLP layer
- ipv4 IP version 4 x25 X.25 protocol
- ipx IPX token-ring IBM token ring
- bridge Bridging decnet DEC net
- ipv6 IP version 6 tipc TIPC
-..............................................................................
-
-1. /proc/sys/net/core - Network core options
--------------------------------------------------------
-
-bpf_jit_enable
---------------
-
-This enables the BPF Just in Time (JIT) compiler. BPF is a flexible
-and efficient infrastructure allowing to execute bytecode at various
-hook points. It is used in a number of Linux kernel subsystems such
-as networking (e.g. XDP, tc), tracing (e.g. kprobes, uprobes, tracepoints)
-and security (e.g. seccomp). LLVM has a BPF back end that can compile
-restricted C into a sequence of BPF instructions. After program load
-through bpf(2) and passing a verifier in the kernel, a JIT will then
-translate these BPF proglets into native CPU instructions. There are
-two flavors of JITs, the newer eBPF JIT currently supported on:
- - x86_64
- - x86_32
- - arm64
- - arm32
- - ppc64
- - sparc64
- - mips64
- - s390x
- - riscv
-
-And the older cBPF JIT supported on the following archs:
- - mips
- - ppc
- - sparc
-
-eBPF JITs are a superset of cBPF JITs, meaning the kernel will
-migrate cBPF instructions into eBPF instructions and then JIT
-compile them transparently. Older cBPF JITs can only translate
-tcpdump filters, seccomp rules, etc, but not mentioned eBPF
-programs loaded through bpf(2).
-
-Values :
- 0 - disable the JIT (default value)
- 1 - enable the JIT
- 2 - enable the JIT and ask the compiler to emit traces on kernel log.
-
-bpf_jit_harden
---------------
-
-This enables hardening for the BPF JIT compiler. Supported are eBPF
-JIT backends. Enabling hardening trades off performance, but can
-mitigate JIT spraying.
-Values :
- 0 - disable JIT hardening (default value)
- 1 - enable JIT hardening for unprivileged users only
- 2 - enable JIT hardening for all users
-
-bpf_jit_kallsyms
-----------------
-
-When BPF JIT compiler is enabled, then compiled images are unknown
-addresses to the kernel, meaning they neither show up in traces nor
-in /proc/kallsyms. This enables export of these addresses, which can
-be used for debugging/tracing. If bpf_jit_harden is enabled, this
-feature is disabled.
-Values :
- 0 - disable JIT kallsyms export (default value)
- 1 - enable JIT kallsyms export for privileged users only
-
-bpf_jit_limit
--------------
-
-This enforces a global limit for memory allocations to the BPF JIT
-compiler in order to reject unprivileged JIT requests once it has
-been surpassed. bpf_jit_limit contains the value of the global limit
-in bytes.
-
-dev_weight
---------------
-
-The maximum number of packets that kernel can handle on a NAPI interrupt,
-it's a Per-CPU variable. For drivers that support LRO or GRO_HW, a hardware
-aggregated packet is counted as one packet in this context.
-
-Default: 64
-
-dev_weight_rx_bias
---------------
-
-RPS (e.g. RFS, aRFS) processing is competing with the registered NAPI poll function
-of the driver for the per softirq cycle netdev_budget. This parameter influences
-the proportion of the configured netdev_budget that is spent on RPS based packet
-processing during RX softirq cycles. It is further meant for making current
-dev_weight adaptable for asymmetric CPU needs on RX/TX side of the network stack.
-(see dev_weight_tx_bias) It is effective on a per CPU basis. Determination is based
-on dev_weight and is calculated multiplicative (dev_weight * dev_weight_rx_bias).
-Default: 1
-
-dev_weight_tx_bias
---------------
-
-Scales the maximum number of packets that can be processed during a TX softirq cycle.
-Effective on a per CPU basis. Allows scaling of current dev_weight for asymmetric
-net stack processing needs. Be careful to avoid making TX softirq processing a CPU hog.
-Calculation is based on dev_weight (dev_weight * dev_weight_tx_bias).
-Default: 1
-
-default_qdisc
---------------
-
-The default queuing discipline to use for network devices. This allows
-overriding the default of pfifo_fast with an alternative. Since the default
-queuing discipline is created without additional parameters so is best suited
-to queuing disciplines that work well without configuration like stochastic
-fair queue (sfq), CoDel (codel) or fair queue CoDel (fq_codel). Don't use
-queuing disciplines like Hierarchical Token Bucket or Deficit Round Robin
-which require setting up classes and bandwidths. Note that physical multiqueue
-interfaces still use mq as root qdisc, which in turn uses this default for its
-leaves. Virtual devices (like e.g. lo or veth) ignore this setting and instead
-default to noqueue.
-Default: pfifo_fast
-
-busy_read
-----------------
-Low latency busy poll timeout for socket reads. (needs CONFIG_NET_RX_BUSY_POLL)
-Approximate time in us to busy loop waiting for packets on the device queue.
-This sets the default value of the SO_BUSY_POLL socket option.
-Can be set or overridden per socket by setting socket option SO_BUSY_POLL,
-which is the preferred method of enabling. If you need to enable the feature
-globally via sysctl, a value of 50 is recommended.
-Will increase power usage.
-Default: 0 (off)
-
-busy_poll
-----------------
-Low latency busy poll timeout for poll and select. (needs CONFIG_NET_RX_BUSY_POLL)
-Approximate time in us to busy loop waiting for events.
-Recommended value depends on the number of sockets you poll on.
-For several sockets 50, for several hundreds 100.
-For more than that you probably want to use epoll.
-Note that only sockets with SO_BUSY_POLL set will be busy polled,
-so you want to either selectively set SO_BUSY_POLL on those sockets or set
-sysctl.net.busy_read globally.
-Will increase power usage.
-Default: 0 (off)
-
-rmem_default
-------------
-
-The default setting of the socket receive buffer in bytes.
-
-rmem_max
---------
-
-The maximum receive socket buffer size in bytes.
-
-tstamp_allow_data
------------------
-Allow processes to receive tx timestamps looped together with the original
-packet contents. If disabled, transmit timestamp requests from unprivileged
-processes are dropped unless socket option SOF_TIMESTAMPING_OPT_TSONLY is set.
-Default: 1 (on)
-
-
-wmem_default
-------------
-
-The default setting (in bytes) of the socket send buffer.
-
-wmem_max
---------
-
-The maximum send socket buffer size in bytes.
-
-message_burst and message_cost
-------------------------------
-
-These parameters are used to limit the warning messages written to the kernel
-log from the networking code. They enforce a rate limit to make a
-denial-of-service attack impossible. A higher message_cost factor, results in
-fewer messages that will be written. Message_burst controls when messages will
-be dropped. The default settings limit warning messages to one every five
-seconds.
-
-warnings
---------
-
-This sysctl is now unused.
-
-This was used to control console messages from the networking stack that
-occur because of problems on the network like duplicate address or bad
-checksums.
-
-These messages are now emitted at KERN_DEBUG and can generally be enabled
-and controlled by the dynamic_debug facility.
-
-netdev_budget
--------------
-
-Maximum number of packets taken from all interfaces in one polling cycle (NAPI
-poll). In one polling cycle interfaces which are registered to polling are
-probed in a round-robin manner. Also, a polling cycle may not exceed
-netdev_budget_usecs microseconds, even if netdev_budget has not been
-exhausted.
-
-netdev_budget_usecs
----------------------
-
-Maximum number of microseconds in one NAPI polling cycle. Polling
-will exit when either netdev_budget_usecs have elapsed during the
-poll cycle or the number of packets processed reaches netdev_budget.
-
-netdev_max_backlog
-------------------
-
-Maximum number of packets, queued on the INPUT side, when the interface
-receives packets faster than kernel can process them.
-
-netdev_rss_key
---------------
-
-RSS (Receive Side Scaling) enabled drivers use a 40 bytes host key that is
-randomly generated.
-Some user space might need to gather its content even if drivers do not
-provide ethtool -x support yet.
-
-myhost:~# cat /proc/sys/net/core/netdev_rss_key
-84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8: ... (52 bytes total)
-
-File contains nul bytes if no driver ever called netdev_rss_key_fill() function.
-Note:
-/proc/sys/net/core/netdev_rss_key contains 52 bytes of key,
-but most drivers only use 40 bytes of it.
-
-myhost:~# ethtool -x eth0
-RX flow hash indirection table for eth0 with 8 RX ring(s):
- 0: 0 1 2 3 4 5 6 7
-RSS hash key:
-84:50:f4:00:a8:15:d1:a7:e9:7f:1d:60:35:c7:47:25:42:97:74:ca:56:bb:b6:a1:d8:43:e3:c9:0c:fd:17:55:c2:3a:4d:69:ed:f1:42:89
-
-netdev_tstamp_prequeue
-----------------------
-
-If set to 0, RX packet timestamps can be sampled after RPS processing, when
-the target CPU processes packets. It might give some delay on timestamps, but
-permit to distribute the load on several cpus.
-
-If set to 1 (default), timestamps are sampled as soon as possible, before
-queueing.
-
-optmem_max
-----------
-
-Maximum ancillary buffer size allowed per socket. Ancillary data is a sequence
-of struct cmsghdr structures with appended data.
-
-fb_tunnels_only_for_init_net
-----------------------------
-
-Controls if fallback tunnels (like tunl0, gre0, gretap0, erspan0,
-sit0, ip6tnl0, ip6gre0) are automatically created when a new
-network namespace is created, if corresponding tunnel is present
-in initial network namespace.
-If set to 1, these devices are not automatically created, and
-user space is responsible for creating them if needed.
-
-Default : 0 (for compatibility reasons)
-
-devconf_inherit_init_net
-----------------------------
-
-Controls if a new network namespace should inherit all current
-settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By
-default, we keep the current behavior: for IPv4 we inherit all current
-settings from init_net and for IPv6 we reset all settings to default.
-
-If set to 1, both IPv4 and IPv6 settings are forced to inherit from
-current ones in init_net. If set to 2, both IPv4 and IPv6 settings are
-forced to reset to their default values.
-
-Default : 0 (for compatibility reasons)
-
-2. /proc/sys/net/unix - Parameters for Unix domain sockets
--------------------------------------------------------
-
-There is only one file in this directory.
-unix_dgram_qlen limits the max number of datagrams queued in Unix domain
-socket's buffer. It will not take effect unless PF_UNIX flag is specified.
-
-
-3. /proc/sys/net/ipv4 - IPV4 settings
--------------------------------------------------------
-Please see: Documentation/networking/ip-sysctl.txt and ipvs-sysctl.txt for
-descriptions of these entries.
-
-
-4. Appletalk
--------------------------------------------------------
-
-The /proc/sys/net/appletalk directory holds the Appletalk configuration data
-when Appletalk is loaded. The configurable parameters are:
-
-aarp-expiry-time
-----------------
-
-The amount of time we keep an ARP entry before expiring it. Used to age out
-old hosts.
-
-aarp-resolve-time
------------------
-
-The amount of time we will spend trying to resolve an Appletalk address.
-
-aarp-retransmit-limit
----------------------
-
-The number of times we will retransmit a query before giving up.
-
-aarp-tick-time
---------------
-
-Controls the rate at which expires are checked.
-
-The directory /proc/net/appletalk holds the list of active Appletalk sockets
-on a machine.
-
-The fields indicate the DDP type, the local address (in network:node format)
-the remote address, the size of the transmit pending queue, the size of the
-received queue (bytes waiting for applications to read) the state and the uid
-owning the socket.
-
-/proc/net/atalk_iface lists all the interfaces configured for appletalk.It
-shows the name of the interface, its Appletalk address, the network range on
-that address (or network number for phase 1 networks), and the status of the
-interface.
-
-/proc/net/atalk_route lists each known network route. It lists the target
-(network) that the route leads to, the router (may be directly connected), the
-route flags, and the device the route is using.
-
-
-5. IPX
--------------------------------------------------------
-
-The IPX protocol has no tunable values in proc/sys/net.
-
-The IPX protocol does, however, provide proc/net/ipx. This lists each IPX
-socket giving the local and remote addresses in Novell format (that is
-network:node:port). In accordance with the strange Novell tradition,
-everything but the port is in hex. Not_Connected is displayed for sockets that
-are not tied to a specific remote address. The Tx and Rx queue sizes indicate
-the number of bytes pending for transmission and reception. The state
-indicates the state the socket is in and the uid is the owning uid of the
-socket.
-
-The /proc/net/ipx_interface file lists all IPX interfaces. For each interface
-it gives the network number, the node number, and indicates if the network is
-the primary network. It also indicates which device it is bound to (or
-Internal for internal networks) and the Frame Type if appropriate. Linux
-supports 802.3, 802.2, 802.2 SNAP and DIX (Blue Book) ethernet framing for
-IPX.
-
-The /proc/net/ipx_route table holds a list of IPX routes. For each route it
-gives the destination network, the router node (or Directly) and the network
-address of the router (or Connected) for internal networks.
-
-6. TIPC
--------------------------------------------------------
-
-tipc_rmem
-----------
-
-The TIPC protocol now has a tunable for the receive memory, similar to the
-tcp_rmem - i.e. a vector of 3 INTEGERs: (min, default, max)
-
- # cat /proc/sys/net/tipc/tipc_rmem
- 4252725 34021800 68043600
- #
-
-The max value is set to CONN_OVERLOAD_LIMIT, and the default and min values
-are scaled (shifted) versions of that same value. Note that the min value
-is not at this point in time used in any meaningful way, but the triplet is
-preserved in order to be consistent with things like tcp_rmem.
-
-named_timeout
---------------
-
-TIPC name table updates are distributed asynchronously in a cluster, without
-any form of transaction handling. This means that different race scenarios are
-possible. One such is that a name withdrawal sent out by one node and received
-by another node may arrive after a second, overlapping name publication already
-has been accepted from a third node, although the conflicting updates
-originally may have been issued in the correct sequential order.
-If named_timeout is nonzero, failed topology updates will be placed on a defer
-queue until another event arrives that clears the error, or until the timeout
-expires. Value is in milliseconds.
diff --git a/Documentation/sysctl/sunrpc.txt b/Documentation/sysctl/sunrpc.txt
deleted file mode 100644
index ae1ecac6f85a..000000000000
--- a/Documentation/sysctl/sunrpc.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-Documentation for /proc/sys/sunrpc/* kernel version 2.2.10
- (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/sunrpc and is valid for Linux kernel version 2.2.
-
-The files in this directory can be used to (re)set the debug
-flags of the SUN Remote Procedure Call (RPC) subsystem in
-the Linux kernel. This stuff is used for NFS, KNFSD and
-maybe a few other things as well.
-
-The files in there are used to control the debugging flags:
-rpc_debug, nfs_debug, nfsd_debug and nlm_debug.
-
-These flags are for kernel hackers only. You should read the
-source code in net/sunrpc/ for more information.
diff --git a/Documentation/sysctl/user.txt b/Documentation/sysctl/user.txt
deleted file mode 100644
index a5882865836e..000000000000
--- a/Documentation/sysctl/user.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-Documentation for /proc/sys/user/* kernel version 4.9.0
- (c) 2016 Eric Biederman <ebiederm@xmission.com>
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/user.
-
-The files in this directory can be used to override the default
-limits on the number of namespaces and other objects that have
-per user per user namespace limits.
-
-The primary purpose of these limits is to stop programs that
-malfunction and attempt to create a ridiculous number of objects,
-before the malfunction becomes a system wide problem. It is the
-intention that the defaults of these limits are set high enough that
-no program in normal operation should run into these limits.
-
-The creation of per user per user namespace objects are charged to
-the user in the user namespace who created the object and
-verified to be below the per user limit in that user namespace.
-
-The creation of objects is also charged to all of the users
-who created user namespaces the creation of the object happens
-in (user namespaces can be nested) and verified to be below the per user
-limits in the user namespaces of those users.
-
-This recursive counting of created objects ensures that creating a
-user namespace does not allow a user to escape their current limits.
-
-Currently, these files are in /proc/sys/user:
-
-- max_cgroup_namespaces
-
- The maximum number of cgroup namespaces that any user in the current
- user namespace may create.
-
-- max_ipc_namespaces
-
- The maximum number of ipc namespaces that any user in the current
- user namespace may create.
-
-- max_mnt_namespaces
-
- The maximum number of mount namespaces that any user in the current
- user namespace may create.
-
-- max_net_namespaces
-
- The maximum number of network namespaces that any user in the
- current user namespace may create.
-
-- max_pid_namespaces
-
- The maximum number of pid namespaces that any user in the current
- user namespace may create.
-
-- max_user_namespaces
-
- The maximum number of user namespaces that any user in the current
- user namespace may create.
-
-- max_uts_namespaces
-
- The maximum number of user namespaces that any user in the current
- user namespace may create.
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
deleted file mode 100644
index 749322060f10..000000000000
--- a/Documentation/sysctl/vm.txt
+++ /dev/null
@@ -1,946 +0,0 @@
-Documentation for /proc/sys/vm/* kernel version 2.6.29
- (c) 1998, 1999, Rik van Riel <riel@nl.linux.org>
- (c) 2008 Peter W. Morreale <pmorreale@novell.com>
-
-For general info and legal blurb, please look in README.
-
-==============================================================
-
-This file contains the documentation for the sysctl files in
-/proc/sys/vm and is valid for Linux kernel version 2.6.29.
-
-The files in this directory can be used to tune the operation
-of the virtual memory (VM) subsystem of the Linux kernel and
-the writeout of dirty data to disk.
-
-Default values and initialization routines for most of these
-files can be found in mm/swap.c.
-
-Currently, these files are in /proc/sys/vm:
-
-- admin_reserve_kbytes
-- block_dump
-- compact_memory
-- compact_unevictable_allowed
-- dirty_background_bytes
-- dirty_background_ratio
-- dirty_bytes
-- dirty_expire_centisecs
-- dirty_ratio
-- dirtytime_expire_seconds
-- dirty_writeback_centisecs
-- drop_caches
-- extfrag_threshold
-- hugetlb_shm_group
-- laptop_mode
-- legacy_va_layout
-- lowmem_reserve_ratio
-- max_map_count
-- memory_failure_early_kill
-- memory_failure_recovery
-- min_free_kbytes
-- min_slab_ratio
-- min_unmapped_ratio
-- mmap_min_addr
-- mmap_rnd_bits
-- mmap_rnd_compat_bits
-- nr_hugepages
-- nr_hugepages_mempolicy
-- nr_overcommit_hugepages
-- nr_trim_pages (only if CONFIG_MMU=n)
-- numa_zonelist_order
-- oom_dump_tasks
-- oom_kill_allocating_task
-- overcommit_kbytes
-- overcommit_memory
-- overcommit_ratio
-- page-cluster
-- panic_on_oom
-- percpu_pagelist_fraction
-- stat_interval
-- stat_refresh
-- numa_stat
-- swappiness
-- unprivileged_userfaultfd
-- user_reserve_kbytes
-- vfs_cache_pressure
-- watermark_boost_factor
-- watermark_scale_factor
-- zone_reclaim_mode
-
-==============================================================
-
-admin_reserve_kbytes
-
-The amount of free memory in the system that should be reserved for users
-with the capability cap_sys_admin.
-
-admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
-
-That should provide enough for the admin to log in and kill a process,
-if necessary, under the default overcommit 'guess' mode.
-
-Systems running under overcommit 'never' should increase this to account
-for the full Virtual Memory Size of programs used to recover. Otherwise,
-root may not be able to log in to recover the system.
-
-How do you calculate a minimum useful reserve?
-
-sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
-
-For overcommit 'guess', we can sum resident set sizes (RSS).
-On x86_64 this is about 8MB.
-
-For overcommit 'never', we can take the max of their virtual sizes (VSZ)
-and add the sum of their RSS.
-On x86_64 this is about 128MB.
-
-Changing this takes effect whenever an application requests memory.
-
-==============================================================
-
-block_dump
-
-block_dump enables block I/O debugging when set to a nonzero value. More
-information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
-
-==============================================================
-
-compact_memory
-
-Available only when CONFIG_COMPACTION is set. When 1 is written to the file,
-all zones are compacted such that free memory is available in contiguous
-blocks where possible. This can be important for example in the allocation of
-huge pages although processes will also directly compact memory as required.
-
-==============================================================
-
-compact_unevictable_allowed
-
-Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
-allowed to examine the unevictable lru (mlocked pages) for pages to compact.
-This should be used on systems where stalls for minor page faults are an
-acceptable trade for large contiguous free memory. Set to 0 to prevent
-compaction from moving pages that are unevictable. Default value is 1.
-
-==============================================================
-
-dirty_background_bytes
-
-Contains the amount of dirty memory at which the background kernel
-flusher threads will start writeback.
-
-Note: dirty_background_bytes is the counterpart of dirty_background_ratio. Only
-one of them may be specified at a time. When one sysctl is written it is
-immediately taken into account to evaluate the dirty memory limits and the
-other appears as 0 when read.
-
-==============================================================
-
-dirty_background_ratio
-
-Contains, as a percentage of total available memory that contains free pages
-and reclaimable pages, the number of pages at which the background kernel
-flusher threads will start writing out dirty data.
-
-The total available memory is not equal to total system memory.
-
-==============================================================
-
-dirty_bytes
-
-Contains the amount of dirty memory at which a process generating disk writes
-will itself start writeback.
-
-Note: dirty_bytes is the counterpart of dirty_ratio. Only one of them may be
-specified at a time. When one sysctl is written it is immediately taken into
-account to evaluate the dirty memory limits and the other appears as 0 when
-read.
-
-Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any
-value lower than this limit will be ignored and the old configuration will be
-retained.
-
-==============================================================
-
-dirty_expire_centisecs
-
-This tunable is used to define when dirty data is old enough to be eligible
-for writeout by the kernel flusher threads. It is expressed in 100'ths
-of a second. Data which has been dirty in-memory for longer than this
-interval will be written out next time a flusher thread wakes up.
-
-==============================================================
-
-dirty_ratio
-
-Contains, as a percentage of total available memory that contains free pages
-and reclaimable pages, the number of pages at which a process which is
-generating disk writes will itself start writing out dirty data.
-
-The total available memory is not equal to total system memory.
-
-==============================================================
-
-dirtytime_expire_seconds
-
-When a lazytime inode is constantly having its pages dirtied, the inode with
-an updated timestamp will never get chance to be written out. And, if the
-only thing that has happened on the file system is a dirtytime inode caused
-by an atime update, a worker will be scheduled to make sure that inode
-eventually gets pushed out to disk. This tunable is used to define when dirty
-inode is old enough to be eligible for writeback by the kernel flusher threads.
-And, it is also used as the interval to wakeup dirtytime_writeback thread.
-
-==============================================================
-
-dirty_writeback_centisecs
-
-The kernel flusher threads will periodically wake up and write `old' data
-out to disk. This tunable expresses the interval between those wakeups, in
-100'ths of a second.
-
-Setting this to zero disables periodic writeback altogether.
-
-==============================================================
-
-drop_caches
-
-Writing to this will cause the kernel to drop clean caches, as well as
-reclaimable slab objects like dentries and inodes. Once dropped, their
-memory becomes free.
-
-To free pagecache:
- echo 1 > /proc/sys/vm/drop_caches
-To free reclaimable slab objects (includes dentries and inodes):
- echo 2 > /proc/sys/vm/drop_caches
-To free slab objects and pagecache:
- echo 3 > /proc/sys/vm/drop_caches
-
-This is a non-destructive operation and will not free any dirty objects.
-To increase the number of objects freed by this operation, the user may run
-`sync' prior to writing to /proc/sys/vm/drop_caches. This will minimize the
-number of dirty objects on the system and create more candidates to be
-dropped.
-
-This file is not a means to control the growth of the various kernel caches
-(inodes, dentries, pagecache, etc...) These objects are automatically
-reclaimed by the kernel when memory is needed elsewhere on the system.
-
-Use of this file can cause performance problems. Since it discards cached
-objects, it may cost a significant amount of I/O and CPU to recreate the
-dropped objects, especially if they were under heavy use. Because of this,
-use outside of a testing or debugging environment is not recommended.
-
-You may see informational messages in your kernel log when this file is
-used:
-
- cat (1234): drop_caches: 3
-
-These are informational only. They do not mean that anything is wrong
-with your system. To disable them, echo 4 (bit 2) into drop_caches.
-
-==============================================================
-
-extfrag_threshold
-
-This parameter affects whether the kernel will compact memory or direct
-reclaim to satisfy a high-order allocation. The extfrag/extfrag_index file in
-debugfs shows what the fragmentation index for each order is in each zone in
-the system. Values tending towards 0 imply allocations would fail due to lack
-of memory, values towards 1000 imply failures are due to fragmentation and -1
-implies that the allocation will succeed as long as watermarks are met.
-
-The kernel will not compact memory in a zone if the
-fragmentation index is <= extfrag_threshold. The default value is 500.
-
-==============================================================
-
-highmem_is_dirtyable
-
-Available only for systems with CONFIG_HIGHMEM enabled (32b systems).
-
-This parameter controls whether the high memory is considered for dirty
-writers throttling. This is not the case by default which means that
-only the amount of memory directly visible/usable by the kernel can
-be dirtied. As a result, on systems with a large amount of memory and
-lowmem basically depleted writers might be throttled too early and
-streaming writes can get very slow.
-
-Changing the value to non zero would allow more memory to be dirtied
-and thus allow writers to write more data which can be flushed to the
-storage more effectively. Note this also comes with a risk of pre-mature
-OOM killer because some writers (e.g. direct block device writes) can
-only use the low memory and they can fill it up with dirty data without
-any throttling.
-
-==============================================================
-
-hugetlb_shm_group
-
-hugetlb_shm_group contains group id that is allowed to create SysV
-shared memory segment using hugetlb page.
-
-==============================================================
-
-laptop_mode
-
-laptop_mode is a knob that controls "laptop mode". All the things that are
-controlled by this knob are discussed in Documentation/laptops/laptop-mode.txt.
-
-==============================================================
-
-legacy_va_layout
-
-If non-zero, this sysctl disables the new 32-bit mmap layout - the kernel
-will use the legacy (2.4) layout for all processes.
-
-==============================================================
-
-lowmem_reserve_ratio
-
-For some specialised workloads on highmem machines it is dangerous for
-the kernel to allow process memory to be allocated from the "lowmem"
-zone. This is because that memory could then be pinned via the mlock()
-system call, or by unavailability of swapspace.
-
-And on large highmem machines this lack of reclaimable lowmem memory
-can be fatal.
-
-So the Linux page allocator has a mechanism which prevents allocations
-which _could_ use highmem from using too much lowmem. This means that
-a certain amount of lowmem is defended from the possibility of being
-captured into pinned user memory.
-
-(The same argument applies to the old 16 megabyte ISA DMA region. This
-mechanism will also defend that region from allocations which could use
-highmem or lowmem).
-
-The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is
-in defending these lower zones.
-
-If you have a machine which uses highmem or ISA DMA and your
-applications are using mlock(), or if you are running with no swap then
-you probably should change the lowmem_reserve_ratio setting.
-
-The lowmem_reserve_ratio is an array. You can see them by reading this file.
--
-% cat /proc/sys/vm/lowmem_reserve_ratio
-256 256 32
--
-
-But, these values are not used directly. The kernel calculates # of protection
-pages for each zones from them. These are shown as array of protection pages
-in /proc/zoneinfo like followings. (This is an example of x86-64 box).
-Each zone has an array of protection pages like this.
-
--
-Node 0, zone DMA
- pages free 1355
- min 3
- low 3
- high 4
- :
- :
- numa_other 0
- protection: (0, 2004, 2004, 2004)
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- pagesets
- cpu: 0 pcp: 0
- :
--
-These protections are added to score to judge whether this zone should be used
-for page allocation or should be reclaimed.
-
-In this example, if normal pages (index=2) are required to this DMA zone and
-watermark[WMARK_HIGH] is used for watermark, the kernel judges this zone should
-not be used because pages_free(1355) is smaller than watermark + protection[2]
-(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
-normal page requirement. If requirement is DMA zone(index=0), protection[0]
-(=0) is used.
-
-zone[i]'s protection[j] is calculated by following expression.
-
-(i < j):
- zone[i]->protection[j]
- = (total sums of managed_pages from zone[i+1] to zone[j] on the node)
- / lowmem_reserve_ratio[i];
-(i = j):
- (should not be protected. = 0;
-(i > j):
- (not necessary, but looks 0)
-
-The default values of lowmem_reserve_ratio[i] are
- 256 (if zone[i] means DMA or DMA32 zone)
- 32 (others).
-As above expression, they are reciprocal number of ratio.
-256 means 1/256. # of protection pages becomes about "0.39%" of total managed
-pages of higher zones on the node.
-
-If you would like to protect more pages, smaller values are effective.
-The minimum value is 1 (1/1 -> 100%). The value less than 1 completely
-disables protection of the pages.
-
-==============================================================
-
-max_map_count:
-
-This file contains the maximum number of memory map areas a process
-may have. Memory map areas are used as a side-effect of calling
-malloc, directly by mmap, mprotect, and madvise, and also when loading
-shared libraries.
-
-While most applications need less than a thousand maps, certain
-programs, particularly malloc debuggers, may consume lots of them,
-e.g., up to one or two maps per allocation.
-
-The default value is 65536.
-
-=============================================================
-
-memory_failure_early_kill:
-
-Control how to kill processes when uncorrected memory error (typically
-a 2bit error in a memory module) is detected in the background by hardware
-that cannot be handled by the kernel. In some cases (like the page
-still having a valid copy on disk) the kernel will handle the failure
-transparently without affecting any applications. But if there is
-no other uptodate copy of the data it will kill to prevent any data
-corruptions from propagating.
-
-1: Kill all processes that have the corrupted and not reloadable page mapped
-as soon as the corruption is detected. Note this is not supported
-for a few types of pages, like kernel internally allocated data or
-the swap cache, but works for the majority of user pages.
-
-0: Only unmap the corrupted page from all processes and only kill a process
-who tries to access it.
-
-The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
-handle this if they want to.
-
-This is only active on architectures/platforms with advanced machine
-check handling and depends on the hardware capabilities.
-
-Applications can override this setting individually with the PR_MCE_KILL prctl
-
-==============================================================
-
-memory_failure_recovery
-
-Enable memory failure recovery (when supported by the platform)
-
-1: Attempt recovery.
-
-0: Always panic on a memory failure.
-
-==============================================================
-
-min_free_kbytes:
-
-This is used to force the Linux VM to keep a minimum number
-of kilobytes free. The VM uses this number to compute a
-watermark[WMARK_MIN] value for each lowmem zone in the system.
-Each lowmem zone gets a number of reserved free pages based
-proportionally on its size.
-
-Some minimal amount of memory is needed to satisfy PF_MEMALLOC
-allocations; if you set this to lower than 1024KB, your system will
-become subtly broken, and prone to deadlock under high loads.
-
-Setting this too high will OOM your machine instantly.
-
-=============================================================
-
-min_slab_ratio:
-
-This is available only on NUMA kernels.
-
-A percentage of the total pages in each zone. On Zone reclaim
-(fallback from the local zone occurs) slabs will be reclaimed if more
-than this percentage of pages in a zone are reclaimable slab pages.
-This insures that the slab growth stays under control even in NUMA
-systems that rarely perform global reclaim.
-
-The default is 5 percent.
-
-Note that slab reclaim is triggered in a per zone / node fashion.
-The process of reclaiming slab memory is currently not node specific
-and may not be fast.
-
-=============================================================
-
-min_unmapped_ratio:
-
-This is available only on NUMA kernels.
-
-This is a percentage of the total pages in each zone. Zone reclaim will
-only occur if more than this percentage of pages are in a state that
-zone_reclaim_mode allows to be reclaimed.
-
-If zone_reclaim_mode has the value 4 OR'd, then the percentage is compared
-against all file-backed unmapped pages including swapcache pages and tmpfs
-files. Otherwise, only unmapped pages backed by normal files but not tmpfs
-files and similar are considered.
-
-The default is 1 percent.
-
-==============================================================
-
-mmap_min_addr
-
-This file indicates the amount of address space which a user process will
-be restricted from mmapping. Since kernel null dereference bugs could
-accidentally operate based on the information in the first couple of pages
-of memory userspace processes should not be allowed to write to them. By
-default this value is set to 0 and no protections will be enforced by the
-security module. Setting this value to something like 64k will allow the
-vast majority of applications to work correctly and provide defense in depth
-against future potential kernel bugs.
-
-==============================================================
-
-mmap_rnd_bits:
-
-This value can be used to select the number of bits to use to
-determine the random offset to the base address of vma regions
-resulting from mmap allocations on architectures which support
-tuning address space randomization. This value will be bounded
-by the architecture's minimum and maximum supported values.
-
-This value can be changed after boot using the
-/proc/sys/vm/mmap_rnd_bits tunable
-
-==============================================================
-
-mmap_rnd_compat_bits:
-
-This value can be used to select the number of bits to use to
-determine the random offset to the base address of vma regions
-resulting from mmap allocations for applications run in
-compatibility mode on architectures which support tuning address
-space randomization. This value will be bounded by the
-architecture's minimum and maximum supported values.
-
-This value can be changed after boot using the
-/proc/sys/vm/mmap_rnd_compat_bits tunable
-
-==============================================================
-
-nr_hugepages
-
-Change the minimum size of the hugepage pool.
-
-See Documentation/admin-guide/mm/hugetlbpage.rst
-
-==============================================================
-
-nr_hugepages_mempolicy
-
-Change the size of the hugepage pool at run-time on a specific
-set of NUMA nodes.
-
-See Documentation/admin-guide/mm/hugetlbpage.rst
-
-==============================================================
-
-nr_overcommit_hugepages
-
-Change the maximum size of the hugepage pool. The maximum is
-nr_hugepages + nr_overcommit_hugepages.
-
-See Documentation/admin-guide/mm/hugetlbpage.rst
-
-==============================================================
-
-nr_trim_pages
-
-This is available only on NOMMU kernels.
-
-This value adjusts the excess page trimming behaviour of power-of-2 aligned
-NOMMU mmap allocations.
-
-A value of 0 disables trimming of allocations entirely, while a value of 1
-trims excess pages aggressively. Any value >= 1 acts as the watermark where
-trimming of allocations is initiated.
-
-The default value is 1.
-
-See Documentation/nommu-mmap.txt for more information.
-
-==============================================================
-
-numa_zonelist_order
-
-This sysctl is only for NUMA and it is deprecated. Anything but
-Node order will fail!
-
-'where the memory is allocated from' is controlled by zonelists.
-(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
- you may be able to read ZONE_DMA as ZONE_DMA32...)
-
-In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following.
-ZONE_NORMAL -> ZONE_DMA
-This means that a memory allocation request for GFP_KERNEL will
-get memory from ZONE_DMA only when ZONE_NORMAL is not available.
-
-In NUMA case, you can think of following 2 types of order.
-Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL
-
-(A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL
-(B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA.
-
-Type(A) offers the best locality for processes on Node(0), but ZONE_DMA
-will be used before ZONE_NORMAL exhaustion. This increases possibility of
-out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small.
-
-Type(B) cannot offer the best locality but is more robust against OOM of
-the DMA zone.
-
-Type(A) is called as "Node" order. Type (B) is "Zone" order.
-
-"Node order" orders the zonelists by node, then by zone within each node.
-Specify "[Nn]ode" for node order
-
-"Zone Order" orders the zonelists by zone type, then by node within each
-zone. Specify "[Zz]one" for zone order.
-
-Specify "[Dd]efault" to request automatic configuration.
-
-On 32-bit, the Normal zone needs to be preserved for allocations accessible
-by the kernel, so "zone" order will be selected.
-
-On 64-bit, devices that require DMA32/DMA are relatively rare, so "node"
-order will be selected.
-
-Default order is recommended unless this is causing problems for your
-system/application.
-
-==============================================================
-
-oom_dump_tasks
-
-Enables a system-wide task dump (excluding kernel threads) to be produced
-when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
-score, and name. This is helpful to determine why the OOM killer was
-invoked, to identify the rogue task that caused it, and to determine why
-the OOM killer chose the task it did to kill.
-
-If this is set to zero, this information is suppressed. On very
-large systems with thousands of tasks it may not be feasible to dump
-the memory state information for each one. Such systems should not
-be forced to incur a performance penalty in OOM conditions when the
-information may not be desired.
-
-If this is set to non-zero, this information is shown whenever the
-OOM killer actually kills a memory-hogging task.
-
-The default value is 1 (enabled).
-
-==============================================================
-
-oom_kill_allocating_task
-
-This enables or disables killing the OOM-triggering task in
-out-of-memory situations.
-
-If this is set to zero, the OOM killer will scan through the entire
-tasklist and select a task based on heuristics to kill. This normally
-selects a rogue memory-hogging task that frees up a large amount of
-memory when killed.
-
-If this is set to non-zero, the OOM killer simply kills the task that
-triggered the out-of-memory condition. This avoids the expensive
-tasklist scan.
-
-If panic_on_oom is selected, it takes precedence over whatever value
-is used in oom_kill_allocating_task.
-
-The default value is 0.
-
-==============================================================
-
-overcommit_kbytes:
-
-When overcommit_memory is set to 2, the committed address space is not
-permitted to exceed swap plus this amount of physical RAM. See below.
-
-Note: overcommit_kbytes is the counterpart of overcommit_ratio. Only one
-of them may be specified at a time. Setting one disables the other (which
-then appears as 0 when read).
-
-==============================================================
-
-overcommit_memory:
-
-This value contains a flag that enables memory overcommitment.
-
-When this flag is 0, the kernel attempts to estimate the amount
-of free memory left when userspace requests more memory.
-
-When this flag is 1, the kernel pretends there is always enough
-memory until it actually runs out.
-
-When this flag is 2, the kernel uses a "never overcommit"
-policy that attempts to prevent any overcommit of memory.
-Note that user_reserve_kbytes affects this policy.
-
-This feature can be very useful because there are a lot of
-programs that malloc() huge amounts of memory "just-in-case"
-and don't use much of it.
-
-The default value is 0.
-
-See Documentation/vm/overcommit-accounting.rst and
-mm/util.c::__vm_enough_memory() for more information.
-
-==============================================================
-
-overcommit_ratio:
-
-When overcommit_memory is set to 2, the committed address
-space is not permitted to exceed swap plus this percentage
-of physical RAM. See above.
-
-==============================================================
-
-page-cluster
-
-page-cluster controls the number of pages up to which consecutive pages
-are read in from swap in a single attempt. This is the swap counterpart
-to page cache readahead.
-The mentioned consecutivity is not in terms of virtual/physical addresses,
-but consecutive on swap space - that means they were swapped out together.
-
-It is a logarithmic value - setting it to zero means "1 page", setting
-it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
-Zero disables swap readahead completely.
-
-The default value is three (eight pages at a time). There may be some
-small benefits in tuning this to a different value if your workload is
-swap-intensive.
-
-Lower values mean lower latencies for initial faults, but at the same time
-extra faults and I/O delays for following faults if they would have been part of
-that consecutive pages readahead would have brought in.
-
-=============================================================
-
-panic_on_oom
-
-This enables or disables panic on out-of-memory feature.
-
-If this is set to 0, the kernel will kill some rogue process,
-called oom_killer. Usually, oom_killer can kill rogue processes and
-system will survive.
-
-If this is set to 1, the kernel panics when out-of-memory happens.
-However, if a process limits using nodes by mempolicy/cpusets,
-and those nodes become memory exhaustion status, one process
-may be killed by oom-killer. No panic occurs in this case.
-Because other nodes' memory may be free. This means system total status
-may be not fatal yet.
-
-If this is set to 2, the kernel panics compulsorily even on the
-above-mentioned. Even oom happens under memory cgroup, the whole
-system panics.
-
-The default value is 0.
-1 and 2 are for failover of clustering. Please select either
-according to your policy of failover.
-panic_on_oom=2+kdump gives you very strong tool to investigate
-why oom happens. You can get snapshot.
-
-=============================================================
-
-percpu_pagelist_fraction
-
-This is the fraction of pages at most (high mark pcp->high) in each zone that
-are allocated for each per cpu page list. The min value for this is 8. It
-means that we don't allow more than 1/8th of pages in each zone to be
-allocated in any single per_cpu_pagelist. This entry only changes the value
-of hot per cpu pagelists. User can specify a number like 100 to allocate
-1/100th of each zone to each per cpu page list.
-
-The batch value of each per cpu pagelist is also updated as a result. It is
-set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8)
-
-The initial value is zero. Kernel does not use this value at boot time to set
-the high water marks for each per cpu page list. If the user writes '0' to this
-sysctl, it will revert to this default behavior.
-
-==============================================================
-
-stat_interval
-
-The time interval between which vm statistics are updated. The default
-is 1 second.
-
-==============================================================
-
-stat_refresh
-
-Any read or write (by root only) flushes all the per-cpu vm statistics
-into their global totals, for more accurate reports when testing
-e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
-
-As a side-effect, it also checks for negative totals (elsewhere reported
-as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
-(At time of writing, a few stats are known sometimes to be found negative,
-with no ill effects: errors and warnings on these stats are suppressed.)
-
-==============================================================
-
-numa_stat
-
-This interface allows runtime configuration of numa statistics.
-
-When page allocation performance becomes a bottleneck and you can tolerate
-some possible tool breakage and decreased numa counter precision, you can
-do:
- echo 0 > /proc/sys/vm/numa_stat
-
-When page allocation performance is not a bottleneck and you want all
-tooling to work, you can do:
- echo 1 > /proc/sys/vm/numa_stat
-
-==============================================================
-
-swappiness
-
-This control is used to define how aggressive the kernel will swap
-memory pages. Higher values will increase aggressiveness, lower values
-decrease the amount of swap. A value of 0 instructs the kernel not to
-initiate swap until the amount of free and file-backed pages is less
-than the high water mark in a zone.
-
-The default value is 60.
-
-==============================================================
-
-unprivileged_userfaultfd
-
-This flag controls whether unprivileged users can use the userfaultfd
-system calls. Set this to 1 to allow unprivileged users to use the
-userfaultfd system calls, or set this to 0 to restrict userfaultfd to only
-privileged users (with SYS_CAP_PTRACE capability).
-
-The default value is 1.
-
-==============================================================
-
-- user_reserve_kbytes
-
-When overcommit_memory is set to 2, "never overcommit" mode, reserve
-min(3% of current process size, user_reserve_kbytes) of free memory.
-This is intended to prevent a user from starting a single memory hogging
-process, such that they cannot recover (kill the hog).
-
-user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
-
-If this is reduced to zero, then the user will be allowed to allocate
-all free memory with a single process, minus admin_reserve_kbytes.
-Any subsequent attempts to execute a command will result in
-"fork: Cannot allocate memory".
-
-Changing this takes effect whenever an application requests memory.
-
-==============================================================
-
-vfs_cache_pressure
-------------------
-
-This percentage value controls the tendency of the kernel to reclaim
-the memory which is used for caching of directory and inode objects.
-
-At the default value of vfs_cache_pressure=100 the kernel will attempt to
-reclaim dentries and inodes at a "fair" rate with respect to pagecache and
-swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
-to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
-never reclaim dentries and inodes due to memory pressure and this can easily
-lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
-causes the kernel to prefer to reclaim dentries and inodes.
-
-Increasing vfs_cache_pressure significantly beyond 100 may have negative
-performance impact. Reclaim code needs to take various locks to find freeable
-directory and inode objects. With vfs_cache_pressure=1000, it will look for
-ten times more freeable objects than there are.
-
-=============================================================
-
-watermark_boost_factor:
-
-This factor controls the level of reclaim when memory is being fragmented.
-It defines the percentage of the high watermark of a zone that will be
-reclaimed if pages of different mobility are being mixed within pageblocks.
-The intent is that compaction has less work to do in the future and to
-increase the success rate of future high-order allocations such as SLUB
-allocations, THP and hugetlbfs pages.
-
-To make it sensible with respect to the watermark_scale_factor
-parameter, the unit is in fractions of 10,000. The default value of
-15,000 on !DISCONTIGMEM configurations means that up to 150% of the high
-watermark will be reclaimed in the event of a pageblock being mixed due
-to fragmentation. The level of reclaim is determined by the number of
-fragmentation events that occurred in the recent past. If this value is
-smaller than a pageblock then a pageblocks worth of pages will be reclaimed
-(e.g. 2MB on 64-bit x86). A boost factor of 0 will disable the feature.
-
-=============================================================
-
-watermark_scale_factor:
-
-This factor controls the aggressiveness of kswapd. It defines the
-amount of memory left in a node/system before kswapd is woken up and
-how much memory needs to be free before kswapd goes back to sleep.
-
-The unit is in fractions of 10,000. The default value of 10 means the
-distances between watermarks are 0.1% of the available memory in the
-node/system. The maximum value is 1000, or 10% of memory.
-
-A high rate of threads entering direct reclaim (allocstall) or kswapd
-going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
-that the number of free pages kswapd maintains for latency reasons is
-too small for the allocation bursts occurring in the system. This knob
-can then be used to tune kswapd aggressiveness accordingly.
-
-==============================================================
-
-zone_reclaim_mode:
-
-Zone_reclaim_mode allows someone to set more or less aggressive approaches to
-reclaim memory when a zone runs out of memory. If it is set to zero then no
-zone reclaim occurs. Allocations will be satisfied from other zones / nodes
-in the system.
-
-This is value ORed together of
-
-1 = Zone reclaim on
-2 = Zone reclaim writes dirty pages out
-4 = Zone reclaim swaps pages
-
-zone_reclaim_mode is disabled by default. For file servers or workloads
-that benefit from having their data cached, zone_reclaim_mode should be
-left disabled as the caching effect is likely to be more important than
-data locality.
-
-zone_reclaim may be enabled if it's known that the workload is partitioned
-such that each partition fits within a NUMA node and that accessing remote
-memory would cause a measurable performance reduction. The page allocator
-will then reclaim easily reusable pages (those page cache pages that are
-currently not used) before allocating off node pages.
-
-Allowing zone reclaim to write out pages stops processes that are
-writing large amounts of data from dirtying pages on other nodes. Zone
-reclaim will write out dirty pages if a zone fills up and so effectively
-throttle the process. This may decrease the performance of a single process
-since it cannot use all of system memory to buffer the outgoing writes
-anymore but it preserve the memory on other nodes so that the performance
-of other processes running on other nodes will not be affected.
-
-Allowing regular swap effectively restricts allocations to the local
-node unless explicitly overridden by memory policies or cpuset
-configurations.
-
-============ End of Document =================================
diff --git a/Documentation/target/index.rst b/Documentation/target/index.rst
index b68f48982392..4b24f81f747e 100644
--- a/Documentation/target/index.rst
+++ b/Documentation/target/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
==================
TCM Virtual Device
diff --git a/Documentation/thermal/cpu-cooling-api.rst b/Documentation/thermal/cpu-cooling-api.rst
new file mode 100644
index 000000000000..645d914c45a6
--- /dev/null
+++ b/Documentation/thermal/cpu-cooling-api.rst
@@ -0,0 +1,107 @@
+=======================
+CPU cooling APIs How To
+=======================
+
+Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
+
+Updated: 6 Jan 2015
+
+Copyright (c) 2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
+
+0. Introduction
+===============
+
+The generic cpu cooling(freq clipping) provides registration/unregistration APIs
+to the caller. The binding of the cooling devices to the trip point is left for
+the user. The registration APIs returns the cooling device pointer.
+
+1. cpu cooling APIs
+===================
+
+1.1 cpufreq registration/unregistration APIs
+--------------------------------------------
+
+ ::
+
+ struct thermal_cooling_device
+ *cpufreq_cooling_register(struct cpumask *clip_cpus)
+
+ This interface function registers the cpufreq cooling device with the name
+ "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
+ cooling devices.
+
+ clip_cpus:
+ cpumask of cpus where the frequency constraints will happen.
+
+ ::
+
+ struct thermal_cooling_device
+ *of_cpufreq_cooling_register(struct cpufreq_policy *policy)
+
+ This interface function registers the cpufreq cooling device with
+ the name "thermal-cpufreq-%x" linking it with a device tree node, in
+ order to bind it via the thermal DT code. This api can support multiple
+ instances of cpufreq cooling devices.
+
+ policy:
+ CPUFreq policy.
+
+
+ ::
+
+ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+
+ This interface function unregisters the "thermal-cpufreq-%x" cooling device.
+
+ cdev: Cooling device pointer which has to be unregistered.
+
+2. Power models
+===============
+
+The power API registration functions provide a simple power model for
+CPUs. The current power is calculated as dynamic power (static power isn't
+supported currently). This power model requires that the operating-points of
+the CPUs are registered using the kernel's opp library and the
+`cpufreq_frequency_table` is assigned to the `struct device` of the
+cpu. If you are using CONFIG_CPUFREQ_DT then the
+`cpufreq_frequency_table` should already be assigned to the cpu
+device.
+
+The dynamic power consumption of a processor depends on many factors.
+For a given processor implementation the primary factors are:
+
+- The time the processor spends running, consuming dynamic power, as
+ compared to the time in idle states where dynamic consumption is
+ negligible. Herein we refer to this as 'utilisation'.
+- The voltage and frequency levels as a result of DVFS. The DVFS
+ level is a dominant factor governing power consumption.
+- In running time the 'execution' behaviour (instruction types, memory
+ access patterns and so forth) causes, in most cases, a second order
+ variation. In pathological cases this variation can be significant,
+ but typically it is of a much lesser impact than the factors above.
+
+A high level dynamic power consumption model may then be represented as::
+
+ Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
+
+f(run) here represents the described execution behaviour and its
+result has a units of Watts/Hz/Volt^2 (this often expressed in
+mW/MHz/uVolt^2)
+
+The detailed behaviour for f(run) could be modelled on-line. However,
+in practice, such an on-line model has dependencies on a number of
+implementation specific processor support and characterisation
+factors. Therefore, in initial implementation that contribution is
+represented as a constant coefficient. This is a simplification
+consistent with the relative contribution to overall power variation.
+
+In this simplified representation our model becomes::
+
+ Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
+
+Where `capacitance` is a constant that represents an indicative
+running time dynamic power coefficient in fundamental units of
+mW/MHz/uVolt^2. Typical values for mobile CPUs might lie in range
+from 100 to 500. For reference, the approximate values for the SoC in
+ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
+140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
deleted file mode 100644
index 7df567eaea1a..000000000000
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-CPU cooling APIs How To
-===================================
-
-Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
-
-Updated: 6 Jan 2015
-
-Copyright (c) 2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
-
-0. Introduction
-
-The generic cpu cooling(freq clipping) provides registration/unregistration APIs
-to the caller. The binding of the cooling devices to the trip point is left for
-the user. The registration APIs returns the cooling device pointer.
-
-1. cpu cooling APIs
-
-1.1 cpufreq registration/unregistration APIs
-1.1.1 struct thermal_cooling_device *cpufreq_cooling_register(
- struct cpumask *clip_cpus)
-
- This interface function registers the cpufreq cooling device with the name
- "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
- cooling devices.
-
- clip_cpus: cpumask of cpus where the frequency constraints will happen.
-
-1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register(
- struct cpufreq_policy *policy)
-
- This interface function registers the cpufreq cooling device with
- the name "thermal-cpufreq-%x" linking it with a device tree node, in
- order to bind it via the thermal DT code. This api can support multiple
- instances of cpufreq cooling devices.
-
- policy: CPUFreq policy.
-
-1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
-
- This interface function unregisters the "thermal-cpufreq-%x" cooling device.
-
- cdev: Cooling device pointer which has to be unregistered.
-
-2. Power models
-
-The power API registration functions provide a simple power model for
-CPUs. The current power is calculated as dynamic power (static power isn't
-supported currently). This power model requires that the operating-points of
-the CPUs are registered using the kernel's opp library and the
-`cpufreq_frequency_table` is assigned to the `struct device` of the
-cpu. If you are using CONFIG_CPUFREQ_DT then the
-`cpufreq_frequency_table` should already be assigned to the cpu
-device.
-
-The dynamic power consumption of a processor depends on many factors.
-For a given processor implementation the primary factors are:
-
-- The time the processor spends running, consuming dynamic power, as
- compared to the time in idle states where dynamic consumption is
- negligible. Herein we refer to this as 'utilisation'.
-- The voltage and frequency levels as a result of DVFS. The DVFS
- level is a dominant factor governing power consumption.
-- In running time the 'execution' behaviour (instruction types, memory
- access patterns and so forth) causes, in most cases, a second order
- variation. In pathological cases this variation can be significant,
- but typically it is of a much lesser impact than the factors above.
-
-A high level dynamic power consumption model may then be represented as:
-
-Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
-
-f(run) here represents the described execution behaviour and its
-result has a units of Watts/Hz/Volt^2 (this often expressed in
-mW/MHz/uVolt^2)
-
-The detailed behaviour for f(run) could be modelled on-line. However,
-in practice, such an on-line model has dependencies on a number of
-implementation specific processor support and characterisation
-factors. Therefore, in initial implementation that contribution is
-represented as a constant coefficient. This is a simplification
-consistent with the relative contribution to overall power variation.
-
-In this simplified representation our model becomes:
-
-Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
-
-Where `capacitance` is a constant that represents an indicative
-running time dynamic power coefficient in fundamental units of
-mW/MHz/uVolt^2. Typical values for mobile CPUs might lie in range
-from 100 to 500. For reference, the approximate values for the SoC in
-ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
-140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/exynos_thermal b/Documentation/thermal/exynos_thermal
deleted file mode 100644
index 9010c4416967..000000000000
--- a/Documentation/thermal/exynos_thermal
+++ /dev/null
@@ -1,77 +0,0 @@
-Kernel driver exynos_tmu
-=================
-
-Supported chips:
-* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
- Datasheet: Not publicly available
-
-Authors: Donggeun Kim <dg77.kim@samsung.com>
-Authors: Amit Daniel <amit.daniel@samsung.com>
-
-TMU controller Description:
----------------------------
-
-This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
-
-The chip only exposes the measured 8-bit temperature code value
-through a register.
-Temperature can be taken from the temperature code.
-There are three equations converting from temperature to temperature code.
-
-The three equations are:
- 1. Two point trimming
- Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
-
- 2. One point trimming
- Tc = T + TI1 - 25
-
- 3. No trimming
- Tc = T + 50
-
- Tc: Temperature code, T: Temperature,
- TI1: Trimming info for 25 degree Celsius (stored at TRIMINFO register)
- Temperature code measured at 25 degree Celsius which is unchanged
- TI2: Trimming info for 85 degree Celsius (stored at TRIMINFO register)
- Temperature code measured at 85 degree Celsius which is unchanged
-
-TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
-when temperature exceeds pre-defined levels.
-The maximum number of configurable threshold is five.
-The threshold levels are defined as follows:
- Level_0: current temperature > trigger_level_0 + threshold
- Level_1: current temperature > trigger_level_1 + threshold
- Level_2: current temperature > trigger_level_2 + threshold
- Level_3: current temperature > trigger_level_3 + threshold
-
- The threshold and each trigger_level are set
- through the corresponding registers.
-
-When an interrupt occurs, this driver notify kernel thermal framework
-with the function exynos_report_trigger.
-Although an interrupt condition for level_0 can be set,
-it can be used to synchronize the cooling action.
-
-TMU driver description:
------------------------
-
-The exynos thermal driver is structured as,
-
- Kernel Core thermal framework
- (thermal_core.c, step_wise.c, cpu_cooling.c)
- ^
- |
- |
-TMU configuration data -------> TMU Driver <------> Exynos Core thermal wrapper
-(exynos_tmu_data.c) (exynos_tmu.c) (exynos_thermal_common.c)
-(exynos_tmu_data.h) (exynos_tmu.h) (exynos_thermal_common.h)
-
-a) TMU configuration data: This consist of TMU register offsets/bitfields
- described through structure exynos_tmu_registers. Also several
- other platform data (struct exynos_tmu_platform_data) members
- are used to configure the TMU.
-b) TMU driver: This component initialises the TMU controller and sets different
- thresholds. It invokes core thermal implementation with the call
- exynos_report_trigger.
-c) Exynos Core thermal wrapper: This provides 3 wrapper function to use the
- Kernel core thermal framework. They are exynos_unregister_thermal,
- exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal.rst b/Documentation/thermal/exynos_thermal.rst
new file mode 100644
index 000000000000..5bd556566c70
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal.rst
@@ -0,0 +1,90 @@
+========================
+Kernel driver exynos_tmu
+========================
+
+Supported chips:
+
+* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
+
+ Datasheet: Not publicly available
+
+Authors: Donggeun Kim <dg77.kim@samsung.com>
+Authors: Amit Daniel <amit.daniel@samsung.com>
+
+TMU controller Description:
+---------------------------
+
+This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
+
+The chip only exposes the measured 8-bit temperature code value
+through a register.
+Temperature can be taken from the temperature code.
+There are three equations converting from temperature to temperature code.
+
+The three equations are:
+ 1. Two point trimming::
+
+ Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
+
+ 2. One point trimming::
+
+ Tc = T + TI1 - 25
+
+ 3. No trimming::
+
+ Tc = T + 50
+
+ Tc:
+ Temperature code, T: Temperature,
+ TI1:
+ Trimming info for 25 degree Celsius (stored at TRIMINFO register)
+ Temperature code measured at 25 degree Celsius which is unchanged
+ TI2:
+ Trimming info for 85 degree Celsius (stored at TRIMINFO register)
+ Temperature code measured at 85 degree Celsius which is unchanged
+
+TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
+when temperature exceeds pre-defined levels.
+The maximum number of configurable threshold is five.
+The threshold levels are defined as follows::
+
+ Level_0: current temperature > trigger_level_0 + threshold
+ Level_1: current temperature > trigger_level_1 + threshold
+ Level_2: current temperature > trigger_level_2 + threshold
+ Level_3: current temperature > trigger_level_3 + threshold
+
+The threshold and each trigger_level are set
+through the corresponding registers.
+
+When an interrupt occurs, this driver notify kernel thermal framework
+with the function exynos_report_trigger.
+Although an interrupt condition for level_0 can be set,
+it can be used to synchronize the cooling action.
+
+TMU driver description:
+-----------------------
+
+The exynos thermal driver is structured as::
+
+ Kernel Core thermal framework
+ (thermal_core.c, step_wise.c, cpu_cooling.c)
+ ^
+ |
+ |
+ TMU configuration data -----> TMU Driver <----> Exynos Core thermal wrapper
+ (exynos_tmu_data.c) (exynos_tmu.c) (exynos_thermal_common.c)
+ (exynos_tmu_data.h) (exynos_tmu.h) (exynos_thermal_common.h)
+
+a) TMU configuration data:
+ This consist of TMU register offsets/bitfields
+ described through structure exynos_tmu_registers. Also several
+ other platform data (struct exynos_tmu_platform_data) members
+ are used to configure the TMU.
+b) TMU driver:
+ This component initialises the TMU controller and sets different
+ thresholds. It invokes core thermal implementation with the call
+ exynos_report_trigger.
+c) Exynos Core thermal wrapper:
+ This provides 3 wrapper function to use the
+ Kernel core thermal framework. They are exynos_unregister_thermal,
+ exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal_emulation b/Documentation/thermal/exynos_thermal_emulation
deleted file mode 100644
index b15efec6ca28..000000000000
--- a/Documentation/thermal/exynos_thermal_emulation
+++ /dev/null
@@ -1,53 +0,0 @@
-EXYNOS EMULATION MODE
-========================
-
-Copyright (C) 2012 Samsung Electronics
-
-Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
-
-Description
------------
-
-Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal management unit.
-Thermal emulation mode supports software debug for TMU's operation. User can set temperature
-manually with software code and TMU will read current temperature from user value not from
-sensor's value.
-
-Enabling CONFIG_THERMAL_EMULATION option will make this support available.
-When it's enabled, sysfs node will be created as
-/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
-
-The sysfs node, 'emul_node', will contain value 0 for the initial state. When you input any
-temperature you want to update to sysfs node, it automatically enable emulation mode and
-current temperature will be changed into it.
-(Exynos also supports user changeable delay time which would be used to delay of
- changing temperature. However, this node only uses same delay of real sensing time, 938us.)
-
-Exynos emulation mode requires synchronous of value changing and enabling. It means when you
-want to update the any value of delay or next temperature, then you have to enable emulation
-mode at the same time. (Or you have to keep the mode enabling.) If you don't, it fails to
-change the value to updated one and just use last succeessful value repeatedly. That's why
-this node gives users the right to change termerpature only. Just one interface makes it more
-simply to use.
-
-Disabling emulation mode only requires writing value 0 to sysfs node.
-
-
-TEMP 120 |
- |
- 100 |
- |
- 80 |
- | +-----------
- 60 | | |
- | +-------------| |
- 40 | | | |
- | | | |
- 20 | | | +----------
- | | | | |
- 0 |______________|_____________|__________|__________|_________
- A A A A TIME
- |<----->| |<----->| |<----->| |
- | 938us | | | | | |
-emulation : 0 50 | 70 | 20 | 0
-current temp : sensor 50 70 20 sensor
diff --git a/Documentation/thermal/exynos_thermal_emulation.rst b/Documentation/thermal/exynos_thermal_emulation.rst
new file mode 100644
index 000000000000..c21d10838bc5
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal_emulation.rst
@@ -0,0 +1,61 @@
+=====================
+Exynos Emulation Mode
+=====================
+
+Copyright (C) 2012 Samsung Electronics
+
+Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
+
+Description
+-----------
+
+Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal
+management unit. Thermal emulation mode supports software debug for
+TMU's operation. User can set temperature manually with software code
+and TMU will read current temperature from user value not from sensor's
+value.
+
+Enabling CONFIG_THERMAL_EMULATION option will make this support
+available. When it's enabled, sysfs node will be created as
+/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
+
+The sysfs node, 'emul_node', will contain value 0 for the initial state.
+When you input any temperature you want to update to sysfs node, it
+automatically enable emulation mode and current temperature will be
+changed into it.
+
+(Exynos also supports user changeable delay time which would be used to
+delay of changing temperature. However, this node only uses same delay
+of real sensing time, 938us.)
+
+Exynos emulation mode requires synchronous of value changing and
+enabling. It means when you want to update the any value of delay or
+next temperature, then you have to enable emulation mode at the same
+time. (Or you have to keep the mode enabling.) If you don't, it fails to
+change the value to updated one and just use last succeessful value
+repeatedly. That's why this node gives users the right to change
+termerpature only. Just one interface makes it more simply to use.
+
+Disabling emulation mode only requires writing value 0 to sysfs node.
+
+::
+
+
+ TEMP 120 |
+ |
+ 100 |
+ |
+ 80 |
+ | +-----------
+ 60 | | |
+ | +-------------| |
+ 40 | | | |
+ | | | |
+ 20 | | | +----------
+ | | | | |
+ 0 |______________|_____________|__________|__________|_________
+ A A A A TIME
+ |<----->| |<----->| |<----->| |
+ | 938us | | | | | |
+ emulation : 0 50 | 70 | 20 | 0
+ current temp: sensor 50 70 20 sensor
diff --git a/Documentation/thermal/index.rst b/Documentation/thermal/index.rst
new file mode 100644
index 000000000000..8c1c00146cad
--- /dev/null
+++ b/Documentation/thermal/index.rst
@@ -0,0 +1,18 @@
+:orphan:
+
+=======
+Thermal
+=======
+
+.. toctree::
+ :maxdepth: 1
+
+ cpu-cooling-api
+ sysfs-api
+ power_allocator
+
+ exynos_thermal
+ exynos_thermal_emulation
+ intel_powerclamp
+ nouveau_thermal
+ x86_pkg_temperature_thermal
diff --git a/Documentation/thermal/intel_powerclamp.rst b/Documentation/thermal/intel_powerclamp.rst
new file mode 100644
index 000000000000..3f6dfb0b3ea6
--- /dev/null
+++ b/Documentation/thermal/intel_powerclamp.rst
@@ -0,0 +1,320 @@
+=======================
+Intel Powerclamp Driver
+=======================
+
+By:
+ - Arjan van de Ven <arjan@linux.intel.com>
+ - Jacob Pan <jacob.jun.pan@linux.intel.com>
+
+.. Contents:
+
+ (*) Introduction
+ - Goals and Objectives
+
+ (*) Theory of Operation
+ - Idle Injection
+ - Calibration
+
+ (*) Performance Analysis
+ - Effectiveness and Limitations
+ - Power vs Performance
+ - Scalability
+ - Calibration
+ - Comparison with Alternative Techniques
+
+ (*) Usage and Interfaces
+ - Generic Thermal Layer (sysfs)
+ - Kernel APIs (TBD)
+
+INTRODUCTION
+============
+
+Consider the situation where a system’s power consumption must be
+reduced at runtime, due to power budget, thermal constraint, or noise
+level, and where active cooling is not preferred. Software managed
+passive power reduction must be performed to prevent the hardware
+actions that are designed for catastrophic scenarios.
+
+Currently, P-states, T-states (clock modulation), and CPU offlining
+are used for CPU throttling.
+
+On Intel CPUs, C-states provide effective power reduction, but so far
+they’re only used opportunistically, based on workload. With the
+development of intel_powerclamp driver, the method of synchronizing
+idle injection across all online CPU threads was introduced. The goal
+is to achieve forced and controllable C-state residency.
+
+Test/Analysis has been made in the areas of power, performance,
+scalability, and user experience. In many cases, clear advantage is
+shown over taking the CPU offline or modulating the CPU clock.
+
+
+THEORY OF OPERATION
+===================
+
+Idle Injection
+--------------
+
+On modern Intel processors (Nehalem or later), package level C-state
+residency is available in MSRs, thus also available to the kernel.
+
+These MSRs are::
+
+ #define MSR_PKG_C2_RESIDENCY 0x60D
+ #define MSR_PKG_C3_RESIDENCY 0x3F8
+ #define MSR_PKG_C6_RESIDENCY 0x3F9
+ #define MSR_PKG_C7_RESIDENCY 0x3FA
+
+If the kernel can also inject idle time to the system, then a
+closed-loop control system can be established that manages package
+level C-state. The intel_powerclamp driver is conceived as such a
+control system, where the target set point is a user-selected idle
+ratio (based on power reduction), and the error is the difference
+between the actual package level C-state residency ratio and the target idle
+ratio.
+
+Injection is controlled by high priority kernel threads, spawned for
+each online CPU.
+
+These kernel threads, with SCHED_FIFO class, are created to perform
+clamping actions of controlled duty ratio and duration. Each per-CPU
+thread synchronizes its idle time and duration, based on the rounding
+of jiffies, so accumulated errors can be prevented to avoid a jittery
+effect. Threads are also bound to the CPU such that they cannot be
+migrated, unless the CPU is taken offline. In this case, threads
+belong to the offlined CPUs will be terminated immediately.
+
+Running as SCHED_FIFO and relatively high priority, also allows such
+scheme to work for both preemptable and non-preemptable kernels.
+Alignment of idle time around jiffies ensures scalability for HZ
+values. This effect can be better visualized using a Perf timechart.
+The following diagram shows the behavior of kernel thread
+kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
+for a given "duration", then relinquishes the CPU to other tasks,
+until the next time interval.
+
+The NOHZ schedule tick is disabled during idle time, but interrupts
+are not masked. Tests show that the extra wakeups from scheduler tick
+have a dramatic impact on the effectiveness of the powerclamp driver
+on large scale systems (Westmere system with 80 processors).
+
+::
+
+ CPU0
+ ____________ ____________
+ kidle_inject/0 | sleep | mwait | sleep |
+ _________| |________| |_______
+ duration
+ CPU1
+ ____________ ____________
+ kidle_inject/1 | sleep | mwait | sleep |
+ _________| |________| |_______
+ ^
+ |
+ |
+ roundup(jiffies, interval)
+
+Only one CPU is allowed to collect statistics and update global
+control parameters. This CPU is referred to as the controlling CPU in
+this document. The controlling CPU is elected at runtime, with a
+policy that favors BSP, taking into account the possibility of a CPU
+hot-plug.
+
+In terms of dynamics of the idle control system, package level idle
+time is considered largely as a non-causal system where its behavior
+cannot be based on the past or current input. Therefore, the
+intel_powerclamp driver attempts to enforce the desired idle time
+instantly as given input (target idle ratio). After injection,
+powerclamp monitors the actual idle for a given time window and adjust
+the next injection accordingly to avoid over/under correction.
+
+When used in a causal control system, such as a temperature control,
+it is up to the user of this driver to implement algorithms where
+past samples and outputs are included in the feedback. For example, a
+PID-based thermal controller can use the powerclamp driver to
+maintain a desired target temperature, based on integral and
+derivative gains of the past samples.
+
+
+
+Calibration
+-----------
+During scalability testing, it is observed that synchronized actions
+among CPUs become challenging as the number of cores grows. This is
+also true for the ability of a system to enter package level C-states.
+
+To make sure the intel_powerclamp driver scales well, online
+calibration is implemented. The goals for doing such a calibration
+are:
+
+a) determine the effective range of idle injection ratio
+b) determine the amount of compensation needed at each target ratio
+
+Compensation to each target ratio consists of two parts:
+
+ a) steady state error compensation
+ This is to offset the error occurring when the system can
+ enter idle without extra wakeups (such as external interrupts).
+
+ b) dynamic error compensation
+ When an excessive amount of wakeups occurs during idle, an
+ additional idle ratio can be added to quiet interrupts, by
+ slowing down CPU activities.
+
+A debugfs file is provided for the user to examine compensation
+progress and results, such as on a Westmere system::
+
+ [jacob@nex01 ~]$ cat
+ /sys/kernel/debug/intel_powerclamp/powerclamp_calib
+ controlling cpu: 0
+ pct confidence steady dynamic (compensation)
+ 0 0 0 0
+ 1 1 0 0
+ 2 1 1 0
+ 3 3 1 0
+ 4 3 1 0
+ 5 3 1 0
+ 6 3 1 0
+ 7 3 1 0
+ 8 3 1 0
+ ...
+ 30 3 2 0
+ 31 3 2 0
+ 32 3 1 0
+ 33 3 2 0
+ 34 3 1 0
+ 35 3 2 0
+ 36 3 1 0
+ 37 3 2 0
+ 38 3 1 0
+ 39 3 2 0
+ 40 3 3 0
+ 41 3 1 0
+ 42 3 2 0
+ 43 3 1 0
+ 44 3 1 0
+ 45 3 2 0
+ 46 3 3 0
+ 47 3 0 0
+ 48 3 2 0
+ 49 3 3 0
+
+Calibration occurs during runtime. No offline method is available.
+Steady state compensation is used only when confidence levels of all
+adjacent ratios have reached satisfactory level. A confidence level
+is accumulated based on clean data collected at runtime. Data
+collected during a period without extra interrupts is considered
+clean.
+
+To compensate for excessive amounts of wakeup during idle, additional
+idle time is injected when such a condition is detected. Currently,
+we have a simple algorithm to double the injection ratio. A possible
+enhancement might be to throttle the offending IRQ, such as delaying
+EOI for level triggered interrupts. But it is a challenge to be
+non-intrusive to the scheduler or the IRQ core code.
+
+
+CPU Online/Offline
+------------------
+Per-CPU kernel threads are started/stopped upon receiving
+notifications of CPU hotplug activities. The intel_powerclamp driver
+keeps track of clamping kernel threads, even after they are migrated
+to other CPUs, after a CPU offline event.
+
+
+Performance Analysis
+====================
+This section describes the general performance data collected on
+multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
+
+Effectiveness and Limitations
+-----------------------------
+The maximum range that idle injection is allowed is capped at 50
+percent. As mentioned earlier, since interrupts are allowed during
+forced idle time, excessive interrupts could result in less
+effectiveness. The extreme case would be doing a ping -f to generated
+flooded network interrupts without much CPU acknowledgement. In this
+case, little can be done from the idle injection threads. In most
+normal cases, such as scp a large file, applications can be throttled
+by the powerclamp driver, since slowing down the CPU also slows down
+network protocol processing, which in turn reduces interrupts.
+
+When control parameters change at runtime by the controlling CPU, it
+may take an additional period for the rest of the CPUs to catch up
+with the changes. During this time, idle injection is out of sync,
+thus not able to enter package C- states at the expected ratio. But
+this effect is minor, in that in most cases change to the target
+ratio is updated much less frequently than the idle injection
+frequency.
+
+Scalability
+-----------
+Tests also show a minor, but measurable, difference between the 4P/8P
+Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
+More compensation is needed on Westmere for the same amount of
+target idle ratio. The compensation also increases as the idle ratio
+gets larger. The above reason constitutes the need for the
+calibration code.
+
+On the IVB 8P system, compared to an offline CPU, powerclamp can
+achieve up to 40% better performance per watt. (measured by a spin
+counter summed over per CPU counting threads spawned for all running
+CPUs).
+
+Usage and Interfaces
+====================
+The powerclamp driver is registered to the generic thermal layer as a
+cooling device. Currently, it’s not bound to any thermal zones::
+
+ jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
+ cur_state:0
+ max_state:50
+ type:intel_powerclamp
+
+cur_state allows user to set the desired idle percentage. Writing 0 to
+cur_state will stop idle injection. Writing a value between 1 and
+max_state will start the idle injection. Reading cur_state returns the
+actual and current idle percentage. This may not be the same value
+set by the user in that current idle percentage depends on workload
+and includes natural idle. When idle injection is disabled, reading
+cur_state returns value -1 instead of 0 which is to avoid confusing
+100% busy state with the disabled state.
+
+Example usage:
+- To inject 25% idle time::
+
+ $ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
+
+If the system is not busy and has more than 25% idle time already,
+then the powerclamp driver will not start idle injection. Using Top
+will not show idle injection kernel threads.
+
+If the system is busy (spin test below) and has less than 25% natural
+idle time, powerclamp kernel threads will do idle injection. Forced
+idle time is accounted as normal idle in that common code path is
+taken as the idle task.
+
+In this example, 24.1% idle is shown. This helps the system admin or
+user determine the cause of slowdown, when a powerclamp driver is in action::
+
+
+ Tasks: 197 total, 1 running, 196 sleeping, 0 stopped, 0 zombie
+ Cpu(s): 71.2%us, 4.7%sy, 0.0%ni, 24.1%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st
+ Mem: 3943228k total, 1689632k used, 2253596k free, 74960k buffers
+ Swap: 4087804k total, 0k used, 4087804k free, 945336k cached
+
+ PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
+ 3352 jacob 20 0 262m 644 428 S 286 0.0 0:17.16 spin
+ 3341 root -51 0 0 0 0 D 25 0.0 0:01.62 kidle_inject/0
+ 3344 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/3
+ 3342 root -51 0 0 0 0 D 25 0.0 0:01.61 kidle_inject/1
+ 3343 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/2
+ 2935 jacob 20 0 696m 125m 35m S 5 3.3 0:31.11 firefox
+ 1546 root 20 0 158m 20m 6640 S 3 0.5 0:26.97 Xorg
+ 2100 jacob 20 0 1223m 88m 30m S 3 2.3 0:23.68 compiz
+
+Tests have shown that by using the powerclamp driver as a cooling
+device, a PID based userspace thermal controller can manage to
+control CPU temperature effectively, when no other thermal influence
+is added. For example, a UltraBook user can compile the kernel under
+certain temperature (below most active trip points).
diff --git a/Documentation/thermal/intel_powerclamp.txt b/Documentation/thermal/intel_powerclamp.txt
deleted file mode 100644
index b5df21168fbc..000000000000
--- a/Documentation/thermal/intel_powerclamp.txt
+++ /dev/null
@@ -1,317 +0,0 @@
- =======================
- INTEL POWERCLAMP DRIVER
- =======================
-By: Arjan van de Ven <arjan@linux.intel.com>
- Jacob Pan <jacob.jun.pan@linux.intel.com>
-
-Contents:
- (*) Introduction
- - Goals and Objectives
-
- (*) Theory of Operation
- - Idle Injection
- - Calibration
-
- (*) Performance Analysis
- - Effectiveness and Limitations
- - Power vs Performance
- - Scalability
- - Calibration
- - Comparison with Alternative Techniques
-
- (*) Usage and Interfaces
- - Generic Thermal Layer (sysfs)
- - Kernel APIs (TBD)
-
-============
-INTRODUCTION
-============
-
-Consider the situation where a system’s power consumption must be
-reduced at runtime, due to power budget, thermal constraint, or noise
-level, and where active cooling is not preferred. Software managed
-passive power reduction must be performed to prevent the hardware
-actions that are designed for catastrophic scenarios.
-
-Currently, P-states, T-states (clock modulation), and CPU offlining
-are used for CPU throttling.
-
-On Intel CPUs, C-states provide effective power reduction, but so far
-they’re only used opportunistically, based on workload. With the
-development of intel_powerclamp driver, the method of synchronizing
-idle injection across all online CPU threads was introduced. The goal
-is to achieve forced and controllable C-state residency.
-
-Test/Analysis has been made in the areas of power, performance,
-scalability, and user experience. In many cases, clear advantage is
-shown over taking the CPU offline or modulating the CPU clock.
-
-
-===================
-THEORY OF OPERATION
-===================
-
-Idle Injection
---------------
-
-On modern Intel processors (Nehalem or later), package level C-state
-residency is available in MSRs, thus also available to the kernel.
-
-These MSRs are:
- #define MSR_PKG_C2_RESIDENCY 0x60D
- #define MSR_PKG_C3_RESIDENCY 0x3F8
- #define MSR_PKG_C6_RESIDENCY 0x3F9
- #define MSR_PKG_C7_RESIDENCY 0x3FA
-
-If the kernel can also inject idle time to the system, then a
-closed-loop control system can be established that manages package
-level C-state. The intel_powerclamp driver is conceived as such a
-control system, where the target set point is a user-selected idle
-ratio (based on power reduction), and the error is the difference
-between the actual package level C-state residency ratio and the target idle
-ratio.
-
-Injection is controlled by high priority kernel threads, spawned for
-each online CPU.
-
-These kernel threads, with SCHED_FIFO class, are created to perform
-clamping actions of controlled duty ratio and duration. Each per-CPU
-thread synchronizes its idle time and duration, based on the rounding
-of jiffies, so accumulated errors can be prevented to avoid a jittery
-effect. Threads are also bound to the CPU such that they cannot be
-migrated, unless the CPU is taken offline. In this case, threads
-belong to the offlined CPUs will be terminated immediately.
-
-Running as SCHED_FIFO and relatively high priority, also allows such
-scheme to work for both preemptable and non-preemptable kernels.
-Alignment of idle time around jiffies ensures scalability for HZ
-values. This effect can be better visualized using a Perf timechart.
-The following diagram shows the behavior of kernel thread
-kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
-for a given "duration", then relinquishes the CPU to other tasks,
-until the next time interval.
-
-The NOHZ schedule tick is disabled during idle time, but interrupts
-are not masked. Tests show that the extra wakeups from scheduler tick
-have a dramatic impact on the effectiveness of the powerclamp driver
-on large scale systems (Westmere system with 80 processors).
-
-CPU0
- ____________ ____________
-kidle_inject/0 | sleep | mwait | sleep |
- _________| |________| |_______
- duration
-CPU1
- ____________ ____________
-kidle_inject/1 | sleep | mwait | sleep |
- _________| |________| |_______
- ^
- |
- |
- roundup(jiffies, interval)
-
-Only one CPU is allowed to collect statistics and update global
-control parameters. This CPU is referred to as the controlling CPU in
-this document. The controlling CPU is elected at runtime, with a
-policy that favors BSP, taking into account the possibility of a CPU
-hot-plug.
-
-In terms of dynamics of the idle control system, package level idle
-time is considered largely as a non-causal system where its behavior
-cannot be based on the past or current input. Therefore, the
-intel_powerclamp driver attempts to enforce the desired idle time
-instantly as given input (target idle ratio). After injection,
-powerclamp monitors the actual idle for a given time window and adjust
-the next injection accordingly to avoid over/under correction.
-
-When used in a causal control system, such as a temperature control,
-it is up to the user of this driver to implement algorithms where
-past samples and outputs are included in the feedback. For example, a
-PID-based thermal controller can use the powerclamp driver to
-maintain a desired target temperature, based on integral and
-derivative gains of the past samples.
-
-
-
-Calibration
------------
-During scalability testing, it is observed that synchronized actions
-among CPUs become challenging as the number of cores grows. This is
-also true for the ability of a system to enter package level C-states.
-
-To make sure the intel_powerclamp driver scales well, online
-calibration is implemented. The goals for doing such a calibration
-are:
-
-a) determine the effective range of idle injection ratio
-b) determine the amount of compensation needed at each target ratio
-
-Compensation to each target ratio consists of two parts:
-
- a) steady state error compensation
- This is to offset the error occurring when the system can
- enter idle without extra wakeups (such as external interrupts).
-
- b) dynamic error compensation
- When an excessive amount of wakeups occurs during idle, an
- additional idle ratio can be added to quiet interrupts, by
- slowing down CPU activities.
-
-A debugfs file is provided for the user to examine compensation
-progress and results, such as on a Westmere system.
-[jacob@nex01 ~]$ cat
-/sys/kernel/debug/intel_powerclamp/powerclamp_calib
-controlling cpu: 0
-pct confidence steady dynamic (compensation)
-0 0 0 0
-1 1 0 0
-2 1 1 0
-3 3 1 0
-4 3 1 0
-5 3 1 0
-6 3 1 0
-7 3 1 0
-8 3 1 0
-...
-30 3 2 0
-31 3 2 0
-32 3 1 0
-33 3 2 0
-34 3 1 0
-35 3 2 0
-36 3 1 0
-37 3 2 0
-38 3 1 0
-39 3 2 0
-40 3 3 0
-41 3 1 0
-42 3 2 0
-43 3 1 0
-44 3 1 0
-45 3 2 0
-46 3 3 0
-47 3 0 0
-48 3 2 0
-49 3 3 0
-
-Calibration occurs during runtime. No offline method is available.
-Steady state compensation is used only when confidence levels of all
-adjacent ratios have reached satisfactory level. A confidence level
-is accumulated based on clean data collected at runtime. Data
-collected during a period without extra interrupts is considered
-clean.
-
-To compensate for excessive amounts of wakeup during idle, additional
-idle time is injected when such a condition is detected. Currently,
-we have a simple algorithm to double the injection ratio. A possible
-enhancement might be to throttle the offending IRQ, such as delaying
-EOI for level triggered interrupts. But it is a challenge to be
-non-intrusive to the scheduler or the IRQ core code.
-
-
-CPU Online/Offline
-------------------
-Per-CPU kernel threads are started/stopped upon receiving
-notifications of CPU hotplug activities. The intel_powerclamp driver
-keeps track of clamping kernel threads, even after they are migrated
-to other CPUs, after a CPU offline event.
-
-
-=====================
-Performance Analysis
-=====================
-This section describes the general performance data collected on
-multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
-
-Effectiveness and Limitations
------------------------------
-The maximum range that idle injection is allowed is capped at 50
-percent. As mentioned earlier, since interrupts are allowed during
-forced idle time, excessive interrupts could result in less
-effectiveness. The extreme case would be doing a ping -f to generated
-flooded network interrupts without much CPU acknowledgement. In this
-case, little can be done from the idle injection threads. In most
-normal cases, such as scp a large file, applications can be throttled
-by the powerclamp driver, since slowing down the CPU also slows down
-network protocol processing, which in turn reduces interrupts.
-
-When control parameters change at runtime by the controlling CPU, it
-may take an additional period for the rest of the CPUs to catch up
-with the changes. During this time, idle injection is out of sync,
-thus not able to enter package C- states at the expected ratio. But
-this effect is minor, in that in most cases change to the target
-ratio is updated much less frequently than the idle injection
-frequency.
-
-Scalability
------------
-Tests also show a minor, but measurable, difference between the 4P/8P
-Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
-More compensation is needed on Westmere for the same amount of
-target idle ratio. The compensation also increases as the idle ratio
-gets larger. The above reason constitutes the need for the
-calibration code.
-
-On the IVB 8P system, compared to an offline CPU, powerclamp can
-achieve up to 40% better performance per watt. (measured by a spin
-counter summed over per CPU counting threads spawned for all running
-CPUs).
-
-====================
-Usage and Interfaces
-====================
-The powerclamp driver is registered to the generic thermal layer as a
-cooling device. Currently, it’s not bound to any thermal zones.
-
-jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
-cur_state:0
-max_state:50
-type:intel_powerclamp
-
-cur_state allows user to set the desired idle percentage. Writing 0 to
-cur_state will stop idle injection. Writing a value between 1 and
-max_state will start the idle injection. Reading cur_state returns the
-actual and current idle percentage. This may not be the same value
-set by the user in that current idle percentage depends on workload
-and includes natural idle. When idle injection is disabled, reading
-cur_state returns value -1 instead of 0 which is to avoid confusing
-100% busy state with the disabled state.
-
-Example usage:
-- To inject 25% idle time
-$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
-"
-
-If the system is not busy and has more than 25% idle time already,
-then the powerclamp driver will not start idle injection. Using Top
-will not show idle injection kernel threads.
-
-If the system is busy (spin test below) and has less than 25% natural
-idle time, powerclamp kernel threads will do idle injection. Forced
-idle time is accounted as normal idle in that common code path is
-taken as the idle task.
-
-In this example, 24.1% idle is shown. This helps the system admin or
-user determine the cause of slowdown, when a powerclamp driver is in action.
-
-
-Tasks: 197 total, 1 running, 196 sleeping, 0 stopped, 0 zombie
-Cpu(s): 71.2%us, 4.7%sy, 0.0%ni, 24.1%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st
-Mem: 3943228k total, 1689632k used, 2253596k free, 74960k buffers
-Swap: 4087804k total, 0k used, 4087804k free, 945336k cached
-
- PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
- 3352 jacob 20 0 262m 644 428 S 286 0.0 0:17.16 spin
- 3341 root -51 0 0 0 0 D 25 0.0 0:01.62 kidle_inject/0
- 3344 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/3
- 3342 root -51 0 0 0 0 D 25 0.0 0:01.61 kidle_inject/1
- 3343 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/2
- 2935 jacob 20 0 696m 125m 35m S 5 3.3 0:31.11 firefox
- 1546 root 20 0 158m 20m 6640 S 3 0.5 0:26.97 Xorg
- 2100 jacob 20 0 1223m 88m 30m S 3 2.3 0:23.68 compiz
-
-Tests have shown that by using the powerclamp driver as a cooling
-device, a PID based userspace thermal controller can manage to
-control CPU temperature effectively, when no other thermal influence
-is added. For example, a UltraBook user can compile the kernel under
-certain temperature (below most active trip points).
diff --git a/Documentation/thermal/nouveau_thermal b/Documentation/thermal/nouveau_thermal
deleted file mode 100644
index 6e17a11efcb0..000000000000
--- a/Documentation/thermal/nouveau_thermal
+++ /dev/null
@@ -1,82 +0,0 @@
-Kernel driver nouveau
-===================
-
-Supported chips:
-* NV43+
-
-Authors: Martin Peres (mupuf) <martin.peres@free.fr>
-
-Description
----------
-
-This driver allows to read the GPU core temperature, drive the GPU fan and
-set temperature alarms.
-
-Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
-cannot access any of the i2c external monitoring chips it may find. If you
-have one of those, temperature and/or fan management through Nouveau's HWMON
-interface is likely not to work. This document may then not cover your situation
-entirely.
-
-Temperature management
---------------------
-
-Temperature is exposed under as a read-only HWMON attribute temp1_input.
-
-In order to protect the GPU from overheating, Nouveau supports 4 configurable
-temperature thresholds:
-
- * Fan_boost: Fan speed is set to 100% when reaching this temperature;
- * Downclock: The GPU will be downclocked to reduce its power dissipation;
- * Critical: The GPU is put on hold to further lower power dissipation;
- * Shutdown: Shut the computer down to protect your GPU.
-
-WARNING: Some of these thresholds may not be used by Nouveau depending
-on your chipset.
-
-The default value for these thresholds comes from the GPU's vbios. These
-thresholds can be configured thanks to the following HWMON attributes:
-
- * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
- * Downclock: temp1_max and temp1_max_hyst;
- * Critical: temp1_crit and temp1_crit_hyst;
- * Shutdown: temp1_emergency and temp1_emergency_hyst.
-
-NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
-to multiply!
-
-Fan management
-------------
-
-Not all cards have a drivable fan. If you do, then the following HWMON
-attributes should be available:
-
- * pwm1_enable: Current fan management mode (NONE, MANUAL or AUTO);
- * pwm1: Current PWM value (power percentage);
- * pwm1_min: The minimum PWM speed allowed;
- * pwm1_max: The maximum PWM speed allowed (bypassed when hitting Fan_boost);
-
-You may also have the following attribute:
-
- * fan1_input: Speed in RPM of your fan.
-
-Your fan can be driven in different modes:
-
- * 0: The fan is left untouched;
- * 1: The fan can be driven in manual (use pwm1 to change the speed);
- * 2; The fan is driven automatically depending on the temperature.
-
-NOTE: Be sure to use the manual mode if you want to drive the fan speed manually
-
-NOTE2: When operating in manual mode outside the vbios-defined
-[PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
-depending on your hardware.
-
-Bug reports
----------
-
-Thermal management on Nouveau is new and may not work on all cards. If you have
-inquiries, please ping mupuf on IRC (#nouveau, freenode).
-
-Bug reports should be filled on Freedesktop's bug tracker. Please follow
-http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/nouveau_thermal.rst b/Documentation/thermal/nouveau_thermal.rst
new file mode 100644
index 000000000000..37255fd6735d
--- /dev/null
+++ b/Documentation/thermal/nouveau_thermal.rst
@@ -0,0 +1,96 @@
+=====================
+Kernel driver nouveau
+=====================
+
+Supported chips:
+
+* NV43+
+
+Authors: Martin Peres (mupuf) <martin.peres@free.fr>
+
+Description
+-----------
+
+This driver allows to read the GPU core temperature, drive the GPU fan and
+set temperature alarms.
+
+Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
+cannot access any of the i2c external monitoring chips it may find. If you
+have one of those, temperature and/or fan management through Nouveau's HWMON
+interface is likely not to work. This document may then not cover your situation
+entirely.
+
+Temperature management
+----------------------
+
+Temperature is exposed under as a read-only HWMON attribute temp1_input.
+
+In order to protect the GPU from overheating, Nouveau supports 4 configurable
+temperature thresholds:
+
+ * Fan_boost:
+ Fan speed is set to 100% when reaching this temperature;
+ * Downclock:
+ The GPU will be downclocked to reduce its power dissipation;
+ * Critical:
+ The GPU is put on hold to further lower power dissipation;
+ * Shutdown:
+ Shut the computer down to protect your GPU.
+
+WARNING:
+ Some of these thresholds may not be used by Nouveau depending
+ on your chipset.
+
+The default value for these thresholds comes from the GPU's vbios. These
+thresholds can be configured thanks to the following HWMON attributes:
+
+ * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
+ * Downclock: temp1_max and temp1_max_hyst;
+ * Critical: temp1_crit and temp1_crit_hyst;
+ * Shutdown: temp1_emergency and temp1_emergency_hyst.
+
+NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
+to multiply!
+
+Fan management
+--------------
+
+Not all cards have a drivable fan. If you do, then the following HWMON
+attributes should be available:
+
+ * pwm1_enable:
+ Current fan management mode (NONE, MANUAL or AUTO);
+ * pwm1:
+ Current PWM value (power percentage);
+ * pwm1_min:
+ The minimum PWM speed allowed;
+ * pwm1_max:
+ The maximum PWM speed allowed (bypassed when hitting Fan_boost);
+
+You may also have the following attribute:
+
+ * fan1_input:
+ Speed in RPM of your fan.
+
+Your fan can be driven in different modes:
+
+ * 0: The fan is left untouched;
+ * 1: The fan can be driven in manual (use pwm1 to change the speed);
+ * 2; The fan is driven automatically depending on the temperature.
+
+NOTE:
+ Be sure to use the manual mode if you want to drive the fan speed manually
+
+NOTE2:
+ When operating in manual mode outside the vbios-defined
+ [PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
+ depending on your hardware.
+
+Bug reports
+-----------
+
+Thermal management on Nouveau is new and may not work on all cards. If you have
+inquiries, please ping mupuf on IRC (#nouveau, freenode).
+
+Bug reports should be filled on Freedesktop's bug tracker. Please follow
+http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/power_allocator.rst b/Documentation/thermal/power_allocator.rst
new file mode 100644
index 000000000000..67b6a3297238
--- /dev/null
+++ b/Documentation/thermal/power_allocator.rst
@@ -0,0 +1,271 @@
+=================================
+Power allocator governor tunables
+=================================
+
+Trip points
+-----------
+
+The governor works optimally with the following two passive trip points:
+
+1. "switch on" trip point: temperature above which the governor
+ control loop starts operating. This is the first passive trip
+ point of the thermal zone.
+
+2. "desired temperature" trip point: it should be higher than the
+ "switch on" trip point. This the target temperature the governor
+ is controlling for. This is the last passive trip point of the
+ thermal zone.
+
+PID Controller
+--------------
+
+The power allocator governor implements a
+Proportional-Integral-Derivative controller (PID controller) with
+temperature as the control input and power as the controlled output:
+
+ P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
+
+where
+ - e = desired_temperature - current_temperature
+ - err_integral is the sum of previous errors
+ - diff_err = e - previous_error
+
+It is similar to the one depicted below::
+
+ k_d
+ |
+ current_temp |
+ | v
+ | +----------+ +---+
+ | +----->| diff_err |-->| X |------+
+ | | +----------+ +---+ |
+ | | | tdp actor
+ | | k_i | | get_requested_power()
+ | | | | | | |
+ | | | | | | | ...
+ v | v v v v v
+ +---+ | +-------+ +---+ +---+ +---+ +----------+
+ | S |-----+----->| sum e |----->| X |--->| S |-->| S |-->|power |
+ +---+ | +-------+ +---+ +---+ +---+ |allocation|
+ ^ | ^ +----------+
+ | | | | |
+ | | +---+ | | |
+ | +------->| X |-------------------+ v v
+ | +---+ granted performance
+ desired_temperature ^
+ |
+ |
+ k_po/k_pu
+
+Sustainable power
+-----------------
+
+An estimate of the sustainable dissipatable power (in mW) should be
+provided while registering the thermal zone. This estimates the
+sustained power that can be dissipated at the desired control
+temperature. This is the maximum sustained power for allocation at
+the desired maximum temperature. The actual sustained power can vary
+for a number of reasons. The closed loop controller will take care of
+variations such as environmental conditions, and some factors related
+to the speed-grade of the silicon. `sustainable_power` is therefore
+simply an estimate, and may be tuned to affect the aggressiveness of
+the thermal ramp. For reference, the sustainable power of a 4" phone
+is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
+depending on screen size).
+
+If you are using device tree, do add it as a property of the
+thermal-zone. For example::
+
+ thermal-zones {
+ soc_thermal {
+ polling-delay = <1000>;
+ polling-delay-passive = <100>;
+ sustainable-power = <2500>;
+ ...
+
+Instead, if the thermal zone is registered from the platform code, pass a
+`thermal_zone_params` that has a `sustainable_power`. If no
+`thermal_zone_params` were being passed, then something like below
+will suffice::
+
+ static const struct thermal_zone_params tz_params = {
+ .sustainable_power = 3500,
+ };
+
+and then pass `tz_params` as the 5th parameter to
+`thermal_zone_device_register()`
+
+k_po and k_pu
+-------------
+
+The implementation of the PID controller in the power allocator
+thermal governor allows the configuration of two proportional term
+constants: `k_po` and `k_pu`. `k_po` is the proportional term
+constant during temperature overshoot periods (current temperature is
+above "desired temperature" trip point). Conversely, `k_pu` is the
+proportional term constant during temperature undershoot periods
+(current temperature below "desired temperature" trip point).
+
+These controls are intended as the primary mechanism for configuring
+the permitted thermal "ramp" of the system. For instance, a lower
+`k_pu` value will provide a slower ramp, at the cost of capping
+available capacity at a low temperature. On the other hand, a high
+value of `k_pu` will result in the governor granting very high power
+while temperature is low, and may lead to temperature overshooting.
+
+The default value for `k_pu` is::
+
+ 2 * sustainable_power / (desired_temperature - switch_on_temp)
+
+This means that at `switch_on_temp` the output of the controller's
+proportional term will be 2 * `sustainable_power`. The default value
+for `k_po` is::
+
+ sustainable_power / (desired_temperature - switch_on_temp)
+
+Focusing on the proportional and feed forward values of the PID
+controller equation we have::
+
+ P_max = k_p * e + sustainable_power
+
+The proportional term is proportional to the difference between the
+desired temperature and the current one. When the current temperature
+is the desired one, then the proportional component is zero and
+`P_max` = `sustainable_power`. That is, the system should operate in
+thermal equilibrium under constant load. `sustainable_power` is only
+an estimate, which is the reason for closed-loop control such as this.
+
+Expanding `k_pu` we get::
+
+ P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
+ sustainable_power
+
+where:
+
+ - T_set is the desired temperature
+ - T is the current temperature
+ - T_on is the switch on temperature
+
+When the current temperature is the switch_on temperature, the above
+formula becomes::
+
+ P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
+ sustainable_power = 2 * sustainable_power + sustainable_power =
+ 3 * sustainable_power
+
+Therefore, the proportional term alone linearly decreases power from
+3 * `sustainable_power` to `sustainable_power` as the temperature
+rises from the switch on temperature to the desired temperature.
+
+k_i and integral_cutoff
+-----------------------
+
+`k_i` configures the PID loop's integral term constant. This term
+allows the PID controller to compensate for long term drift and for
+the quantized nature of the output control: cooling devices can't set
+the exact power that the governor requests. When the temperature
+error is below `integral_cutoff`, errors are accumulated in the
+integral term. This term is then multiplied by `k_i` and the result
+added to the output of the controller. Typically `k_i` is set low (1
+or 2) and `integral_cutoff` is 0.
+
+k_d
+---
+
+`k_d` configures the PID loop's derivative term constant. It's
+recommended to leave it as the default: 0.
+
+Cooling device power API
+========================
+
+Cooling devices controlled by this governor must supply the additional
+"power" API in their `cooling_device_ops`. It consists on three ops:
+
+1. ::
+
+ int get_requested_power(struct thermal_cooling_device *cdev,
+ struct thermal_zone_device *tz, u32 *power);
+
+
+@cdev:
+ The `struct thermal_cooling_device` pointer
+@tz:
+ thermal zone in which we are currently operating
+@power:
+ pointer in which to store the calculated power
+
+`get_requested_power()` calculates the power requested by the device
+in milliwatts and stores it in @power . It should return 0 on
+success, -E* on failure. This is currently used by the power
+allocator governor to calculate how much power to give to each cooling
+device.
+
+2. ::
+
+ int state2power(struct thermal_cooling_device *cdev, struct
+ thermal_zone_device *tz, unsigned long state,
+ u32 *power);
+
+@cdev:
+ The `struct thermal_cooling_device` pointer
+@tz:
+ thermal zone in which we are currently operating
+@state:
+ A cooling device state
+@power:
+ pointer in which to store the equivalent power
+
+Convert cooling device state @state into power consumption in
+milliwatts and store it in @power. It should return 0 on success, -E*
+on failure. This is currently used by thermal core to calculate the
+maximum power that an actor can consume.
+
+3. ::
+
+ int power2state(struct thermal_cooling_device *cdev, u32 power,
+ unsigned long *state);
+
+@cdev:
+ The `struct thermal_cooling_device` pointer
+@power:
+ power in milliwatts
+@state:
+ pointer in which to store the resulting state
+
+Calculate a cooling device state that would make the device consume at
+most @power mW and store it in @state. It should return 0 on success,
+-E* on failure. This is currently used by the thermal core to convert
+a given power set by the power allocator governor to a state that the
+cooling device can set. It is a function because this conversion may
+depend on external factors that may change so this function should the
+best conversion given "current circumstances".
+
+Cooling device weights
+----------------------
+
+Weights are a mechanism to bias the allocation among cooling
+devices. They express the relative power efficiency of different
+cooling devices. Higher weight can be used to express higher power
+efficiency. Weighting is relative such that if each cooling device
+has a weight of one they are considered equal. This is particularly
+useful in heterogeneous systems where two cooling devices may perform
+the same kind of compute, but with different efficiency. For example,
+a system with two different types of processors.
+
+If the thermal zone is registered using
+`thermal_zone_device_register()` (i.e., platform code), then weights
+are passed as part of the thermal zone's `thermal_bind_parameters`.
+If the platform is registered using device tree, then they are passed
+as the `contribution` property of each map in the `cooling-maps` node.
+
+Limitations of the power allocator governor
+===========================================
+
+The power allocator governor's PID controller works best if there is a
+periodic tick. If you have a driver that calls
+`thermal_zone_device_update()` (or anything that ends up calling the
+governor's `throttle()` function) repetitively, the governor response
+won't be very good. Note that this is not particular to this
+governor, step-wise will also misbehave if you call its throttle()
+faster than the normal thermal framework tick (due to interrupts for
+example) as it will overreact.
diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt
deleted file mode 100644
index 9fb0ff06dca9..000000000000
--- a/Documentation/thermal/power_allocator.txt
+++ /dev/null
@@ -1,247 +0,0 @@
-Power allocator governor tunables
-=================================
-
-Trip points
------------
-
-The governor works optimally with the following two passive trip points:
-
-1. "switch on" trip point: temperature above which the governor
- control loop starts operating. This is the first passive trip
- point of the thermal zone.
-
-2. "desired temperature" trip point: it should be higher than the
- "switch on" trip point. This the target temperature the governor
- is controlling for. This is the last passive trip point of the
- thermal zone.
-
-PID Controller
---------------
-
-The power allocator governor implements a
-Proportional-Integral-Derivative controller (PID controller) with
-temperature as the control input and power as the controlled output:
-
- P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
-
-where
- e = desired_temperature - current_temperature
- err_integral is the sum of previous errors
- diff_err = e - previous_error
-
-It is similar to the one depicted below:
-
- k_d
- |
-current_temp |
- | v
- | +----------+ +---+
- | +----->| diff_err |-->| X |------+
- | | +----------+ +---+ |
- | | | tdp actor
- | | k_i | | get_requested_power()
- | | | | | | |
- | | | | | | | ...
- v | v v v v v
- +---+ | +-------+ +---+ +---+ +---+ +----------+
- | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power |
- +---+ | +-------+ +---+ +---+ +---+ |allocation|
- ^ | ^ +----------+
- | | | | |
- | | +---+ | | |
- | +------->| X |-------------------+ v v
- | +---+ granted performance
-desired_temperature ^
- |
- |
- k_po/k_pu
-
-Sustainable power
------------------
-
-An estimate of the sustainable dissipatable power (in mW) should be
-provided while registering the thermal zone. This estimates the
-sustained power that can be dissipated at the desired control
-temperature. This is the maximum sustained power for allocation at
-the desired maximum temperature. The actual sustained power can vary
-for a number of reasons. The closed loop controller will take care of
-variations such as environmental conditions, and some factors related
-to the speed-grade of the silicon. `sustainable_power` is therefore
-simply an estimate, and may be tuned to affect the aggressiveness of
-the thermal ramp. For reference, the sustainable power of a 4" phone
-is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
-depending on screen size).
-
-If you are using device tree, do add it as a property of the
-thermal-zone. For example:
-
- thermal-zones {
- soc_thermal {
- polling-delay = <1000>;
- polling-delay-passive = <100>;
- sustainable-power = <2500>;
- ...
-
-Instead, if the thermal zone is registered from the platform code, pass a
-`thermal_zone_params` that has a `sustainable_power`. If no
-`thermal_zone_params` were being passed, then something like below
-will suffice:
-
- static const struct thermal_zone_params tz_params = {
- .sustainable_power = 3500,
- };
-
-and then pass `tz_params` as the 5th parameter to
-`thermal_zone_device_register()`
-
-k_po and k_pu
--------------
-
-The implementation of the PID controller in the power allocator
-thermal governor allows the configuration of two proportional term
-constants: `k_po` and `k_pu`. `k_po` is the proportional term
-constant during temperature overshoot periods (current temperature is
-above "desired temperature" trip point). Conversely, `k_pu` is the
-proportional term constant during temperature undershoot periods
-(current temperature below "desired temperature" trip point).
-
-These controls are intended as the primary mechanism for configuring
-the permitted thermal "ramp" of the system. For instance, a lower
-`k_pu` value will provide a slower ramp, at the cost of capping
-available capacity at a low temperature. On the other hand, a high
-value of `k_pu` will result in the governor granting very high power
-while temperature is low, and may lead to temperature overshooting.
-
-The default value for `k_pu` is:
-
- 2 * sustainable_power / (desired_temperature - switch_on_temp)
-
-This means that at `switch_on_temp` the output of the controller's
-proportional term will be 2 * `sustainable_power`. The default value
-for `k_po` is:
-
- sustainable_power / (desired_temperature - switch_on_temp)
-
-Focusing on the proportional and feed forward values of the PID
-controller equation we have:
-
- P_max = k_p * e + sustainable_power
-
-The proportional term is proportional to the difference between the
-desired temperature and the current one. When the current temperature
-is the desired one, then the proportional component is zero and
-`P_max` = `sustainable_power`. That is, the system should operate in
-thermal equilibrium under constant load. `sustainable_power` is only
-an estimate, which is the reason for closed-loop control such as this.
-
-Expanding `k_pu` we get:
- P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
- sustainable_power
-
-where
- T_set is the desired temperature
- T is the current temperature
- T_on is the switch on temperature
-
-When the current temperature is the switch_on temperature, the above
-formula becomes:
-
- P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
- sustainable_power = 2 * sustainable_power + sustainable_power =
- 3 * sustainable_power
-
-Therefore, the proportional term alone linearly decreases power from
-3 * `sustainable_power` to `sustainable_power` as the temperature
-rises from the switch on temperature to the desired temperature.
-
-k_i and integral_cutoff
------------------------
-
-`k_i` configures the PID loop's integral term constant. This term
-allows the PID controller to compensate for long term drift and for
-the quantized nature of the output control: cooling devices can't set
-the exact power that the governor requests. When the temperature
-error is below `integral_cutoff`, errors are accumulated in the
-integral term. This term is then multiplied by `k_i` and the result
-added to the output of the controller. Typically `k_i` is set low (1
-or 2) and `integral_cutoff` is 0.
-
-k_d
----
-
-`k_d` configures the PID loop's derivative term constant. It's
-recommended to leave it as the default: 0.
-
-Cooling device power API
-========================
-
-Cooling devices controlled by this governor must supply the additional
-"power" API in their `cooling_device_ops`. It consists on three ops:
-
-1. int get_requested_power(struct thermal_cooling_device *cdev,
- struct thermal_zone_device *tz, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@power: pointer in which to store the calculated power
-
-`get_requested_power()` calculates the power requested by the device
-in milliwatts and stores it in @power . It should return 0 on
-success, -E* on failure. This is currently used by the power
-allocator governor to calculate how much power to give to each cooling
-device.
-
-2. int state2power(struct thermal_cooling_device *cdev, struct
- thermal_zone_device *tz, unsigned long state, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@state: A cooling device state
-@power: pointer in which to store the equivalent power
-
-Convert cooling device state @state into power consumption in
-milliwatts and store it in @power. It should return 0 on success, -E*
-on failure. This is currently used by thermal core to calculate the
-maximum power that an actor can consume.
-
-3. int power2state(struct thermal_cooling_device *cdev, u32 power,
- unsigned long *state);
-@cdev: The `struct thermal_cooling_device` pointer
-@power: power in milliwatts
-@state: pointer in which to store the resulting state
-
-Calculate a cooling device state that would make the device consume at
-most @power mW and store it in @state. It should return 0 on success,
--E* on failure. This is currently used by the thermal core to convert
-a given power set by the power allocator governor to a state that the
-cooling device can set. It is a function because this conversion may
-depend on external factors that may change so this function should the
-best conversion given "current circumstances".
-
-Cooling device weights
-----------------------
-
-Weights are a mechanism to bias the allocation among cooling
-devices. They express the relative power efficiency of different
-cooling devices. Higher weight can be used to express higher power
-efficiency. Weighting is relative such that if each cooling device
-has a weight of one they are considered equal. This is particularly
-useful in heterogeneous systems where two cooling devices may perform
-the same kind of compute, but with different efficiency. For example,
-a system with two different types of processors.
-
-If the thermal zone is registered using
-`thermal_zone_device_register()` (i.e., platform code), then weights
-are passed as part of the thermal zone's `thermal_bind_parameters`.
-If the platform is registered using device tree, then they are passed
-as the `contribution` property of each map in the `cooling-maps` node.
-
-Limitations of the power allocator governor
-===========================================
-
-The power allocator governor's PID controller works best if there is a
-periodic tick. If you have a driver that calls
-`thermal_zone_device_update()` (or anything that ends up calling the
-governor's `throttle()` function) repetitively, the governor response
-won't be very good. Note that this is not particular to this
-governor, step-wise will also misbehave if you call its throttle()
-faster than the normal thermal framework tick (due to interrupts for
-example) as it will overreact.
diff --git a/Documentation/thermal/sysfs-api.rst b/Documentation/thermal/sysfs-api.rst
new file mode 100644
index 000000000000..e4930761d3e5
--- /dev/null
+++ b/Documentation/thermal/sysfs-api.rst
@@ -0,0 +1,798 @@
+===================================
+Generic Thermal Sysfs driver How To
+===================================
+
+Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
+
+Updated: 2 January 2008
+
+Copyright (c) 2008 Intel Corporation
+
+
+0. Introduction
+===============
+
+The generic thermal sysfs provides a set of interfaces for thermal zone
+devices (sensors) and thermal cooling devices (fan, processor...) to register
+with the thermal management solution and to be a part of it.
+
+This how-to focuses on enabling new thermal zone and cooling devices to
+participate in thermal management.
+This solution is platform independent and any type of thermal zone devices
+and cooling devices should be able to make use of the infrastructure.
+
+The main task of the thermal sysfs driver is to expose thermal zone attributes
+as well as cooling device attributes to the user space.
+An intelligent thermal management application can make decisions based on
+inputs from thermal zone attributes (the current temperature and trip point
+temperature) and throttle appropriate devices.
+
+- `[0-*]` denotes any positive number starting from 0
+- `[1-*]` denotes any positive number starting from 1
+
+1. thermal sysfs driver interface functions
+===========================================
+
+1.1 thermal zone device interface
+---------------------------------
+
+ ::
+
+ struct thermal_zone_device
+ *thermal_zone_device_register(char *type,
+ int trips, int mask, void *devdata,
+ struct thermal_zone_device_ops *ops,
+ const struct thermal_zone_params *tzp,
+ int passive_delay, int polling_delay))
+
+ This interface function adds a new thermal zone device (sensor) to
+ /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the
+ thermal cooling devices registered at the same time.
+
+ type:
+ the thermal zone type.
+ trips:
+ the total number of trip points this thermal zone supports.
+ mask:
+ Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
+ devdata:
+ device private data
+ ops:
+ thermal zone device call-backs.
+
+ .bind:
+ bind the thermal zone device with a thermal cooling device.
+ .unbind:
+ unbind the thermal zone device with a thermal cooling device.
+ .get_temp:
+ get the current temperature of the thermal zone.
+ .set_trips:
+ set the trip points window. Whenever the current temperature
+ is updated, the trip points immediately below and above the
+ current temperature are found.
+ .get_mode:
+ get the current mode (enabled/disabled) of the thermal zone.
+
+ - "enabled" means the kernel thermal management is
+ enabled.
+ - "disabled" will prevent kernel thermal driver action
+ upon trip points so that user applications can take
+ charge of thermal management.
+ .set_mode:
+ set the mode (enabled/disabled) of the thermal zone.
+ .get_trip_type:
+ get the type of certain trip point.
+ .get_trip_temp:
+ get the temperature above which the certain trip point
+ will be fired.
+ .set_emul_temp:
+ set the emulation temperature which helps in debugging
+ different threshold temperature points.
+ tzp:
+ thermal zone platform parameters.
+ passive_delay:
+ number of milliseconds to wait between polls when
+ performing passive cooling.
+ polling_delay:
+ number of milliseconds to wait between polls when checking
+ whether trip points have been crossed (0 for interrupt driven systems).
+
+ ::
+
+ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
+
+ This interface function removes the thermal zone device.
+ It deletes the corresponding entry from /sys/class/thermal folder and
+ unbinds all the thermal cooling devices it uses.
+
+ ::
+
+ struct thermal_zone_device
+ *thermal_zone_of_sensor_register(struct device *dev, int sensor_id,
+ void *data,
+ const struct thermal_zone_of_device_ops *ops)
+
+ This interface adds a new sensor to a DT thermal zone.
+ This function will search the list of thermal zones described in
+ device tree and look for the zone that refer to the sensor device
+ pointed by dev->of_node as temperature providers. For the zone
+ pointing to the sensor node, the sensor will be added to the DT
+ thermal zone device.
+
+ The parameters for this interface are:
+
+ dev:
+ Device node of sensor containing valid node pointer in
+ dev->of_node.
+ sensor_id:
+ a sensor identifier, in case the sensor IP has more
+ than one sensors
+ data:
+ a private pointer (owned by the caller) that will be
+ passed back, when a temperature reading is needed.
+ ops:
+ `struct thermal_zone_of_device_ops *`.
+
+ ============== =======================================
+ get_temp a pointer to a function that reads the
+ sensor temperature. This is mandatory
+ callback provided by sensor driver.
+ set_trips a pointer to a function that sets a
+ temperature window. When this window is
+ left the driver must inform the thermal
+ core via thermal_zone_device_update.
+ get_trend a pointer to a function that reads the
+ sensor temperature trend.
+ set_emul_temp a pointer to a function that sets
+ sensor emulated temperature.
+ ============== =======================================
+
+ The thermal zone temperature is provided by the get_temp() function
+ pointer of thermal_zone_of_device_ops. When called, it will
+ have the private pointer @data back.
+
+ It returns error pointer if fails otherwise valid thermal zone device
+ handle. Caller should check the return handle with IS_ERR() for finding
+ whether success or not.
+
+ ::
+
+ void thermal_zone_of_sensor_unregister(struct device *dev,
+ struct thermal_zone_device *tzd)
+
+ This interface unregisters a sensor from a DT thermal zone which was
+ successfully added by interface thermal_zone_of_sensor_register().
+ This function removes the sensor callbacks and private data from the
+ thermal zone device registered with thermal_zone_of_sensor_register()
+ interface. It will also silent the zone by remove the .get_temp() and
+ get_trend() thermal zone device callbacks.
+
+ ::
+
+ struct thermal_zone_device
+ *devm_thermal_zone_of_sensor_register(struct device *dev,
+ int sensor_id,
+ void *data,
+ const struct thermal_zone_of_device_ops *ops)
+
+ This interface is resource managed version of
+ thermal_zone_of_sensor_register().
+
+ All details of thermal_zone_of_sensor_register() described in
+ section 1.1.3 is applicable here.
+
+ The benefit of using this interface to register sensor is that it
+ is not require to explicitly call thermal_zone_of_sensor_unregister()
+ in error path or during driver unbinding as this is done by driver
+ resource manager.
+
+ ::
+
+ void devm_thermal_zone_of_sensor_unregister(struct device *dev,
+ struct thermal_zone_device *tzd)
+
+ This interface is resource managed version of
+ thermal_zone_of_sensor_unregister().
+ All details of thermal_zone_of_sensor_unregister() described in
+ section 1.1.4 is applicable here.
+ Normally this function will not need to be called and the resource
+ management code will ensure that the resource is freed.
+
+ ::
+
+ int thermal_zone_get_slope(struct thermal_zone_device *tz)
+
+ This interface is used to read the slope attribute value
+ for the thermal zone device, which might be useful for platform
+ drivers for temperature calculations.
+
+ ::
+
+ int thermal_zone_get_offset(struct thermal_zone_device *tz)
+
+ This interface is used to read the offset attribute value
+ for the thermal zone device, which might be useful for platform
+ drivers for temperature calculations.
+
+1.2 thermal cooling device interface
+------------------------------------
+
+
+ ::
+
+ struct thermal_cooling_device
+ *thermal_cooling_device_register(char *name,
+ void *devdata, struct thermal_cooling_device_ops *)
+
+ This interface function adds a new thermal cooling device (fan/processor/...)
+ to /sys/class/thermal/ folder as `cooling_device[0-*]`. It tries to bind itself
+ to all the thermal zone devices registered at the same time.
+
+ name:
+ the cooling device name.
+ devdata:
+ device private data.
+ ops:
+ thermal cooling devices call-backs.
+
+ .get_max_state:
+ get the Maximum throttle state of the cooling device.
+ .get_cur_state:
+ get the Currently requested throttle state of the
+ cooling device.
+ .set_cur_state:
+ set the Current throttle state of the cooling device.
+
+ ::
+
+ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+
+ This interface function removes the thermal cooling device.
+ It deletes the corresponding entry from /sys/class/thermal folder and
+ unbinds itself from all the thermal zone devices using it.
+
+1.3 interface for binding a thermal zone device with a thermal cooling device
+-----------------------------------------------------------------------------
+
+ ::
+
+ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+ int trip, struct thermal_cooling_device *cdev,
+ unsigned long upper, unsigned long lower, unsigned int weight);
+
+ This interface function binds a thermal cooling device to a particular trip
+ point of a thermal zone device.
+
+ This function is usually called in the thermal zone device .bind callback.
+
+ tz:
+ the thermal zone device
+ cdev:
+ thermal cooling device
+ trip:
+ indicates which trip point in this thermal zone the cooling device
+ is associated with.
+ upper:
+ the Maximum cooling state for this trip point.
+ THERMAL_NO_LIMIT means no upper limit,
+ and the cooling device can be in max_state.
+ lower:
+ the Minimum cooling state can be used for this trip point.
+ THERMAL_NO_LIMIT means no lower limit,
+ and the cooling device can be in cooling state 0.
+ weight:
+ the influence of this cooling device in this thermal
+ zone. See 1.4.1 below for more information.
+
+ ::
+
+ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+ int trip, struct thermal_cooling_device *cdev);
+
+ This interface function unbinds a thermal cooling device from a particular
+ trip point of a thermal zone device. This function is usually called in
+ the thermal zone device .unbind callback.
+
+ tz:
+ the thermal zone device
+ cdev:
+ thermal cooling device
+ trip:
+ indicates which trip point in this thermal zone the cooling device
+ is associated with.
+
+1.4 Thermal Zone Parameters
+---------------------------
+
+ ::
+
+ struct thermal_bind_params
+
+ This structure defines the following parameters that are used to bind
+ a zone with a cooling device for a particular trip point.
+
+ .cdev:
+ The cooling device pointer
+ .weight:
+ The 'influence' of a particular cooling device on this
+ zone. This is relative to the rest of the cooling
+ devices. For example, if all cooling devices have a
+ weight of 1, then they all contribute the same. You can
+ use percentages if you want, but it's not mandatory. A
+ weight of 0 means that this cooling device doesn't
+ contribute to the cooling of this zone unless all cooling
+ devices have a weight of 0. If all weights are 0, then
+ they all contribute the same.
+ .trip_mask:
+ This is a bit mask that gives the binding relation between
+ this thermal zone and cdev, for a particular trip point.
+ If nth bit is set, then the cdev and thermal zone are bound
+ for trip point n.
+ .binding_limits:
+ This is an array of cooling state limits. Must have
+ exactly 2 * thermal_zone.number_of_trip_points. It is an
+ array consisting of tuples <lower-state upper-state> of
+ state limits. Each trip will be associated with one state
+ limit tuple when binding. A NULL pointer means
+ <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
+ These limits are used when binding a cdev to a trip point.
+ .match:
+ This call back returns success(0) if the 'tz and cdev' need to
+ be bound, as per platform data.
+
+ ::
+
+ struct thermal_zone_params
+
+ This structure defines the platform level parameters for a thermal zone.
+ This data, for each thermal zone should come from the platform layer.
+ This is an optional feature where some platforms can choose not to
+ provide this data.
+
+ .governor_name:
+ Name of the thermal governor used for this zone
+ .no_hwmon:
+ a boolean to indicate if the thermal to hwmon sysfs interface
+ is required. when no_hwmon == false, a hwmon sysfs interface
+ will be created. when no_hwmon == true, nothing will be done.
+ In case the thermal_zone_params is NULL, the hwmon interface
+ will be created (for backward compatibility).
+ .num_tbps:
+ Number of thermal_bind_params entries for this zone
+ .tbp:
+ thermal_bind_params entries
+
+2. sysfs attributes structure
+=============================
+
+== ================
+RO read only value
+WO write only value
+RW read/write value
+== ================
+
+Thermal sysfs attributes will be represented under /sys/class/thermal.
+Hwmon sysfs I/F extension is also available under /sys/class/hwmon
+if hwmon is compiled in or built as a module.
+
+Thermal zone device sys I/F, created once it's registered::
+
+ /sys/class/thermal/thermal_zone[0-*]:
+ |---type: Type of the thermal zone
+ |---temp: Current temperature
+ |---mode: Working mode of the thermal zone
+ |---policy: Thermal governor used for this zone
+ |---available_policies: Available thermal governors for this zone
+ |---trip_point_[0-*]_temp: Trip point temperature
+ |---trip_point_[0-*]_type: Trip point type
+ |---trip_point_[0-*]_hyst: Hysteresis value for this trip point
+ |---emul_temp: Emulated temperature set node
+ |---sustainable_power: Sustainable dissipatable power
+ |---k_po: Proportional term during temperature overshoot
+ |---k_pu: Proportional term during temperature undershoot
+ |---k_i: PID's integral term in the power allocator gov
+ |---k_d: PID's derivative term in the power allocator
+ |---integral_cutoff: Offset above which errors are accumulated
+ |---slope: Slope constant applied as linear extrapolation
+ |---offset: Offset constant applied as linear extrapolation
+
+Thermal cooling device sys I/F, created once it's registered::
+
+ /sys/class/thermal/cooling_device[0-*]:
+ |---type: Type of the cooling device(processor/fan/...)
+ |---max_state: Maximum cooling state of the cooling device
+ |---cur_state: Current cooling state of the cooling device
+ |---stats: Directory containing cooling device's statistics
+ |---stats/reset: Writing any value resets the statistics
+ |---stats/time_in_state_ms: Time (msec) spent in various cooling states
+ |---stats/total_trans: Total number of times cooling state is changed
+ |---stats/trans_table: Cooing state transition table
+
+
+Then next two dynamic attributes are created/removed in pairs. They represent
+the relationship between a thermal zone and its associated cooling device.
+They are created/removed for each successful execution of
+thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
+
+::
+
+ /sys/class/thermal/thermal_zone[0-*]:
+ |---cdev[0-*]: [0-*]th cooling device in current thermal zone
+ |---cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with
+ |---cdev[0-*]_weight: Influence of the cooling device in
+ this thermal zone
+
+Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
+the generic thermal driver also creates a hwmon sysfs I/F for each _type_
+of thermal zone device. E.g. the generic thermal driver registers one hwmon
+class device and build the associated hwmon sysfs I/F for all the registered
+ACPI thermal zones.
+
+::
+
+ /sys/class/hwmon/hwmon[0-*]:
+ |---name: The type of the thermal zone devices
+ |---temp[1-*]_input: The current temperature of thermal zone [1-*]
+ |---temp[1-*]_critical: The critical trip point of thermal zone [1-*]
+
+Please read Documentation/hwmon/sysfs-interface.rst for additional information.
+
+Thermal zone attributes
+-----------------------
+
+type
+ Strings which represent the thermal zone type.
+ This is given by thermal zone driver as part of registration.
+ E.g: "acpitz" indicates it's an ACPI thermal device.
+ In order to keep it consistent with hwmon sys attribute; this should
+ be a short, lowercase string, not containing spaces nor dashes.
+ RO, Required
+
+temp
+ Current temperature as reported by thermal zone (sensor).
+ Unit: millidegree Celsius
+ RO, Required
+
+mode
+ One of the predefined values in [enabled, disabled].
+ This file gives information about the algorithm that is currently
+ managing the thermal zone. It can be either default kernel based
+ algorithm or user space application.
+
+ enabled
+ enable Kernel Thermal management.
+ disabled
+ Preventing kernel thermal zone driver actions upon
+ trip points so that user application can take full
+ charge of the thermal management.
+
+ RW, Optional
+
+policy
+ One of the various thermal governors used for a particular zone.
+
+ RW, Required
+
+available_policies
+ Available thermal governors which can be used for a particular zone.
+
+ RO, Required
+
+`trip_point_[0-*]_temp`
+ The temperature above which trip point will be fired.
+
+ Unit: millidegree Celsius
+
+ RO, Optional
+
+`trip_point_[0-*]_type`
+ Strings which indicate the type of the trip point.
+
+ E.g. it can be one of critical, hot, passive, `active[0-*]` for ACPI
+ thermal zone.
+
+ RO, Optional
+
+`trip_point_[0-*]_hyst`
+ The hysteresis value for a trip point, represented as an integer
+ Unit: Celsius
+ RW, Optional
+
+`cdev[0-*]`
+ Sysfs link to the thermal cooling device node where the sys I/F
+ for cooling device throttling control represents.
+
+ RO, Optional
+
+`cdev[0-*]_trip_point`
+ The trip point in this thermal zone which `cdev[0-*]` is associated
+ with; -1 means the cooling device is not associated with any trip
+ point.
+
+ RO, Optional
+
+`cdev[0-*]_weight`
+ The influence of `cdev[0-*]` in this thermal zone. This value
+ is relative to the rest of cooling devices in the thermal
+ zone. For example, if a cooling device has a weight double
+ than that of other, it's twice as effective in cooling the
+ thermal zone.
+
+ RW, Optional
+
+passive
+ Attribute is only present for zones in which the passive cooling
+ policy is not supported by native thermal driver. Default is zero
+ and can be set to a temperature (in millidegrees) to enable a
+ passive trip point for the zone. Activation is done by polling with
+ an interval of 1 second.
+
+ Unit: millidegrees Celsius
+
+ Valid values: 0 (disabled) or greater than 1000
+
+ RW, Optional
+
+emul_temp
+ Interface to set the emulated temperature method in thermal zone
+ (sensor). After setting this temperature, the thermal zone may pass
+ this temperature to platform emulation function if registered or
+ cache it locally. This is useful in debugging different temperature
+ threshold and its associated cooling action. This is write only node
+ and writing 0 on this node should disable emulation.
+ Unit: millidegree Celsius
+
+ WO, Optional
+
+ WARNING:
+ Be careful while enabling this option on production systems,
+ because userland can easily disable the thermal policy by simply
+ flooding this sysfs node with low temperature values.
+
+sustainable_power
+ An estimate of the sustained power that can be dissipated by
+ the thermal zone. Used by the power allocator governor. For
+ more information see Documentation/thermal/power_allocator.rst
+
+ Unit: milliwatts
+
+ RW, Optional
+
+k_po
+ The proportional term of the power allocator governor's PID
+ controller during temperature overshoot. Temperature overshoot
+ is when the current temperature is above the "desired
+ temperature" trip point. For more information see
+ Documentation/thermal/power_allocator.rst
+
+ RW, Optional
+
+k_pu
+ The proportional term of the power allocator governor's PID
+ controller during temperature undershoot. Temperature undershoot
+ is when the current temperature is below the "desired
+ temperature" trip point. For more information see
+ Documentation/thermal/power_allocator.rst
+
+ RW, Optional
+
+k_i
+ The integral term of the power allocator governor's PID
+ controller. This term allows the PID controller to compensate
+ for long term drift. For more information see
+ Documentation/thermal/power_allocator.rst
+
+ RW, Optional
+
+k_d
+ The derivative term of the power allocator governor's PID
+ controller. For more information see
+ Documentation/thermal/power_allocator.rst
+
+ RW, Optional
+
+integral_cutoff
+ Temperature offset from the desired temperature trip point
+ above which the integral term of the power allocator
+ governor's PID controller starts accumulating errors. For
+ example, if integral_cutoff is 0, then the integral term only
+ accumulates error when temperature is above the desired
+ temperature trip point. For more information see
+ Documentation/thermal/power_allocator.rst
+
+ Unit: millidegree Celsius
+
+ RW, Optional
+
+slope
+ The slope constant used in a linear extrapolation model
+ to determine a hotspot temperature based off the sensor's
+ raw readings. It is up to the device driver to determine
+ the usage of these values.
+
+ RW, Optional
+
+offset
+ The offset constant used in a linear extrapolation model
+ to determine a hotspot temperature based off the sensor's
+ raw readings. It is up to the device driver to determine
+ the usage of these values.
+
+ RW, Optional
+
+Cooling device attributes
+-------------------------
+
+type
+ String which represents the type of device, e.g:
+
+ - for generic ACPI: should be "Fan", "Processor" or "LCD"
+ - for memory controller device on intel_menlow platform:
+ should be "Memory controller".
+
+ RO, Required
+
+max_state
+ The maximum permissible cooling state of this cooling device.
+
+ RO, Required
+
+cur_state
+ The current cooling state of this cooling device.
+ The value can any integer numbers between 0 and max_state:
+
+ - cur_state == 0 means no cooling
+ - cur_state == max_state means the maximum cooling.
+
+ RW, Required
+
+stats/reset
+ Writing any value resets the cooling device's statistics.
+ WO, Required
+
+stats/time_in_state_ms:
+ The amount of time spent by the cooling device in various cooling
+ states. The output will have "<state> <time>" pair in each line, which
+ will mean this cooling device spent <time> msec of time at <state>.
+ Output will have one line for each of the supported states. usertime
+ units here is 10mS (similar to other time exported in /proc).
+ RO, Required
+
+
+stats/total_trans:
+ A single positive value showing the total number of times the state of a
+ cooling device is changed.
+
+ RO, Required
+
+stats/trans_table:
+ This gives fine grained information about all the cooling state
+ transitions. The cat output here is a two dimensional matrix, where an
+ entry <i,j> (row i, column j) represents the number of transitions from
+ State_i to State_j. If the transition table is bigger than PAGE_SIZE,
+ reading this will return an -EFBIG error.
+ RO, Required
+
+3. A simple implementation
+==========================
+
+ACPI thermal zone may support multiple trip points like critical, hot,
+passive, active. If an ACPI thermal zone supports critical, passive,
+active[0] and active[1] at the same time, it may register itself as a
+thermal_zone_device (thermal_zone1) with 4 trip points in all.
+It has one processor and one fan, which are both registered as
+thermal_cooling_device. Both are considered to have the same
+effectiveness in cooling the thermal zone.
+
+If the processor is listed in _PSL method, and the fan is listed in _AL0
+method, the sys I/F structure will be built like this::
+
+ /sys/class/thermal:
+ |thermal_zone1:
+ |---type: acpitz
+ |---temp: 37000
+ |---mode: enabled
+ |---policy: step_wise
+ |---available_policies: step_wise fair_share
+ |---trip_point_0_temp: 100000
+ |---trip_point_0_type: critical
+ |---trip_point_1_temp: 80000
+ |---trip_point_1_type: passive
+ |---trip_point_2_temp: 70000
+ |---trip_point_2_type: active0
+ |---trip_point_3_temp: 60000
+ |---trip_point_3_type: active1
+ |---cdev0: --->/sys/class/thermal/cooling_device0
+ |---cdev0_trip_point: 1 /* cdev0 can be used for passive */
+ |---cdev0_weight: 1024
+ |---cdev1: --->/sys/class/thermal/cooling_device3
+ |---cdev1_trip_point: 2 /* cdev1 can be used for active[0]*/
+ |---cdev1_weight: 1024
+
+ |cooling_device0:
+ |---type: Processor
+ |---max_state: 8
+ |---cur_state: 0
+
+ |cooling_device3:
+ |---type: Fan
+ |---max_state: 2
+ |---cur_state: 0
+
+ /sys/class/hwmon:
+ |hwmon0:
+ |---name: acpitz
+ |---temp1_input: 37000
+ |---temp1_crit: 100000
+
+4. Event Notification
+=====================
+
+The framework includes a simple notification mechanism, in the form of a
+netlink event. Netlink socket initialization is done during the _init_
+of the framework. Drivers which intend to use the notification mechanism
+just need to call thermal_generate_netlink_event() with two arguments viz
+(originator, event). The originator is a pointer to struct thermal_zone_device
+from where the event has been originated. An integer which represents the
+thermal zone device will be used in the message to identify the zone. The
+event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
+THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
+crosses any of the configured thresholds.
+
+5. Export Symbol APIs
+=====================
+
+5.1. get_tz_trend
+-----------------
+
+This function returns the trend of a thermal zone, i.e the rate of change
+of temperature of the thermal zone. Ideally, the thermal sensor drivers
+are supposed to implement the callback. If they don't, the thermal
+framework calculated the trend by comparing the previous and the current
+temperature values.
+
+5.2. get_thermal_instance
+-------------------------
+
+This function returns the thermal_instance corresponding to a given
+{thermal_zone, cooling_device, trip_point} combination. Returns NULL
+if such an instance does not exist.
+
+5.3. thermal_notify_framework
+-----------------------------
+
+This function handles the trip events from sensor drivers. It starts
+throttling the cooling devices according to the policy configured.
+For CRITICAL and HOT trip points, this notifies the respective drivers,
+and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
+The throttling policy is based on the configured platform data; if no
+platform data is provided, this uses the step_wise throttling policy.
+
+5.4. thermal_cdev_update
+------------------------
+
+This function serves as an arbitrator to set the state of a cooling
+device. It sets the cooling device to the deepest cooling state if
+possible.
+
+6. thermal_emergency_poweroff
+=============================
+
+On an event of critical trip temperature crossing. Thermal framework
+allows the system to shutdown gracefully by calling orderly_poweroff().
+In the event of a failure of orderly_poweroff() to shut down the system
+we are in danger of keeping the system alive at undesirably high
+temperatures. To mitigate this high risk scenario we program a work
+queue to fire after a pre-determined number of seconds to start
+an emergency shutdown of the device using the kernel_power_off()
+function. In case kernel_power_off() fails then finally
+emergency_restart() is called in the worst case.
+
+The delay should be carefully profiled so as to give adequate time for
+orderly_poweroff(). In case of failure of an orderly_poweroff() the
+emergency poweroff kicks in after the delay has elapsed and shuts down
+the system.
+
+If set to 0 emergency poweroff will not be supported. So a carefully
+profiled non-zero positive value is a must for emergerncy poweroff to be
+triggered.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
deleted file mode 100644
index c3fa500df92c..000000000000
--- a/Documentation/thermal/sysfs-api.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-Generic Thermal Sysfs driver How To
-===================================
-
-Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
-
-Updated: 2 January 2008
-
-Copyright (c) 2008 Intel Corporation
-
-
-0. Introduction
-
-The generic thermal sysfs provides a set of interfaces for thermal zone
-devices (sensors) and thermal cooling devices (fan, processor...) to register
-with the thermal management solution and to be a part of it.
-
-This how-to focuses on enabling new thermal zone and cooling devices to
-participate in thermal management.
-This solution is platform independent and any type of thermal zone devices
-and cooling devices should be able to make use of the infrastructure.
-
-The main task of the thermal sysfs driver is to expose thermal zone attributes
-as well as cooling device attributes to the user space.
-An intelligent thermal management application can make decisions based on
-inputs from thermal zone attributes (the current temperature and trip point
-temperature) and throttle appropriate devices.
-
-[0-*] denotes any positive number starting from 0
-[1-*] denotes any positive number starting from 1
-
-1. thermal sysfs driver interface functions
-
-1.1 thermal zone device interface
-1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *type,
- int trips, int mask, void *devdata,
- struct thermal_zone_device_ops *ops,
- const struct thermal_zone_params *tzp,
- int passive_delay, int polling_delay))
-
- This interface function adds a new thermal zone device (sensor) to
- /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the
- thermal cooling devices registered at the same time.
-
- type: the thermal zone type.
- trips: the total number of trip points this thermal zone supports.
- mask: Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
- devdata: device private data
- ops: thermal zone device call-backs.
- .bind: bind the thermal zone device with a thermal cooling device.
- .unbind: unbind the thermal zone device with a thermal cooling device.
- .get_temp: get the current temperature of the thermal zone.
- .set_trips: set the trip points window. Whenever the current temperature
- is updated, the trip points immediately below and above the
- current temperature are found.
- .get_mode: get the current mode (enabled/disabled) of the thermal zone.
- - "enabled" means the kernel thermal management is enabled.
- - "disabled" will prevent kernel thermal driver action upon trip points
- so that user applications can take charge of thermal management.
- .set_mode: set the mode (enabled/disabled) of the thermal zone.
- .get_trip_type: get the type of certain trip point.
- .get_trip_temp: get the temperature above which the certain trip point
- will be fired.
- .set_emul_temp: set the emulation temperature which helps in debugging
- different threshold temperature points.
- tzp: thermal zone platform parameters.
- passive_delay: number of milliseconds to wait between polls when
- performing passive cooling.
- polling_delay: number of milliseconds to wait between polls when checking
- whether trip points have been crossed (0 for interrupt driven systems).
-
-
-1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
-
- This interface function removes the thermal zone device.
- It deletes the corresponding entry from /sys/class/thermal folder and
- unbinds all the thermal cooling devices it uses.
-
-1.1.3 struct thermal_zone_device *thermal_zone_of_sensor_register(
- struct device *dev, int sensor_id, void *data,
- const struct thermal_zone_of_device_ops *ops)
-
- This interface adds a new sensor to a DT thermal zone.
- This function will search the list of thermal zones described in
- device tree and look for the zone that refer to the sensor device
- pointed by dev->of_node as temperature providers. For the zone
- pointing to the sensor node, the sensor will be added to the DT
- thermal zone device.
-
- The parameters for this interface are:
- dev: Device node of sensor containing valid node pointer in
- dev->of_node.
- sensor_id: a sensor identifier, in case the sensor IP has more
- than one sensors
- data: a private pointer (owned by the caller) that will be
- passed back, when a temperature reading is needed.
- ops: struct thermal_zone_of_device_ops *.
-
- get_temp: a pointer to a function that reads the
- sensor temperature. This is mandatory
- callback provided by sensor driver.
- set_trips: a pointer to a function that sets a
- temperature window. When this window is
- left the driver must inform the thermal
- core via thermal_zone_device_update.
- get_trend: a pointer to a function that reads the
- sensor temperature trend.
- set_emul_temp: a pointer to a function that sets
- sensor emulated temperature.
- The thermal zone temperature is provided by the get_temp() function
- pointer of thermal_zone_of_device_ops. When called, it will
- have the private pointer @data back.
-
- It returns error pointer if fails otherwise valid thermal zone device
- handle. Caller should check the return handle with IS_ERR() for finding
- whether success or not.
-
-1.1.4 void thermal_zone_of_sensor_unregister(struct device *dev,
- struct thermal_zone_device *tzd)
-
- This interface unregisters a sensor from a DT thermal zone which was
- successfully added by interface thermal_zone_of_sensor_register().
- This function removes the sensor callbacks and private data from the
- thermal zone device registered with thermal_zone_of_sensor_register()
- interface. It will also silent the zone by remove the .get_temp() and
- get_trend() thermal zone device callbacks.
-
-1.1.5 struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
- struct device *dev, int sensor_id,
- void *data, const struct thermal_zone_of_device_ops *ops)
-
- This interface is resource managed version of
- thermal_zone_of_sensor_register().
- All details of thermal_zone_of_sensor_register() described in
- section 1.1.3 is applicable here.
- The benefit of using this interface to register sensor is that it
- is not require to explicitly call thermal_zone_of_sensor_unregister()
- in error path or during driver unbinding as this is done by driver
- resource manager.
-
-1.1.6 void devm_thermal_zone_of_sensor_unregister(struct device *dev,
- struct thermal_zone_device *tzd)
-
- This interface is resource managed version of
- thermal_zone_of_sensor_unregister().
- All details of thermal_zone_of_sensor_unregister() described in
- section 1.1.4 is applicable here.
- Normally this function will not need to be called and the resource
- management code will ensure that the resource is freed.
-
-1.1.7 int thermal_zone_get_slope(struct thermal_zone_device *tz)
-
- This interface is used to read the slope attribute value
- for the thermal zone device, which might be useful for platform
- drivers for temperature calculations.
-
-1.1.8 int thermal_zone_get_offset(struct thermal_zone_device *tz)
-
- This interface is used to read the offset attribute value
- for the thermal zone device, which might be useful for platform
- drivers for temperature calculations.
-
-1.2 thermal cooling device interface
-1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
- void *devdata, struct thermal_cooling_device_ops *)
-
- This interface function adds a new thermal cooling device (fan/processor/...)
- to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself
- to all the thermal zone devices registered at the same time.
- name: the cooling device name.
- devdata: device private data.
- ops: thermal cooling devices call-backs.
- .get_max_state: get the Maximum throttle state of the cooling device.
- .get_cur_state: get the Currently requested throttle state of the cooling device.
- .set_cur_state: set the Current throttle state of the cooling device.
-
-1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
-
- This interface function removes the thermal cooling device.
- It deletes the corresponding entry from /sys/class/thermal folder and
- unbinds itself from all the thermal zone devices using it.
-
-1.3 interface for binding a thermal zone device with a thermal cooling device
-1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
- int trip, struct thermal_cooling_device *cdev,
- unsigned long upper, unsigned long lower, unsigned int weight);
-
- This interface function binds a thermal cooling device to a particular trip
- point of a thermal zone device.
- This function is usually called in the thermal zone device .bind callback.
- tz: the thermal zone device
- cdev: thermal cooling device
- trip: indicates which trip point in this thermal zone the cooling device
- is associated with.
- upper:the Maximum cooling state for this trip point.
- THERMAL_NO_LIMIT means no upper limit,
- and the cooling device can be in max_state.
- lower:the Minimum cooling state can be used for this trip point.
- THERMAL_NO_LIMIT means no lower limit,
- and the cooling device can be in cooling state 0.
- weight: the influence of this cooling device in this thermal
- zone. See 1.4.1 below for more information.
-
-1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
- int trip, struct thermal_cooling_device *cdev);
-
- This interface function unbinds a thermal cooling device from a particular
- trip point of a thermal zone device. This function is usually called in
- the thermal zone device .unbind callback.
- tz: the thermal zone device
- cdev: thermal cooling device
- trip: indicates which trip point in this thermal zone the cooling device
- is associated with.
-
-1.4 Thermal Zone Parameters
-1.4.1 struct thermal_bind_params
- This structure defines the following parameters that are used to bind
- a zone with a cooling device for a particular trip point.
- .cdev: The cooling device pointer
- .weight: The 'influence' of a particular cooling device on this
- zone. This is relative to the rest of the cooling
- devices. For example, if all cooling devices have a
- weight of 1, then they all contribute the same. You can
- use percentages if you want, but it's not mandatory. A
- weight of 0 means that this cooling device doesn't
- contribute to the cooling of this zone unless all cooling
- devices have a weight of 0. If all weights are 0, then
- they all contribute the same.
- .trip_mask:This is a bit mask that gives the binding relation between
- this thermal zone and cdev, for a particular trip point.
- If nth bit is set, then the cdev and thermal zone are bound
- for trip point n.
- .binding_limits: This is an array of cooling state limits. Must have
- exactly 2 * thermal_zone.number_of_trip_points. It is an
- array consisting of tuples <lower-state upper-state> of
- state limits. Each trip will be associated with one state
- limit tuple when binding. A NULL pointer means
- <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
- These limits are used when binding a cdev to a trip point.
- .match: This call back returns success(0) if the 'tz and cdev' need to
- be bound, as per platform data.
-1.4.2 struct thermal_zone_params
- This structure defines the platform level parameters for a thermal zone.
- This data, for each thermal zone should come from the platform layer.
- This is an optional feature where some platforms can choose not to
- provide this data.
- .governor_name: Name of the thermal governor used for this zone
- .no_hwmon: a boolean to indicate if the thermal to hwmon sysfs interface
- is required. when no_hwmon == false, a hwmon sysfs interface
- will be created. when no_hwmon == true, nothing will be done.
- In case the thermal_zone_params is NULL, the hwmon interface
- will be created (for backward compatibility).
- .num_tbps: Number of thermal_bind_params entries for this zone
- .tbp: thermal_bind_params entries
-
-2. sysfs attributes structure
-
-RO read only value
-WO write only value
-RW read/write value
-
-Thermal sysfs attributes will be represented under /sys/class/thermal.
-Hwmon sysfs I/F extension is also available under /sys/class/hwmon
-if hwmon is compiled in or built as a module.
-
-Thermal zone device sys I/F, created once it's registered:
-/sys/class/thermal/thermal_zone[0-*]:
- |---type: Type of the thermal zone
- |---temp: Current temperature
- |---mode: Working mode of the thermal zone
- |---policy: Thermal governor used for this zone
- |---available_policies: Available thermal governors for this zone
- |---trip_point_[0-*]_temp: Trip point temperature
- |---trip_point_[0-*]_type: Trip point type
- |---trip_point_[0-*]_hyst: Hysteresis value for this trip point
- |---emul_temp: Emulated temperature set node
- |---sustainable_power: Sustainable dissipatable power
- |---k_po: Proportional term during temperature overshoot
- |---k_pu: Proportional term during temperature undershoot
- |---k_i: PID's integral term in the power allocator gov
- |---k_d: PID's derivative term in the power allocator
- |---integral_cutoff: Offset above which errors are accumulated
- |---slope: Slope constant applied as linear extrapolation
- |---offset: Offset constant applied as linear extrapolation
-
-Thermal cooling device sys I/F, created once it's registered:
-/sys/class/thermal/cooling_device[0-*]:
- |---type: Type of the cooling device(processor/fan/...)
- |---max_state: Maximum cooling state of the cooling device
- |---cur_state: Current cooling state of the cooling device
- |---stats: Directory containing cooling device's statistics
- |---stats/reset: Writing any value resets the statistics
- |---stats/time_in_state_ms: Time (msec) spent in various cooling states
- |---stats/total_trans: Total number of times cooling state is changed
- |---stats/trans_table: Cooing state transition table
-
-
-Then next two dynamic attributes are created/removed in pairs. They represent
-the relationship between a thermal zone and its associated cooling device.
-They are created/removed for each successful execution of
-thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
-
-/sys/class/thermal/thermal_zone[0-*]:
- |---cdev[0-*]: [0-*]th cooling device in current thermal zone
- |---cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with
- |---cdev[0-*]_weight: Influence of the cooling device in
- this thermal zone
-
-Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
-the generic thermal driver also creates a hwmon sysfs I/F for each _type_
-of thermal zone device. E.g. the generic thermal driver registers one hwmon
-class device and build the associated hwmon sysfs I/F for all the registered
-ACPI thermal zones.
-
-/sys/class/hwmon/hwmon[0-*]:
- |---name: The type of the thermal zone devices
- |---temp[1-*]_input: The current temperature of thermal zone [1-*]
- |---temp[1-*]_critical: The critical trip point of thermal zone [1-*]
-
-Please read Documentation/hwmon/sysfs-interface.rst for additional information.
-
-***************************
-* Thermal zone attributes *
-***************************
-
-type
- Strings which represent the thermal zone type.
- This is given by thermal zone driver as part of registration.
- E.g: "acpitz" indicates it's an ACPI thermal device.
- In order to keep it consistent with hwmon sys attribute; this should
- be a short, lowercase string, not containing spaces nor dashes.
- RO, Required
-
-temp
- Current temperature as reported by thermal zone (sensor).
- Unit: millidegree Celsius
- RO, Required
-
-mode
- One of the predefined values in [enabled, disabled].
- This file gives information about the algorithm that is currently
- managing the thermal zone. It can be either default kernel based
- algorithm or user space application.
- enabled = enable Kernel Thermal management.
- disabled = Preventing kernel thermal zone driver actions upon
- trip points so that user application can take full
- charge of the thermal management.
- RW, Optional
-
-policy
- One of the various thermal governors used for a particular zone.
- RW, Required
-
-available_policies
- Available thermal governors which can be used for a particular zone.
- RO, Required
-
-trip_point_[0-*]_temp
- The temperature above which trip point will be fired.
- Unit: millidegree Celsius
- RO, Optional
-
-trip_point_[0-*]_type
- Strings which indicate the type of the trip point.
- E.g. it can be one of critical, hot, passive, active[0-*] for ACPI
- thermal zone.
- RO, Optional
-
-trip_point_[0-*]_hyst
- The hysteresis value for a trip point, represented as an integer
- Unit: Celsius
- RW, Optional
-
-cdev[0-*]
- Sysfs link to the thermal cooling device node where the sys I/F
- for cooling device throttling control represents.
- RO, Optional
-
-cdev[0-*]_trip_point
- The trip point in this thermal zone which cdev[0-*] is associated
- with; -1 means the cooling device is not associated with any trip
- point.
- RO, Optional
-
-cdev[0-*]_weight
- The influence of cdev[0-*] in this thermal zone. This value
- is relative to the rest of cooling devices in the thermal
- zone. For example, if a cooling device has a weight double
- than that of other, it's twice as effective in cooling the
- thermal zone.
- RW, Optional
-
-passive
- Attribute is only present for zones in which the passive cooling
- policy is not supported by native thermal driver. Default is zero
- and can be set to a temperature (in millidegrees) to enable a
- passive trip point for the zone. Activation is done by polling with
- an interval of 1 second.
- Unit: millidegrees Celsius
- Valid values: 0 (disabled) or greater than 1000
- RW, Optional
-
-emul_temp
- Interface to set the emulated temperature method in thermal zone
- (sensor). After setting this temperature, the thermal zone may pass
- this temperature to platform emulation function if registered or
- cache it locally. This is useful in debugging different temperature
- threshold and its associated cooling action. This is write only node
- and writing 0 on this node should disable emulation.
- Unit: millidegree Celsius
- WO, Optional
-
- WARNING: Be careful while enabling this option on production systems,
- because userland can easily disable the thermal policy by simply
- flooding this sysfs node with low temperature values.
-
-sustainable_power
- An estimate of the sustained power that can be dissipated by
- the thermal zone. Used by the power allocator governor. For
- more information see Documentation/thermal/power_allocator.txt
- Unit: milliwatts
- RW, Optional
-
-k_po
- The proportional term of the power allocator governor's PID
- controller during temperature overshoot. Temperature overshoot
- is when the current temperature is above the "desired
- temperature" trip point. For more information see
- Documentation/thermal/power_allocator.txt
- RW, Optional
-
-k_pu
- The proportional term of the power allocator governor's PID
- controller during temperature undershoot. Temperature undershoot
- is when the current temperature is below the "desired
- temperature" trip point. For more information see
- Documentation/thermal/power_allocator.txt
- RW, Optional
-
-k_i
- The integral term of the power allocator governor's PID
- controller. This term allows the PID controller to compensate
- for long term drift. For more information see
- Documentation/thermal/power_allocator.txt
- RW, Optional
-
-k_d
- The derivative term of the power allocator governor's PID
- controller. For more information see
- Documentation/thermal/power_allocator.txt
- RW, Optional
-
-integral_cutoff
- Temperature offset from the desired temperature trip point
- above which the integral term of the power allocator
- governor's PID controller starts accumulating errors. For
- example, if integral_cutoff is 0, then the integral term only
- accumulates error when temperature is above the desired
- temperature trip point. For more information see
- Documentation/thermal/power_allocator.txt
- Unit: millidegree Celsius
- RW, Optional
-
-slope
- The slope constant used in a linear extrapolation model
- to determine a hotspot temperature based off the sensor's
- raw readings. It is up to the device driver to determine
- the usage of these values.
- RW, Optional
-
-offset
- The offset constant used in a linear extrapolation model
- to determine a hotspot temperature based off the sensor's
- raw readings. It is up to the device driver to determine
- the usage of these values.
- RW, Optional
-
-*****************************
-* Cooling device attributes *
-*****************************
-
-type
- String which represents the type of device, e.g:
- - for generic ACPI: should be "Fan", "Processor" or "LCD"
- - for memory controller device on intel_menlow platform:
- should be "Memory controller".
- RO, Required
-
-max_state
- The maximum permissible cooling state of this cooling device.
- RO, Required
-
-cur_state
- The current cooling state of this cooling device.
- The value can any integer numbers between 0 and max_state:
- - cur_state == 0 means no cooling
- - cur_state == max_state means the maximum cooling.
- RW, Required
-
-stats/reset
- Writing any value resets the cooling device's statistics.
- WO, Required
-
-stats/time_in_state_ms:
- The amount of time spent by the cooling device in various cooling
- states. The output will have "<state> <time>" pair in each line, which
- will mean this cooling device spent <time> msec of time at <state>.
- Output will have one line for each of the supported states. usertime
- units here is 10mS (similar to other time exported in /proc).
- RO, Required
-
-stats/total_trans:
- A single positive value showing the total number of times the state of a
- cooling device is changed.
- RO, Required
-
-stats/trans_table:
- This gives fine grained information about all the cooling state
- transitions. The cat output here is a two dimensional matrix, where an
- entry <i,j> (row i, column j) represents the number of transitions from
- State_i to State_j. If the transition table is bigger than PAGE_SIZE,
- reading this will return an -EFBIG error.
- RO, Required
-
-3. A simple implementation
-
-ACPI thermal zone may support multiple trip points like critical, hot,
-passive, active. If an ACPI thermal zone supports critical, passive,
-active[0] and active[1] at the same time, it may register itself as a
-thermal_zone_device (thermal_zone1) with 4 trip points in all.
-It has one processor and one fan, which are both registered as
-thermal_cooling_device. Both are considered to have the same
-effectiveness in cooling the thermal zone.
-
-If the processor is listed in _PSL method, and the fan is listed in _AL0
-method, the sys I/F structure will be built like this:
-
-/sys/class/thermal:
-
-|thermal_zone1:
- |---type: acpitz
- |---temp: 37000
- |---mode: enabled
- |---policy: step_wise
- |---available_policies: step_wise fair_share
- |---trip_point_0_temp: 100000
- |---trip_point_0_type: critical
- |---trip_point_1_temp: 80000
- |---trip_point_1_type: passive
- |---trip_point_2_temp: 70000
- |---trip_point_2_type: active0
- |---trip_point_3_temp: 60000
- |---trip_point_3_type: active1
- |---cdev0: --->/sys/class/thermal/cooling_device0
- |---cdev0_trip_point: 1 /* cdev0 can be used for passive */
- |---cdev0_weight: 1024
- |---cdev1: --->/sys/class/thermal/cooling_device3
- |---cdev1_trip_point: 2 /* cdev1 can be used for active[0]*/
- |---cdev1_weight: 1024
-
-|cooling_device0:
- |---type: Processor
- |---max_state: 8
- |---cur_state: 0
-
-|cooling_device3:
- |---type: Fan
- |---max_state: 2
- |---cur_state: 0
-
-/sys/class/hwmon:
-
-|hwmon0:
- |---name: acpitz
- |---temp1_input: 37000
- |---temp1_crit: 100000
-
-4. Event Notification
-
-The framework includes a simple notification mechanism, in the form of a
-netlink event. Netlink socket initialization is done during the _init_
-of the framework. Drivers which intend to use the notification mechanism
-just need to call thermal_generate_netlink_event() with two arguments viz
-(originator, event). The originator is a pointer to struct thermal_zone_device
-from where the event has been originated. An integer which represents the
-thermal zone device will be used in the message to identify the zone. The
-event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
-THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
-crosses any of the configured thresholds.
-
-5. Export Symbol APIs:
-
-5.1: get_tz_trend:
-This function returns the trend of a thermal zone, i.e the rate of change
-of temperature of the thermal zone. Ideally, the thermal sensor drivers
-are supposed to implement the callback. If they don't, the thermal
-framework calculated the trend by comparing the previous and the current
-temperature values.
-
-5.2:get_thermal_instance:
-This function returns the thermal_instance corresponding to a given
-{thermal_zone, cooling_device, trip_point} combination. Returns NULL
-if such an instance does not exist.
-
-5.3:thermal_notify_framework:
-This function handles the trip events from sensor drivers. It starts
-throttling the cooling devices according to the policy configured.
-For CRITICAL and HOT trip points, this notifies the respective drivers,
-and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
-The throttling policy is based on the configured platform data; if no
-platform data is provided, this uses the step_wise throttling policy.
-
-5.4:thermal_cdev_update:
-This function serves as an arbitrator to set the state of a cooling
-device. It sets the cooling device to the deepest cooling state if
-possible.
-
-6. thermal_emergency_poweroff:
-
-On an event of critical trip temperature crossing. Thermal framework
-allows the system to shutdown gracefully by calling orderly_poweroff().
-In the event of a failure of orderly_poweroff() to shut down the system
-we are in danger of keeping the system alive at undesirably high
-temperatures. To mitigate this high risk scenario we program a work
-queue to fire after a pre-determined number of seconds to start
-an emergency shutdown of the device using the kernel_power_off()
-function. In case kernel_power_off() fails then finally
-emergency_restart() is called in the worst case.
-
-The delay should be carefully profiled so as to give adequate time for
-orderly_poweroff(). In case of failure of an orderly_poweroff() the
-emergency poweroff kicks in after the delay has elapsed and shuts down
-the system.
-
-If set to 0 emergency poweroff will not be supported. So a carefully
-profiled non-zero positive value is a must for emergerncy poweroff to be
-triggered.
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal b/Documentation/thermal/x86_pkg_temperature_thermal
deleted file mode 100644
index 17a3a4c0a0ca..000000000000
--- a/Documentation/thermal/x86_pkg_temperature_thermal
+++ /dev/null
@@ -1,47 +0,0 @@
-Kernel driver: x86_pkg_temp_thermal
-===================
-
-Supported chips:
-* x86: with package level thermal management
-(Verify using: CPUID.06H:EAX[bit 6] =1)
-
-Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
-
-Reference
----
-Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
-Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
-
-Description
----------
-
-This driver register CPU digital temperature package level sensor as a thermal
-zone with maximum two user mode configurable trip points. Number of trip points
-depends on the capability of the package. Once the trip point is violated,
-user mode can receive notification via thermal notification mechanism and can
-take any action to control temperature.
-
-
-Threshold management
---------------------
-Each package will register as a thermal zone under /sys/class/thermal.
-Example:
-/sys/class/thermal/thermal_zone1
-
-This contains two trip points:
-- trip_point_0_temp
-- trip_point_1_temp
-
-User can set any temperature between 0 to TJ-Max temperature. Temperature units
-are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.txt" for
-thermal sys-fs details.
-
-Any value other than 0 in these trip points, can trigger thermal notifications.
-Setting 0, stops sending thermal notifications.
-
-Thermal notifications: To get kobject-uevent notifications, set the thermal zone
-policy to "user_space". For example: echo -n "user_space" > policy
-
-
-
-
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal.rst b/Documentation/thermal/x86_pkg_temperature_thermal.rst
new file mode 100644
index 000000000000..f134dbd3f5a9
--- /dev/null
+++ b/Documentation/thermal/x86_pkg_temperature_thermal.rst
@@ -0,0 +1,55 @@
+===================================
+Kernel driver: x86_pkg_temp_thermal
+===================================
+
+Supported chips:
+
+* x86: with package level thermal management
+
+(Verify using: CPUID.06H:EAX[bit 6] =1)
+
+Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+
+Reference
+---------
+
+Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
+Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
+
+Description
+-----------
+
+This driver register CPU digital temperature package level sensor as a thermal
+zone with maximum two user mode configurable trip points. Number of trip points
+depends on the capability of the package. Once the trip point is violated,
+user mode can receive notification via thermal notification mechanism and can
+take any action to control temperature.
+
+
+Threshold management
+--------------------
+Each package will register as a thermal zone under /sys/class/thermal.
+
+Example::
+
+ /sys/class/thermal/thermal_zone1
+
+This contains two trip points:
+
+- trip_point_0_temp
+- trip_point_1_temp
+
+User can set any temperature between 0 to TJ-Max temperature. Temperature units
+are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.rst" for
+thermal sys-fs details.
+
+Any value other than 0 in these trip points, can trigger thermal notifications.
+Setting 0, stops sending thermal notifications.
+
+Thermal notifications:
+To get kobject-uevent notifications, set the thermal zone
+policy to "user_space".
+
+For example::
+
+ echo -n "user_space" > policy
diff --git a/Documentation/timers/index.rst b/Documentation/timers/index.rst
index 91f6f8263c48..df510ad0c989 100644
--- a/Documentation/timers/index.rst
+++ b/Documentation/timers/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
======
timers
diff --git a/Documentation/trace/coresight-cpu-debug.txt b/Documentation/trace/coresight-cpu-debug.txt
index f07e38094b40..1a660a39e3c0 100644
--- a/Documentation/trace/coresight-cpu-debug.txt
+++ b/Documentation/trace/coresight-cpu-debug.txt
@@ -151,7 +151,7 @@ At the runtime you can disable idle states with below methods:
It is possible to disable CPU idle states by way of the PM QoS
subsystem, more specifically by using the "/dev/cpu_dma_latency"
-interface (see Documentation/power/pm_qos_interface.txt for more
+interface (see Documentation/power/pm_qos_interface.rst for more
details). As specified in the PM QoS documentation the requested
parameter will stay in effect until the file descriptor is released.
For example:
diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst
index 7d2b0178d3f3..fbb314bfa112 100644
--- a/Documentation/trace/kprobetrace.rst
+++ b/Documentation/trace/kprobetrace.rst
@@ -51,15 +51,17 @@ Synopsis of kprobe_events
$argN : Fetch the Nth function argument. (N >= 1) (\*1)
$retval : Fetch return value.(\*2)
$comm : Fetch current task comm.
- +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(\*3)
+ +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*3)(\*4)
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
- (x8/x16/x32/x64), "string" and bitfield are supported.
+ (x8/x16/x32/x64), "string", "ustring" and bitfield
+ are supported.
(\*1) only for the probe on function entry (offs == 0).
(\*2) only for return probe.
(\*3) this is useful for fetching a field of data structures.
+ (\*4) "u" means user-space dereference. See :ref:`user_mem_access`.
Types
-----
@@ -77,7 +79,8 @@ apply it to registers/stack-entries etc. (for example, '$stack1:x8[8]' is
wrong, but '+8($stack):x8[8]' is OK.)
String type is a special type, which fetches a "null-terminated" string from
kernel space. This means it will fail and store NULL if the string container
-has been paged out.
+has been paged out. "ustring" type is an alternative of string for user-space.
+See :ref:`user_mem_access` for more info..
The string array type is a bit different from other types. For other base
types, <base-type>[1] is equal to <base-type> (e.g. +0(%di):x32[1] is same
as +0(%di):x32.) But string[1] is not equal to string. The string type itself
@@ -92,6 +95,25 @@ Symbol type('symbol') is an alias of u32 or u64 type (depends on BITS_PER_LONG)
which shows given pointer in "symbol+offset" style.
For $comm, the default type is "string"; any other type is invalid.
+.. _user_mem_access:
+User Memory Access
+------------------
+Kprobe events supports user-space memory access. For that purpose, you can use
+either user-space dereference syntax or 'ustring' type.
+
+The user-space dereference syntax allows you to access a field of a data
+structure in user-space. This is done by adding the "u" prefix to the
+dereference syntax. For example, +u4(%si) means it will read memory from the
+address in the register %si offset by 4, and the memory is expected to be in
+user-space. You can use this for strings too, e.g. +u0(%si):string will read
+a string from the address in the register %si that is expected to be in user-
+space. 'ustring' is a shortcut way of performing the same task. That is,
++0(%si):ustring is equivalent to +u0(%si):string.
+
+Note that kprobe-event provides the user-memory access syntax but it doesn't
+use it transparently. This means if you use normal dereference or string type
+for user memory, it might fail, and may always fail on some archs. The user
+has to carefully check if the target data is in kernel or user space.
Per-Probe Event Filtering
-------------------------
@@ -124,6 +146,20 @@ You can check the total number of probe hits and probe miss-hits via
The first column is event name, the second is the number of probe hits,
the third is the number of probe miss-hits.
+Kernel Boot Parameter
+---------------------
+You can add and enable new kprobe events when booting up the kernel by
+"kprobe_event=" parameter. The parameter accepts a semicolon-delimited
+kprobe events, which format is similar to the kprobe_events.
+The difference is that the probe definition parameters are comma-delimited
+instead of space. For example, adding myprobe event on do_sys_open like below
+
+ p:myprobe do_sys_open dfd=%ax filename=%dx flags=%cx mode=+4($stack)
+
+should be below for kernel boot parameter (just replace spaces with comma)
+
+ p:myprobe,do_sys_open,dfd=%ax,filename=%dx,flags=%cx,mode=+4($stack)
+
Usage examples
--------------
diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst
index 0b21305fabdc..6e75a6c5a2c8 100644
--- a/Documentation/trace/uprobetracer.rst
+++ b/Documentation/trace/uprobetracer.rst
@@ -42,16 +42,18 @@ Synopsis of uprobe_tracer
@+OFFSET : Fetch memory at OFFSET (OFFSET from same file as PATH)
$stackN : Fetch Nth entry of stack (N >= 0)
$stack : Fetch stack address.
- $retval : Fetch return value.(*)
+ $retval : Fetch return value.(\*1)
$comm : Fetch current task comm.
- +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
+ +|-[u]OFFS(FETCHARG) : Fetch memory at FETCHARG +|- OFFS address.(\*2)(\*3)
NAME=FETCHARG : Set NAME as the argument name of FETCHARG.
FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types
(u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types
(x8/x16/x32/x64), "string" and bitfield are supported.
- (*) only for return probe.
- (**) this is useful for fetching a field of data structures.
+ (\*1) only for return probe.
+ (\*2) this is useful for fetching a field of data structures.
+ (\*3) Unlike kprobe event, "u" prefix will just be ignored, becuse uprobe
+ events can access only user-space memory.
Types
-----
diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst
index 5fd8a1abd2be..b9a6be4b8499 100644
--- a/Documentation/translations/it_IT/kernel-hacking/locking.rst
+++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst
@@ -1404,7 +1404,7 @@ Riferimento per l'API dei Futex
Approfondimenti
===============
-- ``Documentation/locking/spinlocks.txt``: la guida di Linus Torvalds agli
+- ``Documentation/locking/spinlocks.rst``: la guida di Linus Torvalds agli
spinlock del kernel.
- Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst
index ea74cae958d7..995ee69fab11 100644
--- a/Documentation/translations/it_IT/process/submit-checklist.rst
+++ b/Documentation/translations/it_IT/process/submit-checklist.rst
@@ -117,7 +117,7 @@ sottomissione delle patch, in particolare
sorgenti che ne spieghi la logica: cosa fanno e perché.
25) Se la patch aggiunge nuove chiamate ioctl, allora aggiornate
- ``Documentation/ioctl/ioctl-number.txt``.
+ ``Documentation/ioctl/ioctl-number.rst``.
26) Se il codice che avete modificato dipende o usa una qualsiasi interfaccia o
funzionalità del kernel che è associata a uno dei seguenti simboli
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 07725b1df002..a33c2a536542 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -24,7 +24,7 @@ Documentation/memory-barriers.txt
=========================
ì €ìž: David Howells <dhowells@redhat.com>
- Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ Paul E. McKenney <paulmck@linux.ibm.com>
Will Deacon <will.deacon@arm.com>
Peter Zijlstra <peterz@infradead.org>
diff --git a/Documentation/translations/zh_CN/arm/Booting b/Documentation/translations/zh_CN/arm/Booting
index 1fe866f8218f..562e9a2957e6 100644
--- a/Documentation/translations/zh_CN/arm/Booting
+++ b/Documentation/translations/zh_CN/arm/Booting
@@ -1,4 +1,4 @@
-Chinese translated version of Documentation/arm/Booting
+Chinese translated version of Documentation/arm/booting.rst
If you have any comment or update to the content, please contact the
original document maintainer directly. However, if you have a problem
@@ -9,7 +9,7 @@ or if there is a problem with the translation.
Maintainer: Russell King <linux@arm.linux.org.uk>
Chinese maintainer: Fu Wei <tekkamanninja@gmail.com>
---------------------------------------------------------------------
-Documentation/arm/Booting 的中文翻译
+Documentation/arm/booting.rst 的中文翻译
如果想评论或更新本文的内容,请直接è”系原文档的维护者。如果你使用英文
交æµæœ‰å›°éš¾çš„è¯ï¼Œä¹Ÿå¯ä»¥å‘中文版维护者求助。如果本翻译更新ä¸åŠæ—¶æˆ–者翻
diff --git a/Documentation/translations/zh_CN/arm/kernel_user_helpers.txt b/Documentation/translations/zh_CN/arm/kernel_user_helpers.txt
index cd7fc8f34cf9..99af4363984d 100644
--- a/Documentation/translations/zh_CN/arm/kernel_user_helpers.txt
+++ b/Documentation/translations/zh_CN/arm/kernel_user_helpers.txt
@@ -1,4 +1,4 @@
-Chinese translated version of Documentation/arm/kernel_user_helpers.txt
+Chinese translated version of Documentation/arm/kernel_user_helpers.rst
If you have any comment or update to the content, please contact the
original document maintainer directly. However, if you have a problem
@@ -10,7 +10,7 @@ Maintainer: Nicolas Pitre <nicolas.pitre@linaro.org>
Dave Martin <dave.martin@linaro.org>
Chinese maintainer: Fu Wei <tekkamanninja@gmail.com>
---------------------------------------------------------------------
-Documentation/arm/kernel_user_helpers.txt 的中文翻译
+Documentation/arm/kernel_user_helpers.rst 的中文翻译
如果想评论或更新本文的内容,请直接è”系原文档的维护者。如果你使用英文
交æµæœ‰å›°éš¾çš„è¯ï¼Œä¹Ÿå¯ä»¥å‘中文版维护者求助。如果本翻译更新ä¸åŠæ—¶æˆ–者翻
diff --git a/Documentation/translations/zh_CN/arm64/booting.txt b/Documentation/translations/zh_CN/arm64/booting.txt
index 3bfbf66e5a5e..4e373d128d98 100644
--- a/Documentation/translations/zh_CN/arm64/booting.txt
+++ b/Documentation/translations/zh_CN/arm64/booting.txt
@@ -236,7 +236,7 @@ AArch64 内核当å‰æ²¡æœ‰æ供自解压代ç ï¼Œå› æ­¤å¦‚果使用了压缩内
*译者注: ARM DEN 0022A 已更新到 ARM DEN 0022C。
设备树必须包å«ä¸€ä¸ª ‘psci’ 节点,请å‚考以下文档:
- Documentation/devicetree/bindings/arm/psci.txt
+ Documentation/devicetree/bindings/arm/psci.yaml
- 辅助 CPU 通用寄存器设置
diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt
index 452271dda141..ee1f37da5b23 100644
--- a/Documentation/translations/zh_CN/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt
@@ -288,7 +288,7 @@ dev/ 包å«ä¸¤ä¸ªå­ç›®å½•ï¼š char/ å’Œ block/。在这两个å­ç›®å½•ä¸­ï¼Œæœ‰ä
中相应的设备。/sys/dev æ供一个通过一个 stat(2) æ“作结果,查找
设备 sysfs 接å£å¿«æ·çš„方法。
-更多有关 driver-model 的特性信æ¯å¯ä»¥åœ¨ Documentation/driver-model/
+更多有关 driver-model 的特性信æ¯å¯ä»¥åœ¨ Documentation/driver-api/driver-model/
中找到。
diff --git a/Documentation/translations/zh_CN/gpio.txt b/Documentation/translations/zh_CN/gpio.txt
index 4cb1ba8b8fed..a23ee14fc927 100644
--- a/Documentation/translations/zh_CN/gpio.txt
+++ b/Documentation/translations/zh_CN/gpio.txt
@@ -1,4 +1,4 @@
-Chinese translated version of Documentation/gpio
+Chinese translated version of Documentation/admin-guide/gpio
If you have any comment or update to the content, please contact the
original document maintainer directly. However, if you have a problem
@@ -10,7 +10,7 @@ Maintainer: Grant Likely <grant.likely@secretlab.ca>
Linus Walleij <linus.walleij@linaro.org>
Chinese maintainer: Fu Wei <tekkamanninja@gmail.com>
---------------------------------------------------------------------
-Documentation/gpio 的中文翻译
+Documentation/admin-guide/gpio 的中文翻译
如果想评论或更新本文的内容,请直接è”系原文档的维护者。如果你使用英文
交æµæœ‰å›°éš¾çš„è¯ï¼Œä¹Ÿå¯ä»¥å‘中文版维护者求助。如果本翻译更新ä¸åŠæ—¶æˆ–者翻
diff --git a/Documentation/translations/zh_CN/oops-tracing.txt b/Documentation/translations/zh_CN/oops-tracing.txt
index 368ddd05b304..c5f3bda7abcb 100644
--- a/Documentation/translations/zh_CN/oops-tracing.txt
+++ b/Documentation/translations/zh_CN/oops-tracing.txt
@@ -53,8 +53,8 @@ cat /proc/kmsg > file, 然而你必须介入中止传输, kmsg是一个“æ°
(2)用串å£ç»ˆç«¯å¯åŠ¨ï¼ˆè¯·å‚看Documentation/admin-guide/serial-console.rst),è¿è¡Œä¸€ä¸ªnull
modem到å¦ä¸€å°æœºå™¨å¹¶ç”¨ä½ å–œæ¬¢çš„通讯工具获å–输出。Minicom工作地很好。
-(3)使用Kdump(请å‚看Documentation/kdump/kdump.rst),
-使用在Documentation/kdump/gdbmacros.txt中定义的dmesg gdbå®ï¼Œä»Žæ—§çš„内存中æå–内核
+(3)使用Kdump(请å‚看Documentation/admin-guide/kdump/kdump.rst),
+使用在Documentation/admin-guide/kdump/gdbmacros.txt中定义的dmesg gdbå®ï¼Œä»Žæ—§çš„内存中æå–内核
环形缓冲区。
完整信æ¯
diff --git a/Documentation/translations/zh_CN/process/submit-checklist.rst b/Documentation/translations/zh_CN/process/submit-checklist.rst
index f4785d2b0491..8738c55e42a2 100644
--- a/Documentation/translations/zh_CN/process/submit-checklist.rst
+++ b/Documentation/translations/zh_CN/process/submit-checklist.rst
@@ -97,7 +97,7 @@ Linux内核补ä¸æ交清å•
24) 所有内存å±éšœä¾‹å¦‚ ``barrier()``, ``rmb()``, ``wmb()`` 都需è¦æºä»£ç ä¸­çš„注
释æ¥è§£é‡Šå®ƒä»¬æ­£åœ¨æ‰§è¡Œçš„æ“作åŠå…¶åŽŸå› çš„逻辑。
-25) 如果补ä¸æ·»åŠ äº†ä»»ä½•ioctl,那么也è¦æ›´æ–° ``Documentation/ioctl/ioctl-number.txt``
+25) 如果补ä¸æ·»åŠ äº†ä»»ä½•ioctl,那么也è¦æ›´æ–° ``Documentation/ioctl/ioctl-number.rst``
26) 如果修改åŽçš„æºä»£ç ä¾èµ–或使用与以下 ``Kconfig`` 符å·ç›¸å…³çš„任何内核API或
功能,则在ç¦ç”¨ç›¸å…³ ``Kconfig`` 符å·å’Œ/或 ``=m`` (如果该选项å¯ç”¨ï¼‰çš„情况
diff --git a/Documentation/translations/zh_CN/process/submitting-drivers.rst b/Documentation/translations/zh_CN/process/submitting-drivers.rst
index 72f4f45c98de..d99885c27aed 100644
--- a/Documentation/translations/zh_CN/process/submitting-drivers.rst
+++ b/Documentation/translations/zh_CN/process/submitting-drivers.rst
@@ -97,7 +97,7 @@ Linux 2.6:
函数定义æˆè¿”回 -ENOSYS(功能未实现)错误。你还应该å°è¯•ç¡®
ä¿ä½ çš„驱动在什么都ä¸å¹²çš„情况下将耗电é™åˆ°æœ€ä½Žã€‚è¦èŽ·å¾—驱动
程åºæµ‹è¯•çš„指导,请å‚阅
- Documentation/power/drivers-testing.txt。有关驱动程åºç”µ
+ Documentation/power/drivers-testing.rst。有关驱动程åºç”µ
æºç®¡ç†é—®é¢˜ç›¸å¯¹å…¨é¢çš„概述,请å‚阅
Documentation/driver-api/pm/devices.rst。
diff --git a/Documentation/translations/zh_CN/sparse.txt b/Documentation/translations/zh_CN/sparse.txt
index 47fc4a06ebe8..0f444b83d639 100644
--- a/Documentation/translations/zh_CN/sparse.txt
+++ b/Documentation/translations/zh_CN/sparse.txt
@@ -73,10 +73,6 @@ __bitwise"类型。
git://git.kernel.org/pub/scm/linux/kernel/git/josh/sparse.git
-DaveJ 把æ¯å°æ—¶è‡ªåŠ¨ç”Ÿæˆçš„ git æºç æ ‘ tar 包放在以下地å€ï¼š
-
- http://www.codemonkey.org.uk/projects/git-snapshots/sparse/
-
一旦你下载了æºç ï¼Œåªè¦ä»¥æ™®é€šç”¨æˆ·èº«ä»½è¿è¡Œï¼š
make
diff --git a/Documentation/usb/acm.txt b/Documentation/usb/acm.rst
index e8bda98e9b51..e8bda98e9b51 100644
--- a/Documentation/usb/acm.txt
+++ b/Documentation/usb/acm.rst
diff --git a/Documentation/usb/authorization.txt b/Documentation/usb/authorization.rst
index 9e53909d04c2..9e53909d04c2 100644
--- a/Documentation/usb/authorization.txt
+++ b/Documentation/usb/authorization.rst
diff --git a/Documentation/usb/chipidea.txt b/Documentation/usb/chipidea.rst
index 68473abe2823..68473abe2823 100644
--- a/Documentation/usb/chipidea.txt
+++ b/Documentation/usb/chipidea.rst
diff --git a/Documentation/usb/dwc3.txt b/Documentation/usb/dwc3.rst
index f94a7ba16573..f94a7ba16573 100644
--- a/Documentation/usb/dwc3.txt
+++ b/Documentation/usb/dwc3.rst
diff --git a/Documentation/usb/ehci.txt b/Documentation/usb/ehci.rst
index 31f650e7c1b4..31f650e7c1b4 100644
--- a/Documentation/usb/ehci.txt
+++ b/Documentation/usb/ehci.rst
diff --git a/Documentation/usb/functionfs.txt b/Documentation/usb/functionfs.rst
index 7fdc6d840ac5..7fdc6d840ac5 100644
--- a/Documentation/usb/functionfs.txt
+++ b/Documentation/usb/functionfs.rst
diff --git a/Documentation/usb/gadget-testing.rst b/Documentation/usb/gadget-testing.rst
new file mode 100644
index 000000000000..2eeb3e9299e4
--- /dev/null
+++ b/Documentation/usb/gadget-testing.rst
@@ -0,0 +1,934 @@
+==============
+Gadget Testing
+==============
+
+This file summarizes information on basic testing of USB functions
+provided by gadgets.
+
+.. contents
+
+ 1. ACM function
+ 2. ECM function
+ 3. ECM subset function
+ 4. EEM function
+ 5. FFS function
+ 6. HID function
+ 7. LOOPBACK function
+ 8. MASS STORAGE function
+ 9. MIDI function
+ 10. NCM function
+ 11. OBEX function
+ 12. PHONET function
+ 13. RNDIS function
+ 14. SERIAL function
+ 15. SOURCESINK function
+ 16. UAC1 function (legacy implementation)
+ 17. UAC2 function
+ 18. UVC function
+ 19. PRINTER function
+ 20. UAC1 function (new API)
+
+
+1. ACM function
+===============
+
+The function is provided by usb_f_acm.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "acm".
+The ACM function provides just one attribute in its function directory:
+
+ port_num
+
+The attribute is read-only.
+
+There can be at most 4 ACM/generic serial/OBEX ports in the system.
+
+
+Testing the ACM function
+------------------------
+
+On the host::
+
+ cat > /dev/ttyACM<X>
+
+On the device::
+
+ cat /dev/ttyGS<Y>
+
+then the other way round
+
+On the device::
+
+ cat > /dev/ttyGS<Y>
+
+On the host::
+
+ cat /dev/ttyACM<X>
+
+2. ECM function
+===============
+
+The function is provided by usb_f_ecm.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "ecm".
+The ECM function provides these attributes in its function directory:
+
+ =============== ==================================================
+ ifname network device interface name associated with this
+ function instance
+ qmult queue length multiplier for high and super speed
+ host_addr MAC address of host's end of this
+ Ethernet over USB link
+ dev_addr MAC address of device's end of this
+ Ethernet over USB link
+ =============== ==================================================
+
+and after creating the functions/ecm.<instance name> they contain default
+values: qmult is 5, dev_addr and host_addr are randomly selected.
+Except for ifname they can be written to until the function is linked to a
+configuration. The ifname is read-only and contains the name of the interface
+which was assigned by the net core, e. g. usb0.
+
+Testing the ECM function
+------------------------
+
+Configure IP addresses of the device and the host. Then:
+
+On the device::
+
+ ping <host's IP>
+
+On the host::
+
+ ping <device's IP>
+
+3. ECM subset function
+======================
+
+The function is provided by usb_f_ecm_subset.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "geth".
+The ECM subset function provides these attributes in its function directory:
+
+ =============== ==================================================
+ ifname network device interface name associated with this
+ function instance
+ qmult queue length multiplier for high and super speed
+ host_addr MAC address of host's end of this
+ Ethernet over USB link
+ dev_addr MAC address of device's end of this
+ Ethernet over USB link
+ =============== ==================================================
+
+and after creating the functions/ecm.<instance name> they contain default
+values: qmult is 5, dev_addr and host_addr are randomly selected.
+Except for ifname they can be written to until the function is linked to a
+configuration. The ifname is read-only and contains the name of the interface
+which was assigned by the net core, e. g. usb0.
+
+Testing the ECM subset function
+-------------------------------
+
+Configure IP addresses of the device and the host. Then:
+
+On the device::
+
+ ping <host's IP>
+
+On the host::
+
+ ping <device's IP>
+
+4. EEM function
+===============
+
+The function is provided by usb_f_eem.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "eem".
+The EEM function provides these attributes in its function directory:
+
+ =============== ==================================================
+ ifname network device interface name associated with this
+ function instance
+ qmult queue length multiplier for high and super speed
+ host_addr MAC address of host's end of this
+ Ethernet over USB link
+ dev_addr MAC address of device's end of this
+ Ethernet over USB link
+ =============== ==================================================
+
+and after creating the functions/eem.<instance name> they contain default
+values: qmult is 5, dev_addr and host_addr are randomly selected.
+Except for ifname they can be written to until the function is linked to a
+configuration. The ifname is read-only and contains the name of the interface
+which was assigned by the net core, e. g. usb0.
+
+Testing the EEM function
+------------------------
+
+Configure IP addresses of the device and the host. Then:
+
+On the device::
+
+ ping <host's IP>
+
+On the host::
+
+ ping <device's IP>
+
+5. FFS function
+===============
+
+The function is provided by usb_f_fs.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "ffs".
+The function directory is intentionally empty and not modifiable.
+
+After creating the directory there is a new instance (a "device") of FunctionFS
+available in the system. Once a "device" is available, the user should follow
+the standard procedure for using FunctionFS (mount it, run the userspace
+process which implements the function proper). The gadget should be enabled
+by writing a suitable string to usb_gadget/<gadget>/UDC.
+
+Testing the FFS function
+------------------------
+
+On the device: start the function's userspace daemon, enable the gadget
+
+On the host: use the USB function provided by the device
+
+6. HID function
+===============
+
+The function is provided by usb_f_hid.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "hid".
+The HID function provides these attributes in its function directory:
+
+ =============== ===========================================
+ protocol HID protocol to use
+ report_desc data to be used in HID reports, except data
+ passed with /dev/hidg<X>
+ report_length HID report length
+ subclass HID subclass to use
+ =============== ===========================================
+
+For a keyboard the protocol and the subclass are 1, the report_length is 8,
+while the report_desc is::
+
+ $ hd my_report_desc
+ 00000000 05 01 09 06 a1 01 05 07 19 e0 29 e7 15 00 25 01 |..........)...%.|
+ 00000010 75 01 95 08 81 02 95 01 75 08 81 03 95 05 75 01 |u.......u.....u.|
+ 00000020 05 08 19 01 29 05 91 02 95 01 75 03 91 03 95 06 |....).....u.....|
+ 00000030 75 08 15 00 25 65 05 07 19 00 29 65 81 00 c0 |u...%e....)e...|
+ 0000003f
+
+Such a sequence of bytes can be stored to the attribute with echo::
+
+ $ echo -ne \\x05\\x01\\x09\\x06\\xa1.....
+
+Testing the HID function
+------------------------
+
+Device:
+
+- create the gadget
+- connect the gadget to a host, preferably not the one used
+ to control the gadget
+- run a program which writes to /dev/hidg<N>, e.g.
+ a userspace program found in Documentation/usb/gadget_hid.rst::
+
+ $ ./hid_gadget_test /dev/hidg0 keyboard
+
+Host:
+
+- observe the keystrokes from the gadget
+
+7. LOOPBACK function
+====================
+
+The function is provided by usb_f_ss_lb.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "Loopback".
+The LOOPBACK function provides these attributes in its function directory:
+
+ =============== =======================
+ qlen depth of loopback queue
+ bulk_buflen buffer length
+ =============== =======================
+
+Testing the LOOPBACK function
+-----------------------------
+
+device: run the gadget
+
+host: test-usb (tools/usb/testusb.c)
+
+8. MASS STORAGE function
+========================
+
+The function is provided by usb_f_mass_storage.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "mass_storage".
+The MASS STORAGE function provides these attributes in its directory:
+files:
+
+ =============== ==============================================
+ stall Set to permit function to halt bulk endpoints.
+ Disabled on some USB devices known not to work
+ correctly. You should set it to true.
+ num_buffers Number of pipeline buffers. Valid numbers
+ are 2..4. Available only if
+ CONFIG_USB_GADGET_DEBUG_FILES is set.
+ =============== ==============================================
+
+and a default lun.0 directory corresponding to SCSI LUN #0.
+
+A new lun can be added with mkdir::
+
+ $ mkdir functions/mass_storage.0/partition.5
+
+Lun numbering does not have to be continuous, except for lun #0 which is
+created by default. A maximum of 8 luns can be specified and they all must be
+named following the <name>.<number> scheme. The numbers can be 0..8.
+Probably a good convention is to name the luns "lun.<number>",
+although it is not mandatory.
+
+In each lun directory there are the following attribute files:
+
+ =============== ==============================================
+ file The path to the backing file for the LUN.
+ Required if LUN is not marked as removable.
+ ro Flag specifying access to the LUN shall be
+ read-only. This is implied if CD-ROM emulation
+ is enabled as well as when it was impossible
+ to open "filename" in R/W mode.
+ removable Flag specifying that LUN shall be indicated as
+ being removable.
+ cdrom Flag specifying that LUN shall be reported as
+ being a CD-ROM.
+ nofua Flag specifying that FUA flag
+ in SCSI WRITE(10,12)
+ =============== ==============================================
+
+Testing the MASS STORAGE function
+---------------------------------
+
+device: connect the gadget, enable it
+host: dmesg, see the USB drives appear (if system configured to automatically
+mount)
+
+9. MIDI function
+================
+
+The function is provided by usb_f_midi.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "midi".
+The MIDI function provides these attributes in its function directory:
+
+ =============== ====================================
+ buflen MIDI buffer length
+ id ID string for the USB MIDI adapter
+ in_ports number of MIDI input ports
+ index index value for the USB MIDI adapter
+ out_ports number of MIDI output ports
+ qlen USB read request queue length
+ =============== ====================================
+
+Testing the MIDI function
+-------------------------
+
+There are two cases: playing a mid from the gadget to
+the host and playing a mid from the host to the gadget.
+
+1) Playing a mid from the gadget to the host:
+
+host::
+
+ $ arecordmidi -l
+ Port Client name Port name
+ 14:0 Midi Through Midi Through Port-0
+ 24:0 MIDI Gadget MIDI Gadget MIDI 1
+ $ arecordmidi -p 24:0 from_gadget.mid
+
+gadget::
+
+ $ aplaymidi -l
+ Port Client name Port name
+ 20:0 f_midi f_midi
+
+ $ aplaymidi -p 20:0 to_host.mid
+
+2) Playing a mid from the host to the gadget
+
+gadget::
+
+ $ arecordmidi -l
+ Port Client name Port name
+ 20:0 f_midi f_midi
+
+ $ arecordmidi -p 20:0 from_host.mid
+
+host::
+
+ $ aplaymidi -l
+ Port Client name Port name
+ 14:0 Midi Through Midi Through Port-0
+ 24:0 MIDI Gadget MIDI Gadget MIDI 1
+
+ $ aplaymidi -p24:0 to_gadget.mid
+
+The from_gadget.mid should sound identical to the to_host.mid.
+
+The from_host.id should sound identical to the to_gadget.mid.
+
+MIDI files can be played to speakers/headphones with e.g. timidity installed::
+
+ $ aplaymidi -l
+ Port Client name Port name
+ 14:0 Midi Through Midi Through Port-0
+ 24:0 MIDI Gadget MIDI Gadget MIDI 1
+ 128:0 TiMidity TiMidity port 0
+ 128:1 TiMidity TiMidity port 1
+ 128:2 TiMidity TiMidity port 2
+ 128:3 TiMidity TiMidity port 3
+
+ $ aplaymidi -p 128:0 file.mid
+
+MIDI ports can be logically connected using the aconnect utility, e.g.::
+
+ $ aconnect 24:0 128:0 # try it on the host
+
+After the gadget's MIDI port is connected to timidity's MIDI port,
+whatever is played at the gadget side with aplaymidi -l is audible
+in host's speakers/headphones.
+
+10. NCM function
+================
+
+The function is provided by usb_f_ncm.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "ncm".
+The NCM function provides these attributes in its function directory:
+
+ =============== ==================================================
+ ifname network device interface name associated with this
+ function instance
+ qmult queue length multiplier for high and super speed
+ host_addr MAC address of host's end of this
+ Ethernet over USB link
+ dev_addr MAC address of device's end of this
+ Ethernet over USB link
+ =============== ==================================================
+
+and after creating the functions/ncm.<instance name> they contain default
+values: qmult is 5, dev_addr and host_addr are randomly selected.
+Except for ifname they can be written to until the function is linked to a
+configuration. The ifname is read-only and contains the name of the interface
+which was assigned by the net core, e. g. usb0.
+
+Testing the NCM function
+------------------------
+
+Configure IP addresses of the device and the host. Then:
+
+On the device::
+
+ ping <host's IP>
+
+On the host::
+
+ ping <device's IP>
+
+11. OBEX function
+=================
+
+The function is provided by usb_f_obex.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "obex".
+The OBEX function provides just one attribute in its function directory:
+
+ port_num
+
+The attribute is read-only.
+
+There can be at most 4 ACM/generic serial/OBEX ports in the system.
+
+Testing the OBEX function
+-------------------------
+
+On device::
+
+ seriald -f /dev/ttyGS<Y> -s 1024
+
+On host::
+
+ serialc -v <vendorID> -p <productID> -i<interface#> -a1 -s1024 \
+ -t<out endpoint addr> -r<in endpoint addr>
+
+where seriald and serialc are Felipe's utilities found here:
+
+ https://github.com/felipebalbi/usb-tools.git master
+
+12. PHONET function
+===================
+
+The function is provided by usb_f_phonet.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "phonet".
+The PHONET function provides just one attribute in its function directory:
+
+ =============== ==================================================
+ ifname network device interface name associated with this
+ function instance
+ =============== ==================================================
+
+Testing the PHONET function
+---------------------------
+
+It is not possible to test the SOCK_STREAM protocol without a specific piece
+of hardware, so only SOCK_DGRAM has been tested. For the latter to work,
+in the past I had to apply the patch mentioned here:
+
+http://www.spinics.net/lists/linux-usb/msg85689.html
+
+These tools are required:
+
+git://git.gitorious.org/meego-cellular/phonet-utils.git
+
+On the host::
+
+ $ ./phonet -a 0x10 -i usbpn0
+ $ ./pnroute add 0x6c usbpn0
+ $./pnroute add 0x10 usbpn0
+ $ ifconfig usbpn0 up
+
+On the device::
+
+ $ ./phonet -a 0x6c -i upnlink0
+ $ ./pnroute add 0x10 upnlink0
+ $ ifconfig upnlink0 up
+
+Then a test program can be used::
+
+ http://www.spinics.net/lists/linux-usb/msg85690.html
+
+On the device::
+
+ $ ./pnxmit -a 0x6c -r
+
+On the host::
+
+ $ ./pnxmit -a 0x10 -s 0x6c
+
+As a result some data should be sent from host to device.
+Then the other way round:
+
+On the host::
+
+ $ ./pnxmit -a 0x10 -r
+
+On the device::
+
+ $ ./pnxmit -a 0x6c -s 0x10
+
+13. RNDIS function
+==================
+
+The function is provided by usb_f_rndis.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "rndis".
+The RNDIS function provides these attributes in its function directory:
+
+ =============== ==================================================
+ ifname network device interface name associated with this
+ function instance
+ qmult queue length multiplier for high and super speed
+ host_addr MAC address of host's end of this
+ Ethernet over USB link
+ dev_addr MAC address of device's end of this
+ Ethernet over USB link
+ =============== ==================================================
+
+and after creating the functions/rndis.<instance name> they contain default
+values: qmult is 5, dev_addr and host_addr are randomly selected.
+Except for ifname they can be written to until the function is linked to a
+configuration. The ifname is read-only and contains the name of the interface
+which was assigned by the net core, e. g. usb0.
+
+Testing the RNDIS function
+--------------------------
+
+Configure IP addresses of the device and the host. Then:
+
+On the device::
+
+ ping <host's IP>
+
+On the host::
+
+ ping <device's IP>
+
+14. SERIAL function
+===================
+
+The function is provided by usb_f_gser.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "gser".
+The SERIAL function provides just one attribute in its function directory:
+
+ port_num
+
+The attribute is read-only.
+
+There can be at most 4 ACM/generic serial/OBEX ports in the system.
+
+Testing the SERIAL function
+---------------------------
+
+On host::
+
+ insmod usbserial
+ echo VID PID >/sys/bus/usb-serial/drivers/generic/new_id
+
+On host::
+
+ cat > /dev/ttyUSB<X>
+
+On target::
+
+ cat /dev/ttyGS<Y>
+
+then the other way round
+
+On target::
+
+ cat > /dev/ttyGS<Y>
+
+On host::
+
+ cat /dev/ttyUSB<X>
+
+15. SOURCESINK function
+=======================
+
+The function is provided by usb_f_ss_lb.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "SourceSink".
+The SOURCESINK function provides these attributes in its function directory:
+
+ =============== ==================================
+ pattern 0 (all zeros), 1 (mod63), 2 (none)
+ isoc_interval 1..16
+ isoc_maxpacket 0 - 1023 (fs), 0 - 1024 (hs/ss)
+ isoc_mult 0..2 (hs/ss only)
+ isoc_maxburst 0..15 (ss only)
+ bulk_buflen buffer length
+ bulk_qlen depth of queue for bulk
+ iso_qlen depth of queue for iso
+ =============== ==================================
+
+Testing the SOURCESINK function
+-------------------------------
+
+device: run the gadget
+
+host: test-usb (tools/usb/testusb.c)
+
+
+16. UAC1 function (legacy implementation)
+=========================================
+
+The function is provided by usb_f_uac1_legacy.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory
+is "uac1_legacy".
+The uac1 function provides these attributes in its function directory:
+
+ =============== ====================================
+ audio_buf_size audio buffer size
+ fn_cap capture pcm device file name
+ fn_cntl control device file name
+ fn_play playback pcm device file name
+ req_buf_size ISO OUT endpoint request buffer size
+ req_count ISO OUT endpoint request count
+ =============== ====================================
+
+The attributes have sane default values.
+
+Testing the UAC1 function
+-------------------------
+
+device: run the gadget
+
+host::
+
+ aplay -l # should list our USB Audio Gadget
+
+17. UAC2 function
+=================
+
+The function is provided by usb_f_uac2.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "uac2".
+The uac2 function provides these attributes in its function directory:
+
+ =============== ====================================================
+ c_chmask capture channel mask
+ c_srate capture sampling rate
+ c_ssize capture sample size (bytes)
+ p_chmask playback channel mask
+ p_srate playback sampling rate
+ p_ssize playback sample size (bytes)
+ req_number the number of pre-allocated request for both capture
+ and playback
+ =============== ====================================================
+
+The attributes have sane default values.
+
+Testing the UAC2 function
+-------------------------
+
+device: run the gadget
+host: aplay -l # should list our USB Audio Gadget
+
+This function does not require real hardware support, it just
+sends a stream of audio data to/from the host. In order to
+actually hear something at the device side, a command similar
+to this must be used at the device side::
+
+ $ arecord -f dat -t wav -D hw:2,0 | aplay -D hw:0,0 &
+
+e.g.::
+
+ $ arecord -f dat -t wav -D hw:CARD=UAC2Gadget,DEV=0 | \
+ aplay -D default:CARD=OdroidU3
+
+18. UVC function
+================
+
+The function is provided by usb_f_uvc.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "uvc".
+The uvc function provides these attributes in its function directory:
+
+ =================== ================================================
+ streaming_interval interval for polling endpoint for data transfers
+ streaming_maxburst bMaxBurst for super speed companion descriptor
+ streaming_maxpacket maximum packet size this endpoint is capable of
+ sending or receiving when this configuration is
+ selected
+ =================== ================================================
+
+There are also "control" and "streaming" subdirectories, each of which contain
+a number of their subdirectories. There are some sane defaults provided, but
+the user must provide the following:
+
+ ================== ====================================================
+ control header create in control/header, link from control/class/fs
+ and/or control/class/ss
+ streaming header create in streaming/header, link from
+ streaming/class/fs and/or streaming/class/hs and/or
+ streaming/class/ss
+ format description create in streaming/mjpeg and/or
+ streaming/uncompressed
+ frame description create in streaming/mjpeg/<format> and/or in
+ streaming/uncompressed/<format>
+ ================== ====================================================
+
+Each frame description contains frame interval specification, and each
+such specification consists of a number of lines with an inverval value
+in each line. The rules stated above are best illustrated with an example::
+
+ # mkdir functions/uvc.usb0/control/header/h
+ # cd functions/uvc.usb0/control/
+ # ln -s header/h class/fs
+ # ln -s header/h class/ss
+ # mkdir -p functions/uvc.usb0/streaming/uncompressed/u/360p
+ # cat <<EOF > functions/uvc.usb0/streaming/uncompressed/u/360p/dwFrameInterval
+ 666666
+ 1000000
+ 5000000
+ EOF
+ # cd $GADGET_CONFIGFS_ROOT
+ # mkdir functions/uvc.usb0/streaming/header/h
+ # cd functions/uvc.usb0/streaming/header/h
+ # ln -s ../../uncompressed/u
+ # cd ../../class/fs
+ # ln -s ../../header/h
+ # cd ../../class/hs
+ # ln -s ../../header/h
+ # cd ../../class/ss
+ # ln -s ../../header/h
+
+
+Testing the UVC function
+------------------------
+
+device: run the gadget, modprobe vivid::
+
+ # uvc-gadget -u /dev/video<uvc video node #> -v /dev/video<vivid video node #>
+
+where uvc-gadget is this program:
+ http://git.ideasonboard.org/uvc-gadget.git
+
+with these patches:
+
+ http://www.spinics.net/lists/linux-usb/msg99220.html
+
+host::
+
+ luvcview -f yuv
+
+19. PRINTER function
+====================
+
+The function is provided by usb_f_printer.ko module.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "printer".
+The printer function provides these attributes in its function directory:
+
+ ========== ===========================================
+ pnp_string Data to be passed to the host in pnp string
+ q_len Number of requests per endpoint
+ ========== ===========================================
+
+Testing the PRINTER function
+----------------------------
+
+The most basic testing:
+
+device: run the gadget::
+
+ # ls -l /devices/virtual/usb_printer_gadget/
+
+should show g_printer<number>.
+
+If udev is active, then /dev/g_printer<number> should appear automatically.
+
+host:
+
+If udev is active, then e.g. /dev/usb/lp0 should appear.
+
+host->device transmission:
+
+device::
+
+ # cat /dev/g_printer<number>
+
+host::
+
+ # cat > /dev/usb/lp0
+
+device->host transmission::
+
+ # cat > /dev/g_printer<number>
+
+host::
+
+ # cat /dev/usb/lp0
+
+More advanced testing can be done with the prn_example
+described in Documentation/usb/gadget_printer.rst.
+
+
+20. UAC1 function (virtual ALSA card, using u_audio API)
+========================================================
+
+The function is provided by usb_f_uac1.ko module.
+It will create a virtual ALSA card and the audio streams are simply
+sinked to and sourced from it.
+
+Function-specific configfs interface
+------------------------------------
+
+The function name to use when creating the function directory is "uac1".
+The uac1 function provides these attributes in its function directory:
+
+ ========== ====================================================
+ c_chmask capture channel mask
+ c_srate capture sampling rate
+ c_ssize capture sample size (bytes)
+ p_chmask playback channel mask
+ p_srate playback sampling rate
+ p_ssize playback sample size (bytes)
+ req_number the number of pre-allocated request for both capture
+ and playback
+ ========== ====================================================
+
+The attributes have sane default values.
+
+Testing the UAC1 function
+-------------------------
+
+device: run the gadget
+host: aplay -l # should list our USB Audio Gadget
+
+This function does not require real hardware support, it just
+sends a stream of audio data to/from the host. In order to
+actually hear something at the device side, a command similar
+to this must be used at the device side::
+
+ $ arecord -f dat -t wav -D hw:2,0 | aplay -D hw:0,0 &
+
+e.g.::
+
+ $ arecord -f dat -t wav -D hw:CARD=UAC1Gadget,DEV=0 | \
+ aplay -D default:CARD=OdroidU3
diff --git a/Documentation/usb/gadget-testing.txt b/Documentation/usb/gadget-testing.txt
deleted file mode 100644
index 7d7f2340af42..000000000000
--- a/Documentation/usb/gadget-testing.txt
+++ /dev/null
@@ -1,934 +0,0 @@
-==============
-Gadget Testing
-==============
-
-This file summarizes information on basic testing of USB functions
-provided by gadgets.
-
-.. contents
-
- 1. ACM function
- 2. ECM function
- 3. ECM subset function
- 4. EEM function
- 5. FFS function
- 6. HID function
- 7. LOOPBACK function
- 8. MASS STORAGE function
- 9. MIDI function
- 10. NCM function
- 11. OBEX function
- 12. PHONET function
- 13. RNDIS function
- 14. SERIAL function
- 15. SOURCESINK function
- 16. UAC1 function (legacy implementation)
- 17. UAC2 function
- 18. UVC function
- 19. PRINTER function
- 20. UAC1 function (new API)
-
-
-1. ACM function
-===============
-
-The function is provided by usb_f_acm.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "acm".
-The ACM function provides just one attribute in its function directory:
-
- port_num
-
-The attribute is read-only.
-
-There can be at most 4 ACM/generic serial/OBEX ports in the system.
-
-
-Testing the ACM function
-------------------------
-
-On the host::
-
- cat > /dev/ttyACM<X>
-
-On the device::
-
- cat /dev/ttyGS<Y>
-
-then the other way round
-
-On the device::
-
- cat > /dev/ttyGS<Y>
-
-On the host::
-
- cat /dev/ttyACM<X>
-
-2. ECM function
-===============
-
-The function is provided by usb_f_ecm.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "ecm".
-The ECM function provides these attributes in its function directory:
-
- =============== ==================================================
- ifname network device interface name associated with this
- function instance
- qmult queue length multiplier for high and super speed
- host_addr MAC address of host's end of this
- Ethernet over USB link
- dev_addr MAC address of device's end of this
- Ethernet over USB link
- =============== ==================================================
-
-and after creating the functions/ecm.<instance name> they contain default
-values: qmult is 5, dev_addr and host_addr are randomly selected.
-Except for ifname they can be written to until the function is linked to a
-configuration. The ifname is read-only and contains the name of the interface
-which was assigned by the net core, e. g. usb0.
-
-Testing the ECM function
-------------------------
-
-Configure IP addresses of the device and the host. Then:
-
-On the device::
-
- ping <host's IP>
-
-On the host::
-
- ping <device's IP>
-
-3. ECM subset function
-======================
-
-The function is provided by usb_f_ecm_subset.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "geth".
-The ECM subset function provides these attributes in its function directory:
-
- =============== ==================================================
- ifname network device interface name associated with this
- function instance
- qmult queue length multiplier for high and super speed
- host_addr MAC address of host's end of this
- Ethernet over USB link
- dev_addr MAC address of device's end of this
- Ethernet over USB link
- =============== ==================================================
-
-and after creating the functions/ecm.<instance name> they contain default
-values: qmult is 5, dev_addr and host_addr are randomly selected.
-Except for ifname they can be written to until the function is linked to a
-configuration. The ifname is read-only and contains the name of the interface
-which was assigned by the net core, e. g. usb0.
-
-Testing the ECM subset function
--------------------------------
-
-Configure IP addresses of the device and the host. Then:
-
-On the device::
-
- ping <host's IP>
-
-On the host::
-
- ping <device's IP>
-
-4. EEM function
-===============
-
-The function is provided by usb_f_eem.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "eem".
-The EEM function provides these attributes in its function directory:
-
- =============== ==================================================
- ifname network device interface name associated with this
- function instance
- qmult queue length multiplier for high and super speed
- host_addr MAC address of host's end of this
- Ethernet over USB link
- dev_addr MAC address of device's end of this
- Ethernet over USB link
- =============== ==================================================
-
-and after creating the functions/eem.<instance name> they contain default
-values: qmult is 5, dev_addr and host_addr are randomly selected.
-Except for ifname they can be written to until the function is linked to a
-configuration. The ifname is read-only and contains the name of the interface
-which was assigned by the net core, e. g. usb0.
-
-Testing the EEM function
-------------------------
-
-Configure IP addresses of the device and the host. Then:
-
-On the device::
-
- ping <host's IP>
-
-On the host::
-
- ping <device's IP>
-
-5. FFS function
-===============
-
-The function is provided by usb_f_fs.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "ffs".
-The function directory is intentionally empty and not modifiable.
-
-After creating the directory there is a new instance (a "device") of FunctionFS
-available in the system. Once a "device" is available, the user should follow
-the standard procedure for using FunctionFS (mount it, run the userspace
-process which implements the function proper). The gadget should be enabled
-by writing a suitable string to usb_gadget/<gadget>/UDC.
-
-Testing the FFS function
-------------------------
-
-On the device: start the function's userspace daemon, enable the gadget
-
-On the host: use the USB function provided by the device
-
-6. HID function
-===============
-
-The function is provided by usb_f_hid.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "hid".
-The HID function provides these attributes in its function directory:
-
- =============== ===========================================
- protocol HID protocol to use
- report_desc data to be used in HID reports, except data
- passed with /dev/hidg<X>
- report_length HID report length
- subclass HID subclass to use
- =============== ===========================================
-
-For a keyboard the protocol and the subclass are 1, the report_length is 8,
-while the report_desc is::
-
- $ hd my_report_desc
- 00000000 05 01 09 06 a1 01 05 07 19 e0 29 e7 15 00 25 01 |..........)...%.|
- 00000010 75 01 95 08 81 02 95 01 75 08 81 03 95 05 75 01 |u.......u.....u.|
- 00000020 05 08 19 01 29 05 91 02 95 01 75 03 91 03 95 06 |....).....u.....|
- 00000030 75 08 15 00 25 65 05 07 19 00 29 65 81 00 c0 |u...%e....)e...|
- 0000003f
-
-Such a sequence of bytes can be stored to the attribute with echo::
-
- $ echo -ne \\x05\\x01\\x09\\x06\\xa1.....
-
-Testing the HID function
-------------------------
-
-Device:
-
-- create the gadget
-- connect the gadget to a host, preferably not the one used
- to control the gadget
-- run a program which writes to /dev/hidg<N>, e.g.
- a userspace program found in Documentation/usb/gadget_hid.txt::
-
- $ ./hid_gadget_test /dev/hidg0 keyboard
-
-Host:
-
-- observe the keystrokes from the gadget
-
-7. LOOPBACK function
-====================
-
-The function is provided by usb_f_ss_lb.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "Loopback".
-The LOOPBACK function provides these attributes in its function directory:
-
- =============== =======================
- qlen depth of loopback queue
- bulk_buflen buffer length
- =============== =======================
-
-Testing the LOOPBACK function
------------------------------
-
-device: run the gadget
-
-host: test-usb (tools/usb/testusb.c)
-
-8. MASS STORAGE function
-========================
-
-The function is provided by usb_f_mass_storage.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "mass_storage".
-The MASS STORAGE function provides these attributes in its directory:
-files:
-
- =============== ==============================================
- stall Set to permit function to halt bulk endpoints.
- Disabled on some USB devices known not to work
- correctly. You should set it to true.
- num_buffers Number of pipeline buffers. Valid numbers
- are 2..4. Available only if
- CONFIG_USB_GADGET_DEBUG_FILES is set.
- =============== ==============================================
-
-and a default lun.0 directory corresponding to SCSI LUN #0.
-
-A new lun can be added with mkdir::
-
- $ mkdir functions/mass_storage.0/partition.5
-
-Lun numbering does not have to be continuous, except for lun #0 which is
-created by default. A maximum of 8 luns can be specified and they all must be
-named following the <name>.<number> scheme. The numbers can be 0..8.
-Probably a good convention is to name the luns "lun.<number>",
-although it is not mandatory.
-
-In each lun directory there are the following attribute files:
-
- =============== ==============================================
- file The path to the backing file for the LUN.
- Required if LUN is not marked as removable.
- ro Flag specifying access to the LUN shall be
- read-only. This is implied if CD-ROM emulation
- is enabled as well as when it was impossible
- to open "filename" in R/W mode.
- removable Flag specifying that LUN shall be indicated as
- being removable.
- cdrom Flag specifying that LUN shall be reported as
- being a CD-ROM.
- nofua Flag specifying that FUA flag
- in SCSI WRITE(10,12)
- =============== ==============================================
-
-Testing the MASS STORAGE function
----------------------------------
-
-device: connect the gadget, enable it
-host: dmesg, see the USB drives appear (if system configured to automatically
-mount)
-
-9. MIDI function
-================
-
-The function is provided by usb_f_midi.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "midi".
-The MIDI function provides these attributes in its function directory:
-
- =============== ====================================
- buflen MIDI buffer length
- id ID string for the USB MIDI adapter
- in_ports number of MIDI input ports
- index index value for the USB MIDI adapter
- out_ports number of MIDI output ports
- qlen USB read request queue length
- =============== ====================================
-
-Testing the MIDI function
--------------------------
-
-There are two cases: playing a mid from the gadget to
-the host and playing a mid from the host to the gadget.
-
-1) Playing a mid from the gadget to the host:
-
-host::
-
- $ arecordmidi -l
- Port Client name Port name
- 14:0 Midi Through Midi Through Port-0
- 24:0 MIDI Gadget MIDI Gadget MIDI 1
- $ arecordmidi -p 24:0 from_gadget.mid
-
-gadget::
-
- $ aplaymidi -l
- Port Client name Port name
- 20:0 f_midi f_midi
-
- $ aplaymidi -p 20:0 to_host.mid
-
-2) Playing a mid from the host to the gadget
-
-gadget::
-
- $ arecordmidi -l
- Port Client name Port name
- 20:0 f_midi f_midi
-
- $ arecordmidi -p 20:0 from_host.mid
-
-host::
-
- $ aplaymidi -l
- Port Client name Port name
- 14:0 Midi Through Midi Through Port-0
- 24:0 MIDI Gadget MIDI Gadget MIDI 1
-
- $ aplaymidi -p24:0 to_gadget.mid
-
-The from_gadget.mid should sound identical to the to_host.mid.
-
-The from_host.id should sound identical to the to_gadget.mid.
-
-MIDI files can be played to speakers/headphones with e.g. timidity installed::
-
- $ aplaymidi -l
- Port Client name Port name
- 14:0 Midi Through Midi Through Port-0
- 24:0 MIDI Gadget MIDI Gadget MIDI 1
- 128:0 TiMidity TiMidity port 0
- 128:1 TiMidity TiMidity port 1
- 128:2 TiMidity TiMidity port 2
- 128:3 TiMidity TiMidity port 3
-
- $ aplaymidi -p 128:0 file.mid
-
-MIDI ports can be logically connected using the aconnect utility, e.g.::
-
- $ aconnect 24:0 128:0 # try it on the host
-
-After the gadget's MIDI port is connected to timidity's MIDI port,
-whatever is played at the gadget side with aplaymidi -l is audible
-in host's speakers/headphones.
-
-10. NCM function
-================
-
-The function is provided by usb_f_ncm.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "ncm".
-The NCM function provides these attributes in its function directory:
-
- =============== ==================================================
- ifname network device interface name associated with this
- function instance
- qmult queue length multiplier for high and super speed
- host_addr MAC address of host's end of this
- Ethernet over USB link
- dev_addr MAC address of device's end of this
- Ethernet over USB link
- =============== ==================================================
-
-and after creating the functions/ncm.<instance name> they contain default
-values: qmult is 5, dev_addr and host_addr are randomly selected.
-Except for ifname they can be written to until the function is linked to a
-configuration. The ifname is read-only and contains the name of the interface
-which was assigned by the net core, e. g. usb0.
-
-Testing the NCM function
-------------------------
-
-Configure IP addresses of the device and the host. Then:
-
-On the device::
-
- ping <host's IP>
-
-On the host::
-
- ping <device's IP>
-
-11. OBEX function
-=================
-
-The function is provided by usb_f_obex.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "obex".
-The OBEX function provides just one attribute in its function directory:
-
- port_num
-
-The attribute is read-only.
-
-There can be at most 4 ACM/generic serial/OBEX ports in the system.
-
-Testing the OBEX function
--------------------------
-
-On device::
-
- seriald -f /dev/ttyGS<Y> -s 1024
-
-On host::
-
- serialc -v <vendorID> -p <productID> -i<interface#> -a1 -s1024 \
- -t<out endpoint addr> -r<in endpoint addr>
-
-where seriald and serialc are Felipe's utilities found here:
-
- https://github.com/felipebalbi/usb-tools.git master
-
-12. PHONET function
-===================
-
-The function is provided by usb_f_phonet.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "phonet".
-The PHONET function provides just one attribute in its function directory:
-
- =============== ==================================================
- ifname network device interface name associated with this
- function instance
- =============== ==================================================
-
-Testing the PHONET function
----------------------------
-
-It is not possible to test the SOCK_STREAM protocol without a specific piece
-of hardware, so only SOCK_DGRAM has been tested. For the latter to work,
-in the past I had to apply the patch mentioned here:
-
-http://www.spinics.net/lists/linux-usb/msg85689.html
-
-These tools are required:
-
-git://git.gitorious.org/meego-cellular/phonet-utils.git
-
-On the host::
-
- $ ./phonet -a 0x10 -i usbpn0
- $ ./pnroute add 0x6c usbpn0
- $./pnroute add 0x10 usbpn0
- $ ifconfig usbpn0 up
-
-On the device::
-
- $ ./phonet -a 0x6c -i upnlink0
- $ ./pnroute add 0x10 upnlink0
- $ ifconfig upnlink0 up
-
-Then a test program can be used::
-
- http://www.spinics.net/lists/linux-usb/msg85690.html
-
-On the device::
-
- $ ./pnxmit -a 0x6c -r
-
-On the host::
-
- $ ./pnxmit -a 0x10 -s 0x6c
-
-As a result some data should be sent from host to device.
-Then the other way round:
-
-On the host::
-
- $ ./pnxmit -a 0x10 -r
-
-On the device::
-
- $ ./pnxmit -a 0x6c -s 0x10
-
-13. RNDIS function
-==================
-
-The function is provided by usb_f_rndis.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "rndis".
-The RNDIS function provides these attributes in its function directory:
-
- =============== ==================================================
- ifname network device interface name associated with this
- function instance
- qmult queue length multiplier for high and super speed
- host_addr MAC address of host's end of this
- Ethernet over USB link
- dev_addr MAC address of device's end of this
- Ethernet over USB link
- =============== ==================================================
-
-and after creating the functions/rndis.<instance name> they contain default
-values: qmult is 5, dev_addr and host_addr are randomly selected.
-Except for ifname they can be written to until the function is linked to a
-configuration. The ifname is read-only and contains the name of the interface
-which was assigned by the net core, e. g. usb0.
-
-Testing the RNDIS function
---------------------------
-
-Configure IP addresses of the device and the host. Then:
-
-On the device::
-
- ping <host's IP>
-
-On the host::
-
- ping <device's IP>
-
-14. SERIAL function
-===================
-
-The function is provided by usb_f_gser.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "gser".
-The SERIAL function provides just one attribute in its function directory:
-
- port_num
-
-The attribute is read-only.
-
-There can be at most 4 ACM/generic serial/OBEX ports in the system.
-
-Testing the SERIAL function
----------------------------
-
-On host::
-
- insmod usbserial
- echo VID PID >/sys/bus/usb-serial/drivers/generic/new_id
-
-On host::
-
- cat > /dev/ttyUSB<X>
-
-On target::
-
- cat /dev/ttyGS<Y>
-
-then the other way round
-
-On target::
-
- cat > /dev/ttyGS<Y>
-
-On host::
-
- cat /dev/ttyUSB<X>
-
-15. SOURCESINK function
-=======================
-
-The function is provided by usb_f_ss_lb.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "SourceSink".
-The SOURCESINK function provides these attributes in its function directory:
-
- =============== ==================================
- pattern 0 (all zeros), 1 (mod63), 2 (none)
- isoc_interval 1..16
- isoc_maxpacket 0 - 1023 (fs), 0 - 1024 (hs/ss)
- isoc_mult 0..2 (hs/ss only)
- isoc_maxburst 0..15 (ss only)
- bulk_buflen buffer length
- bulk_qlen depth of queue for bulk
- iso_qlen depth of queue for iso
- =============== ==================================
-
-Testing the SOURCESINK function
--------------------------------
-
-device: run the gadget
-
-host: test-usb (tools/usb/testusb.c)
-
-
-16. UAC1 function (legacy implementation)
-=========================================
-
-The function is provided by usb_f_uac1_legacy.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory
-is "uac1_legacy".
-The uac1 function provides these attributes in its function directory:
-
- =============== ====================================
- audio_buf_size audio buffer size
- fn_cap capture pcm device file name
- fn_cntl control device file name
- fn_play playback pcm device file name
- req_buf_size ISO OUT endpoint request buffer size
- req_count ISO OUT endpoint request count
- =============== ====================================
-
-The attributes have sane default values.
-
-Testing the UAC1 function
--------------------------
-
-device: run the gadget
-
-host::
-
- aplay -l # should list our USB Audio Gadget
-
-17. UAC2 function
-=================
-
-The function is provided by usb_f_uac2.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "uac2".
-The uac2 function provides these attributes in its function directory:
-
- =============== ====================================================
- c_chmask capture channel mask
- c_srate capture sampling rate
- c_ssize capture sample size (bytes)
- p_chmask playback channel mask
- p_srate playback sampling rate
- p_ssize playback sample size (bytes)
- req_number the number of pre-allocated request for both capture
- and playback
- =============== ====================================================
-
-The attributes have sane default values.
-
-Testing the UAC2 function
--------------------------
-
-device: run the gadget
-host: aplay -l # should list our USB Audio Gadget
-
-This function does not require real hardware support, it just
-sends a stream of audio data to/from the host. In order to
-actually hear something at the device side, a command similar
-to this must be used at the device side::
-
- $ arecord -f dat -t wav -D hw:2,0 | aplay -D hw:0,0 &
-
-e.g.::
-
- $ arecord -f dat -t wav -D hw:CARD=UAC2Gadget,DEV=0 | \
- aplay -D default:CARD=OdroidU3
-
-18. UVC function
-================
-
-The function is provided by usb_f_uvc.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "uvc".
-The uvc function provides these attributes in its function directory:
-
- =================== ================================================
- streaming_interval interval for polling endpoint for data transfers
- streaming_maxburst bMaxBurst for super speed companion descriptor
- streaming_maxpacket maximum packet size this endpoint is capable of
- sending or receiving when this configuration is
- selected
- =================== ================================================
-
-There are also "control" and "streaming" subdirectories, each of which contain
-a number of their subdirectories. There are some sane defaults provided, but
-the user must provide the following:
-
- ================== ====================================================
- control header create in control/header, link from control/class/fs
- and/or control/class/ss
- streaming header create in streaming/header, link from
- streaming/class/fs and/or streaming/class/hs and/or
- streaming/class/ss
- format description create in streaming/mjpeg and/or
- streaming/uncompressed
- frame description create in streaming/mjpeg/<format> and/or in
- streaming/uncompressed/<format>
- ================== ====================================================
-
-Each frame description contains frame interval specification, and each
-such specification consists of a number of lines with an inverval value
-in each line. The rules stated above are best illustrated with an example::
-
- # mkdir functions/uvc.usb0/control/header/h
- # cd functions/uvc.usb0/control/
- # ln -s header/h class/fs
- # ln -s header/h class/ss
- # mkdir -p functions/uvc.usb0/streaming/uncompressed/u/360p
- # cat <<EOF > functions/uvc.usb0/streaming/uncompressed/u/360p/dwFrameInterval
- 666666
- 1000000
- 5000000
- EOF
- # cd $GADGET_CONFIGFS_ROOT
- # mkdir functions/uvc.usb0/streaming/header/h
- # cd functions/uvc.usb0/streaming/header/h
- # ln -s ../../uncompressed/u
- # cd ../../class/fs
- # ln -s ../../header/h
- # cd ../../class/hs
- # ln -s ../../header/h
- # cd ../../class/ss
- # ln -s ../../header/h
-
-
-Testing the UVC function
-------------------------
-
-device: run the gadget, modprobe vivid::
-
- # uvc-gadget -u /dev/video<uvc video node #> -v /dev/video<vivid video node #>
-
-where uvc-gadget is this program:
- http://git.ideasonboard.org/uvc-gadget.git
-
-with these patches:
-
- http://www.spinics.net/lists/linux-usb/msg99220.html
-
-host::
-
- luvcview -f yuv
-
-19. PRINTER function
-====================
-
-The function is provided by usb_f_printer.ko module.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "printer".
-The printer function provides these attributes in its function directory:
-
- ========== ===========================================
- pnp_string Data to be passed to the host in pnp string
- q_len Number of requests per endpoint
- ========== ===========================================
-
-Testing the PRINTER function
-----------------------------
-
-The most basic testing:
-
-device: run the gadget::
-
- # ls -l /devices/virtual/usb_printer_gadget/
-
-should show g_printer<number>.
-
-If udev is active, then /dev/g_printer<number> should appear automatically.
-
-host:
-
-If udev is active, then e.g. /dev/usb/lp0 should appear.
-
-host->device transmission:
-
-device::
-
- # cat /dev/g_printer<number>
-
-host::
-
- # cat > /dev/usb/lp0
-
-device->host transmission::
-
- # cat > /dev/g_printer<number>
-
-host::
-
- # cat /dev/usb/lp0
-
-More advanced testing can be done with the prn_example
-described in Documentation/usb/gadget_printer.txt.
-
-
-20. UAC1 function (virtual ALSA card, using u_audio API)
-========================================================
-
-The function is provided by usb_f_uac1.ko module.
-It will create a virtual ALSA card and the audio streams are simply
-sinked to and sourced from it.
-
-Function-specific configfs interface
-------------------------------------
-
-The function name to use when creating the function directory is "uac1".
-The uac1 function provides these attributes in its function directory:
-
- ========== ====================================================
- c_chmask capture channel mask
- c_srate capture sampling rate
- c_ssize capture sample size (bytes)
- p_chmask playback channel mask
- p_srate playback sampling rate
- p_ssize playback sample size (bytes)
- req_number the number of pre-allocated request for both capture
- and playback
- ========== ====================================================
-
-The attributes have sane default values.
-
-Testing the UAC1 function
--------------------------
-
-device: run the gadget
-host: aplay -l # should list our USB Audio Gadget
-
-This function does not require real hardware support, it just
-sends a stream of audio data to/from the host. In order to
-actually hear something at the device side, a command similar
-to this must be used at the device side::
-
- $ arecord -f dat -t wav -D hw:2,0 | aplay -D hw:0,0 &
-
-e.g.::
-
- $ arecord -f dat -t wav -D hw:CARD=UAC1Gadget,DEV=0 | \
- aplay -D default:CARD=OdroidU3
diff --git a/Documentation/usb/gadget_configfs.txt b/Documentation/usb/gadget_configfs.rst
index 54fb08baae22..54fb08baae22 100644
--- a/Documentation/usb/gadget_configfs.txt
+++ b/Documentation/usb/gadget_configfs.rst
diff --git a/Documentation/usb/gadget_hid.txt b/Documentation/usb/gadget_hid.rst
index 098d563040cc..098d563040cc 100644
--- a/Documentation/usb/gadget_hid.txt
+++ b/Documentation/usb/gadget_hid.rst
diff --git a/Documentation/usb/gadget_multi.txt b/Documentation/usb/gadget_multi.rst
index 9806b55af301..9806b55af301 100644
--- a/Documentation/usb/gadget_multi.txt
+++ b/Documentation/usb/gadget_multi.rst
diff --git a/Documentation/usb/gadget_printer.txt b/Documentation/usb/gadget_printer.rst
index 5e5516c69075..5e5516c69075 100644
--- a/Documentation/usb/gadget_printer.txt
+++ b/Documentation/usb/gadget_printer.rst
diff --git a/Documentation/usb/gadget_serial.txt b/Documentation/usb/gadget_serial.rst
index dce8bc1fb1f2..dce8bc1fb1f2 100644
--- a/Documentation/usb/gadget_serial.txt
+++ b/Documentation/usb/gadget_serial.rst
diff --git a/Documentation/usb/index.rst b/Documentation/usb/index.rst
new file mode 100644
index 000000000000..e55386a4abfb
--- /dev/null
+++ b/Documentation/usb/index.rst
@@ -0,0 +1,39 @@
+===========
+USB support
+===========
+
+.. toctree::
+ :maxdepth: 1
+
+ acm
+ authorization
+ chipidea
+ dwc3
+ ehci
+ functionfs
+ gadget_configfs
+ gadget_hid
+ gadget_multi
+ gadget_printer
+ gadget_serial
+ gadget-testing
+ iuu_phoenix
+ mass-storage
+ misc_usbsevseg
+ mtouchusb
+ ohci
+ rio
+ usbip_protocol
+ usbmon
+ usb-serial
+ wusb-design-overview
+
+ usb-help
+ text_files
+
+.. only:: subproject and html
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/usb/iuu_phoenix.txt b/Documentation/usb/iuu_phoenix.rst
index b76268728450..b76268728450 100644
--- a/Documentation/usb/iuu_phoenix.txt
+++ b/Documentation/usb/iuu_phoenix.rst
diff --git a/Documentation/usb/mass-storage.txt b/Documentation/usb/mass-storage.rst
index d181b47c3cb6..d181b47c3cb6 100644
--- a/Documentation/usb/mass-storage.txt
+++ b/Documentation/usb/mass-storage.rst
diff --git a/Documentation/usb/misc_usbsevseg.txt b/Documentation/usb/misc_usbsevseg.rst
index 6274aee083ed..6274aee083ed 100644
--- a/Documentation/usb/misc_usbsevseg.txt
+++ b/Documentation/usb/misc_usbsevseg.rst
diff --git a/Documentation/usb/mtouchusb.txt b/Documentation/usb/mtouchusb.rst
index d1111b74bf75..d1111b74bf75 100644
--- a/Documentation/usb/mtouchusb.txt
+++ b/Documentation/usb/mtouchusb.rst
diff --git a/Documentation/usb/ohci.txt b/Documentation/usb/ohci.rst
index bb3c49719e6b..bb3c49719e6b 100644
--- a/Documentation/usb/ohci.txt
+++ b/Documentation/usb/ohci.rst
diff --git a/Documentation/usb/rio.txt b/Documentation/usb/rio.rst
index ea73475471db..ea73475471db 100644
--- a/Documentation/usb/rio.txt
+++ b/Documentation/usb/rio.rst
diff --git a/Documentation/usb/text_files.rst b/Documentation/usb/text_files.rst
new file mode 100644
index 000000000000..6a8d3fcf64b6
--- /dev/null
+++ b/Documentation/usb/text_files.rst
@@ -0,0 +1,29 @@
+Linux CDC ACM inf
+-----------------
+
+.. include:: linux-cdc-acm.inf
+ :literal:
+
+Linux inf
+---------
+
+.. include:: linux.inf
+ :literal:
+
+USB devfs drop permissions source
+---------------------------------
+
+.. literalinclude:: usbdevfs-drop-permissions.c
+ :language: c
+
+WUSB command line script to manipulate auth credentials
+-------------------------------------------------------
+
+.. literalinclude:: wusb-cbaf
+ :language: shell
+
+Credits
+-------
+
+.. include:: CREDITS
+ :literal:
diff --git a/Documentation/usb/usb-help.txt b/Documentation/usb/usb-help.rst
index dc23ecd4d802..dc23ecd4d802 100644
--- a/Documentation/usb/usb-help.txt
+++ b/Documentation/usb/usb-help.rst
diff --git a/Documentation/usb/usb-serial.txt b/Documentation/usb/usb-serial.rst
index 8fa7dbd3da9a..8fa7dbd3da9a 100644
--- a/Documentation/usb/usb-serial.txt
+++ b/Documentation/usb/usb-serial.rst
diff --git a/Documentation/usb/usbip_protocol.txt b/Documentation/usb/usbip_protocol.rst
index 988c832166cd..988c832166cd 100644
--- a/Documentation/usb/usbip_protocol.txt
+++ b/Documentation/usb/usbip_protocol.rst
diff --git a/Documentation/usb/usbmon.txt b/Documentation/usb/usbmon.rst
index b0bd51080799..b0bd51080799 100644
--- a/Documentation/usb/usbmon.txt
+++ b/Documentation/usb/usbmon.rst
diff --git a/Documentation/usb/WUSB-Design-overview.txt b/Documentation/usb/wusb-design-overview.rst
index dc5e21609bb5..dc5e21609bb5 100644
--- a/Documentation/usb/WUSB-Design-overview.txt
+++ b/Documentation/usb/wusb-design-overview.rst
diff --git a/Documentation/userspace-api/accelerators/ocxl.rst b/Documentation/userspace-api/accelerators/ocxl.rst
new file mode 100644
index 000000000000..14cefc020e2d
--- /dev/null
+++ b/Documentation/userspace-api/accelerators/ocxl.rst
@@ -0,0 +1,176 @@
+========================================================
+OpenCAPI (Open Coherent Accelerator Processor Interface)
+========================================================
+
+OpenCAPI is an interface between processors and accelerators. It aims
+at being low-latency and high-bandwidth. The specification is
+developed by the `OpenCAPI Consortium <http://opencapi.org/>`_.
+
+It allows an accelerator (which could be a FPGA, ASICs, ...) to access
+the host memory coherently, using virtual addresses. An OpenCAPI
+device can also host its own memory, that can be accessed from the
+host.
+
+OpenCAPI is known in linux as 'ocxl', as the open, processor-agnostic
+evolution of 'cxl' (the driver for the IBM CAPI interface for
+powerpc), which was named that way to avoid confusion with the ISDN
+CAPI subsystem.
+
+
+High-level view
+===============
+
+OpenCAPI defines a Data Link Layer (DL) and Transaction Layer (TL), to
+be implemented on top of a physical link. Any processor or device
+implementing the DL and TL can start sharing memory.
+
+::
+
+ +-----------+ +-------------+
+ | | | |
+ | | | Accelerated |
+ | Processor | | Function |
+ | | +--------+ | Unit | +--------+
+ | |--| Memory | | (AFU) |--| Memory |
+ | | +--------+ | | +--------+
+ +-----------+ +-------------+
+ | |
+ +-----------+ +-------------+
+ | TL | | TLX |
+ +-----------+ +-------------+
+ | |
+ +-----------+ +-------------+
+ | DL | | DLX |
+ +-----------+ +-------------+
+ | |
+ | PHY |
+ +---------------------------------------+
+
+
+
+Device discovery
+================
+
+OpenCAPI relies on a PCI-like configuration space, implemented on the
+device. So the host can discover AFUs by querying the config space.
+
+OpenCAPI devices in Linux are treated like PCI devices (with a few
+caveats). The firmware is expected to abstract the hardware as if it
+was a PCI link. A lot of the existing PCI infrastructure is reused:
+devices are scanned and BARs are assigned during the standard PCI
+enumeration. Commands like 'lspci' can therefore be used to see what
+devices are available.
+
+The configuration space defines the AFU(s) that can be found on the
+physical adapter, such as its name, how many memory contexts it can
+work with, the size of its MMIO areas, ...
+
+
+
+MMIO
+====
+
+OpenCAPI defines two MMIO areas for each AFU:
+
+* the global MMIO area, with registers pertinent to the whole AFU.
+* a per-process MMIO area, which has a fixed size for each context.
+
+
+
+AFU interrupts
+==============
+
+OpenCAPI includes the possibility for an AFU to send an interrupt to a
+host process. It is done through a 'intrp_req' defined in the
+Transaction Layer, specifying a 64-bit object handle which defines the
+interrupt.
+
+The driver allows a process to allocate an interrupt and obtain its
+64-bit object handle, that can be passed to the AFU.
+
+
+
+char devices
+============
+
+The driver creates one char device per AFU found on the physical
+device. A physical device may have multiple functions and each
+function can have multiple AFUs. At the time of this writing though,
+it has only been tested with devices exporting only one AFU.
+
+Char devices can be found in /dev/ocxl/ and are named as:
+/dev/ocxl/<AFU name>.<location>.<index>
+
+where <AFU name> is a max 20-character long name, as found in the
+config space of the AFU.
+<location> is added by the driver and can help distinguish devices
+when a system has more than one instance of the same OpenCAPI device.
+<index> is also to help distinguish AFUs in the unlikely case where a
+device carries multiple copies of the same AFU.
+
+
+
+Sysfs class
+===========
+
+An ocxl class is added for the devices representing the AFUs. See
+/sys/class/ocxl. The layout is described in
+Documentation/ABI/testing/sysfs-class-ocxl
+
+
+
+User API
+========
+
+open
+----
+
+Based on the AFU definition found in the config space, an AFU may
+support working with more than one memory context, in which case the
+associated char device may be opened multiple times by different
+processes.
+
+
+ioctl
+-----
+
+OCXL_IOCTL_ATTACH:
+
+ Attach the memory context of the calling process to the AFU so that
+ the AFU can access its memory.
+
+OCXL_IOCTL_IRQ_ALLOC:
+
+ Allocate an AFU interrupt and return an identifier.
+
+OCXL_IOCTL_IRQ_FREE:
+
+ Free a previously allocated AFU interrupt.
+
+OCXL_IOCTL_IRQ_SET_FD:
+
+ Associate an event fd to an AFU interrupt so that the user process
+ can be notified when the AFU sends an interrupt.
+
+OCXL_IOCTL_GET_METADATA:
+
+ Obtains configuration information from the card, such at the size of
+ MMIO areas, the AFU version, and the PASID for the current context.
+
+OCXL_IOCTL_ENABLE_P9_WAIT:
+
+ Allows the AFU to wake a userspace thread executing 'wait'. Returns
+ information to userspace to allow it to configure the AFU. Note that
+ this is only available on POWER9.
+
+OCXL_IOCTL_GET_FEATURES:
+
+ Reports on which CPU features that affect OpenCAPI are usable from
+ userspace.
+
+
+mmap
+----
+
+A process can mmap the per-process MMIO area for interactions with the
+AFU.
diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index a3233da7fa88..ad494da40009 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -20,6 +20,7 @@ place where this information is gathered.
seccomp_filter
unshare
spec_ctrl
+ accelerators/ocxl
.. only:: subproject and html
diff --git a/Documentation/vfio-mediated-device.txt b/Documentation/vfio-mediated-device.txt
deleted file mode 100644
index c3f69bcaf96e..000000000000
--- a/Documentation/vfio-mediated-device.txt
+++ /dev/null
@@ -1,414 +0,0 @@
-.. include:: <isonum.txt>
-
-=====================
-VFIO Mediated devices
-=====================
-
-:Copyright: |copy| 2016, NVIDIA CORPORATION. All rights reserved.
-:Author: Neo Jia <cjia@nvidia.com>
-:Author: Kirti Wankhede <kwankhede@nvidia.com>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License version 2 as
-published by the Free Software Foundation.
-
-
-Virtual Function I/O (VFIO) Mediated devices[1]
-===============================================
-
-The number of use cases for virtualizing DMA devices that do not have built-in
-SR_IOV capability is increasing. Previously, to virtualize such devices,
-developers had to create their own management interfaces and APIs, and then
-integrate them with user space software. To simplify integration with user space
-software, we have identified common requirements and a unified management
-interface for such devices.
-
-The VFIO driver framework provides unified APIs for direct device access. It is
-an IOMMU/device-agnostic framework for exposing direct device access to user
-space in a secure, IOMMU-protected environment. This framework is used for
-multiple devices, such as GPUs, network adapters, and compute accelerators. With
-direct device access, virtual machines or user space applications have direct
-access to the physical device. This framework is reused for mediated devices.
-
-The mediated core driver provides a common interface for mediated device
-management that can be used by drivers of different devices. This module
-provides a generic interface to perform these operations:
-
-* Create and destroy a mediated device
-* Add a mediated device to and remove it from a mediated bus driver
-* Add a mediated device to and remove it from an IOMMU group
-
-The mediated core driver also provides an interface to register a bus driver.
-For example, the mediated VFIO mdev driver is designed for mediated devices and
-supports VFIO APIs. The mediated bus driver adds a mediated device to and
-removes it from a VFIO group.
-
-The following high-level block diagram shows the main components and interfaces
-in the VFIO mediated driver framework. The diagram shows NVIDIA, Intel, and IBM
-devices as examples, as these devices are the first devices to use this module::
-
- +---------------+
- | |
- | +-----------+ | mdev_register_driver() +--------------+
- | | | +<------------------------+ |
- | | mdev | | | |
- | | bus | +------------------------>+ vfio_mdev.ko |<-> VFIO user
- | | driver | | probe()/remove() | | APIs
- | | | | +--------------+
- | +-----------+ |
- | |
- | MDEV CORE |
- | MODULE |
- | mdev.ko |
- | +-----------+ | mdev_register_device() +--------------+
- | | | +<------------------------+ |
- | | | | | nvidia.ko |<-> physical
- | | | +------------------------>+ | device
- | | | | callbacks +--------------+
- | | Physical | |
- | | device | | mdev_register_device() +--------------+
- | | interface | |<------------------------+ |
- | | | | | i915.ko |<-> physical
- | | | +------------------------>+ | device
- | | | | callbacks +--------------+
- | | | |
- | | | | mdev_register_device() +--------------+
- | | | +<------------------------+ |
- | | | | | ccw_device.ko|<-> physical
- | | | +------------------------>+ | device
- | | | | callbacks +--------------+
- | +-----------+ |
- +---------------+
-
-
-Registration Interfaces
-=======================
-
-The mediated core driver provides the following types of registration
-interfaces:
-
-* Registration interface for a mediated bus driver
-* Physical device driver interface
-
-Registration Interface for a Mediated Bus Driver
-------------------------------------------------
-
-The registration interface for a mediated bus driver provides the following
-structure to represent a mediated device's driver::
-
- /*
- * struct mdev_driver [2] - Mediated device's driver
- * @name: driver name
- * @probe: called when new device created
- * @remove: called when device removed
- * @driver: device driver structure
- */
- struct mdev_driver {
- const char *name;
- int (*probe) (struct device *dev);
- void (*remove) (struct device *dev);
- struct device_driver driver;
- };
-
-A mediated bus driver for mdev should use this structure in the function calls
-to register and unregister itself with the core driver:
-
-* Register::
-
- extern int mdev_register_driver(struct mdev_driver *drv,
- struct module *owner);
-
-* Unregister::
-
- extern void mdev_unregister_driver(struct mdev_driver *drv);
-
-The mediated bus driver is responsible for adding mediated devices to the VFIO
-group when devices are bound to the driver and removing mediated devices from
-the VFIO when devices are unbound from the driver.
-
-
-Physical Device Driver Interface
---------------------------------
-
-The physical device driver interface provides the mdev_parent_ops[3] structure
-to define the APIs to manage work in the mediated core driver that is related
-to the physical device.
-
-The structures in the mdev_parent_ops structure are as follows:
-
-* dev_attr_groups: attributes of the parent device
-* mdev_attr_groups: attributes of the mediated device
-* supported_config: attributes to define supported configurations
-
-The functions in the mdev_parent_ops structure are as follows:
-
-* create: allocate basic resources in a driver for a mediated device
-* remove: free resources in a driver when a mediated device is destroyed
-
-(Note that mdev-core provides no implicit serialization of create/remove
-callbacks per mdev parent device, per mdev type, or any other categorization.
-Vendor drivers are expected to be fully asynchronous in this respect or
-provide their own internal resource protection.)
-
-The callbacks in the mdev_parent_ops structure are as follows:
-
-* open: open callback of mediated device
-* close: close callback of mediated device
-* ioctl: ioctl callback of mediated device
-* read : read emulation callback
-* write: write emulation callback
-* mmap: mmap emulation callback
-
-A driver should use the mdev_parent_ops structure in the function call to
-register itself with the mdev core driver::
-
- extern int mdev_register_device(struct device *dev,
- const struct mdev_parent_ops *ops);
-
-However, the mdev_parent_ops structure is not required in the function call
-that a driver should use to unregister itself with the mdev core driver::
-
- extern void mdev_unregister_device(struct device *dev);
-
-
-Mediated Device Management Interface Through sysfs
-==================================================
-
-The management interface through sysfs enables user space software, such as
-libvirt, to query and configure mediated devices in a hardware-agnostic fashion.
-This management interface provides flexibility to the underlying physical
-device's driver to support features such as:
-
-* Mediated device hot plug
-* Multiple mediated devices in a single virtual machine
-* Multiple mediated devices from different physical devices
-
-Links in the mdev_bus Class Directory
--------------------------------------
-The /sys/class/mdev_bus/ directory contains links to devices that are registered
-with the mdev core driver.
-
-Directories and files under the sysfs for Each Physical Device
---------------------------------------------------------------
-
-::
-
- |- [parent physical device]
- |--- Vendor-specific-attributes [optional]
- |--- [mdev_supported_types]
- | |--- [<type-id>]
- | | |--- create
- | | |--- name
- | | |--- available_instances
- | | |--- device_api
- | | |--- description
- | | |--- [devices]
- | |--- [<type-id>]
- | | |--- create
- | | |--- name
- | | |--- available_instances
- | | |--- device_api
- | | |--- description
- | | |--- [devices]
- | |--- [<type-id>]
- | |--- create
- | |--- name
- | |--- available_instances
- | |--- device_api
- | |--- description
- | |--- [devices]
-
-* [mdev_supported_types]
-
- The list of currently supported mediated device types and their details.
-
- [<type-id>], device_api, and available_instances are mandatory attributes
- that should be provided by vendor driver.
-
-* [<type-id>]
-
- The [<type-id>] name is created by adding the device driver string as a prefix
- to the string provided by the vendor driver. This format of this name is as
- follows::
-
- sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name);
-
- (or using mdev_parent_dev(mdev) to arrive at the parent device outside
- of the core mdev code)
-
-* device_api
-
- This attribute should show which device API is being created, for example,
- "vfio-pci" for a PCI device.
-
-* available_instances
-
- This attribute should show the number of devices of type <type-id> that can be
- created.
-
-* [device]
-
- This directory contains links to the devices of type <type-id> that have been
- created.
-
-* name
-
- This attribute should show human readable name. This is optional attribute.
-
-* description
-
- This attribute should show brief features/description of the type. This is
- optional attribute.
-
-Directories and Files Under the sysfs for Each mdev Device
-----------------------------------------------------------
-
-::
-
- |- [parent phy device]
- |--- [$MDEV_UUID]
- |--- remove
- |--- mdev_type {link to its type}
- |--- vendor-specific-attributes [optional]
-
-* remove (write only)
-
-Writing '1' to the 'remove' file destroys the mdev device. The vendor driver can
-fail the remove() callback if that device is active and the vendor driver
-doesn't support hot unplug.
-
-Example::
-
- # echo 1 > /sys/bus/mdev/devices/$mdev_UUID/remove
-
-Mediated device Hot plug
-------------------------
-
-Mediated devices can be created and assigned at runtime. The procedure to hot
-plug a mediated device is the same as the procedure to hot plug a PCI device.
-
-Translation APIs for Mediated Devices
-=====================================
-
-The following APIs are provided for translating user pfn to host pfn in a VFIO
-driver::
-
- extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
- int npage, int prot, unsigned long *phys_pfn);
-
- extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
- int npage);
-
-These functions call back into the back-end IOMMU module by using the pin_pages
-and unpin_pages callbacks of the struct vfio_iommu_driver_ops[4]. Currently
-these callbacks are supported in the TYPE1 IOMMU module. To enable them for
-other IOMMU backend modules, such as PPC64 sPAPR module, they need to provide
-these two callback functions.
-
-Using the Sample Code
-=====================
-
-mtty.c in samples/vfio-mdev/ directory is a sample driver program to
-demonstrate how to use the mediated device framework.
-
-The sample driver creates an mdev device that simulates a serial port over a PCI
-card.
-
-1. Build and load the mtty.ko module.
-
- This step creates a dummy device, /sys/devices/virtual/mtty/mtty/
-
- Files in this device directory in sysfs are similar to the following::
-
- # tree /sys/devices/virtual/mtty/mtty/
- /sys/devices/virtual/mtty/mtty/
- |-- mdev_supported_types
- | |-- mtty-1
- | | |-- available_instances
- | | |-- create
- | | |-- device_api
- | | |-- devices
- | | `-- name
- | `-- mtty-2
- | |-- available_instances
- | |-- create
- | |-- device_api
- | |-- devices
- | `-- name
- |-- mtty_dev
- | `-- sample_mtty_dev
- |-- power
- | |-- autosuspend_delay_ms
- | |-- control
- | |-- runtime_active_time
- | |-- runtime_status
- | `-- runtime_suspended_time
- |-- subsystem -> ../../../../class/mtty
- `-- uevent
-
-2. Create a mediated device by using the dummy device that you created in the
- previous step::
-
- # echo "83b8f4f2-509f-382f-3c1e-e6bfe0fa1001" > \
- /sys/devices/virtual/mtty/mtty/mdev_supported_types/mtty-2/create
-
-3. Add parameters to qemu-kvm::
-
- -device vfio-pci,\
- sysfsdev=/sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001
-
-4. Boot the VM.
-
- In the Linux guest VM, with no hardware on the host, the device appears
- as follows::
-
- # lspci -s 00:05.0 -xxvv
- 00:05.0 Serial controller: Device 4348:3253 (rev 10) (prog-if 02 [16550])
- Subsystem: Device 4348:3253
- Physical Slot: 5
- Control: I/O+ Mem- BusMaster- SpecCycle- MemWINV- VGASnoop- ParErr-
- Stepping- SERR- FastB2B- DisINTx-
- Status: Cap- 66MHz- UDF- FastB2B- ParErr- DEVSEL=medium >TAbort-
- <TAbort- <MAbort- >SERR- <PERR- INTx-
- Interrupt: pin A routed to IRQ 10
- Region 0: I/O ports at c150 [size=8]
- Region 1: I/O ports at c158 [size=8]
- Kernel driver in use: serial
- 00: 48 43 53 32 01 00 00 02 10 02 00 07 00 00 00 00
- 10: 51 c1 00 00 59 c1 00 00 00 00 00 00 00 00 00 00
- 20: 00 00 00 00 00 00 00 00 00 00 00 00 48 43 53 32
- 30: 00 00 00 00 00 00 00 00 00 00 00 00 0a 01 00 00
-
- In the Linux guest VM, dmesg output for the device is as follows:
-
- serial 0000:00:05.0: PCI INT A -> Link[LNKA] -> GSI 10 (level, high) -> IRQ 10
- 0000:00:05.0: ttyS1 at I/O 0xc150 (irq = 10) is a 16550A
- 0000:00:05.0: ttyS2 at I/O 0xc158 (irq = 10) is a 16550A
-
-
-5. In the Linux guest VM, check the serial ports::
-
- # setserial -g /dev/ttyS*
- /dev/ttyS0, UART: 16550A, Port: 0x03f8, IRQ: 4
- /dev/ttyS1, UART: 16550A, Port: 0xc150, IRQ: 10
- /dev/ttyS2, UART: 16550A, Port: 0xc158, IRQ: 10
-
-6. Using minicom or any terminal emulation program, open port /dev/ttyS1 or
- /dev/ttyS2 with hardware flow control disabled.
-
-7. Type data on the minicom terminal or send data to the terminal emulation
- program and read the data.
-
- Data is loop backed from hosts mtty driver.
-
-8. Destroy the mediated device that you created::
-
- # echo 1 > /sys/bus/mdev/devices/83b8f4f2-509f-382f-3c1e-e6bfe0fa1001/remove
-
-References
-==========
-
-1. See Documentation/vfio.txt for more information on VFIO.
-2. struct mdev_driver in include/linux/mdev.h
-3. struct mdev_parent_ops in include/linux/mdev.h
-4. struct vfio_iommu_driver_ops in include/linux/vfio.h
diff --git a/Documentation/virtual/index.rst b/Documentation/virtual/index.rst
new file mode 100644
index 000000000000..062ffb527043
--- /dev/null
+++ b/Documentation/virtual/index.rst
@@ -0,0 +1,18 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============================
+Linux Virtualization Support
+============================
+
+.. toctree::
+ :maxdepth: 2
+
+ kvm/index
+ paravirt_ops
+
+.. only:: html and subproject
+
+ Indices
+ =======
+
+ * :ref:`genindex`
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 68984c284c40..e54a3f51ddc5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1079,7 +1079,7 @@ yet and must be cleared on entry.
4.35 KVM_SET_USER_MEMORY_REGION
-Capability: KVM_CAP_USER_MEM
+Capability: KVM_CAP_USER_MEMORY
Architectures: all
Type: vm ioctl
Parameters: struct kvm_userspace_memory_region (in)
@@ -3857,43 +3857,59 @@ Type: vcpu ioctl
Parameters: struct kvm_nested_state (in/out)
Returns: 0 on success, -1 on error
Errors:
- E2BIG: the total state size (including the fixed-size part of struct
- kvm_nested_state) exceeds the value of 'size' specified by
+ E2BIG: the total state size exceeds the value of 'size' specified by
the user; the size required will be written into size.
struct kvm_nested_state {
__u16 flags;
__u16 format;
__u32 size;
+
union {
- struct kvm_vmx_nested_state vmx;
- struct kvm_svm_nested_state svm;
+ struct kvm_vmx_nested_state_hdr vmx;
+ struct kvm_svm_nested_state_hdr svm;
+
+ /* Pad the header to 128 bytes. */
__u8 pad[120];
- };
- __u8 data[0];
+ } hdr;
+
+ union {
+ struct kvm_vmx_nested_state_data vmx[0];
+ struct kvm_svm_nested_state_data svm[0];
+ } data;
};
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
+#define KVM_STATE_NESTED_EVMCS 0x00000004
+
+#define KVM_STATE_NESTED_FORMAT_VMX 0
+#define KVM_STATE_NESTED_FORMAT_SVM 1
-#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
-#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
+#define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000
-struct kvm_vmx_nested_state {
+#define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001
+#define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002
+
+struct kvm_vmx_nested_state_hdr {
__u64 vmxon_pa;
- __u64 vmcs_pa;
+ __u64 vmcs12_pa;
struct {
__u16 flags;
} smm;
};
+struct kvm_vmx_nested_state_data {
+ __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+ __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
+};
+
This ioctl copies the vcpu's nested virtualization state from the kernel to
userspace.
-The maximum size of the state, including the fixed-size part of struct
-kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to
-the KVM_CHECK_EXTENSION ioctl().
+The maximum size of the state can be retrieved by passing KVM_CAP_NESTED_STATE
+to the KVM_CHECK_EXTENSION ioctl().
4.115 KVM_SET_NESTED_STATE
@@ -3903,8 +3919,8 @@ Type: vcpu ioctl
Parameters: struct kvm_nested_state (in)
Returns: 0 on success, -1 on error
-This copies the vcpu's kvm_nested_state struct from userspace to the kernel. For
-the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
+This copies the vcpu's kvm_nested_state struct from userspace to the kernel.
+For the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
4.116 KVM_(UN)REGISTER_COALESCED_MMIO
@@ -4065,6 +4081,37 @@ KVM_ARM_VCPU_FINALIZE call.
See KVM_ARM_VCPU_INIT for details of vcpu features that require finalization
using this ioctl.
+4.120 KVM_SET_PMU_EVENT_FILTER
+
+Capability: KVM_CAP_PMU_EVENT_FILTER
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_pmu_event_filter (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_pmu_event_filter {
+ __u32 action;
+ __u32 nevents;
+ __u32 fixed_counter_bitmap;
+ __u32 flags;
+ __u32 pad[4];
+ __u64 events[0];
+};
+
+This ioctl restricts the set of PMU events that the guest can program.
+The argument holds a list of events which will be allowed or denied.
+The eventsel+umask of each event the guest attempts to program is compared
+against the events field to determine whether the guest should have access.
+The events field only controls general purpose counters; fixed purpose
+counters are controlled by the fixed_counter_bitmap.
+
+No flags are defined yet, the field must be zero.
+
+Valid values for 'action':
+#define KVM_PMU_EVENT_ALLOW 0
+#define KVM_PMU_EVENT_DENY 1
+
+
5. The kvm_run structure
------------------------
@@ -4893,6 +4940,8 @@ Valid bits in args[0] are
#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
#define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
+#define KVM_X86_DISABLE_EXITS_PAUSE (1 << 2)
+#define KVM_X86_DISABLE_EXITS_CSTATE (1 << 3)
Enabling this capability on a VM provides userspace with a way to no
longer intercept some instructions for improved latency in some
diff --git a/Documentation/virtual/kvm/arm/psci.txt b/Documentation/virtual/kvm/arm/psci.txt
index aafdab887b04..559586fc9d37 100644
--- a/Documentation/virtual/kvm/arm/psci.txt
+++ b/Documentation/virtual/kvm/arm/psci.txt
@@ -28,3 +28,34 @@ The following register is defined:
- Allows any PSCI version implemented by KVM and compatible with
v0.2 to be set with SET_ONE_REG
- Affects the whole VM (even if the register view is per-vcpu)
+
+* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
+ Holds the state of the firmware support to mitigate CVE-2017-5715, as
+ offered by KVM to the guest via a HVC call. The workaround is described
+ under SMCCC_ARCH_WORKAROUND_1 in [1].
+ Accepted values are:
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL: KVM does not offer
+ firmware support for the workaround. The mitigation status for the
+ guest is unknown.
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL: The workaround HVC call is
+ available to the guest and required for the mitigation.
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED: The workaround HVC call
+ is available to the guest, but it is not needed on this VCPU.
+
+* KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
+ Holds the state of the firmware support to mitigate CVE-2018-3639, as
+ offered by KVM to the guest via a HVC call. The workaround is described
+ under SMCCC_ARCH_WORKAROUND_2 in [1].
+ Accepted values are:
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL: A workaround is not
+ available. KVM does not offer firmware support for the workaround.
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN: The workaround state is
+ unknown. KVM does not offer firmware support for the workaround.
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL: The workaround is available,
+ and can be disabled by a vCPU. If
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED is set, it is active for
+ this vCPU.
+ KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED: The workaround is
+ always active on this vCPU or it is not needed.
+
+[1] https://developer.arm.com/-/media/developer/pdf/ARM_DEN_0070A_Firmware_interfaces_for_mitigating_CVE-2017-5715.pdf
diff --git a/Documentation/virtual/kvm/cpuid.rst b/Documentation/virtual/kvm/cpuid.rst
new file mode 100644
index 000000000000..01b081f6e7ea
--- /dev/null
+++ b/Documentation/virtual/kvm/cpuid.rst
@@ -0,0 +1,107 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============
+KVM CPUID bits
+==============
+
+:Author: Glauber Costa <glommer@gmail.com>
+
+A guest running on a kvm host, can check some of its features using
+cpuid. This is not always guaranteed to work, since userspace can
+mask-out some, or even all KVM-related cpuid features before launching
+a guest.
+
+KVM cpuid functions are:
+
+function: KVM_CPUID_SIGNATURE (0x40000000)
+
+returns::
+
+ eax = 0x40000001
+ ebx = 0x4b4d564b
+ ecx = 0x564b4d56
+ edx = 0x4d
+
+Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
+The value in eax corresponds to the maximum cpuid function present in this leaf,
+and will be updated if more functions are added in the future.
+Note also that old hosts set eax value to 0x0. This should
+be interpreted as if the value was 0x40000001.
+This function queries the presence of KVM cpuid leafs.
+
+function: define KVM_CPUID_FEATURES (0x40000001)
+
+returns::
+
+ ebx, ecx
+ eax = an OR'ed group of (1 << flag)
+
+where ``flag`` is defined as below:
+
+================================= =========== ================================
+flag value meaning
+================================= =========== ================================
+KVM_FEATURE_CLOCKSOURCE 0 kvmclock available at msrs
+ 0x11 and 0x12
+
+KVM_FEATURE_NOP_IO_DELAY 1 not necessary to perform delays
+ on PIO operations
+
+KVM_FEATURE_MMU_OP 2 deprecated
+
+KVM_FEATURE_CLOCKSOURCE2 3 kvmclock available at msrs
+
+ 0x4b564d00 and 0x4b564d01
+KVM_FEATURE_ASYNC_PF 4 async pf can be enabled by
+ writing to msr 0x4b564d02
+
+KVM_FEATURE_STEAL_TIME 5 steal time can be enabled by
+ writing to msr 0x4b564d03
+
+KVM_FEATURE_PV_EOI 6 paravirtualized end of interrupt
+ handler can be enabled by
+ writing to msr 0x4b564d04
+
+KVM_FEATURE_PV_UNHAULT 7 guest checks this feature bit
+ before enabling paravirtualized
+ spinlock support
+
+KVM_FEATURE_PV_TLB_FLUSH 9 guest checks this feature bit
+ before enabling paravirtualized
+ tlb flush
+
+KVM_FEATURE_ASYNC_PF_VMEXIT 10 paravirtualized async PF VM EXIT
+ can be enabled by setting bit 2
+ when writing to msr 0x4b564d02
+
+KVM_FEATURE_PV_SEND_IPI 11 guest checks this feature bit
+ before enabling paravirtualized
+ sebd IPIs
+
+KVM_FEATURE_PV_POLL_CONTROL 12 host-side polling on HLT can
+ be disabled by writing
+ to msr 0x4b564d05.
+
+KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit
+ before using paravirtualized
+ sched yield.
+
+KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side
+ per-cpu warps are expeced in
+ kvmclock
+================================= =========== ================================
+
+::
+
+ edx = an OR'ed group of (1 << flag)
+
+Where ``flag`` here is defined as below:
+
+================== ============ =================================
+flag value meaning
+================== ============ =================================
+KVM_HINTS_REALTIME 0 guest checks this feature bit to
+ determine that vCPUs are never
+ preempted for an unlimited time
+ allowing optimizations
+================== ============ =================================
diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
deleted file mode 100644
index 97ca1940a0dc..000000000000
--- a/Documentation/virtual/kvm/cpuid.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-KVM CPUID bits
-Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
-=====================================================
-
-A guest running on a kvm host, can check some of its features using
-cpuid. This is not always guaranteed to work, since userspace can
-mask-out some, or even all KVM-related cpuid features before launching
-a guest.
-
-KVM cpuid functions are:
-
-function: KVM_CPUID_SIGNATURE (0x40000000)
-returns : eax = 0x40000001,
- ebx = 0x4b4d564b,
- ecx = 0x564b4d56,
- edx = 0x4d.
-Note that this value in ebx, ecx and edx corresponds to the string "KVMKVMKVM".
-The value in eax corresponds to the maximum cpuid function present in this leaf,
-and will be updated if more functions are added in the future.
-Note also that old hosts set eax value to 0x0. This should
-be interpreted as if the value was 0x40000001.
-This function queries the presence of KVM cpuid leafs.
-
-
-function: define KVM_CPUID_FEATURES (0x40000001)
-returns : ebx, ecx
- eax = an OR'ed group of (1 << flag), where each flags is:
-
-
-flag || value || meaning
-=============================================================================
-KVM_FEATURE_CLOCKSOURCE || 0 || kvmclock available at msrs
- || || 0x11 and 0x12.
-------------------------------------------------------------------------------
-KVM_FEATURE_NOP_IO_DELAY || 1 || not necessary to perform delays
- || || on PIO operations.
-------------------------------------------------------------------------------
-KVM_FEATURE_MMU_OP || 2 || deprecated.
-------------------------------------------------------------------------------
-KVM_FEATURE_CLOCKSOURCE2 || 3 || kvmclock available at msrs
- || || 0x4b564d00 and 0x4b564d01
-------------------------------------------------------------------------------
-KVM_FEATURE_ASYNC_PF || 4 || async pf can be enabled by
- || || writing to msr 0x4b564d02
-------------------------------------------------------------------------------
-KVM_FEATURE_STEAL_TIME || 5 || steal time can be enabled by
- || || writing to msr 0x4b564d03.
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_EOI || 6 || paravirtualized end of interrupt
- || || handler can be enabled by writing
- || || to msr 0x4b564d04.
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit
- || || before enabling paravirtualized
- || || spinlock support.
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_TLB_FLUSH || 9 || guest checks this feature bit
- || || before enabling paravirtualized
- || || tlb flush.
-------------------------------------------------------------------------------
-KVM_FEATURE_ASYNC_PF_VMEXIT || 10 || paravirtualized async PF VM exit
- || || can be enabled by setting bit 2
- || || when writing to msr 0x4b564d02
-------------------------------------------------------------------------------
-KVM_FEATURE_PV_SEND_IPI || 11 || guest checks this feature bit
- || || before using paravirtualized
- || || send IPIs.
-------------------------------------------------------------------------------
-KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side
- || || per-cpu warps are expected in
- || || kvmclock.
-------------------------------------------------------------------------------
-
- edx = an OR'ed group of (1 << flag), where each flags is:
-
-
-flag || value || meaning
-==================================================================================
-KVM_HINTS_REALTIME || 0 || guest checks this feature bit to
- || || determine that vCPUs are never
- || || preempted for an unlimited time,
- || || allowing optimizations
-----------------------------------------------------------------------------------
diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt
index da24c138c8d1..da210651f714 100644
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -141,3 +141,14 @@ a0 corresponds to the APIC ID in the third argument (a2), bit 1
corresponds to the APIC ID a2+1, and so on.
Returns the number of CPUs to which the IPIs were delivered successfully.
+
+7. KVM_HC_SCHED_YIELD
+------------------------
+Architecture: x86
+Status: active
+Purpose: Hypercall used to yield if the IPI target vCPU is preempted
+
+a0: destination APIC ID
+
+Usage example: When sending a call-function IPI-many to vCPUs, yield if
+any of the IPI target vCPUs was preempted.
diff --git a/Documentation/virtual/kvm/index.rst b/Documentation/virtual/kvm/index.rst
new file mode 100644
index 000000000000..0b206a06f5be
--- /dev/null
+++ b/Documentation/virtual/kvm/index.rst
@@ -0,0 +1,11 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===
+KVM
+===
+
+.. toctree::
+ :maxdepth: 2
+
+ amd-memory-encryption
+ cpuid
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 1bb8bcaf8497..635cd6eaf714 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -15,8 +15,6 @@ The acquisition orders for mutexes are as follows:
On x86, vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock.
-For spinlocks, kvm_lock is taken outside kvm->mmu_lock.
-
Everything else is a leaf: no other lock is taken inside the critical
sections.
@@ -169,7 +167,7 @@ which time it will be set using the Dirty tracking mechanism described above.
------------
Name: kvm_lock
-Type: spinlock_t
+Type: mutex
Arch: any
Protects: - vm_list
diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt
index f3f0d57ced8e..df1f4338b3ca 100644
--- a/Documentation/virtual/kvm/msr.txt
+++ b/Documentation/virtual/kvm/msr.txt
@@ -273,3 +273,12 @@ MSR_KVM_EOI_EN: 0x4b564d04
guest must both read the least significant bit in the memory area and
clear it using a single CPU instruction, such as test and clear, or
compare and exchange.
+
+MSR_KVM_POLL_CONTROL: 0x4b564d05
+ Control host-side polling.
+
+ data: Bit 0 enables (1) or disables (0) host-side HLT polling logic.
+
+ KVM guests can request the host not to poll on HLT, for example if
+ they are performing polling themselves.
+
diff --git a/Documentation/virtual/paravirt_ops.rst b/Documentation/virtual/paravirt_ops.rst
new file mode 100644
index 000000000000..6b789d27cead
--- /dev/null
+++ b/Documentation/virtual/paravirt_ops.rst
@@ -0,0 +1,35 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+============
+Paravirt_ops
+============
+
+Linux provides support for different hypervisor virtualization technologies.
+Historically different binary kernels would be required in order to support
+different hypervisors, this restriction was removed with pv_ops.
+Linux pv_ops is a virtualization API which enables support for different
+hypervisors. It allows each hypervisor to override critical operations and
+allows a single kernel binary to run on all supported execution environments
+including native machine -- without any hypervisors.
+
+pv_ops provides a set of function pointers which represent operations
+corresponding to low level critical instructions and high level
+functionalities in various areas. pv-ops allows for optimizations at run
+time by enabling binary patching of the low-ops critical operations
+at boot time.
+
+pv_ops operations are classified into three categories:
+
+- simple indirect call
+ These operations correspond to high level functionality where it is
+ known that the overhead of indirect call isn't very important.
+
+- indirect call which allows optimization with binary patch
+ Usually these operations correspond to low level critical instructions. They
+ are called frequently and are performance critical. The overhead is
+ very important.
+
+- a set of macros for hand written assembly code
+ Hand written assembly codes (.S files) also need paravirtualization
+ because they include sensitive instructions or some of code paths in
+ them are very performance critical.
diff --git a/Documentation/virtual/paravirt_ops.txt b/Documentation/virtual/paravirt_ops.txt
deleted file mode 100644
index d4881c00e339..000000000000
--- a/Documentation/virtual/paravirt_ops.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-Paravirt_ops
-============
-
-Linux provides support for different hypervisor virtualization technologies.
-Historically different binary kernels would be required in order to support
-different hypervisors, this restriction was removed with pv_ops.
-Linux pv_ops is a virtualization API which enables support for different
-hypervisors. It allows each hypervisor to override critical operations and
-allows a single kernel binary to run on all supported execution environments
-including native machine -- without any hypervisors.
-
-pv_ops provides a set of function pointers which represent operations
-corresponding to low level critical instructions and high level
-functionalities in various areas. pv-ops allows for optimizations at run
-time by enabling binary patching of the low-ops critical operations
-at boot time.
-
-pv_ops operations are classified into three categories:
-
-- simple indirect call
- These operations correspond to high level functionality where it is
- known that the overhead of indirect call isn't very important.
-
-- indirect call which allows optimization with binary patch
- Usually these operations correspond to low level critical instructions. They
- are called frequently and are performance critical. The overhead is
- very important.
-
-- a set of macros for hand written assembly code
- Hand written assembly codes (.S files) also need paravirtualization
- because they include sensitive instructions or some of code paths in
- them are very performance critical.
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7cdf7282e022..7d90964abbb0 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -10,7 +10,7 @@ of this being specialized struct page for such memory (see sections 5 to 7 of
this document).
HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program address coherently with
+allowing a device to transparently access program addresses coherently with
the CPU meaning that any valid pointer on the CPU is also a valid pointer
for the device. This is becoming mandatory to simplify the use of advanced
heterogeneous computing where GPU, DSP, or FPGA are used to perform various
@@ -22,8 +22,8 @@ expose the hardware limitations that are inherent to many platforms. The third
section gives an overview of the HMM design. The fourth section explains how
CPU page-table mirroring works and the purpose of HMM in this context. The
fifth section deals with how device memory is represented inside the kernel.
-Finally, the last section presents a new migration helper that allows lever-
-aging the device DMA engine.
+Finally, the last section presents a new migration helper that allows
+leveraging the device DMA engine.
.. contents:: :local:
@@ -39,20 +39,20 @@ address space. I use shared address space to refer to the opposite situation:
i.e., one in which any application memory region can be used by a device
transparently.
-Split address space happens because device can only access memory allocated
-through device specific API. This implies that all memory objects in a program
+Split address space happens because devices can only access memory allocated
+through a device specific API. This implies that all memory objects in a program
are not equal from the device point of view which complicates large programs
that rely on a wide set of libraries.
-Concretely this means that code that wants to leverage devices like GPUs needs
-to copy object between generically allocated memory (malloc, mmap private, mmap
+Concretely, this means that code that wants to leverage devices like GPUs needs
+to copy objects between generically allocated memory (malloc, mmap private, mmap
share) and memory allocated through the device driver API (this still ends up
with an mmap but of the device file).
For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
-complex data sets (list, tree, ...) are hard to get right. Duplicating a
+for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
complex data set needs to re-map all the pointer relations between each of its
-elements. This is error prone and program gets harder to debug because of the
+elements. This is error prone and programs get harder to debug because of the
duplicate data set and addresses.
Split address space also means that libraries cannot transparently use data
@@ -77,12 +77,12 @@ I/O bus, device memory characteristics
I/O buses cripple shared address spaces due to a few limitations. Most I/O
buses only allow basic memory access from device to main memory; even cache
-coherency is often optional. Access to device memory from CPU is even more
+coherency is often optional. Access to device memory from a CPU is even more
limited. More often than not, it is not cache coherent.
If we only consider the PCIE bus, then a device can access main memory (often
through an IOMMU) and be cache coherent with the CPUs. However, it only allows
-a limited set of atomic operations from device on main memory. This is worse
+a limited set of atomic operations from the device on main memory. This is worse
in the other direction: the CPU can only access a limited range of the device
memory and cannot perform atomic operations on it. Thus device memory cannot
be considered the same as regular memory from the kernel point of view.
@@ -93,20 +93,20 @@ The final limitation is latency. Access to main memory from the device has an
order of magnitude higher latency than when the device accesses its own memory.
Some platforms are developing new I/O buses or additions/modifications to PCIE
-to address some of these limitations (OpenCAPI, CCIX). They mainly allow two-
-way cache coherency between CPU and device and allow all atomic operations the
+to address some of these limitations (OpenCAPI, CCIX). They mainly allow
+two-way cache coherency between CPU and device and allow all atomic operations the
architecture supports. Sadly, not all platforms are following this trend and
some major architectures are left without hardware solutions to these problems.
So for shared address space to make sense, not only must we allow devices to
access any memory but we must also permit any memory to be migrated to device
-memory while device is using it (blocking CPU access while it happens).
+memory while the device is using it (blocking CPU access while it happens).
Shared address space and migration
==================================
-HMM intends to provide two main features. First one is to share the address
+HMM intends to provide two main features. The first one is to share the address
space by duplicating the CPU page table in the device page table so the same
address points to the same physical memory for any valid main memory address in
the process address space.
@@ -121,14 +121,14 @@ why HMM provides helpers to factor out everything that can be while leaving the
hardware specific details to the device driver.
The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
-allows allocating a struct page for each page of the device memory. Those pages
+allows allocating a struct page for each page of device memory. Those pages
are special because the CPU cannot map them. However, they allow migrating
main memory to device memory using existing migration mechanisms and everything
-looks like a page is swapped out to disk from the CPU point of view. Using a
-struct page gives the easiest and cleanest integration with existing mm mech-
-anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
+looks like a page that is swapped out to disk from the CPU point of view. Using a
+struct page gives the easiest and cleanest integration with existing mm
+mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
memory for the device memory and second to perform migration. Policy decisions
-of what and when to migrate things is left to the device driver.
+of what and when to migrate is left to the device driver.
Note that any CPU access to a device page triggers a page fault and a migration
back to main memory. For example, when a page backing a given CPU address A is
@@ -136,8 +136,8 @@ migrated from a main memory page to a device page, then any CPU access to
address A triggers a page fault and initiates a migration back to main memory.
With these two features, HMM not only allows a device to mirror process address
-space and keeping both CPU and device page table synchronized, but also lever-
-ages device memory by migrating the part of the data set that is actively being
+space and keeps both CPU and device page tables synchronized, but also
+leverages device memory by migrating the part of the data set that is actively being
used by the device.
@@ -151,21 +151,28 @@ registration of an hmm_mirror struct::
int hmm_mirror_register(struct hmm_mirror *mirror,
struct mm_struct *mm);
- int hmm_mirror_register_locked(struct hmm_mirror *mirror,
- struct mm_struct *mm);
-
-The locked variant is to be used when the driver is already holding mmap_sem
-of the mm in write mode. The mirror struct has a set of callbacks that are used
+The mirror struct has a set of callbacks that are used
to propagate CPU page tables::
struct hmm_mirror_ops {
+ /* release() - release hmm_mirror
+ *
+ * @mirror: pointer to struct hmm_mirror
+ *
+ * This is called when the mm_struct is being released. The callback
+ * must ensure that all access to any pages obtained from this mirror
+ * is halted before the callback returns. All future access should
+ * fault.
+ */
+ void (*release)(struct hmm_mirror *mirror);
+
/* sync_cpu_device_pagetables() - synchronize page tables
*
* @mirror: pointer to struct hmm_mirror
- * @update_type: type of update that occurred to the CPU page table
- * @start: virtual start address of the range to update
- * @end: virtual end address of the range to update
+ * @update: update information (see struct mmu_notifier_range)
+ * Return: -EAGAIN if update.blockable false and callback need to
+ * block, 0 otherwise.
*
* This callback ultimately originates from mmu_notifiers when the CPU
* page table is updated. The device driver must update its page table
@@ -176,14 +183,12 @@ to propagate CPU page tables::
* page tables are completely updated (TLBs flushed, etc); this is a
* synchronous call.
*/
- void (*update)(struct hmm_mirror *mirror,
- enum hmm_update action,
- unsigned long start,
- unsigned long end);
+ int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+ const struct hmm_update *update);
};
The device driver must perform the update action to the range (mark range
-read only, or fully unmap, ...). The device must be done with the update before
+read only, or fully unmap, etc.). The device must complete the update before
the driver callback returns.
When the device driver wants to populate a range of virtual addresses, it can
@@ -194,17 +199,18 @@ use either::
The first one (hmm_range_snapshot()) will only fetch present CPU page table
entries and will not trigger a page fault on missing or non-present entries.
-The second one does trigger a page fault on missing or read-only entry if the
-write parameter is true. Page faults use the generic mm page fault code path
-just like a CPU page fault.
+The second one does trigger a page fault on missing or read-only entries if
+write access is requested (see below). Page faults use the generic mm page
+fault code path just like a CPU page fault.
Both functions copy CPU page table entries into their pfns array argument. Each
entry in that array corresponds to an address in the virtual range. HMM
provides a set of flags to help the driver identify special CPU page table
entries.
-Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronized. The usage pattern is::
+Locking within the sync_cpu_device_pagetables() callback is the most important
+aspect the driver must respect in order to keep things properly synchronized.
+The usage pattern is::
int driver_populate_range(...)
{
@@ -239,11 +245,11 @@ respect in order to keep things properly synchronized. The usage pattern is::
hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
goto again;
}
- hmm_mirror_unregister(&range);
+ hmm_range_unregister(&range);
return ret;
}
take_lock(driver->update);
- if (!range.valid) {
+ if (!hmm_range_valid(&range)) {
release_lock(driver->update);
up_read(&mm->mmap_sem);
goto again;
@@ -251,15 +257,15 @@ respect in order to keep things properly synchronized. The usage pattern is::
// Use pfns array content to update device page table
- hmm_mirror_unregister(&range);
+ hmm_range_unregister(&range);
release_lock(driver->update);
up_read(&mm->mmap_sem);
return 0;
}
The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before checking the range.valid
-field to avoid any race with a concurrent CPU page table update.
+sync_cpu_device_pagetables() callback. That lock must be held before calling
+hmm_range_valid() to avoid any race with a concurrent CPU page table update.
HMM implements all this on top of the mmu_notifier API because we wanted a
simpler API and also to be able to perform optimizations latter on like doing
@@ -279,46 +285,47 @@ concurrently).
Leverage default_flags and pfn_flags_mask
=========================================
-The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
-to set fault or snapshot policy for a whole range instead of having to set them
-for each entries in the range.
+The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
+fault or snapshot policy for the whole range instead of having to set them
+for each entry in the pfns array.
+
+For instance, if the device flags for range.flags are::
-For instance if the device flags for device entries are:
- VALID (1 << 63)
- WRITE (1 << 62)
+ range.flags[HMM_PFN_VALID] = (1 << 63);
+ range.flags[HMM_PFN_WRITE] = (1 << 62);
-Now let say that device driver wants to fault with at least read a range then
-it does set::
+and the device driver wants pages for a range with at least read permission,
+it sets::
range->default_flags = (1 << 63);
range->pfn_flags_mask = 0;
-and calls hmm_range_fault() as described above. This will fill fault all page
+and calls hmm_range_fault() as described above. This will fill fault all pages
in the range with at least read permission.
-Now let say driver wants to do the same except for one page in the range for
-which its want to have write. Now driver set::
+Now let's say the driver wants to do the same except for one page in the range for
+which it wants to have write permission. Now driver set::
range->default_flags = (1 << 63);
range->pfn_flags_mask = (1 << 62);
range->pfns[index_of_write] = (1 << 62);
-With this HMM will fault in all page with at least read (ie valid) and for the
+With this, HMM will fault in all pages with at least read (i.e., valid) and for the
address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
-write permission ie if the CPU pte does not have write permission set then HMM
+write permission i.e., if the CPU pte does not have write permission set then HMM
will call handle_mm_fault().
-Note that HMM will populate the pfns array with write permission for any entry
-that have write permission within the CPU pte no matter what are the values set
+Note that HMM will populate the pfns array with write permission for any page
+that is mapped with CPU write permission no matter what values are set
in default_flags or pfn_flags_mask.
Represent and manage device memory from core kernel point of view
=================================================================
-Several different designs were tried to support device memory. First one used
-a device specific data structure to keep information about migrated memory and
-HMM hooked itself in various places of mm code to handle any access to
+Several different designs were tried to support device memory. The first one
+used a device specific data structure to keep information about migrated memory
+and HMM hooked itself in various places of mm code to handle any access to
addresses that were backed by device memory. It turns out that this ended up
replicating most of the fields of struct page and also needed many kernel code
paths to be updated to understand this new kind of memory.
@@ -329,33 +336,6 @@ directly using struct page for device memory which left most kernel code paths
unaware of the difference. We only need to make sure that no one ever tries to
map those pages from the CPU side.
-HMM provides a set of helpers to register and hotplug device memory as a new
-region needing a struct page. This is offered through a very simple API::
-
- struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
- struct device *device,
- unsigned long size);
- void hmm_devmem_remove(struct hmm_devmem *devmem);
-
-The hmm_devmem_ops is where most of the important things are::
-
- struct hmm_devmem_ops {
- void (*free)(struct hmm_devmem *devmem, struct page *page);
- int (*fault)(struct hmm_devmem *devmem,
- struct vm_area_struct *vma,
- unsigned long addr,
- struct page *page,
- unsigned flags,
- pmd_t *pmdp);
- };
-
-The first callback (free()) happens when the last reference on a device page is
-dropped. This means the device page is now free and no longer used by anyone.
-The second callback happens whenever the CPU tries to access a device page
-which it cannot do. This second callback must trigger a migration back to
-system memory.
-
-
Migration to and from device memory
===================================
@@ -417,9 +397,9 @@ willing to pay to keep all the code simpler.
Memory cgroup (memcg) and rss accounting
========================================
-For now device memory is accounted as any regular page in rss counters (either
+For now, device memory is accounted as any regular page in rss counters (either
anonymous if device page is used for anonymous, file if device page is used for
-file backed page or shmem if device page is used for shared memory). This is a
+file backed page, or shmem if device page is used for shared memory). This is a
deliberate choice to keep existing applications, that might start using device
memory without knowing about it, running unimpacted.
@@ -439,6 +419,6 @@ get more experience in how device memory is used and its impact on memory
resource control.
-Note that device memory can never be pinned by device driver nor through GUP
+Note that device memory can never be pinned by a device driver nor through GUP
and thus such memory is always free upon process exit. Or when last reference
is dropped in case of shared memory or file backed memory.
diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst
index 382f72ace1fc..58a12376b7df 100644
--- a/Documentation/vm/memory-model.rst
+++ b/Documentation/vm/memory-model.rst
@@ -181,3 +181,43 @@ that is eventually passed to vmemmap_populate() through a long chain
of function calls. The vmemmap_populate() implementation may use the
`vmem_altmap` along with :c:func:`altmap_alloc_block_buf` helper to
allocate memory map on the persistent memory device.
+
+ZONE_DEVICE
+===========
+The `ZONE_DEVICE` facility builds upon `SPARSEMEM_VMEMMAP` to offer
+`struct page` `mem_map` services for device driver identified physical
+address ranges. The "device" aspect of `ZONE_DEVICE` relates to the fact
+that the page objects for these address ranges are never marked online,
+and that a reference must be taken against the device, not just the page
+to keep the memory pinned for active use. `ZONE_DEVICE`, via
+:c:func:`devm_memremap_pages`, performs just enough memory hotplug to
+turn on :c:func:`pfn_to_page`, :c:func:`page_to_pfn`, and
+:c:func:`get_user_pages` service for the given range of pfns. Since the
+page reference count never drops below 1 the page is never tracked as
+free memory and the page's `struct list_head lru` space is repurposed
+for back referencing to the host device / driver that mapped the memory.
+
+While `SPARSEMEM` presents memory as a collection of sections,
+optionally collected into memory blocks, `ZONE_DEVICE` users have a need
+for smaller granularity of populating the `mem_map`. Given that
+`ZONE_DEVICE` memory is never marked online it is subsequently never
+subject to its memory ranges being exposed through the sysfs memory
+hotplug api on memory block boundaries. The implementation relies on
+this lack of user-api constraint to allow sub-section sized memory
+ranges to be specified to :c:func:`arch_add_memory`, the top-half of
+memory hotplug. Sub-section support allows for 2MB as the cross-arch
+common alignment granularity for :c:func:`devm_memremap_pages`.
+
+The users of `ZONE_DEVICE` are:
+
+* pmem: Map platform persistent memory to be used as a direct-I/O target
+ via DAX mappings.
+
+* hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()`
+ event callbacks to allow a device-driver to coordinate memory management
+ events related to device-memory, typically GPU memory. See
+ Documentation/vm/hmm.rst.
+
+* p2pdma: Create `struct page` objects to allow peer devices in a
+ PCI/-E topology to coordinate direct-DMA operations between themselves,
+ i.e. bypass host memory.
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
index 461d5d57cd4f..99fdeca917ca 100644
--- a/Documentation/vm/numa.rst
+++ b/Documentation/vm/numa.rst
@@ -67,7 +67,7 @@ nodes. Each emulated node will manage a fraction of the underlying cells'
physical memory. NUMA emluation is useful for testing NUMA kernel and
application features on non-NUMA platforms, and as a sort of memory resource
management mechanism when used together with cpusets.
-[see Documentation/cgroup-v1/cpusets.txt]
+[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
For each node with memory, Linux constructs an independent memory management
subsystem, complete with its own free page lists, in-use page lists, usage
@@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see
System administrators can restrict the CPUs and nodes' memories that a non-
privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets. [see Documentation/cgroup-v1/cpusets.txt]
+using control groups and CPUsets. [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
On architectures that do not hide memoryless nodes, Linux will include only
zones [nodes] with memory in the zonelists. This means that for a memoryless
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index f68d61335abb..1d6cd7db4e43 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -41,7 +41,7 @@ locations.
Larger installations usually partition the system using cpusets into
sections of nodes. Paul Jackson has equipped cpusets with the ability to
move pages when a task is moved to another cpuset (See
-Documentation/cgroup-v1/cpusets.txt).
+Documentation/admin-guide/cgroup-v1/cpusets.rst).
Cpusets allows the automation of process locality. If a task is moved to
a new cpuset then also all its pages are moved with it so that the
performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index b8e29f977f2d..17d0861b0f1d 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -98,7 +98,7 @@ Memory Control Group Interaction
--------------------------------
The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/cgroup-v1/memory.txt] by extending the
+memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the
lru_list enum.
The memory controller data structure automatically gets a per-zone unevictable
@@ -439,7 +439,7 @@ Compacting MLOCKED Pages
The unevictable LRU can be scanned for compactable regions and the default
behavior is to do so. /proc/sys/vm/compact_unevictable_allowed controls
-this behavior (see Documentation/sysctl/vm.txt). Once scanning of the
+this behavior (see Documentation/admin-guide/sysctl/vm.rst). Once scanning of the
unevictable LRU is enabled, the work of compaction is mostly handled by
the page migration code and the same work flow as described in MIGRATING
MLOCKED PAGES will apply.
diff --git a/Documentation/w1/w1.netlink b/Documentation/w1/w1.netlink
index ef2727192d69..94ad4c420828 100644
--- a/Documentation/w1/w1.netlink
+++ b/Documentation/w1/w1.netlink
@@ -183,7 +183,7 @@ acknowledge number is set to seq+1.
Additional documantion, source code examples.
============================================
-1. Documentation/connector
+1. Documentation/driver-api/connector.rst
2. http://www.ioremap.net/archive/w1
This archive includes userspace application w1d.c which uses
read/write/search commands for all master/slave devices found on the bus.
diff --git a/Documentation/watchdog/hpwdt.rst b/Documentation/watchdog/hpwdt.rst
index 94a96371113e..c165d92cfd12 100644
--- a/Documentation/watchdog/hpwdt.rst
+++ b/Documentation/watchdog/hpwdt.rst
@@ -39,6 +39,10 @@ Last reviewed: 08/20/2018
Default value is set when compiling the kernel. If it is set
to "Y", then there is no way of disabling the watchdog once
it has been started.
+ kdumptimeout Minimum timeout in seconds to apply upon receipt of an NMI
+ before calling panic. (-1) disables the watchdog. When value
+ is > 0, the timer is reprogrammed with the greater of
+ value or current timeout value.
============ ================================================================
NOTE:
diff --git a/Documentation/watchdog/index.rst b/Documentation/watchdog/index.rst
index 33a0de631e84..c177645081d8 100644
--- a/Documentation/watchdog/index.rst
+++ b/Documentation/watchdog/index.rst
@@ -1,4 +1,4 @@
-:orphan:
+.. SPDX-License-Identifier: GPL-2.0
======================
Linux Watchdog Support
diff --git a/Documentation/watchdog/watchdog-parameters.rst b/Documentation/watchdog/watchdog-parameters.rst
index b121caae7798..a3985cc5aeda 100644
--- a/Documentation/watchdog/watchdog-parameters.rst
+++ b/Documentation/watchdog/watchdog-parameters.rst
@@ -13,6 +13,17 @@ modules.
-------------------------------------------------
+watchdog core:
+ open_timeout:
+ Maximum time, in seconds, for which the watchdog framework will take
+ care of pinging a running hardware watchdog until userspace opens the
+ corresponding /dev/watchdogN device. A value of 0 means an infinite
+ timeout. Setting this to a non-zero value can be useful to ensure that
+ either userspace comes up properly, or the board gets reset and allows
+ fallback logic in the bootloader to try something else.
+
+-------------------------------------------------
+
acquirewdt:
wdt_stop:
Acquire WDT 'stop' io port (default 0x43)
diff --git a/Documentation/x86/exception-tables.rst b/Documentation/x86/exception-tables.rst
index 24596c8210b5..ed6d4b0cf62c 100644
--- a/Documentation/x86/exception-tables.rst
+++ b/Documentation/x86/exception-tables.rst
@@ -35,7 +35,7 @@ page fault handler::
void do_page_fault(struct pt_regs *regs, unsigned long error_code)
in arch/x86/mm/fault.c. The parameters on the stack are set up by
-the low level assembly glue in arch/x86/kernel/entry_32.S. The parameter
+the low level assembly glue in arch/x86/entry/entry_32.S. The parameter
regs is a pointer to the saved registers on the stack, error_code
contains a reason code for the exception.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index f2de1b2d3ac7..af64c4bb4447 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -20,6 +20,8 @@ x86-specific Documentation
mtrr
pat
intel_mpx
+ intel-iommu
+ intel_txt
amd-memory-encryption
pti
mds
diff --git a/Documentation/Intel-IOMMU.txt b/Documentation/x86/intel-iommu.rst
index 9dae6b47e398..9dae6b47e398 100644
--- a/Documentation/Intel-IOMMU.txt
+++ b/Documentation/x86/intel-iommu.rst
diff --git a/Documentation/intel_txt.txt b/Documentation/x86/intel_txt.rst
index d83c1a2122c9..d83c1a2122c9 100644
--- a/Documentation/intel_txt.txt
+++ b/Documentation/x86/intel_txt.rst
diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst
index 6e28dbe818ab..e29739904e37 100644
--- a/Documentation/x86/topology.rst
+++ b/Documentation/x86/topology.rst
@@ -9,7 +9,7 @@ representation in the kernel. Update/change when doing changes to the
respective code.
The architecture-agnostic topology definitions are in
-Documentation/cputopology.txt. This file holds x86-specific
+Documentation/admin-guide/cputopology.rst. This file holds x86-specific
differences/specialities which must not necessarily apply to the generic
definitions. Thus, the way to read up on Linux topology on x86 is to start
with the generic one and look at this one in parallel for the x86 specifics.
@@ -49,6 +49,10 @@ Package-related topology information in the kernel:
The number of cores in a package. This information is retrieved via CPUID.
+ - cpuinfo_x86.x86_max_dies:
+
+ The number of dies in a package. This information is retrieved via CPUID.
+
- cpuinfo_x86.phys_proc_id:
The physical ID of the package. This information is retrieved via CPUID
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
index 04df57b9aa3f..ff9bcfd2cc14 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
@@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks. This is a way of limiting the
amount of system memory that are available to a certain class of tasks.
For more information on the features of cpusets, see
-Documentation/cgroup-v1/cpusets.txt.
+Documentation/admin-guide/cgroup-v1/cpusets.rst.
There are a number of different configurations you can use for your needs. For
more information on the numa=fake command line option and its various ways of
configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst.
@@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg::
On node 3 totalpages: 131072
Now following the instructions for mounting the cpusets filesystem from
-Documentation/cgroup-v1/cpusets.txt, you can assign fake nodes (i.e. contiguous memory
+Documentation/admin-guide/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
address spaces) to individual cpusets::
[root@xroads /]# mkdir exampleset
diff --git a/Documentation/xilinx/index.rst b/Documentation/xilinx/index.rst
deleted file mode 100644
index 01cc1a0714df..000000000000
--- a/Documentation/xilinx/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-:orphan:
-
-===========
-Xilinx FPGA
-===========
-
-.. toctree::
- :maxdepth: 1
-
- eemi
-
-.. only:: subproject and html
-
- Indices
- =======
-
- * :ref:`genindex`
diff --git a/Documentation/xtensa/atomctl.rst b/Documentation/xtensa/atomctl.rst
new file mode 100644
index 000000000000..1ecbd0ba9a2e
--- /dev/null
+++ b/Documentation/xtensa/atomctl.rst
@@ -0,0 +1,51 @@
+===========================================
+Atomic Operation Control (ATOMCTL) Register
+===========================================
+
+We Have Atomic Operation Control (ATOMCTL) Register.
+This register determines the effect of using a S32C1I instruction
+with various combinations of:
+
+ 1. With and without an Coherent Cache Controller which
+ can do Atomic Transactions to the memory internally.
+
+ 2. With and without An Intelligent Memory Controller which
+ can do Atomic Transactions itself.
+
+The Core comes up with a default value of for the three types of cache ops::
+
+ 0x28: (WB: Internal, WT: Internal, BY:Exception)
+
+On the FPGA Cards we typically simulate an Intelligent Memory controller
+which can implement RCW transactions. For FPGA cards with an External
+Memory controller we let it to the atomic operations internally while
+doing a Cached (WB) transaction and use the Memory RCW for un-cached
+operations.
+
+For systems without an coherent cache controller, non-MX, we always
+use the memory controllers RCW, thought non-MX controlers likely
+support the Internal Operation.
+
+CUSTOMER-WARNING:
+ Virtually all customers buy their memory controllers from vendors that
+ don't support atomic RCW memory transactions and will likely want to
+ configure this register to not use RCW.
+
+Developers might find using RCW in Bypass mode convenient when testing
+with the cache being bypassed; for example studying cache alias problems.
+
+See Section 4.3.12.4 of ISA; Bits::
+
+ WB WT BY
+ 5 4 | 3 2 | 1 0
+
+========= ================== ================== ===============
+ 2 Bit
+ Field
+ Values WB - Write Back WT - Write Thru BY - Bypass
+========= ================== ================== ===============
+ 0 Exception Exception Exception
+ 1 RCW Transaction RCW Transaction RCW Transaction
+ 2 Internal Operation Internal Operation Reserved
+ 3 Reserved Reserved Reserved
+========= ================== ================== ===============
diff --git a/Documentation/xtensa/atomctl.txt b/Documentation/xtensa/atomctl.txt
deleted file mode 100644
index 1da783ac200c..000000000000
--- a/Documentation/xtensa/atomctl.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-We Have Atomic Operation Control (ATOMCTL) Register.
-This register determines the effect of using a S32C1I instruction
-with various combinations of:
-
- 1. With and without an Coherent Cache Controller which
- can do Atomic Transactions to the memory internally.
-
- 2. With and without An Intelligent Memory Controller which
- can do Atomic Transactions itself.
-
-The Core comes up with a default value of for the three types of cache ops:
-
- 0x28: (WB: Internal, WT: Internal, BY:Exception)
-
-On the FPGA Cards we typically simulate an Intelligent Memory controller
-which can implement RCW transactions. For FPGA cards with an External
-Memory controller we let it to the atomic operations internally while
-doing a Cached (WB) transaction and use the Memory RCW for un-cached
-operations.
-
-For systems without an coherent cache controller, non-MX, we always
-use the memory controllers RCW, thought non-MX controlers likely
-support the Internal Operation.
-
-CUSTOMER-WARNING:
- Virtually all customers buy their memory controllers from vendors that
- don't support atomic RCW memory transactions and will likely want to
- configure this register to not use RCW.
-
-Developers might find using RCW in Bypass mode convenient when testing
-with the cache being bypassed; for example studying cache alias problems.
-
-See Section 4.3.12.4 of ISA; Bits:
-
- WB WT BY
- 5 4 | 3 2 | 1 0
- 2 Bit
- Field
- Values WB - Write Back WT - Write Thru BY - Bypass
---------- --------------- ----------------- ----------------
- 0 Exception Exception Exception
- 1 RCW Transaction RCW Transaction RCW Transaction
- 2 Internal Operation Internal Operation Reserved
- 3 Reserved Reserved Reserved
diff --git a/Documentation/xtensa/booting.rst b/Documentation/xtensa/booting.rst
new file mode 100644
index 000000000000..e1b83707e5b6
--- /dev/null
+++ b/Documentation/xtensa/booting.rst
@@ -0,0 +1,22 @@
+=====================================
+Passing boot parameters to the kernel
+=====================================
+
+Boot parameters are represented as a TLV list in the memory. Please see
+arch/xtensa/include/asm/bootparam.h for definition of the bp_tag structure and
+tag value constants. First entry in the list must have type BP_TAG_FIRST, last
+entry must have type BP_TAG_LAST. The address of the first list entry is
+passed to the kernel in the register a2. The address type depends on MMU type:
+
+- For configurations without MMU, with region protection or with MPU the
+ address must be the physical address.
+- For configurations with region translarion MMU or with MMUv3 and CONFIG_MMU=n
+ the address must be a valid address in the current mapping. The kernel will
+ not change the mapping on its own.
+- For configurations with MMUv2 the address must be a virtual address in the
+ default virtual mapping (0xd0000000..0xffffffff).
+- For configurations with MMUv3 and CONFIG_MMU=y the address may be either a
+ virtual or physical address. In either case it must be within the default
+ virtual mapping. It is considered physical if it is within the range of
+ physical addresses covered by the default KSEG mapping (XCHAL_KSEG_PADDR..
+ XCHAL_KSEG_PADDR + XCHAL_KSEG_SIZE), otherwise it is considered virtual.
diff --git a/Documentation/xtensa/booting.txt b/Documentation/xtensa/booting.txt
deleted file mode 100644
index 402b33a2619f..000000000000
--- a/Documentation/xtensa/booting.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-Passing boot parameters to the kernel.
-
-Boot parameters are represented as a TLV list in the memory. Please see
-arch/xtensa/include/asm/bootparam.h for definition of the bp_tag structure and
-tag value constants. First entry in the list must have type BP_TAG_FIRST, last
-entry must have type BP_TAG_LAST. The address of the first list entry is
-passed to the kernel in the register a2. The address type depends on MMU type:
-- For configurations without MMU, with region protection or with MPU the
- address must be the physical address.
-- For configurations with region translarion MMU or with MMUv3 and CONFIG_MMU=n
- the address must be a valid address in the current mapping. The kernel will
- not change the mapping on its own.
-- For configurations with MMUv2 the address must be a virtual address in the
- default virtual mapping (0xd0000000..0xffffffff).
-- For configurations with MMUv3 and CONFIG_MMU=y the address may be either a
- virtual or physical address. In either case it must be within the default
- virtual mapping. It is considered physical if it is within the range of
- physical addresses covered by the default KSEG mapping (XCHAL_KSEG_PADDR..
- XCHAL_KSEG_PADDR + XCHAL_KSEG_SIZE), otherwise it is considered virtual.
diff --git a/Documentation/xtensa/index.rst b/Documentation/xtensa/index.rst
new file mode 100644
index 000000000000..52fa04eb39a3
--- /dev/null
+++ b/Documentation/xtensa/index.rst
@@ -0,0 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===================
+Xtensa Architecture
+===================
+
+.. toctree::
+ :maxdepth: 1
+
+ atomctl
+ booting
+ mmu
diff --git a/Documentation/xtensa/mmu.rst b/Documentation/xtensa/mmu.rst
new file mode 100644
index 000000000000..e52a12960fdc
--- /dev/null
+++ b/Documentation/xtensa/mmu.rst
@@ -0,0 +1,195 @@
+=============================
+MMUv3 initialization sequence
+=============================
+
+The code in the initialize_mmu macro sets up MMUv3 memory mapping
+identically to MMUv2 fixed memory mapping. Depending on
+CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX symbol this code is
+located in addresses it was linked for (symbol undefined), or not
+(symbol defined), so it needs to be position-independent.
+
+The code has the following assumptions:
+
+ - This code fragment is run only on an MMU v3.
+ - TLBs are in their reset state.
+ - ITLBCFG and DTLBCFG are zero (reset state).
+ - RASID is 0x04030201 (reset state).
+ - PS.RING is zero (reset state).
+ - LITBASE is zero (reset state, PC-relative literals); required to be PIC.
+
+TLB setup proceeds along the following steps.
+
+ Legend:
+
+ - VA = virtual address (two upper nibbles of it);
+ - PA = physical address (two upper nibbles of it);
+ - pc = physical range that contains this code;
+
+After step 2, we jump to virtual address in the range 0x40000000..0x5fffffff
+or 0x00000000..0x1fffffff, depending on whether the kernel was loaded below
+0x40000000 or above. That address corresponds to next instruction to execute
+in this code. After step 4, we jump to intended (linked) address of this code.
+The scheme below assumes that the kernel is loaded below 0x40000000.
+
+ ====== ===== ===== ===== ===== ====== ===== =====
+ - Step0 Step1 Step2 Step3 Step4 Step5
+
+ VA PA PA PA PA VA PA PA
+ ====== ===== ===== ===== ===== ====== ===== =====
+ E0..FF -> E0 -> E0 -> E0 F0..FF -> F0 -> F0
+ C0..DF -> C0 -> C0 -> C0 E0..EF -> F0 -> F0
+ A0..BF -> A0 -> A0 -> A0 D8..DF -> 00 -> 00
+ 80..9F -> 80 -> 80 -> 80 D0..D7 -> 00 -> 00
+ 60..7F -> 60 -> 60 -> 60
+ 40..5F -> 40 -> pc -> pc 40..5F -> pc
+ 20..3F -> 20 -> 20 -> 20
+ 00..1F -> 00 -> 00 -> 00
+ ====== ===== ===== ===== ===== ====== ===== =====
+
+The default location of IO peripherals is above 0xf0000000. This may be changed
+using a "ranges" property in a device tree simple-bus node. See the Devicetree
+Specification, section 4.5 for details on the syntax and semantics of
+simple-bus nodes. The following limitations apply:
+
+1. Only top level simple-bus nodes are considered
+
+2. Only one (first) simple-bus node is considered
+
+3. Empty "ranges" properties are not supported
+
+4. Only the first triplet in the "ranges" property is considered
+
+5. The parent-bus-address value is rounded down to the nearest 256MB boundary
+
+6. The IO area covers the entire 256MB segment of parent-bus-address; the
+ "ranges" triplet length field is ignored
+
+
+MMUv3 address space layouts.
+============================
+
+Default MMUv2-compatible layout::
+
+ Symbol VADDR Size
+ +------------------+
+ | Userspace | 0x00000000 TASK_SIZE
+ +------------------+ 0x40000000
+ +------------------+
+ | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE
+ +------------------+
+ | KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE
+ +------------------+ 0x8e400000
+ +------------------+
+ | VMALLOC area | VMALLOC_START 0xc0000000 128MB - 64KB
+ +------------------+ VMALLOC_END
+ | Cache aliasing | TLBTEMP_BASE_1 0xc7ff0000 DCACHE_WAY_SIZE
+ | remap area 1 |
+ +------------------+
+ | Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE
+ | remap area 2 |
+ +------------------+
+ +------------------+
+ | KMAP area | PKMAP_BASE PTRS_PER_PTE *
+ | | DCACHE_N_COLORS *
+ | | PAGE_SIZE
+ | | (4MB * DCACHE_N_COLORS)
+ +------------------+
+ | Atomic KMAP area | FIXADDR_START KM_TYPE_NR *
+ | | NR_CPUS *
+ | | DCACHE_N_COLORS *
+ | | PAGE_SIZE
+ +------------------+ FIXADDR_TOP 0xcffff000
+ +------------------+
+ | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xd0000000 128MB
+ +------------------+
+ | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xd8000000 128MB
+ +------------------+
+ | Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB
+ +------------------+
+ | Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB
+ +------------------+
+
+
+256MB cached + 256MB uncached layout::
+
+ Symbol VADDR Size
+ +------------------+
+ | Userspace | 0x00000000 TASK_SIZE
+ +------------------+ 0x40000000
+ +------------------+
+ | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE
+ +------------------+
+ | KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE
+ +------------------+ 0x8e400000
+ +------------------+
+ | VMALLOC area | VMALLOC_START 0xa0000000 128MB - 64KB
+ +------------------+ VMALLOC_END
+ | Cache aliasing | TLBTEMP_BASE_1 0xa7ff0000 DCACHE_WAY_SIZE
+ | remap area 1 |
+ +------------------+
+ | Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE
+ | remap area 2 |
+ +------------------+
+ +------------------+
+ | KMAP area | PKMAP_BASE PTRS_PER_PTE *
+ | | DCACHE_N_COLORS *
+ | | PAGE_SIZE
+ | | (4MB * DCACHE_N_COLORS)
+ +------------------+
+ | Atomic KMAP area | FIXADDR_START KM_TYPE_NR *
+ | | NR_CPUS *
+ | | DCACHE_N_COLORS *
+ | | PAGE_SIZE
+ +------------------+ FIXADDR_TOP 0xaffff000
+ +------------------+
+ | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xb0000000 256MB
+ +------------------+
+ | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 256MB
+ +------------------+
+ +------------------+
+ | Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB
+ +------------------+
+ | Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB
+ +------------------+
+
+
+512MB cached + 512MB uncached layout::
+
+ Symbol VADDR Size
+ +------------------+
+ | Userspace | 0x00000000 TASK_SIZE
+ +------------------+ 0x40000000
+ +------------------+
+ | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE
+ +------------------+
+ | KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE
+ +------------------+ 0x8e400000
+ +------------------+
+ | VMALLOC area | VMALLOC_START 0x90000000 128MB - 64KB
+ +------------------+ VMALLOC_END
+ | Cache aliasing | TLBTEMP_BASE_1 0x97ff0000 DCACHE_WAY_SIZE
+ | remap area 1 |
+ +------------------+
+ | Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE
+ | remap area 2 |
+ +------------------+
+ +------------------+
+ | KMAP area | PKMAP_BASE PTRS_PER_PTE *
+ | | DCACHE_N_COLORS *
+ | | PAGE_SIZE
+ | | (4MB * DCACHE_N_COLORS)
+ +------------------+
+ | Atomic KMAP area | FIXADDR_START KM_TYPE_NR *
+ | | NR_CPUS *
+ | | DCACHE_N_COLORS *
+ | | PAGE_SIZE
+ +------------------+ FIXADDR_TOP 0x9ffff000
+ +------------------+
+ | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xa0000000 512MB
+ +------------------+
+ | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 512MB
+ +------------------+
+ | Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB
+ +------------------+
+ | Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB
+ +------------------+
diff --git a/Documentation/xtensa/mmu.txt b/Documentation/xtensa/mmu.txt
deleted file mode 100644
index 318114de63f3..000000000000
--- a/Documentation/xtensa/mmu.txt
+++ /dev/null
@@ -1,189 +0,0 @@
-MMUv3 initialization sequence.
-
-The code in the initialize_mmu macro sets up MMUv3 memory mapping
-identically to MMUv2 fixed memory mapping. Depending on
-CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX symbol this code is
-located in addresses it was linked for (symbol undefined), or not
-(symbol defined), so it needs to be position-independent.
-
-The code has the following assumptions:
- This code fragment is run only on an MMU v3.
- TLBs are in their reset state.
- ITLBCFG and DTLBCFG are zero (reset state).
- RASID is 0x04030201 (reset state).
- PS.RING is zero (reset state).
- LITBASE is zero (reset state, PC-relative literals); required to be PIC.
-
-TLB setup proceeds along the following steps.
-
- Legend:
- VA = virtual address (two upper nibbles of it);
- PA = physical address (two upper nibbles of it);
- pc = physical range that contains this code;
-
-After step 2, we jump to virtual address in the range 0x40000000..0x5fffffff
-or 0x00000000..0x1fffffff, depending on whether the kernel was loaded below
-0x40000000 or above. That address corresponds to next instruction to execute
-in this code. After step 4, we jump to intended (linked) address of this code.
-The scheme below assumes that the kernel is loaded below 0x40000000.
-
- Step0 Step1 Step2 Step3 Step4 Step5
- ===== ===== ===== ===== ===== =====
- VA PA PA PA PA VA PA PA
- ------ -- -- -- -- ------ -- --
- E0..FF -> E0 -> E0 -> E0 F0..FF -> F0 -> F0
- C0..DF -> C0 -> C0 -> C0 E0..EF -> F0 -> F0
- A0..BF -> A0 -> A0 -> A0 D8..DF -> 00 -> 00
- 80..9F -> 80 -> 80 -> 80 D0..D7 -> 00 -> 00
- 60..7F -> 60 -> 60 -> 60
- 40..5F -> 40 -> pc -> pc 40..5F -> pc
- 20..3F -> 20 -> 20 -> 20
- 00..1F -> 00 -> 00 -> 00
-
-The default location of IO peripherals is above 0xf0000000. This may be changed
-using a "ranges" property in a device tree simple-bus node. See the Devicetree
-Specification, section 4.5 for details on the syntax and semantics of
-simple-bus nodes. The following limitations apply:
-
-1. Only top level simple-bus nodes are considered
-
-2. Only one (first) simple-bus node is considered
-
-3. Empty "ranges" properties are not supported
-
-4. Only the first triplet in the "ranges" property is considered
-
-5. The parent-bus-address value is rounded down to the nearest 256MB boundary
-
-6. The IO area covers the entire 256MB segment of parent-bus-address; the
- "ranges" triplet length field is ignored
-
-
-MMUv3 address space layouts.
-============================
-
-Default MMUv2-compatible layout.
-
- Symbol VADDR Size
-+------------------+
-| Userspace | 0x00000000 TASK_SIZE
-+------------------+ 0x40000000
-+------------------+
-| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE
-+------------------+
-| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE
-+------------------+ 0x8e400000
-+------------------+
-| VMALLOC area | VMALLOC_START 0xc0000000 128MB - 64KB
-+------------------+ VMALLOC_END
-| Cache aliasing | TLBTEMP_BASE_1 0xc7ff0000 DCACHE_WAY_SIZE
-| remap area 1 |
-+------------------+
-| Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE
-| remap area 2 |
-+------------------+
-+------------------+
-| KMAP area | PKMAP_BASE PTRS_PER_PTE *
-| | DCACHE_N_COLORS *
-| | PAGE_SIZE
-| | (4MB * DCACHE_N_COLORS)
-+------------------+
-| Atomic KMAP area | FIXADDR_START KM_TYPE_NR *
-| | NR_CPUS *
-| | DCACHE_N_COLORS *
-| | PAGE_SIZE
-+------------------+ FIXADDR_TOP 0xcffff000
-+------------------+
-| Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xd0000000 128MB
-+------------------+
-| Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xd8000000 128MB
-+------------------+
-| Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB
-+------------------+
-| Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB
-+------------------+
-
-
-256MB cached + 256MB uncached layout.
-
- Symbol VADDR Size
-+------------------+
-| Userspace | 0x00000000 TASK_SIZE
-+------------------+ 0x40000000
-+------------------+
-| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE
-+------------------+
-| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE
-+------------------+ 0x8e400000
-+------------------+
-| VMALLOC area | VMALLOC_START 0xa0000000 128MB - 64KB
-+------------------+ VMALLOC_END
-| Cache aliasing | TLBTEMP_BASE_1 0xa7ff0000 DCACHE_WAY_SIZE
-| remap area 1 |
-+------------------+
-| Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE
-| remap area 2 |
-+------------------+
-+------------------+
-| KMAP area | PKMAP_BASE PTRS_PER_PTE *
-| | DCACHE_N_COLORS *
-| | PAGE_SIZE
-| | (4MB * DCACHE_N_COLORS)
-+------------------+
-| Atomic KMAP area | FIXADDR_START KM_TYPE_NR *
-| | NR_CPUS *
-| | DCACHE_N_COLORS *
-| | PAGE_SIZE
-+------------------+ FIXADDR_TOP 0xaffff000
-+------------------+
-| Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xb0000000 256MB
-+------------------+
-| Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 256MB
-+------------------+
-+------------------+
-| Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB
-+------------------+
-| Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB
-+------------------+
-
-
-512MB cached + 512MB uncached layout.
-
- Symbol VADDR Size
-+------------------+
-| Userspace | 0x00000000 TASK_SIZE
-+------------------+ 0x40000000
-+------------------+
-| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE
-+------------------+
-| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE
-+------------------+ 0x8e400000
-+------------------+
-| VMALLOC area | VMALLOC_START 0x90000000 128MB - 64KB
-+------------------+ VMALLOC_END
-| Cache aliasing | TLBTEMP_BASE_1 0x97ff0000 DCACHE_WAY_SIZE
-| remap area 1 |
-+------------------+
-| Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE
-| remap area 2 |
-+------------------+
-+------------------+
-| KMAP area | PKMAP_BASE PTRS_PER_PTE *
-| | DCACHE_N_COLORS *
-| | PAGE_SIZE
-| | (4MB * DCACHE_N_COLORS)
-+------------------+
-| Atomic KMAP area | FIXADDR_START KM_TYPE_NR *
-| | NR_CPUS *
-| | DCACHE_N_COLORS *
-| | PAGE_SIZE
-+------------------+ FIXADDR_TOP 0x9ffff000
-+------------------+
-| Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xa0000000 512MB
-+------------------+
-| Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 512MB
-+------------------+
-| Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB
-+------------------+
-| Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB
-+------------------+