summaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2024-11-05 09:36:29 +0100
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2024-11-05 09:36:29 +0100
commit9365f0de4303f82ed4c2db1c39d3de824b249d80 (patch)
tree2429440aa61c31608557adae2d8fd9228dee7622 /Documentation
parentf36ee841165b2234db8346eb8d5381626e5ab524 (diff)
parent59b723cd2adbac2a34fc8e12c74ae26ae45bf230 (diff)
downloadlinux-stable-9365f0de4303f82ed4c2db1c39d3de824b249d80.tar.gz
linux-stable-9365f0de4303f82ed4c2db1c39d3de824b249d80.tar.bz2
linux-stable-9365f0de4303f82ed4c2db1c39d3de824b249d80.zip
Merge 6.12-rc6 into char-misc-next
We need the char/misc/iio fixes in here as well. Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/admin-guide/pm/cpufreq.rst20
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml24
-rw-r--r--Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml19
-rw-r--r--Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml21
-rw-r--r--Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml5
-rw-r--r--Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml18
-rw-r--r--Documentation/devicetree/bindings/sound/rockchip,rk3308-codec.yaml4
-rw-r--r--Documentation/filesystems/caching/cachefiles.rst2
-rw-r--r--Documentation/filesystems/netfs_library.rst1
-rw-r--r--Documentation/iio/ad7380.rst13
-rw-r--r--Documentation/networking/packet_mmap.rst5
-rw-r--r--Documentation/rust/arch-support.rst2
-rw-r--r--Documentation/userspace-api/mseal.rst307
-rw-r--r--Documentation/virt/kvm/api.rst16
-rw-r--r--Documentation/virt/kvm/locking.rst2
15 files changed, 250 insertions, 209 deletions
diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst
index fe1be4ad88cb..a21369eba034 100644
--- a/Documentation/admin-guide/pm/cpufreq.rst
+++ b/Documentation/admin-guide/pm/cpufreq.rst
@@ -425,8 +425,8 @@ This governor exposes only one tunable:
``rate_limit_us``
Minimum time (in microseconds) that has to pass between two consecutive
- runs of governor computations (default: 1000 times the scaling driver's
- transition latency).
+ runs of governor computations (default: 1.5 times the scaling driver's
+ transition latency or the maximum 2ms).
The purpose of this tunable is to reduce the scheduler context overhead
of the governor which might be excessive without it.
@@ -474,17 +474,17 @@ This governor exposes the following tunables:
This is how often the governor's worker routine should run, in
microseconds.
- Typically, it is set to values of the order of 10000 (10 ms). Its
- default value is equal to the value of ``cpuinfo_transition_latency``
- for each policy this governor is attached to (but since the unit here
- is greater by 1000, this means that the time represented by
- ``sampling_rate`` is 1000 times greater than the transition latency by
- default).
+ Typically, it is set to values of the order of 2000 (2 ms). Its
+ default value is to add a 50% breathing room
+ to ``cpuinfo_transition_latency`` on each policy this governor is
+ attached to. The minimum is typically the length of two scheduler
+ ticks.
If this tunable is per-policy, the following shell command sets the time
- represented by it to be 750 times as high as the transition latency::
+ represented by it to be 1.5 times as high as the transition latency
+ (the default)::
- # echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate
+ # echo `$(($(cat cpuinfo_transition_latency) * 3 / 2)) > ondemand/sampling_rate
``up_threshold``
If the estimated CPU load is above this value (in percent), the governor
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
index 3a82aec9021c..497c0eb4ed0b 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,dpi.yaml
@@ -63,6 +63,16 @@ properties:
- const: sleep
power-domains:
+ description: |
+ The MediaTek DPI module is typically associated with one of the
+ following multimedia power domains:
+ POWER_DOMAIN_DISPLAY
+ POWER_DOMAIN_VDOSYS
+ POWER_DOMAIN_MM
+ The specific power domain used varies depending on the SoC design.
+
+ It is recommended to explicitly add the appropriate power domain
+ property to the DPI node in the device tree.
maxItems: 1
port:
@@ -79,20 +89,6 @@ required:
- clock-names
- port
-allOf:
- - if:
- not:
- properties:
- compatible:
- contains:
- enum:
- - mediatek,mt6795-dpi
- - mediatek,mt8173-dpi
- - mediatek,mt8186-dpi
- then:
- properties:
- power-domains: false
-
additionalProperties: false
examples:
diff --git a/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml b/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
index e4affc854f3d..4b6ff546757e 100644
--- a/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
+++ b/Documentation/devicetree/bindings/display/mediatek/mediatek,split.yaml
@@ -38,6 +38,7 @@ properties:
description: A phandle and PM domain specifier as defined by bindings of
the power controller specified by phandle. See
Documentation/devicetree/bindings/power/power-domain.yaml for details.
+ maxItems: 1
mediatek,gce-client-reg:
description:
@@ -57,6 +58,9 @@ properties:
clocks:
items:
- description: SPLIT Clock
+ - description: Used for interfacing with the HDMI RX signal source.
+ - description: Paired with receiving HDMI RX metadata.
+ minItems: 1
required:
- compatible
@@ -72,9 +76,24 @@ allOf:
const: mediatek,mt8195-mdp3-split
then:
+ properties:
+ clocks:
+ minItems: 3
+
required:
- mediatek,gce-client-reg
+ - if:
+ properties:
+ compatible:
+ contains:
+ const: mediatek,mt8173-disp-split
+
+ then:
+ properties:
+ clocks:
+ maxItems: 1
+
additionalProperties: false
examples:
diff --git a/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml b/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
index bd19abb867d9..0065d6508824 100644
--- a/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
+++ b/Documentation/devicetree/bindings/iio/adc/adi,ad7380.yaml
@@ -67,6 +67,10 @@ properties:
A 2.5V to 3.3V supply for the external reference voltage. When omitted,
the internal 2.5V reference is used.
+ refin-supply:
+ description:
+ A 2.5V to 3.3V supply for external reference voltage, for ad7380-4 only.
+
aina-supply:
description:
The common mode voltage supply for the AINA- pin on pseudo-differential
@@ -135,6 +139,23 @@ allOf:
ainc-supply: false
aind-supply: false
+ # ad7380-4 uses refin-supply as external reference.
+ # All other chips from ad738x family use refio as optional external reference.
+ # When refio-supply is omitted, internal reference is used.
+ - if:
+ properties:
+ compatible:
+ enum:
+ - adi,ad7380-4
+ then:
+ properties:
+ refio-supply: false
+ required:
+ - refin-supply
+ else:
+ properties:
+ refin-supply: false
+
examples:
- |
#include <dt-bindings/interrupt-controller/irq.h>
diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
index dcf4fa55fbba..380a9222a51d 100644
--- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
+++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
@@ -154,8 +154,6 @@ allOf:
- qcom,sm8550-qmp-gen4x2-pcie-phy
- qcom,sm8650-qmp-gen3x2-pcie-phy
- qcom,sm8650-qmp-gen4x2-pcie-phy
- - qcom,x1e80100-qmp-gen3x2-pcie-phy
- - qcom,x1e80100-qmp-gen4x2-pcie-phy
then:
properties:
clocks:
@@ -171,6 +169,8 @@ allOf:
- qcom,sc8280xp-qmp-gen3x1-pcie-phy
- qcom,sc8280xp-qmp-gen3x2-pcie-phy
- qcom,sc8280xp-qmp-gen3x4-pcie-phy
+ - qcom,x1e80100-qmp-gen3x2-pcie-phy
+ - qcom,x1e80100-qmp-gen4x2-pcie-phy
- qcom,x1e80100-qmp-gen4x4-pcie-phy
then:
properties:
@@ -201,6 +201,7 @@ allOf:
- qcom,sm8550-qmp-gen4x2-pcie-phy
- qcom,sm8650-qmp-gen4x2-pcie-phy
- qcom,x1e80100-qmp-gen4x2-pcie-phy
+ - qcom,x1e80100-qmp-gen4x4-pcie-phy
then:
properties:
resets:
diff --git a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml
index ab3206ffa4af..beef193aaaeb 100644
--- a/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml
+++ b/Documentation/devicetree/bindings/sound/davinci-mcasp-audio.yaml
@@ -102,21 +102,21 @@ properties:
default: 2
interrupts:
- oneOf:
- - minItems: 1
- items:
- - description: TX interrupt
- - description: RX interrupt
- - items:
- - description: common/combined interrupt
+ minItems: 1
+ maxItems: 2
interrupt-names:
oneOf:
- - minItems: 1
+ - description: TX interrupt
+ const: tx
+ - description: RX interrupt
+ const: rx
+ - description: TX and RX interrupts
items:
- const: tx
- const: rx
- - const: common
+ - description: Common/combined interrupt
+ const: common
fck_parent:
$ref: /schemas/types.yaml#/definitions/string
diff --git a/Documentation/devicetree/bindings/sound/rockchip,rk3308-codec.yaml b/Documentation/devicetree/bindings/sound/rockchip,rk3308-codec.yaml
index ecf3d7d968c8..2cf229a076f0 100644
--- a/Documentation/devicetree/bindings/sound/rockchip,rk3308-codec.yaml
+++ b/Documentation/devicetree/bindings/sound/rockchip,rk3308-codec.yaml
@@ -48,6 +48,10 @@ properties:
- const: mclk_rx
- const: hclk
+ port:
+ $ref: audio-graph-port.yaml#
+ unevaluatedProperties: false
+
resets:
maxItems: 1
diff --git a/Documentation/filesystems/caching/cachefiles.rst b/Documentation/filesystems/caching/cachefiles.rst
index e04a27bdbe19..b3ccc782cb3b 100644
--- a/Documentation/filesystems/caching/cachefiles.rst
+++ b/Documentation/filesystems/caching/cachefiles.rst
@@ -115,7 +115,7 @@ set up cache ready for use. The following script commands are available:
This mask can also be set through sysfs, eg::
- echo 5 >/sys/modules/cachefiles/parameters/debug
+ echo 5 > /sys/module/cachefiles/parameters/debug
Starting the Cache
diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst
index f0d2cb257bb8..73f0bfd7e903 100644
--- a/Documentation/filesystems/netfs_library.rst
+++ b/Documentation/filesystems/netfs_library.rst
@@ -592,4 +592,3 @@ API Function Reference
.. kernel-doc:: include/linux/netfs.h
.. kernel-doc:: fs/netfs/buffered_read.c
-.. kernel-doc:: fs/netfs/io.c
diff --git a/Documentation/iio/ad7380.rst b/Documentation/iio/ad7380.rst
index 9c784c1e652e..6f70b49b9ef2 100644
--- a/Documentation/iio/ad7380.rst
+++ b/Documentation/iio/ad7380.rst
@@ -41,13 +41,22 @@ supports only 1 SDO line.
Reference voltage
-----------------
-2 possible reference voltage sources are supported:
+ad7380-4
+~~~~~~~~
+
+ad7380-4 supports only an external reference voltage (2.5V to 3.3V). It must be
+declared in the device tree as ``refin-supply``.
+
+All other devices from ad738x family
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All other devices from ad738x support 2 possible reference voltage sources:
- Internal reference (2.5V)
- External reference (2.5V to 3.3V)
The source is determined by the device tree. If ``refio-supply`` is present,
-then the external reference is used, else the internal reference is used.
+then it is used as external reference, else the internal reference is used.
Oversampling and resolution boost
---------------------------------
diff --git a/Documentation/networking/packet_mmap.rst b/Documentation/networking/packet_mmap.rst
index dca15d15feaf..02370786e77b 100644
--- a/Documentation/networking/packet_mmap.rst
+++ b/Documentation/networking/packet_mmap.rst
@@ -16,7 +16,7 @@ ii) transmit network traffic, or any other that needs raw
Howto can be found at:
- https://sites.google.com/site/packetmmap/
+ https://web.archive.org/web/20220404160947/https://sites.google.com/site/packetmmap/
Please send your comments to
- Ulisses Alonso Camaró <uaca@i.hate.spam.alumni.uv.es>
@@ -166,7 +166,8 @@ As capture, each frame contains two parts::
/* bind socket to eth0 */
bind(this->socket, (struct sockaddr *)&my_addr, sizeof(struct sockaddr_ll));
- A complete tutorial is available at: https://sites.google.com/site/packetmmap/
+ A complete tutorial is available at:
+ https://web.archive.org/web/20220404160947/https://sites.google.com/site/packetmmap/
By default, the user should put data at::
diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst
index 750ff371570a..54be7ddf3e57 100644
--- a/Documentation/rust/arch-support.rst
+++ b/Documentation/rust/arch-support.rst
@@ -17,7 +17,7 @@ Architecture Level of support Constraints
============= ================ ==============================================
``arm64`` Maintained Little Endian only.
``loongarch`` Maintained \-
-``riscv`` Maintained ``riscv64`` only.
+``riscv`` Maintained ``riscv64`` and LLVM/Clang only.
``um`` Maintained \-
``x86`` Maintained ``x86_64`` only.
============= ================ ==============================================
diff --git a/Documentation/userspace-api/mseal.rst b/Documentation/userspace-api/mseal.rst
index 4132eec995a3..41102f74c5e2 100644
--- a/Documentation/userspace-api/mseal.rst
+++ b/Documentation/userspace-api/mseal.rst
@@ -23,177 +23,166 @@ applications can additionally seal security critical data at runtime.
A similar feature already exists in the XNU kernel with the
VM_FLAGS_PERMANENT flag [1] and on OpenBSD with the mimmutable syscall [2].
-User API
-========
-mseal()
------------
-The mseal() syscall has the following signature:
-
-``int mseal(void addr, size_t len, unsigned long flags)``
-
-**addr/len**: virtual memory address range.
-
-The address range set by ``addr``/``len`` must meet:
- - The start address must be in an allocated VMA.
- - The start address must be page aligned.
- - The end address (``addr`` + ``len``) must be in an allocated VMA.
- - no gap (unallocated memory) between start and end address.
-
-The ``len`` will be paged aligned implicitly by the kernel.
-
-**flags**: reserved for future use.
-
-**return values**:
-
-- ``0``: Success.
-
-- ``-EINVAL``:
- - Invalid input ``flags``.
- - The start address (``addr``) is not page aligned.
- - Address range (``addr`` + ``len``) overflow.
-
-- ``-ENOMEM``:
- - The start address (``addr``) is not allocated.
- - The end address (``addr`` + ``len``) is not allocated.
- - A gap (unallocated memory) between start and end address.
-
-- ``-EPERM``:
- - sealing is supported only on 64-bit CPUs, 32-bit is not supported.
-
-- For above error cases, users can expect the given memory range is
- unmodified, i.e. no partial update.
-
-- There might be other internal errors/cases not listed here, e.g.
- error during merging/splitting VMAs, or the process reaching the max
- number of supported VMAs. In those cases, partial updates to the given
- memory range could happen. However, those cases should be rare.
-
-**Blocked operations after sealing**:
- Unmapping, moving to another location, and shrinking the size,
- via munmap() and mremap(), can leave an empty space, therefore
- can be replaced with a VMA with a new set of attributes.
-
- Moving or expanding a different VMA into the current location,
- via mremap().
-
- Modifying a VMA via mmap(MAP_FIXED).
-
- Size expansion, via mremap(), does not appear to pose any
- specific risks to sealed VMAs. It is included anyway because
- the use case is unclear. In any case, users can rely on
- merging to expand a sealed VMA.
-
- mprotect() and pkey_mprotect().
-
- Some destructive madvice() behaviors (e.g. MADV_DONTNEED)
- for anonymous memory, when users don't have write permission to the
- memory. Those behaviors can alter region contents by discarding pages,
- effectively a memset(0) for anonymous memory.
-
- Kernel will return -EPERM for blocked operations.
-
- For blocked operations, one can expect the given address is unmodified,
- i.e. no partial update. Note, this is different from existing mm
- system call behaviors, where partial updates are made till an error is
- found and returned to userspace. To give an example:
-
- Assume following code sequence:
-
- - ptr = mmap(null, 8192, PROT_NONE);
- - munmap(ptr + 4096, 4096);
- - ret1 = mprotect(ptr, 8192, PROT_READ);
- - mseal(ptr, 4096);
- - ret2 = mprotect(ptr, 8192, PROT_NONE);
-
- ret1 will be -ENOMEM, the page from ptr is updated to PROT_READ.
-
- ret2 will be -EPERM, the page remains to be PROT_READ.
-
-**Note**:
-
-- mseal() only works on 64-bit CPUs, not 32-bit CPU.
-
-- users can call mseal() multiple times, mseal() on an already sealed memory
- is a no-action (not error).
-
-- munseal() is not supported.
-
-Use cases:
-==========
+SYSCALL
+=======
+mseal syscall signature
+-----------------------
+ ``int mseal(void \* addr, size_t len, unsigned long flags)``
+
+ **addr**/**len**: virtual memory address range.
+ The address range set by **addr**/**len** must meet:
+ - The start address must be in an allocated VMA.
+ - The start address must be page aligned.
+ - The end address (**addr** + **len**) must be in an allocated VMA.
+ - no gap (unallocated memory) between start and end address.
+
+ The ``len`` will be paged aligned implicitly by the kernel.
+
+ **flags**: reserved for future use.
+
+ **Return values**:
+ - **0**: Success.
+ - **-EINVAL**:
+ * Invalid input ``flags``.
+ * The start address (``addr``) is not page aligned.
+ * Address range (``addr`` + ``len``) overflow.
+ - **-ENOMEM**:
+ * The start address (``addr``) is not allocated.
+ * The end address (``addr`` + ``len``) is not allocated.
+ * A gap (unallocated memory) between start and end address.
+ - **-EPERM**:
+ * sealing is supported only on 64-bit CPUs, 32-bit is not supported.
+
+ **Note about error return**:
+ - For above error cases, users can expect the given memory range is
+ unmodified, i.e. no partial update.
+ - There might be other internal errors/cases not listed here, e.g.
+ error during merging/splitting VMAs, or the process reaching the maximum
+ number of supported VMAs. In those cases, partial updates to the given
+ memory range could happen. However, those cases should be rare.
+
+ **Architecture support**:
+ mseal only works on 64-bit CPUs, not 32-bit CPUs.
+
+ **Idempotent**:
+ users can call mseal multiple times. mseal on an already sealed memory
+ is a no-action (not error).
+
+ **no munseal**
+ Once mapping is sealed, it can't be unsealed. The kernel should never
+ have munseal, this is consistent with other sealing feature, e.g.
+ F_SEAL_SEAL for file.
+
+Blocked mm syscall for sealed mapping
+-------------------------------------
+ It might be important to note: **once the mapping is sealed, it will
+ stay in the process's memory until the process terminates**.
+
+ Example::
+
+ *ptr = mmap(0, 4096, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
+ rc = mseal(ptr, 4096, 0);
+ /* munmap will fail */
+ rc = munmap(ptr, 4096);
+ assert(rc < 0);
+
+ Blocked mm syscall:
+ - munmap
+ - mmap
+ - mremap
+ - mprotect and pkey_mprotect
+ - some destructive madvise behaviors: MADV_DONTNEED, MADV_FREE,
+ MADV_DONTNEED_LOCKED, MADV_FREE, MADV_DONTFORK, MADV_WIPEONFORK
+
+ The first set of syscalls to block is munmap, mremap, mmap. They can
+ either leave an empty space in the address space, therefore allowing
+ replacement with a new mapping with new set of attributes, or can
+ overwrite the existing mapping with another mapping.
+
+ mprotect and pkey_mprotect are blocked because they changes the
+ protection bits (RWX) of the mapping.
+
+ Certain destructive madvise behaviors, specifically MADV_DONTNEED,
+ MADV_FREE, MADV_DONTNEED_LOCKED, and MADV_WIPEONFORK, can introduce
+ risks when applied to anonymous memory by threads lacking write
+ permissions. Consequently, these operations are prohibited under such
+ conditions. The aforementioned behaviors have the potential to modify
+ region contents by discarding pages, effectively performing a memset(0)
+ operation on the anonymous memory.
+
+ Kernel will return -EPERM for blocked syscalls.
+
+ When blocked syscall return -EPERM due to sealing, the memory regions may
+ or may not be changed, depends on the syscall being blocked:
+
+ - munmap: munmap is atomic. If one of VMAs in the given range is
+ sealed, none of VMAs are updated.
+ - mprotect, pkey_mprotect, madvise: partial update might happen, e.g.
+ when mprotect over multiple VMAs, mprotect might update the beginning
+ VMAs before reaching the sealed VMA and return -EPERM.
+ - mmap and mremap: undefined behavior.
+
+Use cases
+=========
- glibc:
The dynamic linker, during loading ELF executables, can apply sealing to
- non-writable memory segments.
-
-- Chrome browser: protect some security sensitive data-structures.
+ mapping segments.
-Notes on which memory to seal:
-==============================
+- Chrome browser: protect some security sensitive data structures.
-It might be important to note that sealing changes the lifetime of a mapping,
-i.e. the sealed mapping won’t be unmapped till the process terminates or the
-exec system call is invoked. Applications can apply sealing to any virtual
-memory region from userspace, but it is crucial to thoroughly analyze the
-mapping's lifetime prior to apply the sealing.
+When not to use mseal
+=====================
+Applications can apply sealing to any virtual memory region from userspace,
+but it is *crucial to thoroughly analyze the mapping's lifetime* prior to
+apply the sealing. This is because the sealed mapping *won’t be unmapped*
+until the process terminates or the exec system call is invoked.
For example:
+ - aio/shm
+ aio/shm can call mmap and munmap on behalf of userspace, e.g.
+ ksys_shmdt() in shm.c. The lifetimes of those mapping are not tied to
+ the lifetime of the process. If those memories are sealed from userspace,
+ then munmap will fail, causing leaks in VMA address space during the
+ lifetime of the process.
+
+ - ptr allocated by malloc (heap)
+ Don't use mseal on the memory ptr return from malloc().
+ malloc() is implemented by allocator, e.g. by glibc. Heap manager might
+ allocate a ptr from brk or mapping created by mmap.
+ If an app calls mseal on a ptr returned from malloc(), this can affect
+ the heap manager's ability to manage the mappings; the outcome is
+ non-deterministic.
+
+ Example::
+
+ ptr = malloc(size);
+ /* don't call mseal on ptr return from malloc. */
+ mseal(ptr, size);
+ /* free will success, allocator can't shrink heap lower than ptr */
+ free(ptr);
+
+mseal doesn't block
+===================
+In a nutshell, mseal blocks certain mm syscall from modifying some of VMA's
+attributes, such as protection bits (RWX). Sealed mappings doesn't mean the
+memory is immutable.
-- aio/shm
-
- aio/shm can call mmap()/munmap() on behalf of userspace, e.g. ksys_shmdt() in
- shm.c. The lifetime of those mapping are not tied to the lifetime of the
- process. If those memories are sealed from userspace, then munmap() will fail,
- causing leaks in VMA address space during the lifetime of the process.
-
-- Brk (heap)
-
- Currently, userspace applications can seal parts of the heap by calling
- malloc() and mseal().
- let's assume following calls from user space:
-
- - ptr = malloc(size);
- - mprotect(ptr, size, RO);
- - mseal(ptr, size);
- - free(ptr);
-
- Technically, before mseal() is added, the user can change the protection of
- the heap by calling mprotect(RO). As long as the user changes the protection
- back to RW before free(), the memory range can be reused.
-
- Adding mseal() into the picture, however, the heap is then sealed partially,
- the user can still free it, but the memory remains to be RO. If the address
- is re-used by the heap manager for another malloc, the process might crash
- soon after. Therefore, it is important not to apply sealing to any memory
- that might get recycled.
-
- Furthermore, even if the application never calls the free() for the ptr,
- the heap manager may invoke the brk system call to shrink the size of the
- heap. In the kernel, the brk-shrink will call munmap(). Consequently,
- depending on the location of the ptr, the outcome of brk-shrink is
- nondeterministic.
-
-
-Additional notes:
-=================
As Jann Horn pointed out in [3], there are still a few ways to write
-to RO memory, which is, in a way, by design. Those cases are not covered
-by mseal(). If applications want to block such cases, sandbox tools (such as
-seccomp, LSM, etc) might be considered.
+to RO memory, which is, in a way, by design. And those could be blocked
+by different security measures.
Those cases are:
-- Write to read-only memory through /proc/self/mem interface.
-- Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
-- userfaultfd.
+ - Write to read-only memory through /proc/self/mem interface (FOLL_FORCE).
+ - Write to read-only memory through ptrace (such as PTRACE_POKETEXT).
+ - userfaultfd.
The idea that inspired this patch comes from Stephen Röttger’s work in V8
CFI [4]. Chrome browser in ChromeOS will be the first user of this API.
-Reference:
-==========
-[1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
-
-[2] https://man.openbsd.org/mimmutable.2
-
-[3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
-
-[4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
+Reference
+=========
+- [1] https://github.com/apple-oss-distributions/xnu/blob/1031c584a5e37aff177559b9f69dbd3c8c3fd30a/osfmk/mach/vm_statistics.h#L274
+- [2] https://man.openbsd.org/mimmutable.2
+- [3] https://lore.kernel.org/lkml/CAG48ez3ShUYey+ZAFsU2i1RpQn0a5eOs2hzQ426FkcgnfUGLvA@mail.gmail.com
+- [4] https://docs.google.com/document/d/1O2jwK4dxI3nRcOJuPYkonhTkNQfbmwdvxQMyXgeaRHo/edit#heading=h.bvaojj9fu6hc
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index e32471977d0a..edc070c6e19b 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -8098,13 +8098,15 @@ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS By default, KVM emulates MONITOR/MWAIT (if
KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT is
disabled.
-KVM_X86_QUIRK_SLOT_ZAP_ALL By default, KVM invalidates all SPTEs in
- fast way for memslot deletion when VM type
- is KVM_X86_DEFAULT_VM.
- When this quirk is disabled or when VM type
- is other than KVM_X86_DEFAULT_VM, KVM zaps
- only leaf SPTEs that are within the range of
- the memslot being deleted.
+KVM_X86_QUIRK_SLOT_ZAP_ALL By default, for KVM_X86_DEFAULT_VM VMs, KVM
+ invalidates all SPTEs in all memslots and
+ address spaces when a memslot is deleted or
+ moved. When this quirk is disabled (or the
+ VM type isn't KVM_X86_DEFAULT_VM), KVM only
+ ensures the backing memory of the deleted
+ or moved memslot isn't reachable, i.e KVM
+ _may_ invalidate only SPTEs related to the
+ memslot.
=================================== ============================================
7.32 KVM_CAP_MAX_VCPU_ID
diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst
index 20a9a37d1cdd..1bedd56e2fe3 100644
--- a/Documentation/virt/kvm/locking.rst
+++ b/Documentation/virt/kvm/locking.rst
@@ -136,7 +136,7 @@ For direct sp, we can easily avoid it since the spte of direct sp is fixed
to gfn. For indirect sp, we disabled fast page fault for simplicity.
A solution for indirect sp could be to pin the gfn, for example via
-kvm_vcpu_gfn_to_pfn_atomic, before the cmpxchg. After the pinning:
+gfn_to_pfn_memslot_atomic, before the cmpxchg. After the pinning:
- We have held the refcount of pfn; that means the pfn can not be freed and
be reused for another gfn.