diff --git a/.clang-format b/.clang-format index 196ca317bd1f24ad57ad9ded549bc0b7994d8111..e92e6dd1780d0b2eeebf34d8cbac5bc18bbc8204 100644 --- a/.clang-format +++ b/.clang-format @@ -86,6 +86,8 @@ ForEachMacros: - 'bio_for_each_segment_all' - 'bio_list_for_each' - 'bip_for_each_vec' + - 'bitmap_for_each_clear_region' + - 'bitmap_for_each_set_region' - 'blkg_for_each_descendant_post' - 'blkg_for_each_descendant_pre' - 'blk_queue_for_each_rl' @@ -115,6 +117,7 @@ ForEachMacros: - 'drm_client_for_each_connector_iter' - 'drm_client_for_each_modeset' - 'drm_connector_for_each_possible_encoder' + - 'drm_for_each_bridge_in_chain' - 'drm_for_each_connector_iter' - 'drm_for_each_crtc' - 'drm_for_each_encoder' @@ -136,12 +139,16 @@ ForEachMacros: - 'for_each_bio' - 'for_each_board_func_rsrc' - 'for_each_bvec' + - 'for_each_card_auxs' + - 'for_each_card_auxs_safe' - 'for_each_card_components' - - 'for_each_card_links' - - 'for_each_card_links_safe' + - 'for_each_card_dapms' + - 'for_each_card_pre_auxs' - 'for_each_card_prelinks' - 'for_each_card_rtds' - 'for_each_card_rtds_safe' + - 'for_each_card_widgets' + - 'for_each_card_widgets_safe' - 'for_each_cgroup_storage_type' - 'for_each_child_of_node' - 'for_each_clear_bit' @@ -156,6 +163,7 @@ ForEachMacros: - 'for_each_cpu_and' - 'for_each_cpu_not' - 'for_each_cpu_wrap' + - 'for_each_dapm_widgets' - 'for_each_dev_addr' - 'for_each_dev_scope' - 'for_each_displayid_db' @@ -186,10 +194,12 @@ ForEachMacros: - 'for_each_ip_tunnel_rcu' - 'for_each_irq_nr' - 'for_each_link_codecs' + - 'for_each_link_cpus' - 'for_each_link_platforms' - 'for_each_lru' - 'for_each_matching_node' - 'for_each_matching_node_and_match' + - 'for_each_member' - 'for_each_memblock' - 'for_each_memblock_type' - 'for_each_memcg_cache_index' @@ -200,9 +210,11 @@ ForEachMacros: - 'for_each_msi_entry' - 'for_each_msi_entry_safe' - 'for_each_net' + - 'for_each_net_continue_reverse' - 'for_each_netdev' - 'for_each_netdev_continue' - 'for_each_netdev_continue_rcu' + - 'for_each_netdev_continue_reverse' - 'for_each_netdev_feature' - 'for_each_netdev_in_bond_rcu' - 'for_each_netdev_rcu' @@ -242,6 +254,7 @@ ForEachMacros: - 'for_each_pci_bridge' - 'for_each_pci_dev' - 'for_each_pci_msi_entry' + - 'for_each_pcm_streams' - 'for_each_populated_zone' - 'for_each_possible_cpu' - 'for_each_present_cpu' @@ -252,12 +265,15 @@ ForEachMacros: - 'for_each_property_of_node' - 'for_each_registered_fb' - 'for_each_reserved_mem_region' - - 'for_each_rtd_codec_dai' - - 'for_each_rtd_codec_dai_rollback' - - 'for_each_rtdcom' - - 'for_each_rtdcom_safe' + - 'for_each_rtd_codec_dais' + - 'for_each_rtd_codec_dais_rollback' + - 'for_each_rtd_components' + - 'for_each_rtd_cpu_dais' + - 'for_each_rtd_cpu_dais_rollback' + - 'for_each_rtd_dais' - 'for_each_set_bit' - 'for_each_set_bit_from' + - 'for_each_set_clump8' - 'for_each_sg' - 'for_each_sg_dma_page' - 'for_each_sg_page' @@ -267,6 +283,7 @@ ForEachMacros: - 'for_each_subelement_id' - '__for_each_thread' - 'for_each_thread' + - 'for_each_wakeup_source' - 'for_each_zone' - 'for_each_zone_zonelist' - 'for_each_zone_zonelist_nodemask' @@ -325,11 +342,13 @@ ForEachMacros: - 'klp_for_each_object' - 'klp_for_each_object_safe' - 'klp_for_each_object_static' + - 'kunit_suite_for_each_test_case' - 'kvm_for_each_memslot' - 'kvm_for_each_vcpu' - 'list_for_each' - 'list_for_each_codec' - 'list_for_each_codec_safe' + - 'list_for_each_continue' - 'list_for_each_entry' - 'list_for_each_entry_continue' - 'list_for_each_entry_continue_rcu' @@ -351,6 +370,7 @@ ForEachMacros: - 'llist_for_each_entry' - 'llist_for_each_entry_safe' - 'llist_for_each_safe' + - 'mci_for_each_dimm' - 'media_device_for_each_entity' - 'media_device_for_each_intf' - 'media_device_for_each_link' @@ -376,6 +396,7 @@ ForEachMacros: - 'of_property_for_each_string' - 'of_property_for_each_u32' - 'pci_bus_for_each_resource' + - 'pcm_for_each_format' - 'ping_portaddr_for_each_entry' - 'plist_for_each' - 'plist_for_each_continue' @@ -444,10 +465,16 @@ ForEachMacros: - 'virtio_device_for_each_vq' - 'xa_for_each' - 'xa_for_each_marked' + - 'xa_for_each_range' - 'xa_for_each_start' - 'xas_for_each' - 'xas_for_each_conflict' - 'xas_for_each_marked' + - 'xbc_array_for_each_value' + - 'xbc_for_each_key_value' + - 'xbc_node_for_each_array_value' + - 'xbc_node_for_each_child' + - 'xbc_node_for_each_key_value' - 'zorro_for_each_dev' #IncludeBlocks: Preserve # Unknown to clang-format-5.0 @@ -465,7 +492,7 @@ KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 -NamespaceIndentation: Inner +NamespaceIndentation: None #ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0 ObjCBlockIndentWidth: 8 ObjCSpaceAfterProperty: true diff --git a/.gitignore b/.gitignore index 72ef86a5570d28015d0ccb95ccd212bf8820c1c2..2258e906f01c84f36c6cf5de2a3e76f2877b0cc0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only # # NOTE! Don't add files that are generated in specific # subdirectories here. Add them in the ".gitignore" file diff --git a/.mailmap b/.mailmap index d9d5c80252f9cf2d2f140d97fb86a87c1ecf4b22..4f906b4e9785f7628ff5fe79fd4da3654ffc7bdc 100644 --- a/.mailmap +++ b/.mailmap @@ -18,6 +18,7 @@ Aleksey Gorelov Aleksandar Markovic Alex Shi Alex Shi +Alexandre Belloni Alexei Starovoitov Alexei Starovoitov Alexei Starovoitov @@ -27,6 +28,8 @@ Andi Shyti Andreas Herrmann Andrey Ryabinin Andrew Morton +Andrew Murray +Andrew Murray Andrew Vasquez Andy Adamson Antoine Tenart @@ -74,6 +77,7 @@ Dmitry Safonov <0x7f454c46@gmail.com> Domen Puncer Douglas Gilbert Ed L. Cashin +Erik Kaneda Evgeniy Polyakov Felipe W Damasio Felix Kuhling @@ -138,6 +142,7 @@ Juha Yrjola Juha Yrjola Juha Yrjola Julien Thierry +Kamil Konieczny Kay Sievers Kenneth W Chen Konstantin Khlebnikov @@ -205,10 +210,15 @@ Oleksij Rempel Oleksij Rempel Oleksij Rempel Oleksij Rempel +Pali Rohár Paolo 'Blaisorblade' Giarrusso Patrick Mochel Paul Burton Paul Burton +Paul E. McKenney +Paul E. McKenney +Paul E. McKenney +Paul E. McKenney Peter A Jonsson Peter Oruba Peter Oruba @@ -216,7 +226,9 @@ Pratyush Anand Praveen BP Punit Agrawal Qais Yousef +Quentin Monnet Quentin Perret +Rafael J. Wysocki Rajesh Shah Ralf Baechle Ralf Wildenhues @@ -233,9 +245,11 @@ Santosh Shilimkar Santosh Shilimkar Sascha Hauer S.Çağlar Onur +Sakari Ailus Sean Nyekjaer Sebastian Reichel Sebastian Reichel +Sedat Dilek Shiraz Hashim Shuah Khan Shuah Khan @@ -252,6 +266,7 @@ Sumit Semwal Tejun Heo Thomas Graf Thomas Pedersen +Tiezhu Yang Todor Tomov Tony Luck TripleX Chung @@ -273,6 +288,8 @@ Vladimir Davydov Vladimir Davydov Takashi YOSHII Will Deacon +Wolfram Sang +Wolfram Sang Yakir Yang Yusuke Goda Gustavo Padovan diff --git a/COPYING b/COPYING index da4cb28febe66172a9fdf1a235525ae6c00cde1d..a635a38ef9405fdfcfe97f3a435393c1e9cae971 100644 --- a/COPYING +++ b/COPYING @@ -16,3 +16,5 @@ In addition, other licenses may also apply. Please see: Documentation/process/license-rules.rst for more details. + +All contributions to the Linux Kernel are subject to this COPYING file. diff --git a/CREDITS b/CREDITS index 9602b0fa1c958da4b605127c2b8da0628efeb34e..032b5994f4760a13770d1629b8b057195d9266c2 100644 --- a/CREDITS +++ b/CREDITS @@ -567,6 +567,11 @@ D: Original author of Amiga FFS filesystem S: Orlando, Florida S: USA +N: Paul Burton +E: paulburton@kernel.org +W: https://pburton.com +D: MIPS maintainer 2018-2020 + N: Lennert Buytenhek E: kernel@wantstofly.org D: Original (2.4) rewrite of the ethernet bridging code @@ -3302,7 +3307,9 @@ S: France N: Aleksa Sarai E: cyphar@cyphar.com W: https://www.cyphar.com/ -D: `pids` cgroup subsystem +D: /sys/fs/cgroup/pids +D: openat2(2) +S: Sydney, Australia N: Dipankar Sarma E: dipankar@in.ibm.com diff --git a/Documentation/.gitignore b/Documentation/.gitignore index e74fec8693b29ce52094c0f5a055200db18e8570..d6dc7c9b8e25020f1f3b28811df2291c38695d5f 100644 --- a/Documentation/.gitignore +++ b/Documentation/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only output *.pyc diff --git a/Documentation/ABI/obsolete/sysfs-kernel-fadump_enabled b/Documentation/ABI/obsolete/sysfs-kernel-fadump_enabled new file mode 100644 index 0000000000000000000000000000000000000000..e9c2de8b368892fc5684356b682a191ba67729c0 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-kernel-fadump_enabled @@ -0,0 +1,9 @@ +This ABI is renamed and moved to a new location /sys/kernel/fadump/enabled. + +What: /sys/kernel/fadump_enabled +Date: Feb 2012 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Primarily used to identify whether the FADump is enabled in + the kernel or not. +User: Kdump service diff --git a/Documentation/ABI/obsolete/sysfs-kernel-fadump_registered b/Documentation/ABI/obsolete/sysfs-kernel-fadump_registered new file mode 100644 index 0000000000000000000000000000000000000000..0360be39c98e9d809613dd856b3d5b7fda3d5b9b --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-kernel-fadump_registered @@ -0,0 +1,10 @@ +This ABI is renamed and moved to a new location /sys/kernel/fadump/registered.¬ + +What: /sys/kernel/fadump_registered +Date: Feb 2012 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + Helps to control the dump collect feature from userspace. + Setting 1 to this file enables the system to collect the + dump and 0 to disable it. +User: Kdump service diff --git a/Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem b/Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem new file mode 100644 index 0000000000000000000000000000000000000000..6ce0b129ab12db7eb84c5ed99ced0e48ca46dc27 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-kernel-fadump_release_mem @@ -0,0 +1,10 @@ +This ABI is renamed and moved to a new location /sys/kernel/fadump/release_mem.¬ + +What: /sys/kernel/fadump_release_mem +Date: Feb 2012 +Contact: linuxppc-dev@lists.ozlabs.org +Description: write only + This is a special sysfs file and only available when + the system is booted to capture the vmcore using FADump. + It is used to release the memory reserved by FADump to + save the crash dump. diff --git a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot b/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot new file mode 100644 index 0000000000000000000000000000000000000000..49ed9c8fd1e56dc553399983b2af61684ac6414a --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot @@ -0,0 +1,23 @@ +What: /sys/fs/selinux/checkreqprot +Date: April 2005 (predates git) +KernelVersion: 2.6.12-rc2 (predates git) +Contact: selinux@vger.kernel.org +Description: + + The selinuxfs "checkreqprot" node allows SELinux to be configured + to check the protection requested by userspace for mmap/mprotect + calls instead of the actual protection applied by the kernel. + This was a compatibility mechanism for legacy userspace and + for the READ_IMPLIES_EXEC personality flag. However, if set to + 1, it weakens security by allowing mappings to be made executable + without authorization by policy. The default value of checkreqprot + at boot was changed starting in Linux v4.4 to 0 (i.e. check the + actual protection), and Android and Linux distributions have been + explicitly writing a "0" to /sys/fs/selinux/checkreqprot during + initialization for some time. Support for setting checkreqprot to 1 + will be removed in a future kernel release, at which point the kernel + will always cease using checkreqprot internally and will always + check the actual protections being applied upon mmap/mprotect calls. + The checkreqprot selinuxfs node will remain for backward compatibility + but will discard writes of the "0" value and will reject writes of the + "1" value when this mechanism is removed. diff --git a/Documentation/ABI/obsolete/sysfs-selinux-disable b/Documentation/ABI/obsolete/sysfs-selinux-disable new file mode 100644 index 0000000000000000000000000000000000000000..c340278e3cf83705e85685926d1b06c46dc74a2c --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-selinux-disable @@ -0,0 +1,26 @@ +What: /sys/fs/selinux/disable +Date: April 2005 (predates git) +KernelVersion: 2.6.12-rc2 (predates git) +Contact: selinux@vger.kernel.org +Description: + + The selinuxfs "disable" node allows SELinux to be disabled at runtime + prior to a policy being loaded into the kernel. If disabled via this + mechanism, SELinux will remain disabled until the system is rebooted. + + The preferred method of disabling SELinux is via the "selinux=0" boot + parameter, but the selinuxfs "disable" node was created to make it + easier for systems with primitive bootloaders that did not allow for + easy modification of the kernel command line. Unfortunately, allowing + for SELinux to be disabled at runtime makes it difficult to secure the + kernel's LSM hooks using the "__ro_after_init" feature. + + Thankfully, the need for the SELinux runtime disable appears to be + gone, the default Kconfig configuration disables this selinuxfs node, + and only one of the major distributions, Fedora, supports disabling + SELinux at runtime. Fedora is in the process of removing the + selinuxfs "disable" node and once that is complete we will start the + slow process of removing this code from the kernel. + + More information on /sys/fs/selinux/disable can be found under the + CONFIG_SECURITY_SELINUX_DISABLE Kconfig option. diff --git a/Documentation/ABI/removed/sysfs-kernel-fadump_release_opalcore b/Documentation/ABI/removed/sysfs-kernel-fadump_release_opalcore new file mode 100644 index 0000000000000000000000000000000000000000..a8d46cd0f4e61dfefa3e2b460914b993c717b001 --- /dev/null +++ b/Documentation/ABI/removed/sysfs-kernel-fadump_release_opalcore @@ -0,0 +1,9 @@ +This ABI is moved to /sys/firmware/opal/mpipl/release_core. + +What: /sys/kernel/fadump_release_opalcore +Date: Sep 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: write only + The sysfs file is available when the system is booted to + collect the dump on OPAL based machine. It used to release + the memory used to collect the opalcore. diff --git a/Documentation/ABI/testing/sysfs-kernel-uids b/Documentation/ABI/removed/sysfs-kernel-uids similarity index 91% rename from Documentation/ABI/testing/sysfs-kernel-uids rename to Documentation/ABI/removed/sysfs-kernel-uids index 4182b706181666657eb0704109786bd225e78625..dc4463f190a7da78685b4fda4239934095fde85c 100644 --- a/Documentation/ABI/testing/sysfs-kernel-uids +++ b/Documentation/ABI/removed/sysfs-kernel-uids @@ -1,5 +1,5 @@ What: /sys/kernel/uids//cpu_shares -Date: December 2007 +Date: December 2007, finally removed in kernel v2.6.34-rc1 Contact: Dhaval Giani Srivatsa Vaddagiri Description: diff --git a/Documentation/ABI/stable/sysfs-class-tpm b/Documentation/ABI/stable/sysfs-class-tpm index c0e23830f56a453ceb15610b67527220882dd072..58e94e7d55be5f8ec949545c677b148f0ec46868 100644 --- a/Documentation/ABI/stable/sysfs-class-tpm +++ b/Documentation/ABI/stable/sysfs-class-tpm @@ -1,7 +1,7 @@ What: /sys/class/tpm/tpmX/device/ Date: April 2005 KernelVersion: 2.6.12 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The device/ directory under a specific TPM instance exposes the properties of that TPM chip @@ -9,7 +9,7 @@ Description: The device/ directory under a specific TPM instance exposes What: /sys/class/tpm/tpmX/device/active Date: April 2006 KernelVersion: 2.6.17 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "active" property prints a '1' if the TPM chip is accepting commands. An inactive TPM chip still contains all the state of an active chip (Storage Root Key, NVRAM, etc), and can be @@ -21,7 +21,7 @@ Description: The "active" property prints a '1' if the TPM chip is accepting What: /sys/class/tpm/tpmX/device/cancel Date: June 2005 KernelVersion: 2.6.13 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "cancel" property allows you to cancel the currently pending TPM command. Writing any value to cancel will call the TPM vendor specific cancel operation. @@ -29,7 +29,7 @@ Description: The "cancel" property allows you to cancel the currently What: /sys/class/tpm/tpmX/device/caps Date: April 2005 KernelVersion: 2.6.12 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "caps" property contains TPM manufacturer and version info. Example output: @@ -46,7 +46,7 @@ Description: The "caps" property contains TPM manufacturer and version info. What: /sys/class/tpm/tpmX/device/durations Date: March 2011 KernelVersion: 3.1 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "durations" property shows the 3 vendor-specific values used to wait for a short, medium and long TPM command. All TPM commands are categorized as short, medium or long in @@ -69,7 +69,7 @@ Description: The "durations" property shows the 3 vendor-specific values What: /sys/class/tpm/tpmX/device/enabled Date: April 2006 KernelVersion: 2.6.17 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "enabled" property prints a '1' if the TPM chip is enabled, meaning that it should be visible to the OS. This property may be visible but produce a '0' after some operation that @@ -78,7 +78,7 @@ Description: The "enabled" property prints a '1' if the TPM chip is enabled, What: /sys/class/tpm/tpmX/device/owned Date: April 2006 KernelVersion: 2.6.17 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "owned" property produces a '1' if the TPM_TakeOwnership ordinal has been executed successfully in the chip. A '0' indicates that ownership hasn't been taken. @@ -86,7 +86,7 @@ Description: The "owned" property produces a '1' if the TPM_TakeOwnership What: /sys/class/tpm/tpmX/device/pcrs Date: April 2005 KernelVersion: 2.6.12 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "pcrs" property will dump the current value of all Platform Configuration Registers in the TPM. Note that since these values may be constantly changing, the output is only valid @@ -109,7 +109,7 @@ Description: The "pcrs" property will dump the current value of all Platform What: /sys/class/tpm/tpmX/device/pubek Date: April 2005 KernelVersion: 2.6.12 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "pubek" property will return the TPM's public endorsement key if possible. If the TPM has had ownership established and is version 1.2, the pubek will not be available without the @@ -161,7 +161,7 @@ Description: The "pubek" property will return the TPM's public endorsement What: /sys/class/tpm/tpmX/device/temp_deactivated Date: April 2006 KernelVersion: 2.6.17 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "temp_deactivated" property returns a '1' if the chip has been temporarily deactivated, usually until the next power cycle. Whether a warm boot (reboot) will clear a TPM chip @@ -170,7 +170,7 @@ Description: The "temp_deactivated" property returns a '1' if the chip has What: /sys/class/tpm/tpmX/device/timeouts Date: March 2011 KernelVersion: 3.1 -Contact: tpmdd-devel@lists.sf.net +Contact: linux-integrity@vger.kernel.org Description: The "timeouts" property shows the 4 vendor-specific values for the TPM's interface spec timeouts. The use of these timeouts is defined by the TPM interface spec that the chip @@ -183,3 +183,14 @@ Description: The "timeouts" property shows the 4 vendor-specific values The four timeout values are shown in usecs, with a trailing "[original]" or "[adjusted]" depending on whether the values were scaled by the driver to be reported in usec from msecs. + +What: /sys/class/tpm/tpmX/tpm_version_major +Date: October 2019 +KernelVersion: 5.5 +Contact: linux-integrity@vger.kernel.org +Description: The "tpm_version_major" property shows the TCG spec major version + implemented by the TPM device. + + Example output: + + 2 diff --git a/Documentation/ABI/stable/sysfs-driver-dma-idxd b/Documentation/ABI/stable/sysfs-driver-dma-idxd new file mode 100644 index 0000000000000000000000000000000000000000..f4be46cc6cb60e16570b7a3694743ddb9037e879 --- /dev/null +++ b/Documentation/ABI/stable/sysfs-driver-dma-idxd @@ -0,0 +1,171 @@ +What: sys/bus/dsa/devices/dsa/cdev_major +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The major number that the character device driver assigned to + this device. + +What: sys/bus/dsa/devices/dsa/errors +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The error information for this device. + +What: sys/bus/dsa/devices/dsa/max_batch_size +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The largest number of work descriptors in a batch. + +What: sys/bus/dsa/devices/dsa/max_work_queues_size +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The maximum work queue size supported by this device. + +What: sys/bus/dsa/devices/dsa/max_engines +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The maximum number of engines supported by this device. + +What: sys/bus/dsa/devices/dsa/max_groups +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The maximum number of groups can be created under this device. + +What: sys/bus/dsa/devices/dsa/max_tokens +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The total number of bandwidth tokens supported by this device. + The bandwidth tokens represent resources within the DSA + implementation, and these resources are allocated by engines to + support operations. + +What: sys/bus/dsa/devices/dsa/max_transfer_size +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The number of bytes to be read from the source address to + perform the operation. The maximum transfer size is dependent on + the workqueue the descriptor was submitted to. + +What: sys/bus/dsa/devices/dsa/max_work_queues +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The maximum work queue number that this device supports. + +What: sys/bus/dsa/devices/dsa/numa_node +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The numa node number for this device. + +What: sys/bus/dsa/devices/dsa/op_cap +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The operation capability bit mask specify the operation types + supported by the this device. + +What: sys/bus/dsa/devices/dsa/state +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The state information of this device. It can be either enabled + or disabled. + +What: sys/bus/dsa/devices/dsa/group. +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The assigned group under this device. + +What: sys/bus/dsa/devices/dsa/engine. +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The assigned engine under this device. + +What: sys/bus/dsa/devices/dsa/wq. +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The assigned work queue under this device. + +What: sys/bus/dsa/devices/dsa/configurable +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: To indicate if this device is configurable or not. + +What: sys/bus/dsa/devices/dsa/token_limit +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The maximum number of bandwidth tokens that may be in use at + one time by operations that access low bandwidth memory in the + device. + +What: sys/bus/dsa/devices/wq./group_id +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The group id that this work queue belongs to. + +What: sys/bus/dsa/devices/wq./size +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The work queue size for this work queue. + +What: sys/bus/dsa/devices/wq./type +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The type of this work queue, it can be "kernel" type for work + queue usages in the kernel space or "user" type for work queue + usages by applications in user space. + +What: sys/bus/dsa/devices/wq./cdev_minor +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The minor number assigned to this work queue by the character + device driver. + +What: sys/bus/dsa/devices/wq./mode +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The work queue mode type for this work queue. + +What: sys/bus/dsa/devices/wq./priority +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The priority value of this work queue, it is a vlue relative to + other work queue in the same group to control quality of service + for dispatching work from multiple workqueues in the same group. + +What: sys/bus/dsa/devices/wq./state +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The current state of the work queue. + +What: sys/bus/dsa/devices/wq./threshold +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The number of entries in this work queue that may be filled + via a limited portal. + +What: sys/bus/dsa/devices/engine./group_id +Date: Oct 25, 2019 +KernelVersion: 5.6.0 +Contact: dmaengine@vger.kernel.org +Description: The group that this engine belongs to. diff --git a/Documentation/ABI/stable/sysfs-driver-mlxreg-io b/Documentation/ABI/stable/sysfs-driver-mlxreg-io index 05601a90a9b6f2251c81c881256221b9bf1f42cb..b0d90cc696a83dd66732fa9ceb687b8f73892726 100644 --- a/Documentation/ABI/stable/sysfs-driver-mlxreg-io +++ b/Documentation/ABI/stable/sysfs-driver-mlxreg-io @@ -1,5 +1,4 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/asic_health - Date: June 2018 KernelVersion: 4.19 Contact: Vadim Pasternak @@ -19,7 +18,6 @@ Description: These files show with which CPLD versions have been burned The files are read only. What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/fan_dir - Date: December 2018 KernelVersion: 5.0 Contact: Vadim Pasternak @@ -30,7 +28,6 @@ Description: This file shows the system fans direction: The files are read only. What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/cpld3_version - Date: November 2018 KernelVersion: 5.0 Contact: Vadim Pasternak @@ -40,7 +37,6 @@ Description: These files show with which CPLD versions have been burned The files are read only. What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/jtag_enable - Date: November 2018 KernelVersion: 5.0 Contact: Vadim Pasternak @@ -108,7 +104,6 @@ What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_pwr_fail What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_comex What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_system What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_voltmon_upgrade_fail - Date: November 2018 KernelVersion: 5.0 Contact: Vadim Pasternak @@ -130,6 +125,12 @@ Description: These files show with which CPLD versions have been burned The files are read only. +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_thermal +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_wd +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_asic +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_reload_bios +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sff_wd +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_swb_wd Date: June 2019 KernelVersion: 5.3 Contact: Vadim Pasternak @@ -143,9 +144,65 @@ Description: These files show the system reset cause, as following: The files are read only. -What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_thermal -What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_comex_wd -What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_from_asic -What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_reload_bios -What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sff_wd -What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_swb_wd +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/config1 +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/config2 +Date: January 2020 +KernelVersion: 5.6 +Contact: Vadim Pasternak +Description: These files show system static topology identification + like system's static I2C topology, number and type of FPGA + devices within the system and so on. + + The files are read only. + +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_ac_pwr_fail +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_platform +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_soc +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/reset_sw_pwr_off +Date: January 2020 +KernelVersion: 5.6 +Contact: Vadim Pasternak +Description: These files show the system reset causes, as following: reset + due to AC power failure, reset invoked from software by + assertion reset signal through CPLD. reset caused by signal + asserted by SOC through ACPI register, reset invoked from + software by assertion power off signal through CPLD. + + The files are read only. + +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/pcie_asic_reset_dis +Date: January 2020 +KernelVersion: 5.6 +Contact: Vadim Pasternak +Description: This file allows to retain ASIC up during PCIe root complex + reset, when attribute is set 1. + + The file is read/write. + +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/vpd_wp +Date: January 2020 +KernelVersion: 5.6 +Contact: Vadim Pasternak +Description: This file allows to overwrite system VPD hardware wrtie + protection when attribute is set 1. + + The file is read/write. + +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/voltreg_update_status +Date: January 2020 +KernelVersion: 5.6 +Contact: Vadim Pasternak +Description: This file exposes the configuration update status of burnable + voltage regulator devices. The status values are as following: + 0 - OK; 1 - CRC failure; 2 = I2C failure; 3 - in progress. + + The file is read only. + +What: /sys/devices/platform/mlxplat/mlxreg-io/hwmon/hwmon*/ufm_version +Date: January 2020 +KernelVersion: 5.6 +Contact: Vadim Pasternak +Description: This file exposes the firmware version of burnable voltage + regulator devices. + + The file is read only. diff --git a/Documentation/ABI/testing/configfs-most b/Documentation/ABI/testing/configfs-most new file mode 100644 index 0000000000000000000000000000000000000000..ed67a4d9f6d63f8bc410c2560df45236bb2d7927 --- /dev/null +++ b/Documentation/ABI/testing/configfs-most @@ -0,0 +1,196 @@ +What: /sys/kernel/config/most_ +Date: March 8, 2019 +KernelVersion: 5.2 +Description: Interface is used to configure and connect device channels + to component drivers. + + Attributes are visible only when configfs is mounted. To mount + configfs in /sys/kernel/config directory use: + # mount -t configfs none /sys/kernel/config/ + + +What: /sys/kernel/config/most_cdev/ +Date: March 8, 2019 +KernelVersion: 5.2 +Description: + The attributes: + + buffer_size configure the buffer size for this channel + + subbuffer_size configure the sub-buffer size for this channel + (needed for synchronous and isochrnous data) + + + num_buffers configure number of buffers used for this + channel + + datatype configure type of data that will travel over + this channel + + direction configure whether this link will be an input + or output + + dbr_size configure DBR data buffer size (this is used + for MediaLB communication only) + + packets_per_xact + configure the number of packets that will be + collected from the network before being + transmitted via USB (this is used for USB + communication only) + + device name of the device the link is to be attached to + + channel name of the channel the link is to be attached to + + comp_params pass parameters needed by some components + + create_link write '1' to this attribute to trigger the + creation of the link. In case of speculative + configuration, the creation is post-poned until + a physical device is being attached to the bus. + + destroy_link write '1' to this attribute to destroy an + active link + +What: /sys/kernel/config/most_video/ +Date: March 8, 2019 +KernelVersion: 5.2 +Description: + The attributes: + + buffer_size configure the buffer size for this channel + + subbuffer_size configure the sub-buffer size for this channel + (needed for synchronous and isochrnous data) + + + num_buffers configure number of buffers used for this + channel + + datatype configure type of data that will travel over + this channel + + direction configure whether this link will be an input + or output + + dbr_size configure DBR data buffer size (this is used + for MediaLB communication only) + + packets_per_xact + configure the number of packets that will be + collected from the network before being + transmitted via USB (this is used for USB + communication only) + + device name of the device the link is to be attached to + + channel name of the channel the link is to be attached to + + comp_params pass parameters needed by some components + + create_link write '1' to this attribute to trigger the + creation of the link. In case of speculative + configuration, the creation is post-poned until + a physical device is being attached to the bus. + + destroy_link write '1' to this attribute to destroy an + active link + +What: /sys/kernel/config/most_net/ +Date: March 8, 2019 +KernelVersion: 5.2 +Description: + The attributes: + + buffer_size configure the buffer size for this channel + + subbuffer_size configure the sub-buffer size for this channel + (needed for synchronous and isochrnous data) + + + num_buffers configure number of buffers used for this + channel + + datatype configure type of data that will travel over + this channel + + direction configure whether this link will be an input + or output + + dbr_size configure DBR data buffer size (this is used + for MediaLB communication only) + + packets_per_xact + configure the number of packets that will be + collected from the network before being + transmitted via USB (this is used for USB + communication only) + + device name of the device the link is to be attached to + + channel name of the channel the link is to be attached to + + comp_params pass parameters needed by some components + + create_link write '1' to this attribute to trigger the + creation of the link. In case of speculative + configuration, the creation is post-poned until + a physical device is being attached to the bus. + + destroy_link write '1' to this attribute to destroy an + active link + +What: /sys/kernel/config/most_sound/ +Date: March 8, 2019 +KernelVersion: 5.2 +Description: + The attributes: + + create_card write '1' to this attribute to trigger the + registration of the sound card with the ALSA + subsystem. + +What: /sys/kernel/config/most_sound// +Date: March 8, 2019 +KernelVersion: 5.2 +Description: + The attributes: + + buffer_size configure the buffer size for this channel + + subbuffer_size configure the sub-buffer size for this channel + (needed for synchronous and isochrnous data) + + + num_buffers configure number of buffers used for this + channel + + datatype configure type of data that will travel over + this channel + + direction configure whether this link will be an input + or output + + dbr_size configure DBR data buffer size (this is used + for MediaLB communication only) + + packets_per_xact + configure the number of packets that will be + collected from the network before being + transmitted via USB (this is used for USB + communication only) + + device name of the device the link is to be attached to + + channel name of the channel the link is to be attached to + + comp_params pass parameters needed by some components + + create_link write '1' to this attribute to trigger the + creation of the link. In case of speculative + configuration, the creation is post-poned until + a physical device is being attached to the bus. + + destroy_link write '1' to this attribute to destroy an + active link diff --git a/Documentation/ABI/testing/configfs-usb-gadget b/Documentation/ABI/testing/configfs-usb-gadget index 95a36589a66b564fc358c52056075c18888eca4a..4594cc2435e8cab6a8f3040343cf86d6ed2a232b 100644 --- a/Documentation/ABI/testing/configfs-usb-gadget +++ b/Documentation/ABI/testing/configfs-usb-gadget @@ -16,6 +16,10 @@ Description: write UDC's name found in /sys/class/udc/* to bind a gadget, empty string "" to unbind. + max_speed - maximum speed the driver supports. Valid + names are super-speed-plus, super-speed, + high-speed, full-speed, and low-speed. + bDeviceClass - USB device class code bDeviceSubClass - USB device subclass code bDeviceProtocol - USB device protocol code diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs index f0ac14b70ecbf4107afe8dd36aa60fed2d2dfc90..a73601c5121e4773cbe673e876562d13fd41b6d3 100644 --- a/Documentation/ABI/testing/debugfs-driver-habanalabs +++ b/Documentation/ABI/testing/debugfs-driver-habanalabs @@ -43,6 +43,20 @@ Description: Allows the root user to read or write directly through the If the IOMMU is disabled, it also allows the root user to read or write from the host a device VA of a host mapped memory +What: /sys/kernel/debug/habanalabs/hl/data64 +Date: Jan 2020 +KernelVersion: 5.6 +Contact: oded.gabbay@gmail.com +Description: Allows the root user to read or write 64 bit data directly + through the device's PCI bar. Writing to this file generates a + write transaction while reading from the file generates a read + transaction. This custom interface is needed (instead of using + the generic Linux user-space PCI mapping) because the DDR bar + is very small compared to the DDR memory and only the driver can + move the bar before and after the transaction. + If the IOMMU is disabled, it also allows the root user to read + or write from the host a device VA of a host mapped memory + What: /sys/kernel/debug/habanalabs/hl/device Date: Jan 2019 KernelVersion: 5.1 diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy index 29aaedf332461cb5256f7fe0d792a179f6cb6b39..cd572912c593898fe82a377be4aa3dda354eaea9 100644 --- a/Documentation/ABI/testing/ima_policy +++ b/Documentation/ABI/testing/ima_policy @@ -25,11 +25,11 @@ Description: lsm: [[subj_user=] [subj_role=] [subj_type=] [obj_user=] [obj_role=] [obj_type=]] option: [[appraise_type=]] [template=] [permit_directio] - [appraise_flag=] + [appraise_flag=] [keyrings=] base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK] [FIRMWARE_CHECK] [KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK] - [KEXEC_CMDLINE] + [KEXEC_CMDLINE] [KEY_CHECK] mask:= [[^]MAY_READ] [[^]MAY_WRITE] [[^]MAY_APPEND] [[^]MAY_EXEC] fsmagic:= hex value @@ -42,6 +42,9 @@ Description: appraise_flag:= [check_blacklist] Currently, blacklist check is only for files signed with appended signature. + keyrings:= list of keyrings + (eg, .builtin_trusted_keys|.ima). Only valid + when action is "measure" and func is KEY_CHECK. template:= name of a defined IMA template type (eg, ima-ng). Only valid when action is "measure". pcr:= decimal value @@ -113,3 +116,12 @@ Description: Example of appraise rule allowing modsig appended signatures: appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig + + Example of measure rule using KEY_CHECK to measure all keys: + + measure func=KEY_CHECK + + Example of measure rule using KEY_CHECK to only measure + keys added to .builtin_trusted_keys or .ima keyring: + + measure func=KEY_CHECK keyrings=.builtin_trusted_keys|.ima diff --git a/Documentation/ABI/testing/rtc-cdev b/Documentation/ABI/testing/rtc-cdev index 97447283f13bf0eac96bc09cd89438ddb73e3524..25910c3c3d7eb11a44b5f527778cc8311828e911 100644 --- a/Documentation/ABI/testing/rtc-cdev +++ b/Documentation/ABI/testing/rtc-cdev @@ -33,6 +33,14 @@ Description: Requires a separate RTC_PIE_ON call to enable the periodic interrupts. + * RTC_VL_READ: Read the voltage inputs status of the RTC when + supported. The value is a bit field of RTC_VL_*, giving the + status of the main and backup voltages. + + * RTC_VL_CLEAR: Clear the voltage status of the RTC. Some RTCs + need user interaction when the backup power provider is + replaced or charged to be able to clear the status. + The ioctl() calls supported by the older /dev/rtc interface are also supported by the newer RTC class framework. However, because the chips and systems are not standardized, some PC/AT diff --git a/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti b/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti new file mode 100644 index 0000000000000000000000000000000000000000..9d11502b43907c051183a76595cdac6f041b57a8 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-coresight-devices-cti @@ -0,0 +1,241 @@ +What: /sys/bus/coresight/devices//enable +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Enable/Disable the CTI hardware. + +What: /sys/bus/coresight/devices//powered +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Indicate if the CTI hardware is powered. + +What: /sys/bus/coresight/devices//ctmid +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Display the associated CTM ID + +What: /sys/bus/coresight/devices//nr_trigger_cons +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Number of devices connected to triggers on this CTI + +What: /sys/bus/coresight/devices//triggers/name +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Name of connected device + +What: /sys/bus/coresight/devices//triggers/in_signals +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Input trigger signals from connected device + +What: /sys/bus/coresight/devices//triggers/in_types +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Functional types for the input trigger signals + from connected device + +What: /sys/bus/coresight/devices//triggers/out_signals +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Output trigger signals to connected device + +What: /sys/bus/coresight/devices//triggers/out_types +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Functional types for the output trigger signals + to connected device + +What: /sys/bus/coresight/devices//regs/inout_sel +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Select the index for inen and outen registers. + +What: /sys/bus/coresight/devices//regs/inen +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Read or write the CTIINEN register selected by inout_sel. + +What: /sys/bus/coresight/devices//regs/outen +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Read or write the CTIOUTEN register selected by inout_sel. + +What: /sys/bus/coresight/devices//regs/gate +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Read or write CTIGATE register. + +What: /sys/bus/coresight/devices//regs/asicctl +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Read or write ASICCTL register. + +What: /sys/bus/coresight/devices//regs/intack +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Write the INTACK register. + +What: /sys/bus/coresight/devices//regs/appset +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Set CTIAPPSET register to activate channel. Read back to + determine current value of register. + +What: /sys/bus/coresight/devices//regs/appclear +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Write APPCLEAR register to deactivate channel. + +What: /sys/bus/coresight/devices//regs/apppulse +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Write APPPULSE to pulse a channel active for one clock + cycle. + +What: /sys/bus/coresight/devices//regs/chinstatus +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Read current status of channel inputs. + +What: /sys/bus/coresight/devices//regs/choutstatus +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) read current status of channel outputs. + +What: /sys/bus/coresight/devices//regs/triginstatus +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) read current status of input trigger signals + +What: /sys/bus/coresight/devices//regs/trigoutstatus +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) read current status of output trigger signals. + +What: /sys/bus/coresight/devices//channels/trigin_attach +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Attach a CTI input trigger to a CTM channel. + +What: /sys/bus/coresight/devices//channels/trigin_detach +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Detach a CTI input trigger from a CTM channel. + +What: /sys/bus/coresight/devices//channels/trigout_attach +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Attach a CTI output trigger to a CTM channel. + +What: /sys/bus/coresight/devices//channels/trigout_detach +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Detach a CTI output trigger from a CTM channel. + +What: /sys/bus/coresight/devices//channels/chan_gate_enable +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Enable CTIGATE for single channel (W) or list enabled + channels through the gate (R). + +What: /sys/bus/coresight/devices//channels/chan_gate_disable +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Disable CTIGATE for single channel. + +What: /sys/bus/coresight/devices//channels/chan_set +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Activate a single channel. + +What: /sys/bus/coresight/devices//channels/chan_clear +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Deactivate a single channel. + +What: /sys/bus/coresight/devices//channels/chan_pulse +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Pulse a single channel - activate for a single clock cycle. + +What: /sys/bus/coresight/devices//channels/trigout_filtered +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) List of output triggers filtered across all connections. + +What: /sys/bus/coresight/devices//channels/trig_filter_enable +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Enable or disable trigger output signal filtering. + +What: /sys/bus/coresight/devices//channels/chan_inuse +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) show channels with at least one attached trigger signal. + +What: /sys/bus/coresight/devices//channels/chan_free +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) show channels with no attached trigger signals. + +What: /sys/bus/coresight/devices//channels/chan_xtrigs_sel +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (RW) Write channel number to select a channel to view, read to + see selected channel number. + +What: /sys/bus/coresight/devices//channels/chan_xtrigs_in +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Read to see input triggers connected to selected view + channel. + +What: /sys/bus/coresight/devices//channels/chan_xtrigs_out +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (R) Read to see output triggers connected to selected view + channel. + +What: /sys/bus/coresight/devices//channels/chan_xtrigs_reset +Date: March 2020 +KernelVersion 5.7 +Contact: Mike Leach or Mathieu Poirier +Description: (W) Clear all channel / trigger programming. diff --git a/Documentation/ABI/testing/sysfs-bus-counter-104-quad-8 b/Documentation/ABI/testing/sysfs-bus-counter-104-quad-8 index 46b1f33b2fcef71a7312a142462dad0a9165033a..eac32180c40d965c6f60a65c2ebdf092c3f9e649 100644 --- a/Documentation/ABI/testing/sysfs-bus-counter-104-quad-8 +++ b/Documentation/ABI/testing/sysfs-bus-counter-104-quad-8 @@ -1,3 +1,28 @@ +What: /sys/bus/counter/devices/counterX/signalY/cable_fault +KernelVersion: 5.7 +Contact: linux-iio@vger.kernel.org +Description: + Read-only attribute that indicates whether a differential + encoder cable fault (not connected or loose wires) is detected + for the respective channel of Signal Y. Valid attribute values + are boolean. Detection must first be enabled via the + corresponding cable_fault_enable attribute. + +What: /sys/bus/counter/devices/counterX/signalY/cable_fault_enable +KernelVersion: 5.7 +Contact: linux-iio@vger.kernel.org +Description: + Whether detection of differential encoder cable faults for the + respective channel of Signal Y is enabled. Valid attribute + values are boolean. + +What: /sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler +KernelVersion: 5.7 +Contact: linux-iio@vger.kernel.org +Description: + Filter clock factor for input Signal Y. This prescaler value + affects the inputs of both quadrature pair signals. + What: /sys/bus/counter/devices/counterX/signalY/index_polarity KernelVersion: 5.2 Contact: linux-iio@vger.kernel.org diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index faaa2166d74152202f5ccb0d188cc898e49a177f..d3e53a6d8331bb3894456a15b23a9f505b97cea3 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -1726,3 +1726,16 @@ Contact: linux-iio@vger.kernel.org Description: List of valid periods (in seconds) for which the light intensity must be above the threshold level before interrupt is asserted. + +What: /sys/bus/iio/devices/iio:deviceX/in_filter_notch_center_frequency +KernelVersion: 5.5 +Contact: linux-iio@vger.kernel.org +Description: + Center frequency in Hz for a notch filter. Used i.e. for line + noise suppression. + +What: /sys/bus/iio/devices/iio:deviceX/in_temp_thermocouple_type +KernelVersion: 5.5 +Contact: linux-iio@vger.kernel.org +Description: + One of the following thermocouple types: B, E, J, K, N, R, S, T. diff --git a/Documentation/ABI/testing/sysfs-bus-iio-adc-ad7192 b/Documentation/ABI/testing/sysfs-bus-iio-adc-ad7192 index 7627d3be08f511de84097df45a7ba4d066e4fad6..f8315202c8f0df2bd4b7216f5cf8d3c2780fcf3f 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio-adc-ad7192 +++ b/Documentation/ABI/testing/sysfs-bus-iio-adc-ad7192 @@ -2,17 +2,22 @@ What: /sys/bus/iio/devices/iio:deviceX/ac_excitation_en KernelVersion: Contact: linux-iio@vger.kernel.org Description: - Reading gives the state of AC excitation. - Writing '1' enables AC excitation. + This attribute, if available, is used to enable the AC + excitation mode found on some converters. In ac excitation mode, + the polarity of the excitation voltage is reversed on + alternate cycles, to eliminate DC errors. What: /sys/bus/iio/devices/iio:deviceX/bridge_switch_en KernelVersion: Contact: linux-iio@vger.kernel.org Description: - This bridge switch is used to disconnect it when there is a - need to minimize the system current consumption. - Reading gives the state of the bridge switch. - Writing '1' enables the bridge switch. + This attribute, if available, is used to close or open the + bridge power down switch found on some converters. + In bridge applications, such as strain gauges and load cells, + the bridge itself consumes the majority of the current in the + system. To minimize the current consumption of the system, + the bridge can be disconnected (when it is not being used + using the bridge_switch_en attribute. What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration KernelVersion: @@ -21,6 +26,13 @@ Description: Initiates the system calibration procedure. This is done on a single channel at a time. Write '1' to start the calibration. +What: /sys/bus/iio/devices/iio:deviceX/in_voltage2-voltage2_shorted_raw +KernelVersion: +Contact: linux-iio@vger.kernel.org +Description: + Measure voltage from AIN2 pin connected to AIN(+) + and AIN(-) shorted. + What: /sys/bus/iio/devices/iio:deviceX/in_voltagex_sys_calibration_mode_available KernelVersion: Contact: linux-iio@vger.kernel.org diff --git a/Documentation/ABI/testing/sysfs-bus-iio-dma-buffer b/Documentation/ABI/testing/sysfs-bus-iio-dma-buffer new file mode 100644 index 0000000000000000000000000000000000000000..d526e657100122e8263ed96e67eff841c65df6ce --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-iio-dma-buffer @@ -0,0 +1,19 @@ +What: /sys/bus/iio/devices/iio:deviceX/buffer/length_align_bytes +KernelVersion: 5.4 +Contact: linux-iio@vger.kernel.org +Description: + DMA buffers tend to have a alignment requirement for the + buffers. If this alignment requirement is not met samples might + be dropped from the buffer. + + This property reports the alignment requirements in bytes. + This means that the buffer size in bytes needs to be a integer + multiple of the number reported by this file. + + The alignment requirements in number of sample sets will depend + on the enabled channels and the bytes per channel. This means + that the alignment requirement in samples sets might change + depending on which and how many channels are enabled. Whereas + the alignment requirement reported in bytes by this property + will remain static and does not depend on which channels are + enabled. diff --git a/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc b/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc index 456cb62b384c2a138d8076dfc3c4fa0325b170db..7fd2601c2831b3ad1392b4615681f123fbc70572 100644 --- a/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc +++ b/Documentation/ABI/testing/sysfs-bus-intel_th-devices-msc @@ -40,3 +40,11 @@ Description: (RW) Trigger window switch for the MSC's buffer, in triggering a window switch for the buffer. Returns an error in any other operating mode or attempts to write something other than "1". +What: /sys/bus/intel_th/devices/-msc/stop_on_full +Date: March 2020 +KernelVersion: 5.7 +Contact: Alexander Shishkin +Description: (RW) Configure whether trace stops when the last available window + becomes full (1/y/Y) or wraps around and continues until the next + window becomes available again (0/n/N). + diff --git a/Documentation/ABI/testing/sysfs-bus-mdio b/Documentation/ABI/testing/sysfs-bus-mdio new file mode 100644 index 0000000000000000000000000000000000000000..da86efc7781b254bcfdb97efbb419c66208244ba --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-mdio @@ -0,0 +1,63 @@ +What: /sys/bus/mdio_bus/devices/.../statistics/ +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + This folder contains statistics about global and per + MDIO bus address statistics. + +What: /sys/bus/mdio_bus/devices/.../statistics/transfers +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfers for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/errors +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfer errors for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/writes +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of write transactions for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/reads +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of read transactions for this MDIO bus. + +What: /sys/bus/mdio_bus/devices/.../statistics/transfers_ +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfers for this MDIO bus address. + +What: /sys/bus/mdio_bus/devices/.../statistics/errors_ +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of transfer errors for this MDIO bus address. + +What: /sys/bus/mdio_bus/devices/.../statistics/writes_ +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of write transactions for this MDIO bus address. + +What: /sys/bus/mdio_bus/devices/.../statistics/reads_ +Date: January 2020 +KernelVersion: 5.6 +Contact: netdev@vger.kernel.org +Description: + Total number of read transactions for this MDIO bus address. diff --git a/Documentation/ABI/testing/sysfs-bus-most b/Documentation/ABI/testing/sysfs-bus-most new file mode 100644 index 0000000000000000000000000000000000000000..6b1d06e3285e0b00790e853fe38e62021886f9bc --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-most @@ -0,0 +1,295 @@ +What: /sys/bus/most/devices/.../description +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Provides information about the interface type and the physical + location of the device. Hardware attached via USB, for instance, + might return <1-1.1:1.0> +Users: + +What: /sys/bus/most/devices/.../interface +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the type of peripheral interface the device uses. +Users: + +What: /sys/bus/most/devices/.../dci +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + If the network interface controller is attached via USB, a dci + directory is created that allows applications to read and + write the controller's DCI registers. +Users: + +What: /sys/bus/most/devices/.../dci/arb_address +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to set an arbitrary DCI register address an + application wants to read from or write to. +Users: + +What: /sys/bus/most/devices/.../dci/arb_value +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to read and write the DCI register whose address + is stored in arb_address. +Users: + +What: /sys/bus/most/devices/.../dci/mep_eui48_hi +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MAC address. +Users: + +What: /sys/bus/most/devices/.../dci/mep_eui48_lo +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MAC address. +Users: + +What: /sys/bus/most/devices/.../dci/mep_eui48_mi +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MAC address. +Users: + +What: /sys/bus/most/devices/.../dci/mep_filter +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MEP filter address. +Users: + +What: /sys/bus/most/devices/.../dci/mep_hash0 +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MEP hash table. +Users: + +What: /sys/bus/most/devices/.../dci/mep_hash1 +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MEP hash table. +Users: + +What: /sys/bus/most/devices/.../dci/mep_hash2 +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MEP hash table. +Users: + +What: /sys/bus/most/devices/.../dci/mep_hash3 +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to check and configure the MEP hash table. +Users: + +What: /sys/bus/most/devices/.../dci/ni_state +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the current network interface state. +Users: + +What: /sys/bus/most/devices/.../dci/node_address +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the current node address. +Users: + +What: /sys/bus/most/devices/.../dci/node_position +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the current node position. +Users: + +What: /sys/bus/most/devices/.../dci/packet_bandwidth +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the configured packet bandwidth. +Users: + +What: /sys/bus/most/devices/.../dci/sync_ep +Date: June 2016 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Triggers the controller's synchronization process for a certain + endpoint. +Users: + +What: /sys/bus/most/devices/...// +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + For every channel of the device a directory is created, whose + name is dictated by the HDM. This enables an application to + collect information about the channel's capabilities and + configure it. +Users: + +What: /sys/bus/most/devices/...//available_datatypes +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the data types the current channel can transport. +Users: + +What: /sys/bus/most/devices/...//available_directions +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the directions the current channel is capable of. +Users: + +What: /sys/bus/most/devices/...//number_of_packet_buffers +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the number of packet buffers the current channel can + handle. +Users: + +What: /sys/bus/most/devices/...//number_of_stream_buffers +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the number of streaming buffers the current channel can + handle. +Users: + +What: /sys/bus/most/devices/...//size_of_packet_buffer +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the size of a packet buffer the current channel can + handle. +Users: + +What: /sys/bus/most/devices/...//size_of_stream_buffer +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates the size of a streaming buffer the current channel can + handle. +Users: + +What: /sys/bus/most/devices/...//set_number_of_buffers +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is to configure the number of buffers of the current channel. +Users: + +What: /sys/bus/most/devices/...//set_buffer_size +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is to configure the size of a buffer of the current channel. +Users: + +What: /sys/bus/most/devices/...//set_direction +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is to configure the direction of the current channel. + The following strings will be accepted: + 'dir_tx', + 'dir_rx' +Users: + +What: /sys/bus/most/devices/...//set_datatype +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is to configure the data type of the current channel. + The following strings will be accepted: + 'control', + 'async', + 'sync', + 'isoc_avp' +Users: + +What: /sys/bus/most/devices/...//set_subbuffer_size +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is to configure the subbuffer size of the current channel. +Users: + +What: /sys/bus/most/devices/...//set_packets_per_xact +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is to configure the number of packets per transaction of + the current channel. This is only needed network interface + controller is attached via USB. +Users: + +What: /sys/bus/most/devices/...//channel_starving +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + Indicates whether current channel ran out of buffers. +Users: + +What: /sys/bus/most/drivers/most_core/components +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to retrieve a list of registered components. +Users: + +What: /sys/bus/most/drivers/most_core/links +Date: March 2017 +KernelVersion: 4.15 +Contact: Christian Gromm +Description: + This is used to retrieve a list of established links. +Users: diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq index 75897e2fde43e4f6991bead8245f72dfadafa295..9758eb85ade3ea461787f2707c14eb400df5a660 100644 --- a/Documentation/ABI/testing/sysfs-class-devfreq +++ b/Documentation/ABI/testing/sysfs-class-devfreq @@ -55,12 +55,15 @@ What: /sys/class/devfreq/.../trans_stat Date: October 2012 Contact: MyungJoo Ham Description: - This ABI shows the statistics of devfreq behavior on a - specific device. It shows the time spent in each state and - the number of transitions between states. + This ABI shows or clears the statistics of devfreq behavior + on a specific device. It shows the time spent in each state + and the number of transitions between states. In order to activate this ABI, the devfreq target device driver should provide the list of available frequencies - with its profile. + with its profile. If need to reset the statistics of devfreq + behavior on a specific device, enter 0(zero) to 'trans_stat' + as following: + echo 0 > /sys/class/devfreq/.../trans_stat What: /sys/class/devfreq/.../userspace/set_freq Date: September 2011 diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power index 27edc06e2495262ecdfec25275501ce7ffca1277..bf3b48f022dc1d802fe90bd951ed1d6291b73aaa 100644 --- a/Documentation/ABI/testing/sysfs-class-power +++ b/Documentation/ABI/testing/sysfs-class-power @@ -189,7 +189,8 @@ Description: Access: Read Valid values: "Unknown", "Good", "Overheat", "Dead", "Over voltage", "Unspecified failure", "Cold", - "Watchdog timer expire", "Safety timer expire" + "Watchdog timer expire", "Safety timer expire", + "Over current" What: /sys/class/power_supply//precharge_current Date: June 2017 diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec index d7647b258c3ca84255b3a95ceebb1c486dabf9f3..b834671522d6f98c76a66981c2e9ef36693ad82b 100644 --- a/Documentation/ABI/testing/sysfs-class-typec +++ b/Documentation/ABI/testing/sysfs-class-typec @@ -20,13 +20,13 @@ Date: April 2017 Contact: Heikki Krogerus Description: The supported power roles. This attribute can be used to request - power role swap on the port when the port supports USB Power - Delivery. Swapping is supported as synchronous operation, so - write(2) to the attribute will not return until the operation - has finished. The attribute is notified about role changes so - that poll(2) on the attribute wakes up. Change on the role will - also generate uevent KOBJ_CHANGE. The current role is show in - brackets, for example "[source] sink" when in source mode. + power role swap on the port. Swapping is supported as + synchronous operation, so write(2) to the attribute will not + return until the operation has finished. The attribute is + notified about role changes so that poll(2) on the attribute + wakes up. Change on the role will also generate uevent + KOBJ_CHANGE. The current role is show in brackets, for example + "[source] sink" when in source mode. Valid values: source, sink @@ -108,6 +108,15 @@ Contact: Heikki Krogerus Description: Revision number of the supported USB Type-C specification. +What: /sys/class/typec//orientation +Date: February 2020 +Contact: Badhri Jagan Sridharan +Description: + Indicates the active orientation of the Type-C connector. + Valid values: + - "normal": CC1 orientation + - "reverse": CC2 orientation + - "unknown": Orientation cannot be determined. USB Type-C partner devices (eg. /sys/class/typec/port0-partner/) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index fc20cde63d1eac1fd563385542c7659c965c5e18..b39531a3c5bc7a1efe9d60620256c2193f7bf2e6 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -196,6 +196,12 @@ Description: does not reflect it. Likewise, if one enables a deep state but a lighter state still is disabled, then this has no effect. +What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/default_status +Date: December 2019 +KernelVersion: v5.6 +Contact: Linux power management list +Description: + (RO) The default status of this state, "enabled" or "disabled". What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency Date: March 2014 @@ -486,6 +492,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds + /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 diff --git a/Documentation/ABI/testing/sysfs-driver-jz4780-efuse b/Documentation/ABI/testing/sysfs-driver-jz4780-efuse new file mode 100644 index 0000000000000000000000000000000000000000..bb6f5d6ceea0cd370340b3d5b44132f21639316d --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-jz4780-efuse @@ -0,0 +1,16 @@ +What: /sys/devices/*//nvmem +Date: December 2017 +Contact: PrasannaKumar Muralidharan +Description: read-only access to the efuse on the Ingenic JZ4780 SoC + The SoC has a one time programmable 8K efuse that is + split into segments. The driver supports read only. + The segments are + 0x000 64 bit Random Number + 0x008 128 bit Ingenic Chip ID + 0x018 128 bit Customer ID + 0x028 3520 bit Reserved + 0x1E0 8 bit Protect Segment + 0x1E1 2296 bit HDMI Key + 0x300 2048 bit Security boot key +Users: any user space application which wants to read the Chip + and Customer ID diff --git a/Documentation/ABI/testing/sysfs-driver-pciback b/Documentation/ABI/testing/sysfs-driver-pciback index 6a733bfa37e6530dc3a0c1e9a0ace7ff43cc1b80..73308c2b81b043d54654e04d10ae8d273a7a731a 100644 --- a/Documentation/ABI/testing/sysfs-driver-pciback +++ b/Documentation/ABI/testing/sysfs-driver-pciback @@ -11,3 +11,16 @@ Description: #echo 00:19.0-E0:2:FF > /sys/bus/pci/drivers/pciback/quirks will allow the guest to read and write to the configuration register 0x0E. + +What: /sys/bus/pci/drivers/pciback/allow_interrupt_control +Date: Jan 2020 +KernelVersion: 5.6 +Contact: xen-devel@lists.xenproject.org +Description: + List of devices which can have interrupt control flag (INTx, + MSI, MSI-X) set by a connected guest. It is meant to be set + only when the guest is a stubdomain hosting device model (qemu) + and the actual device is assigned to a HVM. It is not safe + (similar to permissive attribute) to set for a devices assigned + to a PV guest. The device is automatically removed from this + list when the connected pcifront terminates. diff --git a/Documentation/ABI/testing/sysfs-driver-uacce b/Documentation/ABI/testing/sysfs-driver-uacce new file mode 100644 index 0000000000000000000000000000000000000000..08f2591138afd3a8c7b66625d0bb676913b9afe7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-driver-uacce @@ -0,0 +1,39 @@ +What: /sys/class/uacce//api +Date: Feb 2020 +KernelVersion: 5.7 +Contact: linux-accelerators@lists.ozlabs.org +Description: Api of the device + Can be any string and up to userspace to parse. + Application use the api to match the correct driver + +What: /sys/class/uacce//flags +Date: Feb 2020 +KernelVersion: 5.7 +Contact: linux-accelerators@lists.ozlabs.org +Description: Attributes of the device, see UACCE_DEV_xxx flag defined in uacce.h + +What: /sys/class/uacce//available_instances +Date: Feb 2020 +KernelVersion: 5.7 +Contact: linux-accelerators@lists.ozlabs.org +Description: Available instances left of the device + Return -ENODEV if uacce_ops get_available_instances is not provided + +What: /sys/class/uacce//algorithms +Date: Feb 2020 +KernelVersion: 5.7 +Contact: linux-accelerators@lists.ozlabs.org +Description: Algorithms supported by this accelerator, separated by new line. + Can be any string and up to userspace to parse. + +What: /sys/class/uacce//region_mmio_size +Date: Feb 2020 +KernelVersion: 5.7 +Contact: linux-accelerators@lists.ozlabs.org +Description: Size (bytes) of mmio region queue file + +What: /sys/class/uacce//region_dus_size +Date: Feb 2020 +KernelVersion: 5.7 +Contact: linux-accelerators@lists.ozlabs.org +Description: Size (bytes) of dus region queue file diff --git a/Documentation/ABI/testing/sysfs-driver-xen-blkback b/Documentation/ABI/testing/sysfs-driver-xen-blkback index 4e7babb3ba1fecc673018253d4adbb860e0cb8a4..ecb7942ff1462bb980ce6166ddbd2abc84f3f8e1 100644 --- a/Documentation/ABI/testing/sysfs-driver-xen-blkback +++ b/Documentation/ABI/testing/sysfs-driver-xen-blkback @@ -25,3 +25,13 @@ Description: allocated without being in use. The time is in seconds, 0 means indefinitely long. The default is 60 seconds. + +What: /sys/module/xen_blkback/parameters/buffer_squeeze_duration_ms +Date: December 2019 +KernelVersion: 5.6 +Contact: SeongJae Park +Description: + When memory pressure is reported to blkback this option + controls the duration in milliseconds that blkback will not + cache any page not backed by a grant mapping. + The default is 10ms. diff --git a/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups new file mode 100644 index 0000000000000000000000000000000000000000..3a2dfe542e8c51780b9454f9d97e89e03f14739f --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups @@ -0,0 +1,21 @@ +What: /sys/firmware/opal/sensor_groups +Date: August 2017 +Contact: Linux for PowerPC mailing list +Description: Sensor groups directory for POWER9 powernv servers + + Each folder in this directory contains a sensor group + which are classified based on type of the sensor + like power, temperature, frequency, current, etc. They + can also indicate the group of sensors belonging to + different owners like CSM, Profiler, Job-Scheduler + +What: /sys/firmware/opal/sensor_groups//clear +Date: August 2017 +Contact: Linux for PowerPC mailing list +Description: Sysfs file to clear the min-max of all the sensors + belonging to the group. + + Writing 1 to this file will clear the minimum and + maximum values of all the sensors in the group. + In POWER9, the min-max of a sensor is the historical minimum + and maximum value of the sensor cached by OCC. diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index aedeae1e8ec180f699f32a7f05b6b6b81105372c..bd8a0d19abe67d6f0acaa891f805c2e21181d06b 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -1,37 +1,40 @@ What: /sys/fs/f2fs//gc_max_sleep_time Date: July 2013 Contact: "Namjae Jeon" -Description: - Controls the maximun sleep time for gc_thread. Time - is in milliseconds. +Description: Controls the maximum sleep time for gc_thread. Time + is in milliseconds. What: /sys/fs/f2fs//gc_min_sleep_time Date: July 2013 Contact: "Namjae Jeon" -Description: - Controls the minimum sleep time for gc_thread. Time - is in milliseconds. +Description: Controls the minimum sleep time for gc_thread. Time + is in milliseconds. What: /sys/fs/f2fs//gc_no_gc_sleep_time Date: July 2013 Contact: "Namjae Jeon" -Description: - Controls the default sleep time for gc_thread. Time - is in milliseconds. +Description: Controls the default sleep time for gc_thread. Time + is in milliseconds. What: /sys/fs/f2fs//gc_idle Date: July 2013 Contact: "Namjae Jeon" -Description: - Controls the victim selection policy for garbage collection. +Description: Controls the victim selection policy for garbage collection. + Setting gc_idle = 0(default) will disable this option. Setting + gc_idle = 1 will select the Cost Benefit approach & setting + gc_idle = 2 will select the greedy approach. What: /sys/fs/f2fs//reclaim_segments Date: October 2013 Contact: "Jaegeuk Kim" -Description: - Controls the issue rate of segment discard commands. - -What: /sys/fs/f2fs//max_blkaddr +Description: This parameter controls the number of prefree segments to be + reclaimed. If the number of prefree segments is larger than + the number of segments in the proportion to the percentage + over total volume size, f2fs tries to conduct checkpoint to + reclaim the prefree segments to free segments. + By default, 5% over total # of segments. + +What: /sys/fs/f2fs//main_blkaddr Date: November 2019 Contact: "Ramon Pantin" Description: @@ -40,227 +43,283 @@ Description: What: /sys/fs/f2fs//ipu_policy Date: November 2013 Contact: "Jaegeuk Kim" -Description: - Controls the in-place-update policy. +Description: Controls the in-place-update policy. + updates in f2fs. User can set: + 0x01: F2FS_IPU_FORCE, 0x02: F2FS_IPU_SSR, + 0x04: F2FS_IPU_UTIL, 0x08: F2FS_IPU_SSR_UTIL, + 0x10: F2FS_IPU_FSYNC, 0x20: F2FS_IPU_ASYNC, + 0x40: F2FS_IPU_NOCACHE. + Refer segment.h for details. What: /sys/fs/f2fs//min_ipu_util Date: November 2013 Contact: "Jaegeuk Kim" -Description: - Controls the FS utilization condition for the in-place-update - policies. +Description: Controls the FS utilization condition for the in-place-update + policies. It is used by F2FS_IPU_UTIL and F2FS_IPU_SSR_UTIL policies. What: /sys/fs/f2fs//min_fsync_blocks Date: September 2014 Contact: "Jaegeuk Kim" -Description: - Controls the dirty page count condition for the in-place-update - policies. +Description: Controls the dirty page count condition for the in-place-update + policies. What: /sys/fs/f2fs//min_seq_blocks Date: August 2018 Contact: "Jaegeuk Kim" -Description: - Controls the dirty page count condition for batched sequential - writes in ->writepages. - +Description: Controls the dirty page count condition for batched sequential + writes in writepages. What: /sys/fs/f2fs//min_hot_blocks Date: March 2017 Contact: "Jaegeuk Kim" -Description: - Controls the dirty page count condition for redefining hot data. +Description: Controls the dirty page count condition for redefining hot data. What: /sys/fs/f2fs//min_ssr_sections Date: October 2017 Contact: "Chao Yu" -Description: - Controls the fee section threshold to trigger SSR allocation. +Description: Controls the free section threshold to trigger SSR allocation. + If this is large, SSR mode will be enabled early. What: /sys/fs/f2fs//max_small_discards Date: November 2013 Contact: "Jaegeuk Kim" -Description: - Controls the issue rate of small discard commands. +Description: Controls the issue rate of discard commands that consist of small + blocks less than 2MB. The candidates to be discarded are cached until + checkpoint is triggered, and issued during the checkpoint. + By default, it is disabled with 0. -What: /sys/fs/f2fs//discard_granularity -Date: July 2017 -Contact: "Chao Yu" -Description: - Controls discard granularity of inner discard thread, inner thread +What: /sys/fs/f2fs//discard_granularity +Date: July 2017 +Contact: "Chao Yu" +Description: Controls discard granularity of inner discard thread. Inner thread will not issue discards with size that is smaller than granularity. - The unit size is one block, now only support configuring in range - of [1, 512]. + The unit size is one block(4KB), now only support configuring + in range of [1, 512]. Default value is 4(=16KB). -What: /sys/fs/f2fs//umount_discard_timeout -Date: January 2019 -Contact: "Jaegeuk Kim" -Description: - Set timeout to issue discard commands during umount. - Default: 5 secs +What: /sys/fs/f2fs//umount_discard_timeout +Date: January 2019 +Contact: "Jaegeuk Kim" +Description: Set timeout to issue discard commands during umount. + Default: 5 secs What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" -Description: - Controls the number of trials to find a victim segment. +Description: Controls the number of trials to find a victim segment + when conducting SSR and cleaning operations. The default value + is 4096 which covers 8GB block address range. What: /sys/fs/f2fs//migration_granularity Date: October 2018 Contact: "Chao Yu" -Description: - Controls migration granularity of garbage collection on large - section, it can let GC move partial segment{s} of one section - in one GC cycle, so that dispersing heavy overhead GC to - multiple lightweight one. +Description: Controls migration granularity of garbage collection on large + section, it can let GC move partial segment{s} of one section + in one GC cycle, so that dispersing heavy overhead GC to + multiple lightweight one. What: /sys/fs/f2fs//dir_level Date: March 2014 Contact: "Jaegeuk Kim" -Description: - Controls the directory level for large directory. +Description: Controls the directory level for large directory. If a + directory has a number of files, it can reduce the file lookup + latency by increasing this dir_level value. Otherwise, it + needs to decrease this value to reduce the space overhead. + The default value is 0. What: /sys/fs/f2fs//ram_thresh Date: March 2014 Contact: "Jaegeuk Kim" -Description: - Controls the memory footprint used by f2fs. +Description: Controls the memory footprint used by free nids and cached + nat entries. By default, 1 is set, which indicates + 10 MB / 1 GB RAM. What: /sys/fs/f2fs//batched_trim_sections Date: February 2015 Contact: "Jaegeuk Kim" -Description: - Controls the trimming rate in batch mode. - +Description: Controls the trimming rate in batch mode. + What: /sys/fs/f2fs//cp_interval Date: October 2015 Contact: "Jaegeuk Kim" -Description: - Controls the checkpoint timing. +Description: Controls the checkpoint timing, set to 60 seconds by default. What: /sys/fs/f2fs//idle_interval Date: January 2016 Contact: "Jaegeuk Kim" -Description: - Controls the idle timing for all paths other than - discard and gc path. +Description: Controls the idle timing of system, if there is no FS operation + during given interval. + Set to 5 seconds by default. What: /sys/fs/f2fs//discard_idle_interval Date: September 2018 Contact: "Chao Yu" Contact: "Sahitya Tummala" -Description: - Controls the idle timing for discard path. +Description: Controls the idle timing of discard thread given + this time interval. + Default is 5 secs. What: /sys/fs/f2fs//gc_idle_interval Date: September 2018 Contact: "Chao Yu" Contact: "Sahitya Tummala" -Description: - Controls the idle timing for gc path. +Description: Controls the idle timing for gc path. Set to 5 seconds by default. What: /sys/fs/f2fs//iostat_enable Date: August 2017 Contact: "Chao Yu" -Description: - Controls to enable/disable IO stat. +Description: Controls to enable/disable IO stat. What: /sys/fs/f2fs//ra_nid_pages Date: October 2015 Contact: "Chao Yu" -Description: - Controls the count of nid pages to be readaheaded. +Description: Controls the count of nid pages to be readaheaded. + When building free nids, F2FS reads NAT blocks ahead for + speed up. Default is 0. What: /sys/fs/f2fs//dirty_nats_ratio Date: January 2016 Contact: "Chao Yu" -Description: - Controls dirty nat entries ratio threshold, if current - ratio exceeds configured threshold, checkpoint will - be triggered for flushing dirty nat entries. +Description: Controls dirty nat entries ratio threshold, if current + ratio exceeds configured threshold, checkpoint will + be triggered for flushing dirty nat entries. What: /sys/fs/f2fs//lifetime_write_kbytes Date: January 2016 Contact: "Shuoran Liu" -Description: - Shows total written kbytes issued to disk. +Description: Shows total written kbytes issued to disk. What: /sys/fs/f2fs//features Date: July 2017 Contact: "Jaegeuk Kim" -Description: - Shows all enabled features in current device. +Description: Shows all enabled features in current device. What: /sys/fs/f2fs//inject_rate Date: May 2016 Contact: "Sheng Yong" -Description: - Controls the injection rate. +Description: Controls the injection rate of arbitrary faults. What: /sys/fs/f2fs//inject_type Date: May 2016 Contact: "Sheng Yong" -Description: - Controls the injection type. +Description: Controls the injection type of arbitrary faults. + +What: /sys/fs/f2fs//dirty_segments +Date: October 2017 +Contact: "Jaegeuk Kim" +Description: Shows the number of dirty segments. What: /sys/fs/f2fs//reserved_blocks Date: June 2017 Contact: "Chao Yu" -Description: - Controls target reserved blocks in system, the threshold - is soft, it could exceed current available user space. +Description: Controls target reserved blocks in system, the threshold + is soft, it could exceed current available user space. What: /sys/fs/f2fs//current_reserved_blocks Date: October 2017 Contact: "Yunlong Song" Contact: "Chao Yu" -Description: - Shows current reserved blocks in system, it may be temporarily - smaller than target_reserved_blocks, but will gradually - increase to target_reserved_blocks when more free blocks are - freed by user later. +Description: Shows current reserved blocks in system, it may be temporarily + smaller than target_reserved_blocks, but will gradually + increase to target_reserved_blocks when more free blocks are + freed by user later. What: /sys/fs/f2fs//gc_urgent Date: August 2017 Contact: "Jaegeuk Kim" -Description: - Do background GC agressively +Description: Do background GC agressively when set. When gc_urgent = 1, + background thread starts to do GC by given gc_urgent_sleep_time + interval. It is set to 0 by default. What: /sys/fs/f2fs//gc_urgent_sleep_time Date: August 2017 Contact: "Jaegeuk Kim" -Description: - Controls sleep time of GC urgent mode +Description: Controls sleep time of GC urgent mode. Set to 500ms by default. What: /sys/fs/f2fs//readdir_ra Date: November 2017 Contact: "Sheng Yong" -Description: - Controls readahead inode block in readdir. +Description: Controls readahead inode block in readdir. Enabled by default. + +What: /sys/fs/f2fs//gc_pin_file_thresh +Date: January 2018 +Contact: Jaegeuk Kim +Description: This indicates how many GC can be failed for the pinned + file. If it exceeds this, F2FS doesn't guarantee its pinning + state. 2048 trials is set by default. What: /sys/fs/f2fs//extension_list Date: Feburary 2018 Contact: "Chao Yu" -Description: - Used to control configure extension list: - - Query: cat /sys/fs/f2fs//extension_list - - Add: echo '[h/c]extension' > /sys/fs/f2fs//extension_list - - Del: echo '[h/c]!extension' > /sys/fs/f2fs//extension_list - - [h] means add/del hot file extension - - [c] means add/del cold file extension +Description: Used to control configure extension list: + - Query: cat /sys/fs/f2fs//extension_list + - Add: echo '[h/c]extension' > /sys/fs/f2fs//extension_list + - Del: echo '[h/c]!extension' > /sys/fs/f2fs//extension_list + - [h] means add/del hot file extension + - [c] means add/del cold file extension What: /sys/fs/f2fs//unusable Date April 2019 Contact: "Daniel Rosenberg" -Description: - If checkpoint=disable, it displays the number of blocks that are unusable. - If checkpoint=enable it displays the enumber of blocks that would be unusable - if checkpoint=disable were to be set. +Description: If checkpoint=disable, it displays the number of blocks that + are unusable. + If checkpoint=enable it displays the enumber of blocks that + would be unusable if checkpoint=disable were to be set. What: /sys/fs/f2fs//encoding Date July 2019 Contact: "Daniel Rosenberg" -Description: - Displays name and version of the encoding set for the filesystem. - If no encoding is set, displays (none) +Description: Displays name and version of the encoding set for the filesystem. + If no encoding is set, displays (none) + +What: /sys/fs/f2fs//free_segments +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Number of free segments in disk. + +What: /sys/fs/f2fs//cp_foreground_calls +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Number of checkpoint operations performed on demand. Available when + CONFIG_F2FS_STAT_FS=y. + +What: /sys/fs/f2fs//cp_background_calls +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Number of checkpoint operations performed in the background to + free segments. Available when CONFIG_F2FS_STAT_FS=y. + +What: /sys/fs/f2fs//gc_foreground_calls +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Number of garbage collection operations performed on demand. + Available when CONFIG_F2FS_STAT_FS=y. + +What: /sys/fs/f2fs//gc_background_calls +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Number of garbage collection operations triggered in background. + Available when CONFIG_F2FS_STAT_FS=y. + +What: /sys/fs/f2fs//moved_blocks_foreground +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Number of blocks moved by garbage collection in foreground. + Available when CONFIG_F2FS_STAT_FS=y. + +What: /sys/fs/f2fs//moved_blocks_background +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Number of blocks moved by garbage collection in background. + Available when CONFIG_F2FS_STAT_FS=y. + +What: /sys/fs/f2fs//avg_vblocks +Date: September 2019 +Contact: "Hridya Valsaraju" +Description: Average number of valid blocks. + Available when CONFIG_F2FS_STAT_FS=y. + +What: /sys/fs/f2fs//mounted_time_sec +Date: February 2020 +Contact: "Jaegeuk Kim" +Description: Show the mounted time in secs of this partition. diff --git a/Documentation/ABI/testing/sysfs-kernel-fadump b/Documentation/ABI/testing/sysfs-kernel-fadump new file mode 100644 index 0000000000000000000000000000000000000000..8f7a64a81783a53b4d696f79df0d5ab72cee8475 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-fadump @@ -0,0 +1,40 @@ +What: /sys/kernel/fadump/* +Date: Dec 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: + The /sys/kernel/fadump/* is a collection of FADump sysfs + file provide information about the configuration status + of Firmware Assisted Dump (FADump). + +What: /sys/kernel/fadump/enabled +Date: Dec 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Primarily used to identify whether the FADump is enabled in + the kernel or not. +User: Kdump service + +What: /sys/kernel/fadump/registered +Date: Dec 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + Helps to control the dump collect feature from userspace. + Setting 1 to this file enables the system to collect the + dump and 0 to disable it. +User: Kdump service + +What: /sys/kernel/fadump/release_mem +Date: Dec 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: write only + This is a special sysfs file and only available when + the system is booted to capture the vmcore using FADump. + It is used to release the memory reserved by FADump to + save the crash dump. + +What: /sys/kernel/fadump/mem_reserved +Date: Dec 2019 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Provide information about the amount of memory reserved by + FADump to save the crash dump in bytes. diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi index 9e99f29096127227919d16afb5aa7f14317dfc1d..1efac0ddb417e0b04ffeeb11573eca9488147f06 100644 --- a/Documentation/ABI/testing/sysfs-platform-asus-wmi +++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi @@ -46,3 +46,13 @@ Description: * 0 - normal, * 1 - overboost, * 2 - silent + +What: /sys/devices/platform//throttle_thermal_policy +Date: Dec 2019 +KernelVersion: 5.6 +Contact: "Leonid Maksymchuk" +Description: + Throttle thermal policy mode: + * 0 - default, + * 1 - overboost, + * 2 - silent diff --git a/Documentation/ABI/testing/sysfs-platform-dell-laptop b/Documentation/ABI/testing/sysfs-platform-dell-laptop index 8c6a0b8e113133c127b1505c61e09ff1d347d551..9b917c7453dee769d17f90c593861255c2ef6daf 100644 --- a/Documentation/ABI/testing/sysfs-platform-dell-laptop +++ b/Documentation/ABI/testing/sysfs-platform-dell-laptop @@ -2,7 +2,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_enabled Date: December 2014 KernelVersion: 3.19 Contact: Gabriele Mazzotta , - Pali Rohár + Pali Rohár Description: This file allows to control the automatic keyboard illumination mode on some systems that have an ambient @@ -13,7 +13,7 @@ What: /sys/class/leds/dell::kbd_backlight/als_setting Date: December 2014 KernelVersion: 3.19 Contact: Gabriele Mazzotta , - Pali Rohár + Pali Rohár Description: This file allows to specifiy the on/off threshold value, as reported by the ambient light sensor. @@ -22,7 +22,7 @@ What: /sys/class/leds/dell::kbd_backlight/start_triggers Date: December 2014 KernelVersion: 3.19 Contact: Gabriele Mazzotta , - Pali Rohár + Pali Rohár Description: This file allows to control the input triggers that turn on the keyboard backlight illumination that is @@ -45,7 +45,7 @@ What: /sys/class/leds/dell::kbd_backlight/stop_timeout Date: December 2014 KernelVersion: 3.19 Contact: Gabriele Mazzotta , - Pali Rohár + Pali Rohár Description: This file allows to specify the interval after which the keyboard illumination is disabled because of inactivity. diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 6f87b9dd384b8b7faf72be043625df5dba5d7afc..5e6ead29124ccefb781645ba1b00af77f242367e 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -407,3 +407,16 @@ Contact: Kalesh Singh Description: The /sys/power/suspend_stats/last_failed_step file contains the last failed step in the suspend/resume path. + +What: /sys/power/sync_on_suspend +Date: October 2019 +Contact: Jonas Meurer +Description: + This file controls whether or not the kernel will sync() + filesystems during system suspend (after freezing user space + and before suspending devices). + + Writing a "1" to this file enables the sync() and writing a "0" + disables it. Reads from the file return the current value. + The default is "1" if the build-time "SUSPEND_SKIP_SYNC" config + flag is unset, or "0" otherwise. diff --git a/Documentation/ABI/testing/sysfs-tty b/Documentation/ABI/testing/sysfs-tty index 9eb3c2b6b0406eeec340f31a47f837ea4f542258..e157130a6792f3a4f9c2049f3e9fcb3f92e89fcf 100644 --- a/Documentation/ABI/testing/sysfs-tty +++ b/Documentation/ABI/testing/sysfs-tty @@ -154,3 +154,10 @@ Description: device specification. For example, when user sets 7bytes on 16550A, which has 1/4/8/14 bytes trigger, the RX trigger is automatically changed to 4 bytes. + +What: /sys/class/tty/ttyS0/console +Date: February 2020 +Contact: Andy Shevchenko +Description: + Allows user to detach or attach back the given device as + kernel console. It shows and accepts a boolean variable. diff --git a/Documentation/ABI/testing/usb-charger-uevent b/Documentation/ABI/testing/usb-charger-uevent new file mode 100644 index 0000000000000000000000000000000000000000..419a92dd0d86a9a8ef1a521aef8b50284ad373ff --- /dev/null +++ b/Documentation/ABI/testing/usb-charger-uevent @@ -0,0 +1,46 @@ +What: Raise a uevent when a USB charger is inserted or removed +Date: 2020-01-14 +KernelVersion: 5.6 +Contact: linux-usb@vger.kernel.org +Description: There are two USB charger states: + USB_CHARGER_ABSENT + USB_CHARGER_PRESENT + There are five USB charger types: + USB_CHARGER_UNKNOWN_TYPE: Charger type is unknown + USB_CHARGER_SDP_TYPE: Standard Downstream Port + USB_CHARGER_CDP_TYPE: Charging Downstream Port + USB_CHARGER_DCP_TYPE: Dedicated Charging Port + USB_CHARGER_ACA_TYPE: Accessory Charging Adapter + https://www.usb.org/document-library/battery-charging-v12-spec-and-adopters-agreement + + Here are two examples taken using udevadm monitor -p when + USB charger is online: + UDEV change /devices/soc0/usbphynop1 (platform) + ACTION=change + DEVPATH=/devices/soc0/usbphynop1 + DRIVER=usb_phy_generic + MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv + OF_COMPATIBLE_0=usb-nop-xceiv + OF_COMPATIBLE_N=1 + OF_FULLNAME=/usbphynop1 + OF_NAME=usbphynop1 + SEQNUM=2493 + SUBSYSTEM=platform + USB_CHARGER_STATE=USB_CHARGER_PRESENT + USB_CHARGER_TYPE=USB_CHARGER_SDP_TYPE + USEC_INITIALIZED=227422826 + + USB charger is offline: + KERNEL change /devices/soc0/usbphynop1 (platform) + ACTION=change + DEVPATH=/devices/soc0/usbphynop1 + DRIVER=usb_phy_generic + MODALIAS=of:Nusbphynop1T(null)Cusb-nop-xceiv + OF_COMPATIBLE_0=usb-nop-xceiv + OF_COMPATIBLE_N=1 + OF_FULLNAME=/usbphynop1 + OF_NAME=usbphynop1 + SEQNUM=2494 + SUBSYSTEM=platform + USB_CHARGER_STATE=USB_CHARGER_ABSENT + USB_CHARGER_TYPE=USB_CHARGER_UNKNOWN_TYPE diff --git a/Documentation/Makefile b/Documentation/Makefile index d77bb607aea4cf5b0db8d42ed896ef5a8e6c99c6..cc786d11a02829e1ba1e078f302021f21f992b30 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -2,7 +2,8 @@ # Makefile for Sphinx documentation # -subdir-y := devicetree/bindings/ +# for cleaning +subdir- := devicetree/bindings # Check for broken documentation file references ifeq ($(CONFIG_WARN_MISSING_DOCUMENTS),y) @@ -13,7 +14,7 @@ endif SPHINXBUILD = sphinx-build SPHINXOPTS = SPHINXDIRS = . -_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst)) +_SPHINXDIRS = $(sort $(patsubst $(srctree)/Documentation/%/index.rst,%,$(wildcard $(srctree)/Documentation/*/index.rst))) SPHINX_CONF = conf.py PAPER = BUILDDIR = $(obj)/output diff --git a/Documentation/PCI/boot-interrupts.rst b/Documentation/PCI/boot-interrupts.rst new file mode 100644 index 0000000000000000000000000000000000000000..d078ef3eb19209acaab81725423192c19ef5426c --- /dev/null +++ b/Documentation/PCI/boot-interrupts.rst @@ -0,0 +1,155 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============== +Boot Interrupts +=============== + +:Author: - Sean V Kelley + +Overview +======== + +On PCI Express, interrupts are represented with either MSI or inbound +interrupt messages (Assert_INTx/Deassert_INTx). The integrated IO-APIC in a +given Core IO converts the legacy interrupt messages from PCI Express to +MSI interrupts. If the IO-APIC is disabled (via the mask bits in the +IO-APIC table entries), the messages are routed to the legacy PCH. This +in-band interrupt mechanism was traditionally necessary for systems that +did not support the IO-APIC and for boot. Intel in the past has used the +term "boot interrupts" to describe this mechanism. Further, the PCI Express +protocol describes this in-band legacy wire-interrupt INTx mechanism for +I/O devices to signal PCI-style level interrupts. The subsequent paragraphs +describe problems with the Core IO handling of INTx message routing to the +PCH and mitigation within BIOS and the OS. + + +Issue +===== + +When in-band legacy INTx messages are forwarded to the PCH, they in turn +trigger a new interrupt for which the OS likely lacks a handler. When an +interrupt goes unhandled over time, they are tracked by the Linux kernel as +Spurious Interrupts. The IRQ will be disabled by the Linux kernel after it +reaches a specific count with the error "nobody cared". This disabled IRQ +now prevents valid usage by an existing interrupt which may happen to share +the IRQ line. + + irq 19: nobody cared (try booting with the "irqpoll" option) + CPU: 0 PID: 2988 Comm: irq/34-nipalk Tainted: 4.14.87-rt49-02410-g4a640ec-dirty #1 + Hardware name: National Instruments NI PXIe-8880/NI PXIe-8880, BIOS 2.1.5f1 01/09/2020 + Call Trace: + + ? dump_stack+0x46/0x5e + ? __report_bad_irq+0x2e/0xb0 + ? note_interrupt+0x242/0x290 + ? nNIKAL100_memoryRead16+0x8/0x10 [nikal] + ? handle_irq_event_percpu+0x55/0x70 + ? handle_irq_event+0x4f/0x80 + ? handle_fasteoi_irq+0x81/0x180 + ? handle_irq+0x1c/0x30 + ? do_IRQ+0x41/0xd0 + ? common_interrupt+0x84/0x84 + + + handlers: + irq_default_primary_handler threaded usb_hcd_irq + Disabling IRQ #19 + + +Conditions +========== + +The use of threaded interrupts is the most likely condition to trigger +this problem today. Threaded interrupts may not be reenabled after the IRQ +handler wakes. These "one shot" conditions mean that the threaded interrupt +needs to keep the interrupt line masked until the threaded handler has run. +Especially when dealing with high data rate interrupts, the thread needs to +run to completion; otherwise some handlers will end up in stack overflows +since the interrupt of the issuing device is still active. + +Affected Chipsets +================= + +The legacy interrupt forwarding mechanism exists today in a number of +devices including but not limited to chipsets from AMD/ATI, Broadcom, and +Intel. Changes made through the mitigations below have been applied to +drivers/pci/quirks.c + +Starting with ICX there are no longer any IO-APICs in the Core IO's +devices. IO-APIC is only in the PCH. Devices connected to the Core IO's +PCIe Root Ports will use native MSI/MSI-X mechanisms. + +Mitigations +=========== + +The mitigations take the form of PCI quirks. The preference has been to +first identify and make use of a means to disable the routing to the PCH. +In such a case a quirk to disable boot interrupt generation can be +added.[1] + + Intel® 6300ESB I/O Controller Hub + Alternate Base Address Register: + BIE: Boot Interrupt Enable + 0 = Boot interrupt is enabled. + 1 = Boot interrupt is disabled. + + Intel® Sandy Bridge through Sky Lake based Xeon servers: + Coherent Interface Protocol Interrupt Control + dis_intx_route2pch/dis_intx_route2ich/dis_intx_route2dmi2: + When this bit is set. Local INTx messages received from the + Intel® Quick Data DMA/PCI Express ports are not routed to legacy + PCH - they are either converted into MSI via the integrated IO-APIC + (if the IO-APIC mask bit is clear in the appropriate entries) + or cause no further action (when mask bit is set) + +In the absence of a way to directly disable the routing, another approach +has been to make use of PCI Interrupt pin to INTx routing tables for +purposes of redirecting the interrupt handler to the rerouted interrupt +line by default. Therefore, on chipsets where this INTx routing cannot be +disabled, the Linux kernel will reroute the valid interrupt to its legacy +interrupt. This redirection of the handler will prevent the occurrence of +the spurious interrupt detection which would ordinarily disable the IRQ +line due to excessive unhandled counts.[2] + +The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS exists to enable (or +disable) the redirection of the interrupt handler to the PCH interrupt +line. The option can be overridden by either pci=ioapicreroute or +pci=noioapicreroute.[3] + + +More Documentation +================== + +There is an overview of the legacy interrupt handling in several datasheets +(6300ESB and 6700PXH below). While largely the same, it provides insight +into the evolution of its handling with chipsets. + +Example of disabling of the boot interrupt +------------------------------------------ + +Intel® 6300ESB I/O Controller Hub (Document # 300641-004US) + 5.7.3 Boot Interrupt + https://www.intel.com/content/dam/doc/datasheet/6300esb-io-controller-hub-datasheet.pdf + +Intel® Xeon® Processor E5-1600/2400/2600/4600 v3 Product Families +Datasheet - Volume 2: Registers (Document # 330784-003) + 6.6.41 cipintrc Coherent Interface Protocol Interrupt Control + https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/xeon-e5-v3-datasheet-vol-2.pdf + +Example of handler rerouting +---------------------------- + +Intel® 6700PXH 64-bit PCI Hub (Document # 302628) + 2.15.2 PCI Express Legacy INTx Support and Boot Interrupt + https://www.intel.com/content/dam/doc/datasheet/6700pxh-64-bit-pci-hub-datasheet.pdf + + +If you have any legacy PCI interrupt questions that aren't answered, email me. + +Cheers, + Sean V Kelley + sean.v.kelley@linux.intel.com + +[1] https://lore.kernel.org/r/12131949181903-git-send-email-sassmann@suse.de/ +[2] https://lore.kernel.org/r/12131949182094-git-send-email-sassmann@suse.de/ +[3] https://lore.kernel.org/r/487C8EA7.6020205@suse.de/ diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index 6768305e4c266035682f58bc25ce20ee2ddc6a74..8f66feaafd4f3d44e3ad307f46eb469e1af9a142 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -16,3 +16,4 @@ Linux PCI Bus Subsystem pci-error-recovery pcieaer-howto endpoint/index + boot-interrupts diff --git a/Documentation/PCI/msi-howto.rst b/Documentation/PCI/msi-howto.rst index 994cbb660adef4150c1ef5e8b378ee2c3d280c16..aa2046af69f7d1b2a5e1c4de414be1acd2d41ec0 100644 --- a/Documentation/PCI/msi-howto.rst +++ b/Documentation/PCI/msi-howto.rst @@ -283,5 +283,5 @@ or disabled (0). If 0 is found in any of the msi_bus files belonging to bridges between the PCI root and the device, MSIs are disabled. It is also worth checking the device driver to see whether it supports MSIs. -For example, it may contain calls to pci_irq_alloc_vectors() with the +For example, it may contain calls to pci_alloc_irq_vectors() with the PCI_IRQ_MSI or PCI_IRQ_MSIX flags. diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst index 6864f9a70f5f0b119b91ef6b9b4b7806cd4116d5..8c016d8c986232d61971ee655180c9df35739db1 100644 --- a/Documentation/PCI/pci.rst +++ b/Documentation/PCI/pci.rst @@ -239,7 +239,7 @@ from the PCI device config space. Use the values in the pci_dev structure as the PCI "bus address" might have been remapped to a "host physical" address by the arch/chip-set specific kernel support. -See Documentation/io-mapping.txt for how to access device registers +See Documentation/driver-api/io-mapping.rst for how to access device registers or device memory. The device driver needs to call pci_request_region() to verify diff --git a/Documentation/PCI/pcieaer-howto.rst b/Documentation/PCI/pcieaer-howto.rst index 18bdefaafd1a89a42ed70a1740ca44b4bfd86a4f..0b36b9ebfa4b4296ec1a4fc55ac9b443a7bdea3c 100644 --- a/Documentation/PCI/pcieaer-howto.rst +++ b/Documentation/PCI/pcieaer-howto.rst @@ -156,12 +156,6 @@ default reset_link function, but different upstream ports might have different specifications to reset pci express link, so all upstream ports should provide their own reset_link functions. -In struct pcie_port_service_driver, a new pointer, reset_link, is -added. -:: - - pci_ers_result_t (*reset_link) (struct pci_dev *dev); - Section 3.2.2.2 provides more detailed info on when to call reset_link. @@ -212,15 +206,10 @@ error_detected(dev, pci_channel_io_frozen) to all drivers within a hierarchy in question. Then, performing link reset at upstream is necessary. As different kinds of devices might use different approaches to reset link, AER port service driver is required to provide the -function to reset link. Firstly, kernel looks for if the upstream -component has an aer driver. If it has, kernel uses the reset_link -callback of the aer driver. If the upstream component has no aer driver -and the port is downstream port, we will perform a hot reset as the -default by setting the Secondary Bus Reset bit of the Bridge Control -register associated with the downstream port. As for upstream ports, -they should provide their own aer service drivers with reset_link -function. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER and -reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes +function to reset link via callback parameter of pcie_do_recovery() +function. If reset_link is not NULL, recovery function will use it +to reset the link. If error_detected returns PCI_ERS_RESULT_CAN_RECOVER +and reset_link returns PCI_ERS_RESULT_RECOVERED, the error handling goes to mmio_enabled. helper functions @@ -243,9 +232,9 @@ messages to root port when an error is detected. :: - int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);` + int pci_aer_clear_nonfatal_status(struct pci_dev *dev);` -pci_cleanup_aer_uncorrect_error_status cleanups the uncorrectable +pci_aer_clear_nonfatal_status clears non-fatal errors in the uncorrectable error status register. Frequent Asked Questions diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index 1a8b129cfc04d56b9030df92f5eae73bd345e213..83ae3b79a6439eab57cda595ed77748f39b69ca9 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -4,7 +4,7 @@ A Tour Through TREE_RCU's Grace-Period Memory Ordering August 8, 2017 -This article was contributed by Paul E. McKenney +This article was contributed by Paul E. McKenney Introduction ============ @@ -48,7 +48,7 @@ Tree RCU Grace Period Memory Ordering Building Blocks The workhorse for RCU's grace-period memory ordering is the critical section for the ``rcu_node`` structure's -``->lock``. These critical sections use helper functions for lock +``->lock``. These critical sections use helper functions for lock acquisition, including ``raw_spin_lock_rcu_node()``, ``raw_spin_lock_irq_rcu_node()``, and ``raw_spin_lock_irqsave_rcu_node()``. Their lock-release counterparts are ``raw_spin_unlock_rcu_node()``, @@ -102,9 +102,9 @@ lock-acquisition and lock-release functions:: 23 r3 = READ_ONCE(x); 24 } 25 - 26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0); + 26 WARN_ON(r1 == 0 && r2 == 0 && r3 == 0); -The ``WARN_ON()`` is evaluated at “the end of time”, +The ``WARN_ON()`` is evaluated at "the end of time", after all changes have propagated throughout the system. Without the ``smp_mb__after_unlock_lock()`` provided by the acquisition functions, this ``WARN_ON()`` could trigger, for example diff --git a/Documentation/RCU/NMI-RCU.rst b/Documentation/RCU/NMI-RCU.rst new file mode 100644 index 0000000000000000000000000000000000000000..180958388ff94afb557b20922b720956900bab4b --- /dev/null +++ b/Documentation/RCU/NMI-RCU.rst @@ -0,0 +1,124 @@ +.. _NMI_rcu_doc: + +Using RCU to Protect Dynamic NMI Handlers +========================================= + + +Although RCU is usually used to protect read-mostly data structures, +it is possible to use RCU to provide dynamic non-maskable interrupt +handlers, as well as dynamic irq handlers. This document describes +how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer +work in "arch/x86/oprofile/nmi_timer_int.c" and in +"arch/x86/kernel/traps.c". + +The relevant pieces of code are listed below, each followed by a +brief explanation:: + + static int dummy_nmi_callback(struct pt_regs *regs, int cpu) + { + return 0; + } + +The dummy_nmi_callback() function is a "dummy" NMI handler that does +nothing, but returns zero, thus saying that it did nothing, allowing +the NMI handler to take the default machine-specific action:: + + static nmi_callback_t nmi_callback = dummy_nmi_callback; + +This nmi_callback variable is a global function pointer to the current +NMI handler:: + + void do_nmi(struct pt_regs * regs, long error_code) + { + int cpu; + + nmi_enter(); + + cpu = smp_processor_id(); + ++nmi_count(cpu); + + if (!rcu_dereference_sched(nmi_callback)(regs, cpu)) + default_do_nmi(regs); + + nmi_exit(); + } + +The do_nmi() function processes each NMI. It first disables preemption +in the same way that a hardware irq would, then increments the per-CPU +count of NMIs. It then invokes the NMI handler stored in the nmi_callback +function pointer. If this handler returns zero, do_nmi() invokes the +default_do_nmi() function to handle a machine-specific NMI. Finally, +preemption is restored. + +In theory, rcu_dereference_sched() is not needed, since this code runs +only on i386, which in theory does not need rcu_dereference_sched() +anyway. However, in practice it is a good documentation aid, particularly +for anyone attempting to do something similar on Alpha or on systems +with aggressive optimizing compilers. + +Quick Quiz: + Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only? + +:ref:`Answer to Quick Quiz ` + +Back to the discussion of NMI and RCU:: + + void set_nmi_callback(nmi_callback_t callback) + { + rcu_assign_pointer(nmi_callback, callback); + } + +The set_nmi_callback() function registers an NMI handler. Note that any +data that is to be used by the callback must be initialized up -before- +the call to set_nmi_callback(). On architectures that do not order +writes, the rcu_assign_pointer() ensures that the NMI handler sees the +initialized values:: + + void unset_nmi_callback(void) + { + rcu_assign_pointer(nmi_callback, dummy_nmi_callback); + } + +This function unregisters an NMI handler, restoring the original +dummy_nmi_handler(). However, there may well be an NMI handler +currently executing on some other CPU. We therefore cannot free +up any data structures used by the old NMI handler until execution +of it completes on all other CPUs. + +One way to accomplish this is via synchronize_rcu(), perhaps as +follows:: + + unset_nmi_callback(); + synchronize_rcu(); + kfree(my_nmi_data); + +This works because (as of v4.20) synchronize_rcu() blocks until all +CPUs complete any preemption-disabled segments of code that they were +executing. +Since NMI handlers disable preemption, synchronize_rcu() is guaranteed +not to return until all ongoing NMI handlers exit. It is therefore safe +to free up the handler's data as soon as synchronize_rcu() returns. + +Important note: for this to work, the architecture in question must +invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively. + +.. _answer_quick_quiz_NMI: + +Answer to Quick Quiz: + Why might the rcu_dereference_sched() be necessary on Alpha, given that the code referenced by the pointer is read-only? + + The caller to set_nmi_callback() might well have + initialized some data that is to be used by the new NMI + handler. In this case, the rcu_dereference_sched() would + be needed, because otherwise a CPU that received an NMI + just after the new handler was set might see the pointer + to the new NMI handler, but the old pre-initialized + version of the handler's data. + + This same sad story can happen on other CPUs when using + a compiler with aggressive pointer-value speculation + optimizations. + + More important, the rcu_dereference_sched() makes it + clear to someone reading the code that the pointer is + being protected by RCU-sched. diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt deleted file mode 100644 index 881353fd5bff1cbc1f3dead8009f5b86cb65832a..0000000000000000000000000000000000000000 --- a/Documentation/RCU/NMI-RCU.txt +++ /dev/null @@ -1,121 +0,0 @@ -Using RCU to Protect Dynamic NMI Handlers - - -Although RCU is usually used to protect read-mostly data structures, -it is possible to use RCU to provide dynamic non-maskable interrupt -handlers, as well as dynamic irq handlers. This document describes -how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer -work in "arch/x86/oprofile/nmi_timer_int.c" and in -"arch/x86/kernel/traps.c". - -The relevant pieces of code are listed below, each followed by a -brief explanation. - - static int dummy_nmi_callback(struct pt_regs *regs, int cpu) - { - return 0; - } - -The dummy_nmi_callback() function is a "dummy" NMI handler that does -nothing, but returns zero, thus saying that it did nothing, allowing -the NMI handler to take the default machine-specific action. - - static nmi_callback_t nmi_callback = dummy_nmi_callback; - -This nmi_callback variable is a global function pointer to the current -NMI handler. - - void do_nmi(struct pt_regs * regs, long error_code) - { - int cpu; - - nmi_enter(); - - cpu = smp_processor_id(); - ++nmi_count(cpu); - - if (!rcu_dereference_sched(nmi_callback)(regs, cpu)) - default_do_nmi(regs); - - nmi_exit(); - } - -The do_nmi() function processes each NMI. It first disables preemption -in the same way that a hardware irq would, then increments the per-CPU -count of NMIs. It then invokes the NMI handler stored in the nmi_callback -function pointer. If this handler returns zero, do_nmi() invokes the -default_do_nmi() function to handle a machine-specific NMI. Finally, -preemption is restored. - -In theory, rcu_dereference_sched() is not needed, since this code runs -only on i386, which in theory does not need rcu_dereference_sched() -anyway. However, in practice it is a good documentation aid, particularly -for anyone attempting to do something similar on Alpha or on systems -with aggressive optimizing compilers. - -Quick Quiz: Why might the rcu_dereference_sched() be necessary on Alpha, - given that the code referenced by the pointer is read-only? - - -Back to the discussion of NMI and RCU... - - void set_nmi_callback(nmi_callback_t callback) - { - rcu_assign_pointer(nmi_callback, callback); - } - -The set_nmi_callback() function registers an NMI handler. Note that any -data that is to be used by the callback must be initialized up -before- -the call to set_nmi_callback(). On architectures that do not order -writes, the rcu_assign_pointer() ensures that the NMI handler sees the -initialized values. - - void unset_nmi_callback(void) - { - rcu_assign_pointer(nmi_callback, dummy_nmi_callback); - } - -This function unregisters an NMI handler, restoring the original -dummy_nmi_handler(). However, there may well be an NMI handler -currently executing on some other CPU. We therefore cannot free -up any data structures used by the old NMI handler until execution -of it completes on all other CPUs. - -One way to accomplish this is via synchronize_rcu(), perhaps as -follows: - - unset_nmi_callback(); - synchronize_rcu(); - kfree(my_nmi_data); - -This works because (as of v4.20) synchronize_rcu() blocks until all -CPUs complete any preemption-disabled segments of code that they were -executing. -Since NMI handlers disable preemption, synchronize_rcu() is guaranteed -not to return until all ongoing NMI handlers exit. It is therefore safe -to free up the handler's data as soon as synchronize_rcu() returns. - -Important note: for this to work, the architecture in question must -invoke nmi_enter() and nmi_exit() on NMI entry and exit, respectively. - - -Answer to Quick Quiz - - Why might the rcu_dereference_sched() be necessary on Alpha, given - that the code referenced by the pointer is read-only? - - Answer: The caller to set_nmi_callback() might well have - initialized some data that is to be used by the new NMI - handler. In this case, the rcu_dereference_sched() would - be needed, because otherwise a CPU that received an NMI - just after the new handler was set might see the pointer - to the new NMI handler, but the old pre-initialized - version of the handler's data. - - This same sad story can happen on other CPUs when using - a compiler with aggressive pointer-value speculation - optimizations. - - More important, the rcu_dereference_sched() makes it - clear to someone reading the code that the pointer is - being protected by RCU-sched. diff --git a/Documentation/RCU/arrayRCU.rst b/Documentation/RCU/arrayRCU.rst new file mode 100644 index 0000000000000000000000000000000000000000..4051ea3871eff0075843d9eee1725b178e4090fd --- /dev/null +++ b/Documentation/RCU/arrayRCU.rst @@ -0,0 +1,165 @@ +.. _array_rcu_doc: + +Using RCU to Protect Read-Mostly Arrays +======================================= + +Although RCU is more commonly used to protect linked lists, it can +also be used to protect arrays. Three situations are as follows: + +1. :ref:`Hash Tables ` + +2. :ref:`Static Arrays ` + +3. :ref:`Resizable Arrays ` + +Each of these three situations involves an RCU-protected pointer to an +array that is separately indexed. It might be tempting to consider use +of RCU to instead protect the index into an array, however, this use +case is **not** supported. The problem with RCU-protected indexes into +arrays is that compilers can play way too many optimization games with +integers, which means that the rules governing handling of these indexes +are far more trouble than they are worth. If RCU-protected indexes into +arrays prove to be particularly valuable (which they have not thus far), +explicit cooperation from the compiler will be required to permit them +to be safely used. + +That aside, each of the three RCU-protected pointer situations are +described in the following sections. + +.. _hash_tables: + +Situation 1: Hash Tables +------------------------ + +Hash tables are often implemented as an array, where each array entry +has a linked-list hash chain. Each hash chain can be protected by RCU +as described in the listRCU.txt document. This approach also applies +to other array-of-list situations, such as radix trees. + +.. _static_arrays: + +Situation 2: Static Arrays +-------------------------- + +Static arrays, where the data (rather than a pointer to the data) is +located in each array element, and where the array is never resized, +have not been used with RCU. Rik van Riel recommends using seqlock in +this situation, which would also have minimal read-side overhead as long +as updates are rare. + +Quick Quiz: + Why is it so important that updates be rare when using seqlock? + +:ref:`Answer to Quick Quiz ` + +.. _resizable_arrays: + +Situation 3: Resizable Arrays +------------------------------ + +Use of RCU for resizable arrays is demonstrated by the grow_ary() +function formerly used by the System V IPC code. The array is used +to map from semaphore, message-queue, and shared-memory IDs to the data +structure that represents the corresponding IPC construct. The grow_ary() +function does not acquire any locks; instead its caller must hold the +ids->sem semaphore. + +The grow_ary() function, shown below, does some limit checks, allocates a +new ipc_id_ary, copies the old to the new portion of the new, initializes +the remainder of the new, updates the ids->entries pointer to point to +the new array, and invokes ipc_rcu_putref() to free up the old array. +Note that rcu_assign_pointer() is used to update the ids->entries pointer, +which includes any memory barriers required on whatever architecture +you are running on:: + + static int grow_ary(struct ipc_ids* ids, int newsize) + { + struct ipc_id_ary* new; + struct ipc_id_ary* old; + int i; + int size = ids->entries->size; + + if(newsize > IPCMNI) + newsize = IPCMNI; + if(newsize <= size) + return newsize; + + new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize + + sizeof(struct ipc_id_ary)); + if(new == NULL) + return size; + new->size = newsize; + memcpy(new->p, ids->entries->p, + sizeof(struct kern_ipc_perm *)*size + + sizeof(struct ipc_id_ary)); + for(i=size;ip[i] = NULL; + } + old = ids->entries; + + /* + * Use rcu_assign_pointer() to make sure the memcpyed + * contents of the new array are visible before the new + * array becomes visible. + */ + rcu_assign_pointer(ids->entries, new); + + ipc_rcu_putref(old); + return newsize; + } + +The ipc_rcu_putref() function decrements the array's reference count +and then, if the reference count has dropped to zero, uses call_rcu() +to free the array after a grace period has elapsed. + +The array is traversed by the ipc_lock() function. This function +indexes into the array under the protection of rcu_read_lock(), +using rcu_dereference() to pick up the pointer to the array so +that it may later safely be dereferenced -- memory barriers are +required on the Alpha CPU. Since the size of the array is stored +with the array itself, there can be no array-size mismatches, so +a simple check suffices. The pointer to the structure corresponding +to the desired IPC object is placed in "out", with NULL indicating +a non-existent entry. After acquiring "out->lock", the "out->deleted" +flag indicates whether the IPC object is in the process of being +deleted, and, if not, the pointer is returned:: + + struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) + { + struct kern_ipc_perm* out; + int lid = id % SEQ_MULTIPLIER; + struct ipc_id_ary* entries; + + rcu_read_lock(); + entries = rcu_dereference(ids->entries); + if(lid >= entries->size) { + rcu_read_unlock(); + return NULL; + } + out = entries->p[lid]; + if(out == NULL) { + rcu_read_unlock(); + return NULL; + } + spin_lock(&out->lock); + + /* ipc_rmid() may have already freed the ID while ipc_lock + * was spinning: here verify that the structure is still valid + */ + if (out->deleted) { + spin_unlock(&out->lock); + rcu_read_unlock(); + return NULL; + } + return out; + } + +.. _answer_quick_quiz_seqlock: + +Answer to Quick Quiz: + Why is it so important that updates be rare when using seqlock? + + The reason that it is important that updates be rare when + using seqlock is that frequent updates can livelock readers. + One way to avoid this problem is to assign a seqlock for + each array entry rather than to the entire array. diff --git a/Documentation/RCU/arrayRCU.txt b/Documentation/RCU/arrayRCU.txt deleted file mode 100644 index f05a9afb2c39b61efb841886e6f272adfa9da797..0000000000000000000000000000000000000000 --- a/Documentation/RCU/arrayRCU.txt +++ /dev/null @@ -1,153 +0,0 @@ -Using RCU to Protect Read-Mostly Arrays - - -Although RCU is more commonly used to protect linked lists, it can -also be used to protect arrays. Three situations are as follows: - -1. Hash Tables - -2. Static Arrays - -3. Resizeable Arrays - -Each of these three situations involves an RCU-protected pointer to an -array that is separately indexed. It might be tempting to consider use -of RCU to instead protect the index into an array, however, this use -case is -not- supported. The problem with RCU-protected indexes into -arrays is that compilers can play way too many optimization games with -integers, which means that the rules governing handling of these indexes -are far more trouble than they are worth. If RCU-protected indexes into -arrays prove to be particularly valuable (which they have not thus far), -explicit cooperation from the compiler will be required to permit them -to be safely used. - -That aside, each of the three RCU-protected pointer situations are -described in the following sections. - - -Situation 1: Hash Tables - -Hash tables are often implemented as an array, where each array entry -has a linked-list hash chain. Each hash chain can be protected by RCU -as described in the listRCU.txt document. This approach also applies -to other array-of-list situations, such as radix trees. - - -Situation 2: Static Arrays - -Static arrays, where the data (rather than a pointer to the data) is -located in each array element, and where the array is never resized, -have not been used with RCU. Rik van Riel recommends using seqlock in -this situation, which would also have minimal read-side overhead as long -as updates are rare. - -Quick Quiz: Why is it so important that updates be rare when - using seqlock? - - -Situation 3: Resizeable Arrays - -Use of RCU for resizeable arrays is demonstrated by the grow_ary() -function formerly used by the System V IPC code. The array is used -to map from semaphore, message-queue, and shared-memory IDs to the data -structure that represents the corresponding IPC construct. The grow_ary() -function does not acquire any locks; instead its caller must hold the -ids->sem semaphore. - -The grow_ary() function, shown below, does some limit checks, allocates a -new ipc_id_ary, copies the old to the new portion of the new, initializes -the remainder of the new, updates the ids->entries pointer to point to -the new array, and invokes ipc_rcu_putref() to free up the old array. -Note that rcu_assign_pointer() is used to update the ids->entries pointer, -which includes any memory barriers required on whatever architecture -you are running on. - - static int grow_ary(struct ipc_ids* ids, int newsize) - { - struct ipc_id_ary* new; - struct ipc_id_ary* old; - int i; - int size = ids->entries->size; - - if(newsize > IPCMNI) - newsize = IPCMNI; - if(newsize <= size) - return newsize; - - new = ipc_rcu_alloc(sizeof(struct kern_ipc_perm *)*newsize + - sizeof(struct ipc_id_ary)); - if(new == NULL) - return size; - new->size = newsize; - memcpy(new->p, ids->entries->p, - sizeof(struct kern_ipc_perm *)*size + - sizeof(struct ipc_id_ary)); - for(i=size;ip[i] = NULL; - } - old = ids->entries; - - /* - * Use rcu_assign_pointer() to make sure the memcpyed - * contents of the new array are visible before the new - * array becomes visible. - */ - rcu_assign_pointer(ids->entries, new); - - ipc_rcu_putref(old); - return newsize; - } - -The ipc_rcu_putref() function decrements the array's reference count -and then, if the reference count has dropped to zero, uses call_rcu() -to free the array after a grace period has elapsed. - -The array is traversed by the ipc_lock() function. This function -indexes into the array under the protection of rcu_read_lock(), -using rcu_dereference() to pick up the pointer to the array so -that it may later safely be dereferenced -- memory barriers are -required on the Alpha CPU. Since the size of the array is stored -with the array itself, there can be no array-size mismatches, so -a simple check suffices. The pointer to the structure corresponding -to the desired IPC object is placed in "out", with NULL indicating -a non-existent entry. After acquiring "out->lock", the "out->deleted" -flag indicates whether the IPC object is in the process of being -deleted, and, if not, the pointer is returned. - - struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id) - { - struct kern_ipc_perm* out; - int lid = id % SEQ_MULTIPLIER; - struct ipc_id_ary* entries; - - rcu_read_lock(); - entries = rcu_dereference(ids->entries); - if(lid >= entries->size) { - rcu_read_unlock(); - return NULL; - } - out = entries->p[lid]; - if(out == NULL) { - rcu_read_unlock(); - return NULL; - } - spin_lock(&out->lock); - - /* ipc_rmid() may have already freed the ID while ipc_lock - * was spinning: here verify that the structure is still valid - */ - if (out->deleted) { - spin_unlock(&out->lock); - rcu_read_unlock(); - return NULL; - } - return out; - } - - -Answer to Quick Quiz: - - The reason that it is important that updates be rare when - using seqlock is that frequent updates can livelock readers. - One way to avoid this problem is to assign a seqlock for - each array entry rather than to the entire array. diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst index 5c99185710fa4b083e144c988090aedd4fb96a7a..81a0a1e5f767e870c3daaeffbd67ca2bbc958071 100644 --- a/Documentation/RCU/index.rst +++ b/Documentation/RCU/index.rst @@ -7,8 +7,13 @@ RCU concepts .. toctree:: :maxdepth: 3 + arrayRCU + rcubarrier + rcu_dereference + whatisRCU rcu listRCU + NMI-RCU UP Design/Memory-Ordering/Tree-RCU-Memory-Ordering diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index 7956ff33042b06bfc8ceb4628a066030f2b4553f..2a643e293fb41e80f638955a98349f26ebd1b450 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -4,12 +4,61 @@ Using RCU to Protect Read-Mostly Linked Lists ============================================= One of the best applications of RCU is to protect read-mostly linked lists -("struct list_head" in list.h). One big advantage of this approach +(``struct list_head`` in list.h). One big advantage of this approach is that all of the required memory barriers are included for you in the list macros. This document describes several applications of RCU, with the best fits first. -Example 1: Read-Side Action Taken Outside of Lock, No In-Place Updates + +Example 1: Read-mostly list: Deferred Destruction +------------------------------------------------- + +A widely used usecase for RCU lists in the kernel is lockless iteration over +all processes in the system. ``task_struct::tasks`` represents the list node that +links all the processes. The list can be traversed in parallel to any list +additions or removals. + +The traversal of the list is done using ``for_each_process()`` which is defined +by the 2 macros:: + + #define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) + + #define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +The code traversing the list of all processes typically looks like:: + + rcu_read_lock(); + for_each_process(p) { + /* Do something with p */ + } + rcu_read_unlock(); + +The simplified code for removing a process from a task list is:: + + void release_task(struct task_struct *p) + { + write_lock(&tasklist_lock); + list_del_rcu(&p->tasks); + write_unlock(&tasklist_lock); + call_rcu(&p->rcu, delayed_put_task_struct); + } + +When a process exits, ``release_task()`` calls ``list_del_rcu(&p->tasks)`` under +``tasklist_lock`` writer lock protection, to remove the task from the list of +all tasks. The ``tasklist_lock`` prevents concurrent list additions/removals +from corrupting the list. Readers using ``for_each_process()`` are not protected +with the ``tasklist_lock``. To prevent readers from noticing changes in the list +pointers, the ``task_struct`` object is freed only after one or more grace +periods elapse (with the help of call_rcu()). This deferring of destruction +ensures that any readers traversing the list will see valid ``p->tasks.next`` +pointers and deletion/freeing can happen in parallel with traversal of the list. +This pattern is also called an **existence lock**, since RCU pins the object in +memory until all existing readers finish. + + +Example 2: Read-Side Action Taken Outside of Lock: No In-Place Updates ---------------------------------------------------------------------- The best applications are cases where, if reader-writer locking were @@ -26,7 +75,7 @@ added or deleted, rather than being modified in place. A straightforward example of this use of RCU may be found in the system-call auditing support. For example, a reader-writer locked -implementation of audit_filter_task() might be as follows:: +implementation of ``audit_filter_task()`` might be as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -34,7 +83,7 @@ implementation of audit_filter_task() might be as follows:: enum audit_state state; read_lock(&auditsc_lock); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry(e, &audit_tsklist, list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { read_unlock(&auditsc_lock); @@ -58,7 +107,7 @@ This means that RCU can be easily applied to the read side, as follows:: enum audit_state state; rcu_read_lock(); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry_rcu(e, &audit_tsklist, list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { rcu_read_unlock(); @@ -69,18 +118,18 @@ This means that RCU can be easily applied to the read side, as follows:: return AUDIT_BUILD_CONTEXT; } -The read_lock() and read_unlock() calls have become rcu_read_lock() +The ``read_lock()`` and ``read_unlock()`` calls have become rcu_read_lock() and rcu_read_unlock(), respectively, and the list_for_each_entry() has -become list_for_each_entry_rcu(). The _rcu() list-traversal primitives +become list_for_each_entry_rcu(). The **_rcu()** list-traversal primitives insert the read-side memory barriers that are required on DEC Alpha CPUs. -The changes to the update side are also straightforward. A reader-writer -lock might be used as follows for deletion and insertion:: +The changes to the update side are also straightforward. A reader-writer lock +might be used as follows for deletion and insertion:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; write_lock(&auditsc_lock); list_for_each_entry(e, list, list) { @@ -113,9 +162,9 @@ Following are the RCU equivalents for these two functions:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; - /* Do not use the _rcu iterator here, since this is the only + /* No need to use the _rcu iterator here, since this is the only * deletion routine. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -139,45 +188,45 @@ Following are the RCU equivalents for these two functions:: return 0; } -Normally, the write_lock() and write_unlock() would be replaced by -a spin_lock() and a spin_unlock(), but in this case, all callers hold -audit_netlink_sem, so no additional locking is required. The auditsc_lock -can therefore be eliminated, since use of RCU eliminates the need for -writers to exclude readers. Normally, the write_lock() calls would -be converted into spin_lock() calls. +Normally, the ``write_lock()`` and ``write_unlock()`` would be replaced by a +spin_lock() and a spin_unlock(). But in this case, all callers hold +``audit_filter_mutex``, so no additional locking is required. The +``auditsc_lock`` can therefore be eliminated, since use of RCU eliminates the +need for writers to exclude readers. The list_del(), list_add(), and list_add_tail() primitives have been replaced by list_del_rcu(), list_add_rcu(), and list_add_tail_rcu(). -The _rcu() list-manipulation primitives add memory barriers that are -needed on weakly ordered CPUs (most of them!). The list_del_rcu() -primitive omits the pointer poisoning debug-assist code that would -otherwise cause concurrent readers to fail spectacularly. +The **_rcu()** list-manipulation primitives add memory barriers that are needed on +weakly ordered CPUs (most of them!). The list_del_rcu() primitive omits the +pointer poisoning debug-assist code that would otherwise cause concurrent +readers to fail spectacularly. -So, when readers can tolerate stale data and when entries are either added -or deleted, without in-place modification, it is very easy to use RCU! +So, when readers can tolerate stale data and when entries are either added or +deleted, without in-place modification, it is very easy to use RCU! -Example 2: Handling In-Place Updates + +Example 3: Handling In-Place Updates ------------------------------------ -The system-call auditing code does not update auditing rules in place. -However, if it did, reader-writer-locked code to do so might look as -follows (presumably, the field_count is only permitted to decrease, -otherwise, the added fields would need to be filled in):: +The system-call auditing code does not update auditing rules in place. However, +if it did, the reader-writer-locked code to do so might look as follows +(assuming only ``field_count`` is updated, otherwise, the added fields would +need to be filled in):: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, __u32 newaction, __u32 newfield_count) { - struct audit_entry *e; - struct audit_newentry *ne; + struct audit_entry *e; + struct audit_entry *ne; write_lock(&auditsc_lock); - /* Note: audit_netlink_sem held by caller. */ + /* Note: audit_filter_mutex held by caller. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { e->rule.action = newaction; - e->rule.file_count = newfield_count; + e->rule.field_count = newfield_count; write_unlock(&auditsc_lock); return 0; } @@ -188,16 +237,16 @@ otherwise, the added fields would need to be filled in):: The RCU version creates a copy, updates the copy, then replaces the old entry with the newly updated entry. This sequence of actions, allowing -concurrent reads while doing a copy to perform an update, is what gives -RCU ("read-copy update") its name. The RCU code is as follows:: +concurrent reads while making a copy to perform an update, is what gives +RCU (*read-copy update*) its name. The RCU code is as follows:: static inline int audit_upd_rule(struct audit_rule *rule, struct list_head *list, __u32 newaction, __u32 newfield_count) { - struct audit_entry *e; - struct audit_newentry *ne; + struct audit_entry *e; + struct audit_entry *ne; list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -206,7 +255,7 @@ RCU ("read-copy update") its name. The RCU code is as follows:: return -ENOMEM; audit_copy_rule(&ne->rule, &e->rule); ne->rule.action = newaction; - ne->rule.file_count = newfield_count; + ne->rule.field_count = newfield_count; list_replace_rcu(&e->list, &ne->list); call_rcu(&e->rcu, audit_free_rule); return 0; @@ -215,34 +264,45 @@ RCU ("read-copy update") its name. The RCU code is as follows:: return -EFAULT; /* No matching rule */ } -Again, this assumes that the caller holds audit_netlink_sem. Normally, -the reader-writer lock would become a spinlock in this sort of code. +Again, this assumes that the caller holds ``audit_filter_mutex``. Normally, the +writer lock would become a spinlock in this sort of code. -Example 3: Eliminating Stale Data +Another use of this pattern can be found in the openswitch driver's *connection +tracking table* code in ``ct_limit_set()``. The table holds connection tracking +entries and has a limit on the maximum entries. There is one such table +per-zone and hence one *limit* per zone. The zones are mapped to their limits +through a hashtable using an RCU-managed hlist for the hash chains. When a new +limit is set, a new limit object is allocated and ``ct_limit_set()`` is called +to replace the old limit object with the new one using list_replace_rcu(). +The old limit object is then freed after a grace period using kfree_rcu(). + + +Example 4: Eliminating Stale Data --------------------------------- -The auditing examples above tolerate stale data, as do most algorithms +The auditing example above tolerates stale data, as do most algorithms that are tracking external state. Because there is a delay from the time the external state changes before Linux becomes aware of the change, -additional RCU-induced staleness is normally not a problem. +additional RCU-induced staleness is generally not a problem. However, there are many examples where stale data cannot be tolerated. -One example in the Linux kernel is the System V IPC (see the ipc_lock() -function in ipc/util.c). This code checks a "deleted" flag under a -per-entry spinlock, and, if the "deleted" flag is set, pretends that the +One example in the Linux kernel is the System V IPC (see the shm_lock() +function in ipc/shm.c). This code checks a *deleted* flag under a +per-entry spinlock, and, if the *deleted* flag is set, pretends that the entry does not exist. For this to be helpful, the search function must -return holding the per-entry spinlock, as ipc_lock() does in fact do. +return holding the per-entry spinlock, as shm_lock() does in fact do. + +.. _quick_quiz: Quick Quiz: - Why does the search function need to return holding the per-entry lock for - this deleted-flag technique to be helpful? + For the deleted-flag technique to be helpful, why is it necessary + to hold the per-entry lock while returning from the search function? -:ref:`Answer to Quick Quiz ` +:ref:`Answer to Quick Quiz ` -If the system-call audit module were to ever need to reject stale data, -one way to accomplish this would be to add a "deleted" flag and a "lock" -spinlock to the audit_entry structure, and modify audit_filter_task() -as follows:: +If the system-call audit module were to ever need to reject stale data, one way +to accomplish this would be to add a ``deleted`` flag and a ``lock`` spinlock to the +audit_entry structure, and modify ``audit_filter_task()`` as follows:: static enum audit_state audit_filter_task(struct task_struct *tsk) { @@ -267,20 +327,20 @@ as follows:: } Note that this example assumes that entries are only added and deleted. -Additional mechanism is required to deal correctly with the -update-in-place performed by audit_upd_rule(). For one thing, -audit_upd_rule() would need additional memory barriers to ensure -that the list_add_rcu() was really executed before the list_del_rcu(). +Additional mechanism is required to deal correctly with the update-in-place +performed by ``audit_upd_rule()``. For one thing, ``audit_upd_rule()`` would +need additional memory barriers to ensure that the list_add_rcu() was really +executed before the list_del_rcu(). -The audit_del_rule() function would need to set the "deleted" -flag under the spinlock as follows:: +The ``audit_del_rule()`` function would need to set the ``deleted`` flag under the +spinlock as follows:: static inline int audit_del_rule(struct audit_rule *rule, struct list_head *list) { - struct audit_entry *e; + struct audit_entry *e; - /* Do not need to use the _rcu iterator here, since this + /* No need to use the _rcu iterator here, since this * is the only deletion routine. */ list_for_each_entry(e, list, list) { if (!audit_compare_rule(rule, &e->rule)) { @@ -295,6 +355,91 @@ flag under the spinlock as follows:: return -EFAULT; /* No matching rule */ } +This too assumes that the caller holds ``audit_filter_mutex``. + + +Example 5: Skipping Stale Objects +--------------------------------- + +For some usecases, reader performance can be improved by skipping stale objects +during read-side list traversal if the object in concern is pending destruction +after one or more grace periods. One such example can be found in the timerfd +subsystem. When a ``CLOCK_REALTIME`` clock is reprogrammed - for example due to +setting of the system time, then all programmed timerfds that depend on this +clock get triggered and processes waiting on them to expire are woken up in +advance of their scheduled expiry. To facilitate this, all such timers are added +to an RCU-managed ``cancel_list`` when they are setup in +``timerfd_setup_cancel()``:: + + static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) + { + spin_lock(&ctx->cancel_lock); + if ((ctx->clockid == CLOCK_REALTIME && + (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) { + if (!ctx->might_cancel) { + ctx->might_cancel = true; + spin_lock(&cancel_lock); + list_add_rcu(&ctx->clist, &cancel_list); + spin_unlock(&cancel_lock); + } + } + spin_unlock(&ctx->cancel_lock); + } + +When a timerfd is freed (fd is closed), then the ``might_cancel`` flag of the +timerfd object is cleared, the object removed from the ``cancel_list`` and +destroyed:: + + int timerfd_release(struct inode *inode, struct file *file) + { + struct timerfd_ctx *ctx = file->private_data; + + spin_lock(&ctx->cancel_lock); + if (ctx->might_cancel) { + ctx->might_cancel = false; + spin_lock(&cancel_lock); + list_del_rcu(&ctx->clist); + spin_unlock(&cancel_lock); + } + spin_unlock(&ctx->cancel_lock); + + hrtimer_cancel(&ctx->t.tmr); + kfree_rcu(ctx, rcu); + return 0; + } + +If the ``CLOCK_REALTIME`` clock is set, for example by a time server, the +hrtimer framework calls ``timerfd_clock_was_set()`` which walks the +``cancel_list`` and wakes up processes waiting on the timerfd. While iterating +the ``cancel_list``, the ``might_cancel`` flag is consulted to skip stale +objects:: + + void timerfd_clock_was_set(void) + { + struct timerfd_ctx *ctx; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &cancel_list, clist) { + if (!ctx->might_cancel) + continue; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->moffs != ktime_mono_to_real(0)) { + ctx->moffs = KTIME_MAX; + ctx->ticks++; + wake_up_locked_poll(&ctx->wqh, EPOLLIN); + } + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + } + rcu_read_unlock(); + } + +The key point here is, because RCU-traversal of the ``cancel_list`` happens +while objects are being added and removed to the list, sometimes the traversal +can step on an object that has been removed from the list. In this example, it +is seen that it is better to skip such objects using a flag. + + Summary ------- @@ -303,19 +448,21 @@ the most amenable to use of RCU. The simplest case is where entries are either added or deleted from the data structure (or atomically modified in place), but non-atomic in-place modifications can be handled by making a copy, updating the copy, then replacing the original with the copy. -If stale data cannot be tolerated, then a "deleted" flag may be used +If stale data cannot be tolerated, then a *deleted* flag may be used in conjunction with a per-entry spinlock in order to allow the search function to reject newly deleted data. -.. _answer_quick_quiz_list: +.. _quick_quiz_answer: Answer to Quick Quiz: - Why does the search function need to return holding the per-entry - lock for this deleted-flag technique to be helpful? + For the deleted-flag technique to be helpful, why is it necessary + to hold the per-entry lock while returning from the search function? If the search function drops the per-entry lock before returning, then the caller will be processing stale data in any case. If it is really OK to be processing stale data, then you don't need a - "deleted" flag. If processing stale data really is a problem, + *deleted* flag. If processing stale data really is a problem, then you need to hold the per-entry lock across all of the code that uses the value that was returned. + +:ref:`Back to Quick Quiz ` diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.txt index 9c015976b174123f52b2fd1a046bcb616df423b3..b8096316fd116228b5323357b5eb879bb3dc1a45 100644 --- a/Documentation/RCU/lockdep-splat.txt +++ b/Documentation/RCU/lockdep-splat.txt @@ -99,7 +99,7 @@ With this change, the rcu_dereference() is always within an RCU read-side critical section, which again would have suppressed the above lockdep-RCU splat. -But in this particular case, we don't actually deference the pointer +But in this particular case, we don't actually dereference the pointer returned from rcu_dereference(). Instead, that pointer is just compared to the cic pointer, which means that the rcu_dereference() can be replaced by rcu_access_pointer() as follows: diff --git a/Documentation/RCU/rcu.rst b/Documentation/RCU/rcu.rst index 8dfb437dacc319ce98eb0a46f7edc1aa7eca57c9..0e03c6ef3147a3ea06cef16fd3fd931ce7c4081a 100644 --- a/Documentation/RCU/rcu.rst +++ b/Documentation/RCU/rcu.rst @@ -11,8 +11,8 @@ must be long enough that any readers accessing the item being deleted have since dropped their references. For example, an RCU-protected deletion from a linked list would first remove the item from the list, wait for a grace period to elapse, then free the element. See the -Documentation/RCU/listRCU.rst file for more information on using RCU with -linked lists. +:ref:`Documentation/RCU/listRCU.rst ` for more information on +using RCU with linked lists. Frequently Asked Questions -------------------------- @@ -50,7 +50,7 @@ Frequently Asked Questions - If I am running on a uniprocessor kernel, which can only do one thing at a time, why should I wait for a grace period? - See the Documentation/RCU/UP.rst file for more information. + See :ref:`Documentation/RCU/UP.rst ` for more information. - How can I see where RCU is currently used in the Linux kernel? @@ -68,18 +68,18 @@ Frequently Asked Questions - Why the name "RCU"? - "RCU" stands for "read-copy update". The file Documentation/RCU/listRCU.rst - has more information on where this name came from, search for - "read-copy update" to find it. + "RCU" stands for "read-copy update". + :ref:`Documentation/RCU/listRCU.rst ` has more information on where + this name came from, search for "read-copy update" to find it. - I hear that RCU is patented? What is with that? Yes, it is. There are several known patents related to RCU, - search for the string "Patent" in RTFP.txt to find them. + search for the string "Patent" in Documentation/RCU/RTFP.txt to find them. Of these, one was allowed to lapse by the assignee, and the others have been contributed to the Linux kernel under GPL. There are now also LGPL implementations of user-level RCU - available (http://liburcu.org/). + available (https://liburcu.org/). - I hear that RCU needs work in order to support realtime kernels? @@ -88,5 +88,5 @@ Frequently Asked Questions - Where can I find more information on RCU? - See the RTFP.txt file in this directory. + See the Documentation/RCU/RTFP.txt file. Or point your browser at (http://www.rdrop.com/users/paulmck/RCU/). diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst new file mode 100644 index 0000000000000000000000000000000000000000..c9667eb0d44469308c401f8270bbf7ea261dab6d --- /dev/null +++ b/Documentation/RCU/rcu_dereference.rst @@ -0,0 +1,463 @@ +.. _rcu_dereference_doc: + +PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference() +=============================================================== + +Most of the time, you can use values from rcu_dereference() or one of +the similar primitives without worries. Dereferencing (prefix "*"), +field selection ("->"), assignment ("="), address-of ("&"), addition and +subtraction of constants, and casts all work quite naturally and safely. + +It is nevertheless possible to get into trouble with other operations. +Follow these rules to keep your RCU code working properly: + +- You must use one of the rcu_dereference() family of primitives + to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU + will complain. Worse yet, your code can see random memory-corruption + bugs due to games that compilers and DEC Alpha can play. + Without one of the rcu_dereference() primitives, compilers + can reload the value, and won't your code have fun with two + different values for a single pointer! Without rcu_dereference(), + DEC Alpha can load a pointer, dereference that pointer, and + return data preceding initialization that preceded the store of + the pointer. + + In addition, the volatile cast in rcu_dereference() prevents the + compiler from deducing the resulting pointer value. Please see + the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH" + for an example where the compiler can in fact deduce the exact + value of the pointer, and thus cause misordering. + +- You are only permitted to use rcu_dereference on pointer values. + The compiler simply knows too much about integral values to + trust it to carry dependencies through integer operations. + There are a very few exceptions, namely that you can temporarily + cast the pointer to uintptr_t in order to: + + - Set bits and clear bits down in the must-be-zero low-order + bits of that pointer. This clearly means that the pointer + must have alignment constraints, for example, this does + -not- work in general for char* pointers. + + - XOR bits to translate pointers, as is done in some + classic buddy-allocator algorithms. + + It is important to cast the value back to pointer before + doing much of anything else with it. + +- Avoid cancellation when using the "+" and "-" infix arithmetic + operators. For example, for a given variable "x", avoid + "(x-(uintptr_t)x)" for char* pointers. The compiler is within its + rights to substitute zero for this sort of expression, so that + subsequent accesses no longer depend on the rcu_dereference(), + again possibly resulting in bugs due to misordering. + + Of course, if "p" is a pointer from rcu_dereference(), and "a" + and "b" are integers that happen to be equal, the expression + "p+a-b" is safe because its value still necessarily depends on + the rcu_dereference(), thus maintaining proper ordering. + +- If you are using RCU to protect JITed functions, so that the + "()" function-invocation operator is applied to a value obtained + (directly or indirectly) from rcu_dereference(), you may need to + interact directly with the hardware to flush instruction caches. + This issue arises on some systems when a newly JITed function is + using the same memory that was used by an earlier JITed function. + +- Do not use the results from relational operators ("==", "!=", + ">", ">=", "<", or "<=") when dereferencing. For example, + the following (quite strange) code is buggy:: + + int *p; + int *q; + + ... + + p = rcu_dereference(gp) + q = &global_q; + q += p > &oom_p; + r1 = *q; /* BUGGY!!! */ + + As before, the reason this is buggy is that relational operators + are often compiled using branches. And as before, although + weak-memory machines such as ARM or PowerPC do order stores + after such branches, but can speculate loads, which can again + result in misordering bugs. + +- Be very careful about comparing pointers obtained from + rcu_dereference() against non-NULL values. As Linus Torvalds + explained, if the two pointers are equal, the compiler could + substitute the pointer you are comparing against for the pointer + obtained from rcu_dereference(). For example:: + + p = rcu_dereference(gp); + if (p == &default_struct) + do_default(p->a); + + Because the compiler now knows that the value of "p" is exactly + the address of the variable "default_struct", it is free to + transform this code into the following:: + + p = rcu_dereference(gp); + if (p == &default_struct) + do_default(default_struct.a); + + On ARM and Power hardware, the load from "default_struct.a" + can now be speculated, such that it might happen before the + rcu_dereference(). This could result in bugs due to misordering. + + However, comparisons are OK in the following cases: + + - The comparison was against the NULL pointer. If the + compiler knows that the pointer is NULL, you had better + not be dereferencing it anyway. If the comparison is + non-equal, the compiler is none the wiser. Therefore, + it is safe to compare pointers from rcu_dereference() + against NULL pointers. + + - The pointer is never dereferenced after being compared. + Since there are no subsequent dereferences, the compiler + cannot use anything it learned from the comparison + to reorder the non-existent subsequent dereferences. + This sort of comparison occurs frequently when scanning + RCU-protected circular linked lists. + + Note that if checks for being within an RCU read-side + critical section are not required and the pointer is never + dereferenced, rcu_access_pointer() should be used in place + of rcu_dereference(). + + - The comparison is against a pointer that references memory + that was initialized "a long time ago." The reason + this is safe is that even if misordering occurs, the + misordering will not affect the accesses that follow + the comparison. So exactly how long ago is "a long + time ago"? Here are some possibilities: + + - Compile time. + + - Boot time. + + - Module-init time for module code. + + - Prior to kthread creation for kthread code. + + - During some prior acquisition of the lock that + we now hold. + + - Before mod_timer() time for a timer handler. + + There are many other possibilities involving the Linux + kernel's wide array of primitives that cause code to + be invoked at a later time. + + - The pointer being compared against also came from + rcu_dereference(). In this case, both pointers depend + on one rcu_dereference() or another, so you get proper + ordering either way. + + That said, this situation can make certain RCU usage + bugs more likely to happen. Which can be a good thing, + at least if they happen during testing. An example + of such an RCU usage bug is shown in the section titled + "EXAMPLE OF AMPLIFIED RCU-USAGE BUG". + + - All of the accesses following the comparison are stores, + so that a control dependency preserves the needed ordering. + That said, it is easy to get control dependencies wrong. + Please see the "CONTROL DEPENDENCIES" section of + Documentation/memory-barriers.txt for more details. + + - The pointers are not equal -and- the compiler does + not have enough information to deduce the value of the + pointer. Note that the volatile cast in rcu_dereference() + will normally prevent the compiler from knowing too much. + + However, please note that if the compiler knows that the + pointer takes on only one of two values, a not-equal + comparison will provide exactly the information that the + compiler needs to deduce the value of the pointer. + +- Disable any value-speculation optimizations that your compiler + might provide, especially if you are making use of feedback-based + optimizations that take data collected from prior runs. Such + value-speculation optimizations reorder operations by design. + + There is one exception to this rule: Value-speculation + optimizations that leverage the branch-prediction hardware are + safe on strongly ordered systems (such as x86), but not on weakly + ordered systems (such as ARM or Power). Choose your compiler + command-line options wisely! + + +EXAMPLE OF AMPLIFIED RCU-USAGE BUG +---------------------------------- + +Because updaters can run concurrently with RCU readers, RCU readers can +see stale and/or inconsistent values. If RCU readers need fresh or +consistent values, which they sometimes do, they need to take proper +precautions. To see this, consider the following code fragment:: + + struct foo { + int a; + int b; + int c; + }; + struct foo *gp1; + struct foo *gp2; + + void updater(void) + { + struct foo *p; + + p = kmalloc(...); + if (p == NULL) + deal_with_it(); + p->a = 42; /* Each field in its own cache line. */ + p->b = 43; + p->c = 44; + rcu_assign_pointer(gp1, p); + p->b = 143; + p->c = 144; + rcu_assign_pointer(gp2, p); + } + + void reader(void) + { + struct foo *p; + struct foo *q; + int r1, r2; + + p = rcu_dereference(gp2); + if (p == NULL) + return; + r1 = p->b; /* Guaranteed to get 143. */ + q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ + if (p == q) { + /* The compiler decides that q->c is same as p->c. */ + r2 = p->c; /* Could get 44 on weakly order system. */ + } + do_something_with(r1, r2); + } + +You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible, +but you should not be. After all, the updater might have been invoked +a second time between the time reader() loaded into "r1" and the time +that it loaded into "r2". The fact that this same result can occur due +to some reordering from the compiler and CPUs is beside the point. + +But suppose that the reader needs a consistent view? + +Then one approach is to use locking, for example, as follows:: + + struct foo { + int a; + int b; + int c; + spinlock_t lock; + }; + struct foo *gp1; + struct foo *gp2; + + void updater(void) + { + struct foo *p; + + p = kmalloc(...); + if (p == NULL) + deal_with_it(); + spin_lock(&p->lock); + p->a = 42; /* Each field in its own cache line. */ + p->b = 43; + p->c = 44; + spin_unlock(&p->lock); + rcu_assign_pointer(gp1, p); + spin_lock(&p->lock); + p->b = 143; + p->c = 144; + spin_unlock(&p->lock); + rcu_assign_pointer(gp2, p); + } + + void reader(void) + { + struct foo *p; + struct foo *q; + int r1, r2; + + p = rcu_dereference(gp2); + if (p == NULL) + return; + spin_lock(&p->lock); + r1 = p->b; /* Guaranteed to get 143. */ + q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ + if (p == q) { + /* The compiler decides that q->c is same as p->c. */ + r2 = p->c; /* Locking guarantees r2 == 144. */ + } + spin_unlock(&p->lock); + do_something_with(r1, r2); + } + +As always, use the right tool for the job! + + +EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH +----------------------------------------- + +If a pointer obtained from rcu_dereference() compares not-equal to some +other pointer, the compiler normally has no clue what the value of the +first pointer might be. This lack of knowledge prevents the compiler +from carrying out optimizations that otherwise might destroy the ordering +guarantees that RCU depends on. And the volatile cast in rcu_dereference() +should prevent the compiler from guessing the value. + +But without rcu_dereference(), the compiler knows more than you might +expect. Consider the following code fragment:: + + struct foo { + int a; + int b; + }; + static struct foo variable1; + static struct foo variable2; + static struct foo *gp = &variable1; + + void updater(void) + { + initialize_foo(&variable2); + rcu_assign_pointer(gp, &variable2); + /* + * The above is the only store to gp in this translation unit, + * and the address of gp is not exported in any way. + */ + } + + int reader(void) + { + struct foo *p; + + p = gp; + barrier(); + if (p == &variable1) + return p->a; /* Must be variable1.a. */ + else + return p->b; /* Must be variable2.b. */ + } + +Because the compiler can see all stores to "gp", it knows that the only +possible values of "gp" are "variable1" on the one hand and "variable2" +on the other. The comparison in reader() therefore tells the compiler +the exact value of "p" even in the not-equals case. This allows the +compiler to make the return values independent of the load from "gp", +in turn destroying the ordering between this load and the loads of the +return values. This can result in "p->b" returning pre-initialization +garbage values. + +In short, rcu_dereference() is -not- optional when you are going to +dereference the resulting pointer. + + +WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE? +------------------------------------------------------------ + +First, please avoid using rcu_dereference_raw() and also please avoid +using rcu_dereference_check() and rcu_dereference_protected() with a +second argument with a constant value of 1 (or true, for that matter). +With that caution out of the way, here is some guidance for which +member of the rcu_dereference() to use in various situations: + +1. If the access needs to be within an RCU read-side critical + section, use rcu_dereference(). With the new consolidated + RCU flavors, an RCU read-side critical section is entered + using rcu_read_lock(), anything that disables bottom halves, + anything that disables interrupts, or anything that disables + preemption. + +2. If the access might be within an RCU read-side critical section + on the one hand, or protected by (say) my_lock on the other, + use rcu_dereference_check(), for example:: + + p1 = rcu_dereference_check(p->rcu_protected_pointer, + lockdep_is_held(&my_lock)); + + +3. If the access might be within an RCU read-side critical section + on the one hand, or protected by either my_lock or your_lock on + the other, again use rcu_dereference_check(), for example:: + + p1 = rcu_dereference_check(p->rcu_protected_pointer, + lockdep_is_held(&my_lock) || + lockdep_is_held(&your_lock)); + +4. If the access is on the update side, so that it is always protected + by my_lock, use rcu_dereference_protected():: + + p1 = rcu_dereference_protected(p->rcu_protected_pointer, + lockdep_is_held(&my_lock)); + + This can be extended to handle multiple locks as in #3 above, + and both can be extended to check other conditions as well. + +5. If the protection is supplied by the caller, and is thus unknown + to this code, that is the rare case when rcu_dereference_raw() + is appropriate. In addition, rcu_dereference_raw() might be + appropriate when the lockdep expression would be excessively + complex, except that a better approach in that case might be to + take a long hard look at your synchronization design. Still, + there are data-locking cases where any one of a very large number + of locks or reference counters suffices to protect the pointer, + so rcu_dereference_raw() does have its place. + + However, its place is probably quite a bit smaller than one + might expect given the number of uses in the current kernel. + Ditto for its synonym, rcu_dereference_check( ... , 1), and + its close relative, rcu_dereference_protected(... , 1). + + +SPARSE CHECKING OF RCU-PROTECTED POINTERS +----------------------------------------- + +The sparse static-analysis tool checks for direct access to RCU-protected +pointers, which can result in "interesting" bugs due to compiler +optimizations involving invented loads and perhaps also load tearing. +For example, suppose someone mistakenly does something like this:: + + p = q->rcu_protected_pointer; + do_something_with(p->a); + do_something_else_with(p->b); + +If register pressure is high, the compiler might optimize "p" out +of existence, transforming the code to something like this:: + + do_something_with(q->rcu_protected_pointer->a); + do_something_else_with(q->rcu_protected_pointer->b); + +This could fatally disappoint your code if q->rcu_protected_pointer +changed in the meantime. Nor is this a theoretical problem: Exactly +this sort of bug cost Paul E. McKenney (and several of his innocent +colleagues) a three-day weekend back in the early 1990s. + +Load tearing could of course result in dereferencing a mashup of a pair +of pointers, which also might fatally disappoint your code. + +These problems could have been avoided simply by making the code instead +read as follows:: + + p = rcu_dereference(q->rcu_protected_pointer); + do_something_with(p->a); + do_something_else_with(p->b); + +Unfortunately, these sorts of bugs can be extremely hard to spot during +review. This is where the sparse tool comes into play, along with the +"__rcu" marker. If you mark a pointer declaration, whether in a structure +or as a formal parameter, with "__rcu", which tells sparse to complain if +this pointer is accessed directly. It will also cause sparse to complain +if a pointer not marked with "__rcu" is accessed using rcu_dereference() +and friends. For example, ->rcu_protected_pointer might be declared as +follows:: + + struct foo __rcu *rcu_protected_pointer; + +Use of "__rcu" is opt-in. If you choose not to use it, then you should +ignore the sparse warnings. diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt deleted file mode 100644 index bf699e8cfc75ca18fe2ee94472ee46e6d2a6a9c8..0000000000000000000000000000000000000000 --- a/Documentation/RCU/rcu_dereference.txt +++ /dev/null @@ -1,456 +0,0 @@ -PROPER CARE AND FEEDING OF RETURN VALUES FROM rcu_dereference() - -Most of the time, you can use values from rcu_dereference() or one of -the similar primitives without worries. Dereferencing (prefix "*"), -field selection ("->"), assignment ("="), address-of ("&"), addition and -subtraction of constants, and casts all work quite naturally and safely. - -It is nevertheless possible to get into trouble with other operations. -Follow these rules to keep your RCU code working properly: - -o You must use one of the rcu_dereference() family of primitives - to load an RCU-protected pointer, otherwise CONFIG_PROVE_RCU - will complain. Worse yet, your code can see random memory-corruption - bugs due to games that compilers and DEC Alpha can play. - Without one of the rcu_dereference() primitives, compilers - can reload the value, and won't your code have fun with two - different values for a single pointer! Without rcu_dereference(), - DEC Alpha can load a pointer, dereference that pointer, and - return data preceding initialization that preceded the store of - the pointer. - - In addition, the volatile cast in rcu_dereference() prevents the - compiler from deducing the resulting pointer value. Please see - the section entitled "EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH" - for an example where the compiler can in fact deduce the exact - value of the pointer, and thus cause misordering. - -o You are only permitted to use rcu_dereference on pointer values. - The compiler simply knows too much about integral values to - trust it to carry dependencies through integer operations. - There are a very few exceptions, namely that you can temporarily - cast the pointer to uintptr_t in order to: - - o Set bits and clear bits down in the must-be-zero low-order - bits of that pointer. This clearly means that the pointer - must have alignment constraints, for example, this does - -not- work in general for char* pointers. - - o XOR bits to translate pointers, as is done in some - classic buddy-allocator algorithms. - - It is important to cast the value back to pointer before - doing much of anything else with it. - -o Avoid cancellation when using the "+" and "-" infix arithmetic - operators. For example, for a given variable "x", avoid - "(x-(uintptr_t)x)" for char* pointers. The compiler is within its - rights to substitute zero for this sort of expression, so that - subsequent accesses no longer depend on the rcu_dereference(), - again possibly resulting in bugs due to misordering. - - Of course, if "p" is a pointer from rcu_dereference(), and "a" - and "b" are integers that happen to be equal, the expression - "p+a-b" is safe because its value still necessarily depends on - the rcu_dereference(), thus maintaining proper ordering. - -o If you are using RCU to protect JITed functions, so that the - "()" function-invocation operator is applied to a value obtained - (directly or indirectly) from rcu_dereference(), you may need to - interact directly with the hardware to flush instruction caches. - This issue arises on some systems when a newly JITed function is - using the same memory that was used by an earlier JITed function. - -o Do not use the results from relational operators ("==", "!=", - ">", ">=", "<", or "<=") when dereferencing. For example, - the following (quite strange) code is buggy: - - int *p; - int *q; - - ... - - p = rcu_dereference(gp) - q = &global_q; - q += p > &oom_p; - r1 = *q; /* BUGGY!!! */ - - As before, the reason this is buggy is that relational operators - are often compiled using branches. And as before, although - weak-memory machines such as ARM or PowerPC do order stores - after such branches, but can speculate loads, which can again - result in misordering bugs. - -o Be very careful about comparing pointers obtained from - rcu_dereference() against non-NULL values. As Linus Torvalds - explained, if the two pointers are equal, the compiler could - substitute the pointer you are comparing against for the pointer - obtained from rcu_dereference(). For example: - - p = rcu_dereference(gp); - if (p == &default_struct) - do_default(p->a); - - Because the compiler now knows that the value of "p" is exactly - the address of the variable "default_struct", it is free to - transform this code into the following: - - p = rcu_dereference(gp); - if (p == &default_struct) - do_default(default_struct.a); - - On ARM and Power hardware, the load from "default_struct.a" - can now be speculated, such that it might happen before the - rcu_dereference(). This could result in bugs due to misordering. - - However, comparisons are OK in the following cases: - - o The comparison was against the NULL pointer. If the - compiler knows that the pointer is NULL, you had better - not be dereferencing it anyway. If the comparison is - non-equal, the compiler is none the wiser. Therefore, - it is safe to compare pointers from rcu_dereference() - against NULL pointers. - - o The pointer is never dereferenced after being compared. - Since there are no subsequent dereferences, the compiler - cannot use anything it learned from the comparison - to reorder the non-existent subsequent dereferences. - This sort of comparison occurs frequently when scanning - RCU-protected circular linked lists. - - Note that if checks for being within an RCU read-side - critical section are not required and the pointer is never - dereferenced, rcu_access_pointer() should be used in place - of rcu_dereference(). - - o The comparison is against a pointer that references memory - that was initialized "a long time ago." The reason - this is safe is that even if misordering occurs, the - misordering will not affect the accesses that follow - the comparison. So exactly how long ago is "a long - time ago"? Here are some possibilities: - - o Compile time. - - o Boot time. - - o Module-init time for module code. - - o Prior to kthread creation for kthread code. - - o During some prior acquisition of the lock that - we now hold. - - o Before mod_timer() time for a timer handler. - - There are many other possibilities involving the Linux - kernel's wide array of primitives that cause code to - be invoked at a later time. - - o The pointer being compared against also came from - rcu_dereference(). In this case, both pointers depend - on one rcu_dereference() or another, so you get proper - ordering either way. - - That said, this situation can make certain RCU usage - bugs more likely to happen. Which can be a good thing, - at least if they happen during testing. An example - of such an RCU usage bug is shown in the section titled - "EXAMPLE OF AMPLIFIED RCU-USAGE BUG". - - o All of the accesses following the comparison are stores, - so that a control dependency preserves the needed ordering. - That said, it is easy to get control dependencies wrong. - Please see the "CONTROL DEPENDENCIES" section of - Documentation/memory-barriers.txt for more details. - - o The pointers are not equal -and- the compiler does - not have enough information to deduce the value of the - pointer. Note that the volatile cast in rcu_dereference() - will normally prevent the compiler from knowing too much. - - However, please note that if the compiler knows that the - pointer takes on only one of two values, a not-equal - comparison will provide exactly the information that the - compiler needs to deduce the value of the pointer. - -o Disable any value-speculation optimizations that your compiler - might provide, especially if you are making use of feedback-based - optimizations that take data collected from prior runs. Such - value-speculation optimizations reorder operations by design. - - There is one exception to this rule: Value-speculation - optimizations that leverage the branch-prediction hardware are - safe on strongly ordered systems (such as x86), but not on weakly - ordered systems (such as ARM or Power). Choose your compiler - command-line options wisely! - - -EXAMPLE OF AMPLIFIED RCU-USAGE BUG - -Because updaters can run concurrently with RCU readers, RCU readers can -see stale and/or inconsistent values. If RCU readers need fresh or -consistent values, which they sometimes do, they need to take proper -precautions. To see this, consider the following code fragment: - - struct foo { - int a; - int b; - int c; - }; - struct foo *gp1; - struct foo *gp2; - - void updater(void) - { - struct foo *p; - - p = kmalloc(...); - if (p == NULL) - deal_with_it(); - p->a = 42; /* Each field in its own cache line. */ - p->b = 43; - p->c = 44; - rcu_assign_pointer(gp1, p); - p->b = 143; - p->c = 144; - rcu_assign_pointer(gp2, p); - } - - void reader(void) - { - struct foo *p; - struct foo *q; - int r1, r2; - - p = rcu_dereference(gp2); - if (p == NULL) - return; - r1 = p->b; /* Guaranteed to get 143. */ - q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ - if (p == q) { - /* The compiler decides that q->c is same as p->c. */ - r2 = p->c; /* Could get 44 on weakly order system. */ - } - do_something_with(r1, r2); - } - -You might be surprised that the outcome (r1 == 143 && r2 == 44) is possible, -but you should not be. After all, the updater might have been invoked -a second time between the time reader() loaded into "r1" and the time -that it loaded into "r2". The fact that this same result can occur due -to some reordering from the compiler and CPUs is beside the point. - -But suppose that the reader needs a consistent view? - -Then one approach is to use locking, for example, as follows: - - struct foo { - int a; - int b; - int c; - spinlock_t lock; - }; - struct foo *gp1; - struct foo *gp2; - - void updater(void) - { - struct foo *p; - - p = kmalloc(...); - if (p == NULL) - deal_with_it(); - spin_lock(&p->lock); - p->a = 42; /* Each field in its own cache line. */ - p->b = 43; - p->c = 44; - spin_unlock(&p->lock); - rcu_assign_pointer(gp1, p); - spin_lock(&p->lock); - p->b = 143; - p->c = 144; - spin_unlock(&p->lock); - rcu_assign_pointer(gp2, p); - } - - void reader(void) - { - struct foo *p; - struct foo *q; - int r1, r2; - - p = rcu_dereference(gp2); - if (p == NULL) - return; - spin_lock(&p->lock); - r1 = p->b; /* Guaranteed to get 143. */ - q = rcu_dereference(gp1); /* Guaranteed non-NULL. */ - if (p == q) { - /* The compiler decides that q->c is same as p->c. */ - r2 = p->c; /* Locking guarantees r2 == 144. */ - } - spin_unlock(&p->lock); - do_something_with(r1, r2); - } - -As always, use the right tool for the job! - - -EXAMPLE WHERE THE COMPILER KNOWS TOO MUCH - -If a pointer obtained from rcu_dereference() compares not-equal to some -other pointer, the compiler normally has no clue what the value of the -first pointer might be. This lack of knowledge prevents the compiler -from carrying out optimizations that otherwise might destroy the ordering -guarantees that RCU depends on. And the volatile cast in rcu_dereference() -should prevent the compiler from guessing the value. - -But without rcu_dereference(), the compiler knows more than you might -expect. Consider the following code fragment: - - struct foo { - int a; - int b; - }; - static struct foo variable1; - static struct foo variable2; - static struct foo *gp = &variable1; - - void updater(void) - { - initialize_foo(&variable2); - rcu_assign_pointer(gp, &variable2); - /* - * The above is the only store to gp in this translation unit, - * and the address of gp is not exported in any way. - */ - } - - int reader(void) - { - struct foo *p; - - p = gp; - barrier(); - if (p == &variable1) - return p->a; /* Must be variable1.a. */ - else - return p->b; /* Must be variable2.b. */ - } - -Because the compiler can see all stores to "gp", it knows that the only -possible values of "gp" are "variable1" on the one hand and "variable2" -on the other. The comparison in reader() therefore tells the compiler -the exact value of "p" even in the not-equals case. This allows the -compiler to make the return values independent of the load from "gp", -in turn destroying the ordering between this load and the loads of the -return values. This can result in "p->b" returning pre-initialization -garbage values. - -In short, rcu_dereference() is -not- optional when you are going to -dereference the resulting pointer. - - -WHICH MEMBER OF THE rcu_dereference() FAMILY SHOULD YOU USE? - -First, please avoid using rcu_dereference_raw() and also please avoid -using rcu_dereference_check() and rcu_dereference_protected() with a -second argument with a constant value of 1 (or true, for that matter). -With that caution out of the way, here is some guidance for which -member of the rcu_dereference() to use in various situations: - -1. If the access needs to be within an RCU read-side critical - section, use rcu_dereference(). With the new consolidated - RCU flavors, an RCU read-side critical section is entered - using rcu_read_lock(), anything that disables bottom halves, - anything that disables interrupts, or anything that disables - preemption. - -2. If the access might be within an RCU read-side critical section - on the one hand, or protected by (say) my_lock on the other, - use rcu_dereference_check(), for example: - - p1 = rcu_dereference_check(p->rcu_protected_pointer, - lockdep_is_held(&my_lock)); - - -3. If the access might be within an RCU read-side critical section - on the one hand, or protected by either my_lock or your_lock on - the other, again use rcu_dereference_check(), for example: - - p1 = rcu_dereference_check(p->rcu_protected_pointer, - lockdep_is_held(&my_lock) || - lockdep_is_held(&your_lock)); - -4. If the access is on the update side, so that it is always protected - by my_lock, use rcu_dereference_protected(): - - p1 = rcu_dereference_protected(p->rcu_protected_pointer, - lockdep_is_held(&my_lock)); - - This can be extended to handle multiple locks as in #3 above, - and both can be extended to check other conditions as well. - -5. If the protection is supplied by the caller, and is thus unknown - to this code, that is the rare case when rcu_dereference_raw() - is appropriate. In addition, rcu_dereference_raw() might be - appropriate when the lockdep expression would be excessively - complex, except that a better approach in that case might be to - take a long hard look at your synchronization design. Still, - there are data-locking cases where any one of a very large number - of locks or reference counters suffices to protect the pointer, - so rcu_dereference_raw() does have its place. - - However, its place is probably quite a bit smaller than one - might expect given the number of uses in the current kernel. - Ditto for its synonym, rcu_dereference_check( ... , 1), and - its close relative, rcu_dereference_protected(... , 1). - - -SPARSE CHECKING OF RCU-PROTECTED POINTERS - -The sparse static-analysis tool checks for direct access to RCU-protected -pointers, which can result in "interesting" bugs due to compiler -optimizations involving invented loads and perhaps also load tearing. -For example, suppose someone mistakenly does something like this: - - p = q->rcu_protected_pointer; - do_something_with(p->a); - do_something_else_with(p->b); - -If register pressure is high, the compiler might optimize "p" out -of existence, transforming the code to something like this: - - do_something_with(q->rcu_protected_pointer->a); - do_something_else_with(q->rcu_protected_pointer->b); - -This could fatally disappoint your code if q->rcu_protected_pointer -changed in the meantime. Nor is this a theoretical problem: Exactly -this sort of bug cost Paul E. McKenney (and several of his innocent -colleagues) a three-day weekend back in the early 1990s. - -Load tearing could of course result in dereferencing a mashup of a pair -of pointers, which also might fatally disappoint your code. - -These problems could have been avoided simply by making the code instead -read as follows: - - p = rcu_dereference(q->rcu_protected_pointer); - do_something_with(p->a); - do_something_else_with(p->b); - -Unfortunately, these sorts of bugs can be extremely hard to spot during -review. This is where the sparse tool comes into play, along with the -"__rcu" marker. If you mark a pointer declaration, whether in a structure -or as a formal parameter, with "__rcu", which tells sparse to complain if -this pointer is accessed directly. It will also cause sparse to complain -if a pointer not marked with "__rcu" is accessed using rcu_dereference() -and friends. For example, ->rcu_protected_pointer might be declared as -follows: - - struct foo __rcu *rcu_protected_pointer; - -Use of "__rcu" is opt-in. If you choose not to use it, then you should -ignore the sparse warnings. diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst new file mode 100644 index 0000000000000000000000000000000000000000..f64f4413a47c4583d0de3ea94dc0f305cabbfcb0 --- /dev/null +++ b/Documentation/RCU/rcubarrier.rst @@ -0,0 +1,353 @@ +.. _rcu_barrier: + +RCU and Unloadable Modules +========================== + +[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/] + +RCU (read-copy update) is a synchronization mechanism that can be thought +of as a replacement for read-writer locking (among other things), but with +very low-overhead readers that are immune to deadlock, priority inversion, +and unbounded latency. RCU read-side critical sections are delimited +by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT +kernels, generate no code whatsoever. + +This means that RCU writers are unaware of the presence of concurrent +readers, so that RCU updates to shared data must be undertaken quite +carefully, leaving an old version of the data structure in place until all +pre-existing readers have finished. These old versions are needed because +such readers might hold a reference to them. RCU updates can therefore be +rather expensive, and RCU is thus best suited for read-mostly situations. + +How can an RCU writer possibly determine when all readers are finished, +given that readers might well leave absolutely no trace of their +presence? There is a synchronize_rcu() primitive that blocks until all +pre-existing readers have completed. An updater wishing to delete an +element p from a linked list might do the following, while holding an +appropriate lock, of course:: + + list_del_rcu(p); + synchronize_rcu(); + kfree(p); + +But the above code cannot be used in IRQ context -- the call_rcu() +primitive must be used instead. This primitive takes a pointer to an +rcu_head struct placed within the RCU-protected data structure and +another pointer to a function that may be invoked later to free that +structure. Code to delete an element p from the linked list from IRQ +context might then be as follows:: + + list_del_rcu(p); + call_rcu(&p->rcu, p_callback); + +Since call_rcu() never blocks, this code can safely be used from within +IRQ context. The function p_callback() might be defined as follows:: + + static void p_callback(struct rcu_head *rp) + { + struct pstruct *p = container_of(rp, struct pstruct, rcu); + + kfree(p); + } + + +Unloading Modules That Use call_rcu() +------------------------------------- + +But what if p_callback is defined in an unloadable module? + +If we unload the module while some RCU callbacks are pending, +the CPUs executing these callbacks are going to be severely +disappointed when they are later invoked, as fancifully depicted at +http://lwn.net/images/ns/kernel/rcu-drop.jpg. + +We could try placing a synchronize_rcu() in the module-exit code path, +but this is not sufficient. Although synchronize_rcu() does wait for a +grace period to elapse, it does not wait for the callbacks to complete. + +One might be tempted to try several back-to-back synchronize_rcu() +calls, but this is still not guaranteed to work. If there is a very +heavy RCU-callback load, then some of the callbacks might be deferred +in order to allow other processing to proceed. Such deferral is required +in realtime kernels in order to avoid excessive scheduling latencies. + + +rcu_barrier() +------------- + +We instead need the rcu_barrier() primitive. Rather than waiting for +a grace period to elapse, rcu_barrier() waits for all outstanding RCU +callbacks to complete. Please note that rcu_barrier() does **not** imply +synchronize_rcu(), in particular, if there are no RCU callbacks queued +anywhere, rcu_barrier() is within its rights to return immediately, +without waiting for a grace period to elapse. + +Pseudo-code using rcu_barrier() is as follows: + + 1. Prevent any new RCU callbacks from being posted. + 2. Execute rcu_barrier(). + 3. Allow the module to be unloaded. + +There is also an srcu_barrier() function for SRCU, and you of course +must match the flavor of rcu_barrier() with that of call_rcu(). If your +module uses multiple flavors of call_rcu(), then it must also use multiple +flavors of rcu_barrier() when unloading that module. For example, if +it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on +srcu_struct_2, then the following three lines of code will be required +when unloading:: + + 1 rcu_barrier(); + 2 srcu_barrier(&srcu_struct_1); + 3 srcu_barrier(&srcu_struct_2); + +The rcutorture module makes use of rcu_barrier() in its exit function +as follows:: + + 1 static void + 2 rcu_torture_cleanup(void) + 3 { + 4 int i; + 5 + 6 fullstop = 1; + 7 if (shuffler_task != NULL) { + 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + 9 kthread_stop(shuffler_task); + 10 } + 11 shuffler_task = NULL; + 12 + 13 if (writer_task != NULL) { + 14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + 15 kthread_stop(writer_task); + 16 } + 17 writer_task = NULL; + 18 + 19 if (reader_tasks != NULL) { + 20 for (i = 0; i < nrealreaders; i++) { + 21 if (reader_tasks[i] != NULL) { + 22 VERBOSE_PRINTK_STRING( + 23 "Stopping rcu_torture_reader task"); + 24 kthread_stop(reader_tasks[i]); + 25 } + 26 reader_tasks[i] = NULL; + 27 } + 28 kfree(reader_tasks); + 29 reader_tasks = NULL; + 30 } + 31 rcu_torture_current = NULL; + 32 + 33 if (fakewriter_tasks != NULL) { + 34 for (i = 0; i < nfakewriters; i++) { + 35 if (fakewriter_tasks[i] != NULL) { + 36 VERBOSE_PRINTK_STRING( + 37 "Stopping rcu_torture_fakewriter task"); + 38 kthread_stop(fakewriter_tasks[i]); + 39 } + 40 fakewriter_tasks[i] = NULL; + 41 } + 42 kfree(fakewriter_tasks); + 43 fakewriter_tasks = NULL; + 44 } + 45 + 46 if (stats_task != NULL) { + 47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + 48 kthread_stop(stats_task); + 49 } + 50 stats_task = NULL; + 51 + 52 /* Wait for all RCU callbacks to fire. */ + 53 rcu_barrier(); + 54 + 55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + 56 + 57 if (cur_ops->cleanup != NULL) + 58 cur_ops->cleanup(); + 59 if (atomic_read(&n_rcu_torture_error)) + 60 rcu_torture_print_module_parms("End of test: FAILURE"); + 61 else + 62 rcu_torture_print_module_parms("End of test: SUCCESS"); + 63 } + +Line 6 sets a global variable that prevents any RCU callbacks from +re-posting themselves. This will not be necessary in most cases, since +RCU callbacks rarely include calls to call_rcu(). However, the rcutorture +module is an exception to this rule, and therefore needs to set this +global variable. + +Lines 7-50 stop all the kernel tasks associated with the rcutorture +module. Therefore, once execution reaches line 53, no more rcutorture +RCU callbacks will be posted. The rcu_barrier() call on line 53 waits +for any pre-existing callbacks to complete. + +Then lines 55-62 print status and do operation-specific cleanup, and +then return, permitting the module-unload operation to be completed. + +.. _rcubarrier_quiz_1: + +Quick Quiz #1: + Is there any other situation where rcu_barrier() might + be required? + +:ref:`Answer to Quick Quiz #1 ` + +Your module might have additional complications. For example, if your +module invokes call_rcu() from timers, you will need to first cancel all +the timers, and only then invoke rcu_barrier() to wait for any remaining +RCU callbacks to complete. + +Of course, if you module uses call_rcu(), you will need to invoke +rcu_barrier() before unloading. Similarly, if your module uses +call_srcu(), you will need to invoke srcu_barrier() before unloading, +and on the same srcu_struct structure. If your module uses call_rcu() +**and** call_srcu(), then you will need to invoke rcu_barrier() **and** +srcu_barrier(). + + +Implementing rcu_barrier() +-------------------------- + +Dipankar Sarma's implementation of rcu_barrier() makes use of the fact +that RCU callbacks are never reordered once queued on one of the per-CPU +queues. His implementation queues an RCU callback on each of the per-CPU +callback queues, and then waits until they have all started executing, at +which point, all earlier RCU callbacks are guaranteed to have completed. + +The original code for rcu_barrier() was as follows:: + + 1 void rcu_barrier(void) + 2 { + 3 BUG_ON(in_interrupt()); + 4 /* Take cpucontrol mutex to protect against CPU hotplug */ + 5 mutex_lock(&rcu_barrier_mutex); + 6 init_completion(&rcu_barrier_completion); + 7 atomic_set(&rcu_barrier_cpu_count, 0); + 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); + 9 wait_for_completion(&rcu_barrier_completion); + 10 mutex_unlock(&rcu_barrier_mutex); + 11 } + +Line 3 verifies that the caller is in process context, and lines 5 and 10 +use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the +global completion and counters at a time, which are initialized on lines +6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is +shown below. Note that the final "1" in on_each_cpu()'s argument list +ensures that all the calls to rcu_barrier_func() will have completed +before on_each_cpu() returns. Line 9 then waits for the completion. + +This code was rewritten in 2008 and several times thereafter, but this +still gives the general idea. + +The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() +to post an RCU callback, as follows:: + + 1 static void rcu_barrier_func(void *notused) + 2 { + 3 int cpu = smp_processor_id(); + 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + 5 struct rcu_head *head; + 6 + 7 head = &rdp->barrier; + 8 atomic_inc(&rcu_barrier_cpu_count); + 9 call_rcu(head, rcu_barrier_callback); + 10 } + +Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure, +which contains the struct rcu_head that needed for the later call to +call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line +8 increments a global counter. This counter will later be decremented +by the callback. Line 9 then registers the rcu_barrier_callback() on +the current CPU's queue. + +The rcu_barrier_callback() function simply atomically decrements the +rcu_barrier_cpu_count variable and finalizes the completion when it +reaches zero, as follows:: + + 1 static void rcu_barrier_callback(struct rcu_head *notused) + 2 { + 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + 4 complete(&rcu_barrier_completion); + 5 } + +.. _rcubarrier_quiz_2: + +Quick Quiz #2: + What happens if CPU 0's rcu_barrier_func() executes + immediately (thus incrementing rcu_barrier_cpu_count to the + value one), but the other CPU's rcu_barrier_func() invocations + are delayed for a full grace period? Couldn't this result in + rcu_barrier() returning prematurely? + +:ref:`Answer to Quick Quiz #2 ` + +The current rcu_barrier() implementation is more complex, due to the need +to avoid disturbing idle CPUs (especially on battery-powered systems) +and the need to minimally disturb non-idle CPUs in real-time systems. +However, the code above illustrates the concepts. + + +rcu_barrier() Summary +--------------------- + +The rcu_barrier() primitive has seen relatively little use, since most +code using RCU is in the core kernel rather than in modules. However, if +you are using RCU from an unloadable module, you need to use rcu_barrier() +so that your module may be safely unloaded. + + +Answers to Quick Quizzes +------------------------ + +.. _answer_rcubarrier_quiz_1: + +Quick Quiz #1: + Is there any other situation where rcu_barrier() might + be required? + +Answer: Interestingly enough, rcu_barrier() was not originally + implemented for module unloading. Nikita Danilov was using + RCU in a filesystem, which resulted in a similar situation at + filesystem-unmount time. Dipankar Sarma coded up rcu_barrier() + in response, so that Nikita could invoke it during the + filesystem-unmount process. + + Much later, yours truly hit the RCU module-unload problem when + implementing rcutorture, and found that rcu_barrier() solves + this problem as well. + +:ref:`Back to Quick Quiz #1 ` + +.. _answer_rcubarrier_quiz_2: + +Quick Quiz #2: + What happens if CPU 0's rcu_barrier_func() executes + immediately (thus incrementing rcu_barrier_cpu_count to the + value one), but the other CPU's rcu_barrier_func() invocations + are delayed for a full grace period? Couldn't this result in + rcu_barrier() returning prematurely? + +Answer: This cannot happen. The reason is that on_each_cpu() has its last + argument, the wait flag, set to "1". This flag is passed through + to smp_call_function() and further to smp_call_function_on_cpu(), + causing this latter to spin until the cross-CPU invocation of + rcu_barrier_func() has completed. This by itself would prevent + a grace period from completing on non-CONFIG_PREEMPT kernels, + since each CPU must undergo a context switch (or other quiescent + state) before the grace period can complete. However, this is + of no use in CONFIG_PREEMPT kernels. + + Therefore, on_each_cpu() disables preemption across its call + to smp_call_function() and also across the local call to + rcu_barrier_func(). This prevents the local CPU from context + switching, again preventing grace periods from completing. This + means that all CPUs have executed rcu_barrier_func() before + the first rcu_barrier_callback() can possibly execute, in turn + preventing rcu_barrier_cpu_count from prematurely reaching zero. + + Currently, -rt implementations of RCU keep but a single global + queue for RCU callbacks, and thus do not suffer from this + problem. However, when the -rt RCU eventually does have per-CPU + callback queues, things will have to change. One simple change + is to add an rcu_read_lock() before line 8 of rcu_barrier() + and an rcu_read_unlock() after line 8 of this same function. If + you can think of a better change, please let me know! + +:ref:`Back to Quick Quiz #2 ` diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt deleted file mode 100644 index a2782df697328e3293769b429b5321c82e0e0b16..0000000000000000000000000000000000000000 --- a/Documentation/RCU/rcubarrier.txt +++ /dev/null @@ -1,325 +0,0 @@ -RCU and Unloadable Modules - -[Originally published in LWN Jan. 14, 2007: http://lwn.net/Articles/217484/] - -RCU (read-copy update) is a synchronization mechanism that can be thought -of as a replacement for read-writer locking (among other things), but with -very low-overhead readers that are immune to deadlock, priority inversion, -and unbounded latency. RCU read-side critical sections are delimited -by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT -kernels, generate no code whatsoever. - -This means that RCU writers are unaware of the presence of concurrent -readers, so that RCU updates to shared data must be undertaken quite -carefully, leaving an old version of the data structure in place until all -pre-existing readers have finished. These old versions are needed because -such readers might hold a reference to them. RCU updates can therefore be -rather expensive, and RCU is thus best suited for read-mostly situations. - -How can an RCU writer possibly determine when all readers are finished, -given that readers might well leave absolutely no trace of their -presence? There is a synchronize_rcu() primitive that blocks until all -pre-existing readers have completed. An updater wishing to delete an -element p from a linked list might do the following, while holding an -appropriate lock, of course: - - list_del_rcu(p); - synchronize_rcu(); - kfree(p); - -But the above code cannot be used in IRQ context -- the call_rcu() -primitive must be used instead. This primitive takes a pointer to an -rcu_head struct placed within the RCU-protected data structure and -another pointer to a function that may be invoked later to free that -structure. Code to delete an element p from the linked list from IRQ -context might then be as follows: - - list_del_rcu(p); - call_rcu(&p->rcu, p_callback); - -Since call_rcu() never blocks, this code can safely be used from within -IRQ context. The function p_callback() might be defined as follows: - - static void p_callback(struct rcu_head *rp) - { - struct pstruct *p = container_of(rp, struct pstruct, rcu); - - kfree(p); - } - - -Unloading Modules That Use call_rcu() - -But what if p_callback is defined in an unloadable module? - -If we unload the module while some RCU callbacks are pending, -the CPUs executing these callbacks are going to be severely -disappointed when they are later invoked, as fancifully depicted at -http://lwn.net/images/ns/kernel/rcu-drop.jpg. - -We could try placing a synchronize_rcu() in the module-exit code path, -but this is not sufficient. Although synchronize_rcu() does wait for a -grace period to elapse, it does not wait for the callbacks to complete. - -One might be tempted to try several back-to-back synchronize_rcu() -calls, but this is still not guaranteed to work. If there is a very -heavy RCU-callback load, then some of the callbacks might be deferred -in order to allow other processing to proceed. Such deferral is required -in realtime kernels in order to avoid excessive scheduling latencies. - - -rcu_barrier() - -We instead need the rcu_barrier() primitive. Rather than waiting for -a grace period to elapse, rcu_barrier() waits for all outstanding RCU -callbacks to complete. Please note that rcu_barrier() does -not- imply -synchronize_rcu(), in particular, if there are no RCU callbacks queued -anywhere, rcu_barrier() is within its rights to return immediately, -without waiting for a grace period to elapse. - -Pseudo-code using rcu_barrier() is as follows: - - 1. Prevent any new RCU callbacks from being posted. - 2. Execute rcu_barrier(). - 3. Allow the module to be unloaded. - -There is also an srcu_barrier() function for SRCU, and you of course -must match the flavor of rcu_barrier() with that of call_rcu(). If your -module uses multiple flavors of call_rcu(), then it must also use multiple -flavors of rcu_barrier() when unloading that module. For example, if -it uses call_rcu(), call_srcu() on srcu_struct_1, and call_srcu() on -srcu_struct_2(), then the following three lines of code will be required -when unloading: - - 1 rcu_barrier(); - 2 srcu_barrier(&srcu_struct_1); - 3 srcu_barrier(&srcu_struct_2); - -The rcutorture module makes use of rcu_barrier() in its exit function -as follows: - - 1 static void - 2 rcu_torture_cleanup(void) - 3 { - 4 int i; - 5 - 6 fullstop = 1; - 7 if (shuffler_task != NULL) { - 8 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); - 9 kthread_stop(shuffler_task); -10 } -11 shuffler_task = NULL; -12 -13 if (writer_task != NULL) { -14 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); -15 kthread_stop(writer_task); -16 } -17 writer_task = NULL; -18 -19 if (reader_tasks != NULL) { -20 for (i = 0; i < nrealreaders; i++) { -21 if (reader_tasks[i] != NULL) { -22 VERBOSE_PRINTK_STRING( -23 "Stopping rcu_torture_reader task"); -24 kthread_stop(reader_tasks[i]); -25 } -26 reader_tasks[i] = NULL; -27 } -28 kfree(reader_tasks); -29 reader_tasks = NULL; -30 } -31 rcu_torture_current = NULL; -32 -33 if (fakewriter_tasks != NULL) { -34 for (i = 0; i < nfakewriters; i++) { -35 if (fakewriter_tasks[i] != NULL) { -36 VERBOSE_PRINTK_STRING( -37 "Stopping rcu_torture_fakewriter task"); -38 kthread_stop(fakewriter_tasks[i]); -39 } -40 fakewriter_tasks[i] = NULL; -41 } -42 kfree(fakewriter_tasks); -43 fakewriter_tasks = NULL; -44 } -45 -46 if (stats_task != NULL) { -47 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); -48 kthread_stop(stats_task); -49 } -50 stats_task = NULL; -51 -52 /* Wait for all RCU callbacks to fire. */ -53 rcu_barrier(); -54 -55 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ -56 -57 if (cur_ops->cleanup != NULL) -58 cur_ops->cleanup(); -59 if (atomic_read(&n_rcu_torture_error)) -60 rcu_torture_print_module_parms("End of test: FAILURE"); -61 else -62 rcu_torture_print_module_parms("End of test: SUCCESS"); -63 } - -Line 6 sets a global variable that prevents any RCU callbacks from -re-posting themselves. This will not be necessary in most cases, since -RCU callbacks rarely include calls to call_rcu(). However, the rcutorture -module is an exception to this rule, and therefore needs to set this -global variable. - -Lines 7-50 stop all the kernel tasks associated with the rcutorture -module. Therefore, once execution reaches line 53, no more rcutorture -RCU callbacks will be posted. The rcu_barrier() call on line 53 waits -for any pre-existing callbacks to complete. - -Then lines 55-62 print status and do operation-specific cleanup, and -then return, permitting the module-unload operation to be completed. - -Quick Quiz #1: Is there any other situation where rcu_barrier() might - be required? - -Your module might have additional complications. For example, if your -module invokes call_rcu() from timers, you will need to first cancel all -the timers, and only then invoke rcu_barrier() to wait for any remaining -RCU callbacks to complete. - -Of course, if you module uses call_rcu(), you will need to invoke -rcu_barrier() before unloading. Similarly, if your module uses -call_srcu(), you will need to invoke srcu_barrier() before unloading, -and on the same srcu_struct structure. If your module uses call_rcu() --and- call_srcu(), then you will need to invoke rcu_barrier() -and- -srcu_barrier(). - - -Implementing rcu_barrier() - -Dipankar Sarma's implementation of rcu_barrier() makes use of the fact -that RCU callbacks are never reordered once queued on one of the per-CPU -queues. His implementation queues an RCU callback on each of the per-CPU -callback queues, and then waits until they have all started executing, at -which point, all earlier RCU callbacks are guaranteed to have completed. - -The original code for rcu_barrier() was as follows: - - 1 void rcu_barrier(void) - 2 { - 3 BUG_ON(in_interrupt()); - 4 /* Take cpucontrol mutex to protect against CPU hotplug */ - 5 mutex_lock(&rcu_barrier_mutex); - 6 init_completion(&rcu_barrier_completion); - 7 atomic_set(&rcu_barrier_cpu_count, 0); - 8 on_each_cpu(rcu_barrier_func, NULL, 0, 1); - 9 wait_for_completion(&rcu_barrier_completion); -10 mutex_unlock(&rcu_barrier_mutex); -11 } - -Line 3 verifies that the caller is in process context, and lines 5 and 10 -use rcu_barrier_mutex to ensure that only one rcu_barrier() is using the -global completion and counters at a time, which are initialized on lines -6 and 7. Line 8 causes each CPU to invoke rcu_barrier_func(), which is -shown below. Note that the final "1" in on_each_cpu()'s argument list -ensures that all the calls to rcu_barrier_func() will have completed -before on_each_cpu() returns. Line 9 then waits for the completion. - -This code was rewritten in 2008 and several times thereafter, but this -still gives the general idea. - -The rcu_barrier_func() runs on each CPU, where it invokes call_rcu() -to post an RCU callback, as follows: - - 1 static void rcu_barrier_func(void *notused) - 2 { - 3 int cpu = smp_processor_id(); - 4 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - 5 struct rcu_head *head; - 6 - 7 head = &rdp->barrier; - 8 atomic_inc(&rcu_barrier_cpu_count); - 9 call_rcu(head, rcu_barrier_callback); -10 } - -Lines 3 and 4 locate RCU's internal per-CPU rcu_data structure, -which contains the struct rcu_head that needed for the later call to -call_rcu(). Line 7 picks up a pointer to this struct rcu_head, and line -8 increments a global counter. This counter will later be decremented -by the callback. Line 9 then registers the rcu_barrier_callback() on -the current CPU's queue. - -The rcu_barrier_callback() function simply atomically decrements the -rcu_barrier_cpu_count variable and finalizes the completion when it -reaches zero, as follows: - - 1 static void rcu_barrier_callback(struct rcu_head *notused) - 2 { - 3 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) - 4 complete(&rcu_barrier_completion); - 5 } - -Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes - immediately (thus incrementing rcu_barrier_cpu_count to the - value one), but the other CPU's rcu_barrier_func() invocations - are delayed for a full grace period? Couldn't this result in - rcu_barrier() returning prematurely? - -The current rcu_barrier() implementation is more complex, due to the need -to avoid disturbing idle CPUs (especially on battery-powered systems) -and the need to minimally disturb non-idle CPUs in real-time systems. -However, the code above illustrates the concepts. - - -rcu_barrier() Summary - -The rcu_barrier() primitive has seen relatively little use, since most -code using RCU is in the core kernel rather than in modules. However, if -you are using RCU from an unloadable module, you need to use rcu_barrier() -so that your module may be safely unloaded. - - -Answers to Quick Quizzes - -Quick Quiz #1: Is there any other situation where rcu_barrier() might - be required? - -Answer: Interestingly enough, rcu_barrier() was not originally - implemented for module unloading. Nikita Danilov was using - RCU in a filesystem, which resulted in a similar situation at - filesystem-unmount time. Dipankar Sarma coded up rcu_barrier() - in response, so that Nikita could invoke it during the - filesystem-unmount process. - - Much later, yours truly hit the RCU module-unload problem when - implementing rcutorture, and found that rcu_barrier() solves - this problem as well. - -Quick Quiz #2: What happens if CPU 0's rcu_barrier_func() executes - immediately (thus incrementing rcu_barrier_cpu_count to the - value one), but the other CPU's rcu_barrier_func() invocations - are delayed for a full grace period? Couldn't this result in - rcu_barrier() returning prematurely? - -Answer: This cannot happen. The reason is that on_each_cpu() has its last - argument, the wait flag, set to "1". This flag is passed through - to smp_call_function() and further to smp_call_function_on_cpu(), - causing this latter to spin until the cross-CPU invocation of - rcu_barrier_func() has completed. This by itself would prevent - a grace period from completing on non-CONFIG_PREEMPT kernels, - since each CPU must undergo a context switch (or other quiescent - state) before the grace period can complete. However, this is - of no use in CONFIG_PREEMPT kernels. - - Therefore, on_each_cpu() disables preemption across its call - to smp_call_function() and also across the local call to - rcu_barrier_func(). This prevents the local CPU from context - switching, again preventing grace periods from completing. This - means that all CPUs have executed rcu_barrier_func() before - the first rcu_barrier_callback() can possibly execute, in turn - preventing rcu_barrier_cpu_count from prematurely reaching zero. - - Currently, -rt implementations of RCU keep but a single global - queue for RCU callbacks, and thus do not suffer from this - problem. However, when the -rt RCU eventually does have per-CPU - callback queues, things will have to change. One simple change - is to add an rcu_read_lock() before line 8 of rcu_barrier() - and an rcu_read_unlock() after line 8 of this same function. If - you can think of a better change, please let me know! diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index f48f4621ccbc2b261e6e9e924ca19b9f508c02a5..a360a8796710a2c7e7d5c7571d7562a83c4d86ef 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed for each CPU: - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 The "last_accelerate:" prints the low-order 16 bits (in hex) of the jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from -rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback -status, so that an "l" indicates that all callbacks were lazy at the start -of the last idle period and an "L" indicates that there are currently -no non-lazy callbacks (in both cases, "." is printed otherwise, as -shown above) and "D" indicates that dyntick-idle processing is enabled -("." is printed otherwise, for example, if disabled via the "nohz=" -kernel boot parameter). +rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle +processing is enabled. If the grace period ends just as the stall warning starts printing, there will be a spurious stall-warning message, which will include diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index a41a0384d20c81e327856c87b5a0827ab075e821..af712a3c5b6a56f4e311649037c1e78b8f935b06 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -124,9 +124,14 @@ using a dynamically allocated srcu_struct (hence "srcud-" rather than debugging. The final "T" entry contains the totals of the counters. -USAGE +USAGE ON SPECIFIC KERNEL BUILDS -The following script may be used to torture RCU: +It is sometimes desirable to torture RCU on a specific kernel build, +for example, when preparing to put that kernel build into production. +In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m +so that the test can be started using modprobe and terminated using rmmod. + +For example, the following script may be used to torture RCU: #!/bin/sh @@ -142,8 +147,136 @@ checked for such errors. The "rmmod" command forces a "SUCCESS", two are self-explanatory, while the last indicates that while there were no RCU failures, CPU-hotplug problems were detected. -However, the tools/testing/selftests/rcutorture/bin/kvm.sh script -provides better automation, including automatic failure analysis. -It assumes a qemu/kvm-enabled platform, and runs guest OSes out of initrd. -See tools/testing/selftests/rcutorture/doc/initrd.txt for instructions -on setting up such an initrd. + +USAGE ON MAINLINE KERNELS + +When using rcutorture to test changes to RCU itself, it is often +necessary to build a number of kernels in order to test that change +across a broad range of combinations of the relevant Kconfig options +and of the relevant kernel boot parameters. In this situation, use +of modprobe and rmmod can be quite time-consuming and error-prone. + +Therefore, the tools/testing/selftests/rcutorture/bin/kvm.sh +script is available for mainline testing for x86, arm64, and +powerpc. By default, it will run the series of tests specified by +tools/testing/selftests/rcutorture/configs/rcu/CFLIST, with each test +running for 30 minutes within a guest OS using a minimal userspace +supplied by an automatically generated initrd. After the tests are +complete, the resulting build products and console output are analyzed +for errors and the results of the runs are summarized. + +On larger systems, rcutorture testing can be accelerated by passing the +--cpus argument to kvm.sh. For example, on a 64-CPU system, "--cpus 43" +would use up to 43 CPUs to run tests concurrently, which as of v5.4 would +complete all the scenarios in two batches, reducing the time to complete +from about eight hours to about one hour (not counting the time to build +the sixteen kernels). The "--dryrun sched" argument will not run tests, +but rather tell you how the tests would be scheduled into batches. This +can be useful when working out how many CPUs to specify in the --cpus +argument. + +Not all changes require that all scenarios be run. For example, a change +to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the +--configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'". +Large systems can run multiple copies of of the full set of scenarios, +for example, a system with 448 hardware threads can run five instances +of the full set concurrently. To make this happen: + + kvm.sh --cpus 448 --configs '5*CFLIST' + +Alternatively, such a system can run 56 concurrent instances of a single +eight-CPU scenario: + + kvm.sh --cpus 448 --configs '56*TREE04' + +Or 28 concurrent instances of each of two eight-CPU scenarios: + + kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04' + +Of course, each concurrent instance will use memory, which can be +limited using the --memory argument, which defaults to 512M. Small +values for memory may require disabling the callback-flooding tests +using the --bootargs parameter discussed below. + +Sometimes additional debugging is useful, and in such cases the --kconfig +parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'". + +Kernel boot arguments can also be supplied, for example, to control +rcutorture's module parameters. For example, to test a change to RCU's +CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'". +This will of course result in the scripting reporting a failure, namely +the resuling RCU CPU stall warning. As noted above, reducing memory may +require disabling rcutorture's callback-flooding tests: + + kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \ + --bootargs 'rcutorture.fwd_progress=0' + +Sometimes all that is needed is a full set of kernel builds. This is +what the --buildonly argument does. + +Finally, the --trust-make argument allows each kernel build to reuse what +it can from the previous kernel build. + +There are additional more arcane arguments that are documented in the +source code of the kvm.sh script. + +If a run contains failures, the number of buildtime and runtime failures +is listed at the end of the kvm.sh output, which you really should redirect +to a file. The build products and console output of each run is kept in +tools/testing/selftests/rcutorture/res in timestamped directories. A +given directory can be supplied to kvm-find-errors.sh in order to have +it cycle you through summaries of errors and full error logs. For example: + + tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \ + tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23 + +However, it is often more convenient to access the files directly. +Files pertaining to all scenarios in a run reside in the top-level +directory (2020.01.20-15.54.23 in the example above), while per-scenario +files reside in a subdirectory named after the scenario (for example, +"TREE04"). If a given scenario ran more than once (as in "--configs +'56*TREE04'" above), the directories corresponding to the second and +subsequent runs of that scenario include a sequence number, for example, +"TREE04.2", "TREE04.3", and so on. + +The most frequently used file in the top-level directory is testid.txt. +If the test ran in a git repository, then this file contains the commit +that was tested and any uncommitted changes in diff format. + +The most frequently used files in each per-scenario-run directory are: + +.config: This file contains the Kconfig options. + +Make.out: This contains build output for a specific scenario. + +console.log: This contains the console output for a specific scenario. + This file may be examined once the kernel has booted, but + it might not exist if the build failed. + +vmlinux: This contains the kernel, which can be useful with tools like + objdump and gdb. + +A number of additional files are available, but are less frequently used. +Many are intended for debugging of rcutorture itself or of its scripting. + +As of v5.4, a successful run with the default set of scenarios produces +the following summary at the end of the run on a 12-CPU system: + +SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ] +SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ] +SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ] +SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ] +TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ] +TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ] +TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ] +TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198 +TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631 +TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ] +TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844 +TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497 +CPU count limited from 16 to 12 +TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961 +TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997 +TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732 +CPU count limited from 16 to 12 +TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011 diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst new file mode 100644 index 0000000000000000000000000000000000000000..c7f147b8034f0223dac324c0d044bdc30e97761f --- /dev/null +++ b/Documentation/RCU/whatisRCU.rst @@ -0,0 +1,1154 @@ +.. _whatisrcu_doc: + +What is RCU? -- "Read, Copy, Update" +====================================== + +Please note that the "What is RCU?" LWN series is an excellent place +to start learning about RCU: + +| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ +| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ +| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ +| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ +| 2010 Big API Table http://lwn.net/Articles/419086/ +| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ +| 2014 Big API Table http://lwn.net/Articles/609973/ + + +What is RCU? + +RCU is a synchronization mechanism that was added to the Linux kernel +during the 2.5 development effort that is optimized for read-mostly +situations. Although RCU is actually quite simple once you understand it, +getting there can sometimes be a challenge. Part of the problem is that +most of the past descriptions of RCU have been written with the mistaken +assumption that there is "one true way" to describe RCU. Instead, +the experience has been that different people must take different paths +to arrive at an understanding of RCU. This document provides several +different paths, as follows: + +:ref:`1. RCU OVERVIEW <1_whatisRCU>` + +:ref:`2. WHAT IS RCU'S CORE API? <2_whatisRCU>` + +:ref:`3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? <3_whatisRCU>` + +:ref:`4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? <4_whatisRCU>` + +:ref:`5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? <5_whatisRCU>` + +:ref:`6. ANALOGY WITH READER-WRITER LOCKING <6_whatisRCU>` + +:ref:`7. FULL LIST OF RCU APIs <7_whatisRCU>` + +:ref:`8. ANSWERS TO QUICK QUIZZES <8_whatisRCU>` + +People who prefer starting with a conceptual overview should focus on +Section 1, though most readers will profit by reading this section at +some point. People who prefer to start with an API that they can then +experiment with should focus on Section 2. People who prefer to start +with example uses should focus on Sections 3 and 4. People who need to +understand the RCU implementation should focus on Section 5, then dive +into the kernel source code. People who reason best by analogy should +focus on Section 6. Section 7 serves as an index to the docbook API +documentation, and Section 8 is the traditional answer key. + +So, start with the section that makes the most sense to you and your +preferred method of learning. If you need to know everything about +everything, feel free to read the whole thing -- but if you are really +that type of person, you have perused the source code and will therefore +never need this document anyway. ;-) + +.. _1_whatisRCU: + +1. RCU OVERVIEW +---------------- + +The basic idea behind RCU is to split updates into "removal" and +"reclamation" phases. The removal phase removes references to data items +within a data structure (possibly by replacing them with references to +new versions of these data items), and can run concurrently with readers. +The reason that it is safe to run the removal phase concurrently with +readers is the semantics of modern CPUs guarantee that readers will see +either the old or the new version of the data structure rather than a +partially updated reference. The reclamation phase does the work of reclaiming +(e.g., freeing) the data items removed from the data structure during the +removal phase. Because reclaiming data items can disrupt any readers +concurrently referencing those data items, the reclamation phase must +not start until readers no longer hold references to those data items. + +Splitting the update into removal and reclamation phases permits the +updater to perform the removal phase immediately, and to defer the +reclamation phase until all readers active during the removal phase have +completed, either by blocking until they finish or by registering a +callback that is invoked after they finish. Only readers that are active +during the removal phase need be considered, because any reader starting +after the removal phase will be unable to gain a reference to the removed +data items, and therefore cannot be disrupted by the reclamation phase. + +So the typical RCU update sequence goes something like the following: + +a. Remove pointers to a data structure, so that subsequent + readers cannot gain a reference to it. + +b. Wait for all previous readers to complete their RCU read-side + critical sections. + +c. At this point, there cannot be any readers who hold references + to the data structure, so it now may safely be reclaimed + (e.g., kfree()d). + +Step (b) above is the key idea underlying RCU's deferred destruction. +The ability to wait until all readers are done allows RCU readers to +use much lighter-weight synchronization, in some cases, absolutely no +synchronization at all. In contrast, in more conventional lock-based +schemes, readers must use heavy-weight synchronization in order to +prevent an updater from deleting the data structure out from under them. +This is because lock-based updaters typically update data items in place, +and must therefore exclude readers. In contrast, RCU-based updaters +typically take advantage of the fact that writes to single aligned +pointers are atomic on modern CPUs, allowing atomic insertion, removal, +and replacement of data items in a linked structure without disrupting +readers. Concurrent RCU readers can then continue accessing the old +versions, and can dispense with the atomic operations, memory barriers, +and communications cache misses that are so expensive on present-day +SMP computer systems, even in absence of lock contention. + +In the three-step procedure shown above, the updater is performing both +the removal and the reclamation step, but it is often helpful for an +entirely different thread to do the reclamation, as is in fact the case +in the Linux kernel's directory-entry cache (dcache). Even if the same +thread performs both the update step (step (a) above) and the reclamation +step (step (c) above), it is often helpful to think of them separately. +For example, RCU readers and updaters need not communicate at all, +but RCU provides implicit low-overhead communication between readers +and reclaimers, namely, in step (b) above. + +So how the heck can a reclaimer tell when a reader is done, given +that readers are not doing any sort of synchronization operations??? +Read on to learn about how RCU's API makes this easy. + +.. _2_whatisRCU: + +2. WHAT IS RCU'S CORE API? +--------------------------- + +The core RCU API is quite small: + +a. rcu_read_lock() +b. rcu_read_unlock() +c. synchronize_rcu() / call_rcu() +d. rcu_assign_pointer() +e. rcu_dereference() + +There are many other members of the RCU API, but the rest can be +expressed in terms of these five, though most implementations instead +express synchronize_rcu() in terms of the call_rcu() callback API. + +The five core RCU APIs are described below, the other 18 will be enumerated +later. See the kernel docbook documentation for more info, or look directly +at the function header comments. + +rcu_read_lock() +^^^^^^^^^^^^^^^ + void rcu_read_lock(void); + + Used by a reader to inform the reclaimer that the reader is + entering an RCU read-side critical section. It is illegal + to block while in an RCU read-side critical section, though + kernels built with CONFIG_PREEMPT_RCU can preempt RCU + read-side critical sections. Any RCU-protected data structure + accessed during an RCU read-side critical section is guaranteed to + remain unreclaimed for the full duration of that critical section. + Reference counts may be used in conjunction with RCU to maintain + longer-term references to data structures. + +rcu_read_unlock() +^^^^^^^^^^^^^^^^^ + void rcu_read_unlock(void); + + Used by a reader to inform the reclaimer that the reader is + exiting an RCU read-side critical section. Note that RCU + read-side critical sections may be nested and/or overlapping. + +synchronize_rcu() +^^^^^^^^^^^^^^^^^ + void synchronize_rcu(void); + + Marks the end of updater code and the beginning of reclaimer + code. It does this by blocking until all pre-existing RCU + read-side critical sections on all CPUs have completed. + Note that synchronize_rcu() will **not** necessarily wait for + any subsequent RCU read-side critical sections to complete. + For example, consider the following sequence of events:: + + CPU 0 CPU 1 CPU 2 + ----------------- ------------------------- --------------- + 1. rcu_read_lock() + 2. enters synchronize_rcu() + 3. rcu_read_lock() + 4. rcu_read_unlock() + 5. exits synchronize_rcu() + 6. rcu_read_unlock() + + To reiterate, synchronize_rcu() waits only for ongoing RCU + read-side critical sections to complete, not necessarily for + any that begin after synchronize_rcu() is invoked. + + Of course, synchronize_rcu() does not necessarily return + **immediately** after the last pre-existing RCU read-side critical + section completes. For one thing, there might well be scheduling + delays. For another thing, many RCU implementations process + requests in batches in order to improve efficiencies, which can + further delay synchronize_rcu(). + + Since synchronize_rcu() is the API that must figure out when + readers are done, its implementation is key to RCU. For RCU + to be useful in all but the most read-intensive situations, + synchronize_rcu()'s overhead must also be quite small. + + The call_rcu() API is a callback form of synchronize_rcu(), + and is described in more detail in a later section. Instead of + blocking, it registers a function and argument which are invoked + after all ongoing RCU read-side critical sections have completed. + This callback variant is particularly useful in situations where + it is illegal to block or where update-side performance is + critically important. + + However, the call_rcu() API should not be used lightly, as use + of the synchronize_rcu() API generally results in simpler code. + In addition, the synchronize_rcu() API has the nice property + of automatically limiting update rate should grace periods + be delayed. This property results in system resilience in face + of denial-of-service attacks. Code using call_rcu() should limit + update rate in order to gain this same sort of resilience. See + checklist.txt for some approaches to limiting the update rate. + +rcu_assign_pointer() +^^^^^^^^^^^^^^^^^^^^ + void rcu_assign_pointer(p, typeof(p) v); + + Yes, rcu_assign_pointer() **is** implemented as a macro, though it + would be cool to be able to declare a function in this manner. + (Compiler experts will no doubt disagree.) + + The updater uses this function to assign a new value to an + RCU-protected pointer, in order to safely communicate the change + in value from the updater to the reader. This macro does not + evaluate to an rvalue, but it does execute any memory-barrier + instructions required for a given CPU architecture. + + Perhaps just as important, it serves to document (1) which + pointers are protected by RCU and (2) the point at which a + given structure becomes accessible to other CPUs. That said, + rcu_assign_pointer() is most frequently used indirectly, via + the _rcu list-manipulation primitives such as list_add_rcu(). + +rcu_dereference() +^^^^^^^^^^^^^^^^^ + typeof(p) rcu_dereference(p); + + Like rcu_assign_pointer(), rcu_dereference() must be implemented + as a macro. + + The reader uses rcu_dereference() to fetch an RCU-protected + pointer, which returns a value that may then be safely + dereferenced. Note that rcu_dereference() does not actually + dereference the pointer, instead, it protects the pointer for + later dereferencing. It also executes any needed memory-barrier + instructions for a given CPU architecture. Currently, only Alpha + needs memory barriers within rcu_dereference() -- on other CPUs, + it compiles to nothing, not even a compiler directive. + + Common coding practice uses rcu_dereference() to copy an + RCU-protected pointer to a local variable, then dereferences + this local variable, for example as follows:: + + p = rcu_dereference(head.next); + return p->data; + + However, in this case, one could just as easily combine these + into one statement:: + + return rcu_dereference(head.next)->data; + + If you are going to be fetching multiple fields from the + RCU-protected structure, using the local variable is of + course preferred. Repeated rcu_dereference() calls look + ugly, do not guarantee that the same pointer will be returned + if an update happened while in the critical section, and incur + unnecessary overhead on Alpha CPUs. + + Note that the value returned by rcu_dereference() is valid + only within the enclosing RCU read-side critical section [1]_. + For example, the following is **not** legal:: + + rcu_read_lock(); + p = rcu_dereference(head.next); + rcu_read_unlock(); + x = p->address; /* BUG!!! */ + rcu_read_lock(); + y = p->data; /* BUG!!! */ + rcu_read_unlock(); + + Holding a reference from one RCU read-side critical section + to another is just as illegal as holding a reference from + one lock-based critical section to another! Similarly, + using a reference outside of the critical section in which + it was acquired is just as illegal as doing so with normal + locking. + + As with rcu_assign_pointer(), an important function of + rcu_dereference() is to document which pointers are protected by + RCU, in particular, flagging a pointer that is subject to changing + at any time, including immediately after the rcu_dereference(). + And, again like rcu_assign_pointer(), rcu_dereference() is + typically used indirectly, via the _rcu list-manipulation + primitives, such as list_for_each_entry_rcu() [2]_. + +.. [1] The variant rcu_dereference_protected() can be used outside + of an RCU read-side critical section as long as the usage is + protected by locks acquired by the update-side code. This variant + avoids the lockdep warning that would happen when using (for + example) rcu_dereference() without rcu_read_lock() protection. + Using rcu_dereference_protected() also has the advantage + of permitting compiler optimizations that rcu_dereference() + must prohibit. The rcu_dereference_protected() variant takes + a lockdep expression to indicate which locks must be acquired + by the caller. If the indicated protection is not provided, + a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst + and the API's code comments for more details and example usage. + +.. [2] If the list_for_each_entry_rcu() instance might be used by + update-side code as well as by RCU readers, then an additional + lockdep expression can be added to its list of arguments. + For example, given an additional "lock_is_held(&mylock)" argument, + the RCU lockdep code would complain only if this instance was + invoked outside of an RCU read-side critical section and without + the protection of mylock. + +The following diagram shows how each API communicates among the +reader, updater, and reclaimer. +:: + + + rcu_assign_pointer() + +--------+ + +---------------------->| reader |---------+ + | +--------+ | + | | | + | | | Protect: + | | | rcu_read_lock() + | | | rcu_read_unlock() + | rcu_dereference() | | + +---------+ | | + | updater |<----------------+ | + +---------+ V + | +-----------+ + +----------------------------------->| reclaimer | + +-----------+ + Defer: + synchronize_rcu() & call_rcu() + + +The RCU infrastructure observes the time sequence of rcu_read_lock(), +rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in +order to determine when (1) synchronize_rcu() invocations may return +to their callers and (2) call_rcu() callbacks may be invoked. Efficient +implementations of the RCU infrastructure make heavy use of batching in +order to amortize their overhead over many uses of the corresponding APIs. + +There are at least three flavors of RCU usage in the Linux kernel. The diagram +above shows the most common one. On the updater side, the rcu_assign_pointer(), +sychronize_rcu() and call_rcu() primitives used are the same for all three +flavors. However for protection (on the reader side), the primitives used vary +depending on the flavor: + +a. rcu_read_lock() / rcu_read_unlock() + rcu_dereference() + +b. rcu_read_lock_bh() / rcu_read_unlock_bh() + local_bh_disable() / local_bh_enable() + rcu_dereference_bh() + +c. rcu_read_lock_sched() / rcu_read_unlock_sched() + preempt_disable() / preempt_enable() + local_irq_save() / local_irq_restore() + hardirq enter / hardirq exit + NMI enter / NMI exit + rcu_dereference_sched() + +These three flavors are used as follows: + +a. RCU applied to normal data structures. + +b. RCU applied to networking data structures that may be subjected + to remote denial-of-service attacks. + +c. RCU applied to scheduler and interrupt/NMI-handler tasks. + +Again, most uses will be of (a). The (b) and (c) cases are important +for specialized uses, but are relatively uncommon. + +.. _3_whatisRCU: + +3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? +----------------------------------------------- + +This section shows a simple use of the core RCU API to protect a +global pointer to a dynamically allocated structure. More-typical +uses of RCU may be found in :ref:`listRCU.rst `, +:ref:`arrayRCU.rst `, and :ref:`NMI-RCU.rst `. +:: + + struct foo { + int a; + char b; + long c; + }; + DEFINE_SPINLOCK(foo_mutex); + + struct foo __rcu *gbl_foo; + + /* + * Create a new struct foo that is the same as the one currently + * pointed to by gbl_foo, except that field "a" is replaced + * with "new_a". Points gbl_foo to the new structure, and + * frees up the old structure after a grace period. + * + * Uses rcu_assign_pointer() to ensure that concurrent readers + * see the initialized version of the new structure. + * + * Uses synchronize_rcu() to ensure that any readers that might + * have references to the old structure complete before freeing + * the old structure. + */ + void foo_update_a(int new_a) + { + struct foo *new_fp; + struct foo *old_fp; + + new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); + spin_lock(&foo_mutex); + old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); + *new_fp = *old_fp; + new_fp->a = new_a; + rcu_assign_pointer(gbl_foo, new_fp); + spin_unlock(&foo_mutex); + synchronize_rcu(); + kfree(old_fp); + } + + /* + * Return the value of field "a" of the current gbl_foo + * structure. Use rcu_read_lock() and rcu_read_unlock() + * to ensure that the structure does not get deleted out + * from under us, and use rcu_dereference() to ensure that + * we see the initialized version of the structure (important + * for DEC Alpha and for people reading the code). + */ + int foo_get_a(void) + { + int retval; + + rcu_read_lock(); + retval = rcu_dereference(gbl_foo)->a; + rcu_read_unlock(); + return retval; + } + +So, to sum up: + +- Use rcu_read_lock() and rcu_read_unlock() to guard RCU + read-side critical sections. + +- Within an RCU read-side critical section, use rcu_dereference() + to dereference RCU-protected pointers. + +- Use some solid scheme (such as locks or semaphores) to + keep concurrent updates from interfering with each other. + +- Use rcu_assign_pointer() to update an RCU-protected pointer. + This primitive protects concurrent readers from the updater, + **not** concurrent updates from each other! You therefore still + need to use locking (or something similar) to keep concurrent + rcu_assign_pointer() primitives from interfering with each other. + +- Use synchronize_rcu() **after** removing a data element from an + RCU-protected data structure, but **before** reclaiming/freeing + the data element, in order to wait for the completion of all + RCU read-side critical sections that might be referencing that + data item. + +See checklist.txt for additional rules to follow when using RCU. +And again, more-typical uses of RCU may be found in :ref:`listRCU.rst +`, :ref:`arrayRCU.rst `, and :ref:`NMI-RCU.rst +`. + +.. _4_whatisRCU: + +4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? +-------------------------------------------- + +In the example above, foo_update_a() blocks until a grace period elapses. +This is quite simple, but in some cases one cannot afford to wait so +long -- there might be other high-priority work to be done. + +In such cases, one uses call_rcu() rather than synchronize_rcu(). +The call_rcu() API is as follows:: + + void call_rcu(struct rcu_head * head, + void (*func)(struct rcu_head *head)); + +This function invokes func(head) after a grace period has elapsed. +This invocation might happen from either softirq or process context, +so the function is not permitted to block. The foo struct needs to +have an rcu_head structure added, perhaps as follows:: + + struct foo { + int a; + char b; + long c; + struct rcu_head rcu; + }; + +The foo_update_a() function might then be written as follows:: + + /* + * Create a new struct foo that is the same as the one currently + * pointed to by gbl_foo, except that field "a" is replaced + * with "new_a". Points gbl_foo to the new structure, and + * frees up the old structure after a grace period. + * + * Uses rcu_assign_pointer() to ensure that concurrent readers + * see the initialized version of the new structure. + * + * Uses call_rcu() to ensure that any readers that might have + * references to the old structure complete before freeing the + * old structure. + */ + void foo_update_a(int new_a) + { + struct foo *new_fp; + struct foo *old_fp; + + new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); + spin_lock(&foo_mutex); + old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); + *new_fp = *old_fp; + new_fp->a = new_a; + rcu_assign_pointer(gbl_foo, new_fp); + spin_unlock(&foo_mutex); + call_rcu(&old_fp->rcu, foo_reclaim); + } + +The foo_reclaim() function might appear as follows:: + + void foo_reclaim(struct rcu_head *rp) + { + struct foo *fp = container_of(rp, struct foo, rcu); + + foo_cleanup(fp->a); + + kfree(fp); + } + +The container_of() primitive is a macro that, given a pointer into a +struct, the type of the struct, and the pointed-to field within the +struct, returns a pointer to the beginning of the struct. + +The use of call_rcu() permits the caller of foo_update_a() to +immediately regain control, without needing to worry further about the +old version of the newly updated element. It also clearly shows the +RCU distinction between updater, namely foo_update_a(), and reclaimer, +namely foo_reclaim(). + +The summary of advice is the same as for the previous section, except +that we are now using call_rcu() rather than synchronize_rcu(): + +- Use call_rcu() **after** removing a data element from an + RCU-protected data structure in order to register a callback + function that will be invoked after the completion of all RCU + read-side critical sections that might be referencing that + data item. + +If the callback for call_rcu() is not doing anything more than calling +kfree() on the structure, you can use kfree_rcu() instead of call_rcu() +to avoid having to write your own callback:: + + kfree_rcu(old_fp, rcu); + +Again, see checklist.txt for additional rules governing the use of RCU. + +.. _5_whatisRCU: + +5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? +------------------------------------------------ + +One of the nice things about RCU is that it has extremely simple "toy" +implementations that are a good first step towards understanding the +production-quality implementations in the Linux kernel. This section +presents two such "toy" implementations of RCU, one that is implemented +in terms of familiar locking primitives, and another that more closely +resembles "classic" RCU. Both are way too simple for real-world use, +lacking both functionality and performance. However, they are useful +in getting a feel for how RCU works. See kernel/rcu/update.c for a +production-quality implementation, and see: + + http://www.rdrop.com/users/paulmck/RCU + +for papers describing the Linux kernel RCU implementation. The OLS'01 +and OLS'02 papers are a good introduction, and the dissertation provides +more details on the current implementation as of early 2004. + + +5A. "TOY" IMPLEMENTATION #1: LOCKING +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section presents a "toy" RCU implementation that is based on +familiar locking primitives. Its overhead makes it a non-starter for +real-life use, as does its lack of scalability. It is also unsuitable +for realtime use, since it allows scheduling latency to "bleed" from +one read-side critical section to another. It also assumes recursive +reader-writer locks: If you try this with non-recursive locks, and +you allow nested rcu_read_lock() calls, you can deadlock. + +However, it is probably the easiest implementation to relate to, so is +a good starting point. + +It is extremely simple:: + + static DEFINE_RWLOCK(rcu_gp_mutex); + + void rcu_read_lock(void) + { + read_lock(&rcu_gp_mutex); + } + + void rcu_read_unlock(void) + { + read_unlock(&rcu_gp_mutex); + } + + void synchronize_rcu(void) + { + write_lock(&rcu_gp_mutex); + smp_mb__after_spinlock(); + write_unlock(&rcu_gp_mutex); + } + +[You can ignore rcu_assign_pointer() and rcu_dereference() without missing +much. But here are simplified versions anyway. And whatever you do, +don't forget about them when submitting patches making use of RCU!]:: + + #define rcu_assign_pointer(p, v) \ + ({ \ + smp_store_release(&(p), (v)); \ + }) + + #define rcu_dereference(p) \ + ({ \ + typeof(p) _________p1 = READ_ONCE(p); \ + (_________p1); \ + }) + + +The rcu_read_lock() and rcu_read_unlock() primitive read-acquire +and release a global reader-writer lock. The synchronize_rcu() +primitive write-acquires this same lock, then releases it. This means +that once synchronize_rcu() exits, all RCU read-side critical sections +that were in progress before synchronize_rcu() was called are guaranteed +to have completed -- there is no way that synchronize_rcu() would have +been able to write-acquire the lock otherwise. The smp_mb__after_spinlock() +promotes synchronize_rcu() to a full memory barrier in compliance with +the "Memory-Barrier Guarantees" listed in: + + Documentation/RCU/Design/Requirements/Requirements.rst + +It is possible to nest rcu_read_lock(), since reader-writer locks may +be recursively acquired. Note also that rcu_read_lock() is immune +from deadlock (an important property of RCU). The reason for this is +that the only thing that can block rcu_read_lock() is a synchronize_rcu(). +But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex, +so there can be no deadlock cycle. + +.. _quiz_1: + +Quick Quiz #1: + Why is this argument naive? How could a deadlock + occur when using this algorithm in a real-world Linux + kernel? How could this deadlock be avoided? + +:ref:`Answers to Quick Quiz <8_whatisRCU>` + +5B. "TOY" EXAMPLE #2: CLASSIC RCU +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section presents a "toy" RCU implementation that is based on +"classic RCU". It is also short on performance (but only for updates) and +on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT +kernels. The definitions of rcu_dereference() and rcu_assign_pointer() +are the same as those shown in the preceding section, so they are omitted. +:: + + void rcu_read_lock(void) { } + + void rcu_read_unlock(void) { } + + void synchronize_rcu(void) + { + int cpu; + + for_each_possible_cpu(cpu) + run_on(cpu); + } + +Note that rcu_read_lock() and rcu_read_unlock() do absolutely nothing. +This is the great strength of classic RCU in a non-preemptive kernel: +read-side overhead is precisely zero, at least on non-Alpha CPUs. +And there is absolutely no way that rcu_read_lock() can possibly +participate in a deadlock cycle! + +The implementation of synchronize_rcu() simply schedules itself on each +CPU in turn. The run_on() primitive can be implemented straightforwardly +in terms of the sched_setaffinity() primitive. Of course, a somewhat less +"toy" implementation would restore the affinity upon completion rather +than just leaving all tasks running on the last CPU, but when I said +"toy", I meant **toy**! + +So how the heck is this supposed to work??? + +Remember that it is illegal to block while in an RCU read-side critical +section. Therefore, if a given CPU executes a context switch, we know +that it must have completed all preceding RCU read-side critical sections. +Once **all** CPUs have executed a context switch, then **all** preceding +RCU read-side critical sections will have completed. + +So, suppose that we remove a data item from its structure and then invoke +synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed +that there are no RCU read-side critical sections holding a reference +to that data item, so we can safely reclaim it. + +.. _quiz_2: + +Quick Quiz #2: + Give an example where Classic RCU's read-side + overhead is **negative**. + +:ref:`Answers to Quick Quiz <8_whatisRCU>` + +.. _quiz_3: + +Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in + PREEMPT_RT, where normal spinlocks can block??? + +:ref:`Answers to Quick Quiz <8_whatisRCU>` + +.. _6_whatisRCU: + +6. ANALOGY WITH READER-WRITER LOCKING +-------------------------------------- + +Although RCU can be used in many different ways, a very common use of +RCU is analogous to reader-writer locking. The following unified +diff shows how closely related RCU and reader-writer locking can be. +:: + + @@ -5,5 +5,5 @@ struct el { + int data; + /* Other data fields */ + }; + -rwlock_t listmutex; + +spinlock_t listmutex; + struct el head; + + @@ -13,15 +14,15 @@ + struct list_head *lp; + struct el *p; + + - read_lock(&listmutex); + - list_for_each_entry(p, head, lp) { + + rcu_read_lock(); + + list_for_each_entry_rcu(p, head, lp) { + if (p->key == key) { + *result = p->data; + - read_unlock(&listmutex); + + rcu_read_unlock(); + return 1; + } + } + - read_unlock(&listmutex); + + rcu_read_unlock(); + return 0; + } + + @@ -29,15 +30,16 @@ + { + struct el *p; + + - write_lock(&listmutex); + + spin_lock(&listmutex); + list_for_each_entry(p, head, lp) { + if (p->key == key) { + - list_del(&p->list); + - write_unlock(&listmutex); + + list_del_rcu(&p->list); + + spin_unlock(&listmutex); + + synchronize_rcu(); + kfree(p); + return 1; + } + } + - write_unlock(&listmutex); + + spin_unlock(&listmutex); + return 0; + } + +Or, for those who prefer a side-by-side listing:: + + 1 struct el { 1 struct el { + 2 struct list_head list; 2 struct list_head list; + 3 long key; 3 long key; + 4 spinlock_t mutex; 4 spinlock_t mutex; + 5 int data; 5 int data; + 6 /* Other data fields */ 6 /* Other data fields */ + 7 }; 7 }; + 8 rwlock_t listmutex; 8 spinlock_t listmutex; + 9 struct el head; 9 struct el head; + +:: + + 1 int search(long key, int *result) 1 int search(long key, int *result) + 2 { 2 { + 3 struct list_head *lp; 3 struct list_head *lp; + 4 struct el *p; 4 struct el *p; + 5 5 + 6 read_lock(&listmutex); 6 rcu_read_lock(); + 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { + 8 if (p->key == key) { 8 if (p->key == key) { + 9 *result = p->data; 9 *result = p->data; + 10 read_unlock(&listmutex); 10 rcu_read_unlock(); + 11 return 1; 11 return 1; + 12 } 12 } + 13 } 13 } + 14 read_unlock(&listmutex); 14 rcu_read_unlock(); + 15 return 0; 15 return 0; + 16 } 16 } + +:: + + 1 int delete(long key) 1 int delete(long key) + 2 { 2 { + 3 struct el *p; 3 struct el *p; + 4 4 + 5 write_lock(&listmutex); 5 spin_lock(&listmutex); + 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) { + 7 if (p->key == key) { 7 if (p->key == key) { + 8 list_del(&p->list); 8 list_del_rcu(&p->list); + 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex); + 10 synchronize_rcu(); + 10 kfree(p); 11 kfree(p); + 11 return 1; 12 return 1; + 12 } 13 } + 13 } 14 } + 14 write_unlock(&listmutex); 15 spin_unlock(&listmutex); + 15 return 0; 16 return 0; + 16 } 17 } + +Either way, the differences are quite small. Read-side locking moves +to rcu_read_lock() and rcu_read_unlock, update-side locking moves from +a reader-writer lock to a simple spinlock, and a synchronize_rcu() +precedes the kfree(). + +However, there is one potential catch: the read-side and update-side +critical sections can now run concurrently. In many cases, this will +not be a problem, but it is necessary to check carefully regardless. +For example, if multiple independent list updates must be seen as +a single atomic update, converting to RCU will require special care. + +Also, the presence of synchronize_rcu() means that the RCU version of +delete() can now block. If this is a problem, there is a callback-based +mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can +be used in place of synchronize_rcu(). + +.. _7_whatisRCU: + +7. FULL LIST OF RCU APIs +------------------------- + +The RCU APIs are documented in docbook-format header comments in the +Linux-kernel source code, but it helps to have a full list of the +APIs, since there does not appear to be a way to categorize them +in docbook. Here is the list, by category. + +RCU list traversal:: + + list_entry_rcu + list_entry_lockless + list_first_entry_rcu + list_next_rcu + list_for_each_entry_rcu + list_for_each_entry_continue_rcu + list_for_each_entry_from_rcu + list_first_or_null_rcu + list_next_or_null_rcu + hlist_first_rcu + hlist_next_rcu + hlist_pprev_rcu + hlist_for_each_entry_rcu + hlist_for_each_entry_rcu_bh + hlist_for_each_entry_from_rcu + hlist_for_each_entry_continue_rcu + hlist_for_each_entry_continue_rcu_bh + hlist_nulls_first_rcu + hlist_nulls_for_each_entry_rcu + hlist_bl_first_rcu + hlist_bl_for_each_entry_rcu + +RCU pointer/list update:: + + rcu_assign_pointer + list_add_rcu + list_add_tail_rcu + list_del_rcu + list_replace_rcu + hlist_add_behind_rcu + hlist_add_before_rcu + hlist_add_head_rcu + hlist_add_tail_rcu + hlist_del_rcu + hlist_del_init_rcu + hlist_replace_rcu + list_splice_init_rcu + list_splice_tail_init_rcu + hlist_nulls_del_init_rcu + hlist_nulls_del_rcu + hlist_nulls_add_head_rcu + hlist_bl_add_head_rcu + hlist_bl_del_init_rcu + hlist_bl_del_rcu + hlist_bl_set_first_rcu + +RCU:: + + Critical sections Grace period Barrier + + rcu_read_lock synchronize_net rcu_barrier + rcu_read_unlock synchronize_rcu + rcu_dereference synchronize_rcu_expedited + rcu_read_lock_held call_rcu + rcu_dereference_check kfree_rcu + rcu_dereference_protected + +bh:: + + Critical sections Grace period Barrier + + rcu_read_lock_bh call_rcu rcu_barrier + rcu_read_unlock_bh synchronize_rcu + [local_bh_disable] synchronize_rcu_expedited + [and friends] + rcu_dereference_bh + rcu_dereference_bh_check + rcu_dereference_bh_protected + rcu_read_lock_bh_held + +sched:: + + Critical sections Grace period Barrier + + rcu_read_lock_sched call_rcu rcu_barrier + rcu_read_unlock_sched synchronize_rcu + [preempt_disable] synchronize_rcu_expedited + [and friends] + rcu_read_lock_sched_notrace + rcu_read_unlock_sched_notrace + rcu_dereference_sched + rcu_dereference_sched_check + rcu_dereference_sched_protected + rcu_read_lock_sched_held + + +SRCU:: + + Critical sections Grace period Barrier + + srcu_read_lock call_srcu srcu_barrier + srcu_read_unlock synchronize_srcu + srcu_dereference synchronize_srcu_expedited + srcu_dereference_check + srcu_read_lock_held + +SRCU: Initialization/cleanup:: + + DEFINE_SRCU + DEFINE_STATIC_SRCU + init_srcu_struct + cleanup_srcu_struct + +All: lockdep-checked RCU-protected pointer access:: + + rcu_access_pointer + rcu_dereference_raw + RCU_LOCKDEP_WARN + rcu_sleep_check + RCU_NONIDLE + +See the comment headers in the source code (or the docbook generated +from them) for more information. + +However, given that there are no fewer than four families of RCU APIs +in the Linux kernel, how do you choose which one to use? The following +list can be helpful: + +a. Will readers need to block? If so, you need SRCU. + +b. What about the -rt patchset? If readers would need to block + in an non-rt kernel, you need SRCU. If readers would block + in a -rt kernel, but not in a non-rt kernel, SRCU is not + necessary. (The -rt patchset turns spinlocks into sleeplocks, + hence this distinction.) + +c. Do you need to treat NMI handlers, hardirq handlers, + and code segments with preemption disabled (whether + via preempt_disable(), local_irq_save(), local_bh_disable(), + or some other mechanism) as if they were explicit RCU readers? + If so, RCU-sched is the only choice that will work for you. + +d. Do you need RCU grace periods to complete even in the face + of softirq monopolization of one or more of the CPUs? For + example, is your code subject to network-based denial-of-service + attacks? If so, you should disable softirq across your readers, + for example, by using rcu_read_lock_bh(). + +e. Is your workload too update-intensive for normal use of + RCU, but inappropriate for other synchronization mechanisms? + If so, consider SLAB_TYPESAFE_BY_RCU (which was originally + named SLAB_DESTROY_BY_RCU). But please be careful! + +f. Do you need read-side critical sections that are respected + even though they are in the middle of the idle loop, during + user-mode execution, or on an offlined CPU? If so, SRCU is the + only choice that will work for you. + +g. Otherwise, use RCU. + +Of course, this all assumes that you have determined that RCU is in fact +the right tool for your job. + +.. _8_whatisRCU: + +8. ANSWERS TO QUICK QUIZZES +---------------------------- + +Quick Quiz #1: + Why is this argument naive? How could a deadlock + occur when using this algorithm in a real-world Linux + kernel? [Referring to the lock-based "toy" RCU + algorithm.] + +Answer: + Consider the following sequence of events: + + 1. CPU 0 acquires some unrelated lock, call it + "problematic_lock", disabling irq via + spin_lock_irqsave(). + + 2. CPU 1 enters synchronize_rcu(), write-acquiring + rcu_gp_mutex. + + 3. CPU 0 enters rcu_read_lock(), but must wait + because CPU 1 holds rcu_gp_mutex. + + 4. CPU 1 is interrupted, and the irq handler + attempts to acquire problematic_lock. + + The system is now deadlocked. + + One way to avoid this deadlock is to use an approach like + that of CONFIG_PREEMPT_RT, where all normal spinlocks + become blocking locks, and all irq handlers execute in + the context of special tasks. In this case, in step 4 + above, the irq handler would block, allowing CPU 1 to + release rcu_gp_mutex, avoiding the deadlock. + + Even in the absence of deadlock, this RCU implementation + allows latency to "bleed" from readers to other + readers through synchronize_rcu(). To see this, + consider task A in an RCU read-side critical section + (thus read-holding rcu_gp_mutex), task B blocked + attempting to write-acquire rcu_gp_mutex, and + task C blocked in rcu_read_lock() attempting to + read_acquire rcu_gp_mutex. Task A's RCU read-side + latency is holding up task C, albeit indirectly via + task B. + + Realtime RCU implementations therefore use a counter-based + approach where tasks in RCU read-side critical sections + cannot be blocked by tasks executing synchronize_rcu(). + +:ref:`Back to Quick Quiz #1 ` + +Quick Quiz #2: + Give an example where Classic RCU's read-side + overhead is **negative**. + +Answer: + Imagine a single-CPU system with a non-CONFIG_PREEMPT + kernel where a routing table is used by process-context + code, but can be updated by irq-context code (for example, + by an "ICMP REDIRECT" packet). The usual way of handling + this would be to have the process-context code disable + interrupts while searching the routing table. Use of + RCU allows such interrupt-disabling to be dispensed with. + Thus, without RCU, you pay the cost of disabling interrupts, + and with RCU you don't. + + One can argue that the overhead of RCU in this + case is negative with respect to the single-CPU + interrupt-disabling approach. Others might argue that + the overhead of RCU is merely zero, and that replacing + the positive overhead of the interrupt-disabling scheme + with the zero-overhead RCU scheme does not constitute + negative overhead. + + In real life, of course, things are more complex. But + even the theoretical possibility of negative overhead for + a synchronization primitive is a bit unexpected. ;-) + +:ref:`Back to Quick Quiz #2 ` + +Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in + PREEMPT_RT, where normal spinlocks can block??? + +Answer: + Just as PREEMPT_RT permits preemption of spinlock + critical sections, it permits preemption of RCU + read-side critical sections. It also permits + spinlocks blocking while in RCU read-side critical + sections. + + Why the apparent inconsistency? Because it is + possible to use priority boosting to keep the RCU + grace periods short if need be (for example, if running + short of memory). In contrast, if blocking waiting + for (say) network reception, there is no way to know + what should be boosted. Especially given that the + process we need to boost might well be a human being + who just went out for a pizza or something. And although + a computer-operated cattle prod might arouse serious + interest, it might also provoke serious objections. + Besides, how does the computer know what pizza parlor + the human being went to??? + +:ref:`Back to Quick Quiz #3 ` + +ACKNOWLEDGEMENTS + +My thanks to the people who helped make this human-readable, including +Jon Walpole, Josh Triplett, Serge Hallyn, Suzanne Wood, and Alan Stern. + + +For more information, see http://www.rdrop.com/users/paulmck/RCU. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt deleted file mode 100644 index 58ba05c4d97f91904234417275fae88092685dcd..0000000000000000000000000000000000000000 --- a/Documentation/RCU/whatisRCU.txt +++ /dev/null @@ -1,1079 +0,0 @@ -What is RCU? -- "Read, Copy, Update" - -Please note that the "What is RCU?" LWN series is an excellent place -to start learning about RCU: - -1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ -2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ -3. RCU part 3: the RCU API http://lwn.net/Articles/264090/ -4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/ - 2010 Big API Table http://lwn.net/Articles/419086/ -5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/ - 2014 Big API Table http://lwn.net/Articles/609973/ - - -What is RCU? - -RCU is a synchronization mechanism that was added to the Linux kernel -during the 2.5 development effort that is optimized for read-mostly -situations. Although RCU is actually quite simple once you understand it, -getting there can sometimes be a challenge. Part of the problem is that -most of the past descriptions of RCU have been written with the mistaken -assumption that there is "one true way" to describe RCU. Instead, -the experience has been that different people must take different paths -to arrive at an understanding of RCU. This document provides several -different paths, as follows: - -1. RCU OVERVIEW -2. WHAT IS RCU'S CORE API? -3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? -4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? -5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? -6. ANALOGY WITH READER-WRITER LOCKING -7. FULL LIST OF RCU APIs -8. ANSWERS TO QUICK QUIZZES - -People who prefer starting with a conceptual overview should focus on -Section 1, though most readers will profit by reading this section at -some point. People who prefer to start with an API that they can then -experiment with should focus on Section 2. People who prefer to start -with example uses should focus on Sections 3 and 4. People who need to -understand the RCU implementation should focus on Section 5, then dive -into the kernel source code. People who reason best by analogy should -focus on Section 6. Section 7 serves as an index to the docbook API -documentation, and Section 8 is the traditional answer key. - -So, start with the section that makes the most sense to you and your -preferred method of learning. If you need to know everything about -everything, feel free to read the whole thing -- but if you are really -that type of person, you have perused the source code and will therefore -never need this document anyway. ;-) - - -1. RCU OVERVIEW - -The basic idea behind RCU is to split updates into "removal" and -"reclamation" phases. The removal phase removes references to data items -within a data structure (possibly by replacing them with references to -new versions of these data items), and can run concurrently with readers. -The reason that it is safe to run the removal phase concurrently with -readers is the semantics of modern CPUs guarantee that readers will see -either the old or the new version of the data structure rather than a -partially updated reference. The reclamation phase does the work of reclaiming -(e.g., freeing) the data items removed from the data structure during the -removal phase. Because reclaiming data items can disrupt any readers -concurrently referencing those data items, the reclamation phase must -not start until readers no longer hold references to those data items. - -Splitting the update into removal and reclamation phases permits the -updater to perform the removal phase immediately, and to defer the -reclamation phase until all readers active during the removal phase have -completed, either by blocking until they finish or by registering a -callback that is invoked after they finish. Only readers that are active -during the removal phase need be considered, because any reader starting -after the removal phase will be unable to gain a reference to the removed -data items, and therefore cannot be disrupted by the reclamation phase. - -So the typical RCU update sequence goes something like the following: - -a. Remove pointers to a data structure, so that subsequent - readers cannot gain a reference to it. - -b. Wait for all previous readers to complete their RCU read-side - critical sections. - -c. At this point, there cannot be any readers who hold references - to the data structure, so it now may safely be reclaimed - (e.g., kfree()d). - -Step (b) above is the key idea underlying RCU's deferred destruction. -The ability to wait until all readers are done allows RCU readers to -use much lighter-weight synchronization, in some cases, absolutely no -synchronization at all. In contrast, in more conventional lock-based -schemes, readers must use heavy-weight synchronization in order to -prevent an updater from deleting the data structure out from under them. -This is because lock-based updaters typically update data items in place, -and must therefore exclude readers. In contrast, RCU-based updaters -typically take advantage of the fact that writes to single aligned -pointers are atomic on modern CPUs, allowing atomic insertion, removal, -and replacement of data items in a linked structure without disrupting -readers. Concurrent RCU readers can then continue accessing the old -versions, and can dispense with the atomic operations, memory barriers, -and communications cache misses that are so expensive on present-day -SMP computer systems, even in absence of lock contention. - -In the three-step procedure shown above, the updater is performing both -the removal and the reclamation step, but it is often helpful for an -entirely different thread to do the reclamation, as is in fact the case -in the Linux kernel's directory-entry cache (dcache). Even if the same -thread performs both the update step (step (a) above) and the reclamation -step (step (c) above), it is often helpful to think of them separately. -For example, RCU readers and updaters need not communicate at all, -but RCU provides implicit low-overhead communication between readers -and reclaimers, namely, in step (b) above. - -So how the heck can a reclaimer tell when a reader is done, given -that readers are not doing any sort of synchronization operations??? -Read on to learn about how RCU's API makes this easy. - - -2. WHAT IS RCU'S CORE API? - -The core RCU API is quite small: - -a. rcu_read_lock() -b. rcu_read_unlock() -c. synchronize_rcu() / call_rcu() -d. rcu_assign_pointer() -e. rcu_dereference() - -There are many other members of the RCU API, but the rest can be -expressed in terms of these five, though most implementations instead -express synchronize_rcu() in terms of the call_rcu() callback API. - -The five core RCU APIs are described below, the other 18 will be enumerated -later. See the kernel docbook documentation for more info, or look directly -at the function header comments. - -rcu_read_lock() - - void rcu_read_lock(void); - - Used by a reader to inform the reclaimer that the reader is - entering an RCU read-side critical section. It is illegal - to block while in an RCU read-side critical section, though - kernels built with CONFIG_PREEMPT_RCU can preempt RCU - read-side critical sections. Any RCU-protected data structure - accessed during an RCU read-side critical section is guaranteed to - remain unreclaimed for the full duration of that critical section. - Reference counts may be used in conjunction with RCU to maintain - longer-term references to data structures. - -rcu_read_unlock() - - void rcu_read_unlock(void); - - Used by a reader to inform the reclaimer that the reader is - exiting an RCU read-side critical section. Note that RCU - read-side critical sections may be nested and/or overlapping. - -synchronize_rcu() - - void synchronize_rcu(void); - - Marks the end of updater code and the beginning of reclaimer - code. It does this by blocking until all pre-existing RCU - read-side critical sections on all CPUs have completed. - Note that synchronize_rcu() will -not- necessarily wait for - any subsequent RCU read-side critical sections to complete. - For example, consider the following sequence of events: - - CPU 0 CPU 1 CPU 2 - ----------------- ------------------------- --------------- - 1. rcu_read_lock() - 2. enters synchronize_rcu() - 3. rcu_read_lock() - 4. rcu_read_unlock() - 5. exits synchronize_rcu() - 6. rcu_read_unlock() - - To reiterate, synchronize_rcu() waits only for ongoing RCU - read-side critical sections to complete, not necessarily for - any that begin after synchronize_rcu() is invoked. - - Of course, synchronize_rcu() does not necessarily return - -immediately- after the last pre-existing RCU read-side critical - section completes. For one thing, there might well be scheduling - delays. For another thing, many RCU implementations process - requests in batches in order to improve efficiencies, which can - further delay synchronize_rcu(). - - Since synchronize_rcu() is the API that must figure out when - readers are done, its implementation is key to RCU. For RCU - to be useful in all but the most read-intensive situations, - synchronize_rcu()'s overhead must also be quite small. - - The call_rcu() API is a callback form of synchronize_rcu(), - and is described in more detail in a later section. Instead of - blocking, it registers a function and argument which are invoked - after all ongoing RCU read-side critical sections have completed. - This callback variant is particularly useful in situations where - it is illegal to block or where update-side performance is - critically important. - - However, the call_rcu() API should not be used lightly, as use - of the synchronize_rcu() API generally results in simpler code. - In addition, the synchronize_rcu() API has the nice property - of automatically limiting update rate should grace periods - be delayed. This property results in system resilience in face - of denial-of-service attacks. Code using call_rcu() should limit - update rate in order to gain this same sort of resilience. See - checklist.txt for some approaches to limiting the update rate. - -rcu_assign_pointer() - - void rcu_assign_pointer(p, typeof(p) v); - - Yes, rcu_assign_pointer() -is- implemented as a macro, though it - would be cool to be able to declare a function in this manner. - (Compiler experts will no doubt disagree.) - - The updater uses this function to assign a new value to an - RCU-protected pointer, in order to safely communicate the change - in value from the updater to the reader. This macro does not - evaluate to an rvalue, but it does execute any memory-barrier - instructions required for a given CPU architecture. - - Perhaps just as important, it serves to document (1) which - pointers are protected by RCU and (2) the point at which a - given structure becomes accessible to other CPUs. That said, - rcu_assign_pointer() is most frequently used indirectly, via - the _rcu list-manipulation primitives such as list_add_rcu(). - -rcu_dereference() - - typeof(p) rcu_dereference(p); - - Like rcu_assign_pointer(), rcu_dereference() must be implemented - as a macro. - - The reader uses rcu_dereference() to fetch an RCU-protected - pointer, which returns a value that may then be safely - dereferenced. Note that rcu_dereference() does not actually - dereference the pointer, instead, it protects the pointer for - later dereferencing. It also executes any needed memory-barrier - instructions for a given CPU architecture. Currently, only Alpha - needs memory barriers within rcu_dereference() -- on other CPUs, - it compiles to nothing, not even a compiler directive. - - Common coding practice uses rcu_dereference() to copy an - RCU-protected pointer to a local variable, then dereferences - this local variable, for example as follows: - - p = rcu_dereference(head.next); - return p->data; - - However, in this case, one could just as easily combine these - into one statement: - - return rcu_dereference(head.next)->data; - - If you are going to be fetching multiple fields from the - RCU-protected structure, using the local variable is of - course preferred. Repeated rcu_dereference() calls look - ugly, do not guarantee that the same pointer will be returned - if an update happened while in the critical section, and incur - unnecessary overhead on Alpha CPUs. - - Note that the value returned by rcu_dereference() is valid - only within the enclosing RCU read-side critical section [1]. - For example, the following is -not- legal: - - rcu_read_lock(); - p = rcu_dereference(head.next); - rcu_read_unlock(); - x = p->address; /* BUG!!! */ - rcu_read_lock(); - y = p->data; /* BUG!!! */ - rcu_read_unlock(); - - Holding a reference from one RCU read-side critical section - to another is just as illegal as holding a reference from - one lock-based critical section to another! Similarly, - using a reference outside of the critical section in which - it was acquired is just as illegal as doing so with normal - locking. - - As with rcu_assign_pointer(), an important function of - rcu_dereference() is to document which pointers are protected by - RCU, in particular, flagging a pointer that is subject to changing - at any time, including immediately after the rcu_dereference(). - And, again like rcu_assign_pointer(), rcu_dereference() is - typically used indirectly, via the _rcu list-manipulation - primitives, such as list_for_each_entry_rcu() [2]. - - [1] The variant rcu_dereference_protected() can be used outside - of an RCU read-side critical section as long as the usage is - protected by locks acquired by the update-side code. This variant - avoids the lockdep warning that would happen when using (for - example) rcu_dereference() without rcu_read_lock() protection. - Using rcu_dereference_protected() also has the advantage - of permitting compiler optimizations that rcu_dereference() - must prohibit. The rcu_dereference_protected() variant takes - a lockdep expression to indicate which locks must be acquired - by the caller. If the indicated protection is not provided, - a lockdep splat is emitted. See Documentation/RCU/Design/Requirements/Requirements.rst - and the API's code comments for more details and example usage. - - [2] If the list_for_each_entry_rcu() instance might be used by - update-side code as well as by RCU readers, then an additional - lockdep expression can be added to its list of arguments. - For example, given an additional "lock_is_held(&mylock)" argument, - the RCU lockdep code would complain only if this instance was - invoked outside of an RCU read-side critical section and without - the protection of mylock. - -The following diagram shows how each API communicates among the -reader, updater, and reclaimer. - - - rcu_assign_pointer() - +--------+ - +---------------------->| reader |---------+ - | +--------+ | - | | | - | | | Protect: - | | | rcu_read_lock() - | | | rcu_read_unlock() - | rcu_dereference() | | - +---------+ | | - | updater |<----------------+ | - +---------+ V - | +-----------+ - +----------------------------------->| reclaimer | - +-----------+ - Defer: - synchronize_rcu() & call_rcu() - - -The RCU infrastructure observes the time sequence of rcu_read_lock(), -rcu_read_unlock(), synchronize_rcu(), and call_rcu() invocations in -order to determine when (1) synchronize_rcu() invocations may return -to their callers and (2) call_rcu() callbacks may be invoked. Efficient -implementations of the RCU infrastructure make heavy use of batching in -order to amortize their overhead over many uses of the corresponding APIs. - -There are at least three flavors of RCU usage in the Linux kernel. The diagram -above shows the most common one. On the updater side, the rcu_assign_pointer(), -sychronize_rcu() and call_rcu() primitives used are the same for all three -flavors. However for protection (on the reader side), the primitives used vary -depending on the flavor: - -a. rcu_read_lock() / rcu_read_unlock() - rcu_dereference() - -b. rcu_read_lock_bh() / rcu_read_unlock_bh() - local_bh_disable() / local_bh_enable() - rcu_dereference_bh() - -c. rcu_read_lock_sched() / rcu_read_unlock_sched() - preempt_disable() / preempt_enable() - local_irq_save() / local_irq_restore() - hardirq enter / hardirq exit - NMI enter / NMI exit - rcu_dereference_sched() - -These three flavors are used as follows: - -a. RCU applied to normal data structures. - -b. RCU applied to networking data structures that may be subjected - to remote denial-of-service attacks. - -c. RCU applied to scheduler and interrupt/NMI-handler tasks. - -Again, most uses will be of (a). The (b) and (c) cases are important -for specialized uses, but are relatively uncommon. - - -3. WHAT ARE SOME EXAMPLE USES OF CORE RCU API? - -This section shows a simple use of the core RCU API to protect a -global pointer to a dynamically allocated structure. More-typical -uses of RCU may be found in listRCU.txt, arrayRCU.txt, and NMI-RCU.txt. - - struct foo { - int a; - char b; - long c; - }; - DEFINE_SPINLOCK(foo_mutex); - - struct foo __rcu *gbl_foo; - - /* - * Create a new struct foo that is the same as the one currently - * pointed to by gbl_foo, except that field "a" is replaced - * with "new_a". Points gbl_foo to the new structure, and - * frees up the old structure after a grace period. - * - * Uses rcu_assign_pointer() to ensure that concurrent readers - * see the initialized version of the new structure. - * - * Uses synchronize_rcu() to ensure that any readers that might - * have references to the old structure complete before freeing - * the old structure. - */ - void foo_update_a(int new_a) - { - struct foo *new_fp; - struct foo *old_fp; - - new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); - spin_lock(&foo_mutex); - old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); - *new_fp = *old_fp; - new_fp->a = new_a; - rcu_assign_pointer(gbl_foo, new_fp); - spin_unlock(&foo_mutex); - synchronize_rcu(); - kfree(old_fp); - } - - /* - * Return the value of field "a" of the current gbl_foo - * structure. Use rcu_read_lock() and rcu_read_unlock() - * to ensure that the structure does not get deleted out - * from under us, and use rcu_dereference() to ensure that - * we see the initialized version of the structure (important - * for DEC Alpha and for people reading the code). - */ - int foo_get_a(void) - { - int retval; - - rcu_read_lock(); - retval = rcu_dereference(gbl_foo)->a; - rcu_read_unlock(); - return retval; - } - -So, to sum up: - -o Use rcu_read_lock() and rcu_read_unlock() to guard RCU - read-side critical sections. - -o Within an RCU read-side critical section, use rcu_dereference() - to dereference RCU-protected pointers. - -o Use some solid scheme (such as locks or semaphores) to - keep concurrent updates from interfering with each other. - -o Use rcu_assign_pointer() to update an RCU-protected pointer. - This primitive protects concurrent readers from the updater, - -not- concurrent updates from each other! You therefore still - need to use locking (or something similar) to keep concurrent - rcu_assign_pointer() primitives from interfering with each other. - -o Use synchronize_rcu() -after- removing a data element from an - RCU-protected data structure, but -before- reclaiming/freeing - the data element, in order to wait for the completion of all - RCU read-side critical sections that might be referencing that - data item. - -See checklist.txt for additional rules to follow when using RCU. -And again, more-typical uses of RCU may be found in listRCU.txt, -arrayRCU.txt, and NMI-RCU.txt. - - -4. WHAT IF MY UPDATING THREAD CANNOT BLOCK? - -In the example above, foo_update_a() blocks until a grace period elapses. -This is quite simple, but in some cases one cannot afford to wait so -long -- there might be other high-priority work to be done. - -In such cases, one uses call_rcu() rather than synchronize_rcu(). -The call_rcu() API is as follows: - - void call_rcu(struct rcu_head * head, - void (*func)(struct rcu_head *head)); - -This function invokes func(head) after a grace period has elapsed. -This invocation might happen from either softirq or process context, -so the function is not permitted to block. The foo struct needs to -have an rcu_head structure added, perhaps as follows: - - struct foo { - int a; - char b; - long c; - struct rcu_head rcu; - }; - -The foo_update_a() function might then be written as follows: - - /* - * Create a new struct foo that is the same as the one currently - * pointed to by gbl_foo, except that field "a" is replaced - * with "new_a". Points gbl_foo to the new structure, and - * frees up the old structure after a grace period. - * - * Uses rcu_assign_pointer() to ensure that concurrent readers - * see the initialized version of the new structure. - * - * Uses call_rcu() to ensure that any readers that might have - * references to the old structure complete before freeing the - * old structure. - */ - void foo_update_a(int new_a) - { - struct foo *new_fp; - struct foo *old_fp; - - new_fp = kmalloc(sizeof(*new_fp), GFP_KERNEL); - spin_lock(&foo_mutex); - old_fp = rcu_dereference_protected(gbl_foo, lockdep_is_held(&foo_mutex)); - *new_fp = *old_fp; - new_fp->a = new_a; - rcu_assign_pointer(gbl_foo, new_fp); - spin_unlock(&foo_mutex); - call_rcu(&old_fp->rcu, foo_reclaim); - } - -The foo_reclaim() function might appear as follows: - - void foo_reclaim(struct rcu_head *rp) - { - struct foo *fp = container_of(rp, struct foo, rcu); - - foo_cleanup(fp->a); - - kfree(fp); - } - -The container_of() primitive is a macro that, given a pointer into a -struct, the type of the struct, and the pointed-to field within the -struct, returns a pointer to the beginning of the struct. - -The use of call_rcu() permits the caller of foo_update_a() to -immediately regain control, without needing to worry further about the -old version of the newly updated element. It also clearly shows the -RCU distinction between updater, namely foo_update_a(), and reclaimer, -namely foo_reclaim(). - -The summary of advice is the same as for the previous section, except -that we are now using call_rcu() rather than synchronize_rcu(): - -o Use call_rcu() -after- removing a data element from an - RCU-protected data structure in order to register a callback - function that will be invoked after the completion of all RCU - read-side critical sections that might be referencing that - data item. - -If the callback for call_rcu() is not doing anything more than calling -kfree() on the structure, you can use kfree_rcu() instead of call_rcu() -to avoid having to write your own callback: - - kfree_rcu(old_fp, rcu); - -Again, see checklist.txt for additional rules governing the use of RCU. - - -5. WHAT ARE SOME SIMPLE IMPLEMENTATIONS OF RCU? - -One of the nice things about RCU is that it has extremely simple "toy" -implementations that are a good first step towards understanding the -production-quality implementations in the Linux kernel. This section -presents two such "toy" implementations of RCU, one that is implemented -in terms of familiar locking primitives, and another that more closely -resembles "classic" RCU. Both are way too simple for real-world use, -lacking both functionality and performance. However, they are useful -in getting a feel for how RCU works. See kernel/rcu/update.c for a -production-quality implementation, and see: - - http://www.rdrop.com/users/paulmck/RCU - -for papers describing the Linux kernel RCU implementation. The OLS'01 -and OLS'02 papers are a good introduction, and the dissertation provides -more details on the current implementation as of early 2004. - - -5A. "TOY" IMPLEMENTATION #1: LOCKING - -This section presents a "toy" RCU implementation that is based on -familiar locking primitives. Its overhead makes it a non-starter for -real-life use, as does its lack of scalability. It is also unsuitable -for realtime use, since it allows scheduling latency to "bleed" from -one read-side critical section to another. It also assumes recursive -reader-writer locks: If you try this with non-recursive locks, and -you allow nested rcu_read_lock() calls, you can deadlock. - -However, it is probably the easiest implementation to relate to, so is -a good starting point. - -It is extremely simple: - - static DEFINE_RWLOCK(rcu_gp_mutex); - - void rcu_read_lock(void) - { - read_lock(&rcu_gp_mutex); - } - - void rcu_read_unlock(void) - { - read_unlock(&rcu_gp_mutex); - } - - void synchronize_rcu(void) - { - write_lock(&rcu_gp_mutex); - smp_mb__after_spinlock(); - write_unlock(&rcu_gp_mutex); - } - -[You can ignore rcu_assign_pointer() and rcu_dereference() without missing -much. But here are simplified versions anyway. And whatever you do, -don't forget about them when submitting patches making use of RCU!] - - #define rcu_assign_pointer(p, v) \ - ({ \ - smp_store_release(&(p), (v)); \ - }) - - #define rcu_dereference(p) \ - ({ \ - typeof(p) _________p1 = READ_ONCE(p); \ - (_________p1); \ - }) - - -The rcu_read_lock() and rcu_read_unlock() primitive read-acquire -and release a global reader-writer lock. The synchronize_rcu() -primitive write-acquires this same lock, then releases it. This means -that once synchronize_rcu() exits, all RCU read-side critical sections -that were in progress before synchronize_rcu() was called are guaranteed -to have completed -- there is no way that synchronize_rcu() would have -been able to write-acquire the lock otherwise. The smp_mb__after_spinlock() -promotes synchronize_rcu() to a full memory barrier in compliance with -the "Memory-Barrier Guarantees" listed in: - - Documentation/RCU/Design/Requirements/Requirements.rst - -It is possible to nest rcu_read_lock(), since reader-writer locks may -be recursively acquired. Note also that rcu_read_lock() is immune -from deadlock (an important property of RCU). The reason for this is -that the only thing that can block rcu_read_lock() is a synchronize_rcu(). -But synchronize_rcu() does not acquire any locks while holding rcu_gp_mutex, -so there can be no deadlock cycle. - -Quick Quiz #1: Why is this argument naive? How could a deadlock - occur when using this algorithm in a real-world Linux - kernel? How could this deadlock be avoided? - - -5B. "TOY" EXAMPLE #2: CLASSIC RCU - -This section presents a "toy" RCU implementation that is based on -"classic RCU". It is also short on performance (but only for updates) and -on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT -kernels. The definitions of rcu_dereference() and rcu_assign_pointer() -are the same as those shown in the preceding section, so they are omitted. - - void rcu_read_lock(void) { } - - void rcu_read_unlock(void) { } - - void synchronize_rcu(void) - { - int cpu; - - for_each_possible_cpu(cpu) - run_on(cpu); - } - -Note that rcu_read_lock() and rcu_read_unlock() do absolutely nothing. -This is the great strength of classic RCU in a non-preemptive kernel: -read-side overhead is precisely zero, at least on non-Alpha CPUs. -And there is absolutely no way that rcu_read_lock() can possibly -participate in a deadlock cycle! - -The implementation of synchronize_rcu() simply schedules itself on each -CPU in turn. The run_on() primitive can be implemented straightforwardly -in terms of the sched_setaffinity() primitive. Of course, a somewhat less -"toy" implementation would restore the affinity upon completion rather -than just leaving all tasks running on the last CPU, but when I said -"toy", I meant -toy-! - -So how the heck is this supposed to work??? - -Remember that it is illegal to block while in an RCU read-side critical -section. Therefore, if a given CPU executes a context switch, we know -that it must have completed all preceding RCU read-side critical sections. -Once -all- CPUs have executed a context switch, then -all- preceding -RCU read-side critical sections will have completed. - -So, suppose that we remove a data item from its structure and then invoke -synchronize_rcu(). Once synchronize_rcu() returns, we are guaranteed -that there are no RCU read-side critical sections holding a reference -to that data item, so we can safely reclaim it. - -Quick Quiz #2: Give an example where Classic RCU's read-side - overhead is -negative-. - -Quick Quiz #3: If it is illegal to block in an RCU read-side - critical section, what the heck do you do in - PREEMPT_RT, where normal spinlocks can block??? - - -6. ANALOGY WITH READER-WRITER LOCKING - -Although RCU can be used in many different ways, a very common use of -RCU is analogous to reader-writer locking. The following unified -diff shows how closely related RCU and reader-writer locking can be. - - @@ -5,5 +5,5 @@ struct el { - int data; - /* Other data fields */ - }; - -rwlock_t listmutex; - +spinlock_t listmutex; - struct el head; - - @@ -13,15 +14,15 @@ - struct list_head *lp; - struct el *p; - - - read_lock(&listmutex); - - list_for_each_entry(p, head, lp) { - + rcu_read_lock(); - + list_for_each_entry_rcu(p, head, lp) { - if (p->key == key) { - *result = p->data; - - read_unlock(&listmutex); - + rcu_read_unlock(); - return 1; - } - } - - read_unlock(&listmutex); - + rcu_read_unlock(); - return 0; - } - - @@ -29,15 +30,16 @@ - { - struct el *p; - - - write_lock(&listmutex); - + spin_lock(&listmutex); - list_for_each_entry(p, head, lp) { - if (p->key == key) { - - list_del(&p->list); - - write_unlock(&listmutex); - + list_del_rcu(&p->list); - + spin_unlock(&listmutex); - + synchronize_rcu(); - kfree(p); - return 1; - } - } - - write_unlock(&listmutex); - + spin_unlock(&listmutex); - return 0; - } - -Or, for those who prefer a side-by-side listing: - - 1 struct el { 1 struct el { - 2 struct list_head list; 2 struct list_head list; - 3 long key; 3 long key; - 4 spinlock_t mutex; 4 spinlock_t mutex; - 5 int data; 5 int data; - 6 /* Other data fields */ 6 /* Other data fields */ - 7 }; 7 }; - 8 rwlock_t listmutex; 8 spinlock_t listmutex; - 9 struct el head; 9 struct el head; - - 1 int search(long key, int *result) 1 int search(long key, int *result) - 2 { 2 { - 3 struct list_head *lp; 3 struct list_head *lp; - 4 struct el *p; 4 struct el *p; - 5 5 - 6 read_lock(&listmutex); 6 rcu_read_lock(); - 7 list_for_each_entry(p, head, lp) { 7 list_for_each_entry_rcu(p, head, lp) { - 8 if (p->key == key) { 8 if (p->key == key) { - 9 *result = p->data; 9 *result = p->data; -10 read_unlock(&listmutex); 10 rcu_read_unlock(); -11 return 1; 11 return 1; -12 } 12 } -13 } 13 } -14 read_unlock(&listmutex); 14 rcu_read_unlock(); -15 return 0; 15 return 0; -16 } 16 } - - 1 int delete(long key) 1 int delete(long key) - 2 { 2 { - 3 struct el *p; 3 struct el *p; - 4 4 - 5 write_lock(&listmutex); 5 spin_lock(&listmutex); - 6 list_for_each_entry(p, head, lp) { 6 list_for_each_entry(p, head, lp) { - 7 if (p->key == key) { 7 if (p->key == key) { - 8 list_del(&p->list); 8 list_del_rcu(&p->list); - 9 write_unlock(&listmutex); 9 spin_unlock(&listmutex); - 10 synchronize_rcu(); -10 kfree(p); 11 kfree(p); -11 return 1; 12 return 1; -12 } 13 } -13 } 14 } -14 write_unlock(&listmutex); 15 spin_unlock(&listmutex); -15 return 0; 16 return 0; -16 } 17 } - -Either way, the differences are quite small. Read-side locking moves -to rcu_read_lock() and rcu_read_unlock, update-side locking moves from -a reader-writer lock to a simple spinlock, and a synchronize_rcu() -precedes the kfree(). - -However, there is one potential catch: the read-side and update-side -critical sections can now run concurrently. In many cases, this will -not be a problem, but it is necessary to check carefully regardless. -For example, if multiple independent list updates must be seen as -a single atomic update, converting to RCU will require special care. - -Also, the presence of synchronize_rcu() means that the RCU version of -delete() can now block. If this is a problem, there is a callback-based -mechanism that never blocks, namely call_rcu() or kfree_rcu(), that can -be used in place of synchronize_rcu(). - - -7. FULL LIST OF RCU APIs - -The RCU APIs are documented in docbook-format header comments in the -Linux-kernel source code, but it helps to have a full list of the -APIs, since there does not appear to be a way to categorize them -in docbook. Here is the list, by category. - -RCU list traversal: - - list_entry_rcu - list_first_entry_rcu - list_next_rcu - list_for_each_entry_rcu - list_for_each_entry_continue_rcu - list_for_each_entry_from_rcu - hlist_first_rcu - hlist_next_rcu - hlist_pprev_rcu - hlist_for_each_entry_rcu - hlist_for_each_entry_rcu_bh - hlist_for_each_entry_from_rcu - hlist_for_each_entry_continue_rcu - hlist_for_each_entry_continue_rcu_bh - hlist_nulls_first_rcu - hlist_nulls_for_each_entry_rcu - hlist_bl_first_rcu - hlist_bl_for_each_entry_rcu - -RCU pointer/list update: - - rcu_assign_pointer - list_add_rcu - list_add_tail_rcu - list_del_rcu - list_replace_rcu - hlist_add_behind_rcu - hlist_add_before_rcu - hlist_add_head_rcu - hlist_del_rcu - hlist_del_init_rcu - hlist_replace_rcu - list_splice_init_rcu() - hlist_nulls_del_init_rcu - hlist_nulls_del_rcu - hlist_nulls_add_head_rcu - hlist_bl_add_head_rcu - hlist_bl_del_init_rcu - hlist_bl_del_rcu - hlist_bl_set_first_rcu - -RCU: Critical sections Grace period Barrier - - rcu_read_lock synchronize_net rcu_barrier - rcu_read_unlock synchronize_rcu - rcu_dereference synchronize_rcu_expedited - rcu_read_lock_held call_rcu - rcu_dereference_check kfree_rcu - rcu_dereference_protected - -bh: Critical sections Grace period Barrier - - rcu_read_lock_bh call_rcu rcu_barrier - rcu_read_unlock_bh synchronize_rcu - [local_bh_disable] synchronize_rcu_expedited - [and friends] - rcu_dereference_bh - rcu_dereference_bh_check - rcu_dereference_bh_protected - rcu_read_lock_bh_held - -sched: Critical sections Grace period Barrier - - rcu_read_lock_sched call_rcu rcu_barrier - rcu_read_unlock_sched synchronize_rcu - [preempt_disable] synchronize_rcu_expedited - [and friends] - rcu_read_lock_sched_notrace - rcu_read_unlock_sched_notrace - rcu_dereference_sched - rcu_dereference_sched_check - rcu_dereference_sched_protected - rcu_read_lock_sched_held - - -SRCU: Critical sections Grace period Barrier - - srcu_read_lock call_srcu srcu_barrier - srcu_read_unlock synchronize_srcu - srcu_dereference synchronize_srcu_expedited - srcu_dereference_check - srcu_read_lock_held - -SRCU: Initialization/cleanup - DEFINE_SRCU - DEFINE_STATIC_SRCU - init_srcu_struct - cleanup_srcu_struct - -All: lockdep-checked RCU-protected pointer access - - rcu_access_pointer - rcu_dereference_raw - RCU_LOCKDEP_WARN - rcu_sleep_check - RCU_NONIDLE - -See the comment headers in the source code (or the docbook generated -from them) for more information. - -However, given that there are no fewer than four families of RCU APIs -in the Linux kernel, how do you choose which one to use? The following -list can be helpful: - -a. Will readers need to block? If so, you need SRCU. - -b. What about the -rt patchset? If readers would need to block - in an non-rt kernel, you need SRCU. If readers would block - in a -rt kernel, but not in a non-rt kernel, SRCU is not - necessary. (The -rt patchset turns spinlocks into sleeplocks, - hence this distinction.) - -c. Do you need to treat NMI handlers, hardirq handlers, - and code segments with preemption disabled (whether - via preempt_disable(), local_irq_save(), local_bh_disable(), - or some other mechanism) as if they were explicit RCU readers? - If so, RCU-sched is the only choice that will work for you. - -d. Do you need RCU grace periods to complete even in the face - of softirq monopolization of one or more of the CPUs? For - example, is your code subject to network-based denial-of-service - attacks? If so, you should disable softirq across your readers, - for example, by using rcu_read_lock_bh(). - -e. Is your workload too update-intensive for normal use of - RCU, but inappropriate for other synchronization mechanisms? - If so, consider SLAB_TYPESAFE_BY_RCU (which was originally - named SLAB_DESTROY_BY_RCU). But please be careful! - -f. Do you need read-side critical sections that are respected - even though they are in the middle of the idle loop, during - user-mode execution, or on an offlined CPU? If so, SRCU is the - only choice that will work for you. - -g. Otherwise, use RCU. - -Of course, this all assumes that you have determined that RCU is in fact -the right tool for your job. - - -8. ANSWERS TO QUICK QUIZZES - -Quick Quiz #1: Why is this argument naive? How could a deadlock - occur when using this algorithm in a real-world Linux - kernel? [Referring to the lock-based "toy" RCU - algorithm.] - -Answer: Consider the following sequence of events: - - 1. CPU 0 acquires some unrelated lock, call it - "problematic_lock", disabling irq via - spin_lock_irqsave(). - - 2. CPU 1 enters synchronize_rcu(), write-acquiring - rcu_gp_mutex. - - 3. CPU 0 enters rcu_read_lock(), but must wait - because CPU 1 holds rcu_gp_mutex. - - 4. CPU 1 is interrupted, and the irq handler - attempts to acquire problematic_lock. - - The system is now deadlocked. - - One way to avoid this deadlock is to use an approach like - that of CONFIG_PREEMPT_RT, where all normal spinlocks - become blocking locks, and all irq handlers execute in - the context of special tasks. In this case, in step 4 - above, the irq handler would block, allowing CPU 1 to - release rcu_gp_mutex, avoiding the deadlock. - - Even in the absence of deadlock, this RCU implementation - allows latency to "bleed" from readers to other - readers through synchronize_rcu(). To see this, - consider task A in an RCU read-side critical section - (thus read-holding rcu_gp_mutex), task B blocked - attempting to write-acquire rcu_gp_mutex, and - task C blocked in rcu_read_lock() attempting to - read_acquire rcu_gp_mutex. Task A's RCU read-side - latency is holding up task C, albeit indirectly via - task B. - - Realtime RCU implementations therefore use a counter-based - approach where tasks in RCU read-side critical sections - cannot be blocked by tasks executing synchronize_rcu(). - -Quick Quiz #2: Give an example where Classic RCU's read-side - overhead is -negative-. - -Answer: Imagine a single-CPU system with a non-CONFIG_PREEMPT - kernel where a routing table is used by process-context - code, but can be updated by irq-context code (for example, - by an "ICMP REDIRECT" packet). The usual way of handling - this would be to have the process-context code disable - interrupts while searching the routing table. Use of - RCU allows such interrupt-disabling to be dispensed with. - Thus, without RCU, you pay the cost of disabling interrupts, - and with RCU you don't. - - One can argue that the overhead of RCU in this - case is negative with respect to the single-CPU - interrupt-disabling approach. Others might argue that - the overhead of RCU is merely zero, and that replacing - the positive overhead of the interrupt-disabling scheme - with the zero-overhead RCU scheme does not constitute - negative overhead. - - In real life, of course, things are more complex. But - even the theoretical possibility of negative overhead for - a synchronization primitive is a bit unexpected. ;-) - -Quick Quiz #3: If it is illegal to block in an RCU read-side - critical section, what the heck do you do in - PREEMPT_RT, where normal spinlocks can block??? - -Answer: Just as PREEMPT_RT permits preemption of spinlock - critical sections, it permits preemption of RCU - read-side critical sections. It also permits - spinlocks blocking while in RCU read-side critical - sections. - - Why the apparent inconsistency? Because it is - possible to use priority boosting to keep the RCU - grace periods short if need be (for example, if running - short of memory). In contrast, if blocking waiting - for (say) network reception, there is no way to know - what should be boosted. Especially given that the - process we need to boost might well be a human being - who just went out for a pizza or something. And although - a computer-operated cattle prod might arouse serious - interest, it might also provoke serious objections. - Besides, how does the computer know what pizza parlor - the human being went to??? - - -ACKNOWLEDGEMENTS - -My thanks to the people who helped make this human-readable, including -Jon Walpole, Josh Triplett, Serge Hallyn, Suzanne Wood, and Alan Stern. - - -For more information, see http://www.rdrop.com/users/paulmck/RCU. diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index 621111ce5740142786eef7f3f8177b58708cabb3..f2b3439edcc2cc772c7f92cd3510060df6b5e8b8 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -1,3 +1,5 @@ +.. _psi: + ================================ PSI - Pressure Stall Information ================================ diff --git a/Documentation/admin-guide/acpi/fan_performance_states.rst b/Documentation/admin-guide/acpi/fan_performance_states.rst new file mode 100644 index 0000000000000000000000000000000000000000..98fe5c3331214df8745ba8fab3e25875bdec7f8f --- /dev/null +++ b/Documentation/admin-guide/acpi/fan_performance_states.rst @@ -0,0 +1,62 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=========================== +ACPI Fan Performance States +=========================== + +When the optional _FPS object is present under an ACPI device representing a +fan (for example, PNP0C0B or INT3404), the ACPI fan driver creates additional +"state*" attributes in the sysfs directory of the ACPI device in question. +These attributes list properties of fan performance states. + +For more information on _FPS refer to the ACPI specification at: + +http://uefi.org/specifications + +For instance, the contents of the INT3404 ACPI device sysfs directory +may look as follows:: + + $ ls -l /sys/bus/acpi/devices/INT3404:00/ + total 0 + ... + -r--r--r-- 1 root root 4096 Dec 13 20:38 state0 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state1 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state10 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state11 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state2 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state3 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state4 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state5 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state6 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state7 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state8 + -r--r--r-- 1 root root 4096 Dec 13 20:38 state9 + -r--r--r-- 1 root root 4096 Dec 13 01:00 status + ... + +where each of the "state*" files represents one performance state of the fan +and contains a colon-separated list of 5 integer numbers (fields) with the +following interpretation:: + + control_percent:trip_point_index:speed_rpm:noise_level_mdb:power_mw + +* ``control_percent``: The percent value to be used to set the fan speed to a + specific level using the _FSL object (0-100). + +* ``trip_point_index``: The active cooling trip point number that corresponds + to this performance state (0-9). + +* ``speed_rpm``: Speed of the fan in rotations per minute. + +* ``noise_level_mdb``: Audible noise emitted by the fan in this state in + millidecibels. + +* ``power_mw``: Power draw of the fan in this state in milliwatts. + +For example:: + + $cat /sys/bus/acpi/devices/INT3404:00/state1 + 25:0:3200:12500:1250 + +When a given field is not populated or its value provided by the platform +firmware is invalid, the "not-defined" string is shown instead of the value. diff --git a/Documentation/admin-guide/acpi/index.rst b/Documentation/admin-guide/acpi/index.rst index 4d13eeea1ecac3ca70d8a9214237e0dd2a05d331..71277689ad97f452fb91ff8738601f6e163d1d65 100644 --- a/Documentation/admin-guide/acpi/index.rst +++ b/Documentation/admin-guide/acpi/index.rst @@ -12,3 +12,4 @@ the Linux ACPI support. dsdt-override ssdt-overlays cppc_sysfs + fan_performance_states diff --git a/Documentation/admin-guide/binderfs.rst b/Documentation/admin-guide/binderfs.rst index c009671f8434918c2867e258e5192464fbf0486c..8243af9b3510e8c428f36c2879bc9bcb7e645e12 100644 --- a/Documentation/admin-guide/binderfs.rst +++ b/Documentation/admin-guide/binderfs.rst @@ -33,6 +33,12 @@ max a per-instance limit. If ``max=`` is set then only ```` number of binder devices can be allocated in this binderfs instance. +stats + Using ``stats=global`` enables global binder statistics. + ``stats=global`` is only available for a binderfs instance mounted in the + initial user namespace. An attempt to use the option to mount a binderfs + instance in another user namespace will return a permission error. + Allocating binder Devices ------------------------- diff --git a/Documentation/admin-guide/binfmt-misc.rst b/Documentation/admin-guide/binfmt-misc.rst index 97b0d792707825b2d43ef221eca4861604b8ccd1..7a864131e5ea767280f9448683dd332b71f1af10 100644 --- a/Documentation/admin-guide/binfmt-misc.rst +++ b/Documentation/admin-guide/binfmt-misc.rst @@ -1,5 +1,5 @@ -Kernel Support for miscellaneous (your favourite) Binary Formats v1.1 -===================================================================== +Kernel Support for miscellaneous Binary Formats (binfmt_misc) +============================================================= This Kernel feature allows you to invoke almost (for restrictions see below) every program by simply typing its name in the shell. @@ -140,8 +140,8 @@ Hints ----- If you want to pass special arguments to your interpreter, you can -write a wrapper script for it. See Documentation/admin-guide/java.rst for an -example. +write a wrapper script for it. +See :doc:`Documentation/admin-guide/java.rst <./java>` for an example. Your interpreter should NOT look in the PATH for the filename; the kernel passes it the full filename (or the file descriptor) to use. Using ``$PATH`` can diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 6eccf13219ffec2f8ac0f37af8e9b9187b8fdcb1..a6fd1f9b5faf52841d5fc281ef215c1ea019362d 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -1,15 +1,15 @@ ======================================== -zram: Compressed RAM based block devices +zram: Compressed RAM-based block devices ======================================== Introduction ============ -The zram module creates RAM based block devices named /dev/zram +The zram module creates RAM-based block devices named /dev/zram ( = 0, 1, ...). Pages written to these disks are compressed and stored in memory itself. These disks allow very fast I/O and compression provides -good amounts of memory savings. Some of the usecases include /tmp storage, -use as swap disks, various caches under /var and maybe many more :) +good amounts of memory savings. Some of the use cases include /tmp storage, +use as swap disks, various caches under /var and maybe many more. :) Statistics for individual zram devices are exported through sysfs nodes at /sys/block/zram/ @@ -43,17 +43,17 @@ The list of possible return codes: ======== ============================================================= -EBUSY an attempt to modify an attribute that cannot be changed once - the device has been initialised. Please reset device first; + the device has been initialised. Please reset device first. -ENOMEM zram was not able to allocate enough memory to fulfil your - needs; + needs. -EINVAL invalid input has been provided. ======== ============================================================= -If you use 'echo', the returned value that is changed by 'echo' utility, +If you use 'echo', the returned value is set by the 'echo' utility, and, in general case, something like:: echo 3 > /sys/block/zram0/max_comp_streams - if [ $? -ne 0 ]; + if [ $? -ne 0 ]; then handle_error fi @@ -65,7 +65,8 @@ should suffice. :: modprobe zram num_devices=4 - This creates 4 devices: /dev/zram{0,1,2,3} + +This creates 4 devices: /dev/zram{0,1,2,3} num_devices parameter is optional and tells zram how many devices should be pre-created. Default: 1. @@ -73,12 +74,12 @@ pre-created. Default: 1. 2) Set max number of compression streams ======================================== -Regardless the value passed to this attribute, ZRAM will always -allocate multiple compression streams - one per online CPUs - thus +Regardless of the value passed to this attribute, ZRAM will always +allocate multiple compression streams - one per online CPU - thus allowing several concurrent compression operations. The number of allocated compression streams goes down when some of the CPUs become offline. There is no single-compression-stream mode anymore, -unless you are running a UP system or has only 1 CPU online. +unless you are running a UP system or have only 1 CPU online. To find out how many streams are currently available:: @@ -89,7 +90,7 @@ To find out how many streams are currently available:: Using comp_algorithm device attribute one can see available and currently selected (shown in square brackets) compression algorithms, -change selected compression algorithm (once the device is initialised +or change the selected compression algorithm (once the device is initialised there is no way to change compression algorithm). Examples:: @@ -167,9 +168,9 @@ Examples:: zram provides a control interface, which enables dynamic (on-demand) device addition and removal. -In order to add a new /dev/zramX device, perform read operation on hot_add -attribute. This will return either new device's device id (meaning that you -can use /dev/zram) or error code. +In order to add a new /dev/zramX device, perform a read operation on the hot_add +attribute. This will return either the new device's device id (meaning that you +can use /dev/zram) or an error code. Example:: @@ -186,8 +187,8 @@ execute:: Per-device statistics are exported as various nodes under /sys/block/zram/ -A brief description of exported device attributes. For more details please -read Documentation/ABI/testing/sysfs-block-zram. +A brief description of exported device attributes follows. For more details +please read Documentation/ABI/testing/sysfs-block-zram. ====================== ====== =============================================== Name access description @@ -245,13 +246,11 @@ whitespace: File /sys/block/zram/mm_stat -The stat file represents device's mm statistics. It consists of a single +The mm_stat file represents the device's mm statistics. It consists of a single line of text and contains the following stats separated by whitespace: ================ ============================================================= orig_data_size uncompressed size of data stored in this disk. - This excludes same-element-filled pages (same_pages) since - no memory is allocated for them. Unit: bytes compr_data_size compressed size of data stored in this disk mem_used_total the amount of memory allocated for this disk. This @@ -261,7 +260,7 @@ line of text and contains the following stats separated by whitespace: Unit: bytes mem_limit the maximum amount of memory ZRAM can use to store the compressed data - mem_used_max the maximum amount of memory zram have consumed to + mem_used_max the maximum amount of memory zram has consumed to store the data same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. @@ -271,7 +270,7 @@ line of text and contains the following stats separated by whitespace: File /sys/block/zram/bd_stat -The stat file represents device's backing device statistics. It consists of +The bd_stat file represents a device's backing device statistics. It consists of a single line of text and contains the following stats separated by whitespace: ============== ============================================================= @@ -316,9 +315,9 @@ To use the feature, admin should set up backing device via:: echo /dev/sda5 > /sys/block/zramX/backing_dev before disksize setting. It supports only partition at this moment. -If admin want to use incompressible page writeback, they could do via:: +If admin wants to use incompressible page writeback, they could do via:: - echo huge > /sys/block/zramX/write + echo huge > /sys/block/zramX/writeback To use idle page writeback, first, user need to declare zram pages as idle:: @@ -326,7 +325,7 @@ as idle:: echo all > /sys/block/zramX/idle From now on, any pages on zram are idle pages. The idle mark -will be removed until someone request access of the block. +will be removed until someone requests access of the block. IOW, unless there is access request, those pages are still idle pages. Admin can request writeback of those idle pages at right timing via:: @@ -341,16 +340,16 @@ to guarantee storage health for entire product life. To overcome the concern, zram supports "writeback_limit" feature. The "writeback_limit_enable"'s default value is 0 so that it doesn't limit -any writeback. IOW, if admin want to apply writeback budget, he should +any writeback. IOW, if admin wants to apply writeback budget, he should enable writeback_limit_enable via:: $ echo 1 > /sys/block/zramX/writeback_limit_enable Once writeback_limit_enable is set, zram doesn't allow any writeback -until admin set the budget via /sys/block/zramX/writeback_limit. +until admin sets the budget via /sys/block/zramX/writeback_limit. (If admin doesn't enable writeback_limit_enable, writeback_limit's value -assigned via /sys/block/zramX/writeback_limit is meaninless.) +assigned via /sys/block/zramX/writeback_limit is meaningless.) If admin want to limit writeback as per-day 400M, he could do it like below:: @@ -361,13 +360,13 @@ like below:: /sys/block/zram0/writeback_limit. $ echo 1 > /sys/block/zram0/writeback_limit_enable -If admin want to allow further write again once the bugdet is exausted, +If admins want to allow further write again once the bugdet is exhausted, he could do it like below:: $ echo $((400<>4K_SHIFT)) > \ /sys/block/zram0/writeback_limit -If admin want to see remaining writeback budget since he set:: +If admin wants to see remaining writeback budget since last set:: $ cat /sys/block/zramX/writeback_limit @@ -375,12 +374,12 @@ If admin want to disable writeback limit, he could do:: $ echo 0 > /sys/block/zramX/writeback_limit_enable -The writeback_limit count will reset whenever you reset zram(e.g., +The writeback_limit count will reset whenever you reset zram (e.g., system reboot, echo 1 > /sys/block/zramX/reset) so keeping how many of writeback happened until you reset the zram to allocate extra writeback budget in next setting is user's job. -If admin want to measure writeback count in a certain period, he could +If admin wants to measure writeback count in a certain period, he could know it via /sys/block/zram0/bd_stat's 3rd column. memory tracking diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst new file mode 100644 index 0000000000000000000000000000000000000000..d6b3b77a412966c441997ee9efdb54a7360ef983 --- /dev/null +++ b/Documentation/admin-guide/bootconfig.rst @@ -0,0 +1,218 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _bootconfig: + +================== +Boot Configuration +================== + +:Author: Masami Hiramatsu + +Overview +======== + +The boot configuration expands the current kernel command line to support +additional key-value data when booting the kernel in an efficient way. +This allows administrators to pass a structured-Key config file. + +Config File Syntax +================== + +The boot config syntax is a simple structured key-value. Each key consists +of dot-connected-words, and key and value are connected by ``=``. The value +has to be terminated by semi-colon (``;``) or newline (``\n``). +For array value, array entries are separated by comma (``,``). :: + + KEY[.WORD[...]] = VALUE[, VALUE2[...]][;] + +Unlike the kernel command line syntax, spaces are OK around the comma and ``=``. + +Each key word must contain only alphabets, numbers, dash (``-``) or underscore +(``_``). And each value only contains printable characters or spaces except +for delimiters such as semi-colon (``;``), new-line (``\n``), comma (``,``), +hash (``#``) and closing brace (``}``). + +If you want to use those delimiters in a value, you can use either double- +quotes (``"VALUE"``) or single-quotes (``'VALUE'``) to quote it. Note that +you can not escape these quotes. + +There can be a key which doesn't have value or has an empty value. Those keys +are used for checking if the key exists or not (like a boolean). + +Key-Value Syntax +---------------- + +The boot config file syntax allows user to merge partially same word keys +by brace. For example:: + + foo.bar.baz = value1 + foo.bar.qux.quux = value2 + +These can be written also in:: + + foo.bar { + baz = value1 + qux.quux = value2 + } + +Or more shorter, written as following:: + + foo.bar { baz = value1; qux.quux = value2 } + +In both styles, same key words are automatically merged when parsing it +at boot time. So you can append similar trees or key-values. + +Same-key Values +--------------- + +It is prohibited that two or more values or arrays share a same-key. +For example,:: + + foo = bar, baz + foo = qux # !ERROR! we can not re-define same key + +If you want to append the value to existing key as an array member, +you can use ``+=`` operator. For example:: + + foo = bar, baz + foo += qux + +In this case, the key ``foo`` has ``bar``, ``baz`` and ``qux``. + +However, a sub-key and a value can not co-exist under a parent key. +For example, following config is NOT allowed.:: + + foo = value1 + foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist + + +Comments +-------- + +The config syntax accepts shell-script style comments. The comments starting +with hash ("#") until newline ("\n") will be ignored. + +:: + + # comment line + foo = value # value is set to foo. + bar = 1, # 1st element + 2, # 2nd element + 3 # 3rd element + +This is parsed as below:: + + foo = value + bar = 1, 2, 3 + +Note that you can not put a comment between value and delimiter(``,`` or +``;``). This means following config has a syntax error :: + + key = 1 # comment + ,2 + + +/proc/bootconfig +================ + +/proc/bootconfig is a user-space interface of the boot config. +Unlike /proc/cmdline, this file shows the key-value style list. +Each key-value pair is shown in each line with following style:: + + KEY[.WORDS...] = "[VALUE]"[,"VALUE2"...] + + +Boot Kernel With a Boot Config +============================== + +Since the boot configuration file is loaded with initrd, it will be added +to the end of the initrd (initramfs) image file with size, checksum and +12-byte magic word as below. + +[initrd][bootconfig][size(u32)][checksum(u32)][#BOOTCONFIG\n] + +The Linux kernel decodes the last part of the initrd image in memory to +get the boot configuration data. +Because of this "piggyback" method, there is no need to change or +update the boot loader and the kernel image itself. + +To do this operation, Linux kernel provides "bootconfig" command under +tools/bootconfig, which allows admin to apply or delete the config file +to/from initrd image. You can build it by the following command:: + + # make -C tools/bootconfig + +To add your boot config file to initrd image, run bootconfig as below +(Old data is removed automatically if exists):: + + # tools/bootconfig/bootconfig -a your-config /boot/initrd.img-X.Y.Z + +To remove the config from the image, you can use -d option as below:: + + # tools/bootconfig/bootconfig -d /boot/initrd.img-X.Y.Z + +Then add "bootconfig" on the normal kernel command line to tell the +kernel to look for the bootconfig at the end of the initrd file. + +Config File Limitation +====================== + +Currently the maximum config size size is 32KB and the total key-words (not +key-value entries) must be under 1024 nodes. +Note: this is not the number of entries but nodes, an entry must consume +more than 2 nodes (a key-word and a value). So theoretically, it will be +up to 512 key-value pairs. If keys contains 3 words in average, it can +contain 256 key-value pairs. In most cases, the number of config items +will be under 100 entries and smaller than 8KB, so it would be enough. +If the node number exceeds 1024, parser returns an error even if the file +size is smaller than 32KB. +Anyway, since bootconfig command verifies it when appending a boot config +to initrd image, user can notice it before boot. + + +Bootconfig APIs +=============== + +User can query or loop on key-value pairs, also it is possible to find +a root (prefix) key node and find key-values under that node. + +If you have a key string, you can query the value directly with the key +using xbc_find_value(). If you want to know what keys exist in the boot +config, you can use xbc_for_each_key_value() to iterate key-value pairs. +Note that you need to use xbc_array_for_each_value() for accessing +each array's value, e.g.:: + + vnode = NULL; + xbc_find_value("key.word", &vnode); + if (vnode && xbc_node_is_array(vnode)) + xbc_array_for_each_value(vnode, value) { + printk("%s ", value); + } + +If you want to focus on keys which have a prefix string, you can use +xbc_find_node() to find a node by the prefix string, and iterate +keys under the prefix node with xbc_node_for_each_key_value(). + +But the most typical usage is to get the named value under prefix +or get the named array under prefix as below:: + + root = xbc_find_node("key.prefix"); + value = xbc_node_find_value(root, "option", &vnode); + ... + xbc_node_for_each_array_value(root, "array-option", value, anode) { + ... + } + +This accesses a value of "key.prefix.option" and an array of +"key.prefix.array-option". + +Locking is not needed, since after initialization, the config becomes +read-only. All data and keys must be copied if you need to modify it. + + +Functions and structures +======================== + +.. kernel-doc:: include/linux/bootconfig.h +.. kernel-doc:: lib/bootconfig.c + diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst index 86a6ae995d5472063e393b86b274fac3e5bff0d1..7ade3abd342ae08820c9b026f04f080c3b882fda 100644 --- a/Documentation/admin-guide/cgroup-v1/cpusets.rst +++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst @@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file automatically tracks the value of node_states[N_MEMORY]--i.e., nodes with memory--using the cpuset_track_online_nodes() hook. +The cpuset.effective_cpus and cpuset.effective_mems files are +normally read-only copies of cpuset.cpus and cpuset.mems files +respectively. If the cpuset cgroup filesystem is mounted with the +special "cpuset_v2_mode" option, the behavior of these files will become +similar to the corresponding files in cpuset v2. In other words, hotplug +events will not change cpuset.cpus and cpuset.mems. Those events will +only affect cpuset.effective_cpus and cpuset.effective_mems which show +the actual cpus and memory nodes that are currently used by this cpuset. +See Documentation/admin-guide/cgroup-v2.rst for more information about +cpuset v2 behavior. + 1.4 What are exclusive cpusets ? -------------------------------- diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst index a3902aa253a96013be62ec3fbabc1d7d1fb4d59d..338f2c7d7a1cd418c20643e3d7fee00b4baf34da 100644 --- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -2,13 +2,6 @@ HugeTLB Controller ================== -The HugeTLB controller allows to limit the HugeTLB usage per control group and -enforces the controller limit during page fault. Since HugeTLB doesn't -support page reclaim, enforcing the limit at page fault time implies that, -the application will get SIGBUS signal if it tries to access HugeTLB pages -beyond its limit. This requires the application to know beforehand how much -HugeTLB pages it would require for its use. - HugeTLB controller can be created by first mounting the cgroup filesystem. # mount -t cgroup -o hugetlb none /sys/fs/cgroup @@ -28,10 +21,14 @@ process (bash) into it. Brief summary of control files:: - hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage - hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb - hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + hugetlb..rsvd.limit_in_bytes # set/show limit of "hugepagesize" hugetlb reservations + hugetlb..rsvd.max_usage_in_bytes # show max "hugepagesize" hugetlb reservations and no-reserve faults + hugetlb..rsvd.usage_in_bytes # show current reservations and no-reserve faults for "hugepagesize" hugetlb + hugetlb..rsvd.failcnt # show the number of allocation failure due to HugeTLB reservation limit + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb faults + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB usage limit For a system supporting three hugepage sizes (64k, 32M and 1G), the control files include:: @@ -40,11 +37,95 @@ files include:: hugetlb.1GB.max_usage_in_bytes hugetlb.1GB.usage_in_bytes hugetlb.1GB.failcnt + hugetlb.1GB.rsvd.limit_in_bytes + hugetlb.1GB.rsvd.max_usage_in_bytes + hugetlb.1GB.rsvd.usage_in_bytes + hugetlb.1GB.rsvd.failcnt hugetlb.64KB.limit_in_bytes hugetlb.64KB.max_usage_in_bytes hugetlb.64KB.usage_in_bytes hugetlb.64KB.failcnt + hugetlb.64KB.rsvd.limit_in_bytes + hugetlb.64KB.rsvd.max_usage_in_bytes + hugetlb.64KB.rsvd.usage_in_bytes + hugetlb.64KB.rsvd.failcnt hugetlb.32MB.limit_in_bytes hugetlb.32MB.max_usage_in_bytes hugetlb.32MB.usage_in_bytes hugetlb.32MB.failcnt + hugetlb.32MB.rsvd.limit_in_bytes + hugetlb.32MB.rsvd.max_usage_in_bytes + hugetlb.32MB.rsvd.usage_in_bytes + hugetlb.32MB.rsvd.failcnt + + +1. Page fault accounting + +hugetlb..limit_in_bytes +hugetlb..max_usage_in_bytes +hugetlb..usage_in_bytes +hugetlb..failcnt + +The HugeTLB controller allows users to limit the HugeTLB usage (page fault) per +control group and enforces the limit during page fault. Since HugeTLB +doesn't support page reclaim, enforcing the limit at page fault time implies +that, the application will get SIGBUS signal if it tries to fault in HugeTLB +pages beyond its limit. Therefore the application needs to know exactly how many +HugeTLB pages it uses before hand, and the sysadmin needs to make sure that +there are enough available on the machine for all the users to avoid processes +getting SIGBUS. + + +2. Reservation accounting + +hugetlb..rsvd.limit_in_bytes +hugetlb..rsvd.max_usage_in_bytes +hugetlb..rsvd.usage_in_bytes +hugetlb..rsvd.failcnt + +The HugeTLB controller allows to limit the HugeTLB reservations per control +group and enforces the controller limit at reservation time and at the fault of +HugeTLB memory for which no reservation exists. Since reservation limits are +enforced at reservation time (on mmap or shget), reservation limits never causes +the application to get SIGBUS signal if the memory was reserved before hand. For +MAP_NORESERVE allocations, the reservation limit behaves the same as the fault +limit, enforcing memory usage at fault time and causing the application to +receive a SIGBUS if it's crossing its limit. + +Reservation limits are superior to page fault limits described above, since +reservation limits are enforced at reservation time (on mmap or shget), and +never causes the application to get SIGBUS signal if the memory was reserved +before hand. This allows for easier fallback to alternatives such as +non-HugeTLB memory for example. In the case of page fault accounting, it's very +hard to avoid processes getting SIGBUS since the sysadmin needs precisely know +the HugeTLB usage of all the tasks in the system and make sure there is enough +pages to satisfy all requests. Avoiding tasks getting SIGBUS on overcommited +systems is practically impossible with page fault accounting. + + +3. Caveats with shared memory + +For shared HugeTLB memory, both HugeTLB reservation and page faults are charged +to the first task that causes the memory to be reserved or faulted, and all +subsequent uses of this reserved or faulted memory is done without charging. + +Shared HugeTLB memory is only uncharged when it is unreserved or deallocated. +This is usually when the HugeTLB file is deleted, and not when the task that +caused the reservation or fault has exited. + + +4. Caveats with HugeTLB cgroup offline. + +When a HugeTLB cgroup goes offline with some reservations or faults still +charged to it, the behavior is as follows: + +- The fault charges are charged to the parent HugeTLB cgroup (reparented), +- the reservation charges remain on the offline HugeTLB cgroup. + +This means that if a HugeTLB cgroup gets offlined while there is still HugeTLB +reservations charged to it, that cgroup persists as a zombie until all HugeTLB +reservations are uncharged. HugeTLB reservations behave in this manner to match +the memory controller whose cgroups also persist as zombie until all charged +memory is uncharged. Also, the tracking of HugeTLB reservations is a bit more +complex compared to the tracking of HugeTLB faults, so it is significantly +harder to reparent reservations at offline time. diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst index 10bf48bae0b045d54d9d3d5cc9ab4399a7b391f6..226f64473e8e215cdb29d36214a0665db1c8e738 100644 --- a/Documentation/admin-guide/cgroup-v1/index.rst +++ b/Documentation/admin-guide/cgroup-v1/index.rst @@ -1,3 +1,5 @@ +.. _cgroup-v1: + ======================== Control Groups version 1 ======================== diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0636bcb60b5a3a644d8411ad68d1171b9fbc534c..bcc80269bb6ac6cc2492b128fd5e167254350737 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and conventions of cgroup v2. It describes all userland-visible aspects of cgroup including core and specific controller behaviors. All future changes must be reflected in this document. Documentation for -v1 is available under Documentation/admin-guide/cgroup-v1/. +v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst `. .. CONTENTS @@ -61,6 +61,8 @@ v1 is available under Documentation/admin-guide/cgroup-v1/. 5-6. Device 5-7. RDMA 5-7-1. RDMA Interface Files + 5-8. HugeTLB + 5.8-1. HugeTLB Interface Files 5-8. Misc 5-8-1. perf_event 5-N. Non-normative information @@ -186,6 +188,17 @@ cgroup v2 currently supports the following mount options. modified through remount from the init namespace. The mount option is ignored on non-init namespace mounts. + memory_recursiveprot + + Recursively apply memory.min and memory.low protection to + entire subtrees, without requiring explicit downward + propagation into leaf cgroups. This allows protecting entire + subtrees from one another, while retaining free competition + within those subtrees. This should have been the default + behavior but is a mount-option to avoid regressing setups + relying on the original semantics (e.g. specifying bogusly + high 'bypass' protection values at higher tree levels). + Organizing Processes and Threads -------------------------------- @@ -1021,7 +1034,7 @@ All time durations are in microseconds. A read-only nested-key file which exists on non-root cgroups. Shows pressure stall information for CPU. See - Documentation/accounting/psi.rst for details. + :ref:`Documentation/accounting/psi.rst ` for details. cpu.uclamp.min A read-write single value file which exists on non-root cgroups. @@ -1101,7 +1114,7 @@ PAGE_SIZE multiple when read back. proportionally to the overage, reducing reclaim pressure for smaller overages. - Effective min boundary is limited by memory.min values of + Effective min boundary is limited by memory.min values of all ancestor cgroups. If there is memory.min overcommitment (child cgroup or cgroups are requiring more protected memory than parent will allow), then each child cgroup will get @@ -1311,53 +1324,41 @@ PAGE_SIZE multiple when read back. Number of major page faults incurred workingset_refault - Number of refaults of previously evicted pages workingset_activate - Number of refaulted pages that were immediately activated workingset_nodereclaim - Number of times a shadow node has been reclaimed pgrefill - Amount of scanned pages (in an active LRU list) pgscan - Amount of scanned pages (in an inactive LRU list) pgsteal - Amount of reclaimed pages pgactivate - Amount of pages moved to the active LRU list pgdeactivate - Amount of pages moved to the inactive LRU list pglazyfree - Amount of pages postponed to be freed under memory pressure pglazyfreed - Amount of reclaimed lazyfree pages thp_fault_alloc - Number of transparent hugepages which were allocated to satisfy a page fault, including COW faults. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE is not set. thp_collapse_alloc - Number of transparent hugepages which were allocated to allow collapsing an existing range of pages. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE is not set. @@ -1401,7 +1402,7 @@ PAGE_SIZE multiple when read back. A read-only nested-key file which exists on non-root cgroups. Shows pressure stall information for memory. See - Documentation/accounting/psi.rst for details. + :ref:`Documentation/accounting/psi.rst ` for details. Usage Guidelines @@ -1476,7 +1477,7 @@ IO Interface Files dios Number of discard IOs ====== ===================== - An example read output follows: + An example read output follows:: 8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353 dbytes=0 dios=0 8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 dbytes=50331648 dios=3021 @@ -1641,7 +1642,7 @@ IO Interface Files A read-only nested-key file which exists on non-root cgroups. Shows pressure stall information for IO. See - Documentation/accounting/psi.rst for details. + :ref:`Documentation/accounting/psi.rst ` for details. Writeback @@ -1851,7 +1852,7 @@ Cpuset Interface Files from the requested CPUs. The CPU numbers are comma-separated numbers or ranges. - For example: + For example:: # cat cpuset.cpus 0-4,6,8-10 @@ -1890,7 +1891,7 @@ Cpuset Interface Files from the requested memory nodes. The memory node numbers are comma-separated numbers or ranges. - For example: + For example:: # cat cpuset.mems 0-1,3 @@ -2056,6 +2057,33 @@ RDMA Interface Files mlx4_0 hca_handle=1 hca_object=20 ocrdma1 hca_handle=1 hca_object=23 +HugeTLB +------- + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. + +HugeTLB Interface Files +~~~~~~~~~~~~~~~~~~~~~~~ + + hugetlb..current + Show current usage for "hugepagesize" hugetlb. It exists for all + the cgroup except root. + + hugetlb..max + Set/show the hard limit of "hugepagesize" hugetlb usage. + The default value is "max". It exists for all the cgroup except root. + + hugetlb..events + A read-only flat-keyed file which exists on non-root cgroups. + + max + The number of allocation failure due to HugeTLB limit + + hugetlb..events.local + Similar to hugetlb..events but the fields in the file + are local to the cgroup i.e. not hierarchical. The file modified event + generated on this file reflects only the local events. Misc ---- diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index c00f9f11e3f3f6133d4b741244e034ae2a3c430e..8439d2ae689b4e2bc99f4110340c4cfef8f6d1e4 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -182,12 +182,15 @@ fix_padding space-efficient. If this option is not present, large padding is used - that is for compatibility with older kernels. - -The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can -be changed when reloading the target (load an inactive table and swap the -tables with suspend and resume). The other arguments should not be changed -when reloading the target because the layout of disk data depend on them -and the reloaded target would be non-functional. +allow_discards + Allow block discard requests (a.k.a. TRIM) for the integrity device. + Discards are only allowed to devices using internal hash. + +The journal mode (D/J), buffer_sectors, journal_watermark, commit_time and +allow_discards can be changed when reloading the target (load an inactive +table and swap the tables with suspend and resume). The other arguments +should not be changed when reloading the target because the layout of disk +data depend on them and the reloaded target would be non-functional. The layout of the formatted block device: diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 252e5ef324e564562d1d7d1826d9fe183497fd87..0dc2eb8e44e5fa56ad19cb1191f182090ee614f8 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -54,6 +54,9 @@ If you make a mistake with the syntax, the write will fail thus:: /dynamic_debug/control -bash: echo: write error: Invalid argument +Note, for systems without 'debugfs' enabled, the control file can be +found in ``/proc/dynamic_debug/control``. + Viewing Dynamic Debug Behaviour =============================== diff --git a/Documentation/driver-api/edid.rst b/Documentation/admin-guide/edid.rst similarity index 98% rename from Documentation/driver-api/edid.rst rename to Documentation/admin-guide/edid.rst index b1b5acd501ed4bf44858cd2f9c9629edf211f144..80deeb21a265fa8f4df01b71096613ddb6fcb87b 100644 --- a/Documentation/driver-api/edid.rst +++ b/Documentation/admin-guide/edid.rst @@ -11,11 +11,13 @@ Today, with the advent of Kernel Mode Setting, a graphics board is either correctly working because all components follow the standards - or the computer is unusable, because the screen remains dark after booting or it displays the wrong area. Cases when this happens are: + - The graphics board does not recognize the monitor. - The graphics board is unable to detect any EDID data. - The graphics board incorrectly forwards EDID data to the driver. - The monitor sends no or bogus EDID data. - A KVM sends its own EDID data instead of querying the connected monitor. + Adding the kernel parameter "nomodeset" helps in most cases, but causes restrictions later on. @@ -32,7 +34,7 @@ individual data for a specific misbehaving monitor, commented sources and a Makefile environment are given here. To create binary EDID and C source code files from the existing data -material, simply type "make". +material, simply type "make" in tools/edid/. If you want to create your own EDID file, copy the file 1024x768.S, replace the settings with your own data and add a new target to the diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst index 9bc93f0ce0c907fa2cb2b54a86bbb2c7ee417570..9443fcef18760389ba49a6a6d328dd8888f0f397 100644 --- a/Documentation/admin-guide/ext4.rst +++ b/Documentation/admin-guide/ext4.rst @@ -92,6 +92,8 @@ Currently Available * efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force the ordering) * Case-insensitive file name lookups +* file-based encryption support (fscrypt) +* file-based verity support (fsverity) [1] Filesystems with a block size of 1k may see a limit imposed by the directory hash tree having a maximum depth of two. diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 0795e3c2643f2946c3f1a58418bc6d28c376ec06..ca4dbdd9016d5a873b11381dbe75bdff5ee13038 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -14,3 +14,4 @@ are configurable at compile, boot or run time. mds tsx_async_abort multihit.rst + special-register-buffer-data-sampling.rst diff --git a/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst new file mode 100644 index 0000000000000000000000000000000000000000..47b1b3afac994beb278d4923e262a1da7bad5a23 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/special-register-buffer-data-sampling.rst @@ -0,0 +1,149 @@ +.. SPDX-License-Identifier: GPL-2.0 + +SRBDS - Special Register Buffer Data Sampling +============================================= + +SRBDS is a hardware vulnerability that allows MDS :doc:`mds` techniques to +infer values returned from special register accesses. Special register +accesses are accesses to off core registers. According to Intel's evaluation, +the special register reads that have a security expectation of privacy are +RDRAND, RDSEED and SGX EGETKEY. + +When RDRAND, RDSEED and EGETKEY instructions are used, the data is moved +to the core through the special register mechanism that is susceptible +to MDS attacks. + +Affected processors +-------------------- +Core models (desktop, mobile, Xeon-E3) that implement RDRAND and/or RDSEED may +be affected. + +A processor is affected by SRBDS if its Family_Model and stepping is +in the following list, with the exception of the listed processors +exporting MDS_NO while Intel TSX is available yet not enabled. The +latter class of processors are only affected when Intel TSX is enabled +by software using TSX_CTRL_MSR otherwise they are not affected. + + ============= ============ ======== + common name Family_Model Stepping + ============= ============ ======== + IvyBridge 06_3AH All + + Haswell 06_3CH All + Haswell_L 06_45H All + Haswell_G 06_46H All + + Broadwell_G 06_47H All + Broadwell 06_3DH All + + Skylake_L 06_4EH All + Skylake 06_5EH All + + Kabylake_L 06_8EH <= 0xC + Kabylake 06_9EH <= 0xD + ============= ============ ======== + +Related CVEs +------------ + +The following CVE entry is related to this SRBDS issue: + + ============== ===== ===================================== + CVE-2020-0543 SRBDS Special Register Buffer Data Sampling + ============== ===== ===================================== + +Attack scenarios +---------------- +An unprivileged user can extract values returned from RDRAND and RDSEED +executed on another core or sibling thread using MDS techniques. + + +Mitigation mechanism +------------------- +Intel will release microcode updates that modify the RDRAND, RDSEED, and +EGETKEY instructions to overwrite secret special register data in the shared +staging buffer before the secret data can be accessed by another logical +processor. + +During execution of the RDRAND, RDSEED, or EGETKEY instructions, off-core +accesses from other logical processors will be delayed until the special +register read is complete and the secret data in the shared staging buffer is +overwritten. + +This has three effects on performance: + +#. RDRAND, RDSEED, or EGETKEY instructions have higher latency. + +#. Executing RDRAND at the same time on multiple logical processors will be + serialized, resulting in an overall reduction in the maximum RDRAND + bandwidth. + +#. Executing RDRAND, RDSEED or EGETKEY will delay memory accesses from other + logical processors that miss their core caches, with an impact similar to + legacy locked cache-line-split accesses. + +The microcode updates provide an opt-out mechanism (RNGDS_MITG_DIS) to disable +the mitigation for RDRAND and RDSEED instructions executed outside of Intel +Software Guard Extensions (Intel SGX) enclaves. On logical processors that +disable the mitigation using this opt-out mechanism, RDRAND and RDSEED do not +take longer to execute and do not impact performance of sibling logical +processors memory accesses. The opt-out mechanism does not affect Intel SGX +enclaves (including execution of RDRAND or RDSEED inside an enclave, as well +as EGETKEY execution). + +IA32_MCU_OPT_CTRL MSR Definition +-------------------------------- +Along with the mitigation for this issue, Intel added a new thread-scope +IA32_MCU_OPT_CTRL MSR, (address 0x123). The presence of this MSR and +RNGDS_MITG_DIS (bit 0) is enumerated by CPUID.(EAX=07H,ECX=0).EDX[SRBDS_CTRL = +9]==1. This MSR is introduced through the microcode update. + +Setting IA32_MCU_OPT_CTRL[0] (RNGDS_MITG_DIS) to 1 for a logical processor +disables the mitigation for RDRAND and RDSEED executed outside of an Intel SGX +enclave on that logical processor. Opting out of the mitigation for a +particular logical processor does not affect the RDRAND and RDSEED mitigations +for other logical processors. + +Note that inside of an Intel SGX enclave, the mitigation is applied regardless +of the value of RNGDS_MITG_DS. + +Mitigation control on the kernel command line +--------------------------------------------- +The kernel command line allows control over the SRBDS mitigation at boot time +with the option "srbds=". The option for this is: + + ============= ============================================================= + off This option disables SRBDS mitigation for RDRAND and RDSEED on + affected platforms. + ============= ============================================================= + +SRBDS System Information +----------------------- +The Linux kernel provides vulnerability status information through sysfs. For +SRBDS this can be accessed by the following sysfs file: +/sys/devices/system/cpu/vulnerabilities/srbds + +The possible values contained in this file are: + + ============================== ============================================= + Not affected Processor not vulnerable + Vulnerable Processor vulnerable and mitigation disabled + Vulnerable: No microcode Processor vulnerable and microcode is missing + mitigation + Mitigation: Microcode Processor is vulnerable and mitigation is in + effect. + Mitigation: TSX disabled Processor is only vulnerable when TSX is + enabled while this system was booted with TSX + disabled. + Unknown: Dependent on + hypervisor status Running on virtual guest processor that is + affected but with no way to know if host + processor is mitigated or vulnerable. + ============================== ============================================= + +SRBDS Default mitigation +------------------------ +This new microcode serializes processor access during execution of RDRAND, +RDSEED ensures that the shared buffer is overwritten before it is released for +reuse. Use the "srbds=off" kernel command line to disable the mitigation for +RDRAND and RDSEED. diff --git a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst index af6865b822d21d18946016c1458a4dc3567050e9..68d96f0e9c9543c64e9b3a8ccaf658b63c9ea82e 100644 --- a/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst +++ b/Documentation/admin-guide/hw-vuln/tsx_async_abort.rst @@ -136,8 +136,6 @@ enables the mitigation by default. The mitigation can be controlled at boot time via a kernel command line option. See :ref:`taa_mitigation_control_command_line`. -.. _virt_mechanism: - Virtualization mitigation ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 4405b74853121c6fa3ab8e9b294aada80c3a72dd..5a6269fb85938e27f6dd1085debfb3ad66c63426 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -64,6 +64,7 @@ configure specific aspects of kernel behavior to your liking. binderfs binfmt-misc blockdev/index + bootconfig braille-console btmrvl cgroup-v1/index @@ -74,8 +75,10 @@ configure specific aspects of kernel behavior to your liking. cputopology dell_rbu device-mapper/index + edid efi-stub ext4 + nfs/index gpio/index highuid hw_random diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst index df5b8345c41df7d81f892d05fb27e97c24ed0f0e..9b14b0c2c9c453cff8add85cf792b6f6ff0777ed 100644 --- a/Documentation/admin-guide/iostats.rst +++ b/Documentation/admin-guide/iostats.rst @@ -100,7 +100,7 @@ Field 10 -- # of milliseconds spent doing I/Os (unsigned int) Since 5.0 this field counts jiffies when at least one request was started or completed. If request runs more than 2 jiffies then some - I/O time will not be accounted unless there are other requests. + I/O time might be not accounted in case of concurrent requests. Field 11 -- weighted # of milliseconds spent doing I/Os (unsigned int) This field is incremented at each I/O start, I/O completion, I/O @@ -143,6 +143,9 @@ are summed (possibly overflowing the unsigned long variable they are summed to) and the result given to the user. There is no convenient user interface for accessing the per-CPU counters themselves. +Since 4.19 request times are measured with nanoseconds precision and +truncated to milliseconds before showing in this interface. + Disks vs Partitions ------------------- diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 727a03fb26c999704b70fd2b53bc7136c9c2dc77..5e2ce88d6edad0d8ed9ec143301c696d4040525d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -22,11 +22,13 @@ default: 0 acpi_backlight= [HW,ACPI] - acpi_backlight=vendor - acpi_backlight=video - If set to vendor, prefer vendor specific driver + { vendor | video | native | none } + If set to vendor, prefer vendor-specific driver (e.g. thinkpad_acpi, sony_acpi, etc.) instead of the ACPI video.ko driver. + If set to video, use the ACPI video.ko driver. + If set to native, use the device's native backlight mode. + If set to none, disable the ACPI backlight interface. acpi_force_32bit_fadt_addr force FADT to use 32 bit addresses rather than the @@ -441,9 +443,18 @@ no delay (0). Format: integer + bootconfig [KNL] + Extended command line options can be added to an initrd + and this will cause the kernel to look for it. + + See Documentation/admin-guide/bootconfig.rst + bert_disable [ACPI] Disable BERT OS support on buggy BIOSes. + bgrt_disable [ACPI][X86] + Disable BGRT to avoid flickering OEM logo. + bttv.card= [HW,V4L] bttv (bt848 + bt878 based grabber cards) bttv.radio= Most important insmod options are available as kernel args too. @@ -515,7 +526,8 @@ 1 -- check protection requested by application. Default value is set via a kernel config option. Value can be changed at runtime via - /selinux/checkreqprot. + /sys/fs/selinux/checkreqprot. + Setting checkreqprot to 1 is deprecated. cio_ignore= [S390] See Documentation/s390/common_io.rst for details. @@ -673,7 +685,7 @@ coredump_filter= [KNL] Change the default value for /proc//coredump_filter. - See also Documentation/filesystems/proc.txt. + See also Documentation/filesystems/proc.rst. coresight_cpu_debug.enable [ARM,ARM64] @@ -838,6 +850,18 @@ dump out devices still on the deferred probe list after retrying. + dfltcc= [HW,S390] + Format: { on | off | def_only | inf_only | always } + on: s390 zlib hardware support for compression on + level 1 and decompression (default) + off: No s390 zlib hardware support + def_only: s390 zlib hardware support for deflate + only (compression on level 1) + inf_only: s390 zlib hardware support for inflate + only (decompression) + always: Same as 'on' but ignores the selected compression + level always using hardware support (used for debugging) + dhash_entries= [KNL] Set number of hash buckets for dentry cache. @@ -938,7 +962,7 @@ edid/1680x1050.bin, or edid/1920x1080.bin is given and no file with the same name exists. Details and instructions how to build your own EDID data are - available in Documentation/driver-api/edid.rst. An EDID + available in Documentation/admin-guide/edid.rst. An EDID data set will only be used for a particular connector, if its name and a colon are prepended to the EDID name. Each connector may use a unique EDID data @@ -968,10 +992,6 @@ Documentation/admin-guide/dynamic-debug-howto.rst for details. - nompx [X86] Disables Intel Memory Protection Extensions. - See Documentation/x86/intel_mpx.rst for more - information about the feature. - nopku [X86] Disable Memory Protection Keys CPU feature found in some Intel CPUs. @@ -1081,6 +1101,12 @@ A valid base address must be provided, and the serial port must already be setup and configured. + ec_imx21, + ec_imx6q, + Start an early, polled-mode, output-only console on the + Freescale i.MX UART at the specified address. The UART + must already be setup and configured. + ar3700_uart, Start an early, polled-mode console on the Armada 3700 serial port at the specified @@ -1169,10 +1195,10 @@ efi= [EFI] Format: { "old_map", "nochunk", "noruntime", "debug", - "nosoftreserve" } + "nosoftreserve", "disable_early_pci_dma", + "no_disable_early_pci_dma" } old_map [X86-64]: switch to the old ioremap-based EFI - runtime services mapping. 32-bit still uses this one by - default. + runtime services mapping. [Needs CONFIG_X86_UV=y] nochunk: disable reading files in "chunks" in the EFI boot stub, as chunking can cause problems with some firmware implementations. @@ -1184,6 +1210,10 @@ claim. Specify efi=nosoftreserve to disable this reservation and treat the memory by its base type (i.e. EFI_CONVENTIONAL_MEMORY / "System RAM"). + disable_early_pci_dma: Disable the busmaster bit on all + PCI bridges while in the EFI boot stub + no_disable_early_pci_dma: Leave the busmaster bit set + on all PCI bridges while in the EFI boot stub efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of @@ -1249,7 +1279,8 @@ 0 -- permissive (log only, no denials). 1 -- enforcing (deny and log). Default value is 0. - Value can be changed at runtime via /selinux/enforce. + Value can be changed at runtime via + /sys/fs/selinux/enforce. erst_disable [ACPI] Disable Error Record Serialization Table (ERST) @@ -1331,6 +1362,24 @@ can be changed at run time by the max_graph_depth file in the tracefs tracing directory. default: 0 (no limit) + fw_devlink= [KNL] Create device links between consumer and supplier + devices by scanning the firmware to infer the + consumer/supplier relationships. This feature is + especially useful when drivers are loaded as modules as + it ensures proper ordering of tasks like device probing + (suppliers first, then consumers), supplier boot state + clean up (only after all consumers have probed), + suspend/resume & runtime PM (consumers first, then + suppliers). + Format: { off | permissive | on | rpm } + off -- Don't create device links from firmware info. + permissive -- Create device links from firmware info + but use it only for ordering boot state clean + up (sync_state() calls). + on -- Create device links from firmware info and use it + to enforce probe and suspend/resume ordering. + rpm -- Like "on", but also use to order runtime PM. + gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad support via parallel port (up to 5 devices per port) @@ -1422,6 +1471,14 @@ hpet_mmap= [X86, HPET_MMAP] Allow userspace to mmap HPET registers. Default set by CONFIG_HPET_MMAP_DEFAULT. + hugetlb_cma= [HW] The size of a cma area used for allocation + of gigantic hugepages. + Format: nn[KMGTPE] + + Reserve a cma area of given size and allocate gigantic + hugepages using the cma allocator. If enabled, the + boot-time allocation of gigantic hugepages is skipped. + hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. On x86-64 and powerpc, this option can be specified @@ -1756,7 +1813,7 @@ provided by tboot because it makes the system vulnerable to DMA attacks. nobounce [Default off] - Disable bounce buffer for unstrusted devices such as + Disable bounce buffer for untrusted devices such as the Thunderbolt devices. This will treat the untrusted devices as the trusted ones, hence might expose security risks of DMA attacks. @@ -1860,7 +1917,7 @@ No delay ip= [IP_PNP] - See Documentation/filesystems/nfs/nfsroot.txt. + See Documentation/admin-guide/nfs/nfsroot.rst. ipcmni_extend [KNL] Extend the maximum number of unique System V IPC identifiers from 32,768 to 16,777,216. @@ -1937,9 +1994,31 @@ begins at 0 and the maximum value is "number of CPUs in system - 1". - The format of is described above. - + managed_irq + + Isolate from being targeted by managed interrupts + which have an interrupt mask containing isolated + CPUs. The affinity of managed interrupts is + handled by the kernel and cannot be changed via + the /proc/irq/* interfaces. + + This isolation is best effort and only effective + if the automatically assigned interrupt mask of a + device queue contains isolated and housekeeping + CPUs. If housekeeping CPUs are online then such + interrupts are directed to the housekeeping CPU + so that IO submitted on the housekeeping CPU + cannot disturb the isolated CPU. + + If a queue's affinity mask contains only isolated + CPUs then this parameter has no effect on the + interrupt routing decision, though interrupts are + only delivered when tasks running on those + isolated CPUs submit IO. IO submitted on + housekeeping CPUs has no influence on those + queues. + The format of is described above. iucv= [HW,NET] @@ -2498,13 +2577,22 @@ For details see: Documentation/admin-guide/hw-vuln/mds.rst mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory - Amount of memory to be used when the kernel is not able - to see the whole system memory or for test. + Amount of memory to be used in cases as follows: + + 1 for test; + 2 when the kernel is not able to see the whole system memory; + 3 memory that lies after 'mem=' boundary is excluded from + the hypervisor, then assigned to KVM guests. + [X86] Work as limiting max address. Use together with memmap= to avoid physical address space collisions. Without memmap= PCI devices could be placed at addresses belonging to unused RAM. + Note that this only takes effects during boot time since + in above case 3, memory may need be hot added after boot + if system memory of hypervisor is not sufficient. + mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel memory. @@ -2750,7 +2838,7 @@ ,[,,,,] mtdparts= [MTD] - See drivers/mtd/cmdlinepart.c. + See drivers/mtd/parsers/cmdlinepart.c multitce=off [PPC] This parameter disables the use of the pSeries firmware feature for updating multiple TCE entries @@ -2808,13 +2896,13 @@ Default value is 0. nfsaddrs= [NFS] Deprecated. Use ip= instead. - See Documentation/filesystems/nfs/nfsroot.txt. + See Documentation/admin-guide/nfs/nfsroot.rst. nfsroot= [NFS] nfs root filesystem for disk-less boxes. - See Documentation/filesystems/nfs/nfsroot.txt. + See Documentation/admin-guide/nfs/nfsroot.rst. nfsrootdebug [NFS] enable nfsroot debugging messages. - See Documentation/filesystems/nfs/nfsroot.txt. + See Documentation/admin-guide/nfs/nfsroot.rst. nfs.callback_nr_threads= [NFSv4] set the total number of threads that the @@ -3129,7 +3217,7 @@ [X86,PV_OPS] Disable paravirtualized VMware scheduler clock and use the default one. - no-steal-acc [X86,KVM,ARM64] Disable paravirtualized steal time + no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour @@ -3240,12 +3328,6 @@ This can be set from sysctl after boot. See Documentation/admin-guide/sysctl/vm.rst for details. - of_devlink [OF, KNL] Create device links between consumer and - supplier devices by scanning the devictree to infer the - consumer/supplier relationships. A consumer device - will not be probed until all the supplier devices have - probed successfully. - ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. See Documentation/debugging-via-ohci1394.txt for more info. @@ -3653,6 +3735,9 @@ Override pmtimer IOPort with a hex value. e.g. pmtmr=0x508 + pm_debug_messages [SUSPEND,KNL] + Enable suspend/resume debug messages during boot up. + pnp.debug=1 [PNP] Enable PNP debug messages (depends on the CONFIG_PNP_DEBUG_MESSAGES option). Change at run-time @@ -3754,6 +3839,11 @@ before loading. See Documentation/admin-guide/blockdev/ramdisk.rst. + prot_virt= [S390] enable hosting protected virtual machines + isolated from the hypervisor (if hardware supports + that). + Format: + psi= [KNL] Enable or disable pressure stall information tracking. Format: @@ -3939,6 +4029,15 @@ Set threshold of queued RCU callbacks below which batch limiting is re-enabled. + rcutree.qovld= [KNL] + Set threshold of queued RCU callbacks beyond which + RCU's force-quiescent-state scan will aggressively + enlist help from cond_resched() and sched IPIs to + help CPUs more quickly reach quiescent states. + Set to less than zero to make this be set based + on rcutree.qhimark at boot time and to zero to + disable more aggressive help enlistment. + rcutree.rcu_idle_gp_delay= [KNL] Set wakeup interval for idle CPUs that have RCU callbacks (RCU_FAST_NO_HZ=y). @@ -3982,6 +4081,19 @@ test until boot completes in order to avoid interference. + rcuperf.kfree_rcu_test= [KNL] + Set to measure performance of kfree_rcu() flooding. + + rcuperf.kfree_nthreads= [KNL] + The number of threads running loops of kfree_rcu(). + + rcuperf.kfree_alloc_num= [KNL] + Number of allocations and frees done in an iteration. + + rcuperf.kfree_loops= [KNL] + Number of loops doing rcuperf.kfree_alloc_num number + of allocations and frees. + rcuperf.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value @@ -4141,6 +4253,12 @@ rcupdate.rcu_cpu_stall_suppress= [KNL] Suppress RCU CPU stall warning messages. + rcupdate.rcu_cpu_stall_suppress_at_boot= [KNL] + Suppress RCU CPU stall warning messages and + rcutorture writer stall warnings that occur + during early boot, that is, during the time + before the init task is spawned. + rcupdate.rcu_cpu_stall_timeout= [KNL] Set timeout for RCU CPU stall warning messages. @@ -4334,6 +4452,22 @@ incurs a small amount of overhead in the scheduler but is useful for debugging and performance tuning. + sched_thermal_decay_shift= + [KNL, SMP] Set a decay shift for scheduler thermal + pressure signal. Thermal pressure signal follows the + default decay period of other scheduler pelt + signals(usually 32 ms but configurable). Setting + sched_thermal_decay_shift will left shift the decay + period for the thermal pressure signal by the shift + value. + i.e. with the default pelt decay period of 32 ms + sched_thermal_decay_shift thermal pressure decay pr + 1 64 ms + 2 128 ms + and so on. + Format: integer between 0 and 10 + Default is 0. + skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. @@ -4352,9 +4486,7 @@ See security/selinux/Kconfig help text. 0 -- disable. 1 -- enable. - Default value is set via kernel config option. - If enabled at boot time, /selinux/disable can be used - later to disable prior to initial policy load. + Default value is 1. apparmor= [APPARMOR] Disable or enable AppArmor at boot time Format: { "0" | "1" } @@ -4458,10 +4590,10 @@ Format: A nonzero value instructs the soft-lockup detector - to panic the machine when a soft-lockup occurs. This - is also controlled by CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC - which is the respective build-time switch to that - functionality. + to panic the machine when a soft-lockup occurs. It is + also controlled by the kernel.softlockup_panic sysctl + and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the + respective build-time switch to that functionality. softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate @@ -4603,6 +4735,48 @@ spia_pedr= spia_peddr= + split_lock_detect= + [X86] Enable split lock detection + + When enabled (and if hardware support is present), atomic + instructions that access data across cache line + boundaries will result in an alignment check exception. + + off - not enabled + + warn - the kernel will emit rate limited warnings + about applications triggering the #AC + exception. This mode is the default on CPUs + that supports split lock detection. + + fatal - the kernel will send SIGBUS to applications + that trigger the #AC exception. + + If an #AC exception is hit in the kernel or in + firmware (i.e. not while executing in user mode) + the kernel will oops in either "warn" or "fatal" + mode. + + srbds= [X86,INTEL] + Control the Special Register Buffer Data Sampling + (SRBDS) mitigation. + + Certain CPUs are vulnerable to an MDS-like + exploit which can leak bits from the random + number generator. + + By default, this issue is mitigated by + microcode. However, the microcode fix can cause + the RDRAND and RDSEED instructions to become + much slower. Among other effects, this will + result in reduced throughput from /dev/urandom. + + The microcode mitigation can be disabled with + the following option: + + off: Disable mitigation and remove + performance impact to RDRAND and RDSEED + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the @@ -4815,6 +4989,10 @@ topology updates sent by the hypervisor to this LPAR. + torture.disable_onoff_at_boot= [KNL] + Prevent the CPU-hotplug component of torturing + until after init has spawned. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] @@ -5029,8 +5207,7 @@ usbcore.old_scheme_first= [USB] Start with the old device initialization - scheme, applies only to low and full-speed devices - (default 0 = off). + scheme (default 0 = off). usbcore.usbfs_memory_mb= [USB] Memory limit (in MB) for buffers allocated by diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst index baeeba8762ae4325be2eec70af80f36b50a56b45..21818aca470868995fc4599e5ba73cac52955d51 100644 --- a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst +++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst @@ -234,7 +234,7 @@ To reduce its OS jitter, do any of the following: Such a workqueue can be confined to a given subset of the CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs files. The set of WQ_SYSFS workqueues can be displayed using - "ls sys/devices/virtual/workqueue". That said, the workqueues + "ls /sys/devices/virtual/workqueue". That said, the workqueues maintainer would like to caution people against indiscriminately sprinkling WQ_SYSFS across all the workqueues. The reason for caution is that it is easy to add WQ_SYSFS, but because sysfs is diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index bd5714547cee0cbb06edd31d59c0d4f12476ad33..2f31de8f7c749ec70a8e6e33fc7b779276410e26 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -310,6 +310,11 @@ thp_fault_fallback is incremented if a page fault fails to allocate a huge page and instead falls back to using small pages. +thp_fault_fallback_charge + is incremented if a page fault fails to charge a huge page and + instead falls back to using small pages even though the + allocation was successful. + thp_collapse_alloc_failed is incremented if khugepaged found a range of pages that should be collapsed into one huge page but failed @@ -319,6 +324,15 @@ thp_file_alloc is incremented every time a file huge page is successfully allocated. +thp_file_fallback + is incremented if a file huge page is attempted to be allocated + but fails and instead falls back to using small pages. + +thp_file_fallback_charge + is incremented if a file huge page cannot be charged and instead + falls back to using small pages even though the allocation was + successful. + thp_file_mapped is incremented every time a file huge page is mapped into user address space. diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 5048cf661a8ae72e9d04a066d574922f92facf22..c30176e67900459e3d3ce4c634606a0642f17850 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -108,6 +108,57 @@ UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an half copied page since it'll keep userfaulting until the copy has finished. +Notes: + +- If you requested UFFDIO_REGISTER_MODE_MISSING when registering then + you must provide some kind of page in your thread after reading from + the uffd. You must provide either UFFDIO_COPY or UFFDIO_ZEROPAGE. + The normal behavior of the OS automatically providing a zero page on + an annonymous mmaping is not in place. + +- None of the page-delivering ioctls default to the range that you + registered with. You must fill in all fields for the appropriate + ioctl struct including the range. + +- You get the address of the access that triggered the missing page + event out of a struct uffd_msg that you read in the thread from the + uffd. You can supply as many pages as you want with UFFDIO_COPY or + UFFDIO_ZEROPAGE. Keep in mind that unless you used DONTWAKE then + the first of any of those IOCTLs wakes up the faulting thread. + +- Be sure to test for all errors including (pollfd[0].revents & + POLLERR). This can happen, e.g. when ranges supplied were + incorrect. + +Write Protect Notifications +--------------------------- + +This is equivalent to (but faster than) using mprotect and a SIGSEGV +signal handler. + +Firstly you need to register a range with UFFDIO_REGISTER_MODE_WP. +Instead of using mprotect(2) you use ioctl(uffd, UFFDIO_WRITEPROTECT, +struct *uffdio_writeprotect) while mode = UFFDIO_WRITEPROTECT_MODE_WP +in the struct passed in. The range does not default to and does not +have to be identical to the range you registered with. You can write +protect as many ranges as you like (inside the registered range). +Then, in the thread reading from uffd the struct will have +msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP set. Now you send +ioctl(uffd, UFFDIO_WRITEPROTECT, struct *uffdio_writeprotect) again +while pagefault.mode does not have UFFDIO_WRITEPROTECT_MODE_WP set. +This wakes up the thread which will continue to run with writes. This +allows you to do the bookkeeping about the write in the uffd reading +thread before the ioctl. + +If you registered with both UFFDIO_REGISTER_MODE_MISSING and +UFFDIO_REGISTER_MODE_WP then you need to think about the sequence in +which you supply a page and undo write protect. Note that there is a +difference between writes into a WP area and into a !WP area. The +former will have UFFD_PAGEFAULT_FLAG_WP set, the latter +UFFD_PAGEFAULT_FLAG_WRITE. The latter did not fail on protection but +you still need to supply a page when UFFDIO_REGISTER_MODE_MISSING was +used. + QEMU/KVM ======== diff --git a/Documentation/admin-guide/nfs/fault_injection.rst b/Documentation/admin-guide/nfs/fault_injection.rst new file mode 100644 index 0000000000000000000000000000000000000000..eb029c0c15ce5022baf5b198d42d776639094b2c --- /dev/null +++ b/Documentation/admin-guide/nfs/fault_injection.rst @@ -0,0 +1,70 @@ +=================== +NFS Fault Injection +=================== + +Fault injection is a method for forcing errors that may not normally occur, or +may be difficult to reproduce. Forcing these errors in a controlled environment +can help the developer find and fix bugs before their code is shipped in a +production system. Injecting an error on the Linux NFS server will allow us to +observe how the client reacts and if it manages to recover its state correctly. + +NFSD_FAULT_INJECTION must be selected when configuring the kernel to use this +feature. + + +Using Fault Injection +===================== +On the client, mount the fault injection server through NFS v4.0+ and do some +work over NFS (open files, take locks, ...). + +On the server, mount the debugfs filesystem to and ls +/nfsd. This will show a list of files that will be used for +injecting faults on the NFS server. As root, write a number n to the file +corresponding to the action you want the server to take. The server will then +process the first n items it finds. So if you want to forget 5 locks, echo '5' +to /nfsd/forget_locks. A value of 0 will tell the server to forget +all corresponding items. A log message will be created containing the number +of items forgotten (check dmesg). + +Go back to work on the client and check if the client recovered from the error +correctly. + + +Available Faults +================ +forget_clients: + The NFS server keeps a list of clients that have placed a mount call. If + this list is cleared, the server will have no knowledge of who the client + is, forcing the client to reauthenticate with the server. + +forget_openowners: + The NFS server keeps a list of what files are currently opened and who + they were opened by. Clearing this list will force the client to reopen + its files. + +forget_locks: + The NFS server keeps a list of what files are currently locked in the VFS. + Clearing this list will force the client to reclaim its locks (files are + unlocked through the VFS as they are cleared from this list). + +forget_delegations: + A delegation is used to assure the client that a file, or part of a file, + has not changed since the delegation was awarded. Clearing this list will + force the client to reacquire its delegation before accessing the file + again. + +recall_delegations: + Delegations can be recalled by the server when another client attempts to + access a file. This test will notify the client that its delegation has + been revoked, forcing the client to reacquire the delegation before using + the file again. + + +tools/nfs/inject_faults.sh script +================================= +This script has been created to ease the fault injection process. This script +will detect the mounted debugfs directory and write to the files located there +based on the arguments passed by the user. For example, running +`inject_faults.sh forget_locks 1` as root will instruct the server to forget +one lock. Running `inject_faults forget_locks` will instruct the server to +forgetall locks. diff --git a/Documentation/admin-guide/nfs/index.rst b/Documentation/admin-guide/nfs/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..6b5a3c90fac5622a9519bbce35a0ea2fc932f5a6 --- /dev/null +++ b/Documentation/admin-guide/nfs/index.rst @@ -0,0 +1,15 @@ +============= +NFS +============= + +.. toctree:: + :maxdepth: 1 + + nfs-client + nfsroot + nfs-rdma + nfsd-admin-interfaces + nfs-idmapper + pnfs-block-server + pnfs-scsi-server + fault_injection diff --git a/Documentation/admin-guide/nfs/nfs-client.rst b/Documentation/admin-guide/nfs/nfs-client.rst new file mode 100644 index 0000000000000000000000000000000000000000..c4b777c7584b499f236be315e894f5a693c7262f --- /dev/null +++ b/Documentation/admin-guide/nfs/nfs-client.rst @@ -0,0 +1,141 @@ +========== +NFS Client +========== + +The NFS client +============== + +The NFS version 2 protocol was first documented in RFC1094 (March 1989). +Since then two more major releases of NFS have been published, with NFSv3 +being documented in RFC1813 (June 1995), and NFSv4 in RFC3530 (April +2003). + +The Linux NFS client currently supports all the above published versions, +and work is in progress on adding support for minor version 1 of the NFSv4 +protocol. + +The purpose of this document is to provide information on some of the +special features of the NFS client that can be configured by system +administrators. + + +The nfs4_unique_id parameter +============================ + +NFSv4 requires clients to identify themselves to servers with a unique +string. File open and lock state shared between one client and one server +is associated with this identity. To support robust NFSv4 state recovery +and transparent state migration, this identity string must not change +across client reboots. + +Without any other intervention, the Linux client uses a string that contains +the local system's node name. System administrators, however, often do not +take care to ensure that node names are fully qualified and do not change +over the lifetime of a client system. Node names can have other +administrative requirements that require particular behavior that does not +work well as part of an nfs_client_id4 string. + +The nfs.nfs4_unique_id boot parameter specifies a unique string that can be +used instead of a system's node name when an NFS client identifies itself to +a server. Thus, if the system's node name is not unique, or it changes, its +nfs.nfs4_unique_id stays the same, preventing collision with other clients +or loss of state during NFS reboot recovery or transparent state migration. + +The nfs.nfs4_unique_id string is typically a UUID, though it can contain +anything that is believed to be unique across all NFS clients. An +nfs4_unique_id string should be chosen when a client system is installed, +just as a system's root file system gets a fresh UUID in its label at +install time. + +The string should remain fixed for the lifetime of the client. It can be +changed safely if care is taken that the client shuts down cleanly and all +outstanding NFSv4 state has expired, to prevent loss of NFSv4 state. + +This string can be stored in an NFS client's grub.conf, or it can be provided +via a net boot facility such as PXE. It may also be specified as an nfs.ko +module parameter. Specifying a uniquifier string is not support for NFS +clients running in containers. + + +The DNS resolver +================ + +NFSv4 allows for one server to refer the NFS client to data that has been +migrated onto another server by means of the special "fs_locations" +attribute. See `RFC3530 Section 6: Filesystem Migration and Replication`_ and +`Implementation Guide for Referrals in NFSv4`_. + +.. _RFC3530 Section 6\: Filesystem Migration and Replication: http://tools.ietf.org/html/rfc3530#section-6 +.. _Implementation Guide for Referrals in NFSv4: http://tools.ietf.org/html/draft-ietf-nfsv4-referrals-00 + +The fs_locations information can take the form of either an ip address and +a path, or a DNS hostname and a path. The latter requires the NFS client to +do a DNS lookup in order to mount the new volume, and hence the need for an +upcall to allow userland to provide this service. + +Assuming that the user has the 'rpc_pipefs' filesystem mounted in the usual +/var/lib/nfs/rpc_pipefs, the upcall consists of the following steps: + + (1) The process checks the dns_resolve cache to see if it contains a + valid entry. If so, it returns that entry and exits. + + (2) If no valid entry exists, the helper script '/sbin/nfs_cache_getent' + (may be changed using the 'nfs.cache_getent' kernel boot parameter) + is run, with two arguments: + - the cache name, "dns_resolve" + - the hostname to resolve + + (3) After looking up the corresponding ip address, the helper script + writes the result into the rpc_pipefs pseudo-file + '/var/lib/nfs/rpc_pipefs/cache/dns_resolve/channel' + in the following (text) format: + + " \n" + + Where is in the usual IPv4 (123.456.78.90) or IPv6 + (ffee:ddcc:bbaa:9988:7766:5544:3322:1100, ffee::1100, ...) format. + is identical to the second argument of the helper + script, and is the 'time to live' of this cache entry (in + units of seconds). + + .. note:: + If is invalid, say the string "0", then a negative + entry is created, which will cause the kernel to treat the hostname + as having no valid DNS translation. + + + + +A basic sample /sbin/nfs_cache_getent +===================================== +.. code-block:: sh + + #!/bin/bash + # + ttl=600 + # + cut=/usr/bin/cut + getent=/usr/bin/getent + rpc_pipefs=/var/lib/nfs/rpc_pipefs + # + die() + { + echo "Usage: $0 cache_name entry_name" + exit 1 + } + + [ $# -lt 2 ] && die + cachename="$1" + cache_path=${rpc_pipefs}/cache/${cachename}/channel + + case "${cachename}" in + dns_resolve) + name="$2" + result="$(${getent} hosts ${name} | ${cut} -f1 -d\ )" + [ -z "${result}" ] && result="0" + ;; + *) + die + ;; + esac + echo "${result} ${name} ${ttl}" >${cache_path} diff --git a/Documentation/admin-guide/nfs/nfs-idmapper.rst b/Documentation/admin-guide/nfs/nfs-idmapper.rst new file mode 100644 index 0000000000000000000000000000000000000000..58b8e63412d5272856c8cdf0a1efcf5f70a11fdc --- /dev/null +++ b/Documentation/admin-guide/nfs/nfs-idmapper.rst @@ -0,0 +1,78 @@ +============= +NFS ID Mapper +============= + +Id mapper is used by NFS to translate user and group ids into names, and to +translate user and group names into ids. Part of this translation involves +performing an upcall to userspace to request the information. There are two +ways NFS could obtain this information: placing a call to /sbin/request-key +or by placing a call to the rpc.idmap daemon. + +NFS will attempt to call /sbin/request-key first. If this succeeds, the +result will be cached using the generic request-key cache. This call should +only fail if /etc/request-key.conf is not configured for the id_resolver key +type, see the "Configuring" section below if you wish to use the request-key +method. + +If the call to /sbin/request-key fails (if /etc/request-key.conf is not +configured with the id_resolver key type), then the idmapper will ask the +legacy rpc.idmap daemon for the id mapping. This result will be stored +in a custom NFS idmap cache. + + +Configuring +=========== + +The file /etc/request-key.conf will need to be modified so /sbin/request-key can +direct the upcall. The following line should be added: + +``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...`` +``#====== ======= =============== =============== ===============================`` +``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600`` + + +This will direct all id_resolver requests to the program /usr/sbin/nfs.idmap. +The last parameter, 600, defines how many seconds into the future the key will +expire. This parameter is optional for /usr/sbin/nfs.idmap. When the timeout +is not specified, nfs.idmap will default to 600 seconds. + +id mapper uses for key descriptions:: + + uid: Find the UID for the given user + gid: Find the GID for the given group + user: Find the user name for the given UID + group: Find the group name for the given GID + +You can handle any of these individually, rather than using the generic upcall +program. If you would like to use your own program for a uid lookup then you +would edit your request-key.conf so it look similar to this: + +``#OP TYPE DESCRIPTION CALLOUT INFO PROGRAM ARG1 ARG2 ARG3 ...`` +``#====== ======= =============== =============== ===============================`` +``create id_resolver uid:* * /some/other/program %k %d 600`` +``create id_resolver * * /usr/sbin/nfs.idmap %k %d 600`` + + +Notice that the new line was added above the line for the generic program. +request-key will find the first matching line and corresponding program. In +this case, /some/other/program will handle all uid lookups and +/usr/sbin/nfs.idmap will handle gid, user, and group lookups. + +See Documentation/security/keys/request-key.rst for more information +about the request-key function. + + +nfs.idmap +========= + +nfs.idmap is designed to be called by request-key, and should not be run "by +hand". This program takes two arguments, a serialized key and a key +description. The serialized key is first converted into a key_serial_t, and +then passed as an argument to keyctl_instantiate (both are part of keyutils.h). + +The actual lookups are performed by functions found in nfsidmap.h. nfs.idmap +determines the correct function to call by looking at the first part of the +description string. For example, a uid lookup description will appear as +"uid:user@domain". + +nfs.idmap will return 0 if the key was instantiated, and non-zero otherwise. diff --git a/Documentation/admin-guide/nfs/nfs-rdma.rst b/Documentation/admin-guide/nfs/nfs-rdma.rst new file mode 100644 index 0000000000000000000000000000000000000000..ef0f3678b1fb8f2d6f882eef20901619c71c0368 --- /dev/null +++ b/Documentation/admin-guide/nfs/nfs-rdma.rst @@ -0,0 +1,292 @@ +=================== +Setting up NFS/RDMA +=================== + +:Author: + NetApp and Open Grid Computing (May 29, 2008) + +.. warning:: + This document is probably obsolete. + +Overview +======== + +This document describes how to install and setup the Linux NFS/RDMA client +and server software. + +The NFS/RDMA client was first included in Linux 2.6.24. The NFS/RDMA server +was first included in the following release, Linux 2.6.25. + +In our testing, we have obtained excellent performance results (full 10Gbit +wire bandwidth at minimal client CPU) under many workloads. The code passes +the full Connectathon test suite and operates over both Infiniband and iWARP +RDMA adapters. + +Getting Help +============ + +If you get stuck, you can ask questions on the +nfs-rdma-devel@lists.sourceforge.net mailing list. + +Installation +============ + +These instructions are a step by step guide to building a machine for +use with NFS/RDMA. + +- Install an RDMA device + + Any device supported by the drivers in drivers/infiniband/hw is acceptable. + + Testing has been performed using several Mellanox-based IB cards, the + Ammasso AMS1100 iWARP adapter, and the Chelsio cxgb3 iWARP adapter. + +- Install a Linux distribution and tools + + The first kernel release to contain both the NFS/RDMA client and server was + Linux 2.6.25 Therefore, a distribution compatible with this and subsequent + Linux kernel release should be installed. + + The procedures described in this document have been tested with + distributions from Red Hat's Fedora Project (http://fedora.redhat.com/). + +- Install nfs-utils-1.1.2 or greater on the client + + An NFS/RDMA mount point can be obtained by using the mount.nfs command in + nfs-utils-1.1.2 or greater (nfs-utils-1.1.1 was the first nfs-utils + version with support for NFS/RDMA mounts, but for various reasons we + recommend using nfs-utils-1.1.2 or greater). To see which version of + mount.nfs you are using, type: + + .. code-block:: sh + + $ /sbin/mount.nfs -V + + If the version is less than 1.1.2 or the command does not exist, + you should install the latest version of nfs-utils. + + Download the latest package from: http://www.kernel.org/pub/linux/utils/nfs + + Uncompress the package and follow the installation instructions. + + If you will not need the idmapper and gssd executables (you do not need + these to create an NFS/RDMA enabled mount command), the installation + process can be simplified by disabling these features when running + configure: + + .. code-block:: sh + + $ ./configure --disable-gss --disable-nfsv4 + + To build nfs-utils you will need the tcp_wrappers package installed. For + more information on this see the package's README and INSTALL files. + + After building the nfs-utils package, there will be a mount.nfs binary in + the utils/mount directory. This binary can be used to initiate NFS v2, v3, + or v4 mounts. To initiate a v4 mount, the binary must be called + mount.nfs4. The standard technique is to create a symlink called + mount.nfs4 to mount.nfs. + + This mount.nfs binary should be installed at /sbin/mount.nfs as follows: + + .. code-block:: sh + + $ sudo cp utils/mount/mount.nfs /sbin/mount.nfs + + In this location, mount.nfs will be invoked automatically for NFS mounts + by the system mount command. + + .. note:: + mount.nfs and therefore nfs-utils-1.1.2 or greater is only needed + on the NFS client machine. You do not need this specific version of + nfs-utils on the server. Furthermore, only the mount.nfs command from + nfs-utils-1.1.2 is needed on the client. + +- Install a Linux kernel with NFS/RDMA + + The NFS/RDMA client and server are both included in the mainline Linux + kernel version 2.6.25 and later. This and other versions of the Linux + kernel can be found at: https://www.kernel.org/pub/linux/kernel/ + + Download the sources and place them in an appropriate location. + +- Configure the RDMA stack + + Make sure your kernel configuration has RDMA support enabled. Under + Device Drivers -> InfiniBand support, update the kernel configuration + to enable InfiniBand support [NOTE: the option name is misleading. Enabling + InfiniBand support is required for all RDMA devices (IB, iWARP, etc.)]. + + Enable the appropriate IB HCA support (mlx4, mthca, ehca, ipath, etc.) or + iWARP adapter support (amso, cxgb3, etc.). + + If you are using InfiniBand, be sure to enable IP-over-InfiniBand support. + +- Configure the NFS client and server + + Your kernel configuration must also have NFS file system support and/or + NFS server support enabled. These and other NFS related configuration + options can be found under File Systems -> Network File Systems. + +- Build, install, reboot + + The NFS/RDMA code will be enabled automatically if NFS and RDMA + are turned on. The NFS/RDMA client and server are configured via the hidden + SUNRPC_XPRT_RDMA config option that depends on SUNRPC and INFINIBAND. The + value of SUNRPC_XPRT_RDMA will be: + + #. N if either SUNRPC or INFINIBAND are N, in this case the NFS/RDMA client + and server will not be built + + #. M if both SUNRPC and INFINIBAND are on (M or Y) and at least one is M, + in this case the NFS/RDMA client and server will be built as modules + + #. Y if both SUNRPC and INFINIBAND are Y, in this case the NFS/RDMA client + and server will be built into the kernel + + Therefore, if you have followed the steps above and turned no NFS and RDMA, + the NFS/RDMA client and server will be built. + + Build a new kernel, install it, boot it. + +Check RDMA and NFS Setup +======================== + +Before configuring the NFS/RDMA software, it is a good idea to test +your new kernel to ensure that the kernel is working correctly. +In particular, it is a good idea to verify that the RDMA stack +is functioning as expected and standard NFS over TCP/IP and/or UDP/IP +is working properly. + +- Check RDMA Setup + + If you built the RDMA components as modules, load them at + this time. For example, if you are using a Mellanox Tavor/Sinai/Arbel + card: + + .. code-block:: sh + + $ modprobe ib_mthca + $ modprobe ib_ipoib + + If you are using InfiniBand, make sure there is a Subnet Manager (SM) + running on the network. If your IB switch has an embedded SM, you can + use it. Otherwise, you will need to run an SM, such as OpenSM, on one + of your end nodes. + + If an SM is running on your network, you should see the following: + + .. code-block:: sh + + $ cat /sys/class/infiniband/driverX/ports/1/state + 4: ACTIVE + + where driverX is mthca0, ipath5, ehca3, etc. + + To further test the InfiniBand software stack, use IPoIB (this + assumes you have two IB hosts named host1 and host2): + + .. code-block:: sh + + host1$ ip link set dev ib0 up + host1$ ip address add dev ib0 a.b.c.x + host2$ ip link set dev ib0 up + host2$ ip address add dev ib0 a.b.c.y + host1$ ping a.b.c.y + host2$ ping a.b.c.x + + For other device types, follow the appropriate procedures. + +- Check NFS Setup + + For the NFS components enabled above (client and/or server), + test their functionality over standard Ethernet using TCP/IP or UDP/IP. + +NFS/RDMA Setup +============== + +We recommend that you use two machines, one to act as the client and +one to act as the server. + +One time configuration: +----------------------- + +- On the server system, configure the /etc/exports file and start the NFS/RDMA server. + + Exports entries with the following formats have been tested:: + + /vol0 192.168.0.47(fsid=0,rw,async,insecure,no_root_squash) + /vol0 192.168.0.0/255.255.255.0(fsid=0,rw,async,insecure,no_root_squash) + + The IP address(es) is(are) the client's IPoIB address for an InfiniBand + HCA or the client's iWARP address(es) for an RNIC. + + .. note:: + The "insecure" option must be used because the NFS/RDMA client does + not use a reserved port. + +Each time a machine boots: +-------------------------- + +- Load and configure the RDMA drivers + + For InfiniBand using a Mellanox adapter: + + .. code-block:: sh + + $ modprobe ib_mthca + $ modprobe ib_ipoib + $ ip li set dev ib0 up + $ ip addr add dev ib0 a.b.c.d + + .. note:: + Please use unique addresses for the client and server! + +- Start the NFS server + + If the NFS/RDMA server was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in + kernel config), load the RDMA transport module: + + .. code-block:: sh + + $ modprobe svcrdma + + Regardless of how the server was built (module or built-in), start the + server: + + .. code-block:: sh + + $ /etc/init.d/nfs start + + or + + .. code-block:: sh + + $ service nfs start + + Instruct the server to listen on the RDMA transport: + + .. code-block:: sh + + $ echo rdma 20049 > /proc/fs/nfsd/portlist + +- On the client system + + If the NFS/RDMA client was built as a module (CONFIG_SUNRPC_XPRT_RDMA=m in + kernel config), load the RDMA client module: + + .. code-block:: sh + + $ modprobe xprtrdma.ko + + Regardless of how the client was built (module or built-in), use this + command to mount the NFS/RDMA server: + + .. code-block:: sh + + $ mount -o rdma,port=20049 :/ /mnt + + To verify that the mount is using RDMA, run "cat /proc/mounts" and check + the "proto" field for the given mount. + + Congratulations! You're using NFS/RDMA! diff --git a/Documentation/admin-guide/nfs/nfsd-admin-interfaces.rst b/Documentation/admin-guide/nfs/nfsd-admin-interfaces.rst new file mode 100644 index 0000000000000000000000000000000000000000..c05926f79054b1e43a51f03adf927a27dd4e0d91 --- /dev/null +++ b/Documentation/admin-guide/nfs/nfsd-admin-interfaces.rst @@ -0,0 +1,40 @@ +================================== +Administrative interfaces for nfsd +================================== + +Note that normally these interfaces are used only by the utilities in +nfs-utils. + +nfsd is controlled mainly by pseudofiles under the "nfsd" filesystem, +which is normally mounted at /proc/fs/nfsd/. + +The server is always started by the first write of a nonzero value to +nfsd/threads. + +Before doing that, NFSD can be told which sockets to listen on by +writing to nfsd/portlist; that write may be: + + - an ascii-encoded file descriptor, which should refer to a + bound (and listening, for tcp) socket, or + - "transportname port", where transportname is currently either + "udp", "tcp", or "rdma". + +If nfsd is started without doing any of these, then it will create one +udp and one tcp listener at port 2049 (see nfsd_init_socks). + +On startup, nfsd and lockd grace periods start. nfsd is shut down by a write of +0 to nfsd/threads. All locks and state are thrown away at that point. + +Between startup and shutdown, the number of threads may be adjusted up +or down by additional writes to nfsd/threads or by writes to +nfsd/pool_threads. + +For more detail about files under nfsd/ and what they control, see +fs/nfsd/nfsctl.c; most of them have detailed comments. + +Implementation notes +==================== + +Note that the rpc server requires the caller to serialize addition and +removal of listening sockets, and startup and shutdown of the server. +For nfsd this is done using nfsd_mutex. diff --git a/Documentation/admin-guide/nfs/nfsroot.rst b/Documentation/admin-guide/nfs/nfsroot.rst new file mode 100644 index 0000000000000000000000000000000000000000..82a4fda057f986012b76424623869fd854b72d59 --- /dev/null +++ b/Documentation/admin-guide/nfs/nfsroot.rst @@ -0,0 +1,364 @@ +=============================================== +Mounting the root filesystem via NFS (nfsroot) +=============================================== + +:Authors: + Written 1996 by Gero Kuhlmann + + Updated 1997 by Martin Mares + + Updated 2006 by Nico Schottelius + + Updated 2006 by Horms + + Updated 2018 by Chris Novakovic + + + +In order to use a diskless system, such as an X-terminal or printer server for +example, it is necessary for the root filesystem to be present on a non-disk +device. This may be an initramfs (see +Documentation/filesystems/ramfs-rootfs-initramfs.txt), a ramdisk (see +Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The +following text describes on how to use NFS for the root filesystem. For the rest +of this text 'client' means the diskless system, and 'server' means the NFS +server. + + + + +Enabling nfsroot capabilities +============================= + +In order to use nfsroot, NFS client support needs to be selected as +built-in during configuration. Once this has been selected, the nfsroot +option will become available, which should also be selected. + +In the networking options, kernel level autoconfiguration can be selected, +along with the types of autoconfiguration to support. Selecting all of +DHCP, BOOTP and RARP is safe. + + + + +Kernel command line +=================== + +When the kernel has been loaded by a boot loader (see below) it needs to be +told what root fs device to use. And in the case of nfsroot, where to find +both the server and the name of the directory on the server to mount as root. +This can be established using the following kernel command line parameters: + + +root=/dev/nfs + This is necessary to enable the pseudo-NFS-device. Note that it's not a + real device but just a synonym to tell the kernel to use NFS instead of + a real device. + + +nfsroot=[:][,] + If the `nfsroot' parameter is NOT given on the command line, + the default ``"/tftpboot/%s"`` will be used. + + Specifies the IP address of the NFS server. + The default address is determined by the ip parameter + (see below). This parameter allows the use of different + servers for IP autoconfiguration and NFS. + + Name of the directory on the server to mount as root. + If there is a "%s" token in the string, it will be + replaced by the ASCII-representation of the client's + IP address. + + Standard NFS options. All options are separated by commas. + The following defaults are used:: + + port = as given by server portmap daemon + rsize = 4096 + wsize = 4096 + timeo = 7 + retrans = 3 + acregmin = 3 + acregmax = 60 + acdirmin = 30 + acdirmax = 60 + flags = hard, nointr, noposix, cto, ac + + +ip=::::::::: + This parameter tells the kernel how to configure IP addresses of devices + and also how to set up the IP routing table. It was originally called + nfsaddrs, but now the boot-time IP configuration works independently of + NFS, so it was renamed to ip and the old name remained as an alias for + compatibility reasons. + + If this parameter is missing from the kernel command line, all fields are + assumed to be empty, and the defaults mentioned below apply. In general + this means that the kernel tries to configure everything using + autoconfiguration. + + The parameter can appear alone as the value to the ip + parameter (without all the ':' characters before). If the value is + "ip=off" or "ip=none", no autoconfiguration will take place, otherwise + autoconfiguration will take place. The most common way to use this + is "ip=dhcp". + + IP address of the client. + Default: Determined using autoconfiguration. + + IP address of the NFS server. + If RARP is used to determine + the client address and this parameter is NOT empty only + replies from the specified server are accepted. + + Only required for NFS root. That is autoconfiguration + will not be triggered if it is missing and NFS root is not + in operation. + + Value is exported to /proc/net/pnp with the prefix "bootserver " + (see below). + + Default: Determined using autoconfiguration. + The address of the autoconfiguration server is used. + + IP address of a gateway if the server is on a different subnet. + Default: Determined using autoconfiguration. + + Netmask for local network interface. + If unspecified the netmask is derived from the client IP address + assuming classful addressing. + + Default: Determined using autoconfiguration. + + Name of the client. + If a '.' character is present, anything + before the first '.' is used as the client's hostname, and anything + after it is used as its NIS domain name. May be supplied by + autoconfiguration, but its absence will not trigger autoconfiguration. + If specified and DHCP is used, the user-provided hostname (and NIS + domain name, if present) will be carried in the DHCP request; this + may cause a DNS record to be created or updated for the client. + + Default: Client IP address is used in ASCII notation. + + Name of network device to use. + Default: If the host only has one device, it is used. + Otherwise the device is determined using + autoconfiguration. This is done by sending + autoconfiguration requests out of all devices, + and using the device that received the first reply. + + Method to use for autoconfiguration. + In the case of options + which specify multiple autoconfiguration protocols, + requests are sent using all protocols, and the first one + to reply is used. + + Only autoconfiguration protocols that have been compiled + into the kernel will be used, regardless of the value of + this option:: + + off or none: don't use autoconfiguration + (do static IP assignment instead) + on or any: use any protocol available in the kernel + (default) + dhcp: use DHCP + bootp: use BOOTP + rarp: use RARP + both: use both BOOTP and RARP but not DHCP + (old option kept for backwards compatibility) + + if dhcp is used, the client identifier can be used by following + format "ip=dhcp,client-id-type,client-id-value" + + Default: any + + IP address of primary nameserver. + Value is exported to /proc/net/pnp with the prefix "nameserver " + (see below). + + Default: None if not using autoconfiguration; determined + automatically if using autoconfiguration. + + IP address of secondary nameserver. + See . + + IP address of a Network Time Protocol (NTP) server. + Value is exported to /proc/net/ipconfig/ntp_servers, but is + otherwise unused (see below). + + Default: None if not using autoconfiguration; determined + automatically if using autoconfiguration. + + After configuration (whether manual or automatic) is complete, two files + are created in the following format; lines are omitted if their respective + value is empty following configuration: + + - /proc/net/pnp: + + #PROTO: (depending on configuration method) + domain (if autoconfigured, the DNS domain) + nameserver (primary name server IP) + nameserver (secondary name server IP) + nameserver (tertiary name server IP) + bootserver (NFS server IP) + + - /proc/net/ipconfig/ntp_servers: + + (NTP server IP) + (NTP server IP) + (NTP server IP) + + and (in /proc/net/pnp) and and + (in /proc/net/ipconfig/ntp_servers) are requested during autoconfiguration; + they cannot be specified as part of the "ip=" kernel command line parameter. + + Because the "domain" and "nameserver" options are recognised by DNS + resolvers, /etc/resolv.conf is often linked to /proc/net/pnp on systems + that use an NFS root filesystem. + + Note that the kernel will not synchronise the system time with any NTP + servers it discovers; this is the responsibility of a user space process + (e.g. an initrd/initramfs script that passes the IP addresses listed in + /proc/net/ipconfig/ntp_servers to an NTP client before mounting the real + root filesystem if it is on NFS). + + +nfsrootdebug + This parameter enables debugging messages to appear in the kernel + log at boot time so that administrators can verify that the correct + NFS mount options, server address, and root path are passed to the + NFS client. + + +rdinit= + To specify which file contains the program that starts system + initialization, administrators can use this command line parameter. + The default value of this parameter is "/init". If the specified + file exists and the kernel can execute it, root filesystem related + kernel command line parameters, including 'nfsroot=', are ignored. + + A description of the process of mounting the root file system can be + found in Documentation/driver-api/early-userspace/early_userspace_support.rst + + +Boot Loader +=========== + +To get the kernel into memory different approaches can be used. +They depend on various facilities being available: + + +- Booting from a floppy using syslinux + + When building kernels, an easy way to create a boot floppy that uses + syslinux is to use the zdisk or bzdisk make targets which use zimage + and bzimage images respectively. Both targets accept the + FDARGS parameter which can be used to set the kernel command line. + + e.g:: + + make bzdisk FDARGS="root=/dev/nfs" + + Note that the user running this command will need to have + access to the floppy drive device, /dev/fd0 + + For more information on syslinux, including how to create bootdisks + for prebuilt kernels, see http://syslinux.zytor.com/ + + .. note:: + Previously it was possible to write a kernel directly to + a floppy using dd, configure the boot device using rdev, and + boot using the resulting floppy. Linux no longer supports this + method of booting. + +- Booting from a cdrom using isolinux + + When building kernels, an easy way to create a bootable cdrom that + uses isolinux is to use the isoimage target which uses a bzimage + image. Like zdisk and bzdisk, this target accepts the FDARGS + parameter which can be used to set the kernel command line. + + e.g:: + + make isoimage FDARGS="root=/dev/nfs" + + The resulting iso image will be arch//boot/image.iso + This can be written to a cdrom using a variety of tools including + cdrecord. + + e.g:: + + cdrecord dev=ATAPI:1,0,0 arch/x86/boot/image.iso + + For more information on isolinux, including how to create bootdisks + for prebuilt kernels, see http://syslinux.zytor.com/ + +- Using LILO + + When using LILO all the necessary command line parameters may be + specified using the 'append=' directive in the LILO configuration + file. + + However, to use the 'root=' directive you also need to create + a dummy root device, which may be removed after LILO is run. + + e.g:: + + mknod /dev/boot255 c 0 255 + + For information on configuring LILO, please refer to its documentation. + +- Using GRUB + + When using GRUB, kernel parameter are simply appended after the kernel + specification: kernel + +- Using loadlin + + loadlin may be used to boot Linux from a DOS command prompt without + requiring a local hard disk to mount as root. This has not been + thoroughly tested by the authors of this document, but in general + it should be possible configure the kernel command line similarly + to the configuration of LILO. + + Please refer to the loadlin documentation for further information. + +- Using a boot ROM + + This is probably the most elegant way of booting a diskless client. + With a boot ROM the kernel is loaded using the TFTP protocol. The + authors of this document are not aware of any no commercial boot + ROMs that support booting Linux over the network. However, there + are two free implementations of a boot ROM, netboot-nfs and + etherboot, both of which are available on sunsite.unc.edu, and both + of which contain everything you need to boot a diskless Linux client. + +- Using pxelinux + + Pxelinux may be used to boot linux using the PXE boot loader + which is present on many modern network cards. + + When using pxelinux, the kernel image is specified using + "kernel ". The nfsroot parameters + are passed to the kernel by adding them to the "append" line. + It is common to use serial console in conjunction with pxeliunx, + see Documentation/admin-guide/serial-console.rst for more information. + + For more information on isolinux, including how to create bootdisks + for prebuilt kernels, see http://syslinux.zytor.com/ + + + + +Credits +======= + + The nfsroot code in the kernel and the RARP support have been written + by Gero Kuhlmann . + + The rest of the IP layer autoconfiguration code has been written + by Martin Mares . + + In order to write the initial version of nfsroot I would like to thank + Jens-Uwe Mager for his help. diff --git a/Documentation/admin-guide/nfs/pnfs-block-server.rst b/Documentation/admin-guide/nfs/pnfs-block-server.rst new file mode 100644 index 0000000000000000000000000000000000000000..b00a2e705cc4d6c95e5d54ba644e7581d9213703 --- /dev/null +++ b/Documentation/admin-guide/nfs/pnfs-block-server.rst @@ -0,0 +1,42 @@ +=================================== +pNFS block layout server user guide +=================================== + +The Linux NFS server now supports the pNFS block layout extension. In this +case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition +to handling all the metadata access to the NFS export also hands out layouts +to the clients to directly access the underlying block devices that are +shared with the client. + +To use pNFS block layouts with with the Linux NFS server the exported file +system needs to support the pNFS block layouts (currently just XFS), and the +file system must sit on shared storage (typically iSCSI) that is accessible +to the clients in addition to the MDS. As of now the file system needs to +sit directly on the exported volume, striping or concatenation of +volumes on the MDS and clients is not supported yet. + +On the server, pNFS block volume support is automatically if the file system +support it. On the client make sure the kernel has the CONFIG_PNFS_BLOCK +option enabled, the blkmapd daemon from nfs-utils is running, and the +file system is mounted using the NFSv4.1 protocol version (mount -o vers=4.1). + +If the nfsd server needs to fence a non-responding client it calls +/sbin/nfsd-recall-failed with the first argument set to the IP address of +the client, and the second argument set to the device node without the /dev +prefix for the file system to be fenced. Below is an example file that shows +how to translate the device into a serial number from SCSI EVPD 0x80:: + + cat > /sbin/nfsd-recall-failed << EOF + +.. code-block:: sh + + #!/bin/sh + + CLIENT="$1" + DEV="/dev/$2" + EVPD=`sg_inq --page=0x80 ${DEV} | \ + grep "Unit serial number:" | \ + awk -F ': ' '{print $2}'` + + echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log + EOF diff --git a/Documentation/admin-guide/nfs/pnfs-scsi-server.rst b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst new file mode 100644 index 0000000000000000000000000000000000000000..d2f6ee5580711769f127612e8726cc1e2053a5a4 --- /dev/null +++ b/Documentation/admin-guide/nfs/pnfs-scsi-server.rst @@ -0,0 +1,24 @@ + +================================== +pNFS SCSI layout server user guide +================================== + +This document describes support for pNFS SCSI layouts in the Linux NFS server. +With pNFS SCSI layouts, the NFS server acts as Metadata Server (MDS) for pNFS, +which in addition to handling all the metadata access to the NFS export, +also hands out layouts to the clients so that they can directly access the +underlying SCSI LUNs that are shared with the client. + +To use pNFS SCSI layouts with with the Linux NFS server, the exported file +system needs to support the pNFS SCSI layouts (currently just XFS), and the +file system must sit on a SCSI LUN that is accessible to the clients in +addition to the MDS. As of now the file system needs to sit directly on the +exported LUN, striping or concatenation of LUNs on the MDS and clients +is not supported yet. + +On a server built with CONFIG_NFSD_SCSI, the pNFS SCSI volume support is +automatically enabled if the file system is exported using the "pnfs" +option and the underlying SCSI device support persistent reservations. +On the client make sure the kernel has the CONFIG_PNFS_BLOCK option +enabled, and the file system is mounted using the NFSv4.1 protocol +version (mount -o vers=4.1). diff --git a/Documentation/admin-guide/perf/imx-ddr.rst b/Documentation/admin-guide/perf/imx-ddr.rst index 3726a10a03bae81d16e43cf6913cb3cfd1a4fc2a..f05f56c73b7d846d5793cbf0da62946f625069eb 100644 --- a/Documentation/admin-guide/perf/imx-ddr.rst +++ b/Documentation/admin-guide/perf/imx-ddr.rst @@ -43,7 +43,8 @@ value 1 for supported. AXI_ID and AXI_MASKING are mapped on DPCR1 register in performance counter. When non-masked bits are matching corresponding AXI_ID bits then counter is - incremented. Perf counter is incremented if + incremented. Perf counter is incremented if:: + AxID && AXI_MASKING == AXI_ID && AXI_MASKING This filter doesn't support filter different AXI ID for axid-read and axid-write diff --git a/Documentation/admin-guide/pm/cpufreq_drivers.rst b/Documentation/admin-guide/pm/cpufreq_drivers.rst new file mode 100644 index 0000000000000000000000000000000000000000..9a134ae65803ffe7dc90cda9f616e5856b2b5350 --- /dev/null +++ b/Documentation/admin-guide/pm/cpufreq_drivers.rst @@ -0,0 +1,274 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================================= +Legacy Documentation of CPU Performance Scaling Drivers +======================================================= + +Included below are historic documents describing assorted +:doc:`CPU performance scaling ` drivers. They are reproduced verbatim, +with the original white space formatting and indentation preserved, except for +the added leading space character in every line of text. + + +AMD PowerNow! Drivers +===================== + +:: + + PowerNow! and Cool'n'Quiet are AMD names for frequency + management capabilities in AMD processors. As the hardware + implementation changes in new generations of the processors, + there is a different cpu-freq driver for each generation. + + Note that the driver's will not load on the "wrong" hardware, + so it is safe to try each driver in turn when in doubt as to + which is the correct driver. + + Note that the functionality to change frequency (and voltage) + is not available in all processors. The drivers will refuse + to load on processors without this capability. The capability + is detected with the cpuid instruction. + + The drivers use BIOS supplied tables to obtain frequency and + voltage information appropriate for a particular platform. + Frequency transitions will be unavailable if the BIOS does + not supply these tables. + + 6th Generation: powernow-k6 + + 7th Generation: powernow-k7: Athlon, Duron, Geode. + + 8th Generation: powernow-k8: Athlon, Athlon 64, Opteron, Sempron. + Documentation on this functionality in 8th generation processors + is available in the "BIOS and Kernel Developer's Guide", publication + 26094, in chapter 9, available for download from www.amd.com. + + BIOS supplied data, for powernow-k7 and for powernow-k8, may be + from either the PSB table or from ACPI objects. The ACPI support + is only available if the kernel config sets CONFIG_ACPI_PROCESSOR. + The powernow-k8 driver will attempt to use ACPI if so configured, + and fall back to PST if that fails. + The powernow-k7 driver will try to use the PSB support first, and + fall back to ACPI if the PSB support fails. A module parameter, + acpi_force, is provided to force ACPI support to be used instead + of PSB support. + + +``cpufreq-nforce2`` +=================== + +:: + + The cpufreq-nforce2 driver changes the FSB on nVidia nForce2 platforms. + + This works better than on other platforms, because the FSB of the CPU + can be controlled independently from the PCI/AGP clock. + + The module has two options: + + fid: multiplier * 10 (for example 8.5 = 85) + min_fsb: minimum FSB + + If not set, fid is calculated from the current CPU speed and the FSB. + min_fsb defaults to FSB at boot time - 50 MHz. + + IMPORTANT: The available range is limited downwards! + Also the minimum available FSB can differ, for systems + booting with 200 MHz, 150 should always work. + + +``pcc-cpufreq`` +=============== + +:: + + /* + * pcc-cpufreq.txt - PCC interface documentation + * + * Copyright (C) 2009 Red Hat, Matthew Garrett + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar + */ + + + Processor Clocking Control Driver + --------------------------------- + + Contents: + --------- + 1. Introduction + 1.1 PCC interface + 1.1.1 Get Average Frequency + 1.1.2 Set Desired Frequency + 1.2 Platforms affected + 2. Driver and /sys details + 2.1 scaling_available_frequencies + 2.2 cpuinfo_transition_latency + 2.3 cpuinfo_cur_freq + 2.4 related_cpus + 3. Caveats + + 1. Introduction: + ---------------- + Processor Clocking Control (PCC) is an interface between the platform + firmware and OSPM. It is a mechanism for coordinating processor + performance (ie: frequency) between the platform firmware and the OS. + + The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC + interface. + + OS utilizes the PCC interface to inform platform firmware what frequency the + OS wants for a logical processor. The platform firmware attempts to achieve + the requested frequency. If the request for the target frequency could not be + satisfied by platform firmware, then it usually means that power budget + conditions are in place, and "power capping" is taking place. + + 1.1 PCC interface: + ------------------ + The complete PCC specification is available here: + https://acpica.org/sites/acpica/files/Processor-Clocking-Control-v1p0.pdf + + PCC relies on a shared memory region that provides a channel for communication + between the OS and platform firmware. PCC also implements a "doorbell" that + is used by the OS to inform the platform firmware that a command has been + sent. + + The ACPI PCCH() method is used to discover the location of the PCC shared + memory region. The shared memory region header contains the "command" and + "status" interface. PCCH() also contains details on how to access the platform + doorbell. + + The following commands are supported by the PCC interface: + * Get Average Frequency + * Set Desired Frequency + + The ACPI PCCP() method is implemented for each logical processor and is + used to discover the offsets for the input and output buffers in the shared + memory region. + + When PCC mode is enabled, the platform will not expose processor performance + or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore, + the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for + AMD) will not load. + + However, OSPM remains in control of policy. The governor (eg: "ondemand") + computes the required performance for each processor based on server workload. + The PCC driver fills in the command interface, and the input buffer and + communicates the request to the platform firmware. The platform firmware is + responsible for delivering the requested performance. + + Each PCC command is "global" in scope and can affect all the logical CPUs in + the system. Therefore, PCC is capable of performing "group" updates. With PCC + the OS is capable of getting/setting the frequency of all the logical CPUs in + the system with a single call to the BIOS. + + 1.1.1 Get Average Frequency: + ---------------------------- + This command is used by the OSPM to query the running frequency of the + processor since the last time this command was completed. The output buffer + indicates the average unhalted frequency of the logical processor expressed as + a percentage of the nominal (ie: maximum) CPU frequency. The output buffer + also signifies if the CPU frequency is limited by a power budget condition. + + 1.1.2 Set Desired Frequency: + ---------------------------- + This command is used by the OSPM to communicate to the platform firmware the + desired frequency for a logical processor. The output buffer is currently + ignored by OSPM. The next invocation of "Get Average Frequency" will inform + OSPM if the desired frequency was achieved or not. + + 1.2 Platforms affected: + ----------------------- + The PCC driver will load on any system where the platform firmware: + * supports the PCC interface, and the associated PCCH() and PCCP() methods + * assumes responsibility for managing the hardware clocking controls in order + to deliver the requested processor performance + + Currently, certain HP ProLiant platforms implement the PCC interface. On those + platforms PCC is the "default" choice. + + However, it is possible to disable this interface via a BIOS setting. In + such an instance, as is also the case on platforms where the PCC interface + is not implemented, the PCC driver will fail to load silently. + + 2. Driver and /sys details: + --------------------------- + When the driver loads, it merely prints the lowest and the highest CPU + frequencies supported by the platform firmware. + + The PCC driver loads with a message such as: + pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933 + MHz + + This means that the OPSM can request the CPU to run at any frequency in + between the limits (1600 MHz, and 2933 MHz) specified in the message. + + Internally, there is no need for the driver to convert the "target" frequency + to a corresponding P-state. + + The VERSION number for the driver will be of the format v.xy.ab. + eg: 1.00.02 + ----- -- + | | + | -- this will increase with bug fixes/enhancements to the driver + |-- this is the version of the PCC specification the driver adheres to + + + The following is a brief discussion on some of the fields exported via the + /sys filesystem and how their values are affected by the PCC driver: + + 2.1 scaling_available_frequencies: + ---------------------------------- + scaling_available_frequencies is not created in /sys. No intermediate + frequencies need to be listed because the BIOS will try to achieve any + frequency, within limits, requested by the governor. A frequency does not have + to be strictly associated with a P-state. + + 2.2 cpuinfo_transition_latency: + ------------------------------- + The cpuinfo_transition_latency field is 0. The PCC specification does + not include a field to expose this value currently. + + 2.3 cpuinfo_cur_freq: + --------------------- + A) Often cpuinfo_cur_freq will show a value different than what is declared + in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq. + This is due to "turbo boost" available on recent Intel processors. If certain + conditions are met the BIOS can achieve a slightly higher speed than requested + by OSPM. An example: + + scaling_cur_freq : 2933000 + cpuinfo_cur_freq : 3196000 + + B) There is a round-off error associated with the cpuinfo_cur_freq value. + Since the driver obtains the current frequency as a "percentage" (%) of the + nominal frequency from the BIOS, sometimes, the values displayed by + scaling_cur_freq and cpuinfo_cur_freq may not match. An example: + + scaling_cur_freq : 1600000 + cpuinfo_cur_freq : 1583000 + + In this example, the nominal frequency is 2933 MHz. The driver obtains the + current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency: + + 54% of 2933 MHz = 1583 MHz + + Nominal frequency is the maximum frequency of the processor, and it usually + corresponds to the frequency of the P0 P-state. + + 2.4 related_cpus: + ----------------- + The related_cpus field is identical to affected_cpus. + + affected_cpus : 4 + related_cpus : 4 + + Currently, the PCC driver does not evaluate _PSD. The platforms that support + PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination + to ensure that the same frequency is requested of all dependent CPUs. + + 3. Caveats: + ----------- + The "cpufreq_stats" module in its present form cannot be loaded and + expected to work with the PCC driver. Since the "cpufreq_stats" module + provides information wrt each P-state, it is not applicable to the PCC driver. diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index e70b365dbc6030e758c20f6aa6165498393dd42d..5605cc6f95605b5cf2489f88c42655b1696898f2 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -506,6 +506,9 @@ object corresponding to it, as follows: ``disable`` Whether or not this idle state is disabled. +``default_status`` + The default status of this state, "enabled" or "disabled". + ``latency`` Exit latency of the idle state in microseconds. @@ -580,20 +583,17 @@ Power Management Quality of Service for CPUs The power management quality of service (PM QoS) framework in the Linux kernel allows kernel code and user space processes to set constraints on various energy-efficiency features of the kernel to prevent performance from dropping -below a required level. The PM QoS constraints can be set globally, in -predefined categories referred to as PM QoS classes, or against individual -devices. +below a required level. CPU idle time management can be affected by PM QoS in two ways, through the -global constraint in the ``PM_QOS_CPU_DMA_LATENCY`` class and through the -resume latency constraints for individual CPUs. Kernel code (e.g. device -drivers) can set both of them with the help of special internal interfaces -provided by the PM QoS framework. User space can modify the former by opening -the :file:`cpu_dma_latency` special device file under :file:`/dev/` and writing -a binary value (interpreted as a signed 32-bit integer) to it. In turn, the -resume latency constraint for a CPU can be modified by user space by writing a -string (representing a signed 32-bit integer) to the -:file:`power/pm_qos_resume_latency_us` file under +global CPU latency limit and through the resume latency constraints for +individual CPUs. Kernel code (e.g. device drivers) can set both of them with +the help of special internal interfaces provided by the PM QoS framework. User +space can modify the former by opening the :file:`cpu_dma_latency` special +device file under :file:`/dev/` and writing a binary value (interpreted as a +signed 32-bit integer) to it. In turn, the resume latency constraint for a CPU +can be modified from user space by writing a string (representing a signed +32-bit integer) to the :file:`power/pm_qos_resume_latency_us` file under :file:`/sys/devices/system/cpu/cpu/` in ``sysfs``, where the CPU number ```` is allocated at the system initialization time. Negative values will be rejected in both cases and, also in both cases, the written integer @@ -602,52 +602,54 @@ number will be interpreted as a requested PM QoS constraint in microseconds. The requested value is not automatically applied as a new constraint, however, as it may be less restrictive (greater in this particular case) than another constraint previously requested by someone else. For this reason, the PM QoS -framework maintains a list of requests that have been made so far in each -global class and for each device, aggregates them and applies the effective -(minimum in this particular case) value as the new constraint. +framework maintains a list of requests that have been made so far for the +global CPU latency limit and for each individual CPU, aggregates them and +applies the effective (minimum in this particular case) value as the new +constraint. In fact, opening the :file:`cpu_dma_latency` special device file causes a new -PM QoS request to be created and added to the priority list of requests in the -``PM_QOS_CPU_DMA_LATENCY`` class and the file descriptor coming from the -"open" operation represents that request. If that file descriptor is then -used for writing, the number written to it will be associated with the PM QoS -request represented by it as a new requested constraint value. Next, the -priority list mechanism will be used to determine the new effective value of -the entire list of requests and that effective value will be set as a new -constraint. Thus setting a new requested constraint value will only change the -real constraint if the effective "list" value is affected by it. In particular, -for the ``PM_QOS_CPU_DMA_LATENCY`` class it only affects the real constraint if -it is the minimum of the requested constraints in the list. The process holding -a file descriptor obtained by opening the :file:`cpu_dma_latency` special device -file controls the PM QoS request associated with that file descriptor, but it -controls this particular PM QoS request only. +PM QoS request to be created and added to a global priority list of CPU latency +limit requests and the file descriptor coming from the "open" operation +represents that request. If that file descriptor is then used for writing, the +number written to it will be associated with the PM QoS request represented by +it as a new requested limit value. Next, the priority list mechanism will be +used to determine the new effective value of the entire list of requests and +that effective value will be set as a new CPU latency limit. Thus requesting a +new limit value will only change the real limit if the effective "list" value is +affected by it, which is the case if it is the minimum of the requested values +in the list. + +The process holding a file descriptor obtained by opening the +:file:`cpu_dma_latency` special device file controls the PM QoS request +associated with that file descriptor, but it controls this particular PM QoS +request only. Closing the :file:`cpu_dma_latency` special device file or, more precisely, the file descriptor obtained while opening it, causes the PM QoS request associated -with that file descriptor to be removed from the ``PM_QOS_CPU_DMA_LATENCY`` -class priority list and destroyed. If that happens, the priority list mechanism -will be used, again, to determine the new effective value for the whole list -and that value will become the new real constraint. +with that file descriptor to be removed from the global priority list of CPU +latency limit requests and destroyed. If that happens, the priority list +mechanism will be used again, to determine the new effective value for the whole +list and that value will become the new limit. -In turn, for each CPU there is only one resume latency PM QoS request -associated with the :file:`power/pm_qos_resume_latency_us` file under +In turn, for each CPU there is one resume latency PM QoS request associated with +the :file:`power/pm_qos_resume_latency_us` file under :file:`/sys/devices/system/cpu/cpu/` in ``sysfs`` and writing to it causes this single PM QoS request to be updated regardless of which user space process does that. In other words, this PM QoS request is shared by the entire user space, so access to the file associated with it needs to be arbitrated to avoid confusion. [Arguably, the only legitimate use of this mechanism in practice is to pin a process to the CPU in question and let it use the -``sysfs`` interface to control the resume latency constraint for it.] It -still only is a request, however. It is a member of a priority list used to +``sysfs`` interface to control the resume latency constraint for it.] It is +still only a request, however. It is an entry in a priority list used to determine the effective value to be set as the resume latency constraint for the CPU in question every time the list of requests is updated this way or another (there may be other requests coming from kernel code in that list). CPU idle time governors are expected to regard the minimum of the global -effective ``PM_QOS_CPU_DMA_LATENCY`` class constraint and the effective -resume latency constraint for the given CPU as the upper limit for the exit -latency of the idle states they can select for that CPU. They should never -select any idle states with exit latency beyond that limit. +(effective) CPU latency limit and the effective resume latency constraint for +the given CPU as the upper limit for the exit latency of the idle states that +they are allowed to select for that CPU. They should never select any idle +states with exit latency beyond that limit. Idle States Control Via Kernel Command Line diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst new file mode 100644 index 0000000000000000000000000000000000000000..89309e1b0e484d3312865f2ea97f6301d1c1cefd --- /dev/null +++ b/Documentation/admin-guide/pm/intel_idle.rst @@ -0,0 +1,268 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +============================================== +``intel_idle`` CPU Idle Time Management Driver +============================================== + +:Copyright: |copy| 2020 Intel Corporation + +:Author: Rafael J. Wysocki + + +General Information +=================== + +``intel_idle`` is a part of the +:doc:`CPU idle time management subsystem ` in the Linux kernel +(``CPUIdle``). It is the default CPU idle time management driver for the +Nehalem and later generations of Intel processors, but the level of support for +a particular processor model in it depends on whether or not it recognizes that +processor model and may also depend on information coming from the platform +firmware. [To understand ``intel_idle`` it is necessary to know how ``CPUIdle`` +works in general, so this is the time to get familiar with :doc:`cpuidle` if you +have not done that yet.] + +``intel_idle`` uses the ``MWAIT`` instruction to inform the processor that the +logical CPU executing it is idle and so it may be possible to put some of the +processor's functional blocks into low-power states. That instruction takes two +arguments (passed in the ``EAX`` and ``ECX`` registers of the target CPU), the +first of which, referred to as a *hint*, can be used by the processor to +determine what can be done (for details refer to Intel Software Developer’s +Manual [1]_). Accordingly, ``intel_idle`` refuses to work with processors in +which the support for the ``MWAIT`` instruction has been disabled (for example, +via the platform firmware configuration menu) or which do not support that +instruction at all. + +``intel_idle`` is not modular, so it cannot be unloaded, which means that the +only way to pass early-configuration-time parameters to it is via the kernel +command line. + + +.. _intel-idle-enumeration-of-states: + +Enumeration of Idle States +========================== + +Each ``MWAIT`` hint value is interpreted by the processor as a license to +reconfigure itself in a certain way in order to save energy. The processor +configurations (with reduced power draw) resulting from that are referred to +as C-states (in the ACPI terminology) or idle states. The list of meaningful +``MWAIT`` hint values and idle states (i.e. low-power configurations of the +processor) corresponding to them depends on the processor model and it may also +depend on the configuration of the platform. + +In order to create a list of available idle states required by the ``CPUIdle`` +subsystem (see :ref:`idle-states-representation` in :doc:`cpuidle`), +``intel_idle`` can use two sources of information: static tables of idle states +for different processor models included in the driver itself and the ACPI tables +of the system. The former are always used if the processor model at hand is +recognized by ``intel_idle`` and the latter are used if that is required for +the given processor model (which is the case for all server processor models +recognized by ``intel_idle``) or if the processor model is not recognized. +[There is a module parameter that can be used to make the driver use the ACPI +tables with any processor model recognized by it; see +`below `_.] + +If the ACPI tables are going to be used for building the list of available idle +states, ``intel_idle`` first looks for a ``_CST`` object under one of the ACPI +objects corresponding to the CPUs in the system (refer to the ACPI specification +[2]_ for the description of ``_CST`` and its output package). Because the +``CPUIdle`` subsystem expects that the list of idle states supplied by the +driver will be suitable for all of the CPUs handled by it and ``intel_idle`` is +registered as the ``CPUIdle`` driver for all of the CPUs in the system, the +driver looks for the first ``_CST`` object returning at least one valid idle +state description and such that all of the idle states included in its return +package are of the FFH (Functional Fixed Hardware) type, which means that the +``MWAIT`` instruction is expected to be used to tell the processor that it can +enter one of them. The return package of that ``_CST`` is then assumed to be +applicable to all of the other CPUs in the system and the idle state +descriptions extracted from it are stored in a preliminary list of idle states +coming from the ACPI tables. [This step is skipped if ``intel_idle`` is +configured to ignore the ACPI tables; see `below `_.] + +Next, the first (index 0) entry in the list of available idle states is +initialized to represent a "polling idle state" (a pseudo-idle state in which +the target CPU continuously fetches and executes instructions), and the +subsequent (real) idle state entries are populated as follows. + +If the processor model at hand is recognized by ``intel_idle``, there is a +(static) table of idle state descriptions for it in the driver. In that case, +the "internal" table is the primary source of information on idle states and the +information from it is copied to the final list of available idle states. If +using the ACPI tables for the enumeration of idle states is not required +(depending on the processor model), all of the listed idle state are enabled by +default (so all of them will be taken into consideration by ``CPUIdle`` +governors during CPU idle state selection). Otherwise, some of the listed idle +states may not be enabled by default if there are no matching entries in the +preliminary list of idle states coming from the ACPI tables. In that case user +space still can enable them later (on a per-CPU basis) with the help of +the ``disable`` idle state attribute in ``sysfs`` (see +:ref:`idle-states-representation` in :doc:`cpuidle`). This basically means that +the idle states "known" to the driver may not be enabled by default if they have +not been exposed by the platform firmware (through the ACPI tables). + +If the given processor model is not recognized by ``intel_idle``, but it +supports ``MWAIT``, the preliminary list of idle states coming from the ACPI +tables is used for building the final list that will be supplied to the +``CPUIdle`` core during driver registration. For each idle state in that list, +the description, ``MWAIT`` hint and exit latency are copied to the corresponding +entry in the final list of idle states. The name of the idle state represented +by it (to be returned by the ``name`` idle state attribute in ``sysfs``) is +"CX_ACPI", where X is the index of that idle state in the final list (note that +the minimum value of X is 1, because 0 is reserved for the "polling" state), and +its target residency is based on the exit latency value. Specifically, for +C1-type idle states the exit latency value is also used as the target residency +(for compatibility with the majority of the "internal" tables of idle states for +various processor models recognized by ``intel_idle``) and for the other idle +state types (C2 and C3) the target residency value is 3 times the exit latency +(again, that is because it reflects the target residency to exit latency ratio +in the majority of cases for the processor models recognized by ``intel_idle``). +All of the idle states in the final list are enabled by default in this case. + + +.. _intel-idle-initialization: + +Initialization +============== + +The initialization of ``intel_idle`` starts with checking if the kernel command +line options forbid the use of the ``MWAIT`` instruction. If that is the case, +an error code is returned right away. + +The next step is to check whether or not the processor model is known to the +driver, which determines the idle states enumeration method (see +`above `_), and whether or not the processor +supports ``MWAIT`` (the initialization fails if that is not the case). Then, +the ``MWAIT`` support in the processor is enumerated through ``CPUID`` and the +driver initialization fails if the level of support is not as expected (for +example, if the total number of ``MWAIT`` substates returned is 0). + +Next, if the driver is not configured to ignore the ACPI tables (see +`below `_), the idle states information provided by the +platform firmware is extracted from them. + +Then, ``CPUIdle`` device objects are allocated for all CPUs and the list of +available idle states is created as explained +`above `_. + +Finally, ``intel_idle`` is registered with the help of cpuidle_register_driver() +as the ``CPUIdle`` driver for all CPUs in the system and a CPU online callback +for configuring individual CPUs is registered via cpuhp_setup_state(), which +(among other things) causes the callback routine to be invoked for all of the +CPUs present in the system at that time (each CPU executes its own instance of +the callback routine). That routine registers a ``CPUIdle`` device for the CPU +running it (which enables the ``CPUIdle`` subsystem to operate that CPU) and +optionally performs some CPU-specific initialization actions that may be +required for the given processor model. + + +.. _intel-idle-parameters: + +Kernel Command Line Options and Module Parameters +================================================= + +The *x86* architecture support code recognizes three kernel command line +options related to CPU idle time management: ``idle=poll``, ``idle=halt``, +and ``idle=nomwait``. If any of them is present in the kernel command line, the +``MWAIT`` instruction is not allowed to be used, so the initialization of +``intel_idle`` will fail. + +Apart from that there are four module parameters recognized by ``intel_idle`` +itself that can be set via the kernel command line (they cannot be updated via +sysfs, so that is the only way to change their values). + +The ``max_cstate`` parameter value is the maximum idle state index in the list +of idle states supplied to the ``CPUIdle`` core during the registration of the +driver. It is also the maximum number of regular (non-polling) idle states that +can be used by ``intel_idle``, so the enumeration of idle states is terminated +after finding that number of usable idle states (the other idle states that +potentially might have been used if ``max_cstate`` had been greater are not +taken into consideration at all). Setting ``max_cstate`` can prevent +``intel_idle`` from exposing idle states that are regarded as "too deep" for +some reason to the ``CPUIdle`` core, but it does so by making them effectively +invisible until the system is shut down and started again which may not always +be desirable. In practice, it is only really necessary to do that if the idle +states in question cannot be enabled during system startup, because in the +working state of the system the CPU power management quality of service (PM +QoS) feature can be used to prevent ``CPUIdle`` from touching those idle states +even if they have been enumerated (see :ref:`cpu-pm-qos` in :doc:`cpuidle`). +Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail. + +The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle`` +if the kernel has been configured with ACPI support) can be set to make the +driver ignore the system's ACPI tables entirely or use them for all of the +recognized processor models, respectively (they both are unset by default and +``use_acpi`` has no effect if ``no_acpi`` is set). + +The value of the ``states_off`` module parameter (0 by default) represents a +list of idle states to be disabled by default in the form of a bitmask. + +Namely, the positions of the bits that are set in the ``states_off`` value are +the indices of idle states to be disabled by default (as reflected by the names +of the corresponding idle state directories in ``sysfs``, :file:`state0`, +:file:`state1` ... :file:`state` ..., where ```` is the index of the given +idle state; see :ref:`idle-states-representation` in :doc:`cpuidle`). + +For example, if ``states_off`` is equal to 3, the driver will disable idle +states 0 and 1 by default, and if it is equal to 8, idle state 3 will be +disabled by default and so on (bit positions beyond the maximum idle state index +are ignored). + +The idle states disabled this way can be enabled (on a per-CPU basis) from user +space via ``sysfs``. + + +.. _intel-idle-core-and-package-idle-states: + +Core and Package Levels of Idle States +====================================== + +Typically, in a processor supporting the ``MWAIT`` instruction there are (at +least) two levels of idle states (or C-states). One level, referred to as +"core C-states", covers individual cores in the processor, whereas the other +level, referred to as "package C-states", covers the entire processor package +and it may also involve other components of the system (GPUs, memory +controllers, I/O hubs etc.). + +Some of the ``MWAIT`` hint values allow the processor to use core C-states only +(most importantly, that is the case for the ``MWAIT`` hint value corresponding +to the ``C1`` idle state), but the majority of them give it a license to put +the target core (i.e. the core containing the logical CPU executing ``MWAIT`` +with the given hint value) into a specific core C-state and then (if possible) +to enter a specific package C-state at the deeper level. For example, the +``MWAIT`` hint value representing the ``C3`` idle state allows the processor to +put the target core into the low-power state referred to as "core ``C3``" (or +``CC3``), which happens if all of the logical CPUs (SMT siblings) in that core +have executed ``MWAIT`` with the ``C3`` hint value (or with a hint value +representing a deeper idle state), and in addition to that (in the majority of +cases) it gives the processor a license to put the entire package (possibly +including some non-CPU components such as a GPU or a memory controller) into the +low-power state referred to as "package ``C3``" (or ``PC3``), which happens if +all of the cores have gone into the ``CC3`` state and (possibly) some additional +conditions are satisfied (for instance, if the GPU is covered by ``PC3``, it may +be required to be in a certain GPU-specific low-power state for ``PC3`` to be +reachable). + +As a rule, there is no simple way to make the processor use core C-states only +if the conditions for entering the corresponding package C-states are met, so +the logical CPU executing ``MWAIT`` with a hint value that is not core-level +only (like for ``C1``) must always assume that this may cause the processor to +enter a package C-state. [That is why the exit latency and target residency +values corresponding to the majority of ``MWAIT`` hint values in the "internal" +tables of idle states in ``intel_idle`` reflect the properties of package +C-states.] If using package C-states is not desirable at all, either +:ref:`PM QoS ` or the ``max_cstate`` module parameter of +``intel_idle`` described `above `_ must be used to +restrict the range of permissible idle states to the ones with core-level only +``MWAIT`` hint values (like ``C1``). + + +References +========== + +.. [1] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 2B*, + https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-vol-2b-manual.html + +.. [2] *Advanced Configuration and Power Interface (ACPI) Specification*, + https://uefi.org/specifications diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 67e414e34f37997e8bc620e904797cb3f3aa13c4..ad392f3aee06149f12a4e6554fec7efedc568ede 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -734,10 +734,10 @@ References ========== .. [1] Kristen Accardi, *Balancing Power and Performance in the Linux Kernel*, - http://events.linuxfoundation.org/sites/events/files/slides/LinuxConEurope_2015.pdf + https://events.static.linuxfound.org/sites/events/files/slides/LinuxConEurope_2015.pdf .. [2] *Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3: System Programming Guide*, - http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html + https://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-system-programming-manual-325384.html .. [3] *Advanced Configuration and Power Interface Specification*, https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf diff --git a/Documentation/admin-guide/pm/sleep-states.rst b/Documentation/admin-guide/pm/sleep-states.rst index cd3a28cb81f424d0962046ebb69292c214041150..ee55a460c639bcbaee09a1fa9b4e555895ce730c 100644 --- a/Documentation/admin-guide/pm/sleep-states.rst +++ b/Documentation/admin-guide/pm/sleep-states.rst @@ -153,8 +153,11 @@ for the given CPU architecture includes the low-level code for system resume. Basic ``sysfs`` Interfaces for System Suspend and Hibernation ============================================================= -The following files located in the :file:`/sys/power/` directory can be used by -user space for sleep states control. +The power management subsystem provides userspace with a unified ``sysfs`` +interface for system sleep regardless of the underlying system architecture or +platform. That interface is located in the :file:`/sys/power/` directory +(assuming that ``sysfs`` is mounted at :file:`/sys`) and it consists of the +following attributes (files): ``state`` This file contains a list of strings representing sleep states supported @@ -162,9 +165,9 @@ user space for sleep states control. to start a transition of the system into the sleep state represented by that string. - In particular, the strings "disk", "freeze" and "standby" represent the + In particular, the "disk", "freeze" and "standby" strings represent the :ref:`hibernation `, :ref:`suspend-to-idle ` and - :ref:`standby ` sleep states, respectively. The string "mem" + :ref:`standby ` sleep states, respectively. The "mem" string is interpreted in accordance with the contents of the ``mem_sleep`` file described below. @@ -177,7 +180,7 @@ user space for sleep states control. associated with the "mem" string in the ``state`` file described above. The strings that may be present in this file are "s2idle", "shallow" - and "deep". The string "s2idle" always represents :ref:`suspend-to-idle + and "deep". The "s2idle" string always represents :ref:`suspend-to-idle ` and, by convention, "shallow" and "deep" represent :ref:`standby ` and :ref:`suspend-to-RAM `, respectively. @@ -185,15 +188,17 @@ user space for sleep states control. Writing one of the listed strings into this file causes the system suspend variant represented by it to be associated with the "mem" string in the ``state`` file. The string representing the suspend variant - currently associated with the "mem" string in the ``state`` file - is listed in square brackets. + currently associated with the "mem" string in the ``state`` file is + shown in square brackets. If the kernel does not support system suspend, this file is not present. ``disk`` - This file contains a list of strings representing different operations - that can be carried out after the hibernation image has been saved. The - possible options are as follows: + This file controls the operating mode of hibernation (Suspend-to-Disk). + Specifically, it tells the kernel what to do after creating a + hibernation image. + + Reading from it returns a list of supported options encoded as: ``platform`` Put the system into a special low-power state (e.g. ACPI S4) to @@ -201,6 +206,11 @@ user space for sleep states control. platform firmware to take a simplified initialization path after wakeup. + It is only available if the platform provides a special + mechanism to put the system to sleep after creating a + hibernation image (platforms with ACPI do that as a rule, for + example). + ``shutdown`` Power off the system. @@ -214,22 +224,53 @@ user space for sleep states control. the hibernation image and continue. Otherwise, use the image to restore the previous state of the system. + It is available if system suspend is supported. + ``test_resume`` Diagnostic operation. Load the image as though the system had just woken up from hibernation and the currently running kernel instance was a restore kernel and follow up with full system resume. - Writing one of the listed strings into this file causes the option + Writing one of the strings listed above into this file causes the option represented by it to be selected. - The currently selected option is shown in square brackets which means + The currently selected option is shown in square brackets, which means that the operation represented by it will be carried out after creating - and saving the image next time hibernation is triggered by writing - ``disk`` to :file:`/sys/power/state`. + and saving the image when hibernation is triggered by writing ``disk`` + to :file:`/sys/power/state`. If the kernel does not support hibernation, this file is not present. +``image_size`` + This file controls the size of hibernation images. + + It can be written a string representing a non-negative integer that will + be used as a best-effort upper limit of the image size, in bytes. The + hibernation core will do its best to ensure that the image size will not + exceed that number, but if that turns out to be impossible to achieve, a + hibernation image will still be created and its size will be as small as + possible. In particular, writing '0' to this file causes the size of + hibernation images to be minimum. + + Reading from it returns the current image size limit, which is set to + around 2/5 of the available RAM size by default. + +``pm_trace`` + This file controls the "PM trace" mechanism saving the last suspend + or resume event point in the RTC memory across reboots. It helps to + debug hard lockups or reboots due to device driver failures that occur + during system suspend or resume (which is more common) more effectively. + + If it contains "1", the fingerprint of each suspend/resume event point + in turn will be stored in the RTC memory (overwriting the actual RTC + information), so it will survive a system crash if one occurs right + after storing it and it can be used later to identify the driver that + caused the crash to happen. + + It contains "0" by default, which may be changed to "1" by writing a + string representing a nonzero integer into it. + According to the above, there are two ways to make the system go into the :ref:`suspend-to-idle ` state. The first one is to write "freeze" directly to :file:`/sys/power/state`. The second one is to write "s2idle" to @@ -244,6 +285,7 @@ system go into the :ref:`suspend-to-RAM ` state (write "deep" into The default suspend variant (ie. the one to be used without writing anything into :file:`/sys/power/mem_sleep`) is either "deep" (on the majority of systems supporting :ref:`suspend-to-RAM `) or "s2idle", but it can be overridden -by the value of the "mem_sleep_default" parameter in the kernel command line. -On some ACPI-based systems, depending on the information in the ACPI tables, the -default may be "s2idle" even if :ref:`suspend-to-RAM ` is supported. +by the value of the ``mem_sleep_default`` parameter in the kernel command line. +On some systems with ACPI, depending on the information in the ACPI tables, the +default may be "s2idle" even if :ref:`suspend-to-RAM ` is supported in +principle. diff --git a/Documentation/admin-guide/pm/suspend-flows.rst b/Documentation/admin-guide/pm/suspend-flows.rst new file mode 100644 index 0000000000000000000000000000000000000000..c479d7462647b3dca2a2d1367d2ca9d8b5e37b92 --- /dev/null +++ b/Documentation/admin-guide/pm/suspend-flows.rst @@ -0,0 +1,270 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: + +========================= +System Suspend Code Flows +========================= + +:Copyright: |copy| 2020 Intel Corporation + +:Author: Rafael J. Wysocki + +At least one global system-wide transition needs to be carried out for the +system to get from the working state into one of the supported +:doc:`sleep states `. Hibernation requires more than one +transition to occur for this purpose, but the other sleep states, commonly +referred to as *system-wide suspend* (or simply *system suspend*) states, need +only one. + +For those sleep states, the transition from the working state of the system into +the target sleep state is referred to as *system suspend* too (in the majority +of cases, whether this means a transition or a sleep state of the system should +be clear from the context) and the transition back from the sleep state into the +working state is referred to as *system resume*. + +The kernel code flows associated with the suspend and resume transitions for +different sleep states of the system are quite similar, but there are some +significant differences between the :ref:`suspend-to-idle ` code flows +and the code flows related to the :ref:`suspend-to-RAM ` and +:ref:`standby ` sleep states. + +The :ref:`suspend-to-RAM ` and :ref:`standby ` sleep states +cannot be implemented without platform support and the difference between them +boils down to the platform-specific actions carried out by the suspend and +resume hooks that need to be provided by the platform driver to make them +available. Apart from that, the suspend and resume code flows for these sleep +states are mostly identical, so they both together will be referred to as +*platform-dependent suspend* states in what follows. + + +.. _s2idle_suspend: + +Suspend-to-idle Suspend Code Flow +================================= + +The following steps are taken in order to transition the system from the working +state to the :ref:`suspend-to-idle ` sleep state: + + 1. Invoking system-wide suspend notifiers. + + Kernel subsystems can register callbacks to be invoked when the suspend + transition is about to occur and when the resume transition has finished. + + That allows them to prepare for the change of the system state and to clean + up after getting back to the working state. + + 2. Freezing tasks. + + Tasks are frozen primarily in order to avoid unchecked hardware accesses + from user space through MMIO regions or I/O registers exposed directly to + it and to prevent user space from entering the kernel while the next step + of the transition is in progress (which might have been problematic for + various reasons). + + All user space tasks are intercepted as though they were sent a signal and + put into uninterruptible sleep until the end of the subsequent system resume + transition. + + The kernel threads that choose to be frozen during system suspend for + specific reasons are frozen subsequently, but they are not intercepted. + Instead, they are expected to periodically check whether or not they need + to be frozen and to put themselves into uninterruptible sleep if so. [Note, + however, that kernel threads can use locking and other concurrency controls + available in kernel space to synchronize themselves with system suspend and + resume, which can be much more precise than the freezing, so the latter is + not a recommended option for kernel threads.] + + 3. Suspending devices and reconfiguring IRQs. + + Devices are suspended in four phases called *prepare*, *suspend*, + *late suspend* and *noirq suspend* (see :ref:`driverapi_pm_devices` for more + information on what exactly happens in each phase). + + Every device is visited in each phase, but typically it is not physically + accessed in more than two of them. + + The runtime PM API is disabled for every device during the *late* suspend + phase and high-level ("action") interrupt handlers are prevented from being + invoked before the *noirq* suspend phase. + + Interrupts are still handled after that, but they are only acknowledged to + interrupt controllers without performing any device-specific actions that + would be triggered in the working state of the system (those actions are + deferred till the subsequent system resume transition as described + `below `_). + + IRQs associated with system wakeup devices are "armed" so that the resume + transition of the system is started when one of them signals an event. + + 4. Freezing the scheduler tick and suspending timekeeping. + + When all devices have been suspended, CPUs enter the idle loop and are put + into the deepest available idle state. While doing that, each of them + "freezes" its own scheduler tick so that the timer events associated with + the tick do not occur until the CPU is woken up by another interrupt source. + + The last CPU to enter the idle state also stops the timekeeping which + (among other things) prevents high resolution timers from triggering going + forward until the first CPU that is woken up restarts the timekeeping. + That allows the CPUs to stay in the deep idle state relatively long in one + go. + + From this point on, the CPUs can only be woken up by non-timer hardware + interrupts. If that happens, they go back to the idle state unless the + interrupt that woke up one of them comes from an IRQ that has been armed for + system wakeup, in which case the system resume transition is started. + + +.. _s2idle_resume: + +Suspend-to-idle Resume Code Flow +================================ + +The following steps are taken in order to transition the system from the +:ref:`suspend-to-idle ` sleep state into the working state: + + 1. Resuming timekeeping and unfreezing the scheduler tick. + + When one of the CPUs is woken up (by a non-timer hardware interrupt), it + leaves the idle state entered in the last step of the preceding suspend + transition, restarts the timekeeping (unless it has been restarted already + by another CPU that woke up earlier) and the scheduler tick on that CPU is + unfrozen. + + If the interrupt that has woken up the CPU was armed for system wakeup, + the system resume transition begins. + + 2. Resuming devices and restoring the working-state configuration of IRQs. + + Devices are resumed in four phases called *noirq resume*, *early resume*, + *resume* and *complete* (see :ref:`driverapi_pm_devices` for more + information on what exactly happens in each phase). + + Every device is visited in each phase, but typically it is not physically + accessed in more than two of them. + + The working-state configuration of IRQs is restored after the *noirq* resume + phase and the runtime PM API is re-enabled for every device whose driver + supports it during the *early* resume phase. + + 3. Thawing tasks. + + Tasks frozen in step 2 of the preceding `suspend `_ + transition are "thawed", which means that they are woken up from the + uninterruptible sleep that they went into at that time and user space tasks + are allowed to exit the kernel. + + 4. Invoking system-wide resume notifiers. + + This is analogous to step 1 of the `suspend `_ transition + and the same set of callbacks is invoked at this point, but a different + "notification type" parameter value is passed to them. + + +Platform-dependent Suspend Code Flow +==================================== + +The following steps are taken in order to transition the system from the working +state to platform-dependent suspend state: + + 1. Invoking system-wide suspend notifiers. + + This step is the same as step 1 of the suspend-to-idle suspend transition + described `above `_. + + 2. Freezing tasks. + + This step is the same as step 2 of the suspend-to-idle suspend transition + described `above `_. + + 3. Suspending devices and reconfiguring IRQs. + + This step is analogous to step 3 of the suspend-to-idle suspend transition + described `above `_, but the arming of IRQs for system + wakeup generally does not have any effect on the platform. + + There are platforms that can go into a very deep low-power state internally + when all CPUs in them are in sufficiently deep idle states and all I/O + devices have been put into low-power states. On those platforms, + suspend-to-idle can reduce system power very effectively. + + On the other platforms, however, low-level components (like interrupt + controllers) need to be turned off in a platform-specific way (implemented + in the hooks provided by the platform driver) to achieve comparable power + reduction. + + That usually prevents in-band hardware interrupts from waking up the system, + which must be done in a special platform-dependent way. Then, the + configuration of system wakeup sources usually starts when system wakeup + devices are suspended and is finalized by the platform suspend hooks later + on. + + 4. Disabling non-boot CPUs. + + On some platforms the suspend hooks mentioned above must run in a one-CPU + configuration of the system (in particular, the hardware cannot be accessed + by any code running in parallel with the platform suspend hooks that may, + and often do, trap into the platform firmware in order to finalize the + suspend transition). + + For this reason, the CPU offline/online (CPU hotplug) framework is used + to take all of the CPUs in the system, except for one (the boot CPU), + offline (typically, the CPUs that have been taken offline go into deep idle + states). + + This means that all tasks are migrated away from those CPUs and all IRQs are + rerouted to the only CPU that remains online. + + 5. Suspending core system components. + + This prepares the core system components for (possibly) losing power going + forward and suspends the timekeeping. + + 6. Platform-specific power removal. + + This is expected to remove power from all of the system components except + for the memory controller and RAM (in order to preserve the contents of the + latter) and some devices designated for system wakeup. + + In many cases control is passed to the platform firmware which is expected + to finalize the suspend transition as needed. + + +Platform-dependent Resume Code Flow +=================================== + +The following steps are taken in order to transition the system from a +platform-dependent suspend state into the working state: + + 1. Platform-specific system wakeup. + + The platform is woken up by a signal from one of the designated system + wakeup devices (which need not be an in-band hardware interrupt) and + control is passed back to the kernel (the working configuration of the + platform may need to be restored by the platform firmware before the + kernel gets control again). + + 2. Resuming core system components. + + The suspend-time configuration of the core system components is restored and + the timekeeping is resumed. + + 3. Re-enabling non-boot CPUs. + + The CPUs disabled in step 4 of the preceding suspend transition are taken + back online and their suspend-time configuration is restored. + + 4. Resuming devices and restoring the working-state configuration of IRQs. + + This step is the same as step 2 of the suspend-to-idle suspend transition + described `above `_. + + 5. Thawing tasks. + + This step is the same as step 3 of the suspend-to-idle suspend transition + described `above `_. + + 6. Invoking system-wide resume notifiers. + + This step is the same as step 4 of the suspend-to-idle suspend transition + described `above `_. diff --git a/Documentation/admin-guide/pm/system-wide.rst b/Documentation/admin-guide/pm/system-wide.rst index 2b1f987b34f007a3d2cffd56a4b64fee3f2aa4c4..1a1924d7100693f9c01dc03237025b9e989a86d1 100644 --- a/Documentation/admin-guide/pm/system-wide.rst +++ b/Documentation/admin-guide/pm/system-wide.rst @@ -8,3 +8,4 @@ System-Wide Power Management :maxdepth: 2 sleep-states + suspend-flows diff --git a/Documentation/admin-guide/pm/working-state.rst b/Documentation/admin-guide/pm/working-state.rst index fc298eb1234b07ba70e35ae26d1dd750834419d7..0a38cdf39df1ed95cdae43ad7870585b636d5ee3 100644 --- a/Documentation/admin-guide/pm/working-state.rst +++ b/Documentation/admin-guide/pm/working-state.rst @@ -8,6 +8,8 @@ Working-State Power Management :maxdepth: 2 cpuidle + intel_idle cpufreq intel_pstate + cpufreq_drivers intel_epb diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index def074807cee9a35bf2bf824738069537808daa5..0d427fd1094194204d0fed72caf4b468e2d62017 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -2,262 +2,197 @@ Documentation for /proc/sys/kernel/ =================================== -kernel version 2.2.10 +.. See scripts/check-sysctl-docs to keep this up to date + Copyright (c) 1998, 1999, Rik van Riel Copyright (c) 2009, Shen Feng -For general info and legal blurb, please look in index.rst. +For general info and legal blurb, please look in :doc:`index`. ------------------------------------------------------------------------------ This file contains documentation for the sysctl files in -/proc/sys/kernel/ and is valid for Linux kernel version 2.2. +``/proc/sys/kernel/`` and is valid for Linux kernel version 2.2. The files in this directory can be used to tune and monitor miscellaneous and general things in the operation of the Linux -kernel. Since some of the files _can_ be used to screw up your +kernel. Since some of the files *can* be used to screw up your system, it is advisable to read both documentation and source before actually making adjustments. Currently, these files might (depending on your configuration) -show up in /proc/sys/kernel: - -- acct -- acpi_video_flags -- auto_msgmni -- bootloader_type [ X86 only ] -- bootloader_version [ X86 only ] -- cap_last_cap -- core_pattern -- core_pipe_limit -- core_uses_pid -- ctrl-alt-del -- dmesg_restrict -- domainname -- hostname -- hotplug -- hardlockup_all_cpu_backtrace -- hardlockup_panic -- hung_task_panic -- hung_task_check_count -- hung_task_timeout_secs -- hung_task_check_interval_secs -- hung_task_warnings -- hyperv_record_panic_msg -- kexec_load_disabled -- kptr_restrict -- l2cr [ PPC only ] -- modprobe ==> Documentation/debugging-modules.txt -- modules_disabled -- msg_next_id [ sysv ipc ] -- msgmax -- msgmnb -- msgmni -- nmi_watchdog -- osrelease -- ostype -- overflowgid -- overflowuid -- panic -- panic_on_oops -- panic_on_stackoverflow -- panic_on_unrecovered_nmi -- panic_on_warn -- panic_print -- panic_on_rcu_stall -- perf_cpu_time_max_percent -- perf_event_paranoid -- perf_event_max_stack -- perf_event_mlock_kb -- perf_event_max_contexts_per_stack -- pid_max -- powersave-nap [ PPC only ] -- printk -- printk_delay -- printk_ratelimit -- printk_ratelimit_burst -- pty ==> Documentation/filesystems/devpts.txt -- randomize_va_space -- real-root-dev ==> Documentation/admin-guide/initrd.rst -- reboot-cmd [ SPARC only ] -- rtsig-max -- rtsig-nr -- sched_energy_aware -- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst -- sem -- sem_next_id [ sysv ipc ] -- sg-big-buff [ generic SCSI device (sg) ] -- shm_next_id [ sysv ipc ] -- shm_rmid_forced -- shmall -- shmmax [ sysv ipc ] -- shmmni -- softlockup_all_cpu_backtrace -- soft_watchdog -- stack_erasing -- stop-a [ SPARC only ] -- sysrq ==> Documentation/admin-guide/sysrq.rst -- sysctl_writes_strict -- tainted ==> Documentation/admin-guide/tainted-kernels.rst -- threads-max -- unknown_nmi_panic -- watchdog -- watchdog_thresh -- version - - -acct: -===== +show up in ``/proc/sys/kernel``: + +.. contents:: :local: + + +acct +==== + +:: -highwater lowwater frequency + highwater lowwater frequency If BSD-style process accounting is enabled these values control its behaviour. If free space on filesystem where the log lives -goes below % accounting suspends. If free space gets -above % accounting resumes. determines +goes below ``lowwater``% accounting suspends. If free space gets +above ``highwater``% accounting resumes. ``frequency`` determines how often do we check the amount of free space (value is in seconds). Default: -4 2 30 -That is, suspend accounting if there left <= 2% free; resume it -if we got >=4%; consider information about amount of free space -valid for 30 seconds. +:: -acpi_video_flags: -================= + 4 2 30 + +That is, suspend accounting if free space drops below 2%; resume it +if it increases to at least 4%; consider information about amount of +free space valid for 30 seconds. -flags -See Doc*/kernel/power/video.txt, it allows mode of video boot to be -set during run time. +acpi_video_flags +================ +See :doc:`/power/video`. This allows the video resume mode to be set, +in a similar fashion to the ``acpi_sleep`` kernel parameter, by +combining the following values: + += ======= +1 s3_bios +2 s3_mode +4 s3_beep += ======= -auto_msgmni: -============ + +auto_msgmni +=========== This variable has no effect and may be removed in future kernel releases. Reading it always returns 0. -Up to Linux 3.17, it enabled/disabled automatic recomputing of msgmni -upon memory add/remove or upon ipc namespace creation/removal. +Up to Linux 3.17, it enabled/disabled automatic recomputing of +`msgmni`_ +upon memory add/remove or upon IPC namespace creation/removal. Echoing "1" into this file enabled msgmni automatic recomputing. -Echoing "0" turned it off. auto_msgmni default value was 1. - +Echoing "0" turned it off. The default value was 1. -bootloader_type: -================ -x86 bootloader identification +bootloader_type (x86 only) +========================== This gives the bootloader type number as indicated by the bootloader, shifted left by 4, and OR'd with the low four bits of the bootloader version. The reason for this encoding is that this used to match the -type_of_loader field in the kernel header; the encoding is kept for +``type_of_loader`` field in the kernel header; the encoding is kept for backwards compatibility. That is, if the full bootloader type number is 0x15 and the full version number is 0x234, this file will contain the value 340 = 0x154. -See the type_of_loader and ext_loader_type fields in -Documentation/x86/boot.rst for additional information. - +See the ``type_of_loader`` and ``ext_loader_type`` fields in +:doc:`/x86/boot` for additional information. -bootloader_version: -=================== -x86 bootloader version +bootloader_version (x86 only) +============================= The complete bootloader version number. In the example above, this file will contain the value 564 = 0x234. -See the type_of_loader and ext_loader_ver fields in -Documentation/x86/boot.rst for additional information. +See the ``type_of_loader`` and ``ext_loader_ver`` fields in +:doc:`/x86/boot` for additional information. -cap_last_cap: -============= +cap_last_cap +============ Highest valid capability of the running kernel. Exports -CAP_LAST_CAP from the kernel. +``CAP_LAST_CAP`` from the kernel. -core_pattern: -============= +core_pattern +============ -core_pattern is used to specify a core dumpfile pattern name. +``core_pattern`` is used to specify a core dumpfile pattern name. * max length 127 characters; default value is "core" -* core_pattern is used as a pattern template for the output filename; - certain string patterns (beginning with '%') are substituted with - their actual values. -* backward compatibility with core_uses_pid: +* ``core_pattern`` is used as a pattern template for the output + filename; certain string patterns (beginning with '%') are + substituted with their actual values. +* backward compatibility with ``core_uses_pid``: - If core_pattern does not include "%p" (default does not) - and core_uses_pid is set, then .PID will be appended to + If ``core_pattern`` does not include "%p" (default does not) + and ``core_uses_pid`` is set, then .PID will be appended to the filename. -* corename format specifiers:: - - % '%' is dropped - %% output one '%' - %p pid - %P global pid (init PID namespace) - %i tid - %I global tid (init PID namespace) - %u uid (in initial user namespace) - %g gid (in initial user namespace) - %d dump mode, matches PR_SET_DUMPABLE and - /proc/sys/fs/suid_dumpable - %s signal number - %t UNIX time of dump - %h hostname - %e executable filename (may be shortened) - %E executable path - % both are dropped +* corename format specifiers + + ======== ========================================== + % '%' is dropped + %% output one '%' + %p pid + %P global pid (init PID namespace) + %i tid + %I global tid (init PID namespace) + %u uid (in initial user namespace) + %g gid (in initial user namespace) + %d dump mode, matches ``PR_SET_DUMPABLE`` and + ``/proc/sys/fs/suid_dumpable`` + %s signal number + %t UNIX time of dump + %h hostname + %e executable filename (may be shortened) + %E executable path + %c maximum size of core file by resource limit RLIMIT_CORE + % both are dropped + ======== ========================================== * If the first character of the pattern is a '|', the kernel will treat the rest of the pattern as a command to run. The core dump will be written to the standard input of that program instead of to a file. -core_pipe_limit: -================ +core_pipe_limit +=============== -This sysctl is only applicable when core_pattern is configured to pipe -core files to a user space helper (when the first character of -core_pattern is a '|', see above). When collecting cores via a pipe -to an application, it is occasionally useful for the collecting -application to gather data about the crashing process from its -/proc/pid directory. In order to do this safely, the kernel must wait -for the collecting process to exit, so as not to remove the crashing -processes proc files prematurely. This in turn creates the -possibility that a misbehaving userspace collecting process can block -the reaping of a crashed process simply by never exiting. This sysctl -defends against that. It defines how many concurrent crashing -processes may be piped to user space applications in parallel. If -this value is exceeded, then those crashing processes above that value -are noted via the kernel log and their cores are skipped. 0 is a -special value, indicating that unlimited processes may be captured in -parallel, but that no waiting will take place (i.e. the collecting -process is not guaranteed access to /proc//). This -value defaults to 0. - - -core_uses_pid: -============== +This sysctl is only applicable when `core_pattern`_ is configured to +pipe core files to a user space helper (when the first character of +``core_pattern`` is a '|', see above). +When collecting cores via a pipe to an application, it is occasionally +useful for the collecting application to gather data about the +crashing process from its ``/proc/pid`` directory. +In order to do this safely, the kernel must wait for the collecting +process to exit, so as not to remove the crashing processes proc files +prematurely. +This in turn creates the possibility that a misbehaving userspace +collecting process can block the reaping of a crashed process simply +by never exiting. +This sysctl defends against that. +It defines how many concurrent crashing processes may be piped to user +space applications in parallel. +If this value is exceeded, then those crashing processes above that +value are noted via the kernel log and their cores are skipped. +0 is a special value, indicating that unlimited processes may be +captured in parallel, but that no waiting will take place (i.e. the +collecting process is not guaranteed access to ``/proc//``). +This value defaults to 0. + + +core_uses_pid +============= The default coredump filename is "core". By setting -core_uses_pid to 1, the coredump filename becomes core.PID. -If core_pattern does not include "%p" (default does not) -and core_uses_pid is set, then .PID will be appended to +``core_uses_pid`` to 1, the coredump filename becomes core.PID. +If `core_pattern`_ does not include "%p" (default does not) +and ``core_uses_pid`` is set, then .PID will be appended to the filename. -ctrl-alt-del: -============= +ctrl-alt-del +============ When the value in this file is 0, ctrl-alt-del is trapped and -sent to the init(1) program to handle a graceful restart. +sent to the ``init(1)`` program to handle a graceful restart. When, however, the value is > 0, Linux's reaction to a Vulcan Nerve Pinch (tm) will be an immediate reboot, without even syncing its dirty buffers. @@ -269,21 +204,22 @@ Note: to decide what to do with it. -dmesg_restrict: -=============== +dmesg_restrict +============== This toggle indicates whether unprivileged users are prevented -from using dmesg(8) to view messages from the kernel's log buffer. -When dmesg_restrict is set to (0) there are no restrictions. When -dmesg_restrict is set set to (1), users must have CAP_SYSLOG to use -dmesg(8). +from using ``dmesg(8)`` to view messages from the kernel's log +buffer. +When ``dmesg_restrict`` is set to 0 there are no restrictions. +When ``dmesg_restrict`` is set set to 1, users must have +``CAP_SYSLOG`` to use ``dmesg(8)``. -The kernel config option CONFIG_SECURITY_DMESG_RESTRICT sets the -default value of dmesg_restrict. +The kernel config option ``CONFIG_SECURITY_DMESG_RESTRICT`` sets the +default value of ``dmesg_restrict``. -domainname & hostname: -====================== +domainname & hostname +===================== These files can be used to set the NIS/YP domainname and the hostname of your box in exactly the same way as the commands @@ -302,167 +238,219 @@ hostname "darkstar" and DNS (Internet Domain Name Server) domainname "frop.org", not to be confused with the NIS (Network Information Service) or YP (Yellow Pages) domainname. These two domain names are in general different. For a detailed discussion -see the hostname(1) man page. +see the ``hostname(1)`` man page. -hardlockup_all_cpu_backtrace: -============================= +hardlockup_all_cpu_backtrace +============================ This value controls the hard lockup detector behavior when a hard lockup condition is detected as to whether or not to gather further debug information. If enabled, arch-specific all-CPU stack dumping will be initiated. -0: do nothing. This is the default behavior. - -1: on detection capture more debug information. += ============================================ +0 Do nothing. This is the default behavior. +1 On detection capture more debug information. += ============================================ -hardlockup_panic: -================= +hardlockup_panic +================ This parameter can be used to control whether the kernel panics when a hard lockup is detected. - 0 - don't panic on hard lockup - 1 - panic on hard lockup += =========================== +0 Don't panic on hard lockup. +1 Panic on hard lockup. += =========================== -See Documentation/admin-guide/lockup-watchdogs.rst for more information. This can -also be set using the nmi_watchdog kernel parameter. +See :doc:`/admin-guide/lockup-watchdogs` for more information. +This can also be set using the nmi_watchdog kernel parameter. -hotplug: -======== +hotplug +======= Path for the hotplug policy agent. -Default value is "/sbin/hotplug". +Default value is "``/sbin/hotplug``". -hung_task_panic: -================ +hung_task_panic +=============== Controls the kernel's behavior when a hung task is detected. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. - -0: continue operation. This is the default behavior. +This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. -1: panic immediately. += ================================================= +0 Continue operation. This is the default behavior. +1 Panic immediately. += ================================================= -hung_task_check_count: -====================== +hung_task_check_count +===================== The upper bound on the number of tasks that are checked. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. +This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. -hung_task_timeout_secs: -======================= +hung_task_timeout_secs +====================== When a task in D state did not get scheduled for more than this value report a warning. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. +This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. -0: means infinite timeout - no checking done. +0 means infinite timeout, no checking is done. -Possible values to set are in range {0..LONG_MAX/HZ}. +Possible values to set are in range {0:``LONG_MAX``/``HZ``}. -hung_task_check_interval_secs: -============================== +hung_task_check_interval_secs +============================= Hung task check interval. If hung task checking is enabled -(see hung_task_timeout_secs), the check is done every -hung_task_check_interval_secs seconds. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. +(see `hung_task_timeout_secs`_), the check is done every +``hung_task_check_interval_secs`` seconds. +This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. -0 (default): means use hung_task_timeout_secs as checking interval. -Possible values to set are in range {0..LONG_MAX/HZ}. +0 (default) means use ``hung_task_timeout_secs`` as checking +interval. +Possible values to set are in range {0:``LONG_MAX``/``HZ``}. -hung_task_warnings: -=================== + +hung_task_warnings +================== The maximum number of warnings to report. During a check interval if a hung task is detected, this value is decreased by 1. When this value reaches 0, no more warnings will be reported. -This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. +This file shows up if ``CONFIG_DETECT_HUNG_TASK`` is enabled. -1: report an infinite number of warnings. -hyperv_record_panic_msg: -======================== +hyperv_record_panic_msg +======================= Controls whether the panic kmsg data should be reported to Hyper-V. -0: do not report panic kmsg data. += ========================================================= +0 Do not report panic kmsg data. +1 Report the panic kmsg data. This is the default behavior. += ========================================================= -1: report the panic kmsg data. This is the default behavior. +kexec_load_disabled +=================== -kexec_load_disabled: -==================== - -A toggle indicating if the kexec_load syscall has been disabled. This -value defaults to 0 (false: kexec_load enabled), but can be set to 1 -(true: kexec_load disabled). Once true, kexec can no longer be used, and -the toggle cannot be set back to false. This allows a kexec image to be -loaded before disabling the syscall, allowing a system to set up (and -later use) an image without it being altered. Generally used together -with the "modules_disabled" sysctl. +A toggle indicating if the ``kexec_load`` syscall has been disabled. +This value defaults to 0 (false: ``kexec_load`` enabled), but can be +set to 1 (true: ``kexec_load`` disabled). +Once true, kexec can no longer be used, and the toggle cannot be set +back to false. +This allows a kexec image to be loaded before disabling the syscall, +allowing a system to set up (and later use) an image without it being +altered. +Generally used together with the `modules_disabled`_ sysctl. -kptr_restrict: -============== +kptr_restrict +============= This toggle indicates whether restrictions are placed on -exposing kernel addresses via /proc and other interfaces. - -When kptr_restrict is set to 0 (the default) the address is hashed before -printing. (This is the equivalent to %p.) +exposing kernel addresses via ``/proc`` and other interfaces. + +When ``kptr_restrict`` is set to 0 (the default) the address is hashed +before printing. +(This is the equivalent to %p.) + +When ``kptr_restrict`` is set to 1, kernel pointers printed using the +%pK format specifier will be replaced with 0s unless the user has +``CAP_SYSLOG`` and effective user and group ids are equal to the real +ids. +This is because %pK checks are done at read() time rather than open() +time, so if permissions are elevated between the open() and the read() +(e.g via a setuid binary) then %pK will not leak kernel pointers to +unprivileged users. +Note, this is a temporary solution only. +The correct long-term solution is to do the permission checks at +open() time. +Consider removing world read permissions from files that use %pK, and +using `dmesg_restrict`_ to protect against uses of %pK in ``dmesg(8)`` +if leaking kernel pointer values to unprivileged users is a concern. + +When ``kptr_restrict`` is set to 2, kernel pointers printed using +%pK will be replaced with 0s regardless of privileges. + + +modprobe +======== -When kptr_restrict is set to (1), kernel pointers printed using the %pK -format specifier will be replaced with 0's unless the user has CAP_SYSLOG -and effective user and group ids are equal to the real ids. This is -because %pK checks are done at read() time rather than open() time, so -if permissions are elevated between the open() and the read() (e.g via -a setuid binary) then %pK will not leak kernel pointers to unprivileged -users. Note, this is a temporary solution only. The correct long-term -solution is to do the permission checks at open() time. Consider removing -world read permissions from files that use %pK, and using dmesg_restrict -to protect against uses of %pK in dmesg(8) if leaking kernel pointer -values to unprivileged users is a concern. +The full path to the usermode helper for autoloading kernel modules, +by default "/sbin/modprobe". This binary is executed when the kernel +requests a module. For example, if userspace passes an unknown +filesystem type to mount(), then the kernel will automatically request +the corresponding filesystem module by executing this usermode helper. +This usermode helper should insert the needed module into the kernel. -When kptr_restrict is set to (2), kernel pointers printed using -%pK will be replaced with 0's regardless of privileges. +This sysctl only affects module autoloading. It has no effect on the +ability to explicitly insert modules. +This sysctl can be used to debug module loading requests:: -l2cr: (PPC only) -================ + echo '#! /bin/sh' > /tmp/modprobe + echo 'echo "$@" >> /tmp/modprobe.log' >> /tmp/modprobe + echo 'exec /sbin/modprobe "$@"' >> /tmp/modprobe + chmod a+x /tmp/modprobe + echo /tmp/modprobe > /proc/sys/kernel/modprobe -This flag controls the L2 cache of G3 processor boards. If -0, the cache is disabled. Enabled if nonzero. +Alternatively, if this sysctl is set to the empty string, then module +autoloading is completely disabled. The kernel will not try to +execute a usermode helper at all, nor will it call the +kernel_module_request LSM hook. +If CONFIG_STATIC_USERMODEHELPER=y is set in the kernel configuration, +then the configured static usermode helper overrides this sysctl, +except that the empty string is still accepted to completely disable +module autoloading as described above. -modules_disabled: -================= +modules_disabled +================ A toggle value indicating if modules are allowed to be loaded in an otherwise modular kernel. This toggle defaults to off (0), but can be set true (1). Once true, modules can be neither loaded nor unloaded, and the toggle cannot be set back -to false. Generally used with the "kexec_load_disabled" toggle. +to false. Generally used with the `kexec_load_disabled`_ toggle. + + +.. _msgmni: + +msgmax, msgmnb, and msgmni +========================== + +``msgmax`` is the maximum size of an IPC message, in bytes. 8192 by +default (``MSGMAX``). +``msgmnb`` is the maximum size of an IPC queue, in bytes. 16384 by +default (``MSGMNB``). -msg_next_id, sem_next_id, and shm_next_id: -========================================== +``msgmni`` is the maximum number of IPC queues. 32000 by default +(``MSGMNI``). + + +msg_next_id, sem_next_id, and shm_next_id (System V IPC) +======================================================== These three toggles allows to specify desired id for next allocated IPC object: message, semaphore or shared memory respectively. By default they are equal to -1, which means generic allocation logic. -Possible values to set are in range {0..INT_MAX}. +Possible values to set are in range {0:``INT_MAX``}. Notes: 1) kernel doesn't guarantee, that new object will have desired id. So, @@ -471,16 +459,16 @@ Notes: successful IPC object allocation. If an IPC object allocation syscall fails, it is undefined if the value remains unmodified or is reset to -1. - -nmi_watchdog: -============= +nmi_watchdog +============ This parameter can be used to control the NMI watchdog (i.e. the hard lockup detector) on x86 systems. -0 - disable the hard lockup detector - -1 - enable the hard lockup detector += ================================= +0 Disable the hard lockup detector. +1 Enable the hard lockup detector. += ================================= The hard lockup detector monitors each CPU for its ability to respond to timer interrupts. The mechanism utilizes CPU performance counter registers @@ -492,11 +480,11 @@ in a KVM virtual machine. This default can be overridden by adding:: nmi_watchdog=1 -to the guest kernel command line (see Documentation/admin-guide/kernel-parameters.rst). +to the guest kernel command line (see :doc:`/admin-guide/kernel-parameters`). -numa_balancing: -=============== +numa_balancing +============== Enables/disables automatic page fault based NUMA memory balancing. Memory is moved automatically to nodes @@ -514,9 +502,10 @@ ideally is offset by improved memory locality but there is no universal guarantee. If the target workload is already bound to NUMA nodes then this feature should be disabled. Otherwise, if the system overhead from the feature is too high then the rate the kernel samples for NUMA hinting -faults may be controlled by the numa_balancing_scan_period_min_ms, +faults may be controlled by the `numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, -numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls. +numa_balancing_scan_size_mb`_, and numa_balancing_settle_count sysctls. + numa_balancing_scan_period_min_ms, numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms, numa_balancing_scan_size_mb =============================================================================================================================== @@ -542,23 +531,23 @@ workload pattern changes and minimises performance impact due to remote memory accesses. These sysctls control the thresholds for scan delays and the number of pages scanned. -numa_balancing_scan_period_min_ms is the minimum time in milliseconds to +``numa_balancing_scan_period_min_ms`` is the minimum time in milliseconds to scan a tasks virtual memory. It effectively controls the maximum scanning rate for each task. -numa_balancing_scan_delay_ms is the starting "scan delay" used for a task +``numa_balancing_scan_delay_ms`` is the starting "scan delay" used for a task when it initially forks. -numa_balancing_scan_period_max_ms is the maximum time in milliseconds to +``numa_balancing_scan_period_max_ms`` is the maximum time in milliseconds to scan a tasks virtual memory. It effectively controls the minimum scanning rate for each task. -numa_balancing_scan_size_mb is how many megabytes worth of pages are +``numa_balancing_scan_size_mb`` is how many megabytes worth of pages are scanned for a given scan. -osrelease, ostype & version: -============================ +osrelease, ostype & version +=========================== :: @@ -569,15 +558,16 @@ osrelease, ostype & version: # cat version #5 Wed Feb 25 21:49:24 MET 1998 -The files osrelease and ostype should be clear enough. Version +The files ``osrelease`` and ``ostype`` should be clear enough. +``version`` needs a little more clarification however. The '#5' means that this is the fifth kernel built from this source base and the date behind it indicates the time the kernel was built. The only way to tune these values is to rebuild the kernel :-) -overflowgid & overflowuid: -========================== +overflowgid & overflowuid +========================= if your architecture did not always support 32-bit UIDs (i.e. arm, i386, m68k, sh, and sparc32), a fixed UID and GID will be returned to @@ -588,108 +578,119 @@ These sysctls allow you to change the value of the fixed UID and GID. The default is 65534. +panic +===== + +The value in this file determines the behaviour of the kernel on a panic: -====== -The value in this file represents the number of seconds the kernel -waits before rebooting on a panic. When you use the software watchdog, -the recommended setting is 60. +* if zero, the kernel will loop forever; +* if negative, the kernel will reboot immediately; +* if positive, the kernel will reboot after the corresponding number + of seconds. +When you use the software watchdog, the recommended setting is 60. -panic_on_io_nmi: -================ + +panic_on_io_nmi +=============== Controls the kernel's behavior when a CPU receives an NMI caused by an IO error. -0: try to continue operation (default) - -1: panic immediately. The IO error triggered an NMI. This indicates a - serious system condition which could result in IO data corruption. - Rather than continuing, panicking might be a better choice. Some - servers issue this sort of NMI when the dump button is pushed, - and you can use this option to take a crash dump. += ================================================================== +0 Try to continue operation (default). +1 Panic immediately. The IO error triggered an NMI. This indicates a + serious system condition which could result in IO data corruption. + Rather than continuing, panicking might be a better choice. Some + servers issue this sort of NMI when the dump button is pushed, + and you can use this option to take a crash dump. += ================================================================== -panic_on_oops: -============== +panic_on_oops +============= Controls the kernel's behaviour when an oops or BUG is encountered. -0: try to continue operation - -1: panic immediately. If the `panic` sysctl is also non-zero then the - machine will be rebooted. += =================================================================== +0 Try to continue operation. +1 Panic immediately. If the `panic` sysctl is also non-zero then the + machine will be rebooted. += =================================================================== -panic_on_stackoverflow: -======================= +panic_on_stackoverflow +====================== Controls the kernel's behavior when detecting the overflows of kernel, IRQ and exception stacks except a user stack. -This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. - -0: try to continue operation. +This file shows up if ``CONFIG_DEBUG_STACKOVERFLOW`` is enabled. -1: panic immediately. += ========================== +0 Try to continue operation. +1 Panic immediately. += ========================== -panic_on_unrecovered_nmi: -========================= +panic_on_unrecovered_nmi +======================== The default Linux behaviour on an NMI of either memory or unknown is to continue operation. For many environments such as scientific computing it is preferable that the box is taken out and the error dealt with than an uncorrected parity/ECC error get propagated. -A small number of systems do generate NMI's for bizarre random reasons +A small number of systems do generate NMIs for bizarre random reasons such as power management so the default is off. That sysctl works like the existing panic controls already in that directory. -panic_on_warn: -============== +panic_on_warn +============= Calls panic() in the WARN() path when set to 1. This is useful to avoid a kernel rebuild when attempting to kdump at the location of a WARN(). -0: only WARN(), default behaviour. - -1: call panic() after printing out WARN() location. += ================================================ +0 Only WARN(), default behaviour. +1 Call panic() after printing out WARN() location. += ================================================ -panic_print: -============ +panic_print +=========== Bitmask for printing system info when panic happens. User can chose combination of the following bits: -===== ======================================== +===== ============================================ bit 0 print all tasks info bit 1 print system memory info bit 2 print timer info -bit 3 print locks info if CONFIG_LOCKDEP is on +bit 3 print locks info if ``CONFIG_LOCKDEP`` is on bit 4 print ftrace buffer -===== ======================================== +===== ============================================ So for example to print tasks and memory info on panic, user can:: echo 3 > /proc/sys/kernel/panic_print -panic_on_rcu_stall: -=================== +panic_on_rcu_stall +================== When set to 1, calls panic() after RCU stall detection messages. This is useful to define the root cause of RCU stalls using a vmcore. -0: do not panic() when RCU stall takes place, default behavior. += ============================================================ +0 Do not panic() when RCU stall takes place, default behavior. +1 panic() after printing RCU stall messages. += ============================================================ -1: panic() after printing RCU stall messages. - -perf_cpu_time_max_percent: -========================== +perf_cpu_time_max_percent +========================= Hints to the kernel how much CPU time it should be allowed to use to handle perf sampling events. If the perf subsystem @@ -702,171 +703,179 @@ unexpectedly take too long to execute, the NMIs can become stacked up next to each other so much that nothing else is allowed to execute. -0: - disable the mechanism. Do not monitor or correct perf's - sampling rate no matter how CPU time it takes. +===== ======================================================== +0 Disable the mechanism. Do not monitor or correct perf's + sampling rate no matter how CPU time it takes. -1-100: - attempt to throttle perf's sample rate to this - percentage of CPU. Note: the kernel calculates an - "expected" length of each sample event. 100 here means - 100% of that expected length. Even if this is set to - 100, you may still see sample throttling if this - length is exceeded. Set to 0 if you truly do not care - how much CPU is consumed. +1-100 Attempt to throttle perf's sample rate to this + percentage of CPU. Note: the kernel calculates an + "expected" length of each sample event. 100 here means + 100% of that expected length. Even if this is set to + 100, you may still see sample throttling if this + length is exceeded. Set to 0 if you truly do not care + how much CPU is consumed. +===== ======================================================== -perf_event_paranoid: -==================== +perf_event_paranoid +=================== Controls use of the performance events system by unprivileged users (without CAP_SYS_ADMIN). The default value is 2. === ================================================================== - -1 Allow use of (almost) all events by all users + -1 Allow use of (almost) all events by all users. - Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK + Ignore mlock limit after perf_event_mlock_kb without + ``CAP_IPC_LOCK``. ->=0 Disallow ftrace function tracepoint by users without CAP_SYS_ADMIN +>=0 Disallow ftrace function tracepoint by users without + ``CAP_SYS_ADMIN``. - Disallow raw tracepoint access by users without CAP_SYS_ADMIN + Disallow raw tracepoint access by users without ``CAP_SYS_ADMIN``. ->=1 Disallow CPU event access by users without CAP_SYS_ADMIN +>=1 Disallow CPU event access by users without ``CAP_SYS_ADMIN``. ->=2 Disallow kernel profiling by users without CAP_SYS_ADMIN +>=2 Disallow kernel profiling by users without ``CAP_SYS_ADMIN``. === ================================================================== -perf_event_max_stack: -===================== +perf_event_max_stack +==================== -Controls maximum number of stack frames to copy for (attr.sample_type & -PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using -'perf record -g' or 'perf trace --call-graph fp'. +Controls maximum number of stack frames to copy for (``attr.sample_type & +PERF_SAMPLE_CALLCHAIN``) configured events, for instance, when using +'``perf record -g``' or '``perf trace --call-graph fp``'. This can only be done when no events are in use that have callchains -enabled, otherwise writing to this file will return -EBUSY. +enabled, otherwise writing to this file will return ``-EBUSY``. The default value is 127. -perf_event_mlock_kb: -==================== +perf_event_mlock_kb +=================== Control size of per-cpu ring buffer not counted agains mlock limit. The default value is 512 + 1 page -perf_event_max_contexts_per_stack: -================================== +perf_event_max_contexts_per_stack +================================= Controls maximum number of stack frame context entries for -(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for -instance, when using 'perf record -g' or 'perf trace --call-graph fp'. +(``attr.sample_type & PERF_SAMPLE_CALLCHAIN``) configured events, for +instance, when using '``perf record -g``' or '``perf trace --call-graph fp``'. This can only be done when no events are in use that have callchains -enabled, otherwise writing to this file will return -EBUSY. +enabled, otherwise writing to this file will return ``-EBUSY``. The default value is 8. -pid_max: -======== +pid_max +======= PID allocation wrap value. When the kernel's next PID value reaches this value, it wraps back to a minimum PID value. -PIDs of value pid_max or larger are not allocated. +PIDs of value ``pid_max`` or larger are not allocated. -ns_last_pid: -============ +ns_last_pid +=========== The last pid allocated in the current (the one task using this sysctl lives in) pid namespace. When selecting a pid for a next task on fork kernel tries to allocate a number starting from this one. -powersave-nap: (PPC only) -========================= +powersave-nap (PPC only) +======================== If set, Linux-PPC will use the 'nap' mode of powersaving, otherwise the 'doze' mode will be used. + ============================================================== -printk: -======= +printk +====== -The four values in printk denote: console_loglevel, -default_message_loglevel, minimum_console_loglevel and -default_console_loglevel respectively. +The four values in printk denote: ``console_loglevel``, +``default_message_loglevel``, ``minimum_console_loglevel`` and +``default_console_loglevel`` respectively. These values influence printk() behavior when printing or -logging error messages. See 'man 2 syslog' for more info on +logging error messages. See '``man 2 syslog``' for more info on the different loglevels. -- console_loglevel: - messages with a higher priority than - this will be printed to the console -- default_message_loglevel: - messages without an explicit priority - will be printed with this priority -- minimum_console_loglevel: - minimum (highest) value to which - console_loglevel can be set -- default_console_loglevel: - default value for console_loglevel +======================== ===================================== +console_loglevel messages with a higher priority than + this will be printed to the console +default_message_loglevel messages without an explicit priority + will be printed with this priority +minimum_console_loglevel minimum (highest) value to which + console_loglevel can be set +default_console_loglevel default value for console_loglevel +======================== ===================================== -printk_delay: -============= +printk_delay +============ -Delay each printk message in printk_delay milliseconds +Delay each printk message in ``printk_delay`` milliseconds Value from 0 - 10000 is allowed. -printk_ratelimit: -================= +printk_ratelimit +================ -Some warning messages are rate limited. printk_ratelimit specifies +Some warning messages are rate limited. ``printk_ratelimit`` specifies the minimum length of time between these messages (in seconds). The default value is 5 seconds. A value of 0 will disable rate limiting. -printk_ratelimit_burst: -======================= +printk_ratelimit_burst +====================== -While long term we enforce one message per printk_ratelimit +While long term we enforce one message per `printk_ratelimit`_ seconds, we do allow a burst of messages to pass through. -printk_ratelimit_burst specifies the number of messages we can +``printk_ratelimit_burst`` specifies the number of messages we can send before ratelimiting kicks in. The default value is 10 messages. -printk_devkmsg: -=============== - -Control the logging to /dev/kmsg from userspace: - -ratelimit: - default, ratelimited +printk_devkmsg +============== -on: unlimited logging to /dev/kmsg from userspace +Control the logging to ``/dev/kmsg`` from userspace: -off: logging to /dev/kmsg disabled +========= ============================================= +ratelimit default, ratelimited +on unlimited logging to /dev/kmsg from userspace +off logging to /dev/kmsg disabled +========= ============================================= -The kernel command line parameter printk.devkmsg= overrides this and is +The kernel command line parameter ``printk.devkmsg=`` overrides this and is a one-time setting until next reboot: once set, it cannot be changed by this sysctl interface anymore. +============================================================== -randomize_va_space: -=================== + +pty +=== + +See Documentation/filesystems/devpts.txt. + + +randomize_va_space +================== This option can be used to select the type of process address space randomization that is used in the system, for architectures @@ -881,10 +890,10 @@ that support this feature. This, among other things, implies that shared libraries will be loaded to random addresses. Also for PIE-linked binaries, the location of code start is randomized. This is the default if the - CONFIG_COMPAT_BRK option is enabled. + ``CONFIG_COMPAT_BRK`` option is enabled. 2 Additionally enable heap randomization. This is the default if - CONFIG_COMPAT_BRK is disabled. + ``CONFIG_COMPAT_BRK`` is disabled. There are a few legacy applications out there (such as some ancient versions of libc.so.5 from 1996) that assume that brk area starts @@ -894,31 +903,27 @@ that support this feature. systems it is safe to choose full randomization. Systems with ancient and/or broken binaries should be configured - with CONFIG_COMPAT_BRK enabled, which excludes the heap from process + with ``CONFIG_COMPAT_BRK`` enabled, which excludes the heap from process address space randomization. == =========================================================================== -reboot-cmd: (Sparc only) -======================== - -??? This seems to be a way to give an argument to the Sparc -ROM/Flash boot loader. Maybe to tell it what to do after -rebooting. ??? +real-root-dev +============= +See :doc:`/admin-guide/initrd`. -rtsig-max & rtsig-nr: -===================== -The file rtsig-max can be used to tune the maximum number -of POSIX realtime (queued) signals that can be outstanding -in the system. +reboot-cmd (SPARC only) +======================= -rtsig-nr shows the number of RT signals currently queued. +??? This seems to be a way to give an argument to the Sparc +ROM/Flash boot loader. Maybe to tell it what to do after +rebooting. ??? -sched_energy_aware: -=================== +sched_energy_aware +================== Enables/disables Energy Aware Scheduling (EAS). EAS starts automatically on platforms where it can run (that is, @@ -928,75 +933,88 @@ requirements for EAS but you do not want to use it, change this value to 0. -sched_schedstats: -================= +sched_schedstats +================ Enables/disables scheduler statistics. Enabling this feature incurs a small amount of overhead in the scheduler but is useful for debugging and performance tuning. -sg-big-buff: -============ +seccomp +======= + +See :doc:`/userspace-api/seccomp_filter`. + + +sg-big-buff +=========== This file shows the size of the generic SCSI (sg) buffer. You can't tune it just yet, but you could change it on -compile time by editing include/scsi/sg.h and changing -the value of SG_BIG_BUFF. +compile time by editing ``include/scsi/sg.h`` and changing +the value of ``SG_BIG_BUFF``. There shouldn't be any reason to change this value. If you can come up with one, you probably know what you are doing anyway :) -shmall: -======= +shmall +====== This parameter sets the total amount of shared memory pages that -can be used system wide. Hence, SHMALL should always be at least -ceil(shmmax/PAGE_SIZE). +can be used system wide. Hence, ``shmall`` should always be at least +``ceil(shmmax/PAGE_SIZE)``. -If you are not sure what the default PAGE_SIZE is on your Linux -system, you can run the following command: +If you are not sure what the default ``PAGE_SIZE`` is on your Linux +system, you can run the following command:: # getconf PAGE_SIZE -shmmax: -======= +shmmax +====== This value can be used to query and set the run time limit on the maximum shared memory segment size that can be created. Shared memory segments up to 1Gb are now supported in the -kernel. This value defaults to SHMMAX. +kernel. This value defaults to ``SHMMAX``. -shm_rmid_forced: -================ +shmmni +====== + +This value determines the maximum number of shared memory segments. +4096 by default (``SHMMNI``). + + +shm_rmid_forced +=============== Linux lets you set resource limits, including how much memory one -process can consume, via setrlimit(2). Unfortunately, shared memory +process can consume, via ``setrlimit(2)``. Unfortunately, shared memory segments are allowed to exist without association with any process, and thus might not be counted against any resource limits. If enabled, shared memory segments are automatically destroyed when their attach count becomes zero after a detach or a process termination. It will also destroy segments that were created, but never attached to, on exit -from the process. The only use left for IPC_RMID is to immediately +from the process. The only use left for ``IPC_RMID`` is to immediately destroy an unattached segment. Of course, this breaks the way things are defined, so some applications might stop working. Note that this feature will do you no good unless you also configure your resource -limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't +limits (in particular, ``RLIMIT_AS`` and ``RLIMIT_NPROC``). Most systems don't need this. Note that if you change this from 0 to 1, already created segments without users and with a dead originative process will be destroyed. -sysctl_writes_strict: -===================== +sysctl_writes_strict +==================== Control how file position affects the behavior of updating sysctl values -via the /proc/sys interface: +via the ``/proc/sys`` interface: == ====================================================================== -1 Legacy per-write sysctl value handling, with no printk warnings. @@ -1013,8 +1031,8 @@ via the /proc/sys interface: == ====================================================================== -softlockup_all_cpu_backtrace: -============================= +softlockup_all_cpu_backtrace +============================ This value controls the soft lockup detector thread's behavior when a soft lockup condition is detected as to whether or not @@ -1024,43 +1042,80 @@ be issued an NMI and instructed to capture stack trace. This feature is only applicable for architectures which support NMI. -0: do nothing. This is the default behavior. += ============================================ +0 Do nothing. This is the default behavior. +1 On detection capture more debug information. += ============================================ -1: on detection capture more debug information. +softlockup_panic +================= -soft_watchdog: -============== +This parameter can be used to control whether the kernel panics +when a soft lockup is detected. -This parameter can be used to control the soft lockup detector. += ============================================ +0 Don't panic on soft lockup. +1 Panic on soft lockup. += ============================================ - 0 - disable the soft lockup detector +This can also be set using the softlockup_panic kernel parameter. - 1 - enable the soft lockup detector + +soft_watchdog +============= + +This parameter can be used to control the soft lockup detector. + += ================================= +0 Disable the soft lockup detector. +1 Enable the soft lockup detector. += ================================= The soft lockup detector monitors CPUs for threads that are hogging the CPUs without rescheduling voluntarily, and thus prevent the 'watchdog/N' threads from running. The mechanism depends on the CPUs ability to respond to timer interrupts which are needed for the 'watchdog/N' threads to be woken up by -the watchdog timer function, otherwise the NMI watchdog - if enabled - can +the watchdog timer function, otherwise the NMI watchdog — if enabled — can detect a hard lockup condition. -stack_erasing: -============== +stack_erasing +============= This parameter can be used to control kernel stack erasing at the end -of syscalls for kernels built with CONFIG_GCC_PLUGIN_STACKLEAK. +of syscalls for kernels built with ``CONFIG_GCC_PLUGIN_STACKLEAK``. That erasing reduces the information which kernel stack leak bugs can reveal and blocks some uninitialized stack variable attacks. The tradeoff is the performance impact: on a single CPU system kernel compilation sees a 1% slowdown, other systems and workloads may vary. - 0: kernel stack erasing is disabled, STACKLEAK_METRICS are not updated. += ==================================================================== +0 Kernel stack erasing is disabled, STACKLEAK_METRICS are not updated. +1 Kernel stack erasing is enabled (default), it is performed before + returning to the userspace at the end of syscalls. += ==================================================================== + + +stop-a (SPARC only) +=================== + +Controls Stop-A: + += ==================================== +0 Stop-A has no effect. +1 Stop-A breaks to the PROM (default). += ==================================== + +Stop-A is always enabled on a panic, so that the user can return to +the boot PROM. - 1: kernel stack erasing is enabled (default), it is performed before - returning to the userspace at the end of syscalls. + +sysrq +===== + +See :doc:`/admin-guide/sysrq`. tainted @@ -1090,30 +1145,30 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. 131072 `(T)` The kernel was built with the struct randomization plugin ====== ===== ============================================================== -See Documentation/admin-guide/tainted-kernels.rst for more information. +See :doc:`/admin-guide/tainted-kernels` for more information. -threads-max: -============ +threads-max +=========== This value controls the maximum number of threads that can be created -using fork(). +using ``fork()``. During initialization the kernel sets this value such that even if the maximum number of threads is created, the thread structures occupy only a part (1/8th) of the available RAM pages. -The minimum value that can be written to threads-max is 1. +The minimum value that can be written to ``threads-max`` is 1. -The maximum value that can be written to threads-max is given by the -constant FUTEX_TID_MASK (0x3fffffff). +The maximum value that can be written to ``threads-max`` is given by the +constant ``FUTEX_TID_MASK`` (0x3fffffff). -If a value outside of this range is written to threads-max an error -EINVAL occurs. +If a value outside of this range is written to ``threads-max`` an +``EINVAL`` error occurs. -unknown_nmi_panic: -================== +unknown_nmi_panic +================= The value in this file affects behavior of handling NMI. When the value is non-zero, unknown NMI is trapped and then panic occurs. At @@ -1123,37 +1178,39 @@ NMI switch that most IA32 servers have fires unknown NMI up, for example. If a system hangs up, try pressing the NMI switch. -watchdog: -========= +watchdog +======== This parameter can be used to disable or enable the soft lockup detector -_and_ the NMI watchdog (i.e. the hard lockup detector) at the same time. - - 0 - disable both lockup detectors +*and* the NMI watchdog (i.e. the hard lockup detector) at the same time. - 1 - enable both lockup detectors += ============================== +0 Disable both lockup detectors. +1 Enable both lockup detectors. += ============================== The soft lockup detector and the NMI watchdog can also be disabled or -enabled individually, using the soft_watchdog and nmi_watchdog parameters. -If the watchdog parameter is read, for example by executing:: +enabled individually, using the ``soft_watchdog`` and ``nmi_watchdog`` +parameters. +If the ``watchdog`` parameter is read, for example by executing:: cat /proc/sys/kernel/watchdog -the output of this command (0 or 1) shows the logical OR of soft_watchdog -and nmi_watchdog. +the output of this command (0 or 1) shows the logical OR of +``soft_watchdog`` and ``nmi_watchdog``. -watchdog_cpumask: -================= +watchdog_cpumask +================ This value can be used to control on which cpus the watchdog may run. -The default cpumask is all possible cores, but if NO_HZ_FULL is +The default cpumask is all possible cores, but if ``NO_HZ_FULL`` is enabled in the kernel config, and cores are specified with the -nohz_full= boot argument, those cores are excluded by default. +``nohz_full=`` boot argument, those cores are excluded by default. Offline cores can be included in this mask, and if the core is later brought online, the watchdog will be started based on the mask value. -Typically this value would only be touched in the nohz_full case +Typically this value would only be touched in the ``nohz_full`` case to re-enable cores that by default were not running the watchdog, if a kernel lockup was suspected on those cores. @@ -1164,12 +1221,12 @@ might say:: echo 0,2-4 > /proc/sys/kernel/watchdog_cpumask -watchdog_thresh: -================ +watchdog_thresh +=============== This value can be used to control the frequency of hrtimer and NMI events and the soft and hard lockup thresholds. The default threshold is 10 seconds. -The softlockup threshold is (2 * watchdog_thresh). Setting this +The softlockup threshold is (``2 * watchdog_thresh``). Setting this tunable to zero will disable lockup detection altogether. diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 287b98708a40915a80ba76455621b507cce161bd..e043c9213388e2e9860ca80bcef1fc50a3609ffa 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -67,7 +67,8 @@ two flavors of JITs, the newer eBPF JIT currently supported on: - sparc64 - mips64 - s390x - - riscv + - riscv64 + - riscv32 And the older cBPF JIT supported on the following archs: diff --git a/Documentation/admin-guide/sysctl/user.rst b/Documentation/admin-guide/sysctl/user.rst index 650eaa03f15eaac3a5e8833da9b70db2c01bff01..c4582458933953e1efaa7dec2f3a85ba44b08bd3 100644 --- a/Documentation/admin-guide/sysctl/user.rst +++ b/Documentation/admin-guide/sysctl/user.rst @@ -65,6 +65,12 @@ max_pid_namespaces The maximum number of pid namespaces that any user in the current user namespace may create. +max_time_namespaces +=================== + + The maximum number of time namespaces that any user in the current + user namespace may create. + max_user_namespaces =================== diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 64aeee1009cab2265d7832cf0694e03b43ce9559..0329a4d3fa9ecf89046d32086372394b070d1681 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -128,6 +128,9 @@ allowed to examine the unevictable lru (mlocked pages) for pages to compact. This should be used on systems where stalls for minor page faults are an acceptable trade for large contiguous free memory. Set to 0 to prevent compaction from moving pages that are unevictable. Default value is 1. +On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due +to compaction, which would block the task from becomming active until the fault +is resolved. dirty_background_bytes diff --git a/Documentation/admin-guide/sysrq.rst b/Documentation/admin-guide/sysrq.rst index 72b2cfb066f455833410d665210d6d164910f2e1..a46209f4636c65929bd3a98a916a52a33459fe48 100644 --- a/Documentation/admin-guide/sysrq.rst +++ b/Documentation/admin-guide/sysrq.rst @@ -48,9 +48,10 @@ always allowed (by a user with admin privileges). How do I use the magic SysRq key? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -On x86 - You press the key combo :kbd:`ALT-SysRq-`. +On x86 + You press the key combo :kbd:`ALT-SysRq-`. -.. note:: + .. note:: Some keyboards may not have a key labeled 'SysRq'. The 'SysRq' key is also known as the 'Print Screen' key. Also some keyboards cannot @@ -58,14 +59,15 @@ On x86 - You press the key combo :kbd:`ALT-SysRq-`. have better luck with press :kbd:`Alt`, press :kbd:`SysRq`, release :kbd:`SysRq`, press :kbd:``, release everything. -On SPARC - You press :kbd:`ALT-STOP-`, I believe. +On SPARC + You press :kbd:`ALT-STOP-`, I believe. On the serial console (PC style standard serial ports only) You send a ``BREAK``, then within 5 seconds a command key. Sending ``BREAK`` twice is interpreted as a normal BREAK. On PowerPC - Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:``, + Press :kbd:`ALT - Print Screen` (or :kbd:`F13`) - :kbd:``. :kbd:`Print Screen` (or :kbd:`F13`) - :kbd:`` may suffice. On other @@ -73,7 +75,7 @@ On other let me know so I can add them to this section. On all - write a character to /proc/sysrq-trigger. e.g.:: + Write a character to /proc/sysrq-trigger. e.g.:: echo t > /proc/sysrq-trigger @@ -282,7 +284,7 @@ Just ask them on the linux-kernel mailing list: Credits ~~~~~~~ -Written by Mydraal -Updated by Adam Sulmicki -Updated by Jeremy M. Dolan 2001/01/28 10:15:59 -Added to by Crutcher Dunnavant +- Written by Mydraal +- Updated by Adam Sulmicki +- Updated by Jeremy M. Dolan 2001/01/28 10:15:59 +- Added to by Crutcher Dunnavant diff --git a/Documentation/admin-guide/thunderbolt.rst b/Documentation/admin-guide/thunderbolt.rst index 898ad78f3cc7daf083c62672bdff94f4c17a69c2..10c4f0ce2ad001729e337f03ca470a2135ae8cbf 100644 --- a/Documentation/admin-guide/thunderbolt.rst +++ b/Documentation/admin-guide/thunderbolt.rst @@ -1,6 +1,28 @@ -============= - Thunderbolt -============= +.. SPDX-License-Identifier: GPL-2.0 + +====================== + USB4 and Thunderbolt +====================== +USB4 is the public specification based on Thunderbolt 3 protocol with +some differences at the register level among other things. Connection +manager is an entity running on the host router (host controller) +responsible for enumerating routers and establishing tunnels. A +connection manager can be implemented either in firmware or software. +Typically PCs come with a firmware connection manager for Thunderbolt 3 +and early USB4 capable systems. Apple systems on the other hand use +software connection manager and the later USB4 compliant devices follow +the suit. + +The Linux Thunderbolt driver supports both and can detect at runtime which +connection manager implementation is to be used. To be on the safe side the +software connection manager in Linux also advertises security level +``user`` which means PCIe tunneling is disabled by default. The +documentation below applies to both implementations with the exception that +the software connection manager only supports ``user`` security level and +is expected to be accompanied with an IOMMU based DMA protection. + +Security levels and how to use them +----------------------------------- The interface presented here is not meant for end users. Instead there should be a userspace tool that handles all the low-level details, keeps a database of the authorized devices and prompts users for new connections. @@ -18,8 +40,6 @@ This will authorize all devices automatically when they appear. However, keep in mind that this bypasses the security levels and makes the system vulnerable to DMA attacks. -Security levels and how to use them ------------------------------------ Starting with Intel Falcon Ridge Thunderbolt controller there are 4 security levels available. Intel Titan Ridge added one more security level (usbonly). The reason for these is the fact that the connected devices can diff --git a/Documentation/arm/microchip.rst b/Documentation/arm/microchip.rst index 1adf53dfc4949d254a2bcdec9cb808f0bd8e815e..05e5f2dfb814bfc4f9afc10898469e5e0316c241 100644 --- a/Documentation/arm/microchip.rst +++ b/Documentation/arm/microchip.rst @@ -92,6 +92,12 @@ the Microchip website: http://www.microchip.com. http://ww1.microchip.com/downloads/en/DeviceDoc/DS60001517A.pdf + - sam9x60 + + * Datasheet + + http://ww1.microchip.com/downloads/en/DeviceDoc/SAM9X60-Data-Sheet-DS60001579A.pdf + * ARM Cortex-A5 based SoCs - sama5d3 family diff --git a/Documentation/arm/tcm.rst b/Documentation/arm/tcm.rst index effd9c7bc968cef23df155e22410325ef10365e1..b256f97838839a98ed86dde227660617b8e283fc 100644 --- a/Documentation/arm/tcm.rst +++ b/Documentation/arm/tcm.rst @@ -4,18 +4,18 @@ ARM TCM (Tightly-Coupled Memory) handling in Linux Written by Linus Walleij -Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory). +Some ARM SoCs have a so-called TCM (Tightly-Coupled Memory). This is usually just a few (4-64) KiB of RAM inside the ARM processor. -Due to being embedded inside the CPU The TCM has a +Due to being embedded inside the CPU, the TCM has a Harvard-architecture, so there is an ITCM (instruction TCM) and a DTCM (data TCM). The DTCM can not contain any instructions, but the ITCM can actually contain data. The size of DTCM or ITCM is minimum 4KiB so the typical minimum configuration is 4KiB ITCM and 4KiB DTCM. -ARM CPU:s have special registers to read out status, physical +ARM CPUs have special registers to read out status, physical location and size of TCM memories. arch/arm/include/asm/cputype.h defines a CPUID_TCM register that you can read out from the system control coprocessor. Documentation from ARM can be found diff --git a/Documentation/arm64/amu.rst b/Documentation/arm64/amu.rst new file mode 100644 index 0000000000000000000000000000000000000000..036783ee327f252726f645d4684b6673f8a4aad9 --- /dev/null +++ b/Documentation/arm64/amu.rst @@ -0,0 +1,117 @@ +======================================================= +Activity Monitors Unit (AMU) extension in AArch64 Linux +======================================================= + +Author: Ionela Voinescu + +Date: 2019-09-10 + +This document briefly describes the provision of Activity Monitors Unit +support in AArch64 Linux. + + +Architecture overview +--------------------- + +The activity monitors extension is an optional extension introduced by the +ARMv8.4 CPU architecture. + +The activity monitors unit, implemented in each CPU, provides performance +counters intended for system management use. The AMU extension provides a +system register interface to the counter registers and also supports an +optional external memory-mapped interface. + +Version 1 of the Activity Monitors architecture implements a counter group +of four fixed and architecturally defined 64-bit event counters. + +- CPU cycle counter: increments at the frequency of the CPU. +- Constant counter: increments at the fixed frequency of the system + clock. +- Instructions retired: increments with every architecturally executed + instruction. +- Memory stall cycles: counts instruction dispatch stall cycles caused by + misses in the last level cache within the clock domain. + +When in WFI or WFE these counters do not increment. + +The Activity Monitors architecture provides space for up to 16 architected +event counters. Future versions of the architecture may use this space to +implement additional architected event counters. + +Additionally, version 1 implements a counter group of up to 16 auxiliary +64-bit event counters. + +On cold reset all counters reset to 0. + + +Basic support +------------- + +The kernel can safely run a mix of CPUs with and without support for the +activity monitors extension. Therefore, when CONFIG_ARM64_AMU_EXTN is +selected we unconditionally enable the capability to allow any late CPU +(secondary or hotplugged) to detect and use the feature. + +When the feature is detected on a CPU, we flag the availability of the +feature but this does not guarantee the correct functionality of the +counters, only the presence of the extension. + +Firmware (code running at higher exception levels, e.g. arm-tf) support is +needed to: + +- Enable access for lower exception levels (EL2 and EL1) to the AMU + registers. +- Enable the counters. If not enabled these will read as 0. +- Save/restore the counters before/after the CPU is being put/brought up + from the 'off' power state. + +When using kernels that have this feature enabled but boot with broken +firmware the user may experience panics or lockups when accessing the +counter registers. Even if these symptoms are not observed, the values +returned by the register reads might not correctly reflect reality. Most +commonly, the counters will read as 0, indicating that they are not +enabled. + +If proper support is not provided in firmware it's best to disable +CONFIG_ARM64_AMU_EXTN. To be noted that for security reasons, this does not +bypass the setting of AMUSERENR_EL0 to trap accesses from EL0 (userspace) to +EL1 (kernel). Therefore, firmware should still ensure accesses to AMU registers +are not trapped in EL2/EL3. + +The fixed counters of AMUv1 are accessible though the following system +register definitions: + +- SYS_AMEVCNTR0_CORE_EL0 +- SYS_AMEVCNTR0_CONST_EL0 +- SYS_AMEVCNTR0_INST_RET_EL0 +- SYS_AMEVCNTR0_MEM_STALL_EL0 + +Auxiliary platform specific counters can be accessed using +SYS_AMEVCNTR1_EL0(n), where n is a value between 0 and 15. + +Details can be found in: arch/arm64/include/asm/sysreg.h. + + +Userspace access +---------------- + +Currently, access from userspace to the AMU registers is disabled due to: + +- Security reasons: they might expose information about code executed in + secure mode. +- Purpose: AMU counters are intended for system management use. + +Also, the presence of the feature is not visible to userspace. + + +Virtualization +-------------- + +Currently, access from userspace (EL0) and kernelspace (EL1) on the KVM +guest side is disabled due to: + +- Security reasons: they might expose information about code executed + by other guests or the host. + +Any attempt to access the AMU registers will result in an UNDEFINED +exception being injected into the guest. diff --git a/Documentation/arm64/booting.rst b/Documentation/arm64/booting.rst index 5d78a6f5b0aed0e495f0618d9408c0edd54f9911..a3f1a47b6f1c54fa94954b456a2cda7bcbdc776b 100644 --- a/Documentation/arm64/booting.rst +++ b/Documentation/arm64/booting.rst @@ -248,6 +248,20 @@ Before jumping into the kernel, the following conditions must be met: - HCR_EL2.APK (bit 40) must be initialised to 0b1 - HCR_EL2.API (bit 41) must be initialised to 0b1 + For CPUs with Activity Monitors Unit v1 (AMUv1) extension present: + - If EL3 is present: + CPTR_EL3.TAM (bit 30) must be initialised to 0b0 + CPTR_EL2.TAM (bit 30) must be initialised to 0b0 + AMCNTENSET0_EL0 must be initialised to 0b1111 + AMCNTENSET1_EL0 must be initialised to a platform specific value + having 0b1 set for the corresponding bit for each of the auxiliary + counters present. + - If the kernel is entered at EL1: + AMCNTENSET0_EL0 must be initialised to 0b1111 + AMCNTENSET1_EL0 must be initialised to a platform specific value + having 0b1 set for the corresponding bit for each of the auxiliary + counters present. + The requirements described above for CPU mode, caches, MMUs, architected timers, coherency and system registers apply to all CPUs. All CPUs must enter the kernel in the same exception level. diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst index b6e44884e3ada435d49f1badd4e2c749a5b6b448..41937a8091aaa2a9c2252fc3f98615a4bbb25436 100644 --- a/Documentation/arm64/cpu-feature-registers.rst +++ b/Documentation/arm64/cpu-feature-registers.rst @@ -117,6 +117,8 @@ infrastructure: +------------------------------+---------+---------+ | Name | bits | visible | +------------------------------+---------+---------+ + | RNDR | [63-60] | y | + +------------------------------+---------+---------+ | TS | [55-52] | y | +------------------------------+---------+---------+ | FHM | [51-48] | y | @@ -200,6 +202,12 @@ infrastructure: +------------------------------+---------+---------+ | Name | bits | visible | +------------------------------+---------+---------+ + | I8MM | [55-52] | y | + +------------------------------+---------+---------+ + | DGH | [51-48] | y | + +------------------------------+---------+---------+ + | BF16 | [47-44] | y | + +------------------------------+---------+---------+ | SB | [39-36] | y | +------------------------------+---------+---------+ | FRINTTS | [35-32] | y | @@ -234,10 +242,18 @@ infrastructure: +------------------------------+---------+---------+ | Name | bits | visible | +------------------------------+---------+---------+ + | F64MM | [59-56] | y | + +------------------------------+---------+---------+ + | F32MM | [55-52] | y | + +------------------------------+---------+---------+ + | I8MM | [47-44] | y | + +------------------------------+---------+---------+ | SM4 | [43-40] | y | +------------------------------+---------+---------+ | SHA3 | [35-32] | y | +------------------------------+---------+---------+ + | BF16 | [23-20] | y | + +------------------------------+---------+---------+ | BitPerm | [19-16] | y | +------------------------------+---------+---------+ | AES | [7-4] | y | diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst index 7fa3d215ae6a80e5d3dd9b6db3bd221d4b6d8c48..7dfb97dfe416058f20ef3863c9185d9a071f46ec 100644 --- a/Documentation/arm64/elf_hwcaps.rst +++ b/Documentation/arm64/elf_hwcaps.rst @@ -204,6 +204,37 @@ HWCAP2_FRINT Functionality implied by ID_AA64ISAR1_EL1.FRINTTS == 0b0001. +HWCAP2_SVEI8MM + + Functionality implied by ID_AA64ZFR0_EL1.I8MM == 0b0001. + +HWCAP2_SVEF32MM + + Functionality implied by ID_AA64ZFR0_EL1.F32MM == 0b0001. + +HWCAP2_SVEF64MM + + Functionality implied by ID_AA64ZFR0_EL1.F64MM == 0b0001. + +HWCAP2_SVEBF16 + + Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0001. + +HWCAP2_I8MM + + Functionality implied by ID_AA64ISAR1_EL1.I8MM == 0b0001. + +HWCAP2_BF16 + + Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0001. + +HWCAP2_DGH + + Functionality implied by ID_AA64ISAR1_EL1.DGH == 0b0001. + +HWCAP2_RNG + + Functionality implied by ID_AA64ISAR0_EL1.RNDR == 0b0001. 4. Unused AT_HWCAP bits ----------------------- diff --git a/Documentation/arm64/index.rst b/Documentation/arm64/index.rst index 5c0c69dc58aa3887e52453f3630afdb9ed8e2bd1..09cbb4ed2237769cef9484e77ce07ddfa8a5e335 100644 --- a/Documentation/arm64/index.rst +++ b/Documentation/arm64/index.rst @@ -6,6 +6,7 @@ ARM64 Architecture :maxdepth: 1 acpi_object_usage + amu arm-acpi booting cpu-feature-registers diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index 02e02175e6f56a75da568cd0112bf71769b8f938..cf03b3290800c25ea7c5d8cba093b0189e3d727a 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -129,7 +129,7 @@ this logic. As a single binary will need to support both 48-bit and 52-bit VA spaces, the VMEMMAP must be sized large enough for 52-bit VAs and -also must be sized large enought to accommodate a fixed PAGE_OFFSET. +also must be sized large enough to accommodate a fixed PAGE_OFFSET. Most code in the kernel should not need to consider the VA_BITS, for code that does need to know the VA size the variables are diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 99b2545455ff9a6e66e3931bc4c8e847d0dbf12f..2c08c628febdf3c7a17c2129047d9f53e246a779 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -88,6 +88,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A76 | #1463225 | ARM64_ERRATUM_1463225 | +----------------+-----------------+-----------------+-----------------------------+ +| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 | ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1188873,1418040| ARM64_ERRATUM_1418040 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Neoverse-N1 | #1349291 | N/A | @@ -108,6 +110,8 @@ stable kernels. +----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | +----------------+-----------------+-----------------+-----------------------------+ +| Cavium | ThunderX GICv3 | #38539 | N/A | ++----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | +----------------+-----------------+-----------------+-----------------------------+ | Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 | diff --git a/Documentation/asm-annotations.rst b/Documentation/asm-annotations.rst index f55c2bb74d006bd0409f79a88b9078c29d9665a5..32ea57483378ddeea5edc84278fe3d412920fcc5 100644 --- a/Documentation/asm-annotations.rst +++ b/Documentation/asm-annotations.rst @@ -73,10 +73,11 @@ The new macros are prefixed with the ``SYM_`` prefix and can be divided into three main groups: 1. ``SYM_FUNC_*`` -- to annotate C-like functions. This means functions with - standard C calling conventions, i.e. the stack contains a return address at - the predefined place and a return from the function can happen in a - standard way. When frame pointers are enabled, save/restore of frame - pointer shall happen at the start/end of a function, respectively, too. + standard C calling conventions. For example, on x86, this means that the + stack contains a return address at the predefined place and a return from + the function can happen in a standard way. When frame pointers are enabled, + save/restore of frame pointer shall happen at the start/end of a function, + respectively, too. Checking tools like ``objtool`` should ensure such marked functions conform to these rules. The tools can also easily annotate these functions with diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index 86fa66c87172138f818526941373652dde74e40b..ad303a2569d33e7d0799bf67e05dd7245429625b 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -47,7 +47,7 @@ Having a real iterator, and making biovecs immutable, has a number of advantages: * Before, iterating over bios was very awkward when you weren't processing - exactly one bvec at a time - for example, bio_copy_data() in fs/bio.c, + exactly one bvec at a time - for example, bio_copy_data() in block/bio.c, which copies the contents of one bio into another. Because the biovecs wouldn't necessarily be the same size, the old code was tricky convoluted - it had to walk two different bios at the same time, keeping both bi_idx and diff --git a/Documentation/block/capability.rst b/Documentation/block/capability.rst index 2cf258d64bbedf92494c763208d31b760f2ea74c..160a5148b915f8a151b516bcd10e8ee4d107086a 100644 --- a/Documentation/block/capability.rst +++ b/Documentation/block/capability.rst @@ -2,17 +2,9 @@ Generic Block Device Capability =============================== -This file documents the sysfs file block//capability +This file documents the sysfs file ``block//capability``. -capability is a hex word indicating which capabilities a specific disk -supports. For more information on bits not listed here, see -include/linux/genhd.h +``capability`` is a bitfield, printed in hexadecimal, indicating which +capabilities a specific block device supports: -GENHD_FL_MEDIA_CHANGE_NOTIFY ----------------------------- - -Value: 4 - -When this bit is set, the disk supports Asynchronous Notification -of media change events. These events will be broadcast to user -space via kernel uevent. +.. kernel-doc:: include/linux/genhd.h diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index c9856b927055b1e804b5087887945ea508038289..38c15c6fcb144b8d3b1a6ba33437a4c7a909ca84 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -20,11 +20,11 @@ Reporting bugs Q: How do I report bugs for BPF kernel code? -------------------------------------------- A: Since all BPF kernel development as well as bpftool and iproute2 BPF -loader development happens through the netdev kernel mailing list, +loader development happens through the bpf kernel mailing list, please report any found issues around BPF to the following mailing list: - netdev@vger.kernel.org + bpf@vger.kernel.org This may also include issues related to XDP, BPF tracing, etc. @@ -46,17 +46,12 @@ Submitting patches Q: To which mailing list do I need to submit my BPF patches? ------------------------------------------------------------ -A: Please submit your BPF patches to the netdev kernel mailing list: +A: Please submit your BPF patches to the bpf kernel mailing list: - netdev@vger.kernel.org - -Historically, BPF came out of networking and has always been maintained -by the kernel networking community. Although these days BPF touches -many other subsystems as well, the patches are still routed mainly -through the networking community. + bpf@vger.kernel.org In case your patch has changes in various different subsystems (e.g. -tracing, security, etc), make sure to Cc the related kernel mailing +networking, tracing, security, etc), make sure to Cc the related kernel mailing lists and maintainers from there as well, so they are able to review the changes and provide their Acked-by's to the patches. @@ -168,7 +163,7 @@ a BPF point of view. Be aware that this is not a final verdict that the patch will automatically get accepted into net or net-next trees eventually: -On the netdev kernel mailing list reviews can come in at any point +On the bpf kernel mailing list reviews can come in at any point in time. If discussions around a patch conclude that they cannot get included as-is, we will either apply a follow-up fix or drop them from the trees entirely. Therefore, we also reserve to rebase @@ -494,15 +489,15 @@ A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have that set up, proceed with building the latest LLVM and clang version from the git repositories:: - $ git clone http://llvm.org/git/llvm.git - $ cd llvm/tools - $ git clone --depth 1 http://llvm.org/git/clang.git - $ cd ..; mkdir build; cd build - $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + $ git clone https://github.com/llvm/llvm-project.git + $ mkdir -p llvm-project/llvm/build/install + $ cd llvm-project/llvm/build + $ cmake .. -G "Ninja" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + -DLLVM_ENABLE_PROJECTS="clang" \ -DBUILD_SHARED_LIBS=OFF \ -DCMAKE_BUILD_TYPE=Release \ -DLLVM_BUILD_RUNTIME=OFF - $ make -j $(getconf _NPROCESSORS_ONLN) + $ ninja The built binaries can then be found in the build/bin/ directory, where you can point the PATH variable to. diff --git a/Documentation/bpf/bpf_lsm.rst b/Documentation/bpf/bpf_lsm.rst new file mode 100644 index 0000000000000000000000000000000000000000..1c0a75a51d79b03b932b3c9f54ef12fce38d769c --- /dev/null +++ b/Documentation/bpf/bpf_lsm.rst @@ -0,0 +1,142 @@ +.. SPDX-License-Identifier: GPL-2.0+ +.. Copyright (C) 2020 Google LLC. + +================ +LSM BPF Programs +================ + +These BPF programs allow runtime instrumentation of the LSM hooks by privileged +users to implement system-wide MAC (Mandatory Access Control) and Audit +policies using eBPF. + +Structure +--------- + +The example shows an eBPF program that can be attached to the ``file_mprotect`` +LSM hook: + +.. c:function:: int file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot); + +Other LSM hooks which can be instrumented can be found in +``include/linux/lsm_hooks.h``. + +eBPF programs that use :doc:`/bpf/btf` do not need to include kernel headers +for accessing information from the attached eBPF program's context. They can +simply declare the structures in the eBPF program and only specify the fields +that need to be accessed. + +.. code-block:: c + + struct mm_struct { + unsigned long start_brk, brk, start_stack; + } __attribute__((preserve_access_index)); + + struct vm_area_struct { + unsigned long start_brk, brk, start_stack; + unsigned long vm_start, vm_end; + struct mm_struct *vm_mm; + } __attribute__((preserve_access_index)); + + +.. note:: The order of the fields is irrelevant. + +This can be further simplified (if one has access to the BTF information at +build time) by generating the ``vmlinux.h`` with: + +.. code-block:: console + + # bpftool btf dump file format c > vmlinux.h + +.. note:: ``path-to-btf-vmlinux`` can be ``/sys/kernel/btf/vmlinux`` if the + build environment matches the environment the BPF programs are + deployed in. + +The ``vmlinux.h`` can then simply be included in the BPF programs without +requiring the definition of the types. + +The eBPF programs can be declared using the``BPF_PROG`` +macros defined in `tools/lib/bpf/bpf_tracing.h`_. In this +example: + + * ``"lsm/file_mprotect"`` indicates the LSM hook that the program must + be attached to + * ``mprotect_audit`` is the name of the eBPF program + +.. code-block:: c + + SEC("lsm/file_mprotect") + int BPF_PROG(mprotect_audit, struct vm_area_struct *vma, + unsigned long reqprot, unsigned long prot, int ret) + { + /* ret is the return value from the previous BPF program + * or 0 if it's the first hook. + */ + if (ret != 0) + return ret; + + int is_heap; + + is_heap = (vma->vm_start >= vma->vm_mm->start_brk && + vma->vm_end <= vma->vm_mm->brk); + + /* Return an -EPERM or write information to the perf events buffer + * for auditing + */ + if (is_heap) + return -EPERM; + } + +The ``__attribute__((preserve_access_index))`` is a clang feature that allows +the BPF verifier to update the offsets for the access at runtime using the +:doc:`/bpf/btf` information. Since the BPF verifier is aware of the types, it +also validates all the accesses made to the various types in the eBPF program. + +Loading +------- + +eBPF programs can be loaded with the :manpage:`bpf(2)` syscall's +``BPF_PROG_LOAD`` operation: + +.. code-block:: c + + struct bpf_object *obj; + + obj = bpf_object__open("./my_prog.o"); + bpf_object__load(obj); + +This can be simplified by using a skeleton header generated by ``bpftool``: + +.. code-block:: console + + # bpftool gen skeleton my_prog.o > my_prog.skel.h + +and the program can be loaded by including ``my_prog.skel.h`` and using +the generated helper, ``my_prog__open_and_load``. + +Attachment to LSM Hooks +----------------------- + +The LSM allows attachment of eBPF programs as LSM hooks using :manpage:`bpf(2)` +syscall's ``BPF_RAW_TRACEPOINT_OPEN`` operation or more simply by +using the libbpf helper ``bpf_program__attach_lsm``. + +The program can be detached from the LSM hook by *destroying* the ``link`` +link returned by ``bpf_program__attach_lsm`` using ``bpf_link__destroy``. + +One can also use the helpers generated in ``my_prog.skel.h`` i.e. +``my_prog__attach`` for attachment and ``my_prog__destroy`` for cleaning up. + +Examples +-------- + +An example eBPF program can be found in +`tools/testing/selftests/bpf/progs/lsm.c`_ and the corresponding +userspace code in `tools/testing/selftests/bpf/prog_tests/test_lsm.c`_ + +.. Links +.. _tools/lib/bpf/bpf_tracing.h: + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/lib/bpf/bpf_tracing.h +.. _tools/testing/selftests/bpf/progs/lsm.c: + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/progs/lsm.c +.. _tools/testing/selftests/bpf/prog_tests/test_lsm.c: + https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/tools/testing/selftests/bpf/prog_tests/test_lsm.c diff --git a/Documentation/bpf/drgn.rst b/Documentation/bpf/drgn.rst new file mode 100644 index 0000000000000000000000000000000000000000..41f223c3161ec142b329038406564dd9519db1b1 --- /dev/null +++ b/Documentation/bpf/drgn.rst @@ -0,0 +1,213 @@ +.. SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +============== +BPF drgn tools +============== + +drgn scripts is a convenient and easy to use mechanism to retrieve arbitrary +kernel data structures. drgn is not relying on kernel UAPI to read the data. +Instead it's reading directly from ``/proc/kcore`` or vmcore and pretty prints +the data based on DWARF debug information from vmlinux. + +This document describes BPF related drgn tools. + +See `drgn/tools`_ for all tools available at the moment and `drgn/doc`_ for +more details on drgn itself. + +bpf_inspect.py +-------------- + +Description +=========== + +`bpf_inspect.py`_ is a tool intended to inspect BPF programs and maps. It can +iterate over all programs and maps in the system and print basic information +about these objects, including id, type and name. + +The main use-case `bpf_inspect.py`_ covers is to show BPF programs of types +``BPF_PROG_TYPE_EXT`` and ``BPF_PROG_TYPE_TRACING`` attached to other BPF +programs via ``freplace``/``fentry``/``fexit`` mechanisms, since there is no +user-space API to get this information. + +Getting started +=============== + +List BPF programs (full names are obtained from BTF):: + + % sudo bpf_inspect.py prog + 27: BPF_PROG_TYPE_TRACEPOINT tracepoint__tcp__tcp_send_reset + 4632: BPF_PROG_TYPE_CGROUP_SOCK_ADDR tw_ipt_bind + 49464: BPF_PROG_TYPE_RAW_TRACEPOINT raw_tracepoint__sched_process_exit + +List BPF maps:: + + % sudo bpf_inspect.py map + 2577: BPF_MAP_TYPE_HASH tw_ipt_vips + 4050: BPF_MAP_TYPE_STACK_TRACE stack_traces + 4069: BPF_MAP_TYPE_PERCPU_ARRAY ned_dctcp_cntr + +Find BPF programs attached to BPF program ``test_pkt_access``:: + + % sudo bpf_inspect.py p | grep test_pkt_access + 650: BPF_PROG_TYPE_SCHED_CLS test_pkt_access + 654: BPF_PROG_TYPE_TRACING test_main linked:[650->25: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access()] + 655: BPF_PROG_TYPE_TRACING test_subprog1 linked:[650->29: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog1()] + 656: BPF_PROG_TYPE_TRACING test_subprog2 linked:[650->31: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog2()] + 657: BPF_PROG_TYPE_TRACING test_subprog3 linked:[650->21: BPF_TRAMP_FEXIT test_pkt_access->test_pkt_access_subprog3()] + 658: BPF_PROG_TYPE_EXT new_get_skb_len linked:[650->16: BPF_TRAMP_REPLACE test_pkt_access->get_skb_len()] + 659: BPF_PROG_TYPE_EXT new_get_skb_ifindex linked:[650->23: BPF_TRAMP_REPLACE test_pkt_access->get_skb_ifindex()] + 660: BPF_PROG_TYPE_EXT new_get_constant linked:[650->19: BPF_TRAMP_REPLACE test_pkt_access->get_constant()] + +It can be seen that there is a program ``test_pkt_access``, id 650 and there +are multiple other tracing and ext programs attached to functions in +``test_pkt_access``. + +For example the line:: + + 658: BPF_PROG_TYPE_EXT new_get_skb_len linked:[650->16: BPF_TRAMP_REPLACE test_pkt_access->get_skb_len()] + +, means that BPF program id 658, type ``BPF_PROG_TYPE_EXT``, name +``new_get_skb_len`` replaces (``BPF_TRAMP_REPLACE``) function ``get_skb_len()`` +that has BTF id 16 in BPF program id 650, name ``test_pkt_access``. + +Getting help: + +.. code-block:: none + + % sudo bpf_inspect.py + usage: bpf_inspect.py [-h] {prog,p,map,m} ... + + drgn script to list BPF programs or maps and their properties + unavailable via kernel API. + + See https://github.com/osandov/drgn/ for more details on drgn. + + optional arguments: + -h, --help show this help message and exit + + subcommands: + {prog,p,map,m} + prog (p) list BPF programs + map (m) list BPF maps + +Customization +============= + +The script is intended to be customized by developers to print relevant +information about BPF programs, maps and other objects. + +For example, to print ``struct bpf_prog_aux`` for BPF program id 53077: + +.. code-block:: none + + % git diff + diff --git a/tools/bpf_inspect.py b/tools/bpf_inspect.py + index 650e228..aea2357 100755 + --- a/tools/bpf_inspect.py + +++ b/tools/bpf_inspect.py + @@ -112,7 +112,9 @@ def list_bpf_progs(args): + if linked: + linked = f" linked:[{linked}]" + + - print(f"{id_:>6}: {type_:32} {name:32} {linked}") + + if id_ == 53077: + + print(f"{id_:>6}: {type_:32} {name:32}") + + print(f"{bpf_prog.aux}") + + + def list_bpf_maps(args): + +It produces the output:: + + % sudo bpf_inspect.py p + 53077: BPF_PROG_TYPE_XDP tw_xdp_policer + *(struct bpf_prog_aux *)0xffff8893fad4b400 = { + .refcnt = (atomic64_t){ + .counter = (long)58, + }, + .used_map_cnt = (u32)1, + .max_ctx_offset = (u32)8, + .max_pkt_offset = (u32)15, + .max_tp_access = (u32)0, + .stack_depth = (u32)8, + .id = (u32)53077, + .func_cnt = (u32)0, + .func_idx = (u32)0, + .attach_btf_id = (u32)0, + .linked_prog = (struct bpf_prog *)0x0, + .verifier_zext = (bool)0, + .offload_requested = (bool)0, + .attach_btf_trace = (bool)0, + .func_proto_unreliable = (bool)0, + .trampoline_prog_type = (enum bpf_tramp_prog_type)BPF_TRAMP_FENTRY, + .trampoline = (struct bpf_trampoline *)0x0, + .tramp_hlist = (struct hlist_node){ + .next = (struct hlist_node *)0x0, + .pprev = (struct hlist_node **)0x0, + }, + .attach_func_proto = (const struct btf_type *)0x0, + .attach_func_name = (const char *)0x0, + .func = (struct bpf_prog **)0x0, + .jit_data = (void *)0x0, + .poke_tab = (struct bpf_jit_poke_descriptor *)0x0, + .size_poke_tab = (u32)0, + .ksym_tnode = (struct latch_tree_node){ + .node = (struct rb_node [2]){ + { + .__rb_parent_color = (unsigned long)18446612956263126665, + .rb_right = (struct rb_node *)0x0, + .rb_left = (struct rb_node *)0xffff88a0be3d0088, + }, + { + .__rb_parent_color = (unsigned long)18446612956263126689, + .rb_right = (struct rb_node *)0x0, + .rb_left = (struct rb_node *)0xffff88a0be3d00a0, + }, + }, + }, + .ksym_lnode = (struct list_head){ + .next = (struct list_head *)0xffff88bf481830b8, + .prev = (struct list_head *)0xffff888309f536b8, + }, + .ops = (const struct bpf_prog_ops *)xdp_prog_ops+0x0 = 0xffffffff820fa350, + .used_maps = (struct bpf_map **)0xffff889ff795de98, + .prog = (struct bpf_prog *)0xffffc9000cf2d000, + .user = (struct user_struct *)root_user+0x0 = 0xffffffff82444820, + .load_time = (u64)2408348759285319, + .cgroup_storage = (struct bpf_map *[2]){}, + .name = (char [16])"tw_xdp_policer", + .security = (void *)0xffff889ff795d548, + .offload = (struct bpf_prog_offload *)0x0, + .btf = (struct btf *)0xffff8890ce6d0580, + .func_info = (struct bpf_func_info *)0xffff889ff795d240, + .func_info_aux = (struct bpf_func_info_aux *)0xffff889ff795de20, + .linfo = (struct bpf_line_info *)0xffff888a707afc00, + .jited_linfo = (void **)0xffff8893fad48600, + .func_info_cnt = (u32)1, + .nr_linfo = (u32)37, + .linfo_idx = (u32)0, + .num_exentries = (u32)0, + .extable = (struct exception_table_entry *)0xffffffffa032d950, + .stats = (struct bpf_prog_stats *)0x603fe3a1f6d0, + .work = (struct work_struct){ + .data = (atomic_long_t){ + .counter = (long)0, + }, + .entry = (struct list_head){ + .next = (struct list_head *)0x0, + .prev = (struct list_head *)0x0, + }, + .func = (work_func_t)0x0, + }, + .rcu = (struct callback_head){ + .next = (struct callback_head *)0x0, + .func = (void (*)(struct callback_head *))0x0, + }, + } + + +.. Links +.. _drgn/doc: https://drgn.readthedocs.io/en/latest/ +.. _drgn/tools: https://github.com/osandov/drgn/tree/master/tools +.. _bpf_inspect.py: + https://github.com/osandov/drgn/blob/master/tools/bpf_inspect.py diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 4f5410b6144169d84e61b5b310e42dc450e967e1..f99677f3572f6275d0f190278edb2bb3fe310c46 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -45,14 +45,16 @@ Program types prog_cgroup_sockopt prog_cgroup_sysctl prog_flow_dissector + bpf_lsm -Testing BPF -=========== +Testing and debugging BPF +========================= .. toctree:: :maxdepth: 1 + drgn s390 diff --git a/Documentation/conf.py b/Documentation/conf.py index 3c7bdf4cd31ffd4fa27b6e622d8b9ba6b2c1764a..9ae8e9abf846a1ddb43946e8b55cfadb32178d83 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -38,7 +38,11 @@ needs_sphinx = '1.3' # ones. extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig', 'automarkup', - 'maintainers_include'] + 'maintainers_include', 'sphinx.ext.autosectionlabel' ] + +# Ensure that autosectionlabel will produce unique names +autosectionlabel_prefix_document = True +autosectionlabel_maxdepth = 2 # The name of the math extension changed on Sphinx 1.4 if (major == 1 and minor > 3) or (major > 1): diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index ab0eae1c153a42e5ebfb3b17f0e1046ab956855b..0897ad12c11917f53578ceea06e1e2788b88983e 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -8,38 +8,81 @@ This is the beginning of a manual for core kernel APIs. The conversion Core utilities ============== +This section has general and "core core" documentation. The first is a +massive grab-bag of kerneldoc info left over from the docbook days; it +should really be broken up someday when somebody finds the energy to do +it. + .. toctree:: :maxdepth: 1 kernel-api - assoc_array - atomic_ops - cachetlb - refcount-vs-atomic - cpu_hotplug - idr - local_ops workqueue - genericirq - xarray - librs - genalloc - errseq - packing printk-formats + symbol-namespaces + +Data structures and low-level utilities +======================================= + +Library functionality that is used throughout the kernel. + +.. toctree:: + :maxdepth: 1 + + kobject + assoc_array + xarray + idr circular-buffers generic-radix-tree - memory-allocation - mm-api - gfp_mask-from-fs-io + packing timekeeping - boot-time-mm + errseq + +Concurrency primitives +====================== + +How Linux keeps everything from happening at the same time. See +:doc:`/locking/index` for more related documentation. + +.. toctree:: + :maxdepth: 1 + + atomic_ops + refcount-vs-atomic + local_ops + padata + ../RCU/index + +Low-level hardware management +============================= + +Cache management, managing CPU hotplug, etc. + +.. toctree:: + :maxdepth: 1 + + cachetlb + cpu_hotplug memory-hotplug + genericirq protection-keys - ../RCU/index - gcc-plugins - symbol-namespaces +Memory management +================= + +How to allocate and use memory in the kernel. Note that there is a lot +more memory-management documentation in :doc:`/vm/index`. + +.. toctree:: + :maxdepth: 1 + + memory-allocation + mm-api + genalloc + pin_user_pages + boot-time-mm + gfp_mask-from-fs-io Interfaces for kernel debugging =============================== @@ -50,6 +93,16 @@ Interfaces for kernel debugging debug-objects tracepoint +Everything else +=============== + +Documents that don't fit elsewhere or which have yet to be categorized. + +.. toctree:: + :maxdepth: 1 + + librs + .. only:: subproject and html Indices diff --git a/Documentation/core-api/kobject.rst b/Documentation/core-api/kobject.rst new file mode 100644 index 0000000000000000000000000000000000000000..1f62d4d7d9660f962fea723c2641c8e7bc3e141c --- /dev/null +++ b/Documentation/core-api/kobject.rst @@ -0,0 +1,434 @@ +===================================================================== +Everything you never wanted to know about kobjects, ksets, and ktypes +===================================================================== + +:Author: Greg Kroah-Hartman +:Last updated: December 19, 2007 + +Based on an original article by Jon Corbet for lwn.net written October 1, +2003 and located at http://lwn.net/Articles/51437/ + +Part of the difficulty in understanding the driver model - and the kobject +abstraction upon which it is built - is that there is no obvious starting +place. Dealing with kobjects requires understanding a few different types, +all of which make reference to each other. In an attempt to make things +easier, we'll take a multi-pass approach, starting with vague terms and +adding detail as we go. To that end, here are some quick definitions of +some terms we will be working with. + + - A kobject is an object of type struct kobject. Kobjects have a name + and a reference count. A kobject also has a parent pointer (allowing + objects to be arranged into hierarchies), a specific type, and, + usually, a representation in the sysfs virtual filesystem. + + Kobjects are generally not interesting on their own; instead, they are + usually embedded within some other structure which contains the stuff + the code is really interested in. + + No structure should **EVER** have more than one kobject embedded within it. + If it does, the reference counting for the object is sure to be messed + up and incorrect, and your code will be buggy. So do not do this. + + - A ktype is the type of object that embeds a kobject. Every structure + that embeds a kobject needs a corresponding ktype. The ktype controls + what happens to the kobject when it is created and destroyed. + + - A kset is a group of kobjects. These kobjects can be of the same ktype + or belong to different ktypes. The kset is the basic container type for + collections of kobjects. Ksets contain their own kobjects, but you can + safely ignore that implementation detail as the kset core code handles + this kobject automatically. + + When you see a sysfs directory full of other directories, generally each + of those directories corresponds to a kobject in the same kset. + +We'll look at how to create and manipulate all of these types. A bottom-up +approach will be taken, so we'll go back to kobjects. + + +Embedding kobjects +================== + +It is rare for kernel code to create a standalone kobject, with one major +exception explained below. Instead, kobjects are used to control access to +a larger, domain-specific object. To this end, kobjects will be found +embedded in other structures. If you are used to thinking of things in +object-oriented terms, kobjects can be seen as a top-level, abstract class +from which other classes are derived. A kobject implements a set of +capabilities which are not particularly useful by themselves, but are +nice to have in other objects. The C language does not allow for the +direct expression of inheritance, so other techniques - such as structure +embedding - must be used. + +(As an aside, for those familiar with the kernel linked list implementation, +this is analogous as to how "list_head" structs are rarely useful on +their own, but are invariably found embedded in the larger objects of +interest.) + +So, for example, the UIO code in ``drivers/uio/uio.c`` has a structure that +defines the memory region associated with a uio device:: + + struct uio_map { + struct kobject kobj; + struct uio_mem *mem; + }; + +If you have a struct uio_map structure, finding its embedded kobject is +just a matter of using the kobj member. Code that works with kobjects will +often have the opposite problem, however: given a struct kobject pointer, +what is the pointer to the containing structure? You must avoid tricks +(such as assuming that the kobject is at the beginning of the structure) +and, instead, use the container_of() macro, found in ````:: + + container_of(pointer, type, member) + +where: + + * ``pointer`` is the pointer to the embedded kobject, + * ``type`` is the type of the containing structure, and + * ``member`` is the name of the structure field to which ``pointer`` points. + +The return value from container_of() is a pointer to the corresponding +container type. So, for example, a pointer ``kp`` to a struct kobject +embedded **within** a struct uio_map could be converted to a pointer to the +**containing** uio_map structure with:: + + struct uio_map *u_map = container_of(kp, struct uio_map, kobj); + +For convenience, programmers often define a simple macro for **back-casting** +kobject pointers to the containing type. Exactly this happens in the +earlier ``drivers/uio/uio.c``, as you can see here:: + + struct uio_map { + struct kobject kobj; + struct uio_mem *mem; + }; + + #define to_map(map) container_of(map, struct uio_map, kobj) + +where the macro argument "map" is a pointer to the struct kobject in +question. That macro is subsequently invoked with:: + + struct uio_map *map = to_map(kobj); + + +Initialization of kobjects +========================== + +Code which creates a kobject must, of course, initialize that object. Some +of the internal fields are setup with a (mandatory) call to kobject_init():: + + void kobject_init(struct kobject *kobj, struct kobj_type *ktype); + +The ktype is required for a kobject to be created properly, as every kobject +must have an associated kobj_type. After calling kobject_init(), to +register the kobject with sysfs, the function kobject_add() must be called:: + + int kobject_add(struct kobject *kobj, struct kobject *parent, + const char *fmt, ...); + +This sets up the parent of the kobject and the name for the kobject +properly. If the kobject is to be associated with a specific kset, +kobj->kset must be assigned before calling kobject_add(). If a kset is +associated with a kobject, then the parent for the kobject can be set to +NULL in the call to kobject_add() and then the kobject's parent will be the +kset itself. + +As the name of the kobject is set when it is added to the kernel, the name +of the kobject should never be manipulated directly. If you must change +the name of the kobject, call kobject_rename():: + + int kobject_rename(struct kobject *kobj, const char *new_name); + +kobject_rename does not perform any locking or have a solid notion of +what names are valid so the caller must provide their own sanity checking +and serialization. + +There is a function called kobject_set_name() but that is legacy cruft and +is being removed. If your code needs to call this function, it is +incorrect and needs to be fixed. + +To properly access the name of the kobject, use the function +kobject_name():: + + const char *kobject_name(const struct kobject * kobj); + +There is a helper function to both initialize and add the kobject to the +kernel at the same time, called surprisingly enough kobject_init_and_add():: + + int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, + struct kobject *parent, const char *fmt, ...); + +The arguments are the same as the individual kobject_init() and +kobject_add() functions described above. + + +Uevents +======= + +After a kobject has been registered with the kobject core, you need to +announce to the world that it has been created. This can be done with a +call to kobject_uevent():: + + int kobject_uevent(struct kobject *kobj, enum kobject_action action); + +Use the **KOBJ_ADD** action for when the kobject is first added to the kernel. +This should be done only after any attributes or children of the kobject +have been initialized properly, as userspace will instantly start to look +for them when this call happens. + +When the kobject is removed from the kernel (details on how to do that are +below), the uevent for **KOBJ_REMOVE** will be automatically created by the +kobject core, so the caller does not have to worry about doing that by +hand. + + +Reference counts +================ + +One of the key functions of a kobject is to serve as a reference counter +for the object in which it is embedded. As long as references to the object +exist, the object (and the code which supports it) must continue to exist. +The low-level functions for manipulating a kobject's reference counts are:: + + struct kobject *kobject_get(struct kobject *kobj); + void kobject_put(struct kobject *kobj); + +A successful call to kobject_get() will increment the kobject's reference +counter and return the pointer to the kobject. + +When a reference is released, the call to kobject_put() will decrement the +reference count and, possibly, free the object. Note that kobject_init() +sets the reference count to one, so the code which sets up the kobject will +need to do a kobject_put() eventually to release that reference. + +Because kobjects are dynamic, they must not be declared statically or on +the stack, but instead, always allocated dynamically. Future versions of +the kernel will contain a run-time check for kobjects that are created +statically and will warn the developer of this improper usage. + +If all that you want to use a kobject for is to provide a reference counter +for your structure, please use the struct kref instead; a kobject would be +overkill. For more information on how to use struct kref, please see the +file Documentation/kref.txt in the Linux kernel source tree. + + +Creating "simple" kobjects +========================== + +Sometimes all that a developer wants is a way to create a simple directory +in the sysfs hierarchy, and not have to mess with the whole complication of +ksets, show and store functions, and other details. This is the one +exception where a single kobject should be created. To create such an +entry, use the function:: + + struct kobject *kobject_create_and_add(char *name, struct kobject *parent); + +This function will create a kobject and place it in sysfs in the location +underneath the specified parent kobject. To create simple attributes +associated with this kobject, use:: + + int sysfs_create_file(struct kobject *kobj, struct attribute *attr); + +or:: + + int sysfs_create_group(struct kobject *kobj, struct attribute_group *grp); + +Both types of attributes used here, with a kobject that has been created +with the kobject_create_and_add(), can be of type kobj_attribute, so no +special custom attribute is needed to be created. + +See the example module, ``samples/kobject/kobject-example.c`` for an +implementation of a simple kobject and attributes. + + + +ktypes and release methods +========================== + +One important thing still missing from the discussion is what happens to a +kobject when its reference count reaches zero. The code which created the +kobject generally does not know when that will happen; if it did, there +would be little point in using a kobject in the first place. Even +predictable object lifecycles become more complicated when sysfs is brought +in as other portions of the kernel can get a reference on any kobject that +is registered in the system. + +The end result is that a structure protected by a kobject cannot be freed +before its reference count goes to zero. The reference count is not under +the direct control of the code which created the kobject. So that code must +be notified asynchronously whenever the last reference to one of its +kobjects goes away. + +Once you registered your kobject via kobject_add(), you must never use +kfree() to free it directly. The only safe way is to use kobject_put(). It +is good practice to always use kobject_put() after kobject_init() to avoid +errors creeping in. + +This notification is done through a kobject's release() method. Usually +such a method has a form like:: + + void my_object_release(struct kobject *kobj) + { + struct my_object *mine = container_of(kobj, struct my_object, kobj); + + /* Perform any additional cleanup on this object, then... */ + kfree(mine); + } + +One important point cannot be overstated: every kobject must have a +release() method, and the kobject must persist (in a consistent state) +until that method is called. If these constraints are not met, the code is +flawed. Note that the kernel will warn you if you forget to provide a +release() method. Do not try to get rid of this warning by providing an +"empty" release function. + +If all your cleanup function needs to do is call kfree(), then you must +create a wrapper function which uses container_of() to upcast to the correct +type (as shown in the example above) and then calls kfree() on the overall +structure. + +Note, the name of the kobject is available in the release function, but it +must NOT be changed within this callback. Otherwise there will be a memory +leak in the kobject core, which makes people unhappy. + +Interestingly, the release() method is not stored in the kobject itself; +instead, it is associated with the ktype. So let us introduce struct +kobj_type:: + + struct kobj_type { + void (*release)(struct kobject *kobj); + const struct sysfs_ops *sysfs_ops; + struct attribute **default_attrs; + const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); + const void *(*namespace)(struct kobject *kobj); + }; + +This structure is used to describe a particular type of kobject (or, more +correctly, of containing object). Every kobject needs to have an associated +kobj_type structure; a pointer to that structure must be specified when you +call kobject_init() or kobject_init_and_add(). + +The release field in struct kobj_type is, of course, a pointer to the +release() method for this type of kobject. The other two fields (sysfs_ops +and default_attrs) control how objects of this type are represented in +sysfs; they are beyond the scope of this document. + +The default_attrs pointer is a list of default attributes that will be +automatically created for any kobject that is registered with this ktype. + + +ksets +===== + +A kset is merely a collection of kobjects that want to be associated with +each other. There is no restriction that they be of the same ktype, but be +very careful if they are not. + +A kset serves these functions: + + - It serves as a bag containing a group of objects. A kset can be used by + the kernel to track "all block devices" or "all PCI device drivers." + + - A kset is also a subdirectory in sysfs, where the associated kobjects + with the kset can show up. Every kset contains a kobject which can be + set up to be the parent of other kobjects; the top-level directories of + the sysfs hierarchy are constructed in this way. + + - Ksets can support the "hotplugging" of kobjects and influence how + uevent events are reported to user space. + +In object-oriented terms, "kset" is the top-level container class; ksets +contain their own kobject, but that kobject is managed by the kset code and +should not be manipulated by any other user. + +A kset keeps its children in a standard kernel linked list. Kobjects point +back to their containing kset via their kset field. In almost all cases, +the kobjects belonging to a kset have that kset (or, strictly, its embedded +kobject) in their parent. + +As a kset contains a kobject within it, it should always be dynamically +created and never declared statically or on the stack. To create a new +kset use:: + + struct kset *kset_create_and_add(const char *name, + struct kset_uevent_ops *u, + struct kobject *parent); + +When you are finished with the kset, call:: + + void kset_unregister(struct kset *kset); + +to destroy it. This removes the kset from sysfs and decrements its reference +count. When the reference count goes to zero, the kset will be released. +Because other references to the kset may still exist, the release may happen +after kset_unregister() returns. + +An example of using a kset can be seen in the +``samples/kobject/kset-example.c`` file in the kernel tree. + +If a kset wishes to control the uevent operations of the kobjects +associated with it, it can use the struct kset_uevent_ops to handle it:: + + struct kset_uevent_ops { + int (*filter)(struct kset *kset, struct kobject *kobj); + const char *(*name)(struct kset *kset, struct kobject *kobj); + int (*uevent)(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env); + }; + + +The filter function allows a kset to prevent a uevent from being emitted to +userspace for a specific kobject. If the function returns 0, the uevent +will not be emitted. + +The name function will be called to override the default name of the kset +that the uevent sends to userspace. By default, the name will be the same +as the kset itself, but this function, if present, can override that name. + +The uevent function will be called when the uevent is about to be sent to +userspace to allow more environment variables to be added to the uevent. + +One might ask how, exactly, a kobject is added to a kset, given that no +functions which perform that function have been presented. The answer is +that this task is handled by kobject_add(). When a kobject is passed to +kobject_add(), its kset member should point to the kset to which the +kobject will belong. kobject_add() will handle the rest. + +If the kobject belonging to a kset has no parent kobject set, it will be +added to the kset's directory. Not all members of a kset do necessarily +live in the kset directory. If an explicit parent kobject is assigned +before the kobject is added, the kobject is registered with the kset, but +added below the parent kobject. + + +Kobject removal +=============== + +After a kobject has been registered with the kobject core successfully, it +must be cleaned up when the code is finished with it. To do that, call +kobject_put(). By doing this, the kobject core will automatically clean up +all of the memory allocated by this kobject. If a ``KOBJ_ADD`` uevent has been +sent for the object, a corresponding ``KOBJ_REMOVE`` uevent will be sent, and +any other sysfs housekeeping will be handled for the caller properly. + +If you need to do a two-stage delete of the kobject (say you are not +allowed to sleep when you need to destroy the object), then call +kobject_del() which will unregister the kobject from sysfs. This makes the +kobject "invisible", but it is not cleaned up, and the reference count of +the object is still the same. At a later time call kobject_put() to finish +the cleanup of the memory associated with the kobject. + +kobject_del() can be used to drop the reference to the parent object, if +circular references are constructed. It is valid in some cases, that a +parent objects references a child. Circular references _must_ be broken +with an explicit call to kobject_del(), so that a release functions will be +called, and the objects in the former circle release each other. + + +Example code to copy from +========================= + +For a more complete example of using ksets and kobjects properly, see the +example programs ``samples/kobject/{kobject-example.c,kset-example.c}``, +which will be built as loadable modules if you select ``CONFIG_SAMPLE_KOBJECT``. diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index be726986ff75c5b2b2d968844aac7d2a13432fea..2adffb3f7914d42d740e099eb44b4b1028d3ef36 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -73,6 +73,9 @@ File Mapping and Page Cache .. kernel-doc:: mm/truncate.c :export: +.. kernel-doc:: include/linux/pagemap.h + :internal: + Memory pools ============ diff --git a/Documentation/core-api/padata.rst b/Documentation/core-api/padata.rst new file mode 100644 index 0000000000000000000000000000000000000000..9a24c111781d9305f559ade66a889b5f3b9e298d --- /dev/null +++ b/Documentation/core-api/padata.rst @@ -0,0 +1,169 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +The padata parallel execution mechanism +======================================= + +:Date: December 2019 + +Padata is a mechanism by which the kernel can farm jobs out to be done in +parallel on multiple CPUs while retaining their ordering. It was developed for +use with the IPsec code, which needs to be able to perform encryption and +decryption on large numbers of packets without reordering those packets. The +crypto developers made a point of writing padata in a sufficiently general +fashion that it could be put to other uses as well. + +Usage +===== + +Initializing +------------ + +The first step in using padata is to set up a padata_instance structure for +overall control of how jobs are to be run:: + + #include + + struct padata_instance *padata_alloc_possible(const char *name); + +'name' simply identifies the instance. + +There are functions for enabling and disabling the instance:: + + int padata_start(struct padata_instance *pinst); + void padata_stop(struct padata_instance *pinst); + +These functions are setting or clearing the "PADATA_INIT" flag; if that flag is +not set, other functions will refuse to work. padata_start() returns zero on +success (flag set) or -EINVAL if the padata cpumask contains no active CPU +(flag not set). padata_stop() clears the flag and blocks until the padata +instance is unused. + +Finally, complete padata initialization by allocating a padata_shell:: + + struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); + +A padata_shell is used to submit a job to padata and allows a series of such +jobs to be serialized independently. A padata_instance may have one or more +padata_shells associated with it, each allowing a separate series of jobs. + +Modifying cpumasks +------------------ + +The CPUs used to run jobs can be changed in two ways, programatically with +padata_set_cpumask() or via sysfs. The former is defined:: + + int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, + cpumask_var_t cpumask); + +Here cpumask_type is one of PADATA_CPU_PARALLEL or PADATA_CPU_SERIAL, where a +parallel cpumask describes which processors will be used to execute jobs +submitted to this instance in parallel and a serial cpumask defines which +processors are allowed to be used as the serialization callback processor. +cpumask specifies the new cpumask to use. + +There may be sysfs files for an instance's cpumasks. For example, pcrypt's +live in /sys/kernel/pcrypt/. Within an instance's directory +there are two files, parallel_cpumask and serial_cpumask, and either cpumask +may be changed by echoing a bitmask into the file, for example:: + + echo f > /sys/kernel/pcrypt/pencrypt/parallel_cpumask + +Reading one of these files shows the user-supplied cpumask, which may be +different from the 'usable' cpumask. + +Padata maintains two pairs of cpumasks internally, the user-supplied cpumasks +and the 'usable' cpumasks. (Each pair consists of a parallel and a serial +cpumask.) The user-supplied cpumasks default to all possible CPUs on instance +allocation and may be changed as above. The usable cpumasks are always a +subset of the user-supplied cpumasks and contain only the online CPUs in the +user-supplied masks; these are the cpumasks padata actually uses. So it is +legal to supply a cpumask to padata that contains offline CPUs. Once an +offline CPU in the user-supplied cpumask comes online, padata is going to use +it. + +Changing the CPU masks are expensive operations, so it should not be done with +great frequency. + +Running A Job +------------- + +Actually submitting work to the padata instance requires the creation of a +padata_priv structure, which represents one job:: + + struct padata_priv { + /* Other stuff here... */ + void (*parallel)(struct padata_priv *padata); + void (*serial)(struct padata_priv *padata); + }; + +This structure will almost certainly be embedded within some larger +structure specific to the work to be done. Most of its fields are private to +padata, but the structure should be zeroed at initialisation time, and the +parallel() and serial() functions should be provided. Those functions will +be called in the process of getting the work done as we will see +momentarily. + +The submission of the job is done with:: + + int padata_do_parallel(struct padata_shell *ps, + struct padata_priv *padata, int *cb_cpu); + +The ps and padata structures must be set up as described above; cb_cpu +points to the preferred CPU to be used for the final callback when the job is +done; it must be in the current instance's CPU mask (if not the cb_cpu pointer +is updated to point to the CPU actually chosen). The return value from +padata_do_parallel() is zero on success, indicating that the job is in +progress. -EBUSY means that somebody, somewhere else is messing with the +instance's CPU mask, while -EINVAL is a complaint about cb_cpu not being in the +serial cpumask, no online CPUs in the parallel or serial cpumasks, or a stopped +instance. + +Each job submitted to padata_do_parallel() will, in turn, be passed to +exactly one call to the above-mentioned parallel() function, on one CPU, so +true parallelism is achieved by submitting multiple jobs. parallel() runs with +software interrupts disabled and thus cannot sleep. The parallel() +function gets the padata_priv structure pointer as its lone parameter; +information about the actual work to be done is probably obtained by using +container_of() to find the enclosing structure. + +Note that parallel() has no return value; the padata subsystem assumes that +parallel() will take responsibility for the job from this point. The job +need not be completed during this call, but, if parallel() leaves work +outstanding, it should be prepared to be called again with a new job before +the previous one completes. + +Serializing Jobs +---------------- + +When a job does complete, parallel() (or whatever function actually finishes +the work) should inform padata of the fact with a call to:: + + void padata_do_serial(struct padata_priv *padata); + +At some point in the future, padata_do_serial() will trigger a call to the +serial() function in the padata_priv structure. That call will happen on +the CPU requested in the initial call to padata_do_parallel(); it, too, is +run with local software interrupts disabled. +Note that this call may be deferred for a while since the padata code takes +pains to ensure that jobs are completed in the order in which they were +submitted. + +Destroying +---------- + +Cleaning up a padata instance predictably involves calling the three free +functions that correspond to the allocation in reverse:: + + void padata_free_shell(struct padata_shell *ps); + void padata_stop(struct padata_instance *pinst); + void padata_free(struct padata_instance *pinst); + +It is the user's responsibility to ensure all outstanding jobs are complete +before any of the above are called. + +Interface +========= + +.. kernel-doc:: include/linux/padata.h +.. kernel-doc:: kernel/padata.c diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst new file mode 100644 index 0000000000000000000000000000000000000000..2e939ff10b86c6a0ba98b031abc93d810a4e65fe --- /dev/null +++ b/Documentation/core-api/pin_user_pages.rst @@ -0,0 +1,256 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================================== +pin_user_pages() and related calls +==================================================== + +.. contents:: :local: + +Overview +======== + +This document describes the following functions:: + + pin_user_pages() + pin_user_pages_fast() + pin_user_pages_remote() + +Basic description of FOLL_PIN +============================= + +FOLL_PIN and FOLL_LONGTERM are flags that can be passed to the get_user_pages*() +("gup") family of functions. FOLL_PIN has significant interactions and +interdependencies with FOLL_LONGTERM, so both are covered here. + +FOLL_PIN is internal to gup, meaning that it should not appear at the gup call +sites. This allows the associated wrapper functions (pin_user_pages*() and +others) to set the correct combination of these flags, and to check for problems +as well. + +FOLL_LONGTERM, on the other hand, *is* allowed to be set at the gup call sites. +This is in order to avoid creating a large number of wrapper functions to cover +all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the +pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so +that's a natural dividing line, and a good point to make separate wrapper calls. +In other words, use pin_user_pages*() for DMA-pinned pages, and +get_user_pages*() for other cases. There are four cases described later on in +this document, to further clarify that concept. + +FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However, +multiple threads and call sites are free to pin the same struct pages, via both +FOLL_PIN and FOLL_GET. It's just the call site that needs to choose one or the +other, not the struct page(s). + +The FOLL_PIN implementation is nearly the same as FOLL_GET, except that FOLL_PIN +uses a different reference counting technique. + +FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying that is, +FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN. + +Which flags are set by each wrapper +=================================== + +For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup +flags the caller provides. The caller is required to pass in a non-null struct +pages* array, and the function then pins pages by incrementing each by a special +value: GUP_PIN_COUNTING_BIAS. + +For huge pages (and in fact, any compound page of more than 2 pages), the +GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting +is achieved, by using the 3rd struct page in the compound page. A new struct +page field, hpage_pinned_refcount, has been added in order to support this. + +This approach for compound pages avoids the counting upper limit problems that +are discussed below. Those limitations would have been aggravated severely by +huge pages, because each tail page adds a refcount to the head page. And in +fact, testing revealed that, without a separate hpage_pinned_refcount field, +page overflows were seen in some huge page stress tests. + +This also means that huge pages and compound pages (of order > 1) do not suffer +from the false positives problem that is mentioned below.:: + + Function + -------- + pin_user_pages FOLL_PIN is always set internally by this function. + pin_user_pages_fast FOLL_PIN is always set internally by this function. + pin_user_pages_remote FOLL_PIN is always set internally by this function. + +For these get_user_pages*() functions, FOLL_GET might not even be specified. +Behavior is a little more complex than above. If FOLL_GET was *not* specified, +but the caller passed in a non-null struct pages* array, then the function +sets FOLL_GET for you, and proceeds to pin pages by incrementing the refcount +of each page by +1.:: + + Function + -------- + get_user_pages FOLL_GET is sometimes set internally by this function. + get_user_pages_fast FOLL_GET is sometimes set internally by this function. + get_user_pages_remote FOLL_GET is sometimes set internally by this function. + +Tracking dma-pinned pages +========================= + +Some of the key design constraints, and solutions, for tracking dma-pinned +pages: + +* An actual reference count, per struct page, is required. This is because + multiple processes may pin and unpin a page. + +* False positives (reporting that a page is dma-pinned, when in fact it is not) + are acceptable, but false negatives are not. + +* struct page may not be increased in size for this, and all fields are already + used. + +* Given the above, we can overload the page->_refcount field by using, sort of, + the upper bits in that field for a dma-pinned count. "Sort of", means that, + rather than dividing page->_refcount into bit fields, we simple add a medium- + large value (GUP_PIN_COUNTING_BIAS, initially chosen to be 1024: 10 bits) to + page->_refcount. This provides fuzzy behavior: if a page has get_page() called + on it 1024 times, then it will appear to have a single dma-pinned count. + And again, that's acceptable. + +This also leads to limitations: there are only 31-10==21 bits available for a +counter that increments 10 bits at a time. + +* Callers must specifically request "dma-pinned tracking of pages". In other + words, just calling get_user_pages() will not suffice; a new set of functions, + pin_user_page() and related, must be used. + +FOLL_PIN, FOLL_GET, FOLL_LONGTERM: when to use which flags +========================================================== + +Thanks to Jan Kara, Vlastimil Babka and several other -mm people, for describing +these categories: + +CASE 1: Direct IO (DIO) +----------------------- +There are GUP references to pages that are serving +as DIO buffers. These buffers are needed for a relatively short time (so they +are not "long term"). No special synchronization with page_mkclean() or +munmap() is provided. Therefore, flags to set at the call site are: :: + + FOLL_PIN + +...but rather than setting FOLL_PIN directly, call sites should use one of +the pin_user_pages*() routines that set FOLL_PIN. + +CASE 2: RDMA +------------ +There are GUP references to pages that are serving as DMA +buffers. These buffers are needed for a long time ("long term"). No special +synchronization with page_mkclean() or munmap() is provided. Therefore, flags +to set at the call site are: :: + + FOLL_PIN | FOLL_LONGTERM + +NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's +because DAX pages do not have a separate page cache, and so "pinning" implies +locking down file system blocks, which is not (yet) supported in that way. + +CASE 3: Hardware with page faulting support +------------------------------------------- +Here, a well-written driver doesn't normally need to pin pages at all. However, +if the driver does choose to do so, it can register MMU notifiers for the range, +and will be called back upon invalidation. Either way (avoiding page pinning, or +using MMU notifiers to unpin upon request), there is proper synchronization with +both filesystem and mm (page_mkclean(), munmap(), etc). + +Therefore, neither flag needs to be set. + +In this case, ideally, neither get_user_pages() nor pin_user_pages() should be +called. Instead, the software should be written so that it does not pin pages. +This allows mm and filesystems to operate more efficiently and reliably. + +CASE 4: Pinning for struct page manipulation only +------------------------------------------------- +Here, normal GUP calls are sufficient, so neither flag needs to be set. + +page_maybe_dma_pinned(): the whole point of pinning +=================================================== + +The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able +to query, "is this page DMA-pinned?" That allows code such as page_mkclean() +(and file system writeback code in general) to make informed decisions about +what to do when a page cannot be unmapped due to such pins. + +What to do in those cases is the subject of a years-long series of discussions +and debates (see the References at the end of this document). It's a TODO item +here: fill in the details once that's worked out. Meanwhile, it's safe to say +that having this available: :: + + static inline bool page_maybe_dma_pinned(struct page *page) + +...is a prerequisite to solving the long-running gup+DMA problem. + +Another way of thinking about FOLL_GET, FOLL_PIN, and FOLL_LONGTERM +=================================================================== + +Another way of thinking about these flags is as a progression of restrictions: +FOLL_GET is for struct page manipulation, without affecting the data that the +struct page refers to. FOLL_PIN is a *replacement* for FOLL_GET, and is for +short term pins on pages whose data *will* get accessed. As such, FOLL_PIN is +a "more severe" form of pinning. And finally, FOLL_LONGTERM is an even more +restrictive case that has FOLL_PIN as a prerequisite: this is for pages that +will be pinned longterm, and whose data will be accessed. + +Unit testing +============ +This file:: + + tools/testing/selftests/vm/gup_benchmark.c + +has the following new calls to exercise the new pin*() wrapper functions: + +* PIN_FAST_BENCHMARK (./gup_benchmark -a) +* PIN_BENCHMARK (./gup_benchmark -b) + +You can monitor how many total dma-pinned pages have been acquired and released +since the system was booted, via two new /proc/vmstat entries: :: + + /proc/vmstat/nr_foll_pin_acquired + /proc/vmstat/nr_foll_pin_released + +Under normal conditions, these two values will be equal unless there are any +long-term [R]DMA pins in place, or during pin/unpin transitions. + +* nr_foll_pin_acquired: This is the number of logical pins that have been + acquired since the system was powered on. For huge pages, the head page is + pinned once for each page (head page and each tail page) within the huge page. + This follows the same sort of behavior that get_user_pages() uses for huge + pages: the head page is refcounted once for each tail or head page in the huge + page, when get_user_pages() is applied to a huge page. + +* nr_foll_pin_released: The number of logical pins that have been released since + the system was powered on. Note that pages are released (unpinned) on a + PAGE_SIZE granularity, even if the original pin was applied to a huge page. + Becaused of the pin count behavior described above in "nr_foll_pin_acquired", + the accounting balances out, so that after doing this:: + + pin_user_pages(huge_page); + for (each page in huge_page) + unpin_user_page(page); + +...the following is expected:: + + nr_foll_pin_released == nr_foll_pin_acquired + +(...unless it was already out of balance due to a long-term RDMA pin being in +place.) + +Other diagnostics +================= + +dump_page() has been enhanced slightly, to handle these new counting fields, and +to better report on compound pages in general. Specifically, for compound pages +with order > 1, the exact (hpage_pinned_refcount) pincount is reported. + +References +========== + +* `Some slow progress on get_user_pages() (Apr 2, 2019) `_ +* `DMA and get_user_pages() (LPC: Dec 12, 2018) `_ +* `The trouble with get_user_pages() (Apr 30, 2018) `_ +* `LWN kernel index: get_user_pages() `_ + +John Hubbard, October, 2019 diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 8ebe46b1af39d88171acc6055759db0b8799ff20..5dfcc4592b23efe335673ea13ba34fbbe4da6338 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -112,6 +112,20 @@ used when printing stack backtraces. The specifier takes into consideration the effect of compiler optimisations which may occur when tail-calls are used and marked with the noreturn GCC attribute. +Probed Pointers from BPF / tracing +---------------------------------- + +:: + + %pks kernel string + %pus user string + +The ``k`` and ``u`` specifiers are used for printing prior probed memory from +either kernel memory (k) or user memory (u). The subsequent ``s`` specifier +results in printing a string. For direct use in regular vsnprintf() the (k) +and (u) annotation is ignored, however, when used out of BPF's bpf_trace_printk(), +for example, it reads the memory it is pointing to without faulting. + Kernel Pointers --------------- diff --git a/Documentation/core-api/timekeeping.rst b/Documentation/core-api/timekeeping.rst index c0ffa30c7c37e3060731645a55bf01922b5ca93f..729e24864fe738a3f5bd79a62d56d9e7c4542c85 100644 --- a/Documentation/core-api/timekeeping.rst +++ b/Documentation/core-api/timekeeping.rst @@ -154,9 +154,9 @@ architectures. These are the recommended replacements: Use ktime_get() or ktime_get_ts64() instead. -.. c:function:: struct timeval do_gettimeofday( void ) - struct timespec getnstimeofday( void ) - struct timespec64 getnstimeofday64( void ) +.. c:function:: void do_gettimeofday( struct timeval * ) + void getnstimeofday( struct timespec * ) + void getnstimeofday64( struct timespec64 * ) void ktime_get_real_ts( struct timespec * ) ktime_get_real_ts64() is a direct replacement, but consider using diff --git a/Documentation/cpu-freq/amd-powernow.txt b/Documentation/cpu-freq/amd-powernow.txt deleted file mode 100644 index 254da155fa47d5d1ab9b5080fe9870d9d60506c6..0000000000000000000000000000000000000000 --- a/Documentation/cpu-freq/amd-powernow.txt +++ /dev/null @@ -1,38 +0,0 @@ - -PowerNow! and Cool'n'Quiet are AMD names for frequency -management capabilities in AMD processors. As the hardware -implementation changes in new generations of the processors, -there is a different cpu-freq driver for each generation. - -Note that the driver's will not load on the "wrong" hardware, -so it is safe to try each driver in turn when in doubt as to -which is the correct driver. - -Note that the functionality to change frequency (and voltage) -is not available in all processors. The drivers will refuse -to load on processors without this capability. The capability -is detected with the cpuid instruction. - -The drivers use BIOS supplied tables to obtain frequency and -voltage information appropriate for a particular platform. -Frequency transitions will be unavailable if the BIOS does -not supply these tables. - -6th Generation: powernow-k6 - -7th Generation: powernow-k7: Athlon, Duron, Geode. - -8th Generation: powernow-k8: Athlon, Athlon 64, Opteron, Sempron. -Documentation on this functionality in 8th generation processors -is available in the "BIOS and Kernel Developer's Guide", publication -26094, in chapter 9, available for download from www.amd.com. - -BIOS supplied data, for powernow-k7 and for powernow-k8, may be -from either the PSB table or from ACPI objects. The ACPI support -is only available if the kernel config sets CONFIG_ACPI_PROCESSOR. -The powernow-k8 driver will attempt to use ACPI if so configured, -and fall back to PST if that fails. -The powernow-k7 driver will try to use the PSB support first, and -fall back to ACPI if the PSB support fails. A module parameter, -acpi_force, is provided to force ACPI support to be used instead -of PSB support. diff --git a/Documentation/cpu-freq/core.rst b/Documentation/cpu-freq/core.rst new file mode 100644 index 0000000000000000000000000000000000000000..33cb90bd1d8f9fac3b21cdb8190b768c30980224 --- /dev/null +++ b/Documentation/cpu-freq/core.rst @@ -0,0 +1,113 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================================================= +General description of the CPUFreq core and CPUFreq notifiers +============================================================= + +Authors: + - Dominik Brodowski + - David Kimdon + - Rafael J. Wysocki + - Viresh Kumar + +.. Contents: + + 1. CPUFreq core and interfaces + 2. CPUFreq notifiers + 3. CPUFreq Table Generation with Operating Performance Point (OPP) + +1. General Information +====================== + +The CPUFreq core code is located in drivers/cpufreq/cpufreq.c. This +cpufreq code offers a standardized interface for the CPUFreq +architecture drivers (those pieces of code that do actual +frequency transitions), as well as to "notifiers". These are device +drivers or other part of the kernel that need to be informed of +policy changes (ex. thermal modules like ACPI) or of all +frequency changes (ex. timing code) or even need to force certain +speed limits (like LCD drivers on ARM architecture). Additionally, the +kernel "constant" loops_per_jiffy is updated on frequency changes +here. + +Reference counting of the cpufreq policies is done by cpufreq_cpu_get +and cpufreq_cpu_put, which make sure that the cpufreq driver is +correctly registered with the core, and will not be unloaded until +cpufreq_put_cpu is called. That also ensures that the respective cpufreq +policy doesn't get freed while being used. + +2. CPUFreq notifiers +==================== + +CPUFreq notifiers conform to the standard kernel notifier interface. +See linux/include/linux/notifier.h for details on notifiers. + +There are two different CPUFreq notifiers - policy notifiers and +transition notifiers. + + +2.1 CPUFreq policy notifiers +---------------------------- + +These are notified when a new policy is created or removed. + +The phase is specified in the second argument to the notifier. The phase is +CPUFREQ_CREATE_POLICY when the policy is first created and it is +CPUFREQ_REMOVE_POLICY when the policy is removed. + +The third argument, a ``void *pointer``, points to a struct cpufreq_policy +consisting of several values, including min, max (the lower and upper +frequencies (in kHz) of the new policy). + + +2.2 CPUFreq transition notifiers +-------------------------------- + +These are notified twice for each online CPU in the policy, when the +CPUfreq driver switches the CPU core frequency and this change has no +any external implications. + +The second argument specifies the phase - CPUFREQ_PRECHANGE or +CPUFREQ_POSTCHANGE. + +The third argument is a struct cpufreq_freqs with the following +values: + +===== =========================== +cpu number of the affected CPU +old old frequency +new new frequency +flags flags of the cpufreq driver +===== =========================== + +3. CPUFreq Table Generation with Operating Performance Point (OPP) +================================================================== +For details about OPP, see Documentation/power/opp.rst + +dev_pm_opp_init_cpufreq_table - + This function provides a ready to use conversion routine to translate + the OPP layer's internal information about the available frequencies + into a format readily providable to cpufreq. + + .. Warning:: + + Do not use this function in interrupt context. + + Example:: + + soc_pm_init() + { + /* Do things */ + r = dev_pm_opp_init_cpufreq_table(dev, &freq_table); + if (!r) + policy->freq_table = freq_table; + /* Do other things */ + } + + .. note:: + + This function is available only if CONFIG_CPU_FREQ is enabled in + addition to CONFIG_PM_OPP. + +dev_pm_opp_free_cpufreq_table + Free up the table allocated by dev_pm_opp_init_cpufreq_table diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt deleted file mode 100644 index ed577d9c154b630f7d6e801abe7b749e455ac750..0000000000000000000000000000000000000000 --- a/Documentation/cpu-freq/core.txt +++ /dev/null @@ -1,112 +0,0 @@ - CPU frequency and voltage scaling code in the Linux(TM) kernel - - - L i n u x C P U F r e q - - C P U F r e q C o r e - - - Dominik Brodowski - David Kimdon - Rafael J. Wysocki - Viresh Kumar - - - - Clock scaling allows you to change the clock speed of the CPUs on the - fly. This is a nice method to save battery power, because the lower - the clock speed, the less power the CPU consumes. - - -Contents: ---------- -1. CPUFreq core and interfaces -2. CPUFreq notifiers -3. CPUFreq Table Generation with Operating Performance Point (OPP) - -1. General Information -======================= - -The CPUFreq core code is located in drivers/cpufreq/cpufreq.c. This -cpufreq code offers a standardized interface for the CPUFreq -architecture drivers (those pieces of code that do actual -frequency transitions), as well as to "notifiers". These are device -drivers or other part of the kernel that need to be informed of -policy changes (ex. thermal modules like ACPI) or of all -frequency changes (ex. timing code) or even need to force certain -speed limits (like LCD drivers on ARM architecture). Additionally, the -kernel "constant" loops_per_jiffy is updated on frequency changes -here. - -Reference counting of the cpufreq policies is done by cpufreq_cpu_get -and cpufreq_cpu_put, which make sure that the cpufreq driver is -correctly registered with the core, and will not be unloaded until -cpufreq_put_cpu is called. That also ensures that the respective cpufreq -policy doesn't get freed while being used. - -2. CPUFreq notifiers -==================== - -CPUFreq notifiers conform to the standard kernel notifier interface. -See linux/include/linux/notifier.h for details on notifiers. - -There are two different CPUFreq notifiers - policy notifiers and -transition notifiers. - - -2.1 CPUFreq policy notifiers ----------------------------- - -These are notified when a new policy is created or removed. - -The phase is specified in the second argument to the notifier. The phase is -CPUFREQ_CREATE_POLICY when the policy is first created and it is -CPUFREQ_REMOVE_POLICY when the policy is removed. - -The third argument, a void *pointer, points to a struct cpufreq_policy -consisting of several values, including min, max (the lower and upper -frequencies (in kHz) of the new policy). - - -2.2 CPUFreq transition notifiers --------------------------------- - -These are notified twice for each online CPU in the policy, when the -CPUfreq driver switches the CPU core frequency and this change has no -any external implications. - -The second argument specifies the phase - CPUFREQ_PRECHANGE or -CPUFREQ_POSTCHANGE. - -The third argument is a struct cpufreq_freqs with the following -values: -cpu - number of the affected CPU -old - old frequency -new - new frequency -flags - flags of the cpufreq driver - -3. CPUFreq Table Generation with Operating Performance Point (OPP) -================================================================== -For details about OPP, see Documentation/power/opp.rst - -dev_pm_opp_init_cpufreq_table - - This function provides a ready to use conversion routine to translate - the OPP layer's internal information about the available frequencies - into a format readily providable to cpufreq. - - WARNING: Do not use this function in interrupt context. - - Example: - soc_pm_init() - { - /* Do things */ - r = dev_pm_opp_init_cpufreq_table(dev, &freq_table); - if (!r) - policy->freq_table = freq_table; - /* Do other things */ - } - - NOTE: This function is available only if CONFIG_CPU_FREQ is enabled in - addition to CONFIG_PM_OPP. - -dev_pm_opp_free_cpufreq_table - Free up the table allocated by dev_pm_opp_init_cpufreq_table diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst new file mode 100644 index 0000000000000000000000000000000000000000..a697278ce19034e87a806912418ecb93379273d5 --- /dev/null +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -0,0 +1,292 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +How to Implement a new CPUFreq Processor Driver +=============================================== + +Authors: + + + - Dominik Brodowski + - Rafael J. Wysocki + - Viresh Kumar + +.. Contents + + 1. What To Do? + 1.1 Initialization + 1.2 Per-CPU Initialization + 1.3 verify + 1.4 target/target_index or setpolicy? + 1.5 target/target_index + 1.6 setpolicy + 1.7 get_intermediate and target_intermediate + 2. Frequency Table Helpers + + + +1. What To Do? +============== + +So, you just got a brand-new CPU / chipset with datasheets and want to +add cpufreq support for this CPU / chipset? Great. Here are some hints +on what is necessary: + + +1.1 Initialization +------------------ + +First of all, in an __initcall level 7 (module_init()) or later +function check whether this kernel runs on the right CPU and the right +chipset. If so, register a struct cpufreq_driver with the CPUfreq core +using cpufreq_register_driver() + +What shall this struct cpufreq_driver contain? + + .name - The name of this driver. + + .init - A pointer to the per-policy initialization function. + + .verify - A pointer to a "verification" function. + + .setpolicy _or_ .fast_switch _or_ .target _or_ .target_index - See + below on the differences. + +And optionally + + .flags - Hints for the cpufreq core. + + .driver_data - cpufreq driver specific data. + + .resolve_freq - Returns the most appropriate frequency for a target + frequency. Doesn't change the frequency though. + + .get_intermediate and target_intermediate - Used to switch to stable + frequency while changing CPU frequency. + + .get - Returns current frequency of the CPU. + + .bios_limit - Returns HW/BIOS max frequency limitations for the CPU. + + .exit - A pointer to a per-policy cleanup function called during + CPU_POST_DEAD phase of cpu hotplug process. + + .stop_cpu - A pointer to a per-policy stop function called during + CPU_DOWN_PREPARE phase of cpu hotplug process. + + .suspend - A pointer to a per-policy suspend function which is called + with interrupts disabled and _after_ the governor is stopped for the + policy. + + .resume - A pointer to a per-policy resume function which is called + with interrupts disabled and _before_ the governor is started again. + + .ready - A pointer to a per-policy ready function which is called after + the policy is fully initialized. + + .attr - A pointer to a NULL-terminated list of "struct freq_attr" which + allow to export values to sysfs. + + .boost_enabled - If set, boost frequencies are enabled. + + .set_boost - A pointer to a per-policy function to enable/disable boost + frequencies. + + +1.2 Per-CPU Initialization +-------------------------- + +Whenever a new CPU is registered with the device model, or after the +cpufreq driver registers itself, the per-policy initialization function +cpufreq_driver.init is called if no cpufreq policy existed for the CPU. +Note that the .init() and .exit() routines are called only once for the +policy and not for each CPU managed by the policy. It takes a ``struct +cpufreq_policy *policy`` as argument. What to do now? + +If necessary, activate the CPUfreq support on your CPU. + +Then, the driver must fill in the following values: + ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.min_freq _and_ | | +|policy->cpuinfo.max_freq | the minimum and maximum frequency | +| | (in kHz) which is supported by | +| | this CPU | ++-----------------------------------+--------------------------------------+ +|policy->cpuinfo.transition_latency | the time it takes on this CPU to | +| | switch between two frequencies in | +| | nanoseconds (if appropriate, else | +| | specify CPUFREQ_ETERNAL) | ++-----------------------------------+--------------------------------------+ +|policy->cur | The current operating frequency of | +| | this CPU (if appropriate) | ++-----------------------------------+--------------------------------------+ +|policy->min, | | +|policy->max, | | +|policy->policy and, if necessary, | | +|policy->governor | must contain the "default policy" for| +| | this CPU. A few moments later, | +| | cpufreq_driver.verify and either | +| | cpufreq_driver.setpolicy or | +| | cpufreq_driver.target/target_index is| +| | called with these values. | ++-----------------------------------+--------------------------------------+ +|policy->cpus | Update this with the masks of the | +| | (online + offline) CPUs that do DVFS | +| | along with this CPU (i.e. that share| +| | clock/voltage rails with it). | ++-----------------------------------+--------------------------------------+ + +For setting some of these values (cpuinfo.min[max]_freq, policy->min[max]), the +frequency table helpers might be helpful. See the section 2 for more information +on them. + + +1.3 verify +---------- + +When the user decides a new policy (consisting of +"policy,governor,min,max") shall be set, this policy must be validated +so that incompatible values can be corrected. For verifying these +values cpufreq_verify_within_limits(``struct cpufreq_policy *policy``, +``unsigned int min_freq``, ``unsigned int max_freq``) function might be helpful. +See section 2 for details on frequency table helpers. + +You need to make sure that at least one valid frequency (or operating +range) is within policy->min and policy->max. If necessary, increase +policy->max first, and only if this is no solution, decrease policy->min. + + +1.4 target or target_index or setpolicy or fast_switch? +------------------------------------------------------- + +Most cpufreq drivers or even most cpu frequency scaling algorithms +only allow the CPU frequency to be set to predefined fixed values. For +these, you use the ->target(), ->target_index() or ->fast_switch() +callbacks. + +Some cpufreq capable processors switch the frequency between certain +limits on their own. These shall use the ->setpolicy() callback. + + +1.5. target/target_index +------------------------ + +The target_index call has two arguments: ``struct cpufreq_policy *policy``, +and ``unsigned int`` index (into the exposed frequency table). + +The CPUfreq driver must set the new frequency when called here. The +actual frequency must be determined by freq_table[index].frequency. + +It should always restore to earlier frequency (i.e. policy->restore_freq) in +case of errors, even if we switched to intermediate frequency earlier. + +Deprecated +---------- +The target call has three arguments: ``struct cpufreq_policy *policy``, +unsigned int target_frequency, unsigned int relation. + +The CPUfreq driver must set the new frequency when called here. The +actual frequency must be determined using the following rules: + +- keep close to "target_freq" +- policy->min <= new_freq <= policy->max (THIS MUST BE VALID!!!) +- if relation==CPUFREQ_REL_L, try to select a new_freq higher than or equal + target_freq. ("L for lowest, but no lower than") +- if relation==CPUFREQ_REL_H, try to select a new_freq lower than or equal + target_freq. ("H for highest, but no higher than") + +Here again the frequency table helper might assist you - see section 2 +for details. + +1.6. fast_switch +---------------- + +This function is used for frequency switching from scheduler's context. +Not all drivers are expected to implement it, as sleeping from within +this callback isn't allowed. This callback must be highly optimized to +do switching as fast as possible. + +This function has two arguments: ``struct cpufreq_policy *policy`` and +``unsigned int target_frequency``. + + +1.7 setpolicy +------------- + +The setpolicy call only takes a ``struct cpufreq_policy *policy`` as +argument. You need to set the lower limit of the in-processor or +in-chipset dynamic frequency switching to policy->min, the upper limit +to policy->max, and -if supported- select a performance-oriented +setting when policy->policy is CPUFREQ_POLICY_PERFORMANCE, and a +powersaving-oriented setting when CPUFREQ_POLICY_POWERSAVE. Also check +the reference implementation in drivers/cpufreq/longrun.c + +1.8 get_intermediate and target_intermediate +-------------------------------------------- + +Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset. + +get_intermediate should return a stable intermediate frequency platform wants to +switch to, and target_intermediate() should set CPU to that frequency, before +jumping to the frequency corresponding to 'index'. Core will take care of +sending notifications and driver doesn't have to handle them in +target_intermediate() or target_index(). + +Drivers can return '0' from get_intermediate() in case they don't wish to switch +to intermediate frequency for some target frequency. In that case core will +directly call ->target_index(). + +NOTE: ->target_index() should restore to policy->restore_freq in case of +failures as core would send notifications for that. + + +2. Frequency Table Helpers +========================== + +As most cpufreq processors only allow for being set to a few specific +frequencies, a "frequency table" with some functions might assist in +some work of the processor driver. Such a "frequency table" consists of +an array of struct cpufreq_frequency_table entries, with driver specific +values in "driver_data", the corresponding frequency in "frequency" and +flags set. At the end of the table, you need to add a +cpufreq_frequency_table entry with frequency set to CPUFREQ_TABLE_END. +And if you want to skip one entry in the table, set the frequency to +CPUFREQ_ENTRY_INVALID. The entries don't need to be in sorted in any +particular order, but if they are cpufreq core will do DVFS a bit +quickly for them as search for best match is faster. + +The cpufreq table is verified automatically by the core if the policy contains a +valid pointer in its policy->freq_table field. + +cpufreq_frequency_table_verify() assures that at least one valid +frequency is within policy->min and policy->max, and all other criteria +are met. This is helpful for the ->verify call. + +cpufreq_frequency_table_target() is the corresponding frequency table +helper for the ->target stage. Just pass the values to this function, +and this function returns the of the frequency table entry which +contains the frequency the CPU shall be set to. + +The following macros can be used as iterators over cpufreq_frequency_table: + +cpufreq_for_each_entry(pos, table) - iterates over all entries of frequency +table. + +cpufreq_for_each_valid_entry(pos, table) - iterates over all entries, +excluding CPUFREQ_ENTRY_INVALID frequencies. +Use arguments "pos" - a ``cpufreq_frequency_table *`` as a loop cursor and +"table" - the ``cpufreq_frequency_table *`` you want to iterate over. + +For example:: + + struct cpufreq_frequency_table *pos, *driver_freq_table; + + cpufreq_for_each_entry(pos, driver_freq_table) { + /* Do something with pos */ + pos->frequency = ... + } + +If you need to work with the position of pos within driver_freq_table, +do not subtract the pointers, as it is quite costly. Instead, use the +macros cpufreq_for_each_entry_idx() and cpufreq_for_each_valid_entry_idx(). diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt deleted file mode 100644 index 6e353d00cdc687eac454c4c586f7075cb73399eb..0000000000000000000000000000000000000000 --- a/Documentation/cpu-freq/cpu-drivers.txt +++ /dev/null @@ -1,295 +0,0 @@ - CPU frequency and voltage scaling code in the Linux(TM) kernel - - - L i n u x C P U F r e q - - C P U D r i v e r s - - - information for developers - - - - Dominik Brodowski - Rafael J. Wysocki - Viresh Kumar - - - - Clock scaling allows you to change the clock speed of the CPUs on the - fly. This is a nice method to save battery power, because the lower - the clock speed, the less power the CPU consumes. - - -Contents: ---------- -1. What To Do? -1.1 Initialization -1.2 Per-CPU Initialization -1.3 verify -1.4 target/target_index or setpolicy? -1.5 target/target_index -1.6 setpolicy -1.7 get_intermediate and target_intermediate -2. Frequency Table Helpers - - - -1. What To Do? -============== - -So, you just got a brand-new CPU / chipset with datasheets and want to -add cpufreq support for this CPU / chipset? Great. Here are some hints -on what is necessary: - - -1.1 Initialization ------------------- - -First of all, in an __initcall level 7 (module_init()) or later -function check whether this kernel runs on the right CPU and the right -chipset. If so, register a struct cpufreq_driver with the CPUfreq core -using cpufreq_register_driver() - -What shall this struct cpufreq_driver contain? - - .name - The name of this driver. - - .init - A pointer to the per-policy initialization function. - - .verify - A pointer to a "verification" function. - - .setpolicy _or_ .fast_switch _or_ .target _or_ .target_index - See - below on the differences. - -And optionally - - .flags - Hints for the cpufreq core. - - .driver_data - cpufreq driver specific data. - - .resolve_freq - Returns the most appropriate frequency for a target - frequency. Doesn't change the frequency though. - - .get_intermediate and target_intermediate - Used to switch to stable - frequency while changing CPU frequency. - - .get - Returns current frequency of the CPU. - - .bios_limit - Returns HW/BIOS max frequency limitations for the CPU. - - .exit - A pointer to a per-policy cleanup function called during - CPU_POST_DEAD phase of cpu hotplug process. - - .stop_cpu - A pointer to a per-policy stop function called during - CPU_DOWN_PREPARE phase of cpu hotplug process. - - .suspend - A pointer to a per-policy suspend function which is called - with interrupts disabled and _after_ the governor is stopped for the - policy. - - .resume - A pointer to a per-policy resume function which is called - with interrupts disabled and _before_ the governor is started again. - - .ready - A pointer to a per-policy ready function which is called after - the policy is fully initialized. - - .attr - A pointer to a NULL-terminated list of "struct freq_attr" which - allow to export values to sysfs. - - .boost_enabled - If set, boost frequencies are enabled. - - .set_boost - A pointer to a per-policy function to enable/disable boost - frequencies. - - -1.2 Per-CPU Initialization --------------------------- - -Whenever a new CPU is registered with the device model, or after the -cpufreq driver registers itself, the per-policy initialization function -cpufreq_driver.init is called if no cpufreq policy existed for the CPU. -Note that the .init() and .exit() routines are called only once for the -policy and not for each CPU managed by the policy. It takes a struct -cpufreq_policy *policy as argument. What to do now? - -If necessary, activate the CPUfreq support on your CPU. - -Then, the driver must fill in the following values: - -policy->cpuinfo.min_freq _and_ -policy->cpuinfo.max_freq - the minimum and maximum frequency - (in kHz) which is supported by - this CPU -policy->cpuinfo.transition_latency the time it takes on this CPU to - switch between two frequencies in - nanoseconds (if appropriate, else - specify CPUFREQ_ETERNAL) - -policy->cur The current operating frequency of - this CPU (if appropriate) -policy->min, -policy->max, -policy->policy and, if necessary, -policy->governor must contain the "default policy" for - this CPU. A few moments later, - cpufreq_driver.verify and either - cpufreq_driver.setpolicy or - cpufreq_driver.target/target_index is called - with these values. -policy->cpus Update this with the masks of the - (online + offline) CPUs that do DVFS - along with this CPU (i.e. that share - clock/voltage rails with it). - -For setting some of these values (cpuinfo.min[max]_freq, policy->min[max]), the -frequency table helpers might be helpful. See the section 2 for more information -on them. - - -1.3 verify ----------- - -When the user decides a new policy (consisting of -"policy,governor,min,max") shall be set, this policy must be validated -so that incompatible values can be corrected. For verifying these -values cpufreq_verify_within_limits(struct cpufreq_policy *policy, -unsigned int min_freq, unsigned int max_freq) function might be helpful. -See section 2 for details on frequency table helpers. - -You need to make sure that at least one valid frequency (or operating -range) is within policy->min and policy->max. If necessary, increase -policy->max first, and only if this is no solution, decrease policy->min. - - -1.4 target or target_index or setpolicy or fast_switch? -------------------------------------------------------- - -Most cpufreq drivers or even most cpu frequency scaling algorithms -only allow the CPU frequency to be set to predefined fixed values. For -these, you use the ->target(), ->target_index() or ->fast_switch() -callbacks. - -Some cpufreq capable processors switch the frequency between certain -limits on their own. These shall use the ->setpolicy() callback. - - -1.5. target/target_index ------------------------- - -The target_index call has two arguments: struct cpufreq_policy *policy, -and unsigned int index (into the exposed frequency table). - -The CPUfreq driver must set the new frequency when called here. The -actual frequency must be determined by freq_table[index].frequency. - -It should always restore to earlier frequency (i.e. policy->restore_freq) in -case of errors, even if we switched to intermediate frequency earlier. - -Deprecated: ----------- -The target call has three arguments: struct cpufreq_policy *policy, -unsigned int target_frequency, unsigned int relation. - -The CPUfreq driver must set the new frequency when called here. The -actual frequency must be determined using the following rules: - -- keep close to "target_freq" -- policy->min <= new_freq <= policy->max (THIS MUST BE VALID!!!) -- if relation==CPUFREQ_REL_L, try to select a new_freq higher than or equal - target_freq. ("L for lowest, but no lower than") -- if relation==CPUFREQ_REL_H, try to select a new_freq lower than or equal - target_freq. ("H for highest, but no higher than") - -Here again the frequency table helper might assist you - see section 2 -for details. - -1.6. fast_switch ----------------- - -This function is used for frequency switching from scheduler's context. -Not all drivers are expected to implement it, as sleeping from within -this callback isn't allowed. This callback must be highly optimized to -do switching as fast as possible. - -This function has two arguments: struct cpufreq_policy *policy and -unsigned int target_frequency. - - -1.7 setpolicy -------------- - -The setpolicy call only takes a struct cpufreq_policy *policy as -argument. You need to set the lower limit of the in-processor or -in-chipset dynamic frequency switching to policy->min, the upper limit -to policy->max, and -if supported- select a performance-oriented -setting when policy->policy is CPUFREQ_POLICY_PERFORMANCE, and a -powersaving-oriented setting when CPUFREQ_POLICY_POWERSAVE. Also check -the reference implementation in drivers/cpufreq/longrun.c - -1.8 get_intermediate and target_intermediate --------------------------------------------- - -Only for drivers with target_index() and CPUFREQ_ASYNC_NOTIFICATION unset. - -get_intermediate should return a stable intermediate frequency platform wants to -switch to, and target_intermediate() should set CPU to that frequency, before -jumping to the frequency corresponding to 'index'. Core will take care of -sending notifications and driver doesn't have to handle them in -target_intermediate() or target_index(). - -Drivers can return '0' from get_intermediate() in case they don't wish to switch -to intermediate frequency for some target frequency. In that case core will -directly call ->target_index(). - -NOTE: ->target_index() should restore to policy->restore_freq in case of -failures as core would send notifications for that. - - -2. Frequency Table Helpers -========================== - -As most cpufreq processors only allow for being set to a few specific -frequencies, a "frequency table" with some functions might assist in -some work of the processor driver. Such a "frequency table" consists of -an array of struct cpufreq_frequency_table entries, with driver specific -values in "driver_data", the corresponding frequency in "frequency" and -flags set. At the end of the table, you need to add a -cpufreq_frequency_table entry with frequency set to CPUFREQ_TABLE_END. -And if you want to skip one entry in the table, set the frequency to -CPUFREQ_ENTRY_INVALID. The entries don't need to be in sorted in any -particular order, but if they are cpufreq core will do DVFS a bit -quickly for them as search for best match is faster. - -The cpufreq table is verified automatically by the core if the policy contains a -valid pointer in its policy->freq_table field. - -cpufreq_frequency_table_verify() assures that at least one valid -frequency is within policy->min and policy->max, and all other criteria -are met. This is helpful for the ->verify call. - -cpufreq_frequency_table_target() is the corresponding frequency table -helper for the ->target stage. Just pass the values to this function, -and this function returns the of the frequency table entry which -contains the frequency the CPU shall be set to. - -The following macros can be used as iterators over cpufreq_frequency_table: - -cpufreq_for_each_entry(pos, table) - iterates over all entries of frequency -table. - -cpufreq_for_each_valid_entry(pos, table) - iterates over all entries, -excluding CPUFREQ_ENTRY_INVALID frequencies. -Use arguments "pos" - a cpufreq_frequency_table * as a loop cursor and -"table" - the cpufreq_frequency_table * you want to iterate over. - -For example: - - struct cpufreq_frequency_table *pos, *driver_freq_table; - - cpufreq_for_each_entry(pos, driver_freq_table) { - /* Do something with pos */ - pos->frequency = ... - } - -If you need to work with the position of pos within driver_freq_table, -do not subtract the pointers, as it is quite costly. Instead, use the -macros cpufreq_for_each_entry_idx() and cpufreq_for_each_valid_entry_idx(). diff --git a/Documentation/cpu-freq/cpufreq-nforce2.txt b/Documentation/cpu-freq/cpufreq-nforce2.txt deleted file mode 100644 index babce13150265f173fc32dfa72ce1b91cf234afd..0000000000000000000000000000000000000000 --- a/Documentation/cpu-freq/cpufreq-nforce2.txt +++ /dev/null @@ -1,19 +0,0 @@ - -The cpufreq-nforce2 driver changes the FSB on nVidia nForce2 platforms. - -This works better than on other platforms, because the FSB of the CPU -can be controlled independently from the PCI/AGP clock. - -The module has two options: - - fid: multiplier * 10 (for example 8.5 = 85) - min_fsb: minimum FSB - -If not set, fid is calculated from the current CPU speed and the FSB. -min_fsb defaults to FSB at boot time - 50 MHz. - -IMPORTANT: The available range is limited downwards! - Also the minimum available FSB can differ, for systems - booting with 200 MHz, 150 should always work. - - diff --git a/Documentation/cpu-freq/cpufreq-stats.rst b/Documentation/cpu-freq/cpufreq-stats.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ad695b1c7db0071305693a63b3ea45b4d39d5a9 --- /dev/null +++ b/Documentation/cpu-freq/cpufreq-stats.rst @@ -0,0 +1,136 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========================================== +General Description of sysfs CPUFreq Stats +========================================== + +information for users + + +Author: Venkatesh Pallipadi + +.. Contents + + 1. Introduction + 2. Statistics Provided (with example) + 3. Configuring cpufreq-stats + + +1. Introduction +=============== + +cpufreq-stats is a driver that provides CPU frequency statistics for each CPU. +These statistics are provided in /sysfs as a bunch of read_only interfaces. This +interface (when configured) will appear in a separate directory under cpufreq +in /sysfs (/devices/system/cpu/cpuX/cpufreq/stats/) for each CPU. +Various statistics will form read_only files under this directory. + +This driver is designed to be independent of any particular cpufreq_driver +that may be running on your CPU. So, it will work with any cpufreq_driver. + + +2. Statistics Provided (with example) +===================================== + +cpufreq stats provides following statistics (explained in detail below). + +- time_in_state +- total_trans +- trans_table + +All the statistics will be from the time the stats driver has been inserted +(or the time the stats were reset) to the time when a read of a particular +statistic is done. Obviously, stats driver will not have any information +about the frequency transitions before the stats driver insertion. + +:: + + :/sys/devices/system/cpu/cpu0/cpufreq/stats # ls -l + total 0 + drwxr-xr-x 2 root root 0 May 14 16:06 . + drwxr-xr-x 3 root root 0 May 14 15:58 .. + --w------- 1 root root 4096 May 14 16:06 reset + -r--r--r-- 1 root root 4096 May 14 16:06 time_in_state + -r--r--r-- 1 root root 4096 May 14 16:06 total_trans + -r--r--r-- 1 root root 4096 May 14 16:06 trans_table + +- **reset** + +Write-only attribute that can be used to reset the stat counters. This can be +useful for evaluating system behaviour under different governors without the +need for a reboot. + +- **time_in_state** + +This gives the amount of time spent in each of the frequencies supported by +this CPU. The cat output will have "