From 3d0186bb068e6cc6c23dc1d2f0b1cf64894c40ea Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 16 Jun 2018 17:32:07 -0400 Subject: Update email address Redirect some older email addresses that are in the git logs. Signed-off-by: Matthew Wilcox --- .mailmap | 7 +++++++ MAINTAINERS | 6 +++--- arch/parisc/kernel/syscall.S | 2 +- drivers/input/keyboard/hilkbd.c | 2 +- drivers/pci/hotplug/acpiphp.h | 2 +- drivers/pci/hotplug/acpiphp_core.c | 4 ++-- drivers/pci/hotplug/acpiphp_glue.c | 2 +- fs/isofs/dir.c | 2 +- include/linux/xarray.h | 2 +- 9 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.mailmap b/.mailmap index 285e09645b31..89f532caf639 100644 --- a/.mailmap +++ b/.mailmap @@ -119,6 +119,13 @@ Mark Brown Mark Yao Martin Kepplinger Martin Kepplinger +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox +Matthew Wilcox Matthieu CASTET Mauro Carvalho Chehab Mauro Carvalho Chehab diff --git a/MAINTAINERS b/MAINTAINERS index a255240d1452..612e5dbf3431 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -536,7 +536,7 @@ F: Documentation/hwmon/adt7475 F: drivers/hwmon/adt7475.c ADVANSYS SCSI DRIVER -M: Matthew Wilcox +M: Matthew Wilcox M: Hannes Reinecke L: linux-scsi@vger.kernel.org S: Maintained @@ -4364,7 +4364,7 @@ S: Maintained F: drivers/i2c/busses/i2c-diolan-u2c.c FILESYSTEM DIRECT ACCESS (DAX) -M: Matthew Wilcox +M: Matthew Wilcox M: Ross Zwisler M: Jan Kara L: linux-fsdevel@vger.kernel.org @@ -8626,7 +8626,7 @@ F: drivers/message/fusion/ F: drivers/scsi/mpt3sas/ LSILOGIC/SYMBIOS/NCR 53C8XX and 53C1010 PCI-SCSI drivers -M: Matthew Wilcox +M: Matthew Wilcox L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/sym53c8xx_2/ diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index f453997a7b8f..a9bc90dc4ae7 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -2,7 +2,7 @@ * Linux/PA-RISC Project (http://www.parisc-linux.org/) * * System call entry code / Linux gateway page - * Copyright (c) Matthew Wilcox 1999 + * Copyright (c) Matthew Wilcox 1999 * Licensed under the GNU GPL. * thanks to Philipp Rumpf, Mike Shaver and various others * sorry about the wall, puffin.. diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c index 5c7afdec192c..f5c5ae8b6c06 100644 --- a/drivers/input/keyboard/hilkbd.c +++ b/drivers/input/keyboard/hilkbd.c @@ -2,7 +2,7 @@ * linux/drivers/hil/hilkbd.c * * Copyright (C) 1998 Philip Blundell - * Copyright (C) 1999 Matthew Wilcox + * Copyright (C) 1999 Matthew Wilcox * Copyright (C) 1999-2007 Helge Deller * * Very basic HP Human Interface Loop (HIL) driver. diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h index e438a2d734f2..14cef4cf592b 100644 --- a/drivers/pci/hotplug/acpiphp.h +++ b/drivers/pci/hotplug/acpiphp.h @@ -8,7 +8,7 @@ * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) * Copyright (C) 2002,2003 NEC Corporation - * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) + * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org) * Copyright (C) 2003-2005 Hewlett Packard * * All rights reserved. diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c index ad32ffbc4b91..0447b169e7ff 100644 --- a/drivers/pci/hotplug/acpiphp_core.c +++ b/drivers/pci/hotplug/acpiphp_core.c @@ -8,7 +8,7 @@ * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) * Copyright (C) 2002,2003 NEC Corporation - * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) + * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org) * Copyright (C) 2003-2005 Hewlett Packard * * All rights reserved. @@ -40,7 +40,7 @@ bool acpiphp_disabled; static struct acpiphp_attention_info *attention_info; #define DRIVER_VERSION "0.5" -#define DRIVER_AUTHOR "Greg Kroah-Hartman , Takayoshi Kochi , Matthew Wilcox " +#define DRIVER_AUTHOR "Greg Kroah-Hartman , Takayoshi Kochi , Matthew Wilcox " #define DRIVER_DESC "ACPI Hot Plug PCI Controller Driver" MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index 12afa7fdf77e..e4c46637f32f 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -5,7 +5,7 @@ * Copyright (C) 2002,2003 Takayoshi Kochi (t-kochi@bq.jp.nec.com) * Copyright (C) 2002 Hiroshi Aono (h-aono@ap.jp.nec.com) * Copyright (C) 2002,2003 NEC Corporation - * Copyright (C) 2003-2005 Matthew Wilcox (matthew.wilcox@hp.com) + * Copyright (C) 2003-2005 Matthew Wilcox (willy@infradead.org) * Copyright (C) 2003-2005 Hewlett Packard * Copyright (C) 2005 Rajesh Shah (rajesh.shah@intel.com) * Copyright (C) 2005 Intel Corporation diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 947ce22f5b3c..f0fe641893a5 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -46,7 +46,7 @@ int isofs_name_translate(struct iso_directory_record *de, char *new, struct inod return i; } -/* Acorn extensions written by Matthew Wilcox 1998 */ +/* Acorn extensions written by Matthew Wilcox 1998 */ int get_acorn_filename(struct iso_directory_record *de, char *retname, struct inode *inode) { diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 2dfc8006fe64..9e4c86853fa4 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -4,7 +4,7 @@ /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation - * Author: Matthew Wilcox + * Author: Matthew Wilcox */ #include -- cgit v1.2.3 From 66ee620f06f99d72475db6eb638559ba608c7dee Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 25 Jun 2018 06:56:50 -0400 Subject: idr: Permit any valid kernel pointer to be stored An upcoming change to the encoding of internal entries will set the bottom two bits to 0b10. Unfortunately, m68k only aligns some data structures to 2 bytes, so the IDR will interpret them as internal entries and things will go badly wrong. Change the radix tree so that it stops either when the node indicates that it's the bottom of the tree (shift == 0) or when the entry is not an internal entry. This means we cannot insert an arbitrary kernel pointer as a multiorder entry, but the IDR does not permit multiorder entries. Annoyingly, this means the IDR can no longer take advantage of the radix tree's ability to store a single entry at offset 0 without allocating memory. A pointer which is 2-byte aligned cannot be stored directly in the root as it would be indistinguishable from a node, so we must allocate a node in order to store a 2-byte pointer at index 0. The idr_replace() function does not take a GFP flags argument, so cannot allocate memory. If a user inserts a 4-byte aligned pointer at index 0 and then replaces it with a 2-byte aligned pointer, we must be able to store it. Arbitrary pointer values are still not permitted; pointers of the form 2 + (i * 4) for values of i between 0 and 1023 are reserved for the implementation. These are not valid kernel pointers as they would point into the zero page. This change does cause a runtime memory consumption regression for the IDA. I will recover that later. Signed-off-by: Matthew Wilcox Tested-by: Guenter Roeck --- lib/idr.c | 4 --- lib/radix-tree.c | 21 +++++++++---- tools/testing/radix-tree/idr-test.c | 63 +++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 10 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index fab2fd5bc326..729e381e23b4 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -39,8 +39,6 @@ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, unsigned int base = idr->idr_base; unsigned int id = *nextid; - if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr))) - return -EINVAL; if (WARN_ON_ONCE(!(idr->idr_rt.gfp_mask & ROOT_IS_IDR))) idr->idr_rt.gfp_mask |= IDR_RT_MARKER; @@ -295,8 +293,6 @@ void *idr_replace(struct idr *idr, void *ptr, unsigned long id) void __rcu **slot = NULL; void *entry; - if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr))) - return ERR_PTR(-EINVAL); id -= idr->idr_base; entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index bc03ecc4dfd2..a904a8ddd174 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -703,6 +703,14 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, if (!radix_tree_is_internal_node(child) && node->shift) break; + /* + * For an IDR, we must not shrink entry 0 into the root in + * case somebody calls idr_replace() with a pointer that + * appears to be an internal entry + */ + if (!node->shift && is_idr(root)) + break; + if (radix_tree_is_internal_node(child)) entry_to_node(child)->parent = NULL; @@ -875,8 +883,8 @@ static void radix_tree_free_nodes(struct radix_tree_node *node) for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); - if (radix_tree_is_internal_node(entry) && - !is_sibling_entry(child, entry)) { + if (radix_tree_is_internal_node(entry) && child->shift && + !is_sibling_entry(child, entry)) { child = entry_to_node(entry); offset = 0; continue; @@ -1049,6 +1057,8 @@ void *__radix_tree_lookup(const struct radix_tree_root *root, parent = entry_to_node(node); offset = radix_tree_descend(parent, &node, index); slot = parent->slots + offset; + if (parent->shift == 0) + break; } if (nodep) @@ -1123,9 +1133,6 @@ static inline void replace_sibling_entries(struct radix_tree_node *node, static void replace_slot(void __rcu **slot, void *item, struct radix_tree_node *node, int count, int exceptional) { - if (WARN_ON_ONCE(radix_tree_is_internal_node(item))) - return; - if (node && (count || exceptional)) { node->count += count; node->exceptional += exceptional; @@ -1784,7 +1791,7 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, goto restart; if (child == RADIX_TREE_RETRY) break; - } while (radix_tree_is_internal_node(child)); + } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); @@ -2150,6 +2157,8 @@ void __rcu **idr_get_free(struct radix_tree_root *root, shift = error; child = rcu_dereference_raw(root->rnode); } + if (start == 0 && shift == 0) + shift = RADIX_TREE_MAP_SHIFT; while (shift) { shift -= RADIX_TREE_MAP_SHIFT; diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index 321ba92c70d2..f620c831a4b5 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -227,6 +227,66 @@ void idr_u32_test(int base) idr_u32_test1(&idr, 0xffffffff); } +static void idr_align_test(struct idr *idr) +{ + char name[] = "Motorola 68000"; + int i, id; + void *entry; + + for (i = 0; i < 9; i++) { + BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i); + idr_for_each_entry(idr, entry, id); + } + idr_destroy(idr); + + for (i = 1; i < 10; i++) { + BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 1); + idr_for_each_entry(idr, entry, id); + } + idr_destroy(idr); + + for (i = 2; i < 11; i++) { + BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 2); + idr_for_each_entry(idr, entry, id); + } + idr_destroy(idr); + + for (i = 3; i < 12; i++) { + BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != i - 3); + idr_for_each_entry(idr, entry, id); + } + idr_destroy(idr); + + for (i = 0; i < 8; i++) { + BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != 0); + BUG_ON(idr_alloc(idr, &name[i + 1], 0, 0, GFP_KERNEL) != 1); + idr_for_each_entry(idr, entry, id); + idr_remove(idr, 1); + idr_for_each_entry(idr, entry, id); + idr_remove(idr, 0); + BUG_ON(!idr_is_empty(idr)); + } + + for (i = 0; i < 8; i++) { + BUG_ON(idr_alloc(idr, NULL, 0, 0, GFP_KERNEL) != 0); + idr_for_each_entry(idr, entry, id); + idr_replace(idr, &name[i], 0); + idr_for_each_entry(idr, entry, id); + BUG_ON(idr_find(idr, 0) != &name[i]); + idr_remove(idr, 0); + } + + for (i = 0; i < 8; i++) { + BUG_ON(idr_alloc(idr, &name[i], 0, 0, GFP_KERNEL) != 0); + BUG_ON(idr_alloc(idr, NULL, 0, 0, GFP_KERNEL) != 1); + idr_remove(idr, 1); + idr_for_each_entry(idr, entry, id); + idr_replace(idr, &name[i + 1], 0); + idr_for_each_entry(idr, entry, id); + idr_remove(idr, 0); + } +} + void idr_checks(void) { unsigned long i; @@ -307,6 +367,7 @@ void idr_checks(void) idr_u32_test(4); idr_u32_test(1); idr_u32_test(0); + idr_align_test(&idr); } #define module_init(x) @@ -341,6 +402,7 @@ void ida_check_nomem(void) */ void ida_check_conv_user(void) { +#if 0 DEFINE_IDA(ida); unsigned long i; @@ -358,6 +420,7 @@ void ida_check_conv_user(void) IDA_BUG_ON(&ida, id != i); } ida_destroy(&ida); +#endif } void ida_check_random(void) -- cgit v1.2.3 From 3159f943aafdbacb2f94c38fdaadabf2bbde2a14 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Nov 2017 13:30:42 -0400 Subject: xarray: Replace exceptional entries Introduce xarray value entries and tagged pointers to replace radix tree exceptional entries. This is a slight change in encoding to allow the use of an extra bit (we can now store BITS_PER_LONG - 1 bits in a value entry). It is also a change in emphasis; exceptional entries are intimidating and different. As the comment explains, you can choose to store values or pointers in the xarray and they are both first-class citizens. Signed-off-by: Matthew Wilcox Reviewed-by: Josef Bacik --- arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +- arch/powerpc/include/asm/nohash/64/pgtable.h | 4 +- drivers/gpu/drm/i915/i915_gem.c | 17 ++-- drivers/staging/erofs/utils.c | 18 ++--- fs/btrfs/compression.c | 2 +- fs/dax.c | 112 +++++++++++++-------------- fs/proc/task_mmu.c | 2 +- include/linux/radix-tree.h | 36 ++------- include/linux/swapops.h | 19 ++--- include/linux/xarray.h | 102 ++++++++++++++++++++++++ lib/idr.c | 60 ++++++-------- lib/radix-tree.c | 21 +++-- mm/filemap.c | 10 +-- mm/khugepaged.c | 2 +- mm/madvise.c | 2 +- mm/memcontrol.c | 2 +- mm/mincore.c | 2 +- mm/readahead.c | 2 +- mm/shmem.c | 10 +-- mm/swap.c | 2 +- mm/truncate.c | 12 +-- mm/workingset.c | 13 ++-- tools/testing/radix-tree/idr-test.c | 6 +- tools/testing/radix-tree/linux/radix-tree.h | 1 - tools/testing/radix-tree/multiorder.c | 47 ++++++----- tools/testing/radix-tree/test.c | 2 +- 26 files changed, 278 insertions(+), 232 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 2fdc865ca374..62039e557ac0 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -723,9 +723,7 @@ static inline bool pte_user(pte_t pte) BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ } while (0) -/* - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; - */ + #define SWP_TYPE_BITS 5 #define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ & ((1UL << SWP_TYPE_BITS) - 1)) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 7cd6809f4d33..05765c2d2c1f 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -313,9 +313,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define MAX_SWAPFILES_CHECK() do { \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ } while (0) -/* - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; - */ + #define SWP_TYPE_BITS 5 #define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ & ((1UL << SWP_TYPE_BITS) - 1)) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index fcc73a6ab503..316730b45f84 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -5996,7 +5996,8 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj, count = __sg_page_count(sg); while (idx + count <= n) { - unsigned long exception, i; + void *entry; + unsigned long i; int ret; /* If we cannot allocate and insert this entry, or the @@ -6011,12 +6012,9 @@ i915_gem_object_get_sg(struct drm_i915_gem_object *obj, if (ret && ret != -EEXIST) goto scan; - exception = - RADIX_TREE_EXCEPTIONAL_ENTRY | - idx << RADIX_TREE_EXCEPTIONAL_SHIFT; + entry = xa_mk_value(idx); for (i = 1; i < count; i++) { - ret = radix_tree_insert(&iter->radix, idx + i, - (void *)exception); + ret = radix_tree_insert(&iter->radix, idx + i, entry); if (ret && ret != -EEXIST) goto scan; } @@ -6054,15 +6052,14 @@ lookup: GEM_BUG_ON(!sg); /* If this index is in the middle of multi-page sg entry, - * the radixtree will contain an exceptional entry that points + * the radix tree will contain a value entry that points * to the start of that range. We will return the pointer to * the base page and the offset of this page within the * sg entry's range. */ *offset = 0; - if (unlikely(radix_tree_exception(sg))) { - unsigned long base = - (unsigned long)sg >> RADIX_TREE_EXCEPTIONAL_SHIFT; + if (unlikely(xa_is_value(sg))) { + unsigned long base = xa_to_value(sg); sg = radix_tree_lookup(&iter->radix, base); GEM_BUG_ON(!sg); diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c index 595cf90af9bb..bdee9bd09f11 100644 --- a/drivers/staging/erofs/utils.c +++ b/drivers/staging/erofs/utils.c @@ -35,7 +35,6 @@ static atomic_long_t erofs_global_shrink_cnt; #ifdef CONFIG_EROFS_FS_ZIP -/* radix_tree and the future XArray both don't use tagptr_t yet */ struct erofs_workgroup *erofs_find_workgroup( struct super_block *sb, pgoff_t index, bool *tag) { @@ -47,9 +46,8 @@ repeat: rcu_read_lock(); grp = radix_tree_lookup(&sbi->workstn_tree, index); if (grp != NULL) { - *tag = radix_tree_exceptional_entry(grp); - grp = (void *)((unsigned long)grp & - ~RADIX_TREE_EXCEPTIONAL_ENTRY); + *tag = xa_pointer_tag(grp); + grp = xa_untag_pointer(grp); if (erofs_workgroup_get(grp, &oldcount)) { /* prefer to relax rcu read side */ @@ -83,9 +81,7 @@ int erofs_register_workgroup(struct super_block *sb, sbi = EROFS_SB(sb); erofs_workstn_lock(sbi); - if (tag) - grp = (void *)((unsigned long)grp | - 1UL << RADIX_TREE_EXCEPTIONAL_SHIFT); + grp = xa_tag_pointer(grp, tag); err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp); @@ -131,9 +127,7 @@ repeat: for (i = 0; i < found; ++i) { int cnt; - struct erofs_workgroup *grp = (void *) - ((unsigned long)batch[i] & - ~RADIX_TREE_EXCEPTIONAL_ENTRY); + struct erofs_workgroup *grp = xa_untag_pointer(batch[i]); first_index = grp->index + 1; @@ -150,8 +144,8 @@ repeat: #endif continue; - if (radix_tree_delete(&sbi->workstn_tree, - grp->index) != grp) { + if (xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, + grp->index)) != grp) { #ifdef EROFS_FS_HAS_MANAGED_CACHE skip: erofs_workgroup_unfreeze(grp, 1); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9bfa66592aa7..fd25e125303c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -440,7 +440,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, rcu_read_lock(); page = radix_tree_lookup(&mapping->i_pages, pg_index); rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { misses++; if (misses > 4) break; diff --git a/fs/dax.c b/fs/dax.c index b68ce484e1be..ebcec36335eb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -59,56 +59,57 @@ static int __init init_dax_wait_table(void) fs_initcall(init_dax_wait_table); /* - * We use lowest available bit in exceptional entry for locking, one bit for - * the entry size (PMD) and two more to tell us if the entry is a zero page or - * an empty entry that is just used for locking. In total four special bits. + * DAX pagecache entries use XArray value entries so they can't be mistaken + * for pages. We use one bit for locking, one bit for the entry size (PMD) + * and two more to tell us if the entry is a zero page or an empty entry that + * is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) -#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) +#define DAX_SHIFT (4) +#define DAX_LOCKED (1UL << 0) +#define DAX_PMD (1UL << 1) +#define DAX_ZERO_PAGE (1UL << 2) +#define DAX_EMPTY (1UL << 3) static unsigned long dax_radix_pfn(void *entry) { - return (unsigned long)entry >> RADIX_DAX_SHIFT; + return xa_to_value(entry) >> DAX_SHIFT; } static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) { - return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | - (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); + return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) | + DAX_LOCKED); } static unsigned int dax_radix_order(void *entry) { - if ((unsigned long)entry & RADIX_DAX_PMD) + if (xa_to_value(entry) & DAX_PMD) return PMD_SHIFT - PAGE_SHIFT; return 0; } static int dax_is_pmd_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_PMD; + return xa_to_value(entry) & DAX_PMD; } static int dax_is_pte_entry(void *entry) { - return !((unsigned long)entry & RADIX_DAX_PMD); + return !(xa_to_value(entry) & DAX_PMD); } static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; + return xa_to_value(entry) & DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_EMPTY; + return xa_to_value(entry) & DAX_EMPTY; } /* @@ -186,9 +187,9 @@ static void dax_wake_mapping_entry_waiter(struct address_space *mapping, */ static inline int slot_locked(struct address_space *mapping, void **slot) { - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - return entry & RADIX_DAX_ENTRY_LOCK; + unsigned long entry = xa_to_value( + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); + return entry & DAX_LOCKED; } /* @@ -196,12 +197,11 @@ static inline int slot_locked(struct address_space *mapping, void **slot) */ static inline void *lock_slot(struct address_space *mapping, void **slot) { - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; + unsigned long v = xa_to_value( + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); + void *entry = xa_mk_value(v | DAX_LOCKED); + radix_tree_replace_slot(&mapping->i_pages, slot, entry); + return entry; } /* @@ -209,17 +209,16 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { - unsigned long entry = (unsigned long) - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - - entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(&mapping->i_pages, slot, (void *)entry); - return (void *)entry; + unsigned long v = xa_to_value( + radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); + void *entry = xa_mk_value(v & ~DAX_LOCKED); + radix_tree_replace_slot(&mapping->i_pages, slot, entry); + return entry; } /* * Lookup entry in radix tree, wait for it to become unlocked if it is - * exceptional entry and return it. The caller must call + * a DAX entry and return it. The caller must call * put_unlocked_mapping_entry() when he decided not to lock the entry or * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. @@ -242,7 +241,7 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); if (!entry || - WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || + WARN_ON_ONCE(!xa_is_value(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; @@ -283,7 +282,7 @@ static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) xa_lock_irq(&mapping->i_pages); entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || + if (WARN_ON_ONCE(!entry || !xa_is_value(entry) || !slot_locked(mapping, slot))) { xa_unlock_irq(&mapping->i_pages); return; @@ -472,12 +471,11 @@ void dax_unlock_mapping_entry(struct page *page) } /* - * Find radix tree entry at given index. If it points to an exceptional entry, - * return it with the radix tree entry locked. If the radix tree doesn't - * contain given index, create an empty exceptional entry for the index and - * return with it locked. + * Find radix tree entry at given index. If it is a DAX entry, return it + * with the radix tree entry locked. If the radix tree doesn't contain the + * given index, create an empty entry for the index and return with it locked. * - * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will + * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will * happen if there are any 4k entries within the 2MiB range that we are * requesting. @@ -507,13 +505,13 @@ restart: xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); - if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { + if (WARN_ON_ONCE(entry && !xa_is_value(entry))) { entry = ERR_PTR(-EIO); goto out_unlock; } if (entry) { - if (size_flag & RADIX_DAX_PMD) { + if (size_flag & DAX_PMD) { if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); @@ -584,7 +582,7 @@ restart: true); } - entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); + entry = dax_radix_locked_entry(0, size_flag | DAX_EMPTY); err = __radix_tree_insert(&mapping->i_pages, index, dax_radix_order(entry), entry); @@ -673,8 +671,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (index >= end) break; - if (WARN_ON_ONCE( - !radix_tree_exceptional_entry(pvec_ent))) + if (WARN_ON_ONCE(!xa_is_value(pvec_ent))) continue; xa_lock_irq(&mapping->i_pages); @@ -713,7 +710,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, xa_lock_irq(pages); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || @@ -729,8 +726,8 @@ out: return ret; } /* - * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree - * entry to get unlocked before deleting it. + * Delete DAX entry at @index from @mapping. Wait for it + * to be unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { @@ -740,7 +737,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the * radix tree (usually fs-private i_mmap_sem for writing). Since the - * caller has seen exceptional entry for this index, we better find it + * caller has seen a DAX entry for this index, we better find it * at that index as well... */ WARN_ON_ONCE(!ret); @@ -748,7 +745,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) } /* - * Invalidate exceptional DAX entry if it is clean. + * Invalidate DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) @@ -802,7 +799,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, @@ -940,13 +937,13 @@ static int dax_writeback_one(struct dax_device *dax_dev, * A page got tagged dirty in DAX mapping? Something is seriously * wrong. */ - if (WARN_ON(!radix_tree_exceptional_entry(entry))) + if (WARN_ON(!xa_is_value(entry))) return -EIO; xa_lock_irq(pages); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ - if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) + if (!entry2 || WARN_ON_ONCE(!xa_is_value(entry2))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to @@ -1123,8 +1120,9 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, - false); + dax_insert_mapping_entry(mapping, vmf, entry, pfn, + DAX_ZERO_PAGE, false); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1514,7 +1512,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pfn = page_to_pfn_t(zero_page); ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); + DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1597,7 +1595,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * is already in the tree, for instance), it will return -EEXIST and * we just fall back to 4k entries. */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + entry = grab_mapping_entry(mapping, pgoff, DAX_PMD); if (IS_ERR(entry)) goto fallback; @@ -1635,7 +1633,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, goto finish_iomap; entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_PMD, write && !sync); + DAX_PMD, write && !sync); /* * If we are doing synchronous page fault and inode needs fsync, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5ea1d64cb0b4..669abb617321 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -521,7 +521,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) mss->swap += PAGE_SIZE; else put_page(page); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 34149e8b5f73..e9e76ab4fbbf 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -28,34 +28,26 @@ #include #include #include +#include /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: * * 00 - data pointer - * 01 - internal entry - * 10 - exceptional entry - * 11 - this bit combination is currently unused/reserved + * 10 - internal entry + * x1 - value entry * * The internal entry may be a pointer to the next level in the tree, a * sibling entry, or an indicator that the entry in this slot has been moved * to another location in the tree and the lookup should be restarted. While * NULL fits the 'data pointer' pattern, it means that there is no entry in * the tree for this index (no matter what level of the tree it is found at). - * This means that you cannot store NULL in the tree as a value for the index. + * This means that storing a NULL entry in the tree is the same as deleting + * the entry from the tree. */ #define RADIX_TREE_ENTRY_MASK 3UL -#define RADIX_TREE_INTERNAL_NODE 1UL - -/* - * Most users of the radix tree store pointers but shmem/tmpfs stores swap - * entries in the same tree. They are marked as exceptional entries to - * distinguish them from pointers to struct page. - * EXCEPTIONAL_ENTRY tests the bit, EXCEPTIONAL_SHIFT shifts content past it. - */ -#define RADIX_TREE_EXCEPTIONAL_ENTRY 2 -#define RADIX_TREE_EXCEPTIONAL_SHIFT 2 +#define RADIX_TREE_INTERNAL_NODE 2UL static inline bool radix_tree_is_internal_node(void *ptr) { @@ -83,11 +75,10 @@ static inline bool radix_tree_is_internal_node(void *ptr) /* * @count is the count of every non-NULL element in the ->slots array - * whether that is an exceptional entry, a retry entry, a user pointer, + * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @exceptional is the count of every element in ->slots which is - * either radix_tree_exceptional_entry() or is a sibling entry for an - * exceptional entry. + * either a value entry or a sibling of a value entry. */ struct radix_tree_node { unsigned char shift; /* Bits remaining in each slot */ @@ -268,17 +259,6 @@ static inline int radix_tree_deref_retry(void *arg) return unlikely(radix_tree_is_internal_node(arg)); } -/** - * radix_tree_exceptional_entry - radix_tree_deref_slot gave exceptional entry? - * @arg: value returned by radix_tree_deref_slot - * Returns: 0 if well-aligned pointer, non-0 if exceptional entry. - */ -static inline int radix_tree_exceptional_entry(void *arg) -{ - /* Not unlikely because radix_tree_exception often tested first */ - return (unsigned long)arg & RADIX_TREE_EXCEPTIONAL_ENTRY; -} - /** * radix_tree_exception - radix_tree_deref_slot returned either exception? * @arg: value returned by radix_tree_deref_slot diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 22af9d8a84ae..4d961668e5fc 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -18,9 +18,8 @@ * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ -#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \ - (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT)) -#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1) +#define SWP_TYPE_SHIFT (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT) +#define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* * Store a type+offset into a swp_entry_t in an arch-independent format @@ -29,8 +28,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) { swp_entry_t ret; - ret.val = (type << SWP_TYPE_SHIFT(ret)) | - (offset & SWP_OFFSET_MASK(ret)); + ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK); return ret; } @@ -40,7 +38,7 @@ static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) */ static inline unsigned swp_type(swp_entry_t entry) { - return (entry.val >> SWP_TYPE_SHIFT(entry)); + return (entry.val >> SWP_TYPE_SHIFT); } /* @@ -49,7 +47,7 @@ static inline unsigned swp_type(swp_entry_t entry) */ static inline pgoff_t swp_offset(swp_entry_t entry) { - return entry.val & SWP_OFFSET_MASK(entry); + return entry.val & SWP_OFFSET_MASK; } #ifdef CONFIG_MMU @@ -90,16 +88,13 @@ static inline swp_entry_t radix_to_swp_entry(void *arg) { swp_entry_t entry; - entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT; + entry.val = xa_to_value(arg); return entry; } static inline void *swp_to_radix_entry(swp_entry_t entry) { - unsigned long value; - - value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT; - return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(entry.val); } #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 9e4c86853fa4..5c8acfc4ff55 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -5,9 +5,111 @@ * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox + * + * See Documentation/core-api/xarray.rst for how to use the XArray. */ +#include #include +#include + +/* + * The bottom two bits of the entry determine how the XArray interprets + * the contents: + * + * 00: Pointer entry + * 10: Internal entry + * x1: Value entry or tagged pointer + * + * Attempting to store internal entries in the XArray is a bug. + */ + +#define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) + +/** + * xa_mk_value() - Create an XArray entry from an integer. + * @v: Value to store in XArray. + * + * Context: Any context. + * Return: An entry suitable for storing in the XArray. + */ +static inline void *xa_mk_value(unsigned long v) +{ + WARN_ON((long)v < 0); + return (void *)((v << 1) | 1); +} + +/** + * xa_to_value() - Get value stored in an XArray entry. + * @entry: XArray entry. + * + * Context: Any context. + * Return: The value stored in the XArray entry. + */ +static inline unsigned long xa_to_value(const void *entry) +{ + return (unsigned long)entry >> 1; +} + +/** + * xa_is_value() - Determine if an entry is a value. + * @entry: XArray entry. + * + * Context: Any context. + * Return: True if the entry is a value, false if it is a pointer. + */ +static inline bool xa_is_value(const void *entry) +{ + return (unsigned long)entry & 1; +} + +/** + * xa_tag_pointer() - Create an XArray entry for a tagged pointer. + * @p: Plain pointer. + * @tag: Tag value (0, 1 or 3). + * + * If the user of the XArray prefers, they can tag their pointers instead + * of storing value entries. Three tags are available (0, 1 and 3). + * These are distinct from the xa_mark_t as they are not replicated up + * through the array and cannot be searched for. + * + * Context: Any context. + * Return: An XArray entry. + */ +static inline void *xa_tag_pointer(void *p, unsigned long tag) +{ + return (void *)((unsigned long)p | tag); +} + +/** + * xa_untag_pointer() - Turn an XArray entry into a plain pointer. + * @entry: XArray entry. + * + * If you have stored a tagged pointer in the XArray, call this function + * to get the untagged version of the pointer. + * + * Context: Any context. + * Return: A pointer. + */ +static inline void *xa_untag_pointer(void *entry) +{ + return (void *)((unsigned long)entry & ~3UL); +} + +/** + * xa_pointer_tag() - Get the tag stored in an XArray entry. + * @entry: XArray entry. + * + * If you have stored a tagged pointer in the XArray, call this function + * to get the tag of that pointer. + * + * Context: Any context. + * Return: A tag. + */ +static inline unsigned int xa_pointer_tag(void *entry) +{ + return (unsigned long)entry & 3UL; +} #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) diff --git a/lib/idr.c b/lib/idr.c index 729e381e23b4..88419fbc5737 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -338,11 +338,8 @@ EXPORT_SYMBOL(idr_replace); * by the number of bits in the leaf bitmap before doing a radix tree lookup. * * As an optimisation, if there are only a few low bits set in any given - * leaf, instead of allocating a 128-byte bitmap, we use the 'exceptional - * entry' functionality of the radix tree to store BITS_PER_LONG - 2 bits - * directly in the entry. By being really tricksy, we could store - * BITS_PER_LONG - 1 bits, but there're diminishing returns after optimising - * for 0-3 allocated IDs. + * leaf, instead of allocating a 128-byte bitmap, we store the bits + * directly in the entry. * * We allow the radix tree 'exceptional' count to get out of date. Nothing * in the IDA nor the radix tree code checks it. If it becomes important @@ -366,12 +363,11 @@ static int ida_get_new_above(struct ida *ida, int start) struct radix_tree_iter iter; struct ida_bitmap *bitmap; unsigned long index; - unsigned bit, ebit; + unsigned bit; int new; index = start / IDA_BITMAP_BITS; bit = start % IDA_BITMAP_BITS; - ebit = bit + RADIX_TREE_EXCEPTIONAL_SHIFT; slot = radix_tree_iter_init(&iter, index); for (;;) { @@ -386,25 +382,23 @@ static int ida_get_new_above(struct ida *ida, int start) return PTR_ERR(slot); } } - if (iter.index > index) { + if (iter.index > index) bit = 0; - ebit = RADIX_TREE_EXCEPTIONAL_SHIFT; - } new = iter.index * IDA_BITMAP_BITS; bitmap = rcu_dereference_raw(*slot); - if (radix_tree_exception(bitmap)) { - unsigned long tmp = (unsigned long)bitmap; - ebit = find_next_zero_bit(&tmp, BITS_PER_LONG, ebit); - if (ebit < BITS_PER_LONG) { - tmp |= 1UL << ebit; - rcu_assign_pointer(*slot, (void *)tmp); - return new + ebit - - RADIX_TREE_EXCEPTIONAL_SHIFT; + if (xa_is_value(bitmap)) { + unsigned long tmp = xa_to_value(bitmap); + int vbit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, + bit); + if (vbit < BITS_PER_XA_VALUE) { + tmp |= 1UL << vbit; + rcu_assign_pointer(*slot, xa_mk_value(tmp)); + return new + vbit; } bitmap = this_cpu_xchg(ida_bitmap, NULL); if (!bitmap) return -EAGAIN; - bitmap->bitmap[0] = tmp >> RADIX_TREE_EXCEPTIONAL_SHIFT; + bitmap->bitmap[0] = tmp; rcu_assign_pointer(*slot, bitmap); } @@ -425,17 +419,14 @@ static int ida_get_new_above(struct ida *ida, int start) new += bit; if (new < 0) return -ENOSPC; - if (ebit < BITS_PER_LONG) { - bitmap = (void *)((1UL << ebit) | - RADIX_TREE_EXCEPTIONAL_ENTRY); - radix_tree_iter_replace(root, &iter, slot, - bitmap); - return new; + if (bit < BITS_PER_XA_VALUE) { + bitmap = xa_mk_value(1UL << bit); + } else { + bitmap = this_cpu_xchg(ida_bitmap, NULL); + if (!bitmap) + return -EAGAIN; + __set_bit(bit, bitmap->bitmap); } - bitmap = this_cpu_xchg(ida_bitmap, NULL); - if (!bitmap) - return -EAGAIN; - __set_bit(bit, bitmap->bitmap); radix_tree_iter_replace(root, &iter, slot, bitmap); } @@ -457,9 +448,9 @@ static void ida_remove(struct ida *ida, int id) goto err; bitmap = rcu_dereference_raw(*slot); - if (radix_tree_exception(bitmap)) { + if (xa_is_value(bitmap)) { btmp = (unsigned long *)slot; - offset += RADIX_TREE_EXCEPTIONAL_SHIFT; + offset += 1; /* Intimate knowledge of the value encoding */ if (offset >= BITS_PER_LONG) goto err; } else { @@ -470,9 +461,8 @@ static void ida_remove(struct ida *ida, int id) __clear_bit(offset, btmp); radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE); - if (radix_tree_exception(bitmap)) { - if (rcu_dereference_raw(*slot) == - (void *)RADIX_TREE_EXCEPTIONAL_ENTRY) + if (xa_is_value(bitmap)) { + if (xa_to_value(rcu_dereference_raw(*slot)) == 0) radix_tree_iter_delete(&ida->ida_rt, &iter, slot); } else if (bitmap_empty(btmp, IDA_BITMAP_BITS)) { kfree(bitmap); @@ -503,7 +493,7 @@ void ida_destroy(struct ida *ida) xa_lock_irqsave(&ida->ida_rt, flags); radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) { struct ida_bitmap *bitmap = rcu_dereference_raw(*slot); - if (!radix_tree_exception(bitmap)) + if (!xa_is_value(bitmap)) kfree(bitmap); radix_tree_iter_delete(&ida->ida_rt, &iter, slot); } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index a904a8ddd174..b6c0e7f3a894 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -340,14 +340,12 @@ static void dump_ida_node(void *entry, unsigned long index) for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) dump_ida_node(node->slots[i], index | (i << node->shift)); - } else if (radix_tree_exceptional_entry(entry)) { + } else if (xa_is_value(entry)) { pr_debug("ida excp: %p offset %d indices %lu-%lu data %lx\n", entry, (int)(index & RADIX_TREE_MAP_MASK), index * IDA_BITMAP_BITS, - index * IDA_BITMAP_BITS + BITS_PER_LONG - - RADIX_TREE_EXCEPTIONAL_SHIFT, - (unsigned long)entry >> - RADIX_TREE_EXCEPTIONAL_SHIFT); + index * IDA_BITMAP_BITS + BITS_PER_XA_VALUE, + xa_to_value(entry)); } else { struct ida_bitmap *bitmap = entry; @@ -656,7 +654,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, BUG_ON(shift > BITS_PER_LONG); if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; - } else if (radix_tree_exceptional_entry(entry)) { + } else if (xa_is_value(entry)) { /* Moving an exceptional root->rnode to a node */ node->exceptional = 1; } @@ -955,12 +953,12 @@ static inline int insert_entries(struct radix_tree_node *node, !is_sibling_entry(node, old) && (old != RADIX_TREE_RETRY)) radix_tree_free_nodes(old); - if (radix_tree_exceptional_entry(old)) + if (xa_is_value(old)) node->exceptional--; } if (node) { node->count += n; - if (radix_tree_exceptional_entry(item)) + if (xa_is_value(item)) node->exceptional += n; } return n; @@ -974,7 +972,7 @@ static inline int insert_entries(struct radix_tree_node *node, rcu_assign_pointer(*slot, item); if (node) { node->count++; - if (radix_tree_exceptional_entry(item)) + if (xa_is_value(item)) node->exceptional++; } return 1; @@ -1190,8 +1188,7 @@ void __radix_tree_replace(struct radix_tree_root *root, radix_tree_update_node_t update_node) { void *old = rcu_dereference_raw(*slot); - int exceptional = !!radix_tree_exceptional_entry(item) - - !!radix_tree_exceptional_entry(old); + int exceptional = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* @@ -1992,7 +1989,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); - int exceptional = radix_tree_exceptional_entry(old) ? -1 : 0; + int exceptional = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; diff --git a/mm/filemap.c b/mm/filemap.c index 52517f28e6f4..4de14e75c4ec 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -127,7 +127,7 @@ static int page_cache_tree_insert(struct address_space *mapping, p = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (!radix_tree_exceptional_entry(p)) + if (!xa_is_value(p)) return -EEXIST; mapping->nrexceptional--; @@ -336,7 +336,7 @@ page_cache_tree_delete_batch(struct address_space *mapping, break; page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!tail_pages) { /* @@ -1355,7 +1355,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, struct page *page; page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + if (!page || xa_is_value(page)) break; index++; if (index == 0) @@ -1396,7 +1396,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, struct page *page; page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || radix_tree_exceptional_entry(page)) + if (!page || xa_is_value(page)) break; index--; if (index == ULONG_MAX) @@ -1539,7 +1539,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, repeat: page = find_get_entry(mapping, offset); - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) page = NULL; if (!page) goto no_page; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a31d740e6cd1..4820e4adf853 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1369,7 +1369,7 @@ static void collapse_shmem(struct mm_struct *mm, page = radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock); - if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { + if (xa_is_value(page) || !PageUptodate(page)) { xa_unlock_irq(&mapping->i_pages); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, diff --git a/mm/madvise.c b/mm/madvise.c index 972a9eaa898b..9d802566c494 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -251,7 +251,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; page = find_get_entry(mapping, index); - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { if (page) put_page(page); continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e79cb59552d9..29d9d1a69b36 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4750,7 +4750,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, /* shmem/tmpfs may report page out on swap: account for that too. */ if (shmem_mapping(mapping)) { page = find_get_entry(mapping, pgoff); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); if (do_memsw_account()) *entry = swp; diff --git a/mm/mincore.c b/mm/mincore.c index fc37afe226e6..4985965aa20a 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -66,7 +66,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * shmem/tmpfs may return swap: account for swapcache * page too. */ - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swp_entry_t swp = radix_to_swp_entry(page); page = find_get_page(swap_address_space(swp), swp_offset(swp)); diff --git a/mm/readahead.c b/mm/readahead.c index 4e630143a0ba..de657077d41d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -179,7 +179,7 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, rcu_read_lock(); page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of * contiguous pages before continuing with the next diff --git a/mm/shmem.c b/mm/shmem.c index 446942677cd4..c1062760fe41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -709,7 +709,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, continue; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) swapped++; if (need_resched()) { @@ -824,7 +824,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, @@ -921,7 +921,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (unfalloc) continue; if (shmem_free_swap(mapping, index, page)) { @@ -1643,7 +1643,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, repeat: swap.val = 0; page = find_lock_entry(mapping, index); - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { swap = radix_to_swp_entry(page); page = NULL; } @@ -2578,7 +2578,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, index = indices[i]; } page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { + if (page && !xa_is_value(page)) { if (!PageUptodate(page)) page = NULL; } diff --git a/mm/swap.c b/mm/swap.c index 26fc9b5f1b6c..4c5c7fcc6e46 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -965,7 +965,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) for (i = 0, j = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) + if (!xa_is_value(page)) pvec->pages[j++] = page; } pvec->nr = j; diff --git a/mm/truncate.c b/mm/truncate.c index 1d2fb2dca96f..ed778555c9f3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -70,7 +70,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, return; for (j = 0; j < pagevec_count(pvec); j++) - if (radix_tree_exceptional_entry(pvec->pages[j])) + if (xa_is_value(pvec->pages[j])) break; if (j == pagevec_count(pvec)) @@ -85,7 +85,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, struct page *page = pvec->pages[i]; pgoff_t index = indices[i]; - if (!radix_tree_exceptional_entry(page)) { + if (!xa_is_value(page)) { pvec->pages[j++] = page; continue; } @@ -347,7 +347,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; if (!trylock_page(page)) @@ -442,7 +442,7 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) + if (xa_is_value(page)) continue; lock_page(page); @@ -561,7 +561,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { invalidate_exceptional_entry(mapping, index, page); continue; @@ -692,7 +692,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (index > end) break; - if (radix_tree_exceptional_entry(page)) { + if (xa_is_value(page)) { if (!invalidate_exceptional_entry2(mapping, index, page)) ret = -EBUSY; diff --git a/mm/workingset.c b/mm/workingset.c index 4516dd790129..bb109b3afac2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -155,8 +155,8 @@ * refault distance will immediately activate the refaulting page. */ -#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ - NODES_SHIFT + \ +#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \ + NODES_SHIFT + \ MEM_CGROUP_ID_SHIFT) #define EVICTION_MASK (~0UL >> EVICTION_SHIFT) @@ -173,20 +173,19 @@ static unsigned int bucket_order __read_mostly; static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction) { eviction >>= bucket_order; + eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; - eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); - return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); + return xa_mk_value(eviction); } static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, unsigned long *evictionp) { - unsigned long entry = (unsigned long)shadow; + unsigned long entry = xa_to_value(shadow); int memcgid, nid; - entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); @@ -453,7 +452,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { - if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) + if (WARN_ON_ONCE(!xa_is_value(node->slots[i]))) goto out_invalid; if (WARN_ON_ONCE(!node->exceptional)) goto out_invalid; diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index f620c831a4b5..a5a4494922a6 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -19,7 +19,7 @@ #include "test.h" -#define DUMMY_PTR ((void *)0x12) +#define DUMMY_PTR ((void *)0x10) int item_idr_free(int id, void *p, void *data) { @@ -411,11 +411,11 @@ void ida_check_conv_user(void) int id = ida_alloc(&ida, GFP_NOWAIT); if (id == -ENOMEM) { IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) != - BITS_PER_LONG - 2); + BITS_PER_XA_VALUE); id = ida_alloc(&ida, GFP_KERNEL); } else { IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) == - BITS_PER_LONG - 2); + BITS_PER_XA_VALUE); } IDA_BUG_ON(&ida, id != i); } diff --git a/tools/testing/radix-tree/linux/radix-tree.h b/tools/testing/radix-tree/linux/radix-tree.h index 24f13d27a8da..d1635a5bef02 100644 --- a/tools/testing/radix-tree/linux/radix-tree.h +++ b/tools/testing/radix-tree/linux/radix-tree.h @@ -2,7 +2,6 @@ #ifndef _TEST_RADIX_TREE_H #define _TEST_RADIX_TREE_H -#include "generated/map-shift.h" #include "../../../../include/linux/radix-tree.h" extern int kmalloc_verbose; diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 7bf405638b0b..2b4f4dba1882 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -39,12 +39,11 @@ static void __multiorder_tag_test(int index, int order) /* * Verify we get collisions for covered indices. We try and fail to - * insert an exceptional entry so we don't leak memory via + * insert a value entry so we don't leak memory via * item_insert_order(). */ for_each_index(i, base, order) { - err = __radix_tree_insert(&tree, i, order, - (void *)(0xA0 | RADIX_TREE_EXCEPTIONAL_ENTRY)); + err = __radix_tree_insert(&tree, i, order, xa_mk_value(0xA0)); assert(err == -EEXIST); } @@ -380,8 +379,8 @@ static void multiorder_join1(unsigned long index, } /* - * Check that the accounting of exceptional entries is handled correctly - * by joining an exceptional entry to a normal pointer. + * Check that the accounting of value entries is handled correctly + * by joining a value entry to a normal pointer. */ static void multiorder_join2(unsigned order1, unsigned order2) { @@ -391,9 +390,9 @@ static void multiorder_join2(unsigned order1, unsigned order2) void *item2; item_insert_order(&tree, 0, order2); - radix_tree_insert(&tree, 1 << order2, (void *)0x12UL); + radix_tree_insert(&tree, 1 << order2, xa_mk_value(5)); item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); - assert(item2 == (void *)0x12UL); + assert(item2 == xa_mk_value(5)); assert(node->exceptional == 1); item2 = radix_tree_lookup(&tree, 0); @@ -407,7 +406,7 @@ static void multiorder_join2(unsigned order1, unsigned order2) } /* - * This test revealed an accounting bug for exceptional entries at one point. + * This test revealed an accounting bug for value entries at one point. * Nodes were being freed back into the pool with an elevated exception count * by radix_tree_join() and then radix_tree_split() was failing to zero the * count of exceptional entries. @@ -421,16 +420,16 @@ static void multiorder_join3(unsigned int order) unsigned long i; for (i = 0; i < (1 << order); i++) { - radix_tree_insert(&tree, i, (void *)0x12UL); + radix_tree_insert(&tree, i, xa_mk_value(5)); } - radix_tree_join(&tree, 0, order, (void *)0x16UL); + radix_tree_join(&tree, 0, order, xa_mk_value(7)); rcu_barrier(); radix_tree_split(&tree, 0, 0); radix_tree_for_each_slot(slot, &tree, &iter, 0) { - radix_tree_iter_replace(&tree, &iter, slot, (void *)0x12UL); + radix_tree_iter_replace(&tree, &iter, slot, xa_mk_value(5)); } __radix_tree_lookup(&tree, 0, &node, NULL); @@ -517,10 +516,10 @@ static void __multiorder_split2(int old_order, int new_order) struct radix_tree_node *node; void *item; - __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + __radix_tree_insert(&tree, 0, old_order, xa_mk_value(5)); item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == (void *)0x12); + assert(item == xa_mk_value(5)); assert(node->exceptional > 0); radix_tree_split(&tree, 0, new_order); @@ -530,7 +529,7 @@ static void __multiorder_split2(int old_order, int new_order) } item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item != (void *)0x12); + assert(item != xa_mk_value(5)); assert(node->exceptional == 0); item_kill_tree(&tree); @@ -544,40 +543,40 @@ static void __multiorder_split3(int old_order, int new_order) struct radix_tree_node *node; void *item; - __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + __radix_tree_insert(&tree, 0, old_order, xa_mk_value(5)); item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == (void *)0x12); + assert(item == xa_mk_value(5)); assert(node->exceptional > 0); radix_tree_split(&tree, 0, new_order); radix_tree_for_each_slot(slot, &tree, &iter, 0) { - radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16); + radix_tree_iter_replace(&tree, &iter, slot, xa_mk_value(7)); } item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == (void *)0x16); + assert(item == xa_mk_value(7)); assert(node->exceptional > 0); item_kill_tree(&tree); - __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + __radix_tree_insert(&tree, 0, old_order, xa_mk_value(5)); item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == (void *)0x12); + assert(item == xa_mk_value(5)); assert(node->exceptional > 0); radix_tree_split(&tree, 0, new_order); radix_tree_for_each_slot(slot, &tree, &iter, 0) { if (iter.index == (1 << new_order)) radix_tree_iter_replace(&tree, &iter, slot, - (void *)0x16); + xa_mk_value(7)); else radix_tree_iter_replace(&tree, &iter, slot, NULL); } item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL); - assert(item == (void *)0x16); + assert(item == xa_mk_value(7)); assert(node->count == node->exceptional); do { node = node->parent; @@ -610,13 +609,13 @@ static void multiorder_account(void) item_insert_order(&tree, 0, 5); - __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); + __radix_tree_insert(&tree, 1 << 5, 5, xa_mk_value(5)); __radix_tree_lookup(&tree, 0, &node, NULL); assert(node->count == node->exceptional * 2); radix_tree_delete(&tree, 1 << 5); assert(node->exceptional == 0); - __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); + __radix_tree_insert(&tree, 1 << 5, 5, xa_mk_value(5)); __radix_tree_lookup(&tree, 1 << 5, &node, &slot); assert(node->count == node->exceptional * 2); __radix_tree_replace(&tree, node, slot, NULL, NULL); diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index def6015570b2..62de66c314b7 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -295,7 +295,7 @@ void item_kill_tree(struct radix_tree_root *root) int nfound; radix_tree_for_each_slot(slot, root, &iter, 0) { - if (radix_tree_exceptional_entry(*slot)) + if (xa_is_value(*slot)) radix_tree_delete(root, iter.index); } -- cgit v1.2.3 From 02c02bf12c5d838603eed44195d3e91f094e2ab2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 3 Nov 2017 23:09:45 -0400 Subject: xarray: Change definition of sibling entries Instead of storing a pointer to the slot containing the canonical entry, store the offset of the slot. Produces slightly more efficient code (~300 bytes) and simplifies the implementation. Signed-off-by: Matthew Wilcox Reviewed-by: Josef Bacik --- include/linux/radix-tree.h | 5 +- include/linux/xarray.h | 92 +++++++++++++++++++++++++++ lib/Kconfig | 7 ++ lib/radix-tree.c | 64 ++++++------------- tools/testing/radix-tree/Makefile | 2 +- tools/testing/radix-tree/generated/autoconf.h | 1 + 6 files changed, 121 insertions(+), 50 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index e9e76ab4fbbf..60f3d8eb2cb7 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -59,10 +59,7 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_TAGS 3 -#ifndef RADIX_TREE_MAP_SHIFT -#define RADIX_TREE_MAP_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) -#endif - +#define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 5c8acfc4ff55..4d1cd7a083e8 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -22,6 +22,12 @@ * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. + * + * Most internal entries are pointers to the next node in the tree. + * The following internal entries have a special meaning: + * + * 0-62: Sibling entries + * 256: Retry entry */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) @@ -111,6 +117,42 @@ static inline unsigned int xa_pointer_tag(void *entry) return (unsigned long)entry & 3UL; } +/* + * xa_mk_internal() - Create an internal entry. + * @v: Value to turn into an internal entry. + * + * Context: Any context. + * Return: An XArray internal entry corresponding to this value. + */ +static inline void *xa_mk_internal(unsigned long v) +{ + return (void *)((v << 2) | 2); +} + +/* + * xa_to_internal() - Extract the value from an internal entry. + * @entry: XArray entry. + * + * Context: Any context. + * Return: The value which was stored in the internal entry. + */ +static inline unsigned long xa_to_internal(const void *entry) +{ + return (unsigned long)entry >> 2; +} + +/* + * xa_is_internal() - Is the entry an internal entry? + * @entry: XArray entry. + * + * Context: Any context. + * Return: %true if the entry is an internal entry. + */ +static inline bool xa_is_internal(const void *entry) +{ + return ((unsigned long)entry & 3) == 2; +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -123,4 +165,54 @@ static inline unsigned int xa_pointer_tag(void *entry) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) +/* Everything below here is the Advanced API. Proceed with caution. */ + +/* + * The xarray is constructed out of a set of 'chunks' of pointers. Choosing + * the best chunk size requires some tradeoffs. A power of two recommends + * itself so that we can walk the tree based purely on shifts and masks. + * Generally, the larger the better; as the number of slots per level of the + * tree increases, the less tall the tree needs to be. But that needs to be + * balanced against the memory consumption of each node. On a 64-bit system, + * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we + * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. + */ +#ifndef XA_CHUNK_SHIFT +#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) +#endif +#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) +#define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) + +/* Private */ +static inline bool xa_is_node(const void *entry) +{ + return xa_is_internal(entry) && (unsigned long)entry > 4096; +} + +/* Private */ +static inline void *xa_mk_sibling(unsigned int offset) +{ + return xa_mk_internal(offset); +} + +/* Private */ +static inline unsigned long xa_to_sibling(const void *entry) +{ + return xa_to_internal(entry); +} + +/** + * xa_is_sibling() - Is the entry a sibling entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a sibling entry. + */ +static inline bool xa_is_sibling(const void *entry) +{ + return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && + (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); +} + +#define XA_RETRY_ENTRY xa_mk_internal(256) + #endif /* _LINUX_XARRAY_H */ diff --git a/lib/Kconfig b/lib/Kconfig index a3928d4438b5..40bfa6ccd294 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -399,8 +399,15 @@ config INTERVAL_TREE for more information. +config XARRAY_MULTI + bool + help + Support entries which occupy multiple consecutive indices in the + XArray. + config RADIX_TREE_MULTIORDER bool + select XARRAY_MULTI config ASSOCIATIVE_ARRAY bool diff --git a/lib/radix-tree.c b/lib/radix-tree.c index b6c0e7f3a894..59b28111eabc 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -38,6 +38,7 @@ #include #include #include +#include /* Number of nodes in fully populated tree of given height */ @@ -98,24 +99,7 @@ static inline void *node_to_entry(void *ptr) return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); } -#define RADIX_TREE_RETRY node_to_entry(NULL) - -#ifdef CONFIG_RADIX_TREE_MULTIORDER -/* Sibling slots point directly to another slot in the same node */ -static inline -bool is_sibling_entry(const struct radix_tree_node *parent, void *node) -{ - void __rcu **ptr = node; - return (parent->slots <= ptr) && - (ptr < parent->slots + RADIX_TREE_MAP_SIZE); -} -#else -static inline -bool is_sibling_entry(const struct radix_tree_node *parent, void *node) -{ - return false; -} -#endif +#define RADIX_TREE_RETRY XA_RETRY_ENTRY static inline unsigned long get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot) @@ -129,16 +113,10 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent, unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); -#ifdef CONFIG_RADIX_TREE_MULTIORDER - if (radix_tree_is_internal_node(entry)) { - if (is_sibling_entry(parent, entry)) { - void __rcu **sibentry; - sibentry = (void __rcu **) entry_to_node(entry); - offset = get_slot_offset(parent, sibentry); - entry = rcu_dereference_raw(*sibentry); - } + if (xa_is_sibling(entry)) { + offset = xa_to_sibling(entry); + entry = rcu_dereference_raw(parent->slots[offset]); } -#endif *nodep = (void *)entry; return offset; @@ -300,10 +278,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) } else if (!radix_tree_is_internal_node(entry)) { pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", entry, i, first, last, node); - } else if (is_sibling_entry(node, entry)) { + } else if (xa_is_sibling(entry)) { pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", entry, i, first, last, node, - *(void **)entry_to_node(entry)); + node->slots[xa_to_sibling(entry)]); } else { dump_node(entry_to_node(entry), first); } @@ -881,8 +859,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node) for (;;) { void *entry = rcu_dereference_raw(child->slots[offset]); - if (radix_tree_is_internal_node(entry) && child->shift && - !is_sibling_entry(child, entry)) { + if (xa_is_node(entry) && child->shift) { child = entry_to_node(entry); offset = 0; continue; @@ -904,7 +881,7 @@ static void radix_tree_free_nodes(struct radix_tree_node *node) static inline int insert_entries(struct radix_tree_node *node, void __rcu **slot, void *item, unsigned order, bool replace) { - struct radix_tree_node *child; + void *sibling; unsigned i, n, tag, offset, tags = 0; if (node) { @@ -922,7 +899,7 @@ static inline int insert_entries(struct radix_tree_node *node, offset = offset & ~(n - 1); slot = &node->slots[offset]; } - child = node_to_entry(slot); + sibling = xa_mk_sibling(offset); for (i = 0; i < n; i++) { if (slot[i]) { @@ -939,7 +916,7 @@ static inline int insert_entries(struct radix_tree_node *node, for (i = 0; i < n; i++) { struct radix_tree_node *old = rcu_dereference_raw(slot[i]); if (i) { - rcu_assign_pointer(slot[i], child); + rcu_assign_pointer(slot[i], sibling); for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) if (tags & (1 << tag)) tag_clear(node, tag, offset + i); @@ -949,9 +926,7 @@ static inline int insert_entries(struct radix_tree_node *node, if (tags & (1 << tag)) tag_set(node, tag, offset); } - if (radix_tree_is_internal_node(old) && - !is_sibling_entry(node, old) && - (old != RADIX_TREE_RETRY)) + if (xa_is_node(old)) radix_tree_free_nodes(old); if (xa_is_value(old)) node->exceptional--; @@ -1112,10 +1087,10 @@ static inline void replace_sibling_entries(struct radix_tree_node *node, void __rcu **slot, int count, int exceptional) { #ifdef CONFIG_RADIX_TREE_MULTIORDER - void *ptr = node_to_entry(slot); - unsigned offset = get_slot_offset(node, slot) + 1; + unsigned offset = get_slot_offset(node, slot); + void *ptr = xa_mk_sibling(offset); - while (offset < RADIX_TREE_MAP_SIZE) { + while (++offset < RADIX_TREE_MAP_SIZE) { if (rcu_dereference_raw(node->slots[offset]) != ptr) break; if (count < 0) { @@ -1123,7 +1098,6 @@ static inline void replace_sibling_entries(struct radix_tree_node *node, node->count--; } node->exceptional += exceptional; - offset++; } #endif } @@ -1319,8 +1293,7 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index, tags |= 1 << tag; for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { - if (!is_sibling_entry(parent, - rcu_dereference_raw(parent->slots[end]))) + if (!xa_is_sibling(rcu_dereference_raw(parent->slots[end]))) break; for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) if (tags & (1 << tag)) @@ -1618,7 +1591,7 @@ static void __rcu **skip_siblings(struct radix_tree_node **nodep, { while (iter->index < iter->next_index) { *nodep = rcu_dereference_raw(*slot); - if (*nodep && !is_sibling_entry(iter->node, *nodep)) + if (*nodep && !xa_is_sibling(*nodep)) return slot; slot++; iter->index = __radix_tree_iter_add(iter, 1); @@ -1769,7 +1742,7 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); - if (is_sibling_entry(node, slot)) + if (xa_is_sibling(slot)) continue; if (slot) break; @@ -2283,6 +2256,7 @@ void __init radix_tree_init(void) BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK); + BUILD_BUG_ON(XA_CHUNK_SIZE > 255); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 37baecc3766f..12adcf9ffe86 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -46,6 +46,6 @@ idr.c: ../../../lib/idr.c generated/map-shift.h: @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ - echo "#define RADIX_TREE_MAP_SHIFT $(SHIFT)" > \ + echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ generated/map-shift.h; \ fi diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h index cf88dc5b8832..ca8e03ad19ac 100644 --- a/tools/testing/radix-tree/generated/autoconf.h +++ b/tools/testing/radix-tree/generated/autoconf.h @@ -1 +1,2 @@ #define CONFIG_RADIX_TREE_MULTIORDER 1 +#define CONFIG_XARRAY_MULTI 1 -- cgit v1.2.3 From f8d5d0cc145cc21bfc56ef807dc28102aebbf228 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 7 Nov 2017 16:30:10 -0500 Subject: xarray: Add definition of struct xarray This is a direct replacement for struct radix_tree_root. Some of the struct members have changed name; convert those, and use a #define so that radix_tree users continue to work without change. Signed-off-by: Matthew Wilcox Reviewed-by: Josef Bacik --- include/linux/radix-tree.h | 28 ++++-------- include/linux/xarray.h | 70 +++++++++++++++++++++++++++++ lib/Makefile | 2 +- lib/idr.c | 4 +- lib/radix-tree.c | 75 ++++++++++++++++---------------- lib/xarray.c | 44 +++++++++++++++++++ tools/testing/radix-tree/Makefile | 5 ++- tools/testing/radix-tree/linux/bug.h | 1 + tools/testing/radix-tree/linux/kconfig.h | 1 + tools/testing/radix-tree/multiorder.c | 6 +-- tools/testing/radix-tree/test.c | 6 +-- tools/testing/radix-tree/xarray.c | 7 +++ 12 files changed, 181 insertions(+), 68 deletions(-) create mode 100644 lib/xarray.c create mode 100644 tools/testing/radix-tree/linux/kconfig.h create mode 100644 tools/testing/radix-tree/xarray.c diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 60f3d8eb2cb7..0b080bd88ab7 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -30,6 +30,9 @@ #include #include +/* Keep unconverted code working */ +#define radix_tree_root xarray + /* * The bottom two bits of the slot determine how the remaining bits in the * slot are interpreted: @@ -92,36 +95,21 @@ struct radix_tree_node { unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; -/* The IDR tag is stored in the low bits of the GFP flags */ +/* The IDR tag is stored in the low bits of xa_flags */ #define ROOT_IS_IDR ((__force gfp_t)4) -/* The top bits of gfp_mask are used to store the root tags */ +/* The top bits of xa_flags are used to store the root tags */ #define ROOT_TAG_SHIFT (__GFP_BITS_SHIFT) -struct radix_tree_root { - spinlock_t xa_lock; - gfp_t gfp_mask; - struct radix_tree_node __rcu *rnode; -}; - -#define RADIX_TREE_INIT(name, mask) { \ - .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ - .gfp_mask = (mask), \ - .rnode = NULL, \ -} +#define RADIX_TREE_INIT(name, mask) XARRAY_INIT(name, mask) #define RADIX_TREE(name, mask) \ struct radix_tree_root name = RADIX_TREE_INIT(name, mask) -#define INIT_RADIX_TREE(root, mask) \ -do { \ - spin_lock_init(&(root)->xa_lock); \ - (root)->gfp_mask = (mask); \ - (root)->rnode = NULL; \ -} while (0) +#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask) static inline bool radix_tree_empty(const struct radix_tree_root *root) { - return root->rnode == NULL; + return root->xa_head == NULL; } /** diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 4d1cd7a083e8..9122cf8bf52a 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -10,6 +10,8 @@ */ #include +#include +#include #include #include @@ -153,6 +155,74 @@ static inline bool xa_is_internal(const void *entry) return ((unsigned long)entry & 3) == 2; } +/** + * struct xarray - The anchor of the XArray. + * @xa_lock: Lock that protects the contents of the XArray. + * + * To use the xarray, define it statically or embed it in your data structure. + * It is a very small data structure, so it does not usually make sense to + * allocate it separately and keep a pointer to it in your data structure. + * + * You may use the xa_lock to protect your own data structures as well. + */ +/* + * If all of the entries in the array are NULL, @xa_head is a NULL pointer. + * If the only non-NULL entry in the array is at index 0, @xa_head is that + * entry. If any other entry in the array is non-NULL, @xa_head points + * to an @xa_node. + */ +struct xarray { + spinlock_t xa_lock; +/* private: The rest of the data structure is not to be used directly. */ + gfp_t xa_flags; + void __rcu * xa_head; +}; + +#define XARRAY_INIT(name, flags) { \ + .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ + .xa_flags = flags, \ + .xa_head = NULL, \ +} + +/** + * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. + * @name: A string that names your XArray. + * @flags: XA_FLAG values. + * + * This is intended for file scope definitions of XArrays. It declares + * and initialises an empty XArray with the chosen name and flags. It is + * equivalent to calling xa_init_flags() on the array, but it does the + * initialisation at compiletime instead of runtime. + */ +#define DEFINE_XARRAY_FLAGS(name, flags) \ + struct xarray name = XARRAY_INIT(name, flags) + +/** + * DEFINE_XARRAY() - Define an XArray. + * @name: A string that names your XArray. + * + * This is intended for file scope definitions of XArrays. It declares + * and initialises an empty XArray with the chosen name. It is equivalent + * to calling xa_init() on the array, but it does the initialisation at + * compiletime instead of runtime. + */ +#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) + +void xa_init_flags(struct xarray *, gfp_t flags); + +/** + * xa_init() - Initialise an empty XArray. + * @xa: XArray. + * + * An empty XArray is full of NULL entries. + * + * Context: Any context. + */ +static inline void xa_init(struct xarray *xa) +{ + xa_init_flags(xa, 0); +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) diff --git a/lib/Makefile b/lib/Makefile index ca3f7ebb900d..057385f1f145 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -18,7 +18,7 @@ KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o timerqueue.o\ + rbtree.o radix-tree.o timerqueue.o xarray.o \ idr.o int_sqrt.o extable.o \ sha1.o chacha20.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ diff --git a/lib/idr.c b/lib/idr.c index 88419fbc5737..9a366b5be2c2 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -39,8 +39,8 @@ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, unsigned int base = idr->idr_base; unsigned int id = *nextid; - if (WARN_ON_ONCE(!(idr->idr_rt.gfp_mask & ROOT_IS_IDR))) - idr->idr_rt.gfp_mask |= IDR_RT_MARKER; + if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR))) + idr->idr_rt.xa_flags |= IDR_RT_MARKER; id = (id < base) ? 0 : id - base; radix_tree_iter_init(&iter, id); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 59b28111eabc..299d4bdba109 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -124,7 +124,7 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent, static inline gfp_t root_gfp_mask(const struct radix_tree_root *root) { - return root->gfp_mask & (__GFP_BITS_MASK & ~GFP_ZONEMASK); + return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK); } static inline void tag_set(struct radix_tree_node *node, unsigned int tag, @@ -147,32 +147,32 @@ static inline int tag_get(const struct radix_tree_node *node, unsigned int tag, static inline void root_tag_set(struct radix_tree_root *root, unsigned tag) { - root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); + root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag) { - root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); + root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT)); } static inline void root_tag_clear_all(struct radix_tree_root *root) { - root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1; + root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1); } static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag) { - return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT)); + return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT)); } static inline unsigned root_tags_get(const struct radix_tree_root *root) { - return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT; + return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT; } static inline bool is_idr(const struct radix_tree_root *root) { - return !!(root->gfp_mask & ROOT_IS_IDR); + return !!(root->xa_flags & ROOT_IS_IDR); } /* @@ -291,12 +291,12 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) /* For debug */ static void radix_tree_dump(struct radix_tree_root *root) { - pr_debug("radix root: %p rnode %p tags %x\n", - root, root->rnode, - root->gfp_mask >> ROOT_TAG_SHIFT); - if (!radix_tree_is_internal_node(root->rnode)) + pr_debug("radix root: %p xa_head %p tags %x\n", + root, root->xa_head, + root->xa_flags >> ROOT_TAG_SHIFT); + if (!radix_tree_is_internal_node(root->xa_head)) return; - dump_node(entry_to_node(root->rnode), 0); + dump_node(entry_to_node(root->xa_head), 0); } static void dump_ida_node(void *entry, unsigned long index) @@ -340,9 +340,9 @@ static void dump_ida_node(void *entry, unsigned long index) static void ida_dump(struct ida *ida) { struct radix_tree_root *root = &ida->ida_rt; - pr_debug("ida: %p node %p free %d\n", ida, root->rnode, - root->gfp_mask >> ROOT_TAG_SHIFT); - dump_ida_node(root->rnode, 0); + pr_debug("ida: %p node %p free %d\n", ida, root->xa_head, + root->xa_flags >> ROOT_TAG_SHIFT); + dump_ida_node(root->xa_head, 0); } #endif @@ -576,7 +576,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { - struct radix_tree_node *node = rcu_dereference_raw(root->rnode); + struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); *nodep = node; @@ -605,7 +605,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, while (index > shift_maxindex(maxshift)) maxshift += RADIX_TREE_MAP_SHIFT; - entry = rcu_dereference_raw(root->rnode); + entry = rcu_dereference_raw(root->xa_head); if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE))) goto out; @@ -633,7 +633,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; } else if (xa_is_value(entry)) { - /* Moving an exceptional root->rnode to a node */ + /* Moving an exceptional root->xa_head to a node */ node->exceptional = 1; } /* @@ -642,7 +642,7 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, */ node->slots[0] = (void __rcu *)entry; entry = node_to_entry(node); - rcu_assign_pointer(root->rnode, entry); + rcu_assign_pointer(root->xa_head, entry); shift += RADIX_TREE_MAP_SHIFT; } while (shift <= maxshift); out: @@ -659,7 +659,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, bool shrunk = false; for (;;) { - struct radix_tree_node *node = rcu_dereference_raw(root->rnode); + struct radix_tree_node *node = rcu_dereference_raw(root->xa_head); struct radix_tree_node *child; if (!radix_tree_is_internal_node(node)) @@ -695,9 +695,9 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, * moving the node from one part of the tree to another: if it * was safe to dereference the old pointer to it * (node->slots[0]), it will be safe to dereference the new - * one (root->rnode) as far as dependent read barriers go. + * one (root->xa_head) as far as dependent read barriers go. */ - root->rnode = (void __rcu *)child; + root->xa_head = (void __rcu *)child; if (is_idr(root) && !tag_get(node, IDR_FREE, 0)) root_tag_clear(root, IDR_FREE); @@ -745,9 +745,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node_to_entry(node) == - rcu_dereference_raw(root->rnode)) - deleted |= radix_tree_shrink(root, - update_node); + rcu_dereference_raw(root->xa_head)) + deleted |= radix_tree_shrink(root, update_node); return deleted; } @@ -762,7 +761,7 @@ static bool delete_node(struct radix_tree_root *root, */ if (!is_idr(root)) root_tag_clear_all(root); - root->rnode = NULL; + root->xa_head = NULL; } WARN_ON_ONCE(!list_empty(&node->private_list)); @@ -787,7 +786,7 @@ static bool delete_node(struct radix_tree_root *root, * at position @index in the radix tree @root. * * Until there is more than one item in the tree, no nodes are - * allocated and @root->rnode is used as a direct slot instead of + * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. * * Returns -ENOMEM, or 0 for success. @@ -797,7 +796,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; - void __rcu **slot = (void __rcu **)&root->rnode; + void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; unsigned long max = index | ((1UL << order) - 1); @@ -813,7 +812,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, if (error < 0) return error; shift = error; - child = rcu_dereference_raw(root->rnode); + child = rcu_dereference_raw(root->xa_head); } while (shift > order) { @@ -1004,7 +1003,7 @@ EXPORT_SYMBOL(__radix_tree_insert); * tree @root. * * Until there is more than one item in the tree, no nodes are - * allocated and @root->rnode is used as a direct slot instead of + * allocated and @root->xa_head is used as a direct slot instead of * pointing to a node, in which case *@nodep will be NULL. */ void *__radix_tree_lookup(const struct radix_tree_root *root, @@ -1017,7 +1016,7 @@ void *__radix_tree_lookup(const struct radix_tree_root *root, restart: parent = NULL; - slot = (void __rcu **)&root->rnode; + slot = (void __rcu **)&root->xa_head; radix_tree_load_root(root, &node, &maxindex); if (index > maxindex) return NULL; @@ -1168,9 +1167,9 @@ void __radix_tree_replace(struct radix_tree_root *root, /* * This function supports replacing exceptional entries and * deleting entries, but that needs accounting against the - * node unless the slot is root->rnode. + * node unless the slot is root->xa_head. */ - WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->rnode) && + WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && (count || exceptional)); replace_slot(slot, item, node, count, exceptional); @@ -1722,7 +1721,7 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, iter->tags = 1; iter->node = NULL; __set_iter_shift(iter, 0); - return (void __rcu **)&root->rnode; + return (void __rcu **)&root->xa_head; } do { @@ -2109,7 +2108,7 @@ void __rcu **idr_get_free(struct radix_tree_root *root, unsigned long max) { struct radix_tree_node *node = NULL, *child; - void __rcu **slot = (void __rcu **)&root->rnode; + void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex, start = iter->next_index; unsigned int shift, offset = 0; @@ -2125,7 +2124,7 @@ void __rcu **idr_get_free(struct radix_tree_root *root, if (error < 0) return ERR_PTR(error); shift = error; - child = rcu_dereference_raw(root->rnode); + child = rcu_dereference_raw(root->xa_head); } if (start == 0 && shift == 0) shift = RADIX_TREE_MAP_SHIFT; @@ -2190,10 +2189,10 @@ void __rcu **idr_get_free(struct radix_tree_root *root, */ void idr_destroy(struct idr *idr) { - struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.rnode); + struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head); if (radix_tree_is_internal_node(node)) radix_tree_free_nodes(node); - idr->idr_rt.rnode = NULL; + idr->idr_rt.xa_head = NULL; root_tag_set(&idr->idr_rt, IDR_FREE); } EXPORT_SYMBOL(idr_destroy); diff --git a/lib/xarray.c b/lib/xarray.c new file mode 100644 index 000000000000..862f4c64c754 --- /dev/null +++ b/lib/xarray.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * XArray implementation + * Copyright (c) 2017 Microsoft Corporation + * Author: Matthew Wilcox + */ + +#include +#include + +/* + * Coding conventions in this file: + * + * @xa is used to refer to the entire xarray. + * @xas is the 'xarray operation state'. It may be either a pointer to + * an xa_state, or an xa_state stored on the stack. This is an unfortunate + * ambiguity. + * @index is the index of the entry being operated on + * @mark is an xa_mark_t; a small number indicating one of the mark bits. + * @node refers to an xa_node; usually the primary one being operated on by + * this function. + * @offset is the index into the slots array inside an xa_node. + * @parent refers to the @xa_node closer to the head than @node. + * @entry refers to something stored in a slot in the xarray + */ + +/** + * xa_init_flags() - Initialise an empty XArray with flags. + * @xa: XArray. + * @flags: XA_FLAG values. + * + * If you need to initialise an XArray with special flags (eg you need + * to take the lock from interrupt context), use this function instead + * of xa_init(). + * + * Context: Any context. + */ +void xa_init_flags(struct xarray *xa, gfp_t flags) +{ + spin_lock_init(&xa->xa_lock); + xa->xa_flags = flags; + xa->xa_head = NULL; +} +EXPORT_SYMBOL(xa_init_flags); diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 12adcf9ffe86..c0cf1c79efd5 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -5,7 +5,7 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \ LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu TARGETS = main idr-test multiorder -CORE_OFILES := radix-tree.o idr.o linux.o test.o find_bit.o +CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o @@ -35,6 +35,7 @@ vpath %.c ../../lib $(OFILES): Makefile *.h */*.h generated/map-shift.h \ ../../include/linux/*.h \ ../../include/asm/*.h \ + ../../../include/linux/xarray.h \ ../../../include/linux/radix-tree.h \ ../../../include/linux/idr.h @@ -44,6 +45,8 @@ radix-tree.c: ../../../lib/radix-tree.c idr.c: ../../../lib/idr.c sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ +xarray.o: ../../../lib/xarray.c + generated/map-shift.h: @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h index 23b8ed52f8c8..03dc8a57eb99 100644 --- a/tools/testing/radix-tree/linux/bug.h +++ b/tools/testing/radix-tree/linux/bug.h @@ -1 +1,2 @@ +#include #include "asm/bug.h" diff --git a/tools/testing/radix-tree/linux/kconfig.h b/tools/testing/radix-tree/linux/kconfig.h new file mode 100644 index 000000000000..6c8675859913 --- /dev/null +++ b/tools/testing/radix-tree/linux/kconfig.h @@ -0,0 +1 @@ +#include "../../../../include/linux/kconfig.h" diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 2b4f4dba1882..080aea450430 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -192,13 +192,13 @@ static void multiorder_shrink(unsigned long index, int order) assert(item_insert_order(&tree, 0, order) == 0); - node = tree.rnode; + node = tree.xa_head; assert(item_insert(&tree, index) == 0); - assert(node != tree.rnode); + assert(node != tree.xa_head); assert(item_delete(&tree, index) != 0); - assert(node == tree.rnode); + assert(node == tree.xa_head); for (i = 0; i < max; i++) { struct item *item = item_lookup(&tree, i); diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 62de66c314b7..70ddf964d51c 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -281,7 +281,7 @@ static int verify_node(struct radix_tree_node *slot, unsigned int tag, void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag) { - struct radix_tree_node *node = root->rnode; + struct radix_tree_node *node = root->xa_head; if (!radix_tree_is_internal_node(node)) return; verify_node(node, tag, !!root_tag_get(root, tag)); @@ -311,13 +311,13 @@ void item_kill_tree(struct radix_tree_root *root) } } assert(radix_tree_gang_lookup(root, (void **)items, 0, 32) == 0); - assert(root->rnode == NULL); + assert(root->xa_head == NULL); } void tree_verify_min_height(struct radix_tree_root *root, int maxindex) { unsigned shift; - struct radix_tree_node *node = root->rnode; + struct radix_tree_node *node = root->xa_head; if (!radix_tree_is_internal_node(node)) { assert(maxindex == 0); return; diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c new file mode 100644 index 000000000000..9bbd667172a7 --- /dev/null +++ b/tools/testing/radix-tree/xarray.c @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * xarray.c: Userspace shim for XArray test-suite + * Copyright (c) 2018 Matthew Wilcox + */ + +#include "../../../lib/xarray.c" -- cgit v1.2.3 From 01959dfe771c6893365482ec78dc1d9cbbbe6de8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 9 Nov 2017 09:23:56 -0500 Subject: xarray: Define struct xa_node This is a direct replacement for struct radix_tree_node. A couple of struct members have changed name, so convert those. Use a #define so that radix tree users continue to work without change. Signed-off-by: Matthew Wilcox Reviewed-by: Josef Bacik --- include/linux/radix-tree.h | 29 +++------------------ include/linux/xarray.h | 27 ++++++++++++++++++++ lib/radix-tree.c | 48 +++++++++++++++++------------------ mm/workingset.c | 16 ++++++------ tools/testing/radix-tree/multiorder.c | 30 +++++++++++----------- 5 files changed, 77 insertions(+), 73 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 0b080bd88ab7..15388b7e38b9 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -32,6 +32,7 @@ /* Keep unconverted code working */ #define radix_tree_root xarray +#define radix_tree_node xa_node /* * The bottom two bits of the slot determine how the remaining bits in the @@ -60,41 +61,17 @@ static inline bool radix_tree_is_internal_node(void *ptr) /*** radix-tree API starts here ***/ -#define RADIX_TREE_MAX_TAGS 3 - #define RADIX_TREE_MAP_SHIFT XA_CHUNK_SHIFT #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) -#define RADIX_TREE_TAG_LONGS \ - ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) +#define RADIX_TREE_MAX_TAGS XA_MAX_MARKS +#define RADIX_TREE_TAG_LONGS XA_MARK_LONGS #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* - * @count is the count of every non-NULL element in the ->slots array - * whether that is a value entry, a retry entry, a user pointer, - * a sibling entry or a pointer to the next level of the tree. - * @exceptional is the count of every element in ->slots which is - * either a value entry or a sibling of a value entry. - */ -struct radix_tree_node { - unsigned char shift; /* Bits remaining in each slot */ - unsigned char offset; /* Slot offset in parent */ - unsigned char count; /* Total entry count */ - unsigned char exceptional; /* Exceptional entry count */ - struct radix_tree_node *parent; /* Used when ascending tree */ - struct radix_tree_root *root; /* The tree we belong to */ - union { - struct list_head private_list; /* For tree user */ - struct rcu_head rcu_head; /* Used when freeing node */ - }; - void __rcu *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; -}; - /* The IDR tag is stored in the low bits of xa_flags */ #define ROOT_IS_IDR ((__force gfp_t)4) /* The top bits of xa_flags are used to store the root tags */ diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 9122cf8bf52a..52141dfc5a90 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -252,6 +252,33 @@ static inline void xa_init(struct xarray *xa) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) +#define XA_MAX_MARKS 3 +#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG) + +/* + * @count is the count of every non-NULL element in the ->slots array + * whether that is a value entry, a retry entry, a user pointer, + * a sibling entry or a pointer to the next level of the tree. + * @nr_values is the count of every element in ->slots which is + * either a value entry or a sibling of a value entry. + */ +struct xa_node { + unsigned char shift; /* Bits remaining in each slot */ + unsigned char offset; /* Slot offset in parent */ + unsigned char count; /* Total entry count */ + unsigned char nr_values; /* Value entry count */ + struct xa_node __rcu *parent; /* NULL at top of tree */ + struct xarray *array; /* The array we belong to */ + union { + struct list_head private_list; /* For tree user */ + struct rcu_head rcu_head; /* Used when freeing node */ + }; + void __rcu *slots[XA_CHUNK_SIZE]; + union { + unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; + unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; + }; +}; /* Private */ static inline bool xa_is_node(const void *entry) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 299d4bdba109..8a568cea1237 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -260,11 +260,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", + pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d nr_values %d\n", node, node->offset, index, index | node_maxindex(node), node->parent, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->exceptional); + node->shift, node->count, node->nr_values); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -354,7 +354,7 @@ static struct radix_tree_node * radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, struct radix_tree_root *root, unsigned int shift, unsigned int offset, - unsigned int count, unsigned int exceptional) + unsigned int count, unsigned int nr_values) { struct radix_tree_node *ret = NULL; @@ -401,9 +401,9 @@ out: ret->shift = shift; ret->offset = offset; ret->count = count; - ret->exceptional = exceptional; + ret->nr_values = nr_values; ret->parent = parent; - ret->root = root; + ret->array = root; } return ret; } @@ -633,8 +633,8 @@ static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp, if (radix_tree_is_internal_node(entry)) { entry_to_node(entry)->parent = node; } else if (xa_is_value(entry)) { - /* Moving an exceptional root->xa_head to a node */ - node->exceptional = 1; + /* Moving a value entry root->xa_head to a node */ + node->nr_values = 1; } /* * entry was already in the radix tree, so we do not need @@ -928,12 +928,12 @@ static inline int insert_entries(struct radix_tree_node *node, if (xa_is_node(old)) radix_tree_free_nodes(old); if (xa_is_value(old)) - node->exceptional--; + node->nr_values--; } if (node) { node->count += n; if (xa_is_value(item)) - node->exceptional += n; + node->nr_values += n; } return n; } @@ -947,7 +947,7 @@ static inline int insert_entries(struct radix_tree_node *node, if (node) { node->count++; if (xa_is_value(item)) - node->exceptional++; + node->nr_values++; } return 1; } @@ -1083,7 +1083,7 @@ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) EXPORT_SYMBOL(radix_tree_lookup); static inline void replace_sibling_entries(struct radix_tree_node *node, - void __rcu **slot, int count, int exceptional) + void __rcu **slot, int count, int values) { #ifdef CONFIG_RADIX_TREE_MULTIORDER unsigned offset = get_slot_offset(node, slot); @@ -1096,18 +1096,18 @@ static inline void replace_sibling_entries(struct radix_tree_node *node, node->slots[offset] = NULL; node->count--; } - node->exceptional += exceptional; + node->nr_values += values; } #endif } static void replace_slot(void __rcu **slot, void *item, - struct radix_tree_node *node, int count, int exceptional) + struct radix_tree_node *node, int count, int values) { - if (node && (count || exceptional)) { + if (node && (count || values)) { node->count += count; - node->exceptional += exceptional; - replace_sibling_entries(node, slot, count, exceptional); + node->nr_values += values; + replace_sibling_entries(node, slot, count, values); } rcu_assign_pointer(*slot, item); @@ -1161,17 +1161,17 @@ void __radix_tree_replace(struct radix_tree_root *root, radix_tree_update_node_t update_node) { void *old = rcu_dereference_raw(*slot); - int exceptional = !!xa_is_value(item) - !!xa_is_value(old); + int values = !!xa_is_value(item) - !!xa_is_value(old); int count = calculate_count(root, node, slot, item, old); /* - * This function supports replacing exceptional entries and + * This function supports replacing value entries and * deleting entries, but that needs accounting against the * node unless the slot is root->xa_head. */ WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) && - (count || exceptional)); - replace_slot(slot, item, node, count, exceptional); + (count || values)); + replace_slot(slot, item, node, count, values); if (!node) return; @@ -1193,7 +1193,7 @@ void __radix_tree_replace(struct radix_tree_root *root, * across slot lookup and replacement. * * NOTE: This cannot be used to switch between non-entries (empty slots), - * regular entries, and exceptional entries, as that requires accounting + * regular entries, and value entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or * deleting, use __radix_tree_lookup() and __radix_tree_replace() or * radix_tree_iter_replace(). @@ -1301,7 +1301,7 @@ int radix_tree_split(struct radix_tree_root *root, unsigned long index, rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); } rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); - parent->exceptional -= (end - offset); + parent->nr_values -= (end - offset); if (order == parent->shift) return 0; @@ -1961,7 +1961,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { void *old = rcu_dereference_raw(*slot); - int exceptional = xa_is_value(old) ? -1 : 0; + int values = xa_is_value(old) ? -1 : 0; unsigned offset = get_slot_offset(node, slot); int tag; @@ -1971,7 +1971,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root, for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) node_tag_clear(root, node, tag, offset); - replace_slot(slot, NULL, node, -1, exceptional); + replace_slot(slot, NULL, node, -1, values); return node && delete_node(root, node, NULL); } diff --git a/mm/workingset.c b/mm/workingset.c index bb109b3afac2..c201cbb8c00f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -349,7 +349,7 @@ void workingset_update_node(struct radix_tree_node *node) * already where they should be. The list_empty() test is safe * as node->private_list is protected by the i_pages lock. */ - if (node->count && node->count == node->exceptional) { + if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) list_lru_add(&shadow_nodes, &node->private_list); } else { @@ -428,8 +428,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, i_pages); + node = container_of(item, struct xa_node, private_list); + mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { @@ -446,25 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; - if (WARN_ON_ONCE(node->count != node->exceptional)) + if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { if (WARN_ON_ONCE(!xa_is_value(node->slots[i]))) goto out_invalid; - if (WARN_ON_ONCE(!node->exceptional)) + if (WARN_ON_ONCE(!node->nr_values)) goto out_invalid; if (WARN_ON_ONCE(!mapping->nrexceptional)) goto out_invalid; node->slots[i] = NULL; - node->exceptional--; + node->nr_values--; node->count--; mapping->nrexceptional--; } } - if (WARN_ON_ONCE(node->exceptional)) + if (WARN_ON_ONCE(node->nr_values)) goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->i_pages, node, diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 080aea450430..60786fa55302 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -393,7 +393,7 @@ static void multiorder_join2(unsigned order1, unsigned order2) radix_tree_insert(&tree, 1 << order2, xa_mk_value(5)); item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); assert(item2 == xa_mk_value(5)); - assert(node->exceptional == 1); + assert(node->nr_values == 1); item2 = radix_tree_lookup(&tree, 0); free(item2); @@ -401,7 +401,7 @@ static void multiorder_join2(unsigned order1, unsigned order2) radix_tree_join(&tree, 0, order1, item1); item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); assert(item2 == item1); - assert(node->exceptional == 0); + assert(node->nr_values == 0); item_kill_tree(&tree); } @@ -409,7 +409,7 @@ static void multiorder_join2(unsigned order1, unsigned order2) * This test revealed an accounting bug for value entries at one point. * Nodes were being freed back into the pool with an elevated exception count * by radix_tree_join() and then radix_tree_split() was failing to zero the - * count of exceptional entries. + * count of value entries. */ static void multiorder_join3(unsigned int order) { @@ -433,7 +433,7 @@ static void multiorder_join3(unsigned int order) } __radix_tree_lookup(&tree, 0, &node, NULL); - assert(node->exceptional == node->count); + assert(node->nr_values == node->count); item_kill_tree(&tree); } @@ -520,7 +520,7 @@ static void __multiorder_split2(int old_order, int new_order) item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item == xa_mk_value(5)); - assert(node->exceptional > 0); + assert(node->nr_values > 0); radix_tree_split(&tree, 0, new_order); radix_tree_for_each_slot(slot, &tree, &iter, 0) { @@ -530,7 +530,7 @@ static void __multiorder_split2(int old_order, int new_order) item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item != xa_mk_value(5)); - assert(node->exceptional == 0); + assert(node->nr_values == 0); item_kill_tree(&tree); } @@ -547,7 +547,7 @@ static void __multiorder_split3(int old_order, int new_order) item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item == xa_mk_value(5)); - assert(node->exceptional > 0); + assert(node->nr_values > 0); radix_tree_split(&tree, 0, new_order); radix_tree_for_each_slot(slot, &tree, &iter, 0) { @@ -556,7 +556,7 @@ static void __multiorder_split3(int old_order, int new_order) item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item == xa_mk_value(7)); - assert(node->exceptional > 0); + assert(node->nr_values > 0); item_kill_tree(&tree); @@ -564,7 +564,7 @@ static void __multiorder_split3(int old_order, int new_order) item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item == xa_mk_value(5)); - assert(node->exceptional > 0); + assert(node->nr_values > 0); radix_tree_split(&tree, 0, new_order); radix_tree_for_each_slot(slot, &tree, &iter, 0) { @@ -577,13 +577,13 @@ static void __multiorder_split3(int old_order, int new_order) item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL); assert(item == xa_mk_value(7)); - assert(node->count == node->exceptional); + assert(node->count == node->nr_values); do { node = node->parent; if (!node) break; assert(node->count == 1); - assert(node->exceptional == 0); + assert(node->nr_values == 0); } while (1); item_kill_tree(&tree); @@ -611,15 +611,15 @@ static void multiorder_account(void) __radix_tree_insert(&tree, 1 << 5, 5, xa_mk_value(5)); __radix_tree_lookup(&tree, 0, &node, NULL); - assert(node->count == node->exceptional * 2); + assert(node->count == node->nr_values * 2); radix_tree_delete(&tree, 1 << 5); - assert(node->exceptional == 0); + assert(node->nr_values == 0); __radix_tree_insert(&tree, 1 << 5, 5, xa_mk_value(5)); __radix_tree_lookup(&tree, 1 << 5, &node, &slot); - assert(node->count == node->exceptional * 2); + assert(node->count == node->nr_values * 2); __radix_tree_replace(&tree, node, slot, NULL, NULL); - assert(node->exceptional == 0); + assert(node->nr_values == 0); item_kill_tree(&tree); } -- cgit v1.2.3 From 992a8e60e3fea77c55589fc1c5af2304e78a5baa Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 23 Nov 2017 22:57:20 -0500 Subject: xarray: Add documentation This is documentation on how to use the XArray, not details about its internal implementation. Signed-off-by: Matthew Wilcox Acked-by: Josef Bacik --- Documentation/core-api/index.rst | 1 + Documentation/core-api/xarray.rst | 404 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 405 insertions(+) create mode 100644 Documentation/core-api/xarray.rst diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 26b735cefb93..df3105aa75f4 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -21,6 +21,7 @@ Core utilities local_ops workqueue genericirq + xarray flexible-arrays librs genalloc diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst new file mode 100644 index 000000000000..4b101b4f3aa1 --- /dev/null +++ b/Documentation/core-api/xarray.rst @@ -0,0 +1,404 @@ +.. SPDX-License-Identifier: GPL-2.0+ + +====== +XArray +====== + +:Author: Matthew Wilcox + +Overview +======== + +The XArray is an abstract data type which behaves like a very large array +of pointers. It meets many of the same needs as a hash or a conventional +resizable array. Unlike a hash, it allows you to sensibly go to the +next or previous entry in a cache-efficient manner. In contrast to a +resizable array, there is no need to copy data or change MMU mappings in +order to grow the array. It is more memory-efficient, parallelisable +and cache friendly than a doubly-linked list. It takes advantage of +RCU to perform lookups without locking. + +The XArray implementation is efficient when the indices used are densely +clustered; hashing the object and using the hash as the index will not +perform well. The XArray is optimised for small indices, but still has +good performance with large indices. If your index can be larger than +``ULONG_MAX`` then the XArray is not the data type for you. The most +important user of the XArray is the page cache. + +Each non-``NULL`` entry in the array has three bits associated with +it called marks. Each mark may be set or cleared independently of +the others. You can iterate over entries which are marked. + +Normal pointers may be stored in the XArray directly. They must be 4-byte +aligned, which is true for any pointer returned from :c:func:`kmalloc` and +:c:func:`alloc_page`. It isn't true for arbitrary user-space pointers, +nor for function pointers. You can store pointers to statically allocated +objects, as long as those objects have an alignment of at least 4. + +You can also store integers between 0 and ``LONG_MAX`` in the XArray. +You must first convert it into an entry using :c:func:`xa_mk_value`. +When you retrieve an entry from the XArray, you can check whether it is +a value entry by calling :c:func:`xa_is_value`, and convert it back to +an integer by calling :c:func:`xa_to_value`. + +Some users want to store tagged pointers instead of using the marks +described above. They can call :c:func:`xa_tag_pointer` to create an +entry with a tag, :c:func:`xa_untag_pointer` to turn a tagged entry +back into an untagged pointer and :c:func:`xa_pointer_tag` to retrieve +the tag of an entry. Tagged pointers use the same bits that are used +to distinguish value entries from normal pointers, so each user must +decide whether they want to store value entries or tagged pointers in +any particular XArray. + +The XArray does not support storing :c:func:`IS_ERR` pointers as some +conflict with value entries or internal entries. + +An unusual feature of the XArray is the ability to create entries which +occupy a range of indices. Once stored to, looking up any index in +the range will return the same entry as looking up any other index in +the range. Setting a mark on one index will set it on all of them. +Storing to any index will store to all of them. Multi-index entries can +be explicitly split into smaller entries, or storing ``NULL`` into any +entry will cause the XArray to forget about the range. + +Normal API +========== + +Start by initialising an XArray, either with :c:func:`DEFINE_XARRAY` +for statically allocated XArrays or :c:func:`xa_init` for dynamically +allocated ones. A freshly-initialised XArray contains a ``NULL`` +pointer at every index. + +You can then set entries using :c:func:`xa_store` and get entries +using :c:func:`xa_load`. xa_store will overwrite any entry with the +new entry and return the previous entry stored at that index. You can +use :c:func:`xa_erase` instead of calling :c:func:`xa_store` with a +``NULL`` entry. There is no difference between an entry that has never +been stored to and one that has most recently had ``NULL`` stored to it. + +You can conditionally replace an entry at an index by using +:c:func:`xa_cmpxchg`. Like :c:func:`cmpxchg`, it will only succeed if +the entry at that index has the 'old' value. It also returns the entry +which was at that index; if it returns the same entry which was passed as +'old', then :c:func:`xa_cmpxchg` succeeded. + +If you want to only store a new entry to an index if the current entry +at that index is ``NULL``, you can use :c:func:`xa_insert` which +returns ``-EEXIST`` if the entry is not empty. + +You can enquire whether a mark is set on an entry by using +:c:func:`xa_get_mark`. If the entry is not ``NULL``, you can set a mark +on it by using :c:func:`xa_set_mark` and remove the mark from an entry by +calling :c:func:`xa_clear_mark`. You can ask whether any entry in the +XArray has a particular mark set by calling :c:func:`xa_marked`. + +You can copy entries out of the XArray into a plain array by calling +:c:func:`xa_extract`. Or you can iterate over the present entries in +the XArray by calling :c:func:`xa_for_each`. You may prefer to use +:c:func:`xa_find` or :c:func:`xa_find_after` to move to the next present +entry in the XArray. + +Finally, you can remove all entries from an XArray by calling +:c:func:`xa_destroy`. If the XArray entries are pointers, you may wish +to free the entries first. You can do this by iterating over all present +entries in the XArray using the :c:func:`xa_for_each` iterator. + +Memory allocation +----------------- + +The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_reserve` +and :c:func:`xa_insert` functions take a gfp_t parameter in case +the XArray needs to allocate memory to store this entry. +If the entry is being deleted, no memory allocation needs to be performed, +and the GFP flags specified will be ignored. + +It is possible for no memory to be allocatable, particularly if you pass +a restrictive set of GFP flags. In that case, the functions return a +special value which can be turned into an errno using :c:func:`xa_err`. +If you don't need to know exactly which error occurred, using +:c:func:`xa_is_err` is slightly more efficient. + +Locking +------- + +When using the Normal API, you do not have to worry about locking. +The XArray uses RCU and an internal spinlock to synchronise access: + +No lock needed: + * :c:func:`xa_empty` + * :c:func:`xa_marked` + +Takes RCU read lock: + * :c:func:`xa_load` + * :c:func:`xa_for_each` + * :c:func:`xa_find` + * :c:func:`xa_find_after` + * :c:func:`xa_extract` + * :c:func:`xa_get_mark` + +Takes xa_lock internally: + * :c:func:`xa_store` + * :c:func:`xa_insert` + * :c:func:`xa_erase` + * :c:func:`xa_erase_bh` + * :c:func:`xa_erase_irq` + * :c:func:`xa_cmpxchg` + * :c:func:`xa_destroy` + * :c:func:`xa_set_mark` + * :c:func:`xa_clear_mark` + +Assumes xa_lock held on entry: + * :c:func:`__xa_store` + * :c:func:`__xa_insert` + * :c:func:`__xa_erase` + * :c:func:`__xa_cmpxchg` + * :c:func:`__xa_set_mark` + * :c:func:`__xa_clear_mark` + +If you want to take advantage of the lock to protect the data structures +that you are storing in the XArray, you can call :c:func:`xa_lock` +before calling :c:func:`xa_load`, then take a reference count on the +object you have found before calling :c:func:`xa_unlock`. This will +prevent stores from removing the object from the array between looking +up the object and incrementing the refcount. You can also use RCU to +avoid dereferencing freed memory, but an explanation of that is beyond +the scope of this document. + +The XArray does not disable interrupts or softirqs while modifying +the array. It is safe to read the XArray from interrupt or softirq +context as the RCU lock provides enough protection. + +If, for example, you want to store entries in the XArray in process +context and then erase them in softirq context, you can do that this way:: + + void foo_init(struct foo *foo) + { + xa_init_flags(&foo->array, XA_FLAGS_LOCK_BH); + } + + int foo_store(struct foo *foo, unsigned long index, void *entry) + { + int err; + + xa_lock_bh(&foo->array); + err = xa_err(__xa_store(&foo->array, index, entry, GFP_KERNEL)); + if (!err) + foo->count++; + xa_unlock_bh(&foo->array); + return err; + } + + /* foo_erase() is only called from softirq context */ + void foo_erase(struct foo *foo, unsigned long index) + { + xa_lock(&foo->array); + __xa_erase(&foo->array, index); + foo->count--; + xa_unlock(&foo->array); + } + +If you are going to modify the XArray from interrupt or softirq context, +you need to initialise the array using :c:func:`xa_init_flags`, passing +``XA_FLAGS_LOCK_IRQ`` or ``XA_FLAGS_LOCK_BH``. + +The above example also shows a common pattern of wanting to extend the +coverage of the xa_lock on the store side to protect some statistics +associated with the array. + +Sharing the XArray with interrupt context is also possible, either +using :c:func:`xa_lock_irqsave` in both the interrupt handler and process +context, or :c:func:`xa_lock_irq` in process context and :c:func:`xa_lock` +in the interrupt handler. Some of the more common patterns have helper +functions such as :c:func:`xa_erase_bh` and :c:func:`xa_erase_irq`. + +Sometimes you need to protect access to the XArray with a mutex because +that lock sits above another mutex in the locking hierarchy. That does +not entitle you to use functions like :c:func:`__xa_erase` without taking +the xa_lock; the xa_lock is used for lockdep validation and will be used +for other purposes in the future. + +The :c:func:`__xa_set_mark` and :c:func:`__xa_clear_mark` functions are also +available for situations where you look up an entry and want to atomically +set or clear a mark. It may be more efficient to use the advanced API +in this case, as it will save you from walking the tree twice. + +Advanced API +============ + +The advanced API offers more flexibility and better performance at the +cost of an interface which can be harder to use and has fewer safeguards. +No locking is done for you by the advanced API, and you are required +to use the xa_lock while modifying the array. You can choose whether +to use the xa_lock or the RCU lock while doing read-only operations on +the array. You can mix advanced and normal operations on the same array; +indeed the normal API is implemented in terms of the advanced API. The +advanced API is only available to modules with a GPL-compatible license. + +The advanced API is based around the xa_state. This is an opaque data +structure which you declare on the stack using the :c:func:`XA_STATE` +macro. This macro initialises the xa_state ready to start walking +around the XArray. It is used as a cursor to maintain the position +in the XArray and let you compose various operations together without +having to restart from the top every time. + +The xa_state is also used to store errors. You can call +:c:func:`xas_error` to retrieve the error. All operations check whether +the xa_state is in an error state before proceeding, so there's no need +for you to check for an error after each call; you can make multiple +calls in succession and only check at a convenient point. The only +errors currently generated by the XArray code itself are ``ENOMEM`` and +``EINVAL``, but it supports arbitrary errors in case you want to call +:c:func:`xas_set_err` yourself. + +If the xa_state is holding an ``ENOMEM`` error, calling :c:func:`xas_nomem` +will attempt to allocate more memory using the specified gfp flags and +cache it in the xa_state for the next attempt. The idea is that you take +the xa_lock, attempt the operation and drop the lock. The operation +attempts to allocate memory while holding the lock, but it is more +likely to fail. Once you have dropped the lock, :c:func:`xas_nomem` +can try harder to allocate more memory. It will return ``true`` if it +is worth retrying the operation (i.e. that there was a memory error *and* +more memory was allocated). If it has previously allocated memory, and +that memory wasn't used, and there is no error (or some error that isn't +``ENOMEM``), then it will free the memory previously allocated. + +Internal Entries +---------------- + +The XArray reserves some entries for its own purposes. These are never +exposed through the normal API, but when using the advanced API, it's +possible to see them. Usually the best way to handle them is to pass them +to :c:func:`xas_retry`, and retry the operation if it returns ``true``. + +.. flat-table:: + :widths: 1 1 6 + + * - Name + - Test + - Usage + + * - Node + - :c:func:`xa_is_node` + - An XArray node. May be visible when using a multi-index xa_state. + + * - Sibling + - :c:func:`xa_is_sibling` + - A non-canonical entry for a multi-index entry. The value indicates + which slot in this node has the canonical entry. + + * - Retry + - :c:func:`xa_is_retry` + - This entry is currently being modified by a thread which has the + xa_lock. The node containing this entry may be freed at the end + of this RCU period. You should restart the lookup from the head + of the array. + +Other internal entries may be added in the future. As far as possible, they +will be handled by :c:func:`xas_retry`. + +Additional functionality +------------------------ + +The :c:func:`xas_create_range` function allocates all the necessary memory +to store every entry in a range. It will set ENOMEM in the xa_state if +it cannot allocate memory. + +You can use :c:func:`xas_init_marks` to reset the marks on an entry +to their default state. This is usually all marks clear, unless the +XArray is marked with ``XA_FLAGS_TRACK_FREE``, in which case mark 0 is set +and all other marks are clear. Replacing one entry with another using +:c:func:`xas_store` will not reset the marks on that entry; if you want +the marks reset, you should do that explicitly. + +The :c:func:`xas_load` will walk the xa_state as close to the entry +as it can. If you know the xa_state has already been walked to the +entry and need to check that the entry hasn't changed, you can use +:c:func:`xas_reload` to save a function call. + +If you need to move to a different index in the XArray, call +:c:func:`xas_set`. This resets the cursor to the top of the tree, which +will generally make the next operation walk the cursor to the desired +spot in the tree. If you want to move to the next or previous index, +call :c:func:`xas_next` or :c:func:`xas_prev`. Setting the index does +not walk the cursor around the array so does not require a lock to be +held, while moving to the next or previous index does. + +You can search for the next present entry using :c:func:`xas_find`. This +is the equivalent of both :c:func:`xa_find` and :c:func:`xa_find_after`; +if the cursor has been walked to an entry, then it will find the next +entry after the one currently referenced. If not, it will return the +entry at the index of the xa_state. Using :c:func:`xas_next_entry` to +move to the next present entry instead of :c:func:`xas_find` will save +a function call in the majority of cases at the expense of emitting more +inline code. + +The :c:func:`xas_find_marked` function is similar. If the xa_state has +not been walked, it will return the entry at the index of the xa_state, +if it is marked. Otherwise, it will return the first marked entry after +the entry referenced by the xa_state. The :c:func:`xas_next_marked` +function is the equivalent of :c:func:`xas_next_entry`. + +When iterating over a range of the XArray using :c:func:`xas_for_each` +or :c:func:`xas_for_each_marked`, it may be necessary to temporarily stop +the iteration. The :c:func:`xas_pause` function exists for this purpose. +After you have done the necessary work and wish to resume, the xa_state +is in an appropriate state to continue the iteration after the entry +you last processed. If you have interrupts disabled while iterating, +then it is good manners to pause the iteration and reenable interrupts +every ``XA_CHECK_SCHED`` entries. + +The :c:func:`xas_get_mark`, :c:func:`xas_set_mark` and +:c:func:`xas_clear_mark` functions require the xa_state cursor to have +been moved to the appropriate location in the xarray; they will do +nothing if you have called :c:func:`xas_pause` or :c:func:`xas_set` +immediately before. + +You can call :c:func:`xas_set_update` to have a callback function +called each time the XArray updates a node. This is used by the page +cache workingset code to maintain its list of nodes which contain only +shadow entries. + +Multi-Index Entries +------------------- + +The XArray has the ability to tie multiple indices together so that +operations on one index affect all indices. For example, storing into +any index will change the value of the entry retrieved from any index. +Setting or clearing a mark on any index will set or clear the mark +on every index that is tied together. The current implementation +only allows tying ranges which are aligned powers of two together; +eg indices 64-127 may be tied together, but 2-6 may not be. This may +save substantial quantities of memory; for example tying 512 entries +together will save over 4kB. + +You can create a multi-index entry by using :c:func:`XA_STATE_ORDER` +or :c:func:`xas_set_order` followed by a call to :c:func:`xas_store`. +Calling :c:func:`xas_load` with a multi-index xa_state will walk the +xa_state to the right location in the tree, but the return value is not +meaningful, potentially being an internal entry or ``NULL`` even when there +is an entry stored within the range. Calling :c:func:`xas_find_conflict` +will return the first entry within the range or ``NULL`` if there are no +entries in the range. The :c:func:`xas_for_each_conflict` iterator will +iterate over every entry which overlaps the specified range. + +If :c:func:`xas_load` encounters a multi-index entry, the xa_index +in the xa_state will not be changed. When iterating over an XArray +or calling :c:func:`xas_find`, if the initial index is in the middle +of a multi-index entry, it will not be altered. Subsequent calls +or iterations will move the index to the first index in the range. +Each entry will only be returned once, no matter how many indices it +occupies. + +Using :c:func:`xas_next` or :c:func:`xas_prev` with a multi-index xa_state +is not supported. Using either of these functions on a multi-index entry +will reveal sibling entries; these should be skipped over by the caller. + +Storing ``NULL`` into any index of a multi-index entry will set the entry +at every index to ``NULL`` and dissolve the tie. Splitting a multi-index +entry into entries occupying smaller ranges is not yet supported. + +Functions and structures +======================== + +.. kernel-doc:: include/linux/xarray.h +.. kernel-doc:: lib/xarray.c -- cgit v1.2.3 From ad3d6c7263e368abdc151e1cc13dc78aa39cc7a7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 7 Nov 2017 14:57:46 -0500 Subject: xarray: Add XArray load operation The xa_load function brings with it a lot of infrastructure; xa_empty(), xa_is_err(), and large chunks of the XArray advanced API that are used to implement xa_load. As the test-suite demonstrates, it is possible to use the XArray functions on a radix tree. The radix tree functions depend on the GFP flags being stored in the root of the tree, so it's not possible to use the radix tree functions on an XArray. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 336 ++++++++++++++++++++++++++++++ lib/Kconfig.debug | 3 + lib/Makefile | 1 + lib/radix-tree.c | 43 ---- lib/test_xarray.c | 87 ++++++++ lib/xarray.c | 195 +++++++++++++++++ tools/include/linux/kernel.h | 1 + tools/testing/radix-tree/.gitignore | 1 + tools/testing/radix-tree/Makefile | 6 +- tools/testing/radix-tree/linux/kernel.h | 1 + tools/testing/radix-tree/linux/rcupdate.h | 2 + tools/testing/radix-tree/main.c | 1 + tools/testing/radix-tree/test.h | 1 + tools/testing/radix-tree/xarray.c | 28 +++ 14 files changed, 661 insertions(+), 45 deletions(-) create mode 100644 lib/test_xarray.c diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 52141dfc5a90..a0df8217068c 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -30,6 +32,10 @@ * * 0-62: Sibling entries * 256: Retry entry + * + * Errors are also represented as internal entries, but use the negative + * space (-4094 to -2). They're never stored in the slots array; only + * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) @@ -155,6 +161,42 @@ static inline bool xa_is_internal(const void *entry) return ((unsigned long)entry & 3) == 2; } +/** + * xa_is_err() - Report whether an XArray operation returned an error + * @entry: Result from calling an XArray function + * + * If an XArray operation cannot complete an operation, it will return + * a special value indicating an error. This function tells you + * whether an error occurred; xa_err() tells you which error occurred. + * + * Context: Any context. + * Return: %true if the entry indicates an error. + */ +static inline bool xa_is_err(const void *entry) +{ + return unlikely(xa_is_internal(entry)); +} + +/** + * xa_err() - Turn an XArray result into an errno. + * @entry: Result from calling an XArray function. + * + * If an XArray operation cannot complete an operation, it will return + * a special pointer value which encodes an errno. This function extracts + * the errno from the pointer value, or returns 0 if the pointer does not + * represent an errno. + * + * Context: Any context. + * Return: A negative errno or 0. + */ +static inline int xa_err(void *entry) +{ + /* xa_to_internal() would not do sign extension. */ + if (xa_is_err(entry)) + return (long)entry >> 2; + return 0; +} + /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. @@ -209,6 +251,7 @@ struct xarray { #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) void xa_init_flags(struct xarray *, gfp_t flags); +void *xa_load(struct xarray *, unsigned long index); /** * xa_init() - Initialise an empty XArray. @@ -223,6 +266,18 @@ static inline void xa_init(struct xarray *xa) xa_init_flags(xa, 0); } +/** + * xa_empty() - Determine if an array has any present entries. + * @xa: XArray. + * + * Context: Any context. + * Return: %true if the array contains only NULL pointers. + */ +static inline bool xa_empty(const struct xarray *xa) +{ + return xa->xa_head == NULL; +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -280,6 +335,65 @@ struct xa_node { }; }; +void xa_dump(const struct xarray *); +void xa_dump_node(const struct xa_node *); + +#ifdef XA_DEBUG +#define XA_BUG_ON(xa, x) do { \ + if (x) { \ + xa_dump(xa); \ + BUG(); \ + } \ + } while (0) +#define XA_NODE_BUG_ON(node, x) do { \ + if (x) { \ + if (node) xa_dump_node(node); \ + BUG(); \ + } \ + } while (0) +#else +#define XA_BUG_ON(xa, x) do { } while (0) +#define XA_NODE_BUG_ON(node, x) do { } while (0) +#endif + +/* Private */ +static inline void *xa_head(const struct xarray *xa) +{ + return rcu_dereference_check(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_head_locked(const struct xarray *xa) +{ + return rcu_dereference_protected(xa->xa_head, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_check(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline void *xa_entry_locked(const struct xarray *xa, + const struct xa_node *node, unsigned int offset) +{ + XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); + return rcu_dereference_protected(node->slots[offset], + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_to_node(const void *entry) +{ + return (struct xa_node *)((unsigned long)entry - 2); +} + /* Private */ static inline bool xa_is_node(const void *entry) { @@ -312,4 +426,226 @@ static inline bool xa_is_sibling(const void *entry) #define XA_RETRY_ENTRY xa_mk_internal(256) +/** + * xa_is_retry() - Is the entry a retry entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a retry entry. + */ +static inline bool xa_is_retry(const void *entry) +{ + return unlikely(entry == XA_RETRY_ENTRY); +} + +/** + * typedef xa_update_node_t - A callback function from the XArray. + * @node: The node which is being processed + * + * This function is called every time the XArray updates the count of + * present and value entries in a node. It allows advanced users to + * maintain the private_list in the node. + * + * Context: The xa_lock is held and interrupts may be disabled. + * Implementations should not drop the xa_lock, nor re-enable + * interrupts. + */ +typedef void (*xa_update_node_t)(struct xa_node *node); + +/* + * The xa_state is opaque to its users. It contains various different pieces + * of state involved in the current operation on the XArray. It should be + * declared on the stack and passed between the various internal routines. + * The various elements in it should not be accessed directly, but only + * through the provided accessor functions. The below documentation is for + * the benefit of those working on the code, not for users of the XArray. + * + * @xa_node usually points to the xa_node containing the slot we're operating + * on (and @xa_offset is the offset in the slots array). If there is a + * single entry in the array at index 0, there are no allocated xa_nodes to + * point to, and so we store %NULL in @xa_node. @xa_node is set to + * the value %XAS_RESTART if the xa_state is not walked to the correct + * position in the tree of nodes for this operation. If an error occurs + * during an operation, it is set to an %XAS_ERROR value. If we run off the + * end of the allocated nodes, it is set to %XAS_BOUNDS. + */ +struct xa_state { + struct xarray *xa; + unsigned long xa_index; + unsigned char xa_shift; + unsigned char xa_sibs; + unsigned char xa_offset; + unsigned char xa_pad; /* Helps gcc generate better code */ + struct xa_node *xa_node; + struct xa_node *xa_alloc; + xa_update_node_t xa_update; +}; + +/* + * We encode errnos in the xas->xa_node. If an error has happened, we need to + * drop the lock to fix it, and once we've done so the xa_state is invalid. + */ +#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) +#define XAS_BOUNDS ((struct xa_node *)1UL) +#define XAS_RESTART ((struct xa_node *)3UL) + +#define __XA_STATE(array, index, shift, sibs) { \ + .xa = array, \ + .xa_index = index, \ + .xa_shift = shift, \ + .xa_sibs = sibs, \ + .xa_offset = 0, \ + .xa_pad = 0, \ + .xa_node = XAS_RESTART, \ + .xa_alloc = NULL, \ + .xa_update = NULL \ +} + +/** + * XA_STATE() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * + * Declare and initialise an xa_state on the stack. + */ +#define XA_STATE(name, array, index) \ + struct xa_state name = __XA_STATE(array, index, 0, 0) + +/** + * XA_STATE_ORDER() - Declare an XArray operation state. + * @name: Name of this operation state (usually xas). + * @array: Array to operate on. + * @index: Initial index of interest. + * @order: Order of entry. + * + * Declare and initialise an xa_state on the stack. This variant of + * XA_STATE() allows you to specify the 'order' of the element you + * want to operate on.` + */ +#define XA_STATE_ORDER(name, array, index, order) \ + struct xa_state name = __XA_STATE(array, \ + (index >> order) << order, \ + order - (order % XA_CHUNK_SHIFT), \ + (1U << (order % XA_CHUNK_SHIFT)) - 1) + +#define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) +#define xas_trylock(xas) xa_trylock((xas)->xa) +#define xas_lock(xas) xa_lock((xas)->xa) +#define xas_unlock(xas) xa_unlock((xas)->xa) +#define xas_lock_bh(xas) xa_lock_bh((xas)->xa) +#define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) +#define xas_lock_irq(xas) xa_lock_irq((xas)->xa) +#define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) +#define xas_lock_irqsave(xas, flags) \ + xa_lock_irqsave((xas)->xa, flags) +#define xas_unlock_irqrestore(xas, flags) \ + xa_unlock_irqrestore((xas)->xa, flags) + +/** + * xas_error() - Return an errno stored in the xa_state. + * @xas: XArray operation state. + * + * Return: 0 if no error has been noted. A negative errno if one has. + */ +static inline int xas_error(const struct xa_state *xas) +{ + return xa_err(xas->xa_node); +} + +/** + * xas_set_err() - Note an error in the xa_state. + * @xas: XArray operation state. + * @err: Negative error number. + * + * Only call this function with a negative @err; zero or positive errors + * will probably not behave the way you think they should. If you want + * to clear the error from an xa_state, use xas_reset(). + */ +static inline void xas_set_err(struct xa_state *xas, long err) +{ + xas->xa_node = XA_ERROR(err); +} + +/** + * xas_invalid() - Is the xas in a retry or error state? + * @xas: XArray operation state. + * + * Return: %true if the xas cannot be used for operations. + */ +static inline bool xas_invalid(const struct xa_state *xas) +{ + return (unsigned long)xas->xa_node & 3; +} + +/** + * xas_valid() - Is the xas a valid cursor into the array? + * @xas: XArray operation state. + * + * Return: %true if the xas can be used for operations. + */ +static inline bool xas_valid(const struct xa_state *xas) +{ + return !xas_invalid(xas); +} + +/** + * xas_reset() - Reset an XArray operation state. + * @xas: XArray operation state. + * + * Resets the error or walk state of the @xas so future walks of the + * array will start from the root. Use this if you have dropped the + * xarray lock and want to reuse the xa_state. + * + * Context: Any context. + */ +static inline void xas_reset(struct xa_state *xas) +{ + xas->xa_node = XAS_RESTART; +} + +/** + * xas_retry() - Retry the operation if appropriate. + * @xas: XArray operation state. + * @entry: Entry from xarray. + * + * The advanced functions may sometimes return an internal entry, such as + * a retry entry or a zero entry. This function sets up the @xas to restart + * the walk from the head of the array if needed. + * + * Context: Any context. + * Return: true if the operation needs to be retried. + */ +static inline bool xas_retry(struct xa_state *xas, const void *entry) +{ + if (!xa_is_retry(entry)) + return false; + xas_reset(xas); + return true; +} + +void *xas_load(struct xa_state *); + +/** + * xas_reload() - Refetch an entry from the xarray. + * @xas: XArray operation state. + * + * Use this function to check that a previously loaded entry still has + * the same value. This is useful for the lockless pagecache lookup where + * we walk the array with only the RCU lock to protect us, lock the page, + * then check that the page hasn't moved since we looked it up. + * + * The caller guarantees that @xas is still valid. If it may be in an + * error or restart state, call xas_load() instead. + * + * Return: The entry at this location in the xarray. + */ +static inline void *xas_reload(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (node) + return xa_entry(xas->xa, node, xas->xa_offset); + return xa_head(xas->xa); +} + #endif /* _LINUX_XARRAY_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4966c4fbe7f7..091155e12422 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1813,6 +1813,9 @@ config TEST_BITFIELD config TEST_UUID tristate "Test functions located in the uuid module at runtime" +config TEST_XARRAY + tristate "Test the XArray code at runtime" + config TEST_OVERFLOW tristate "Test check_*_overflow() functions at runtime" diff --git a/lib/Makefile b/lib/Makefile index 057385f1f145..e6f809556af5 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -68,6 +68,7 @@ obj-$(CONFIG_TEST_PRINTF) += test_printf.o obj-$(CONFIG_TEST_BITMAP) += test_bitmap.o obj-$(CONFIG_TEST_BITFIELD) += test_bitfield.o obj-$(CONFIG_TEST_UUID) += test_uuid.o +obj-$(CONFIG_TEST_XARRAY) += test_xarray.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8a568cea1237..b8e961428484 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -256,49 +256,6 @@ static unsigned long next_index(unsigned long index, } #ifndef __KERNEL__ -static void dump_node(struct radix_tree_node *node, unsigned long index) -{ - unsigned long i; - - pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d nr_values %d\n", - node, node->offset, index, index | node_maxindex(node), - node->parent, - node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->nr_values); - - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - unsigned long first = index | (i << node->shift); - unsigned long last = first | ((1UL << node->shift) - 1); - void *entry = node->slots[i]; - if (!entry) - continue; - if (entry == RADIX_TREE_RETRY) { - pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", - i, first, last, node); - } else if (!radix_tree_is_internal_node(entry)) { - pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", - entry, i, first, last, node); - } else if (xa_is_sibling(entry)) { - pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", - entry, i, first, last, node, - node->slots[xa_to_sibling(entry)]); - } else { - dump_node(entry_to_node(entry), first); - } - } -} - -/* For debug */ -static void radix_tree_dump(struct radix_tree_root *root) -{ - pr_debug("radix root: %p xa_head %p tags %x\n", - root, root->xa_head, - root->xa_flags >> ROOT_TAG_SHIFT); - if (!radix_tree_is_internal_node(root->xa_head)) - return; - dump_node(entry_to_node(root->xa_head), 0); -} - static void dump_ida_node(void *entry, unsigned long index) { unsigned long i; diff --git a/lib/test_xarray.c b/lib/test_xarray.c new file mode 100644 index 000000000000..a7248b87617f --- /dev/null +++ b/lib/test_xarray.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * test_xarray.c: Test the XArray API + * Copyright (c) 2017-2018 Microsoft Corporation + * Author: Matthew Wilcox + */ + +#include +#include + +static unsigned int tests_run; +static unsigned int tests_passed; + +#ifndef XA_DEBUG +# ifdef __KERNEL__ +void xa_dump(const struct xarray *xa) { } +# endif +#undef XA_BUG_ON +#define XA_BUG_ON(xa, x) do { \ + tests_run++; \ + if (x) { \ + printk("BUG at %s:%d\n", __func__, __LINE__); \ + xa_dump(xa); \ + dump_stack(); \ + } else { \ + tests_passed++; \ + } \ +} while (0) +#endif + +static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + radix_tree_insert(xa, index, xa_mk_value(index)); + return NULL; +} + +static void xa_erase_index(struct xarray *xa, unsigned long index) +{ + radix_tree_delete(xa, index); +} + +static noinline void check_xa_load(struct xarray *xa) +{ + unsigned long i, j; + + for (i = 0; i < 1024; i++) { + for (j = 0; j < 1024; j++) { + void *entry = xa_load(xa, j); + if (j < i) + XA_BUG_ON(xa, xa_to_value(entry) != j); + else + XA_BUG_ON(xa, entry); + } + XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL); + } + + for (i = 0; i < 1024; i++) { + for (j = 0; j < 1024; j++) { + void *entry = xa_load(xa, j); + if (j >= i) + XA_BUG_ON(xa, xa_to_value(entry) != j); + else + XA_BUG_ON(xa, entry); + } + xa_erase_index(xa, i); + } + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static RADIX_TREE(array, GFP_KERNEL); + +static int xarray_checks(void) +{ + check_xa_load(&array); + + printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); + return (tests_run == tests_passed) ? 0 : -EINVAL; +} + +static void xarray_exit(void) +{ +} + +module_init(xarray_checks); +module_exit(xarray_exit); +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); diff --git a/lib/xarray.c b/lib/xarray.c index 862f4c64c754..19cfcbc69a68 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -24,6 +24,100 @@ * @entry refers to something stored in a slot in the xarray */ +/* extracts the offset within this node from the index */ +static unsigned int get_offset(unsigned long index, struct xa_node *node) +{ + return (index >> node->shift) & XA_CHUNK_MASK; +} + +/* move the index either forwards (find) or backwards (sibling slot) */ +static void xas_move_index(struct xa_state *xas, unsigned long offset) +{ + unsigned int shift = xas->xa_node->shift; + xas->xa_index &= ~XA_CHUNK_MASK << shift; + xas->xa_index += offset << shift; +} + +static void *set_bounds(struct xa_state *xas) +{ + xas->xa_node = XAS_BOUNDS; + return NULL; +} + +/* + * Starts a walk. If the @xas is already valid, we assume that it's on + * the right path and just return where we've got to. If we're in an + * error state, return NULL. If the index is outside the current scope + * of the xarray, return NULL without changing @xas->xa_node. Otherwise + * set @xas->xa_node to NULL and return the current head of the array. + */ +static void *xas_start(struct xa_state *xas) +{ + void *entry; + + if (xas_valid(xas)) + return xas_reload(xas); + if (xas_error(xas)) + return NULL; + + entry = xa_head(xas->xa); + if (!xa_is_node(entry)) { + if (xas->xa_index) + return set_bounds(xas); + } else { + if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK) + return set_bounds(xas); + } + + xas->xa_node = NULL; + return entry; +} + +static void *xas_descend(struct xa_state *xas, struct xa_node *node) +{ + unsigned int offset = get_offset(xas->xa_index, node); + void *entry = xa_entry(xas->xa, node, offset); + + xas->xa_node = node; + if (xa_is_sibling(entry)) { + offset = xa_to_sibling(entry); + entry = xa_entry(xas->xa, node, offset); + } + + xas->xa_offset = offset; + return entry; +} + +/** + * xas_load() - Load an entry from the XArray (advanced). + * @xas: XArray operation state. + * + * Usually walks the @xas to the appropriate state to load the entry + * stored at xa_index. However, it will do nothing and return %NULL if + * @xas is in an error state. xas_load() will never expand the tree. + * + * If the xa_state is set up to operate on a multi-index entry, xas_load() + * may return %NULL or an internal entry, even if there are entries + * present within the range specified by @xas. + * + * Context: Any context. The caller should hold the xa_lock or the RCU lock. + * Return: Usually an entry in the XArray, but see description for exceptions. + */ +void *xas_load(struct xa_state *xas) +{ + void *entry = xas_start(xas); + + while (xa_is_node(entry)) { + struct xa_node *node = xa_to_node(entry); + + if (xas->xa_shift > node->shift) + break; + entry = xas_descend(xas, node); + } + return entry; +} +EXPORT_SYMBOL_GPL(xas_load); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. @@ -42,3 +136,104 @@ void xa_init_flags(struct xarray *xa, gfp_t flags) xa->xa_head = NULL; } EXPORT_SYMBOL(xa_init_flags); + +/** + * xa_load() - Load an entry from an XArray. + * @xa: XArray. + * @index: index into array. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry at @index in @xa. + */ +void *xa_load(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + do { + entry = xas_load(&xas); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + return entry; +} +EXPORT_SYMBOL(xa_load); + +#ifdef XA_DEBUG +void xa_dump_node(const struct xa_node *node) +{ + unsigned i, j; + + if (!node) + return; + if ((unsigned long)node & 3) { + pr_cont("node %px\n", node); + return; + } + + pr_cont("node %px %s %d parent %px shift %d count %d values %d " + "array %px list %px %px marks", + node, node->parent ? "offset" : "max", node->offset, + node->parent, node->shift, node->count, node->nr_values, + node->array, node->private_list.prev, node->private_list.next); + for (i = 0; i < XA_MAX_MARKS; i++) + for (j = 0; j < XA_MARK_LONGS; j++) + pr_cont(" %lx", node->marks[i][j]); + pr_cont("\n"); +} + +void xa_dump_index(unsigned long index, unsigned int shift) +{ + if (!shift) + pr_info("%lu: ", index); + else if (shift >= BITS_PER_LONG) + pr_info("0-%lu: ", ~0UL); + else + pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1)); +} + +void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) +{ + if (!entry) + return; + + xa_dump_index(index, shift); + + if (xa_is_node(entry)) { + if (shift == 0) { + pr_cont("%px\n", entry); + } else { + unsigned long i; + struct xa_node *node = xa_to_node(entry); + xa_dump_node(node); + for (i = 0; i < XA_CHUNK_SIZE; i++) + xa_dump_entry(node->slots[i], + index + (i << node->shift), node->shift); + } + } else if (xa_is_value(entry)) + pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (!xa_is_internal(entry)) + pr_cont("%px\n", entry); + else if (xa_is_retry(entry)) + pr_cont("retry (%ld)\n", xa_to_internal(entry)); + else if (xa_is_sibling(entry)) + pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); + else + pr_cont("UNKNOWN ENTRY (%px)\n", entry); +} + +void xa_dump(const struct xarray *xa) +{ + void *entry = xa->xa_head; + unsigned int shift = 0; + + pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, + xa->xa_flags, radix_tree_tagged(xa, 0), + radix_tree_tagged(xa, 1), radix_tree_tagged(xa, 2)); + if (xa_is_node(entry)) + shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; + xa_dump_entry(entry, 0, shift); +} +#endif diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index 0ad884452c5c..6935ef94e77a 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -70,6 +70,7 @@ #define BUG_ON(cond) assert(!(cond)) #endif #endif +#define BUG() BUG_ON(1) #if __BYTE_ORDER == __BIG_ENDIAN #define cpu_to_le16 bswap_16 diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore index d4706c0ffceb..3834899b6693 100644 --- a/tools/testing/radix-tree/.gitignore +++ b/tools/testing/radix-tree/.gitignore @@ -4,3 +4,4 @@ idr-test main multiorder radix-tree.c +xarray diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index c0cf1c79efd5..1379f1d78d0b 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -4,7 +4,7 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \ -fsanitize=undefined LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu -TARGETS = main idr-test multiorder +TARGETS = main idr-test multiorder xarray CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o @@ -25,6 +25,8 @@ main: $(OFILES) idr-test.o: ../../../lib/test_ida.c idr-test: idr-test.o $(CORE_OFILES) +xarray: $(CORE_OFILES) + multiorder: multiorder.o $(CORE_OFILES) clean: @@ -45,7 +47,7 @@ radix-tree.c: ../../../lib/radix-tree.c idr.c: ../../../lib/idr.c sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ -xarray.o: ../../../lib/xarray.c +xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c generated/map-shift.h: @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 426f32f28547..5d06ac75a14d 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -14,6 +14,7 @@ #include "../../../include/linux/kconfig.h" #define printk printf +#define pr_info printk #define pr_debug printk #define pr_cont printk diff --git a/tools/testing/radix-tree/linux/rcupdate.h b/tools/testing/radix-tree/linux/rcupdate.h index 73ed33658203..fd280b070fdb 100644 --- a/tools/testing/radix-tree/linux/rcupdate.h +++ b/tools/testing/radix-tree/linux/rcupdate.h @@ -6,5 +6,7 @@ #define rcu_dereference_raw(p) rcu_dereference(p) #define rcu_dereference_protected(p, cond) rcu_dereference(p) +#define rcu_dereference_check(p, cond) rcu_dereference(p) +#define RCU_INIT_POINTER(p, v) (p) = (v) #endif diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index b741686e53d6..09deaf4f0959 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -365,6 +365,7 @@ int main(int argc, char **argv) rcu_register_thread(); radix_tree_init(); + xarray_tests(); regression1_test(); regression2_test(); regression3_test(); diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 92d901eacf49..e3dc7a16f09b 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -34,6 +34,7 @@ int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, unsigned iftag, unsigned thentag); unsigned long find_item(struct radix_tree_root *, void *item); +void xarray_tests(void); void tag_check(void); void multiorder_checks(void); void iteration_test(unsigned order, unsigned duration); diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c index 9bbd667172a7..e61e43efe463 100644 --- a/tools/testing/radix-tree/xarray.c +++ b/tools/testing/radix-tree/xarray.c @@ -4,4 +4,32 @@ * Copyright (c) 2018 Matthew Wilcox */ +#define XA_DEBUG +#include "test.h" + +#define module_init(x) +#define module_exit(x) +#define MODULE_AUTHOR(x) +#define MODULE_LICENSE(x) +#define dump_stack() assert(0) + #include "../../../lib/xarray.c" +#undef XA_DEBUG +#include "../../../lib/test_xarray.c" + +void xarray_tests(void) +{ + xarray_checks(); + xarray_exit(); +} + +int __weak main(void) +{ + radix_tree_init(); + xarray_tests(); + radix_tree_cpu_dead(1); + rcu_barrier(); + if (nr_allocated) + printf("nr_allocated = %d\n", nr_allocated); + return 0; +} -- cgit v1.2.3 From 9b89a0355144685a787b0dc5bcf7bdd6f2d02968 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 10 Nov 2017 09:34:31 -0500 Subject: xarray: Add XArray marks XArray marks are like the radix tree tags, only slightly more strongly typed. They are renamed in order to distinguish them from tagged pointers. This commit adds the basic get/set/clear operations. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 63 +++++++ lib/test_xarray.c | 34 ++++ lib/xarray.c | 232 +++++++++++++++++++++++++- tools/include/asm-generic/bitops.h | 1 + tools/include/asm-generic/bitops/atomic.h | 9 - tools/include/asm-generic/bitops/non-atomic.h | 109 ++++++++++++ tools/include/linux/spinlock.h | 10 +- 7 files changed, 445 insertions(+), 13 deletions(-) create mode 100644 tools/include/asm-generic/bitops/non-atomic.h diff --git a/include/linux/xarray.h b/include/linux/xarray.h index a0df8217068c..2de504ae9ba4 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -197,6 +198,20 @@ static inline int xa_err(void *entry) return 0; } +typedef unsigned __bitwise xa_mark_t; +#define XA_MARK_0 ((__force xa_mark_t)0U) +#define XA_MARK_1 ((__force xa_mark_t)1U) +#define XA_MARK_2 ((__force xa_mark_t)2U) +#define XA_PRESENT ((__force xa_mark_t)8U) +#define XA_MARK_MAX XA_MARK_2 + +/* + * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, + * and we remain compatible with that. + */ +#define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ + (__force unsigned)(mark))) + /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. @@ -252,6 +267,9 @@ struct xarray { void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); +bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_init() - Initialise an empty XArray. @@ -278,6 +296,19 @@ static inline bool xa_empty(const struct xarray *xa) return xa->xa_head == NULL; } +/** + * xa_marked() - Inquire whether any entry in this array has a mark set + * @xa: Array + * @mark: Mark value + * + * Context: Any context. + * Return: %true if any entry has this mark set. + */ +static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) +{ + return xa->xa_flags & XA_FLAGS_MARK(mark); +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -290,6 +321,12 @@ static inline bool xa_empty(const struct xarray *xa) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) +/* + * Versions of the normal API which require the caller to hold the xa_lock. + */ +void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); +void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); + /* Everything below here is the Advanced API. Proceed with caution. */ /* @@ -388,6 +425,22 @@ static inline void *xa_entry_locked(const struct xarray *xa, lockdep_is_held(&xa->xa_lock)); } +/* Private */ +static inline struct xa_node *xa_parent(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_check(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + +/* Private */ +static inline struct xa_node *xa_parent_locked(const struct xarray *xa, + const struct xa_node *node) +{ + return rcu_dereference_protected(node->parent, + lockdep_is_held(&xa->xa_lock)); +} + /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { @@ -588,6 +641,12 @@ static inline bool xas_valid(const struct xa_state *xas) return !xas_invalid(xas); } +/* True if the pointer is something other than a node */ +static inline bool xas_not_node(struct xa_node *node) +{ + return ((unsigned long)node & 3) || !node; +} + /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. @@ -625,6 +684,10 @@ static inline bool xas_retry(struct xa_state *xas, const void *entry) void *xas_load(struct xa_state *); +bool xas_get_mark(const struct xa_state *, xa_mark_t); +void xas_set_mark(const struct xa_state *, xa_mark_t); +void xas_clear_mark(const struct xa_state *, xa_mark_t); + /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index a7248b87617f..f8b0e9ba80a4 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -67,11 +67,45 @@ static noinline void check_xa_load(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) +{ + /* NULL elements have no marks set */ + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + + /* Storing a pointer will not make a mark appear */ + XA_BUG_ON(xa, xa_store_index(xa, index, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, !xa_get_mark(xa, index, XA_MARK_0)); + + /* Setting one mark will not set another mark */ + XA_BUG_ON(xa, xa_get_mark(xa, index + 1, XA_MARK_0)); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_1)); + + /* Storing NULL clears marks, and they can't be set again */ + xa_erase_index(xa, index); + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + xa_set_mark(xa, index, XA_MARK_0); + XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); +} + +static noinline void check_xa_mark(struct xarray *xa) +{ + unsigned long index; + + for (index = 0; index < 16384; index += 4) + check_xa_mark_1(xa, index); +} + static RADIX_TREE(array, GFP_KERNEL); static int xarray_checks(void) { check_xa_load(&array); + check_xa_mark(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index 19cfcbc69a68..aa86c47e532f 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -5,6 +5,7 @@ * Author: Matthew Wilcox */ +#include #include #include @@ -24,6 +25,48 @@ * @entry refers to something stored in a slot in the xarray */ +static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) +{ + if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) + xa->xa_flags |= XA_FLAGS_MARK(mark); +} + +static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark) +{ + if (xa->xa_flags & XA_FLAGS_MARK(mark)) + xa->xa_flags &= ~(XA_FLAGS_MARK(mark)); +} + +static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark) +{ + return node->marks[(__force unsigned)mark]; +} + +static inline bool node_get_mark(struct xa_node *node, + unsigned int offset, xa_mark_t mark) +{ + return test_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_set_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_set_bit(offset, node_marks(node, mark)); +} + +/* returns true if the bit was set */ +static inline bool node_clear_mark(struct xa_node *node, unsigned int offset, + xa_mark_t mark) +{ + return __test_and_clear_bit(offset, node_marks(node, mark)); +} + +static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) +{ + return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); +} + /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { @@ -118,6 +161,85 @@ void *xas_load(struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_load); +/** + * xas_get_mark() - Returns the state of this mark. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Return: true if the mark is set, false if the mark is clear or @xas + * is in an error state. + */ +bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark) +{ + if (xas_invalid(xas)) + return false; + if (!xas->xa_node) + return xa_marked(xas->xa, mark); + return node_get_mark(xas->xa_node, xas->xa_offset, mark); +} +EXPORT_SYMBOL_GPL(xas_get_mark); + +/** + * xas_set_mark() - Sets the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Sets the specified mark on this entry, and walks up the tree setting it + * on all the ancestor entries. Does nothing if @xas has not been walked to + * an entry, or is in an error state. + */ +void xas_set_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (node_set_mark(node, offset, mark)) + return; + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (!xa_marked(xas->xa, mark)) + xa_mark_set(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_set_mark); + +/** + * xas_clear_mark() - Clears the mark on this entry and its parents. + * @xas: XArray operation state. + * @mark: Mark number. + * + * Clears the specified mark on this entry, and walks back to the head + * attempting to clear it on all the ancestor entries. Does nothing if + * @xas has not been walked to an entry, or is in an error state. + */ +void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset = xas->xa_offset; + + if (xas_invalid(xas)) + return; + + while (node) { + if (!node_clear_mark(node, offset, mark)) + return; + if (node_any_mark(node, mark)) + return; + + offset = node->offset; + node = xa_parent_locked(xas->xa, node); + } + + if (xa_marked(xas->xa, mark)) + xa_mark_clear(xas->xa, mark); +} +EXPORT_SYMBOL_GPL(xas_clear_mark); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. @@ -160,6 +282,112 @@ void *xa_load(struct xarray *xa, unsigned long index) } EXPORT_SYMBOL(xa_load); +/** + * __xa_set_mark() - Set this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a NULL entry does not succeed. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_set_mark(&xas, mark); +} +EXPORT_SYMBOL_GPL(__xa_set_mark); + +/** + * __xa_clear_mark() - Clear this mark on this entry while locked. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Context: Any context. Expects xa_lock to be held on entry. + */ +void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry = xas_load(&xas); + + if (entry) + xas_clear_mark(&xas, mark); +} +EXPORT_SYMBOL_GPL(__xa_clear_mark); + +/** + * xa_get_mark() - Inquire whether this mark is set on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * This function uses the RCU read lock, so the result may be out of date + * by the time it returns. If you need the result to be stable, use a lock. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: True if the entry at @index has this mark set, false if it doesn't. + */ +bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + XA_STATE(xas, xa, index); + void *entry; + + rcu_read_lock(); + entry = xas_start(&xas); + while (xas_get_mark(&xas, mark)) { + if (!xa_is_node(entry)) + goto found; + entry = xas_descend(&xas, xa_to_node(entry)); + } + rcu_read_unlock(); + return false; + found: + rcu_read_unlock(); + return true; +} +EXPORT_SYMBOL(xa_get_mark); + +/** + * xa_set_mark() - Set this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Attempting to set a mark on a NULL entry does not succeed. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_set_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_set_mark); + +/** + * xa_clear_mark() - Clear this mark on this entry. + * @xa: XArray. + * @index: Index of entry. + * @mark: Mark number. + * + * Clearing a mark always succeeds. + * + * Context: Process context. Takes and releases the xa_lock. + */ +void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) +{ + xa_lock(xa); + __xa_clear_mark(xa, index, mark); + xa_unlock(xa); +} +EXPORT_SYMBOL(xa_clear_mark); + #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { @@ -230,8 +458,8 @@ void xa_dump(const struct xarray *xa) unsigned int shift = 0; pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry, - xa->xa_flags, radix_tree_tagged(xa, 0), - radix_tree_tagged(xa, 1), radix_tree_tagged(xa, 2)); + xa->xa_flags, xa_marked(xa, XA_MARK_0), + xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2)); if (xa_is_node(entry)) shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT; xa_dump_entry(entry, 0, shift); diff --git a/tools/include/asm-generic/bitops.h b/tools/include/asm-generic/bitops.h index 9bce3b56b5e7..5d2ab38965cc 100644 --- a/tools/include/asm-generic/bitops.h +++ b/tools/include/asm-generic/bitops.h @@ -27,5 +27,6 @@ #include #include +#include #endif /* __TOOLS_ASM_GENERIC_BITOPS_H */ diff --git a/tools/include/asm-generic/bitops/atomic.h b/tools/include/asm-generic/bitops/atomic.h index 21c41ccd1266..2f6ea28764a7 100644 --- a/tools/include/asm-generic/bitops/atomic.h +++ b/tools/include/asm-generic/bitops/atomic.h @@ -15,13 +15,4 @@ static inline void clear_bit(int nr, unsigned long *addr) addr[nr / __BITS_PER_LONG] &= ~(1UL << (nr % __BITS_PER_LONG)); } -static __always_inline int test_bit(unsigned int nr, const unsigned long *addr) -{ - return ((1UL << (nr % __BITS_PER_LONG)) & - (((unsigned long *)addr)[nr / __BITS_PER_LONG])) != 0; -} - -#define __set_bit(nr, addr) set_bit(nr, addr) -#define __clear_bit(nr, addr) clear_bit(nr, addr) - #endif /* _TOOLS_LINUX_ASM_GENERIC_BITOPS_ATOMIC_H_ */ diff --git a/tools/include/asm-generic/bitops/non-atomic.h b/tools/include/asm-generic/bitops/non-atomic.h new file mode 100644 index 000000000000..7e10c4b50c5d --- /dev/null +++ b/tools/include/asm-generic/bitops/non-atomic.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ +#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ + +#include + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p |= mask; +} + +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p &= ~mask; +} + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p ^= mask; +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old | mask; + return (old & mask) != 0; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old & ~mask; + return (old & mask) != 0; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, + volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + unsigned long old = *p; + + *p = old ^ mask; + return (old & mask) != 0; +} + +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static inline int test_bit(int nr, const volatile unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); +} + +#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 1738c0391da4..622266b197d0 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -8,8 +8,14 @@ #define spinlock_t pthread_mutex_t #define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER #define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER -#define spin_lock_init(x) pthread_mutex_init(x, NULL) - +#define spin_lock_init(x) pthread_mutex_init(x, NULL) + +#define spin_lock(x) pthread_mutex_lock(x) +#define spin_unlock(x) pthread_mutex_unlock(x) +#define spin_lock_bh(x) pthread_mutex_lock(x) +#define spin_unlock_bh(x) pthread_mutex_unlock(x) +#define spin_lock_irq(x) pthread_mutex_lock(x) +#define spin_unlock_irq(x) pthread_mutex_unlock(x) #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) -- cgit v1.2.3 From 58d6ea3085f2e53714810a513c61629f6d2be0a6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 10 Nov 2017 15:15:08 -0500 Subject: xarray: Add XArray unconditional store operations xa_store() differs from radix_tree_insert() in that it will overwrite an existing element in the array rather than returning an error. This is the behaviour which most users want, and those that want more complex behaviour generally want to use the xas family of routines anyway. For memory allocation, xa_store() will first attempt to request memory from the slab allocator; if memory is not immediately available, it will drop the xa_lock and allocate memory, keeping a pointer in the xa_state. It does not use the per-CPU cache, although those will continue to exist until all radix tree users are converted to the xarray. This patch also includes xa_erase() and __xa_erase() for a streamlined way to store NULL. Since there is no need to allocate memory in order to store a NULL in the XArray, we do not need to trouble the user with deciding what memory allocation flags to use. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 147 ++++++- lib/radix-tree.c | 4 +- lib/test_xarray.c | 177 +++++++- lib/xarray.c | 693 +++++++++++++++++++++++++++++++ tools/include/linux/bitmap.h | 1 + tools/include/linux/spinlock.h | 2 + tools/testing/radix-tree/Makefile | 2 +- tools/testing/radix-tree/bitmap.c | 23 + tools/testing/radix-tree/linux/kernel.h | 4 + tools/testing/radix-tree/linux/lockdep.h | 11 + 10 files changed, 1055 insertions(+), 9 deletions(-) create mode 100644 tools/testing/radix-tree/bitmap.c create mode 100644 tools/testing/radix-tree/linux/lockdep.h diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 2de504ae9ba4..15eab63286ed 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -205,10 +205,17 @@ typedef unsigned __bitwise xa_mark_t; #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 +enum xa_lock_type { + XA_LOCK_IRQ = 1, + XA_LOCK_BH = 2, +}; + /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ +#define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) +#define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) @@ -267,6 +274,7 @@ struct xarray { void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); +void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -309,6 +317,23 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) return xa->xa_flags & XA_FLAGS_MARK(mark); } +/** + * xa_erase() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase(struct xarray *xa, unsigned long index) +{ + return xa_store(xa, index, NULL, 0); +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -322,11 +347,65 @@ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) spin_unlock_irqrestore(&(xa)->xa_lock, flags) /* - * Versions of the normal API which require the caller to hold the xa_lock. - */ + * Versions of the normal API which require the caller to hold the + * xa_lock. If the GFP flags allow it, they will drop the lock to + * allocate memory, then reacquire it afterwards. These functions + * may also re-enable interrupts if the XArray flags indicate the + * locking should be interrupt safe. + */ +void *__xa_erase(struct xarray *, unsigned long index); +void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); +/** + * xa_erase_bh() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling softirqs. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_bh(xa); + entry = __xa_erase(xa, index); + xa_unlock_bh(xa); + + return entry; +} + +/** + * xa_erase_irq() - Erase this entry from the XArray. + * @xa: XArray. + * @index: Index of entry. + * + * This function is the equivalent of calling xa_store() with %NULL as + * the third argument. The XArray does not need to allocate memory, so + * the user does not need to provide GFP flags. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. + * Return: The entry which used to be at this index. + */ +static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) +{ + void *entry; + + xa_lock_irq(xa); + entry = __xa_erase(xa, index); + xa_unlock_irq(xa); + + return entry; +} + /* Everything below here is the Advanced API. Proceed with caution. */ /* @@ -441,6 +520,12 @@ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, lockdep_is_held(&xa->xa_lock)); } +/* Private */ +static inline void *xa_mk_node(const struct xa_node *node) +{ + return (void *)((unsigned long)node | 2); +} + /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { @@ -647,6 +732,12 @@ static inline bool xas_not_node(struct xa_node *node) return ((unsigned long)node & 3) || !node; } +/* True if the node represents head-of-tree, RESTART or BOUNDS */ +static inline bool xas_top(struct xa_node *node) +{ + return node <= XAS_RESTART; +} + /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. @@ -683,10 +774,14 @@ static inline bool xas_retry(struct xa_state *xas, const void *entry) } void *xas_load(struct xa_state *); +void *xas_store(struct xa_state *, void *entry); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); +void xas_init_marks(const struct xa_state *); + +bool xas_nomem(struct xa_state *, gfp_t); /** * xas_reload() - Refetch an entry from the xarray. @@ -711,4 +806,52 @@ static inline void *xas_reload(struct xa_state *xas) return xa_head(xas->xa); } +/** + * xas_set() - Set up XArray operation state for a different index. + * @xas: XArray operation state. + * @index: New index into the XArray. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see xas_next() + * to move to an adjacent index. + */ +static inline void xas_set(struct xa_state *xas, unsigned long index) +{ + xas->xa_index = index; + xas->xa_node = XAS_RESTART; +} + +/** + * xas_set_order() - Set up XArray operation state for a multislot entry. + * @xas: XArray operation state. + * @index: Target of the operation. + * @order: Entry occupies 2^@order indices. + */ +static inline void xas_set_order(struct xa_state *xas, unsigned long index, + unsigned int order) +{ +#ifdef CONFIG_XARRAY_MULTI + xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; + xas->xa_shift = order - (order % XA_CHUNK_SHIFT); + xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; + xas->xa_node = XAS_RESTART; +#else + BUG_ON(order > 0); + xas_set(xas, index); +#endif +} + +/** + * xas_set_update() - Set up XArray operation state for a callback. + * @xas: XArray operation state. + * @update: Function to call when updating a node. + * + * The XArray can notify a caller after it has updated an xa_node. + * This is advanced functionality and is only needed by the page cache. + */ +static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) +{ + xas->xa_update = update; +} + #endif /* _LINUX_XARRAY_H */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index b8e961428484..3479d93c32b9 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -47,7 +47,7 @@ static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; /* * Radix tree node cache. */ -static struct kmem_cache *radix_tree_node_cachep; +struct kmem_cache *radix_tree_node_cachep; /* * The radix tree is variable-height, so an insert operation not only has @@ -365,7 +365,7 @@ out: return ret; } -static void radix_tree_node_rcu_free(struct rcu_head *head) +void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); diff --git a/lib/test_xarray.c b/lib/test_xarray.c index f8b0e9ba80a4..b711030fbc32 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -30,13 +30,49 @@ void xa_dump(const struct xarray *xa) { } static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp) { - radix_tree_insert(xa, index, xa_mk_value(index)); - return NULL; + return xa_store(xa, index, xa_mk_value(index & LONG_MAX), gfp); } static void xa_erase_index(struct xarray *xa, unsigned long index) { - radix_tree_delete(xa, index); + XA_BUG_ON(xa, xa_erase(xa, index) != xa_mk_value(index & LONG_MAX)); + XA_BUG_ON(xa, xa_load(xa, index) != NULL); +} + +/* + * If anyone needs this, please move it to xarray.c. We have no current + * users outside the test suite because all current multislot users want + * to use the advanced API. + */ +static void *xa_store_order(struct xarray *xa, unsigned long index, + unsigned order, void *entry, gfp_t gfp) +{ + XA_STATE_ORDER(xas, xa, index, order); + void *curr; + + do { + xas_lock(&xas); + curr = xas_store(&xas, entry); + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return curr; +} + +static noinline void check_xa_err(struct xarray *xa) +{ + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 0, GFP_NOWAIT)) != 0); + XA_BUG_ON(xa, xa_err(xa_erase(xa, 0)) != 0); +#ifndef __KERNEL__ + /* The kernel does not fail GFP_NOWAIT allocations */ + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM); + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_NOWAIT)) != -ENOMEM); +#endif + XA_BUG_ON(xa, xa_err(xa_store_index(xa, 1, GFP_KERNEL)) != 0); + XA_BUG_ON(xa, xa_err(xa_store(xa, 1, xa_mk_value(0), GFP_KERNEL)) != 0); + XA_BUG_ON(xa, xa_err(xa_erase(xa, 1)) != 0); +// kills the test-suite :-( +// XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL); } static noinline void check_xa_load(struct xarray *xa) @@ -69,6 +105,9 @@ static noinline void check_xa_load(struct xarray *xa) static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) { + unsigned int order; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 8 : 1; + /* NULL elements have no marks set */ XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); xa_set_mark(xa, index, XA_MARK_0); @@ -90,6 +129,37 @@ static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); xa_set_mark(xa, index, XA_MARK_0); XA_BUG_ON(xa, xa_get_mark(xa, index, XA_MARK_0)); + + /* + * Storing a multi-index entry over entries with marks gives the + * entire entry the union of the marks + */ + BUG_ON((index % 4) != 0); + for (order = 2; order < max_order; order++) { + unsigned long base = round_down(index, 1UL << order); + unsigned long next = base + (1UL << order); + unsigned long i; + + XA_BUG_ON(xa, xa_store_index(xa, index + 1, GFP_KERNEL)); + xa_set_mark(xa, index + 1, XA_MARK_0); + XA_BUG_ON(xa, xa_store_index(xa, index + 2, GFP_KERNEL)); + xa_set_mark(xa, index + 2, XA_MARK_1); + XA_BUG_ON(xa, xa_store_index(xa, next, GFP_KERNEL)); + xa_store_order(xa, index, order, xa_mk_value(index), + GFP_KERNEL); + for (i = base; i < next; i++) { + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0)); + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_1)); + XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_2)); + } + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_0)); + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_1)); + XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_2)); + xa_erase_index(xa, index); + xa_erase_index(xa, next); + XA_BUG_ON(xa, !xa_empty(xa)); + } + XA_BUG_ON(xa, !xa_empty(xa)); } static noinline void check_xa_mark(struct xarray *xa) @@ -100,12 +170,111 @@ static noinline void check_xa_mark(struct xarray *xa) check_xa_mark_1(xa, index); } -static RADIX_TREE(array, GFP_KERNEL); +static noinline void check_xa_shrink(struct xarray *xa) +{ + XA_STATE(xas, xa, 1); + struct xa_node *node; + + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_store_index(xa, 0, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL); + + /* + * Check that erasing the entry at 1 shrinks the tree and properly + * marks the node as being deleted. + */ + xas_lock(&xas); + XA_BUG_ON(xa, xas_load(&xas) != xa_mk_value(1)); + node = xas.xa_node; + XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 1) != NULL); + XA_BUG_ON(xa, xas.xa_node != XAS_BOUNDS); + XA_BUG_ON(xa, xa_entry_locked(xa, node, 0) != XA_RETRY_ENTRY); + XA_BUG_ON(xa, xas_load(&xas) != NULL); + xas_unlock(&xas); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + xa_erase_index(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_multi_store(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned long i, j, k; + unsigned int max_order = (sizeof(long) == 4) ? 30 : 60; + + /* Loading from any position returns the same value */ + xa_store_order(xa, 0, 1, xa_mk_value(0), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 2) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 2); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2); + rcu_read_unlock(); + + /* Storing adjacent to the value does not alter the value */ + xa_store(xa, 3, xa, GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, 2) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 3); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 2); + rcu_read_unlock(); + + /* Overwriting multiple indexes works */ + xa_store_order(xa, 0, 2, xa_mk_value(1), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 1) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 2) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 3) != xa_mk_value(1)); + XA_BUG_ON(xa, xa_load(xa, 4) != NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->count != 4); + XA_BUG_ON(xa, xa_to_node(xa_head(xa))->nr_values != 4); + rcu_read_unlock(); + + /* We can erase multiple values with a single store */ + xa_store_order(xa, 0, 63, NULL, GFP_KERNEL); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Even when the first slot is empty but the others aren't */ + xa_store_index(xa, 1, GFP_KERNEL); + xa_store_index(xa, 2, GFP_KERNEL); + xa_store_order(xa, 0, 2, NULL, GFP_KERNEL); + XA_BUG_ON(xa, !xa_empty(xa)); + + for (i = 0; i < max_order; i++) { + for (j = 0; j < max_order; j++) { + xa_store_order(xa, 0, i, xa_mk_value(i), GFP_KERNEL); + xa_store_order(xa, 0, j, xa_mk_value(j), GFP_KERNEL); + + for (k = 0; k < max_order; k++) { + void *entry = xa_load(xa, (1UL << k) - 1); + if ((i < k) && (j < k)) + XA_BUG_ON(xa, entry != NULL); + else + XA_BUG_ON(xa, entry != xa_mk_value(j)); + } + + xa_erase(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + } + } +#endif +} + +static DEFINE_XARRAY(array); static int xarray_checks(void) { + check_xa_err(&array); check_xa_load(&array); check_xa_mark(&array); + check_xa_shrink(&array); + check_multi_store(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index aa86c47e532f..4596a95ed9cd 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -7,6 +7,8 @@ #include #include +#include +#include #include /* @@ -25,6 +27,31 @@ * @entry refers to something stored in a slot in the xarray */ +static inline unsigned int xa_lock_type(const struct xarray *xa) +{ + return (__force unsigned int)xa->xa_flags & 3; +} + +static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_lock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_lock_bh(xas); + else + xas_lock(xas); +} + +static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) +{ + if (lock_type == XA_LOCK_IRQ) + xas_unlock_irq(xas); + else if (lock_type == XA_LOCK_BH) + xas_unlock_bh(xas); + else + xas_unlock(xas); +} + static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) { if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) @@ -67,6 +94,34 @@ static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); } +#define mark_inc(mark) do { \ + mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ +} while (0) + +/* + * xas_squash_marks() - Merge all marks to the first entry + * @xas: Array operation state. + * + * Set a mark on the first entry if any entry has it set. Clear marks on + * all sibling entries. + */ +static void xas_squash_marks(const struct xa_state *xas) +{ + unsigned int mark = 0; + unsigned int limit = xas->xa_offset + xas->xa_sibs + 1; + + if (!xas->xa_sibs) + return; + + do { + unsigned long *marks = xas->xa_node->marks[mark]; + if (find_next_bit(marks, limit, xas->xa_offset + 1) == limit) + continue; + __set_bit(xas->xa_offset, marks); + bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs); + } while (mark++ != (__force unsigned)XA_MARK_MAX); +} + /* extracts the offset within this node from the index */ static unsigned int get_offset(unsigned long index, struct xa_node *node) { @@ -161,6 +216,516 @@ void *xas_load(struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_load); +/* Move the radix tree node cache here */ +extern struct kmem_cache *radix_tree_node_cachep; +extern void radix_tree_node_rcu_free(struct rcu_head *head); + +#define XA_RCU_FREE ((struct xarray *)1) + +static void xa_node_free(struct xa_node *node) +{ + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->array = XA_RCU_FREE; + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); +} + +/* + * xas_destroy() - Free any resources allocated during the XArray operation. + * @xas: XArray operation state. + * + * This function is now internal-only. + */ +static void xas_destroy(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_alloc; + + if (!node) + return; + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + kmem_cache_free(radix_tree_node_cachep, node); + xas->xa_alloc = NULL; +} + +/** + * xas_nomem() - Allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * If we need to add new nodes to the XArray, we try to allocate memory + * with GFP_NOWAIT while holding the lock, which will usually succeed. + * If it fails, @xas is flagged as needing memory to continue. The caller + * should drop the lock and call xas_nomem(). If xas_nomem() succeeds, + * the caller should retry the operation. + * + * Forward progress is guaranteed as one node is allocated here and + * stored in the xa_state where it will be found by xas_alloc(). More + * nodes will likely be found in the slab allocator, but we do not tie + * them up here. + * + * Return: true if memory was needed, and was successfully allocated. + */ +bool xas_nomem(struct xa_state *xas, gfp_t gfp) +{ + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} +EXPORT_SYMBOL_GPL(xas_nomem); + +/* + * __xas_nomem() - Drop locks and allocate memory if needed. + * @xas: XArray operation state. + * @gfp: Memory allocation flags. + * + * Internal variant of xas_nomem(). + * + * Return: true if memory was needed, and was successfully allocated. + */ +static bool __xas_nomem(struct xa_state *xas, gfp_t gfp) + __must_hold(xas->xa->xa_lock) +{ + unsigned int lock_type = xa_lock_type(xas->xa); + + if (xas->xa_node != XA_ERROR(-ENOMEM)) { + xas_destroy(xas); + return false; + } + if (gfpflags_allow_blocking(gfp)) { + xas_unlock_type(xas, lock_type); + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + xas_lock_type(xas, lock_type); + } else { + xas->xa_alloc = kmem_cache_alloc(radix_tree_node_cachep, gfp); + } + if (!xas->xa_alloc) + return false; + XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list)); + xas->xa_node = XAS_RESTART; + return true; +} + +static void xas_update(struct xa_state *xas, struct xa_node *node) +{ + if (xas->xa_update) + xas->xa_update(node); + else + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); +} + +static void *xas_alloc(struct xa_state *xas, unsigned int shift) +{ + struct xa_node *parent = xas->xa_node; + struct xa_node *node = xas->xa_alloc; + + if (xas_invalid(xas)) + return NULL; + + if (node) { + xas->xa_alloc = NULL; + } else { + node = kmem_cache_alloc(radix_tree_node_cachep, + GFP_NOWAIT | __GFP_NOWARN); + if (!node) { + xas_set_err(xas, -ENOMEM); + return NULL; + } + } + + if (parent) { + node->offset = xas->xa_offset; + parent->count++; + XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE); + xas_update(xas, parent); + } + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + XA_NODE_BUG_ON(node, !list_empty(&node->private_list)); + node->shift = shift; + node->count = 0; + node->nr_values = 0; + RCU_INIT_POINTER(node->parent, xas->xa_node); + node->array = xas->xa; + + return node; +} + +/* + * Use this to calculate the maximum index that will need to be created + * in order to add the entry described by @xas. Because we cannot store a + * multiple-index entry at index 0, the calculation is a little more complex + * than you might expect. + */ +static unsigned long xas_max(struct xa_state *xas) +{ + unsigned long max = xas->xa_index; + +#ifdef CONFIG_XARRAY_MULTI + if (xas->xa_shift || xas->xa_sibs) { + unsigned long mask; + mask = (((xas->xa_sibs + 1UL) << xas->xa_shift) - 1); + max |= mask; + if (mask == max) + max++; + } +#endif + + return max; +} + +/* The maximum index that can be contained in the array without expanding it */ +static unsigned long max_index(void *entry) +{ + if (!xa_is_node(entry)) + return 0; + return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1; +} + +static void xas_shrink(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = xas->xa_node; + + for (;;) { + void *entry; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count != 1) + break; + entry = xa_entry_locked(xa, node, 0); + if (!entry) + break; + if (!xa_is_node(entry) && node->shift) + break; + xas->xa_node = XAS_BOUNDS; + + RCU_INIT_POINTER(xa->xa_head, entry); + + node->count = 0; + node->nr_values = 0; + if (!xa_is_node(entry)) + RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY); + xas_update(xas, node); + xa_node_free(node); + if (!xa_is_node(entry)) + break; + node = xa_to_node(entry); + node->parent = NULL; + } +} + +/* + * xas_delete_node() - Attempt to delete an xa_node + * @xas: Array operation state. + * + * Attempts to delete the @xas->xa_node. This will fail if xa->node has + * a non-zero reference count. + */ +static void xas_delete_node(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + for (;;) { + struct xa_node *parent; + + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + if (node->count) + break; + + parent = xa_parent_locked(xas->xa, node); + xas->xa_node = parent; + xas->xa_offset = node->offset; + xa_node_free(node); + + if (!parent) { + xas->xa->xa_head = NULL; + xas->xa_node = XAS_BOUNDS; + return; + } + + parent->slots[xas->xa_offset] = NULL; + parent->count--; + XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE); + node = parent; + xas_update(xas, node); + } + + if (!node->parent) + xas_shrink(xas); +} + +/** + * xas_free_nodes() - Free this node and all nodes that it references + * @xas: Array operation state. + * @top: Node to free + * + * This node has been removed from the tree. We must now free it and all + * of its subnodes. There may be RCU walkers with references into the tree, + * so we must replace all entries with retry markers. + */ +static void xas_free_nodes(struct xa_state *xas, struct xa_node *top) +{ + unsigned int offset = 0; + struct xa_node *node = top; + + for (;;) { + void *entry = xa_entry_locked(xas->xa, node, offset); + + if (xa_is_node(entry)) { + node = xa_to_node(entry); + offset = 0; + continue; + } + if (entry) + RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY); + offset++; + while (offset == XA_CHUNK_SIZE) { + struct xa_node *parent; + + parent = xa_parent_locked(xas->xa, node); + offset = node->offset + 1; + node->count = 0; + node->nr_values = 0; + xas_update(xas, node); + xa_node_free(node); + if (node == top) + return; + node = parent; + } + } +} + +/* + * xas_expand adds nodes to the head of the tree until it has reached + * sufficient height to be able to contain @xas->xa_index + */ +static int xas_expand(struct xa_state *xas, void *head) +{ + struct xarray *xa = xas->xa; + struct xa_node *node = NULL; + unsigned int shift = 0; + unsigned long max = xas_max(xas); + + if (!head) { + if (max == 0) + return 0; + while ((max >> shift) >= XA_CHUNK_SIZE) + shift += XA_CHUNK_SHIFT; + return shift + XA_CHUNK_SHIFT; + } else if (xa_is_node(head)) { + node = xa_to_node(head); + shift = node->shift + XA_CHUNK_SHIFT; + } + xas->xa_node = NULL; + + while (max > max_index(head)) { + xa_mark_t mark = 0; + + XA_NODE_BUG_ON(node, shift > BITS_PER_LONG); + node = xas_alloc(xas, shift); + if (!node) + return -ENOMEM; + + node->count = 1; + if (xa_is_value(head)) + node->nr_values = 1; + RCU_INIT_POINTER(node->slots[0], head); + + /* Propagate the aggregated mark info to the new child */ + for (;;) { + if (xa_marked(xa, mark)) + node_set_mark(node, 0, mark); + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } + + /* + * Now that the new node is fully initialised, we can add + * it to the tree + */ + if (xa_is_node(head)) { + xa_to_node(head)->offset = 0; + rcu_assign_pointer(xa_to_node(head)->parent, node); + } + head = xa_mk_node(node); + rcu_assign_pointer(xa->xa_head, head); + xas_update(xas, node); + + shift += XA_CHUNK_SHIFT; + } + + xas->xa_node = node; + return shift; +} + +/* + * xas_create() - Create a slot to store an entry in. + * @xas: XArray operation state. + * + * Most users will not need to call this function directly, as it is called + * by xas_store(). It is useful for doing conditional store operations + * (see the xa_cmpxchg() implementation for an example). + * + * Return: If the slot already existed, returns the contents of this slot. + * If the slot was newly created, returns NULL. If it failed to create the + * slot, returns NULL and indicates the error in @xas. + */ +static void *xas_create(struct xa_state *xas) +{ + struct xarray *xa = xas->xa; + void *entry; + void __rcu **slot; + struct xa_node *node = xas->xa_node; + int shift; + unsigned int order = xas->xa_shift; + + if (xas_top(node)) { + entry = xa_head_locked(xa); + xas->xa_node = NULL; + shift = xas_expand(xas, entry); + if (shift < 0) + return NULL; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } else if (xas_error(xas)) { + return NULL; + } else if (node) { + unsigned int offset = xas->xa_offset; + + shift = node->shift; + entry = xa_entry_locked(xa, node, offset); + slot = &node->slots[offset]; + } else { + shift = 0; + entry = xa_head_locked(xa); + slot = &xa->xa_head; + } + + while (shift > order) { + shift -= XA_CHUNK_SHIFT; + if (!entry) { + node = xas_alloc(xas, shift); + if (!node) + break; + rcu_assign_pointer(*slot, xa_mk_node(node)); + } else if (xa_is_node(entry)) { + node = xa_to_node(entry); + } else { + break; + } + entry = xas_descend(xas, node); + slot = &node->slots[xas->xa_offset]; + } + + return entry; +} + +static void update_node(struct xa_state *xas, struct xa_node *node, + int count, int values) +{ + if (!node || (!count && !values)) + return; + + node->count += count; + node->nr_values += values; + XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE); + XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE); + xas_update(xas, node); + if (count < 0) + xas_delete_node(xas); +} + +/** + * xas_store() - Store this entry in the XArray. + * @xas: XArray operation state. + * @entry: New entry. + * + * If @xas is operating on a multi-index entry, the entry returned by this + * function is essentially meaningless (it may be an internal entry or it + * may be %NULL, even if there are non-NULL entries at some of the indices + * covered by the range). This is not a problem for any current users, + * and can be changed if needed. + * + * Return: The old entry at this index. + */ +void *xas_store(struct xa_state *xas, void *entry) +{ + struct xa_node *node; + void __rcu **slot = &xas->xa->xa_head; + unsigned int offset, max; + int count = 0; + int values = 0; + void *first, *next; + bool value = xa_is_value(entry); + + if (entry) + first = xas_create(xas); + else + first = xas_load(xas); + + if (xas_invalid(xas)) + return first; + node = xas->xa_node; + if (node && (xas->xa_shift < node->shift)) + xas->xa_sibs = 0; + if ((first == entry) && !xas->xa_sibs) + return first; + + next = first; + offset = xas->xa_offset; + max = xas->xa_offset + xas->xa_sibs; + if (node) { + slot = &node->slots[offset]; + if (xas->xa_sibs) + xas_squash_marks(xas); + } + if (!entry) + xas_init_marks(xas); + + for (;;) { + /* + * Must clear the marks before setting the entry to NULL, + * otherwise xas_for_each_marked may find a NULL entry and + * stop early. rcu_assign_pointer contains a release barrier + * so the mark clearing will appear to happen before the + * entry is set to NULL. + */ + rcu_assign_pointer(*slot, entry); + if (xa_is_node(next)) + xas_free_nodes(xas, xa_to_node(next)); + if (!node) + break; + count += !next - !entry; + values += !xa_is_value(first) - !value; + if (entry) { + if (offset == max) + break; + if (!xa_is_sibling(entry)) + entry = xa_mk_sibling(xas->xa_offset); + } else { + if (offset == XA_CHUNK_MASK) + break; + } + next = xa_entry_locked(xas->xa, node, ++offset); + if (!xa_is_sibling(next)) { + if (!entry && (offset > max)) + break; + first = next; + } + slot++; + } + + update_node(xas, node, count, values); + return first; +} +EXPORT_SYMBOL_GPL(xas_store); + /** * xas_get_mark() - Returns the state of this mark. * @xas: XArray operation state. @@ -240,6 +805,30 @@ void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark) } EXPORT_SYMBOL_GPL(xas_clear_mark); +/** + * xas_init_marks() - Initialise all marks for the entry + * @xas: Array operations state. + * + * Initialise all marks for the entry specified by @xas. If we're tracking + * free entries with a mark, we need to set it on all entries. All other + * marks are cleared. + * + * This implementation is not as efficient as it could be; we may walk + * up the tree multiple times. + */ +void xas_init_marks(const struct xa_state *xas) +{ + xa_mark_t mark = 0; + + for (;;) { + xas_clear_mark(xas, mark); + if (mark == XA_MARK_MAX) + break; + mark_inc(mark); + } +} +EXPORT_SYMBOL_GPL(xas_init_marks); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. @@ -253,9 +842,19 @@ EXPORT_SYMBOL_GPL(xas_clear_mark); */ void xa_init_flags(struct xarray *xa, gfp_t flags) { + unsigned int lock_type; + static struct lock_class_key xa_lock_irq; + static struct lock_class_key xa_lock_bh; + spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; + + lock_type = xa_lock_type(xa); + if (lock_type == XA_LOCK_IRQ) + lockdep_set_class(&xa->xa_lock, &xa_lock_irq); + else if (lock_type == XA_LOCK_BH) + lockdep_set_class(&xa->xa_lock, &xa_lock_bh); } EXPORT_SYMBOL(xa_init_flags); @@ -282,6 +881,100 @@ void *xa_load(struct xarray *xa, unsigned long index) } EXPORT_SYMBOL(xa_load); +static void *xas_result(struct xa_state *xas, void *curr) +{ + XA_NODE_BUG_ON(xas->xa_node, xa_is_internal(curr)); + if (xas_error(xas)) + curr = xas->xa_node; + return curr; +} + +/** + * __xa_erase() - Erase this entry from the XArray while locked. + * @xa: XArray. + * @index: Index into array. + * + * If the entry at this index is a multi-index entry then all indices will + * be erased, and the entry will no longer be a multi-index entry. + * This function expects the xa_lock to be held on entry. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index. + */ +void *__xa_erase(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + return xas_result(&xas, xas_store(&xas, NULL)); +} +EXPORT_SYMBOL_GPL(__xa_erase); + +/** + * xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from this index will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry + * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation + * failed. + */ +void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + curr = xas_store(&xas, entry); + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(xa_store); + +/** + * __xa_store() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + curr = xas_store(&xas, entry); + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_store); + /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index e63662db131b..05dca5c203f3 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -15,6 +15,7 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int bits); +void bitmap_clear(unsigned long *map, unsigned int start, int len); #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index 622266b197d0..c934572d935c 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -37,4 +37,6 @@ static inline bool arch_spin_is_locked(arch_spinlock_t *mutex) return true; } +#include + #endif diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index 1379f1d78d0b..acf1afa01c5b 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -5,7 +5,7 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \ LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu TARGETS = main idr-test multiorder xarray -CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o +CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ tag_check.o multiorder.o idr-test.o iteration_check.o benchmark.o diff --git a/tools/testing/radix-tree/bitmap.c b/tools/testing/radix-tree/bitmap.c new file mode 100644 index 000000000000..66ec4a24a203 --- /dev/null +++ b/tools/testing/radix-tree/bitmap.c @@ -0,0 +1,23 @@ +/* lib/bitmap.c pulls in at least two other files. */ + +#include + +void bitmap_clear(unsigned long *map, unsigned int start, int len) +{ + unsigned long *p = map + BIT_WORD(start); + const unsigned int size = start + len; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + while (len - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (len) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 5d06ac75a14d..4568248222ae 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -18,4 +18,8 @@ #define pr_debug printk #define pr_cont printk +#define __acquires(x) +#define __releases(x) +#define __must_hold(x) + #endif /* _KERNEL_H */ diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h new file mode 100644 index 000000000000..565fccdfe6e9 --- /dev/null +++ b/tools/testing/radix-tree/linux/lockdep.h @@ -0,0 +1,11 @@ +#ifndef _LINUX_LOCKDEP_H +#define _LINUX_LOCKDEP_H +struct lock_class_key { + unsigned int a; +}; + +static inline void lockdep_set_class(spinlock_t *lock, + struct lock_class_key *key) +{ +} +#endif /* _LINUX_LOCKDEP_H */ -- cgit v1.2.3 From 41aec91f55985e7f14ee75fe2f6e7bcfff0d0fae Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 10 Nov 2017 15:34:55 -0500 Subject: xarray: Add XArray conditional store operations Like cmpxchg(), xa_cmpxchg will only store to the index if the current entry matches the old entry. It returns the current entry, which is usually more useful than the errno returned by radix_tree_insert(). For the users who really only want the errno, the xa_insert() wrapper provides a more convenient calling convention. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 60 ++++++++++++++++++++++++++++++++++++++++++ lib/test_xarray.c | 20 ++++++++++++++ lib/xarray.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 15eab63286ed..7f740f675b96 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -275,6 +275,8 @@ struct xarray { void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); +void *xa_cmpxchg(struct xarray *, unsigned long index, + void *old, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -334,6 +336,34 @@ static inline void *xa_erase(struct xarray *xa, unsigned long index) return xa_store(xa, index, NULL, 0); } +/** + * xa_insert() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * If you would rather see the existing entry in the array, use xa_cmpxchg(). + * This function is for users who don't care what the entry is, only that + * one is present. + * + * Context: Process context. Takes and releases the xa_lock. + * May sleep if the @gfp flags permit. + * Return: 0 if the store succeeded. -EEXIST if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int xa_insert(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr = xa_cmpxchg(xa, index, NULL, entry, gfp); + if (!curr) + return 0; + if (xa_is_err(curr)) + return xa_err(curr); + return -EEXIST; +} + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -355,9 +385,39 @@ static inline void *xa_erase(struct xarray *xa, unsigned long index) */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); +void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, + void *entry, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); +/** + * __xa_insert() - Store this entry in the XArray unless another entry is + * already present. + * @xa: XArray. + * @index: Index into array. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * If you would rather see the existing entry in the array, use __xa_cmpxchg(). + * This function is for users who don't care what the entry is, only that + * one is present. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if the @gfp flags permit. + * Return: 0 if the store succeeded. -EEXIST if another entry was present. + * -ENOMEM if memory could not be allocated. + */ +static inline int __xa_insert(struct xarray *xa, unsigned long index, + void *entry, gfp_t gfp) +{ + void *curr = __xa_cmpxchg(xa, index, NULL, entry, gfp); + if (!curr) + return 0; + if (xa_is_err(curr)) + return xa_err(curr); + return -EEXIST; +} + /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index b711030fbc32..fb472258b639 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -198,6 +198,25 @@ static noinline void check_xa_shrink(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_cmpxchg(struct xarray *xa) +{ + void *FIVE = xa_mk_value(5); + void *SIX = xa_mk_value(6); + void *LOTS = xa_mk_value(12345678); + + XA_BUG_ON(xa, !xa_empty(xa)); + XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_insert(xa, 12345678, xa, GFP_KERNEL) != -EEXIST); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, SIX, FIVE, GFP_KERNEL) != LOTS); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, LOTS, FIVE, GFP_KERNEL) != LOTS); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, FIVE, LOTS, GFP_KERNEL) != FIVE); + XA_BUG_ON(xa, xa_cmpxchg(xa, 5, FIVE, NULL, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, xa_cmpxchg(xa, 5, NULL, FIVE, GFP_KERNEL) != NULL); + xa_erase_index(xa, 12345678); + xa_erase_index(xa, 5); + XA_BUG_ON(xa, !xa_empty(xa)); +} + static noinline void check_multi_store(struct xarray *xa) { #ifdef CONFIG_XARRAY_MULTI @@ -274,6 +293,7 @@ static int xarray_checks(void) check_xa_load(&array); check_xa_mark(&array); check_xa_shrink(&array); + check_cmpxchg(&array); check_multi_store(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); diff --git a/lib/xarray.c b/lib/xarray.c index 4596a95ed9cd..2ba5a98ec618 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -975,6 +975,77 @@ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) } EXPORT_SYMBOL(__xa_store); +/** + * xa_cmpxchg() - Conditionally replace an entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New value to place in array. + * @gfp: Memory allocation flags. + * + * If the entry at @index is the same as @old, replace it with @entry. + * If the return value is equal to @old, then the exchange was successful. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: The old value at this index or xa_err() if an error happened. + */ +void *xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + curr = xas_load(&xas); + if (curr == old) + xas_store(&xas, entry); + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(xa_cmpxchg); + +/** + * __xa_cmpxchg() - Store this entry in the XArray. + * @xa: XArray. + * @index: Index into array. + * @old: Old value to test against. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * You must already be holding the xa_lock when calling this function. + * It will drop the lock if needed to allocate memory, and then reacquire + * it afterwards. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: The old entry at this index or xa_err() if an error happened. + */ +void *__xa_cmpxchg(struct xarray *xa, unsigned long index, + void *old, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + void *curr; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + + do { + curr = xas_load(&xas); + if (curr == old) + xas_store(&xas, entry); + } while (__xas_nomem(&xas, gfp)); + + return xas_result(&xas, curr); +} +EXPORT_SYMBOL(__xa_cmpxchg); + /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. -- cgit v1.2.3 From b803b42823d0d9e8b6deccf01ffc2aba5d0738df Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 14 Nov 2017 08:30:11 -0500 Subject: xarray: Add XArray iterators The xa_for_each iterator allows the user to efficiently walk a range of the array, executing the loop body once for each entry in that range that matches the filter. This commit also includes xa_find() and xa_find_after() which are helper functions for xa_for_each() but may also be useful in their own right. In the xas family of functions, we have xas_for_each(), xas_find(), xas_next_entry(), xas_for_each_tagged(), xas_find_tagged(), xas_next_tagged() and xas_pause(). Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 165 ++++++++++++++++++++++++++++ lib/test_xarray.c | 183 +++++++++++++++++++++++++++++++ lib/xarray.c | 292 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 640 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 7f740f675b96..66b10efa5a50 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -280,6 +280,10 @@ void *xa_cmpxchg(struct xarray *, unsigned long index, bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); +void *xa_find(struct xarray *xa, unsigned long *index, + unsigned long max, xa_mark_t) __attribute__((nonnull(2))); +void *xa_find_after(struct xarray *xa, unsigned long *index, + unsigned long max, xa_mark_t) __attribute__((nonnull(2))); /** * xa_init() - Initialise an empty XArray. @@ -364,6 +368,35 @@ static inline int xa_insert(struct xarray *xa, unsigned long index, return -EEXIST; } +/** + * xa_for_each() - Iterate over a portion of an XArray. + * @xa: XArray. + * @entry: Entry retrieved from array. + * @index: Index of @entry. + * @max: Maximum index to retrieve from array. + * @filter: Selection criterion. + * + * Initialise @index to the lowest index you want to retrieve from the + * array. During the iteration, @entry will have the value of the entry + * stored in @xa at @index. The iteration will skip all entries in the + * array which do not match @filter. You may modify @index during the + * iteration if you want to skip or reprocess indices. It is safe to modify + * the array during the iteration. At the end of the iteration, @entry will + * be set to NULL and @index will have a value less than or equal to max. + * + * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have + * to handle your own locking with xas_for_each(), and if you have to unlock + * after each iteration, it will also end up being O(n.log(n)). xa_for_each() + * will spin if it hits a retry entry; if you intend to see retry entries, + * you should use the xas_for_each() iterator instead. The xas_for_each() + * iterator will expand into more inline code than xa_for_each(). + * + * Context: Any context. Takes and releases the RCU lock. + */ +#define xa_for_each(xa, entry, index, max, filter) \ + for (entry = xa_find(xa, &index, max, filter); entry; \ + entry = xa_find_after(xa, &index, max, filter)) + #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) @@ -835,13 +868,16 @@ static inline bool xas_retry(struct xa_state *xas, const void *entry) void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); +void *xas_find(struct xa_state *, unsigned long max); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); +void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); +void xas_pause(struct xa_state *); /** * xas_reload() - Refetch an entry from the xarray. @@ -914,4 +950,133 @@ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) xas->xa_update = update; } +/** + * xas_next_entry() - Advance iterator to next present entry. + * @xas: XArray operation state. + * @max: Highest index to return. + * + * xas_next_entry() is an inline function to optimise xarray traversal for + * speed. It is equivalent to calling xas_find(), and will call xas_find() + * for all the hard cases. + * + * Return: The next present entry after the one currently referred to by @xas. + */ +static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) +{ + struct xa_node *node = xas->xa_node; + void *entry; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) + return xas_find(xas, max); + + do { + if (unlikely(xas->xa_index >= max)) + return xas_find(xas, max); + if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) + return xas_find(xas, max); + entry = xa_entry(xas->xa, node, xas->xa_offset + 1); + if (unlikely(xa_is_internal(entry))) + return xas_find(xas, max); + xas->xa_offset++; + xas->xa_index++; + } while (!entry); + + return entry; +} + +/* Private */ +static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, + xa_mark_t mark) +{ + unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; + unsigned int offset = xas->xa_offset; + + if (advance) + offset++; + if (XA_CHUNK_SIZE == BITS_PER_LONG) { + if (offset < XA_CHUNK_SIZE) { + unsigned long data = *addr & (~0UL << offset); + if (data) + return __ffs(data); + } + return XA_CHUNK_SIZE; + } + + return find_next_bit(addr, XA_CHUNK_SIZE, offset); +} + +/** + * xas_next_marked() - Advance iterator to next marked entry. + * @xas: XArray operation state. + * @max: Highest index to return. + * @mark: Mark to search for. + * + * xas_next_marked() is an inline function to optimise xarray traversal for + * speed. It is equivalent to calling xas_find_marked(), and will call + * xas_find_marked() for all the hard cases. + * + * Return: The next marked entry after the one currently referred to by @xas. + */ +static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct xa_node *node = xas->xa_node; + unsigned int offset; + + if (unlikely(xas_not_node(node) || node->shift)) + return xas_find_marked(xas, max, mark); + offset = xas_find_chunk(xas, true, mark); + xas->xa_offset = offset; + xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; + if (xas->xa_index > max) + return NULL; + if (offset == XA_CHUNK_SIZE) + return xas_find_marked(xas, max, mark); + return xa_entry(xas->xa, node, offset); +} + +/* + * If iterating while holding a lock, drop the lock and reschedule + * every %XA_CHECK_SCHED loops. + */ +enum { + XA_CHECK_SCHED = 4096, +}; + +/** + * xas_for_each() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * @max: Maximum index to retrieve from array. + * + * The loop body will be executed for each entry present in the xarray + * between the current xas position and @max. @entry will be set to + * the entry retrieved from the xarray. It is safe to delete entries + * from the array in the loop body. You should hold either the RCU lock + * or the xa_lock while iterating. If you need to drop the lock, call + * xas_pause() first. + */ +#define xas_for_each(xas, entry, max) \ + for (entry = xas_find(xas, max); entry; \ + entry = xas_next_entry(xas, max)) + +/** + * xas_for_each_marked() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * @max: Maximum index to retrieve from array. + * @mark: Mark to search for. + * + * The loop body will be executed for each marked entry in the xarray + * between the current xas position and @max. @entry will be set to + * the entry retrieved from the xarray. It is safe to delete entries + * from the array in the loop body. You should hold either the RCU lock + * or the xa_lock while iterating. If you need to drop the lock, call + * xas_pause() first. + */ +#define xas_for_each_marked(xas, entry, max, mark) \ + for (entry = xas_find_marked(xas, max, mark); entry; \ + entry = xas_next_marked(xas, max, mark)) + #endif /* _LINUX_XARRAY_H */ diff --git a/lib/test_xarray.c b/lib/test_xarray.c index fb472258b639..e3c2d4d00b15 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -75,6 +75,48 @@ static noinline void check_xa_err(struct xarray *xa) // XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL); } +static noinline void check_xas_retry(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + void *entry; + + xa_store_index(xa, 0, GFP_KERNEL); + xa_store_index(xa, 1, GFP_KERNEL); + + rcu_read_lock(); + XA_BUG_ON(xa, xas_find(&xas, ULONG_MAX) != xa_mk_value(0)); + xa_erase_index(xa, 1); + XA_BUG_ON(xa, !xa_is_retry(xas_reload(&xas))); + XA_BUG_ON(xa, xas_retry(&xas, NULL)); + XA_BUG_ON(xa, xas_retry(&xas, xa_mk_value(0))); + xas_reset(&xas); + XA_BUG_ON(xa, xas.xa_node != XAS_RESTART); + XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0)); + XA_BUG_ON(xa, xas.xa_node != NULL); + + XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL); + XA_BUG_ON(xa, !xa_is_internal(xas_reload(&xas))); + xas.xa_node = XAS_RESTART; + XA_BUG_ON(xa, xas_next_entry(&xas, ULONG_MAX) != xa_mk_value(0)); + rcu_read_unlock(); + + /* Make sure we can iterate through retry entries */ + xas_lock(&xas); + xas_set(&xas, 0); + xas_store(&xas, XA_RETRY_ENTRY); + xas_set(&xas, 1); + xas_store(&xas, XA_RETRY_ENTRY); + + xas_set(&xas, 0); + xas_for_each(&xas, entry, ULONG_MAX) { + xas_store(&xas, xa_mk_value(xas.xa_index)); + } + xas_unlock(&xas); + + xa_erase_index(xa, 0); + xa_erase_index(xa, 1); +} + static noinline void check_xa_load(struct xarray *xa) { unsigned long i, j; @@ -217,6 +259,44 @@ static noinline void check_cmpxchg(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_xas_erase(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + void *entry; + unsigned long i, j; + + for (i = 0; i < 200; i++) { + for (j = i; j < 2 * i + 17; j++) { + xas_set(&xas, j); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(j)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + } + + xas_set(&xas, ULONG_MAX); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(0)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + xas_lock(&xas); + xas_store(&xas, NULL); + + xas_set(&xas, 0); + j = i; + xas_for_each(&xas, entry, ULONG_MAX) { + XA_BUG_ON(xa, entry != xa_mk_value(j)); + xas_store(&xas, NULL); + j++; + } + xas_unlock(&xas); + XA_BUG_ON(xa, !xa_empty(xa)); + } +} + static noinline void check_multi_store(struct xarray *xa) { #ifdef CONFIG_XARRAY_MULTI @@ -285,16 +365,119 @@ static noinline void check_multi_store(struct xarray *xa) #endif } +static noinline void check_multi_find(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned long index; + + xa_store_order(xa, 12, 2, xa_mk_value(12), GFP_KERNEL); + XA_BUG_ON(xa, xa_store_index(xa, 16, GFP_KERNEL) != NULL); + + index = 0; + XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) != + xa_mk_value(12)); + XA_BUG_ON(xa, index != 12); + index = 13; + XA_BUG_ON(xa, xa_find(xa, &index, ULONG_MAX, XA_PRESENT) != + xa_mk_value(12)); + XA_BUG_ON(xa, (index < 12) || (index >= 16)); + XA_BUG_ON(xa, xa_find_after(xa, &index, ULONG_MAX, XA_PRESENT) != + xa_mk_value(16)); + XA_BUG_ON(xa, index != 16); + + xa_erase_index(xa, 12); + xa_erase_index(xa, 16); + XA_BUG_ON(xa, !xa_empty(xa)); +#endif +} + +static noinline void check_multi_find_2(struct xarray *xa) +{ + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 10 : 1; + unsigned int i, j; + void *entry; + + for (i = 0; i < max_order; i++) { + unsigned long index = 1UL << i; + for (j = 0; j < index; j++) { + XA_STATE(xas, xa, j + index); + xa_store_index(xa, index - 1, GFP_KERNEL); + xa_store_order(xa, index, i, xa_mk_value(index), + GFP_KERNEL); + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + xa_erase_index(xa, index); + } + rcu_read_unlock(); + xa_erase_index(xa, index - 1); + XA_BUG_ON(xa, !xa_empty(xa)); + } + } +} + +static noinline void check_find(struct xarray *xa) +{ + unsigned long i, j, k; + + XA_BUG_ON(xa, !xa_empty(xa)); + + /* + * Check xa_find with all pairs between 0 and 99 inclusive, + * starting at every index between 0 and 99 + */ + for (i = 0; i < 100; i++) { + XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL); + xa_set_mark(xa, i, XA_MARK_0); + for (j = 0; j < i; j++) { + XA_BUG_ON(xa, xa_store_index(xa, j, GFP_KERNEL) != + NULL); + xa_set_mark(xa, j, XA_MARK_0); + for (k = 0; k < 100; k++) { + unsigned long index = k; + void *entry = xa_find(xa, &index, ULONG_MAX, + XA_PRESENT); + if (k <= j) + XA_BUG_ON(xa, index != j); + else if (k <= i) + XA_BUG_ON(xa, index != i); + else + XA_BUG_ON(xa, entry != NULL); + + index = k; + entry = xa_find(xa, &index, ULONG_MAX, + XA_MARK_0); + if (k <= j) + XA_BUG_ON(xa, index != j); + else if (k <= i) + XA_BUG_ON(xa, index != i); + else + XA_BUG_ON(xa, entry != NULL); + } + xa_erase_index(xa, j); + XA_BUG_ON(xa, xa_get_mark(xa, j, XA_MARK_0)); + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0)); + } + xa_erase_index(xa, i); + XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_0)); + } + XA_BUG_ON(xa, !xa_empty(xa)); + check_multi_find(xa); + check_multi_find_2(xa); +} + static DEFINE_XARRAY(array); static int xarray_checks(void) { check_xa_err(&array); + check_xas_retry(&array); check_xa_load(&array); check_xa_mark(&array); check_xa_shrink(&array); + check_xas_erase(&array); check_cmpxchg(&array); check_multi_store(&array); + check_find(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index 2ba5a98ec618..24494f42daa6 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -128,6 +128,11 @@ static unsigned int get_offset(unsigned long index, struct xa_node *node) return (index >> node->shift) & XA_CHUNK_MASK; } +static void xas_set_offset(struct xa_state *xas) +{ + xas->xa_offset = get_offset(xas->xa_index, xas->xa_node); +} + /* move the index either forwards (find) or backwards (sibling slot) */ static void xas_move_index(struct xa_state *xas, unsigned long offset) { @@ -136,6 +141,12 @@ static void xas_move_index(struct xa_state *xas, unsigned long offset) xas->xa_index += offset << shift; } +static void xas_advance(struct xa_state *xas) +{ + xas->xa_offset++; + xas_move_index(xas, xas->xa_offset); +} + static void *set_bounds(struct xa_state *xas) { xas->xa_node = XAS_BOUNDS; @@ -829,6 +840,202 @@ void xas_init_marks(const struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_init_marks); +/** + * xas_pause() - Pause a walk to drop a lock. + * @xas: XArray operation state. + * + * Some users need to pause a walk and drop the lock they're holding in + * order to yield to a higher priority thread or carry out an operation + * on an entry. Those users should call this function before they drop + * the lock. It resets the @xas to be suitable for the next iteration + * of the loop after the user has reacquired the lock. If most entries + * found during a walk require you to call xas_pause(), the xa_for_each() + * iterator may be more appropriate. + * + * Note that xas_pause() only works for forward iteration. If a user needs + * to pause a reverse iteration, we will need a xas_pause_rev(). + */ +void xas_pause(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (xas_invalid(xas)) + return; + + if (node) { + unsigned int offset = xas->xa_offset; + while (++offset < XA_CHUNK_SIZE) { + if (!xa_is_sibling(xa_entry(xas->xa, node, offset))) + break; + } + xas->xa_index += (offset - xas->xa_offset) << node->shift; + } else { + xas->xa_index++; + } + xas->xa_node = XAS_RESTART; +} +EXPORT_SYMBOL_GPL(xas_pause); + +/** + * xas_find() - Find the next present entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * + * If the @xas has not yet been walked to an entry, return the entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we move to the + * next entry. + * + * If no entry is found and the array is smaller than @max, the iterator + * is set to the smallest index not yet in the array. This allows @xas + * to be immediately passed to xas_store(). + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find(struct xa_state *xas, unsigned long max) +{ + void *entry; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) { + xas->xa_index = 1; + return set_bounds(xas); + } else if (xas_top(xas->xa_node)) { + entry = xas_load(xas); + if (entry || xas_not_node(xas->xa_node)) + return entry; + } else if (!xas->xa_node->shift && + xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) { + xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1; + } + + xas_advance(xas); + + while (xas->xa_node && (xas->xa_index <= max)) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_node(entry)) { + xas->xa_node = xa_to_node(entry); + xas->xa_offset = 0; + continue; + } + if (entry && !xa_is_sibling(entry)) + return entry; + + xas_advance(xas); + } + + if (!xas->xa_node) + xas->xa_node = XAS_BOUNDS; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find); + +/** + * xas_find_marked() - Find the next marked entry in the XArray. + * @xas: XArray operation state. + * @max: Highest index to return. + * @mark: Mark number to search for. + * + * If the @xas has not yet been walked to an entry, return the marked entry + * which has an index >= xas.xa_index. If it has been walked, the entry + * currently being pointed at has been processed, and so we return the + * first marked entry with an index > xas.xa_index. + * + * If no marked entry is found and the array is smaller than @max, @xas is + * set to the bounds state and xas->xa_index is set to the smallest index + * not yet in the array. This allows @xas to be immediately passed to + * xas_store(). + * + * If no entry is found before @max is reached, @xas is set to the restart + * state. + * + * Return: The entry, if found, otherwise %NULL. + */ +void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) +{ + bool advance = true; + unsigned int offset; + void *entry; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) { + xas->xa_index = 1; + goto out; + } else if (xas_top(xas->xa_node)) { + advance = false; + entry = xa_head(xas->xa); + xas->xa_node = NULL; + if (xas->xa_index > max_index(entry)) + goto bounds; + if (!xa_is_node(entry)) { + if (xa_marked(xas->xa, mark)) + return entry; + xas->xa_index = 1; + goto out; + } + xas->xa_node = xa_to_node(entry); + xas->xa_offset = xas->xa_index >> xas->xa_node->shift; + } + + while (xas->xa_index <= max) { + if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + advance = false; + continue; + } + + if (!advance) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (xa_is_sibling(entry)) { + xas->xa_offset = xa_to_sibling(entry); + xas_move_index(xas, xas->xa_offset); + } + } + + offset = xas_find_chunk(xas, advance, mark); + if (offset > xas->xa_offset) { + advance = false; + xas_move_index(xas, offset); + /* Mind the wrap */ + if ((xas->xa_index - 1) >= max) + goto max; + xas->xa_offset = offset; + if (offset == XA_CHUNK_SIZE) + continue; + } + + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } + +out: + if (!max) + goto max; +bounds: + xas->xa_node = XAS_BOUNDS; + return NULL; +max: + xas->xa_node = XAS_RESTART; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_marked); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. @@ -1152,6 +1359,91 @@ void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark) } EXPORT_SYMBOL(xa_clear_mark); +/** + * xa_find() - Search the XArray for an entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter, and has the lowest + * index that is at least @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may not find + * entries which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The entry, if found, otherwise %NULL. + */ +void *xa_find(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp); + void *entry; + + rcu_read_lock(); + do { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + } while (xas_retry(&xas, entry)); + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find); + +/** + * xa_find_after() - Search the XArray for a present entry. + * @xa: XArray. + * @indexp: Pointer to an index. + * @max: Maximum index to search to. + * @filter: Selection criterion. + * + * Finds the entry in @xa which matches the @filter and has the lowest + * index that is above @indexp and no more than @max. + * If an entry is found, @indexp is updated to be the index of the entry. + * This function is protected by the RCU read lock, so it may miss entries + * which are being simultaneously added. It will not return an + * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find(). + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The pointer, if found, otherwise %NULL. + */ +void *xa_find_after(struct xarray *xa, unsigned long *indexp, + unsigned long max, xa_mark_t filter) +{ + XA_STATE(xas, xa, *indexp + 1); + void *entry; + + rcu_read_lock(); + for (;;) { + if ((__force unsigned int)filter < XA_MAX_MARKS) + entry = xas_find_marked(&xas, max, filter); + else + entry = xas_find(&xas, max); + if (xas.xa_shift) { + if (xas.xa_index & ((1UL << xas.xa_shift) - 1)) + continue; + } else { + if (xas.xa_offset < (xas.xa_index & XA_CHUNK_MASK)) + continue; + } + if (!xas_retry(&xas, entry)) + break; + } + rcu_read_unlock(); + + if (entry) + *indexp = xas.xa_index; + return entry; +} +EXPORT_SYMBOL(xa_find_after); + #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { -- cgit v1.2.3 From 80a0a1a9a3cde9b23851e8eb7160e2786549306a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 14 Nov 2017 16:42:22 -0500 Subject: xarray: Extract entries from an XArray The xa_extract function combines the functionality of radix_tree_gang_lookup() and radix_tree_gang_lookup_tagged(). It extracts entries matching the specified filter into a normal array. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 2 ++ lib/xarray.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 66b10efa5a50..82ceed981924 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -284,6 +284,8 @@ void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); +unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, + unsigned long max, unsigned int n, xa_mark_t); /** * xa_init() - Initialise an empty XArray. diff --git a/lib/xarray.c b/lib/xarray.c index 24494f42daa6..70e04810c8c2 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1444,6 +1444,86 @@ void *xa_find_after(struct xarray *xa, unsigned long *indexp, } EXPORT_SYMBOL(xa_find_after); +static unsigned int xas_extract_present(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each(xas, entry, max) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +static unsigned int xas_extract_marked(struct xa_state *xas, void **dst, + unsigned long max, unsigned int n, xa_mark_t mark) +{ + void *entry; + unsigned int i = 0; + + rcu_read_lock(); + xas_for_each_marked(xas, entry, max, mark) { + if (xas_retry(xas, entry)) + continue; + dst[i++] = entry; + if (i == n) + break; + } + rcu_read_unlock(); + + return i; +} + +/** + * xa_extract() - Copy selected entries from the XArray into a normal array. + * @xa: The source XArray to copy from. + * @dst: The buffer to copy entries into. + * @start: The first index in the XArray eligible to be selected. + * @max: The last index in the XArray eligible to be selected. + * @n: The maximum number of entries to copy. + * @filter: Selection criterion. + * + * Copies up to @n entries that match @filter from the XArray. The + * copied entries will have indices between @start and @max, inclusive. + * + * The @filter may be an XArray mark value, in which case entries which are + * marked with that mark will be copied. It may also be %XA_PRESENT, in + * which case all entries which are not NULL will be copied. + * + * The entries returned may not represent a snapshot of the XArray at a + * moment in time. For example, if another thread stores to index 5, then + * index 10, calling xa_extract() may return the old contents of index 5 + * and the new contents of index 10. Indices not modified while this + * function is running will not be skipped. + * + * If you need stronger guarantees, holding the xa_lock across calls to this + * function will prevent concurrent modification. + * + * Context: Any context. Takes and releases the RCU lock. + * Return: The number of entries copied. + */ +unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, + unsigned long max, unsigned int n, xa_mark_t filter) +{ + XA_STATE(xas, xa, start); + + if (!n) + return 0; + + if ((__force unsigned int)filter < XA_MAX_MARKS) + return xas_extract_marked(&xas, dst, max, n, filter); + return xas_extract_present(&xas, dst, max, n); +} +EXPORT_SYMBOL(xa_extract); + #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { -- cgit v1.2.3 From 687149fca1f37c447e5d161e0a4a04cb2c880cb6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 17 Nov 2017 08:16:34 -0500 Subject: xarray: Destroy an XArray This function frees all the internal memory allocated to the xarray and reinitialises it to be empty. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 1 + lib/test_xarray.c | 34 ++++++++++++++++++++++++++++++++++ lib/xarray.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 82ceed981924..0a758fa3ed2c 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -286,6 +286,7 @@ void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); +void xa_destroy(struct xarray *); /** * xa_init() - Initialise an empty XArray. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index e3c2d4d00b15..a96f67caa1c2 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -465,6 +465,39 @@ static noinline void check_find(struct xarray *xa) check_multi_find_2(xa); } +static noinline void check_destroy(struct xarray *xa) +{ + unsigned long index; + + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Destroying an empty array is a no-op */ + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Destroying an array with a single entry */ + for (index = 0; index < 1000; index++) { + xa_store_index(xa, index, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); + } + + /* Destroying an array with a single entry at ULONG_MAX */ + xa_store(xa, ULONG_MAX, xa, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); + +#ifdef CONFIG_XARRAY_MULTI + /* Destroying an array with a multi-index entry */ + xa_store_order(xa, 1 << 11, 11, xa, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + xa_destroy(xa); + XA_BUG_ON(xa, !xa_empty(xa)); +#endif +} + static DEFINE_XARRAY(array); static int xarray_checks(void) @@ -478,6 +511,7 @@ static int xarray_checks(void) check_cmpxchg(&array); check_multi_store(&array); check_find(&array); + check_destroy(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index 70e04810c8c2..057dbe0d3bf6 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1524,6 +1524,34 @@ unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start, } EXPORT_SYMBOL(xa_extract); +/** + * xa_destroy() - Free all internal data structures. + * @xa: XArray. + * + * After calling this function, the XArray is empty and has freed all memory + * allocated for its internal data structures. You are responsible for + * freeing the objects referenced by the XArray. + * + * Context: Any context. Takes and releases the xa_lock, interrupt-safe. + */ +void xa_destroy(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned long flags; + void *entry; + + xas.xa_node = NULL; + xas_lock_irqsave(&xas, flags); + entry = xa_head_locked(xa); + RCU_INIT_POINTER(xa->xa_head, NULL); + xas_init_marks(&xas); + /* lockdep checks we're still holding the lock in xas_free_nodes() */ + if (xa_is_node(entry)) + xas_free_nodes(&xas, xa_to_node(entry)); + xas_unlock_irqrestore(&xas, flags); +} +EXPORT_SYMBOL(xa_destroy); + #ifdef XA_DEBUG void xa_dump_node(const struct xa_node *node) { -- cgit v1.2.3 From 64d3e9a9e0cc51957d243dd2b0adc5d74ff5e128 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 1 Dec 2017 00:06:52 -0500 Subject: xarray: Step through an XArray The xas_next and xas_prev functions move the xas index by one position, and adjust the rest of the iterator state to match it. This is more efficient than calling xas_set() as it keeps the iterator at the leaves of the tree instead of walking the iterator from the root each time. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 67 ++++++++++++++++++++++++++++ lib/test_xarray.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/xarray.c | 74 +++++++++++++++++++++++++++++++ 3 files changed, 256 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 0a758fa3ed2c..381f94a66762 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -828,6 +828,12 @@ static inline bool xas_not_node(struct xa_node *node) return ((unsigned long)node & 3) || !node; } +/* True if the node represents RESTART or an error */ +static inline bool xas_frozen(struct xa_node *node) +{ + return (unsigned long)node & 2; +} + /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { @@ -1082,4 +1088,65 @@ enum { for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) +void *__xas_next(struct xa_state *); +void *__xas_prev(struct xa_state *); + +/** + * xas_prev() - Move iterator to previous index. + * @xas: XArray operation state. + * + * If the @xas was in an error state, it will remain in an error state + * and this function will return %NULL. If the @xas has never been walked, + * it will have the effect of calling xas_load(). Otherwise one will be + * subtracted from the index and the state will be walked to the correct + * location in the array for the next operation. + * + * If the iterator was referencing index 0, this function wraps + * around to %ULONG_MAX. + * + * Return: The entry at the new index. This may be %NULL or an internal + * entry. + */ +static inline void *xas_prev(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset == 0)) + return __xas_prev(xas); + + xas->xa_index--; + xas->xa_offset--; + return xa_entry(xas->xa, node, xas->xa_offset); +} + +/** + * xas_next() - Move state to next index. + * @xas: XArray operation state. + * + * If the @xas was in an error state, it will remain in an error state + * and this function will return %NULL. If the @xas has never been walked, + * it will have the effect of calling xas_load(). Otherwise one will be + * added to the index and the state will be walked to the correct + * location in the array for the next operation. + * + * If the iterator was referencing index %ULONG_MAX, this function wraps + * around to 0. + * + * Return: The entry at the new index. This may be %NULL or an internal + * entry. + */ +static inline void *xas_next(struct xa_state *xas) +{ + struct xa_node *node = xas->xa_node; + + if (unlikely(xas_not_node(node) || node->shift || + xas->xa_offset == XA_CHUNK_MASK)) + return __xas_next(xas); + + xas->xa_index++; + xas->xa_offset++; + return xa_entry(xas->xa, node, xas->xa_offset); +} + #endif /* _LINUX_XARRAY_H */ diff --git a/lib/test_xarray.c b/lib/test_xarray.c index a96f67caa1c2..85ef1882dd3c 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -465,6 +465,120 @@ static noinline void check_find(struct xarray *xa) check_multi_find_2(xa); } +static noinline void check_move_small(struct xarray *xa, unsigned long idx) +{ + XA_STATE(xas, xa, 0); + unsigned long i; + + xa_store_index(xa, 0, GFP_KERNEL); + xa_store_index(xa, idx, GFP_KERNEL); + + rcu_read_lock(); + for (i = 0; i < idx * 4; i++) { + void *entry = xas_next(&xas); + if (i <= idx) + XA_BUG_ON(xa, xas.xa_node == XAS_RESTART); + XA_BUG_ON(xa, xas.xa_index != i); + if (i == 0 || i == idx) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + } + xas_next(&xas); + XA_BUG_ON(xa, xas.xa_index != i); + + do { + void *entry = xas_prev(&xas); + i--; + if (i <= idx) + XA_BUG_ON(xa, xas.xa_node == XAS_RESTART); + XA_BUG_ON(xa, xas.xa_index != i); + if (i == 0 || i == idx) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + } while (i > 0); + + xas_set(&xas, ULONG_MAX); + XA_BUG_ON(xa, xas_next(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + XA_BUG_ON(xa, xas_next(&xas) != xa_mk_value(0)); + XA_BUG_ON(xa, xas.xa_index != 0); + XA_BUG_ON(xa, xas_prev(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + rcu_read_unlock(); + + xa_erase_index(xa, 0); + xa_erase_index(xa, idx); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_move(struct xarray *xa) +{ + XA_STATE(xas, xa, (1 << 16) - 1); + unsigned long i; + + for (i = 0; i < (1 << 16); i++) + XA_BUG_ON(xa, xa_store_index(xa, i, GFP_KERNEL) != NULL); + + rcu_read_lock(); + do { + void *entry = xas_prev(&xas); + i--; + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, i != xas.xa_index); + } while (i != 0); + + XA_BUG_ON(xa, xas_prev(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + + do { + void *entry = xas_next(&xas); + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, i != xas.xa_index); + i++; + } while (i < (1 << 16)); + rcu_read_unlock(); + + for (i = (1 << 8); i < (1 << 15); i++) + xa_erase_index(xa, i); + + i = xas.xa_index; + + rcu_read_lock(); + do { + void *entry = xas_prev(&xas); + i--; + if ((i < (1 << 8)) || (i >= (1 << 15))) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + XA_BUG_ON(xa, i != xas.xa_index); + } while (i != 0); + + XA_BUG_ON(xa, xas_prev(&xas) != NULL); + XA_BUG_ON(xa, xas.xa_index != ULONG_MAX); + + do { + void *entry = xas_next(&xas); + if ((i < (1 << 8)) || (i >= (1 << 15))) + XA_BUG_ON(xa, entry != xa_mk_value(i)); + else + XA_BUG_ON(xa, entry != NULL); + XA_BUG_ON(xa, i != xas.xa_index); + i++; + } while (i < (1 << 16)); + rcu_read_unlock(); + + xa_destroy(xa); + + for (i = 0; i < 16; i++) + check_move_small(xa, 1UL << i); + + for (i = 2; i < 16; i++) + check_move_small(xa, (1UL << i) - 1); +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -512,6 +626,7 @@ static int xarray_checks(void) check_multi_store(&array); check_find(&array); check_destroy(&array); + check_move(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index 057dbe0d3bf6..303c46579598 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -876,6 +876,80 @@ void xas_pause(struct xa_state *xas) } EXPORT_SYMBOL_GPL(xas_pause); +/* + * __xas_prev() - Find the previous entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_prev() which handles all the complex cases + * out of line. + */ +void *__xas_prev(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index--; + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset--; + + while (xas->xa_offset == 255) { + xas->xa_offset = xas->xa_node->offset - 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_prev); + +/* + * __xas_next() - Find the next entry in the XArray. + * @xas: XArray operation state. + * + * Helper function for xas_next() which handles all the complex cases + * out of line. + */ +void *__xas_next(struct xa_state *xas) +{ + void *entry; + + if (!xas_frozen(xas->xa_node)) + xas->xa_index++; + if (xas_not_node(xas->xa_node)) + return xas_load(xas); + + if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node)) + xas->xa_offset++; + + while (xas->xa_offset == XA_CHUNK_SIZE) { + xas->xa_offset = xas->xa_node->offset + 1; + xas->xa_node = xa_parent(xas->xa, xas->xa_node); + if (!xas->xa_node) + return set_bounds(xas); + } + + for (;;) { + entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset); + if (!xa_is_node(entry)) + return entry; + + xas->xa_node = xa_to_node(entry); + xas_set_offset(xas); + } +} +EXPORT_SYMBOL_GPL(__xas_next); + /** * xas_find() - Find the next present entry in the XArray. * @xas: XArray operation state. -- cgit v1.2.3 From 4e99d4e9579d3b950bf4b38d0d64eb1b9be78761 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 1 Jun 2018 22:46:02 -0400 Subject: xarray: Add xas_for_each_conflict This iterator iterates over each entry that is stored in the index or indices specified by the xa_state. This is intended for use for a conditional store of a multiindex entry, or to allow entries which are about to be removed from the xarray to be disposed of properly. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 17 +++++++++++++ lib/test_xarray.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++ lib/xarray.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 381f94a66762..2664e718dbd3 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -878,6 +878,7 @@ static inline bool xas_retry(struct xa_state *xas, const void *entry) void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); +void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); @@ -1088,6 +1089,22 @@ enum { for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) +/** + * xas_for_each_conflict() - Iterate over a range of an XArray. + * @xas: XArray operation state. + * @entry: Entry retrieved from the array. + * + * The loop body will be executed for each entry in the XArray that lies + * within the range specified by @xas. If the loop completes successfully, + * any entries that lie in this range will be replaced by @entry. The caller + * may break out of the loop; if they do so, the contents of the XArray will + * be unchanged. The operation may fail due to an out of memory condition. + * The caller may also call xa_set_err() to exit the loop while setting an + * error to record the reason. + */ +#define xas_for_each_conflict(xas, entry) \ + while ((entry = xas_find_conflict(xas))) + void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 85ef1882dd3c..8eba3de1baea 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -365,6 +365,73 @@ static noinline void check_multi_store(struct xarray *xa) #endif } +static noinline void __check_store_iter(struct xarray *xa, unsigned long start, + unsigned int order, unsigned int present) +{ + XA_STATE_ORDER(xas, xa, start, order); + void *entry; + unsigned int count = 0; + +retry: + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, !xa_is_value(entry)); + XA_BUG_ON(xa, entry < xa_mk_value(start)); + XA_BUG_ON(xa, entry > xa_mk_value(start + (1UL << order) - 1)); + count++; + } + xas_store(&xas, xa_mk_value(start)); + xas_unlock(&xas); + if (xas_nomem(&xas, GFP_KERNEL)) { + count = 0; + goto retry; + } + XA_BUG_ON(xa, xas_error(&xas)); + XA_BUG_ON(xa, count != present); + XA_BUG_ON(xa, xa_load(xa, start) != xa_mk_value(start)); + XA_BUG_ON(xa, xa_load(xa, start + (1UL << order) - 1) != + xa_mk_value(start)); + xa_erase_index(xa, start); +} + +static noinline void check_store_iter(struct xarray *xa) +{ + unsigned int i, j; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + + for (i = 0; i < max_order; i++) { + unsigned int min = 1 << i; + unsigned int max = (2 << i) - 1; + __check_store_iter(xa, 0, i, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + __check_store_iter(xa, min, i, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + + xa_store_index(xa, min, GFP_KERNEL); + __check_store_iter(xa, min, i, 1); + XA_BUG_ON(xa, !xa_empty(xa)); + xa_store_index(xa, max, GFP_KERNEL); + __check_store_iter(xa, min, i, 1); + XA_BUG_ON(xa, !xa_empty(xa)); + + for (j = 0; j < min; j++) + xa_store_index(xa, j, GFP_KERNEL); + __check_store_iter(xa, 0, i, min); + XA_BUG_ON(xa, !xa_empty(xa)); + for (j = 0; j < min; j++) + xa_store_index(xa, min + j, GFP_KERNEL); + __check_store_iter(xa, min, i, min); + XA_BUG_ON(xa, !xa_empty(xa)); + } +#ifdef CONFIG_XARRAY_MULTI + xa_store_index(xa, 63, GFP_KERNEL); + xa_store_index(xa, 65, GFP_KERNEL); + __check_store_iter(xa, 64, 2, 1); + xa_erase_index(xa, 63); +#endif + XA_BUG_ON(xa, !xa_empty(xa)); +} + static noinline void check_multi_find(struct xarray *xa) { #ifdef CONFIG_XARRAY_MULTI @@ -627,6 +694,7 @@ static int xarray_checks(void) check_find(&array); check_destroy(&array); check_move(&array); + check_store_iter(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; diff --git a/lib/xarray.c b/lib/xarray.c index 303c46579598..41f8ebc651f5 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1110,6 +1110,67 @@ max: } EXPORT_SYMBOL_GPL(xas_find_marked); +/** + * xas_find_conflict() - Find the next present entry in a range. + * @xas: XArray operation state. + * + * The @xas describes both a range and a position within that range. + * + * Context: Any context. Expects xa_lock to be held. + * Return: The next entry in the range covered by @xas or %NULL. + */ +void *xas_find_conflict(struct xa_state *xas) +{ + void *curr; + + if (xas_error(xas)) + return NULL; + + if (!xas->xa_node) + return NULL; + + if (xas_top(xas->xa_node)) { + curr = xas_start(xas); + if (!curr) + return NULL; + while (xa_is_node(curr)) { + struct xa_node *node = xa_to_node(curr); + curr = xas_descend(xas, node); + } + if (curr) + return curr; + } + + if (xas->xa_node->shift > xas->xa_shift) + return NULL; + + for (;;) { + if (xas->xa_node->shift == xas->xa_shift) { + if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs) + break; + } else if (xas->xa_offset == XA_CHUNK_MASK) { + xas->xa_offset = xas->xa_node->offset; + xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node); + if (!xas->xa_node) + break; + continue; + } + curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset); + if (xa_is_sibling(curr)) + continue; + while (xa_is_node(curr)) { + xas->xa_node = xa_to_node(curr); + xas->xa_offset = 0; + curr = xa_entry_locked(xas->xa, xas->xa_node, 0); + } + if (curr) + return curr; + } + xas->xa_offset -= xas->xa_sibs; + return NULL; +} +EXPORT_SYMBOL_GPL(xas_find_conflict); + /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. -- cgit v1.2.3 From 2264f5132fe45571139727ebdeb78696b35d1506 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 00:11:48 -0500 Subject: xarray: Add xas_create_range This hopefully temporary function is useful for users who have not yet been converted to multi-index entries. Signed-off-by: Matthew Wilcox --- include/linux/xarray.h | 13 ++++++ lib/test_xarray.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/xarray.c | 50 +++++++++++++++++++++ 3 files changed, 182 insertions(+) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 2664e718dbd3..deae17aa0ed7 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -822,6 +822,17 @@ static inline bool xas_valid(const struct xa_state *xas) return !xas_invalid(xas); } +/** + * xas_is_node() - Does the xas point to a node? + * @xas: XArray operation state. + * + * Return: %true if the xas currently references a node. + */ +static inline bool xas_is_node(const struct xa_state *xas) +{ + return xas_valid(xas) && xas->xa_node; +} + /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { @@ -889,6 +900,8 @@ void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_pause(struct xa_state *); +void xas_create_range(struct xa_state *); + /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 8eba3de1baea..703370015d10 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -646,6 +646,124 @@ static noinline void check_move(struct xarray *xa) check_move_small(xa, (1UL << i) - 1); } +static noinline void xa_store_many_order(struct xarray *xa, + unsigned long index, unsigned order) +{ + XA_STATE_ORDER(xas, xa, index, order); + unsigned int i = 0; + + do { + xas_lock(&xas); + XA_BUG_ON(xa, xas_find_conflict(&xas)); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < (1U << order); i++) { + XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(index + i))); + xas_next(&xas); + } +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + XA_BUG_ON(xa, xas_error(&xas)); +} + +static noinline void check_create_range_1(struct xarray *xa, + unsigned long index, unsigned order) +{ + unsigned long i; + + xa_store_many_order(xa, index, order); + for (i = index; i < index + (1UL << order); i++) + xa_erase_index(xa, i); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_create_range_2(struct xarray *xa, unsigned order) +{ + unsigned long i; + unsigned long nr = 1UL << order; + + for (i = 0; i < nr * nr; i += nr) + xa_store_many_order(xa, i, order); + for (i = 0; i < nr * nr; i++) + xa_erase_index(xa, i); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_create_range_3(void) +{ + XA_STATE(xas, NULL, 0); + xas_set_err(&xas, -EEXIST); + xas_create_range(&xas); + XA_BUG_ON(NULL, xas_error(&xas) != -EEXIST); +} + +static noinline void check_create_range_4(struct xarray *xa, + unsigned long index, unsigned order) +{ + XA_STATE_ORDER(xas, xa, index, order); + unsigned long base = xas.xa_index; + unsigned long i = 0; + + xa_store_index(xa, index, GFP_KERNEL); + do { + xas_lock(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < (1UL << order); i++) { + void *old = xas_store(&xas, xa_mk_value(base + i)); + if (xas.xa_index == index) + XA_BUG_ON(xa, old != xa_mk_value(base + i)); + else + XA_BUG_ON(xa, old != NULL); + xas_next(&xas); + } +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + XA_BUG_ON(xa, xas_error(&xas)); + + for (i = base; i < base + (1UL << order); i++) + xa_erase_index(xa, i); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_create_range(struct xarray *xa) +{ + unsigned int order; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 12 : 1; + + for (order = 0; order < max_order; order++) { + check_create_range_1(xa, 0, order); + check_create_range_1(xa, 1U << order, order); + check_create_range_1(xa, 2U << order, order); + check_create_range_1(xa, 3U << order, order); + check_create_range_1(xa, 1U << 24, order); + if (order < 10) + check_create_range_2(xa, order); + + check_create_range_4(xa, 0, order); + check_create_range_4(xa, 1U << order, order); + check_create_range_4(xa, 2U << order, order); + check_create_range_4(xa, 3U << order, order); + check_create_range_4(xa, 1U << 24, order); + + check_create_range_4(xa, 1, order); + check_create_range_4(xa, (1U << order) + 1, order); + check_create_range_4(xa, (2U << order) + 1, order); + check_create_range_4(xa, (2U << order) - 1, order); + check_create_range_4(xa, (3U << order) + 1, order); + check_create_range_4(xa, (3U << order) - 1, order); + check_create_range_4(xa, (1U << 24) + 1, order); + } + + check_create_range_3(); +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -694,6 +812,7 @@ static int xarray_checks(void) check_find(&array); check_destroy(&array); check_move(&array); + check_create_range(&array); check_store_iter(&array); printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); diff --git a/lib/xarray.c b/lib/xarray.c index 41f8ebc651f5..ff37516fe832 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -637,6 +637,56 @@ static void *xas_create(struct xa_state *xas) return entry; } +/** + * xas_create_range() - Ensure that stores to this range will succeed + * @xas: XArray operation state. + * + * Creates all of the slots in the range covered by @xas. Sets @xas to + * create single-index entries and positions it at the beginning of the + * range. This is for the benefit of users which have not yet been + * converted to use multi-index entries. + */ +void xas_create_range(struct xa_state *xas) +{ + unsigned long index = xas->xa_index; + unsigned char shift = xas->xa_shift; + unsigned char sibs = xas->xa_sibs; + + xas->xa_index |= ((sibs + 1) << shift) - 1; + if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift) + xas->xa_offset |= sibs; + xas->xa_shift = 0; + xas->xa_sibs = 0; + + for (;;) { + xas_create(xas); + if (xas_error(xas)) + goto restore; + if (xas->xa_index <= (index | XA_CHUNK_MASK)) + goto success; + xas->xa_index -= XA_CHUNK_SIZE; + + for (;;) { + struct xa_node *node = xas->xa_node; + xas->xa_node = xa_parent_locked(xas->xa, node); + xas->xa_offset = node->offset - 1; + if (node->offset != 0) + break; + } + } + +restore: + xas->xa_shift = shift; + xas->xa_sibs = sibs; + xas->xa_index = index; + return; +success: + xas->xa_index = index; + if (xas->xa_node) + xas_set_offset(xas); +} +EXPORT_SYMBOL_GPL(xas_create_range); + static void update_node(struct xa_state *xas, struct xa_node *node, int count, int values) { -- cgit v1.2.3 From 9f14d4f1f1045f161fd4db8a8e194b7825c2874a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 1 Oct 2018 14:54:59 -0400 Subject: xarray: Add xa_reserve and xa_release This function reserves a slot in the XArray for users which need to acquire multiple locks before storing their entry in the tree and so cannot use a plain xa_store(). Signed-off-by: Matthew Wilcox --- Documentation/core-api/xarray.rst | 6 +++++ include/linux/xarray.h | 34 ++++++++++++++++++++++++++-- lib/test_xarray.c | 40 +++++++++++++++++++++++++++++++++ lib/xarray.c | 47 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 2 deletions(-) diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 4b101b4f3aa1..2b397da84bcd 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -293,6 +293,12 @@ to :c:func:`xas_retry`, and retry the operation if it returns ``true``. of this RCU period. You should restart the lookup from the head of the array. + * - Zero + - :c:func:`xa_is_zero` + - Zero entries appear as ``NULL`` through the Normal API, but occupy + an entry in the XArray which can be used to reserve the index for + future use. + Other internal entries may be added in the future. As far as possible, they will be handled by :c:func:`xas_retry`. diff --git a/include/linux/xarray.h b/include/linux/xarray.h index deae17aa0ed7..bed63d3cfea8 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -32,7 +32,8 @@ * The following internal entries have a special meaning: * * 0-62: Sibling entries - * 256: Retry entry + * 256: Zero entry + * 257: Retry entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only @@ -277,6 +278,7 @@ void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); +int xa_reserve(struct xarray *, unsigned long index, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -371,6 +373,20 @@ static inline int xa_insert(struct xarray *xa, unsigned long index, return -EEXIST; } +/** + * xa_release() - Release a reserved entry. + * @xa: XArray. + * @index: Index of entry. + * + * After calling xa_reserve(), you can call this function to release the + * reservation. If the entry at @index has been stored to, this function + * will do nothing. + */ +static inline void xa_release(struct xarray *xa, unsigned long index) +{ + xa_cmpxchg(xa, index, NULL, NULL, 0); +} + /** * xa_for_each() - Iterate over a portion of an XArray. * @xa: XArray. @@ -658,7 +674,19 @@ static inline bool xa_is_sibling(const void *entry) (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } -#define XA_RETRY_ENTRY xa_mk_internal(256) +#define XA_ZERO_ENTRY xa_mk_internal(256) +#define XA_RETRY_ENTRY xa_mk_internal(257) + +/** + * xa_is_zero() - Is the entry a zero entry? + * @entry: Entry retrieved from the XArray + * + * Return: %true if the entry is a zero entry. + */ +static inline bool xa_is_zero(const void *entry) +{ + return unlikely(entry == XA_ZERO_ENTRY); +} /** * xa_is_retry() - Is the entry a retry entry? @@ -880,6 +908,8 @@ static inline void xas_reset(struct xa_state *xas) */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { + if (xa_is_zero(entry)) + return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 703370015d10..6aafd411a5c3 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -259,6 +259,45 @@ static noinline void check_cmpxchg(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_reserve(struct xarray *xa) +{ + void *entry; + unsigned long index = 0; + + /* An array with a reserved entry is not empty */ + XA_BUG_ON(xa, !xa_empty(xa)); + xa_reserve(xa, 12345678, GFP_KERNEL); + XA_BUG_ON(xa, xa_empty(xa)); + XA_BUG_ON(xa, xa_load(xa, 12345678)); + xa_release(xa, 12345678); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Releasing a used entry does nothing */ + xa_reserve(xa, 12345678, GFP_KERNEL); + XA_BUG_ON(xa, xa_store_index(xa, 12345678, GFP_NOWAIT) != NULL); + xa_release(xa, 12345678); + xa_erase_index(xa, 12345678); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* cmpxchg sees a reserved entry as NULL */ + xa_reserve(xa, 12345678, GFP_KERNEL); + XA_BUG_ON(xa, xa_cmpxchg(xa, 12345678, NULL, xa_mk_value(12345678), + GFP_NOWAIT) != NULL); + xa_release(xa, 12345678); + xa_erase_index(xa, 12345678); + XA_BUG_ON(xa, !xa_empty(xa)); + + /* Can iterate through a reserved entry */ + xa_store_index(xa, 5, GFP_KERNEL); + xa_reserve(xa, 6, GFP_KERNEL); + xa_store_index(xa, 7, GFP_KERNEL); + + xa_for_each(xa, entry, index, ULONG_MAX, XA_PRESENT) { + XA_BUG_ON(xa, index != 5 && index != 7); + } + xa_destroy(xa); +} + static noinline void check_xas_erase(struct xarray *xa) { XA_STATE(xas, xa, 0); @@ -808,6 +847,7 @@ static int xarray_checks(void) check_xa_shrink(&array); check_xas_erase(&array); check_cmpxchg(&array); + check_reserve(&array); check_multi_store(&array); check_find(&array); check_destroy(&array); diff --git a/lib/xarray.c b/lib/xarray.c index ff37516fe832..546461914282 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1266,6 +1266,8 @@ void *xa_load(struct xarray *xa, unsigned long index) rcu_read_lock(); do { entry = xas_load(&xas); + if (xa_is_zero(entry)) + entry = NULL; } while (xas_retry(&xas, entry)); rcu_read_unlock(); @@ -1275,6 +1277,8 @@ EXPORT_SYMBOL(xa_load); static void *xas_result(struct xa_state *xas, void *curr) { + if (xa_is_zero(curr)) + return NULL; XA_NODE_BUG_ON(xas->xa_node, xa_is_internal(curr)); if (xas_error(xas)) curr = xas->xa_node; @@ -1394,6 +1398,8 @@ void *xa_cmpxchg(struct xarray *xa, unsigned long index, do { xas_lock(&xas); curr = xas_load(&xas); + if (curr == XA_ZERO_ENTRY) + curr = NULL; if (curr == old) xas_store(&xas, entry); xas_unlock(&xas); @@ -1430,6 +1436,8 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, do { curr = xas_load(&xas); + if (curr == XA_ZERO_ENTRY) + curr = NULL; if (curr == old) xas_store(&xas, entry); } while (__xas_nomem(&xas, gfp)); @@ -1438,6 +1446,43 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, } EXPORT_SYMBOL(__xa_cmpxchg); +/** + * xa_reserve() - Reserve this index in the XArray. + * @xa: XArray. + * @index: Index into array. + * @gfp: Memory allocation flags. + * + * Ensures there is somewhere to store an entry at @index in the array. + * If there is already something stored at @index, this function does + * nothing. If there was nothing there, the entry is marked as reserved. + * Loads from @index will continue to see a %NULL pointer until a + * subsequent store to @index. + * + * If you do not use the entry that you have reserved, call xa_release() + * or xa_erase() to free any unnecessary memory. + * + * Context: Process context. Takes and releases the xa_lock, IRQ or BH safe + * if specified in XArray flags. May sleep if the @gfp flags permit. + * Return: 0 if the reservation succeeded or -ENOMEM if it failed. + */ +int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + XA_STATE(xas, xa, index); + unsigned int lock_type = xa_lock_type(xa); + void *curr; + + do { + xas_lock_type(&xas, lock_type); + curr = xas_load(&xas); + if (!curr) + xas_store(&xas, XA_ZERO_ENTRY); + xas_unlock_type(&xas, lock_type); + } while (xas_nomem(&xas, gfp)); + + return xas_error(&xas); +} +EXPORT_SYMBOL(xa_reserve); + /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. @@ -1797,6 +1842,8 @@ void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift) pr_cont("retry (%ld)\n", xa_to_internal(entry)); else if (xa_is_sibling(entry)) pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry)); + else if (xa_is_zero(entry)) + pr_cont("zero (%ld)\n", xa_to_internal(entry)); else pr_cont("UNKNOWN ENTRY (%px)\n", entry); } -- cgit v1.2.3 From 3d5bd6e1a04ae779bc5e0de9ba2e92aa55c40fe8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 27 Nov 2017 15:32:05 -0500 Subject: xarray: Add MAINTAINERS entry Add myself as XArray and IDR maintainer. Signed-off-by: Matthew Wilcox --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 612e5dbf3431..e54349ea090b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15973,6 +15973,17 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso S: Maintained F: arch/x86/entry/vdso/ +XARRAY +M: Matthew Wilcox +L: linux-fsdevel@vger.kernel.org +S: Supported +F: Documentation/core-api/xarray.rst +F: lib/idr.c +F: lib/xarray.c +F: include/linux/idr.h +F: include/linux/xarray.h +F: tools/testing/radix-tree + XC2028/3028 TUNER DRIVER M: Mauro Carvalho Chehab L: linux-media@vger.kernel.org -- cgit v1.2.3 From 371c752dc66948714ee3b66c3306f3ff1ff71d2e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jul 2018 10:50:12 -0400 Subject: xarray: Track free entries in an XArray Add the optional ability to track which entries in an XArray are free and provide xa_alloc() to replace most of the functionality of the IDR. Signed-off-by: Matthew Wilcox --- Documentation/core-api/xarray.rst | 23 +++++++-- include/linux/xarray.h | 101 ++++++++++++++++++++++++++++++++++++++ lib/test_xarray.c | 61 +++++++++++++++++++++++ lib/xarray.c | 88 +++++++++++++++++++++++++++++++-- 4 files changed, 266 insertions(+), 7 deletions(-) diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 2b397da84bcd..463e4c798ec1 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -103,12 +103,25 @@ Finally, you can remove all entries from an XArray by calling to free the entries first. You can do this by iterating over all present entries in the XArray using the :c:func:`xa_for_each` iterator. +ID assignment +------------- + +You can call :c:func:`xa_alloc` to store the entry at any unused index +in the XArray. If you need to modify the array from interrupt context, +you can use :c:func:`xa_alloc_bh` or :c:func:`xa_alloc_irq` to disable +interrupts while allocating the ID. Unlike :c:func:`xa_store`, allocating +a ``NULL`` pointer does not delete an entry. Instead it reserves an +entry like :c:func:`xa_reserve` and you can release it using either +:c:func:`xa_erase` or :c:func:`xa_release`. To use ID assignment, the +XArray must be defined with :c:func:`DEFINE_XARRAY_ALLOC`, or initialised +by passing ``XA_FLAGS_ALLOC`` to :c:func:`xa_init_flags`, + Memory allocation ----------------- -The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_reserve` -and :c:func:`xa_insert` functions take a gfp_t parameter in case -the XArray needs to allocate memory to store this entry. +The :c:func:`xa_store`, :c:func:`xa_cmpxchg`, :c:func:`xa_alloc`, +:c:func:`xa_reserve` and :c:func:`xa_insert` functions take a gfp_t +parameter in case the XArray needs to allocate memory to store this entry. If the entry is being deleted, no memory allocation needs to be performed, and the GFP flags specified will be ignored. @@ -143,6 +156,9 @@ Takes xa_lock internally: * :c:func:`xa_erase_bh` * :c:func:`xa_erase_irq` * :c:func:`xa_cmpxchg` + * :c:func:`xa_alloc` + * :c:func:`xa_alloc_bh` + * :c:func:`xa_alloc_irq` * :c:func:`xa_destroy` * :c:func:`xa_set_mark` * :c:func:`xa_clear_mark` @@ -152,6 +168,7 @@ Assumes xa_lock held on entry: * :c:func:`__xa_insert` * :c:func:`__xa_erase` * :c:func:`__xa_cmpxchg` + * :c:func:`__xa_alloc` * :c:func:`__xa_set_mark` * :c:func:`__xa_clear_mark` diff --git a/include/linux/xarray.h b/include/linux/xarray.h index bed63d3cfea8..e0b57af52e9b 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -205,6 +205,7 @@ typedef unsigned __bitwise xa_mark_t; #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 +#define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, @@ -217,9 +218,12 @@ enum xa_lock_type { */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) +#define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) +#define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) + /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. @@ -273,6 +277,15 @@ struct xarray { */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) +/** + * DEFINE_XARRAY_ALLOC() - Define an XArray which can allocate IDs. + * @name: A string that names your XArray. + * + * This is intended for file scope definitions of allocating XArrays. + * See also DEFINE_XARRAY(). + */ +#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) + void xa_init_flags(struct xarray *, gfp_t flags); void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); @@ -439,6 +452,7 @@ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); +int __xa_alloc(struct xarray *, u32 *id, u32 max, void *entry, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); @@ -518,6 +532,93 @@ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) return entry; } +/** + * xa_alloc() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Process context. Takes and releases the xa_lock. May sleep if + * the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +static inline int xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, + gfp_t gfp) +{ + int err; + + xa_lock(xa); + err = __xa_alloc(xa, id, max, entry, gfp); + xa_unlock(xa); + + return err; +} + +/** + * xa_alloc_bh() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling softirqs. May sleep if the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +static inline int xa_alloc_bh(struct xarray *xa, u32 *id, u32 max, void *entry, + gfp_t gfp) +{ + int err; + + xa_lock_bh(xa); + err = __xa_alloc(xa, id, max, entry, gfp); + xa_unlock_bh(xa); + + return err; +} + +/** + * xa_alloc_irq() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Process context. Takes and releases the xa_lock while + * disabling interrupts. May sleep if the @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +static inline int xa_alloc_irq(struct xarray *xa, u32 *id, u32 max, void *entry, + gfp_t gfp) +{ + int err; + + xa_lock_irq(xa); + err = __xa_alloc(xa, id, max, entry, gfp); + xa_unlock_irq(xa); + + return err; +} + /* Everything below here is the Advanced API. Proceed with caution. */ /* diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 6aafd411a5c3..a752e6a37e6f 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -33,6 +33,15 @@ static void *xa_store_index(struct xarray *xa, unsigned long index, gfp_t gfp) return xa_store(xa, index, xa_mk_value(index & LONG_MAX), gfp); } +static void xa_alloc_index(struct xarray *xa, unsigned long index, gfp_t gfp) +{ + u32 id = 0; + + XA_BUG_ON(xa, xa_alloc(xa, &id, UINT_MAX, xa_mk_value(index & LONG_MAX), + gfp) != 0); + XA_BUG_ON(xa, id != index); +} + static void xa_erase_index(struct xarray *xa, unsigned long index) { XA_BUG_ON(xa, xa_erase(xa, index) != xa_mk_value(index & LONG_MAX)); @@ -404,6 +413,57 @@ static noinline void check_multi_store(struct xarray *xa) #endif } +static DEFINE_XARRAY_ALLOC(xa0); + +static noinline void check_xa_alloc(void) +{ + int i; + u32 id; + + /* An empty array should assign 0 to the first alloc */ + xa_alloc_index(&xa0, 0, GFP_KERNEL); + + /* Erasing it should make the array empty again */ + xa_erase_index(&xa0, 0); + XA_BUG_ON(&xa0, !xa_empty(&xa0)); + + /* And it should assign 0 again */ + xa_alloc_index(&xa0, 0, GFP_KERNEL); + + /* The next assigned ID should be 1 */ + xa_alloc_index(&xa0, 1, GFP_KERNEL); + xa_erase_index(&xa0, 1); + + /* Storing a value should mark it used */ + xa_store_index(&xa0, 1, GFP_KERNEL); + xa_alloc_index(&xa0, 2, GFP_KERNEL); + + /* If we then erase 0, it should be free */ + xa_erase_index(&xa0, 0); + xa_alloc_index(&xa0, 0, GFP_KERNEL); + + xa_erase_index(&xa0, 1); + xa_erase_index(&xa0, 2); + + for (i = 1; i < 5000; i++) { + xa_alloc_index(&xa0, i, GFP_KERNEL); + } + + xa_destroy(&xa0); + + id = 0xfffffffeU; + XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0), + GFP_KERNEL) != 0); + XA_BUG_ON(&xa0, id != 0xfffffffeU); + XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0), + GFP_KERNEL) != 0); + XA_BUG_ON(&xa0, id != 0xffffffffU); + XA_BUG_ON(&xa0, xa_alloc(&xa0, &id, UINT_MAX, xa_mk_value(0), + GFP_KERNEL) != -ENOSPC); + XA_BUG_ON(&xa0, id != 0xffffffffU); + xa_destroy(&xa0); +} + static noinline void __check_store_iter(struct xarray *xa, unsigned long start, unsigned int order, unsigned int present) { @@ -849,6 +909,7 @@ static int xarray_checks(void) check_cmpxchg(&array); check_reserve(&array); check_multi_store(&array); + check_xa_alloc(); check_find(&array); check_destroy(&array); check_move(&array); diff --git a/lib/xarray.c b/lib/xarray.c index 546461914282..9a0d49d4b5f0 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -52,6 +52,11 @@ static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type) xas_unlock(xas); } +static inline bool xa_track_free(const struct xarray *xa) +{ + return xa->xa_flags & XA_FLAGS_TRACK_FREE; +} + static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark) { if (!(xa->xa_flags & XA_FLAGS_MARK(mark))) @@ -94,6 +99,11 @@ static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark) return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE); } +static inline void node_mark_all(struct xa_node *node, xa_mark_t mark) +{ + bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE); +} + #define mark_inc(mark) do { \ mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \ } while (0) @@ -416,6 +426,8 @@ static void xas_shrink(struct xa_state *xas) xas->xa_node = XAS_BOUNDS; RCU_INIT_POINTER(xa->xa_head, entry); + if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK)) + xa_mark_clear(xa, XA_FREE_MARK); node->count = 0; node->nr_values = 0; @@ -549,8 +561,15 @@ static int xas_expand(struct xa_state *xas, void *head) /* Propagate the aggregated mark info to the new child */ for (;;) { - if (xa_marked(xa, mark)) + if (xa_track_free(xa) && mark == XA_FREE_MARK) { + node_mark_all(node, XA_FREE_MARK); + if (!xa_marked(xa, XA_FREE_MARK)) { + node_clear_mark(node, 0, XA_FREE_MARK); + xa_mark_set(xa, XA_FREE_MARK); + } + } else if (xa_marked(xa, mark)) { node_set_mark(node, 0, mark); + } if (mark == XA_MARK_MAX) break; mark_inc(mark); @@ -624,6 +643,8 @@ static void *xas_create(struct xa_state *xas) node = xas_alloc(xas, shift); if (!node) break; + if (xa_track_free(xa)) + node_mark_all(node, XA_FREE_MARK); rcu_assign_pointer(*slot, xa_mk_node(node)); } else if (xa_is_node(entry)) { node = xa_to_node(entry); @@ -882,7 +903,10 @@ void xas_init_marks(const struct xa_state *xas) xa_mark_t mark = 0; for (;;) { - xas_clear_mark(xas, mark); + if (xa_track_free(xas->xa) && mark == XA_FREE_MARK) + xas_set_mark(xas, mark); + else + xas_clear_mark(xas, mark); if (mark == XA_MARK_MAX) break; mark_inc(mark); @@ -1333,6 +1357,8 @@ void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) do { xas_lock(&xas); curr = xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); xas_unlock(&xas); } while (xas_nomem(&xas, gfp)); @@ -1365,6 +1391,8 @@ void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) do { curr = xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); @@ -1400,8 +1428,11 @@ void *xa_cmpxchg(struct xarray *xa, unsigned long index, curr = xas_load(&xas); if (curr == XA_ZERO_ENTRY) curr = NULL; - if (curr == old) + if (curr == old) { xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); + } xas_unlock(&xas); } while (xas_nomem(&xas, gfp)); @@ -1438,8 +1469,11 @@ void *__xa_cmpxchg(struct xarray *xa, unsigned long index, curr = xas_load(&xas); if (curr == XA_ZERO_ENTRY) curr = NULL; - if (curr == old) + if (curr == old) { xas_store(&xas, entry); + if (xa_track_free(xa) && entry) + xas_clear_mark(&xas, XA_FREE_MARK); + } } while (__xas_nomem(&xas, gfp)); return xas_result(&xas, curr); @@ -1483,6 +1517,52 @@ int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) } EXPORT_SYMBOL(xa_reserve); +/** + * __xa_alloc() - Find somewhere to store this entry in the XArray. + * @xa: XArray. + * @id: Pointer to ID. + * @max: Maximum ID to allocate (inclusive). + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * Allocates an unused ID in the range specified by @id and @max. + * Updates the @id pointer with the index, then stores the entry at that + * index. A concurrent lookup will not see an uninitialised @id. + * + * Context: Any context. Expects xa_lock to be held on entry. May + * release and reacquire xa_lock if @gfp flags permit. + * Return: 0 on success, -ENOMEM if memory allocation fails or -ENOSPC if + * there is no more space in the XArray. + */ +int __xa_alloc(struct xarray *xa, u32 *id, u32 max, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + int err; + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return -EINVAL; + if (WARN_ON_ONCE(!xa_track_free(xa))) + return -EINVAL; + + if (!entry) + entry = XA_ZERO_ENTRY; + + do { + xas.xa_index = *id; + xas_find_marked(&xas, max, XA_FREE_MARK); + if (xas.xa_node == XAS_RESTART) + xas_set_err(&xas, -ENOSPC); + xas_store(&xas, entry); + xas_clear_mark(&xas, XA_FREE_MARK); + } while (__xas_nomem(&xas, gfp)); + + err = xas_error(&xas); + if (!err) + *id = xas.xa_index; + return err; +} +EXPORT_SYMBOL(__xa_alloc); + /** * __xa_set_mark() - Set this mark on this entry while locked. * @xa: XArray. -- cgit v1.2.3 From f32f004cddf86d63a9c42994bbce9f4e2f07c9fa Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jul 2018 15:42:46 -0400 Subject: ida: Convert to XArray Use the XA_TRACK_FREE ability to track which entries have a free bit, similarly to how it uses the radix tree's IDR_FREE tag. This eliminates the per-cpu ida_bitmap preload, and fixes the memory consumption regression I introduced when making the IDR able to store any pointer. Signed-off-by: Matthew Wilcox --- include/linux/idr.h | 18 +- lib/idr.c | 379 +++++++++++++++++++----------------- lib/radix-tree.c | 71 ------- tools/testing/radix-tree/idr-test.c | 8 +- 4 files changed, 213 insertions(+), 263 deletions(-) diff --git a/include/linux/idr.h b/include/linux/idr.h index 3ec8628ce17f..60daf34b625d 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -214,8 +214,7 @@ static inline void idr_preload_end(void) ++id, (entry) = idr_get_next((idr), &(id))) /* - * IDA - IDR based id allocator, use when translation from id to - * pointer isn't necessary. + * IDA - ID Allocator, use when translation from id to pointer isn't necessary. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) @@ -225,14 +224,14 @@ struct ida_bitmap { unsigned long bitmap[IDA_BITMAP_LONGS]; }; -DECLARE_PER_CPU(struct ida_bitmap *, ida_bitmap); - struct ida { - struct radix_tree_root ida_rt; + struct xarray xa; }; +#define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC) + #define IDA_INIT(name) { \ - .ida_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER | GFP_NOWAIT), \ + .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \ } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) @@ -292,7 +291,7 @@ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) static inline void ida_init(struct ida *ida) { - INIT_RADIX_TREE(&ida->ida_rt, IDR_RT_MARKER | GFP_NOWAIT); + xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } #define ida_simple_get(ida, start, end, gfp) \ @@ -301,9 +300,6 @@ static inline void ida_init(struct ida *ida) static inline bool ida_is_empty(const struct ida *ida) { - return radix_tree_empty(&ida->ida_rt); + return xa_empty(&ida->xa); } - -/* in lib/radix-tree.c */ -int ida_pre_get(struct ida *ida, gfp_t gfp_mask); #endif /* __IDR_H__ */ diff --git a/lib/idr.c b/lib/idr.c index 9a366b5be2c2..3c20ad9b0463 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -6,8 +6,6 @@ #include #include -DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap); - /** * idr_alloc_u32() - Allocate an ID. * @idr: IDR handle. @@ -320,6 +318,9 @@ EXPORT_SYMBOL(idr_replace); * free the individual IDs in it. You can use ida_is_empty() to find * out whether the IDA has any IDs currently allocated. * + * The IDA handles its own locking. It is safe to call any of the IDA + * functions without synchronisation in your code. + * * IDs are currently limited to the range [0-INT_MAX]. If this is an awkward * limitation, it should be quite straightforward to raise the maximum. */ @@ -327,151 +328,197 @@ EXPORT_SYMBOL(idr_replace); /* * Developer's notes: * - * The IDA uses the functionality provided by the IDR & radix tree to store - * bitmaps in each entry. The IDR_FREE tag means there is at least one bit - * free, unlike the IDR where it means at least one entry is free. + * The IDA uses the functionality provided by the XArray to store bitmaps in + * each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap + * have been set. * - * I considered telling the radix tree that each slot is an order-10 node - * and storing the bit numbers in the radix tree, but the radix tree can't - * allow a single multiorder entry at index 0, which would significantly - * increase memory consumption for the IDA. So instead we divide the index - * by the number of bits in the leaf bitmap before doing a radix tree lookup. + * I considered telling the XArray that each slot is an order-10 node + * and indexing by bit number, but the XArray can't allow a single multi-index + * entry in the head, which would significantly increase memory consumption + * for the IDA. So instead we divide the index by the number of bits in the + * leaf bitmap before doing a radix tree lookup. * * As an optimisation, if there are only a few low bits set in any given * leaf, instead of allocating a 128-byte bitmap, we store the bits - * directly in the entry. - * - * We allow the radix tree 'exceptional' count to get out of date. Nothing - * in the IDA nor the radix tree code checks it. If it becomes important - * to maintain an accurate exceptional count, switch the rcu_assign_pointer() - * calls to radix_tree_iter_replace() which will correct the exceptional - * count. - * - * The IDA always requires a lock to alloc/free. If we add a 'test_bit' + * as a value entry. Value entries never have the XA_FREE_MARK cleared + * because we can always convert them into a bitmap entry. + * + * It would be possible to optimise further; once we've run out of a + * single 128-byte bitmap, we currently switch to a 576-byte node, put + * the 128-byte bitmap in the first entry and then start allocating extra + * 128-byte entries. We could instead use the 512 bytes of the node's + * data as a bitmap before moving to that scheme. I do not believe this + * is a worthwhile optimisation; Rasmus Villemoes surveyed the current + * users of the IDA and almost none of them use more than 1024 entries. + * Those that do use more than the 8192 IDs that the 512 bytes would + * provide. + * + * The IDA always uses a lock to alloc/free. If we add a 'test_bit' * equivalent, it will still need locking. Going to RCU lookup would require * using RCU to free bitmaps, and that's not trivial without embedding an * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte * bitmap, which is excessive. */ -#define IDA_MAX (0x80000000U / IDA_BITMAP_BITS - 1) - -static int ida_get_new_above(struct ida *ida, int start) +/** + * ida_alloc_range() - Allocate an unused ID. + * @ida: IDA handle. + * @min: Lowest ID to allocate. + * @max: Highest ID to allocate. + * @gfp: Memory allocation flags. + * + * Allocate an ID between @min and @max, inclusive. The allocated ID will + * not exceed %INT_MAX, even if @max is larger. + * + * Context: Any context. + * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, + * or %-ENOSPC if there are no free IDs. + */ +int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, + gfp_t gfp) { - struct radix_tree_root *root = &ida->ida_rt; - void __rcu **slot; - struct radix_tree_iter iter; - struct ida_bitmap *bitmap; - unsigned long index; - unsigned bit; - int new; - - index = start / IDA_BITMAP_BITS; - bit = start % IDA_BITMAP_BITS; - - slot = radix_tree_iter_init(&iter, index); - for (;;) { - if (slot) - slot = radix_tree_next_slot(slot, &iter, - RADIX_TREE_ITER_TAGGED); - if (!slot) { - slot = idr_get_free(root, &iter, GFP_NOWAIT, IDA_MAX); - if (IS_ERR(slot)) { - if (slot == ERR_PTR(-ENOMEM)) - return -EAGAIN; - return PTR_ERR(slot); + XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS); + unsigned bit = min % IDA_BITMAP_BITS; + unsigned long flags; + struct ida_bitmap *bitmap, *alloc = NULL; + + if ((int)min < 0) + return -ENOSPC; + + if ((int)max < 0) + max = INT_MAX; + +retry: + xas_lock_irqsave(&xas, flags); +next: + bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK); + if (xas.xa_index > min / IDA_BITMAP_BITS) + bit = 0; + if (xas.xa_index * IDA_BITMAP_BITS + bit > max) + goto nospc; + + if (xa_is_value(bitmap)) { + unsigned long tmp = xa_to_value(bitmap); + + if (bit < BITS_PER_XA_VALUE) { + bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit); + if (xas.xa_index * IDA_BITMAP_BITS + bit > max) + goto nospc; + if (bit < BITS_PER_XA_VALUE) { + tmp |= 1UL << bit; + xas_store(&xas, xa_mk_value(tmp)); + goto out; } } - if (iter.index > index) - bit = 0; - new = iter.index * IDA_BITMAP_BITS; - bitmap = rcu_dereference_raw(*slot); - if (xa_is_value(bitmap)) { - unsigned long tmp = xa_to_value(bitmap); - int vbit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, - bit); - if (vbit < BITS_PER_XA_VALUE) { - tmp |= 1UL << vbit; - rcu_assign_pointer(*slot, xa_mk_value(tmp)); - return new + vbit; - } - bitmap = this_cpu_xchg(ida_bitmap, NULL); - if (!bitmap) - return -EAGAIN; - bitmap->bitmap[0] = tmp; - rcu_assign_pointer(*slot, bitmap); + bitmap = alloc; + if (!bitmap) + bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); + if (!bitmap) + goto alloc; + bitmap->bitmap[0] = tmp; + xas_store(&xas, bitmap); + if (xas_error(&xas)) { + bitmap->bitmap[0] = 0; + goto out; } + } - if (bitmap) { - bit = find_next_zero_bit(bitmap->bitmap, - IDA_BITMAP_BITS, bit); - new += bit; - if (new < 0) - return -ENOSPC; - if (bit == IDA_BITMAP_BITS) - continue; + if (bitmap) { + bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit); + if (xas.xa_index * IDA_BITMAP_BITS + bit > max) + goto nospc; + if (bit == IDA_BITMAP_BITS) + goto next; - __set_bit(bit, bitmap->bitmap); - if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) - radix_tree_iter_tag_clear(root, &iter, - IDR_FREE); + __set_bit(bit, bitmap->bitmap); + if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) + xas_clear_mark(&xas, XA_FREE_MARK); + } else { + if (bit < BITS_PER_XA_VALUE) { + bitmap = xa_mk_value(1UL << bit); } else { - new += bit; - if (new < 0) - return -ENOSPC; - if (bit < BITS_PER_XA_VALUE) { - bitmap = xa_mk_value(1UL << bit); - } else { - bitmap = this_cpu_xchg(ida_bitmap, NULL); - if (!bitmap) - return -EAGAIN; - __set_bit(bit, bitmap->bitmap); - } - radix_tree_iter_replace(root, &iter, slot, bitmap); + bitmap = alloc; + if (!bitmap) + bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); + if (!bitmap) + goto alloc; + __set_bit(bit, bitmap->bitmap); } - - return new; + xas_store(&xas, bitmap); + } +out: + xas_unlock_irqrestore(&xas, flags); + if (xas_nomem(&xas, gfp)) { + xas.xa_index = min / IDA_BITMAP_BITS; + bit = min % IDA_BITMAP_BITS; + goto retry; } + if (bitmap != alloc) + kfree(alloc); + if (xas_error(&xas)) + return xas_error(&xas); + return xas.xa_index * IDA_BITMAP_BITS + bit; +alloc: + xas_unlock_irqrestore(&xas, flags); + alloc = kzalloc(sizeof(*bitmap), gfp); + if (!alloc) + return -ENOMEM; + xas_set(&xas, min / IDA_BITMAP_BITS); + bit = min % IDA_BITMAP_BITS; + goto retry; +nospc: + xas_unlock_irqrestore(&xas, flags); + return -ENOSPC; } +EXPORT_SYMBOL(ida_alloc_range); -static void ida_remove(struct ida *ida, int id) +/** + * ida_free() - Release an allocated ID. + * @ida: IDA handle. + * @id: Previously allocated ID. + * + * Context: Any context. + */ +void ida_free(struct ida *ida, unsigned int id) { - unsigned long index = id / IDA_BITMAP_BITS; - unsigned offset = id % IDA_BITMAP_BITS; + XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS); + unsigned bit = id % IDA_BITMAP_BITS; struct ida_bitmap *bitmap; - unsigned long *btmp; - struct radix_tree_iter iter; - void __rcu **slot; + unsigned long flags; - slot = radix_tree_iter_lookup(&ida->ida_rt, &iter, index); - if (!slot) - goto err; + BUG_ON((int)id < 0); + + xas_lock_irqsave(&xas, flags); + bitmap = xas_load(&xas); - bitmap = rcu_dereference_raw(*slot); if (xa_is_value(bitmap)) { - btmp = (unsigned long *)slot; - offset += 1; /* Intimate knowledge of the value encoding */ - if (offset >= BITS_PER_LONG) + unsigned long v = xa_to_value(bitmap); + if (bit >= BITS_PER_XA_VALUE) + goto err; + if (!(v & (1UL << bit))) goto err; + v &= ~(1UL << bit); + if (!v) + goto delete; + xas_store(&xas, xa_mk_value(v)); } else { - btmp = bitmap->bitmap; - } - if (!test_bit(offset, btmp)) - goto err; - - __clear_bit(offset, btmp); - radix_tree_iter_tag_set(&ida->ida_rt, &iter, IDR_FREE); - if (xa_is_value(bitmap)) { - if (xa_to_value(rcu_dereference_raw(*slot)) == 0) - radix_tree_iter_delete(&ida->ida_rt, &iter, slot); - } else if (bitmap_empty(btmp, IDA_BITMAP_BITS)) { - kfree(bitmap); - radix_tree_iter_delete(&ida->ida_rt, &iter, slot); + if (!test_bit(bit, bitmap->bitmap)) + goto err; + __clear_bit(bit, bitmap->bitmap); + xas_set_mark(&xas, XA_FREE_MARK); + if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) { + kfree(bitmap); +delete: + xas_store(&xas, NULL); + } } + xas_unlock_irqrestore(&xas, flags); return; err: + xas_unlock_irqrestore(&xas, flags); WARN(1, "ida_free called for id=%d which is not allocated.\n", id); } +EXPORT_SYMBOL(ida_free); /** * ida_destroy() - Free all IDs. @@ -486,80 +533,60 @@ static void ida_remove(struct ida *ida, int id) */ void ida_destroy(struct ida *ida) { + XA_STATE(xas, &ida->xa, 0); + struct ida_bitmap *bitmap; unsigned long flags; - struct radix_tree_iter iter; - void __rcu **slot; - xa_lock_irqsave(&ida->ida_rt, flags); - radix_tree_for_each_slot(slot, &ida->ida_rt, &iter, 0) { - struct ida_bitmap *bitmap = rcu_dereference_raw(*slot); + xas_lock_irqsave(&xas, flags); + xas_for_each(&xas, bitmap, ULONG_MAX) { if (!xa_is_value(bitmap)) kfree(bitmap); - radix_tree_iter_delete(&ida->ida_rt, &iter, slot); + xas_store(&xas, NULL); } - xa_unlock_irqrestore(&ida->ida_rt, flags); + xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(ida_destroy); -/** - * ida_alloc_range() - Allocate an unused ID. - * @ida: IDA handle. - * @min: Lowest ID to allocate. - * @max: Highest ID to allocate. - * @gfp: Memory allocation flags. - * - * Allocate an ID between @min and @max, inclusive. The allocated ID will - * not exceed %INT_MAX, even if @max is larger. - * - * Context: Any context. - * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, - * or %-ENOSPC if there are no free IDs. - */ -int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, - gfp_t gfp) -{ - int id = 0; - unsigned long flags; - - if ((int)min < 0) - return -ENOSPC; +#ifndef __KERNEL__ +extern void xa_dump_index(unsigned long index, unsigned int shift); +#define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS) - if ((int)max < 0) - max = INT_MAX; - -again: - xa_lock_irqsave(&ida->ida_rt, flags); - id = ida_get_new_above(ida, min); - if (id > (int)max) { - ida_remove(ida, id); - id = -ENOSPC; - } - xa_unlock_irqrestore(&ida->ida_rt, flags); +static void ida_dump_entry(void *entry, unsigned long index) +{ + unsigned long i; + + if (!entry) + return; + + if (xa_is_node(entry)) { + struct xa_node *node = xa_to_node(entry); + unsigned int shift = node->shift + IDA_CHUNK_SHIFT + + XA_CHUNK_SHIFT; + + xa_dump_index(index * IDA_BITMAP_BITS, shift); + xa_dump_node(node); + for (i = 0; i < XA_CHUNK_SIZE; i++) + ida_dump_entry(node->slots[i], + index | (i << node->shift)); + } else if (xa_is_value(entry)) { + xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG)); + pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry); + } else { + struct ida_bitmap *bitmap = entry; - if (unlikely(id == -EAGAIN)) { - if (!ida_pre_get(ida, gfp)) - return -ENOMEM; - goto again; + xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT); + pr_cont("bitmap: %p data", bitmap); + for (i = 0; i < IDA_BITMAP_LONGS; i++) + pr_cont(" %lx", bitmap->bitmap[i]); + pr_cont("\n"); } - - return id; } -EXPORT_SYMBOL(ida_alloc_range); -/** - * ida_free() - Release an allocated ID. - * @ida: IDA handle. - * @id: Previously allocated ID. - * - * Context: Any context. - */ -void ida_free(struct ida *ida, unsigned int id) +static void ida_dump(struct ida *ida) { - unsigned long flags; - - BUG_ON((int)id < 0); - xa_lock_irqsave(&ida->ida_rt, flags); - ida_remove(ida, id); - xa_unlock_irqrestore(&ida->ida_rt, flags); + struct xarray *xa = &ida->xa; + pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head, + xa->xa_flags >> ROOT_TAG_SHIFT); + ida_dump_entry(xa->xa_head, 0); } -EXPORT_SYMBOL(ida_free); +#endif diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 3479d93c32b9..68702061464f 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -255,54 +255,6 @@ static unsigned long next_index(unsigned long index, return (index & ~node_maxindex(node)) + (offset << node->shift); } -#ifndef __KERNEL__ -static void dump_ida_node(void *entry, unsigned long index) -{ - unsigned long i; - - if (!entry) - return; - - if (radix_tree_is_internal_node(entry)) { - struct radix_tree_node *node = entry_to_node(entry); - - pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n", - node, node->offset, index * IDA_BITMAP_BITS, - ((index | node_maxindex(node)) + 1) * - IDA_BITMAP_BITS - 1, - node->parent, node->tags[0][0], node->shift, - node->count); - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) - dump_ida_node(node->slots[i], - index | (i << node->shift)); - } else if (xa_is_value(entry)) { - pr_debug("ida excp: %p offset %d indices %lu-%lu data %lx\n", - entry, (int)(index & RADIX_TREE_MAP_MASK), - index * IDA_BITMAP_BITS, - index * IDA_BITMAP_BITS + BITS_PER_XA_VALUE, - xa_to_value(entry)); - } else { - struct ida_bitmap *bitmap = entry; - - pr_debug("ida btmp: %p offset %d indices %lu-%lu data", bitmap, - (int)(index & RADIX_TREE_MAP_MASK), - index * IDA_BITMAP_BITS, - (index + 1) * IDA_BITMAP_BITS - 1); - for (i = 0; i < IDA_BITMAP_LONGS; i++) - pr_cont(" %lx", bitmap->bitmap[i]); - pr_cont("\n"); - } -} - -static void ida_dump(struct ida *ida) -{ - struct radix_tree_root *root = &ida->ida_rt; - pr_debug("ida: %p node %p free %d\n", ida, root->xa_head, - root->xa_flags >> ROOT_TAG_SHIFT); - dump_ida_node(root->xa_head, 0); -} -#endif - /* * This assumes that the caller has performed appropriate preallocation, and * that the caller has pinned this thread of control to the current CPU. @@ -2039,27 +1991,6 @@ void idr_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(idr_preload); -int ida_pre_get(struct ida *ida, gfp_t gfp) -{ - /* - * The IDA API has no preload_end() equivalent. Instead, - * ida_get_new() can return -EAGAIN, prompting the caller - * to return to the ida_pre_get() step. - */ - if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE)) - preempt_enable(); - - if (!this_cpu_read(ida_bitmap)) { - struct ida_bitmap *bitmap = kzalloc(sizeof(*bitmap), gfp); - if (!bitmap) - return 0; - if (this_cpu_cmpxchg(ida_bitmap, NULL, bitmap)) - kfree(bitmap); - } - - return 1; -} - void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max) @@ -2201,8 +2132,6 @@ static int radix_tree_cpu_dead(unsigned int cpu) kmem_cache_free(radix_tree_node_cachep, node); rtp->nr--; } - kfree(per_cpu(ida_bitmap, cpu)); - per_cpu(ida_bitmap, cpu) = NULL; return 0; } diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index a5a4494922a6..1b63bdb7688f 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -402,16 +402,15 @@ void ida_check_nomem(void) */ void ida_check_conv_user(void) { -#if 0 DEFINE_IDA(ida); unsigned long i; - radix_tree_cpu_dead(1); for (i = 0; i < 1000000; i++) { int id = ida_alloc(&ida, GFP_NOWAIT); if (id == -ENOMEM) { - IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) != - BITS_PER_XA_VALUE); + IDA_BUG_ON(&ida, ((i % IDA_BITMAP_BITS) != + BITS_PER_XA_VALUE) && + ((i % IDA_BITMAP_BITS) != 0)); id = ida_alloc(&ida, GFP_KERNEL); } else { IDA_BUG_ON(&ida, (i % IDA_BITMAP_BITS) == @@ -420,7 +419,6 @@ void ida_check_conv_user(void) IDA_BUG_ON(&ida, id != i); } ida_destroy(&ida); -#endif } void ida_check_random(void) -- cgit v1.2.3 From eb797a8ee0ab4cd03df556980ce7bf167cadaa50 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 5 Mar 2018 22:46:03 -0500 Subject: page cache: Rearrange address_space Change i_pages from a radix_tree_root to an xarray, convert the documentation into kernel-doc format and change the order of the elements to pack them better on 64-bit systems. Signed-off-by: Matthew Wilcox --- include/linux/fs.h | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c0b4a1c22ff..d126cad0f621 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -403,24 +403,40 @@ int pagecache_write_end(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); +/** + * struct address_space - Contents of a cacheable, mappable object. + * @host: Owner, either the inode or the block_device. + * @i_pages: Cached pages. + * @gfp_mask: Memory allocation flags to use for allocating pages. + * @i_mmap_writable: Number of VM_SHARED mappings. + * @i_mmap: Tree of private and shared mappings. + * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. + * @nrpages: Number of page entries, protected by the i_pages lock. + * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. + * @writeback_index: Writeback starts here. + * @a_ops: Methods. + * @flags: Error bits and flags (AS_*). + * @wb_err: The most recent error which has occurred. + * @private_lock: For use by the owner of the address_space. + * @private_list: For use by the owner of the address_space. + * @private_data: For use by the owner of the address_space. + */ struct address_space { - struct inode *host; /* owner: inode, block_device */ - struct radix_tree_root i_pages; /* cached pages */ - atomic_t i_mmap_writable;/* count VM_SHARED mappings */ - struct rb_root_cached i_mmap; /* tree of private and shared mappings */ - struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ - /* Protected by the i_pages lock */ - unsigned long nrpages; /* number of total pages */ - /* number of shadow or DAX exceptional entries */ + struct inode *host; + struct xarray i_pages; + gfp_t gfp_mask; + atomic_t i_mmap_writable; + struct rb_root_cached i_mmap; + struct rw_semaphore i_mmap_rwsem; + unsigned long nrpages; unsigned long nrexceptional; - pgoff_t writeback_index;/* writeback starts here */ - const struct address_space_operations *a_ops; /* methods */ - unsigned long flags; /* error bits */ - spinlock_t private_lock; /* for use by the address_space */ - gfp_t gfp_mask; /* implicit gfp mask for allocations */ - struct list_head private_list; /* for use by the address_space */ - void *private_data; /* ditto */ + pgoff_t writeback_index; + const struct address_space_operations *a_ops; + unsigned long flags; errseq_t wb_err; + spinlock_t private_lock; + struct list_head private_list; + void *private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but -- cgit v1.2.3 From 0d3f92966629e536b0c5c2355c1ada8e21c245f6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 21 Nov 2017 14:07:06 -0500 Subject: page cache: Convert hole search to XArray The page cache offers the ability to search for a miss in the previous or next N locations. Rather than teach the XArray about the page cache's definition of a miss, use xas_prev() and xas_next() to search the page array. This should be more efficient as it does not have to start the lookup from the top for each index. Signed-off-by: Matthew Wilcox --- fs/nfs/blocklayout/blocklayout.c | 2 +- include/linux/pagemap.h | 4 +- mm/filemap.c | 110 ++++++++++++++++++--------------------- mm/readahead.c | 4 +- 4 files changed, 55 insertions(+), 65 deletions(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 06cb0c1d9aee..d3781cd983f6 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -896,7 +896,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (end != inode->i_mapping->nrpages) { rcu_read_lock(); - end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); + end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b1bd2186e6d2..cf9ad413eee9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -241,9 +241,9 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x) typedef int filler_t(void *, struct page *); -pgoff_t page_cache_next_hole(struct address_space *mapping, +pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); -pgoff_t page_cache_prev_hole(struct address_space *mapping, +pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); #define FGP_ACCESSED 0x00000001 diff --git a/mm/filemap.c b/mm/filemap.c index 4de14e75c4ec..714d3d0f60f5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1326,86 +1326,76 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * page_cache_next_hole - find the next hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the - * lowest indexed hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'return - index >= - * max_scan' will be true). In rare cases of index wrap-around, 0 will - * be returned. - * - * page_cache_next_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 5, then subsequently a hole is created at - * index 10, page_cache_next_hole covering both indexes may return 10 - * if called under rcu_read_lock. + * page_cache_next_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the + * gap with the lowest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 5, then subsequently a gap is + * created at index 10, page_cache_next_miss covering both indices may + * return 10 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'return - index >= max_scan' will be true). + * In the rare case of index wrap-around, 0 will be returned. */ -pgoff_t page_cache_next_hole(struct address_space *mapping, +pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; + XA_STATE(xas, &mapping->i_pages, index); - for (i = 0; i < max_scan; i++) { - struct page *page; - - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || xa_is_value(page)) + while (max_scan--) { + void *entry = xas_next(&xas); + if (!entry || xa_is_value(entry)) break; - index++; - if (index == 0) + if (xas.xa_index == 0) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_next_hole); +EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_hole - find the prev hole (not-present entry) - * @mapping: mapping - * @index: index - * @max_scan: maximum range to search - * - * Search backwards in the range [max(index-max_scan+1, 0), index] for - * the first hole. - * - * Returns: the index of the hole if found, otherwise returns an index - * outside of the set specified (in which case 'index - return >= - * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX - * will be returned. - * - * page_cache_prev_hole may be called under rcu_read_lock. However, - * like radix_tree_gang_lookup, this will not atomically search a - * snapshot of the tree at a single point in time. For example, if a - * hole is created at index 10, then subsequently a hole is created at - * index 5, page_cache_prev_hole covering both indexes may return 5 if - * called under rcu_read_lock. + * page_cache_prev_miss() - Find the next gap in the page cache. + * @mapping: Mapping. + * @index: Index. + * @max_scan: Maximum range to search. + * + * Search the range [max(index - max_scan + 1, 0), index] for the + * gap with the highest index. + * + * This function may be called under the rcu_read_lock. However, this will + * not atomically search a snapshot of the cache at a single point in time. + * For example, if a gap is created at index 10, then subsequently a gap is + * created at index 5, page_cache_prev_miss() covering both indices may + * return 5 if called under the rcu_read_lock. + * + * Return: The index of the gap if found, otherwise an index outside the + * range specified (in which case 'index - return >= max_scan' will be true). + * In the rare case of wrap-around, ULONG_MAX will be returned. */ -pgoff_t page_cache_prev_hole(struct address_space *mapping, +pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan) { - unsigned long i; - - for (i = 0; i < max_scan; i++) { - struct page *page; + XA_STATE(xas, &mapping->i_pages, index); - page = radix_tree_lookup(&mapping->i_pages, index); - if (!page || xa_is_value(page)) + while (max_scan--) { + void *entry = xas_prev(&xas); + if (!entry || xa_is_value(entry)) break; - index--; - if (index == ULONG_MAX) + if (xas.xa_index == ULONG_MAX) break; } - return index; + return xas.xa_index; } -EXPORT_SYMBOL(page_cache_prev_hole); +EXPORT_SYMBOL(page_cache_prev_miss); /** * find_get_entry - find and get a page cache entry diff --git a/mm/readahead.c b/mm/readahead.c index de657077d41d..fc4dd364b37a 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -336,7 +336,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = page_cache_prev_hole(mapping, offset - 1, max); + head = page_cache_prev_miss(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -425,7 +425,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max_pages); + start = page_cache_next_miss(mapping, offset + 1, max_pages); rcu_read_unlock(); if (!start || start - offset > max_pages) -- cgit v1.2.3 From 74d609585d8bd6083bd9d75bc1fd2c0d3851bcc5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 17 Nov 2017 10:01:45 -0500 Subject: page cache: Add and replace pages using the XArray Use the XArray APIs to add and replace pages in the page cache. This removes two uses of the radix tree preload API and is significantly shorter code. It also removes the last user of __radix_tree_create() outside radix-tree.c itself, so make it static. Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 3 - include/linux/swap.h | 8 ++- lib/radix-tree.c | 6 +- mm/filemap.c | 139 +++++++++++++++++++-------------------------- 4 files changed, 66 insertions(+), 90 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 15388b7e38b9..d55de428a589 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -231,9 +231,6 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -int __radix_tree_create(struct radix_tree_root *, unsigned long index, - unsigned order, struct radix_tree_node **nodep, - void __rcu ***slotp); int __radix_tree_insert(struct radix_tree_root *, unsigned long index, unsigned order, void *); static inline int radix_tree_insert(struct radix_tree_root *root, diff --git a/include/linux/swap.h b/include/linux/swap.h index 8e2c11e692ba..6ea50cf41b4c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -299,8 +299,12 @@ void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -/* Do not use directly, use workingset_lookup_update */ -void workingset_update_node(struct radix_tree_node *node); +/* Only track the nodes of mappings with shadow entries */ +void workingset_update_node(struct xa_node *node); +#define mapping_set_update(xas, mapping) do { \ + if (!dax_mapping(mapping) && !shmem_mapping(mapping)) \ + xas_set_update(xas, workingset_update_node); \ +} while (0) /* Returns workingset_update_node() if the mapping has shadow entries. */ #define workingset_lookup_update(mapping) \ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 68702061464f..8a58051eb5b3 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -700,9 +700,9 @@ static bool delete_node(struct radix_tree_root *root, * * Returns -ENOMEM, or 0 for success. */ -int __radix_tree_create(struct radix_tree_root *root, unsigned long index, - unsigned order, struct radix_tree_node **nodep, - void __rcu ***slotp) +static int __radix_tree_create(struct radix_tree_root *root, + unsigned long index, unsigned order, + struct radix_tree_node **nodep, void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; diff --git a/mm/filemap.c b/mm/filemap.c index 714d3d0f60f5..b63caebd1367 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -111,35 +111,6 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->i_pages, page->index, 0, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (!xa_is_value(p)) - return -EEXIST; - - mapping->nrexceptional--; - if (shadowp) - *shadowp = p; - } - __radix_tree_replace(&mapping->i_pages, node, slot, page, - workingset_lookup_update(mapping)); - mapping->nrpages++; - return 0; -} - static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { @@ -775,51 +746,44 @@ EXPORT_SYMBOL(file_write_and_wait_range); * locked. This function does not add the new page to the LRU, the * caller must do that. * - * The remove + add is atomic. The only way this function can fail is - * memory allocation failure. + * The remove + add is atomic. This function cannot fail. */ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) { - int error; + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *) = mapping->a_ops->freepage; + pgoff_t offset = old->index; + XA_STATE(xas, &mapping->i_pages, offset); + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(old), old); VM_BUG_ON_PAGE(!PageLocked(new), new); VM_BUG_ON_PAGE(new->mapping, new); - error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK); - if (!error) { - struct address_space *mapping = old->mapping; - void (*freepage)(struct page *); - unsigned long flags; - - pgoff_t offset = old->index; - freepage = mapping->a_ops->freepage; + get_page(new); + new->mapping = mapping; + new->index = offset; - get_page(new); - new->mapping = mapping; - new->index = offset; + xas_lock_irqsave(&xas, flags); + xas_store(&xas, new); - xa_lock_irqsave(&mapping->i_pages, flags); - __delete_from_page_cache(old, NULL); - error = page_cache_tree_insert(mapping, new, NULL); - BUG_ON(error); - - /* - * hugetlb pages do not participate in page cache accounting. - */ - if (!PageHuge(new)) - __inc_node_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(new)) - __inc_node_page_state(new, NR_SHMEM); - xa_unlock_irqrestore(&mapping->i_pages, flags); - mem_cgroup_migrate(old, new); - radix_tree_preload_end(); - if (freepage) - freepage(old); - put_page(old); - } + old->mapping = NULL; + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(old)) + __dec_node_page_state(new, NR_FILE_PAGES); + if (!PageHuge(new)) + __inc_node_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(old)) + __dec_node_page_state(new, NR_SHMEM); + if (PageSwapBacked(new)) + __inc_node_page_state(new, NR_SHMEM); + xas_unlock_irqrestore(&xas, flags); + mem_cgroup_migrate(old, new); + if (freepage) + freepage(old); + put_page(old); - return error; + return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -828,12 +792,15 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); struct mem_cgroup *memcg; int error; + void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); + mapping_set_update(&xas, mapping); if (!huge) { error = mem_cgroup_try_charge(page, current->mm, @@ -842,39 +809,47 @@ static int __add_to_page_cache_locked(struct page *page, return error; } - error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); - if (error) { - if (!huge) - mem_cgroup_cancel_charge(page, memcg, false); - return error; - } - get_page(page); page->mapping = mapping; page->index = offset; - xa_lock_irq(&mapping->i_pages); - error = page_cache_tree_insert(mapping, page, shadowp); - radix_tree_preload_end(); - if (unlikely(error)) - goto err_insert; + do { + xas_lock_irq(&xas); + old = xas_load(&xas); + if (old && !xa_is_value(old)) + xas_set_err(&xas, -EEXIST); + xas_store(&xas, page); + if (xas_error(&xas)) + goto unlock; + + if (xa_is_value(old)) { + mapping->nrexceptional--; + if (shadowp) + *shadowp = old; + } + mapping->nrpages++; + + /* hugetlb pages do not participate in page cache accounting */ + if (!huge) + __inc_node_page_state(page, NR_FILE_PAGES); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); + + if (xas_error(&xas)) + goto error; - /* hugetlb pages do not participate in page cache accounting. */ - if (!huge) - __inc_node_page_state(page, NR_FILE_PAGES); - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; -err_insert: +error: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return error; + return xas_error(&xas); } /** -- cgit v1.2.3 From 5c024e6a4ebc1740db9f0f075aaa476210108a97 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 21 Nov 2017 09:17:59 -0500 Subject: page cache: Convert page deletion to XArray The code is slightly shorter and simpler. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index b63caebd1367..414efbdc95df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -111,31 +111,26 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ -static void page_cache_tree_delete(struct address_space *mapping, +static void page_cache_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr; + XA_STATE(xas, &mapping->i_pages, page->index); + unsigned int nr = 1; - /* hugetlb pages are represented by one entry in the radix tree */ - nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + mapping_set_update(&xas, mapping); + + /* hugetlb pages are represented by a single entry in the xarray */ + if (!PageHuge(page)) { + xas_set_order(&xas, page->index, compound_order(page)); + nr = 1U << compound_order(page); + } VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(nr != 1 && shadow, page); - for (i = 0; i < nr; i++) { - struct radix_tree_node *node; - void **slot; - - __radix_tree_lookup(&mapping->i_pages, page->index + i, - &node, &slot); - - VM_BUG_ON_PAGE(!node && nr != 1, page); - - radix_tree_clear_tags(&mapping->i_pages, node, slot); - __radix_tree_replace(&mapping->i_pages, node, slot, shadow, - workingset_lookup_update(mapping)); - } + xas_store(&xas, shadow); + xas_init_marks(&xas); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ @@ -234,7 +229,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) trace_mm_filemap_delete_from_page_cache(page); unaccount_page_cache_page(mapping, page); - page_cache_tree_delete(mapping, page, shadow); + page_cache_delete(mapping, page, shadow); } static void page_cache_free_page(struct address_space *mapping, -- cgit v1.2.3 From 4c7472c0df2f889df417a37571e622e02b5058fe Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 16:12:50 -0400 Subject: page cache: Convert find_get_entry to XArray Slightly shorter and simpler code. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 63 +++++++++++++++++++++++++++--------------------------------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 414efbdc95df..2bf9f0742082 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1382,47 +1382,40 @@ EXPORT_SYMBOL(page_cache_prev_miss); */ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { - void **pagep; + XA_STATE(xas, &mapping->i_pages, offset); struct page *head, *page; rcu_read_lock(); repeat: - page = NULL; - pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); - if (pagep) { - page = radix_tree_deref_slot(pagep); - if (unlikely(!page)) - goto out; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto repeat; - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. - */ - goto out; - } + xas_reset(&xas); + page = xas_load(&xas); + if (xas_retry(&xas, page)) + goto repeat; + /* + * A shadow entry of a recently evicted page, or a swap entry from + * shmem/tmpfs. Return it without attempting to raise page count. + */ + if (!page || xa_is_value(page)) + goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto repeat; + head = compound_head(page); + if (!page_cache_get_speculative(head)) + goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + /* The page was split under us? */ + if (compound_head(page) != head) { + put_page(head); + goto repeat; + } - /* - * Has the page moved? - * This is part of the lockless pagecache protocol. See - * include/linux/pagemap.h for details. - */ - if (unlikely(page != *pagep)) { - put_page(head); - goto repeat; - } + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != xas_reload(&xas))) { + put_page(head); + goto repeat; } out: rcu_read_unlock(); @@ -1453,7 +1446,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_entry(mapping, offset); - if (page && !radix_tree_exception(page)) { + if (page && !xa_is_value(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page_mapping(page) != mapping)) { -- cgit v1.2.3 From f280bf092d48ff2db12b4a01127cd855c9e0ffb0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 17:20:45 -0400 Subject: page cache: Convert find_get_entries to XArray Slightly shorter and simpler code. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 2bf9f0742082..4707156b9fbd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1578,53 +1578,48 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, ULONG_MAX) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; -- cgit v1.2.3 From fd1b3cee2a867868d39bb8cbcc4b00c36d07cc01 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 17:38:56 -0400 Subject: page cache: Convert find_get_pages_range to XArray The 'end' parameter of the xas_for_each iterator avoids a useless iteration at the end of the range. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 52 +++++++++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 4707156b9fbd..b72c39fe61c2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1650,64 +1650,50 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *start); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each(&xas, page, end) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Skip - * over it. - */ + /* Skip over shadow, swap and DAX entries */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *start = pages[ret - 1]->index + 1; + *start = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* * We come here when there is no page beyond @end. We take care to not * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is page at index -1 but that is + * breaks the iteration when there is a page at index -1 but that is * already broken anyway. */ if (end == (pgoff_t)-1) -- cgit v1.2.3 From 3ece58a270cd1e5026282abe778bd50db7a11d08 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 18:00:33 -0400 Subject: page cache: Convert find_get_pages_contig to XArray There's no direct replacement for radix_tree_for_each_contig() in the XArray API as it's an unusual thing to do. Instead, open-code a loop using xas_next(). This removes the only user of radix_tree_for_each_contig() so delete the iterator from the API and the test suite code for it. Signed-off-by: Matthew Wilcox --- .clang-format | 1 - include/linux/radix-tree.h | 17 ----------- mm/filemap.c | 53 ++++++++++++++-------------------- tools/testing/radix-tree/regression3.c | 23 --------------- 4 files changed, 22 insertions(+), 72 deletions(-) diff --git a/.clang-format b/.clang-format index 1d5da22e0ba5..e6080f5834a3 100644 --- a/.clang-format +++ b/.clang-format @@ -323,7 +323,6 @@ ForEachMacros: - 'protocol_for_each_card' - 'protocol_for_each_dev' - 'queue_for_each_hw_ctx' - - 'radix_tree_for_each_contig' - 'radix_tree_for_each_slot' - 'radix_tree_for_each_tagged' - 'rbtree_postorder_for_each_entry_safe' diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index d55de428a589..023e888e1163 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -522,23 +522,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, slot || (slot = radix_tree_next_chunk(root, iter, 0)) ; \ slot = radix_tree_next_slot(slot, iter, 0)) -/** - * radix_tree_for_each_contig - iterate over contiguous slots - * - * @slot: the void** variable for pointer to slot - * @root: the struct radix_tree_root pointer - * @iter: the struct radix_tree_iter pointer - * @start: iteration starting index - * - * @slot points to radix tree slot, @iter->index contains its index. - */ -#define radix_tree_for_each_contig(slot, root, iter, start) \ - for (slot = radix_tree_iter_init(iter, start) ; \ - slot || (slot = radix_tree_next_chunk(root, iter, \ - RADIX_TREE_ITER_CONTIG)) ; \ - slot = radix_tree_next_slot(slot, iter, \ - RADIX_TREE_ITER_CONTIG)) - /** * radix_tree_for_each_tagged - iterate over tagged slots * diff --git a/mm/filemap.c b/mm/filemap.c index b72c39fe61c2..089b67598100 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1721,57 +1721,43 @@ out: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); + struct page *page; unsigned int ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - /* The hole, there no reason to continue */ - if (unlikely(!page)) - break; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Stop - * looking for contiguous pages. - */ + for (page = xas_load(&xas); page; page = xas_next(&xas)) { + struct page *head; + if (xas_retry(&xas, page)) + continue; + /* + * If the entry has been swapped out, we can stop looking. + * No current caller is looking for DAX entries. + */ + if (xa_is_value(page)) break; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; /* * must check mapping and index after taking the ref. * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { + if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { put_page(page); break; } @@ -1779,6 +1765,11 @@ repeat: pages[ret] = page; if (++ret == nr_pages) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c index ace2543c3eda..9f9a3b280f56 100644 --- a/tools/testing/radix-tree/regression3.c +++ b/tools/testing/radix-tree/regression3.c @@ -69,21 +69,6 @@ void regression3_test(void) continue; } } - radix_tree_delete(&root, 1); - - first = true; - radix_tree_for_each_contig(slot, &root, &iter, 0) { - printv(2, "contig %ld %p\n", iter.index, *slot); - if (first) { - radix_tree_insert(&root, 1, ptr); - first = false; - } - if (radix_tree_deref_retry(*slot)) { - printv(2, "retry at %ld\n", iter.index); - slot = radix_tree_iter_retry(&iter); - continue; - } - } radix_tree_for_each_slot(slot, &root, &iter, 0) { printv(2, "slot %ld %p\n", iter.index, *slot); @@ -93,14 +78,6 @@ void regression3_test(void) } } - radix_tree_for_each_contig(slot, &root, &iter, 0) { - printv(2, "contig %ld %p\n", iter.index, *slot); - if (!iter.index) { - printv(2, "next at %ld\n", iter.index); - slot = radix_tree_iter_resume(slot, &iter); - } - } - radix_tree_tag_set(&root, 0, 0); radix_tree_tag_set(&root, 1, 0); radix_tree_for_each_tagged(slot, &root, &iter, 0, 0) { -- cgit v1.2.3 From a6906972fe67fb6f73ba04088f7897227fd1cd8f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 18:12:54 -0400 Subject: page cache; Convert find_get_pages_range_tag to XArray The 'end' parameter of the xas_for_each iterator avoids a useless iteration at the end of the range. Signed-off-by: Matthew Wilcox --- include/linux/pagemap.h | 4 +-- mm/filemap.c | 68 +++++++++++++++++++------------------------------ 2 files changed, 28 insertions(+), 44 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cf9ad413eee9..c9bbb9a05764 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -363,10 +363,10 @@ static inline unsigned find_get_pages(struct address_space *mapping, unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); static inline unsigned find_get_pages_tag(struct address_space *mapping, - pgoff_t *index, int tag, unsigned int nr_pages, + pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, diff --git a/mm/filemap.c b/mm/filemap.c index 089b67598100..0df28aa6411c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1789,74 +1789,58 @@ EXPORT_SYMBOL(find_get_pages_contig); * @tag. We update @index to index the next page for the traversal. */ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, - pgoff_t end, int tag, unsigned int nr_pages, + pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, *index); + struct page *page; unsigned ret = 0; if (unlikely(!nr_pages)) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { - struct page *head, *page; - - if (iter.index > end) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, end, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - /* - * A shadow entry of a recently evicted page. - * - * Those entries should never be tagged, but - * this tree walk is lockless and the tags are - * looked up in bulk, one radix tree node at a - * time, so there is a sizable window for page - * reclaim to evict a page we saw tagged. - * - * Skip over it. - */ + /* + * Shadow entries should never be tagged, but this iteration + * is lockless so there is a window for page reclaim to evict + * a page we saw tagged. Skip over it. + */ + if (xa_is_value(page)) continue; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; if (++ret == nr_pages) { - *index = pages[ret - 1]->index + 1; + *index = page->index + 1; goto out; } + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } /* - * We come here when we got at @end. We take care to not overflow the + * We come here when we got to @end. We take care to not overflow the * index @index as it confuses some of the callers. This breaks the - * iteration when there is page at index -1 but that is already broken - * anyway. + * iteration when there is a page at index -1 but that is already + * broken anyway. */ if (end == (pgoff_t)-1) *index = (pgoff_t)-1; -- cgit v1.2.3 From c1901cd33cf407d77a181f8dd4ffff98041ef480 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 16 May 2018 23:56:04 -0400 Subject: page cache: Convert find_get_entries_tag to XArray Slightly shorter and simpler code. Signed-off-by: Matthew Wilcox --- include/linux/pagemap.h | 2 +- mm/filemap.c | 54 ++++++++++++++++++++++--------------------------- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c9bbb9a05764..226f96f0dee0 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -373,7 +373,7 @@ static inline unsigned find_get_pages_tag(struct address_space *mapping, nr_pages, pages); } unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 0df28aa6411c..35ae011971e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1866,57 +1866,51 @@ EXPORT_SYMBOL(find_get_pages_range_tag); * @tag. */ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - int tag, unsigned int nr_entries, + xa_mark_t tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices) { - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + struct page *page; unsigned int ret = 0; - struct radix_tree_iter iter; if (!nr_entries) return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { - struct page *head, *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) + xas_for_each_marked(&xas, page, ULONG_MAX, tag) { + struct page *head; + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + if (xa_is_value(page)) goto export; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto retry; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto put_page; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; + export: - indices[ret] = iter.index; + indices[ret] = xas.xa_index; entries[ret] = page; if (++ret == nr_entries) break; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); } rcu_read_unlock(); return ret; -- cgit v1.2.3 From 070e807c690bf9a648d4a878f3c68ea9f5f5ce14 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 May 2018 00:08:30 -0400 Subject: page cache: Convert filemap_map_pages to XArray Slight change of strategy here; if we have trouble getting hold of a page for whatever reason (eg a compound page is split underneath us), don't spin to stabilise the page, just continue the iteration, like we would if we failed to trylock the page. Since this is a speculative optimisation, it feels like we should allow the process to take an extra fault if it turns out to need this page instead of spending time to pin down a page it may not need. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 35ae011971e1..ae1fcaa24f97 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2516,45 +2516,31 @@ EXPORT_SYMBOL(filemap_fault); void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { - struct radix_tree_iter iter; - void **slot; struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; + XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { - if (iter.index > end_pgoff) - break; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - goto next; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } + xas_for_each(&xas, page, end_pgoff) { + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) goto next; - } head = compound_head(page); if (!page_cache_get_speculative(head)) - goto repeat; + goto next; /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } + if (compound_head(page) != head) + goto skip; /* Has the page moved? */ - if (unlikely(page != *slot)) { - put_page(head); - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto skip; if (!PageUptodate(page) || PageReadahead(page) || @@ -2573,10 +2559,10 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; if (vmf->pte) - vmf->pte += iter.index - last_pgoff; - last_pgoff = iter.index; + vmf->pte += xas.xa_index - last_pgoff; + last_pgoff = xas.xa_index; if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); @@ -2589,8 +2575,6 @@ next: /* Huge page is mapped? No need to proceed. */ if (pmd_trans_huge(*vmf->pmd)) break; - if (iter.index == end_pgoff) - break; } rcu_read_unlock(); } -- cgit v1.2.3 From a332125fc3223e1092f765db442b7afb9fd4ecde Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 May 2018 00:13:27 -0400 Subject: radix tree test suite: Convert regression1 to XArray Now the page cache lookup is using the XArray, let's convert this regression test from the radix tree API to the XArray so it's testing roughly the same thing it was testing before. Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/regression1.c | 58 +++++++++++----------------------- 1 file changed, 19 insertions(+), 39 deletions(-) diff --git a/tools/testing/radix-tree/regression1.c b/tools/testing/radix-tree/regression1.c index 0aece092f40e..b4a4a7168986 100644 --- a/tools/testing/radix-tree/regression1.c +++ b/tools/testing/radix-tree/regression1.c @@ -53,12 +53,12 @@ struct page { unsigned long index; }; -static struct page *page_alloc(void) +static struct page *page_alloc(int index) { struct page *p; p = malloc(sizeof(struct page)); p->count = 1; - p->index = 1; + p->index = index; pthread_mutex_init(&p->lock, NULL); return p; @@ -80,53 +80,33 @@ static void page_free(struct page *p) static unsigned find_get_pages(unsigned long start, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + XA_STATE(xas, &mt_tree, start); + struct page *page; + unsigned int ret = 0; rcu_read_lock(); -restart: - nr_found = radix_tree_gang_lookup_slot(&mt_tree, - (void ***)pages, NULL, start, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { - struct page *page; -repeat: - page = radix_tree_deref_slot((void **)pages[i]); - if (unlikely(!page)) + xas_for_each(&xas, page, ULONG_MAX) { + if (xas_retry(&xas, page)) continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - assert((start | i) == 0); - goto restart; - } - /* - * No exceptional entries are inserted in this test. - */ - assert(0); - } - pthread_mutex_lock(&page->lock); - if (!page->count) { - pthread_mutex_unlock(&page->lock); - goto repeat; - } + if (!page->count) + goto unlock; + /* don't actually update page refcount */ pthread_mutex_unlock(&page->lock); /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { - goto repeat; - } + if (unlikely(page != xas_reload(&xas))) + goto put_page; pages[ret] = page; ret++; + continue; +unlock: + pthread_mutex_unlock(&page->lock); +put_page: + xas_reset(&xas); } rcu_read_unlock(); return ret; @@ -145,12 +125,12 @@ static void *regression1_fn(void *arg) for (j = 0; j < 1000000; j++) { struct page *p; - p = page_alloc(); + p = page_alloc(0); pthread_mutex_lock(&mt_lock); radix_tree_insert(&mt_tree, 0, p); pthread_mutex_unlock(&mt_lock); - p = page_alloc(); + p = page_alloc(1); pthread_mutex_lock(&mt_lock); radix_tree_insert(&mt_tree, 1, p); pthread_mutex_unlock(&mt_lock); -- cgit v1.2.3 From ef8e5717db01bfa4f425d2cda551a6fccc85529d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 03:59:45 -0500 Subject: page cache: Convert delete_batch to XArray Rename the function from page_cache_tree_delete_batch to just page_cache_delete_batch. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ae1fcaa24f97..f7f9af1d98b0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -272,7 +272,7 @@ void delete_from_page_cache(struct page *page) EXPORT_SYMBOL(delete_from_page_cache); /* - * page_cache_tree_delete_batch - delete several pages from page cache + * page_cache_delete_batch - delete several pages from page cache * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * @@ -285,23 +285,18 @@ EXPORT_SYMBOL(delete_from_page_cache); * * The function expects the i_pages lock to be held. */ -static void -page_cache_tree_delete_batch(struct address_space *mapping, +static void page_cache_delete_batch(struct address_space *mapping, struct pagevec *pvec) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; int i = 0, tail_pages = 0; struct page *page; - pgoff_t start; - start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + mapping_set_update(&xas, mapping); + xas_for_each(&xas, page, ULONG_MAX) { if (i >= pagevec_count(pvec) && !tail_pages) break; - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); if (xa_is_value(page)) continue; if (!tail_pages) { @@ -310,8 +305,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, * have our pages locked so they are protected from * being removed. */ - if (page != pvec->pages[i]) + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > + pvec->pages[i]->index, page); continue; + } WARN_ON_ONCE(!PageLocked(page)); if (PageTransHuge(page) && !PageHuge(page)) tail_pages = HPAGE_PMD_NR - 1; @@ -322,11 +320,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, */ i++; } else { + VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages + != pvec->pages[i]->index, page); tail_pages--; } - radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); - __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, - workingset_lookup_update(mapping)); + xas_store(&xas, NULL); total_pages++; } mapping->nrpages -= total_pages; @@ -347,7 +345,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, unaccount_page_cache_page(mapping, pvec->pages[i]); } - page_cache_tree_delete_batch(mapping, pvec); + page_cache_delete_batch(mapping, pvec); xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) -- cgit v1.2.3 From 22ecdb4f8b7dbfefa3deaf6b144093b5c4cd2aff Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 04:02:00 -0500 Subject: page cache: Remove stray radix comment Signed-off-by: Matthew Wilcox --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index f7f9af1d98b0..bedd8680f789 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2682,7 +2682,7 @@ repeat: put_page(page); if (err == -EEXIST) goto repeat; - /* Presumably ENOMEM for radix tree node */ + /* Presumably ENOMEM for xarray node */ return ERR_PTR(err); } -- cgit v1.2.3 From 8fa8e538e4be359e9042573737632dda35a8700d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 Jan 2018 06:26:49 -0500 Subject: page cache: Convert filemap_range_has_page to XArray Instead of calling find_get_pages_range() and putting any reference, use xas_find() to iterate over any entries in the range, skipping the shadow/swap entries. Signed-off-by: Matthew Wilcox --- mm/filemap.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index bedd8680f789..6b36516bc31d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -455,20 +455,31 @@ EXPORT_SYMBOL(filemap_flush); bool filemap_range_has_page(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - pgoff_t index = start_byte >> PAGE_SHIFT; - pgoff_t end = end_byte >> PAGE_SHIFT; struct page *page; + XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); + pgoff_t max = end_byte >> PAGE_SHIFT; if (end_byte < start_byte) return false; - if (mapping->nrpages == 0) - return false; + rcu_read_lock(); + for (;;) { + page = xas_find(&xas, max); + if (xas_retry(&xas, page)) + continue; + /* Shadow entries don't count */ + if (xa_is_value(page)) + continue; + /* + * We don't need to try to pin this page; we're about to + * release the RCU lock anyway. It is enough to know that + * there was a page here recently. + */ + break; + } + rcu_read_unlock(); - if (!find_get_pages_range(mapping, &index, end, 1, &page)) - return false; - put_page(page); - return true; + return page != NULL; } EXPORT_SYMBOL(filemap_range_has_page); -- cgit v1.2.3 From ff9c745b81ff1e482167fd73558450e66ad43a33 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 22 Nov 2017 11:41:23 -0500 Subject: mm: Convert page-writeback to XArray Includes moving mapping_tagged() to fs.h as a static inline, and changing it to return bool. Signed-off-by: Matthew Wilcox --- include/linux/fs.h | 17 +++++++------ mm/page-writeback.c | 72 +++++++++++++++++++---------------------------------- 2 files changed, 36 insertions(+), 53 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index d126cad0f621..e10278e4db66 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -483,15 +483,18 @@ struct block_device { struct mutex bd_fsfreeze_mutex; } __randomize_layout; +/* XArray tags, for tagging dirty and writeback pages in the pagecache. */ +#define PAGECACHE_TAG_DIRTY XA_MARK_0 +#define PAGECACHE_TAG_WRITEBACK XA_MARK_1 +#define PAGECACHE_TAG_TOWRITE XA_MARK_2 + /* - * Radix-tree tags, for tagging dirty and writeback pages within the pagecache - * radix trees + * Returns true if any of the pages in the mapping are marked with the tag. */ -#define PAGECACHE_TAG_DIRTY 0 -#define PAGECACHE_TAG_WRITEBACK 1 -#define PAGECACHE_TAG_TOWRITE 2 - -int mapping_tagged(struct address_space *mapping, int tag); +static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) +{ + return xa_marked(&mapping->i_pages, tag); +} static inline void i_mmap_lock_write(struct address_space *mapping) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 84ae9bf5858a..fc6e5743b0bf 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2097,34 +2097,25 @@ void __init page_writeback_init(void) * dirty pages in the file (thus it is important for this function to be quick * so that it can tag pages faster than a dirtying process can create them). */ -/* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock - * latency. - */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { -#define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); + unsigned int tagged = 0; + void *page; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, - PAGECACHE_TAG_DIRTY) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(&mapping->i_pages, &iter, - PAGECACHE_TAG_TOWRITE); - tagged++; - if ((tagged % WRITEBACK_TAG_BATCH) != 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) continue; - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + + xas_pause(&xas); + xas_unlock_irq(&xas); cond_resched(); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2164,7 +2155,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@ -2445,7 +2436,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, /* * For address_spaces which do not use buffers. Just tag the page as dirty in - * its radix tree. + * the xarray. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" @@ -2471,7 +2462,7 @@ int __set_page_dirty_nobuffers(struct page *page) BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, page_index(page), + __xa_set_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); @@ -2634,13 +2625,13 @@ EXPORT_SYMBOL(__cancel_dirty_page); * Returns true if the page was previously dirty. * * This is for preparing to put the page under writeout. We leave the page - * tagged as dirty in the radix tree so that a concurrent write-for-sync + * tagged as dirty in the xarray so that a concurrent write-for-sync * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage * implementation will run either set_page_writeback() or set_page_dirty(), - * at which stage we bring the page's dirty flag and radix-tree dirty tag + * at which stage we bring the page's dirty flag and xarray dirty tag * back into sync. * - * This incoherency between the page's dirty flag and radix-tree tag is + * This incoherency between the page's dirty flag and xarray tag is * unfortunate, but it only exists while the page is locked. */ int clear_page_dirty_for_io(struct page *page) @@ -2721,7 +2712,7 @@ int test_clear_page_writeback(struct page *page) xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2761,11 +2752,13 @@ int __test_set_page_writeback(struct page *page, bool keep_write) lock_page_memcg(page); if (mapping && mapping_use_writeback_tags(mapping)) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - xa_lock_irqsave(&mapping->i_pages, flags); + xas_lock_irqsave(&xas, flags); + xas_load(&xas); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2773,8 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_WRITEBACK); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2787,12 +2779,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_TOWRITE); - xa_unlock_irqrestore(&mapping->i_pages, flags); + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irqrestore(&xas, flags); } else { ret = TestSetPageWriteback(page); } @@ -2806,16 +2796,6 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); -/* - * Return true if any of the pages in the mapping are marked with the - * passed tag. - */ -int mapping_tagged(struct address_space *mapping, int tag) -{ - return radix_tree_tagged(&mapping->i_pages, tag); -} -EXPORT_SYMBOL(mapping_tagged); - /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. -- cgit v1.2.3 From a97e7904c0806309fd77103005bb7820c3f1c5e4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Nov 2017 14:24:59 -0500 Subject: mm: Convert workingset to XArray We construct an XA_STATE and use it to delete the node with xas_store() rather than adding a special function for this unique use case. Includes a test that simulates this usage for the test suite. Signed-off-by: Matthew Wilcox --- include/linux/swap.h | 9 -------- lib/test_xarray.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/workingset.c | 51 +++++++++++++++++------------------------ 3 files changed, 86 insertions(+), 39 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6ea50cf41b4c..112cebcf0402 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -306,15 +306,6 @@ void workingset_update_node(struct xa_node *node); xas_set_update(xas, workingset_update_node); \ } while (0) -/* Returns workingset_update_node() if the mapping has shadow entries. */ -#define workingset_lookup_update(mapping) \ -({ \ - radix_tree_update_node_t __helper = workingset_update_node; \ - if (dax_mapping(mapping) || shmem_mapping(mapping)) \ - __helper = NULL; \ - __helper; \ -}) - /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; diff --git a/lib/test_xarray.c b/lib/test_xarray.c index a752e6a37e6f..128c6489082f 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -863,6 +863,67 @@ static noinline void check_create_range(struct xarray *xa) check_create_range_3(); } +static LIST_HEAD(shadow_nodes); + +static void test_update_node(struct xa_node *node) +{ + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) + list_add(&shadow_nodes, &node->private_list); + } else { + if (!list_empty(&node->private_list)) + list_del_init(&node->private_list); + } +} + +static noinline void shadow_remove(struct xarray *xa) +{ + struct xa_node *node; + + xa_lock(xa); + while ((node = list_first_entry_or_null(&shadow_nodes, + struct xa_node, private_list))) { + XA_STATE(xas, node->array, 0); + XA_BUG_ON(xa, node->array != xa); + list_del_init(&node->private_list); + xas.xa_node = xa_parent_locked(node->array, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, test_update_node); + xas_store(&xas, NULL); + } + xa_unlock(xa); +} + +static noinline void check_workingset(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + xas_set_update(&xas, test_update_node); + + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(0)); + xas_next(&xas); + xas_store(&xas, xa_mk_value(1)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + XA_BUG_ON(xa, list_empty(&shadow_nodes)); + + xas_lock(&xas); + xas_next(&xas); + xas_store(&xas, &xas); + XA_BUG_ON(xa, !list_empty(&shadow_nodes)); + + xas_store(&xas, xa_mk_value(2)); + xas_unlock(&xas); + XA_BUG_ON(xa, list_empty(&shadow_nodes)); + + shadow_remove(xa); + XA_BUG_ON(xa, !list_empty(&shadow_nodes)); + XA_BUG_ON(xa, !xa_empty(xa)); +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -916,6 +977,10 @@ static int xarray_checks(void) check_create_range(&array); check_store_iter(&array); + check_workingset(&array, 0); + check_workingset(&array, 64); + check_workingset(&array, 4096); + printk("XArray: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run == tests_passed) ? 0 : -EINVAL; } diff --git a/mm/workingset.c b/mm/workingset.c index c201cbb8c00f..5cfb29ec3fd9 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -148,7 +148,7 @@ * and activations is maintained (node->inactive_age). * * On eviction, a snapshot of this counter (along with some bits to - * identify the node) is stored in the now empty page cache radix tree + * identify the node) is stored in the now empty page cache * slot of the evicted page. This is called a shadow entry. * * On cache misses for which there are shadow entries, an eligible @@ -162,7 +162,7 @@ /* * Eviction timestamps need to be able to cover the full range of - * actionable refaults. However, bits are tight in the radix tree + * actionable refaults. However, bits are tight in the xarray * entry, and after storing the identifier for the lruvec there might * not be enough left to represent every single actionable refault. In * that case, we have to sacrifice granularity for distance, and group @@ -339,7 +339,7 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node) +void workingset_update_node(struct xa_node *node) { /* * Track non-empty nodes that contain only shadow entries; @@ -368,7 +368,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, nodes = list_lru_shrink_count(&shadow_nodes, sc); /* - * Approximate a reasonable limit for the radix tree nodes + * Approximate a reasonable limit for the nodes * containing shadow entries. We don't need to keep more * shadow entries than possible pages on the active list, * since refault distances bigger than that are dismissed. @@ -383,11 +383,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * worst-case density of 1/8th. Below that, not all eligible * refaults can be detected anymore. * - * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * On 64-bit with 7 xa_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE + * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ if (sc->memcg) { cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, @@ -396,7 +396,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); } - max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); + max_nodes = cache >> (XA_CHUNK_SHIFT - 3); if (!nodes) return SHRINK_EMPTY; @@ -409,11 +409,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, - void *arg) + void *arg) __must_hold(lru_lock) { + struct xa_node *node = container_of(item, struct xa_node, private_list); + XA_STATE(xas, node->array, 0); struct address_space *mapping; - struct radix_tree_node *node; - unsigned int i; int ret; /* @@ -421,14 +421,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any - * address_space that has radix tree nodes on the LRU. + * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ - node = container_of(item, struct xa_node, private_list); mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ @@ -450,25 +449,17 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; if (WARN_ON_ONCE(node->count != node->nr_values)) goto out_invalid; - for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[i]) { - if (WARN_ON_ONCE(!xa_is_value(node->slots[i]))) - goto out_invalid; - if (WARN_ON_ONCE(!node->nr_values)) - goto out_invalid; - if (WARN_ON_ONCE(!mapping->nrexceptional)) - goto out_invalid; - node->slots[i] = NULL; - node->nr_values--; - node->count--; - mapping->nrexceptional--; - } - } - if (WARN_ON_ONCE(node->nr_values)) - goto out_invalid; + mapping->nrexceptional -= node->nr_values; + xas.xa_node = xa_parent_locked(&mapping->i_pages, node); + xas.xa_offset = node->offset; + xas.xa_shift = node->shift + XA_CHUNK_SHIFT; + xas_set_update(&xas, workingset_update_node); + /* + * We could store a shadow entry here which was the minimum of the + * shadow entries we were tracking ... + */ + xas_store(&xas, NULL); inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->i_pages, node, - workingset_lookup_update(mapping)); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3 From 69b6c1319b6588b65d853055d5336f662f6cb1a2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 25 Nov 2017 22:52:46 -0500 Subject: mm: Convert truncate to XArray This is essentially xa_cmpxchg() with the locking handled above us, and it doesn't have to handle replacing a NULL entry. Signed-off-by: Matthew Wilcox --- mm/truncate.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index ed778555c9f3..45d68e90b703 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -33,15 +33,12 @@ static inline void __clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - struct radix_tree_node *node; - void **slot; + XA_STATE(xas, &mapping->i_pages, index); - if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) + xas_set_update(&xas, workingset_update_node); + if (xas_load(&xas) != entry) return; - if (*slot != entry) - return; - __radix_tree_replace(&mapping->i_pages, node, slot, NULL, - workingset_update_node); + xas_store(&xas, NULL); mapping->nrexceptional--; } @@ -738,10 +735,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping, index++; } /* - * For DAX we invalidate page tables after invalidating radix tree. We + * For DAX we invalidate page tables after invalidating page cache. We * could invalidate page tables while invalidating each entry however * that would be expensive. And doing range unmapping before doesn't - * work as we have no cheap way to find whether radix tree entry didn't + * work as we have no cheap way to find whether page cache entry didn't * get remapped later. */ if (dax_mapping(mapping)) { -- cgit v1.2.3 From 8d93b41c09d1b74f376e3b9779eb3badeb550301 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 27 Nov 2017 15:46:54 -0500 Subject: mm: Convert add_to_swap_cache to XArray Combine __add_to_swap_cache and add_to_swap_cache into one function since there is no more need to preload. Signed-off-by: Matthew Wilcox --- mm/swap_state.c | 93 ++++++++++++++++++--------------------------------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index ecee9c6c4cc1..850314c2c3ab 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -107,14 +107,15 @@ void show_swap_cache_info(void) } /* - * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) { - int error, i, nr = hpage_nr_pages(page); - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); + unsigned long i, nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -123,50 +124,30 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) page_ref_add(page, nr); SetPageSwapCache(page); - address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - for (i = 0; i < nr; i++) { - set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->i_pages, - idx + i, page + i); - if (unlikely(error)) - break; - } - if (likely(!error)) { + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; + for (i = 0; i < nr; i++) { + VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + set_page_private(page + i, entry.val + i); + xas_store(&xas, page + i); + xas_next(&xas); + } address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); ADD_CACHE_INFO(add_total, nr); - } else { - /* - * Only the context which have set SWAP_HAS_CACHE flag - * would call add_to_swap_cache(). - * So add_to_swap_cache() doesn't returns -EEXIST. - */ - VM_BUG_ON(error == -EEXIST); - set_page_private(page + i, 0UL); - while (i--) { - radix_tree_delete(&address_space->i_pages, idx + i); - set_page_private(page + i, 0UL); - } - ClearPageSwapCache(page); - page_ref_sub(page, nr); - } - xa_unlock_irq(&address_space->i_pages); +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); - return error; -} - - -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) -{ - int error; + if (!xas_error(&xas)) + return 0; - error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); - if (!error) { - error = __add_to_swap_cache(page, entry); - radix_tree_preload_end(); - } - return error; + ClearPageSwapCache(page); + page_ref_sub(page, nr); + return xas_error(&xas); } /* @@ -217,7 +198,7 @@ int add_to_swap(struct page *page) return 0; /* - * Radix-tree node allocations from PF_MEMALLOC contexts could + * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * @@ -229,7 +210,6 @@ int add_to_swap(struct page *page) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - /* -ENOMEM radix-tree allocation failure */ if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely @@ -413,19 +393,11 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, break; /* Out of memory */ } - /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); - if (err) - break; - /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (err == -EEXIST) { - radix_tree_preload_end(); /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page @@ -433,26 +405,19 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ cond_resched(); continue; - } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); + } else if (err) /* swp entry is obsolete ? */ break; - } - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + /* May fail (-ENOMEM) if XArray node allocation failed. */ __SetPageLocked(new_page); __SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { - radix_tree_preload_end(); - /* - * Initiate read into locked page and return. - */ + /* Initiate read into locked page */ lru_cache_add_anon(new_page); *new_page_allocated = true; return new_page; } - radix_tree_preload_end(); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely -- cgit v1.2.3 From 4e17ec250fce0eba9b70a91c9622da2748a3ec50 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 29 Nov 2017 08:32:39 -0500 Subject: mm: Convert delete_from_swap_cache to XArray Both callers of __delete_from_swap_cache have the swp_entry_t already, so pass that in to make constructing the XA_STATE easier. Signed-off-by: Matthew Wilcox --- include/linux/swap.h | 5 +++-- mm/swap_state.c | 24 ++++++++++-------------- mm/vmscan.c | 2 +- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 112cebcf0402..cb479bf5842e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -403,7 +403,7 @@ extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); -extern void __delete_from_swap_cache(struct page *); +extern void __delete_from_swap_cache(struct page *, swp_entry_t entry); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); @@ -557,7 +557,8 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, return -1; } -static inline void __delete_from_swap_cache(struct page *page) +static inline void __delete_from_swap_cache(struct page *page, + swp_entry_t entry) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 850314c2c3ab..f393c994cc60 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -154,23 +154,22 @@ unlock: * This must be called only on pages that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page) +void __delete_from_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *address_space; + struct address_space *address_space = swap_address_space(entry); int i, nr = hpage_nr_pages(page); - swp_entry_t entry; - pgoff_t idx; + pgoff_t idx = swp_offset(entry); + XA_STATE(xas, &address_space->i_pages, idx); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(PageWriteback(page), page); - entry.val = page_private(page); - address_space = swap_address_space(entry); - idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->i_pages, idx + i); + void *entry = xas_store(&xas, NULL); + VM_BUG_ON_PAGE(entry != page + i, entry); set_page_private(page + i, 0); + xas_next(&xas); } ClearPageSwapCache(page); address_space->nrpages -= nr; @@ -243,14 +242,11 @@ fail: */ void delete_from_swap_cache(struct page *page) { - swp_entry_t entry; - struct address_space *address_space; - - entry.val = page_private(page); + swp_entry_t entry = { .val = page_private(page) }; + struct address_space *address_space = swap_address_space(entry); - address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, entry); xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); diff --git a/mm/vmscan.c b/mm/vmscan.c index c7ce2c161225..80f731cf974e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -923,7 +923,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { -- cgit v1.2.3 From 560d454bae08b5d5a132c5520177dede066334b7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 04:30:18 -0500 Subject: mm: Convert __do_page_cache_readahead to XArray This one is trivial. Signed-off-by: Matthew Wilcox --- mm/readahead.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index fc4dd364b37a..f3d6f9656a3c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -176,9 +176,7 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping, if (page_offset > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, page_offset); - rcu_read_unlock(); + page = xa_load(&mapping->i_pages, page_offset); if (page && !xa_is_value(page)) { /* * Page already present? Kick off the current batch of -- cgit v1.2.3 From 89eb946a7432be639b452fac295c0c2e5186c4a4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 04:35:16 -0500 Subject: mm: Convert page migration to XArray Signed-off-by: Matthew Wilcox --- mm/migrate.c | 48 ++++++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d6a2e89b086a..b3cde3fd094a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -323,7 +323,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, page = migration_entry_to_page(entry); /* - * Once radix-tree replacement of page migration started, page_count + * Once page cache replacement of page migration started, page_count * *must* be zero. And, we don't want to call wait_on_page_locked() * against a page without get_page(). * So, we use get_page_unless_zero(), here. Even failed, page fault @@ -438,10 +438,10 @@ int migrate_page_move_mapping(struct address_space *mapping, struct buffer_head *head, enum migrate_mode mode, int extra_count) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; int dirty; int expected_count = 1 + extra_count; - void **pslot; /* * Device public or private pages have an extra refcount as they are @@ -467,21 +467,16 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(page)); + xas_lock_irq(&xas); expected_count += hpage_nr_pages(page) + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -495,7 +490,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -523,16 +518,13 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); if (PageTransHuge(page)) { int i; - int index = page_index(page); for (i = 1; i < HPAGE_PMD_NR; i++) { - pslot = radix_tree_lookup_slot(&mapping->i_pages, - index + i); - radix_tree_replace_slot(&mapping->i_pages, pslot, - newpage + i); + xas_next(&xas); + xas_store(&xas, newpage + i); } } @@ -543,7 +535,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); - xa_unlock(&mapping->i_pages); + xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -583,22 +575,18 @@ EXPORT_SYMBOL(migrate_page_move_mapping); int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + XA_STATE(xas, &mapping->i_pages, page_index(page)); int expected_count; - void **pslot; - - xa_lock_irq(&mapping->i_pages); - - pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); + xas_lock_irq(&xas); expected_count = 2 + page_has_private(page); - if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { - xa_unlock_irq(&mapping->i_pages); + if (page_count(page) != expected_count || xas_load(&xas) != page) { + xas_unlock_irq(&xas); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return -EAGAIN; } @@ -607,11 +595,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); + xas_store(&xas, newpage); page_ref_unfreeze(page, expected_count - 1); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); return MIGRATEPAGE_SUCCESS; } -- cgit v1.2.3 From aa5dc07f70c50a3c56bf3fa07bde0b22a95aa381 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 10:16:10 -0500 Subject: mm: Convert huge_memory to XArray Quite a straightforward conversion. Signed-off-by: Matthew Wilcox --- mm/huge_memory.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 533f9b00147d..9eb79c384616 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2443,13 +2443,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - /* Additional pin to radix tree of swap cache */ + /* Additional pin to swap cache */ if (PageSwapCache(head)) page_ref_add(head, 2); else page_ref_inc(head); } else { - /* Additional pin to radix tree */ + /* Additional pin to page cache */ page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } @@ -2561,7 +2561,7 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) { int extra_pins; - /* Additional pins from radix tree */ + /* Additional pins from page cache */ if (PageAnon(page)) extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; else @@ -2657,17 +2657,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); if (mapping) { - void **pslot; + XA_STATE(xas, &mapping->i_pages, page_index(head)); - xa_lock(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->i_pages, - page_index(head)); /* - * Check if the head page is present in radix tree. + * Check if the head page is present in page cache. * We assume all tail are present too, if head is there. */ - if (radix_tree_deref_slot_protected(pslot, - &mapping->i_pages.xa_lock) != head) + xa_lock(&mapping->i_pages); + if (xas_load(&xas) != head) goto fail; } -- cgit v1.2.3 From 77da9389b9d5f07d54fda092d1ab56002ec0019a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 14:56:08 -0500 Subject: mm: Convert collapse_shmem to XArray I found another victim of the radix tree being hard to use. Because there was no call to radix_tree_preload(), khugepaged was allocating radix_tree_nodes using GFP_ATOMIC. I also converted a local_irq_save()/restore() pair to disable()/enable(). Signed-off-by: Matthew Wilcox --- mm/khugepaged.c | 159 +++++++++++++++++++++++--------------------------------- 1 file changed, 66 insertions(+), 93 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4820e4adf853..9610e8cba545 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1288,17 +1288,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * * Basic scheme is simple, details are more complex: * - allocate and freeze a new huge page; - * - scan over radix tree replacing old pages the new one + * - scan page cache replacing old pages with the new one * + swap in pages if necessary; * + fill in gaps; - * + keep old pages around in case if rollback is required; - * - if replacing succeed: + * + keep old pages around in case rollback is required; + * - if replacing succeeds: * + copy data over; * + free old pages; * + unfreeze huge page; * - if replacing failed; * + put all pages back and unfreeze them; - * + restore gaps in the radix-tree; + * + restore gaps in the page cache; * + free huge page; */ static void collapse_shmem(struct mm_struct *mm, @@ -1306,12 +1306,11 @@ static void collapse_shmem(struct mm_struct *mm, struct page **hpage, int node) { gfp_t gfp; - struct page *page, *new_page, *tmp; + struct page *new_page; struct mem_cgroup *memcg; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); - struct radix_tree_iter iter; - void **slot; + XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1336,48 +1335,49 @@ static void collapse_shmem(struct mm_struct *mm, __SetPageLocked(new_page); BUG_ON(!page_ref_freeze(new_page, 1)); - /* - * At this point the new_page is 'frozen' (page_count() is zero), locked - * and not up-to-date. It's safe to insert it into radix tree, because - * nobody would be able to map it or use it in other way until we - * unfreeze it. + * At this point the new_page is 'frozen' (page_count() is zero), + * locked and not up-to-date. It's safe to insert it into the page + * cache, because nobody would be able to map it or use it in other + * way until we unfreeze it. */ - index = start; - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - int n = min(iter.index, end) - index; - - /* - * Handle holes in the radix tree: charge it from shmem and - * insert relevant subpage of new_page into the radix-tree. - */ - if (n && !shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; + /* This will be less messy when we use multi-index entries */ + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (!xas_error(&xas)) break; - } - nr_none += n; - for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); - } + xas_unlock_irq(&xas); + if (!xas_nomem(&xas, GFP_KERNEL)) + goto out; + } while (1); - /* We are done. */ - if (index >= end) - break; + xas_set(&xas, start); + for (index = start; index < end; index++) { + struct page *page = xas_next(&xas); + + VM_BUG_ON(index != xas.xa_index); + if (!page) { + if (!shmem_charge(mapping->host, 1)) { + result = SCAN_FAIL; + break; + } + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + nr_none++; + continue; + } - page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); if (xa_is_value(page) || !PageUptodate(page)) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; - goto tree_unlocked; + goto xa_unlocked; } - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + xas_set(&xas, index); } else if (trylock_page(page)) { get_page(page); } else { @@ -1397,7 +1397,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1407,17 +1407,16 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); + xas_set(&xas, index); - slot = radix_tree_lookup_slot(&mapping->i_pages, index); - VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock), page); + VM_BUG_ON_PAGE(page != xas_load(&xas), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* * The page is expected to have page_count() == 3: * - we hold a pin on it; - * - one reference from radix tree; + * - one reference from page cache; * - one from isolate_lru_page; */ if (!page_ref_freeze(page, 3)) { @@ -1432,56 +1431,30 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->i_pages, slot, - new_page + (index % HPAGE_PMD_NR)); - - slot = radix_tree_iter_resume(slot, &iter); - index++; + xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); continue; out_lru: - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); putback_lru_page(page); out_isolate_failed: unlock_page(page); put_page(page); - goto tree_unlocked; + goto xa_unlocked; out_unlock: unlock_page(page); put_page(page); break; } + xas_unlock_irq(&xas); - /* - * Handle hole in radix tree at the end of the range. - * This code only triggers if there's nothing in radix tree - * beyond 'end'. - */ - if (result == SCAN_SUCCEED && index < end) { - int n = end - index; - - if (!shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; - goto tree_locked; - } - - for (; index < end; index++) { - radix_tree_insert(&mapping->i_pages, index, - new_page + (index % HPAGE_PMD_NR)); - } - nr_none += n; - } - -tree_locked: - xa_unlock_irq(&mapping->i_pages); -tree_unlocked: - +xa_unlocked: if (result == SCAN_SUCCEED) { - unsigned long flags; + struct page *page, *tmp; struct zone *zone = page_zone(new_page); /* - * Replacing old pages with new one has succeed, now we need to - * copy the content and free old pages. + * Replacing old pages with new one has succeeded, now we + * need to copy the content and free the old pages. */ list_for_each_entry_safe(page, tmp, &pagelist, lru) { copy_highpage(new_page + (page->index % HPAGE_PMD_NR), @@ -1495,16 +1468,16 @@ tree_unlocked: put_page(page); } - local_irq_save(flags); + local_irq_disable(); __inc_node_page_state(new_page, NR_SHMEM_THPS); if (nr_none) { __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); } - local_irq_restore(flags); + local_irq_enable(); /* - * Remove pte page tables, so we can re-faulti + * Remove pte page tables, so we can re-fault * the page as huge. */ retract_page_tables(mapping, start); @@ -1521,37 +1494,37 @@ tree_unlocked: khugepaged_pages_collapsed++; } else { - /* Something went wrong: rollback changes to the radix-tree */ + struct page *page; + /* Something went wrong: roll back page cache changes */ shmem_uncharge(mapping->host, nr_none); - xa_lock_irq(&mapping->i_pages); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; + xas_lock_irq(&xas); + xas_set(&xas, start); + xas_for_each(&xas, page, end - 1) { page = list_first_entry_or_null(&pagelist, struct page, lru); - if (!page || iter.index < page->index) { + if (!page || xas.xa_index < page->index) { if (!nr_none) break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->i_pages, iter.index); + xas_store(&xas, NULL); continue; } - VM_BUG_ON_PAGE(page->index != iter.index, page); + VM_BUG_ON_PAGE(page->index != xas.xa_index, page); /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->i_pages, slot, page); - slot = radix_tree_iter_resume(slot, &iter); - xa_unlock_irq(&mapping->i_pages); + xas_store(&xas, page); + xas_pause(&xas); + xas_unlock_irq(&xas); putback_lru_page(page); unlock_page(page); - xa_lock_irq(&mapping->i_pages); + xas_lock_irq(&xas); } VM_BUG_ON(nr_none); - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); -- cgit v1.2.3 From 85b392dbace77bf91050e6e07f3dd0f262e4babf Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 15:06:23 -0500 Subject: mm: Convert khugepaged_scan_shmem to XArray Slightly shorter and easier to read code. Signed-off-by: Matthew Wilcox --- mm/khugepaged.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9610e8cba545..c13625c1ad5e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1542,8 +1542,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, pgoff_t start, struct page **hpage) { struct page *page = NULL; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; @@ -1552,17 +1551,11 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= start + HPAGE_PMD_NR) - break; - - page = radix_tree_deref_slot(slot); - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, page)) continue; - } - if (radix_tree_exception(page)) { + if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; break; @@ -1601,7 +1594,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } -- cgit v1.2.3 From 67891ffff2f5cf10e89e348207f687a05c8bd2d6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sun, 10 Jun 2018 07:34:39 -0400 Subject: mm: Convert is_page_cache_freeable to XArray This is just a variable rename and comment change. Signed-off-by: Matthew Wilcox --- mm/vmscan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 80f731cf974e..f9cc86e91812 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -742,12 +742,12 @@ static inline int is_page_cache_freeable(struct page *page) { /* * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache radix tree and - * optional buffer heads at page->private. + * that isolated the page, the page cache and optional buffer + * heads at page->private. */ - int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ? HPAGE_PMD_NR : 1; - return page_count(page) - page_has_private(page) == 1 + radix_pins; + return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) -- cgit v1.2.3 From 10bbd235859bf483f9a8a4ebe95463d700bae394 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Dec 2017 17:30:38 -0500 Subject: pagevec: Use xa_mark_t Removes sparse warnings. Signed-off-by: Matthew Wilcox --- fs/btrfs/extent_io.c | 4 ++-- fs/ext4/inode.c | 2 +- fs/f2fs/data.c | 2 +- fs/gfs2/aops.c | 2 +- include/linux/pagevec.h | 8 +++++--- mm/swap.c | 4 ++-- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dd6faab02bb..fc7ca7d991ad 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3778,7 +3778,7 @@ int btree_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { @@ -3903,7 +3903,7 @@ static int extent_write_cache_pages(struct address_space *mapping, pgoff_t done_index; int range_whole = 0; int scanned = 0; - int tag; + xa_mark_t tag; /* * We have to hold onto the inode so that ordered extents can do their diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d767e993591d..57bad3edfbed 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2613,7 +2613,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; - int tag; + xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 382c1ef9a9e4..5b760809eecc 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2003,7 +2003,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 31e8270d0b26..8afbb35559b9 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -366,7 +366,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - int tag; + xa_mark_t tag; pagevec_init(&pvec); if (wbc->range_cyclic) { diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 6dc456ac6136..081d934eda64 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,6 +9,8 @@ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H +#include + /* 15 pointers + header align the pagevec structure to a power of two */ #define PAGEVEC_SIZE 15 @@ -40,12 +42,12 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag); + xa_mark_t tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned max_pages); + xa_mark_t tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag) + struct address_space *mapping, pgoff_t *index, xa_mark_t tag) { return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } diff --git a/mm/swap.c b/mm/swap.c index 4c5c7fcc6e46..6861f3140a13 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1002,7 +1002,7 @@ EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag) + xa_mark_t tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, PAGEVEC_SIZE, pvec->pages); @@ -1012,7 +1012,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned max_pages) + xa_mark_t tag, unsigned max_pages) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); -- cgit v1.2.3 From 62f945b6a7b8cda6d1f35941eb374276f7b8749a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 17 Nov 2017 10:22:37 -0500 Subject: shmem: Convert shmem_radix_tree_replace to XArray Rename shmem_radix_tree_replace() to shmem_replace_entry() and convert it to use the XArray API. Signed-off-by: Matthew Wilcox --- mm/shmem.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c1062760fe41..5697c8fecdfc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -322,24 +322,20 @@ void shmem_uncharge(struct inode *inode, long pages) } /* - * Replace item expected in radix tree by a new item, while holding tree lock. + * Replace item expected in xarray by a new item, while holding xa_lock. */ -static int shmem_radix_tree_replace(struct address_space *mapping, +static int shmem_replace_entry(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { - struct radix_tree_node *node; - void __rcu **pslot; + XA_STATE(xas, &mapping->i_pages, index); void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); - if (!item) - return -ENOENT; + item = xas_load(&xas); if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->i_pages, node, pslot, - replacement, NULL); + xas_store(&xas, replacement); return 0; } @@ -624,8 +620,7 @@ static int shmem_add_to_page_cache(struct page *page, } else if (!expected) { error = radix_tree_insert(&mapping->i_pages, index, page); } else { - error = shmem_radix_tree_replace(mapping, index, expected, - page); + error = shmem_replace_entry(mapping, index, expected, page); } if (!error) { @@ -654,7 +649,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); xa_lock_irq(&mapping->i_pages); - error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + error = shmem_replace_entry(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); @@ -1578,8 +1573,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, - newpage); + error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); -- cgit v1.2.3 From a12831bf4293d38518e41b80dd897af0122bb268 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 22 Nov 2017 08:34:58 -0500 Subject: shmem: Convert shmem_confirm_swap to XArray xa_load has its own RCU locking, so we can eliminate it here. Signed-off-by: Matthew Wilcox --- mm/shmem.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 5697c8fecdfc..ce91569426f3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -349,12 +349,7 @@ static int shmem_replace_entry(struct address_space *mapping, static bool shmem_confirm_swap(struct address_space *mapping, pgoff_t index, swp_entry_t swap) { - void *item; - - rcu_read_lock(); - item = radix_tree_lookup(&mapping->i_pages, index); - rcu_read_unlock(); - return item == swp_to_radix_entry(swap); + return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap); } /* -- cgit v1.2.3 From e21a29552fa3f44ea41c53488875015ae70fd7f8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 22 Nov 2017 08:36:00 -0500 Subject: shmem: Convert find_swap_entry to XArray This is a 1:1 conversion. The major part of this patch is converting the test framework from userspace to kernel space and mirroring the algorithm now used in find_swap_entry(). Signed-off-by: Matthew Wilcox --- lib/test_xarray.c | 56 +++++++++++++++++++++++++++++++++++++ mm/shmem.c | 27 +++++++----------- tools/testing/radix-tree/main.c | 61 ----------------------------------------- tools/testing/radix-tree/test.c | 22 --------------- tools/testing/radix-tree/test.h | 1 - 5 files changed, 66 insertions(+), 101 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 128c6489082f..815daffdd8c9 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -631,6 +631,61 @@ static noinline void check_find(struct xarray *xa) check_multi_find_2(xa); } +/* See find_swap_entry() in mm/shmem.c */ +static noinline unsigned long xa_find_entry(struct xarray *xa, void *item) +{ + XA_STATE(xas, xa, 0); + unsigned int checked = 0; + void *entry; + + rcu_read_lock(); + xas_for_each(&xas, entry, ULONG_MAX) { + if (xas_retry(&xas, entry)) + continue; + if (entry == item) + break; + checked++; + if ((checked % 4) != 0) + continue; + xas_pause(&xas); + } + rcu_read_unlock(); + + return entry ? xas.xa_index : -1; +} + +static noinline void check_find_entry(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned int order; + unsigned long offset, index; + + for (order = 0; order < 20; order++) { + for (offset = 0; offset < (1UL << (order + 3)); + offset += (1UL << order)) { + for (index = 0; index < (1UL << (order + 5)); + index += (1UL << order)) { + xa_store_order(xa, index, order, + xa_mk_value(index), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, index) != + xa_mk_value(index)); + XA_BUG_ON(xa, xa_find_entry(xa, + xa_mk_value(index)) != index); + } + XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1); + xa_destroy(xa); + } + } +#endif + + XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1); + xa_store_index(xa, ULONG_MAX, GFP_KERNEL); + XA_BUG_ON(xa, xa_find_entry(xa, xa) != -1); + XA_BUG_ON(xa, xa_find_entry(xa, xa_mk_value(LONG_MAX)) != -1); + xa_erase_index(xa, ULONG_MAX); + XA_BUG_ON(xa, !xa_empty(xa)); +} + static noinline void check_move_small(struct xarray *xa, unsigned long idx) { XA_STATE(xas, xa, 0); @@ -972,6 +1027,7 @@ static int xarray_checks(void) check_multi_store(&array); check_xa_alloc(); check_find(&array); + check_find_entry(&array); check_destroy(&array); check_move(&array); check_create_range(&array); diff --git a/mm/shmem.c b/mm/shmem.c index ce91569426f3..a305529d6b89 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1100,34 +1100,27 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +static unsigned long find_swap_entry(struct xarray *xa, void *item) { - struct radix_tree_iter iter; - void __rcu **slot; - unsigned long found = -1; + XA_STATE(xas, xa, 0); unsigned int checked = 0; + void *entry; rcu_read_lock(); - radix_tree_for_each_slot(slot, root, &iter, 0) { - void *entry = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(entry)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, entry, ULONG_MAX) { + if (xas_retry(&xas, entry)) continue; - } - if (entry == item) { - found = iter.index; + if (entry == item) break; - } checked++; - if ((checked % 4096) != 0) + if ((checked % XA_CHECK_SCHED) != 0) continue; - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } - rcu_read_unlock(); - return found; + + return entry ? xas.xa_index : -1; } /* diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index 09deaf4f0959..79589ea570ab 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -236,63 +236,6 @@ void copy_tag_check(void) item_kill_tree(&tree); } -static void __locate_check(struct radix_tree_root *tree, unsigned long index, - unsigned order) -{ - struct item *item; - unsigned long index2; - - item_insert_order(tree, index, order); - item = item_lookup(tree, index); - index2 = find_item(tree, item); - if (index != index2) { - printv(2, "index %ld order %d inserted; found %ld\n", - index, order, index2); - abort(); - } -} - -static void __order_0_locate_check(void) -{ - RADIX_TREE(tree, GFP_KERNEL); - int i; - - for (i = 0; i < 50; i++) - __locate_check(&tree, rand() % INT_MAX, 0); - - item_kill_tree(&tree); -} - -static void locate_check(void) -{ - RADIX_TREE(tree, GFP_KERNEL); - unsigned order; - unsigned long offset, index; - - __order_0_locate_check(); - - for (order = 0; order < 20; order++) { - for (offset = 0; offset < (1 << (order + 3)); - offset += (1UL << order)) { - for (index = 0; index < (1UL << (order + 5)); - index += (1UL << order)) { - __locate_check(&tree, index + offset, order); - } - if (find_item(&tree, &tree) != -1) - abort(); - - item_kill_tree(&tree); - } - } - - if (find_item(&tree, &tree) != -1) - abort(); - __locate_check(&tree, -1, 0); - if (find_item(&tree, &tree) != -1) - abort(); - item_kill_tree(&tree); -} - static void single_thread_tests(bool long_run) { int i; @@ -303,10 +246,6 @@ static void single_thread_tests(bool long_run) rcu_barrier(); printv(2, "after multiorder_check: %d allocated, preempt %d\n", nr_allocated, preempt_count); - locate_check(); - rcu_barrier(); - printv(2, "after locate_check: %d allocated, preempt %d\n", - nr_allocated, preempt_count); tag_check(); rcu_barrier(); printv(2, "after tag_check: %d allocated, preempt %d\n", diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 70ddf964d51c..470419bfd49d 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -209,28 +209,6 @@ int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock, return tagged; } -/* Use the same pattern as find_swap_entry() in mm/shmem.c */ -unsigned long find_item(struct radix_tree_root *root, void *item) -{ - struct radix_tree_iter iter; - void **slot; - unsigned long found = -1; - unsigned long checked = 0; - - radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { - found = iter.index; - break; - } - checked++; - if ((checked % 4) != 0) - continue; - slot = radix_tree_iter_resume(slot, &iter); - } - - return found; -} - static int verify_node(struct radix_tree_node *slot, unsigned int tag, int tagged) { diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index e3dc7a16f09b..9532c18c6cb1 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -32,7 +32,6 @@ void item_kill_tree(struct radix_tree_root *root); int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, unsigned long start, unsigned long end, unsigned batch, unsigned iftag, unsigned thentag); -unsigned long find_item(struct radix_tree_root *, void *item); void xarray_tests(void); void tag_check(void); -- cgit v1.2.3 From 552446a4166189a8c2515571dd6e25fd64a2dc78 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 1 Dec 2017 13:25:14 -0500 Subject: shmem: Convert shmem_add_to_page_cache to XArray We can use xas_find_conflict() instead of radix_tree_gang_lookup_slot() to find any conflicting entry and combine the three paths through this function into one. Signed-off-by: Matthew Wilcox --- mm/shmem.c | 81 ++++++++++++++++++++++++++------------------------------------ 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index a305529d6b89..8633bd3dc433 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -577,9 +577,11 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, void *expected) + pgoff_t index, void *expected, gfp_t gfp) { - int error, nr = hpage_nr_pages(page); + XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); + unsigned long i = 0; + unsigned long nr = 1UL << compound_order(page); VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(index != round_down(index, nr), page); @@ -591,46 +593,39 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - xa_lock_irq(&mapping->i_pages); - if (PageTransHuge(page)) { - void __rcu **results; - pgoff_t idx; - int i; - - error = 0; - if (radix_tree_gang_lookup_slot(&mapping->i_pages, - &results, &idx, index, 1) && - idx < index + HPAGE_PMD_NR) { - error = -EEXIST; + do { + void *entry; + xas_lock_irq(&xas); + entry = xas_find_conflict(&xas); + if (entry != expected) + xas_set_err(&xas, -EEXIST); + xas_create_range(&xas); + if (xas_error(&xas)) + goto unlock; +next: + xas_store(&xas, page + i); + if (++i < nr) { + xas_next(&xas); + goto next; } - - if (!error) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->i_pages, - index + i, page + i); - VM_BUG_ON(error); - } + if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); + __inc_node_page_state(page, NR_SHMEM_THPS); } - } else if (!expected) { - error = radix_tree_insert(&mapping->i_pages, index, page); - } else { - error = shmem_replace_entry(mapping, index, expected, page); - } - - if (!error) { mapping->nrpages += nr; - if (PageTransHuge(page)) - __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - xa_unlock_irq(&mapping->i_pages); - } else { +unlock: + xas_unlock_irq(&xas); + } while (xas_nomem(&xas, gfp)); + + if (xas_error(&xas)) { page->mapping = NULL; - xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); + return xas_error(&xas); } - return error; + + return 0; } /* @@ -1183,7 +1178,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - radswap); + radswap, gfp); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -1700,7 +1695,7 @@ repeat: false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap)); + swp_to_radix_entry(swap), gfp); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1806,13 +1801,8 @@ alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, PageTransHuge(page)); if (error) goto unacct; - error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, - compound_order(page)); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL); - radix_tree_preload_end(); - } + error = shmem_add_to_page_cache(page, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK); if (error) { mem_cgroup_cancel_charge(page, memcg, PageTransHuge(page)); @@ -2281,11 +2271,8 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); - if (!ret) { - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL); - radix_tree_preload_end(); - } + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK); if (ret) goto out_release_uncharge; -- cgit v1.2.3 From 7b8d046fba91d62beb8a8f78244aaa3c23a60cdd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 1 Dec 2017 22:13:06 -0500 Subject: shmem: Convert shmem_alloc_hugepage to XArray xa_find() is a slightly easier API to use than radix_tree_gang_lookup_slot() because it contains its own RCU locking. This commit removes the last user of radix_tree_gang_lookup_slot() so remove the function too. Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 6 +----- lib/radix-tree.c | 44 +------------------------------------------- mm/shmem.c | 14 ++++---------- 3 files changed, 6 insertions(+), 58 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 023e888e1163..8a4280bc350f 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -147,12 +147,11 @@ static inline unsigned int iter_shift(const struct radix_tree_iter *iter) * radix_tree_lookup_slot * radix_tree_tag_get * radix_tree_gang_lookup - * radix_tree_gang_lookup_slot * radix_tree_gang_lookup_tag * radix_tree_gang_lookup_tag_slot * radix_tree_tagged * - * The first 8 functions are able to be called locklessly, using RCU. The + * The first 7 functions are able to be called locklessly, using RCU. The * caller must ensure calls to these functions are made within rcu_read_lock() * regions. Other readers (lock-free or otherwise) and modifications may be * running concurrently. @@ -263,9 +262,6 @@ void radix_tree_clear_tags(struct radix_tree_root *, struct radix_tree_node *, unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items); -unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *, - void __rcu ***results, unsigned long *indices, - unsigned long first_index, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8a58051eb5b3..2f9c0e45eeb7 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1097,7 +1097,7 @@ void __radix_tree_replace(struct radix_tree_root *root, * @slot: pointer to slot * @item: new item to store in the slot. * - * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * For use with radix_tree_lookup_slot() and * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked * across slot lookup and replacement. * @@ -1731,48 +1731,6 @@ radix_tree_gang_lookup(const struct radix_tree_root *root, void **results, } EXPORT_SYMBOL(radix_tree_gang_lookup); -/** - * radix_tree_gang_lookup_slot - perform multiple slot lookup on radix tree - * @root: radix tree root - * @results: where the results of the lookup are placed - * @indices: where their indices should be placed (but usually NULL) - * @first_index: start the lookup from this key - * @max_items: place up to this many items at *results - * - * Performs an index-ascending scan of the tree for present items. Places - * their slots at *@results and returns the number of items which were - * placed at *@results. - * - * The implementation is naive. - * - * Like radix_tree_gang_lookup as far as RCU and locking goes. Slots must - * be dereferenced with radix_tree_deref_slot, and if using only RCU - * protection, radix_tree_deref_slot may fail requiring a retry. - */ -unsigned int -radix_tree_gang_lookup_slot(const struct radix_tree_root *root, - void __rcu ***results, unsigned long *indices, - unsigned long first_index, unsigned int max_items) -{ - struct radix_tree_iter iter; - void __rcu **slot; - unsigned int ret = 0; - - if (unlikely(!max_items)) - return 0; - - radix_tree_for_each_slot(slot, root, &iter, first_index) { - results[ret] = slot; - if (indices) - indices[ret] = iter.index; - if (++ret == max_items) - break; - } - - return ret; -} -EXPORT_SYMBOL(radix_tree_gang_lookup_slot); - /** * radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree * based on a tag diff --git a/mm/shmem.c b/mm/shmem.c index 8633bd3dc433..608c9252248f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1431,23 +1431,17 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct inode *inode = &info->vfs_inode; - struct address_space *mapping = inode->i_mapping; - pgoff_t idx, hindex; - void __rcu **results; + struct address_space *mapping = info->vfs_inode.i_mapping; + pgoff_t hindex; struct page *page; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) return NULL; hindex = round_down(index, HPAGE_PMD_NR); - rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, - hindex, 1) && idx < hindex + HPAGE_PMD_NR) { - rcu_read_unlock(); + if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, + XA_PRESENT)) return NULL; - } - rcu_read_unlock(); shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, -- cgit v1.2.3 From c121d3bb717ee932caf031c6a7923547f7f83163 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 03:13:54 -0500 Subject: shmem: Convert shmem_free_swap to XArray Since we are conditionally storing NULL in the XArray, we do not need to allocate memory and the GFP flags will be unused. Signed-off-by: Matthew Wilcox --- mm/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 608c9252248f..8e9bbfb7ed15 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -650,7 +650,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Remove swap entry from radix tree, free the swap and its page cache. + * Remove swap entry from page cache, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) @@ -658,7 +658,7 @@ static int shmem_free_swap(struct address_space *mapping, void *old; xa_lock_irq(&mapping->i_pages); - old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + old = __xa_cmpxchg(&mapping->i_pages, index, radswap, NULL, 0); xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; -- cgit v1.2.3 From 7ae3424fb4b5587d9b2db12a4b1040c52073dd5c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 03:28:00 -0500 Subject: shmem: Convert shmem_partial_swap_usage to XArray Simpler code because the xarray takes care of things like the limit and dereferencing the slot. Signed-off-by: Matthew Wilcox --- mm/shmem.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8e9bbfb7ed15..5588205f4259 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -676,29 +676,19 @@ static int shmem_free_swap(struct address_space *mapping, unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { - struct radix_tree_iter iter; - void __rcu **slot; + XA_STATE(xas, &mapping->i_pages, start); struct page *page; unsigned long swapped = 0; rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - if (iter.index >= end) - break; - - page = radix_tree_deref_slot(slot); - - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, page, end - 1) { + if (xas_retry(&xas, page)) continue; - } - if (xa_is_value(page)) swapped++; if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } -- cgit v1.2.3 From 2313216f861f955c2091f1593ea95d5dc88493f4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 22 Nov 2017 11:11:31 -0500 Subject: memfd: Convert memfd_wait_for_pins to XArray Simplify the locking by taking the spinlock while we walk the tree on the assumption that many acquires and releases of the lock will be worse than holding the lock while we process an entire batch of pages. Signed-off-by: Matthew Wilcox Reviewed-by: Mike Kravetz --- mm/memfd.c | 61 +++++++++++++++++++++++++------------------------------------ 1 file changed, 25 insertions(+), 36 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 2bb5e257080e..cf0ddd5fc859 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -21,7 +21,7 @@ #include /* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * We need a tag: a new tag would expand every xa_node by 8 bytes, * so reuse a tag which we firmly believe is never set or cleared on tmpfs * or hugetlbfs because they are memory only filesystems. */ @@ -72,9 +72,7 @@ static void memfd_tag_pins(struct address_space *mapping) */ static int memfd_wait_for_pins(struct address_space *mapping) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; + XA_STATE(xas, &mapping->i_pages, 0); struct page *page; int error, scan; @@ -82,7 +80,9 @@ static int memfd_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + unsigned int tagged = 0; + + if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; if (!scan) @@ -90,45 +90,34 @@ static int memfd_wait_for_pins(struct address_space *mapping) else if (schedule_timeout_killable((HZ << scan) / 200)) scan = LAST_SCAN; - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, MEMFD_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - + xas_set(&xas, 0); + xas_lock_irq(&xas); + xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + bool clear = true; + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still * found pages pinned. */ - error = -EBUSY; + if (scan == LAST_SCAN) + error = -EBUSY; + else + clear = false; } + if (clear) + xas_clear_mark(&xas, MEMFD_TAG_PINNED); + if (++tagged % XA_CHECK_SCHED) + continue; - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } - rcu_read_unlock(); + xas_unlock_irq(&xas); } return error; -- cgit v1.2.3 From ef3038a573aa8bf2f3797b110f7244b55a0e519c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 22 Nov 2017 08:37:38 -0500 Subject: memfd: Convert memfd_tag_pins to XArray Switch to a batch-processing model like memfd_wait_for_pins() and use the xa_state previously set up by memfd_wait_for_pins(). Signed-off-by: Matthew Wilcox Reviewed-by: Mike Kravetz --- mm/memfd.c | 44 ++++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index cf0ddd5fc859..97264c79d2cd 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -28,37 +28,29 @@ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ -static void memfd_tag_pins(struct address_space *mapping) +static void memfd_tag_pins(struct xa_state *xas) { - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; struct page *page; + unsigned int tagged = 0; lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } + xas_lock_irq(xas); + xas_for_each(xas, page, ULONG_MAX) { + if (xa_is_value(page)) + continue; + if (page_count(page) - page_mapcount(page) > 1) + xas_set_mark(xas, MEMFD_TAG_PINNED); + + if (++tagged % XA_CHECK_SCHED) + continue; + + xas_pause(xas); + xas_unlock_irq(xas); + cond_resched(); + xas_lock_irq(xas); } - rcu_read_unlock(); + xas_unlock_irq(xas); } /* @@ -76,7 +68,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) struct page *page; int error, scan; - memfd_tag_pins(mapping); + memfd_tag_pins(&xas); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { -- cgit v1.2.3 From 7f4446eefe9fbbe68e9543946c9a20f67897ff9b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 03:31:13 -0500 Subject: shmem: Comment fixups Remove the last mentions of radix tree from various comments. Signed-off-by: Matthew Wilcox --- mm/shmem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 5588205f4259..56bf122e0bb4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -763,7 +763,7 @@ void shmem_unlock_mapping(struct address_space *mapping) } /* - * Remove range of pages and swap entries from radix tree, and free them. + * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, @@ -1143,10 +1143,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, * We needed to drop mutex to make that restrictive page * allocation, but the inode might have been freed while we * dropped it: although a racing shmem_evict_inode() cannot - * complete without emptying the radix_tree, our page lock + * complete without emptying the page cache, our page lock * on this swapcache page is not enough to prevent that - * free_swap_and_cache() of our swap entry will only - * trylock_page(), removing swap from radix_tree whatever. + * trylock_page(), removing swap from page cache whatever. * * We must not proceed to shmem_add_to_page_cache() if the * inode has been freed, but of course we cannot rely on @@ -1212,7 +1212,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) &memcg, false); if (error) goto out; - /* No radix_tree_preload: swap entry keeps a place for page in tree */ + /* No memory allocation: swap entry occupies the slot for the page */ error = -EAGAIN; mutex_lock(&shmem_swaplist_mutex); @@ -1887,7 +1887,7 @@ unlock: spin_unlock_irq(&info->lock); goto repeat; } - if (error == -EEXIST) /* from above or from radix_tree_insert */ + if (error == -EEXIST) goto repeat; return error; } @@ -2501,7 +2501,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. + * llseek SEEK_DATA or SEEK_HOLE through the page cache. */ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pgoff_t index, pgoff_t end, int whence) -- cgit v1.2.3 From 0a943c65e7d7207dce2a63e9d14a925c0d9a7d96 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 10:37:22 -0500 Subject: btrfs: Convert page cache to XArray Signed-off-by: Matthew Wilcox Acked-by: David Sterba --- fs/btrfs/compression.c | 4 +--- fs/btrfs/extent_io.c | 8 +++----- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index fd25e125303c..a65d144da00c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -437,9 +437,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (pg_index > end_index) break; - rcu_read_lock(); - page = radix_tree_lookup(&mapping->i_pages, pg_index); - rcu_read_unlock(); + page = xa_load(&mapping->i_pages, pg_index); if (page && !xa_is_value(page)) { misses++; if (misses > 4) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fc7ca7d991ad..d4ad015e4485 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5153,11 +5153,9 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) clear_page_dirty_for_io(page); xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) { - radix_tree_tag_clear(&page->mapping->i_pages, - page_index(page), - PAGECACHE_TAG_DIRTY); - } + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&page->mapping->i_pages); ClearPageError(page); unlock_page(page); -- cgit v1.2.3 From ec82e1c1c89ce02d7703b841e6ee7d429feed609 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 10:40:41 -0500 Subject: fs: Convert buffer to XArray Mostly comment fixes, but one use of __xa_set_mark. Signed-off-by: Matthew Wilcox --- fs/buffer.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 6f1ae3ac9789..1286c2b95498 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -562,7 +562,7 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) EXPORT_SYMBOL(mark_buffer_dirty_inode); /* - * Mark the page dirty, and set it dirty in the radix tree, and mark the inode + * Mark the page dirty, and set it dirty in the page cache, and mark the inode * dirty. * * If warn is true, then emit a warning if the page is not uptodate and has @@ -579,8 +579,8 @@ void __set_page_dirty(struct page *page, struct address_space *mapping, if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -1050,7 +1050,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and - * the page is tagged dirty in its radix tree. + * the page is tagged dirty in the page cache. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is @@ -1073,9 +1073,9 @@ __getblk_slow(struct block_device *bdev, sector_t block, * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * - * mark_buffer_dirty() will set the dirty bit against the buffer, then set its - * backing page dirty, then tag the page as dirty in its address_space's radix - * tree and then attach the address_space's inode to its superblock's dirty + * mark_buffer_dirty() will set the dirty bit against the buffer, then set + * its backing page dirty, then tag the page as dirty in the page cache + * and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, -- cgit v1.2.3 From 04edf02cdd37a7e2e23ce8de77cb06d22ef2f503 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 10:46:23 -0500 Subject: fs: Convert writeback to XArray A couple of short loops. Signed-off-by: Matthew Wilcox --- fs/fs-writeback.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 471d863958bc..b40168fcc94a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -339,9 +339,9 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; - struct radix_tree_iter iter; + XA_STATE(xas, &mapping->i_pages, 0); + struct page *page; bool switched = false; - void **slot; /* * By the time control reaches here, RCU grace period has passed @@ -375,25 +375,18 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under writeback. */ - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_DIRTY) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page) && PageDirty(page)) { + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { + if (PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, 0, - PAGECACHE_TAG_WRITEBACK) { - struct page *page = radix_tree_deref_slot_protected(slot, - &mapping->i_pages.xa_lock); - if (likely(page)) { - WARN_ON_ONCE(!PageWriteback(page)); - dec_wb_stat(old_wb, WB_WRITEBACK); - inc_wb_stat(new_wb, WB_WRITEBACK); - } + xas_set(&xas, 0); + xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { + WARN_ON_ONCE(!PageWriteback(page)); + dec_wb_stat(old_wb, WB_WRITEBACK); + inc_wb_stat(new_wb, WB_WRITEBACK); } wb_get(new_wb); -- cgit v1.2.3 From f611ff63751afa421edc1eddf4281de13e082c51 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 19:33:30 -0500 Subject: nilfs2: Convert to XArray This is close to a 1:1 replacement of radix tree APIs with their XArray equivalents. It would be possible to optimise nilfs_copy_back_pages(), but that doesn't seem to be in the performance path. Also, I think it has a pre-existing bug, and I've added a note to that effect in the source code. Signed-off-by: Matthew Wilcox --- fs/nilfs2/btnode.c | 26 +++++++++----------------- fs/nilfs2/page.c | 29 +++++++++++++---------------- 2 files changed, 22 insertions(+), 33 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index ebb24a314f43..de99db518571 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -168,24 +168,18 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, ctxt->newbh = NULL; if (inode->i_blkbits == PAGE_SHIFT) { - lock_page(obh->b_page); - /* - * We cannot call radix_tree_preload for the kernels older - * than 2.6.23, because it is not exported for modules. - */ + struct page *opage = obh->b_page; + lock_page(opage); retry: - err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); - if (err) - goto failed_unlock; /* BUG_ON(oldkey != obh->b_page->index); */ - if (unlikely(oldkey != obh->b_page->index)) - NILFS_PAGE_BUG(obh->b_page, + if (unlikely(oldkey != opage->index)) + NILFS_PAGE_BUG(opage, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); xa_lock_irq(&btnc->i_pages); - err = radix_tree_insert(&btnc->i_pages, newkey, obh->b_page); + err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS); xa_unlock_irq(&btnc->i_pages); /* * Note: page->index will not change to newkey until @@ -193,7 +187,6 @@ retry: * To protect the page in intermediate state, the page lock * is held. */ - radix_tree_preload_end(); if (!err) return 0; else if (err != -EEXIST) @@ -203,7 +196,7 @@ retry: if (!err) goto retry; /* fallback to copy mode */ - unlock_page(obh->b_page); + unlock_page(opage); } nbh = nilfs_btnode_create_block(btnc, newkey); @@ -243,9 +236,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, mark_buffer_dirty(obh); xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, oldkey); - radix_tree_tag_set(&btnc->i_pages, newkey, - PAGECACHE_TAG_DIRTY); + __xa_erase(&btnc->i_pages, oldkey); + __xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY); xa_unlock_irq(&btnc->i_pages); opage->index = obh->b_blocknr = newkey; @@ -275,7 +267,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc, if (nbh == NULL) { /* blocksize == pagesize */ xa_lock_irq(&btnc->i_pages); - radix_tree_delete(&btnc->i_pages, newkey); + __xa_erase(&btnc->i_pages, newkey); xa_unlock_irq(&btnc->i_pages); unlock_page(ctxt->bh->b_page); } else diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 329a056b73b1..d7fc8d369d89 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -289,7 +289,7 @@ repeat: * @dmap: destination page cache * @smap: source page cache * - * No pages must no be added to the cache during this process. + * No pages must be added to the cache during this process. * This must be ensured by the caller. */ void nilfs_copy_back_pages(struct address_space *dmap, @@ -298,7 +298,6 @@ void nilfs_copy_back_pages(struct address_space *dmap, struct pagevec pvec; unsigned int i, n; pgoff_t index = 0; - int err; pagevec_init(&pvec); repeat: @@ -313,35 +312,34 @@ repeat: lock_page(page); dpage = find_lock_page(dmap, offset); if (dpage) { - /* override existing page on the destination cache */ + /* overwrite existing page in the destination cache */ WARN_ON(PageDirty(dpage)); nilfs_copy_page(dpage, page, 0); unlock_page(dpage); put_page(dpage); + /* Do we not need to remove page from smap here? */ } else { - struct page *page2; + struct page *p; /* move the page to the destination cache */ xa_lock_irq(&smap->i_pages); - page2 = radix_tree_delete(&smap->i_pages, offset); - WARN_ON(page2 != page); - + p = __xa_erase(&smap->i_pages, offset); + WARN_ON(page != p); smap->nrpages--; xa_unlock_irq(&smap->i_pages); xa_lock_irq(&dmap->i_pages); - err = radix_tree_insert(&dmap->i_pages, offset, page); - if (unlikely(err < 0)) { - WARN_ON(err == -EEXIST); + p = __xa_store(&dmap->i_pages, offset, page, GFP_NOFS); + if (unlikely(p)) { + /* Probably -ENOMEM */ page->mapping = NULL; - put_page(page); /* for cache */ + put_page(page); } else { page->mapping = dmap; dmap->nrpages++; if (PageDirty(page)) - radix_tree_tag_set(&dmap->i_pages, - offset, - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&dmap->i_pages, offset, + PAGECACHE_TAG_DIRTY); } xa_unlock_irq(&dmap->i_pages); } @@ -467,8 +465,7 @@ int __nilfs_clear_page_dirty(struct page *page) if (mapping) { xa_lock_irq(&mapping->i_pages); if (test_bit(PG_dirty, &page->flags)) { - radix_tree_tag_clear(&mapping->i_pages, - page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irq(&mapping->i_pages); return clear_page_dirty_for_io(page); -- cgit v1.2.3 From 5ec2d99de7427c84bb7250d23f5acf49a3670a63 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 4 Dec 2017 20:25:25 -0500 Subject: f2fs: Convert to XArray This is a straightforward conversion. Signed-off-by: Matthew Wilcox --- fs/f2fs/data.c | 4 ++-- fs/f2fs/dir.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inline.c | 2 +- fs/f2fs/node.c | 6 ++---- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5b760809eecc..6962491172a5 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2697,13 +2697,13 @@ const struct address_space_operations f2fs_dblock_aops = { #endif }; -void f2fs_clear_radix_tree_dirty_tag(struct page *page) +void f2fs_clear_page_cache_dirty_tag(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - radix_tree_tag_clear(&mapping->i_pages, page_index(page), + __xa_clear_mark(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); xa_unlock_irqrestore(&mapping->i_pages, flags); } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index ecc3a4e2be96..01006085904a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -729,7 +729,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, page->index, page->index + 1)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index abf925664d9c..3ccb8ed84fae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3039,7 +3039,7 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode); #endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); -void f2fs_clear_radix_tree_dirty_tag(struct page *page); +void f2fs_clear_page_cache_dirty_tag(struct page *page); /* * gc.c diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 115dc219344b..3e0a630a0168 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -245,7 +245,7 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index dd2e45a661aa..0bae5eda056a 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -104,7 +104,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { if (PageDirty(page)) { - f2fs_clear_radix_tree_dirty_tag(page); + f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); } @@ -1307,9 +1307,7 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (f2fs_check_nid_range(sbi, nid)) return; - rcu_read_lock(); - apage = radix_tree_lookup(&NODE_MAPPING(sbi)->i_pages, nid); - rcu_read_unlock(); + apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); if (apage) return; -- cgit v1.2.3 From a77d19f46a37c0fed48676739e8e0277f43d74e6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 27 Mar 2018 13:39:38 -0400 Subject: dax: Rename some functions Remove mentions of 'radix' and 'radix tree'. Simplify some names by dropping the word 'mapping'. Signed-off-by: Matthew Wilcox --- fs/dax.c | 86 +++++++++++++++++++++++++++++++--------------------------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index ebcec36335eb..acdb996efad2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -74,18 +74,18 @@ fs_initcall(init_dax_wait_table); #define DAX_ZERO_PAGE (1UL << 2) #define DAX_EMPTY (1UL << 3) -static unsigned long dax_radix_pfn(void *entry) +static unsigned long dax_to_pfn(void *entry) { return xa_to_value(entry) >> DAX_SHIFT; } -static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) +static void *dax_make_locked(unsigned long pfn, unsigned long flags) { return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) | DAX_LOCKED); } -static unsigned int dax_radix_order(void *entry) +static unsigned int dax_entry_order(void *entry) { if (xa_to_value(entry) & DAX_PMD) return PMD_SHIFT - PAGE_SHIFT; @@ -113,7 +113,7 @@ static int dax_is_empty_entry(void *entry) } /* - * DAX radix tree locking + * DAX page cache entry locking */ struct exceptional_entry_key { struct address_space *mapping; @@ -217,7 +217,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) } /* - * Lookup entry in radix tree, wait for it to become unlocked if it is + * Lookup entry in page cache, wait for it to become unlocked if it is * a DAX entry and return it. The caller must call * put_unlocked_mapping_entry() when he decided not to lock the entry or * put_locked_mapping_entry() when he locked the entry and now wants to @@ -299,7 +299,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, } /* - * Called when we are done with radix tree entry we looked up via + * Called when we are done with page cache entry we looked up via * get_unlocked_mapping_entry() and which we didn't lock in the end. */ static void put_unlocked_mapping_entry(struct address_space *mapping, @@ -308,7 +308,7 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, if (!entry) return; - /* We have to wake up next waiter for the radix tree entry lock */ + /* We have to wake up next waiter for the page cache entry lock */ dax_wake_mapping_entry_waiter(mapping, index, entry, false); } @@ -324,9 +324,9 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } -static unsigned long dax_radix_end_pfn(void *entry) +static unsigned long dax_end_pfn(void *entry) { - return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; + return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; } /* @@ -334,8 +334,8 @@ static unsigned long dax_radix_end_pfn(void *entry) * 'empty' and 'zero' entries. */ #define for_each_mapped_pfn(entry, pfn) \ - for (pfn = dax_radix_pfn(entry); \ - pfn < dax_radix_end_pfn(entry); pfn++) + for (pfn = dax_to_pfn(entry); \ + pfn < dax_end_pfn(entry); pfn++) /* * TODO: for reflink+dax we need a way to associate a single page with @@ -471,9 +471,9 @@ void dax_unlock_mapping_entry(struct page *page) } /* - * Find radix tree entry at given index. If it is a DAX entry, return it - * with the radix tree entry locked. If the radix tree doesn't contain the - * given index, create an empty entry for the index and return with it locked. + * Find page cache entry at given index. If it is a DAX entry, return it + * with the entry locked. If the page cache doesn't contain an entry at + * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will @@ -582,10 +582,10 @@ restart: true); } - entry = dax_radix_locked_entry(0, size_flag | DAX_EMPTY); + entry = dax_make_locked(0, size_flag | DAX_EMPTY); err = __radix_tree_insert(&mapping->i_pages, index, - dax_radix_order(entry), entry); + dax_entry_order(entry), entry); radix_tree_preload_end(); if (err) { xa_unlock_irq(&mapping->i_pages); @@ -701,7 +701,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping) } EXPORT_SYMBOL_GPL(dax_layout_busy_page); -static int __dax_invalidate_mapping_entry(struct address_space *mapping, +static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { int ret = 0; @@ -731,12 +731,12 @@ out: */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { - int ret = __dax_invalidate_mapping_entry(mapping, index, true); + int ret = __dax_invalidate_entry(mapping, index, true); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the - * radix tree (usually fs-private i_mmap_sem for writing). Since the + * page cache (usually fs-private i_mmap_sem for writing). Since the * caller has seen a DAX entry for this index, we better find it * at that index as well... */ @@ -750,7 +750,7 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { - return __dax_invalidate_mapping_entry(mapping, index, false); + return __dax_invalidate_entry(mapping, index, false); } static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, @@ -786,10 +786,9 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ -static void *dax_insert_mapping_entry(struct address_space *mapping, - struct vm_fault *vmf, - void *entry, pfn_t pfn_t, - unsigned long flags, bool dirty) +static void *dax_insert_entry(struct address_space *mapping, + struct vm_fault *vmf, + void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) { struct radix_tree_root *pages = &mapping->i_pages; unsigned long pfn = pfn_t_to_pfn(pfn_t); @@ -809,7 +808,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, } xa_lock_irq(pages); - new_entry = dax_radix_locked_entry(pfn, flags); + new_entry = dax_make_locked(pfn, flags); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); @@ -817,9 +816,9 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* - * Only swap our new entry into the radix tree if the current + * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or - * PMD entry is already in the tree, we leave it alone. This + * PMD entry is already in the cache, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. @@ -842,8 +841,8 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, return entry; } -static inline unsigned long -pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +static inline +unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) { unsigned long address; @@ -853,8 +852,8 @@ pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) } /* Walk all mappings of a given index of a file and writeprotect them */ -static void dax_mapping_entry_mkclean(struct address_space *mapping, - pgoff_t index, unsigned long pfn) +static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, + unsigned long pfn) { struct vm_area_struct *vma; pte_t pte, *ptep = NULL; @@ -950,7 +949,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, * compare pfns as we must not bail out due to difference in lockbit * or entry type. */ - if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) + if (dax_to_pfn(entry2) != dax_to_pfn(entry)) goto put_unlocked; if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { @@ -980,10 +979,10 @@ static int dax_writeback_one(struct dax_device *dax_dev, * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ - pfn = dax_radix_pfn(entry); - size = PAGE_SIZE << dax_radix_order(entry); + pfn = dax_to_pfn(entry); + size = PAGE_SIZE << dax_entry_order(entry); - dax_mapping_entry_mkclean(mapping, index, pfn); + dax_entry_mkclean(mapping, index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There @@ -1120,7 +1119,7 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - dax_insert_mapping_entry(mapping, vmf, entry, pfn, + dax_insert_entry(mapping, vmf, entry, pfn, DAX_ZERO_PAGE, false); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); @@ -1431,7 +1430,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, + entry = dax_insert_entry(mapping, vmf, entry, pfn, 0, write && !sync); /* @@ -1511,7 +1510,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; pfn = page_to_pfn_t(zero_page); - ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, + ret = dax_insert_entry(mapping, vmf, entry, pfn, DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); @@ -1564,7 +1563,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD - * range in the radix tree. + * range in the page cache. */ if ((vmf->pgoff & PG_PMD_COLOUR) != ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) @@ -1632,7 +1631,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; - entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, + entry = dax_insert_entry(mapping, vmf, entry, pfn, DAX_PMD, write && !sync); /* @@ -1725,15 +1724,14 @@ vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, } EXPORT_SYMBOL_GPL(dax_iomap_fault); -/** +/* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault * @pe_size: Size of entry to be inserted * @pfn: PFN to insert * - * This function inserts writeable PTE or PMD entry into page tables for mmaped - * DAX file. It takes care of marking corresponding radix tree entry as dirty - * as well. + * This function inserts a writeable PTE or PMD entry into the page tables + * for an mmaped DAX file. It also marks the page cache entry as dirty. */ static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, enum page_entry_size pe_size, -- cgit v1.2.3 From ec4907ff69fb16161d9d9370260303a73dd5acde Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 28 Mar 2018 11:01:43 -0400 Subject: dax: Hash on XArray instead of mapping Since the XArray is embedded in the struct address_space, its address contains exactly as much entropy as the address of the mapping. This patch is purely preparatory for later patches which will simplify the wait/wake interfaces. Signed-off-by: Matthew Wilcox --- fs/dax.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index acdb996efad2..fd111ea1da3b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -116,7 +116,7 @@ static int dax_is_empty_entry(void *entry) * DAX page cache entry locking */ struct exceptional_entry_key { - struct address_space *mapping; + struct xarray *xa; pgoff_t entry_start; }; @@ -125,7 +125,7 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, +static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, pgoff_t index, void *entry, struct exceptional_entry_key *key) { unsigned long hash; @@ -138,21 +138,21 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; - key->mapping = mapping; + key->xa = xa; key->entry_start = index; - hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); + hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } -static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, - int sync, void *keyp) +static int wake_exceptional_entry_func(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *keyp) { struct exceptional_entry_key *key = keyp; struct wait_exceptional_entry_queue *ewait = container_of(wait, struct wait_exceptional_entry_queue, wait); - if (key->mapping != ewait->key.mapping || + if (key->xa != ewait->key.xa || key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); @@ -163,13 +163,13 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_mapping_entry_waiter(struct address_space *mapping, +static void dax_wake_mapping_entry_waiter(struct xarray *xa, pgoff_t index, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; - wq = dax_entry_waitqueue(mapping, index, entry, &key); + wq = dax_entry_waitqueue(xa, index, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -248,7 +248,8 @@ static void *__get_unlocked_mapping_entry(struct address_space *mapping, return entry; } - wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); + wq = dax_entry_waitqueue(&mapping->i_pages, index, entry, + &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xa_unlock_irq(&mapping->i_pages); @@ -289,7 +290,7 @@ static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) } unlock_slot(mapping, slot); xa_unlock_irq(&mapping->i_pages); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); } static void put_locked_mapping_entry(struct address_space *mapping, @@ -309,7 +310,7 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, return; /* We have to wake up next waiter for the page cache entry lock */ - dax_wake_mapping_entry_waiter(mapping, index, entry, false); + dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); } static unsigned long dax_entry_size(void *entry) @@ -578,8 +579,8 @@ restart: dax_disassociate_entry(entry, mapping, false); radix_tree_delete(&mapping->i_pages, index); mapping->nrexceptional--; - dax_wake_mapping_entry_waiter(mapping, index, entry, - true); + dax_wake_mapping_entry_waiter(&mapping->i_pages, + index, entry, true); } entry = dax_make_locked(0, size_flag | DAX_EMPTY); -- cgit v1.2.3 From cfc93c6c6c963d96ef7081c17fb95d2738a75f55 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 28 Mar 2018 11:48:03 -0400 Subject: dax: Convert dax_insert_pfn_mkwrite to XArray Add some XArray-based helper functions to replace the radix tree based metaphors currently in use. The biggest change is that converted code doesn't see its own lock bit; get_unlocked_entry() always returns an entry with the lock bit clear. So we don't have to mess around loading the current entry and clearing the lock bit; we can just store the unlocked entry that we already have. Signed-off-by: Matthew Wilcox --- fs/dax.c | 149 +++++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 117 insertions(+), 32 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index fd111ea1da3b..8873e4b0bc98 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -38,6 +38,17 @@ #define CREATE_TRACE_POINTS #include +static inline unsigned int pe_order(enum page_entry_size pe_size) +{ + if (pe_size == PE_SIZE_PTE) + return PAGE_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PMD) + return PMD_SHIFT - PAGE_SHIFT; + if (pe_size == PE_SIZE_PUD) + return PUD_SHIFT - PAGE_SHIFT; + return ~0; +} + /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -46,6 +57,9 @@ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) +/* The order of a PMD entry */ +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -85,10 +99,15 @@ static void *dax_make_locked(unsigned long pfn, unsigned long flags) DAX_LOCKED); } +static bool dax_is_locked(void *entry) +{ + return xa_to_value(entry) & DAX_LOCKED; +} + static unsigned int dax_entry_order(void *entry) { if (xa_to_value(entry) & DAX_PMD) - return PMD_SHIFT - PAGE_SHIFT; + return PMD_ORDER; return 0; } @@ -181,6 +200,81 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } +static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) +{ + return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry, + wake_all); +} + +/* + * Look up entry in page cache, wait for it to become unlocked if it + * is a DAX entry and return it. The caller must subsequently call + * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() + * if it did. + * + * Must be called with the i_pages lock held. + */ +static void *get_unlocked_entry(struct xa_state *xas) +{ + void *entry; + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq; + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + + for (;;) { + entry = xas_load(xas); + if (!entry || xa_is_internal(entry) || + WARN_ON_ONCE(!xa_is_value(entry)) || + !dax_is_locked(entry)) + return entry; + + wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry, + &ewait.key); + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + xas_unlock_irq(xas); + xas_reset(xas); + schedule(); + finish_wait(wq, &ewait.wait); + xas_lock_irq(xas); + } +} + +static void put_unlocked_entry(struct xa_state *xas, void *entry) +{ + /* If we were the only waiter woken, wake the next one */ + if (entry) + dax_wake_entry(xas, entry, false); +} + +/* + * We used the xa_state to get the entry, but then we locked the entry and + * dropped the xa_lock, so we know the xa_state is stale and must be reset + * before use. + */ +static void dax_unlock_entry(struct xa_state *xas, void *entry) +{ + void *old; + + xas_reset(xas); + xas_lock_irq(xas); + old = xas_store(xas, entry); + xas_unlock_irq(xas); + BUG_ON(!dax_is_locked(old)); + dax_wake_entry(xas, entry, false); +} + +/* + * Return: The entry stored at this location before it was locked. + */ +static void *dax_lock_entry(struct xa_state *xas, void *entry) +{ + unsigned long v = xa_to_value(entry); + return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); +} + /* * Check whether the given slot is locked. Must be called with the i_pages * lock held. @@ -1728,50 +1822,46 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); /* * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted * @pfn: PFN to insert + * @order: Order of entry to insert. * * This function inserts a writeable PTE or PMD entry into the page tables * for an mmaped DAX file. It also marks the page cache entry as dirty. */ -static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, - enum page_entry_size pe_size, - pfn_t pfn) +static vm_fault_t +dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - void *entry, **slot; - pgoff_t index = vmf->pgoff; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); + void *entry; vm_fault_t ret; - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); /* Did we race with someone splitting entry or so? */ if (!entry || - (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || - (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); + (order == 0 && !dax_is_pte_entry(entry)) || + (order == PMD_ORDER && (xa_is_internal(entry) || + !dax_is_pmd_entry(entry)))) { + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } - radix_tree_tag_set(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - switch (pe_size) { - case PE_SIZE_PTE: + xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + if (order == 0) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - break; #ifdef CONFIG_FS_DAX_PMD - case PE_SIZE_PMD: + else if (order == PMD_ORDER) ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); - break; #endif - default: + else ret = VM_FAULT_FALLBACK; - } - put_locked_mapping_entry(mapping, index); + dax_unlock_entry(&xas, entry); trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret; } @@ -1791,17 +1881,12 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - size_t len = 0; + unsigned int order = pe_order(pe_size); + size_t len = PAGE_SIZE << order; - if (pe_size == PE_SIZE_PTE) - len = PAGE_SIZE; - else if (pe_size == PE_SIZE_PMD) - len = PMD_SIZE; - else - WARN_ON_ONCE(1); err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); if (err) return VM_FAULT_SIGBUS; - return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); + return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); -- cgit v1.2.3 From 084a899008cec6db91dbf06453b2e3a36ceed02d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 17 May 2018 13:03:48 -0400 Subject: dax: Convert dax_layout_busy_page to XArray Instead of using a pagevec, just use the XArray iterators. Add a conditional rescheduling point which probably should have been there in the original. Signed-off-by: Matthew Wilcox --- fs/dax.c | 64 +++++++++++++++++++++------------------------------------------- 1 file changed, 21 insertions(+), 43 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 8873e4b0bc98..28a51f2e8e50 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -722,11 +722,10 @@ restart: */ struct page *dax_layout_busy_page(struct address_space *mapping) { - pgoff_t indices[PAGEVEC_SIZE]; + XA_STATE(xas, &mapping->i_pages, 0); + void *entry; + unsigned int scanned = 0; struct page *page = NULL; - struct pagevec pvec; - pgoff_t index, end; - unsigned i; /* * In the 'limited' case get_user_pages() for dax is disabled. @@ -737,13 +736,9 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; - pagevec_init(&pvec); - index = 0; - end = -1; - /* * If we race get_user_pages_fast() here either we'll see the - * elevated page count in the pagevec_lookup and wait, or + * elevated page count in the iteration and wait, or * get_user_pages_fast() will see that the page it took a reference * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by @@ -755,43 +750,26 @@ struct page *dax_layout_busy_page(struct address_space *mapping) */ unmap_mapping_range(mapping, 0, 0, 1); - while (index < end && pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *pvec_ent = pvec.pages[i]; - void *entry; - - index = indices[i]; - if (index >= end) - break; - - if (WARN_ON_ONCE(!xa_is_value(pvec_ent))) - continue; - - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (entry) - page = dax_busy_page(entry); - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(&mapping->i_pages); - if (page) - break; - } - - /* - * We don't expect normal struct page entries to exist in our - * tree, but we keep these pagevec calls so that this code is - * consistent with the common pattern for handling pagevecs - * throughout the kernel. - */ - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - index++; - + xas_lock_irq(&xas); + xas_for_each(&xas, entry, ULONG_MAX) { + if (WARN_ON_ONCE(!xa_is_value(entry))) + continue; + if (unlikely(dax_is_locked(entry))) + entry = get_unlocked_entry(&xas); + if (entry) + page = dax_busy_page(entry); + put_unlocked_entry(&xas, entry); if (page) break; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } + xas_unlock_irq(&xas); return page; } EXPORT_SYMBOL_GPL(dax_layout_busy_page); -- cgit v1.2.3 From 07f2d89cc270936ac314e7cc4ac57077d7f08aef Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 28 Mar 2018 15:40:41 -0400 Subject: dax: Convert __dax_invalidate_entry to XArray Avoids walking the radix tree multiple times looking for tags. Signed-off-by: Matthew Wilcox --- fs/dax.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 28a51f2e8e50..83a510068b95 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -777,27 +777,28 @@ EXPORT_SYMBOL_GPL(dax_layout_busy_page); static int __dax_invalidate_entry(struct address_space *mapping, pgoff_t index, bool trunc) { + XA_STATE(xas, &mapping->i_pages, index); int ret = 0; void *entry; - struct radix_tree_root *pages = &mapping->i_pages; - xa_lock_irq(pages); - entry = get_unlocked_mapping_entry(mapping, index, NULL); + xas_lock_irq(&xas); + entry = get_unlocked_entry(&xas); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && - (radix_tree_tag_get(pages, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE))) + (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || + xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; dax_disassociate_entry(entry, mapping, trunc); - radix_tree_delete(pages, index); + xas_store(&xas, NULL); mapping->nrexceptional--; ret = 1; out: - put_unlocked_mapping_entry(mapping, index, entry); - xa_unlock_irq(pages); + put_unlocked_entry(&xas, entry); + xas_unlock_irq(&xas); return ret; } + /* * Delete DAX entry at @index from @mapping. Wait for it * to be unlocked before deleting it. -- cgit v1.2.3 From 9fc747f68d49f4b63029e3a1e87c49d23771a199 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 28 Mar 2018 16:03:45 -0400 Subject: dax: Convert dax writeback to XArray Use XArray iteration instead of a pagevec. Signed-off-by: Matthew Wilcox --- fs/dax.c | 130 ++++++++++++++++++++++++++++++--------------------------------- 1 file changed, 62 insertions(+), 68 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 83a510068b95..de3ba829a3f4 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -997,11 +997,9 @@ unlock_pte: i_mmap_unlock_read(mapping); } -static int dax_writeback_one(struct dax_device *dax_dev, - struct address_space *mapping, pgoff_t index, void *entry) +static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, + struct address_space *mapping, void *entry) { - struct radix_tree_root *pages = &mapping->i_pages; - void *entry2, **slot; unsigned long pfn; long ret = 0; size_t size; @@ -1013,29 +1011,35 @@ static int dax_writeback_one(struct dax_device *dax_dev, if (WARN_ON(!xa_is_value(entry))) return -EIO; - xa_lock_irq(pages); - entry2 = get_unlocked_mapping_entry(mapping, index, &slot); - /* Entry got punched out / reallocated? */ - if (!entry2 || WARN_ON_ONCE(!xa_is_value(entry2))) - goto put_unlocked; - /* - * Entry got reallocated elsewhere? No need to writeback. We have to - * compare pfns as we must not bail out due to difference in lockbit - * or entry type. - */ - if (dax_to_pfn(entry2) != dax_to_pfn(entry)) - goto put_unlocked; - if (WARN_ON_ONCE(dax_is_empty_entry(entry) || - dax_is_zero_entry(entry))) { - ret = -EIO; - goto put_unlocked; + if (unlikely(dax_is_locked(entry))) { + void *old_entry = entry; + + entry = get_unlocked_entry(xas); + + /* Entry got punched out / reallocated? */ + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. + * We have to compare pfns as we must not bail out due to + * difference in lockbit or entry type. + */ + if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) + goto put_unlocked; + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || + dax_is_zero_entry(entry))) { + ret = -EIO; + goto put_unlocked; + } + + /* Another fsync thread may have already done this entry */ + if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; } - /* Another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(pages, index, PAGECACHE_TAG_TOWRITE)) - goto put_unlocked; /* Lock the entry to serialize with page faults */ - entry = lock_slot(mapping, slot); + dax_lock_entry(xas, entry); + /* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we @@ -1043,8 +1047,8 @@ static int dax_writeback_one(struct dax_device *dax_dev, * at the entry only under the i_pages lock and once they do that * they will see the entry locked and wait for it to unlock. */ - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_TOWRITE); - xa_unlock_irq(pages); + xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irq(xas); /* * Even if dax_writeback_mapping_range() was given a wbc->range_start @@ -1056,7 +1060,7 @@ static int dax_writeback_one(struct dax_device *dax_dev, pfn = dax_to_pfn(entry); size = PAGE_SIZE << dax_entry_order(entry); - dax_entry_mkclean(mapping, index, pfn); + dax_entry_mkclean(mapping, xas->xa_index, pfn); dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); /* * After we have flushed the cache, we can clear the dirty tag. There @@ -1064,16 +1068,18 @@ static int dax_writeback_one(struct dax_device *dax_dev, * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ - xa_lock_irq(pages); - radix_tree_tag_clear(pages, index, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); - trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); - put_locked_mapping_entry(mapping, index); + xas_reset(xas); + xas_lock_irq(xas); + xas_store(xas, entry); + xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); + dax_wake_entry(xas, entry, false); + + trace_dax_writeback_one(mapping->host, xas->xa_index, + size >> PAGE_SHIFT); return ret; put_unlocked: - put_unlocked_mapping_entry(mapping, index, entry2); - xa_unlock_irq(pages); + put_unlocked_entry(xas, entry); return ret; } @@ -1085,13 +1091,13 @@ static int dax_writeback_one(struct dax_device *dax_dev, int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { + XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); struct inode *inode = mapping->host; - pgoff_t start_index, end_index; - pgoff_t indices[PAGEVEC_SIZE]; + pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; struct dax_device *dax_dev; - struct pagevec pvec; - bool done = false; - int i, ret = 0; + void *entry; + int ret = 0; + unsigned int scanned = 0; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; @@ -1103,41 +1109,29 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!dax_dev) return -EIO; - start_index = wbc->range_start >> PAGE_SHIFT; - end_index = wbc->range_end >> PAGE_SHIFT; - - trace_dax_writeback_range(inode, start_index, end_index); - - tag_pages_for_writeback(mapping, start_index, end_index); + trace_dax_writeback_range(inode, xas.xa_index, end_index); - pagevec_init(&pvec); - while (!done) { - pvec.nr = find_get_entries_tag(mapping, start_index, - PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, - pvec.pages, indices); + tag_pages_for_writeback(mapping, xas.xa_index, end_index); - if (pvec.nr == 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { + ret = dax_writeback_one(&xas, dax_dev, mapping, entry); + if (ret < 0) { + mapping_set_error(mapping, ret); break; - - for (i = 0; i < pvec.nr; i++) { - if (indices[i] > end_index) { - done = true; - break; - } - - ret = dax_writeback_one(dax_dev, mapping, indices[i], - pvec.pages[i]); - if (ret < 0) { - mapping_set_error(mapping, ret); - goto out; - } } - start_index = indices[pvec.nr - 1] + 1; + if (++scanned % XA_CHECK_SCHED) + continue; + + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); } -out: + xas_unlock_irq(&xas); put_dax(dax_dev); - trace_dax_writeback_range_done(inode, start_index, end_index); - return (ret < 0 ? ret : 0); + trace_dax_writeback_range_done(inode, xas.xa_index, end_index); + return ret; } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -- cgit v1.2.3 From 9f32d221301c3e754b24c77ab11bf793b19f51b5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 12 Jun 2018 09:46:30 -0400 Subject: dax: Convert dax_lock_mapping_entry to XArray Instead of always retrying when we slept, only retry if the page has moved. Signed-off-by: Matthew Wilcox --- fs/dax.c | 83 +++++++++++++++++++++++++++------------------------------------- 1 file changed, 35 insertions(+), 48 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index de3ba829a3f4..fc2745ca3308 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -99,6 +99,17 @@ static void *dax_make_locked(unsigned long pfn, unsigned long flags) DAX_LOCKED); } +static void *dax_make_entry(pfn_t pfn, unsigned long flags) +{ + return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); +} + +static void *dax_make_page_entry(struct page *page) +{ + pfn_t pfn = page_to_pfn_t(page); + return dax_make_entry(pfn, PageHead(page) ? DAX_PMD : 0); +} + static bool dax_is_locked(void *entry) { return xa_to_value(entry) & DAX_LOCKED; @@ -487,33 +498,16 @@ static struct page *dax_busy_page(void *entry) return NULL; } -static bool entry_wait_revalidate(void) -{ - rcu_read_unlock(); - schedule(); - rcu_read_lock(); - - /* - * Tell __get_unlocked_mapping_entry() to take a break, we need - * to revalidate page->mapping after dropping locks - */ - return true; -} - bool dax_lock_mapping_entry(struct page *page) { - pgoff_t index; - struct inode *inode; - bool did_lock = false; - void *entry = NULL, **slot; - struct address_space *mapping; + XA_STATE(xas, NULL, 0); + void *entry; - rcu_read_lock(); for (;;) { - mapping = READ_ONCE(page->mapping); + struct address_space *mapping = READ_ONCE(page->mapping); if (!dax_mapping(mapping)) - break; + return false; /* * In the device-dax case there's no need to lock, a @@ -522,47 +516,40 @@ bool dax_lock_mapping_entry(struct page *page) * otherwise we would not have a valid pfn_to_page() * translation. */ - inode = mapping->host; - if (S_ISCHR(inode->i_mode)) { - did_lock = true; - break; - } + if (S_ISCHR(mapping->host->i_mode)) + return true; - xa_lock_irq(&mapping->i_pages); + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); if (mapping != page->mapping) { - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); continue; } - index = page->index; - - entry = __get_unlocked_mapping_entry(mapping, index, &slot, - entry_wait_revalidate); - if (!entry) { - xa_unlock_irq(&mapping->i_pages); - break; - } else if (IS_ERR(entry)) { - WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN); - continue; + xas_set(&xas, page->index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + entry = get_unlocked_entry(&xas); + /* Did the page move while we slept? */ + if (dax_to_pfn(entry) != page_to_pfn(page)) { + xas_unlock_irq(&xas); + continue; + } } - lock_slot(mapping, slot); - did_lock = true; - xa_unlock_irq(&mapping->i_pages); - break; + dax_lock_entry(&xas, entry); + xas_unlock_irq(&xas); + return true; } - rcu_read_unlock(); - - return did_lock; } void dax_unlock_mapping_entry(struct page *page) { struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + XA_STATE(xas, &mapping->i_pages, page->index); - if (S_ISCHR(inode->i_mode)) + if (S_ISCHR(mapping->host->i_mode)) return; - unlock_mapping_entry(mapping, page->index); + dax_unlock_entry(&xas, dax_make_page_entry(page)); } /* -- cgit v1.2.3 From b15cd800682fcaf27048b05e42f5c208e4c756c0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 29 Mar 2018 22:58:27 -0400 Subject: dax: Convert page fault handlers to XArray This is the last part of DAX to be converted to the XArray so remove all the old helper functions. Signed-off-by: Matthew Wilcox --- fs/dax.c | 429 +++++++++++++++++++-------------------------------------------- 1 file changed, 128 insertions(+), 301 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index fc2745ca3308..616e36ea6aaa 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -93,12 +93,6 @@ static unsigned long dax_to_pfn(void *entry) return xa_to_value(entry) >> DAX_SHIFT; } -static void *dax_make_locked(unsigned long pfn, unsigned long flags) -{ - return xa_mk_value(flags | ((unsigned long)pfn << DAX_SHIFT) | - DAX_LOCKED); -} - static void *dax_make_entry(pfn_t pfn, unsigned long flags) { return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); @@ -155,10 +149,11 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; -static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, - pgoff_t index, void *entry, struct exceptional_entry_key *key) +static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, + void *entry, struct exceptional_entry_key *key) { unsigned long hash; + unsigned long index = xas->xa_index; /* * If 'entry' is a PMD, align the 'index' that we use for the wait @@ -167,11 +162,10 @@ static wait_queue_head_t *dax_entry_waitqueue(struct xarray *xa, */ if (dax_is_pmd_entry(entry)) index &= ~PG_PMD_COLOUR; - - key->xa = xa; + key->xa = xas->xa; key->entry_start = index; - hash = hash_long((unsigned long)xa ^ index, DAX_WAIT_TABLE_BITS); + hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } @@ -193,13 +187,12 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_mapping_entry_waiter(struct xarray *xa, - pgoff_t index, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; - wq = dax_entry_waitqueue(xa, index, entry, &key); + wq = dax_entry_waitqueue(xas, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -211,12 +204,6 @@ static void dax_wake_mapping_entry_waiter(struct xarray *xa, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } -static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) -{ - return dax_wake_mapping_entry_waiter(xas->xa, xas->xa_index, entry, - wake_all); -} - /* * Look up entry in page cache, wait for it to become unlocked if it * is a DAX entry and return it. The caller must subsequently call @@ -241,8 +228,7 @@ static void *get_unlocked_entry(struct xa_state *xas) !dax_is_locked(entry)) return entry; - wq = dax_entry_waitqueue(xas->xa, xas->xa_index, entry, - &ewait.key); + wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); xas_unlock_irq(xas); @@ -286,138 +272,6 @@ static void *dax_lock_entry(struct xa_state *xas, void *entry) return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); } -/* - * Check whether the given slot is locked. Must be called with the i_pages - * lock held. - */ -static inline int slot_locked(struct address_space *mapping, void **slot) -{ - unsigned long entry = xa_to_value( - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); - return entry & DAX_LOCKED; -} - -/* - * Mark the given slot as locked. Must be called with the i_pages lock held. - */ -static inline void *lock_slot(struct address_space *mapping, void **slot) -{ - unsigned long v = xa_to_value( - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); - void *entry = xa_mk_value(v | DAX_LOCKED); - radix_tree_replace_slot(&mapping->i_pages, slot, entry); - return entry; -} - -/* - * Mark the given slot as unlocked. Must be called with the i_pages lock held. - */ -static inline void *unlock_slot(struct address_space *mapping, void **slot) -{ - unsigned long v = xa_to_value( - radix_tree_deref_slot_protected(slot, &mapping->i_pages.xa_lock)); - void *entry = xa_mk_value(v & ~DAX_LOCKED); - radix_tree_replace_slot(&mapping->i_pages, slot, entry); - return entry; -} - -/* - * Lookup entry in page cache, wait for it to become unlocked if it is - * a DAX entry and return it. The caller must call - * put_unlocked_mapping_entry() when he decided not to lock the entry or - * put_locked_mapping_entry() when he locked the entry and now wants to - * unlock it. - * - * Must be called with the i_pages lock held. - */ -static void *__get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp, bool (*wait_fn)(void)) -{ - void *entry, **slot; - struct wait_exceptional_entry_queue ewait; - wait_queue_head_t *wq; - - init_wait(&ewait.wait); - ewait.wait.func = wake_exceptional_entry_func; - - for (;;) { - bool revalidate; - - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, - &slot); - if (!entry || - WARN_ON_ONCE(!xa_is_value(entry)) || - !slot_locked(mapping, slot)) { - if (slotp) - *slotp = slot; - return entry; - } - - wq = dax_entry_waitqueue(&mapping->i_pages, index, entry, - &ewait.key); - prepare_to_wait_exclusive(wq, &ewait.wait, - TASK_UNINTERRUPTIBLE); - xa_unlock_irq(&mapping->i_pages); - revalidate = wait_fn(); - finish_wait(wq, &ewait.wait); - xa_lock_irq(&mapping->i_pages); - if (revalidate) - return ERR_PTR(-EAGAIN); - } -} - -static bool entry_wait(void) -{ - schedule(); - /* - * Never return an ERR_PTR() from - * __get_unlocked_mapping_entry(), just keep looping. - */ - return false; -} - -static void *get_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void ***slotp) -{ - return __get_unlocked_mapping_entry(mapping, index, slotp, entry_wait); -} - -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - xa_lock_irq(&mapping->i_pages); - entry = __radix_tree_lookup(&mapping->i_pages, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !xa_is_value(entry) || - !slot_locked(mapping, slot))) { - xa_unlock_irq(&mapping->i_pages); - return; - } - unlock_slot(mapping, slot); - xa_unlock_irq(&mapping->i_pages); - dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); -} - -static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index) -{ - unlock_mapping_entry(mapping, index); -} - -/* - * Called when we are done with page cache entry we looked up via - * get_unlocked_mapping_entry() and which we didn't lock in the end. - */ -static void put_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - if (!entry) - return; - - /* We have to wake up next waiter for the page cache entry lock */ - dax_wake_mapping_entry_waiter(&mapping->i_pages, index, entry, false); -} - static unsigned long dax_entry_size(void *entry) { if (dax_is_zero_entry(entry)) @@ -558,47 +412,52 @@ void dax_unlock_mapping_entry(struct page *page) * that index, add a locked empty entry. * * When requesting an entry with size DAX_PMD, grab_mapping_entry() will - * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries within the 2MiB range that we are - * requesting. + * either return that locked entry or will return VM_FAULT_FALLBACK. + * This will happen if there are any PTE entries within the PMD range + * that we are requesting. * - * We always favor 4k entries over 2MiB entries. There isn't a flow where we - * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB - * insertion will fail if it finds any 4k entries already in the tree, and a - * 4k insertion will cause an existing 2MiB entry to be unmapped and - * downgraded to 4k entries. This happens for both 2MiB huge zero pages as - * well as 2MiB empty entries. + * We always favor PTE entries over PMD entries. There isn't a flow where we + * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD + * insertion will fail if it finds any PTE entries already in the tree, and a + * PTE insertion will cause an existing PMD entry to be unmapped and + * downgraded to PTE entries. This happens for both PMD zero pages as + * well as PMD empty entries. * - * The exception to this downgrade path is for 2MiB DAX PMD entries that have - * real storage backing them. We will leave these real 2MiB DAX entries in - * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. + * The exception to this downgrade path is for PMD entries that have + * real storage backing them. We will leave these real PMD entries in + * the tree, and PTE writes will simply dirty the entire PMD entry. * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. + * + * On error, this function does not return an ERR_PTR. Instead it returns + * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values + * overlap with xarray value entries. */ -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, - unsigned long size_flag) +static void *grab_mapping_entry(struct xa_state *xas, + struct address_space *mapping, unsigned long size_flag) { - bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ - void *entry, **slot; - -restart: - xa_lock_irq(&mapping->i_pages); - entry = get_unlocked_mapping_entry(mapping, index, &slot); + unsigned long index = xas->xa_index; + bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ + void *entry; - if (WARN_ON_ONCE(entry && !xa_is_value(entry))) { - entry = ERR_PTR(-EIO); - goto out_unlock; - } +retry: + xas_lock_irq(xas); + entry = get_unlocked_entry(xas); + if (xa_is_internal(entry)) + goto fallback; if (entry) { + if (WARN_ON_ONCE(!xa_is_value(entry))) { + xas_set_err(xas, EIO); + goto out_unlock; + } + if (size_flag & DAX_PMD) { if (dax_is_pte_entry(entry)) { - put_unlocked_mapping_entry(mapping, index, - entry); - entry = ERR_PTR(-EEXIST); - goto out_unlock; + put_unlocked_entry(xas, entry); + goto fallback; } } else { /* trying to grab a PTE entry */ if (dax_is_pmd_entry(entry) && @@ -609,87 +468,57 @@ restart: } } - /* No entry for given index? Make sure radix tree is big enough. */ - if (!entry || pmd_downgrade) { - int err; - - if (pmd_downgrade) { - /* - * Make sure 'entry' remains valid while we drop - * the i_pages lock. - */ - entry = lock_slot(mapping, slot); - } + if (pmd_downgrade) { + /* + * Make sure 'entry' remains valid while we drop + * the i_pages lock. + */ + dax_lock_entry(xas, entry); - xa_unlock_irq(&mapping->i_pages); /* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ - if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); - - err = radix_tree_preload( - mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) { - if (pmd_downgrade) - put_locked_mapping_entry(mapping, index); - return ERR_PTR(err); - } - xa_lock_irq(&mapping->i_pages); - - if (!entry) { - /* - * We needed to drop the i_pages lock while calling - * radix_tree_preload() and we didn't have an entry to - * lock. See if another thread inserted an entry at - * our index during this time. - */ - entry = __radix_tree_lookup(&mapping->i_pages, index, - NULL, &slot); - if (entry) { - radix_tree_preload_end(); - xa_unlock_irq(&mapping->i_pages); - goto restart; - } + if (dax_is_zero_entry(entry)) { + xas_unlock_irq(xas); + unmap_mapping_pages(mapping, + xas->xa_index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); + xas_reset(xas); + xas_lock_irq(xas); } - if (pmd_downgrade) { - dax_disassociate_entry(entry, mapping, false); - radix_tree_delete(&mapping->i_pages, index); - mapping->nrexceptional--; - dax_wake_mapping_entry_waiter(&mapping->i_pages, - index, entry, true); - } + dax_disassociate_entry(entry, mapping, false); + xas_store(xas, NULL); /* undo the PMD join */ + dax_wake_entry(xas, entry, true); + mapping->nrexceptional--; + entry = NULL; + xas_set(xas, index); + } - entry = dax_make_locked(0, size_flag | DAX_EMPTY); - - err = __radix_tree_insert(&mapping->i_pages, index, - dax_entry_order(entry), entry); - radix_tree_preload_end(); - if (err) { - xa_unlock_irq(&mapping->i_pages); - /* - * Our insertion of a DAX entry failed, most likely - * because we were inserting a PMD entry and it - * collided with a PTE sized entry at a different - * index in the PMD range. We haven't inserted - * anything into the radix tree and have no waiters to - * wake. - */ - return ERR_PTR(err); - } - /* Good, we have inserted empty locked entry into the tree. */ + if (entry) { + dax_lock_entry(xas, entry); + } else { + entry = dax_make_entry(pfn_to_pfn_t(0), size_flag | DAX_EMPTY); + dax_lock_entry(xas, entry); + if (xas_error(xas)) + goto out_unlock; mapping->nrexceptional++; - xa_unlock_irq(&mapping->i_pages); - return entry; } - entry = lock_slot(mapping, slot); - out_unlock: - xa_unlock_irq(&mapping->i_pages); + +out_unlock: + xas_unlock_irq(xas); + if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) + goto retry; + if (xas->xa_node == XA_ERROR(-ENOMEM)) + return xa_mk_internal(VM_FAULT_OOM); + if (xas_error(xas)) + return xa_mk_internal(VM_FAULT_SIGBUS); return entry; +fallback: + xas_unlock_irq(xas); + return xa_mk_internal(VM_FAULT_FALLBACK); } /** @@ -847,29 +676,27 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ -static void *dax_insert_entry(struct address_space *mapping, - struct vm_fault *vmf, - void *entry, pfn_t pfn_t, unsigned long flags, bool dirty) +static void *dax_insert_entry(struct xa_state *xas, + struct address_space *mapping, struct vm_fault *vmf, + void *entry, pfn_t pfn, unsigned long flags, bool dirty) { - struct radix_tree_root *pages = &mapping->i_pages; - unsigned long pfn = pfn_t_to_pfn(pfn_t); - pgoff_t index = vmf->pgoff; - void *new_entry; + void *new_entry = dax_make_entry(pfn, flags); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { + unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, - PG_PMD_NR, false); + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_pages(mapping, vmf->pgoff, 1, false); + unmap_mapping_pages(mapping, index, 1, false); } - xa_lock_irq(pages); - new_entry = dax_make_locked(pfn, flags); + xas_reset(xas); + xas_lock_irq(xas); if (dax_entry_size(entry) != dax_entry_size(new_entry)) { dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); @@ -884,21 +711,18 @@ static void *dax_insert_entry(struct address_space *mapping, * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ - struct radix_tree_node *node; - void **slot; - void *ret; - - ret = __radix_tree_lookup(pages, index, &node, &slot); - WARN_ON_ONCE(ret != entry); - __radix_tree_replace(pages, node, slot, - new_entry, NULL); + void *old = dax_lock_entry(xas, new_entry); + WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | + DAX_LOCKED)); entry = new_entry; + } else { + xas_load(xas); /* Walk the xa_state */ } if (dirty) - radix_tree_tag_set(pages, index, PAGECACHE_TAG_DIRTY); + xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - xa_unlock_irq(pages); + xas_unlock_irq(xas); return entry; } @@ -1166,15 +990,16 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, - struct vm_fault *vmf) +static vm_fault_t dax_load_hole(struct xa_state *xas, + struct address_space *mapping, void **entry, + struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - dax_insert_entry(mapping, vmf, entry, pfn, + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_ZERO_PAGE, false); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); @@ -1384,6 +1209,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE(xas, &mapping->i_pages, vmf->pgoff); struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; @@ -1410,9 +1236,9 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (write && !vmf->cow_page) flags |= IOMAP_WRITE; - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - ret = dax_fault_return(PTR_ERR(entry)); + entry = grab_mapping_entry(&xas, mapping, 0); + if (xa_is_internal(entry)) { + ret = xa_to_internal(entry); goto out; } @@ -1485,7 +1311,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto error_finish_iomap; - entry = dax_insert_entry(mapping, vmf, entry, pfn, + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, 0, write && !sync); /* @@ -1513,7 +1339,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(&xas, mapping, &entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1540,21 +1366,20 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff); + dax_unlock_entry(&xas, entry); out: trace_dax_pte_fault_done(inode, vmf, ret); return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void *entry) +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + struct iomap *iomap, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; struct inode *inode = mapping->host; struct page *zero_page; - void *ret = NULL; spinlock_t *ptl; pmd_t pmd_entry; pfn_t pfn; @@ -1565,7 +1390,7 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; pfn = page_to_pfn_t(zero_page); - ret = dax_insert_entry(mapping, vmf, entry, pfn, + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, DAX_PMD | DAX_ZERO_PAGE, false); ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); @@ -1578,11 +1403,11 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pmd_entry = pmd_mkhuge(pmd_entry); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); return VM_FAULT_NOPAGE; fallback: - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); return VM_FAULT_FALLBACK; } @@ -1591,6 +1416,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; + XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; @@ -1598,7 +1424,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; - pgoff_t max_pgoff, pgoff; + pgoff_t max_pgoff; void *entry; loff_t pos; int error; @@ -1609,7 +1435,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ - pgoff = linear_page_index(vma, pmd_addr); max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1634,24 +1459,26 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff >= max_pgoff) { + if (xas.xa_index >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) + if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* - * grab_mapping_entry() will make sure we get a 2MiB empty entry, a - * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page - * is already in the tree, for instance), it will return -EEXIST and - * we just fall back to 4k entries. + * grab_mapping_entry() will make sure we get an empty PMD entry, + * a zero PMD entry or a DAX PMD. If it can't (because a PTE + * entry is already in the array, for instance), it will return + * VM_FAULT_FALLBACK. */ - entry = grab_mapping_entry(mapping, pgoff, DAX_PMD); - if (IS_ERR(entry)) + entry = grab_mapping_entry(&xas, mapping, DAX_PMD); + if (xa_is_internal(entry)) { + result = xa_to_internal(entry); goto fallback; + } /* * It is possible, particularly with mixed reads & writes to private @@ -1670,7 +1497,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ - pos = (loff_t)pgoff << PAGE_SHIFT; + pos = (loff_t)xas.xa_index << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) goto unlock_entry; @@ -1686,7 +1513,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, if (error < 0) goto finish_iomap; - entry = dax_insert_entry(mapping, vmf, entry, pfn, + entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, DAX_PMD, write && !sync); /* @@ -1711,7 +1538,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, entry); + result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); break; default: WARN_ON_ONCE(1); @@ -1734,7 +1561,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff); + dax_unlock_entry(&xas, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); -- cgit v1.2.3 From a28334862993b5c6a8766f6963ee69048403817c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 5 Dec 2017 19:04:20 -0500 Subject: page cache: Finish XArray conversion With no more radix tree API users left, we can drop the GFP flags and use xa_init() instead of INIT_RADIX_TREE(). Signed-off-by: Matthew Wilcox --- fs/inode.c | 2 +- mm/swap_state.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 42f6d25f32a5..9b808986d440 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -349,7 +349,7 @@ EXPORT_SYMBOL(inc_nlink); static void __address_space_init_once(struct address_space *mapping) { - INIT_RADIX_TREE(&mapping->i_pages, GFP_ATOMIC | __GFP_ACCOUNT); + xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ); init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); diff --git a/mm/swap_state.c b/mm/swap_state.c index f393c994cc60..31c45a25b2d3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -586,7 +586,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); + xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ -- cgit v1.2.3 From 1cf56f9d670b88b2e947a7ccdb8ba32e6477915d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 9 Apr 2018 16:24:45 -0400 Subject: radix tree: Remove radix_tree_update_node_t The only user of this functionality was the workingset code, and it's now been converted to the XArray. Remove __radix_tree_delete_node() entirely as it was also only used by the workingset code. Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 7 +----- lib/idr.c | 2 +- lib/radix-tree.c | 42 +++++++---------------------------- tools/testing/radix-tree/multiorder.c | 2 +- 4 files changed, 11 insertions(+), 42 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 8a4280bc350f..7513d0df984b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -242,17 +242,12 @@ void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); -typedef void (*radix_tree_update_node_t)(struct radix_tree_node *); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, - void __rcu **slot, void *entry, - radix_tree_update_node_t update_node); + void __rcu **slot, void *entry); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); -void __radix_tree_delete_node(struct radix_tree_root *, - struct radix_tree_node *, - radix_tree_update_node_t update_node); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/lib/idr.c b/lib/idr.c index 3c20ad9b0463..cb1db9b8d3f6 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -297,7 +297,7 @@ void *idr_replace(struct idr *idr, void *ptr, unsigned long id) if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) return ERR_PTR(-ENOENT); - __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL); + __radix_tree_replace(&idr->idr_rt, node, slot, ptr); return entry; } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 2f9c0e45eeb7..c4c252185734 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -562,8 +562,7 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root, - radix_tree_update_node_t update_node) +static inline bool radix_tree_shrink(struct radix_tree_root *root) { bool shrunk = false; @@ -631,8 +630,6 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, node->count = 0; if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; - if (update_node) - update_node(node); } WARN_ON_ONCE(!list_empty(&node->private_list)); @@ -644,8 +641,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, } static bool delete_node(struct radix_tree_root *root, - struct radix_tree_node *node, - radix_tree_update_node_t update_node) + struct radix_tree_node *node) { bool deleted = false; @@ -655,7 +651,7 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->xa_head)) - deleted |= radix_tree_shrink(root, update_node); + deleted |= radix_tree_shrink(root); return deleted; } @@ -1059,15 +1055,13 @@ static int calculate_count(struct radix_tree_root *root, * @node: pointer to tree node * @slot: pointer to slot in @node * @item: new item to store in the slot. - * @update_node: callback for changing leaf nodes * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void __rcu **slot, void *item, - radix_tree_update_node_t update_node) + void __rcu **slot, void *item) { void *old = rcu_dereference_raw(*slot); int values = !!xa_is_value(item) - !!xa_is_value(old); @@ -1085,10 +1079,7 @@ void __radix_tree_replace(struct radix_tree_root *root, if (!node) return; - if (update_node) - update_node(node); - - delete_node(root, node, update_node); + delete_node(root, node); } /** @@ -1110,7 +1101,7 @@ void __radix_tree_replace(struct radix_tree_root *root, void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { - __radix_tree_replace(root, NULL, slot, item, NULL); + __radix_tree_replace(root, NULL, slot, item); } EXPORT_SYMBOL(radix_tree_replace_slot); @@ -1127,7 +1118,7 @@ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { - __radix_tree_replace(root, iter->node, slot, item, NULL); + __radix_tree_replace(root, iter->node, slot, item); } #ifdef CONFIG_RADIX_TREE_MULTIORDER @@ -1807,23 +1798,6 @@ radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); -/** - * __radix_tree_delete_node - try to free node after clearing a slot - * @root: radix tree root - * @node: node containing @index - * @update_node: callback for changing leaf nodes - * - * After clearing the slot at @index in @node from radix tree - * rooted at @root, call this function to attempt freeing the - * node and shrinking the tree. - */ -void __radix_tree_delete_node(struct radix_tree_root *root, - struct radix_tree_node *node, - radix_tree_update_node_t update_node) -{ - delete_node(root, node, update_node); -} - static bool __radix_tree_delete(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot) { @@ -1839,7 +1813,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, values); - return node && delete_node(root, node, NULL); + return node && delete_node(root, node); } /** diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 60786fa55302..0e0ff26c9bcb 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -618,7 +618,7 @@ static void multiorder_account(void) __radix_tree_insert(&tree, 1 << 5, 5, xa_mk_value(5)); __radix_tree_lookup(&tree, 1 << 5, &node, &slot); assert(node->count == node->nr_values * 2); - __radix_tree_replace(&tree, node, slot, NULL, NULL); + __radix_tree_replace(&tree, node, slot, NULL); assert(node->nr_values == 0); item_kill_tree(&tree); -- cgit v1.2.3 From 2956c6644bfd9aab9f6b21a12e1bd75876d9dd73 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 19 May 2018 16:47:47 -0400 Subject: radix tree: Remove split/join code radix_tree_split and radix_tree_join were never used upstream. Remove them; if they're needed in future they will be replaced by XArray equivalents. Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 6 - lib/radix-tree.c | 171 +---------------------- tools/testing/radix-tree/benchmark.c | 91 ------------- tools/testing/radix-tree/multiorder.c | 247 ---------------------------------- 4 files changed, 2 insertions(+), 513 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7513d0df984b..44f9abdcb5ab 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -284,12 +284,6 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } -int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t); -int radix_tree_split(struct radix_tree_root *, unsigned long index, - unsigned new_order); -int radix_tree_join(struct radix_tree_root *, unsigned long index, - unsigned new_order, void *); - void __rcu **idr_get_free(struct radix_tree_root *root, struct radix_tree_iter *iter, gfp_t gfp, unsigned long max); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index c4c252185734..c52e0bef5264 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -415,28 +415,6 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(radix_tree_maybe_preload); -#ifdef CONFIG_RADIX_TREE_MULTIORDER -/* - * Preload with enough objects to ensure that we can split a single entry - * of order @old_order into many entries of size @new_order - */ -int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, - gfp_t gfp_mask) -{ - unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); - unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - - (new_order / RADIX_TREE_MAP_SHIFT); - unsigned nr = 0; - - WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); - BUG_ON(new_order >= old_order); - - while (layers--) - nr = nr * RADIX_TREE_MAP_SIZE + 1; - return __radix_tree_preload(gfp_mask, top * nr); -} -#endif - /* * The same as function above, but preload number of nodes required to insert * (1 << order) continuous naturally-aligned elements. @@ -1111,8 +1089,8 @@ EXPORT_SYMBOL(radix_tree_replace_slot); * @slot: pointer to slot * @item: new item to store in the slot. * - * For use with radix_tree_split() and radix_tree_for_each_slot(). - * Caller must hold tree write locked across split and replacement. + * For use with radix_tree_for_each_slot(). + * Caller must hold tree write locked. */ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, @@ -1121,151 +1099,6 @@ void radix_tree_iter_replace(struct radix_tree_root *root, __radix_tree_replace(root, iter->node, slot, item); } -#ifdef CONFIG_RADIX_TREE_MULTIORDER -/** - * radix_tree_join - replace multiple entries with one multiorder entry - * @root: radix tree root - * @index: an index inside the new entry - * @order: order of the new entry - * @item: new entry - * - * Call this function to replace several entries with one larger entry. - * The existing entries are presumed to not need freeing as a result of - * this call. - * - * The replacement entry will have all the tags set on it that were set - * on any of the entries it is replacing. - */ -int radix_tree_join(struct radix_tree_root *root, unsigned long index, - unsigned order, void *item) -{ - struct radix_tree_node *node; - void __rcu **slot; - int error; - - BUG_ON(radix_tree_is_internal_node(item)); - - error = __radix_tree_create(root, index, order, &node, &slot); - if (!error) - error = insert_entries(node, slot, item, order, true); - if (error > 0) - error = 0; - - return error; -} - -/** - * radix_tree_split - Split an entry into smaller entries - * @root: radix tree root - * @index: An index within the large entry - * @order: Order of new entries - * - * Call this function as the first step in replacing a multiorder entry - * with several entries of lower order. After this function returns, - * loop over the relevant portion of the tree using radix_tree_for_each_slot() - * and call radix_tree_iter_replace() to set up each new entry. - * - * The tags from this entry are replicated to all the new entries. - * - * The radix tree should be locked against modification during the entire - * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which - * should prompt RCU walkers to restart the lookup from the root. - */ -int radix_tree_split(struct radix_tree_root *root, unsigned long index, - unsigned order) -{ - struct radix_tree_node *parent, *node, *child; - void __rcu **slot; - unsigned int offset, end; - unsigned n, tag, tags = 0; - gfp_t gfp = root_gfp_mask(root); - - if (!__radix_tree_lookup(root, index, &parent, &slot)) - return -ENOENT; - if (!parent) - return -ENOENT; - - offset = get_slot_offset(parent, slot); - - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tag_get(parent, tag, offset)) - tags |= 1 << tag; - - for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { - if (!xa_is_sibling(rcu_dereference_raw(parent->slots[end]))) - break; - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(parent, tag, end); - /* rcu_assign_pointer ensures tags are set before RETRY */ - rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); - } - rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); - parent->nr_values -= (end - offset); - - if (order == parent->shift) - return 0; - if (order > parent->shift) { - while (offset < end) - offset += insert_entries(parent, &parent->slots[offset], - RADIX_TREE_RETRY, order, true); - return 0; - } - - node = parent; - - for (;;) { - if (node->shift > order) { - child = radix_tree_node_alloc(gfp, node, root, - node->shift - RADIX_TREE_MAP_SHIFT, - offset, 0, 0); - if (!child) - goto nomem; - if (node != parent) { - node->count++; - rcu_assign_pointer(node->slots[offset], - node_to_entry(child)); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(node, tag, offset); - } - - node = child; - offset = 0; - continue; - } - - n = insert_entries(node, &node->slots[offset], - RADIX_TREE_RETRY, order, false); - BUG_ON(n > RADIX_TREE_MAP_SIZE); - - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(node, tag, offset); - offset += n; - - while (offset == RADIX_TREE_MAP_SIZE) { - if (node == parent) - break; - offset = node->offset; - child = node; - node = node->parent; - rcu_assign_pointer(node->slots[offset], - node_to_entry(child)); - offset++; - } - if ((node == parent) && (offset == end)) - return 0; - } - - nomem: - /* Shouldn't happen; did user forget to preload? */ - /* TODO: free all the allocated nodes */ - WARN_ON(1); - return -ENOMEM; -} -#endif - static void node_tag_set(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c index 99c40f3ed133..35741b9c2a4a 100644 --- a/tools/testing/radix-tree/benchmark.c +++ b/tools/testing/radix-tree/benchmark.c @@ -146,90 +146,6 @@ static void benchmark_size(unsigned long size, unsigned long step, int order) rcu_barrier(); } -static long long __benchmark_split(unsigned long index, - int old_order, int new_order) -{ - struct timespec start, finish; - long long nsec; - RADIX_TREE(tree, GFP_ATOMIC); - - item_insert_order(&tree, index, old_order); - - clock_gettime(CLOCK_MONOTONIC, &start); - radix_tree_split(&tree, index, new_order); - clock_gettime(CLOCK_MONOTONIC, &finish); - nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + - (finish.tv_nsec - start.tv_nsec); - - item_kill_tree(&tree); - - return nsec; - -} - -static void benchmark_split(unsigned long size, unsigned long step) -{ - int i, j, idx; - long long nsec = 0; - - - for (idx = 0; idx < size; idx += step) { - for (i = 3; i < 11; i++) { - for (j = 0; j < i; j++) { - nsec += __benchmark_split(idx, i, j); - } - } - } - - printv(2, "Size %8ld, step %8ld, split time %10lld ns\n", - size, step, nsec); - -} - -static long long __benchmark_join(unsigned long index, - unsigned order1, unsigned order2) -{ - unsigned long loc; - struct timespec start, finish; - long long nsec; - void *item, *item2 = item_create(index + 1, order1); - RADIX_TREE(tree, GFP_KERNEL); - - item_insert_order(&tree, index, order2); - item = radix_tree_lookup(&tree, index); - - clock_gettime(CLOCK_MONOTONIC, &start); - radix_tree_join(&tree, index + 1, order1, item2); - clock_gettime(CLOCK_MONOTONIC, &finish); - nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + - (finish.tv_nsec - start.tv_nsec); - - loc = find_item(&tree, item); - if (loc == -1) - free(item); - - item_kill_tree(&tree); - - return nsec; -} - -static void benchmark_join(unsigned long step) -{ - int i, j, idx; - long long nsec = 0; - - for (idx = 0; idx < 1 << 10; idx += step) { - for (i = 1; i < 15; i++) { - for (j = 0; j < i; j++) { - nsec += __benchmark_join(idx, i, j); - } - } - } - - printv(2, "Size %8d, step %8ld, join time %10lld ns\n", - 1 << 10, step, nsec); -} - void benchmark(void) { unsigned long size[] = {1 << 10, 1 << 20, 0}; @@ -247,11 +163,4 @@ void benchmark(void) for (c = 0; size[c]; c++) for (s = 0; step[s]; s++) benchmark_size(size[c], step[s] << 9, 9); - - for (c = 0; size[c]; c++) - for (s = 0; step[s]; s++) - benchmark_split(size[c], step[s]); - - for (s = 0; step[s]; s++) - benchmark_join(step[s]); } diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 0e0ff26c9bcb..221e042d1b89 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -356,251 +356,6 @@ void multiorder_tagged_iteration(void) item_kill_tree(&tree); } -/* - * Basic join checks: make sure we can't find an entry in the tree after - * a larger entry has replaced it - */ -static void multiorder_join1(unsigned long index, - unsigned order1, unsigned order2) -{ - unsigned long loc; - void *item, *item2 = item_create(index + 1, order1); - RADIX_TREE(tree, GFP_KERNEL); - - item_insert_order(&tree, index, order2); - item = radix_tree_lookup(&tree, index); - radix_tree_join(&tree, index + 1, order1, item2); - loc = find_item(&tree, item); - if (loc == -1) - free(item); - item = radix_tree_lookup(&tree, index + 1); - assert(item == item2); - item_kill_tree(&tree); -} - -/* - * Check that the accounting of value entries is handled correctly - * by joining a value entry to a normal pointer. - */ -static void multiorder_join2(unsigned order1, unsigned order2) -{ - RADIX_TREE(tree, GFP_KERNEL); - struct radix_tree_node *node; - void *item1 = item_create(0, order1); - void *item2; - - item_insert_order(&tree, 0, order2); - radix_tree_insert(&tree, 1 << order2, xa_mk_value(5)); - item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); - assert(item2 == xa_mk_value(5)); - assert(node->nr_values == 1); - - item2 = radix_tree_lookup(&tree, 0); - free(item2); - - radix_tree_join(&tree, 0, order1, item1); - item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); - assert(item2 == item1); - assert(node->nr_values == 0); - item_kill_tree(&tree); -} - -/* - * This test revealed an accounting bug for value entries at one point. - * Nodes were being freed back into the pool with an elevated exception count - * by radix_tree_join() and then radix_tree_split() was failing to zero the - * count of value entries. - */ -static void multiorder_join3(unsigned int order) -{ - RADIX_TREE(tree, GFP_KERNEL); - struct radix_tree_node *node; - void **slot; - struct radix_tree_iter iter; - unsigned long i; - - for (i = 0; i < (1 << order); i++) { - radix_tree_insert(&tree, i, xa_mk_value(5)); - } - - radix_tree_join(&tree, 0, order, xa_mk_value(7)); - rcu_barrier(); - - radix_tree_split(&tree, 0, 0); - - radix_tree_for_each_slot(slot, &tree, &iter, 0) { - radix_tree_iter_replace(&tree, &iter, slot, xa_mk_value(5)); - } - - __radix_tree_lookup(&tree, 0, &node, NULL); - assert(node->nr_values == node->count); - - item_kill_tree(&tree); -} - -static void multiorder_join(void) -{ - int i, j, idx; - - for (idx = 0; idx < 1024; idx = idx * 2 + 3) { - for (i = 1; i < 15; i++) { - for (j = 0; j < i; j++) { - multiorder_join1(idx, i, j); - } - } - } - - for (i = 1; i < 15; i++) { - for (j = 0; j < i; j++) { - multiorder_join2(i, j); - } - } - - for (i = 3; i < 10; i++) { - multiorder_join3(i); - } -} - -static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc) -{ - struct radix_tree_preload *rtp = &radix_tree_preloads; - if (rtp->nr != 0) - printv(2, "split(%u %u) remaining %u\n", old_order, new_order, - rtp->nr); - /* - * Can't check for equality here as some nodes may have been - * RCU-freed while we ran. But we should never finish with more - * nodes allocated since they should have all been preloaded. - */ - if (nr_allocated > alloc) - printv(2, "split(%u %u) allocated %u %u\n", old_order, new_order, - alloc, nr_allocated); -} - -static void __multiorder_split(int old_order, int new_order) -{ - RADIX_TREE(tree, GFP_ATOMIC); - void **slot; - struct radix_tree_iter iter; - unsigned alloc; - struct item *item; - - radix_tree_preload(GFP_KERNEL); - assert(item_insert_order(&tree, 0, old_order) == 0); - radix_tree_preload_end(); - - /* Wipe out the preloaded cache or it'll confuse check_mem() */ - radix_tree_cpu_dead(0); - - item = radix_tree_tag_set(&tree, 0, 2); - - radix_tree_split_preload(old_order, new_order, GFP_KERNEL); - alloc = nr_allocated; - radix_tree_split(&tree, 0, new_order); - check_mem(old_order, new_order, alloc); - radix_tree_for_each_slot(slot, &tree, &iter, 0) { - radix_tree_iter_replace(&tree, &iter, slot, - item_create(iter.index, new_order)); - } - radix_tree_preload_end(); - - item_kill_tree(&tree); - free(item); -} - -static void __multiorder_split2(int old_order, int new_order) -{ - RADIX_TREE(tree, GFP_KERNEL); - void **slot; - struct radix_tree_iter iter; - struct radix_tree_node *node; - void *item; - - __radix_tree_insert(&tree, 0, old_order, xa_mk_value(5)); - - item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == xa_mk_value(5)); - assert(node->nr_values > 0); - - radix_tree_split(&tree, 0, new_order); - radix_tree_for_each_slot(slot, &tree, &iter, 0) { - radix_tree_iter_replace(&tree, &iter, slot, - item_create(iter.index, new_order)); - } - - item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item != xa_mk_value(5)); - assert(node->nr_values == 0); - - item_kill_tree(&tree); -} - -static void __multiorder_split3(int old_order, int new_order) -{ - RADIX_TREE(tree, GFP_KERNEL); - void **slot; - struct radix_tree_iter iter; - struct radix_tree_node *node; - void *item; - - __radix_tree_insert(&tree, 0, old_order, xa_mk_value(5)); - - item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == xa_mk_value(5)); - assert(node->nr_values > 0); - - radix_tree_split(&tree, 0, new_order); - radix_tree_for_each_slot(slot, &tree, &iter, 0) { - radix_tree_iter_replace(&tree, &iter, slot, xa_mk_value(7)); - } - - item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == xa_mk_value(7)); - assert(node->nr_values > 0); - - item_kill_tree(&tree); - - __radix_tree_insert(&tree, 0, old_order, xa_mk_value(5)); - - item = __radix_tree_lookup(&tree, 0, &node, NULL); - assert(item == xa_mk_value(5)); - assert(node->nr_values > 0); - - radix_tree_split(&tree, 0, new_order); - radix_tree_for_each_slot(slot, &tree, &iter, 0) { - if (iter.index == (1 << new_order)) - radix_tree_iter_replace(&tree, &iter, slot, - xa_mk_value(7)); - else - radix_tree_iter_replace(&tree, &iter, slot, NULL); - } - - item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL); - assert(item == xa_mk_value(7)); - assert(node->count == node->nr_values); - do { - node = node->parent; - if (!node) - break; - assert(node->count == 1); - assert(node->nr_values == 0); - } while (1); - - item_kill_tree(&tree); -} - -static void multiorder_split(void) -{ - int i, j; - - for (i = 3; i < 11; i++) - for (j = 0; j < i; j++) { - __multiorder_split(i, j); - __multiorder_split2(i, j); - __multiorder_split3(i, j); - } -} - static void multiorder_account(void) { RADIX_TREE(tree, GFP_KERNEL); @@ -702,8 +457,6 @@ void multiorder_checks(void) multiorder_tag_tests(); multiorder_iteration(); multiorder_tagged_iteration(); - multiorder_join(); - multiorder_split(); multiorder_account(); multiorder_iteration_race(); -- cgit v1.2.3 From 8cf2f98411e3a0865026a1061af637161b16d32b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 19 May 2018 16:47:47 -0400 Subject: radix tree: Remove radix_tree_maybe_preload_order This function was only used by the page cache which is now converted to the XArray. Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 1 - lib/radix-tree.c | 74 ---------------------------------------------- 2 files changed, 75 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 44f9abdcb5ab..664d50c5c2ff 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -259,7 +259,6 @@ unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); int radix_tree_maybe_preload(gfp_t gfp_mask); -int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *, unsigned long index, unsigned int tag); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index c52e0bef5264..b57ddc3dbbbd 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -41,9 +41,6 @@ #include -/* Number of nodes in fully populated tree of given height */ -static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; - /* * Radix tree node cache. */ @@ -415,51 +412,6 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(radix_tree_maybe_preload); -/* - * The same as function above, but preload number of nodes required to insert - * (1 << order) continuous naturally-aligned elements. - */ -int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) -{ - unsigned long nr_subtrees; - int nr_nodes, subtree_height; - - /* Preloading doesn't help anything with this gfp mask, skip it */ - if (!gfpflags_allow_blocking(gfp_mask)) { - preempt_disable(); - return 0; - } - - /* - * Calculate number and height of fully populated subtrees it takes to - * store (1 << order) elements. - */ - nr_subtrees = 1 << order; - for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; - subtree_height++) - nr_subtrees >>= RADIX_TREE_MAP_SHIFT; - - /* - * The worst case is zero height tree with a single item at index 0 and - * then inserting items starting at ULONG_MAX - (1 << order). - * - * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to - * 0-index item. - */ - nr_nodes = RADIX_TREE_MAX_PATH; - - /* Plus branch to fully populated subtrees. */ - nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; - - /* Root node is shared. */ - nr_nodes--; - - /* Plus nodes required to build subtrees. */ - nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; - - return __radix_tree_preload(gfp_mask, nr_nodes); -} - static unsigned radix_tree_load_root(const struct radix_tree_root *root, struct radix_tree_node **nodep, unsigned long *maxindex) { @@ -1859,31 +1811,6 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } -static __init unsigned long __maxindex(unsigned int height) -{ - unsigned int width = height * RADIX_TREE_MAP_SHIFT; - int shift = RADIX_TREE_INDEX_BITS - width; - - if (shift < 0) - return ~0UL; - if (shift >= BITS_PER_LONG) - return 0UL; - return ~0UL >> shift; -} - -static __init void radix_tree_init_maxnodes(void) -{ - unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; - unsigned int i, j; - - for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) - height_to_maxindex[i] = __maxindex(i); - for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { - for (j = i; j > 0; j--) - height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; - } -} - static int radix_tree_cpu_dead(unsigned int cpu) { struct radix_tree_preload *rtp; @@ -1911,7 +1838,6 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); - radix_tree_init_maxnodes(); ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead", NULL, radix_tree_cpu_dead); WARN_ON(ret < 0); -- cgit v1.2.3 From adb9d9c4ccb1ff0bf1d376e65f36aa5573c75c1a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 9 Apr 2018 16:52:21 -0400 Subject: radix tree: Remove radix_tree_clear_tags The page cache was the only user of this interface and it has now been converted to the XArray. Transform the test into a test of xas_init_marks(). Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 2 -- lib/radix-tree.c | 13 ------------ lib/test_xarray.c | 40 ++++++++++++++++++++++++++++++++++++ tools/testing/radix-tree/tag_check.c | 29 -------------------------- 4 files changed, 40 insertions(+), 44 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 664d50c5c2ff..7b009af3bb1e 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -252,8 +252,6 @@ void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); -void radix_tree_clear_tags(struct radix_tree_root *, struct radix_tree_node *, - void __rcu **slot); unsigned int radix_tree_gang_lookup(const struct radix_tree_root *, void **results, unsigned long first_index, unsigned int max_items); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index b57ddc3dbbbd..e92f2b6ae9c5 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1670,19 +1670,6 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); -void radix_tree_clear_tags(struct radix_tree_root *root, - struct radix_tree_node *node, - void __rcu **slot) -{ - if (node) { - unsigned int tag, offset = get_slot_offset(node, slot); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - node_tag_clear(root, node, tag, offset); - } else { - root_tag_clear_all(root); - } -} - /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 815daffdd8c9..ca86141641cb 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -213,12 +213,52 @@ static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) XA_BUG_ON(xa, !xa_empty(xa)); } +static noinline void check_xa_mark_2(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned long index; + unsigned int count = 0; + void *entry; + + xa_store_index(xa, 0, GFP_KERNEL); + xa_set_mark(xa, 0, XA_MARK_0); + xas_lock(&xas); + xas_load(&xas); + xas_init_marks(&xas); + xas_unlock(&xas); + XA_BUG_ON(xa, !xa_get_mark(xa, 0, XA_MARK_0) == 0); + + for (index = 3500; index < 4500; index++) { + xa_store_index(xa, index, GFP_KERNEL); + xa_set_mark(xa, index, XA_MARK_0); + } + + xas_reset(&xas); + rcu_read_lock(); + xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0) + count++; + rcu_read_unlock(); + XA_BUG_ON(xa, count != 1000); + + xas_lock(&xas); + xas_for_each(&xas, entry, ULONG_MAX) { + xas_init_marks(&xas); + XA_BUG_ON(xa, !xa_get_mark(xa, xas.xa_index, XA_MARK_0)); + XA_BUG_ON(xa, !xas_get_mark(&xas, XA_MARK_0)); + } + xas_unlock(&xas); + + xa_destroy(xa); +} + static noinline void check_xa_mark(struct xarray *xa) { unsigned long index; for (index = 0; index < 16384; index += 4) check_xa_mark_1(xa, index); + + check_xa_mark_2(xa); } static noinline void check_xa_shrink(struct xarray *xa) diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c index 543181e4847b..56a42f1c5ab0 100644 --- a/tools/testing/radix-tree/tag_check.c +++ b/tools/testing/radix-tree/tag_check.c @@ -331,34 +331,6 @@ static void single_check(void) item_kill_tree(&tree); } -void radix_tree_clear_tags_test(void) -{ - unsigned long index; - struct radix_tree_node *node; - struct radix_tree_iter iter; - void **slot; - - RADIX_TREE(tree, GFP_KERNEL); - - item_insert(&tree, 0); - item_tag_set(&tree, 0, 0); - __radix_tree_lookup(&tree, 0, &node, &slot); - radix_tree_clear_tags(&tree, node, slot); - assert(item_tag_get(&tree, 0, 0) == 0); - - for (index = 0; index < 1000; index++) { - item_insert(&tree, index); - item_tag_set(&tree, index, 0); - } - - radix_tree_for_each_slot(slot, &tree, &iter, 0) { - radix_tree_clear_tags(&tree, iter.node, slot); - assert(item_tag_get(&tree, iter.index, 0) == 0); - } - - item_kill_tree(&tree); -} - void tag_check(void) { single_check(); @@ -376,5 +348,4 @@ void tag_check(void) thrash_tags(); rcu_barrier(); printv(2, "after thrash_tags: %d allocated\n", nr_allocated); - radix_tree_clear_tags_test(); } -- cgit v1.2.3 From 372266ba0267803564824b1c09f1bb7f3f3fc761 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 18 Aug 2018 07:09:22 -0400 Subject: radix tree test suite: Convert tag_tagged_items to XArray The tag_tagged_items() function is supposed to test the page-writeback tagging code. Since that has been converted to the XArray, there's not much point in testing the radix tree's tagging code. This requires using the pthread mutex embedded in the xarray instead of an external lock, so remove the pthread mutexes which protect xarrays/radix trees. Also remove radix_tree_iter_tag_set() as this was the last user. Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 2 -- lib/radix-tree.c | 12 ---------- tools/testing/radix-tree/iteration_check.c | 16 ++++++------- tools/testing/radix-tree/main.c | 4 ++-- tools/testing/radix-tree/multiorder.c | 12 +++++----- tools/testing/radix-tree/regression1.c | 17 +++++++------- tools/testing/radix-tree/regression2.c | 8 +++---- tools/testing/radix-tree/tag_check.c | 4 ++-- tools/testing/radix-tree/test.c | 37 ++++++++++++------------------ tools/testing/radix-tree/test.h | 5 ++-- 10 files changed, 46 insertions(+), 71 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7b009af3bb1e..9a1460488163 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -264,8 +264,6 @@ void *radix_tree_tag_clear(struct radix_tree_root *, unsigned long index, unsigned int tag); int radix_tree_tag_get(const struct radix_tree_root *, unsigned long index, unsigned int tag); -void radix_tree_iter_tag_set(struct radix_tree_root *, - const struct radix_tree_iter *iter, unsigned int tag); void radix_tree_iter_tag_clear(struct radix_tree_root *, const struct radix_tree_iter *iter, unsigned int tag); unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index e92f2b6ae9c5..f107dd2698e3 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1108,18 +1108,6 @@ void *radix_tree_tag_set(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_set); -/** - * radix_tree_iter_tag_set - set a tag on the current iterator entry - * @root: radix tree root - * @iter: iterator state - * @tag: tag to set - */ -void radix_tree_iter_tag_set(struct radix_tree_root *root, - const struct radix_tree_iter *iter, unsigned int tag) -{ - node_tag_set(root, iter->node, tag, iter_offset(iter)); -} - static void node_tag_clear(struct radix_tree_root *root, struct radix_tree_node *node, unsigned int tag, unsigned int offset) diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c index a92bab513701..d047327bb9ef 100644 --- a/tools/testing/radix-tree/iteration_check.c +++ b/tools/testing/radix-tree/iteration_check.c @@ -18,10 +18,9 @@ #define NUM_THREADS 5 #define MAX_IDX 100 -#define TAG 0 -#define NEW_TAG 1 +#define TAG XA_MARK_0 +#define NEW_TAG XA_MARK_1 -static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_t threads[NUM_THREADS]; static unsigned int seeds[3]; static RADIX_TREE(tree, GFP_KERNEL); @@ -38,7 +37,7 @@ static void *add_entries_fn(void *arg) int order; for (pgoff = 0; pgoff < MAX_IDX; pgoff++) { - pthread_mutex_lock(&tree_lock); + xa_lock(&tree); for (order = max_order; order >= 0; order--) { if (item_insert_order(&tree, pgoff, order) == 0) { @@ -46,7 +45,7 @@ static void *add_entries_fn(void *arg) break; } } - pthread_mutex_unlock(&tree_lock); + xa_unlock(&tree); } } @@ -150,9 +149,9 @@ static void *remove_entries_fn(void *arg) pgoff = rand_r(&seeds[2]) % MAX_IDX; - pthread_mutex_lock(&tree_lock); + xa_lock(&tree); item_delete(&tree, pgoff); - pthread_mutex_unlock(&tree_lock); + xa_unlock(&tree); } rcu_unregister_thread(); @@ -165,8 +164,7 @@ static void *tag_entries_fn(void *arg) rcu_register_thread(); while (!test_complete) { - tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG, - NEW_TAG); + tag_tagged_items(&tree, 0, MAX_IDX, 10, TAG, NEW_TAG); } rcu_unregister_thread(); return NULL; diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index 79589ea570ab..77a44c54998f 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -214,7 +214,7 @@ void copy_tag_check(void) } // printf("\ncopying tags...\n"); - tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1); + tagged = tag_tagged_items(&tree, start, end, ITEMS, XA_MARK_0, XA_MARK_1); // printf("checking copied tags\n"); assert(tagged == count); @@ -223,7 +223,7 @@ void copy_tag_check(void) /* Copy tags in several rounds */ // printf("\ncopying tags...\n"); tmp = rand() % (count / 10 + 2); - tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2); + tagged = tag_tagged_items(&tree, start, end, tmp, XA_MARK_0, XA_MARK_2); assert(tagged == count); // printf("%lu %lu %lu\n", tagged, tmp, count); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 221e042d1b89..0436554a099a 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -59,7 +59,7 @@ static void __multiorder_tag_test(int index, int order) assert(!radix_tree_tag_get(&tree, i, 1)); } - assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1); + assert(tag_tagged_items(&tree, 0, ~0UL, 10, XA_MARK_0, XA_MARK_1) == 1); assert(radix_tree_tag_clear(&tree, index, 0)); for_each_index(i, base, order) { @@ -87,7 +87,7 @@ static void __multiorder_tag_test2(unsigned order, unsigned long index2) assert(radix_tree_tag_set(&tree, 0, 0)); assert(radix_tree_tag_set(&tree, index2, 0)); - assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2); + assert(tag_tagged_items(&tree, 0, ~0UL, 10, XA_MARK_0, XA_MARK_1) == 2); item_kill_tree(&tree); } @@ -318,8 +318,8 @@ void multiorder_tagged_iteration(void) } } - assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) == - TAG_ENTRIES); + assert(tag_tagged_items(&tree, 0, ~0UL, TAG_ENTRIES, XA_MARK_1, + XA_MARK_2) == TAG_ENTRIES); for (j = 0; j < 256; j++) { int mask, k; @@ -345,8 +345,8 @@ void multiorder_tagged_iteration(void) } } - assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0) - == TAG_ENTRIES); + assert(tag_tagged_items(&tree, 1, ~0UL, MT_NUM_ENTRIES * 2, XA_MARK_1, + XA_MARK_0) == TAG_ENTRIES); i = 0; radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { assert(iter.index == tag_index[i]); diff --git a/tools/testing/radix-tree/regression1.c b/tools/testing/radix-tree/regression1.c index b4a4a7168986..a61c7bcbc72d 100644 --- a/tools/testing/radix-tree/regression1.c +++ b/tools/testing/radix-tree/regression1.c @@ -44,7 +44,6 @@ #include "regression.h" static RADIX_TREE(mt_tree, GFP_KERNEL); -static pthread_mutex_t mt_lock = PTHREAD_MUTEX_INITIALIZER; struct page { pthread_mutex_t lock; @@ -126,29 +125,29 @@ static void *regression1_fn(void *arg) struct page *p; p = page_alloc(0); - pthread_mutex_lock(&mt_lock); + xa_lock(&mt_tree); radix_tree_insert(&mt_tree, 0, p); - pthread_mutex_unlock(&mt_lock); + xa_unlock(&mt_tree); p = page_alloc(1); - pthread_mutex_lock(&mt_lock); + xa_lock(&mt_tree); radix_tree_insert(&mt_tree, 1, p); - pthread_mutex_unlock(&mt_lock); + xa_unlock(&mt_tree); - pthread_mutex_lock(&mt_lock); + xa_lock(&mt_tree); p = radix_tree_delete(&mt_tree, 1); pthread_mutex_lock(&p->lock); p->count--; pthread_mutex_unlock(&p->lock); - pthread_mutex_unlock(&mt_lock); + xa_unlock(&mt_tree); page_free(p); - pthread_mutex_lock(&mt_lock); + xa_lock(&mt_tree); p = radix_tree_delete(&mt_tree, 0); pthread_mutex_lock(&p->lock); p->count--; pthread_mutex_unlock(&p->lock); - pthread_mutex_unlock(&mt_lock); + xa_unlock(&mt_tree); page_free(p); } } else { diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c index 424b91c77831..f2c7e640a919 100644 --- a/tools/testing/radix-tree/regression2.c +++ b/tools/testing/radix-tree/regression2.c @@ -53,9 +53,9 @@ #include "regression.h" #include "test.h" -#define PAGECACHE_TAG_DIRTY 0 -#define PAGECACHE_TAG_WRITEBACK 1 -#define PAGECACHE_TAG_TOWRITE 2 +#define PAGECACHE_TAG_DIRTY XA_MARK_0 +#define PAGECACHE_TAG_WRITEBACK XA_MARK_1 +#define PAGECACHE_TAG_TOWRITE XA_MARK_2 static RADIX_TREE(mt_tree, GFP_KERNEL); unsigned long page_count = 0; @@ -92,7 +92,7 @@ void regression2_test(void) /* 1. */ start = 0; end = max_slots - 2; - tag_tagged_items(&mt_tree, NULL, start, end, 1, + tag_tagged_items(&mt_tree, start, end, 1, PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); /* 2. */ diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c index 56a42f1c5ab0..f898957b1a19 100644 --- a/tools/testing/radix-tree/tag_check.c +++ b/tools/testing/radix-tree/tag_check.c @@ -24,7 +24,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag) item_tag_set(tree, index, tag); ret = item_tag_get(tree, index, tag); assert(ret != 0); - ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag); + ret = tag_tagged_items(tree, first, ~0UL, 10, tag, !tag); assert(ret == 1); ret = item_tag_get(tree, index, !tag); assert(ret != 0); @@ -321,7 +321,7 @@ static void single_check(void) assert(ret == 0); verify_tag_consistency(&tree, 0); verify_tag_consistency(&tree, 1); - ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1); + ret = tag_tagged_items(&tree, first, 10, 10, XA_MARK_0, XA_MARK_1); assert(ret == 1); ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); assert(ret == 1); diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 470419bfd49d..d70adcd03d35 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -176,35 +176,28 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, } /* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */ -int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock, - unsigned long start, unsigned long end, unsigned batch, - unsigned iftag, unsigned thentag) +int tag_tagged_items(struct xarray *xa, unsigned long start, unsigned long end, + unsigned batch, xa_mark_t iftag, xa_mark_t thentag) { - unsigned long tagged = 0; - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, xa, start); + unsigned int tagged = 0; + struct item *item; if (batch == 0) batch = 1; - if (lock) - pthread_mutex_lock(lock); - radix_tree_for_each_tagged(slot, root, &iter, start, iftag) { - if (iter.index > end) - break; - radix_tree_iter_tag_set(root, &iter, thentag); - tagged++; - if ((tagged % batch) != 0) + xas_lock_irq(&xas); + xas_for_each_marked(&xas, item, end, iftag) { + xas_set_mark(&xas, thentag); + if (++tagged % batch) continue; - slot = radix_tree_iter_resume(slot, &iter); - if (lock) { - pthread_mutex_unlock(lock); - rcu_barrier(); - pthread_mutex_lock(lock); - } + + xas_pause(&xas); + xas_unlock_irq(&xas); + rcu_barrier(); + xas_lock_irq(&xas); } - if (lock) - pthread_mutex_unlock(lock); + xas_unlock_irq(&xas); return tagged; } diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 9532c18c6cb1..100e9a456f91 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -29,9 +29,8 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, unsigned long nr, int chunk); void item_kill_tree(struct radix_tree_root *root); -int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, - unsigned long start, unsigned long end, unsigned batch, - unsigned iftag, unsigned thentag); +int tag_tagged_items(struct xarray *, unsigned long start, unsigned long end, + unsigned batch, xa_mark_t iftag, xa_mark_t thentag); void xarray_tests(void); void tag_check(void); -- cgit v1.2.3 From 47e0fab2b15155e33fdff777c791bebfd5855bbc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 20 Aug 2018 15:48:46 -0400 Subject: radix tree test suite: Convert iteration test to XArray With no code left in the kernel using the multiorder radix tree, convert the iteration test from the radix tree to the XArray. It's unlikely to suffer the same bug as the radix tree, but this test will prevent that bug from ever creeping into the XArray implementation. Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/iteration_check.c | 103 ++++++++++++++--------------- tools/testing/radix-tree/test.c | 17 +++-- tools/testing/radix-tree/test.h | 1 + 3 files changed, 63 insertions(+), 58 deletions(-) diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c index d047327bb9ef..238db187aa15 100644 --- a/tools/testing/radix-tree/iteration_check.c +++ b/tools/testing/radix-tree/iteration_check.c @@ -1,5 +1,5 @@ /* - * iteration_check.c: test races having to do with radix tree iteration + * iteration_check.c: test races having to do with xarray iteration * Copyright (c) 2016 Intel Corporation * Author: Ross Zwisler * @@ -12,7 +12,6 @@ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ -#include #include #include "test.h" @@ -23,29 +22,44 @@ static pthread_t threads[NUM_THREADS]; static unsigned int seeds[3]; -static RADIX_TREE(tree, GFP_KERNEL); +static DEFINE_XARRAY(array); static bool test_complete; static int max_order; -/* relentlessly fill the tree with tagged entries */ +void my_item_insert(struct xarray *xa, unsigned long index) +{ + XA_STATE(xas, xa, index); + struct item *item = item_create(index, 0); + int order; + +retry: + xas_lock(&xas); + for (order = max_order; order >= 0; order--) { + xas_set_order(&xas, index, order); + item->order = order; + if (xas_find_conflict(&xas)) + continue; + xas_store(&xas, item); + xas_set_mark(&xas, TAG); + break; + } + xas_unlock(&xas); + if (xas_nomem(&xas, GFP_KERNEL)) + goto retry; + if (order < 0) + free(item); +} + +/* relentlessly fill the array with tagged entries */ static void *add_entries_fn(void *arg) { rcu_register_thread(); while (!test_complete) { unsigned long pgoff; - int order; for (pgoff = 0; pgoff < MAX_IDX; pgoff++) { - xa_lock(&tree); - for (order = max_order; order >= 0; order--) { - if (item_insert_order(&tree, pgoff, order) - == 0) { - item_tag_set(&tree, pgoff, TAG); - break; - } - } - xa_unlock(&tree); + my_item_insert(&array, pgoff); } } @@ -55,33 +69,25 @@ static void *add_entries_fn(void *arg) } /* - * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find - * things that have been removed and randomly resetting our iteration to the - * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and - * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a - * NULL 'slot' variable. + * Iterate over tagged entries, retrying when we find ourselves in a deleted + * node and randomly pausing the iteration. */ static void *tagged_iteration_fn(void *arg) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &array, 0); + void *entry; rcu_register_thread(); while (!test_complete) { + xas_set(&xas, 0); rcu_read_lock(); - radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { - void *entry = radix_tree_deref_slot(slot); - if (unlikely(!entry)) + xas_for_each_marked(&xas, entry, ULONG_MAX, TAG) { + if (xas_retry(&xas, entry)) continue; - if (radix_tree_deref_retry(entry)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - if (rand_r(&seeds[0]) % 50 == 0) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); rcu_read_unlock(); rcu_barrier(); rcu_read_lock(); @@ -96,33 +102,25 @@ static void *tagged_iteration_fn(void *arg) } /* - * Iterate over the entries, doing a radix_tree_iter_retry() as we find things - * that have been removed and randomly resetting our iteration to the next - * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and - * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a - * NULL 'slot' variable. + * Iterate over the entries, retrying when we find ourselves in a deleted + * node and randomly pausing the iteration. */ static void *untagged_iteration_fn(void *arg) { - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, &array, 0); + void *entry; rcu_register_thread(); while (!test_complete) { + xas_set(&xas, 0); rcu_read_lock(); - radix_tree_for_each_slot(slot, &tree, &iter, 0) { - void *entry = radix_tree_deref_slot(slot); - if (unlikely(!entry)) + xas_for_each(&xas, entry, ULONG_MAX) { + if (xas_retry(&xas, entry)) continue; - if (radix_tree_deref_retry(entry)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - if (rand_r(&seeds[1]) % 50 == 0) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); rcu_read_unlock(); rcu_barrier(); rcu_read_lock(); @@ -137,7 +135,7 @@ static void *untagged_iteration_fn(void *arg) } /* - * Randomly remove entries to help induce radix_tree_iter_retry() calls in the + * Randomly remove entries to help induce retries in the * two iteration functions. */ static void *remove_entries_fn(void *arg) @@ -146,12 +144,13 @@ static void *remove_entries_fn(void *arg) while (!test_complete) { int pgoff; + struct item *item; pgoff = rand_r(&seeds[2]) % MAX_IDX; - xa_lock(&tree); - item_delete(&tree, pgoff); - xa_unlock(&tree); + item = xa_erase(&array, pgoff); + if (item) + item_free(item, pgoff); } rcu_unregister_thread(); @@ -164,7 +163,7 @@ static void *tag_entries_fn(void *arg) rcu_register_thread(); while (!test_complete) { - tag_tagged_items(&tree, 0, MAX_IDX, 10, TAG, NEW_TAG); + tag_tagged_items(&array, 0, MAX_IDX, 10, TAG, NEW_TAG); } rcu_unregister_thread(); return NULL; @@ -215,5 +214,5 @@ void iteration_test(unsigned order, unsigned test_duration) } } - item_kill_tree(&tree); + item_kill_tree(&array); } diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index d70adcd03d35..32973dd51ec5 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -63,16 +63,21 @@ void item_sanity(struct item *item, unsigned long index) assert((item->index | mask) == (index | mask)); } +void item_free(struct item *item, unsigned long index) +{ + item_sanity(item, index); + free(item); +} + int item_delete(struct radix_tree_root *root, unsigned long index) { struct item *item = radix_tree_delete(root, index); - if (item) { - item_sanity(item, index); - free(item); - return 1; - } - return 0; + if (!item) + return 0; + + item_free(item, index); + return 1; } static void item_free_rcu(struct rcu_head *head) diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 100e9a456f91..a2f53c889a31 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -14,6 +14,7 @@ struct item *item_create(unsigned long index, unsigned int order); int __item_insert(struct radix_tree_root *root, struct item *item); int item_insert(struct radix_tree_root *root, unsigned long index); void item_sanity(struct item *item, unsigned long index); +void item_free(struct item *item, unsigned long index); int item_insert_order(struct radix_tree_root *root, unsigned long index, unsigned order); int item_delete(struct radix_tree_root *root, unsigned long index); -- cgit v1.2.3 From d6427f8179b5dd65eb468c61fc8cc24657c336c9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 28 Aug 2018 16:13:16 -0400 Subject: xarray: Move multiorder account test in-kernel Move this test to the in-kernel test suite, and enhance it to test several different orders. Signed-off-by: Matthew Wilcox --- lib/test_xarray.c | 32 ++++++++++++++++++++++++++++++++ tools/testing/radix-tree/multiorder.c | 24 ------------------------ 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index ca86141641cb..38cab4ccb24e 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1019,6 +1019,37 @@ static noinline void check_workingset(struct xarray *xa, unsigned long index) XA_BUG_ON(xa, !xa_empty(xa)); } +/* + * Check that the pointer / value / sibling entries are accounted the + * way we expect them to be. + */ +static noinline void check_account(struct xarray *xa) +{ +#ifdef CONFIG_XARRAY_MULTI + unsigned int order; + + for (order = 1; order < 12; order++) { + XA_STATE(xas, xa, 1 << order); + + xa_store_order(xa, 0, order, xa, GFP_KERNEL); + xas_load(&xas); + XA_BUG_ON(xa, xas.xa_node->count == 0); + XA_BUG_ON(xa, xas.xa_node->count > (1 << order)); + XA_BUG_ON(xa, xas.xa_node->nr_values != 0); + + xa_store_order(xa, 1 << order, order, xa_mk_value(1 << order), + GFP_KERNEL); + XA_BUG_ON(xa, xas.xa_node->count != xas.xa_node->nr_values * 2); + + xa_erase(xa, 1 << order); + XA_BUG_ON(xa, xas.xa_node->nr_values != 0); + + xa_erase(xa, 0); + XA_BUG_ON(xa, !xa_empty(xa)); + } +#endif +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -1068,6 +1099,7 @@ static int xarray_checks(void) check_xa_alloc(); check_find(&array); check_find_entry(&array); + check_account(&array); check_destroy(&array); check_move(&array); check_create_range(&array); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 0436554a099a..dc27a3da210a 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -356,29 +356,6 @@ void multiorder_tagged_iteration(void) item_kill_tree(&tree); } -static void multiorder_account(void) -{ - RADIX_TREE(tree, GFP_KERNEL); - struct radix_tree_node *node; - void **slot; - - item_insert_order(&tree, 0, 5); - - __radix_tree_insert(&tree, 1 << 5, 5, xa_mk_value(5)); - __radix_tree_lookup(&tree, 0, &node, NULL); - assert(node->count == node->nr_values * 2); - radix_tree_delete(&tree, 1 << 5); - assert(node->nr_values == 0); - - __radix_tree_insert(&tree, 1 << 5, 5, xa_mk_value(5)); - __radix_tree_lookup(&tree, 1 << 5, &node, &slot); - assert(node->count == node->nr_values * 2); - __radix_tree_replace(&tree, node, slot, NULL); - assert(node->nr_values == 0); - - item_kill_tree(&tree); -} - bool stop_iteration = false; static void *creator_func(void *ptr) @@ -457,7 +434,6 @@ void multiorder_checks(void) multiorder_tag_tests(); multiorder_iteration(); multiorder_tagged_iteration(); - multiorder_account(); multiorder_iteration_race(); radix_tree_cpu_dead(0); -- cgit v1.2.3 From 93eb07f72c8d86f8fe5e90907df1cc037f6ffbb7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 8 Sep 2018 12:09:52 -0400 Subject: xarray: Move multiorder_shrink to kernel tests Test this functionality inside the kernel as well as in userspace. Also remove insert_bug() as there's no comparable thing to test in the XArray code. Signed-off-by: Matthew Wilcox --- lib/test_xarray.c | 37 ++++++++ tools/testing/radix-tree/multiorder.c | 173 ---------------------------------- 2 files changed, 37 insertions(+), 173 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 38cab4ccb24e..ff94b54a926d 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -199,9 +199,25 @@ static noinline void check_xa_mark_1(struct xarray *xa, unsigned long index) xa_store_order(xa, index, order, xa_mk_value(index), GFP_KERNEL); for (i = base; i < next; i++) { + XA_STATE(xas, xa, i); + unsigned int seen = 0; + void *entry; + XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_0)); XA_BUG_ON(xa, !xa_get_mark(xa, i, XA_MARK_1)); XA_BUG_ON(xa, xa_get_mark(xa, i, XA_MARK_2)); + + /* We should see two elements in the array */ + xas_for_each(&xas, entry, ULONG_MAX) + seen++; + XA_BUG_ON(xa, seen != 2); + + /* One of which is marked */ + xas_set(&xas, 0); + seen = 0; + xas_for_each_marked(&xas, entry, ULONG_MAX, XA_MARK_0) + seen++; + XA_BUG_ON(xa, seen != 1); } XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_0)); XA_BUG_ON(xa, xa_get_mark(xa, next, XA_MARK_1)); @@ -265,6 +281,8 @@ static noinline void check_xa_shrink(struct xarray *xa) { XA_STATE(xas, xa, 1); struct xa_node *node; + unsigned int order; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 15 : 1; XA_BUG_ON(xa, !xa_empty(xa)); XA_BUG_ON(xa, xa_store_index(xa, 0, GFP_KERNEL) != NULL); @@ -287,6 +305,25 @@ static noinline void check_xa_shrink(struct xarray *xa) XA_BUG_ON(xa, xa_load(xa, 0) != xa_mk_value(0)); xa_erase_index(xa, 0); XA_BUG_ON(xa, !xa_empty(xa)); + + for (order = 0; order < max_order; order++) { + unsigned long max = (1UL << order) - 1; + xa_store_order(xa, 0, order, xa_mk_value(0), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, max) != xa_mk_value(0)); + XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL); + rcu_read_lock(); + node = xa_head(xa); + rcu_read_unlock(); + XA_BUG_ON(xa, xa_store_index(xa, ULONG_MAX, GFP_KERNEL) != + NULL); + rcu_read_lock(); + XA_BUG_ON(xa, xa_head(xa) == node); + rcu_read_unlock(); + XA_BUG_ON(xa, xa_load(xa, max + 1) != NULL); + xa_erase_index(xa, ULONG_MAX); + XA_BUG_ON(xa, xa->xa_head != node); + xa_erase_index(xa, 0); + } } static noinline void check_cmpxchg(struct xarray *xa) diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index dc27a3da210a..a60c03287e9c 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -20,127 +20,6 @@ #include "test.h" -#define for_each_index(i, base, order) \ - for (i = base; i < base + (1 << order); i++) - -static void __multiorder_tag_test(int index, int order) -{ - RADIX_TREE(tree, GFP_KERNEL); - int base, err, i; - - /* our canonical entry */ - base = index & ~((1 << order) - 1); - - printv(2, "Multiorder tag test with index %d, canonical entry %d\n", - index, base); - - err = item_insert_order(&tree, index, order); - assert(!err); - - /* - * Verify we get collisions for covered indices. We try and fail to - * insert a value entry so we don't leak memory via - * item_insert_order(). - */ - for_each_index(i, base, order) { - err = __radix_tree_insert(&tree, i, order, xa_mk_value(0xA0)); - assert(err == -EEXIST); - } - - for_each_index(i, base, order) { - assert(!radix_tree_tag_get(&tree, i, 0)); - assert(!radix_tree_tag_get(&tree, i, 1)); - } - - assert(radix_tree_tag_set(&tree, index, 0)); - - for_each_index(i, base, order) { - assert(radix_tree_tag_get(&tree, i, 0)); - assert(!radix_tree_tag_get(&tree, i, 1)); - } - - assert(tag_tagged_items(&tree, 0, ~0UL, 10, XA_MARK_0, XA_MARK_1) == 1); - assert(radix_tree_tag_clear(&tree, index, 0)); - - for_each_index(i, base, order) { - assert(!radix_tree_tag_get(&tree, i, 0)); - assert(radix_tree_tag_get(&tree, i, 1)); - } - - assert(radix_tree_tag_clear(&tree, index, 1)); - - assert(!radix_tree_tagged(&tree, 0)); - assert(!radix_tree_tagged(&tree, 1)); - - item_kill_tree(&tree); -} - -static void __multiorder_tag_test2(unsigned order, unsigned long index2) -{ - RADIX_TREE(tree, GFP_KERNEL); - unsigned long index = (1 << order); - index2 += index; - - assert(item_insert_order(&tree, 0, order) == 0); - assert(item_insert(&tree, index2) == 0); - - assert(radix_tree_tag_set(&tree, 0, 0)); - assert(radix_tree_tag_set(&tree, index2, 0)); - - assert(tag_tagged_items(&tree, 0, ~0UL, 10, XA_MARK_0, XA_MARK_1) == 2); - - item_kill_tree(&tree); -} - -static void multiorder_tag_tests(void) -{ - int i, j; - - /* test multi-order entry for indices 0-7 with no sibling pointers */ - __multiorder_tag_test(0, 3); - __multiorder_tag_test(5, 3); - - /* test multi-order entry for indices 8-15 with no sibling pointers */ - __multiorder_tag_test(8, 3); - __multiorder_tag_test(15, 3); - - /* - * Our order 5 entry covers indices 0-31 in a tree with height=2. - * This is broken up as follows: - * 0-7: canonical entry - * 8-15: sibling 1 - * 16-23: sibling 2 - * 24-31: sibling 3 - */ - __multiorder_tag_test(0, 5); - __multiorder_tag_test(29, 5); - - /* same test, but with indices 32-63 */ - __multiorder_tag_test(32, 5); - __multiorder_tag_test(44, 5); - - /* - * Our order 8 entry covers indices 0-255 in a tree with height=3. - * This is broken up as follows: - * 0-63: canonical entry - * 64-127: sibling 1 - * 128-191: sibling 2 - * 192-255: sibling 3 - */ - __multiorder_tag_test(0, 8); - __multiorder_tag_test(190, 8); - - /* same test, but with indices 256-511 */ - __multiorder_tag_test(256, 8); - __multiorder_tag_test(300, 8); - - __multiorder_tag_test(0x12345678UL, 8); - - for (i = 1; i < 10; i++) - for (j = 0; j < (10 << i); j++) - __multiorder_tag_test2(i, j); -} - static void multiorder_check(unsigned long index, int order) { unsigned long i; @@ -181,53 +60,6 @@ static void multiorder_check(unsigned long index, int order) item_check_absent(&tree, i); } -static void multiorder_shrink(unsigned long index, int order) -{ - unsigned long i; - unsigned long max = 1 << order; - RADIX_TREE(tree, GFP_KERNEL); - struct radix_tree_node *node; - - printv(2, "Multiorder shrink index %ld, order %d\n", index, order); - - assert(item_insert_order(&tree, 0, order) == 0); - - node = tree.xa_head; - - assert(item_insert(&tree, index) == 0); - assert(node != tree.xa_head); - - assert(item_delete(&tree, index) != 0); - assert(node == tree.xa_head); - - for (i = 0; i < max; i++) { - struct item *item = item_lookup(&tree, i); - assert(item != 0); - assert(item->index == 0); - } - for (i = max; i < 2*max; i++) - item_check_absent(&tree, i); - - if (!item_delete(&tree, 0)) { - printv(2, "failed to delete index %ld (order %d)\n", index, order); - abort(); - } - - for (i = 0; i < 2*max; i++) - item_check_absent(&tree, i); -} - -static void multiorder_insert_bug(void) -{ - RADIX_TREE(tree, GFP_KERNEL); - - item_insert(&tree, 0); - radix_tree_tag_set(&tree, 0, 0); - item_insert_order(&tree, 3 << 6, 6); - - item_kill_tree(&tree); -} - void multiorder_iteration(void) { RADIX_TREE(tree, GFP_KERNEL); @@ -427,11 +259,6 @@ void multiorder_checks(void) multiorder_check((1UL << i) + 1, i); } - for (i = 0; i < 15; i++) - multiorder_shrink((1UL << (i + RADIX_TREE_MAP_SHIFT)), i); - - multiorder_insert_bug(); - multiorder_tag_tests(); multiorder_iteration(); multiorder_tagged_iteration(); multiorder_iteration_race(); -- cgit v1.2.3 From 4f06d6302da682157890f72c0573e12a73536814 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sun, 9 Sep 2018 01:52:17 -0400 Subject: xarray: Move multiorder_check to in-kernel tests This version is a little less thorough in order to be a little quicker, but tests the important edge cases. Also test adding a multiorder entry at a non-canonical index, and erasing it. Signed-off-by: Matthew Wilcox --- lib/test_xarray.c | 44 ++++++++++++++++++++++++++++++++ tools/testing/radix-tree/multiorder.c | 48 ----------------------------------- 2 files changed, 44 insertions(+), 48 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index ff94b54a926d..0f06a93b4d0e 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -422,6 +422,43 @@ static noinline void check_xas_erase(struct xarray *xa) } } +#ifdef CONFIG_XARRAY_MULTI +static noinline void check_multi_store_1(struct xarray *xa, unsigned long index, + unsigned int order) +{ + XA_STATE(xas, xa, index); + unsigned long min = index & ~((1UL << order) - 1); + unsigned long max = min + (1UL << order); + + xa_store_order(xa, index, order, xa_mk_value(index), GFP_KERNEL); + XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(index)); + XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(index)); + XA_BUG_ON(xa, xa_load(xa, max) != NULL); + XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL); + + XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(min)) != xa_mk_value(index)); + XA_BUG_ON(xa, xa_load(xa, min) != xa_mk_value(min)); + XA_BUG_ON(xa, xa_load(xa, max - 1) != xa_mk_value(min)); + XA_BUG_ON(xa, xa_load(xa, max) != NULL); + XA_BUG_ON(xa, xa_load(xa, min - 1) != NULL); + + xa_erase_index(xa, min); + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_multi_store_2(struct xarray *xa, unsigned long index, + unsigned int order) +{ + XA_STATE(xas, xa, index); + xa_store_order(xa, index, order, xa_mk_value(0), GFP_KERNEL); + + XA_BUG_ON(xa, xas_store(&xas, xa_mk_value(1)) != xa_mk_value(0)); + XA_BUG_ON(xa, xas.xa_index != index); + XA_BUG_ON(xa, xas_store(&xas, NULL) != xa_mk_value(1)); + XA_BUG_ON(xa, !xa_empty(xa)); +} +#endif + static noinline void check_multi_store(struct xarray *xa) { #ifdef CONFIG_XARRAY_MULTI @@ -487,6 +524,13 @@ static noinline void check_multi_store(struct xarray *xa) XA_BUG_ON(xa, !xa_empty(xa)); } } + + for (i = 0; i < 20; i++) { + check_multi_store_1(xa, 200, i); + check_multi_store_1(xa, 0, i); + check_multi_store_1(xa, (1UL << i) + 1, i); + } + check_multi_store_2(xa, 4095, 9); #endif } diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index a60c03287e9c..6e8d66c2aa89 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -20,46 +20,6 @@ #include "test.h" -static void multiorder_check(unsigned long index, int order) -{ - unsigned long i; - unsigned long min = index & ~((1UL << order) - 1); - unsigned long max = min + (1UL << order); - void **slot; - struct item *item2 = item_create(min, order); - RADIX_TREE(tree, GFP_KERNEL); - - printv(2, "Multiorder index %ld, order %d\n", index, order); - - assert(item_insert_order(&tree, index, order) == 0); - - for (i = min; i < max; i++) { - struct item *item = item_lookup(&tree, i); - assert(item != 0); - assert(item->index == index); - } - for (i = 0; i < min; i++) - item_check_absent(&tree, i); - for (i = max; i < 2*max; i++) - item_check_absent(&tree, i); - for (i = min; i < max; i++) - assert(radix_tree_insert(&tree, i, item2) == -EEXIST); - - slot = radix_tree_lookup_slot(&tree, index); - free(*slot); - radix_tree_replace_slot(&tree, slot, item2); - for (i = min; i < max; i++) { - struct item *item = item_lookup(&tree, i); - assert(item != 0); - assert(item->index == min); - } - - assert(item_delete(&tree, min) != 0); - - for (i = 0; i < 2*max; i++) - item_check_absent(&tree, i); -} - void multiorder_iteration(void) { RADIX_TREE(tree, GFP_KERNEL); @@ -251,14 +211,6 @@ static void multiorder_iteration_race(void) void multiorder_checks(void) { - int i; - - for (i = 0; i < 20; i++) { - multiorder_check(200, i); - multiorder_check(0, i); - multiorder_check((1UL << i) + 1, i); - } - multiorder_iteration(); multiorder_tagged_iteration(); multiorder_iteration_race(); -- cgit v1.2.3 From 0e9446c35a80931044b6d8d2d74a9cabd248539f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 15 Aug 2018 14:13:29 -0400 Subject: xarray: Add range store functionality This version of xa_store_range() really only supports load and store. Our only user only needs basic load and store functionality, so there's no need to do the extra work to support marking and overlapping stores correctly yet. Signed-off-by: Matthew Wilcox --- Documentation/core-api/xarray.rst | 8 ++++ include/linux/xarray.h | 2 + lib/test_xarray.c | 34 ++++++++++++++ lib/xarray.c | 97 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 139 insertions(+), 2 deletions(-) diff --git a/Documentation/core-api/xarray.rst b/Documentation/core-api/xarray.rst index 463e4c798ec1..a4e705108f42 100644 --- a/Documentation/core-api/xarray.rst +++ b/Documentation/core-api/xarray.rst @@ -98,6 +98,13 @@ the XArray by calling :c:func:`xa_for_each`. You may prefer to use :c:func:`xa_find` or :c:func:`xa_find_after` to move to the next present entry in the XArray. +Calling :c:func:`xa_store_range` stores the same entry in a range +of indices. If you do this, some of the other operations will behave +in a slightly odd way. For example, marking the entry at one index +may result in the entry being marked at some, but not all of the other +indices. Storing into one index may result in the entry retrieved by +some, but not all of the other indices changing. + Finally, you can remove all entries from an XArray by calling :c:func:`xa_destroy`. If the XArray entries are pointers, you may wish to free the entries first. You can do this by iterating over all present @@ -156,6 +163,7 @@ Takes xa_lock internally: * :c:func:`xa_erase_bh` * :c:func:`xa_erase_irq` * :c:func:`xa_cmpxchg` + * :c:func:`xa_store_range` * :c:func:`xa_alloc` * :c:func:`xa_alloc_bh` * :c:func:`xa_alloc_irq` diff --git a/include/linux/xarray.h b/include/linux/xarray.h index e0b57af52e9b..d9514928ddac 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -292,6 +292,8 @@ void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int xa_reserve(struct xarray *, unsigned long index, gfp_t); +void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, + void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 0f06a93b4d0e..aa47754150ce 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1039,6 +1039,39 @@ static noinline void check_create_range(struct xarray *xa) check_create_range_3(); } +static noinline void __check_store_range(struct xarray *xa, unsigned long first, + unsigned long last) +{ +#ifdef CONFIG_XARRAY_MULTI + xa_store_range(xa, first, last, xa_mk_value(first), GFP_KERNEL); + + XA_BUG_ON(xa, xa_load(xa, first) != xa_mk_value(first)); + XA_BUG_ON(xa, xa_load(xa, last) != xa_mk_value(first)); + XA_BUG_ON(xa, xa_load(xa, first - 1) != NULL); + XA_BUG_ON(xa, xa_load(xa, last + 1) != NULL); + + xa_store_range(xa, first, last, NULL, GFP_KERNEL); +#endif + + XA_BUG_ON(xa, !xa_empty(xa)); +} + +static noinline void check_store_range(struct xarray *xa) +{ + unsigned long i, j; + + for (i = 0; i < 128; i++) { + for (j = i; j < 128; j++) { + __check_store_range(xa, i, j); + __check_store_range(xa, 128 + i, 128 + j); + __check_store_range(xa, 4095 + i, 4095 + j); + __check_store_range(xa, 4096 + i, 4096 + j); + __check_store_range(xa, 123456 + i, 123456 + j); + __check_store_range(xa, UINT_MAX + i, UINT_MAX + j); + } + } +} + static LIST_HEAD(shadow_nodes); static void test_update_node(struct xa_node *node) @@ -1184,6 +1217,7 @@ static int xarray_checks(void) check_destroy(&array); check_move(&array); check_create_range(&array); + check_store_range(&array); check_store_iter(&array); check_workingset(&array, 0); diff --git a/lib/xarray.c b/lib/xarray.c index 9a0d49d4b5f0..8b176f009c08 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -376,6 +376,14 @@ static void *xas_alloc(struct xa_state *xas, unsigned int shift) return node; } +#ifdef CONFIG_XARRAY_MULTI +/* Returns the number of indices covered by a given xa_state */ +static unsigned long xas_size(const struct xa_state *xas) +{ + return (xas->xa_sibs + 1UL) << xas->xa_shift; +} +#endif + /* * Use this to calculate the maximum index that will need to be created * in order to add the entry described by @xas. Because we cannot store a @@ -388,8 +396,7 @@ static unsigned long xas_max(struct xa_state *xas) #ifdef CONFIG_XARRAY_MULTI if (xas->xa_shift || xas->xa_sibs) { - unsigned long mask; - mask = (((xas->xa_sibs + 1UL) << xas->xa_shift) - 1); + unsigned long mask = xas_size(xas) - 1; max |= mask; if (mask == max) max++; @@ -1517,6 +1524,92 @@ int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) } EXPORT_SYMBOL(xa_reserve); +#ifdef CONFIG_XARRAY_MULTI +static void xas_set_range(struct xa_state *xas, unsigned long first, + unsigned long last) +{ + unsigned int shift = 0; + unsigned long sibs = last - first; + unsigned int offset = XA_CHUNK_MASK; + + xas_set(xas, first); + + while ((first & XA_CHUNK_MASK) == 0) { + if (sibs < XA_CHUNK_MASK) + break; + if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK)) + break; + shift += XA_CHUNK_SHIFT; + if (offset == XA_CHUNK_MASK) + offset = sibs & XA_CHUNK_MASK; + sibs >>= XA_CHUNK_SHIFT; + first >>= XA_CHUNK_SHIFT; + } + + offset = first & XA_CHUNK_MASK; + if (offset + sibs > XA_CHUNK_MASK) + sibs = XA_CHUNK_MASK - offset; + if ((((first + sibs + 1) << shift) - 1) > last) + sibs -= 1; + + xas->xa_shift = shift; + xas->xa_sibs = sibs; +} + +/** + * xa_store_range() - Store this entry at a range of indices in the XArray. + * @xa: XArray. + * @first: First index to affect. + * @last: Last index to affect. + * @entry: New entry. + * @gfp: Memory allocation flags. + * + * After this function returns, loads from any index between @first and @last, + * inclusive will return @entry. + * Storing into an existing multislot entry updates the entry of every index. + * The marks associated with @index are unaffected unless @entry is %NULL. + * + * Context: Process context. Takes and releases the xa_lock. May sleep + * if the @gfp flags permit. + * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in + * an XArray, or xa_err(-ENOMEM) if memory allocation failed. + */ +void *xa_store_range(struct xarray *xa, unsigned long first, + unsigned long last, void *entry, gfp_t gfp) +{ + XA_STATE(xas, xa, 0); + + if (WARN_ON_ONCE(xa_is_internal(entry))) + return XA_ERROR(-EINVAL); + if (last < first) + return XA_ERROR(-EINVAL); + + do { + xas_lock(&xas); + if (entry) { + unsigned int order = (last == ~0UL) ? 64 : + ilog2(last + 1); + xas_set_order(&xas, last, order); + xas_create(&xas); + if (xas_error(&xas)) + goto unlock; + } + do { + xas_set_range(&xas, first, last); + xas_store(&xas, entry); + if (xas_error(&xas)) + goto unlock; + first += xas_size(&xas); + } while (first <= last); +unlock: + xas_unlock(&xas); + } while (xas_nomem(&xas, gfp)); + + return xas_result(&xas, NULL); +} +EXPORT_SYMBOL(xa_store_range); +#endif /* CONFIG_XARRAY_MULTI */ + /** * __xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. -- cgit v1.2.3 From bcfa4b72111c9a4d483024cb1f877803b354aa11 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 15 Aug 2018 14:22:16 -0400 Subject: memremap: Convert to XArray Use the new xa_store_range function instead of the radix tree. Signed-off-by: Matthew Wilcox --- kernel/memremap.c | 76 ++++++++++++------------------------------------------- 1 file changed, 16 insertions(+), 60 deletions(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 5b8600d39931..e842fab9f184 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,47 +1,21 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright(c) 2015 Intel Corporation. All rights reserved. */ -#include #include -#include -#include #include #include -#include #include +#include +#include #include #include +#include #include +#include -static DEFINE_MUTEX(pgmap_lock); -static RADIX_TREE(pgmap_radix, GFP_KERNEL); +static DEFINE_XARRAY(pgmap_array); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -static unsigned long order_at(struct resource *res, unsigned long pgoff) -{ - unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; - unsigned long nr_pages, mask; - - nr_pages = PHYS_PFN(resource_size(res)); - if (nr_pages == pgoff) - return ULONG_MAX; - - /* - * What is the largest aligned power-of-2 range available from - * this resource pgoff to the end of the resource range, - * considering the alignment of the current pgoff? - */ - mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); - if (!mask) - return ULONG_MAX; - - return find_first_bit(&mask, BITS_PER_LONG); -} - -#define foreach_order_pgoff(res, order, pgoff) \ - for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ - pgoff += 1UL << order, order = order_at((res), pgoff)) - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, unsigned long addr, @@ -70,18 +44,10 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) +static void pgmap_array_delete(struct resource *res) { - unsigned long pgoff, order; - - mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) { - if (pgoff >= end_pgoff) - break; - radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); - } - mutex_unlock(&pgmap_lock); - + xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), + NULL, GFP_KERNEL); synchronize_rcu(); } @@ -142,7 +108,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res, -1); + pgmap_array_delete(res); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -175,7 +141,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; struct resource *res = &pgmap->res; - unsigned long pfn, pgoff, order; + unsigned long pfn; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; struct dev_pagemap *conflict_pgmap; @@ -216,20 +182,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) pgmap->dev = dev; - mutex_lock(&pgmap_lock); - error = 0; - - foreach_order_pgoff(res, order, pgoff) { - error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, pgmap); - if (error) { - dev_err(dev, "%s: failed: %d\n", __func__, error); - break; - } - } - mutex_unlock(&pgmap_lock); + error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), + PHYS_PFN(res->end), pgmap, GFP_KERNEL)); if (error) - goto err_radix; + goto err_array; nid = dev_to_node(dev); if (nid < 0) @@ -279,8 +235,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_kasan: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: - err_radix: - pgmap_radix_release(res, pgoff); + pgmap_array_delete(res); + err_array: return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -320,7 +276,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); + pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) pgmap = NULL; rcu_read_unlock(); -- cgit v1.2.3 From 9076b33d7dad8e316c90918845417e85347f682c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 12 Sep 2018 12:52:45 -0400 Subject: radix tree test suite: Remove __item_insert Inline it into its one caller Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/test.c | 7 +------ tools/testing/radix-tree/test.h | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 32973dd51ec5..5991cfd34f2b 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -25,11 +25,6 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag) return radix_tree_tag_get(root, index, tag); } -int __item_insert(struct radix_tree_root *root, struct item *item) -{ - return __radix_tree_insert(root, item->index, item->order, item); -} - struct item *item_create(unsigned long index, unsigned int order) { struct item *ret = malloc(sizeof(*ret)); @@ -43,7 +38,7 @@ int item_insert_order(struct radix_tree_root *root, unsigned long index, unsigned order) { struct item *item = item_create(index, order); - int err = __item_insert(root, item); + int err = __radix_tree_insert(root, item->index, item->order, item); if (err) free(item); return err; diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index a2f53c889a31..28961a08828e 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -11,7 +11,6 @@ struct item { }; struct item *item_create(unsigned long index, unsigned int order); -int __item_insert(struct radix_tree_root *root, struct item *item); int item_insert(struct radix_tree_root *root, unsigned long index); void item_sanity(struct item *item, unsigned long index); void item_free(struct item *item, unsigned long index); -- cgit v1.2.3 From 879a9ae7b5bc046f195a725d62bbc96258e5d0c8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 12 Sep 2018 23:12:47 -0400 Subject: radix tree test suite: Remove multiorder benchmarking The multiorder radix tree code is being removed, so remove the benchmarking of its performance. Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/benchmark.c | 50 +++++++++++++++--------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c index 35741b9c2a4a..7e195ed8e92d 100644 --- a/tools/testing/radix-tree/benchmark.c +++ b/tools/testing/radix-tree/benchmark.c @@ -17,9 +17,6 @@ #include #include "test.h" -#define for_each_index(i, base, order) \ - for (i = base; i < base + (1 << order); i++) - #define NSEC_PER_SEC 1000000000L static long long benchmark_iter(struct radix_tree_root *root, bool tagged) @@ -61,7 +58,7 @@ again: } static void benchmark_insert(struct radix_tree_root *root, - unsigned long size, unsigned long step, int order) + unsigned long size, unsigned long step) { struct timespec start, finish; unsigned long index; @@ -70,19 +67,19 @@ static void benchmark_insert(struct radix_tree_root *root, clock_gettime(CLOCK_MONOTONIC, &start); for (index = 0 ; index < size ; index += step) - item_insert_order(root, index, order); + item_insert(root, index); clock_gettime(CLOCK_MONOTONIC, &finish); nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + (finish.tv_nsec - start.tv_nsec); - printv(2, "Size: %8ld, step: %8ld, order: %d, insertion: %15lld ns\n", - size, step, order, nsec); + printv(2, "Size: %8ld, step: %8ld, insertion: %15lld ns\n", + size, step, nsec); } static void benchmark_tagging(struct radix_tree_root *root, - unsigned long size, unsigned long step, int order) + unsigned long size, unsigned long step) { struct timespec start, finish; unsigned long index; @@ -98,49 +95,48 @@ static void benchmark_tagging(struct radix_tree_root *root, nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + (finish.tv_nsec - start.tv_nsec); - printv(2, "Size: %8ld, step: %8ld, order: %d, tagging: %17lld ns\n", - size, step, order, nsec); + printv(2, "Size: %8ld, step: %8ld, tagging: %17lld ns\n", + size, step, nsec); } static void benchmark_delete(struct radix_tree_root *root, - unsigned long size, unsigned long step, int order) + unsigned long size, unsigned long step) { struct timespec start, finish; - unsigned long index, i; + unsigned long index; long long nsec; clock_gettime(CLOCK_MONOTONIC, &start); for (index = 0 ; index < size ; index += step) - for_each_index(i, index, order) - item_delete(root, i); + item_delete(root, index); clock_gettime(CLOCK_MONOTONIC, &finish); nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + (finish.tv_nsec - start.tv_nsec); - printv(2, "Size: %8ld, step: %8ld, order: %d, deletion: %16lld ns\n", - size, step, order, nsec); + printv(2, "Size: %8ld, step: %8ld, deletion: %16lld ns\n", + size, step, nsec); } -static void benchmark_size(unsigned long size, unsigned long step, int order) +static void benchmark_size(unsigned long size, unsigned long step) { RADIX_TREE(tree, GFP_KERNEL); long long normal, tagged; - benchmark_insert(&tree, size, step, order); - benchmark_tagging(&tree, size, step, order); + benchmark_insert(&tree, size, step); + benchmark_tagging(&tree, size, step); tagged = benchmark_iter(&tree, true); normal = benchmark_iter(&tree, false); - printv(2, "Size: %8ld, step: %8ld, order: %d, tagged iteration: %8lld ns\n", - size, step, order, tagged); - printv(2, "Size: %8ld, step: %8ld, order: %d, normal iteration: %8lld ns\n", - size, step, order, normal); + printv(2, "Size: %8ld, step: %8ld, tagged iteration: %8lld ns\n", + size, step, tagged); + printv(2, "Size: %8ld, step: %8ld, normal iteration: %8lld ns\n", + size, step, normal); - benchmark_delete(&tree, size, step, order); + benchmark_delete(&tree, size, step); item_kill_tree(&tree); rcu_barrier(); @@ -158,9 +154,5 @@ void benchmark(void) for (c = 0; size[c]; c++) for (s = 0; step[s]; s++) - benchmark_size(size[c], step[s], 0); - - for (c = 0; size[c]; c++) - for (s = 0; step[s]; s++) - benchmark_size(size[c], step[s] << 9, 9); + benchmark_size(size[c], step[s]); } -- cgit v1.2.3 From 4bb53bdda0d1e061035774ed4868bdeb4d889044 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 12 Sep 2018 23:29:32 -0400 Subject: radix tree tests: Move item_insert_order The remaining tests are not suitable for moving in-kernel, so move item_insert_order() into multiorder.c, make it static and make it use the XArray. Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/multiorder.c | 19 +++++++++++++++++++ tools/testing/radix-tree/test.c | 12 +++--------- tools/testing/radix-tree/test.h | 2 -- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 6e8d66c2aa89..8c41dca272b1 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -20,6 +20,25 @@ #include "test.h" +static int item_insert_order(struct xarray *xa, unsigned long index, + unsigned order) +{ + XA_STATE_ORDER(xas, xa, index, order); + struct item *item = item_create(index, order); + + do { + xas_lock(&xas); + xas_store(&xas, item); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + if (!xas_error(&xas)) + return 0; + + free(item); + return xas_error(&xas); +} + void multiorder_iteration(void) { RADIX_TREE(tree, GFP_KERNEL); diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 5991cfd34f2b..5376b8c5d8d6 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -34,21 +34,15 @@ struct item *item_create(unsigned long index, unsigned int order) return ret; } -int item_insert_order(struct radix_tree_root *root, unsigned long index, - unsigned order) +int item_insert(struct radix_tree_root *root, unsigned long index) { - struct item *item = item_create(index, order); - int err = __radix_tree_insert(root, item->index, item->order, item); + struct item *item = item_create(index, 0); + int err = radix_tree_insert(root, item->index, item); if (err) free(item); return err; } -int item_insert(struct radix_tree_root *root, unsigned long index) -{ - return item_insert_order(root, index, 0); -} - void item_sanity(struct item *item, unsigned long index) { unsigned long mask; diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 28961a08828e..e259c0839d5d 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -14,8 +14,6 @@ struct item *item_create(unsigned long index, unsigned int order); int item_insert(struct radix_tree_root *root, unsigned long index); void item_sanity(struct item *item, unsigned long index); void item_free(struct item *item, unsigned long index); -int item_insert_order(struct radix_tree_root *root, unsigned long index, - unsigned order); int item_delete(struct radix_tree_root *root, unsigned long index); int item_delete_rcu(struct radix_tree_root *root, unsigned long index); struct item *item_lookup(struct radix_tree_root *root, unsigned long index); -- cgit v1.2.3 From ccc89e30fac790e3a175dbfc863c67286fce96b0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 13 Sep 2018 10:15:41 -0400 Subject: radix tree tests: Convert item_kill_tree to XArray In preparation for the removal of the multiorder radix tree code, convert item_kill_tree() to use the XArray so it can still be called for XArrays containing multi-index entries. Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/test.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 5376b8c5d8d6..19045ce3bd23 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -252,31 +252,19 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag) verify_node(node, tag, !!root_tag_get(root, tag)); } -void item_kill_tree(struct radix_tree_root *root) +void item_kill_tree(struct xarray *xa) { - struct radix_tree_iter iter; - void **slot; - struct item *items[32]; - int nfound; - - radix_tree_for_each_slot(slot, root, &iter, 0) { - if (xa_is_value(*slot)) - radix_tree_delete(root, iter.index); - } - - while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { - int i; - - for (i = 0; i < nfound; i++) { - void *ret; + XA_STATE(xas, xa, 0); + void *entry; - ret = radix_tree_delete(root, items[i]->index); - assert(ret == items[i]); - free(items[i]); + xas_for_each(&xas, entry, ULONG_MAX) { + if (!xa_is_value(entry)) { + item_free(entry, xas.xa_index); } + xas_store(&xas, NULL); } - assert(radix_tree_gang_lookup(root, (void **)items, 0, 32) == 0); - assert(root->xa_head == NULL); + + assert(xa_empty(xa)); } void tree_verify_min_height(struct radix_tree_root *root, int maxindex) -- cgit v1.2.3 From b66b5a48b8a0e43dc114573da11c1a9c586a2d4f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 22 Sep 2018 15:34:28 -0400 Subject: radix tree tests: Convert item_delete_rcu to XArray In preparation for the removal of the multiorder radix tree code, convert item_delete_rcu() to use the XArray so it can still be called for XArrays containing multi-index entries. Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/test.c | 4 ++-- tools/testing/radix-tree/test.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 19045ce3bd23..a15d0512e633 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -76,9 +76,9 @@ static void item_free_rcu(struct rcu_head *head) free(item); } -int item_delete_rcu(struct radix_tree_root *root, unsigned long index) +int item_delete_rcu(struct xarray *xa, unsigned long index) { - struct item *item = radix_tree_delete(root, index); + struct item *item = xa_erase(xa, index); if (item) { item_sanity(item, index); diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index e259c0839d5d..1ee4b2c0ad10 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -15,7 +15,7 @@ int item_insert(struct radix_tree_root *root, unsigned long index); void item_sanity(struct item *item, unsigned long index); void item_free(struct item *item, unsigned long index); int item_delete(struct radix_tree_root *root, unsigned long index); -int item_delete_rcu(struct radix_tree_root *root, unsigned long index); +int item_delete_rcu(struct xarray *xa, unsigned long index); struct item *item_lookup(struct radix_tree_root *root, unsigned long index); void item_check_present(struct radix_tree_root *root, unsigned long index); -- cgit v1.2.3 From 542980aa9318edcfb68aa7bf6eacf2814dc137dd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 22 Sep 2018 16:12:41 -0400 Subject: radix tree test: Convert multiorder tests to XArray This is the last remaining user of the multiorder functionality of the radix tree. Test the XArray instead. Signed-off-by: Matthew Wilcox --- tools/testing/radix-tree/multiorder.c | 105 ++++++++++++++++------------------ 1 file changed, 49 insertions(+), 56 deletions(-) diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 8c41dca272b1..ff27a74d9762 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -39,21 +39,20 @@ static int item_insert_order(struct xarray *xa, unsigned long index, return xas_error(&xas); } -void multiorder_iteration(void) +void multiorder_iteration(struct xarray *xa) { - RADIX_TREE(tree, GFP_KERNEL); - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, xa, 0); + struct item *item; int i, j, err; - printv(1, "Multiorder iteration test\n"); - #define NUM_ENTRIES 11 int index[NUM_ENTRIES] = {0, 2, 4, 8, 16, 32, 34, 36, 64, 72, 128}; int order[NUM_ENTRIES] = {1, 1, 2, 3, 4, 1, 0, 1, 3, 0, 7}; + printv(1, "Multiorder iteration test\n"); + for (i = 0; i < NUM_ENTRIES; i++) { - err = item_insert_order(&tree, index[i], order[i]); + err = item_insert_order(xa, index[i], order[i]); assert(!err); } @@ -62,14 +61,14 @@ void multiorder_iteration(void) if (j <= (index[i] | ((1 << order[i]) - 1))) break; - radix_tree_for_each_slot(slot, &tree, &iter, j) { - int height = order[i] / RADIX_TREE_MAP_SHIFT; - int shift = height * RADIX_TREE_MAP_SHIFT; + xas_set(&xas, j); + xas_for_each(&xas, item, ULONG_MAX) { + int height = order[i] / XA_CHUNK_SHIFT; + int shift = height * XA_CHUNK_SHIFT; unsigned long mask = (1UL << order[i]) - 1; - struct item *item = *slot; - assert((iter.index | mask) == (index[i] | mask)); - assert(iter.shift == shift); + assert((xas.xa_index | mask) == (index[i] | mask)); + assert(xas.xa_node->shift == shift); assert(!radix_tree_is_internal_node(item)); assert((item->index | mask) == (index[i] | mask)); assert(item->order == order[i]); @@ -77,18 +76,15 @@ void multiorder_iteration(void) } } - item_kill_tree(&tree); + item_kill_tree(xa); } -void multiorder_tagged_iteration(void) +void multiorder_tagged_iteration(struct xarray *xa) { - RADIX_TREE(tree, GFP_KERNEL); - struct radix_tree_iter iter; - void **slot; + XA_STATE(xas, xa, 0); + struct item *item; int i, j; - printv(1, "Multiorder tagged iteration test\n"); - #define MT_NUM_ENTRIES 9 int index[MT_NUM_ENTRIES] = {0, 2, 4, 16, 32, 40, 64, 72, 128}; int order[MT_NUM_ENTRIES] = {1, 0, 2, 4, 3, 1, 3, 0, 7}; @@ -96,13 +92,15 @@ void multiorder_tagged_iteration(void) #define TAG_ENTRIES 7 int tag_index[TAG_ENTRIES] = {0, 4, 16, 40, 64, 72, 128}; + printv(1, "Multiorder tagged iteration test\n"); + for (i = 0; i < MT_NUM_ENTRIES; i++) - assert(!item_insert_order(&tree, index[i], order[i])); + assert(!item_insert_order(xa, index[i], order[i])); - assert(!radix_tree_tagged(&tree, 1)); + assert(!xa_marked(xa, XA_MARK_1)); for (i = 0; i < TAG_ENTRIES; i++) - assert(radix_tree_tag_set(&tree, tag_index[i], 1)); + xa_set_mark(xa, tag_index[i], XA_MARK_1); for (j = 0; j < 256; j++) { int k; @@ -114,22 +112,22 @@ void multiorder_tagged_iteration(void) break; } - radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { + xas_set(&xas, j); + xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_1) { unsigned long mask; - struct item *item = *slot; for (k = i; index[k] < tag_index[i]; k++) ; mask = (1UL << order[k]) - 1; - assert((iter.index | mask) == (tag_index[i] | mask)); - assert(!radix_tree_is_internal_node(item)); + assert((xas.xa_index | mask) == (tag_index[i] | mask)); + assert(!xa_is_internal(item)); assert((item->index | mask) == (tag_index[i] | mask)); assert(item->order == order[k]); i++; } } - assert(tag_tagged_items(&tree, 0, ~0UL, TAG_ENTRIES, XA_MARK_1, + assert(tag_tagged_items(xa, 0, ULONG_MAX, TAG_ENTRIES, XA_MARK_1, XA_MARK_2) == TAG_ENTRIES); for (j = 0; j < 256; j++) { @@ -142,29 +140,31 @@ void multiorder_tagged_iteration(void) break; } - radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { - struct item *item = *slot; + xas_set(&xas, j); + xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_2) { for (k = i; index[k] < tag_index[i]; k++) ; mask = (1 << order[k]) - 1; - assert((iter.index | mask) == (tag_index[i] | mask)); - assert(!radix_tree_is_internal_node(item)); + assert((xas.xa_index | mask) == (tag_index[i] | mask)); + assert(!xa_is_internal(item)); assert((item->index | mask) == (tag_index[i] | mask)); assert(item->order == order[k]); i++; } } - assert(tag_tagged_items(&tree, 1, ~0UL, MT_NUM_ENTRIES * 2, XA_MARK_1, + assert(tag_tagged_items(xa, 1, ULONG_MAX, MT_NUM_ENTRIES * 2, XA_MARK_1, XA_MARK_0) == TAG_ENTRIES); i = 0; - radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { - assert(iter.index == tag_index[i]); + xas_set(&xas, 0); + xas_for_each_marked(&xas, item, ULONG_MAX, XA_MARK_0) { + assert(xas.xa_index == tag_index[i]); i++; } + assert(i == TAG_ENTRIES); - item_kill_tree(&tree); + item_kill_tree(xa); } bool stop_iteration = false; @@ -187,52 +187,45 @@ static void *creator_func(void *ptr) static void *iterator_func(void *ptr) { - struct radix_tree_root *tree = ptr; - struct radix_tree_iter iter; + XA_STATE(xas, ptr, 0); struct item *item; - void **slot; while (!stop_iteration) { rcu_read_lock(); - radix_tree_for_each_slot(slot, tree, &iter, 0) { - item = radix_tree_deref_slot(slot); - - if (!item) + xas_for_each(&xas, item, ULONG_MAX) { + if (xas_retry(&xas, item)) continue; - if (radix_tree_deref_retry(item)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - item_sanity(item, iter.index); + item_sanity(item, xas.xa_index); } rcu_read_unlock(); } return NULL; } -static void multiorder_iteration_race(void) +static void multiorder_iteration_race(struct xarray *xa) { const int num_threads = sysconf(_SC_NPROCESSORS_ONLN); pthread_t worker_thread[num_threads]; - RADIX_TREE(tree, GFP_KERNEL); int i; - pthread_create(&worker_thread[0], NULL, &creator_func, &tree); + pthread_create(&worker_thread[0], NULL, &creator_func, xa); for (i = 1; i < num_threads; i++) - pthread_create(&worker_thread[i], NULL, &iterator_func, &tree); + pthread_create(&worker_thread[i], NULL, &iterator_func, xa); for (i = 0; i < num_threads; i++) pthread_join(worker_thread[i], NULL); - item_kill_tree(&tree); + item_kill_tree(xa); } +static DEFINE_XARRAY(array); + void multiorder_checks(void) { - multiorder_iteration(); - multiorder_tagged_iteration(); - multiorder_iteration_race(); + multiorder_iteration(&array); + multiorder_tagged_iteration(&array); + multiorder_iteration_race(&array); radix_tree_cpu_dead(0); } -- cgit v1.2.3 From 3a08cd52c37c793ffc199f6fc2ecfc368e284b2d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 22 Sep 2018 16:14:30 -0400 Subject: radix tree: Remove multiorder support All users have now been converted to the XArray. Removing the support reduces code size and ensures new users will use the XArray instead. Signed-off-by: Matthew Wilcox --- include/linux/radix-tree.h | 40 +---- lib/Kconfig | 4 - lib/radix-tree.c | 215 ++------------------------ mm/Kconfig | 4 +- tools/testing/radix-tree/generated/autoconf.h | 1 - 5 files changed, 19 insertions(+), 245 deletions(-) diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 9a1460488163..06c4c7a6c09c 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -96,7 +96,6 @@ static inline bool radix_tree_empty(const struct radix_tree_root *root) * @next_index: one beyond the last index for this chunk * @tags: bit-mask for tag-iterating * @node: node that contains current slot - * @shift: shift for the node that holds our slots * * This radix tree iterator works in terms of "chunks" of slots. A chunk is a * subinterval of slots contained within one radix tree leaf node. It is @@ -110,20 +109,8 @@ struct radix_tree_iter { unsigned long next_index; unsigned long tags; struct radix_tree_node *node; -#ifdef CONFIG_RADIX_TREE_MULTIORDER - unsigned int shift; -#endif }; -static inline unsigned int iter_shift(const struct radix_tree_iter *iter) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - return iter->shift; -#else - return 0; -#endif -} - /** * Radix-tree synchronization * @@ -230,13 +217,8 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -int __radix_tree_insert(struct radix_tree_root *, unsigned long index, - unsigned order, void *); -static inline int radix_tree_insert(struct radix_tree_root *root, - unsigned long index, void *entry) -{ - return __radix_tree_insert(root, index, 0, entry); -} +int radix_tree_insert(struct radix_tree_root *, unsigned long index, + void *); void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, struct radix_tree_node **nodep, void __rcu ***slotp); void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); @@ -384,7 +366,7 @@ void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter) static inline unsigned long __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) { - return iter->index + (slots << iter_shift(iter)); + return iter->index + slots; } /** @@ -409,20 +391,8 @@ void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot, static __always_inline long radix_tree_chunk_size(struct radix_tree_iter *iter) { - return (iter->next_index - iter->index) >> iter_shift(iter); -} - -#ifdef CONFIG_RADIX_TREE_MULTIORDER -void __rcu **__radix_tree_next_slot(void __rcu **slot, - struct radix_tree_iter *iter, unsigned flags); -#else -/* Can't happen without sibling entries, but the compiler can't tell that */ -static inline void __rcu **__radix_tree_next_slot(void __rcu **slot, - struct radix_tree_iter *iter, unsigned flags) -{ - return slot; + return iter->next_index - iter->index; } -#endif /** * radix_tree_next_slot - find next slot in chunk @@ -482,8 +452,6 @@ static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot, return NULL; found: - if (unlikely(radix_tree_is_internal_node(rcu_dereference_raw(*slot)))) - return __radix_tree_next_slot(slot, iter, flags); return slot; } diff --git a/lib/Kconfig b/lib/Kconfig index 40bfa6ccd294..a9965f4af4dd 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -405,10 +405,6 @@ config XARRAY_MULTI Support entries which occupy multiple consecutive indices in the XArray. -config RADIX_TREE_MULTIORDER - bool - select XARRAY_MULTI - config ASSOCIATIVE_ARRAY bool help diff --git a/lib/radix-tree.c b/lib/radix-tree.c index f107dd2698e3..1106bb6aa01e 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -110,11 +110,6 @@ static unsigned int radix_tree_descend(const struct radix_tree_node *parent, unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK; void __rcu **entry = rcu_dereference_raw(parent->slots[offset]); - if (xa_is_sibling(entry)) { - offset = xa_to_sibling(entry); - entry = rcu_dereference_raw(parent->slots[offset]); - } - *nodep = (void *)entry; return offset; } @@ -229,7 +224,7 @@ radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, static unsigned int iter_offset(const struct radix_tree_iter *iter) { - return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; + return iter->index & RADIX_TREE_MAP_MASK; } /* @@ -506,16 +501,13 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) /* * The candidate node has more than one child, or its child - * is not at the leftmost slot, or the child is a multiorder - * entry, we cannot shrink. + * is not at the leftmost slot, we cannot shrink. */ if (node->count != 1) break; child = rcu_dereference_raw(node->slots[0]); if (!child) break; - if (!radix_tree_is_internal_node(child) && node->shift) - break; /* * For an IDR, we must not shrink entry 0 into the root in @@ -613,7 +605,6 @@ static bool delete_node(struct radix_tree_root *root, * __radix_tree_create - create a slot in a radix tree * @root: radix tree root * @index: index key - * @order: index occupies 2^order aligned slots * @nodep: returns node * @slotp: returns slot * @@ -627,21 +618,19 @@ static bool delete_node(struct radix_tree_root *root, * Returns -ENOMEM, or 0 for success. */ static int __radix_tree_create(struct radix_tree_root *root, - unsigned long index, unsigned order, - struct radix_tree_node **nodep, void __rcu ***slotp) + unsigned long index, struct radix_tree_node **nodep, + void __rcu ***slotp) { struct radix_tree_node *node = NULL, *child; void __rcu **slot = (void __rcu **)&root->xa_head; unsigned long maxindex; unsigned int shift, offset = 0; - unsigned long max = index | ((1UL << order) - 1); + unsigned long max = index; gfp_t gfp = root_gfp_mask(root); shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ - if (order > 0 && max == ((1UL << order) - 1)) - max++; if (max > maxindex) { int error = radix_tree_extend(root, gfp, max, shift); if (error < 0) @@ -650,7 +639,7 @@ static int __radix_tree_create(struct radix_tree_root *root, child = rcu_dereference_raw(root->xa_head); } - while (shift > order) { + while (shift > 0) { shift -= RADIX_TREE_MAP_SHIFT; if (child == NULL) { /* Have to add a child node. */ @@ -711,70 +700,8 @@ static void radix_tree_free_nodes(struct radix_tree_node *node) } } -#ifdef CONFIG_RADIX_TREE_MULTIORDER -static inline int insert_entries(struct radix_tree_node *node, - void __rcu **slot, void *item, unsigned order, bool replace) -{ - void *sibling; - unsigned i, n, tag, offset, tags = 0; - - if (node) { - if (order > node->shift) - n = 1 << (order - node->shift); - else - n = 1; - offset = get_slot_offset(node, slot); - } else { - n = 1; - offset = 0; - } - - if (n > 1) { - offset = offset & ~(n - 1); - slot = &node->slots[offset]; - } - sibling = xa_mk_sibling(offset); - - for (i = 0; i < n; i++) { - if (slot[i]) { - if (replace) { - node->count--; - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tag_get(node, tag, offset + i)) - tags |= 1 << tag; - } else - return -EEXIST; - } - } - - for (i = 0; i < n; i++) { - struct radix_tree_node *old = rcu_dereference_raw(slot[i]); - if (i) { - rcu_assign_pointer(slot[i], sibling); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_clear(node, tag, offset + i); - } else { - rcu_assign_pointer(slot[i], item); - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) - if (tags & (1 << tag)) - tag_set(node, tag, offset); - } - if (xa_is_node(old)) - radix_tree_free_nodes(old); - if (xa_is_value(old)) - node->nr_values--; - } - if (node) { - node->count += n; - if (xa_is_value(item)) - node->nr_values += n; - } - return n; -} -#else static inline int insert_entries(struct radix_tree_node *node, - void __rcu **slot, void *item, unsigned order, bool replace) + void __rcu **slot, void *item, bool replace) { if (*slot) return -EEXIST; @@ -786,19 +713,17 @@ static inline int insert_entries(struct radix_tree_node *node, } return 1; } -#endif /** * __radix_tree_insert - insert into a radix tree * @root: radix tree root * @index: index key - * @order: key covers the 2^order indices around index * @item: item to insert * * Insert an item into the radix tree at position @index. */ -int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, - unsigned order, void *item) +int radix_tree_insert(struct radix_tree_root *root, unsigned long index, + void *item) { struct radix_tree_node *node; void __rcu **slot; @@ -806,11 +731,11 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, BUG_ON(radix_tree_is_internal_node(item)); - error = __radix_tree_create(root, index, order, &node, &slot); + error = __radix_tree_create(root, index, &node, &slot); if (error) return error; - error = insert_entries(node, slot, item, order, false); + error = insert_entries(node, slot, item, false); if (error < 0) return error; @@ -825,7 +750,7 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, return 0; } -EXPORT_SYMBOL(__radix_tree_insert); +EXPORT_SYMBOL(radix_tree_insert); /** * __radix_tree_lookup - lookup an item in a radix tree @@ -917,32 +842,12 @@ void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); -static inline void replace_sibling_entries(struct radix_tree_node *node, - void __rcu **slot, int count, int values) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - unsigned offset = get_slot_offset(node, slot); - void *ptr = xa_mk_sibling(offset); - - while (++offset < RADIX_TREE_MAP_SIZE) { - if (rcu_dereference_raw(node->slots[offset]) != ptr) - break; - if (count < 0) { - node->slots[offset] = NULL; - node->count--; - } - node->nr_values += values; - } -#endif -} - static void replace_slot(void __rcu **slot, void *item, struct radix_tree_node *node, int count, int values) { if (node && (count || values)) { node->count += count; node->nr_values += values; - replace_sibling_entries(node, slot, count, values); } rcu_assign_pointer(*slot, item); @@ -1223,14 +1128,6 @@ int radix_tree_tag_get(const struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_get); -static inline void __set_iter_shift(struct radix_tree_iter *iter, - unsigned int shift) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - iter->shift = shift; -#endif -} - /* Construct iter->tags bit-mask from node->tags[tag] array */ static void set_iter_tags(struct radix_tree_iter *iter, struct radix_tree_node *node, unsigned offset, @@ -1257,92 +1154,11 @@ static void set_iter_tags(struct radix_tree_iter *iter, } } -#ifdef CONFIG_RADIX_TREE_MULTIORDER -static void __rcu **skip_siblings(struct radix_tree_node **nodep, - void __rcu **slot, struct radix_tree_iter *iter) -{ - while (iter->index < iter->next_index) { - *nodep = rcu_dereference_raw(*slot); - if (*nodep && !xa_is_sibling(*nodep)) - return slot; - slot++; - iter->index = __radix_tree_iter_add(iter, 1); - iter->tags >>= 1; - } - - *nodep = NULL; - return NULL; -} - -void __rcu **__radix_tree_next_slot(void __rcu **slot, - struct radix_tree_iter *iter, unsigned flags) -{ - unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; - struct radix_tree_node *node; - - slot = skip_siblings(&node, slot, iter); - - while (radix_tree_is_internal_node(node)) { - unsigned offset; - unsigned long next_index; - - if (node == RADIX_TREE_RETRY) - return slot; - node = entry_to_node(node); - iter->node = node; - iter->shift = node->shift; - - if (flags & RADIX_TREE_ITER_TAGGED) { - offset = radix_tree_find_next_bit(node, tag, 0); - if (offset == RADIX_TREE_MAP_SIZE) - return NULL; - slot = &node->slots[offset]; - iter->index = __radix_tree_iter_add(iter, offset); - set_iter_tags(iter, node, offset, tag); - node = rcu_dereference_raw(*slot); - } else { - offset = 0; - slot = &node->slots[0]; - for (;;) { - node = rcu_dereference_raw(*slot); - if (node) - break; - slot++; - offset++; - if (offset == RADIX_TREE_MAP_SIZE) - return NULL; - } - iter->index = __radix_tree_iter_add(iter, offset); - } - if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) - goto none; - next_index = (iter->index | shift_maxindex(iter->shift)) + 1; - if (next_index < iter->next_index) - iter->next_index = next_index; - } - - return slot; - none: - iter->next_index = 0; - return NULL; -} -EXPORT_SYMBOL(__radix_tree_next_slot); -#else -static void __rcu **skip_siblings(struct radix_tree_node **nodep, - void __rcu **slot, struct radix_tree_iter *iter) -{ - return slot; -} -#endif - void __rcu **radix_tree_iter_resume(void __rcu **slot, struct radix_tree_iter *iter) { - struct radix_tree_node *node; - slot++; iter->index = __radix_tree_iter_add(iter, 1); - skip_siblings(&node, slot, iter); iter->next_index = iter->index; iter->tags = 0; return NULL; @@ -1393,7 +1209,6 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, iter->next_index = maxindex + 1; iter->tags = 1; iter->node = NULL; - __set_iter_shift(iter, 0); return (void __rcu **)&root->xa_head; } @@ -1414,8 +1229,6 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, while (++offset < RADIX_TREE_MAP_SIZE) { void *slot = rcu_dereference_raw( node->slots[offset]); - if (xa_is_sibling(slot)) - continue; if (slot) break; } @@ -1436,10 +1249,9 @@ void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root, } while (node->shift && radix_tree_is_internal_node(child)); /* Update the iterator state */ - iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); + iter->index = (index &~ node_maxindex(node)) | offset; iter->next_index = (index | node_maxindex(node)) + 1; iter->node = node; - __set_iter_shift(iter, node->shift); if (flags & RADIX_TREE_ITER_TAGGED) set_iter_tags(iter, node, offset, tag); @@ -1750,7 +1562,6 @@ void __rcu **idr_get_free(struct radix_tree_root *root, else iter->next_index = 1; iter->node = node; - __set_iter_shift(iter, shift); set_iter_tags(iter, node, offset, IDR_FREE); return slot; diff --git a/mm/Kconfig b/mm/Kconfig index de64ea658716..02301a89089e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -379,7 +379,7 @@ config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -671,7 +671,7 @@ config ZONE_DEVICE depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE - select RADIX_TREE_MULTIORDER + select XARRAY_MULTI help Device memory hotplug support allows for establishing pmem, diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h index ca8e03ad19ac..2218b3cc184e 100644 --- a/tools/testing/radix-tree/generated/autoconf.h +++ b/tools/testing/radix-tree/generated/autoconf.h @@ -1,2 +1 @@ -#define CONFIG_RADIX_TREE_MULTIORDER 1 #define CONFIG_XARRAY_MULTI 1 -- cgit v1.2.3