diff options
author | David Chinner <dgc@sgi.com> | 2007-08-28 14:00:13 +1000 |
---|---|---|
committer | Tim Shimmin <tes@chook.melbourne.sgi.com> | 2007-10-15 16:50:50 +1000 |
commit | da353b0d64e070ae7c5342a0d56ec20ae9ef5cfb (patch) | |
tree | 84454023d649df67cc6b125c73746ddb341ac34e /fs/xfs/xfs_iget.c | |
parent | 39cd9f877e63ce7e02cdc7f5dbf1b908451c9532 (diff) | |
download | linux-stable-da353b0d64e070ae7c5342a0d56ec20ae9ef5cfb.tar.gz linux-stable-da353b0d64e070ae7c5342a0d56ec20ae9ef5cfb.tar.bz2 linux-stable-da353b0d64e070ae7c5342a0d56ec20ae9ef5cfb.zip |
[XFS] Radix tree based inode caching
One of the perpetual scaling problems XFS has is indexing it's incore
inodes. We currently uses hashes and the default hash sizes chosen can
only ever be a tradeoff between memory consumption and the maximum
realistic size of the cache.
As a result, anyone who has millions of inodes cached on a filesystem
needs to tunes the size of the cache via the ihashsize mount option to
allow decent scalability with inode cache operations.
A further problem is the separate inode cluster hash, whose size is based
on the ihashsize but is smaller, and so under certain conditions (sparse
cluster cache population) this can become a limitation long before the
inode hash is causing issues.
The following patchset removes the inode hash and cluster hash and
replaces them with radix trees to avoid the scalability limitations of the
hashes. It also reduces the size of the inodes by 3 pointers....
SGI-PV: 969561
SGI-Modid: xfs-linux-melb:xfs-kern:29481a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_iget.c')
-rw-r--r-- | fs/xfs/xfs_iget.c | 585 |
1 files changed, 195 insertions, 390 deletions
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 114433a22baa..e07dcc1b70a6 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -40,131 +40,13 @@ #include "xfs_utils.h" /* - * Initialize the inode hash table for the newly mounted file system. - * Choose an initial table size based on user specified value, else - * use a simple algorithm using the maximum number of inodes as an - * indicator for table size, and clamp it between one and some large - * number of pages. - */ -void -xfs_ihash_init(xfs_mount_t *mp) -{ - __uint64_t icount; - uint i; - - if (!mp->m_ihsize) { - icount = mp->m_maxicount ? mp->m_maxicount : - (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog); - mp->m_ihsize = 1 << max_t(uint, 8, - (xfs_highbit64(icount) + 1) / 2); - mp->m_ihsize = min_t(uint, mp->m_ihsize, - (64 * NBPP) / sizeof(xfs_ihash_t)); - } - - mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize, - NBPC * sizeof(xfs_ihash_t), - mp->m_ihsize * sizeof(xfs_ihash_t), - KM_SLEEP | KM_MAYFAIL | KM_LARGE); - mp->m_ihsize /= sizeof(xfs_ihash_t); - for (i = 0; i < mp->m_ihsize; i++) - rwlock_init(&(mp->m_ihash[i].ih_lock)); -} - -/* - * Free up structures allocated by xfs_ihash_init, at unmount time. - */ -void -xfs_ihash_free(xfs_mount_t *mp) -{ - kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t)); - mp->m_ihash = NULL; -} - -/* - * Initialize the inode cluster hash table for the newly mounted file system. - * Its size is derived from the ihash table size. - */ -void -xfs_chash_init(xfs_mount_t *mp) -{ - uint i; - - mp->m_chsize = max_t(uint, 1, mp->m_ihsize / - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)); - mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize); - mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize - * sizeof(xfs_chash_t), - KM_SLEEP | KM_LARGE); - for (i = 0; i < mp->m_chsize; i++) { - spinlock_init(&mp->m_chash[i].ch_lock,"xfshash"); - } -} - -/* - * Free up structures allocated by xfs_chash_init, at unmount time. - */ -void -xfs_chash_free(xfs_mount_t *mp) -{ - int i; - - for (i = 0; i < mp->m_chsize; i++) { - spinlock_destroy(&mp->m_chash[i].ch_lock); - } - - kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t)); - mp->m_chash = NULL; -} - -/* - * Try to move an inode to the front of its hash list if possible - * (and if its not there already). Called right after obtaining - * the list version number and then dropping the read_lock on the - * hash list in question (which is done right after looking up the - * inode in question...). - */ -STATIC void -xfs_ihash_promote( - xfs_ihash_t *ih, - xfs_inode_t *ip, - ulong version) -{ - xfs_inode_t *iq; - - if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) { - if (likely(version == ih->ih_version)) { - /* remove from list */ - if ((iq = ip->i_next)) { - iq->i_prevp = ip->i_prevp; - } - *ip->i_prevp = iq; - - /* insert at list head */ - iq = ih->ih_next; - iq->i_prevp = &ip->i_next; - ip->i_next = iq; - ip->i_prevp = &ih->ih_next; - ih->ih_next = ip; - } - write_unlock(&ih->ih_lock); - } -} - -/* * Look up an inode by number in the given file system. - * The inode is looked up in the hash table for the file system - * represented by the mount point parameter mp. Each bucket of - * the hash table is guarded by an individual semaphore. - * - * If the inode is found in the hash table, its corresponding vnode - * is obtained with a call to vn_get(). This call takes care of - * coordination with the reclamation of the inode and vnode. Note - * that the vmap structure is filled in while holding the hash lock. - * This gives us the state of the inode/vnode when we found it and - * is used for coordination in vn_get(). + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, attach it to the provided + * vnode. * - * If it is not in core, read it in from the file system's device and - * add the inode into the hash table. + * If it is not in core, read it in from the file system's device, + * add it to the cache and attach the provided vnode. * * The inode is locked according to the value of the lock_flags parameter. * This flag parameter indicates how and if the inode's IO lock and inode lock @@ -192,274 +74,241 @@ xfs_iget_core( xfs_inode_t **ipp, xfs_daddr_t bno) { - xfs_ihash_t *ih; xfs_inode_t *ip; xfs_inode_t *iq; bhv_vnode_t *inode_vp; - ulong version; int error; - /* REFERENCED */ - xfs_chash_t *ch; - xfs_chashlist_t *chl, *chlnew; - SPLDECL(s); + xfs_icluster_t *icl, *new_icl = NULL; + unsigned long first_index, mask; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* the radix tree exists only in inode capable AGs */ + if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_get_perag(mp, ino); + if (!pag->pagi_inodeok) + return EINVAL; + ASSERT(pag->pag_ici_init); + agino = XFS_INO_TO_AGINO(mp, ino); +again: + read_lock(&pag->pag_ici_lock); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); - ih = XFS_IHASH(mp, ino); + if (ip != NULL) { + /* + * If INEW is set this inode is being set up + * we need to pause and try again. + */ + if (xfs_iflags_test(ip, XFS_INEW)) { + read_unlock(&pag->pag_ici_lock); + delay(1); + XFS_STATS_INC(xs_ig_frecycle); -again: - read_lock(&ih->ih_lock); + goto again; + } - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == ino) { + inode_vp = XFS_ITOV_NULL(ip); + if (inode_vp == NULL) { /* - * If INEW is set this inode is being set up + * If IRECLAIM is set this inode is + * on its way out of the system, * we need to pause and try again. */ - if (xfs_iflags_test(ip, XFS_INEW)) { - read_unlock(&ih->ih_lock); + if (xfs_iflags_test(ip, XFS_IRECLAIM)) { + read_unlock(&pag->pag_ici_lock); delay(1); XFS_STATS_INC(xs_ig_frecycle); goto again; } + ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - inode_vp = XFS_ITOV_NULL(ip); - if (inode_vp == NULL) { - /* - * If IRECLAIM is set this inode is - * on its way out of the system, - * we need to pause and try again. - */ - if (xfs_iflags_test(ip, XFS_IRECLAIM)) { - read_unlock(&ih->ih_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } - ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - - /* - * If lookup is racing with unlink, then we - * should return an error immediately so we - * don't remove it from the reclaim list and - * potentially leak the inode. - */ - if ((ip->i_d.di_mode == 0) && - !(flags & XFS_IGET_CREATE)) { - read_unlock(&ih->ih_lock); - return ENOENT; - } - - /* - * There may be transactions sitting in the - * incore log buffers or being flushed to disk - * at this time. We can't clear the - * XFS_IRECLAIMABLE flag until these - * transactions have hit the disk, otherwise we - * will void the guarantee the flag provides - * xfs_iunpin() - */ - if (xfs_ipincount(ip)) { - read_unlock(&ih->ih_lock); - xfs_log_force(mp, 0, - XFS_LOG_FORCE|XFS_LOG_SYNC); - XFS_STATS_INC(xs_ig_frecycle); - goto again; - } - - vn_trace_exit(vp, "xfs_iget.alloc", - (inst_t *)__return_address); + /* + * If lookup is racing with unlink, then we + * should return an error immediately so we + * don't remove it from the reclaim list and + * potentially leak the inode. + */ + if ((ip->i_d.di_mode == 0) && + !(flags & XFS_IGET_CREATE)) { + read_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); + return ENOENT; + } - XFS_STATS_INC(xs_ig_found); + /* + * There may be transactions sitting in the + * incore log buffers or being flushed to disk + * at this time. We can't clear the + * XFS_IRECLAIMABLE flag until these + * transactions have hit the disk, otherwise we + * will void the guarantee the flag provides + * xfs_iunpin() + */ + if (xfs_ipincount(ip)) { + read_unlock(&pag->pag_ici_lock); + xfs_log_force(mp, 0, + XFS_LOG_FORCE|XFS_LOG_SYNC); + XFS_STATS_INC(xs_ig_frecycle); + goto again; + } - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); + vn_trace_exit(vp, "xfs_iget.alloc", + (inst_t *)__return_address); - XFS_MOUNT_ILOCK(mp); - list_del_init(&ip->i_reclaim); - XFS_MOUNT_IUNLOCK(mp); + XFS_STATS_INC(xs_ig_found); - goto finish_inode; + xfs_iflags_clear(ip, XFS_IRECLAIMABLE); + read_unlock(&pag->pag_ici_lock); - } else if (vp != inode_vp) { - struct inode *inode = vn_to_inode(inode_vp); + XFS_MOUNT_ILOCK(mp); + list_del_init(&ip->i_reclaim); + XFS_MOUNT_IUNLOCK(mp); - /* The inode is being torn down, pause and - * try again. - */ - if (inode->i_state & (I_FREEING | I_CLEAR)) { - read_unlock(&ih->ih_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); + goto finish_inode; - goto again; - } -/* Chances are the other vnode (the one in the inode) is being torn - * down right now, and we landed on top of it. Question is, what do - * we do? Unhook the old inode and hook up the new one? - */ - cmn_err(CE_PANIC, - "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p", - inode_vp, vp); - } + } else if (vp != inode_vp) { + struct inode *inode = vn_to_inode(inode_vp); - /* - * Inode cache hit: if ip is not at the front of - * its hash chain, move it there now. - * Do this with the lock held for update, but - * do statistics after releasing the lock. + /* The inode is being torn down, pause and + * try again. */ - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); - XFS_STATS_INC(xs_ig_found); + if (inode->i_state & (I_FREEING | I_CLEAR)) { + read_unlock(&pag->pag_ici_lock); + delay(1); + XFS_STATS_INC(xs_ig_frecycle); -finish_inode: - if (ip->i_d.di_mode == 0) { - if (!(flags & XFS_IGET_CREATE)) - return ENOENT; - xfs_iocore_inode_reinit(ip); + goto again; } +/* Chances are the other vnode (the one in the inode) is being torn +* down right now, and we landed on top of it. Question is, what do +* we do? Unhook the old inode and hook up the new one? +*/ + cmn_err(CE_PANIC, + "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p", + inode_vp, vp); + } - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); + /* + * Inode cache hit + */ + read_unlock(&pag->pag_ici_lock); + XFS_STATS_INC(xs_ig_found); - xfs_iflags_clear(ip, XFS_ISTALE); - vn_trace_exit(vp, "xfs_iget.found", - (inst_t *)__return_address); - goto return_ip; +finish_inode: + if (ip->i_d.di_mode == 0) { + if (!(flags & XFS_IGET_CREATE)) { + xfs_put_perag(mp, pag); + return ENOENT; + } + xfs_iocore_inode_reinit(ip); } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE); + vn_trace_exit(vp, "xfs_iget.found", + (inst_t *)__return_address); + goto return_ip; } /* - * Inode cache miss: save the hash chain version stamp and unlock - * the chain, so we don't deadlock in vn_alloc. + * Inode cache miss */ + read_unlock(&pag->pag_ici_lock); XFS_STATS_INC(xs_ig_missed); - version = ih->ih_version; - - read_unlock(&ih->ih_lock); - /* * Read the disk inode attributes into a new inode structure and get * a new vnode for it. This should also initialize i_ino and i_mount. */ error = xfs_iread(mp, tp, ino, &ip, bno, (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0); - if (error) + if (error) { + xfs_put_perag(mp, pag); return error; + } vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address); xfs_inode_lock_init(ip, vp); xfs_iocore_inode_init(ip); - if (lock_flags) xfs_ilock(ip, lock_flags); if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { xfs_idestroy(ip); + xfs_put_perag(mp, pag); return ENOENT; } /* - * Put ip on its hash chain, unless someone else hashed a duplicate - * after we released the hash lock. + * This is a bit messy - we preallocate everything we _might_ + * need before we pick up the ici lock. That way we don't have to + * juggle locks and go all the way back to the start. */ - write_lock(&ih->ih_lock); + new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP); + if (radix_tree_preload(GFP_KERNEL)) { + delay(1); + goto again; + } + mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); + first_index = agino & mask; + write_lock(&pag->pag_ici_lock); - if (ih->ih_version != version) { - for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) { - if (iq->i_ino == ino) { - write_unlock(&ih->ih_lock); - xfs_idestroy(ip); + /* + * Find the cluster if it exists + */ + icl = NULL; + if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq, + first_index, 1)) { + if ((iq->i_ino & mask) == first_index) + icl = iq->i_cluster; + } - XFS_STATS_INC(xs_ig_dup); - goto again; - } - } + /* + * insert the new inode + */ + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + BUG_ON(error != -EEXIST); + write_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + xfs_idestroy(ip); + XFS_STATS_INC(xs_ig_dup); + goto again; } /* * These values _must_ be set before releasing ihlock! */ - ip->i_hash = ih; - if ((iq = ih->ih_next)) { - iq->i_prevp = &ip->i_next; - } - ip->i_next = iq; - ip->i_prevp = &ih->ih_next; - ih->ih_next = ip; ip->i_udquot = ip->i_gdquot = NULL; - ih->ih_version++; xfs_iflags_set(ip, XFS_INEW); - write_unlock(&ih->ih_lock); - /* - * put ip on its cluster's hash chain - */ - ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL && - ip->i_cnext == NULL); - - chlnew = NULL; - ch = XFS_CHASH(mp, ip->i_blkno); - chlredo: - s = mutex_spinlock(&ch->ch_lock); - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - - /* insert this inode into the doubly-linked list - * where chl points */ - if ((iq = chl->chl_ip)) { - ip->i_cprev = iq->i_cprev; - iq->i_cprev->i_cnext = ip; - iq->i_cprev = ip; - ip->i_cnext = iq; - } else { - ip->i_cnext = ip; - ip->i_cprev = ip; - } - chl->chl_ip = ip; - ip->i_chash = chl; - break; - } - } + ASSERT(ip->i_cluster == NULL); - /* no hash list found for this block; add a new hash list */ - if (chl == NULL) { - if (chlnew == NULL) { - mutex_spinunlock(&ch->ch_lock, s); - ASSERT(xfs_chashlist_zone != NULL); - chlnew = (xfs_chashlist_t *) - kmem_zone_alloc(xfs_chashlist_zone, - KM_SLEEP); - ASSERT(chlnew != NULL); - goto chlredo; - } else { - ip->i_cnext = ip; - ip->i_cprev = ip; - ip->i_chash = chlnew; - chlnew->chl_ip = ip; - chlnew->chl_blkno = ip->i_blkno; - if (ch->ch_list) - ch->ch_list->chl_prev = chlnew; - chlnew->chl_next = ch->ch_list; - chlnew->chl_prev = NULL; - ch->ch_list = chlnew; - chlnew = NULL; - } + if (!icl) { + spin_lock_init(&new_icl->icl_lock); + INIT_HLIST_HEAD(&new_icl->icl_inodes); + icl = new_icl; + new_icl = NULL; } else { - if (chlnew != NULL) { - kmem_zone_free(xfs_chashlist_zone, chlnew); - } + ASSERT(!hlist_empty(&icl->icl_inodes)); } + spin_lock(&icl->icl_lock); + hlist_add_head(&ip->i_cnode, &icl->icl_inodes); + ip->i_cluster = icl; + spin_unlock(&icl->icl_lock); - mutex_spinunlock(&ch->ch_lock, s); - + write_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (new_icl) + kmem_zone_free(xfs_icluster_zone, new_icl); /* * Link ip to its mount and thread it on the mount's inode list. @@ -478,6 +327,7 @@ finish_inode: mp->m_inodes = ip; XFS_MOUNT_IUNLOCK(mp); + xfs_put_perag(mp, pag); return_ip: ASSERT(ip->i_df.if_ext_max == @@ -587,32 +437,19 @@ xfs_inode_incore(xfs_mount_t *mp, xfs_ino_t ino, xfs_trans_t *tp) { - xfs_ihash_t *ih; xfs_inode_t *ip; - ulong version; - - ih = XFS_IHASH(mp, ino); - read_lock(&ih->ih_lock); - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == ino) { - /* - * If we find it and tp matches, return it. - * Also move it to the front of the hash list - * if we find it and it is not already there. - * Otherwise break from the loop and return - * NULL. - */ - if (ip->i_transp == tp) { - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); - return (ip); - } - break; - } - } - read_unlock(&ih->ih_lock); - return (NULL); + xfs_perag_t *pag; + + pag = xfs_get_perag(mp, ino); + read_lock(&pag->pag_ici_lock); + ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino)); + read_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); + + /* the returned inode must match the transaction */ + if (ip && (ip->i_transp != tp)) + return NULL; + return ip; } /* @@ -718,58 +555,26 @@ void xfs_iextract( xfs_inode_t *ip) { - xfs_ihash_t *ih; + xfs_mount_t *mp = ip->i_mount; + xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); xfs_inode_t *iq; - xfs_mount_t *mp; - xfs_chash_t *ch; - xfs_chashlist_t *chl, *chm; - SPLDECL(s); - - ih = ip->i_hash; - write_lock(&ih->ih_lock); - if ((iq = ip->i_next)) { - iq->i_prevp = ip->i_prevp; - } - *ip->i_prevp = iq; - ih->ih_version++; - write_unlock(&ih->ih_lock); + + write_lock(&pag->pag_ici_lock); + radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino)); + write_unlock(&pag->pag_ici_lock); + xfs_put_perag(mp, pag); /* - * Remove from cluster hash list - * 1) delete the chashlist if this is the last inode on the chashlist - * 2) unchain from list of inodes - * 3) point chashlist->chl_ip to 'chl_next' if to this inode. + * Remove from cluster list */ mp = ip->i_mount; - ch = XFS_CHASH(mp, ip->i_blkno); - s = mutex_spinlock(&ch->ch_lock); - - if (ip->i_cnext == ip) { - /* Last inode on chashlist */ - ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); - ASSERT(ip->i_chash != NULL); - chm=NULL; - chl = ip->i_chash; - if (chl->chl_prev) - chl->chl_prev->chl_next = chl->chl_next; - else - ch->ch_list = chl->chl_next; - if (chl->chl_next) - chl->chl_next->chl_prev = chl->chl_prev; - kmem_zone_free(xfs_chashlist_zone, chl); - } else { - /* delete one inode from a non-empty list */ - iq = ip->i_cnext; - iq->i_cprev = ip->i_cprev; - ip->i_cprev->i_cnext = iq; - if (ip->i_chash->chl_ip == ip) { - ip->i_chash->chl_ip = iq; - } - ip->i_chash = __return_address; - ip->i_cprev = __return_address; - ip->i_cnext = __return_address; - } - mutex_spinunlock(&ch->ch_lock, s); + spin_lock(&ip->i_cluster->icl_lock); + hlist_del(&ip->i_cnode); + spin_unlock(&ip->i_cluster->icl_lock); + + /* was last inode in cluster? */ + if (hlist_empty(&ip->i_cluster->icl_inodes)) + kmem_zone_free(xfs_icluster_zone, ip->i_cluster); /* * Remove from mount's inode list. |