From 9c12a831d73dd938a22418d70b39aed4feb4bdf2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 16 Sep 2013 08:24:26 -0400 Subject: ext4: fix performance regression in writeback of random writes The Linux Kernel Performance project guys have reported that commit 4e7ea81db5 introduces a performance regression for the following fio workload: [global] direct=0 ioengine=mmap size=1500M bs=4k pre_read=1 numjobs=1 overwrite=1 loops=5 runtime=300 group_reporting invalidate=0 directory=/mnt/ file_service_type=random:36 file_service_type=random:36 [job0] startdelay=0 rw=randrw filename=data0/f1:data0/f2 [job1] startdelay=0 rw=randrw filename=data0/f2:data0/f1 ... [job7] startdelay=0 rw=randrw filename=data0/f2:data0/f1 The culprit of the problem is that after the commit ext4_writepages() are more aggressive in writing back pages. Thus we have less consecutive dirty pages resulting in more seeking. This increased aggressivity is caused by a bug in the condition terminating ext4_writepages(). We start writing from the beginning of the file even if we should have terminated ext4_writepages() because wbc->nr_to_write <= 0. After fixing the condition the throughput of the fio workload is about 20% better than before writeback reorganization. Reported-by: "Yan, Zheng" Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9115f2807515..4cf2619f007c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2559,7 +2559,7 @@ retry: break; } blk_finish_plug(&plug); - if (!ret && !cycled) { + if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd.last_page = writeback_index - 1; mpd.first_page = 0; -- cgit v1.2.3 From 8660998608cfa1077e560034db81885af8e1e885 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Fri, 6 Sep 2013 21:49:56 -0500 Subject: jfs: fix error path in ialloc If insert_inode_locked() fails, we shouldn't be calling unlock_new_inode(). Signed-off-by: Dave Kleikamp Tested-by: Michael L. Semon Cc: stable@vger.kernel.org --- fs/jfs/jfs_inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index c1a3e603279c..7f464c513ba0 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -95,7 +95,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) if (insert_inode_locked(inode) < 0) { rc = -EINVAL; - goto fail_unlock; + goto fail_put; } inode_init_owner(inode, parent, mode); @@ -156,7 +156,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode) fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; -fail_unlock: clear_nlink(inode); unlock_new_inode(inode); fail_put: -- cgit v1.2.3 From bde52788bdb755b9e4b75db6c434f30e32a0ca0b Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Fri, 13 Sep 2013 19:19:54 +0400 Subject: fuse: wait for writeback in fuse_file_fallocate() The patch fixes a race between mmap-ed write and fallocate(PUNCH_HOLE): 1) An user makes a page dirty via mmap-ed write. 2) The user performs fallocate(2) with mode == PUNCH_HOLE|KEEP_SIZE and covering the page. 3) Before truncate_pagecache_range call from fuse_file_fallocate, the page goes to write-back. The page is fully processed by fuse_writepage (including end_page_writeback on the page), but fuse_flush_writepages did nothing because fi->writectr < 0. 4) truncate_pagecache_range is called and fuse_file_fallocate is finishing by calling fuse_release_nowrite. The latter triggers processing queued write-back request which will write stale data to the hole soon. Changed in v2 (thanks to Brian for suggestion): - Do not truncate page cache until FUSE_FALLOCATE succeeded. Otherwise, we can end up in returning -ENOTSUPP while user data is already punched from page cache. Use filemap_write_and_wait_range() instead. Changed in v3 (thanks to Miklos for suggestion): - fuse_wait_on_writeback() is prone to livelocks; use fuse_set_nowrite() instead. So far as we need a dirty-page barrier only, fuse_sync_writes() should be enough. - rebased to for-linus branch of fuse.git Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/file.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d409deafc67b..f9f07c4fa517 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2484,8 +2484,15 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (lock_inode) { mutex_lock(&inode->i_mutex); - if (mode & FALLOC_FL_PUNCH_HOLE) - fuse_set_nowrite(inode); + if (mode & FALLOC_FL_PUNCH_HOLE) { + loff_t endbyte = offset + length - 1; + err = filemap_write_and_wait_range(inode->i_mapping, + offset, endbyte); + if (err) + goto out; + + fuse_sync_writes(inode); + } } req = fuse_get_req_nopages(fc); @@ -2520,11 +2527,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, fuse_invalidate_attr(inode); out: - if (lock_inode) { - if (mode & FALLOC_FL_PUNCH_HOLE) - fuse_release_nowrite(inode); + if (lock_inode) mutex_unlock(&inode->i_mutex); - } return err; } -- cgit v1.2.3 From 0ab08f576b9e6a6b689fc6b4e632079b978e619b Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Fri, 13 Sep 2013 19:20:16 +0400 Subject: fuse: fix fallocate vs. ftruncate race A former patch introducing FUSE_I_SIZE_UNSTABLE flag provided detailed description of races between ftruncate and anyone who can extend i_size: > 1. As in the previous scenario fuse_dentry_revalidate() discovered that i_size > changed (due to our own fuse_do_setattr()) and is going to call > truncate_pagecache() for some 'new_size' it believes valid right now. But by > the time that particular truncate_pagecache() is called ... > 2. fuse_do_setattr() returns (either having called truncate_pagecache() or > not -- it doesn't matter). > 3. The file is extended either by write(2) or ftruncate(2) or fallocate(2). > 4. mmap-ed write makes a page in the extended region dirty. This patch adds necessary bits to fuse_file_fallocate() to protect from that race. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/file.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f9f07c4fa517..4598345ab87d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2467,6 +2467,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, { struct fuse_file *ff = file->private_data; struct inode *inode = file->f_inode; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = ff->fc; struct fuse_req *req; struct fuse_fallocate_in inarg = { @@ -2495,6 +2496,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } } + if (!(mode & FALLOC_FL_KEEP_SIZE)) + set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -2527,6 +2531,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, fuse_invalidate_attr(inode); out: + if (!(mode & FALLOC_FL_KEEP_SIZE)) + clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (lock_inode) mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 54afa99057ee2ffd3df0f5e891298bbbb65ea63c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Sep 2013 17:10:39 +0000 Subject: CIFS: FS-Cache: Uncache unread pages in cifs_readpages() before freeing them In cifs_readpages(), we may decide we don't want to read a page after all - but the page may already have passed through fscache_read_or_alloc_pages() and thus have marks and reservations set. Thus we have to call fscache_readpages_cancel() or fscache_uncache_page() on the pages we're returning to clear the marks. NFS, AFS and 9P should be unaffected by this as they call read_cache_pages() which does the cleanup for you. Signed-off-by: David Howells Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 8 ++++++++ fs/cifs/fscache.c | 7 +++++++ fs/cifs/fscache.h | 13 +++++++++++++ 3 files changed, 28 insertions(+) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index eb955b525e55..7ddddf2e2504 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3254,6 +3254,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, /* * Reads as many pages as possible from fscache. Returns -ENOBUFS * immediately if the cookie is negative + * + * After this point, every page in the list might have PG_fscache set, + * so we will need to clean that up off of every page we don't use. */ rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list, &num_pages); @@ -3376,6 +3379,11 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, kref_put(&rdata->refcount, cifs_readdata_release); } + /* Any pages that have been shown to fscache but didn't get added to + * the pagecache must be uncached before they get returned to the + * allocator. + */ + cifs_fscache_readpages_cancel(mapping->host, page_list); return rc; } diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 2f4bc5a58054..b3258f35e88a 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -223,6 +223,13 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) fscache_uncache_page(CIFS_I(inode)->fscache, page); } +void __cifs_fscache_readpages_cancel(struct inode *inode, struct list_head *pages) +{ + cifs_dbg(FYI, "%s: (fsc: %p, i: %p)\n", + __func__, CIFS_I(inode)->fscache, inode); + fscache_readpages_cancel(CIFS_I(inode)->fscache, pages); +} + void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) { struct cifsInodeInfo *cifsi = CIFS_I(inode); diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 63539323e0b9..24794b6cd8ec 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -54,6 +54,7 @@ extern int __cifs_readpages_from_fscache(struct inode *, struct address_space *, struct list_head *, unsigned *); +extern void __cifs_fscache_readpages_cancel(struct inode *, struct list_head *); extern void __cifs_readpage_to_fscache(struct inode *, struct page *); @@ -91,6 +92,13 @@ static inline void cifs_readpage_to_fscache(struct inode *inode, __cifs_readpage_to_fscache(inode, page); } +static inline void cifs_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ + if (CIFS_I(inode)->fscache) + return __cifs_fscache_readpages_cancel(inode, pages); +} + #else /* CONFIG_CIFS_FSCACHE */ static inline int cifs_fscache_register(void) { return 0; } static inline void cifs_fscache_unregister(void) {} @@ -131,6 +139,11 @@ static inline int cifs_readpages_from_fscache(struct inode *inode, static inline void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {} +static inline void cifs_fscache_readpages_cancel(struct inode *inode, + struct list_head *pages) +{ +} + #endif /* CONFIG_CIFS_FSCACHE */ #endif /* _CIFS_FSCACHE_H */ -- cgit v1.2.3 From 9ae6cf606a33b0a762798df0fb742848bcc685b5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 16 Sep 2013 11:23:45 -0400 Subject: cifs: stop trying to use virtual circuits Currently, we try to ensure that we use vcnum of 0 on the first established session on a connection and then try to use a different vcnum on each session after that. This is a little odd, since there's no real reason to use a different vcnum for each SMB session. I can only assume there was some confusion between SMB sessions and VCs. That's somewhat understandable since they both get created during SESSION_SETUP, but the documentation indicates that they are really orthogonal. The comment on max_vcs in particular looks quite misguided. An SMB session is already uniquely identified by the SMB UID value -- there's no need to again uniquely ID with a VC. Furthermore, a vcnum of 0 is a cue to the server that it should release any resources that were previously held by the client. This sounds like a good thing, until you consider that: a) it totally ignores the fact that other programs on the box (e.g. smbclient) might have connections established to the server. Using a vcnum of 0 causes them to get kicked off. b) it causes problems with NAT. If several clients are connected to the same server via the same NAT'ed address, whenever one connects to the server it kicks off all the others, which then reconnect and kick off the first one...ad nauseum. I don't see any reason to ignore the advice in "Implementing CIFS" which has a comprehensive treatment of virtual circuits. In there, it states "...and contrary to the specs the client should always use a VcNumber of one, never zero." Have the client just use a hardcoded vcnum of 1, and stop abusing the special behavior of vcnum 0. Reported-by: Sauron99@gmx.de Signed-off-by: Jeff Layton Reviewed-by: Volker Lendecke Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 4 --- fs/cifs/cifssmb.c | 1 - fs/cifs/sess.c | 84 +----------------------------------------------------- 3 files changed, 1 insertion(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index cfa14c80ef3b..9c72be6fb0df 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -547,9 +547,6 @@ struct TCP_Server_Info { unsigned int max_rw; /* maxRw specifies the maximum */ /* message size the server can send or receive for */ /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ - unsigned int max_vcs; /* maximum number of smb sessions, at least - those that can be specified uniquely with - vcnumbers */ unsigned int capabilities; /* selective disabling of caps by smb sess */ int timeAdj; /* Adjust for difference in server time zone in sec */ __u64 CurrentMid; /* multiplex id - rotating counter */ @@ -715,7 +712,6 @@ struct cifs_ses { enum statusEnum status; unsigned overrideSecFlg; /* if non-zero override global sec flags */ __u16 ipc_tid; /* special tid for connection to IPC share */ - __u16 vcnum; char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a3d74fea1623..4baf35949b51 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -463,7 +463,6 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) cifs_max_pending); set_credits(server, server->maxReq); server->maxBuf = le16_to_cpu(rsp->MaxBufSize); - server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 5f99b7f19e78..352358de1d7e 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -32,88 +32,6 @@ #include #include "cifs_spnego.h" -/* - * Checks if this is the first smb session to be reconnected after - * the socket has been reestablished (so we know whether to use vc 0). - * Called while holding the cifs_tcp_ses_lock, so do not block - */ -static bool is_first_ses_reconnect(struct cifs_ses *ses) -{ - struct list_head *tmp; - struct cifs_ses *tmp_ses; - - list_for_each(tmp, &ses->server->smb_ses_list) { - tmp_ses = list_entry(tmp, struct cifs_ses, - smb_ses_list); - if (tmp_ses->need_reconnect == false) - return false; - } - /* could not find a session that was already connected, - this must be the first one we are reconnecting */ - return true; -} - -/* - * vc number 0 is treated specially by some servers, and should be the - * first one we request. After that we can use vcnumbers up to maxvcs, - * one for each smb session (some Windows versions set maxvcs incorrectly - * so maxvc=1 can be ignored). If we have too many vcs, we can reuse - * any vc but zero (some servers reset the connection on vcnum zero) - * - */ -static __le16 get_next_vcnum(struct cifs_ses *ses) -{ - __u16 vcnum = 0; - struct list_head *tmp; - struct cifs_ses *tmp_ses; - __u16 max_vcs = ses->server->max_vcs; - __u16 i; - int free_vc_found = 0; - - /* Quoting the MS-SMB specification: "Windows-based SMB servers set this - field to one but do not enforce this limit, which allows an SMB client - to establish more virtual circuits than allowed by this value ... but - other server implementations can enforce this limit." */ - if (max_vcs < 2) - max_vcs = 0xFFFF; - - spin_lock(&cifs_tcp_ses_lock); - if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) - goto get_vc_num_exit; /* vcnum will be zero */ - for (i = ses->server->srv_count - 1; i < max_vcs; i++) { - if (i == 0) /* this is the only connection, use vc 0 */ - break; - - free_vc_found = 1; - - list_for_each(tmp, &ses->server->smb_ses_list) { - tmp_ses = list_entry(tmp, struct cifs_ses, - smb_ses_list); - if (tmp_ses->vcnum == i) { - free_vc_found = 0; - break; /* found duplicate, try next vcnum */ - } - } - if (free_vc_found) - break; /* we found a vcnumber that will work - use it */ - } - - if (i == 0) - vcnum = 0; /* for most common case, ie if one smb session, use - vc zero. Also for case when no free vcnum, zero - is safest to send (some clients only send zero) */ - else if (free_vc_found == 0) - vcnum = 1; /* we can not reuse vc=0 safely, since some servers - reset all uids on that, but 1 is ok. */ - else - vcnum = i; - ses->vcnum = vcnum; -get_vc_num_exit: - spin_unlock(&cifs_tcp_ses_lock); - - return cpu_to_le16(vcnum); -} - static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) { __u32 capabilities = 0; @@ -128,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - pSMB->req.VcNumber = get_next_vcnum(ses); + pSMB->req.VcNumber = __constant_cpu_to_le16(1); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ -- cgit v1.2.3 From 74d290da476f672ad756634d12aa707375d3564d Mon Sep 17 00:00:00 2001 From: Jim McDonough Date: Sat, 21 Sep 2013 10:36:10 -0500 Subject: [CIFS] Provide sane values for nlink Since we don't get info about the number of links from the readdir linfo levels, stat() will return 0 for st_nlink, and in particular, samba re-exported shares will show directories as files (as samba is keying off st_nlink before evaluating how to set the dos modebits) when doing a dir or ls. Copy nlink to the inode, unless it wasn't provided. Provide sane values if we don't have an existing one and none was provided. Signed-off-by: Jim McDonough Reviewed-by: Jeff Layton Reviewed-by: David Disseldorp Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 1 + fs/cifs/inode.c | 45 +++++++++++++++++++++++++++++++++++++++------ fs/cifs/readdir.c | 3 +++ 3 files changed, 43 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 9c72be6fb0df..52b6f6c26bfc 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1268,6 +1268,7 @@ struct dfs_info3_param { #define CIFS_FATTR_DELETE_PENDING 0x2 #define CIFS_FATTR_NEED_REVAL 0x4 #define CIFS_FATTR_INO_COLLISION 0x8 +#define CIFS_FATTR_UNKNOWN_NLINK 0x10 struct cifs_fattr { u32 cf_flags; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f9ff9c173f78..867b7cdc794a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -120,6 +120,33 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) cifs_i->invalid_mapping = true; } +/* + * copy nlink to the inode, unless it wasn't provided. Provide + * sane values if we don't have an existing one and none was provided + */ +static void +cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) +{ + /* + * if we're in a situation where we can't trust what we + * got from the server (readdir, some non-unix cases) + * fake reasonable values + */ + if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) { + /* only provide fake values on a new inode */ + if (inode->i_state & I_NEW) { + if (fattr->cf_cifsattrs & ATTR_DIRECTORY) + set_nlink(inode, 2); + else + set_nlink(inode, 1); + } + return; + } + + /* we trust the server, so update it */ + set_nlink(inode, fattr->cf_nlink); +} + /* populate an inode with info from a cifs_fattr struct */ void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) @@ -134,7 +161,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_mtime = fattr->cf_mtime; inode->i_ctime = fattr->cf_ctime; inode->i_rdev = fattr->cf_rdev; - set_nlink(inode, fattr->cf_nlink); + cifs_nlink_fattr_to_inode(inode, fattr); inode->i_uid = fattr->cf_uid; inode->i_gid = fattr->cf_gid; @@ -541,6 +568,7 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, fattr->cf_bytes = le64_to_cpu(info->AllocationSize); fattr->cf_createtime = le64_to_cpu(info->CreationTime); + fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); if (fattr->cf_cifsattrs & ATTR_DIRECTORY) { fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode; fattr->cf_dtype = DT_DIR; @@ -548,7 +576,8 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, * Server can return wrong NumberOfLinks value for directories * when Unix extensions are disabled - fake it. */ - fattr->cf_nlink = 2; + if (!tcon->unix_ext) + fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; } else if (fattr->cf_cifsattrs & ATTR_REPARSE) { fattr->cf_mode = S_IFLNK; fattr->cf_dtype = DT_LNK; @@ -561,11 +590,15 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (fattr->cf_cifsattrs & ATTR_READONLY) fattr->cf_mode &= ~(S_IWUGO); - fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks); - if (fattr->cf_nlink < 1) { - cifs_dbg(1, "replacing bogus file nlink value %u\n", + /* + * Don't accept zero nlink from non-unix servers unless + * delete is pending. Instead mark it as unknown. + */ + if ((fattr->cf_nlink < 1) && !tcon->unix_ext && + !info->DeletePending) { + cifs_dbg(1, "bogus file nlink value %u\n", fattr->cf_nlink); - fattr->cf_nlink = 1; + fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; } } diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 42ef03be089f..53a75f3d0179 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -180,6 +180,9 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb) fattr->cf_dtype = DT_REG; } + /* non-unix readdir doesn't provide nlink */ + fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK; + if (fattr->cf_cifsattrs & ATTR_READONLY) fattr->cf_mode &= ~S_IWUGO; -- cgit v1.2.3 From 05c715f2a924dae2b2f6396b0109013a24502f2c Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 25 Sep 2013 18:58:13 -0500 Subject: [CIFS] Remove ext2 flags that have been moved to fs.h These flags were unused by cifs and since the EXT flags have been moved to common code in uapi/linux/fs.h we won't need to have a cifs specific copy. Signed-off-by: Steve French --- fs/cifs/cifspdu.h | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 948676db8e2e..a630475e421c 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2652,26 +2652,7 @@ typedef struct file_xattr_info { } __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info level 0x205 */ - -/* flags for chattr command */ -#define EXT_SECURE_DELETE 0x00000001 /* EXT3_SECRM_FL */ -#define EXT_ENABLE_UNDELETE 0x00000002 /* EXT3_UNRM_FL */ -/* Reserved for compress file 0x4 */ -#define EXT_SYNCHRONOUS 0x00000008 /* EXT3_SYNC_FL */ -#define EXT_IMMUTABLE_FL 0x00000010 /* EXT3_IMMUTABLE_FL */ -#define EXT_OPEN_APPEND_ONLY 0x00000020 /* EXT3_APPEND_FL */ -#define EXT_DO_NOT_BACKUP 0x00000040 /* EXT3_NODUMP_FL */ -#define EXT_NO_UPDATE_ATIME 0x00000080 /* EXT3_NOATIME_FL */ -/* 0x100 through 0x800 reserved for compression flags and are GET-ONLY */ -#define EXT_HASH_TREE_INDEXED_DIR 0x00001000 /* GET-ONLY EXT3_INDEX_FL */ -/* 0x2000 reserved for IMAGIC_FL */ -#define EXT_JOURNAL_THIS_FILE 0x00004000 /* GET-ONLY EXT3_JOURNAL_DATA_FL */ -/* 0x8000 reserved for EXT3_NOTAIL_FL */ -#define EXT_SYNCHRONOUS_DIR 0x00010000 /* EXT3_DIRSYNC_FL */ -#define EXT_TOPDIR 0x00020000 /* EXT3_TOPDIR_FL */ - -#define EXT_SET_MASK 0x000300FF -#define EXT_GET_MASK 0x0003DFFF +/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */ typedef struct file_chattr_info { __le64 mask; /* list of all possible attribute bits */ -- cgit v1.2.3 From ffe67b58595eda992f369de7ac4d0a9b165074fd Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 25 Sep 2013 19:01:27 -0500 Subject: [CIFS] update cifs.ko version To 2.02 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index ea723a5e8226..6d0b07217ac9 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -132,5 +132,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.01" +#define CIFS_VERSION "2.02" #endif /* _CIFSFS_H */ -- cgit v1.2.3 From 5e9ae2e5da0beb93f8557fc92a8f4fbc05ea448f Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Thu, 26 Sep 2013 20:34:51 -0400 Subject: aio: fix use-after-free in aio_migratepage Dmitry Vyukov managed to trigger a case where aio_migratepage can cause a use-after-free during teardown of the aio ring buffer's mapping. This turns out to be caused by access to the ioctx's ring_pages via the migratepage operation which was not being protected by any locks during ioctx freeing. Use the address_space's private_lock to protect use and updates of the mapping's private_data, and make ioctx teardown unlink the ioctx from the address space. Reported-by: Dmitry Vyukov Tested-by: Dmitry Vyukov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 52 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 6b868f0e0c4c..067e3d340c35 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -167,10 +167,25 @@ static int __init aio_setup(void) } __initcall(aio_setup); +static void put_aio_ring_file(struct kioctx *ctx) +{ + struct file *aio_ring_file = ctx->aio_ring_file; + if (aio_ring_file) { + truncate_setsize(aio_ring_file->f_inode, 0); + + /* Prevent further access to the kioctx from migratepages */ + spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock); + aio_ring_file->f_inode->i_mapping->private_data = NULL; + ctx->aio_ring_file = NULL; + spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock); + + fput(aio_ring_file); + } +} + static void aio_free_ring(struct kioctx *ctx) { int i; - struct file *aio_ring_file = ctx->aio_ring_file; for (i = 0; i < ctx->nr_pages; i++) { pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, @@ -178,14 +193,10 @@ static void aio_free_ring(struct kioctx *ctx) put_page(ctx->ring_pages[i]); } + put_aio_ring_file(ctx); + if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); - - if (aio_ring_file) { - truncate_setsize(aio_ring_file->f_inode, 0); - fput(aio_ring_file); - ctx->aio_ring_file = NULL; - } } static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) @@ -207,9 +218,8 @@ static int aio_set_page_dirty(struct page *page) static int aio_migratepage(struct address_space *mapping, struct page *new, struct page *old, enum migrate_mode mode) { - struct kioctx *ctx = mapping->private_data; + struct kioctx *ctx; unsigned long flags; - unsigned idx = old->index; int rc; /* Writeback must be complete */ @@ -224,10 +234,23 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, get_page(new); - spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - ctx->ring_pages[idx] = new; - spin_unlock_irqrestore(&ctx->completion_lock, flags); + /* We can potentially race against kioctx teardown here. Use the + * address_space's private data lock to protect the mapping's + * private_data. + */ + spin_lock(&mapping->private_lock); + ctx = mapping->private_data; + if (ctx) { + pgoff_t idx; + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + idx = old->index; + if (idx < (pgoff_t)ctx->nr_pages) + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else + rc = -EBUSY; + spin_unlock(&mapping->private_lock); return rc; } @@ -617,8 +640,7 @@ out_freepcpu: out_freeref: free_percpu(ctx->users.pcpu_count); out_freectx: - if (ctx->aio_ring_file) - fput(ctx->aio_ring_file); + put_aio_ring_file(ctx); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); -- cgit v1.2.3 From 3c70b8eeda596069258772afabf2ab0b1aa017f0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Oct 2013 16:41:22 +0200 Subject: fuse: don't check_submounts_and_drop() in RCU walk If revalidate finds an invalid dentry in RCU walk mode, let the VFS deal with it instead of calling check_submounts_and_drop() which is not prepared for being called from RCU walk. Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 62b43b577bfc..9b16806f11da 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -259,7 +259,8 @@ out: invalid: ret = 0; - if (check_submounts_and_drop(entry) != 0) + + if (!(flags & LOOKUP_RCU) && check_submounts_and_drop(entry) != 0) ret = 1; goto out; } -- cgit v1.2.3 From 6314efee3cfeea2da12dbc05edfa20e5a42391bd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Oct 2013 16:41:22 +0200 Subject: fuse: readdirplus: fix RCU walk Doing dput(parent) is not valid in RCU walk mode. In RCU mode it would probably be okay to update the parent flags, but it's actually not necessary most of the time... So only set the FUSE_I_ADVISE_RDPLUS flag on the parent when the entry was recently initialized by READDIRPLUS. This is achieved by setting FUSE_I_INIT_RDPLUS on entries added by READDIRPLUS and only dropping out of RCU mode if this flag is set. FUSE_I_INIT_RDPLUS is cleared once the FUSE_I_ADVISE_RDPLUS flag is set in the parent. Reported-by: Al Viro Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org --- fs/fuse/dir.c | 12 +++++++++--- fs/fuse/fuse_i.h | 2 ++ 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 9b16806f11da..a75159846a75 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -182,6 +182,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) struct inode *inode; struct dentry *parent; struct fuse_conn *fc; + struct fuse_inode *fi; int ret; inode = ACCESS_ONCE(entry->d_inode); @@ -228,7 +229,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (!err && !outarg.nodeid) err = -ENOENT; if (!err) { - struct fuse_inode *fi = get_fuse_inode(inode); + fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { fuse_queue_forget(fc, forget, outarg.nodeid, 1); goto invalid; @@ -246,8 +247,11 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { - fc = get_fuse_conn(inode); - if (fc->readdirplus_auto) { + fi = get_fuse_inode(inode); + if (flags & LOOKUP_RCU) { + if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) + return -ECHILD; + } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { parent = dget_parent(entry); fuse_advise_use_readdirplus(parent->d_inode); dput(parent); @@ -1292,6 +1296,8 @@ static int fuse_direntplus_link(struct file *file, } found: + if (fc->readdirplus_auto) + set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); fuse_change_entry_timeout(dentry, o); err = 0; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 5ced199b50bb..5b9e6f3b6aef 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -115,6 +115,8 @@ struct fuse_inode { enum { /** Advise readdirplus */ FUSE_I_ADVISE_RDPLUS, + /** Initialized with readdirplus */ + FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, }; -- cgit v1.2.3 From 698fa1d163c560343b8012a0a916440d076b6c8a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 1 Oct 2013 16:41:23 +0200 Subject: fuse: no RCU mode in fuse_access() fuse_access() is never called in RCU walk, only on the final component of access(2) and chdir(2)... Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index a75159846a75..b7989f2ab4c4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1068,6 +1068,8 @@ static int fuse_access(struct inode *inode, int mask) struct fuse_access_in inarg; int err; + BUG_ON(mask & MAY_NOT_BLOCK); + if (fc->no_access) return 0; @@ -1155,9 +1157,6 @@ static int fuse_permission(struct inode *inode, int mask) noticed immediately, only after the attribute timeout has expired */ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { -- cgit v1.2.3 From 89c6c89af2ef41cb127c9694ef7783e585e96337 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 30 Sep 2013 09:37:03 +1000 Subject: xfs: lockdep needs to know about 3 dquot-deep nesting Michael Semon reported that xfs/299 generated this lockdep warning: ============================================= [ INFO: possible recursive locking detected ] 3.12.0-rc2+ #2 Not tainted --------------------------------------------- touch/21072 is trying to acquire lock: (&xfs_dquot_other_class){+.+...}, at: [] xfs_trans_dqlockedjoin+0x57/0x64 but task is already holding lock: (&xfs_dquot_other_class){+.+...}, at: [] xfs_trans_dqlockedjoin+0x57/0x64 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&xfs_dquot_other_class); lock(&xfs_dquot_other_class); *** DEADLOCK *** May be due to missing lock nesting notation 7 locks held by touch/21072: #0: (sb_writers#10){++++.+}, at: [] mnt_want_write+0x1e/0x3e #1: (&type->i_mutex_dir_key#4){+.+.+.}, at: [] do_last+0x245/0xe40 #2: (sb_internal#2){++++.+}, at: [] xfs_trans_alloc+0x1f/0x35 #3: (&(&ip->i_lock)->mr_lock/1){+.+...}, at: [] xfs_ilock+0x100/0x1f1 #4: (&(&ip->i_lock)->mr_lock){++++-.}, at: [] xfs_ilock_nowait+0x105/0x22f #5: (&dqp->q_qlock){+.+...}, at: [] xfs_trans_dqlockedjoin+0x57/0x64 #6: (&xfs_dquot_other_class){+.+...}, at: [] xfs_trans_dqlockedjoin+0x57/0x64 The lockdep annotation for dquot lock nesting only understands locking for user and "other" dquots, not user, group and quota dquots. Fix the annotations to match the locking heirarchy we now have. Reported-by: Michael L. Semon Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit f112a049712a5c07de25d511c3c6587a2b1a015e) --- fs/xfs/xfs_dquot.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 71520e6e5d65..1ee776d477c3 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -64,7 +64,8 @@ int xfs_dqerror_mod = 33; struct kmem_zone *xfs_qm_dqtrxzone; static struct kmem_zone *xfs_qm_dqzone; -static struct lock_class_key xfs_dquot_other_class; +static struct lock_class_key xfs_dquot_group_class; +static struct lock_class_key xfs_dquot_project_class; /* * This is called to free all the memory associated with a dquot @@ -703,8 +704,20 @@ xfs_qm_dqread( * Make sure group quotas have a different lock class than user * quotas. */ - if (!(type & XFS_DQ_USER)) - lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class); + switch (type) { + case XFS_DQ_USER: + /* uses the default lock class */ + break; + case XFS_DQ_GROUP: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); + break; + case XFS_DQ_PROJ: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); + break; + default: + ASSERT(0); + break; + } XFS_STATS_INC(xs_qm_dquot); -- cgit v1.2.3 From 6d313498f035abc9d8ad3a1b3295f133bfab9638 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 30 Sep 2013 09:37:04 +1000 Subject: xfs: dirent dtype presence is dependent on directory magic numbers The determination of whether a directory entry contains a dtype field originally was dependent on the filesystem having CRCs enabled. This meant that the format for dtype beign enabled could be determined by checking the directory block magic number rather than doing a feature bit check. This was useful in that it meant that we didn't need to pass a struct xfs_mount around to functions that were already supplied with a directory block header. Unfortunately, the introduction of dtype fields into the v4 structure via a feature bit meant this "use the directory block magic number" method of discriminating the dirent entry sizes is broken. Hence we need to convert the places that use magic number checks to use feature bit checks so that they work correctly and not by chance. The current code works on v4 filesystems only because the dirent size roundup covers the extra byte needed by the dtype field in the places where this problem occurs. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers (cherry picked from commit 367993e7c6428cb7617ab7653d61dca54e2fdede) --- fs/xfs/xfs_dir2_block.c | 6 +++--- fs/xfs/xfs_dir2_format.h | 51 +++++++++++++++++++---------------------------- fs/xfs/xfs_dir2_readdir.c | 4 ++-- fs/xfs/xfs_dir2_sf.c | 6 +++--- 4 files changed, 28 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 0957aa98b6c0..12dad188939d 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1158,7 +1158,7 @@ xfs_dir2_sf_to_block( /* * Create entry for . */ - dep = xfs_dir3_data_dot_entry_p(hdr); + dep = xfs_dir3_data_dot_entry_p(mp, hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; @@ -1172,7 +1172,7 @@ xfs_dir2_sf_to_block( /* * Create entry for .. */ - dep = xfs_dir3_data_dotdot_entry_p(hdr); + dep = xfs_dir3_data_dotdot_entry_p(mp, hdr); dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; @@ -1183,7 +1183,7 @@ xfs_dir2_sf_to_block( blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp, (char *)dep - (char *)hdr)); - offset = xfs_dir3_data_first_offset(hdr); + offset = xfs_dir3_data_first_offset(mp); /* * Loop over existing entries, stuff them in. */ diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index a0961a61ac1a..9cf67381adf6 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -497,69 +497,58 @@ xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) /* * Offsets of . and .. in data space (always block 0) * - * The macros are used for shortform directories as they have no headers to read - * the magic number out of. Shortform directories need to know the size of the - * data block header because the sfe embeds the block offset of the entry into - * it so that it doesn't change when format conversion occurs. Bad Things Happen - * if we don't follow this rule. - * * XXX: there is scope for significant optimisation of the logic here. Right * now we are checking for "dir3 format" over and over again. Ideally we should * only do it once for each operation. */ -#define XFS_DIR3_DATA_DOT_OFFSET(mp) \ - xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&(mp)->m_sb)) -#define XFS_DIR3_DATA_DOTDOT_OFFSET(mp) \ - (XFS_DIR3_DATA_DOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 1)) -#define XFS_DIR3_DATA_FIRST_OFFSET(mp) \ - (XFS_DIR3_DATA_DOTDOT_OFFSET(mp) + xfs_dir3_data_entsize(mp, 2)) - static inline xfs_dir2_data_aoff_t -xfs_dir3_data_dot_offset(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dot_offset(struct xfs_mount *mp) { - return xfs_dir3_data_entry_offset(hdr); + return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb)); } static inline xfs_dir2_data_aoff_t -xfs_dir3_data_dotdot_offset(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dotdot_offset(struct xfs_mount *mp) { - bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - return xfs_dir3_data_dot_offset(hdr) + - __xfs_dir3_data_entsize(dir3, 1); + return xfs_dir3_data_dot_offset(mp) + + xfs_dir3_data_entsize(mp, 1); } static inline xfs_dir2_data_aoff_t -xfs_dir3_data_first_offset(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_first_offset(struct xfs_mount *mp) { - bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - return xfs_dir3_data_dotdot_offset(hdr) + - __xfs_dir3_data_entsize(dir3, 2); + return xfs_dir3_data_dotdot_offset(mp) + + xfs_dir3_data_entsize(mp, 2); } /* * location of . and .. in data space (always block 0) */ static inline struct xfs_dir2_data_entry * -xfs_dir3_data_dot_entry_p(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dot_entry_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) { return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_dot_offset(hdr)); + ((char *)hdr + xfs_dir3_data_dot_offset(mp)); } static inline struct xfs_dir2_data_entry * -xfs_dir3_data_dotdot_entry_p(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_dotdot_entry_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) { return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_dotdot_offset(hdr)); + ((char *)hdr + xfs_dir3_data_dotdot_offset(mp)); } static inline struct xfs_dir2_data_entry * -xfs_dir3_data_first_entry_p(struct xfs_dir2_data_hdr *hdr) +xfs_dir3_data_first_entry_p( + struct xfs_mount *mp, + struct xfs_dir2_data_hdr *hdr) { return (struct xfs_dir2_data_entry *) - ((char *)hdr + xfs_dir3_data_first_offset(hdr)); + ((char *)hdr + xfs_dir3_data_first_offset(mp)); } /* diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 8993ec17452c..8f84153e98a8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -119,9 +119,9 @@ xfs_dir2_sf_getdents( * mp->m_dirdatablk. */ dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR3_DATA_DOT_OFFSET(mp)); + xfs_dir3_data_dot_offset(mp)); dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk, - XFS_DIR3_DATA_DOTDOT_OFFSET(mp)); + xfs_dir3_data_dotdot_offset(mp)); /* * Put . entry unless we're starting past it. diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index bb6e2848f473..3ef6d402084c 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -557,7 +557,7 @@ xfs_dir2_sf_addname_hard( * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR3_DATA_FIRST_OFFSET(mp), + for (offset = xfs_dir3_data_first_offset(mp), oldsfep = xfs_dir2_sf_firstentry(oldsfp), add_datasize = xfs_dir3_data_entsize(mp, args->namelen), eof = (char *)oldsfep == &buf[old_isize]; @@ -640,7 +640,7 @@ xfs_dir2_sf_addname_pick( sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir3_data_entsize(mp, args->namelen); - offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); + offset = xfs_dir3_data_first_offset(mp); sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* @@ -713,7 +713,7 @@ xfs_dir2_sf_check( mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR3_DATA_FIRST_OFFSET(mp); + offset = xfs_dir3_data_first_offset(mp); ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; -- cgit v1.2.3 From 9b3b77fe661875f19ed748b67fb1eeb57d602b7e Mon Sep 17 00:00:00 2001 From: "tinguely@sgi.com" Date: Fri, 27 Sep 2013 09:00:55 -0500 Subject: xfs: fix memory leak in xlog_recover_add_to_trans Free the memory in error path of xlog_recover_add_to_trans(). Normally this memory is freed in recovery pass2, but is leaked in the error path. Signed-off-by: Mark Tinguely Reviewed-by: Eric Sandeen Signed-off-by: Ben Myers (cherry picked from commit 519ccb81ac1c8e3e4eed294acf93be00b43dcad6) --- fs/xfs/xfs_log_recover.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index cc179878fe41..43240583fd5b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1585,6 +1585,7 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); + free(ptr); return XFS_ERROR(EIO); } -- cgit v1.2.3 From b2a42f78ab475f4730300b0e9568bc3b2587d112 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Tue, 1 Oct 2013 16:47:53 +0200 Subject: xfs: Use kmem_free() instead of free() This fixes a build failure caused by calling the free() function which does not exist in the Linux kernel. Signed-off-by: Thierry Reding Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers (cherry picked from commit aaaae98022efa4f3c31042f1fdf9e7a0c5f04663) --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 43240583fd5b..39797490a1f1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1585,7 +1585,7 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); - free(ptr); + kmem_free(ptr); return XFS_ERROR(EIO); } -- cgit v1.2.3 From 60e7cd3a4ba6049ef590921e84454e6cfd9e2589 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2013 14:10:43 -0400 Subject: Btrfs: fix transid verify errors when recovering log tree If we crash with a log, remount and recover that log, and then crash before we can commit another transaction we will get transid verify errors on the next mount. This is because we were not zero'ing out the log when we committed the transaction after recovery. This is ok as long as we commit another transaction at some point in the future, but if you abort or something else goes wrong you can end up in this weird state because the recovery stuff says that the tree log should have a generation+1 of the super generation, which won't be the case of the transaction that was started for recovery. Fix this by removing the check and _always_ zero out the log portion of the super when we commit a transaction. This fixes the transid verify issues I was seeing with my force errors tests. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index e7a95356df83..8c81bdc1ef9b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1838,11 +1838,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); update_super_roots(root); - if (!root->fs_info->log_root_recovering) { - btrfs_set_super_log_root(root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); - } - + btrfs_set_super_log_root(root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); -- cgit v1.2.3 From 385fe0bede52a45cd960f30c7eb8d20ad8e1e05b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 1 Oct 2013 23:49:49 +0800 Subject: Btrfs: fix crash of compressed writes The crash[1] is found by xfstests/generic/208 with "-o compress", it's not reproduced everytime, but it does panic. The bug is quite interesting, it's actually introduced by a recent commit (573aecafca1cf7a974231b759197a1aebcf39c2a, Btrfs: actually limit the size of delalloc range). Btrfs implements delay allocation, so during writeback, we (1) get a page A and lock it (2) search the state tree for delalloc bytes and lock all pages within the range (3) process the delalloc range, including find disk space and create ordered extent and so on. (4) submit the page A. It runs well in normal cases, but if we're in a racy case, eg. buffered compressed writes and aio-dio writes, sometimes we may fail to lock all pages in the 'delalloc' range, in which case, we need to fall back to search the state tree again with a smaller range limit(max_bytes = PAGE_CACHE_SIZE - offset). The mentioned commit has a side effect, that is, in the fallback case, we can find delalloc bytes before the index of the page we already have locked, so we're in the case of (delalloc_end <= *start) and return with (found > 0). This ends with not locking delalloc pages but making ->writepage still process them, and the crash happens. This fixes it by just thinking that we find nothing and returning to caller as the caller knows how to deal with it properly. [1]: ------------[ cut here ]------------ kernel BUG at mm/page-writeback.c:2170! [...] CPU: 2 PID: 11755 Comm: btrfs-delalloc- Tainted: G O 3.11.0+ #8 [...] RIP: 0010:[] [] clear_page_dirty_for_io+0x1e/0x83 [...] [ 4934.248731] Stack: [ 4934.248731] ffff8801477e5dc8 ffffea00049b9f00 ffff8801869f9ce8 ffffffffa02b841a [ 4934.248731] 0000000000000000 0000000000000000 0000000000000fff 0000000000000620 [ 4934.248731] ffff88018db59c78 ffffea0005da8d40 ffffffffa02ff860 00000001810016c0 [ 4934.248731] Call Trace: [ 4934.248731] [] extent_range_clear_dirty_for_io+0xcf/0xf5 [btrfs] [ 4934.248731] [] compress_file_range+0x1dc/0x4cb [btrfs] [ 4934.248731] [] ? detach_if_pending+0x22/0x4b [ 4934.248731] [] async_cow_start+0x35/0x53 [btrfs] [ 4934.248731] [] worker_loop+0x14b/0x48c [btrfs] [ 4934.248731] [] ? btrfs_queue_worker+0x25c/0x25c [btrfs] [ 4934.248731] [] kthread+0x8d/0x95 [ 4934.248731] [] ? kthread_freezable_should_stop+0x43/0x43 [ 4934.248731] [] ret_from_fork+0x7c/0xb0 [ 4934.248731] [] ? kthread_freezable_should_stop+0x43/0x43 [ 4934.248731] Code: ff 85 c0 0f 94 c0 0f b6 c0 59 5b 5d c3 0f 1f 44 00 00 55 48 89 e5 41 54 53 48 89 fb e8 2c de 00 00 49 89 c4 48 8b 03 a8 01 75 02 <0f> 0b 4d 85 e4 74 52 49 8b 84 24 80 00 00 00 f6 40 20 01 75 44 [ 4934.248731] RIP [] clear_page_dirty_for_io+0x1e/0x83 [ 4934.248731] RSP [ 4934.280307] ---[ end trace 36f06d3f8750236a ]--- Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c09a40db53db..43feb4663f5b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1614,7 +1614,7 @@ again: *start = delalloc_start; *end = delalloc_end; free_extent_state(cached_state); - return found; + return 0; } /* -- cgit v1.2.3 From 964fb15acfcd672ac691f04879b71f07ccc21e0c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 2 Oct 2013 19:39:50 +0300 Subject: Btrfs: eliminate races in worker stopping code The current implementation of worker threads in Btrfs has races in worker stopping code, which cause all kinds of panics and lockups when running btrfs/011 xfstest in a loop. The problem is that btrfs_stop_workers is unsynchronized with respect to check_idle_worker, check_busy_worker and __btrfs_start_workers. E.g., check_idle_worker race flow: btrfs_stop_workers(): check_idle_worker(aworker): - grabs the lock - splices the idle list into the working list - removes the first worker from the working list - releases the lock to wait for its kthread's completion - grabs the lock - if aworker is on the working list, moves aworker from the working list to the idle list - releases the lock - grabs the lock - puts the worker - removes the second worker from the working list ...... btrfs_stop_workers returns, aworker is on the idle list FS is umounted, memory is freed ...... aworker is waken up, fireworks ensue With this applied, I wasn't able to trigger the problem in 48 hours, whereas previously I could reliably reproduce at least one of these races within an hour. Reported-by: David Sterba Signed-off-by: Ilya Dryomov Signed-off-by: Josef Bacik --- fs/btrfs/async-thread.c | 25 +++++++++++++++++++------ fs/btrfs/async-thread.h | 2 ++ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 58b7d14b08ee..08cc08f037a6 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -107,7 +107,8 @@ static void check_idle_worker(struct btrfs_worker_thread *worker) worker->idle = 1; /* the list may be empty if the worker is just starting */ - if (!list_empty(&worker->worker_list)) { + if (!list_empty(&worker->worker_list) && + !worker->workers->stopping) { list_move(&worker->worker_list, &worker->workers->idle_list); } @@ -127,7 +128,8 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) spin_lock_irqsave(&worker->workers->lock, flags); worker->idle = 0; - if (!list_empty(&worker->worker_list)) { + if (!list_empty(&worker->worker_list) && + !worker->workers->stopping) { list_move_tail(&worker->worker_list, &worker->workers->worker_list); } @@ -412,6 +414,7 @@ void btrfs_stop_workers(struct btrfs_workers *workers) int can_stop; spin_lock_irq(&workers->lock); + workers->stopping = 1; list_splice_init(&workers->idle_list, &workers->worker_list); while (!list_empty(&workers->worker_list)) { cur = workers->worker_list.next; @@ -455,6 +458,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, workers->ordered = 0; workers->atomic_start_pending = 0; workers->atomic_worker_start = async_helper; + workers->stopping = 0; } /* @@ -480,15 +484,19 @@ static int __btrfs_start_workers(struct btrfs_workers *workers) atomic_set(&worker->num_pending, 0); atomic_set(&worker->refs, 1); worker->workers = workers; - worker->task = kthread_run(worker_loop, worker, - "btrfs-%s-%d", workers->name, - workers->num_workers + 1); + worker->task = kthread_create(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + 1); if (IS_ERR(worker->task)) { ret = PTR_ERR(worker->task); - kfree(worker); goto fail; } + spin_lock_irq(&workers->lock); + if (workers->stopping) { + spin_unlock_irq(&workers->lock); + goto fail_kthread; + } list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; @@ -496,8 +504,13 @@ static int __btrfs_start_workers(struct btrfs_workers *workers) WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); + wake_up_process(worker->task); return 0; + +fail_kthread: + kthread_stop(worker->task); fail: + kfree(worker); spin_lock_irq(&workers->lock); workers->num_workers_starting--; spin_unlock_irq(&workers->lock); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 063698b90ce2..1f26792683ed 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -107,6 +107,8 @@ struct btrfs_workers { /* extra name for this worker, used for current->name */ char *name; + + int stopping; }; void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); -- cgit v1.2.3 From 1357272fc7deeebb7b3c5d1a071562edc273cdaf Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 2 Oct 2013 20:41:01 +0300 Subject: Btrfs: fix a use-after-free bug in btrfs_dev_replace_finishing free_device rcu callback, scheduled from btrfs_rm_dev_replace_srcdev, can be processed before btrfs_scratch_superblock is called, which would result in a use-after-free on btrfs_device contents. Fix this by zeroing the superblock before the rcu callback is registered. Cc: Stefan Behrens Signed-off-by: Ilya Dryomov Signed-off-by: Josef Bacik --- fs/btrfs/dev-replace.c | 5 +---- fs/btrfs/volumes.c | 7 ++++++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5d844438b2d4..2a9bd5bd24c3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -535,10 +535,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); btrfs_rm_dev_replace_srcdev(fs_info, src_device); - if (src_device->bdev) { - /* zero out the old super */ - btrfs_scratch_superblock(src_device); - } + /* * this is again a consistent state where no dev_replace procedure * is running, the target device is part of the filesystem, the diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 043114749516..b0203b1322ac 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1716,6 +1716,7 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev) { WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + list_del_rcu(&srcdev->dev_list); list_del_rcu(&srcdev->dev_alloc_list); fs_info->fs_devices->num_devices--; @@ -1725,9 +1726,13 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, } if (srcdev->can_discard) fs_info->fs_devices->num_can_discard--; - if (srcdev->bdev) + if (srcdev->bdev) { fs_info->fs_devices->open_devices--; + /* zero out the old super */ + btrfs_scratch_superblock(srcdev); + } + call_rcu(&srcdev->rcu, free_device); } -- cgit v1.2.3 From b208c2f7ceafacbc44f13d1b5a9fbada98226183 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 19 Sep 2013 20:37:07 -0700 Subject: btrfs: Fix crash due to not allocating integrity data for a bioset When btrfs creates a bioset, we must also allocate the integrity data pool. Otherwise btrfs will crash when it tries to submit a bio to a checksumming disk: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: [] mempool_alloc+0x4a/0x150 PGD 2305e4067 PUD 23063d067 PMD 0 Oops: 0000 [#1] PREEMPT SMP Modules linked in: btrfs scsi_debug xfs ext4 jbd2 ext3 jbd mbcache sch_fq_codel eeprom lpc_ich mfd_core nfsd exportfs auth_rpcgss af_packet raid6_pq xor zlib_deflate libcrc32c [last unloaded: scsi_debug] CPU: 1 PID: 4486 Comm: mount Not tainted 3.12.0-rc1-mcsum #2 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8802451c9720 ti: ffff880230698000 task.ti: ffff880230698000 RIP: 0010:[] [] mempool_alloc+0x4a/0x150 RSP: 0018:ffff880230699688 EFLAGS: 00010286 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000005f8445 RDX: 0000000000000001 RSI: 0000000000000010 RDI: 0000000000000000 RBP: ffff8802306996f8 R08: 0000000000011200 R09: 0000000000000008 R10: 0000000000000020 R11: ffff88009d6e8000 R12: 0000000000011210 R13: 0000000000000030 R14: ffff8802306996b8 R15: ffff8802451c9720 FS: 00007f25b8a16800(0000) GS:ffff88024fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000018 CR3: 0000000230576000 CR4: 00000000000007e0 Stack: ffff8802451c9720 0000000000000002 ffffffff81a97100 0000000000281250 ffffffff81a96480 ffff88024fc99150 ffff880228d18200 0000000000000000 0000000000000000 0000000000000040 ffff880230e8c2e8 ffff8802459dc900 Call Trace: [] bio_integrity_alloc+0x48/0x1b0 [] bio_integrity_prep+0xac/0x360 [] ? mempool_alloc+0x58/0x150 [] ? alloc_extent_state+0x31/0x110 [btrfs] [] blk_queue_bio+0x1c9/0x460 [] generic_make_request+0xca/0x100 [] submit_bio+0x79/0x160 [] btrfs_map_bio+0x48e/0x5b0 [btrfs] [] btree_submit_bio_hook+0xda/0x110 [btrfs] [] submit_one_bio+0x6a/0xa0 [btrfs] [] read_extent_buffer_pages+0x250/0x310 [btrfs] [] ? __radix_tree_preload+0x66/0xf0 [] ? radix_tree_insert+0x95/0x260 [] btree_read_extent_buffer_pages.constprop.128+0xb6/0x120 [btrfs] [] read_tree_block+0x3a/0x60 [btrfs] [] open_ctree+0x139d/0x2030 [btrfs] [] btrfs_mount+0x53a/0x7d0 [btrfs] [] ? pcpu_alloc+0x8eb/0x9f0 [] ? __kmalloc_track_caller+0x35/0x1e0 [] mount_fs+0x20/0xd0 [] vfs_kern_mount+0x76/0x120 [] do_mount+0x200/0xa40 [] ? strndup_user+0x5b/0x80 [] SyS_mount+0x90/0xe0 [] system_call_fastpath+0x1a/0x1f Code: 4c 8d 75 a8 4c 89 6d e8 45 89 e0 4c 8d 6f 30 48 89 5d d8 41 83 e0 af 48 89 fb 49 83 c6 18 4c 89 7d f8 65 4c 8b 3c 25 c0 b8 00 00 <48> 8b 73 18 44 89 c7 44 89 45 98 ff 53 20 48 85 c0 48 89 c2 74 RIP [] mempool_alloc+0x4a/0x150 RSP CR2: 0000000000000018 ---[ end trace 7a96042017ed21e2 ]--- Signed-off-by: Darrick J. Wong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43feb4663f5b..22bda32acb89 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -145,8 +145,16 @@ int __init extent_io_init(void) offsetof(struct btrfs_io_bio, bio)); if (!btrfs_bioset) goto free_buffer_cache; + + if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE)) + goto free_bioset; + return 0; +free_bioset: + bioset_free(btrfs_bioset); + btrfs_bioset = NULL; + free_buffer_cache: kmem_cache_destroy(extent_buffer_cache); extent_buffer_cache = NULL; -- cgit v1.2.3 From c31f330719b7331b2112a5525fe5941a99ac223d Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 28 Sep 2013 18:24:12 -0500 Subject: do not treat non-symlink reparse points as valid symlinks Windows 8 and later can create NFS symlinks (within reparse points) which we were assuming were normal NTFS symlinks and thus reporting corrupt paths for. Add check for reparse points to make sure that they really are normal symlinks before we try to parse the pathname. We also should not be parsing other types of reparse points (DFS junctions etc) as if they were a symlink so return EOPNOTSUPP on those. Also fix endian errors (we were not parsing symlink lengths as little endian). This fixes commit d244bf2dfbebfded05f494ffd53659fa7b1e32c1 which implemented follow link for non-Unix CIFS mounts CC: Stable Reviewed-by: Andrew Bartlett Signed-off-by: Steve French --- fs/cifs/cifspdu.h | 31 +++++++++++++++++++++++-------- fs/cifs/cifssmb.c | 40 ++++++++++++++++++++++++++++++++++------ fs/cifs/smbfsctl.h | 14 ++++++++++++++ 3 files changed, 71 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index a630475e421c..08f9dfb1a894 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1491,15 +1491,30 @@ struct file_notify_information { __u8 FileName[0]; } __attribute__((packed)); -struct reparse_data { - __u32 ReparseTag; - __u16 ReparseDataLength; +/* For IO_REPARSE_TAG_SYMLINK */ +struct reparse_symlink_data { + __le32 ReparseTag; + __le16 ReparseDataLength; __u16 Reserved; - __u16 SubstituteNameOffset; - __u16 SubstituteNameLength; - __u16 PrintNameOffset; - __u16 PrintNameLength; - __u32 Flags; + __le16 SubstituteNameOffset; + __le16 SubstituteNameLength; + __le16 PrintNameOffset; + __le16 PrintNameLength; + __le32 Flags; + char PathBuffer[0]; +} __attribute__((packed)); + +/* For IO_REPARSE_TAG_NFS */ +#define NFS_SPECFILE_LNK 0x00000000014B4E4C +#define NFS_SPECFILE_CHR 0x0000000000524843 +#define NFS_SPECFILE_BLK 0x00000000004B4C42 +#define NFS_SPECFILE_FIFO 0x000000004F464946 +#define NFS_SPECFILE_SOCK 0x000000004B434F53 +struct reparse_posix_data { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __le64 InodeType; /* LNK, FIFO, CHR etc. */ char PathBuffer[0]; } __attribute__((packed)); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4baf35949b51..ccd31ab815d4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3088,7 +3088,8 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, bool is_unicode; unsigned int sub_len; char *sub_start; - struct reparse_data *reparse_buf; + struct reparse_symlink_data *reparse_buf; + struct reparse_posix_data *posix_buf; __u32 data_offset, data_count; char *end_of_smb; @@ -3137,20 +3138,47 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, goto qreparse_out; } end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; - reparse_buf = (struct reparse_data *) + reparse_buf = (struct reparse_symlink_data *) ((char *)&pSMBr->hdr.Protocol + data_offset); if ((char *)reparse_buf >= end_of_smb) { rc = -EIO; goto qreparse_out; } - if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset + - reparse_buf->PrintNameLength) > end_of_smb) { + if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) { + cifs_dbg(FYI, "NFS style reparse tag\n"); + posix_buf = (struct reparse_posix_data *)reparse_buf; + + if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) { + cifs_dbg(FYI, "unsupported file type 0x%llx\n", + le64_to_cpu(posix_buf->InodeType)); + rc = -EOPNOTSUPP; + goto qreparse_out; + } + is_unicode = true; + sub_len = le16_to_cpu(reparse_buf->ReparseDataLength); + if (posix_buf->PathBuffer + sub_len > end_of_smb) { + cifs_dbg(FYI, "reparse buf beyond SMB\n"); + rc = -EIO; + goto qreparse_out; + } + *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer, + sub_len, is_unicode, nls_codepage); + goto qreparse_out; + } else if (reparse_buf->ReparseTag != + cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) { + rc = -EOPNOTSUPP; + goto qreparse_out; + } + + /* Reparse tag is NTFS symlink */ + sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) + + reparse_buf->PathBuffer; + sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength); + if (sub_start + sub_len > end_of_smb) { cifs_dbg(FYI, "reparse buf beyond SMB\n"); rc = -EIO; goto qreparse_out; } - sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer; - sub_len = reparse_buf->SubstituteNameLength; if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) is_unicode = true; else diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index d952ee48f4dc..a4b2391fe66e 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -97,9 +97,23 @@ #define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */ #define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */ +/* See FSCC 2.1.2.5 */ #define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 #define IO_REPARSE_TAG_HSM 0xC0000004 #define IO_REPARSE_TAG_SIS 0x80000007 +#define IO_REPARSE_TAG_HSM2 0x80000006 +#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005 +/* Used by the DFS filter. See MS-DFSC */ +#define IO_REPARSE_TAG_DFS 0x8000000A +/* Used by the DFS filter See MS-DFSC */ +#define IO_REPARSE_TAG_DFSR 0x80000012 +#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B +/* See section MS-FSCC 2.1.2.4 */ +#define IO_REPARSE_TAG_SYMLINK 0xA000000C +#define IO_REPARSE_TAG_DEDUP 0x80000013 +#define IO_REPARSE_APPXSTREAM 0xC0000014 +/* NFS symlinks, Win 8/SMB3 and later */ +#define IO_REPARSE_TAG_NFS 0x80000014 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ -- cgit v1.2.3 From eb4c7df6c20b407ecbf1a985edc33d967371c2e8 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Thu, 3 Oct 2013 05:44:45 -0500 Subject: cifs: Avoid umount hangs with smb2 when server is unresponsive Do not send SMB2 Logoff command when reconnecting, the way smb1 code base works. Also, no need to wait for a credit for an echo command when one is already in flight. Without these changes, umount command hangs if the server is unresponsive e.g. hibernating. Signed-off-by: Shirish Pargaonkar Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 6 ++++++ fs/cifs/transport.c | 9 +++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index eba0efde66d7..edccb5252462 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -687,6 +687,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) else return -EIO; + /* no need to send SMB logoff if uid already closed due to reconnect */ + if (ses->need_reconnect) + goto smb2_session_already_dead; + rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req); if (rc) return rc; @@ -701,6 +705,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ + +smb2_session_already_dead: return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 6fdcb1b4a106..800b938e4061 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -410,8 +410,13 @@ static int wait_for_free_request(struct TCP_Server_Info *server, const int timeout, const int optype) { - return wait_for_free_credits(server, timeout, - server->ops->get_credits_field(server, optype)); + int *val; + + val = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*val <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; + return wait_for_free_credits(server, timeout, val); } static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, -- cgit v1.2.3 From 2f6c9479633780ba4a3484bba7eba5a721a5cf20 Mon Sep 17 00:00:00 2001 From: Jan Klos Date: Sun, 6 Oct 2013 21:08:20 +0200 Subject: cifs: Fix inability to write files >2GB to SMB2/3 shares When connecting to SMB2/3 shares, maximum file size is set to non-LFS maximum in superblock. This is due to cap_large_files bit being different for SMB1 and SMB2/3 (where it is just an internal flag that is not negotiated and the SMB1 one corresponds to multichannel capability, so maybe LFS works correctly if server sends 0x08 flag) while capabilities are checked always for the SMB1 bit in cifs_read_super(). The patch fixes this by checking for the correct bit according to the protocol version. CC: Stable Signed-off-by: Jan Klos Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a16b4e58bcc6..77fc5e181077 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -120,14 +120,16 @@ cifs_read_super(struct super_block *sb) { struct inode *inode; struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; int rc = 0; cifs_sb = CIFS_SB(sb); + tcon = cifs_sb_master_tcon(cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) sb->s_flags |= MS_POSIXACL; - if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) + if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files) sb->s_maxbytes = MAX_LFS_FILESIZE; else sb->s_maxbytes = MAX_NON_LFS; @@ -147,7 +149,7 @@ cifs_read_super(struct super_block *sb) goto out_no_root; } - if (cifs_sb_master_tcon(cifs_sb)->nocase) + if (tcon->nocase) sb->s_d_op = &cifs_ci_dentry_ops; else sb->s_d_op = &cifs_dentry_ops; -- cgit v1.2.3 From dde2356c8466298bd77fa699e0ea296372eed47b Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Fri, 27 Sep 2013 18:35:42 +0100 Subject: cifs: Allow LANMAN auth method for servers supporting unencapsulated authentication methods This allows users to use LANMAN authentication on servers which support unencapsulated authentication. The patch fixes a regression where users using plaintext authentication were no longer able to do so because of changed bought in by patch 3f618223dc0bdcbc8d510350e78ee2195ff93768 https://bugzilla.redhat.com/show_bug.cgi?id=1011621 Reported-by: Panos Kavalagios Reviewed-by: Jeff Layton Signed-off-by: Sachin Prabhu Signed-off-by: Steve French --- fs/cifs/sess.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 352358de1d7e..e87387dbf39f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -500,9 +500,9 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) return NTLMv2; if (global_secflags & CIFSSEC_MAY_NTLM) return NTLM; - /* Fallthrough */ default: - return Unspecified; + /* Fallthrough to attempt LANMAN authentication next */ + break; } case CIFS_NEGFLAVOR_LANMAN: switch (requested) { -- cgit v1.2.3 From 4871c1588f92c6c13f4713a7009f25f217055807 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 9 Oct 2013 12:24:04 -0400 Subject: Btrfs: use right root when checking for hash collision btrfs_rename was using the root of the old dir instead of the root of the new dir when checking for a hash collision, so if you tried to move a file into a subvol it would freak out because it would see the file you are trying to move in its current root. This fixes the bug where this would fail btrfs subvol create test1 btrfs subvol create test2 mv test1 test2. Thanks to Chris Murphy for catching this, Cc: stable@vger.kernel.org Reported-by: Chris Murphy Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8af6c03953ad..3b4ffaf0cd52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7986,7 +7986,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(root, new_dir->i_ino, + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, new_dentry->d_name.name, new_dentry->d_name.len); -- cgit v1.2.3 From 7bf811a595a895b7a886dcf218d0d34f97df76dc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Oct 2013 22:11:09 -0400 Subject: Btrfs: limit delalloc pages outside of find_delalloc_range Liu fixed part of this problem and unfortunately I steered him in slightly the wrong direction and so didn't completely fix the problem. The problem is we limit the size of the delalloc range we are looking for to max bytes and then we try to lock that range. If we fail to lock the pages in that range we will shrink the max bytes to a single page and re loop. However if our first page is inside of the delalloc range then we will end up limiting the end of the range to a period before our first page. This is illustrated below [0 -------- delalloc range --------- 256mb] [page] So find_delalloc_range will return with delalloc_start as 0 and end as 128mb, and then we will notice that delalloc_start < *start and adjust it up, but not adjust delalloc_end up, so things go sideways. To fix this we need to not limit the max bytes in find_delalloc_range, but in find_lock_delalloc_range and that way we don't end up with this confusion. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 43feb4663f5b..d8ea0cb200b4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1482,10 +1482,8 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, cur_start = state->end + 1; node = rb_next(node); total_bytes += state->end - state->start + 1; - if (total_bytes >= max_bytes) { - *end = *start + max_bytes - 1; + if (total_bytes >= max_bytes) break; - } if (!node) break; } @@ -1627,10 +1625,9 @@ again: /* * make sure to limit the number of pages we try to lock down - * if we're looping. */ - if (delalloc_end + 1 - delalloc_start > max_bytes && loops) - delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + if (delalloc_end + 1 - delalloc_start > max_bytes) + delalloc_end = delalloc_start + max_bytes - 1; /* step two, lock all the pages after the page that has start */ ret = lock_delalloc_pages(inode, locked_page, @@ -1641,8 +1638,7 @@ again: */ free_extent_state(cached_state); if (!loops) { - unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); - max_bytes = PAGE_CACHE_SIZE - offset; + max_bytes = PAGE_CACHE_SIZE; loops = 1; goto again; } else { -- cgit v1.2.3 From 14927d95464956ffe8af4278331a6bfea94ab780 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 25 Sep 2013 21:47:43 +0800 Subject: Btrfs: insert orphan roots into fs radix tree Now we don't drop all the deleted snapshots/subvolumes before the space balance. It means we have to relocate the space which is held by the dead snapshots/subvolumes. So we must into them into fs radix tree, or we would forget to commit the change of them when doing transaction commit, and it would corrupt the metadata. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/root-tree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0b1f4ef8db98..ec71ea44d2b4 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -299,11 +299,6 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) continue; } - if (btrfs_root_refs(&root->root_item) == 0) { - btrfs_add_dead_root(root); - continue; - } - err = btrfs_init_fs_root(root); if (err) { btrfs_free_fs_root(root); @@ -318,6 +313,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) btrfs_free_fs_root(root); break; } + + if (btrfs_root_refs(&root->root_item) == 0) + btrfs_add_dead_root(root); } btrfs_free_path(path); -- cgit v1.2.3 From c00869f1ae6a8fa49802d5e60d843b7051a112ec Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 25 Sep 2013 21:47:44 +0800 Subject: Btrfs: fix oops caused by the space balance and dead roots When doing space balance and subvolume destroy at the same time, we met the following oops: kernel BUG at fs/btrfs/relocation.c:2247! RIP: 0010: [] prepare_to_merge+0x154/0x1f0 [btrfs] Call Trace: [] relocate_block_group+0x466/0x4e6 [btrfs] [] btrfs_relocate_block_group+0x143/0x275 [btrfs] [] btrfs_relocate_chunk.isra.27+0x5c/0x5a2 [btrfs] [] ? btrfs_item_key_to_cpu+0x15/0x31 [btrfs] [] ? btrfs_get_token_64+0x7e/0xcd [btrfs] [] ? btrfs_tree_read_unlock_blocking+0xb2/0xb7 [btrfs] [] btrfs_balance+0x9c7/0xb6f [btrfs] [] btrfs_ioctl_balance+0x234/0x2ac [btrfs] [] btrfs_ioctl+0xd87/0x1ef9 [btrfs] [] ? path_openat+0x234/0x4db [] ? __do_page_fault+0x31d/0x391 [] ? vma_link+0x74/0x94 [] vfs_ioctl+0x1d/0x39 [] do_vfs_ioctl+0x32d/0x3e2 [] SyS_ioctl+0x57/0x83 [] ? do_page_fault+0xe/0x10 [] system_call_fastpath+0x16/0x1b It is because we returned the error number if the reference of the root was 0 when doing space relocation. It was not right here, because though the root was dead(refs == 0), but the space it held still need be relocated, or we could not remove the block group. So in this case, we should return the root no matter it is dead or not. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 9 +++++---- fs/btrfs/disk-io.h | 13 +++++++++++-- fs/btrfs/relocation.c | 2 +- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ae17ed13b32..62176ad89846 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1561,8 +1561,9 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, return ret; } -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location) +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + bool check_ref) { struct btrfs_root *root; int ret; @@ -1586,7 +1587,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { - if (btrfs_root_refs(&root->root_item) == 0) + if (check_ref && btrfs_root_refs(&root->root_item) == 0) return ERR_PTR(-ENOENT); return root; } @@ -1595,7 +1596,7 @@ again: if (IS_ERR(root)) return root; - if (btrfs_root_refs(&root->root_item) == 0) { + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; goto fail; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b71acd6e1e5b..5ce2a7da8b11 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -68,8 +68,17 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, int btrfs_init_fs_root(struct btrfs_root *root); int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, - struct btrfs_key *location); + +struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *key, + bool check_ref); +static inline struct btrfs_root * +btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + return btrfs_get_fs_root(fs_info, location, true); +} + int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_root *root); void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a5a26320503f..4a355726151e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -588,7 +588,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, else key.offset = (u64)-1; - return btrfs_read_fs_root_no_name(fs_info, &key); + return btrfs_get_fs_root(fs_info, &key, false); } #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 -- cgit v1.2.3 From 6e4ea8e33b2057b85d75175dd89b93f5e26de3bc Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 10 Oct 2013 20:05:35 -0400 Subject: ext4: fix memory leak in xattr If we take the 2nd retry path in ext4_expand_extra_isize_ea, we potentionally return from the function without having freed these allocations. If we don't do the return, we over-write the previous allocation pointers, so we leak either way. Spotted with Coverity. [ Fixed by tytso to set is and bs to NULL after freeing these pointers, in case in the retry loop we later end up triggering an error causing a jump to cleanup, at which point we could have a double free bug. -- Ted ] Signed-off-by: Dave Jones Signed-off-by: "Theodore Ts'o" Reviewed-by: Eric Sandeen Cc: stable@vger.kernel.org --- fs/ext4/xattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c081e34f717f..03e9bebba198 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1350,6 +1350,8 @@ retry: s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; + kfree(is); is = NULL; + kfree(bs); bs = NULL; goto retry; } error = -1; -- cgit v1.2.3 From 9d05746e7b16d8565dddbe3200faa1e669d23bbf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 30 Sep 2013 08:35:10 -0700 Subject: vfs: allow O_PATH file descriptors for fstatfs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Olga reported that file descriptors opened with O_PATH do not work with fstatfs(), found during further development of ksh93's thread support. There is no reason to not allow O_PATH file descriptors here (fstatfs is very much a path operation), so use "fdget_raw()". See commit 55815f70147d ("vfs: make O_PATH file descriptors usable for 'fstat()'") for a very similar issue reported for fstat() by the same team. Reported-and-tested-by: ольга крыжановская Acked-by: Al Viro Cc: stable@kernel.org # O_PATH introduced in 3.0+ Signed-off-by: Linus Torvalds --- fs/statfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/statfs.c b/fs/statfs.c index c219e733f553..083dc0ac9140 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -94,7 +94,7 @@ retry: int fd_statfs(int fd, struct kstatfs *st) { - struct fd f = fdget(fd); + struct fd f = fdget_raw(fd); int error = -EBADF; if (f.file) { error = vfs_statfs(&f.file->f_path, st); -- cgit v1.2.3 From 0c26606cbe4937f2228a27bb0c2cad19855be87a Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Sun, 13 Oct 2013 13:29:03 -0600 Subject: cifs: ntstatus_to_dos_map[] is not terminated Functions that walk the ntstatus_to_dos_map[] array could run off the end. For example, ntstatus_to_dos() loops while ntstatus_to_dos_map[].ntstatus is not 0. Granted, this is mostly theoretical, but could be used as a DOS attack if the error code in the SMB header is bogus. [Might consider adding to stable, as this patch is low risk - Steve] Reviewed-by: Jeff Layton Signed-off-by: Tim Gardner Signed-off-by: Steve French --- fs/cifs/netmisc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index af847e1cf1c1..651a5279607b 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -780,7 +780,9 @@ static const struct { ERRDOS, ERRnoaccess, 0xc0000290}, { ERRDOS, ERRbadfunc, 0xc000029c}, { ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, { - ERRDOS, ERRinvlevel, 0x007c0001}, }; + ERRDOS, ERRinvlevel, 0x007c0001}, { + 0, 0, 0 } +}; /***************************************************************************** Print an error message from the status code -- cgit v1.2.3 From 43ae9e3fc70ca0057ae0a24ef5eedff05e3fae06 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 10 Oct 2013 16:48:19 +0200 Subject: ext[34]: fix double put in tmpfile d_tmpfile() already swallowed the inode ref. Signed-off-by: Miklos Szeredi Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/ext3/namei.c | 5 ++--- fs/ext4/namei.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 1194b1f0f839..f8cde46de9cd 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1783,7 +1783,7 @@ retry: d_tmpfile(dentry, inode); err = ext3_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -1791,10 +1791,9 @@ retry: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext3_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1bec5a5c1e45..5a0408d7b114 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2319,7 +2319,7 @@ retry: d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); if (err) - goto err_drop_inode; + goto err_unlock_inode; mark_inode_dirty(inode); unlock_new_inode(inode); } @@ -2328,10 +2328,9 @@ retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; -err_drop_inode: +err_unlock_inode: ext4_journal_stop(handle); unlock_new_inode(inode); - iput(inode); return err; } -- cgit v1.2.3 From e9cdd6e771580e6ff872e5c64e8b766972c7d1bc Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 16 Oct 2013 13:46:53 -0700 Subject: mm: /proc/pid/pagemap: inspect _PAGE_SOFT_DIRTY only on present pages If a page we are inspecting is in swap we may occasionally report it as having soft dirty bit (even if it is clean). The pte_soft_dirty helper should be called on present pte only. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andy Lutomirski Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Reviewed-by: Naoya Horiguchi Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7366e9d63cee..390bdab01c3c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -941,6 +941,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, frame = pte_pfn(pte); flags = PM_PRESENT; page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) + flags2 |= __PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { swp_entry_t entry; if (pte_swp_soft_dirty(pte)) @@ -960,7 +962,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (page && !PageAnon(page)) flags |= PM_FILE; - if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) + if ((vma->vm_flags & VM_SOFTDIRTY)) flags2 |= __PM_SOFT_DIRTY; *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); -- cgit v1.2.3 From 84235de394d9775bfaa7fa9762a59d91fef0c1fc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Oct 2013 13:47:00 -0700 Subject: fs: buffer: move allocation failure loop into the allocator Buffer allocation has a very crude indefinite loop around waking the flusher threads and performing global NOFS direct reclaim because it can not handle allocation failures. The most immediate problem with this is that the allocation may fail due to a memory cgroup limit, where flushers + direct reclaim might not make any progress towards resolving the situation at all. Because unlike the global case, a memory cgroup may not have any cache at all, only anonymous pages but no swap. This situation will lead to a reclaim livelock with insane IO from waking the flushers and thrashing unrelated filesystem cache in a tight loop. Use __GFP_NOFAIL allocations for buffers for now. This makes sure that any looping happens in the page allocator, which knows how to orchestrate kswapd, direct reclaim, and the flushers sensibly. It also allows memory cgroups to detect allocations that can't handle failure and will allow them to ultimately bypass the limit if reclaim can not make progress. Reported-by: azurIt Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 4d7433534f5c..6024877335ca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; /* Will call free_more_memory() */ + gfp_t gfp_mask; - page = find_or_create_page(inode->i_mapping, index, - (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); + gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; + gfp_mask |= __GFP_MOVABLE; + /* + * XXX: __getblk_slow() can not really deal with failure and + * will endlessly loop on improvised global reclaim. Prefer + * looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp_mask |= __GFP_NOFAIL; + + page = find_or_create_page(inode->i_mapping, index, gfp_mask); if (!page) return ret; -- cgit v1.2.3 From 2cbe3b0af82279f14cfb3195f2406651f28ee9b8 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 16 Oct 2013 13:47:04 -0700 Subject: procfs: fix unintended truncation of returned mapped address Currently, proc_reg_get_unmapped_area truncates upper 32-bit of the mapped virtual address returned from get_unmapped_area method in pde->proc_fops due to the variable rv of signed integer on x86_64. This is too small to have vitual address of unsigned long on x86_64 since on x86_64, signed integer is of 4 bytes while unsigned long is of 8 bytes. To fix this issue, use unsigned long instead. Fixes a regression added in commit c4fe24485729 ("sparc: fix PCI device proc file mmap(2)"). Signed-off-by: HATAYAMA Daisuke Cc: Alexey Dobriyan Cc: David S. Miller Tested-by: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9f8ef9b7674d..6c501c4d996d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -288,7 +288,7 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct proc_dir_entry *pde = PDE(file_inode(file)); - int rv = -EIO; + unsigned long rv = -EIO; unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); if (use_pde(pde)) { get_unmapped_area = pde->proc_fops->get_unmapped_area; -- cgit v1.2.3 From fad1a86e25e0a1f85635ed06ef62ddadd5b8fa4c Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 16 Oct 2013 13:47:05 -0700 Subject: procfs: call default get_unmapped_area on MMU-present architectures Commit c4fe24485729 ("sparc: fix PCI device proc file mmap(2)") added proc_reg_get_unmapped_area in proc_reg_file_ops and proc_reg_file_ops_no_compat, by which now mmap always returns EIO if get_unmapped_area method is not defined for the target procfs file, which causes regression of mmap on /proc/vmcore. To address this issue, like get_unmapped_area(), call default current->mm->get_unmapped_area on MMU-present architectures if pde->proc_fops->get_unmapped_area, i.e. the one in actual file operation in the procfs file, is not defined. Reported-by: Michael Holzheu Signed-off-by: HATAYAMA Daisuke Cc: Alexey Dobriyan Cc: David S. Miller Tested-by: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6c501c4d996d..8eaa1ba793fc 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -289,9 +289,13 @@ static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long { struct proc_dir_entry *pde = PDE(file_inode(file)); unsigned long rv = -EIO; - unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; if (use_pde(pde)) { - get_unmapped_area = pde->proc_fops->get_unmapped_area; +#ifdef CONFIG_MMU + get_unmapped_area = current->mm->get_unmapped_area; +#endif + if (pde->proc_fops->get_unmapped_area) + get_unmapped_area = pde->proc_fops->get_unmapped_area; if (get_unmapped_area) rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); unuse_pde(pde); -- cgit v1.2.3 From 1bda19eb73d68b304148e67253e47cef049a419d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Oct 2013 12:10:36 -0400 Subject: Btrfs: release path before starting transaction in can_nocow_extent We can't be holding tree locks while we try to start a transaction, we will deadlock. Thanks, Reported-by: Sage Weil Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b4ffaf0cd52..f4a6851e6c88 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6437,6 +6437,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (btrfs_extent_readonly(root, disk_bytenr)) goto out; + btrfs_release_path(path); /* * look for other files referencing this extent, if we -- cgit v1.2.3 From 606d6fe3ffdb5190d4c8e4d6cd23aa6c1f9cb6ad Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 19 Oct 2013 14:56:55 -0700 Subject: fs/namei.c: fix new kernel-doc warning Add @path parameter to fix kernel-doc warning. Also fix a spello/typo. Warning(fs/namei.c:2304): No description found for parameter 'path' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 645268f23eb6..caa28051e197 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2294,10 +2294,11 @@ out: * path_mountpoint - look up a path to be umounted * @dfd: directory file descriptor to start walk from * @name: full pathname to walk + * @path: pointer to container for result * @flags: lookup flags * * Look up the given name, but don't attempt to revalidate the last component. - * Returns 0 and "path" will be valid on success; Retuns error otherwise. + * Returns 0 and "path" will be valid on success; Returns error otherwise. */ static int path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) -- cgit v1.2.3 From 69c88dc7d9f1a6c3eceb7058111677c640811c94 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 19 Oct 2013 14:57:07 -0700 Subject: vfs: fix new kernel-doc warnings Move kernel-doc notation to immediately before its function to eliminate kernel-doc warnings introduced by commit db14fc3abcd5 ("vfs: add d_walk()") Warning(fs/dcache.c:1343): No description found for parameter 'data' Warning(fs/dcache.c:1343): No description found for parameter 'dentry' Warning(fs/dcache.c:1343): Excess function parameter 'parent' description in 'check_mount' Signed-off-by: Randy Dunlap Cc: Miklos Szeredi Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/dcache.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 41000305d716..20532cb0b06e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1331,14 +1331,6 @@ rename_retry: * list is non-empty and continue searching. */ -/** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. - * - * Return true if the parent or its subdirectories contain - * a mount point - */ - static enum d_walk_ret check_mount(void *data, struct dentry *dentry) { int *ret = data; @@ -1349,6 +1341,13 @@ static enum d_walk_ret check_mount(void *data, struct dentry *dentry) return D_WALK_CONTINUE; } +/** + * have_submounts - check for mounts over a dentry + * @parent: dentry to check. + * + * Return true if the parent or its subdirectories contain + * a mount point + */ int have_submounts(struct dentry *parent) { int ret = 0; -- cgit v1.2.3