diff options
-rw-r--r-- | fs/ceph/addr.c | 21 | ||||
-rw-r--r-- | fs/ceph/cache.c | 92 | ||||
-rw-r--r-- | fs/ceph/caps.c | 40 | ||||
-rw-r--r-- | fs/ceph/file.c | 2 | ||||
-rw-r--r-- | fs/ceph/inode.c | 18 | ||||
-rw-r--r-- | fs/ceph/locks.c | 25 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 4 | ||||
-rw-r--r-- | fs/ceph/super.c | 47 | ||||
-rw-r--r-- | fs/ceph/super.h | 4 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 3 | ||||
-rw-r--r-- | include/linux/ceph/ceph_features.h | 264 | ||||
-rw-r--r-- | include/linux/ceph/ceph_fs.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/decode.h | 60 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 49 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h | 2 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 70 | ||||
-rw-r--r-- | include/linux/ceph/osdmap.h | 41 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 6 | ||||
-rw-r--r-- | include/linux/crush/crush.h | 66 | ||||
-rw-r--r-- | include/linux/crush/mapper.h | 9 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 1 | ||||
-rw-r--r-- | net/ceph/crush/crush.c | 3 | ||||
-rw-r--r-- | net/ceph/crush/mapper.c | 81 | ||||
-rw-r--r-- | net/ceph/debugfs.c | 112 | ||||
-rw-r--r-- | net/ceph/messenger.c | 10 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 8 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 905 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 840 |
28 files changed, 2308 insertions, 476 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1e71e6ca5ddf..50836280a6f8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -530,14 +530,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) long writeback_stat; u64 truncate_size; u32 truncate_seq; - int err = 0, len = PAGE_SIZE; + int err, len = PAGE_SIZE; dout("writepage %p idx %lu\n", page, page->index); - if (!page->mapping || !page->mapping->host) { - dout("writepage %p - no mapping\n", page); - return -EFAULT; - } inode = page->mapping->host; ci = ceph_inode(inode); fsc = ceph_inode_to_client(inode); @@ -547,7 +543,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) snapc = page_snap_context(page); if (snapc == NULL) { dout("writepage %p page %p not dirty?\n", inode, page); - goto out; + return 0; } oldest = get_oldest_context(inode, &snap_size, &truncate_size, &truncate_seq); @@ -555,9 +551,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, snapc); /* we should only noop if called by kswapd */ - WARN_ON((current->flags & PF_MEMALLOC) == 0); + WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); - goto out; + redirty_page_for_writepage(wbc, page); + return 0; } ceph_put_snap_context(oldest); @@ -567,8 +564,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* is this a partial page at end of file? */ if (page_off >= snap_size) { dout("%p page eof %llu\n", page, snap_size); - goto out; + return 0; } + if (snap_size < page_off + len) len = snap_size - page_off; @@ -595,7 +593,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage interrupted page %p\n", page); redirty_page_for_writepage(wbc, page); end_page_writeback(page); - goto out; + return err; } dout("writepage setting page/mapping error %d %p\n", err, page); @@ -611,7 +609,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ -out: return err; } @@ -1318,7 +1315,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - int check_cap = 0; + bool check_cap = false; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 4e7421caf380..fd1172823f86 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -35,18 +35,34 @@ struct fscache_netfs ceph_cache_netfs = { .version = 0, }; +static DEFINE_MUTEX(ceph_fscache_lock); +static LIST_HEAD(ceph_fscache_list); + +struct ceph_fscache_entry { + struct list_head list; + struct fscache_cookie *fscache; + struct ceph_fsid fsid; + size_t uniq_len; + char uniquifier[0]; +}; + static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, void *buffer, uint16_t maxbuf) { const struct ceph_fs_client* fsc = cookie_netfs_data; - uint16_t klen; + const char *fscache_uniq = fsc->mount_options->fscache_uniq; + uint16_t fsid_len, uniq_len; - klen = sizeof(fsc->client->fsid); - if (klen > maxbuf) + fsid_len = sizeof(fsc->client->fsid); + uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; + if (fsid_len + uniq_len > maxbuf) return 0; - memcpy(buffer, &fsc->client->fsid, klen); - return klen; + memcpy(buffer, &fsc->client->fsid, fsid_len); + if (uniq_len) + memcpy(buffer + fsid_len, fscache_uniq, uniq_len); + + return fsid_len + uniq_len; } static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { @@ -67,13 +83,54 @@ void ceph_fscache_unregister(void) int ceph_fscache_register_fs(struct ceph_fs_client* fsc) { + const struct ceph_fsid *fsid = &fsc->client->fsid; + const char *fscache_uniq = fsc->mount_options->fscache_uniq; + size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; + struct ceph_fscache_entry *ent; + int err = 0; + + mutex_lock(&ceph_fscache_lock); + list_for_each_entry(ent, &ceph_fscache_list, list) { + if (memcmp(&ent->fsid, fsid, sizeof(*fsid))) + continue; + if (ent->uniq_len != uniq_len) + continue; + if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) + continue; + + pr_err("fscache cookie already registered for fsid %pU\n", fsid); + pr_err(" use fsc=%%s mount option to specify a uniquifier\n"); + err = -EBUSY; + goto out_unlock; + } + + ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL); + if (!ent) { + err = -ENOMEM; + goto out_unlock; + } + fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, &ceph_fscache_fsid_object_def, fsc, true); - if (!fsc->fscache) - pr_err("Unable to register fsid: %p fscache cookie\n", fsc); - return 0; + if (fsc->fscache) { + memcpy(&ent->fsid, fsid, sizeof(*fsid)); + if (uniq_len > 0) { + memcpy(&ent->uniquifier, fscache_uniq, uniq_len); + ent->uniq_len = uniq_len; + } + ent->fscache = fsc->fscache; + list_add_tail(&ent->list, &ceph_fscache_list); + } else { + kfree(ent); + pr_err("unable to register fscache cookie for fsid %pU\n", + fsid); + /* all other fs ignore this error */ + } +out_unlock: + mutex_unlock(&ceph_fscache_lock); + return err; } static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, @@ -349,7 +406,24 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { - fscache_relinquish_cookie(fsc->fscache, 0); + if (fscache_cookie_valid(fsc->fscache)) { + struct ceph_fscache_entry *ent; + bool found = false; + + mutex_lock(&ceph_fscache_lock); + list_for_each_entry(ent, &ceph_fscache_list, list) { + if (ent->fscache == fsc->fscache) { + list_del(&ent->list); + kfree(ent); + found = true; + break; + } + } + WARN_ON_ONCE(!found); + mutex_unlock(&ceph_fscache_lock); + + __fscache_relinquish_cookie(fsc->fscache, 0); + } fsc->fscache = NULL; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a3ebb632294e..7007ae2a5ad2 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1653,6 +1653,21 @@ static int try_nonblocking_invalidate(struct inode *inode) return -1; } +bool __ceph_should_report_size(struct ceph_inode_info *ci) +{ + loff_t size = ci->vfs_inode.i_size; + /* mds will adjust max size according to the reported size */ + if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) + return false; + if (size >= ci->i_max_size) + return true; + /* half of previous max_size increment has been used */ + if (ci->i_max_size > ci->i_reported_size && + (size << 1) >= ci->i_max_size + ci->i_reported_size) + return true; + return false; +} + /* * Swiss army knife function to examine currently used and wanted * versus held caps. Release, flush, ack revoked caps to mds as @@ -1806,8 +1821,7 @@ retry_locked: } /* approaching file_max? */ - if ((inode->i_size << 1) >= ci->i_max_size && - (ci->i_reported_size << 1) < ci->i_max_size) { + if (__ceph_should_report_size(ci)) { dout("i_size approaching max_size\n"); goto ack; } @@ -3027,8 +3041,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, le32_to_cpu(grant->truncate_seq), le64_to_cpu(grant->truncate_size), size); - /* max size increase? */ - if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { + } + + if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { + if (max_size != ci->i_max_size) { dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; @@ -3037,6 +3053,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ci->i_requested_max_size = 0; } wake = true; + } else if (ci->i_wanted_max_size > ci->i_max_size && + ci->i_wanted_max_size > ci->i_requested_max_size) { + /* CEPH_CAP_OP_IMPORT */ + wake = true; } } @@ -3554,7 +3574,6 @@ retry: } /* make sure we re-request max_size, if necessary */ - ci->i_wanted_max_size = 0; ci->i_requested_max_size = 0; *old_issued = issued; @@ -3790,6 +3809,7 @@ bad: */ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { + struct inode *inode; struct ceph_inode_info *ci; int flags = CHECK_CAPS_NODELAY; @@ -3805,9 +3825,15 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) time_before(jiffies, ci->i_hold_caps_max)) break; list_del_init(&ci->i_cap_delay_list); + + inode = igrab(&ci->vfs_inode); spin_unlock(&mdsc->cap_delay_lock); - dout("check_delayed_caps on %p\n", &ci->vfs_inode); - ceph_check_caps(ci, flags, NULL); + + if (inode) { + dout("check_delayed_caps on %p\n", inode); + ceph_check_caps(ci, flags, NULL); + iput(inode); + } } spin_unlock(&mdsc->cap_delay_lock); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 29308a80d66f..3d48c415f3cb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1040,8 +1040,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, int num_pages; int written = 0; int flags; - int check_caps = 0; int ret; + bool check_caps = false; struct timespec mtime = current_time(inode); size_t count = iov_iter_count(from); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 4de6cdddf059..220dfd87cbfa 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1016,6 +1016,7 @@ static void update_dentry_lease(struct dentry *dentry, long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; struct inode *dir; + struct ceph_mds_session *old_lease_session = NULL; /* * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that @@ -1051,8 +1052,10 @@ static void update_dentry_lease(struct dentry *dentry, time_before(ttl, di->time)) goto out_unlock; /* we already have a newer lease. */ - if (di->lease_session && di->lease_session != session) - goto out_unlock; + if (di->lease_session && di->lease_session != session) { + old_lease_session = di->lease_session; + di->lease_session = NULL; + } ceph_dentry_lru_touch(dentry); @@ -1065,6 +1068,8 @@ static void update_dentry_lease(struct dentry *dentry, di->time = ttl; out_unlock: spin_unlock(&dentry->d_lock); + if (old_lease_session) + ceph_put_mds_session(old_lease_session); return; } @@ -1653,20 +1658,17 @@ out: return err; } -int ceph_inode_set_size(struct inode *inode, loff_t size) +bool ceph_inode_set_size(struct inode *inode, loff_t size) { struct ceph_inode_info *ci = ceph_inode(inode); - int ret = 0; + bool ret; spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); i_size_write(inode, size); inode->i_blocks = calc_inode_blocks(size); - /* tell the MDS if we are approaching max_size */ - if ((size << 1) >= ci->i_max_size && - (ci->i_reported_size << 1) < ci->i_max_size) - ret = 1; + ret = __ceph_should_report_size(ci); spin_unlock(&ci->i_ceph_lock); return ret; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 6806dbeaee19..64ae74472046 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -127,6 +127,29 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", req->r_tid); + mutex_lock(&mdsc->mutex); + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { + err = 0; + } else { + /* + * ensure we aren't running concurrently with + * ceph_fill_trace or ceph_readdir_prepopulate, which + * rely on locks (dir mutex) held by our caller. + */ + mutex_lock(&req->r_fill_mutex); + req->r_err = err; + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); + mutex_unlock(&req->r_fill_mutex); + + if (!req->r_session) { + // haven't sent the request + err = 0; + } + } + mutex_unlock(&mdsc->mutex); + if (!err) + return 0; + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, USE_AUTH_MDS); if (IS_ERR(intr_req)) @@ -146,7 +169,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, if (err && err != -ERESTARTSYS) return err; - wait_for_completion(&req->r_completion); + wait_for_completion_killable(&req->r_safe_completion); return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0c05df44cc6c..666a9f274832 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3769,13 +3769,13 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) void ceph_mdsc_destroy(struct ceph_fs_client *fsc) { struct ceph_mds_client *mdsc = fsc->mdsc; - dout("mdsc_destroy %p\n", mdsc); - ceph_mdsc_stop(mdsc); /* flush out any connection work with references to us */ ceph_msgr_flush(); + ceph_mdsc_stop(mdsc); + fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 8d7918ce694a..aa06a8c24792 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -121,6 +121,7 @@ enum { /* int args above */ Opt_snapdirname, Opt_mds_namespace, + Opt_fscache_uniq, Opt_last_string, /* string args above */ Opt_dirstat, @@ -158,6 +159,7 @@ static match_table_t fsopt_tokens = { /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_mds_namespace, "mds_namespace=%s"}, + {Opt_fscache_uniq, "fsc=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, {Opt_nodirstat, "nodirstat"}, @@ -223,6 +225,14 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->mds_namespace) return -ENOMEM; break; + case Opt_fscache_uniq: + fsopt->fscache_uniq = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + if (!fsopt->fscache_uniq) + return -ENOMEM; + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + break; /* misc */ case Opt_wsize: fsopt->wsize = intval; @@ -317,6 +327,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->snapdir_name); kfree(args->mds_namespace); kfree(args->server_path); + kfree(args->fscache_uniq); kfree(args); } @@ -350,10 +361,12 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); if (ret) return ret; - ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); if (ret) return ret; + ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); + if (ret) + return ret; return ceph_compare_options(new_opt, fsc->client); } @@ -475,8 +488,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noasyncreaddir"); if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); - if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) - seq_puts(m, ",fsc"); + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { + if (fsopt->fscache_uniq) + seq_printf(m, ",fsc=%s", fsopt->fscache_uniq); + else + seq_puts(m, ",fsc"); + } if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) seq_puts(m, ",nopoolperm"); @@ -597,18 +614,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc->wb_pagevec_pool) goto fail_trunc_wq; - /* setup fscache */ - if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && - (ceph_fscache_register_fs(fsc) != 0)) - goto fail_fscache; - /* caps */ fsc->min_caps = fsopt->max_readdir; return fsc; -fail_fscache: - ceph_fscache_unregister_fs(fsc); fail_trunc_wq: destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: @@ -626,8 +636,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); - ceph_fscache_unregister_fs(fsc); - destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); @@ -636,8 +644,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_mount_options(fsc->mount_options); - ceph_fs_debugfs_cleanup(fsc); - ceph_destroy_client(fsc->client); kfree(fsc); @@ -822,6 +828,13 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) if (err < 0) goto out; + /* setup fscache */ + if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { + err = ceph_fscache_register_fs(fsc); + if (err < 0) + goto out; + } + if (!fsc->mount_options->server_path) { path = ""; dout("mount opening path \\t\n"); @@ -1040,6 +1053,12 @@ static void ceph_kill_sb(struct super_block *s) ceph_mdsc_pre_umount(fsc->mdsc); generic_shutdown_super(s); + + fsc->client->extra_mon_dispatch = NULL; + ceph_fs_debugfs_cleanup(fsc); + + ceph_fscache_unregister_fs(fsc); + ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a973acd8beaf..f02a2225fe42 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -73,6 +73,7 @@ struct ceph_mount_options { char *snapdir_name; /* default ".snap" */ char *mds_namespace; /* default NULL */ char *server_path; /* default "/" */ + char *fscache_uniq; /* default NULL */ }; struct ceph_fs_client { @@ -793,7 +794,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, extern int ceph_inode_holds_cap(struct inode *inode, int mask); -extern int ceph_inode_set_size(struct inode *inode, loff_t size); +extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); extern void ceph_queue_vmtruncate(struct inode *inode); @@ -918,6 +919,7 @@ extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); +extern bool __ceph_should_report_size(struct ceph_inode_info *ci); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 75267cdd5dfd..11263f102e4c 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -756,6 +756,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, /* let's see if a virtual xattr was requested */ vxattr = ceph_match_vxattr(inode, name); if (vxattr) { + err = ceph_do_getattr(inode, 0, true); + if (err) + return err; err = -ENODATA; if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) err = vxattr->getxattr_cb(ci, value, size); diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index fd8b2953c78f..f0f6c537b64c 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -2,103 +2,174 @@ #define __CEPH_FEATURES /* - * feature bits + * Each time we reclaim bits for reuse we need to specify another bit + * that, if present, indicates we have the new incarnation of that + * feature. Base case is 1 (first use). */ -#define CEPH_FEATURE_UID (1ULL<<0) -#define CEPH_FEATURE_NOSRCADDR (1ULL<<1) -#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) -#define CEPH_FEATURE_FLOCK (1ULL<<3) -#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) -#define CEPH_FEATURE_MONNAMES (1ULL<<5) -#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) -#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) -#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) -#define CEPH_FEATURE_PGID64 (1ULL<<9) -#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) -#define CEPH_FEATURE_PGPOOL3 (1ULL<<11) -#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) -#define CEPH_FEATURE_OSDENC (1ULL<<13) -#define CEPH_FEATURE_OMAP (1ULL<<14) -#define CEPH_FEATURE_MONENC (1ULL<<15) -#define CEPH_FEATURE_QUERY_T (1ULL<<16) -#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) -#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) -#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) -#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) -#define CEPH_FEATURE_MON_GV (1ULL<<21) -#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) -#define CEPH_FEATURE_MSG_AUTH (1ULL<<23) -#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) -#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) -#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) -#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) -#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) -#define CEPH_FEATURE_MDSENC (1ULL<<29) -#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) -#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) -#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) -#define CEPH_FEATURE_MON_SCRUB (1ULL<<33) -#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) -#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) -#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ -#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) -#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) -#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */ -/* The process supports new-style OSDMap encoding. Monitors also use - this bit to determine if peers support NAK messages. */ -#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39) -#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40) -#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41) -#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */ -#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42) -#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43) -#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44) -#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45) -#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46) -#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */ -#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */ -#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */ -#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47) -#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */ -#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) -// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY -#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ -#define CEPH_FEATURE_MON_METADATA (1ULL<<50) -#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ -#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) -#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) -#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) -#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) -#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */ -#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ -#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ -#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ -// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 -#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ -#define CEPH_FEATURE_FS_FILE_LAYOUT_V2 (1ULL<<58) /* file_layout_t */ +#define CEPH_FEATURE_INCARNATION_1 (0ull) +#define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL + +#define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ + const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); + +/* this bit is ignored but still advertised by release *when* */ +#define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \ + const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ + const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \ + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); /* - * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature - * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 - * to mean 33 bit ~0, and introduce a helper below to do the - * translation. + * this bit is ignored by release *unused* and not advertised by + * release *unadvertised* + */ +#define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised) + + +/* + * test for a feature. this test is safer than a typical mask against + * the bit because it ensures that we have the bit AND the marker for the + * bit's incarnation. this must be used in any case where the features + * bits may include an old meaning of the bit. + */ +#define CEPH_HAVE_FEATURE(x, name) \ + (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name)) + + +/* + * Notes on deprecation: + * + * A *major* release is a release through which all upgrades must pass + * (e.g., jewel). For example, no pre-jewel server will ever talk to + * a post-jewel server (mon, osd, etc). + * + * For feature bits used *only* on the server-side: + * + * - In the first phase we indicate that a feature is DEPRECATED as of + * a particular release. This is the first major release X (say, + * jewel) that does not depend on its peers advertising the feature. + * That is, it safely assumes its peers all have the feature. We + * indicate this with the DEPRECATED macro. For example, + * + * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL) + * + * because 10.2.z (jewel) did not care if its peers advertised this + * feature bit. + * + * - In the second phase we stop advertising the the bit and call it + * RETIRED. This can normally be done in the *next* major release + * following the one in which we marked the feature DEPRECATED. In + * the above example, for 12.0.z (luminous) we can say: + * + * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) * - * This was introduced by ceph.git commit - * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 - * and fixed by ceph.git commit - * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c + * - The bit can be reused in the first post-luminous release, 13.0.z + * (m). + * + * This ensures that no two versions who have different meanings for + * the bit ever speak to each other. */ -#define CEPH_FEATURE_RESERVED (1ULL<<63) - -static inline u64 ceph_sanitize_features(u64 features) -{ - if (features & CEPH_FEATURE_RESERVED) { - /* everything through OSD_SNAPMAPPER */ - return 0x1ffffffffull; - } else { - return features; - } -} + +DEFINE_CEPH_FEATURE( 0, 1, UID) +DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) +DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE( 3, 1, FLOCK) +DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) +DEFINE_CEPH_FEATURE( 5, 1, MONNAMES) +DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) +DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) +DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR) +DEFINE_CEPH_FEATURE( 9, 1, PGID64) +DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP) +DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) +DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX) +DEFINE_CEPH_FEATURE(13, 1, OSDENC) +DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN) +DEFINE_CEPH_FEATURE(15, 1, MONENC) +DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) +DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) +DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap +DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap +DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap +DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) +DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS) + +DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) +DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID) +DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) +DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL) +DEFINE_CEPH_FEATURE(28, 2, SERVER_M) +DEFINE_CEPH_FEATURE(29, 1, MDSENC) +DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) +DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me +DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) +DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) +DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) +DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES) +DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap +DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) +DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) +DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) +DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap +DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) +DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) +DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2) +DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS) + +DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS) +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap +DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) +DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS) +DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap + +DEFINE_CEPH_FEATURE(50, 1, MON_METADATA) +DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT) +DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES) +DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3) +DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT) +DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4) +DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) +DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) +DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap +DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap +DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap +DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) +DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap +DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap +DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) +DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap +DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap +DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit* + +DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down! +DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal +DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing + /* * Features supported. @@ -113,6 +184,11 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ CEPH_FEATURE_CRUSH_TUNABLES | \ + CEPH_FEATURE_SERVER_LUMINOUS | \ + CEPH_FEATURE_RESEND_ON_SPLIT | \ + CEPH_FEATURE_RADOS_BACKOFF | \ + CEPH_FEATURE_OSDMAP_PG_UPMAP | \ + CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \ CEPH_FEATURE_MSG_AUTH | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ @@ -126,7 +202,11 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ + CEPH_FEATURE_OSD_POOLRESEND | \ CEPH_FEATURE_CRUSH_V4 | \ + CEPH_FEATURE_NEW_OSDOP_ENCODING | \ + CEPH_FEATURE_SERVER_JEWEL | \ + CEPH_FEATURE_MON_STATEFUL_SUB | \ CEPH_FEATURE_CRUSH_TUNABLES5 | \ CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index ad078ebe25d6..edf5b04b918a 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -147,6 +147,7 @@ struct ceph_dir_layout { #define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OPREPLY 43 #define CEPH_MSG_WATCH_NOTIFY 44 +#define CEPH_MSG_OSD_BACKOFF 61 /* watch-notify operations */ diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index f990f2cc907a..14af9b70d301 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -133,6 +133,66 @@ bad: } /* + * skip helpers + */ +#define ceph_decode_skip_n(p, end, n, bad) \ + do { \ + ceph_decode_need(p, end, n, bad); \ + *p += n; \ + } while (0) + +#define ceph_decode_skip_64(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u64), bad) + +#define ceph_decode_skip_32(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u32), bad) + +#define ceph_decode_skip_16(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u16), bad) + +#define ceph_decode_skip_8(p, end, bad) \ +ceph_decode_skip_n(p, end, sizeof(u8), bad) + +#define ceph_decode_skip_string(p, end, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + ceph_decode_skip_n(p, end, len, bad); \ + } while (0) + +#define ceph_decode_skip_set(p, end, type, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) \ + ceph_decode_skip_##type(p, end, bad); \ + } while (0) + +#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) { \ + ceph_decode_skip_##ktype(p, end, bad); \ + ceph_decode_skip_##vtype(p, end, bad); \ + } \ + } while (0) + +#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \ + do { \ + u32 len; \ + \ + ceph_decode_32_safe(p, end, len, bad); \ + while (len--) { \ + ceph_decode_skip_##ktype1(p, end, bad); \ + ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \ + } \ + } while (0) + +/* * struct ceph_timespec <-> struct timespec */ static inline void ceph_decode_timespec(struct timespec *ts, diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 3229ae6c7846..8a79587e1317 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -184,10 +184,11 @@ static inline int calc_pages_for(u64 off, u64 len) (off >> PAGE_SHIFT); } -/* - * These are not meant to be generic - an integer key is assumed. - */ -#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +#define RB_BYVAL(a) (a) +#define RB_BYPTR(a) (&(a)) +#define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b)) + +#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ static void insert_##name(struct rb_root *root, type *t) \ { \ struct rb_node **n = &root->rb_node; \ @@ -197,11 +198,13 @@ static void insert_##name(struct rb_root *root, type *t) \ \ while (*n) { \ type *cur = rb_entry(*n, type, nodefld); \ + int cmp; \ \ parent = *n; \ - if (t->keyfld < cur->keyfld) \ + cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \ + if (cmp < 0) \ n = &(*n)->rb_left; \ - else if (t->keyfld > cur->keyfld) \ + else if (cmp > 0) \ n = &(*n)->rb_right; \ else \ BUG(); \ @@ -217,19 +220,24 @@ static void erase_##name(struct rb_root *root, type *t) \ RB_CLEAR_NODE(&t->nodefld); \ } -#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ -extern type __lookup_##name##_key; \ -static type *lookup_##name(struct rb_root *root, \ - typeof(__lookup_##name##_key.keyfld) key) \ +/* + * @lookup_param_type is a parameter and not constructed from (@type, + * @keyfld) with typeof() because adding const is too unwieldy. + */ +#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) \ +static type *lookup_##name(struct rb_root *root, lookup_param_type key) \ { \ struct rb_node *n = root->rb_node; \ \ while (n) { \ type *cur = rb_entry(n, type, nodefld); \ + int cmp; \ \ - if (key < cur->keyfld) \ + cmp = cmpexp(key, keyexp(cur->keyfld)); \ + if (cmp < 0) \ n = n->rb_left; \ - else if (key > cur->keyfld) \ + else if (cmp > 0) \ n = n->rb_right; \ else \ return cur; \ @@ -238,6 +246,23 @@ static type *lookup_##name(struct rb_root *root, \ return NULL; \ } +#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) \ +DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ +DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ + lookup_param_type, nodefld) + +/* + * Shorthands for integer keys. + */ +#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ +DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld) + +#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ +extern type __lookup_##name##_key; \ +DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \ + typeof(__lookup_##name##_key.keyfld), nodefld) + #define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c5c4c713e00f..fbd94d9fa5dd 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -44,6 +44,8 @@ struct ceph_connection_operations { struct ceph_msg_header *hdr, int *skip); + void (*reencode_message) (struct ceph_msg *msg); + int (*sign_message) (struct ceph_msg *msg); int (*check_message_signature) (struct ceph_msg *msg); }; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 85650b415e73..c6d96a5f46fd 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -1,6 +1,7 @@ #ifndef _FS_CEPH_OSD_CLIENT_H #define _FS_CEPH_OSD_CLIENT_H +#include <linux/bitrev.h> #include <linux/completion.h> #include <linux/kref.h> #include <linux/mempool.h> @@ -36,6 +37,8 @@ struct ceph_osd { struct ceph_connection o_con; struct rb_root o_requests; struct rb_root o_linger_requests; + struct rb_root o_backoff_mappings; + struct rb_root o_backoffs_by_id; struct list_head o_osd_lru; struct ceph_auth_handshake o_auth; unsigned long lru_ttl; @@ -136,7 +139,8 @@ struct ceph_osd_request_target { struct ceph_object_id target_oid; struct ceph_object_locator target_oloc; - struct ceph_pg pgid; + struct ceph_pg pgid; /* last raw pg we mapped to */ + struct ceph_spg spgid; /* last actual spg we mapped to */ u32 pg_num; u32 pg_num_mask; struct ceph_osds acting; @@ -148,6 +152,9 @@ struct ceph_osd_request_target { unsigned int flags; /* CEPH_OSD_FLAG_* */ bool paused; + u32 epoch; + u32 last_force_resend; + int osd; }; @@ -193,7 +200,6 @@ struct ceph_osd_request { unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ int r_attempts; - u32 r_last_force_resend; u32 r_map_dne_bound; struct ceph_osd_req_op r_ops[]; @@ -203,6 +209,23 @@ struct ceph_request_redirect { struct ceph_object_locator oloc; }; +/* + * osd request identifier + * + * caller name + incarnation# + tid to unique identify this request + */ +struct ceph_osd_reqid { + struct ceph_entity_name name; + __le64 tid; + __le32 inc; +} __packed; + +struct ceph_blkin_trace_info { + __le64 trace_id; + __le64 span_id; + __le64 parent_span_id; +} __packed; + typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, u64 notifier_id, void *data, size_t data_len); typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); @@ -221,7 +244,6 @@ struct ceph_osd_linger_request { struct list_head pending_lworks; struct ceph_osd_request_target t; - u32 last_force_resend; u32 map_dne_bound; struct timespec mtime; @@ -256,6 +278,48 @@ struct ceph_watch_item { struct ceph_entity_addr addr; }; +struct ceph_spg_mapping { + struct rb_node node; + struct ceph_spg spgid; + + struct rb_root backoffs; +}; + +struct ceph_hobject_id { + void *key; + size_t key_len; + void *oid; + size_t oid_len; + u64 snapid; + u32 hash; + u8 is_max; + void *nspace; + size_t nspace_len; + s64 pool; + + /* cache */ + u32 hash_reverse_bits; +}; + +static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid) +{ + hoid->hash_reverse_bits = bitrev32(hoid->hash); +} + +/* + * PG-wide backoff: [begin, end) + * per-object backoff: begin == end + */ +struct ceph_osd_backoff { + struct rb_node spg_node; + struct rb_node id_node; + + struct ceph_spg spgid; + u64 id; + struct ceph_hobject_id *begin; + struct ceph_hobject_id *end; +}; + #define CEPH_LINGER_ID_START 0xffff000000000000ULL struct ceph_osd_client { diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 938656f70807..a0996cb9faed 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -24,7 +24,15 @@ struct ceph_pg { uint32_t seed; }; +#define CEPH_SPG_NOSHARD -1 + +struct ceph_spg { + struct ceph_pg pgid; + s8 shard; +}; + int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); +int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs); #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id together */ @@ -135,10 +143,14 @@ struct ceph_pg_mapping { struct { int len; int osds[]; - } pg_temp; + } pg_temp, pg_upmap; struct { int osd; } primary_temp; + struct { + int len; + int from_to[][2]; + } pg_upmap_items; }; }; @@ -150,13 +162,17 @@ struct ceph_osdmap { u32 flags; /* CEPH_OSDMAP_* */ u32 max_osd; /* size of osd_state, _offload, _addr arrays */ - u8 *osd_state; /* CEPH_OSD_* */ + u32 *osd_state; /* CEPH_OSD_* */ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ struct ceph_entity_addr *osd_addr; struct rb_root pg_temp; struct rb_root primary_temp; + /* remap (post-CRUSH, pre-up) */ + struct rb_root pg_upmap; /* PG := raw set */ + struct rb_root pg_upmap_items; /* from -> to within raw set */ + u32 *osd_primary_affinity; struct rb_root pg_pools; @@ -187,7 +203,7 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) return !ceph_osd_is_up(map, osd); } -extern char *ceph_osdmap_state_str(char *str, int len, int state); +char *ceph_osdmap_state_str(char *str, int len, u32 state); extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, @@ -198,11 +214,13 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, return &map->osd_addr[osd]; } +#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4) + static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) { __u8 version; - if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { + if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) { pr_warn("incomplete pg encoding\n"); return -EINVAL; } @@ -240,6 +258,8 @@ static inline void ceph_osds_init(struct ceph_osds *set) void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); +bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, + u32 new_pg_num); bool ceph_is_new_interval(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, const struct ceph_osds *old_up, @@ -262,15 +282,24 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); +int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid); int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_id *oid, - struct ceph_object_locator *oloc, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid); void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting); +bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_spg *spgid); int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 5d0018782d50..385db08bb8b2 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -439,6 +439,12 @@ enum { const char *ceph_osd_watch_op_name(int o); +enum { + CEPH_OSD_BACKOFF_OP_BLOCK = 1, + CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, + CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, +}; + /* * an individual object operation. each may be accompanied by some data * payload diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index fbecbd089d75..92e165d417a6 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -2,6 +2,7 @@ #define CEPH_CRUSH_CRUSH_H #ifdef __KERNEL__ +# include <linux/rbtree.h> # include <linux/types.h> #else # include "crush_compat.h" @@ -137,6 +138,68 @@ struct crush_bucket { }; +/** @ingroup API + * + * Replacement weights for each item in a bucket. The size of the + * array must be exactly the size of the straw2 bucket, just as the + * item_weights array. + * + */ +struct crush_weight_set { + __u32 *weights; /*!< 16.16 fixed point weights + in the same order as items */ + __u32 size; /*!< size of the __weights__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for a given straw2 bucket, for + * placement purposes. + * + * When crush_do_rule() chooses the Nth item from a straw2 bucket, the + * replacement weights found at __weight_set[N]__ are used instead of + * the weights from __item_weights__. If __N__ is greater than + * __weight_set_size__, the weights found at __weight_set_size-1__ are + * used instead. For instance if __weight_set__ is: + * + * [ [ 0x10000, 0x20000 ], // position 0 + * [ 0x20000, 0x40000 ] ] // position 1 + * + * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ] + * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ] + * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ] + * etc. + * + */ +struct crush_choose_arg { + __s32 *ids; /*!< values to use instead of items */ + __u32 ids_size; /*!< size of the __ids__ array */ + struct crush_weight_set *weight_set; /*!< weight replacements for + a given position */ + __u32 weight_set_size; /*!< size of the __weight_set__ array */ +}; + +/** @ingroup API + * + * Replacement weights and ids for each bucket in the crushmap. The + * __size__ of the __args__ array must be exactly the same as the + * __map->max_buckets__. + * + * The __crush_choose_arg__ at index N will be used when choosing + * an item from the bucket __map->buckets[N]__ bucket, provided it + * is a straw2 bucket. + * + */ +struct crush_choose_arg_map { +#ifdef __KERNEL__ + struct rb_node node; + u64 choose_args_index; +#endif + struct crush_choose_arg *args; /*!< replacement for each bucket + in the crushmap */ + __u32 size; /*!< size of the __args__ array */ +}; + struct crush_bucket_uniform { struct crush_bucket h; __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ @@ -236,6 +299,9 @@ struct crush_map { __u32 allowed_bucket_algs; __u32 *choose_tries; +#else + /* CrushWrapper::choose_args */ + struct rb_root choose_args; #endif }; diff --git a/include/linux/crush/mapper.h b/include/linux/crush/mapper.h index c95e19e1ff11..141edabb947e 100644 --- a/include/linux/crush/mapper.h +++ b/include/linux/crush/mapper.h @@ -11,11 +11,10 @@ #include "crush.h" extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); -extern int crush_do_rule(const struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - const __u32 *weights, int weight_max, - void *cwin); +int crush_do_rule(const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + void *cwin, const struct crush_choose_arg *choose_args); /* * Returns the exact amount of workspace that will need to be used diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 47e94b560ba0..3d265c5cb6d0 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type) case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; + case CEPH_MSG_OSD_BACKOFF: return "osd_backoff"; default: return "unknown"; } } diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c index 5bf94c04f645..4b428f46a8ca 100644 --- a/net/ceph/crush/crush.c +++ b/net/ceph/crush/crush.c @@ -1,6 +1,7 @@ #ifdef __KERNEL__ # include <linux/slab.h> # include <linux/crush/crush.h> +void clear_choose_args(struct crush_map *c); #else # include "crush_compat.h" # include "crush.h" @@ -127,6 +128,8 @@ void crush_destroy(struct crush_map *map) #ifndef __KERNEL__ kfree(map->choose_tries); +#else + clear_choose_args(map); #endif kfree(map); } diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index b5cd8c21bfdf..746b145bfd11 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -302,19 +302,42 @@ static __u64 crush_ln(unsigned int xin) * */ +static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg, + int position) +{ + if (!arg || !arg->weight_set || arg->weight_set_size == 0) + return bucket->item_weights; + + if (position >= arg->weight_set_size) + position = arg->weight_set_size - 1; + return arg->weight_set[position].weights; +} + +static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket, + const struct crush_choose_arg *arg) +{ + if (!arg || !arg->ids) + return bucket->h.items; + + return arg->ids; +} + static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, - int x, int r) + int x, int r, + const struct crush_choose_arg *arg, + int position) { unsigned int i, high = 0; unsigned int u; - unsigned int w; __s64 ln, draw, high_draw = 0; + __u32 *weights = get_choose_arg_weights(bucket, arg, position); + __s32 *ids = get_choose_arg_ids(bucket, arg); for (i = 0; i < bucket->h.size; i++) { - w = bucket->item_weights[i]; - if (w) { - u = crush_hash32_3(bucket->h.hash, x, - bucket->h.items[i], r); + dprintk("weight 0x%x item %d\n", weights[i], ids[i]); + if (weights[i]) { + u = crush_hash32_3(bucket->h.hash, x, ids[i], r); u &= 0xffff; /* @@ -335,7 +358,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, * weight means a larger (less negative) value * for draw. */ - draw = div64_s64(ln, w); + draw = div64_s64(ln, weights[i]); } else { draw = S64_MIN; } @@ -352,7 +375,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, static int crush_bucket_choose(const struct crush_bucket *in, struct crush_work_bucket *work, - int x, int r) + int x, int r, + const struct crush_choose_arg *arg, + int position) { dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); BUG_ON(in->size == 0); @@ -374,7 +399,7 @@ static int crush_bucket_choose(const struct crush_bucket *in, case CRUSH_BUCKET_STRAW2: return bucket_straw2_choose( (const struct crush_bucket_straw2 *)in, - x, r); + x, r, arg, position); default: dprintk("unknown bucket %d alg %d\n", in->id, in->alg); return in->items[0]; @@ -436,7 +461,8 @@ static int crush_choose_firstn(const struct crush_map *map, unsigned int vary_r, unsigned int stable, int *out2, - int parent_r) + int parent_r, + const struct crush_choose_arg *choose_args) { int rep; unsigned int ftotal, flocal; @@ -486,7 +512,10 @@ static int crush_choose_firstn(const struct crush_map *map, else item = crush_bucket_choose( in, work->work[-1-in->id], - x, r); + x, r, + (choose_args ? + &choose_args[-1-in->id] : 0), + outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); skip_rep = 1; @@ -543,7 +572,8 @@ static int crush_choose_firstn(const struct crush_map *map, vary_r, stable, NULL, - sub_r) <= outpos) + sub_r, + choose_args) <= outpos) /* didn't get leaf */ reject = 1; } else { @@ -620,7 +650,8 @@ static void crush_choose_indep(const struct crush_map *map, unsigned int recurse_tries, int recurse_to_leaf, int *out2, - int parent_r) + int parent_r, + const struct crush_choose_arg *choose_args) { const struct crush_bucket *in = bucket; int endpos = outpos + left; @@ -692,7 +723,10 @@ static void crush_choose_indep(const struct crush_map *map, item = crush_bucket_choose( in, work->work[-1-in->id], - x, r); + x, r, + (choose_args ? + &choose_args[-1-in->id] : 0), + outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); out[rep] = CRUSH_ITEM_NONE; @@ -746,7 +780,8 @@ static void crush_choose_indep(const struct crush_map *map, x, 1, numrep, 0, out2, rep, recurse_tries, 0, - 0, NULL, r); + 0, NULL, r, + choose_args); if (out2[rep] == CRUSH_ITEM_NONE) { /* placed nothing; no leaf */ break; @@ -823,7 +858,7 @@ void crush_init_workspace(const struct crush_map *map, void *v) * set the pointer first and then reserve the space for it to * point to by incrementing the point. */ - v += sizeof(struct crush_work *); + v += sizeof(struct crush_work); w->work = v; v += map->max_buckets * sizeof(struct crush_work_bucket *); for (b = 0; b < map->max_buckets; ++b) { @@ -854,11 +889,12 @@ void crush_init_workspace(const struct crush_map *map, void *v) * @weight: weight vector (for map leaves) * @weight_max: size of weight vector * @cwin: pointer to at least crush_work_size() bytes of memory + * @choose_args: weights and ids for each known bucket */ int crush_do_rule(const struct crush_map *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, - void *cwin) + void *cwin, const struct crush_choose_arg *choose_args) { int result_len; struct crush_work *cw = cwin; @@ -968,11 +1004,6 @@ int crush_do_rule(const struct crush_map *map, for (i = 0; i < wsize; i++) { int bno; - /* - * see CRUSH_N, CRUSH_N_MINUS macros. - * basically, numrep <= 0 means relative to - * the provided result_max - */ numrep = curstep->arg1; if (numrep <= 0) { numrep += result_max; @@ -1013,7 +1044,8 @@ int crush_do_rule(const struct crush_map *map, vary_r, stable, c+osize, - 0); + 0, + choose_args); } else { out_size = ((numrep < (result_max-osize)) ? numrep : (result_max-osize)); @@ -1030,7 +1062,8 @@ int crush_do_rule(const struct crush_map *map, choose_leaf_tries : 1, recurse_to_leaf, c+osize, - 0); + 0, + choose_args); osize += out_size; } } diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 71ba13927b3d..fa5233e0d01c 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p) } for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; - int state = map->osd_state[i]; + u32 state = map->osd_state[i]; char sb[64]; seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", @@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, pg->pgid.seed, pg->primary_temp.osd); } + for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_upmap.len; i++) + seq_printf(s, "%s%d", (i == 0 ? "" : ","), + pg->pg_upmap.osds[i]); + seq_printf(s, "]\n"); + } + for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_upmap_items.len; i++) + seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","), + pg->pg_upmap_items.from_to[i][0], + pg->pg_upmap_items.from_to[i][1]); + seq_printf(s, "]\n"); + } up_read(&osdc->lock); return 0; @@ -147,17 +170,26 @@ static int monc_show(struct seq_file *s, void *p) return 0; } +static void dump_spgid(struct seq_file *s, const struct ceph_spg *spgid) +{ + seq_printf(s, "%llu.%x", spgid->pgid.pool, spgid->pgid.seed); + if (spgid->shard != CEPH_SPG_NOSHARD) + seq_printf(s, "s%d", spgid->shard); +} + static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) { int i; - seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed); + seq_printf(s, "osd%d\t%llu.%x\t", t->osd, t->pgid.pool, t->pgid.seed); + dump_spgid(s, &t->spgid); + seq_puts(s, "\t["); for (i = 0; i < t->up.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]); seq_printf(s, "]/%d\t[", t->up.primary); for (i = 0; i < t->acting.size; i++) seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); - seq_printf(s, "]/%d\t", t->acting.primary); + seq_printf(s, "]/%d\te%u\t", t->acting.primary, t->epoch); if (t->target_oloc.pool_ns) { seq_printf(s, "%*pE/%*pE\t0x%x", (int)t->target_oloc.pool_ns->len, @@ -234,6 +266,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd) mutex_unlock(&osd->lock); } +static void dump_snapid(struct seq_file *s, u64 snapid) +{ + if (snapid == CEPH_NOSNAP) + seq_puts(s, "head"); + else if (snapid == CEPH_SNAPDIR) + seq_puts(s, "snapdir"); + else + seq_printf(s, "%llx", snapid); +} + +static void dump_name_escaped(struct seq_file *s, unsigned char *name, + size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (name[i] == '%' || name[i] == ':' || name[i] == '/' || + name[i] < 32 || name[i] >= 127) { + seq_printf(s, "%%%02x", name[i]); + } else { + seq_putc(s, name[i]); + } + } +} + +static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid) +{ + if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max && + hoid->pool == S64_MIN) { + seq_puts(s, "MIN"); + return; + } + if (hoid->is_max) { + seq_puts(s, "MAX"); + return; + } + seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits); + dump_name_escaped(s, hoid->nspace, hoid->nspace_len); + seq_putc(s, ':'); + dump_name_escaped(s, hoid->key, hoid->key_len); + seq_putc(s, ':'); + dump_name_escaped(s, hoid->oid, hoid->oid_len); + seq_putc(s, ':'); + dump_snapid(s, hoid->snapid); +} + +static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd) +{ + struct rb_node *n; + + mutex_lock(&osd->lock); + for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) { + struct ceph_osd_backoff *backoff = + rb_entry(n, struct ceph_osd_backoff, id_node); + + seq_printf(s, "osd%d\t", osd->o_osd); + dump_spgid(s, &backoff->spgid); + seq_printf(s, "\t%llu\t", backoff->id); + dump_hoid(s, backoff->begin); + seq_putc(s, '\t'); + dump_hoid(s, backoff->end); + seq_putc(s, '\n'); + } + + mutex_unlock(&osd->lock); +} + static int osdc_show(struct seq_file *s, void *pp) { struct ceph_client *client = s->private; @@ -259,6 +358,13 @@ static int osdc_show(struct seq_file *s, void *pp) } dump_linger_requests(s, &osdc->homeless_osd); + seq_puts(s, "BACKOFFS\n"); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + dump_backoffs(s, osd); + } + up_read(&osdc->lock); return 0; } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 588a91930051..0c31035bbfee 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1288,13 +1288,16 @@ static void prepare_write_message(struct ceph_connection *con) m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; } - WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); + + if (con->ops->reencode_message) + con->ops->reencode_message(m); dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), m->data_length); - BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); + WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); @@ -2033,8 +2036,7 @@ static int process_connect(struct ceph_connection *con) { u64 sup_feat = from_msgr(con->msgr)->supported_features; u64 req_feat = from_msgr(con->msgr)->required_features; - u64 server_feat = ceph_sanitize_features( - le64_to_cpu(con->in_reply.features)); + u64 server_feat = le64_to_cpu(con->in_reply.features); int ret; dout("process_connect on %p tag %d\n", con, (int)con->in_tag); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 250f11f78609..875675765531 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -6,6 +6,7 @@ #include <linux/random.h> #include <linux/sched.h> +#include <linux/ceph/ceph_features.h> #include <linux/ceph/mon_client.h> #include <linux/ceph/libceph.h> #include <linux/ceph/debugfs.h> @@ -297,6 +298,10 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); if (monc->sub_renew_sent) { + /* + * This is only needed for legacy (infernalis or older) + * MONs -- see delayed_work(). + */ monc->sub_renew_after = monc->sub_renew_sent + (seconds >> 1) * HZ - 1; dout("%s sent %lu duration %d renew after %lu\n", __func__, @@ -955,7 +960,8 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); } - if (is_auth) { + if (is_auth && + !(monc->con.peer_features & CEPH_FEATURE_MON_STATEFUL_SUB)) { unsigned long now = jiffies; dout("%s renew subs? now %lu renew after %lu\n", diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 924f07c36ddb..86a9737d8e3f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -12,6 +12,7 @@ #include <linux/bio.h> #endif +#include <linux/ceph/ceph_features.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osd_client.h> #include <linux/ceph/messenger.h> @@ -49,6 +50,7 @@ static void link_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); static void unlink_linger(struct ceph_osd *osd, struct ceph_osd_linger_request *lreq); +static void clear_backoffs(struct ceph_osd *osd); #if 1 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) @@ -373,6 +375,7 @@ static void target_copy(struct ceph_osd_request_target *dest, ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); dest->pgid = src->pgid; /* struct */ + dest->spgid = src->spgid; /* struct */ dest->pg_num = src->pg_num; dest->pg_num_mask = src->pg_num_mask; ceph_osds_copy(&dest->acting, &src->acting); @@ -384,6 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest, dest->flags = src->flags; dest->paused = src->paused; + dest->epoch = src->epoch; + dest->last_force_resend = src->last_force_resend; + dest->osd = src->osd; } @@ -537,7 +543,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) +static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) { return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); } @@ -552,17 +558,21 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); /* create request message */ - msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ - msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ + msg_size = CEPH_ENCODING_START_BLK_LEN + + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ + msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + sizeof(struct ceph_osd_reqid); /* reqid */ + msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */ + msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */ msg_size += CEPH_ENCODING_START_BLK_LEN + ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ - msg_size += 1 + 8 + 4 + 4; /* pgid */ msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); - msg_size += 4; /* retry_attempt */ + msg_size += 4 + 8; /* retry_attempt, features */ if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); @@ -1010,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd) RB_CLEAR_NODE(&osd->o_node); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; + osd->o_backoff_mappings = RB_ROOT; + osd->o_backoffs_by_id = RB_ROOT; INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_keepalive_item); osd->o_incarnation = 1; @@ -1021,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd) WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id)); WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_keepalive_item)); @@ -1141,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd) unlink_linger(osd, lreq); link_linger(&osdc->homeless_osd, lreq); } + clear_backoffs(osd); __remove_osd_from_lru(osd); erase_osd(&osdc->osds, osd); @@ -1297,7 +1312,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || __pool_full(pi); - WARN_ON(pi->id != t->base_oloc.pool); + WARN_ON(pi->id != t->target_oloc.pool); return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || (osdc->osdmap->epoch < osdc->epoch_barrier); @@ -1311,19 +1326,21 @@ enum calc_target_result { static enum calc_target_result calc_target(struct ceph_osd_client *osdc, struct ceph_osd_request_target *t, - u32 *last_force_resend, + struct ceph_connection *con, bool any_change) { struct ceph_pg_pool_info *pi; struct ceph_pg pgid, last_pgid; struct ceph_osds up, acting; bool force_resend = false; - bool need_check_tiering = false; - bool need_resend = false; + bool unpaused = false; + bool legacy_change; + bool split = false; bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); enum calc_target_result ct_res; int ret; + t->epoch = osdc->osdmap->epoch; pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); if (!pi) { t->osd = CEPH_HOMELESS_OSD; @@ -1332,33 +1349,33 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, } if (osdc->osdmap->epoch == pi->last_force_request_resend) { - if (last_force_resend && - *last_force_resend < pi->last_force_request_resend) { - *last_force_resend = pi->last_force_request_resend; + if (t->last_force_resend < pi->last_force_request_resend) { + t->last_force_resend = pi->last_force_request_resend; force_resend = true; - } else if (!last_force_resend) { + } else if (t->last_force_resend == 0) { force_resend = true; } } - if (ceph_oid_empty(&t->target_oid) || force_resend) { - ceph_oid_copy(&t->target_oid, &t->base_oid); - need_check_tiering = true; - } - if (ceph_oloc_empty(&t->target_oloc) || force_resend) { - ceph_oloc_copy(&t->target_oloc, &t->base_oloc); - need_check_tiering = true; - } - if (need_check_tiering && - (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + /* apply tiering */ + ceph_oid_copy(&t->target_oid, &t->base_oid); + ceph_oloc_copy(&t->target_oloc, &t->base_oloc); + if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) t->target_oloc.pool = pi->read_tier; if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) t->target_oloc.pool = pi->write_tier; + + pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); + if (!pi) { + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } } - ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, - &t->target_oloc, &pgid); + ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, + &pgid); if (ret) { WARN_ON(ret != -ENOENT); t->osd = CEPH_HOMELESS_OSD; @@ -1368,7 +1385,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, last_pgid.pool = pgid.pool; last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); - ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); + ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting); if (any_change && ceph_is_new_interval(&t->acting, &acting, @@ -1387,13 +1404,16 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, if (t->paused && !target_should_be_paused(osdc, t, pi)) { t->paused = false; - need_resend = true; + unpaused = true; } + legacy_change = ceph_pg_compare(&t->pgid, &pgid) || + ceph_osds_changed(&t->acting, &acting, any_change); + if (t->pg_num) + split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); - if (ceph_pg_compare(&t->pgid, &pgid) || - ceph_osds_changed(&t->acting, &acting, any_change) || - force_resend) { + if (legacy_change || force_resend || split) { t->pgid = pgid; /* struct */ + ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid); ceph_osds_copy(&t->acting, &acting); ceph_osds_copy(&t->up, &up); t->size = pi->size; @@ -1403,15 +1423,342 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, t->sort_bitwise = sort_bitwise; t->osd = acting.primary; - need_resend = true; } - ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; + if (unpaused || legacy_change || force_resend || + (split && con && CEPH_HAVE_FEATURE(con->peer_features, + RESEND_ON_SPLIT))) + ct_res = CALC_TARGET_NEED_RESEND; + else + ct_res = CALC_TARGET_NO_ACTION; + out: dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); return ct_res; } +static struct ceph_spg_mapping *alloc_spg_mapping(void) +{ + struct ceph_spg_mapping *spg; + + spg = kmalloc(sizeof(*spg), GFP_NOIO); + if (!spg) + return NULL; + + RB_CLEAR_NODE(&spg->node); + spg->backoffs = RB_ROOT; + return spg; +} + +static void free_spg_mapping(struct ceph_spg_mapping *spg) +{ + WARN_ON(!RB_EMPTY_NODE(&spg->node)); + WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs)); + + kfree(spg); +} + +/* + * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to + * ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is + * defined only within a specific spgid; it does not pass anything to + * children on split, or to another primary. + */ +DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare, + RB_BYPTR, const struct ceph_spg *, node) + +static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid) +{ + return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits; +} + +static void hoid_get_effective_key(const struct ceph_hobject_id *hoid, + void **pkey, size_t *pkey_len) +{ + if (hoid->key_len) { + *pkey = hoid->key; + *pkey_len = hoid->key_len; + } else { + *pkey = hoid->oid; + *pkey_len = hoid->oid_len; + } +} + +static int compare_names(const void *name1, size_t name1_len, + const void *name2, size_t name2_len) +{ + int ret; + + ret = memcmp(name1, name2, min(name1_len, name2_len)); + if (!ret) { + if (name1_len < name2_len) + ret = -1; + else if (name1_len > name2_len) + ret = 1; + } + return ret; +} + +static int hoid_compare(const struct ceph_hobject_id *lhs, + const struct ceph_hobject_id *rhs) +{ + void *effective_key1, *effective_key2; + size_t effective_key1_len, effective_key2_len; + int ret; + + if (lhs->is_max < rhs->is_max) + return -1; + if (lhs->is_max > rhs->is_max) + return 1; + + if (lhs->pool < rhs->pool) + return -1; + if (lhs->pool > rhs->pool) + return 1; + + if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs)) + return -1; + if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs)) + return 1; + + ret = compare_names(lhs->nspace, lhs->nspace_len, + rhs->nspace, rhs->nspace_len); + if (ret) + return ret; + + hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len); + hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len); + ret = compare_names(effective_key1, effective_key1_len, + effective_key2, effective_key2_len); + if (ret) + return ret; + + ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len); + if (ret) + return ret; + + if (lhs->snapid < rhs->snapid) + return -1; + if (lhs->snapid > rhs->snapid) + return 1; + + return 0; +} + +/* + * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX + * compat stuff here. + * + * Assumes @hoid is zero-initialized. + */ +static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid) +{ + u8 struct_v; + u32 struct_len; + int ret; + + ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v, + &struct_len); + if (ret) + return ret; + + if (struct_v < 4) { + pr_err("got struct_v %d < 4 of hobject_t\n", struct_v); + goto e_inval; + } + + hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len, + GFP_NOIO); + if (IS_ERR(hoid->key)) { + ret = PTR_ERR(hoid->key); + hoid->key = NULL; + return ret; + } + + hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len, + GFP_NOIO); + if (IS_ERR(hoid->oid)) { + ret = PTR_ERR(hoid->oid); + hoid->oid = NULL; + return ret; + } + + ceph_decode_64_safe(p, end, hoid->snapid, e_inval); + ceph_decode_32_safe(p, end, hoid->hash, e_inval); + ceph_decode_8_safe(p, end, hoid->is_max, e_inval); + + hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len, + GFP_NOIO); + if (IS_ERR(hoid->nspace)) { + ret = PTR_ERR(hoid->nspace); + hoid->nspace = NULL; + return ret; + } + + ceph_decode_64_safe(p, end, hoid->pool, e_inval); + + ceph_hoid_build_hash_cache(hoid); + return 0; + +e_inval: + return -EINVAL; +} + +static int hoid_encoding_size(const struct ceph_hobject_id *hoid) +{ + return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */ + 4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len; +} + +static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid) +{ + ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid)); + ceph_encode_string(p, end, hoid->key, hoid->key_len); + ceph_encode_string(p, end, hoid->oid, hoid->oid_len); + ceph_encode_64(p, hoid->snapid); + ceph_encode_32(p, hoid->hash); + ceph_encode_8(p, hoid->is_max); + ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len); + ceph_encode_64(p, hoid->pool); +} + +static void free_hoid(struct ceph_hobject_id *hoid) +{ + if (hoid) { + kfree(hoid->key); + kfree(hoid->oid); + kfree(hoid->nspace); + kfree(hoid); + } +} + +static struct ceph_osd_backoff *alloc_backoff(void) +{ + struct ceph_osd_backoff *backoff; + + backoff = kzalloc(sizeof(*backoff), GFP_NOIO); + if (!backoff) + return NULL; + + RB_CLEAR_NODE(&backoff->spg_node); + RB_CLEAR_NODE(&backoff->id_node); + return backoff; +} + +static void free_backoff(struct ceph_osd_backoff *backoff) +{ + WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node)); + WARN_ON(!RB_EMPTY_NODE(&backoff->id_node)); + + free_hoid(backoff->begin); + free_hoid(backoff->end); + kfree(backoff); +} + +/* + * Within a specific spgid, backoffs are managed by ->begin hoid. + */ +DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare, + RB_BYVAL, spg_node); + +static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root, + const struct ceph_hobject_id *hoid) +{ + struct rb_node *n = root->rb_node; + + while (n) { + struct ceph_osd_backoff *cur = + rb_entry(n, struct ceph_osd_backoff, spg_node); + int cmp; + + cmp = hoid_compare(hoid, cur->begin); + if (cmp < 0) { + n = n->rb_left; + } else if (cmp > 0) { + if (hoid_compare(hoid, cur->end) < 0) + return cur; + + n = n->rb_right; + } else { + return cur; + } + } + + return NULL; +} + +/* + * Each backoff has a unique id within its OSD session. + */ +DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node) + +static void clear_backoffs(struct ceph_osd *osd) +{ + while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) { + struct ceph_spg_mapping *spg = + rb_entry(rb_first(&osd->o_backoff_mappings), + struct ceph_spg_mapping, node); + + while (!RB_EMPTY_ROOT(&spg->backoffs)) { + struct ceph_osd_backoff *backoff = + rb_entry(rb_first(&spg->backoffs), + struct ceph_osd_backoff, spg_node); + + erase_backoff(&spg->backoffs, backoff); + erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); + free_backoff(backoff); + } + erase_spg_mapping(&osd->o_backoff_mappings, spg); + free_spg_mapping(spg); + } +} + +/* + * Set up a temporary, non-owning view into @t. + */ +static void hoid_fill_from_target(struct ceph_hobject_id *hoid, + const struct ceph_osd_request_target *t) +{ + hoid->key = NULL; + hoid->key_len = 0; + hoid->oid = t->target_oid.name; + hoid->oid_len = t->target_oid.name_len; + hoid->snapid = CEPH_NOSNAP; + hoid->hash = t->pgid.seed; + hoid->is_max = false; + if (t->target_oloc.pool_ns) { + hoid->nspace = t->target_oloc.pool_ns->str; + hoid->nspace_len = t->target_oloc.pool_ns->len; + } else { + hoid->nspace = NULL; + hoid->nspace_len = 0; + } + hoid->pool = t->target_oloc.pool; + ceph_hoid_build_hash_cache(hoid); +} + +static bool should_plug_request(struct ceph_osd_request *req) +{ + struct ceph_osd *osd = req->r_osd; + struct ceph_spg_mapping *spg; + struct ceph_osd_backoff *backoff; + struct ceph_hobject_id hoid; + + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid); + if (!spg) + return false; + + hoid_fill_from_target(&hoid, &req->r_t); + backoff = lookup_containing_backoff(&spg->backoffs, &hoid); + if (!backoff) + return false; + + dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n", + __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool, + backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id); + return true; +} + static void setup_request_data(struct ceph_osd_request *req, struct ceph_msg *msg) { @@ -1483,7 +1830,37 @@ static void setup_request_data(struct ceph_osd_request *req, WARN_ON(data_len != msg->data_length); } -static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) +static void encode_pgid(void **p, const struct ceph_pg *pgid) +{ + ceph_encode_8(p, 1); + ceph_encode_64(p, pgid->pool); + ceph_encode_32(p, pgid->seed); + ceph_encode_32(p, -1); /* preferred */ +} + +static void encode_spgid(void **p, const struct ceph_spg *spgid) +{ + ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1); + encode_pgid(p, &spgid->pgid); + ceph_encode_8(p, spgid->shard); +} + +static void encode_oloc(void **p, void *end, + const struct ceph_object_locator *oloc) +{ + ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc)); + ceph_encode_64(p, oloc->pool); + ceph_encode_32(p, -1); /* preferred */ + ceph_encode_32(p, 0); /* key len */ + if (oloc->pool_ns) + ceph_encode_string(p, end, oloc->pool_ns->str, + oloc->pool_ns->len); + else + ceph_encode_32(p, 0); +} + +static void encode_request_partial(struct ceph_osd_request *req, + struct ceph_msg *msg) { void *p = msg->front.iov_base; void *const end = p + msg->front_alloc_len; @@ -1500,38 +1877,27 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) setup_request_data(req, msg); - ceph_encode_32(&p, 1); /* client_inc, always 1 */ + encode_spgid(&p, &req->r_t.spgid); /* actual spg */ + ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ ceph_encode_32(&p, req->r_osdc->osdmap->epoch); ceph_encode_32(&p, req->r_flags); - ceph_encode_timespec(p, &req->r_mtime); - p += sizeof(struct ceph_timespec); - /* reassert_version */ - memset(p, 0, sizeof(struct ceph_eversion)); - p += sizeof(struct ceph_eversion); - - /* oloc */ - ceph_start_encoding(&p, 5, 4, - ceph_oloc_encoding_size(&req->r_t.target_oloc)); - ceph_encode_64(&p, req->r_t.target_oloc.pool); - ceph_encode_32(&p, -1); /* preferred */ - ceph_encode_32(&p, 0); /* key len */ - if (req->r_t.target_oloc.pool_ns) - ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str, - req->r_t.target_oloc.pool_ns->len); - else - ceph_encode_32(&p, 0); + /* reqid */ + ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid)); + memset(p, 0, sizeof(struct ceph_osd_reqid)); + p += sizeof(struct ceph_osd_reqid); - /* pgid */ - ceph_encode_8(&p, 1); - ceph_encode_64(&p, req->r_t.pgid.pool); - ceph_encode_32(&p, req->r_t.pgid.seed); - ceph_encode_32(&p, -1); /* preferred */ + /* trace */ + memset(p, 0, sizeof(struct ceph_blkin_trace_info)); + p += sizeof(struct ceph_blkin_trace_info); + + ceph_encode_32(&p, 0); /* client_inc, always 0 */ + ceph_encode_timespec(p, &req->r_mtime); + p += sizeof(struct ceph_timespec); - /* oid */ - ceph_encode_32(&p, req->r_t.target_oid.name_len); - memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len); - p += req->r_t.target_oid.name_len; + encode_oloc(&p, end, &req->r_t.target_oloc); + ceph_encode_string(&p, end, req->r_t.target_oid.name, + req->r_t.target_oid.name_len); /* ops, can imply data */ ceph_encode_16(&p, req->r_num_ops); @@ -1552,11 +1918,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) } ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ + BUG_ON(p != end - 8); /* space for features */ - BUG_ON(p > end); - msg->front.iov_len = p - msg->front.iov_base; - msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */ + /* front_len is finalized in encode_request_finish() */ msg->hdr.data_len = cpu_to_le32(data_len); /* * The header "data_off" is a hint to the receiver allowing it @@ -1565,9 +1930,99 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) */ msg->hdr.data_off = cpu_to_le16(req->r_data_offset); - dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__, - req, req->r_t.target_oid.name, req->r_t.target_oid.name_len, - msg->front.iov_len, data_len); + dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg, + req->r_t.target_oid.name, req->r_t.target_oid.name_len); +} + +static void encode_request_finish(struct ceph_msg *msg) +{ + void *p = msg->front.iov_base; + void *const end = p + msg->front_alloc_len; + + if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) { + /* luminous OSD -- encode features and be done */ + p = end - 8; + ceph_encode_64(&p, msg->con->peer_features); + } else { + struct { + char spgid[CEPH_ENCODING_START_BLK_LEN + + CEPH_PGID_ENCODING_LEN + 1]; + __le32 hash; + __le32 epoch; + __le32 flags; + char reqid[CEPH_ENCODING_START_BLK_LEN + + sizeof(struct ceph_osd_reqid)]; + char trace[sizeof(struct ceph_blkin_trace_info)]; + __le32 client_inc; + struct ceph_timespec mtime; + } __packed head; + struct ceph_pg pgid; + void *oloc, *oid, *tail; + int oloc_len, oid_len, tail_len; + int len; + + /* + * Pre-luminous OSD -- reencode v8 into v4 using @head + * as a temporary buffer. Encode the raw PG; the rest + * is just a matter of moving oloc, oid and tail blobs + * around. + */ + memcpy(&head, p, sizeof(head)); + p += sizeof(head); + + oloc = p; + p += CEPH_ENCODING_START_BLK_LEN; + pgid.pool = ceph_decode_64(&p); + p += 4 + 4; /* preferred, key len */ + len = ceph_decode_32(&p); + p += len; /* nspace */ + oloc_len = p - oloc; + + oid = p; + len = ceph_decode_32(&p); + p += len; + oid_len = p - oid; + + tail = p; + tail_len = (end - p) - 8; + + p = msg->front.iov_base; + ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc)); + ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch)); + ceph_encode_copy(&p, &head.flags, sizeof(head.flags)); + ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime)); + + /* reassert_version */ + memset(p, 0, sizeof(struct ceph_eversion)); + p += sizeof(struct ceph_eversion); + + BUG_ON(p >= oloc); + memmove(p, oloc, oloc_len); + p += oloc_len; + + pgid.seed = le32_to_cpu(head.hash); + encode_pgid(&p, &pgid); /* raw pg */ + + BUG_ON(p >= oid); + memmove(p, oid, oid_len); + p += oid_len; + + /* tail -- ops, snapid, snapc, retry_attempt */ + BUG_ON(p >= tail); + memmove(p, tail, tail_len); + p += tail_len; + + msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ + } + + BUG_ON(p > end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg, + le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), + le16_to_cpu(msg->hdr.version)); } /* @@ -1580,6 +2035,10 @@ static void send_request(struct ceph_osd_request *req) verify_osd_locked(osd); WARN_ON(osd->o_osd != req->r_t.osd); + /* backoff? */ + if (should_plug_request(req)) + return; + /* * We may have a previously queued request message hanging * around. Cancel it to avoid corrupting the msgr. @@ -1593,11 +2052,13 @@ static void send_request(struct ceph_osd_request *req) else WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); - encode_request(req, req->r_request); + encode_request_partial(req, req->r_request); - dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n", + dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n", __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, - req->r_t.osd, req->r_flags, req->r_attempts); + req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed, + req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags, + req->r_attempts); req->r_t.paused = false; req->r_stamp = jiffies; @@ -1645,7 +2106,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); again: - ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); + ct_res = calc_target(osdc, &req->r_t, NULL, false); if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) goto promote; @@ -1737,13 +2198,12 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked) static void finish_request(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; - struct ceph_osd *osd = req->r_osd; - verify_osd_locked(osd); + WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); - unlink_request(osd, req); + if (req->r_osd) + unlink_request(req->r_osd, req); atomic_dec(&osdc->num_requests); /* @@ -2441,7 +2901,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd *osd; - calc_target(osdc, &lreq->t, &lreq->last_force_resend, false); + calc_target(osdc, &lreq->t, NULL, false); osd = lookup_create_osd(osdc, lreq->t.osd, true); link_linger(osd, lreq); @@ -3059,7 +3519,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) struct ceph_osd_client *osdc = lreq->osdc; enum calc_target_result ct_res; - ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true); + ct_res = calc_target(osdc, &lreq->t, NULL, true); if (ct_res == CALC_TARGET_NEED_RESEND) { struct ceph_osd *osd; @@ -3117,6 +3577,7 @@ static void scan_requests(struct ceph_osd *osd, list_add_tail(&lreq->scan_item, need_resend_linger); break; case CALC_TARGET_POOL_DNE: + list_del_init(&lreq->scan_item); check_linger_pool_dne(lreq); break; } @@ -3130,8 +3591,8 @@ static void scan_requests(struct ceph_osd *osd, n = rb_next(n); /* unlink_request(), check_pool_dne() */ dout("%s req %p tid %llu\n", __func__, req, req->r_tid); - ct_res = calc_target(osdc, &req->r_t, - &req->r_last_force_resend, false); + ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, + false); switch (ct_res) { case CALC_TARGET_NO_ACTION: force_resend_writes = cleared_full || @@ -3229,8 +3690,25 @@ static void kick_requests(struct ceph_osd_client *osdc, struct list_head *need_resend_linger) { struct ceph_osd_linger_request *lreq, *nlreq; + enum calc_target_result ct_res; struct rb_node *n; + /* make sure need_resend targets reflect latest map */ + for (n = rb_first(need_resend); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + n = rb_next(n); + + if (req->r_t.epoch < osdc->osdmap->epoch) { + ct_res = calc_target(osdc, &req->r_t, NULL, false); + if (ct_res == CALC_TARGET_POOL_DNE) { + erase_request(need_resend, req); + check_pool_dne(req); + } + } + } + for (n = rb_first(need_resend); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); @@ -3239,8 +3717,6 @@ static void kick_requests(struct ceph_osd_client *osdc, n = rb_next(n); erase_request(need_resend, req); /* before link_request() */ - WARN_ON(req->r_osd); - calc_target(osdc, &req->r_t, NULL, false); osd = lookup_create_osd(osdc, req->r_t.osd, true); link_request(osd, req); if (!req->r_linger) { @@ -3383,6 +3859,8 @@ static void kick_osd_requests(struct ceph_osd *osd) { struct rb_node *n; + clear_backoffs(osd); + for (n = rb_first(&osd->o_requests); n; ) { struct ceph_osd_request *req = rb_entry(n, struct ceph_osd_request, r_node); @@ -3428,6 +3906,261 @@ out_unlock: up_write(&osdc->lock); } +struct MOSDBackoff { + struct ceph_spg spgid; + u32 map_epoch; + u8 op; + u64 id; + struct ceph_hobject_id *begin; + struct ceph_hobject_id *end; +}; + +static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m) +{ + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; + u8 struct_v; + u32 struct_len; + int ret; + + ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len); + if (ret) + return ret; + + ret = ceph_decode_pgid(&p, end, &m->spgid.pgid); + if (ret) + return ret; + + ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval); + ceph_decode_32_safe(&p, end, m->map_epoch, e_inval); + ceph_decode_8_safe(&p, end, m->op, e_inval); + ceph_decode_64_safe(&p, end, m->id, e_inval); + + m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO); + if (!m->begin) + return -ENOMEM; + + ret = decode_hoid(&p, end, m->begin); + if (ret) { + free_hoid(m->begin); + return ret; + } + + m->end = kzalloc(sizeof(*m->end), GFP_NOIO); + if (!m->end) { + free_hoid(m->begin); + return -ENOMEM; + } + + ret = decode_hoid(&p, end, m->end); + if (ret) { + free_hoid(m->begin); + free_hoid(m->end); + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static struct ceph_msg *create_backoff_message( + const struct ceph_osd_backoff *backoff, + u32 map_epoch) +{ + struct ceph_msg *msg; + void *p, *end; + int msg_size; + + msg_size = CEPH_ENCODING_START_BLK_LEN + + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ + msg_size += 4 + 1 + 8; /* map_epoch, op, id */ + msg_size += CEPH_ENCODING_START_BLK_LEN + + hoid_encoding_size(backoff->begin); + msg_size += CEPH_ENCODING_START_BLK_LEN + + hoid_encoding_size(backoff->end); + + msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true); + if (!msg) + return NULL; + + p = msg->front.iov_base; + end = p + msg->front_alloc_len; + + encode_spgid(&p, &backoff->spgid); + ceph_encode_32(&p, map_epoch); + ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK); + ceph_encode_64(&p, backoff->id); + encode_hoid(&p, end, backoff->begin); + encode_hoid(&p, end, backoff->end); + BUG_ON(p != end); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */ + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + + return msg; +} + +static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m) +{ + struct ceph_spg_mapping *spg; + struct ceph_osd_backoff *backoff; + struct ceph_msg *msg; + + dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, + m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); + + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid); + if (!spg) { + spg = alloc_spg_mapping(); + if (!spg) { + pr_err("%s failed to allocate spg\n", __func__); + return; + } + spg->spgid = m->spgid; /* struct */ + insert_spg_mapping(&osd->o_backoff_mappings, spg); + } + + backoff = alloc_backoff(); + if (!backoff) { + pr_err("%s failed to allocate backoff\n", __func__); + return; + } + backoff->spgid = m->spgid; /* struct */ + backoff->id = m->id; + backoff->begin = m->begin; + m->begin = NULL; /* backoff now owns this */ + backoff->end = m->end; + m->end = NULL; /* ditto */ + + insert_backoff(&spg->backoffs, backoff); + insert_backoff_by_id(&osd->o_backoffs_by_id, backoff); + + /* + * Ack with original backoff's epoch so that the OSD can + * discard this if there was a PG split. + */ + msg = create_backoff_message(backoff, m->map_epoch); + if (!msg) { + pr_err("%s failed to allocate msg\n", __func__); + return; + } + ceph_con_send(&osd->o_con, msg); +} + +static bool target_contained_by(const struct ceph_osd_request_target *t, + const struct ceph_hobject_id *begin, + const struct ceph_hobject_id *end) +{ + struct ceph_hobject_id hoid; + int cmp; + + hoid_fill_from_target(&hoid, t); + cmp = hoid_compare(&hoid, begin); + return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0); +} + +static void handle_backoff_unblock(struct ceph_osd *osd, + const struct MOSDBackoff *m) +{ + struct ceph_spg_mapping *spg; + struct ceph_osd_backoff *backoff; + struct rb_node *n; + + dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, + m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); + + backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id); + if (!backoff) { + pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n", + __func__, osd->o_osd, m->spgid.pgid.pool, + m->spgid.pgid.seed, m->spgid.shard, m->id); + return; + } + + if (hoid_compare(backoff->begin, m->begin) && + hoid_compare(backoff->end, m->end)) { + pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n", + __func__, osd->o_osd, m->spgid.pgid.pool, + m->spgid.pgid.seed, m->spgid.shard, m->id); + /* unblock it anyway... */ + } + + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid); + BUG_ON(!spg); + + erase_backoff(&spg->backoffs, backoff); + erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); + free_backoff(backoff); + + if (RB_EMPTY_ROOT(&spg->backoffs)) { + erase_spg_mapping(&osd->o_backoff_mappings, spg); + free_spg_mapping(spg); + } + + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) { + /* + * Match against @m, not @backoff -- the PG may + * have split on the OSD. + */ + if (target_contained_by(&req->r_t, m->begin, m->end)) { + /* + * If no other installed backoff applies, + * resend. + */ + send_request(req); + } + } + } +} + +static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg) +{ + struct ceph_osd_client *osdc = osd->o_osdc; + struct MOSDBackoff m; + int ret; + + down_read(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown\n", __func__, osd->o_osd); + up_read(&osdc->lock); + return; + } + WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); + + mutex_lock(&osd->lock); + ret = decode_MOSDBackoff(msg, &m); + if (ret) { + pr_err("failed to decode MOSDBackoff: %d\n", ret); + ceph_msg_dump(msg); + goto out_unlock; + } + + switch (m.op) { + case CEPH_OSD_BACKOFF_OP_BLOCK: + handle_backoff_block(osd, &m); + break; + case CEPH_OSD_BACKOFF_OP_UNBLOCK: + handle_backoff_unblock(osd, &m); + break; + default: + pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op); + } + + free_hoid(m.begin); + free_hoid(m.end); + +out_unlock: + mutex_unlock(&osd->lock); + up_read(&osdc->lock); +} + /* * Process osd watch notifications */ @@ -4365,6 +5098,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) case CEPH_MSG_OSD_OPREPLY: handle_reply(osd, msg); break; + case CEPH_MSG_OSD_BACKOFF: + handle_backoff(osd, msg); + break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); break; @@ -4487,6 +5223,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: + case CEPH_MSG_OSD_BACKOFF: case CEPH_MSG_WATCH_NOTIFY: return alloc_msg_with_page_vector(hdr); case CEPH_MSG_OSD_OPREPLY: @@ -4571,6 +5308,11 @@ static int invalidate_authorizer(struct ceph_connection *con) return ceph_monc_validate_auth(&osdc->client->monc); } +static void osd_reencode_message(struct ceph_msg *msg) +{ + encode_request_finish(msg); +} + static int osd_sign_message(struct ceph_msg *msg) { struct ceph_osd *o = msg->con->private; @@ -4595,6 +5337,7 @@ static const struct ceph_connection_operations osd_con_ops = { .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .alloc_msg = alloc_msg, + .reencode_message = osd_reencode_message, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, .fault = osd_fault, diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 55e3a477f92d..864789c5974e 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -11,7 +11,7 @@ #include <linux/crush/hash.h> #include <linux/crush/mapper.h> -char *ceph_osdmap_state_str(char *str, int len, int state) +char *ceph_osdmap_state_str(char *str, int len, u32 state) { if (!len) return str; @@ -138,19 +138,175 @@ bad: return -EINVAL; } -static int skip_name_map(void **p, void *end) +static struct crush_choose_arg_map *alloc_choose_arg_map(void) { - int len; - ceph_decode_32_safe(p, end, len ,bad); - while (len--) { - int strlen; - *p += sizeof(u32); - ceph_decode_32_safe(p, end, strlen, bad); - *p += strlen; + struct crush_choose_arg_map *arg_map; + + arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO); + if (!arg_map) + return NULL; + + RB_CLEAR_NODE(&arg_map->node); + return arg_map; } - return 0; -bad: - return -EINVAL; + +static void free_choose_arg_map(struct crush_choose_arg_map *arg_map) +{ + if (arg_map) { + int i, j; + + WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); + + for (i = 0; i < arg_map->size; i++) { + struct crush_choose_arg *arg = &arg_map->args[i]; + + for (j = 0; j < arg->weight_set_size; j++) + kfree(arg->weight_set[j].weights); + kfree(arg->weight_set); + kfree(arg->ids); + } + kfree(arg_map->args); + kfree(arg_map); + } +} + +DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index, + node); + +void clear_choose_args(struct crush_map *c) +{ + while (!RB_EMPTY_ROOT(&c->choose_args)) { + struct crush_choose_arg_map *arg_map = + rb_entry(rb_first(&c->choose_args), + struct crush_choose_arg_map, node); + + erase_choose_arg_map(&c->choose_args, arg_map); + free_choose_arg_map(arg_map); + } +} + +static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen) +{ + u32 *a = NULL; + u32 len; + int ret; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len) { + u32 i; + + a = kmalloc_array(len, sizeof(u32), GFP_NOIO); + if (!a) { + ret = -ENOMEM; + goto fail; + } + + ceph_decode_need(p, end, len * sizeof(u32), e_inval); + for (i = 0; i < len; i++) + a[i] = ceph_decode_32(p); + } + + *plen = len; + return a; + +e_inval: + ret = -EINVAL; +fail: + kfree(a); + return ERR_PTR(ret); +} + +/* + * Assumes @arg is zero-initialized. + */ +static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg) +{ + int ret; + + ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval); + if (arg->weight_set_size) { + u32 i; + + arg->weight_set = kmalloc_array(arg->weight_set_size, + sizeof(*arg->weight_set), + GFP_NOIO); + if (!arg->weight_set) + return -ENOMEM; + + for (i = 0; i < arg->weight_set_size; i++) { + struct crush_weight_set *w = &arg->weight_set[i]; + + w->weights = decode_array_32_alloc(p, end, &w->size); + if (IS_ERR(w->weights)) { + ret = PTR_ERR(w->weights); + w->weights = NULL; + return ret; + } + } + } + + arg->ids = decode_array_32_alloc(p, end, &arg->ids_size); + if (IS_ERR(arg->ids)) { + ret = PTR_ERR(arg->ids); + arg->ids = NULL; + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_choose_args(void **p, void *end, struct crush_map *c) +{ + struct crush_choose_arg_map *arg_map = NULL; + u32 num_choose_arg_maps, num_buckets; + int ret; + + ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval); + while (num_choose_arg_maps--) { + arg_map = alloc_choose_arg_map(); + if (!arg_map) { + ret = -ENOMEM; + goto fail; + } + + ceph_decode_64_safe(p, end, arg_map->choose_args_index, + e_inval); + arg_map->size = c->max_buckets; + arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args), + GFP_NOIO); + if (!arg_map->args) { + ret = -ENOMEM; + goto fail; + } + + ceph_decode_32_safe(p, end, num_buckets, e_inval); + while (num_buckets--) { + struct crush_choose_arg *arg; + u32 bucket_index; + + ceph_decode_32_safe(p, end, bucket_index, e_inval); + if (bucket_index >= arg_map->size) + goto e_inval; + + arg = &arg_map->args[bucket_index]; + ret = decode_choose_arg(p, end, arg); + if (ret) + goto fail; + } + + insert_choose_arg_map(&c->choose_args, arg_map); + } + + return 0; + +e_inval: + ret = -EINVAL; +fail: + free_choose_arg_map(arg_map); + return ret; } static void crush_finalize(struct crush_map *c) @@ -187,7 +343,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) void **p = &pbyval; void *start = pbyval; u32 magic; - u32 num_name_maps; dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); @@ -195,6 +350,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) if (c == NULL) return ERR_PTR(-ENOMEM); + c->choose_args = RB_ROOT; + /* set tunables to default values */ c->choose_local_tries = 2; c->choose_local_fallback_tries = 5; @@ -353,12 +510,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) } } - /* ignore trailing name maps. */ - for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { - err = skip_name_map(p, end); - if (err < 0) - goto done; - } + ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */ + ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */ + ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ /* tunables */ ceph_decode_need(p, end, 3*sizeof(u32), done); @@ -391,6 +545,21 @@ static struct crush_map *crush_decode(void *pbyval, void *end) dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); + if (*p != end) { + /* class_map */ + ceph_decode_skip_map(p, end, 32, 32, bad); + /* class_name */ + ceph_decode_skip_map(p, end, 32, string, bad); + /* class_bucket */ + ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad); + } + + if (*p != end) { + err = decode_choose_args(p, end, c); + if (err) + goto bad; + } + done: crush_finalize(c); dout("crush_decode success\n"); @@ -418,75 +587,49 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) return 0; } -/* - * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) and primary_temp (explicit primary setting) - */ -static int __insert_pg_mapping(struct ceph_pg_mapping *new, - struct rb_root *root) +int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct ceph_pg_mapping *pg = NULL; - int c; + int ret; - dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); - while (*p) { - parent = *p; - pg = rb_entry(parent, struct ceph_pg_mapping, node); - c = ceph_pg_compare(&new->pgid, &pg->pgid); - if (c < 0) - p = &(*p)->rb_left; - else if (c > 0) - p = &(*p)->rb_right; - else - return -EEXIST; - } + ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid); + if (ret) + return ret; + + if (lhs->shard < rhs->shard) + return -1; + if (lhs->shard > rhs->shard) + return 1; - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, root); return 0; } -static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, - struct ceph_pg pgid) +static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len) { - struct rb_node *n = root->rb_node; struct ceph_pg_mapping *pg; - int c; - while (n) { - pg = rb_entry(n, struct ceph_pg_mapping, node); - c = ceph_pg_compare(&pgid, &pg->pgid); - if (c < 0) { - n = n->rb_left; - } else if (c > 0) { - n = n->rb_right; - } else { - dout("__lookup_pg_mapping %lld.%x got %p\n", - pgid.pool, pgid.seed, pg); - return pg; - } - } - return NULL; + pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO); + if (!pg) + return NULL; + + RB_CLEAR_NODE(&pg->node); + return pg; } -static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) +static void free_pg_mapping(struct ceph_pg_mapping *pg) { - struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); + WARN_ON(!RB_EMPTY_NODE(&pg->node)); - if (pg) { - dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, - pg); - rb_erase(&pg->node, root); - kfree(pg); - return 0; - } - dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); - return -ENOENT; + kfree(pg); } /* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) and primary_temp (explicit primary setting) + */ +DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, + RB_BYPTR, const struct ceph_pg *, node) + +/* * rbtree of pg pool info */ static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) @@ -682,11 +825,48 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += len; } + /* + * last_force_op_resend_preluminous, will be overridden if the + * map was encoded with RESEND_ON_SPLIT + */ if (ev >= 15) pi->last_force_request_resend = ceph_decode_32(p); else pi->last_force_request_resend = 0; + if (ev >= 16) + *p += 4; /* skip min_read_recency_for_promote */ + + if (ev >= 17) + *p += 8; /* skip expected_num_objects */ + + if (ev >= 19) + *p += 4; /* skip cache_target_dirty_high_ratio_micro */ + + if (ev >= 20) + *p += 4; /* skip min_write_recency_for_promote */ + + if (ev >= 21) + *p += 1; /* skip use_gmt_hitset */ + + if (ev >= 22) + *p += 1; /* skip fast_read */ + + if (ev >= 23) { + *p += 4; /* skip hit_set_grade_decay_rate */ + *p += 4; /* skip hit_set_search_last_n */ + } + + if (ev >= 24) { + /* skip opts */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; + } + + if (ev >= 25) + pi->last_force_request_resend = ceph_decode_32(p); + /* ignore the rest */ *p = pool_end; @@ -743,6 +923,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; + map->pg_upmap = RB_ROOT; + map->pg_upmap_items = RB_ROOT; mutex_init(&map->crush_workspace_mutex); return map; @@ -757,14 +939,28 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_temp), struct ceph_pg_mapping, node); - rb_erase(&pg->node, &map->pg_temp); - kfree(pg); + erase_pg_mapping(&map->pg_temp, pg); + free_pg_mapping(pg); } while (!RB_EMPTY_ROOT(&map->primary_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->primary_temp), struct ceph_pg_mapping, node); - rb_erase(&pg->node, &map->primary_temp); + erase_pg_mapping(&map->primary_temp, pg); + free_pg_mapping(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_upmap)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_upmap), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_upmap); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_upmap_items), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_upmap_items); kfree(pg); } while (!RB_EMPTY_ROOT(&map->pg_pools)) { @@ -788,7 +984,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) */ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) { - u8 *state; + u32 *state; u32 *weight; struct ceph_entity_addr *addr; int i; @@ -964,47 +1160,40 @@ static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) return __decode_pools(p, end, map, true); } -static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, - bool incremental) +typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool); + +static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root, + decode_mapping_fn_t fn, bool incremental) { u32 n; + WARN_ON(!incremental && !fn); + ceph_decode_32_safe(p, end, n, e_inval); while (n--) { + struct ceph_pg_mapping *pg; struct ceph_pg pgid; - u32 len, i; int ret; ret = ceph_decode_pgid(p, end, &pgid); if (ret) return ret; - ceph_decode_32_safe(p, end, len, e_inval); - - ret = __remove_pg_mapping(&map->pg_temp, pgid); - BUG_ON(!incremental && ret != -ENOENT); - - if (!incremental || len > 0) { - struct ceph_pg_mapping *pg; - - ceph_decode_need(p, end, len*sizeof(u32), e_inval); - - if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) - return -EINVAL; - - pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); - if (!pg) - return -ENOMEM; + pg = lookup_pg_mapping(mapping_root, &pgid); + if (pg) { + WARN_ON(!incremental); + erase_pg_mapping(mapping_root, pg); + free_pg_mapping(pg); + } - pg->pgid = pgid; - pg->pg_temp.len = len; - for (i = 0; i < len; i++) - pg->pg_temp.osds[i] = ceph_decode_32(p); + if (fn) { + pg = fn(p, end, incremental); + if (IS_ERR(pg)) + return PTR_ERR(pg); - ret = __insert_pg_mapping(pg, &map->pg_temp); - if (ret) { - kfree(pg); - return ret; + if (pg) { + pg->pgid = pgid; /* struct */ + insert_pg_mapping(mapping_root, pg); } } } @@ -1015,69 +1204,77 @@ e_inval: return -EINVAL; } +static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, + bool incremental) +{ + struct ceph_pg_mapping *pg; + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len == 0 && incremental) + return NULL; /* new_pg_temp: [] to remove */ + if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) + return ERR_PTR(-EINVAL); + + ceph_decode_need(p, end, len * sizeof(u32), e_inval); + pg = alloc_pg_mapping(len * sizeof(u32)); + if (!pg) + return ERR_PTR(-ENOMEM); + + pg->pg_temp.len = len; + for (i = 0; i < len; i++) + pg->pg_temp.osds[i] = ceph_decode_32(p); + + return pg; + +e_inval: + return ERR_PTR(-EINVAL); +} + static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_pg_temp(p, end, map, false); + return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, + false); } static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_pg_temp(p, end, map, true); + return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, + true); } -static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, - bool incremental) +static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end, + bool incremental) { - u32 n; - - ceph_decode_32_safe(p, end, n, e_inval); - while (n--) { - struct ceph_pg pgid; - u32 osd; - int ret; - - ret = ceph_decode_pgid(p, end, &pgid); - if (ret) - return ret; - - ceph_decode_32_safe(p, end, osd, e_inval); - - ret = __remove_pg_mapping(&map->primary_temp, pgid); - BUG_ON(!incremental && ret != -ENOENT); - - if (!incremental || osd != (u32)-1) { - struct ceph_pg_mapping *pg; - - pg = kzalloc(sizeof(*pg), GFP_NOFS); - if (!pg) - return -ENOMEM; + struct ceph_pg_mapping *pg; + u32 osd; - pg->pgid = pgid; - pg->primary_temp.osd = osd; + ceph_decode_32_safe(p, end, osd, e_inval); + if (osd == (u32)-1 && incremental) + return NULL; /* new_primary_temp: -1 to remove */ - ret = __insert_pg_mapping(pg, &map->primary_temp); - if (ret) { - kfree(pg); - return ret; - } - } - } + pg = alloc_pg_mapping(0); + if (!pg) + return ERR_PTR(-ENOMEM); - return 0; + pg->primary_temp.osd = osd; + return pg; e_inval: - return -EINVAL; + return ERR_PTR(-EINVAL); } static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_primary_temp(p, end, map, false); + return decode_pg_mapping(p, end, &map->primary_temp, + __decode_primary_temp, false); } static int decode_new_primary_temp(void **p, void *end, struct ceph_osdmap *map) { - return __decode_primary_temp(p, end, map, true); + return decode_pg_mapping(p, end, &map->primary_temp, + __decode_primary_temp, true); } u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) @@ -1168,6 +1365,75 @@ e_inval: return -EINVAL; } +static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end, + bool __unused) +{ + return __decode_pg_temp(p, end, false); +} + +static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, + false); +} + +static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, + true); +} + +static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true); +} + +static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, + bool __unused) +{ + struct ceph_pg_mapping *pg; + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) + return ERR_PTR(-EINVAL); + + ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); + pg = kzalloc(sizeof(*pg) + 2 * len * sizeof(u32), GFP_NOIO); + if (!pg) + return ERR_PTR(-ENOMEM); + + pg->pg_upmap_items.len = len; + for (i = 0; i < len; i++) { + pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p); + pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p); + } + + return pg; + +e_inval: + return ERR_PTR(-EINVAL); +} + +static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, + __decode_pg_upmap_items, false); +} + +static int decode_new_pg_upmap_items(void **p, void *end, + struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, + __decode_pg_upmap_items, true); +} + +static int decode_old_pg_upmap_items(void **p, void *end, + struct ceph_osdmap *map) +{ + return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true); +} + /* * decode a full map. */ @@ -1218,13 +1484,21 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) /* osd_state, osd_weight, osd_addrs->client_addr */ ceph_decode_need(p, end, 3*sizeof(u32) + - map->max_osd*(1 + sizeof(*map->osd_weight) + + map->max_osd*((struct_v >= 5 ? sizeof(u32) : + sizeof(u8)) + + sizeof(*map->osd_weight) + sizeof(*map->osd_addr)), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; - ceph_decode_copy(p, map->osd_state, map->max_osd); + if (struct_v >= 5) { + for (i = 0; i < map->max_osd; i++) + map->osd_state[i] = ceph_decode_32(p); + } else { + for (i = 0; i < map->max_osd; i++) + map->osd_state[i] = ceph_decode_8(p); + } if (ceph_decode_32(p) != map->max_osd) goto e_inval; @@ -1257,9 +1531,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; } else { - /* XXX can this happen? */ - kfree(map->osd_primary_affinity); - map->osd_primary_affinity = NULL; + WARN_ON(map->osd_primary_affinity); } /* crush */ @@ -1268,6 +1540,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) if (err) goto bad; + *p += len; + if (struct_v >= 3) { + /* erasure_code_profiles */ + ceph_decode_skip_map_of_map(p, end, string, string, string, + bad); + } + + if (struct_v >= 4) { + err = decode_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_pg_upmap_items(p, end, map); + if (err) + goto bad; + } else { + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap)); + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items)); + } + /* ignore the rest */ *p = end; @@ -1314,7 +1606,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) * new_up_client: { osd=6, addr=... } # set osd_state and addr * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ -static int decode_new_up_state_weight(void **p, void *end, +static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_osdmap *map) { void *new_up_client; @@ -1330,7 +1622,7 @@ static int decode_new_up_state_weight(void **p, void *end, new_state = *p; ceph_decode_32_safe(p, end, len, e_inval); - len *= sizeof(u32) + sizeof(u8); + len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8)); ceph_decode_need(p, end, len, e_inval); *p += len; @@ -1366,11 +1658,14 @@ static int decode_new_up_state_weight(void **p, void *end, len = ceph_decode_32(p); while (len--) { s32 osd; - u8 xorstate; + u32 xorstate; int ret; osd = ceph_decode_32(p); - xorstate = ceph_decode_8(p); + if (struct_v >= 5) + xorstate = ceph_decode_32(p); + else + xorstate = ceph_decode_8(p); if (xorstate == 0) xorstate = CEPH_OSD_UP; BUG_ON(osd >= map->max_osd); @@ -1504,7 +1799,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } /* new_up_client, new_state, new_weight */ - err = decode_new_up_state_weight(p, end, map); + err = decode_new_up_state_weight(p, end, struct_v, map); if (err) goto bad; @@ -1527,6 +1822,32 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, goto bad; } + if (struct_v >= 3) { + /* new_erasure_code_profiles */ + ceph_decode_skip_map_of_map(p, end, string, string, string, + bad); + /* old_erasure_code_profiles */ + ceph_decode_skip_set(p, end, string, bad); + } + + if (struct_v >= 4) { + err = decode_new_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_old_pg_upmap(p, end, map); + if (err) + goto bad; + + err = decode_new_pg_upmap_items(p, end, map); + if (err) + goto bad; + + err = decode_old_pg_upmap_items(p, end, map); + if (err) + goto bad; + } + /* ignore the rest */ *p = end; @@ -1547,12 +1868,13 @@ bad: void ceph_oloc_copy(struct ceph_object_locator *dest, const struct ceph_object_locator *src) { - WARN_ON(!ceph_oloc_empty(dest)); - WARN_ON(dest->pool_ns); /* empty() only covers ->pool */ + ceph_oloc_destroy(dest); dest->pool = src->pool; if (src->pool_ns) dest->pool_ns = ceph_get_string(src->pool_ns); + else + dest->pool_ns = NULL; } EXPORT_SYMBOL(ceph_oloc_copy); @@ -1565,14 +1887,15 @@ EXPORT_SYMBOL(ceph_oloc_destroy); void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { - WARN_ON(!ceph_oid_empty(dest)); + ceph_oid_destroy(dest); if (src->name != src->inline_name) { /* very rare, see ceph_object_id definition */ dest->name = kmalloc(src->name_len + 1, GFP_NOIO | __GFP_NOFAIL); + } else { + dest->name = dest->inline_name; } - memcpy(dest->name, src->name, src->name_len + 1); dest->name_len = src->name_len; } @@ -1714,9 +2037,8 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) dest->primary = src->primary; } -static bool is_split(const struct ceph_pg *pgid, - u32 old_pg_num, - u32 new_pg_num) +bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, + u32 new_pg_num) { int old_bits = calc_bits_of(old_pg_num); int old_mask = (1 << old_bits) - 1; @@ -1761,7 +2083,7 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, !osds_equal(old_up, new_up) || old_size != new_size || old_min_size != new_min_size || - is_split(pgid, old_pg_num, new_pg_num) || + ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || old_sort_bitwise != new_sort_bitwise; } @@ -1885,16 +2207,12 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); * Should only be called with target_oid and target_oloc (as opposed to * base_oid and base_oloc), since tiering isn't taken into account. */ -int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_id *oid, - struct ceph_object_locator *oloc, - struct ceph_pg *raw_pgid) +int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) { - struct ceph_pg_pool_info *pi; - - pi = ceph_pg_pool_by_id(osdmap, oloc->pool); - if (!pi) - return -ENOENT; + WARN_ON(pi->id != oloc->pool); if (!oloc->pool_ns) { raw_pgid->pool = oloc->pool; @@ -1926,6 +2244,20 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, } return 0; } + +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + const struct ceph_object_id *oid, + const struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) +{ + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, oloc->pool); + if (!pi) + return -ENOENT; + + return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); +} EXPORT_SYMBOL(ceph_object_locator_to_pg); /* @@ -1970,23 +2302,57 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, - const __u32 *weight, int weight_max) + const __u32 *weight, int weight_max, + u64 choose_args_index) { + struct crush_choose_arg_map *arg_map; int r; BUG_ON(result_max > CEPH_PG_MAX_SIZE); + arg_map = lookup_choose_arg_map(&map->crush->choose_args, + choose_args_index); + mutex_lock(&map->crush_workspace_mutex); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_workspace); + weight, weight_max, map->crush_workspace, + arg_map ? arg_map->args : NULL); mutex_unlock(&map->crush_workspace_mutex); return r; } +static void remove_nonexistent_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + struct ceph_osds *set) +{ + int i; + + if (ceph_can_shift_osds(pi)) { + int removed = 0; + + /* shift left */ + for (i = 0; i < set->size; i++) { + if (!ceph_osd_exists(osdmap, set->osds[i])) { + removed++; + continue; + } + if (removed) + set->osds[i - removed] = set->osds[i]; + } + set->size -= removed; + } else { + /* set dne devices to NONE */ + for (i = 0; i < set->size; i++) { + if (!ceph_osd_exists(osdmap, set->osds[i])) + set->osds[i] = CRUSH_ITEM_NONE; + } + } +} + /* - * Calculate raw set (CRUSH output) for given PG. The result may - * contain nonexistent OSDs. ->primary is undefined for a raw set. + * Calculate raw set (CRUSH output) for given PG and filter out + * nonexistent OSDs. ->primary is undefined for a raw set. * * Placement seed (CRUSH input) is returned through @ppps. */ @@ -2020,7 +2386,7 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, } len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, - osdmap->osd_weight, osdmap->max_osd); + osdmap->osd_weight, osdmap->max_osd, pi->id); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", len, ruleno, pi->id, pi->crush_ruleset, pi->type, @@ -2029,6 +2395,70 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, } raw->size = len; + remove_nonexistent_osds(osdmap, pi, raw); +} + +/* apply pg_upmap[_items] mappings */ +static void apply_upmap(struct ceph_osdmap *osdmap, + const struct ceph_pg *pgid, + struct ceph_osds *raw) +{ + struct ceph_pg_mapping *pg; + int i, j; + + pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid); + if (pg) { + /* make sure targets aren't marked out */ + for (i = 0; i < pg->pg_upmap.len; i++) { + int osd = pg->pg_upmap.osds[i]; + + if (osd != CRUSH_ITEM_NONE && + osd < osdmap->max_osd && + osdmap->osd_weight[osd] == 0) { + /* reject/ignore explicit mapping */ + return; + } + } + for (i = 0; i < pg->pg_upmap.len; i++) + raw->osds[i] = pg->pg_upmap.osds[i]; + raw->size = pg->pg_upmap.len; + return; + } + + pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); + if (pg) { + /* + * Note: this approach does not allow a bidirectional swap, + * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. + */ + for (i = 0; i < pg->pg_upmap_items.len; i++) { + int from = pg->pg_upmap_items.from_to[i][0]; + int to = pg->pg_upmap_items.from_to[i][1]; + int pos = -1; + bool exists = false; + + /* make sure replacement doesn't already appear */ + for (j = 0; j < raw->size; j++) { + int osd = raw->osds[j]; + + if (osd == to) { + exists = true; + break; + } + /* ignore mapping if target is marked out */ + if (osd == from && pos < 0 && + !(to != CRUSH_ITEM_NONE && + to < osdmap->max_osd && + osdmap->osd_weight[to] == 0)) { + pos = j; + } + } + if (!exists && pos >= 0) { + raw->osds[pos] = to; + return; + } + } + } } /* @@ -2151,18 +2581,16 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, */ static void get_temp_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, - const struct ceph_pg *raw_pgid, + const struct ceph_pg *pgid, struct ceph_osds *temp) { - struct ceph_pg pgid; struct ceph_pg_mapping *pg; int i; - raw_pg_to_pg(pi, raw_pgid, &pgid); ceph_osds_init(temp); /* pg_temp? */ - pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); + pg = lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { for (i = 0; i < pg->pg_temp.len; i++) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { @@ -2185,7 +2613,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap, } /* primary_temp? */ - pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); + pg = lookup_pg_mapping(&osdmap->primary_temp, pgid); if (pg) temp->primary = pg->primary_temp.osd; } @@ -2198,43 +2626,75 @@ static void get_temp_osds(struct ceph_osdmap *osdmap, * resend a request. */ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting) { - struct ceph_pg_pool_info *pi; + struct ceph_pg pgid; u32 pps; - pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); - if (!pi) { - ceph_osds_init(up); - ceph_osds_init(acting); - goto out; - } + WARN_ON(pi->id != raw_pgid->pool); + raw_pg_to_pg(pi, raw_pgid, &pgid); pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); + apply_upmap(osdmap, &pgid, up); raw_to_up_osds(osdmap, pi, up); apply_primary_affinity(osdmap, pi, pps, up); - get_temp_osds(osdmap, pi, raw_pgid, acting); + get_temp_osds(osdmap, pi, &pgid, acting); if (!acting->size) { memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); acting->size = up->size; if (acting->primary == -1) acting->primary = up->primary; } -out: WARN_ON(!osds_valid(up) || !osds_valid(acting)); } +bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_spg *spgid) +{ + struct ceph_pg pgid; + struct ceph_osds up, acting; + int i; + + WARN_ON(pi->id != raw_pgid->pool); + raw_pg_to_pg(pi, raw_pgid, &pgid); + + if (ceph_can_shift_osds(pi)) { + spgid->pgid = pgid; /* struct */ + spgid->shard = CEPH_SPG_NOSHARD; + return true; + } + + ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting); + for (i = 0; i < acting.size; i++) { + if (acting.osds[i] == acting.primary) { + spgid->pgid = pgid; /* struct */ + spgid->shard = i; + return true; + } + } + + return false; +} + /* * Return acting primary for given PG, or -1 if none. */ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid) { + struct ceph_pg_pool_info *pi; struct ceph_osds up, acting; - ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); + pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); + if (!pi) + return -1; + + ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting); return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); |