summaryrefslogtreecommitdiffstats
path: root/net/ceph
diff options
context:
space:
mode:
authorIlya Dryomov <ilya.dryomov@inktank.com>2014-03-24 17:12:49 +0200
committerSage Weil <sage@inktank.com>2014-04-04 21:08:17 -0700
commit47ec1f3cc46dde00deb34922dbffdeda254ad76d (patch)
tree1dbfa1b18638f70684a531ba53c16e40a7e3f320 /net/ceph
parent5e8d4d36bf23bb7baf027c479d54395840219928 (diff)
downloadlinux-47ec1f3cc46dde00deb34922dbffdeda254ad76d.tar.gz
linux-47ec1f3cc46dde00deb34922dbffdeda254ad76d.tar.bz2
linux-47ec1f3cc46dde00deb34922dbffdeda254ad76d.zip
libceph: add support for osd primary affinity
Respond to non-default primary_affinity values accordingly. (Primary affinity allows the admin to shift 'primary responsibility' away from specific osds, effectively shifting around the read side of the workload and whatever overhead is incurred by peering and writes by virtue of being the primary). Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com> Reviewed-by: Alex Elder <elder@linaro.org>
Diffstat (limited to 'net/ceph')
-rw-r--r--net/ceph/osdmap.c68
1 files changed, 68 insertions, 0 deletions
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 20a38a37794c..ae8f367c5291 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1596,6 +1596,72 @@ static int raw_to_up_osds(struct ceph_osdmap *osdmap,
return len;
}
+static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+ struct ceph_pg_pool_info *pool,
+ int *osds, int len, int *primary)
+{
+ int i;
+ int pos = -1;
+
+ /*
+ * Do we have any non-default primary_affinity values for these
+ * osds?
+ */
+ if (!osdmap->osd_primary_affinity)
+ return;
+
+ for (i = 0; i < len; i++) {
+ if (osds[i] != CRUSH_ITEM_NONE &&
+ osdmap->osd_primary_affinity[i] !=
+ CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+ break;
+ }
+ }
+ if (i == len)
+ return;
+
+ /*
+ * Pick the primary. Feed both the seed (for the pg) and the
+ * osd into the hash/rng so that a proportional fraction of an
+ * osd's pgs get rejected as primary.
+ */
+ for (i = 0; i < len; i++) {
+ int osd;
+ u32 aff;
+
+ osd = osds[i];
+ if (osd == CRUSH_ITEM_NONE)
+ continue;
+
+ aff = osdmap->osd_primary_affinity[osd];
+ if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+ (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+ pps, osd) >> 16) >= aff) {
+ /*
+ * We chose not to use this primary. Note it
+ * anyway as a fallback in case we don't pick
+ * anyone else, but keep looking.
+ */
+ if (pos < 0)
+ pos = i;
+ } else {
+ pos = i;
+ break;
+ }
+ }
+ if (pos < 0)
+ return;
+
+ *primary = osds[pos];
+
+ if (ceph_can_shift_osds(pool) && pos > 0) {
+ /* move the new primary to the front */
+ for (i = pos; i > 0; i--)
+ osds[i] = osds[i - 1];
+ osds[0] = *primary;
+ }
+}
+
/*
* Given up set, apply pg_temp and primary_temp mappings.
*
@@ -1698,6 +1764,8 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
len = raw_to_up_osds(osdmap, pool, osds, len, primary);
+ apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
+
len = apply_temps(osdmap, pool, pgid, osds, len, primary);
return len;