diff options
Diffstat (limited to 'drivers/staging/zcache/tmem.c')
-rw-r--r-- | drivers/staging/zcache/tmem.c | 894 |
1 files changed, 894 insertions, 0 deletions
diff --git a/drivers/staging/zcache/tmem.c b/drivers/staging/zcache/tmem.c new file mode 100644 index 000000000000..a2b7e03b6062 --- /dev/null +++ b/drivers/staging/zcache/tmem.c @@ -0,0 +1,894 @@ +/* + * In-kernel transcendent memory (generic implementation) + * + * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. + * + * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented + * "handles" (triples containing a pool id, and object id, and an index), to + * pages in a page-accessible memory (PAM). Tmem references the PAM pages via + * an abstract "pampd" (PAM page-descriptor), which can be operated on by a + * set of functions (pamops). Each pampd contains some representation of + * PAGE_SIZE bytes worth of data. For those familiar with key-value stores, + * the tmem handle is a three-level hierarchical key, and the value is always + * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is + * referenced in the datastore by the pampd. The hierarchy is required + * to ensure that certain invalidation functions can be performed efficiently + * (i.e. flush all indexes associated with this object_id, or + * flush all objects associated with this pool). + * + * Tmem must support potentially millions of pages and must be able to insert, + * find, and delete these pages at a potential frequency of thousands per + * second concurrently across many CPUs, (and, if used with KVM, across many + * vcpus across many guests). Tmem is tracked with a hierarchy of data + * structures, organized by the elements in the handle-tuple: pool_id, + * object_id, and page index. One or more "clients" (e.g. guests) each + * provide one or more tmem_pools. Each pool, contains a hash table of + * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree + * of pointers, with intermediate nodes called tmem_objnodes. Each leaf + * pointer in this tree points to a pampd, which is accessible only through + * a small set of callbacks registered by the PAM implementation (see + * tmem_register_pamops). Tmem only needs to memory allocation for objs + * and objnodes and this is done via a set of callbacks that must be + * registered by the tmem host implementation (e.g. see tmem_register_hostops). + */ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#ifdef CONFIG_RAMSTER +#include <linux/delay.h> +#endif + +#include "tmem.h" + +/* data structure sentinels used for debugging... see tmem.h */ +#define POOL_SENTINEL 0x87658765 +#define OBJ_SENTINEL 0x12345678 +#define OBJNODE_SENTINEL 0xfedcba09 + +/* + * A tmem host implementation must use this function to register callbacks + * for memory allocation. + */ +static struct tmem_hostops tmem_hostops; + +static void tmem_objnode_tree_init(void); + +void tmem_register_hostops(struct tmem_hostops *m) +{ + tmem_objnode_tree_init(); + tmem_hostops = *m; +} + +/* + * A tmem host implementation must use this function to register + * callbacks for a page-accessible memory (PAM) implementation. + */ +static struct tmem_pamops tmem_pamops; + +void tmem_register_pamops(struct tmem_pamops *m) +{ + tmem_pamops = *m; +} + +/* + * Oid's are potentially very sparse and tmem_objs may have an indeterminately + * short life, being added and deleted at a relatively high frequency. + * So an rb_tree is an ideal data structure to manage tmem_objs. But because + * of the potentially huge number of tmem_objs, each pool manages a hashtable + * of rb_trees to reduce search, insert, delete, and rebalancing time. + * Each hashbucket also has a lock to manage concurrent access and no + * searches, inserts, or deletions can be performed unless the lock is held. + * As a result, care must be taken to ensure tmem routines are not called + * recursively; the vast majority of the time, a recursive call may work + * but a deadlock will occur a small fraction of the time due to the + * hashbucket lock. + * + * The following routines manage tmem_objs. In all of these routines, + * the hashbucket lock is already held. + */ + +/* Search for object==oid in pool, returns object if found. */ +static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb, + struct tmem_oid *oidp, + struct rb_node **parent, + struct rb_node ***link) +{ + struct rb_node *_parent = NULL, **rbnode; + struct tmem_obj *obj = NULL; + + rbnode = &hb->obj_rb_root.rb_node; + while (*rbnode) { + BUG_ON(RB_EMPTY_NODE(*rbnode)); + _parent = *rbnode; + obj = rb_entry(*rbnode, struct tmem_obj, + rb_tree_node); + switch (tmem_oid_compare(oidp, &obj->oid)) { + case 0: /* equal */ + goto out; + case -1: + rbnode = &(*rbnode)->rb_left; + break; + case 1: + rbnode = &(*rbnode)->rb_right; + break; + } + } + + if (parent) + *parent = _parent; + if (link) + *link = rbnode; + obj = NULL; +out: + return obj; +} + +static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, + struct tmem_oid *oidp) +{ + return __tmem_obj_find(hb, oidp, NULL, NULL); +} + +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool); + +/* Free an object that has no more pampds in it. */ +static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) +{ + struct tmem_pool *pool; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pampd_count > 0); + pool = obj->pool; + BUG_ON(pool == NULL); + if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ + tmem_pampd_destroy_all_in_obj(obj, false); + BUG_ON(obj->objnode_tree_root != NULL); + BUG_ON((long)obj->objnode_count != 0); + atomic_dec(&pool->obj_count); + BUG_ON(atomic_read(&pool->obj_count) < 0); + INVERT_SENTINEL(obj, OBJ); + obj->pool = NULL; + tmem_oid_set_invalid(&obj->oid); + rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); +} + +/* + * Initialize, and insert an tmem_object_root (called only if find failed). + */ +static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, + struct tmem_pool *pool, + struct tmem_oid *oidp) +{ + struct rb_root *root = &hb->obj_rb_root; + struct rb_node **new = NULL, *parent = NULL; + + BUG_ON(pool == NULL); + atomic_inc(&pool->obj_count); + obj->objnode_tree_height = 0; + obj->objnode_tree_root = NULL; + obj->pool = pool; + obj->oid = *oidp; + obj->objnode_count = 0; + obj->pampd_count = 0; +#ifdef CONFIG_RAMSTER + if (tmem_pamops.new_obj != NULL) + (*tmem_pamops.new_obj)(obj); +#endif + SET_SENTINEL(obj, OBJ); + + if (__tmem_obj_find(hb, oidp, &parent, &new)) + BUG(); + + rb_link_node(&obj->rb_tree_node, parent, new); + rb_insert_color(&obj->rb_tree_node, root); +} + +/* + * Tmem is managed as a set of tmem_pools with certain attributes, such as + * "ephemeral" vs "persistent". These attributes apply to all tmem_objs + * and all pampds that belong to a tmem_pool. A tmem_pool is created + * or deleted relatively rarely (for example, when a filesystem is + * mounted or unmounted). + */ + +/* flush all data from a pool and, optionally, free it */ +static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) +{ + struct rb_node *rbnode; + struct tmem_obj *obj; + struct tmem_hashbucket *hb = &pool->hashbucket[0]; + int i; + + BUG_ON(pool == NULL); + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { + spin_lock(&hb->lock); + rbnode = rb_first(&hb->obj_rb_root); + while (rbnode != NULL) { + obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); + rbnode = rb_next(rbnode); + tmem_pampd_destroy_all_in_obj(obj, true); + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + } + spin_unlock(&hb->lock); + } + if (destroy) + list_del(&pool->pool_list); +} + +/* + * A tmem_obj contains a radix-tree-like tree in which the intermediate + * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation + * is very specialized and tuned for specific uses and is not particularly + * suited for use from this code, though some code from the core algorithms has + * been reused, thus the copyright notices below). Each tmem_objnode contains + * a set of pointers which point to either a set of intermediate tmem_objnodes + * or a set of of pampds. + * + * Portions Copyright (C) 2001 Momchil Velikov + * Portions Copyright (C) 2001 Christoph Hellwig + * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> + */ + +struct tmem_objnode_tree_path { + struct tmem_objnode *objnode; + int offset; +}; + +/* objnode height_to_maxindex translation */ +static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; + +static void tmem_objnode_tree_init(void) +{ + unsigned int ht, tmp; + + for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { + tmp = ht * OBJNODE_TREE_MAP_SHIFT; + if (tmp >= OBJNODE_TREE_INDEX_BITS) + tmem_objnode_tree_h2max[ht] = ~0UL; + else + tmem_objnode_tree_h2max[ht] = + (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; + } +} + +static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) +{ + struct tmem_objnode *objnode; + + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + objnode = (*tmem_hostops.objnode_alloc)(obj->pool); + if (unlikely(objnode == NULL)) + goto out; + objnode->obj = obj; + SET_SENTINEL(objnode, OBJNODE); + memset(&objnode->slots, 0, sizeof(objnode->slots)); + objnode->slots_in_use = 0; + obj->objnode_count++; +out: + return objnode; +} + +static void tmem_objnode_free(struct tmem_objnode *objnode) +{ + struct tmem_pool *pool; + int i; + + BUG_ON(objnode == NULL); + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) + BUG_ON(objnode->slots[i] != NULL); + ASSERT_SENTINEL(objnode, OBJNODE); + INVERT_SENTINEL(objnode, OBJNODE); + BUG_ON(objnode->obj == NULL); + ASSERT_SENTINEL(objnode->obj, OBJ); + pool = objnode->obj->pool; + BUG_ON(pool == NULL); + ASSERT_SENTINEL(pool, POOL); + objnode->obj->objnode_count--; + objnode->obj = NULL; + (*tmem_hostops.objnode_free)(objnode, pool); +} + +/* + * Lookup index in object and return associated pampd (or NULL if not found). + */ +static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) +{ + unsigned int height, shift; + struct tmem_objnode **slot = NULL; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + + height = obj->objnode_tree_height; + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) + goto out; + if (height == 0 && obj->objnode_tree_root) { + slot = &obj->objnode_tree_root; + goto out; + } + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; + slot = &obj->objnode_tree_root; + while (height > 0) { + if (*slot == NULL) + goto out; + slot = (struct tmem_objnode **) + ((*slot)->slots + + ((index >> shift) & OBJNODE_TREE_MAP_MASK)); + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } +out: + return slot != NULL ? (void **)slot : NULL; +} + +static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) +{ + struct tmem_objnode **slot; + + slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); + return slot != NULL ? *slot : NULL; +} + +#ifdef CONFIG_RAMSTER +static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, + void *new_pampd, bool no_free) +{ + struct tmem_objnode **slot; + void *ret = NULL; + + slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); + if ((slot != NULL) && (*slot != NULL)) { + void *old_pampd = *(void **)slot; + *(void **)slot = new_pampd; + if (!no_free) + (*tmem_pamops.free)(old_pampd, obj->pool, + NULL, 0, false); + ret = new_pampd; + } + return ret; +} +#endif + +static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, + void *pampd) +{ + int ret = 0; + struct tmem_objnode *objnode = NULL, *newnode, *slot; + unsigned int height, shift; + int offset = 0; + + /* if necessary, extend the tree to be higher */ + if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { + height = obj->objnode_tree_height + 1; + if (index > tmem_objnode_tree_h2max[height]) + while (index > tmem_objnode_tree_h2max[height]) + height++; + if (obj->objnode_tree_root == NULL) { + obj->objnode_tree_height = height; + goto insert; + } + do { + newnode = tmem_objnode_alloc(obj); + if (!newnode) { + ret = -ENOMEM; + goto out; + } + newnode->slots[0] = obj->objnode_tree_root; + newnode->slots_in_use = 1; + obj->objnode_tree_root = newnode; + obj->objnode_tree_height++; + } while (height > obj->objnode_tree_height); + } +insert: + slot = obj->objnode_tree_root; + height = obj->objnode_tree_height; + shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; + while (height > 0) { + if (slot == NULL) { + /* add a child objnode. */ + slot = tmem_objnode_alloc(obj); + if (!slot) { + ret = -ENOMEM; + goto out; + } + if (objnode) { + + objnode->slots[offset] = slot; + objnode->slots_in_use++; + } else + obj->objnode_tree_root = slot; + } + /* go down a level */ + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; + objnode = slot; + slot = objnode->slots[offset]; + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } + BUG_ON(slot != NULL); + if (objnode) { + objnode->slots_in_use++; + objnode->slots[offset] = pampd; + } else + obj->objnode_tree_root = pampd; + obj->pampd_count++; +out: + return ret; +} + +static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) +{ + struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; + struct tmem_objnode_tree_path *pathp = path; + struct tmem_objnode *slot = NULL; + unsigned int height, shift; + int offset; + + BUG_ON(obj == NULL); + ASSERT_SENTINEL(obj, OBJ); + BUG_ON(obj->pool == NULL); + ASSERT_SENTINEL(obj->pool, POOL); + height = obj->objnode_tree_height; + if (index > tmem_objnode_tree_h2max[height]) + goto out; + slot = obj->objnode_tree_root; + if (height == 0 && obj->objnode_tree_root) { + obj->objnode_tree_root = NULL; + goto out; + } + shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; + pathp->objnode = NULL; + do { + if (slot == NULL) + goto out; + pathp++; + offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; + pathp->offset = offset; + pathp->objnode = slot; + slot = slot->slots[offset]; + shift -= OBJNODE_TREE_MAP_SHIFT; + height--; + } while (height > 0); + if (slot == NULL) + goto out; + while (pathp->objnode) { + pathp->objnode->slots[pathp->offset] = NULL; + pathp->objnode->slots_in_use--; + if (pathp->objnode->slots_in_use) { + if (pathp->objnode == obj->objnode_tree_root) { + while (obj->objnode_tree_height > 0 && + obj->objnode_tree_root->slots_in_use == 1 && + obj->objnode_tree_root->slots[0]) { + struct tmem_objnode *to_free = + obj->objnode_tree_root; + + obj->objnode_tree_root = + to_free->slots[0]; + obj->objnode_tree_height--; + to_free->slots[0] = NULL; + to_free->slots_in_use = 0; + tmem_objnode_free(to_free); + } + } + goto out; + } + tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ + pathp--; + } + obj->objnode_tree_height = 0; + obj->objnode_tree_root = NULL; + +out: + if (slot != NULL) + obj->pampd_count--; + BUG_ON(obj->pampd_count < 0); + return slot; +} + +/* Recursively walk the objnode_tree destroying pampds and objnodes. */ +static void tmem_objnode_node_destroy(struct tmem_obj *obj, + struct tmem_objnode *objnode, + unsigned int ht) +{ + int i; + + if (ht == 0) + return; + for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { + if (objnode->slots[i]) { + if (ht == 1) { + obj->pampd_count--; + (*tmem_pamops.free)(objnode->slots[i], + obj->pool, NULL, 0, true); + objnode->slots[i] = NULL; + continue; + } + tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); + tmem_objnode_free(objnode->slots[i]); + objnode->slots[i] = NULL; + } + } +} + +static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj, + bool pool_destroy) +{ + if (obj->objnode_tree_root == NULL) + return; + if (obj->objnode_tree_height == 0) { + obj->pampd_count--; + (*tmem_pamops.free)(obj->objnode_tree_root, + obj->pool, NULL, 0, true); + } else { + tmem_objnode_node_destroy(obj, obj->objnode_tree_root, + obj->objnode_tree_height); + tmem_objnode_free(obj->objnode_tree_root); + obj->objnode_tree_height = 0; + } + obj->objnode_tree_root = NULL; +#ifdef CONFIG_RAMSTER + if (tmem_pamops.free_obj != NULL) + (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy); +#endif +} + +/* + * Tmem is operated on by a set of well-defined actions: + * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". + * (The tmem ABI allows for subpages and exchanges but these operations + * are not included in this implementation.) + * + * These "tmem core" operations are implemented in the following functions. + */ + +/* + * "Put" a page, e.g. associate the passed pampd with the passed handle. + * Tmem_put is complicated by a corner case: What if a page with matching + * handle already exists in tmem? To guarantee coherency, one of two + * actions is necessary: Either the data for the page must be overwritten, + * or the page must be "flushed" so that the data is not accessible to a + * subsequent "get". Since these "duplicate puts" are relatively rare, + * this implementation always flushes for simplicity. + */ +int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, + bool raw, void *pampd_to_use) +{ + struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; + void *pampd = NULL, *pampd_del = NULL; + int ret = -ENOMEM; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = objfound = tmem_obj_find(hb, oidp); + if (obj != NULL) { + pampd = tmem_pampd_lookup_in_obj(objfound, index); + if (pampd != NULL) { + /* if found, is a dup put, flush the old one */ + pampd_del = tmem_pampd_delete_from_obj(obj, index); + BUG_ON(pampd_del != pampd); + (*tmem_pamops.free)(pampd, pool, oidp, index, true); + if (obj->pampd_count == 0) { + objnew = obj; + objfound = NULL; + } + pampd = NULL; + } + } else { + obj = objnew = (*tmem_hostops.obj_alloc)(pool); + if (unlikely(obj == NULL)) { + ret = -ENOMEM; + goto out; + } + tmem_obj_init(obj, hb, pool, oidp); + } + BUG_ON(obj == NULL); + BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); + pampd = pampd_to_use; + BUG_ON(pampd_to_use == NULL); + ret = tmem_pampd_add_to_obj(obj, index, pampd); + if (unlikely(ret == -ENOMEM)) + /* may have partially built objnode tree ("stump") */ + goto delete_and_free; + (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool)); + goto out; + +delete_and_free: + (void)tmem_pampd_delete_from_obj(obj, index); + if (pampd) + (*tmem_pamops.free)(pampd, pool, NULL, 0, true); + if (objnew) { + tmem_obj_free(objnew, hb); + (*tmem_hostops.obj_free)(objnew, pool); + } +out: + spin_unlock(&hb->lock); + return ret; +} + +#ifdef CONFIG_RAMSTER +/* + * For ramster only: The following routines provide a two-step sequence + * to allow the caller to replace a pampd in the tmem data structures with + * another pampd. Here, we lookup the passed handle and, if found, return the + * associated pampd and object, leaving the hashbucket locked and returning + * a reference to it. The caller is expected to immediately call the + * matching tmem_localify_finish routine which will handles the replacement + * and unlocks the hashbucket. + */ +void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, struct tmem_obj **ret_obj, + void **saved_hb) +{ + struct tmem_hashbucket *hb; + struct tmem_obj *obj = NULL; + void *pampd = NULL; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (likely(obj != NULL)) + pampd = tmem_pampd_lookup_in_obj(obj, index); + *ret_obj = obj; + *saved_hb = (void *)hb; + /* note, hashbucket remains locked */ + return pampd; +} + +void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, + void *pampd, void *saved_hb, bool delete) +{ + struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; + + BUG_ON(!spin_is_locked(&hb->lock)); + if (pampd != NULL) { + BUG_ON(obj == NULL); + (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); + (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool)); + } else if (delete) { + BUG_ON(obj == NULL); + (void)tmem_pampd_delete_from_obj(obj, index); + } + spin_unlock(&hb->lock); +} + +/* + * For ramster only. Helper function to support asynchronous tmem_get. + */ +static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, + struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, bool free, char *data) +{ + void *old_pampd = *ppampd, *new_pampd = NULL; + bool intransit = false; + int ret = 0; + + if (!is_ephemeral(pool)) + new_pampd = (*tmem_pamops.repatriate_preload)( + old_pampd, pool, oidp, index, &intransit); + if (intransit) + ret = -EAGAIN; + else if (new_pampd != NULL) + *ppampd = new_pampd; + /* must release the hb->lock else repatriate can't sleep */ + spin_unlock(&hb->lock); + if (!intransit) + ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, + oidp, index, free, data); + if (ret == -EAGAIN) { + /* rare I think, but should cond_resched()??? */ + usleep_range(10, 1000); + } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) { + ret = -1; + } else if (ret != 0 && ret != -ENOENT) { + ret = -1; + } + /* note hb->lock has now been unlocked */ + return ret; +} + +/* + * For ramster only. If a page in tmem matches the handle, replace the + * page so that any subsequent "get" gets the new page. Returns 0 if + * there was a page to replace, else returns -1. + */ +int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, + uint32_t index, void *new_pampd) +{ + struct tmem_obj *obj; + int ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); + /* if we bug here, pamops wasn't properly set up for ramster */ + BUG_ON(tmem_pamops.replace_in_obj == NULL); + ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); +out: + spin_unlock(&hb->lock); + return ret; +} +#endif + +/* + * "Get" a page, e.g. if a pampd can be found matching the passed handle, + * use a pamops callback to recreated the page from the pampd with the + * matching handle. By tmem definition, when a "get" is successful on + * an ephemeral page, the page is "flushed", and when a "get" is successful + * on a persistent page, the page is retained in tmem. Note that to preserve + * coherency, "get" can never be skipped if tmem contains the data. + * That is, if a get is done with a certain handle and fails, any + * subsequent "get" must also fail (unless of course there is a + * "put" done with the same handle). + */ +int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, + char *data, size_t *sizep, bool raw, int get_and_free) +{ + struct tmem_obj *obj; + void *pampd = NULL; + bool ephemeral = is_ephemeral(pool); + int ret = -1; + struct tmem_hashbucket *hb; + bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); + bool lock_held = false; + void **ppampd; + + do { + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + lock_held = true; + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + ppampd = __tmem_pampd_lookup_in_obj(obj, index); + if (ppampd == NULL) + goto out; +#ifdef CONFIG_RAMSTER + if ((tmem_pamops.is_remote != NULL) && + tmem_pamops.is_remote(*ppampd)) { + ret = tmem_repatriate(ppampd, hb, pool, oidp, + index, free, data); + /* tmem_repatriate releases hb->lock */ + lock_held = false; + *sizep = PAGE_SIZE; + if (ret != -EAGAIN) + goto out; + } +#endif + } while (ret == -EAGAIN); + if (free) + pampd = tmem_pampd_delete_from_obj(obj, index); + else + pampd = tmem_pampd_lookup_in_obj(obj, index); + if (pampd == NULL) + goto out; + if (free) { + if (obj->pampd_count == 0) { + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + obj = NULL; + } + } + if (free) + ret = (*tmem_pamops.get_data_and_free)( + data, sizep, raw, pampd, pool, oidp, index); + else + ret = (*tmem_pamops.get_data)( + data, sizep, raw, pampd, pool, oidp, index); + if (ret < 0) + goto out; + ret = 0; +out: + if (lock_held) + spin_unlock(&hb->lock); + return ret; +} + +/* + * If a page in tmem matches the handle, "flush" this page from tmem such + * that any subsequent "get" does not succeed (unless, of course, there + * was another "put" with the same handle). + */ +int tmem_flush_page(struct tmem_pool *pool, + struct tmem_oid *oidp, uint32_t index) +{ + struct tmem_obj *obj; + void *pampd; + int ret = -1; + struct tmem_hashbucket *hb; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + pampd = tmem_pampd_delete_from_obj(obj, index); + if (pampd == NULL) + goto out; + (*tmem_pamops.free)(pampd, pool, oidp, index, true); + if (obj->pampd_count == 0) { + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + } + ret = 0; + +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Flush" all pages in tmem matching this oid. + */ +int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) +{ + struct tmem_obj *obj; + struct tmem_hashbucket *hb; + int ret = -1; + + hb = &pool->hashbucket[tmem_oid_hash(oidp)]; + spin_lock(&hb->lock); + obj = tmem_obj_find(hb, oidp); + if (obj == NULL) + goto out; + tmem_pampd_destroy_all_in_obj(obj, false); + tmem_obj_free(obj, hb); + (*tmem_hostops.obj_free)(obj, pool); + ret = 0; + +out: + spin_unlock(&hb->lock); + return ret; +} + +/* + * "Flush" all pages (and tmem_objs) from this tmem_pool and disable + * all subsequent access to this tmem_pool. + */ +int tmem_destroy_pool(struct tmem_pool *pool) +{ + int ret = -1; + + if (pool == NULL) + goto out; + tmem_pool_flush(pool, 1); + ret = 0; +out: + return ret; +} + +static LIST_HEAD(tmem_global_pool_list); + +/* + * Create a new tmem_pool with the provided flag and return + * a pool id provided by the tmem host implementation. + */ +void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) +{ + int persistent = flags & TMEM_POOL_PERSIST; + int shared = flags & TMEM_POOL_SHARED; + struct tmem_hashbucket *hb = &pool->hashbucket[0]; + int i; + + for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { + hb->obj_rb_root = RB_ROOT; + spin_lock_init(&hb->lock); + } + INIT_LIST_HEAD(&pool->pool_list); + atomic_set(&pool->obj_count, 0); + SET_SENTINEL(pool, POOL); + list_add_tail(&pool->pool_list, &tmem_global_pool_list); + pool->persistent = persistent; + pool->shared = shared; +} |