/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include static LIST_HEAD(bpf_map_types); static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { struct bpf_map_type_list *tl; struct bpf_map *map; list_for_each_entry(tl, &bpf_map_types, list_node) { if (tl->type == attr->map_type) { map = tl->ops->map_alloc(attr); if (IS_ERR(map)) return map; map->ops = tl->ops; map->map_type = attr->map_type; return map; } } return ERR_PTR(-EINVAL); } /* boot time registration of different map implementations */ void bpf_register_map_type(struct bpf_map_type_list *tl) { list_add(&tl->list_node, &bpf_map_types); } /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); /* implementation dependent freeing */ map->ops->map_free(map); } /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ void bpf_map_put(struct bpf_map *map) { if (atomic_dec_and_test(&map->refcnt)) { INIT_WORK(&map->work, bpf_map_free_deferred); schedule_work(&map->work); } } static int bpf_map_release(struct inode *inode, struct file *filp) { struct bpf_map *map = filp->private_data; bpf_map_put(map); return 0; } static const struct file_operations bpf_map_fops = { .release = bpf_map_release, }; /* helper macro to check that unused fields 'union bpf_attr' are zero */ #define CHECK_ATTR(CMD) \ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ sizeof(attr->CMD##_LAST_FIELD), 0, \ sizeof(*attr) - \ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL #define BPF_MAP_CREATE_LAST_FIELD max_entries /* called via syscall */ static int map_create(union bpf_attr *attr) { struct bpf_map *map; int err; err = CHECK_ATTR(BPF_MAP_CREATE); if (err) return -EINVAL; /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map = find_and_alloc_map(attr); if (IS_ERR(map)) return PTR_ERR(map); atomic_set(&map->refcnt, 1); err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); if (err < 0) /* failed to allocate fd */ goto free_map; return err; free_map: map->ops->map_free(map); return err; } SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; int err; /* the syscall is limited to root temporarily. This restriction will be * lifted when security audit is clean. Note that eBPF+tracing must have * this restriction, since it may pass kernel data to user space */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!access_ok(VERIFY_READ, uattr, 1)) return -EFAULT; if (size > PAGE_SIZE) /* silly large */ return -E2BIG; /* If we're handed a bigger struct than we know of, * ensure all the unknown bits are 0 - i.e. new * user-space does not rely on any kernel feature * extensions we dont know about yet. */ if (size > sizeof(attr)) { unsigned char __user *addr; unsigned char __user *end; unsigned char val; addr = (void __user *)uattr + sizeof(attr); end = (void __user *)uattr + size; for (; addr < end; addr++) { err = get_user(val, addr); if (err) return err; if (val) return -E2BIG; } size = sizeof(attr); } /* copy attributes from user space, may be less than sizeof(bpf_attr) */ if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; switch (cmd) { case BPF_MAP_CREATE: err = map_create(&attr); break; default: err = -EINVAL; break; } return err; }