diff options
Diffstat (limited to 'kernel')
41 files changed, 568 insertions, 300 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cde93d54c571..6aa7543bcdb2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -122,7 +122,11 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") +filechk_ikconfiggz = \ + echo "static const char kernel_config_data[] __used = MAGIC_START"; \ + cat $< | scripts/bin2c; \ + echo "MAGIC_END;" + targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 715f9fcf4712..befe570be5ba 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -467,7 +467,7 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) return kind_ops[BTF_INFO_KIND(t->info)]; } -bool btf_name_offset_valid(const struct btf *btf, u32 offset) +static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { return BTF_STR_OFFSET_VALID(offset) && offset < btf->hdr.str_len; @@ -1219,8 +1219,6 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset, u8 nr_copy_bits; u64 print_num; - data += BITS_ROUNDDOWN_BYTES(bits_offset); - bits_offset = BITS_PER_BYTE_MASKED(bits_offset); nr_copy_bits = nr_bits + bits_offset; nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); @@ -1255,7 +1253,9 @@ static void btf_int_bits_seq_show(const struct btf *btf, * BTF_INT_OFFSET() cannot exceed 64 bits. */ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); - btf_bitfield_seq_show(data, total_bits_offset, nr_bits, m); + data += BITS_ROUNDDOWN_BYTES(total_bits_offset); + bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); + btf_bitfield_seq_show(data, bits_offset, nr_bits, m); } static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, @@ -2001,12 +2001,12 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, member_offset = btf_member_bit_offset(t, member); bitfield_size = btf_member_bitfield_size(t, member); + bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); + bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { - btf_bitfield_seq_show(data, member_offset, + btf_bitfield_seq_show(data + bytes_offset, bits8_offset, bitfield_size, m); } else { - bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); - bits8_offset = BITS_PER_BYTE_MASKED(member_offset); ops = btf_type_ops(member_type); ops->seq_show(btf, member_type, member->type, data + bytes_offset, bits8_offset, m); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9425c2fb872f..ab612fe9862f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -718,6 +718,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + /* fall through */ default: return NULL; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 38de580abcc2..f908b9356025 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -54,6 +54,7 @@ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] +#define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define IMM insn->imm @@ -857,6 +858,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + /* Constraints on AX register: + * + * AX register is inaccessible from user space. It is mapped in + * all JITs, and used here for constant blinding rewrites. It is + * typically "stateless" meaning its contents are only valid within + * the executed instruction, but not across several instructions. + * There are a few exceptions however which are further detailed + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! + */ + if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) + goto out; + if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { @@ -1188,7 +1209,6 @@ bool bpf_opcode_in_insntable(u8 code) */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { - u64 tmp; #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { @@ -1268,36 +1288,36 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1553,7 +1573,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ @@ -1566,7 +1586,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 99d243e1ad6e..52378d3e34b3 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; + u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); @@ -36,7 +37,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } - inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + inner_map_meta_size = sizeof(*inner_map_meta); + /* In some cases verifier needs to access beyond just base map. */ + if (inner_map->ops == &array_map_ops) + inner_map_meta_size = sizeof(struct bpf_array); + + inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); @@ -46,9 +52,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; - inner_map_meta->ops = inner_map->ops; inner_map_meta->max_entries = inner_map->max_entries; + /* Misc members not needed in bpf_map_meta_equal() check. */ + inner_map_meta->ops = inner_map->ops; + if (inner_map->ops == &array_map_ops) { + inner_map_meta->unpriv_array = inner_map->unpriv_array; + container_of(inner_map_meta, struct bpf_array, map)->index_mask = + container_of(inner_map, struct bpf_array, map)->index_mask; + } + fdput(f); return inner_map_meta; } diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 90daf285de03..d43b14535827 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -180,11 +180,14 @@ static inline int stack_map_parse_build_id(void *page_addr, if (nhdr->n_type == BPF_BUILD_ID && nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz == BPF_BUILD_ID_SIZE) { + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { memcpy(build_id, note_start + note_offs + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - BPF_BUILD_ID_SIZE); + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BPF_BUILD_ID_SIZE - nhdr->n_descsz); return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + @@ -260,7 +263,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, return -EFAULT; /* page not mapped */ ret = -EINVAL; - page_addr = page_address(page); + page_addr = kmap_atomic(page); ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ @@ -276,6 +279,7 @@ static int stack_map_get_build_id(struct vm_area_struct *vma, else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) ret = stack_map_get_build_id_64(page_addr, build_id); out: + kunmap_atomic(page_addr); put_page(page); return ret; } @@ -310,6 +314,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); } return; } @@ -320,6 +325,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; + memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0607db304def..b155cd17c1bd 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -79,7 +79,7 @@ int bpf_check_uarg_tail_zero(void __user *uaddr, if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ return -E2BIG; - if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size))) + if (unlikely(!access_ok(uaddr, actual_size))) return -EFAULT; if (actual_size <= expected_size) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71d86e3024ae..56674a7c3778 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -710,6 +710,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, free_func_state(dst_state->frame[i]); dst_state->frame[i] = NULL; } + dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; @@ -754,7 +755,8 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_state *cur = env->cur_state; struct bpf_verifier_stack_elem *elem; @@ -772,6 +774,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, err = copy_verifier_state(&elem->st, cur); if (err) goto err; + elem->st.speculative |= speculative; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose(env, "BPF program is too complex\n"); goto err; @@ -1387,6 +1390,31 @@ static int check_stack_read(struct bpf_verifier_env *env, } } +static int check_stack_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size) +{ + /* Stack accesses must be at a fixed offset, so that we + * can determine what type of data were returned. See + * check_stack_read(). + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "variable stack access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) @@ -1418,13 +1446,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, */ if (env->log.level) print_verifier_state(env, state); + /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0) { + if (reg->smin_value < 0 && + (reg->smin_value == S64_MIN || + (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || + reg->smin_value + off < 0)) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -1954,24 +1986,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - /* stack accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - * See check_stack_read(). - */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } off += reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, - size); - return -EACCES; - } + err = check_stack_access(env, reg, off, size); + if (err) + return err; state = func(env, reg); err = update_stack_depth(env, state, off); @@ -3052,6 +3070,125 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; +} + +static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, + u32 alu_state, u32 alu_limit) +{ + /* If we arrived here from different branches with different + * state or limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + return 0; +} + +static int sanitize_val_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + if (update_alu_sanitation_state(aux, alu_state, alu_limit)) + return -EACCES; +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -3070,8 +3207,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); - u32 dst = insn->dst_reg; + int ret; dst_reg = ®s[dst]; @@ -3104,6 +3242,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; + case PTR_TO_MAP_VALUE: + if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { + verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", + off_reg == dst_reg ? dst : src); + return -EACCES; + } + /* fall-through */ default: break; } @@ -3120,6 +3265,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -3170,6 +3320,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose(env, "R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ verbose(env, "R%d tried to subtract pointer from scalar\n", @@ -3249,6 +3404,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (!env->allow_ptr_leaks) { + if (dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1, false)) { + verbose(env, "R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } else if (dst_reg->type == PTR_TO_STACK && + check_stack_access(env, dst_reg, dst_reg->off + + dst_reg->var_off.value, 1)) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } + } + return 0; } @@ -3267,6 +3441,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, s64 smin_val, smax_val; u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + u32 dst = insn->dst_reg; + int ret; if (insn_bitness == 32) { /* Relevant for 32-bit RSH: Information can propagate towards @@ -3301,6 +3477,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to add from different pointers or scalars\n", dst); + return ret; + } if (signed_add_overflows(dst_reg->smin_value, smin_val) || signed_add_overflows(dst_reg->smax_value, smax_val)) { dst_reg->smin_value = S64_MIN; @@ -3320,6 +3501,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose(env, "R%d tried to sub from different pointers or scalars\n", dst); + return ret; + } if (signed_sub_overflows(dst_reg->smin_value, smax_val) || signed_sub_overflows(dst_reg->smax_value, smin_val)) { /* Overflow possible, we know nothing */ @@ -4348,7 +4534,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; other_branch_regs = other_branch->frame[other_branch->curframe]->regs; @@ -5458,6 +5645,12 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -5650,7 +5843,6 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; int insn_cnt = env->prog->len, i; - int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -5660,6 +5852,7 @@ static int do_check(struct bpf_verifier_env *env) if (!state) return -ENOMEM; state->curframe = 0; + state->speculative = false; state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); if (!state->frame[0]) { kfree(state); @@ -5670,19 +5863,19 @@ static int do_check(struct bpf_verifier_env *env) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, 0 /* subprogno, zero == main subprog */); - insn_idx = 0; + for (;;) { struct bpf_insn *insn; u8 class; int err; - if (insn_idx >= insn_cnt) { + if (env->insn_idx >= insn_cnt) { verbose(env, "invalid insn idx %d insn_cnt %d\n", - insn_idx, insn_cnt); + env->insn_idx, insn_cnt); return -EFAULT; } - insn = &insns[insn_idx]; + insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { @@ -5692,17 +5885,19 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, insn_idx); + err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { /* found equivalent state, can prune the search */ if (env->log.level) { if (do_print_state) - verbose(env, "\nfrom %d to %d: safe\n", - prev_insn_idx, insn_idx); + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else - verbose(env, "%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -5715,10 +5910,12 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level > 1 || (env->log.level && do_print_state)) { if (env->log.level > 1) - verbose(env, "%d:", insn_idx); + verbose(env, "%d:", env->insn_idx); else - verbose(env, "\nfrom %d to %d:", - prev_insn_idx, insn_idx); + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -5729,20 +5926,20 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; - verbose_linfo(env, insn_idx, "; "); - verbose(env, "%d: ", insn_idx); + verbose_linfo(env, env->insn_idx, "; "); + verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { - err = bpf_prog_offload_verify_insn(env, insn_idx, - prev_insn_idx); + err = bpf_prog_offload_verify_insn(env, env->insn_idx, + env->prev_insn_idx); if (err) return err; } regs = cur_regs(env); - env->insn_aux_data[insn_idx].seen = true; + env->insn_aux_data[env->insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); @@ -5768,13 +5965,13 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg, false); + err = check_mem_access(env, env->insn_idx, insn->src_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, insn->dst_reg, false); if (err) return err; - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn @@ -5799,10 +5996,10 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn_idx, insn); + err = check_xadd(env, env->insn_idx, insn); if (err) return err; - insn_idx++; + env->insn_idx++; continue; } @@ -5818,13 +6015,13 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, insn->src_reg, false); if (err) return err; - prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; @@ -5852,9 +6049,9 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - -1, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false); if (err) return err; @@ -5872,9 +6069,9 @@ static int do_check(struct bpf_verifier_env *env) } if (insn->src_reg == BPF_PSEUDO_CALL) - err = check_func_call(env, insn, &insn_idx); + err = check_func_call(env, insn, &env->insn_idx); else - err = check_helper_call(env, insn->imm, insn_idx); + err = check_helper_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -5887,7 +6084,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - insn_idx += insn->off + 1; + env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { @@ -5901,8 +6098,8 @@ static int do_check(struct bpf_verifier_env *env) if (state->curframe) { /* exit from nested function */ - prev_insn_idx = insn_idx; - err = prepare_func_exit(env, &insn_idx); + env->prev_insn_idx = env->insn_idx; + err = prepare_func_exit(env, &env->insn_idx); if (err) return err; do_print_state = true; @@ -5932,7 +6129,8 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: - err = pop_stack(env, &prev_insn_idx, &insn_idx); + err = pop_stack(env, &env->prev_insn_idx, + &env->insn_idx); if (err < 0) { if (err != -ENOENT) return err; @@ -5942,7 +6140,7 @@ process_bpf_exit: continue; } } else { - err = check_cond_jmp_op(env, insn, &insn_idx); + err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } @@ -5959,8 +6157,8 @@ process_bpf_exit: if (err) return err; - insn_idx++; - env->insn_aux_data[insn_idx].seen = true; + env->insn_idx++; + env->insn_aux_data[env->insn_idx].seen = true; } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -5970,7 +6168,7 @@ process_bpf_exit: return -EINVAL; } - insn_idx++; + env->insn_idx++; } verbose(env, "processed %d insns (limit %d), stack depth ", @@ -6709,6 +6907,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; if (insn->src_reg == BPF_PSEUDO_CALL) diff --git a/kernel/compat.c b/kernel/compat.c index 089d00d0da9c..f01affa17e22 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -95,28 +95,28 @@ int compat_put_timex(struct compat_timex __user *utp, const struct timex *txc) static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) { - return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || + return (!access_ok(ctv, sizeof(*ctv)) || __get_user(tv->tv_sec, &ctv->tv_sec) || __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) { - return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || + return (!access_ok(ctv, sizeof(*ctv)) || __put_user(tv->tv_sec, &ctv->tv_sec) || __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; } static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) { - return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || + return (!access_ok(cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) { - return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || + return (!access_ok(cts, sizeof(*cts)) || __put_user(ts->tv_sec, &cts->tv_sec) || __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } @@ -335,7 +335,7 @@ int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event) { memset(event, 0, sizeof(*event)); - return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) || + return (!access_ok(u_event, sizeof(*u_event)) || __get_user(event->sigev_value.sival_int, &u_event->sigev_value.sival_int) || __get_user(event->sigev_signo, &u_event->sigev_signo) || @@ -354,10 +354,9 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) + if (!user_access_begin(umask, bitmap_size / 8)) return -EFAULT; - user_access_begin(); while (nr_compat_longs > 1) { compat_ulong_t l1, l2; unsafe_get_user(l1, umask++, Efault); @@ -384,10 +383,9 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); - if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) + if (!user_access_begin(umask, bitmap_size / 8)) return -EFAULT; - user_access_begin(); while (nr_compat_longs > 1) { unsigned long m = *mask++; unsafe_put_user((compat_ulong_t)m, umask++, Efault); @@ -438,7 +436,7 @@ void __user *compat_alloc_user_space(unsigned long len) ptr = arch_compat_alloc_user_space(len); - if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) + if (unlikely(!access_ok(ptr, len))) return NULL; return ptr; diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 597d40893862..66f0fb7e9a3a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -223,7 +223,6 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, */ return mem->flags & DMA_MEMORY_EXCLUSIVE; } -EXPORT_SYMBOL(dma_alloc_from_dev_coherent); void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) { @@ -268,7 +267,6 @@ int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr) return __dma_release_from_coherent(mem, order, vaddr); } -EXPORT_SYMBOL(dma_release_from_dev_coherent); int dma_release_from_global_coherent(int order, void *vaddr) { diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 164706da2a73..23cf5361bcf1 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -49,7 +49,6 @@ enum { dma_debug_single, - dma_debug_page, dma_debug_sg, dma_debug_coherent, dma_debug_resource, @@ -1300,8 +1299,7 @@ void debug_dma_map_single(struct device *dev, const void *addr, EXPORT_SYMBOL(debug_dma_map_single); void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr, - bool map_single) + size_t size, int direction, dma_addr_t dma_addr) { struct dma_debug_entry *entry; @@ -1316,7 +1314,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, return; entry->dev = dev; - entry->type = dma_debug_page; + entry->type = dma_debug_single; entry->pfn = page_to_pfn(page); entry->offset = offset, entry->dev_addr = dma_addr; @@ -1324,9 +1322,6 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - if (map_single) - entry->type = dma_debug_single; - check_for_stack(dev, page, offset); if (!PageHighMem(page)) { @@ -1378,10 +1373,10 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) EXPORT_SYMBOL(debug_dma_mapping_error); void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, int direction, bool map_single) + size_t size, int direction) { struct dma_debug_entry ref = { - .type = dma_debug_page, + .type = dma_debug_single, .dev = dev, .dev_addr = addr, .size = size, @@ -1390,10 +1385,6 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, if (unlikely(dma_debug_disabled())) return; - - if (map_single) - ref.type = dma_debug_single; - check_unmap(&ref); } EXPORT_SYMBOL(debug_dma_unmap_page); @@ -1521,7 +1512,6 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, add_dma_entry(entry); } -EXPORT_SYMBOL(debug_dma_alloc_coherent); void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr) @@ -1549,7 +1539,6 @@ void debug_dma_free_coherent(struct device *dev, size_t size, check_unmap(&ref); } -EXPORT_SYMBOL(debug_dma_free_coherent); void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, int direction, dma_addr_t dma_addr) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d7c34d2d1ba5..a11006b6d8e8 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -46,45 +46,6 @@ static int dmam_match(struct device *dev, void *res, void *match_data) } /** - * dmam_alloc_coherent - Managed dma_alloc_coherent() - * @dev: Device to allocate coherent memory for - * @size: Size of allocation - * @dma_handle: Out argument for allocated DMA handle - * @gfp: Allocation flags - * - * Managed dma_alloc_coherent(). Memory allocated using this function - * will be automatically released on driver detach. - * - * RETURNS: - * Pointer to allocated memory on success, NULL on failure. - */ -void *dmam_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - struct dma_devres *dr; - void *vaddr; - - dr = devres_alloc(dmam_release, sizeof(*dr), gfp); - if (!dr) - return NULL; - - vaddr = dma_alloc_coherent(dev, size, dma_handle, gfp); - if (!vaddr) { - devres_free(dr); - return NULL; - } - - dr->vaddr = vaddr; - dr->dma_handle = *dma_handle; - dr->size = size; - - devres_add(dev, dr); - - return vaddr; -} -EXPORT_SYMBOL(dmam_alloc_coherent); - -/** * dmam_free_coherent - Managed dma_free_coherent() * @dev: Device to free coherent memory for * @size: Size of allocation @@ -144,61 +105,6 @@ void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, } EXPORT_SYMBOL(dmam_alloc_attrs); -#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT - -static void dmam_coherent_decl_release(struct device *dev, void *res) -{ - dma_release_declared_memory(dev); -} - -/** - * dmam_declare_coherent_memory - Managed dma_declare_coherent_memory() - * @dev: Device to declare coherent memory for - * @phys_addr: Physical address of coherent memory to be declared - * @device_addr: Device address of coherent memory to be declared - * @size: Size of coherent memory to be declared - * @flags: Flags - * - * Managed dma_declare_coherent_memory(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void *res; - int rc; - - res = devres_alloc(dmam_coherent_decl_release, 0, GFP_KERNEL); - if (!res) - return -ENOMEM; - - rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, - flags); - if (!rc) - devres_add(dev, res); - else - devres_free(res); - - return rc; -} -EXPORT_SYMBOL(dmam_declare_coherent_memory); - -/** - * dmam_release_declared_memory - Managed dma_release_declared_memory(). - * @dev: Device to release declared coherent memory for - * - * Managed dmam_release_declared_memory(). - */ -void dmam_release_declared_memory(struct device *dev) -{ - WARN_ON(devres_destroy(dev, dmam_coherent_decl_release, NULL, NULL)); -} -EXPORT_SYMBOL(dmam_release_declared_memory); - -#endif - /* * Create scatter-list for the already allocated DMA buffer. */ diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 18cc09fc27b9..7a723194ecbe 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -204,8 +204,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, ret = dma_alloc_from_pool(size, &page, flags); if (!ret) return NULL; - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - return ret; + goto done; } page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); @@ -215,8 +214,10 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); - if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) - return page; /* opaque cookie */ + if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { + ret = page; /* opaque cookie */ + goto done; + } /* create a coherent mapping */ ret = dma_common_contiguous_remap(page, size, VM_USERMAP, @@ -227,9 +228,9 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, return ret; } - *dma_handle = phys_to_dma(dev, page_to_phys(page)); memset(ret, 0, size); - +done: + *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d6361776dc5c..1fb6fd68b9c7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -378,6 +378,8 @@ void __init swiotlb_exit(void) memblock_free_late(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); } + io_tlb_start = 0; + io_tlb_end = 0; io_tlb_nslabs = 0; max_segment = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 67ecac337374..3cd13a30f732 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10135,7 +10135,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, u32 size; int ret; - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + if (!access_ok(uattr, PERF_ATTR_SIZE_VER0)) return -EFAULT; /* diff --git a/kernel/exit.c b/kernel/exit.c index 0e21e6d21f35..284f2fe9a293 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -866,6 +866,7 @@ void __noreturn do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); + exit_umh(tsk); /* * Flush inherited counters to the parent - before the parent @@ -1604,10 +1605,9 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, if (!infop) return err; - if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + if (!user_access_begin(infop, sizeof(*infop))) return -EFAULT; - user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); @@ -1732,10 +1732,9 @@ COMPAT_SYSCALL_DEFINE5(waitid, if (!infop) return err; - if (!access_ok(VERIFY_WRITE, infop, sizeof(*infop))) + if (!user_access_begin(infop, sizeof(*infop))) return -EFAULT; - user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); unsafe_put_user(info.cause, &infop->si_code, Efault); diff --git a/kernel/fork.c b/kernel/fork.c index d439c48ecf18..b69248e6f0e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -164,10 +164,6 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_stack(unsigned long *stack) -{ -} - #ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* @@ -221,6 +217,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) memset(s->addr, 0, THREAD_SIZE); tsk->stack_vm_area = s; + tsk->stack = s->addr; return s->addr; } @@ -422,7 +419,6 @@ static void release_task_stack(struct task_struct *tsk) return; /* Better to leak the stack than to free prematurely */ account_kernel_stack(tsk, -1); - arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); tsk->stack = NULL; #ifdef CONFIG_VMAP_STACK @@ -1838,8 +1834,6 @@ static __latent_entropy struct task_struct *copy_process( posix_cpu_timers_init(p); - p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); @@ -2006,6 +2000,17 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_free_pid; /* + * From this point on we must avoid any synchronous user-space + * communication until we take the tasklist-lock. In particular, we do + * not want user-space to be able to predict the process start-time by + * stalling fork(2) after we recorded the start_time but before it is + * visible to the system. + */ + + p->start_time = ktime_get_ns(); + p->real_start_time = ktime_get_boot_ns(); + + /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ diff --git a/kernel/futex.c b/kernel/futex.c index 054105854e0e..be3bff2315ff 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -481,13 +481,18 @@ static void drop_futex_key_refs(union futex_key *key) } } +enum futex_access { + FUTEX_READ, + FUTEX_WRITE +}; + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, - * VERIFY_WRITE) + * @rw: mapping needs to be read/write (values: FUTEX_READ, + * FUTEX_WRITE) * * Return: a negative error code or 0 * @@ -500,7 +505,7 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -516,7 +521,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) return -EINVAL; address -= key->both.offset; - if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(uaddr, sizeof(u32)))) return -EFAULT; if (unlikely(should_fail_futex(fshared))) @@ -546,7 +551,7 @@ again: * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. */ - if (err == -EFAULT && rw == VERIFY_READ) { + if (err == -EFAULT && rw == FUTEX_READ) { err = get_user_pages_fast(address, 1, 0, &page); ro = 1; } @@ -1583,7 +1588,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ); if (unlikely(ret != 0)) goto out; @@ -1642,7 +1647,7 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) oparg = 1 << oparg; } - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); @@ -1682,10 +1687,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, DEFINE_WAKE_Q(wake_q); retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out_put_key1; @@ -1961,11 +1966,11 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, } retry: - ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ); if (unlikely(ret != 0)) goto out; ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); + requeue_pi ? FUTEX_WRITE : FUTEX_READ); if (unlikely(ret != 0)) goto out_put_key1; @@ -2634,7 +2639,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, * while the syscall executes. */ retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; @@ -2793,7 +2798,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, } retry: - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; @@ -2972,7 +2977,7 @@ retry: if ((uval & FUTEX_TID_MASK) != vpid) return -EPERM; - ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE); if (ret) return ret; @@ -3199,7 +3204,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ rt_mutex_init_waiter(&rt_waiter); - ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) goto out; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cb8e3e8ac7b9..4a9191617076 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -34,7 +34,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ -#define HUNG_TASK_BATCHING 1024 +#define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: @@ -112,8 +112,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) - return; + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_show_lock = true; + hung_task_call_panic = true; + } /* * Ok, the task did not get scheduled for more than 2 minutes, @@ -135,11 +138,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) } touch_nmi_watchdog(); - - if (sysctl_hung_task_panic) { - hung_task_show_lock = true; - hung_task_call_panic = true; - } } /* @@ -173,7 +171,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; - int batch_count = HUNG_TASK_BATCHING; + unsigned long last_break = jiffies; struct task_struct *g, *t; /* @@ -188,10 +186,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) for_each_process_thread(g, t) { if (!max_count--) goto unlock; - if (!--batch_count) { - batch_count = HUNG_TASK_BATCHING; + if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; + last_break = jiffies; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b28028b08d44..bad96b476eb6 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -18,8 +18,6 @@ #include <linux/cpu.h> #include <asm/sections.h> -#ifdef HAVE_JUMP_LABEL - /* mutex to protect coming/going of the the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); @@ -80,13 +78,13 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) static void jump_label_update(struct static_key *key); /* - * There are similar definitions for the !HAVE_JUMP_LABEL case in jump_label.h. + * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h. * The use of 'atomic_read()' requires atomic.h and its problematic for some * kernel headers such as kernel.h and others. Since static_key_count() is not - * used in the branch statements as it is for the !HAVE_JUMP_LABEL case its ok + * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok * to have it be a function here. Similarly, for 'static_key_enable()' and * 'static_key_disable()', which require bug.h. This should allow jump_label.h - * to be included from most/all places for HAVE_JUMP_LABEL. + * to be included from most/all places for CONFIG_JUMP_LABEL. */ int static_key_count(struct static_key *key) { @@ -791,5 +789,3 @@ static __init int jump_label_test(void) } early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */ - -#endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/kcov.c b/kernel/kcov.c index 97959d7b77e2..c2277dbdbfb1 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -112,7 +112,7 @@ void notrace __sanitizer_cov_trace_pc(void) EXPORT_SYMBOL(__sanitizer_cov_trace_pc); #ifdef CONFIG_KCOV_ENABLE_COMPARISONS -static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) +static void notrace write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) { struct task_struct *t; u64 *area; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 3f8a35104285..db578783dd36 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -987,7 +987,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * wait_lock. This ensures the lock cancellation is ordered * against mutex_unlock() and wake-ups do not go missing. */ - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { ret = -EINTR; goto err; } diff --git a/kernel/module.c b/kernel/module.c index fcbc0128810b..2ad1b5239910 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3102,7 +3102,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->bpf_raw_events), &mod->num_bpf_raw_events); #endif -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), &mod->num_jump_entries); diff --git a/kernel/panic.c b/kernel/panic.c index d10c340c43b0..f121e6ba7e11 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -46,6 +46,13 @@ int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); +#define PANIC_PRINT_TASK_INFO 0x00000001 +#define PANIC_PRINT_MEM_INFO 0x00000002 +#define PANIC_PRINT_TIMER_INFO 0x00000004 +#define PANIC_PRINT_LOCK_INFO 0x00000008 +#define PANIC_PRINT_FTRACE_INFO 0x00000010 +unsigned long panic_print; + ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); @@ -125,6 +132,24 @@ void nmi_panic(struct pt_regs *regs, const char *msg) } EXPORT_SYMBOL(nmi_panic); +static void panic_print_sys_info(void) +{ + if (panic_print & PANIC_PRINT_TASK_INFO) + show_state(); + + if (panic_print & PANIC_PRINT_MEM_INFO) + show_mem(0, NULL); + + if (panic_print & PANIC_PRINT_TIMER_INFO) + sysrq_timer_list_show(); + + if (panic_print & PANIC_PRINT_LOCK_INFO) + debug_show_all_locks(); + + if (panic_print & PANIC_PRINT_FTRACE_INFO) + ftrace_dump(DUMP_ALL); +} + /** * panic - halt the system * @fmt: The text string to print @@ -254,6 +279,8 @@ void panic(const char *fmt, ...) debug_locks_off(); console_flush_on_panic(); + panic_print_sys_info(); + if (!panic_blink) panic_blink = no_blink; @@ -658,6 +685,7 @@ void refcount_error_report(struct pt_regs *regs, const char *err) #endif core_param(panic, panic_timeout, int, 0644); +core_param(panic_print, panic_print, ulong, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); core_param(panic_on_warn, panic_on_warn, int, 0644); core_param(crash_kexec_post_notifiers, crash_kexec_post_notifiers, bool, 0644); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1306fe0c1dc6..d3d170374ceb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1466,7 +1466,7 @@ int do_syslog(int type, char __user *buf, int len, int source) return -EINVAL; if (!len) return 0; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); @@ -1484,7 +1484,7 @@ int do_syslog(int type, char __user *buf, int len, int source) return -EINVAL; if (!len) return 0; - if (!access_ok(VERIFY_WRITE, buf, len)) + if (!access_ok(buf, len)) return -EFAULT; error = syslog_print_all(buf, len, clear); break; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c2cee9db5204..771e93f9c43f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1073,7 +1073,7 @@ int ptrace_request(struct task_struct *child, long request, struct iovec kiov; struct iovec __user *uiov = datavp; - if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) + if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(kiov.iov_base, &uiov->iov_base) || @@ -1229,7 +1229,7 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_uptr_t ptr; compat_size_t len; - if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) + if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(ptr, &uiov->iov_base) || diff --git a/kernel/rseq.c b/kernel/rseq.c index c6242d8594dc..25e9a7b60eba 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -267,7 +267,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - if (unlikely(!access_ok(VERIFY_WRITE, t->rseq, sizeof(*t->rseq)))) + if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq)))) goto error; ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) @@ -295,7 +295,7 @@ void rseq_syscall(struct pt_regs *regs) if (!t->rseq) return; - if (!access_ok(VERIFY_READ, t->rseq, sizeof(*t->rseq)) || + if (!access_ok(t->rseq, sizeof(*t->rseq)) || rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV, t); } @@ -351,7 +351,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || rseq_len != sizeof(*rseq)) return -EINVAL; - if (!access_ok(VERIFY_WRITE, rseq, rseq_len)) + if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; current->rseq_len = rseq_len; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f66920173370..a674c7db2f29 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -24,7 +24,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) /* * Debugging: various feature bits * @@ -3416,7 +3416,7 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nivcsw; if (!preempt && prev->state) { - if (unlikely(signal_pending_state(prev->state, prev))) { + if (signal_pending_state(prev->state, prev)) { prev->state = TASK_RUNNING; } else { deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); @@ -4450,7 +4450,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a u32 size; int ret; - if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) + if (!access_ok(uattr, SCHED_ATTR_SIZE_VER0)) return -EFAULT; /* Zero the full structure, so that a short copy will be nice: */ @@ -4650,7 +4650,7 @@ static int sched_read_attr(struct sched_attr __user *uattr, { int ret; - if (!access_ok(VERIFY_WRITE, uattr, usize)) + if (!access_ok(uattr, usize)) return -EFAULT; /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02bd5f969b21..de3de997e245 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -73,7 +73,7 @@ static int sched_feat_show(struct seq_file *m, void *v) return 0; } -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL #define jump_label_key__true STATIC_KEY_INIT_TRUE #define jump_label_key__false STATIC_KEY_INIT_FALSE @@ -99,7 +99,7 @@ static void sched_feat_enable(int i) #else static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6483834f1278..50aa2aba69bd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4217,7 +4217,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH -#ifdef HAVE_JUMP_LABEL +#ifdef CONFIG_JUMP_LABEL static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) @@ -4234,7 +4234,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* HAVE_JUMP_LABEL */ +#else /* CONFIG_JUMP_LABEL */ static bool cfs_bandwidth_used(void) { return true; @@ -4242,7 +4242,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* HAVE_JUMP_LABEL */ +#endif /* CONFIG_JUMP_LABEL */ /* * default period for cfs group bandwidth. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0ba08924e017..d04530bf251f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1488,7 +1488,7 @@ enum { #undef SCHED_FEAT -#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) /* * To support run-time toggling of sched features, all the translation units @@ -1508,7 +1508,7 @@ static __always_inline bool static_branch_##name(struct static_key *key) \ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) -#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */ /* * Each translation unit has its own copy of sysctl_sched_features to allow @@ -1524,7 +1524,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) -#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ +#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 66b59ac77c22..e83a3f8449f6 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -93,7 +93,7 @@ long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait long ret = 0; raw_spin_lock_irqsave(&q->lock, flags); - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { /* * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one() * must not see us. diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 5dd47f1103d1..6eb1f8efd221 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -264,7 +264,7 @@ long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_en long ret = 0; spin_lock_irqsave(&wq_head->lock, flags); - if (unlikely(signal_pending_state(state, current))) { + if (signal_pending_state(state, current)) { /* * Exclusive waiter must not fail if it was selected by wakeup, * it should "consume" the condition we were waiting for. diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d7f538847b84..e815781ed751 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -976,6 +976,9 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) struct seccomp_filter *filter = file->private_data; struct seccomp_knotif *knotif; + if (!filter) + return 0; + mutex_lock(&filter->notify_lock); /* @@ -1300,6 +1303,7 @@ out: out_put_fd: if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) { if (ret < 0) { + listener_f->private_data = NULL; fput(listener_f); put_unused_fd(listener); } else { diff --git a/kernel/signal.c b/kernel/signal.c index 53e07d97ffe0..e1d7ad8e6ab1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3997,7 +3997,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, if (act) { old_sigset_t mask; - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + if (!access_ok(act, sizeof(*act)) || __get_user(new_ka.sa.sa_handler, &act->sa_handler) || __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || @@ -4012,7 +4012,7 @@ SYSCALL_DEFINE3(sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + if (!access_ok(oact, sizeof(*oact)) || __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || @@ -4034,7 +4034,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, compat_uptr_t handler, restorer; if (act) { - if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + if (!access_ok(act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(restorer, &act->sa_restorer) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || @@ -4052,7 +4052,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + if (!access_ok(oact, sizeof(*oact)) || __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || __put_user(ptr_to_compat(old_ka.sa.sa_restorer), diff --git a/kernel/sys.c b/kernel/sys.c index 64b5a230f38d..f7eb62eceb24 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1207,7 +1207,8 @@ DECLARE_RWSEM(uts_sem); /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 - * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60. + * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be + * 2.6.60. */ static int override_release(char __user *release, size_t len) { @@ -2627,7 +2628,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) s.freehigh >>= bitcount; } - if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + if (!access_ok(info, sizeof(struct compat_sysinfo)) || __put_user(s.uptime, &info->uptime) || __put_user(s.loads[0], &info->loads[0]) || __put_user(s.loads[1], &info->loads[1]) || diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1825f712e73b..ba4d9e85feb8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -807,6 +807,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "panic_print", + .data = &panic_print, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, #if defined CONFIG_PRINTK { .procname = "printk", @@ -2787,6 +2794,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int bool neg; left -= proc_skip_spaces(&p); + if (!left) + break; err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 07148b497451..73c132095a7b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -140,6 +140,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, + { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" }, {} }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 9ddb6fddb4e0..8b068adb9da1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -170,7 +170,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, return -EPERM; if (unlikely(uaccess_kernel())) return -EPERM; - if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) + if (!access_ok(unsafe_ptr, size)) return -EPERM; return probe_kernel_write(unsafe_ptr, src, size); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5c19b8c41c7e..d5fb09ebba8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -607,11 +607,17 @@ static int trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; - /* argc must be >= 1 */ - if (argv[0][0] == 'r') { + switch (argv[0][0]) { + case 'r': is_return = true; flags |= TPARG_FL_RETURN; - } else if (argv[0][0] != 'p' || argc < 2) + break; + case 'p': + break; + default: + return -ECANCELED; + } + if (argc < 2) return -ECANCELED; event = strchr(&argv[0][1], ':'); diff --git a/kernel/umh.c b/kernel/umh.c index 0baa672e023c..d937cbad903a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -37,6 +37,8 @@ static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; static DEFINE_SPINLOCK(umh_sysctl_lock); static DECLARE_RWSEM(umhelper_sem); +static LIST_HEAD(umh_list); +static DEFINE_MUTEX(umh_list_lock); static void call_usermodehelper_freeinfo(struct subprocess_info *info) { @@ -100,10 +102,12 @@ static int call_usermodehelper_exec_async(void *data) commit_creds(new); sub_info->pid = task_pid_nr(current); - if (sub_info->file) + if (sub_info->file) { retval = do_execve_file(sub_info->file, sub_info->argv, sub_info->envp); - else + if (!retval) + current->flags |= PF_UMH; + } else retval = do_execve(getname_kernel(sub_info->path), (const char __user *const __user *)sub_info->argv, (const char __user *const __user *)sub_info->envp); @@ -517,6 +521,11 @@ int fork_usermode_blob(void *data, size_t len, struct umh_info *info) goto out; err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); + if (!err) { + mutex_lock(&umh_list_lock); + list_add(&info->list, &umh_list); + mutex_unlock(&umh_list_lock); + } out: fput(file); return err; @@ -679,6 +688,26 @@ static int proc_cap_handler(struct ctl_table *table, int write, return 0; } +void __exit_umh(struct task_struct *tsk) +{ + struct umh_info *info; + pid_t pid = tsk->pid; + + mutex_lock(&umh_list_lock); + list_for_each_entry(info, &umh_list, list) { + if (info->pid == pid) { + list_del(&info->list); + mutex_unlock(&umh_list_lock); + goto out; + } + } + mutex_unlock(&umh_list_lock); + return; +out: + if (info->cleanup) + info->cleanup(info); +} + struct ctl_table usermodehelper_table[] = { { .procname = "bset", |