diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-24 16:49:49 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-24 16:49:49 -0700 |
commit | e0456717e483bb8a9431b80a5bdc99a928b9b003 (patch) | |
tree | 5eb5add2bafd1f20326d70f5cb3b711d00a40b10 /arch | |
parent | 98ec21a01896751b673b6c731ca8881daa8b2c6d (diff) | |
parent | 1ea2d020ba477cb7011a7174e8501a9e04a325d4 (diff) | |
download | linux-stable-e0456717e483bb8a9431b80a5bdc99a928b9b003.tar.gz linux-stable-e0456717e483bb8a9431b80a5bdc99a928b9b003.tar.bz2 linux-stable-e0456717e483bb8a9431b80a5bdc99a928b9b003.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Add TX fast path in mac80211, from Johannes Berg.
2) Add TSO/GRO support to ibmveth, from Thomas Falcon
3) Move away from cached routes in ipv6, just like ipv4, from Martin
KaFai Lau.
4) Lots of new rhashtable tests, from Thomas Graf.
5) Run ingress qdisc lockless, from Alexei Starovoitov.
6) Allow servers to fetch TCP packet headers for SYN packets of new
connections, for fingerprinting. From Eric Dumazet.
7) Add mode parameter to pktgen, for testing receive. From Alexei
Starovoitov.
8) Cache access optimizations via simplifications of build_skb(), from
Alexander Duyck.
9) Move page frag allocator under mm/, also from Alexander.
10) Add xmit_more support to hv_netvsc, from KY Srinivasan.
11) Add a counter guard in case we try to perform endless reclassify
loops in the packet scheduler.
12) Extern flow dissector to be programmable and use it in new "Flower"
classifier. From Jiri Pirko.
13) AF_PACKET fanout rollover fixes, performance improvements, and new
statistics. From Willem de Bruijn.
14) Add netdev driver for GENEVE tunnels, from John W Linville.
15) Add ingress netfilter hooks and filtering, from Pablo Neira Ayuso.
16) Fix handling of epoll edge triggers in TCP, from Eric Dumazet.
17) Add an ECN retry fallback for the initial TCP handshake, from Daniel
Borkmann.
18) Add tail call support to BPF, from Alexei Starovoitov.
19) Add several pktgen helper scripts, from Jesper Dangaard Brouer.
20) Add zerocopy support to AF_UNIX, from Hannes Frederic Sowa.
21) Favor even port numbers for allocation to connect() requests, and
odd port numbers for bind(0), in an effort to help avoid
ip_local_port_range exhaustion. From Eric Dumazet.
22) Add Cavium ThunderX driver, from Sunil Goutham.
23) Allow bpf programs to access skb_iif and dev->ifindex SKB metadata,
from Alexei Starovoitov.
24) Add support for T6 chips in cxgb4vf driver, from Hariprasad Shenai.
25) Double TCP Small Queues default to 256K to accomodate situations
like the XEN driver and wireless aggregation. From Wei Liu.
26) Add more entropy inputs to flow dissector, from Tom Herbert.
27) Add CDG congestion control algorithm to TCP, from Kenneth Klette
Jonassen.
28) Convert ipset over to RCU locking, from Jozsef Kadlecsik.
29) Track and act upon link status of ipv4 route nexthops, from Andy
Gospodarek.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1670 commits)
bridge: vlan: flush the dynamically learned entries on port vlan delete
bridge: multicast: add a comment to br_port_state_selection about blocking state
net: inet_diag: export IPV6_V6ONLY sockopt
stmmac: troubleshoot unexpected bits in des0 & des1
net: ipv4 sysctl option to ignore routes when nexthop link is down
net: track link-status of ipv4 nexthops
net: switchdev: ignore unsupported bridge flags
net: Cavium: Fix MAC address setting in shutdown state
drivers: net: xgene: fix for ACPI support without ACPI
ip: report the original address of ICMP messages
net/mlx5e: Prefetch skb data on RX
net/mlx5e: Pop cq outside mlx5e_get_cqe
net/mlx5e: Remove mlx5e_cq.sqrq back-pointer
net/mlx5e: Remove extra spaces
net/mlx5e: Avoid TX CQE generation if more xmit packets expected
net/mlx5e: Avoid redundant dev_kfree_skb() upon NOP completion
net/mlx5e: Remove re-assignment of wq type in mlx5e_enable_rq()
net/mlx5e: Use skb_shinfo(skb)->gso_segs rather than counting them
net/mlx5e: Static mapping of netdev priv resources to/from netdev TX queues
net/mlx4_en: Use HW counters for rx/tx bytes/packets in PF device
...
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm/net/bpf_jit_32.c | 10 | ||||
-rw-r--r-- | arch/s390/net/bpf_jit.h | 10 | ||||
-rw-r--r-- | arch/s390/net/bpf_jit_comp.c | 106 | ||||
-rw-r--r-- | arch/x86/net/bpf_jit_comp.c | 150 |
4 files changed, 248 insertions, 28 deletions
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index e0e23582c8b4..4550d247e308 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -873,6 +873,16 @@ b_epilogue: off = offsetof(struct sk_buff, queue_mapping); emit(ARM_LDRH_I(r_A, r_skb, off), ctx); break; + case BPF_LDX | BPF_W | BPF_ABS: + /* + * load a 32bit word from struct seccomp_data. + * seccomp_check_filter() will already have checked + * that k is 32bit aligned and lies within the + * struct seccomp_data. + */ + ctx->seen |= SEEN_SKB; + emit(ARM_LDR_I(r_A, r_skb, k), ctx); + break; default: return -1; } diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h index de156ba3bd71..f6498eec9ee1 100644 --- a/arch/s390/net/bpf_jit.h +++ b/arch/s390/net/bpf_jit.h @@ -28,6 +28,9 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * | old backchain | | * +---------------+ | * | r15 - r6 | | + * +---------------+ | + * | 4 byte align | | + * | tail_call_cnt | | * BFP -> +===============+ | * | | | * | BPF stack | | @@ -46,14 +49,17 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * R15 -> +---------------+ + low * * We get 160 bytes stack space from calling function, but only use - * 11 * 8 byte (old backchain + r15 - r6) for storing registers. + * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt. */ #define STK_SPACE (MAX_BPF_STACK + 8 + 4 + 4 + 160) -#define STK_160_UNUSED (160 - 11 * 8) +#define STK_160_UNUSED (160 - 12 * 8) #define STK_OFF (STK_SPACE - STK_160_UNUSED) #define STK_OFF_TMP 160 /* Offset of tmp buffer on stack */ #define STK_OFF_HLEN 168 /* Offset of SKB header length on stack */ +#define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */ +#define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */ + /* Offset to skip condition code check */ #define OFF_OK 4 diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 55423d8be580..d3766dd67e23 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -21,6 +21,7 @@ #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/init.h> +#include <linux/bpf.h> #include <asm/cacheflush.h> #include <asm/dis.h> #include "bpf_jit.h" @@ -40,6 +41,8 @@ struct bpf_jit { int base_ip; /* Base address for literal pool */ int ret0_ip; /* Address of return 0 */ int exit_ip; /* Address of exit */ + int tail_call_start; /* Tail call start offset */ + int labels[1]; /* Labels for local jumps */ }; #define BPF_SIZE_MAX 4096 /* Max size for program */ @@ -49,6 +52,7 @@ struct bpf_jit { #define SEEN_RET0 4 /* ret0_ip points to a valid return 0 */ #define SEEN_LITERAL 8 /* code uses literals */ #define SEEN_FUNC 16 /* calls C functions */ +#define SEEN_TAIL_CALL 32 /* code uses tail calls */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) /* @@ -60,6 +64,7 @@ struct bpf_jit { #define REG_L (__MAX_BPF_REG+3) /* Literal pool register */ #define REG_15 (__MAX_BPF_REG+4) /* Register 15 */ #define REG_0 REG_W0 /* Register 0 */ +#define REG_1 REG_W1 /* Register 1 */ #define REG_2 BPF_REG_1 /* Register 2 */ #define REG_14 BPF_REG_0 /* Register 14 */ @@ -223,6 +228,24 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) REG_SET_SEEN(b3); \ }) +#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask) \ +({ \ + int rel = (jit->labels[label] - jit->prg) >> 1; \ + _EMIT6(op1 | reg(b1, b2) << 16 | (rel & 0xffff), \ + op2 | mask << 12); \ + REG_SET_SEEN(b1); \ + REG_SET_SEEN(b2); \ +}) + +#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask) \ +({ \ + int rel = (jit->labels[label] - jit->prg) >> 1; \ + _EMIT6(op1 | (reg_high(b1) | mask) << 16 | \ + (rel & 0xffff), op2 | (imm & 0xff) << 8); \ + REG_SET_SEEN(b1); \ + BUILD_BUG_ON(((unsigned long) imm) > 0xff); \ +}) + #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask) \ ({ \ /* Branch instruction needs 6 bytes */ \ @@ -286,7 +309,7 @@ static void jit_fill_hole(void *area, unsigned int size) */ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re) { - u32 off = 72 + (rs - 6) * 8; + u32 off = STK_OFF_R6 + (rs - 6) * 8; if (rs == re) /* stg %rs,off(%r15) */ @@ -301,7 +324,7 @@ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re) */ static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re) { - u32 off = 72 + (rs - 6) * 8; + u32 off = STK_OFF_R6 + (rs - 6) * 8; if (jit->seen & SEEN_STACK) off += STK_OFF; @@ -374,6 +397,16 @@ static void save_restore_regs(struct bpf_jit *jit, int op) */ static void bpf_jit_prologue(struct bpf_jit *jit) { + if (jit->seen & SEEN_TAIL_CALL) { + /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ + _EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT); + } else { + /* j tail_call_start: NOP if no tail calls are used */ + EMIT4_PCREL(0xa7f40000, 6); + _EMIT2(0); + } + /* Tail calls have to skip above initialization */ + jit->tail_call_start = jit->prg; /* Save registers */ save_restore_regs(jit, REGS_SAVE); /* Setup literal pool */ @@ -951,6 +984,75 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT4(0xb9040000, BPF_REG_0, REG_2); break; } + case BPF_JMP | BPF_CALL | BPF_X: + /* + * Implicit input: + * B1: pointer to ctx + * B2: pointer to bpf_array + * B3: index in bpf_array + */ + jit->seen |= SEEN_TAIL_CALL; + + /* + * if (index >= array->map.max_entries) + * goto out; + */ + + /* llgf %w1,map.max_entries(%b2) */ + EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2, + offsetof(struct bpf_array, map.max_entries)); + /* clgrj %b3,%w1,0xa,label0: if %b3 >= %w1 goto out */ + EMIT6_PCREL_LABEL(0xec000000, 0x0065, BPF_REG_3, + REG_W1, 0, 0xa); + + /* + * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) + * goto out; + */ + + if (jit->seen & SEEN_STACK) + off = STK_OFF_TCCNT + STK_OFF; + else + off = STK_OFF_TCCNT; + /* lhi %w0,1 */ + EMIT4_IMM(0xa7080000, REG_W0, 1); + /* laal %w1,%w0,off(%r15) */ + EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); + /* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */ + EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1, + MAX_TAIL_CALL_CNT, 0, 0x2); + + /* + * prog = array->prog[index]; + * if (prog == NULL) + * goto out; + */ + + /* sllg %r1,%b3,3: %r1 = index * 8 */ + EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, BPF_REG_3, REG_0, 3); + /* lg %r1,prog(%b2,%r1) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, BPF_REG_2, + REG_1, offsetof(struct bpf_array, prog)); + /* clgij %r1,0,0x8,label0 */ + EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007d, REG_1, 0, 0, 0x8); + + /* + * Restore registers before calling function + */ + save_restore_regs(jit, REGS_RESTORE); + + /* + * goto *(prog->bpf_func + tail_call_start); + */ + + /* lg %r1,bpf_func(%r1) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0, + offsetof(struct bpf_prog, bpf_func)); + /* bc 0xf,tail_call_start(%r1) */ + _EMIT4(0x47f01000 + jit->tail_call_start); + /* out: */ + jit->labels[0] = jit->prg; + break; case BPF_JMP | BPF_EXIT: /* return b0 */ last = (i == fp->len - 1) ? 1 : 0; if (last && !(jit->seen & SEEN_RET0)) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index ddeff4844a10..579a8fd74be0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <asm/cacheflush.h> +#include <linux/bpf.h> int bpf_jit_enable __read_mostly; @@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) return ptr + len; } -#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0) +#define EMIT(bytes, len) \ + do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -186,31 +188,31 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) +#define STACKSIZE \ + (MAX_BPF_STACK + \ + 32 /* space for rbx, r13, r14, r15 */ + \ + 8 /* space for skb_copy_bits() buffer */) + +#define PROLOGUE_SIZE 51 + +/* emit x64 prologue code for BPF program and check it's size. + * bpf_tail_call helper will skip it while jumping into another program + */ +static void emit_prologue(u8 **pprog) { - struct bpf_insn *insn = bpf_prog->insnsi; - int insn_cnt = bpf_prog->len; - bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); - bool seen_exit = false; - u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i; - int proglen = 0; - u8 *prog = temp; - int stacksize = MAX_BPF_STACK + - 32 /* space for rbx, r13, r14, r15 */ + - 8 /* space for skb_copy_bits() buffer */; + u8 *prog = *pprog; + int cnt = 0; EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ - /* sub rsp, stacksize */ - EMIT3_off32(0x48, 0x81, 0xEC, stacksize); + /* sub rsp, STACKSIZE */ + EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE); /* all classic BPF filters use R6(rbx) save it */ /* mov qword ptr [rbp-X],rbx */ - EMIT3_off32(0x48, 0x89, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE); /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and @@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, */ /* mov qword ptr [rbp-X],r13 */ - EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8); /* mov qword ptr [rbp-X],r14 */ - EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16); /* mov qword ptr [rbp-X],r15 */ - EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); /* clear A and X registers */ EMIT2(0x31, 0xc0); /* xor eax, eax */ EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + /* clear tail_cnt: mov qword ptr [rbp-X], rax */ + EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + *pprog = prog; +} + +/* generate the following code: + * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... + * if (index >= array->map.max_entries) + * goto out; + * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + * prog = array->prog[index]; + * if (prog == NULL) + * goto out; + * goto *(prog->bpf_func + prologue_size); + * out: + */ +static void emit_bpf_tail_call(u8 **pprog) +{ + u8 *prog = *pprog; + int label1, label2, label3; + int cnt = 0; + + /* rdi - pointer to ctx + * rsi - pointer to bpf_array + * rdx - index in bpf_array + */ + + /* if (index >= array->map.max_entries) + * goto out; + */ + EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + offsetof(struct bpf_array, map.max_entries)); + EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ +#define OFFSET1 44 /* number of bytes to jump */ + EMIT2(X86_JBE, OFFSET1); /* jbe out */ + label1 = cnt; + + /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ +#define OFFSET2 33 + EMIT2(X86_JA, OFFSET2); /* ja out */ + label2 = cnt; + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + + /* prog = array->prog[index]; */ + EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */ + EMIT1(offsetof(struct bpf_array, prog)); + EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ + + /* if (prog == NULL) + * goto out; + */ + EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ +#define OFFSET3 10 + EMIT2(X86_JE, OFFSET3); /* je out */ + label3 = cnt; + + /* goto *(prog->bpf_func + prologue_size); */ + EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ + offsetof(struct bpf_prog, bpf_func)); + EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + + /* now we're ready to jump into next BPF program + * rdi == ctx (1st arg) + * rax == prog->bpf_func + prologue_size + */ + EMIT2(0xFF, 0xE0); /* jmp rax */ + + /* out: */ + BUILD_BUG_ON(cnt - label1 != OFFSET1); + BUILD_BUG_ON(cnt - label2 != OFFSET2); + BUILD_BUG_ON(cnt - label3 != OFFSET3); + *pprog = prog; +} + +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + int oldproglen, struct jit_context *ctx) +{ + struct bpf_insn *insn = bpf_prog->insnsi; + int insn_cnt = bpf_prog->len; + bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_exit = false; + u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + int i, cnt = 0; + int proglen = 0; + u8 *prog = temp; + + emit_prologue(&prog); + if (seen_ld_abs) { /* r9d : skb->len - skb->data_len (headlen) * r10 : skb->data @@ -739,6 +837,10 @@ xadd: if (is_imm8(insn->off)) } break; + case BPF_JMP | BPF_CALL | BPF_X: + emit_bpf_tail_call(&prog); + break; + /* cond jump */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: @@ -891,13 +993,13 @@ common_load: /* update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp-X] */ - EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE); /* mov r13, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8); /* mov r14, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16); /* mov r15, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ |