7 files changed, 386 insertions, 24 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 85cfdffde182..146c742f1d49 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3121,6 +3121,38 @@ union bpf_attr {
  * 		0 on success, or a negative error in case of failure:
  *
  *		**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ *
+ * u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
+ *	Description
+ *		Return the cgroup v2 id of the socket *sk*.
+ *
+ *		*sk* must be a non-**NULL** pointer to a full socket, e.g. one
+ *		returned from **bpf_sk_lookup_xxx**\ (),
+ *		**bpf_sk_fullsock**\ (), etc. The format of returned id is
+ *		same as in **bpf_skb_cgroup_id**\ ().
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		the **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *sk* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *sk*, then return value will be same as that
+ *		of **bpf_sk_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *sk*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_sk_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3250,7 +3282,9 @@ union bpf_attr {
 	FN(sk_assign),			\
 	FN(ktime_get_boot_ns),		\
 	FN(seq_printf),			\
-	FN(seq_write),
+	FN(seq_write),			\
+	FN(sk_cgroup_id),		\
+	FN(sk_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 1fe8c0c2d408..5815902bb617 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
 };
 
 #ifdef CONFIG_SOCK_CGROUP_DATA
+static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
+{
+	struct cgroup *cgrp;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	return cgroup_id(cgrp);
+}
+
 BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
 {
 	struct sock *sk = skb_to_full_sk(skb);
-	struct cgroup *cgrp;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	return cgroup_id(cgrp);
+	return __bpf_sk_cgroup_id(sk);
 }
 
 static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
@@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
-BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
-	   ancestor_level)
+static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
+					      int ancestor_level)
 {
-	struct sock *sk = skb_to_full_sk(skb);
 	struct cgroup *ancestor;
 	struct cgroup *cgrp;
 
-	if (!sk || !sk_fullsock(sk))
-		return 0;
-
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
 	ancestor = cgroup_ancestor(cgrp, ancestor_level);
 	if (!ancestor)
@@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
 	return cgroup_id(ancestor);
 }
 
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+	   ancestor_level)
+{
+	struct sock *sk = skb_to_full_sk(skb);
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
 static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
 	.func           = bpf_skb_ancestor_cgroup_id,
 	.gpl_only       = false,
@@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 	.arg2_type      = ARG_ANYTHING,
 };
+
+BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
+{
+	return __bpf_sk_cgroup_id(sk);
+}
+
+static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
+	.func           = bpf_sk_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_SOCKET,
+};
+
+BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
+{
+	return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
+static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
+	.func           = bpf_sk_ancestor_cgroup_id,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_SOCKET,
+	.arg2_type      = ARG_ANYTHING,
+};
 #endif
 
 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
@@ -6157,8 +6195,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_SOCK_CGROUP_DATA
 	case BPF_FUNC_skb_cgroup_id:
 		return &bpf_skb_cgroup_id_proto;
+	case BPF_FUNC_skb_ancestor_cgroup_id:
+		return &bpf_skb_ancestor_cgroup_id_proto;
+	case BPF_FUNC_sk_cgroup_id:
+		return &bpf_sk_cgroup_id_proto;
+	case BPF_FUNC_sk_ancestor_cgroup_id:
+		return &bpf_sk_ancestor_cgroup_id_proto;
 #endif
 #ifdef CONFIG_INET
+	case BPF_FUNC_sk_lookup_tcp:
+		return &bpf_sk_lookup_tcp_proto;
+	case BPF_FUNC_sk_lookup_udp:
+		return &bpf_sk_lookup_udp_proto;
+	case BPF_FUNC_sk_release:
+		return &bpf_sk_release_proto;
+	case BPF_FUNC_skc_lookup_tcp:
+		return &bpf_skc_lookup_tcp_proto;
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
 	case BPF_FUNC_get_listener_sock:
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 85cfdffde182..146c742f1d49 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3121,6 +3121,38 @@ union bpf_attr {
  * 		0 on success, or a negative error in case of failure:
  *
  *		**-EOVERFLOW** if an overflow happened: The same object will be tried again.
+ *
+ * u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
+ *	Description
+ *		Return the cgroup v2 id of the socket *sk*.
+ *
+ *		*sk* must be a non-**NULL** pointer to a full socket, e.g. one
+ *		returned from **bpf_sk_lookup_xxx**\ (),
+ *		**bpf_sk_fullsock**\ (), etc. The format of returned id is
+ *		same as in **bpf_skb_cgroup_id**\ ().
+ *
+ *		This helper is available only if the kernel was compiled with
+ *		the **CONFIG_SOCK_CGROUP_DATA** configuration option.
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
+ *
+ * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
+ *	Description
+ *		Return id of cgroup v2 that is ancestor of cgroup associated
+ *		with the *sk* at the *ancestor_level*.  The root cgroup is at
+ *		*ancestor_level* zero and each step down the hierarchy
+ *		increments the level. If *ancestor_level* == level of cgroup
+ *		associated with *sk*, then return value will be same as that
+ *		of **bpf_sk_cgroup_id**\ ().
+ *
+ *		The helper is useful to implement policies based on cgroups
+ *		that are upper in hierarchy than immediate cgroup associated
+ *		with *sk*.
+ *
+ *		The format of returned id and helper limitations are same as in
+ *		**bpf_sk_cgroup_id**\ ().
+ *	Return
+ *		The id is returned or 0 in case the id could not be retrieved.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3250,7 +3282,9 @@ union bpf_attr {
 	FN(sk_assign),			\
 	FN(ktime_get_boot_ns),		\
 	FN(seq_printf),			\
-	FN(seq_write),
+	FN(seq_write),			\
+	FN(sk_cgroup_id),		\
+	FN(sk_ancestor_cgroup_id),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 0ff64b70b746..999a775484c1 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -4,10 +4,14 @@
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
+
+#include <sys/epoll.h>
+
 #include <linux/err.h>
 #include <linux/in.h>
 #include <linux/in6.h>
 
+#include "bpf_util.h"
 #include "network_helpers.h"
 
 #define clean_errno() (errno == 0 ? "None" : strerror(errno))
@@ -77,9 +81,7 @@ static const size_t timeo_optlen = sizeof(timeo_sec);
 
 int connect_to_fd(int family, int type, int server_fd)
 {
-	struct sockaddr_storage addr;
-	socklen_t len = sizeof(addr);
-	int fd;
+	int fd, save_errno;
 
 	fd = socket(family, type, 0);
 	if (fd < 0) {
@@ -87,24 +89,70 @@ int connect_to_fd(int family, int type, int server_fd)
 		return -1;
 	}
 
-	if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec, timeo_optlen)) {
+	if (connect_fd_to_fd(fd, server_fd) < 0 && errno != EINPROGRESS) {
+		save_errno = errno;
+		close(fd);
+		errno = save_errno;
+		return -1;
+	}
+
+	return fd;
+}
+
+int connect_fd_to_fd(int client_fd, int server_fd)
+{
+	struct sockaddr_storage addr;
+	socklen_t len = sizeof(addr);
+	int save_errno;
+
+	if (setsockopt(client_fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
+		       timeo_optlen)) {
 		log_err("Failed to set SO_RCVTIMEO");
-		goto out;
+		return -1;
 	}
 
 	if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
 		log_err("Failed to get server addr");
-		goto out;
+		return -1;
 	}
 
-	if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
-		log_err("Fail to connect to server with family %d", family);
-		goto out;
+	if (connect(client_fd, (const struct sockaddr *)&addr, len) < 0) {
+		if (errno != EINPROGRESS) {
+			save_errno = errno;
+			log_err("Failed to connect to server");
+			errno = save_errno;
+		}
+		return -1;
 	}
 
-	return fd;
+	return 0;
+}
+
+int connect_wait(int fd)
+{
+	struct epoll_event ev = {}, events[2];
+	int timeout_ms = 1000;
+	int efd, nfd;
+
+	efd = epoll_create1(EPOLL_CLOEXEC);
+	if (efd < 0) {
+		log_err("Failed to open epoll fd");
+		return -1;
+	}
+
+	ev.events = EPOLLRDHUP | EPOLLOUT;
+	ev.data.fd = fd;
+
+	if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) < 0) {
+		log_err("Failed to register fd=%d on epoll fd=%d", fd, efd);
+		close(efd);
+		return -1;
+	}
+
+	nfd = epoll_wait(efd, events, ARRAY_SIZE(events), timeout_ms);
+	if (nfd < 0)
+		log_err("Failed to wait for I/O event on epoll fd=%d", efd);
 
-out:
-	close(fd);
-	return -1;
+	close(efd);
+	return nfd;
 }
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index a0be7db4f67d..86914e6e7b53 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -35,5 +35,7 @@ extern struct ipv6_packet pkt_v6;
 
 int start_server(int family, int type);
 int connect_to_fd(int family, int type, int server_fd);
+int connect_fd_to_fd(int client_fd, int server_fd);
+int connect_wait(int client_fd);
 
 #endif
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c
new file mode 100644
index 000000000000..059047af7df3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <test_progs.h>
+
+#include "network_helpers.h"
+#include "cgroup_skb_sk_lookup_kern.skel.h"
+
+static void run_lookup_test(__u16 *g_serv_port, int out_sk)
+{
+	int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err;
+	struct sockaddr_in6 addr = {};
+	socklen_t addr_len = sizeof(addr);
+	__u32 duration = 0;
+
+	serv_sk = start_server(AF_INET6, SOCK_STREAM);
+	if (CHECK(serv_sk < 0, "start_server", "failed to start server\n"))
+		return;
+
+	err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len);
+	if (CHECK(err, "getsockname", "errno %d\n", errno))
+		goto cleanup;
+
+	*g_serv_port = addr.sin6_port;
+
+	/* Client outside of test cgroup should fail to connect by timeout. */
+	err = connect_fd_to_fd(out_sk, serv_sk);
+	if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd",
+		  "unexpected result err %d errno %d\n", err, errno))
+		goto cleanup;
+
+	err = connect_wait(out_sk);
+	if (CHECK(err, "connect_wait", "unexpected result %d\n", err))
+		goto cleanup;
+
+	/* Client inside test cgroup should connect just fine. */
+	in_sk = connect_to_fd(AF_INET6, SOCK_STREAM, serv_sk);
+	if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno))
+		goto cleanup;
+
+	serv_in_sk = accept(serv_sk, NULL, NULL);
+	if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno))
+		goto cleanup;
+
+cleanup:
+	close(serv_in_sk);
+	close(in_sk);
+	close(serv_sk);
+}
+
+static void run_cgroup_bpf_test(const char *cg_path, int out_sk)
+{
+	struct cgroup_skb_sk_lookup_kern *skel;
+	struct bpf_link *link;
+	__u32 duration = 0;
+	int cgfd = -1;
+
+	skel = cgroup_skb_sk_lookup_kern__open_and_load();
+	if (CHECK(!skel, "skel_open_load", "open_load failed\n"))
+		return;
+
+	cgfd = test__join_cgroup(cg_path);
+	if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n"))
+		goto cleanup;
+
+	link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd);
+	if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link)))
+		goto cleanup;
+
+	run_lookup_test(&skel->bss->g_serv_port, out_sk);
+
+	bpf_link__destroy(link);
+
+cleanup:
+	close(cgfd);
+	cgroup_skb_sk_lookup_kern__destroy(skel);
+}
+
+void test_cgroup_skb_sk_lookup(void)
+{
+	const char *cg_path = "/foo";
+	int out_sk;
+
+	/* Create a socket before joining testing cgroup so that its cgroup id
+	 * differs from that of testing cgroup. Moving selftests process to
+	 * testing cgroup won't change cgroup id of an already created socket.
+	 */
+	out_sk = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0);
+	if (CHECK_FAIL(out_sk < 0))
+		return;
+
+	run_cgroup_bpf_test(cg_path, out_sk);
+
+	close(out_sk);
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c
new file mode 100644
index 000000000000..3f757e30d7a0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+__u16 g_serv_port = 0;
+
+static inline void set_ip(__u32 *dst, const struct in6_addr *src)
+{
+	dst[0] = src->in6_u.u6_addr32[0];
+	dst[1] = src->in6_u.u6_addr32[1];
+	dst[2] = src->in6_u.u6_addr32[2];
+	dst[3] = src->in6_u.u6_addr32[3];
+}
+
+static inline void set_tuple(struct bpf_sock_tuple *tuple,
+			     const struct ipv6hdr *ip6h,
+			     const struct tcphdr *tcph)
+{
+	set_ip(tuple->ipv6.saddr, &ip6h->daddr);
+	set_ip(tuple->ipv6.daddr, &ip6h->saddr);
+	tuple->ipv6.sport = tcph->dest;
+	tuple->ipv6.dport = tcph->source;
+}
+
+static inline int is_allowed_peer_cg(struct __sk_buff *skb,
+				     const struct ipv6hdr *ip6h,
+				     const struct tcphdr *tcph)
+{
+	__u64 cgid, acgid, peer_cgid, peer_acgid;
+	struct bpf_sock_tuple tuple;
+	size_t tuple_len = sizeof(tuple.ipv6);
+	struct bpf_sock *peer_sk;
+
+	set_tuple(&tuple, ip6h, tcph);
+
+	peer_sk = bpf_sk_lookup_tcp(skb, &tuple, tuple_len,
+				    BPF_F_CURRENT_NETNS, 0);
+	if (!peer_sk)
+		return 0;
+
+	cgid = bpf_skb_cgroup_id(skb);
+	peer_cgid = bpf_sk_cgroup_id(peer_sk);
+
+	acgid = bpf_skb_ancestor_cgroup_id(skb, 2);
+	peer_acgid = bpf_sk_ancestor_cgroup_id(peer_sk, 2);
+
+	bpf_sk_release(peer_sk);
+
+	return cgid && cgid == peer_cgid && acgid && acgid == peer_acgid;
+}
+
+SEC("cgroup_skb/ingress")
+int ingress_lookup(struct __sk_buff *skb)
+{
+	__u32 serv_port_key = 0;
+	struct ipv6hdr ip6h;
+	struct tcphdr tcph;
+
+	if (skb->protocol != bpf_htons(ETH_P_IPV6))
+		return 1;
+
+	/* For SYN packets coming to listening socket skb->remote_port will be
+	 * zero, so IPv6/TCP headers are loaded to identify remote peer
+	 * instead.
+	 */
+	if (bpf_skb_load_bytes(skb, 0, &ip6h, sizeof(ip6h)))
+		return 1;
+
+	if (ip6h.nexthdr != IPPROTO_TCP)
+		return 1;
+
+	if (bpf_skb_load_bytes(skb, sizeof(ip6h), &tcph, sizeof(tcph)))
+		return 1;
+
+	if (!g_serv_port)
+		return 0;
+
+	if (tcph.dest != g_serv_port)
+		return 1;
+
+	return is_allowed_peer_cg(skb, &ip6h, &tcph);
+}