summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2024-03-29 08:28:50 -0700
committerJakub Kicinski <kuba@kernel.org>2024-03-29 08:28:51 -0700
commitda493dbb1f2a156a1b6d8d8a447f2c3affe43678 (patch)
tree0c59686980a7ef30db7510b22fc5245b052b35ae
parent50e2907ef8bb52cf80ecde9eec5c4dac07177146 (diff)
parent2aa0cff26ed53bc8d4855292b501759435ffdd38 (diff)
downloadlinux-stable-da493dbb1f2a156a1b6d8d8a447f2c3affe43678.tar.gz
linux-stable-da493dbb1f2a156a1b6d8d8a447f2c3affe43678.tar.bz2
linux-stable-da493dbb1f2a156a1b6d8d8a447f2c3affe43678.zip
Merge branch 'af_unix-rework-gc'
Kuniyuki Iwashima says: ==================== af_unix: Rework GC. When we pass a file descriptor to an AF_UNIX socket via SCM_RIGTHS, the underlying struct file of the inflight fd gets its refcount bumped. If the fd is of an AF_UNIX socket, we need to track it in case it forms cyclic references. Let's say we send a fd of AF_UNIX socket A to B and vice versa and close() both sockets. When created, each socket's struct file initially has one reference. After the fd exchange, both refcounts are bumped up to 2. Then, close() decreases both to 1. From this point on, no one can touch the file/socket. However, the struct file has one refcount and thus never calls the release() function of the AF_UNIX socket. That's why we need to track all inflight AF_UNIX sockets and run garbage collection. This series replaces the current GC implementation that locks each inflight socket's receive queue and requires trickiness in other places. The new GC does not lock each socket's queue to minimise its effect and tries to be lightweight if there is no cyclic reference or no update in the shape of the inflight fd graph. The new implementation is based on Tarjan's Strongly Connected Components algorithm, and we will consider each inflight AF_UNIX socket as a vertex and its file descriptor as an edge in a directed graph. For the details, please see each patch. patch 1 - 3 : Add struct to express inflight socket graphs patch 4 : Optimse inflight fd counting patch 5 - 6 : Group SCC possibly forming a cycle patch 7 - 8 : Support embryo socket patch 9 - 11 : Make GC lightweight patch 12 - 13 : Detect dead cycle references patch 14 : Replace GC algorithm patch 15 : selftest After this series is applied, we can remove the two ugly tricks for race, scm_fp_dup() in unix_attach_fds() and spin_lock dance in unix_peek_fds() as done in patch 14/15 of v1. Also, we will add cond_resched_lock() in __unix_gc() and convert it to use a dedicated kthread instead of global system workqueue as suggested by Paolo in a v4 thread. v4: https://lore.kernel.org/netdev/20240301022243.73908-1-kuniyu@amazon.com/ v3: https://lore.kernel.org/netdev/20240223214003.17369-1-kuniyu@amazon.com/ v2: https://lore.kernel.org/netdev/20240216210556.65913-1-kuniyu@amazon.com/ v1: https://lore.kernel.org/netdev/20240203030058.60750-1-kuniyu@amazon.com/ ==================== Link: https://lore.kernel.org/r/20240325202425.60930-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/net/af_unix.h31
-rw-r--r--include/net/scm.h9
-rw-r--r--net/core/scm.c11
-rw-r--r--net/unix/af_unix.c27
-rw-r--r--net/unix/garbage.c573
-rw-r--r--tools/testing/selftests/net/.gitignore1
-rw-r--r--tools/testing/selftests/net/af_unix/Makefile2
-rw-r--r--tools/testing/selftests/net/af_unix/scm_rights.c286
8 files changed, 735 insertions, 205 deletions
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 627ea8e2d915..226a8da2cbe3 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -19,12 +19,30 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
extern spinlock_t unix_gc_lock;
extern unsigned int unix_tot_inflight;
-
-void unix_inflight(struct user_struct *user, struct file *fp);
-void unix_notinflight(struct user_struct *user, struct file *fp);
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+void unix_del_edges(struct scm_fp_list *fpl);
+void unix_update_edges(struct unix_sock *receiver);
+int unix_prepare_fpl(struct scm_fp_list *fpl);
+void unix_destroy_fpl(struct scm_fp_list *fpl);
void unix_gc(void);
void wait_for_unix_gc(struct scm_fp_list *fpl);
+struct unix_vertex {
+ struct list_head edges;
+ struct list_head entry;
+ struct list_head scc_entry;
+ unsigned long out_degree;
+ unsigned long index;
+ unsigned long scc_index;
+};
+
+struct unix_edge {
+ struct unix_sock *predecessor;
+ struct unix_sock *successor;
+ struct list_head vertex_entry;
+ struct list_head stack_entry;
+};
+
struct sock *unix_peer_get(struct sock *sk);
#define UNIX_HASH_MOD (256 - 1)
@@ -62,12 +80,9 @@ struct unix_sock {
struct path path;
struct mutex iolock, bindlock;
struct sock *peer;
- struct list_head link;
- unsigned long inflight;
+ struct sock *listener;
+ struct unix_vertex *vertex;
spinlock_t lock;
- unsigned long gc_flags;
-#define UNIX_GC_CANDIDATE 0
-#define UNIX_GC_MAYBE_CYCLE 1
struct socket_wq peer_wq;
wait_queue_entry_t peer_wake;
struct scm_stat scm_stat;
diff --git a/include/net/scm.h b/include/net/scm.h
index 92276a2c5543..bbc5527809d1 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -23,10 +23,19 @@ struct scm_creds {
kgid_t gid;
};
+#ifdef CONFIG_UNIX
+struct unix_edge;
+#endif
+
struct scm_fp_list {
short count;
short count_unix;
short max;
+#ifdef CONFIG_UNIX
+ bool inflight;
+ struct list_head vertices;
+ struct unix_edge *edges;
+#endif
struct user_struct *user;
struct file *fp[SCM_MAX_FD];
};
diff --git a/net/core/scm.c b/net/core/scm.c
index 9cd4b0a01cd6..5763f3320358 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -89,6 +89,11 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
fpl->count_unix = 0;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
+#if IS_ENABLED(CONFIG_UNIX)
+ fpl->inflight = false;
+ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+#endif
}
fpp = &fpl->fp[fpl->count];
@@ -376,8 +381,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
+
new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
+#if IS_ENABLED(CONFIG_UNIX)
+ new_fpl->inflight = false;
+ new_fpl->edges = NULL;
+ INIT_LIST_HEAD(&new_fpl->vertices);
+#endif
}
return new_fpl;
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5b41e2321209..27ca50ab1cd1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -979,11 +979,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk);
- u->inflight = 0;
+ u->listener = NULL;
+ u->vertex = NULL;
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
- INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
@@ -1597,6 +1597,7 @@ restart:
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
+ newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
@@ -1692,8 +1693,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
bool kern)
{
struct sock *sk = sock->sk;
- struct sock *tsk;
struct sk_buff *skb;
+ struct sock *tsk;
int err;
err = -EOPNOTSUPP;
@@ -1718,6 +1719,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
}
tsk = skb->sk;
+ unix_update_edges(unix_sk(tsk));
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);
@@ -1789,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p)
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
- int i;
-
if (too_many_unix_fds(current))
return -ETOOMANYREFS;
@@ -1802,21 +1802,18 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
if (!UNIXCB(skb).fp)
return -ENOMEM;
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_inflight(scm->fp->user, scm->fp->fp[i]);
+ if (unix_prepare_fpl(UNIXCB(skb).fp))
+ return -ENOMEM;
return 0;
}
static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
- int i;
-
scm->fp = UNIXCB(skb).fp;
UNIXCB(skb).fp = NULL;
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+ unix_destroy_fpl(scm->fp);
}
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
@@ -1937,8 +1934,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- if (unlikely(fp && fp->count))
+ if (unlikely(fp && fp->count)) {
atomic_add(fp->count, &u->scm_stat.nr_fds);
+ unix_add_edges(fp, u);
+ }
}
static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
@@ -1946,8 +1945,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
- if (unlikely(fp && fp->count))
+ if (unlikely(fp && fp->count)) {
atomic_sub(fp->count, &u->scm_stat.nr_fds);
+ unix_del_edges(fp);
+ }
}
/*
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index fa39b6265238..89ea71d9297b 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -101,261 +101,468 @@ struct unix_sock *unix_get_socket(struct file *filp)
return NULL;
}
+static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
+{
+ /* If an embryo socket has a fd,
+ * the listener indirectly holds the fd's refcnt.
+ */
+ if (edge->successor->listener)
+ return unix_sk(edge->successor->listener)->vertex;
+
+ return edge->successor->vertex;
+}
+
+static bool unix_graph_maybe_cyclic;
+static bool unix_graph_grouped;
+
+static void unix_update_graph(struct unix_vertex *vertex)
+{
+ /* If the receiver socket is not inflight, no cyclic
+ * reference could be formed.
+ */
+ if (!vertex)
+ return;
+
+ unix_graph_maybe_cyclic = true;
+ unix_graph_grouped = false;
+}
+
+static LIST_HEAD(unix_unvisited_vertices);
+
+enum unix_vertex_index {
+ UNIX_VERTEX_INDEX_MARK1,
+ UNIX_VERTEX_INDEX_MARK2,
+ UNIX_VERTEX_INDEX_START,
+};
+
+static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
+
+static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+{
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+
+ if (!vertex) {
+ vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
+ vertex->index = unix_vertex_unvisited_index;
+ vertex->out_degree = 0;
+ INIT_LIST_HEAD(&vertex->edges);
+ INIT_LIST_HEAD(&vertex->scc_entry);
+
+ list_move_tail(&vertex->entry, &unix_unvisited_vertices);
+ edge->predecessor->vertex = vertex;
+ }
+
+ vertex->out_degree++;
+ list_add_tail(&edge->vertex_entry, &vertex->edges);
+
+ unix_update_graph(unix_edge_successor(edge));
+}
+
+static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+{
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+
+ unix_update_graph(unix_edge_successor(edge));
+
+ list_del(&edge->vertex_entry);
+ vertex->out_degree--;
+
+ if (!vertex->out_degree) {
+ edge->predecessor->vertex = NULL;
+ list_move_tail(&vertex->entry, &fpl->vertices);
+ }
+}
+
+static void unix_free_vertices(struct scm_fp_list *fpl)
+{
+ struct unix_vertex *vertex, *next_vertex;
+
+ list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
+ list_del(&vertex->entry);
+ kfree(vertex);
+ }
+}
+
DEFINE_SPINLOCK(unix_gc_lock);
unsigned int unix_tot_inflight;
-static LIST_HEAD(gc_candidates);
-static LIST_HEAD(gc_inflight_list);
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-void unix_inflight(struct user_struct *user, struct file *filp)
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
{
- struct unix_sock *u = unix_get_socket(filp);
+ int i = 0, j = 0;
spin_lock(&unix_gc_lock);
- if (u) {
- if (!u->inflight) {
- WARN_ON_ONCE(!list_empty(&u->link));
- list_add_tail(&u->link, &gc_inflight_list);
- } else {
- WARN_ON_ONCE(list_empty(&u->link));
- }
- u->inflight++;
+ if (!fpl->count_unix)
+ goto out;
- /* Paired with READ_ONCE() in wait_for_unix_gc() */
- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
- }
+ do {
+ struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
+ struct unix_edge *edge;
+
+ if (!inflight)
+ continue;
+
+ edge = fpl->edges + i++;
+ edge->predecessor = inflight;
+ edge->successor = receiver;
- WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
+ unix_add_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+out:
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = true;
+
+ unix_free_vertices(fpl);
}
-void unix_notinflight(struct user_struct *user, struct file *filp)
+void unix_del_edges(struct scm_fp_list *fpl)
{
- struct unix_sock *u = unix_get_socket(filp);
+ int i = 0;
spin_lock(&unix_gc_lock);
- if (u) {
- WARN_ON_ONCE(!u->inflight);
- WARN_ON_ONCE(list_empty(&u->link));
+ if (!fpl->count_unix)
+ goto out;
- u->inflight--;
- if (!u->inflight)
- list_del_init(&u->link);
+ do {
+ struct unix_edge *edge = fpl->edges + i++;
- /* Paired with READ_ONCE() in wait_for_unix_gc() */
- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
- }
+ unix_del_edge(fpl, edge);
+ } while (i < fpl->count_unix);
- WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+out:
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = false;
}
-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
- struct sk_buff_head *hitlist)
+void unix_update_edges(struct unix_sock *receiver)
{
- struct sk_buff *skb;
- struct sk_buff *next;
-
- spin_lock(&x->sk_receive_queue.lock);
- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
- /* Do we have file descriptors ? */
- if (UNIXCB(skb).fp) {
- bool hit = false;
- /* Process the descriptors of this socket */
- int nfd = UNIXCB(skb).fp->count;
- struct file **fp = UNIXCB(skb).fp->fp;
-
- while (nfd--) {
- /* Get the socket the fd matches if it indeed does so */
- struct unix_sock *u = unix_get_socket(*fp++);
-
- /* Ignore non-candidates, they could have been added
- * to the queues after starting the garbage collection
- */
- if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
- hit = true;
-
- func(u);
- }
- }
- if (hit && hitlist != NULL) {
- __skb_unlink(skb, &x->sk_receive_queue);
- __skb_queue_tail(hitlist, skb);
- }
- }
- }
- spin_unlock(&x->sk_receive_queue.lock);
+ spin_lock(&unix_gc_lock);
+ unix_update_graph(unix_sk(receiver->listener)->vertex);
+ receiver->listener = NULL;
+ spin_unlock(&unix_gc_lock);
}
-static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
- struct sk_buff_head *hitlist)
+int unix_prepare_fpl(struct scm_fp_list *fpl)
{
- if (x->sk_state != TCP_LISTEN) {
- scan_inflight(x, func, hitlist);
- } else {
- struct sk_buff *skb;
- struct sk_buff *next;
- struct unix_sock *u;
- LIST_HEAD(embryos);
+ struct unix_vertex *vertex;
+ int i;
- /* For a listening socket collect the queued embryos
- * and perform a scan on them as well.
- */
- spin_lock(&x->sk_receive_queue.lock);
- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
- u = unix_sk(skb->sk);
+ if (!fpl->count_unix)
+ return 0;
- /* An embryo cannot be in-flight, so it's safe
- * to use the list link.
- */
- WARN_ON_ONCE(!list_empty(&u->link));
- list_add_tail(&u->link, &embryos);
- }
- spin_unlock(&x->sk_receive_queue.lock);
+ for (i = 0; i < fpl->count_unix; i++) {
+ vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
+ if (!vertex)
+ goto err;
- while (!list_empty(&embryos)) {
- u = list_entry(embryos.next, struct unix_sock, link);
- scan_inflight(&u->sk, func, hitlist);
- list_del_init(&u->link);
- }
+ list_add(&vertex->entry, &fpl->vertices);
}
-}
-static void dec_inflight(struct unix_sock *usk)
-{
- usk->inflight--;
+ fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
+ GFP_KERNEL_ACCOUNT);
+ if (!fpl->edges)
+ goto err;
+
+ return 0;
+
+err:
+ unix_free_vertices(fpl);
+ return -ENOMEM;
}
-static void inc_inflight(struct unix_sock *usk)
+void unix_destroy_fpl(struct scm_fp_list *fpl)
{
- usk->inflight++;
+ if (fpl->inflight)
+ unix_del_edges(fpl);
+
+ kvfree(fpl->edges);
+ unix_free_vertices(fpl);
}
-static void inc_inflight_move_tail(struct unix_sock *u)
+static bool unix_vertex_dead(struct unix_vertex *vertex)
{
- u->inflight++;
+ struct unix_edge *edge;
+ struct unix_sock *u;
+ long total_ref;
- /* If this still might be part of a cycle, move it to the end
- * of the list, so that it's checked even if it was already
- * passed over
- */
- if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
- list_move_tail(&u->link, &gc_candidates);
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+ struct unix_vertex *next_vertex = unix_edge_successor(edge);
+
+ /* The vertex's fd can be received by a non-inflight socket. */
+ if (!next_vertex)
+ return false;
+
+ /* The vertex's fd can be received by an inflight socket in
+ * another SCC.
+ */
+ if (next_vertex->scc_index != vertex->scc_index)
+ return false;
+ }
+
+ /* No receiver exists out of the same SCC. */
+
+ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
+ u = edge->predecessor;
+ total_ref = file_count(u->sk.sk_socket->file);
+
+ /* If not close()d, total_ref > out_degree. */
+ if (total_ref != vertex->out_degree)
+ return false;
+
+ return true;
}
-static bool gc_in_progress;
+enum unix_recv_queue_lock_class {
+ U_RECVQ_LOCK_NORMAL,
+ U_RECVQ_LOCK_EMBRYO,
+};
-static void __unix_gc(struct work_struct *work)
+static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
{
- struct sk_buff_head hitlist;
- struct unix_sock *u, *next;
- LIST_HEAD(not_cycle_list);
- struct list_head cursor;
+ struct unix_vertex *vertex;
- spin_lock(&unix_gc_lock);
+ list_for_each_entry_reverse(vertex, scc, scc_entry) {
+ struct sk_buff_head *queue;
+ struct unix_edge *edge;
+ struct unix_sock *u;
- /* First, select candidates for garbage collection. Only
- * in-flight sockets are considered, and from those only ones
- * which don't have any external reference.
- *
- * Holding unix_gc_lock will protect these candidates from
- * being detached, and hence from gaining an external
- * reference. Since there are no possible receivers, all
- * buffers currently on the candidates' queues stay there
- * during the garbage collection.
- *
- * We also know that no new candidate can be added onto the
- * receive queues. Other, non candidate sockets _can_ be
- * added to queue, so we must make sure only to touch
- * candidates.
- */
- list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
- long total_refs;
+ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
+ u = edge->predecessor;
+ queue = &u->sk.sk_receive_queue;
+
+ spin_lock(&queue->lock);
+
+ if (u->sk.sk_state == TCP_LISTEN) {
+ struct sk_buff *skb;
- total_refs = file_count(u->sk.sk_socket->file);
+ skb_queue_walk(queue, skb) {
+ struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
- WARN_ON_ONCE(!u->inflight);
- WARN_ON_ONCE(total_refs < u->inflight);
- if (total_refs == u->inflight) {
- list_move_tail(&u->link, &gc_candidates);
- __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
- __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+ /* listener -> embryo order, the inversion never happens. */
+ spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
+ skb_queue_splice_init(embryo_queue, hitlist);
+ spin_unlock(&embryo_queue->lock);
+ }
+ } else {
+ skb_queue_splice_init(queue, hitlist);
+
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+ if (u->oob_skb) {
+ kfree_skb(u->oob_skb);
+ u->oob_skb = NULL;
+ }
+#endif
}
+
+ spin_unlock(&queue->lock);
}
+}
- /* Now remove all internal in-flight reference to children of
- * the candidates.
- */
- list_for_each_entry(u, &gc_candidates, link)
- scan_children(&u->sk, dec_inflight, NULL);
+static bool unix_scc_cyclic(struct list_head *scc)
+{
+ struct unix_vertex *vertex;
+ struct unix_edge *edge;
- /* Restore the references for children of all candidates,
- * which have remaining references. Do this recursively, so
- * only those remain, which form cyclic references.
- *
- * Use a "cursor" link, to make the list traversal safe, even
- * though elements might be moved about.
+ /* SCC containing multiple vertices ? */
+ if (!list_is_singular(scc))
+ return true;
+
+ vertex = list_first_entry(scc, typeof(*vertex), scc_entry);
+
+ /* Self-reference or a embryo-listener circle ? */
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+ if (unix_edge_successor(edge) == vertex)
+ return true;
+ }
+
+ return false;
+}
+
+static LIST_HEAD(unix_visited_vertices);
+static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
+ struct sk_buff_head *hitlist)
+{
+ LIST_HEAD(vertex_stack);
+ struct unix_edge *edge;
+ LIST_HEAD(edge_stack);
+
+next_vertex:
+ /* Push vertex to vertex_stack and mark it as on-stack
+ * (index >= UNIX_VERTEX_INDEX_START).
+ * The vertex will be popped when finalising SCC later.
*/
- list_add(&cursor, &gc_candidates);
- while (cursor.next != &gc_candidates) {
- u = list_entry(cursor.next, struct unix_sock, link);
+ list_add(&vertex->scc_entry, &vertex_stack);
+
+ vertex->index = *last_index;
+ vertex->scc_index = *last_index;
+ (*last_index)++;
+
+ /* Explore neighbour vertices (receivers of the current vertex's fd). */
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+ struct unix_vertex *next_vertex = unix_edge_successor(edge);
+
+ if (!next_vertex)
+ continue;
+
+ if (next_vertex->index == unix_vertex_unvisited_index) {
+ /* Iterative deepening depth first search
+ *
+ * 1. Push a forward edge to edge_stack and set
+ * the successor to vertex for the next iteration.
+ */
+ list_add(&edge->stack_entry, &edge_stack);
- /* Move cursor to after the current position. */
- list_move(&cursor, &u->link);
+ vertex = next_vertex;
+ goto next_vertex;
- if (u->inflight) {
- list_move_tail(&u->link, &not_cycle_list);
- __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
- scan_children(&u->sk, inc_inflight_move_tail, NULL);
+ /* 2. Pop the edge directed to the current vertex
+ * and restore the ancestor for backtracking.
+ */
+prev_vertex:
+ edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
+ list_del_init(&edge->stack_entry);
+
+ next_vertex = vertex;
+ vertex = edge->predecessor->vertex;
+
+ /* If the successor has a smaller scc_index, two vertices
+ * are in the same SCC, so propagate the smaller scc_index
+ * to skip SCC finalisation.
+ */
+ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+ } else if (next_vertex->index != unix_vertex_grouped_index) {
+ /* Loop detected by a back/cross edge.
+ *
+ * The successor is on vertex_stack, so two vertices are in
+ * the same SCC. If the successor has a smaller *scc_index*,
+ * propagate it to skip SCC finalisation.
+ */
+ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+ } else {
+ /* The successor was already grouped as another SCC */
}
}
- list_del(&cursor);
- /* Now gc_candidates contains only garbage. Restore original
- * inflight counters for these as well, and remove the skbuffs
- * which are creating the cycle(s).
- */
- skb_queue_head_init(&hitlist);
- list_for_each_entry(u, &gc_candidates, link) {
- scan_children(&u->sk, inc_inflight, &hitlist);
+ if (vertex->index == vertex->scc_index) {
+ struct list_head scc;
+ bool scc_dead = true;
-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
- if (u->oob_skb) {
- kfree_skb(u->oob_skb);
- u->oob_skb = NULL;
+ /* SCC finalised.
+ *
+ * If the scc_index was not updated, all the vertices above on
+ * vertex_stack are in the same SCC. Group them using scc_entry.
+ */
+ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+
+ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+ /* Don't restart DFS from this vertex in unix_walk_scc(). */
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+ /* Mark vertex as off-stack. */
+ vertex->index = unix_vertex_grouped_index;
+
+ if (scc_dead)
+ scc_dead = unix_vertex_dead(vertex);
}
-#endif
+
+ if (scc_dead)
+ unix_collect_skb(&scc, hitlist);
+ else if (!unix_graph_maybe_cyclic)
+ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+
+ list_del(&scc);
}
- /* not_cycle_list contains those sockets which do not make up a
- * cycle. Restore these to the inflight list.
+ /* Need backtracking ? */
+ if (!list_empty(&edge_stack))
+ goto prev_vertex;
+}
+
+static void unix_walk_scc(struct sk_buff_head *hitlist)
+{
+ unsigned long last_index = UNIX_VERTEX_INDEX_START;
+
+ unix_graph_maybe_cyclic = false;
+
+ /* Visit every vertex exactly once.
+ * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
*/
- while (!list_empty(&not_cycle_list)) {
- u = list_entry(not_cycle_list.next, struct unix_sock, link);
- __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
- list_move_tail(&u->link, &gc_inflight_list);
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ __unix_walk_scc(vertex, &last_index, hitlist);
}
- spin_unlock(&unix_gc_lock);
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
- /* Here we are. Hitlist is filled. Die. */
- __skb_queue_purge(&hitlist);
+ unix_graph_grouped = true;
+}
+
+static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
+{
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+ struct list_head scc;
+ bool scc_dead = true;
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ list_add(&scc, &vertex->scc_entry);
+
+ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+ if (scc_dead)
+ scc_dead = unix_vertex_dead(vertex);
+ }
+
+ if (scc_dead)
+ unix_collect_skb(&scc, hitlist);
+
+ list_del(&scc);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+}
+
+static bool gc_in_progress;
+
+static void __unix_gc(struct work_struct *work)
+{
+ struct sk_buff_head hitlist;
spin_lock(&unix_gc_lock);
- /* All candidates should have been detached by now. */
- WARN_ON_ONCE(!list_empty(&gc_candidates));
+ if (!unix_graph_maybe_cyclic) {
+ spin_unlock(&unix_gc_lock);
+ goto skip_gc;
+ }
- /* Paired with READ_ONCE() in wait_for_unix_gc(). */
- WRITE_ONCE(gc_in_progress, false);
+ __skb_queue_head_init(&hitlist);
+
+ if (unix_graph_grouped)
+ unix_walk_scc_fast(&hitlist);
+ else
+ unix_walk_scc(&hitlist);
spin_unlock(&unix_gc_lock);
+
+ __skb_queue_purge(&hitlist);
+skip_gc:
+ WRITE_ONCE(gc_in_progress, false);
}
static DECLARE_WORK(unix_gc_work, __unix_gc);
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 2f9d378edec3..d996a0ab0765 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -31,6 +31,7 @@ reuseport_dualstack
rxtimestamp
sctp_hello
scm_pidfd
+scm_rights
sk_bind_sendto_listen
sk_connect_zero_addr
socket
diff --git a/tools/testing/selftests/net/af_unix/Makefile b/tools/testing/selftests/net/af_unix/Makefile
index 221c387a7d7f..3b83c797650d 100644
--- a/tools/testing/selftests/net/af_unix/Makefile
+++ b/tools/testing/selftests/net/af_unix/Makefile
@@ -1,4 +1,4 @@
CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd
+TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights
include ../../lib.mk
diff --git a/tools/testing/selftests/net/af_unix/scm_rights.c b/tools/testing/selftests/net/af_unix/scm_rights.c
new file mode 100644
index 000000000000..bab606c9f1eb
--- /dev/null
+++ b/tools/testing/selftests/net/af_unix/scm_rights.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates. */
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "../../kselftest_harness.h"
+
+FIXTURE(scm_rights)
+{
+ int fd[16];
+};
+
+FIXTURE_VARIANT(scm_rights)
+{
+ char name[16];
+ int type;
+ int flags;
+ bool test_listener;
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, dgram)
+{
+ .name = "UNIX ",
+ .type = SOCK_DGRAM,
+ .flags = 0,
+ .test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream)
+{
+ .name = "UNIX-STREAM ",
+ .type = SOCK_STREAM,
+ .flags = 0,
+ .test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_oob)
+{
+ .name = "UNIX-STREAM ",
+ .type = SOCK_STREAM,
+ .flags = MSG_OOB,
+ .test_listener = false,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener)
+{
+ .name = "UNIX-STREAM ",
+ .type = SOCK_STREAM,
+ .flags = 0,
+ .test_listener = true,
+};
+
+FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob)
+{
+ .name = "UNIX-STREAM ",
+ .type = SOCK_STREAM,
+ .flags = MSG_OOB,
+ .test_listener = true,
+};
+
+static int count_sockets(struct __test_metadata *_metadata,
+ const FIXTURE_VARIANT(scm_rights) *variant)
+{
+ int sockets = -1, len, ret;
+ char *line = NULL;
+ size_t unused;
+ FILE *f;
+
+ f = fopen("/proc/net/protocols", "r");
+ ASSERT_NE(NULL, f);
+
+ len = strlen(variant->name);
+
+ while (getline(&line, &unused, f) != -1) {
+ int unused2;
+
+ if (strncmp(line, variant->name, len))
+ continue;
+
+ ret = sscanf(line + len, "%d %d", &unused2, &sockets);
+ ASSERT_EQ(2, ret);
+
+ break;
+ }
+
+ free(line);
+
+ ret = fclose(f);
+ ASSERT_EQ(0, ret);
+
+ return sockets;
+}
+
+FIXTURE_SETUP(scm_rights)
+{
+ int ret;
+
+ ret = unshare(CLONE_NEWNET);
+ ASSERT_EQ(0, ret);
+
+ ret = count_sockets(_metadata, variant);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(scm_rights)
+{
+ int ret;
+
+ sleep(1);
+
+ ret = count_sockets(_metadata, variant);
+ ASSERT_EQ(0, ret);
+}
+
+static void create_listeners(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ int n)
+{
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ socklen_t addrlen;
+ int i, ret;
+
+ for (i = 0; i < n * 2; i += 2) {
+ self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_LE(0, self->fd[i]);
+
+ addrlen = sizeof(addr.sun_family);
+ ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen);
+ ASSERT_EQ(0, ret);
+
+ ret = listen(self->fd[i], -1);
+ ASSERT_EQ(0, ret);
+
+ addrlen = sizeof(addr);
+ ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen);
+ ASSERT_EQ(0, ret);
+
+ self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0);
+ ASSERT_LE(0, self->fd[i + 1]);
+
+ ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen);
+ ASSERT_EQ(0, ret);
+ }
+}
+
+static void create_socketpairs(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ const FIXTURE_VARIANT(scm_rights) *variant,
+ int n)
+{
+ int i, ret;
+
+ ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
+
+ for (i = 0; i < n * 2; i += 2) {
+ ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i);
+ ASSERT_EQ(0, ret);
+ }
+}
+
+static void __create_sockets(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ const FIXTURE_VARIANT(scm_rights) *variant,
+ int n)
+{
+ if (variant->test_listener)
+ create_listeners(_metadata, self, n);
+ else
+ create_socketpairs(_metadata, self, variant, n);
+}
+
+static void __close_sockets(struct __test_metadata *_metadata,
+ FIXTURE_DATA(scm_rights) *self,
+ int n)
+{
+ int i, ret;
+
+ ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
+
+ for (i = 0; i < n * 2; i++) {
+ ret = close(self->fd[i]);
+ ASSERT_EQ(0, ret);
+ }
+}
+
+void __send_fd(struct __test_metadata *_metadata,
+ const FIXTURE_DATA(scm_rights) *self,
+ const FIXTURE_VARIANT(scm_rights) *variant,
+ int inflight, int receiver)
+{
+#define MSG "nop"
+#define MSGLEN 3
+ struct {
+ struct cmsghdr cmsghdr;
+ int fd[2];
+ } cmsg = {
+ .cmsghdr = {
+ .cmsg_len = CMSG_LEN(sizeof(cmsg.fd)),
+ .cmsg_level = SOL_SOCKET,
+ .cmsg_type = SCM_RIGHTS,
+ },
+ .fd = {
+ self->fd[inflight * 2],
+ self->fd[inflight * 2],
+ },
+ };
+ struct iovec iov = {
+ .iov_base = MSG,
+ .iov_len = MSGLEN,
+ };
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = &cmsg,
+ .msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)),
+ };
+ int ret;
+
+ ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags);
+ ASSERT_EQ(MSGLEN, ret);
+}
+
+#define create_sockets(n) \
+ __create_sockets(_metadata, self, variant, n)
+#define close_sockets(n) \
+ __close_sockets(_metadata, self, n)
+#define send_fd(inflight, receiver) \
+ __send_fd(_metadata, self, variant, inflight, receiver)
+
+TEST_F(scm_rights, self_ref)
+{
+ create_sockets(2);
+
+ send_fd(0, 0);
+
+ send_fd(1, 1);
+
+ close_sockets(2);
+}
+
+TEST_F(scm_rights, triangle)
+{
+ create_sockets(6);
+
+ send_fd(0, 1);
+ send_fd(1, 2);
+ send_fd(2, 0);
+
+ send_fd(3, 4);
+ send_fd(4, 5);
+ send_fd(5, 3);
+
+ close_sockets(6);
+}
+
+TEST_F(scm_rights, cross_edge)
+{
+ create_sockets(8);
+
+ send_fd(0, 1);
+ send_fd(1, 2);
+ send_fd(2, 0);
+ send_fd(1, 3);
+ send_fd(3, 2);
+
+ send_fd(4, 5);
+ send_fd(5, 6);
+ send_fd(6, 4);
+ send_fd(5, 7);
+ send_fd(7, 6);
+
+ close_sockets(8);
+}
+
+TEST_HARNESS_MAIN