Currently we always pick the first ssk on the list and then have
mptcp_sendmsg_frag wait until more space becomes available in case that
ssk has no write space available.
Instead check the first subflow on the list. If no more write space
is available, then we need to either return -EAGAIN to userspace (nonblock
case), or we need to wait until a subflow becomes available.
This is done by blocking the current thread via sk_stream_wait_memory()
and then make the subflow sk_write_space() unblock the parent mptcp socket.
We can't acquire the mptcp socket lock from the subflow callbacks, but
we can use the mptcp->flags to for signalling. MPTCP_SEND_SPACE flag is
added for this purpose. If it gets set, then at least one subflow has
become available for writing.
v2: dumb-down the selection: just pick the first ssk on the list and make
mptcp socket block if it has no wspace.
Backup is only used if no non-backup subflow exists.
Signed-off-by: Florian Westphal <fw(a)strlen.de>
---
net/mptcp/protocol.c | 69 +++++++++++++++++++++++++++++++++++++++-----
net/mptcp/protocol.h | 1 +
net/mptcp/subflow.c | 7 ++++-
3 files changed, 68 insertions(+), 9 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index d15d64d16136..2b847e079619 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -344,6 +344,38 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
return ret;
}
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *backup = NULL;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_socket(subflow)->sk;
+
+ if (!sk_stream_is_writeable(ssk)) {
+ struct socket *sock = ssk->sk_socket;
+
+ if (sock)
+ set_bit(SOCK_NOSPACE, &sock->flags);
+
+ return NULL;
+ }
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+
+ continue;
+ }
+
+ return ssk;
+ }
+
+ return backup;
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
int mss_now = 0, size_goal = 0, ret = 0;
@@ -366,23 +398,36 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t
len)
return ret;
}
- ssk = mptcp_subflow_get(msk);
- if (!ssk) {
- release_sock(sk);
- return -ENOTCONN;
- }
-
- if (!msg_data_left(msg)) {
+ if (unlikely(!msg_data_left(msg))) {
+ ssk = mptcp_subflow_get(msk);
pr_debug("empty send");
ret = sock_sendmsg(ssk->sk_socket, msg);
goto out;
}
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+
+ smp_mb__before_atomic();
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic();
+
+ ssk = mptcp_subflow_get_send(msk);
+ while (!ssk) {
+ ret = sk_stream_wait_memory(sk, &timeo);
+ if (ret)
+ goto out;
+
+ ssk = mptcp_subflow_get_send(msk);
+ if (list_empty(&msk->conn_list)) {
+ ret = -ENOTCONN;
+ goto out;
+ }
+ }
+
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
mptcp_clean_una(sk);
- timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
while (msg_data_left(msg)) {
ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal);
@@ -1077,6 +1122,13 @@ bool mptcp_sk_is_subflow(const struct sock *sk)
return subflow->mp_join == 1;
}
+static bool mptcp_memory_free(const struct sock *sk, int wake)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ return test_bit(MPTCP_SEND_SPACE, &msk->flags);
+}
+
static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
@@ -1097,6 +1149,7 @@ static struct proto mptcp_prot = {
.sockets_allocated = &mptcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
+ .stream_memory_free = mptcp_memory_free,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock),
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 791f2c19cfb8..4b7399efc8bb 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -76,6 +76,7 @@
/* MPTCP socket flags */
#define MPTCP_DATA_READY BIT(0)
+#define MPTCP_SEND_SPACE BIT(1)
static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
{
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 7bae7c35ea6b..150f0da8b0f0 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -616,8 +616,13 @@ static void subflow_write_space(struct sock *sk)
struct sock *parent = subflow->conn;
sk_stream_write_space(sk);
- if (parent)
+ if (parent) {
+ smp_mb__before_atomic();
+ set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
+ smp_mb__after_atomic();
+
sk_stream_write_space(parent);
+ }
}
int mptcp_subflow_connect(struct sock *sk, struct sockaddr *local,
--
2.23.0