[PATCH net-next] mptcp: let MPTCP create max size skbs
by Paolo Abeni
Currently the xmit path of the MPTCP protocol creates smaller-
than-max-size skbs, which is suboptimal for the performances.
There are a few things to improve:
- when coalescing to an existing skb, must clear the PUSH flag
- tcp_build_frag() expect the available space as an argument.
When coalescing is enable MPTCP already subtracted the
to-be-coalesced skb len. We must increment said argument
accordingly.
- when a skb is split by the TCP stack we froze the MPTCP map,
to avoid changing an already transmitted DSS due to skb
collapsing on xmit. We actually need to froze only the first
half of the map, and can keep collapsing on the 2nd half
Before:
./use_mptcp.sh netperf -H 127.0.0.1 -t TCP_STREAM
[...]
131072 16384 16384 30.00 24414.86
After:
./use_mptcp.sh netperf -H 127.0.0.1 -t TCP_STREAM
[...]
131072 16384 16384 30.05 28357.69
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
---
use_mptcp.sh forces exiting app to create MPTCP instead of TCP
ones via LD_PRELOAD of crafter socket() implementation.
https://github.com/pabeni/mptcp-tools/tree/master/use_mptcp
---
include/net/mptcp.h | 25 +++++++++++++++++++++++++
net/ipv4/tcp_output.c | 4 ++--
net/mptcp/protocol.c | 14 +++++++++-----
3 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 5694370be3d4..ead867e008d2 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -122,6 +122,31 @@ static inline void mptcp_skb_ext_copy(struct sk_buff *to,
skb_ext_copy(to, from);
}
+static inline void mptcp_skb_split(struct sk_buff *from,
+ struct sk_buff *to)
+{
+ struct mptcp_ext *from_ext, *to_ext;
+
+ from_ext = skb_ext_find(from, SKB_EXT_MPTCP);
+ if (!from_ext)
+ return;
+
+ /* if we can't allocate new ext for the 2nd half no action is needed:
+ * 2nd hald will be covered by the existing mapping and coalescing
+ * will be prevented
+ */
+ to_ext = skb_ext_add(to, SKB_EXT_MPTCP);
+ if (!to_ext)
+ return;
+
+ memcpy(to_ext, from_ext, sizeof(struct mptcp_ext));
+ from_ext->frozen = 1;
+ from_ext->data_len -= to->len;
+ to_ext->data_len = to->len;
+ to_ext->data_seq += from->len;
+ to_ext->subflow_seq += from->len;
+}
+
static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext,
const struct mptcp_ext *from_ext)
{
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 41880d3521ed..7e6441921848 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1570,7 +1570,6 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
if (!buff)
return -ENOMEM; /* We'll just try again later. */
skb_copy_decrypted(buff, skb);
- mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
@@ -1591,6 +1590,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
tcp_skb_fragment_eor(skb, buff);
skb_split(skb, buff, len);
+ mptcp_skb_split(skb, buff);
buff->ip_summed = CHECKSUM_PARTIAL;
@@ -2125,7 +2125,6 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
if (unlikely(!buff))
return -ENOMEM;
skb_copy_decrypted(buff, skb);
- mptcp_skb_ext_copy(buff, skb);
sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
@@ -2149,6 +2148,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
buff->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
+ mptcp_skb_split(skb, buff);
tcp_fragment_tstamp(skb, buff);
/* Fix up tso_factor for both original and new SKB. */
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 69441ea71411..2a2986141422 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1223,6 +1223,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_ext *mpext = NULL;
struct sk_buff *skb, *tail;
bool can_collapse = false;
+ int size_bias = 0;
int avail_size;
size_t ret = 0;
@@ -1244,10 +1245,12 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
can_collapse = (info->size_goal - skb->len > 0) &&
mptcp_skb_can_collapse_to(data_seq, skb, mpext);
- if (!can_collapse)
+ if (!can_collapse) {
TCP_SKB_CB(skb)->eor = 1;
- else
+ } else {
+ size_bias = skb->len;
avail_size = info->size_goal - skb->len;
+ }
}
/* Zero window and all data acked? Probe. */
@@ -1267,8 +1270,8 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
return 0;
ret = info->limit - info->sent;
- tail = tcp_build_frag(ssk, avail_size, info->flags, dfrag->page,
- dfrag->offset + info->sent, &ret);
+ tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
+ dfrag->page, dfrag->offset + info->sent, &ret);
if (!tail) {
tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
return -ENOMEM;
@@ -1277,8 +1280,9 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
/* if the tail skb is still the cached one, collapsing really happened.
*/
if (skb == tail) {
- WARN_ON_ONCE(!can_collapse);
+ TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += ret;
+ WARN_ON_ONCE(!can_collapse);
WARN_ON_ONCE(zero_window_probe);
goto out;
}
--
2.26.2
1 year, 7 months
[PATCH net-next] Squash-to: "mptcp: use mptcp release_cb for delayed tasks"
by Paolo Abeni
fix uninitialized local variable usage.
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
---
Side notes:
- not sure why gcc did not complain here
- if we will use the usual tags propagation schema will not credit
the kernel test robot, on the flip side, I'm not sure if a:
Co-developed-by: kernel test robot <lkp(a)intel.com>
tag is what we want ;)
---
net/mptcp/protocol.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index c42f92764ec9..69441ea71411 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1488,7 +1488,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
struct mptcp_sendmsg_info info;
struct mptcp_data_frag *dfrag;
int len, copied = 0;
- u32 sndbuf;
info.flags = 0;
while ((dfrag = mptcp_send_head(sk))) {
@@ -1500,8 +1499,8 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
/* do auto tuning */
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
- sndbuf > READ_ONCE(sk->sk_sndbuf))
- WRITE_ONCE(sk->sk_sndbuf, sndbuf);
+ ssk->sk_sndbuf > READ_ONCE(sk->sk_sndbuf))
+ WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
__mptcp_update_wmem(sk);
--
2.26.2
1 year, 7 months
[PATCH mptcp] squashto: mptcp: protect the rx path with the msk socket spinlock
by Florian Westphal
Looks like the arguments are inverted. Intention seems to move
the msk rx queue to the sk one so it can be purged from the destroy
function.
This avoids:
unreferenced object 0xffff888105cd3800 (size 512):
comm "packetdrill", pid 1648, jiffies 4294677515 (age 433.463s)
hex dump (first 32 bytes):
3c 33 30 3e 4e 6f 76 20 32 30 20 30 30 3a 32 33 <30>Nov 20 00:23
3a 34 38 20 73 79 73 74 65 6d 64 5b 31 5d 3a 20 :48 systemd[1]:
backtrace:
[<00000000b794d231>] __kmalloc_reserve.isra.0+0x2d/0x90
[<000000000421e158>] __alloc_skb+0x90/0x260
[<00000000d46c1201>] alloc_skb_with_frags+0x5e/0x250
[<00000000357464a5>] sock_alloc_send_pskb+0x265/0x2a0
[<00000000a73deb72>] tun_get_user+0x4dc/0x1840
[<00000000ba538b49>] tun_chr_write_iter+0x51/0x80
[<000000004f72fd7e>] do_iter_readv_writev+0x1c6/0x2b0
[<00000000c606f908>] do_iter_write+0xb1/0x230
[<0000000088092c0e>] vfs_writev+0xcb/0x130
[<00000000f9881e25>] do_writev+0x8c/0x150
[<00000000dc31e12e>] do_syscall_64+0x2d/0x40
[<0000000019b09572>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
plus WARN()s from nonzero sk_forward_alloc.
---
net/mptcp/protocol.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 92236a2ccbea..e62d34034d9e 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2707,7 +2707,9 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
__mptcp_clear_xmit(sk);
- skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue);
+ /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
+ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
+
skb_rbtree_purge(&msk->out_of_order_queue);
mptcp_token_destroy(msk);
mptcp_pm_free_anno_list(msk);
--
2.26.2
1 year, 7 months
[PATCH net-next] Squash-to: "mptcp: protect the rx path with the msk socket spinlock"
by Paolo Abeni
drop left-over splicing from the join list, we don't need
it anymore there.
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
---
net/mptcp/protocol.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f55dc414c403..7c3508dae1c8 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1698,7 +1698,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
- __mptcp_flush_join_list(msk);
for (;;) {
int bytes_read, old_space;
--
2.26.2
1 year, 7 months
[PATCH net-next] Squash-to: "mptcp: open code mptcp variant for lock_sock"
by Paolo Abeni
deal with report from kernel test robot
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
---
include/net/sock.h | 1 +
net/mptcp/protocol.h | 2 --
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index 1d29aeae74fd..86d074979ca0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1585,6 +1585,7 @@ static inline void lock_sock(struct sock *sk)
lock_sock_nested(sk, 0);
}
+void __lock_sock(struct sock *sk);
void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 93aa60231632..1b50249138d2 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -255,8 +255,6 @@ struct mptcp_sock {
} rcvq_space;
};
-void __lock_sock(struct sock *sk);
-
#define mptcp_lock_sock(___sk, cb) do { \
struct sock *__sk = (___sk); /* silence macro reuse warning */ \
might_sleep(); \
--
2.26.2
1 year, 7 months
[MPTCP][PATCH v3 mptcp-next 0/2] drop rm_addr_signal flag and more
by Geliang Tang
v3:
- use pr_warn
v2:
- use pr_err instead of WARN_ON_ONCE
Geliang Tang (2):
mptcp: drop rm_addr_signal flag
mptcp: rename add_addr_signal and mptcp_add_addr_status
net/mptcp/pm.c | 30 +++++++++++++++++++++---------
net/mptcp/pm_netlink.c | 4 ++--
net/mptcp/protocol.h | 16 ++++++++--------
3 files changed, 31 insertions(+), 19 deletions(-)
--
2.26.2
1 year, 7 months
[MPTCP][PATCH v2 mptcp-next 0/2] drop rm_addr_signal flag and more
by Geliang Tang
v2:
- use pr_err instead of WARN_ON_ONCE
Geliang Tang (2):
mptcp: drop rm_addr_signal flag
mptcp: rename add_addr_signal and mptcp_add_addr_status
net/mptcp/pm.c | 30 +++++++++++++++++++++---------
net/mptcp/pm_netlink.c | 4 ++--
net/mptcp/protocol.h | 16 ++++++++--------
3 files changed, 31 insertions(+), 19 deletions(-)
--
2.26.2
1 year, 7 months
[PATCH net-next] mptcp: update rtx timeout only if required.
by Paolo Abeni
We must start the retransmission timer only there are
pending data in the rtx queue.
Otherwise we can hit a WARN_ON in mptcp_reset_timer(),
as syzbot demonstrated.
Reported-and-tested-by: syzbot+42aa53dafb66a07e5a24(a)syzkaller.appspotmail.com
Fixes: d9ca1de8c0cd ("mptcp: move page frag allocation in mptcp_sendmsg()")
Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
---
net/mptcp/protocol.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8df013daea88..aeda4357de9a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1261,11 +1261,12 @@ static void mptcp_push_pending(struct sock *sk, unsigned int flags)
mptcp_push_release(sk, ssk, &info);
out:
- /* start the timer, if it's not pending */
- if (!mptcp_timer_pending(sk))
- mptcp_reset_timer(sk);
- if (copied)
+ if (copied) {
+ /* start the timer, if it's not pending */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
__mptcp_check_send_data_fin(sk);
+ }
}
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
--
2.26.2
1 year, 7 months
[MPTCP][PATCH mptcp-next] Squash to "mptcp: create the listening socket"
by Geliang Tang
This patch defined a new function __mptcp_subflow_create_socket for reuse,
This fuction is extracted from mptcp_subflow_create_socket, and it could
be reused by mptcp_subflow_create_socket.
Signed-off-by: Geliang Tang <geliangtang(a)gmail.com>
---
net/mptcp/subflow.c | 55 +++++++++++++++++++--------------------------
1 file changed, 23 insertions(+), 32 deletions(-)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index eace5eb49955..f9aa12cb2bfb 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1234,19 +1234,12 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
#endif /* CONFIG_SOCK_CGROUP_DATA */
}
-int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
+static int __mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
- struct mptcp_subflow_context *subflow;
struct net *net = sock_net(sk);
struct socket *sf;
int err;
- /* un-accepted server sockets can reach here - on bad configuration
- * bail early to avoid greater trouble later
- */
- if (unlikely(!sk->sk_socket))
- return -EINVAL;
-
err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
&sf);
if (err)
@@ -1273,6 +1266,27 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
return err;
}
+ *new_sock = sf;
+
+ return 0;
+}
+
+int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
+{
+ struct mptcp_subflow_context *subflow;
+ struct socket *sf;
+ int err;
+
+ /* un-accepted server sockets can reach here - on bad configuration
+ * bail early to avoid greater trouble later
+ */
+ if (unlikely(!sk->sk_socket))
+ return -EINVAL;
+
+ err = __mptcp_subflow_create_socket(sk, &sf);
+ if (err)
+ return err;
+
/* the newly created socket really belongs to the owning MPTCP master
* socket, even if for additional subflows the allocation is performed
* by a kernel workqueue. Adjust inode references, so that the
@@ -1296,36 +1310,13 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
int mptcp_create_listen_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
- struct net *net = sock_net(sk);
struct socket *sf;
int err;
- err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
- &sf);
+ err = __mptcp_subflow_create_socket(sk, &sf);
if (err)
return err;
- lock_sock(sf->sk);
-
- /* the newly created socket has to be in the same cgroup as its parent */
- mptcp_attach_cgroup(sk, sf->sk);
-
- /* kernel sockets do not by default acquire net ref, but TCP timer
- * needs it.
- */
- sf->sk->sk_net_refcnt = 1;
- get_net(net);
-#ifdef CONFIG_PROC_FS
- this_cpu_add(*net->core.sock_inuse, 1);
-#endif
- err = tcp_set_ulp(sf->sk, "mptcp");
- release_sock(sf->sk);
-
- if (err) {
- sock_release(sf);
- return err;
- }
-
SOCK_INODE(sf)->i_uid = GLOBAL_ROOT_UID;
SOCK_INODE(sf)->i_gid = GLOBAL_ROOT_GID;
--
2.26.2
1 year, 7 months
[mptcp:export 781/783] net/mptcp/protocol.c:1503:8: warning: variable 'sndbuf' is uninitialized when used here
by kernel test robot
tree: https://github.com/multipath-tcp/mptcp_net-next.git export
head: 502db67b77b326a9be8f3bb3515a09b83a8a1fd4
commit: 8c15885fe4f2567586ebc32da86d9559f12f840c [781/783] mptcp: use mptcp release_cb for delayed tasks
config: x86_64-randconfig-a005-20201119 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 3ded927cf80ac519f9f9c4664fef08787f7c537d)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# https://github.com/multipath-tcp/mptcp_net-next/commit/8c15885fe4f2567586...
git remote add mptcp https://github.com/multipath-tcp/mptcp_net-next.git
git fetch --no-tags mptcp export
git checkout 8c15885fe4f2567586ebc32da86d9559f12f840c
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp(a)intel.com>
All warnings (new ones prefixed by >>):
>> net/mptcp/protocol.c:1503:8: warning: variable 'sndbuf' is uninitialized when used here [-Wuninitialized]
sndbuf > READ_ONCE(sk->sk_sndbuf))
^~~~~~
net/mptcp/protocol.c:1491:12: note: initialize the variable 'sndbuf' to silence this warning
u32 sndbuf;
^
= 0
1 warning generated.
vim +/sndbuf +1503 net/mptcp/protocol.c
1484
1485 static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
1486 {
1487 struct mptcp_sock *msk = mptcp_sk(sk);
1488 struct mptcp_sendmsg_info info;
1489 struct mptcp_data_frag *dfrag;
1490 int len, copied = 0;
1491 u32 sndbuf;
1492
1493 info.flags = 0;
1494 while ((dfrag = mptcp_send_head(sk))) {
1495 info.sent = dfrag->already_sent;
1496 info.limit = dfrag->data_len;
1497 len = dfrag->data_len - dfrag->already_sent;
1498 while (len > 0) {
1499 int ret = 0;
1500
1501 /* do auto tuning */
1502 if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
> 1503 sndbuf > READ_ONCE(sk->sk_sndbuf))
1504 WRITE_ONCE(sk->sk_sndbuf, sndbuf);
1505
1506 if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) {
1507 __mptcp_update_wmem(sk);
1508 sk_mem_reclaim_partial(sk);
1509 }
1510 if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC))
1511 goto out;
1512
1513 ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
1514 if (ret <= 0)
1515 goto out;
1516
1517 info.sent += ret;
1518 dfrag->already_sent += ret;
1519 msk->snd_nxt += ret;
1520 msk->snd_burst -= ret;
1521 msk->tx_pending_data -= ret;
1522 copied += ret;
1523 len -= ret;
1524 }
1525 WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
1526 }
1527
1528 out:
1529 /* __mptcp_alloc_tx_skb could have released some wmem and we are
1530 * not going to flush it via release_sock()
1531 */
1532 __mptcp_update_wmem(sk);
1533 if (copied) {
1534 mptcp_set_timeout(sk, ssk);
1535 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
1536 info.size_goal);
1537 if (msk->snd_data_fin_enable &&
1538 msk->snd_nxt + 1 == msk->write_seq)
1539 mptcp_schedule_work(sk);
1540 }
1541 }
1542
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
1 year, 7 months