Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:3.16 commit in: /
Date: Sat, 27 Sep 2014 13:37:52
Message-Id: 1411825057.1b28da13cd7150f66fae58043d3de661105a513a.mpagano@gentoo
1 commit: 1b28da13cd7150f66fae58043d3de661105a513a
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Sat Sep 27 13:37:37 2014 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Sat Sep 27 13:37:37 2014 +0000
6 URL: http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=1b28da13
7
8 Move mpctp patch to experimental
9
10 ---
11 0000_README | 9 +-
12 5010_multipath-tcp-v3.16-872d7f6c6f4e.patch | 19230 ++++++++++++++++++++++++++
13 2 files changed, 19235 insertions(+), 4 deletions(-)
14
15 diff --git a/0000_README b/0000_README
16 index d92e6b7..3cc9441 100644
17 --- a/0000_README
18 +++ b/0000_README
19 @@ -58,10 +58,6 @@ Patch: 2400_kcopy-patch-for-infiniband-driver.patch
20 From: Alexey Shvetsov <alexxy@g.o>
21 Desc: Zero copy for infiniband psm userspace driver
22
23 -Patch: 2500_multipath-tcp-v3.16-872d7f6c6f4e.patch
24 -From: http://multipath-tcp.org/
25 -Desc: Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures.
26 -
27 Patch: 2700_ThinkPad-30-brightness-control-fix.patch
28 From: Seth Forshee <seth.forshee@×××××××××.com>
29 Desc: ACPI: Disable Windows 8 compatibility for some Lenovo ThinkPads
30 @@ -101,3 +97,8 @@ Desc: BFQ v7r5 patch 2 for 3.16: BFQ Scheduler
31 Patch: 5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r5-for-3.16.0.patch
32 From: http://algo.ing.unimo.it/people/paolo/disk_sched/
33 Desc: BFQ v7r5 patch 3 for 3.16: Early Queue Merge (EQM)
34 +
35 +Patch: 5010_multipath-tcp-v3.16-872d7f6c6f4e.patch
36 +From: http://multipath-tcp.org/
37 +Desc: Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures.
38 +
39
40 diff --git a/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch
41 new file mode 100644
42 index 0000000..3000da3
43 --- /dev/null
44 +++ b/5010_multipath-tcp-v3.16-872d7f6c6f4e.patch
45 @@ -0,0 +1,19230 @@
46 +diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
47 +index 768a0fb67dd6..5a46d91a8df9 100644
48 +--- a/drivers/infiniband/hw/cxgb4/cm.c
49 ++++ b/drivers/infiniband/hw/cxgb4/cm.c
50 +@@ -3432,7 +3432,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
51 + */
52 + memset(&tmp_opt, 0, sizeof(tmp_opt));
53 + tcp_clear_options(&tmp_opt);
54 +- tcp_parse_options(skb, &tmp_opt, 0, NULL);
55 ++ tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);
56 +
57 + req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
58 + memset(req, 0, sizeof(*req));
59 +diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
60 +index 2faef339d8f2..d86c853ffaad 100644
61 +--- a/include/linux/ipv6.h
62 ++++ b/include/linux/ipv6.h
63 +@@ -256,16 +256,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
64 + return inet_sk(__sk)->pinet6;
65 + }
66 +
67 +-static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
68 +-{
69 +- struct request_sock *req = reqsk_alloc(ops);
70 +-
71 +- if (req)
72 +- inet_rsk(req)->pktopts = NULL;
73 +-
74 +- return req;
75 +-}
76 +-
77 + static inline struct raw6_sock *raw6_sk(const struct sock *sk)
78 + {
79 + return (struct raw6_sock *)sk;
80 +@@ -309,12 +299,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
81 + return NULL;
82 + }
83 +
84 +-static inline struct inet6_request_sock *
85 +- inet6_rsk(const struct request_sock *rsk)
86 +-{
87 +- return NULL;
88 +-}
89 +-
90 + static inline struct raw6_sock *raw6_sk(const struct sock *sk)
91 + {
92 + return NULL;
93 +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
94 +index ec89301ada41..99ea4b0e3693 100644
95 +--- a/include/linux/skbuff.h
96 ++++ b/include/linux/skbuff.h
97 +@@ -2784,8 +2784,10 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
98 + bool zero_okay,
99 + __sum16 check)
100 + {
101 +- if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
102 +- skb->csum_valid = 1;
103 ++ if (skb_csum_unnecessary(skb)) {
104 ++ return false;
105 ++ } else if (zero_okay && !check) {
106 ++ skb->ip_summed = CHECKSUM_UNNECESSARY;
107 + return false;
108 + }
109 +
110 +diff --git a/include/linux/tcp.h b/include/linux/tcp.h
111 +index a0513210798f..7bc2e078d6ca 100644
112 +--- a/include/linux/tcp.h
113 ++++ b/include/linux/tcp.h
114 +@@ -53,7 +53,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
115 + /* TCP Fast Open */
116 + #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
117 + #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
118 +-#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
119 ++#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */
120 +
121 + /* TCP Fast Open Cookie as stored in memory */
122 + struct tcp_fastopen_cookie {
123 +@@ -72,6 +72,51 @@ struct tcp_sack_block {
124 + u32 end_seq;
125 + };
126 +
127 ++struct tcp_out_options {
128 ++ u16 options; /* bit field of OPTION_* */
129 ++ u8 ws; /* window scale, 0 to disable */
130 ++ u8 num_sack_blocks;/* number of SACK blocks to include */
131 ++ u8 hash_size; /* bytes in hash_location */
132 ++ u16 mss; /* 0 to disable */
133 ++ __u8 *hash_location; /* temporary pointer, overloaded */
134 ++ __u32 tsval, tsecr; /* need to include OPTION_TS */
135 ++ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
136 ++#ifdef CONFIG_MPTCP
137 ++ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */
138 ++ u8 dss_csum:1,
139 ++ add_addr_v4:1,
140 ++ add_addr_v6:1; /* dss-checksum required? */
141 ++
142 ++ union {
143 ++ struct {
144 ++ __u64 sender_key; /* sender's key for mptcp */
145 ++ __u64 receiver_key; /* receiver's key for mptcp */
146 ++ } mp_capable;
147 ++
148 ++ struct {
149 ++ __u64 sender_truncated_mac;
150 ++ __u32 sender_nonce;
151 ++ /* random number of the sender */
152 ++ __u32 token; /* token for mptcp */
153 ++ u8 low_prio:1;
154 ++ } mp_join_syns;
155 ++ };
156 ++
157 ++ struct {
158 ++ struct in_addr addr;
159 ++ u8 addr_id;
160 ++ } add_addr4;
161 ++
162 ++ struct {
163 ++ struct in6_addr addr;
164 ++ u8 addr_id;
165 ++ } add_addr6;
166 ++
167 ++ u16 remove_addrs; /* list of address id */
168 ++ u8 addr_id; /* address id (mp_join or add_address) */
169 ++#endif /* CONFIG_MPTCP */
170 ++};
171 ++
172 + /*These are used to set the sack_ok field in struct tcp_options_received */
173 + #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
174 + #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/
175 +@@ -95,6 +140,9 @@ struct tcp_options_received {
176 + u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
177 + };
178 +
179 ++struct mptcp_cb;
180 ++struct mptcp_tcp_sock;
181 ++
182 + static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
183 + {
184 + rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
185 +@@ -111,10 +159,7 @@ struct tcp_request_sock_ops;
186 +
187 + struct tcp_request_sock {
188 + struct inet_request_sock req;
189 +-#ifdef CONFIG_TCP_MD5SIG
190 +- /* Only used by TCP MD5 Signature so far. */
191 + const struct tcp_request_sock_ops *af_specific;
192 +-#endif
193 + struct sock *listener; /* needed for TFO */
194 + u32 rcv_isn;
195 + u32 snt_isn;
196 +@@ -130,6 +175,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
197 + return (struct tcp_request_sock *)req;
198 + }
199 +
200 ++struct tcp_md5sig_key;
201 ++
202 + struct tcp_sock {
203 + /* inet_connection_sock has to be the first member of tcp_sock */
204 + struct inet_connection_sock inet_conn;
205 +@@ -326,6 +373,37 @@ struct tcp_sock {
206 + * socket. Used to retransmit SYNACKs etc.
207 + */
208 + struct request_sock *fastopen_rsk;
209 ++
210 ++ /* MPTCP/TCP-specific callbacks */
211 ++ const struct tcp_sock_ops *ops;
212 ++
213 ++ struct mptcp_cb *mpcb;
214 ++ struct sock *meta_sk;
215 ++ /* We keep these flags even if CONFIG_MPTCP is not checked, because
216 ++ * it allows checking MPTCP capability just by checking the mpc flag,
217 ++ * rather than adding ifdefs everywhere.
218 ++ */
219 ++ u16 mpc:1, /* Other end is multipath capable */
220 ++ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
221 ++ send_mp_fclose:1,
222 ++ request_mptcp:1, /* Did we send out an MP_CAPABLE?
223 ++ * (this speeds up mptcp_doit() in tcp_recvmsg)
224 ++ */
225 ++ mptcp_enabled:1, /* Is MPTCP enabled from the application ? */
226 ++ pf:1, /* Potentially Failed state: when this flag is set, we
227 ++ * stop using the subflow
228 ++ */
229 ++ mp_killed:1, /* Killed with a tcp_done in mptcp? */
230 ++ was_meta_sk:1, /* This was a meta sk (in case of reuse) */
231 ++ is_master_sk,
232 ++ close_it:1, /* Must close socket in mptcp_data_ready? */
233 ++ closing:1;
234 ++ struct mptcp_tcp_sock *mptcp;
235 ++#ifdef CONFIG_MPTCP
236 ++ struct hlist_nulls_node tk_table;
237 ++ u32 mptcp_loc_token;
238 ++ u64 mptcp_loc_key;
239 ++#endif /* CONFIG_MPTCP */
240 + };
241 +
242 + enum tsq_flags {
243 +@@ -337,6 +415,8 @@ enum tsq_flags {
244 + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
245 + * tcp_v{4|6}_mtu_reduced()
246 + */
247 ++ MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */
248 ++ MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
249 + };
250 +
251 + static inline struct tcp_sock *tcp_sk(const struct sock *sk)
252 +@@ -355,6 +435,7 @@ struct tcp_timewait_sock {
253 + #ifdef CONFIG_TCP_MD5SIG
254 + struct tcp_md5sig_key *tw_md5_key;
255 + #endif
256 ++ struct mptcp_tw *mptcp_tw;
257 + };
258 +
259 + static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
260 +diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
261 +index 74af137304be..83f63033897a 100644
262 +--- a/include/net/inet6_connection_sock.h
263 ++++ b/include/net/inet6_connection_sock.h
264 +@@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk,
265 +
266 + struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,
267 + const struct request_sock *req);
268 ++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
269 ++ const u32 rnd, const u32 synq_hsize);
270 +
271 + struct request_sock *inet6_csk_search_req(const struct sock *sk,
272 + struct request_sock ***prevp,
273 +diff --git a/include/net/inet_common.h b/include/net/inet_common.h
274 +index fe7994c48b75..780f229f46a8 100644
275 +--- a/include/net/inet_common.h
276 ++++ b/include/net/inet_common.h
277 +@@ -1,6 +1,8 @@
278 + #ifndef _INET_COMMON_H
279 + #define _INET_COMMON_H
280 +
281 ++#include <net/sock.h>
282 ++
283 + extern const struct proto_ops inet_stream_ops;
284 + extern const struct proto_ops inet_dgram_ops;
285 +
286 +@@ -13,6 +15,8 @@ struct sock;
287 + struct sockaddr;
288 + struct socket;
289 +
290 ++int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
291 ++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
292 + int inet_release(struct socket *sock);
293 + int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
294 + int addr_len, int flags);
295 +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
296 +index 7a4313887568..f62159e39839 100644
297 +--- a/include/net/inet_connection_sock.h
298 ++++ b/include/net/inet_connection_sock.h
299 +@@ -30,6 +30,7 @@
300 +
301 + struct inet_bind_bucket;
302 + struct tcp_congestion_ops;
303 ++struct tcp_options_received;
304 +
305 + /*
306 + * Pointers to address related TCP functions
307 +@@ -243,6 +244,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
308 +
309 + struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
310 +
311 ++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
312 ++ const u32 synq_hsize);
313 ++
314 + struct request_sock *inet_csk_search_req(const struct sock *sk,
315 + struct request_sock ***prevp,
316 + const __be16 rport,
317 +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
318 +index b1edf17bec01..6a32d8d6b85e 100644
319 +--- a/include/net/inet_sock.h
320 ++++ b/include/net/inet_sock.h
321 +@@ -86,10 +86,14 @@ struct inet_request_sock {
322 + wscale_ok : 1,
323 + ecn_ok : 1,
324 + acked : 1,
325 +- no_srccheck: 1;
326 ++ no_srccheck: 1,
327 ++ mptcp_rqsk : 1,
328 ++ saw_mpc : 1;
329 + kmemcheck_bitfield_end(flags);
330 +- struct ip_options_rcu *opt;
331 +- struct sk_buff *pktopts;
332 ++ union {
333 ++ struct ip_options_rcu *opt;
334 ++ struct sk_buff *pktopts;
335 ++ };
336 + u32 ir_mark;
337 + };
338 +
339 +diff --git a/include/net/mptcp.h b/include/net/mptcp.h
340 +new file mode 100644
341 +index 000000000000..712780fc39e4
342 +--- /dev/null
343 ++++ b/include/net/mptcp.h
344 +@@ -0,0 +1,1439 @@
345 ++/*
346 ++ * MPTCP implementation
347 ++ *
348 ++ * Initial Design & Implementation:
349 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
350 ++ *
351 ++ * Current Maintainer & Author:
352 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
353 ++ *
354 ++ * Additional authors:
355 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
356 ++ * Gregory Detal <gregory.detal@×××××××××.be>
357 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
358 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
359 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
360 ++ * Andreas Ripke <ripke@××××××.eu>
361 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
362 ++ * Octavian Purdila <octavian.purdila@×××××.com>
363 ++ * John Ronan <jronan@××××.org>
364 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
365 ++ * Brandon Heller <brandonh@××××××××.edu>
366 ++ *
367 ++ *
368 ++ * This program is free software; you can redistribute it and/or
369 ++ * modify it under the terms of the GNU General Public License
370 ++ * as published by the Free Software Foundation; either version
371 ++ * 2 of the License, or (at your option) any later version.
372 ++ */
373 ++
374 ++#ifndef _MPTCP_H
375 ++#define _MPTCP_H
376 ++
377 ++#include <linux/inetdevice.h>
378 ++#include <linux/ipv6.h>
379 ++#include <linux/list.h>
380 ++#include <linux/net.h>
381 ++#include <linux/netpoll.h>
382 ++#include <linux/skbuff.h>
383 ++#include <linux/socket.h>
384 ++#include <linux/tcp.h>
385 ++#include <linux/kernel.h>
386 ++
387 ++#include <asm/byteorder.h>
388 ++#include <asm/unaligned.h>
389 ++#include <crypto/hash.h>
390 ++#include <net/tcp.h>
391 ++
392 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
393 ++ #define ntohll(x) be64_to_cpu(x)
394 ++ #define htonll(x) cpu_to_be64(x)
395 ++#elif defined(__BIG_ENDIAN_BITFIELD)
396 ++ #define ntohll(x) (x)
397 ++ #define htonll(x) (x)
398 ++#endif
399 ++
400 ++struct mptcp_loc4 {
401 ++ u8 loc4_id;
402 ++ u8 low_prio:1;
403 ++ struct in_addr addr;
404 ++};
405 ++
406 ++struct mptcp_rem4 {
407 ++ u8 rem4_id;
408 ++ __be16 port;
409 ++ struct in_addr addr;
410 ++};
411 ++
412 ++struct mptcp_loc6 {
413 ++ u8 loc6_id;
414 ++ u8 low_prio:1;
415 ++ struct in6_addr addr;
416 ++};
417 ++
418 ++struct mptcp_rem6 {
419 ++ u8 rem6_id;
420 ++ __be16 port;
421 ++ struct in6_addr addr;
422 ++};
423 ++
424 ++struct mptcp_request_sock {
425 ++ struct tcp_request_sock req;
426 ++ /* hlist-nulls entry to the hash-table. Depending on whether this is a
427 ++ * a new MPTCP connection or an additional subflow, the request-socket
428 ++ * is either in the mptcp_reqsk_tk_htb or mptcp_reqsk_htb.
429 ++ */
430 ++ struct hlist_nulls_node hash_entry;
431 ++
432 ++ union {
433 ++ struct {
434 ++ /* Only on initial subflows */
435 ++ u64 mptcp_loc_key;
436 ++ u64 mptcp_rem_key;
437 ++ u32 mptcp_loc_token;
438 ++ };
439 ++
440 ++ struct {
441 ++ /* Only on additional subflows */
442 ++ struct mptcp_cb *mptcp_mpcb;
443 ++ u32 mptcp_rem_nonce;
444 ++ u32 mptcp_loc_nonce;
445 ++ u64 mptcp_hash_tmac;
446 ++ };
447 ++ };
448 ++
449 ++ u8 loc_id;
450 ++ u8 rem_id; /* Address-id in the MP_JOIN */
451 ++ u8 dss_csum:1,
452 ++ is_sub:1, /* Is this a new subflow? */
453 ++ low_prio:1, /* Interface set to low-prio? */
454 ++ rcv_low_prio:1;
455 ++};
456 ++
457 ++struct mptcp_options_received {
458 ++ u16 saw_mpc:1,
459 ++ dss_csum:1,
460 ++ drop_me:1,
461 ++
462 ++ is_mp_join:1,
463 ++ join_ack:1,
464 ++
465 ++ saw_low_prio:2, /* 0x1 - low-prio set for this subflow
466 ++ * 0x2 - low-prio set for another subflow
467 ++ */
468 ++ low_prio:1,
469 ++
470 ++ saw_add_addr:2, /* Saw at least one add_addr option:
471 ++ * 0x1: IPv4 - 0x2: IPv6
472 ++ */
473 ++ more_add_addr:1, /* Saw one more add-addr. */
474 ++
475 ++ saw_rem_addr:1, /* Saw at least one rem_addr option */
476 ++ more_rem_addr:1, /* Saw one more rem-addr. */
477 ++
478 ++ mp_fail:1,
479 ++ mp_fclose:1;
480 ++ u8 rem_id; /* Address-id in the MP_JOIN */
481 ++ u8 prio_addr_id; /* Address-id in the MP_PRIO */
482 ++
483 ++ const unsigned char *add_addr_ptr; /* Pointer to add-address option */
484 ++ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
485 ++
486 ++ u32 data_ack;
487 ++ u32 data_seq;
488 ++ u16 data_len;
489 ++
490 ++ u32 mptcp_rem_token;/* Remote token */
491 ++
492 ++ /* Key inside the option (from mp_capable or fast_close) */
493 ++ u64 mptcp_key;
494 ++
495 ++ u32 mptcp_recv_nonce;
496 ++ u64 mptcp_recv_tmac;
497 ++ u8 mptcp_recv_mac[20];
498 ++};
499 ++
500 ++struct mptcp_tcp_sock {
501 ++ struct tcp_sock *next; /* Next subflow socket */
502 ++ struct hlist_node cb_list;
503 ++ struct mptcp_options_received rx_opt;
504 ++
505 ++ /* Those three fields record the current mapping */
506 ++ u64 map_data_seq;
507 ++ u32 map_subseq;
508 ++ u16 map_data_len;
509 ++ u16 slave_sk:1,
510 ++ fully_established:1,
511 ++ establish_increased:1,
512 ++ second_packet:1,
513 ++ attached:1,
514 ++ send_mp_fail:1,
515 ++ include_mpc:1,
516 ++ mapping_present:1,
517 ++ map_data_fin:1,
518 ++ low_prio:1, /* use this socket as backup */
519 ++ rcv_low_prio:1, /* Peer sent low-prio option to us */
520 ++ send_mp_prio:1, /* Trigger to send mp_prio on this socket */
521 ++ pre_established:1; /* State between sending 3rd ACK and
522 ++ * receiving the fourth ack of new subflows.
523 ++ */
524 ++
525 ++ /* isn: needed to translate abs to relative subflow seqnums */
526 ++ u32 snt_isn;
527 ++ u32 rcv_isn;
528 ++ u8 path_index;
529 ++ u8 loc_id;
530 ++ u8 rem_id;
531 ++
532 ++#define MPTCP_SCHED_SIZE 4
533 ++ u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8);
534 ++
535 ++ struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified
536 ++ * skb in the ofo-queue.
537 ++ */
538 ++
539 ++ int init_rcv_wnd;
540 ++ u32 infinite_cutoff_seq;
541 ++ struct delayed_work work;
542 ++ u32 mptcp_loc_nonce;
543 ++ struct tcp_sock *tp; /* Where is my daddy? */
544 ++ u32 last_end_data_seq;
545 ++
546 ++ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */
547 ++ struct timer_list mptcp_ack_timer;
548 ++
549 ++ /* HMAC of the third ack */
550 ++ char sender_mac[20];
551 ++};
552 ++
553 ++struct mptcp_tw {
554 ++ struct list_head list;
555 ++ u64 loc_key;
556 ++ u64 rcv_nxt;
557 ++ struct mptcp_cb __rcu *mpcb;
558 ++ u8 meta_tw:1,
559 ++ in_list:1;
560 ++};
561 ++
562 ++#define MPTCP_PM_NAME_MAX 16
563 ++struct mptcp_pm_ops {
564 ++ struct list_head list;
565 ++
566 ++ /* Signal the creation of a new MPTCP-session. */
567 ++ void (*new_session)(const struct sock *meta_sk);
568 ++ void (*release_sock)(struct sock *meta_sk);
569 ++ void (*fully_established)(struct sock *meta_sk);
570 ++ void (*new_remote_address)(struct sock *meta_sk);
571 ++ int (*get_local_id)(sa_family_t family, union inet_addr *addr,
572 ++ struct net *net, bool *low_prio);
573 ++ void (*addr_signal)(struct sock *sk, unsigned *size,
574 ++ struct tcp_out_options *opts, struct sk_buff *skb);
575 ++ void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr,
576 ++ sa_family_t family, __be16 port, u8 id);
577 ++ void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id);
578 ++ void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr);
579 ++ void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr);
580 ++
581 ++ char name[MPTCP_PM_NAME_MAX];
582 ++ struct module *owner;
583 ++};
584 ++
585 ++#define MPTCP_SCHED_NAME_MAX 16
586 ++struct mptcp_sched_ops {
587 ++ struct list_head list;
588 ++
589 ++ struct sock * (*get_subflow)(struct sock *meta_sk,
590 ++ struct sk_buff *skb,
591 ++ bool zero_wnd_test);
592 ++ struct sk_buff * (*next_segment)(struct sock *meta_sk,
593 ++ int *reinject,
594 ++ struct sock **subsk,
595 ++ unsigned int *limit);
596 ++ void (*init)(struct sock *sk);
597 ++
598 ++ char name[MPTCP_SCHED_NAME_MAX];
599 ++ struct module *owner;
600 ++};
601 ++
602 ++struct mptcp_cb {
603 ++ /* list of sockets in this multipath connection */
604 ++ struct tcp_sock *connection_list;
605 ++ /* list of sockets that need a call to release_cb */
606 ++ struct hlist_head callback_list;
607 ++
608 ++ /* High-order bits of 64-bit sequence numbers */
609 ++ u32 snd_high_order[2];
610 ++ u32 rcv_high_order[2];
611 ++
612 ++ u16 send_infinite_mapping:1,
613 ++ in_time_wait:1,
614 ++ list_rcvd:1, /* XXX TO REMOVE */
615 ++ addr_signal:1, /* Path-manager wants us to call addr_signal */
616 ++ dss_csum:1,
617 ++ server_side:1,
618 ++ infinite_mapping_rcv:1,
619 ++ infinite_mapping_snd:1,
620 ++ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */
621 ++ passive_close:1,
622 ++ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
623 ++ rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */
624 ++
625 ++ /* socket count in this connection */
626 ++ u8 cnt_subflows;
627 ++ u8 cnt_established;
628 ++
629 ++ struct mptcp_sched_ops *sched_ops;
630 ++
631 ++ struct sk_buff_head reinject_queue;
632 ++ /* First cache-line boundary is here minus 8 bytes. But from the
633 ++ * reinject-queue only the next and prev pointers are regularly
634 ++ * accessed. Thus, the whole data-path is on a single cache-line.
635 ++ */
636 ++
637 ++ u64 csum_cutoff_seq;
638 ++
639 ++ /***** Start of fields, used for connection closure */
640 ++ spinlock_t tw_lock;
641 ++ unsigned char mptw_state;
642 ++ u8 dfin_path_index;
643 ++
644 ++ struct list_head tw_list;
645 ++
646 ++ /***** Start of fields, used for subflow establishment and closure */
647 ++ atomic_t mpcb_refcnt;
648 ++
649 ++ /* Mutex needed, because otherwise mptcp_close will complain that the
650 ++ * socket is owned by the user.
651 ++ * E.g., mptcp_sub_close_wq is taking the meta-lock.
652 ++ */
653 ++ struct mutex mpcb_mutex;
654 ++
655 ++ /***** Start of fields, used for subflow establishment */
656 ++ struct sock *meta_sk;
657 ++
658 ++ /* Master socket, also part of the connection_list, this
659 ++ * socket is the one that the application sees.
660 ++ */
661 ++ struct sock *master_sk;
662 ++
663 ++ __u64 mptcp_loc_key;
664 ++ __u64 mptcp_rem_key;
665 ++ __u32 mptcp_loc_token;
666 ++ __u32 mptcp_rem_token;
667 ++
668 ++#define MPTCP_PM_SIZE 608
669 ++ u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);
670 ++ struct mptcp_pm_ops *pm_ops;
671 ++
672 ++ u32 path_index_bits;
673 ++ /* Next pi to pick up in case a new path becomes available */
674 ++ u8 next_path_index;
675 ++
676 ++ /* Original snd/rcvbuf of the initial subflow.
677 ++ * Used for the new subflows on the server-side to allow correct
678 ++ * autotuning
679 ++ */
680 ++ int orig_sk_rcvbuf;
681 ++ int orig_sk_sndbuf;
682 ++ u32 orig_window_clamp;
683 ++
684 ++ /* Timer for retransmitting SYN/ACK+MP_JOIN */
685 ++ struct timer_list synack_timer;
686 ++};
687 ++
688 ++#define MPTCP_SUB_CAPABLE 0
689 ++#define MPTCP_SUB_LEN_CAPABLE_SYN 12
690 ++#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12
691 ++#define MPTCP_SUB_LEN_CAPABLE_ACK 20
692 ++#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20
693 ++
694 ++#define MPTCP_SUB_JOIN 1
695 ++#define MPTCP_SUB_LEN_JOIN_SYN 12
696 ++#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12
697 ++#define MPTCP_SUB_LEN_JOIN_SYNACK 16
698 ++#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
699 ++#define MPTCP_SUB_LEN_JOIN_ACK 24
700 ++#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24
701 ++
702 ++#define MPTCP_SUB_DSS 2
703 ++#define MPTCP_SUB_LEN_DSS 4
704 ++#define MPTCP_SUB_LEN_DSS_ALIGN 4
705 ++
706 ++/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
707 ++ * as they are part of the DSS-option.
708 ++ * To get the total length, just add the different options together.
709 ++ */
710 ++#define MPTCP_SUB_LEN_SEQ 10
711 ++#define MPTCP_SUB_LEN_SEQ_CSUM 12
712 ++#define MPTCP_SUB_LEN_SEQ_ALIGN 12
713 ++
714 ++#define MPTCP_SUB_LEN_SEQ_64 14
715 ++#define MPTCP_SUB_LEN_SEQ_CSUM_64 16
716 ++#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16
717 ++
718 ++#define MPTCP_SUB_LEN_ACK 4
719 ++#define MPTCP_SUB_LEN_ACK_ALIGN 4
720 ++
721 ++#define MPTCP_SUB_LEN_ACK_64 8
722 ++#define MPTCP_SUB_LEN_ACK_64_ALIGN 8
723 ++
724 ++/* This is the "default" option-length we will send out most often.
725 ++ * MPTCP DSS-header
726 ++ * 32-bit data sequence number
727 ++ * 32-bit data ack
728 ++ *
729 ++ * It is necessary to calculate the effective MSS we will be using when
730 ++ * sending data.
731 ++ */
732 ++#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \
733 ++ MPTCP_SUB_LEN_SEQ_ALIGN + \
734 ++ MPTCP_SUB_LEN_ACK_ALIGN)
735 ++
736 ++#define MPTCP_SUB_ADD_ADDR 3
737 ++#define MPTCP_SUB_LEN_ADD_ADDR4 8
738 ++#define MPTCP_SUB_LEN_ADD_ADDR6 20
739 ++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8
740 ++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20
741 ++
742 ++#define MPTCP_SUB_REMOVE_ADDR 4
743 ++#define MPTCP_SUB_LEN_REMOVE_ADDR 4
744 ++
745 ++#define MPTCP_SUB_PRIO 5
746 ++#define MPTCP_SUB_LEN_PRIO 3
747 ++#define MPTCP_SUB_LEN_PRIO_ADDR 4
748 ++#define MPTCP_SUB_LEN_PRIO_ALIGN 4
749 ++
750 ++#define MPTCP_SUB_FAIL 6
751 ++#define MPTCP_SUB_LEN_FAIL 12
752 ++#define MPTCP_SUB_LEN_FAIL_ALIGN 12
753 ++
754 ++#define MPTCP_SUB_FCLOSE 7
755 ++#define MPTCP_SUB_LEN_FCLOSE 12
756 ++#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12
757 ++
758 ++
759 ++#define OPTION_MPTCP (1 << 5)
760 ++
761 ++#ifdef CONFIG_MPTCP
762 ++
763 ++/* Used for checking if the mptcp initialization has been successful */
764 ++extern bool mptcp_init_failed;
765 ++
766 ++/* MPTCP options */
767 ++#define OPTION_TYPE_SYN (1 << 0)
768 ++#define OPTION_TYPE_SYNACK (1 << 1)
769 ++#define OPTION_TYPE_ACK (1 << 2)
770 ++#define OPTION_MP_CAPABLE (1 << 3)
771 ++#define OPTION_DATA_ACK (1 << 4)
772 ++#define OPTION_ADD_ADDR (1 << 5)
773 ++#define OPTION_MP_JOIN (1 << 6)
774 ++#define OPTION_MP_FAIL (1 << 7)
775 ++#define OPTION_MP_FCLOSE (1 << 8)
776 ++#define OPTION_REMOVE_ADDR (1 << 9)
777 ++#define OPTION_MP_PRIO (1 << 10)
778 ++
779 ++/* MPTCP flags: both TX and RX */
780 ++#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */
781 ++#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */
782 ++#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */
783 ++/* MPTCP flags: RX only */
784 ++#define MPTCPHDR_ACK 0x08
785 ++#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number? */
786 ++#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */
787 ++#define MPTCPHDR_DSS_CSUM 0x40
788 ++#define MPTCPHDR_JOIN 0x80
789 ++/* MPTCP flags: TX only */
790 ++#define MPTCPHDR_INF 0x08
791 ++
792 ++struct mptcp_option {
793 ++ __u8 kind;
794 ++ __u8 len;
795 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
796 ++ __u8 ver:4,
797 ++ sub:4;
798 ++#elif defined(__BIG_ENDIAN_BITFIELD)
799 ++ __u8 sub:4,
800 ++ ver:4;
801 ++#else
802 ++#error "Adjust your <asm/byteorder.h> defines"
803 ++#endif
804 ++};
805 ++
806 ++struct mp_capable {
807 ++ __u8 kind;
808 ++ __u8 len;
809 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
810 ++ __u8 ver:4,
811 ++ sub:4;
812 ++ __u8 h:1,
813 ++ rsv:5,
814 ++ b:1,
815 ++ a:1;
816 ++#elif defined(__BIG_ENDIAN_BITFIELD)
817 ++ __u8 sub:4,
818 ++ ver:4;
819 ++ __u8 a:1,
820 ++ b:1,
821 ++ rsv:5,
822 ++ h:1;
823 ++#else
824 ++#error "Adjust your <asm/byteorder.h> defines"
825 ++#endif
826 ++ __u64 sender_key;
827 ++ __u64 receiver_key;
828 ++} __attribute__((__packed__));
829 ++
830 ++struct mp_join {
831 ++ __u8 kind;
832 ++ __u8 len;
833 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
834 ++ __u8 b:1,
835 ++ rsv:3,
836 ++ sub:4;
837 ++#elif defined(__BIG_ENDIAN_BITFIELD)
838 ++ __u8 sub:4,
839 ++ rsv:3,
840 ++ b:1;
841 ++#else
842 ++#error "Adjust your <asm/byteorder.h> defines"
843 ++#endif
844 ++ __u8 addr_id;
845 ++ union {
846 ++ struct {
847 ++ u32 token;
848 ++ u32 nonce;
849 ++ } syn;
850 ++ struct {
851 ++ __u64 mac;
852 ++ u32 nonce;
853 ++ } synack;
854 ++ struct {
855 ++ __u8 mac[20];
856 ++ } ack;
857 ++ } u;
858 ++} __attribute__((__packed__));
859 ++
860 ++struct mp_dss {
861 ++ __u8 kind;
862 ++ __u8 len;
863 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
864 ++ __u16 rsv1:4,
865 ++ sub:4,
866 ++ A:1,
867 ++ a:1,
868 ++ M:1,
869 ++ m:1,
870 ++ F:1,
871 ++ rsv2:3;
872 ++#elif defined(__BIG_ENDIAN_BITFIELD)
873 ++ __u16 sub:4,
874 ++ rsv1:4,
875 ++ rsv2:3,
876 ++ F:1,
877 ++ m:1,
878 ++ M:1,
879 ++ a:1,
880 ++ A:1;
881 ++#else
882 ++#error "Adjust your <asm/byteorder.h> defines"
883 ++#endif
884 ++};
885 ++
886 ++struct mp_add_addr {
887 ++ __u8 kind;
888 ++ __u8 len;
889 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
890 ++ __u8 ipver:4,
891 ++ sub:4;
892 ++#elif defined(__BIG_ENDIAN_BITFIELD)
893 ++ __u8 sub:4,
894 ++ ipver:4;
895 ++#else
896 ++#error "Adjust your <asm/byteorder.h> defines"
897 ++#endif
898 ++ __u8 addr_id;
899 ++ union {
900 ++ struct {
901 ++ struct in_addr addr;
902 ++ __be16 port;
903 ++ } v4;
904 ++ struct {
905 ++ struct in6_addr addr;
906 ++ __be16 port;
907 ++ } v6;
908 ++ } u;
909 ++} __attribute__((__packed__));
910 ++
911 ++struct mp_remove_addr {
912 ++ __u8 kind;
913 ++ __u8 len;
914 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
915 ++ __u8 rsv:4,
916 ++ sub:4;
917 ++#elif defined(__BIG_ENDIAN_BITFIELD)
918 ++ __u8 sub:4,
919 ++ rsv:4;
920 ++#else
921 ++#error "Adjust your <asm/byteorder.h> defines"
922 ++#endif
923 ++ /* list of addr_id */
924 ++ __u8 addrs_id;
925 ++};
926 ++
927 ++struct mp_fail {
928 ++ __u8 kind;
929 ++ __u8 len;
930 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
931 ++ __u16 rsv1:4,
932 ++ sub:4,
933 ++ rsv2:8;
934 ++#elif defined(__BIG_ENDIAN_BITFIELD)
935 ++ __u16 sub:4,
936 ++ rsv1:4,
937 ++ rsv2:8;
938 ++#else
939 ++#error "Adjust your <asm/byteorder.h> defines"
940 ++#endif
941 ++ __be64 data_seq;
942 ++} __attribute__((__packed__));
943 ++
944 ++struct mp_fclose {
945 ++ __u8 kind;
946 ++ __u8 len;
947 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
948 ++ __u16 rsv1:4,
949 ++ sub:4,
950 ++ rsv2:8;
951 ++#elif defined(__BIG_ENDIAN_BITFIELD)
952 ++ __u16 sub:4,
953 ++ rsv1:4,
954 ++ rsv2:8;
955 ++#else
956 ++#error "Adjust your <asm/byteorder.h> defines"
957 ++#endif
958 ++ __u64 key;
959 ++} __attribute__((__packed__));
960 ++
961 ++struct mp_prio {
962 ++ __u8 kind;
963 ++ __u8 len;
964 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
965 ++ __u8 b:1,
966 ++ rsv:3,
967 ++ sub:4;
968 ++#elif defined(__BIG_ENDIAN_BITFIELD)
969 ++ __u8 sub:4,
970 ++ rsv:3,
971 ++ b:1;
972 ++#else
973 ++#error "Adjust your <asm/byteorder.h> defines"
974 ++#endif
975 ++ __u8 addr_id;
976 ++} __attribute__((__packed__));
977 ++
978 ++static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum)
979 ++{
980 ++ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
981 ++}
982 ++
983 ++#define MPTCP_APP 2
984 ++
985 ++extern int sysctl_mptcp_enabled;
986 ++extern int sysctl_mptcp_checksum;
987 ++extern int sysctl_mptcp_debug;
988 ++extern int sysctl_mptcp_syn_retries;
989 ++
990 ++extern struct workqueue_struct *mptcp_wq;
991 ++
992 ++#define mptcp_debug(fmt, args...) \
993 ++ do { \
994 ++ if (unlikely(sysctl_mptcp_debug)) \
995 ++ pr_err(__FILE__ ": " fmt, ##args); \
996 ++ } while (0)
997 ++
998 ++/* Iterates over all subflows */
999 ++#define mptcp_for_each_tp(mpcb, tp) \
1000 ++ for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)
1001 ++
1002 ++#define mptcp_for_each_sk(mpcb, sk) \
1003 ++ for ((sk) = (struct sock *)(mpcb)->connection_list; \
1004 ++ sk; \
1005 ++ sk = (struct sock *)tcp_sk(sk)->mptcp->next)
1006 ++
1007 ++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \
1008 ++ for (__sk = (struct sock *)(__mpcb)->connection_list, \
1009 ++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \
1010 ++ __sk; \
1011 ++ __sk = __temp, \
1012 ++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)
1013 ++
1014 ++/* Iterates over all bit set to 1 in a bitset */
1015 ++#define mptcp_for_each_bit_set(b, i) \
1016 ++ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
1017 ++
1018 ++#define mptcp_for_each_bit_unset(b, i) \
1019 ++ mptcp_for_each_bit_set(~b, i)
1020 ++
1021 ++extern struct lock_class_key meta_key;
1022 ++extern struct lock_class_key meta_slock_key;
1023 ++extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];
1024 ++
1025 ++/* This is needed to ensure that two subsequent key/nonce-generation result in
1026 ++ * different keys/nonces if the IPs and ports are the same.
1027 ++ */
1028 ++extern u32 mptcp_seed;
1029 ++
1030 ++#define MPTCP_HASH_SIZE 1024
1031 ++
1032 ++extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
1033 ++
1034 ++/* This second hashtable is needed to retrieve request socks
1035 ++ * created as a result of a join request. While the SYN contains
1036 ++ * the token, the final ack does not, so we need a separate hashtable
1037 ++ * to retrieve the mpcb.
1038 ++ */
1039 ++extern struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
1040 ++extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
1041 ++
1042 ++/* Lock, protecting the two hash-tables that hold the token. Namely,
1043 ++ * mptcp_reqsk_tk_htb and tk_hashtable
1044 ++ */
1045 ++extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */
1046 ++
1047 ++/* Request-sockets can be hashed in the tk_htb for collision-detection or in
1048 ++ * the regular htb for join-connections. We need to define different NULLS
1049 ++ * values so that we can correctly detect a request-socket that has been
1050 ++ * recycled. See also c25eb3bfb9729.
1051 ++ */
1052 ++#define MPTCP_REQSK_NULLS_BASE (1U << 29)
1053 ++
1054 ++
1055 ++void mptcp_data_ready(struct sock *sk);
1056 ++void mptcp_write_space(struct sock *sk);
1057 ++
1058 ++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
1059 ++ struct sock *sk);
1060 ++void mptcp_ofo_queue(struct sock *meta_sk);
1061 ++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);
1062 ++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
1063 ++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
1064 ++ gfp_t flags);
1065 ++void mptcp_del_sock(struct sock *sk);
1066 ++void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk);
1067 ++void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
1068 ++void mptcp_update_sndbuf(const struct tcp_sock *tp);
1069 ++void mptcp_send_fin(struct sock *meta_sk);
1070 ++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
1071 ++bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1072 ++ int push_one, gfp_t gfp);
1073 ++void tcp_parse_mptcp_options(const struct sk_buff *skb,
1074 ++ struct mptcp_options_received *mopt);
1075 ++void mptcp_parse_options(const uint8_t *ptr, int opsize,
1076 ++ struct mptcp_options_received *mopt,
1077 ++ const struct sk_buff *skb);
1078 ++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
1079 ++ unsigned *remaining);
1080 ++void mptcp_synack_options(struct request_sock *req,
1081 ++ struct tcp_out_options *opts,
1082 ++ unsigned *remaining);
1083 ++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
1084 ++ struct tcp_out_options *opts, unsigned *size);
1085 ++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
1086 ++ const struct tcp_out_options *opts,
1087 ++ struct sk_buff *skb);
1088 ++void mptcp_close(struct sock *meta_sk, long timeout);
1089 ++int mptcp_doit(struct sock *sk);
1090 ++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);
1091 ++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req);
1092 ++int mptcp_check_req_master(struct sock *sk, struct sock *child,
1093 ++ struct request_sock *req,
1094 ++ struct request_sock **prev);
1095 ++struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,
1096 ++ struct request_sock *req,
1097 ++ struct request_sock **prev,
1098 ++ const struct mptcp_options_received *mopt);
1099 ++u32 __mptcp_select_window(struct sock *sk);
1100 ++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
1101 ++ __u32 *window_clamp, int wscale_ok,
1102 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
1103 ++ const struct sock *sk);
1104 ++unsigned int mptcp_current_mss(struct sock *meta_sk);
1105 ++int mptcp_select_size(const struct sock *meta_sk, bool sg);
1106 ++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);
1107 ++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
1108 ++ u32 *hash_out);
1109 ++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk);
1110 ++void mptcp_fin(struct sock *meta_sk);
1111 ++void mptcp_retransmit_timer(struct sock *meta_sk);
1112 ++int mptcp_write_wakeup(struct sock *meta_sk);
1113 ++void mptcp_sub_close_wq(struct work_struct *work);
1114 ++void mptcp_sub_close(struct sock *sk, unsigned long delay);
1115 ++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk);
1116 ++void mptcp_fallback_meta_sk(struct sock *meta_sk);
1117 ++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
1118 ++void mptcp_ack_handler(unsigned long);
1119 ++int mptcp_check_rtt(const struct tcp_sock *tp, int time);
1120 ++int mptcp_check_snd_buf(const struct tcp_sock *tp);
1121 ++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
1122 ++ const struct sk_buff *skb);
1123 ++void __init mptcp_init(void);
1124 ++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);
1125 ++void mptcp_destroy_sock(struct sock *sk);
1126 ++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
1127 ++ const struct sk_buff *skb,
1128 ++ const struct mptcp_options_received *mopt);
1129 ++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
1130 ++ int large_allowed);
1131 ++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw);
1132 ++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
1133 ++void mptcp_time_wait(struct sock *sk, int state, int timeo);
1134 ++void mptcp_disconnect(struct sock *sk);
1135 ++bool mptcp_should_expand_sndbuf(const struct sock *sk);
1136 ++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
1137 ++void mptcp_tsq_flags(struct sock *sk);
1138 ++void mptcp_tsq_sub_deferred(struct sock *meta_sk);
1139 ++struct mp_join *mptcp_find_join(const struct sk_buff *skb);
1140 ++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
1141 ++void mptcp_hash_remove(struct tcp_sock *meta_tp);
1142 ++struct sock *mptcp_hash_find(const struct net *net, const u32 token);
1143 ++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
1144 ++int mptcp_do_join_short(struct sk_buff *skb,
1145 ++ const struct mptcp_options_received *mopt,
1146 ++ struct net *net);
1147 ++void mptcp_reqsk_destructor(struct request_sock *req);
1148 ++void mptcp_reqsk_new_mptcp(struct request_sock *req,
1149 ++ const struct mptcp_options_received *mopt,
1150 ++ const struct sk_buff *skb);
1151 ++int mptcp_check_req(struct sk_buff *skb, struct net *net);
1152 ++void mptcp_connect_init(struct sock *sk);
1153 ++void mptcp_sub_force_close(struct sock *sk);
1154 ++int mptcp_sub_len_remove_addr_align(u16 bitfield);
1155 ++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
1156 ++ const struct sk_buff *skb);
1157 ++void mptcp_init_buffer_space(struct sock *sk);
1158 ++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,
1159 ++ struct sk_buff *skb);
1160 ++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb);
1161 ++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb);
1162 ++void mptcp_init_congestion_control(struct sock *sk);
1163 ++
1164 ++/* MPTCP-path-manager registration/initialization functions */
1165 ++int mptcp_register_path_manager(struct mptcp_pm_ops *pm);
1166 ++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);
1167 ++void mptcp_init_path_manager(struct mptcp_cb *mpcb);
1168 ++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);
1169 ++void mptcp_fallback_default(struct mptcp_cb *mpcb);
1170 ++void mptcp_get_default_path_manager(char *name);
1171 ++int mptcp_set_default_path_manager(const char *name);
1172 ++extern struct mptcp_pm_ops mptcp_pm_default;
1173 ++
1174 ++/* MPTCP-scheduler registration/initialization functions */
1175 ++int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
1176 ++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
1177 ++void mptcp_init_scheduler(struct mptcp_cb *mpcb);
1178 ++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb);
1179 ++void mptcp_get_default_scheduler(char *name);
1180 ++int mptcp_set_default_scheduler(const char *name);
1181 ++extern struct mptcp_sched_ops mptcp_sched_default;
1182 ++
1183 ++static inline void mptcp_reset_synack_timer(struct sock *meta_sk,
1184 ++ unsigned long len)
1185 ++{
1186 ++ sk_reset_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer,
1187 ++ jiffies + len);
1188 ++}
1189 ++
1190 ++static inline void mptcp_delete_synack_timer(struct sock *meta_sk)
1191 ++{
1192 ++ sk_stop_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer);
1193 ++}
1194 ++
1195 ++static inline bool is_mptcp_enabled(const struct sock *sk)
1196 ++{
1197 ++ if (!sysctl_mptcp_enabled || mptcp_init_failed)
1198 ++ return false;
1199 ++
1200 ++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
1201 ++ return false;
1202 ++
1203 ++ return true;
1204 ++}
1205 ++
1206 ++static inline int mptcp_pi_to_flag(int pi)
1207 ++{
1208 ++ return 1 << (pi - 1);
1209 ++}
1210 ++
1211 ++static inline
1212 ++struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
1213 ++{
1214 ++ return (struct mptcp_request_sock *)req;
1215 ++}
1216 ++
1217 ++static inline
1218 ++struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
1219 ++{
1220 ++ return (struct request_sock *)req;
1221 ++}
1222 ++
1223 ++static inline bool mptcp_can_sendpage(struct sock *sk)
1224 ++{
1225 ++ struct sock *sk_it;
1226 ++
1227 ++ if (tcp_sk(sk)->mpcb->dss_csum)
1228 ++ return false;
1229 ++
1230 ++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
1231 ++ if (!(sk_it->sk_route_caps & NETIF_F_SG) ||
1232 ++ !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))
1233 ++ return false;
1234 ++ }
1235 ++
1236 ++ return true;
1237 ++}
1238 ++
1239 ++static inline void mptcp_push_pending_frames(struct sock *meta_sk)
1240 ++{
1241 ++ /* We check packets out and send-head here. TCP only checks the
1242 ++ * send-head. But, MPTCP also checks packets_out, as this is an
1243 ++ * indication that we might want to do opportunistic reinjection.
1244 ++ */
1245 ++ if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) {
1246 ++ struct tcp_sock *tp = tcp_sk(meta_sk);
1247 ++
1248 ++ /* We don't care about the MSS, because it will be set in
1249 ++ * mptcp_write_xmit.
1250 ++ */
1251 ++ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
1252 ++ }
1253 ++}
1254 ++
1255 ++static inline void mptcp_send_reset(struct sock *sk)
1256 ++{
1257 ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
1258 ++ mptcp_sub_force_close(sk);
1259 ++}
1260 ++
1261 ++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
1262 ++{
1263 ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
1264 ++}
1265 ++
1266 ++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
1267 ++{
1268 ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN;
1269 ++}
1270 ++
1271 ++/* Is it a data-fin while in infinite mapping mode?
1272 ++ * In infinite mode, a subflow-fin is in fact a data-fin.
1273 ++ */
1274 ++static inline bool mptcp_is_data_fin2(const struct sk_buff *skb,
1275 ++ const struct tcp_sock *tp)
1276 ++{
1277 ++ return mptcp_is_data_fin(skb) ||
1278 ++ (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);
1279 ++}
1280 ++
1281 ++static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
1282 ++{
1283 ++ u64 data_seq_high = (u32)(data_seq >> 32);
1284 ++
1285 ++ if (mpcb->rcv_high_order[0] == data_seq_high)
1286 ++ return 0;
1287 ++ else if (mpcb->rcv_high_order[1] == data_seq_high)
1288 ++ return MPTCPHDR_SEQ64_INDEX;
1289 ++ else
1290 ++ return MPTCPHDR_SEQ64_OFO;
1291 ++}
1292 ++
1293 ++/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
1294 ++ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
1295 ++ */
1296 ++static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
1297 ++ u32 *data_seq,
1298 ++ struct mptcp_cb *mpcb)
1299 ++{
1300 ++ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
1301 ++
1302 ++ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
1303 ++ u64 data_seq64 = get_unaligned_be64(ptr);
1304 ++
1305 ++ if (mpcb)
1306 ++ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
1307 ++
1308 ++ *data_seq = (u32)data_seq64;
1309 ++ ptr++;
1310 ++ } else {
1311 ++ *data_seq = get_unaligned_be32(ptr);
1312 ++ }
1313 ++
1314 ++ return ptr;
1315 ++}
1316 ++
1317 ++static inline struct sock *mptcp_meta_sk(const struct sock *sk)
1318 ++{
1319 ++ return tcp_sk(sk)->meta_sk;
1320 ++}
1321 ++
1322 ++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
1323 ++{
1324 ++ return tcp_sk(tp->meta_sk);
1325 ++}
1326 ++
1327 ++static inline int is_meta_tp(const struct tcp_sock *tp)
1328 ++{
1329 ++ return tp->mpcb && mptcp_meta_tp(tp) == tp;
1330 ++}
1331 ++
1332 ++static inline int is_meta_sk(const struct sock *sk)
1333 ++{
1334 ++ return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
1335 ++ mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk;
1336 ++}
1337 ++
1338 ++static inline int is_master_tp(const struct tcp_sock *tp)
1339 ++{
1340 ++ return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
1341 ++}
1342 ++
1343 ++static inline void mptcp_hash_request_remove(struct request_sock *req)
1344 ++{
1345 ++ int in_softirq = 0;
1346 ++
1347 ++ if (hlist_nulls_unhashed(&mptcp_rsk(req)->hash_entry))
1348 ++ return;
1349 ++
1350 ++ if (in_softirq()) {
1351 ++ spin_lock(&mptcp_reqsk_hlock);
1352 ++ in_softirq = 1;
1353 ++ } else {
1354 ++ spin_lock_bh(&mptcp_reqsk_hlock);
1355 ++ }
1356 ++
1357 ++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);
1358 ++
1359 ++ if (in_softirq)
1360 ++ spin_unlock(&mptcp_reqsk_hlock);
1361 ++ else
1362 ++ spin_unlock_bh(&mptcp_reqsk_hlock);
1363 ++}
1364 ++
1365 ++static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
1366 ++{
1367 ++ mopt->saw_mpc = 0;
1368 ++ mopt->dss_csum = 0;
1369 ++ mopt->drop_me = 0;
1370 ++
1371 ++ mopt->is_mp_join = 0;
1372 ++ mopt->join_ack = 0;
1373 ++
1374 ++ mopt->saw_low_prio = 0;
1375 ++ mopt->low_prio = 0;
1376 ++
1377 ++ mopt->saw_add_addr = 0;
1378 ++ mopt->more_add_addr = 0;
1379 ++
1380 ++ mopt->saw_rem_addr = 0;
1381 ++ mopt->more_rem_addr = 0;
1382 ++
1383 ++ mopt->mp_fail = 0;
1384 ++ mopt->mp_fclose = 0;
1385 ++}
1386 ++
1387 ++static inline void mptcp_reset_mopt(struct tcp_sock *tp)
1388 ++{
1389 ++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
1390 ++
1391 ++ mopt->saw_low_prio = 0;
1392 ++ mopt->saw_add_addr = 0;
1393 ++ mopt->more_add_addr = 0;
1394 ++ mopt->saw_rem_addr = 0;
1395 ++ mopt->more_rem_addr = 0;
1396 ++ mopt->join_ack = 0;
1397 ++ mopt->mp_fail = 0;
1398 ++ mopt->mp_fclose = 0;
1399 ++}
1400 ++
1401 ++static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
1402 ++ const struct mptcp_cb *mpcb)
1403 ++{
1404 ++ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
1405 ++ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
1406 ++}
1407 ++
1408 ++static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
1409 ++ u32 data_seq_32)
1410 ++{
1411 ++ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
1412 ++}
1413 ++
1414 ++static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
1415 ++{
1416 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
1417 ++ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
1418 ++ meta_tp->rcv_nxt);
1419 ++}
1420 ++
1421 ++static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
1422 ++{
1423 ++ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
1424 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
1425 ++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
1426 ++ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
1427 ++ }
1428 ++}
1429 ++
1430 ++static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
1431 ++ u32 old_rcv_nxt)
1432 ++{
1433 ++ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
1434 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
1435 ++ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
1436 ++ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
1437 ++ }
1438 ++}
1439 ++
1440 ++static inline int mptcp_sk_can_send(const struct sock *sk)
1441 ++{
1442 ++ return tcp_passive_fastopen(sk) ||
1443 ++ ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1444 ++ !tcp_sk(sk)->mptcp->pre_established);
1445 ++}
1446 ++
1447 ++static inline int mptcp_sk_can_recv(const struct sock *sk)
1448 ++{
1449 ++ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2);
1450 ++}
1451 ++
1452 ++static inline int mptcp_sk_can_send_ack(const struct sock *sk)
1453 ++{
1454 ++ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
1455 ++ TCPF_CLOSE | TCPF_LISTEN)) &&
1456 ++ !tcp_sk(sk)->mptcp->pre_established;
1457 ++}
1458 ++
1459 ++/* Only support GSO if all subflows supports it */
1460 ++static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)
1461 ++{
1462 ++ struct sock *sk;
1463 ++
1464 ++ if (tcp_sk(meta_sk)->mpcb->dss_csum)
1465 ++ return false;
1466 ++
1467 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1468 ++ if (!mptcp_sk_can_send(sk))
1469 ++ continue;
1470 ++ if (!sk_can_gso(sk))
1471 ++ return false;
1472 ++ }
1473 ++ return true;
1474 ++}
1475 ++
1476 ++static inline bool mptcp_can_sg(const struct sock *meta_sk)
1477 ++{
1478 ++ struct sock *sk;
1479 ++
1480 ++ if (tcp_sk(meta_sk)->mpcb->dss_csum)
1481 ++ return false;
1482 ++
1483 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1484 ++ if (!mptcp_sk_can_send(sk))
1485 ++ continue;
1486 ++ if (!(sk->sk_route_caps & NETIF_F_SG))
1487 ++ return false;
1488 ++ }
1489 ++ return true;
1490 ++}
1491 ++
1492 ++static inline void mptcp_set_rto(struct sock *sk)
1493 ++{
1494 ++ struct tcp_sock *tp = tcp_sk(sk);
1495 ++ struct sock *sk_it;
1496 ++ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
1497 ++ __u32 max_rto = 0;
1498 ++
1499 ++ /* We are in recovery-phase on the MPTCP-level. Do not update the
1500 ++ * RTO, because this would kill exponential backoff.
1501 ++ */
1502 ++ if (micsk->icsk_retransmits)
1503 ++ return;
1504 ++
1505 ++ mptcp_for_each_sk(tp->mpcb, sk_it) {
1506 ++ if (mptcp_sk_can_send(sk_it) &&
1507 ++ inet_csk(sk_it)->icsk_rto > max_rto)
1508 ++ max_rto = inet_csk(sk_it)->icsk_rto;
1509 ++ }
1510 ++ if (max_rto) {
1511 ++ micsk->icsk_rto = max_rto << 1;
1512 ++
1513 ++ /* A successfull rto-measurement - reset backoff counter */
1514 ++ micsk->icsk_backoff = 0;
1515 ++ }
1516 ++}
1517 ++
1518 ++static inline int mptcp_sysctl_syn_retries(void)
1519 ++{
1520 ++ return sysctl_mptcp_syn_retries;
1521 ++}
1522 ++
1523 ++static inline void mptcp_sub_close_passive(struct sock *sk)
1524 ++{
1525 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
1526 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
1527 ++
1528 ++ /* Only close, if the app did a send-shutdown (passive close), and we
1529 ++ * received the data-ack of the data-fin.
1530 ++ */
1531 ++ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
1532 ++ mptcp_sub_close(sk, 0);
1533 ++}
1534 ++
1535 ++static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)
1536 ++{
1537 ++ struct tcp_sock *tp = tcp_sk(sk);
1538 ++
1539 ++ /* If data has been acknowleged on the meta-level, fully_established
1540 ++ * will have been set before and thus we will not fall back to infinite
1541 ++ * mapping.
1542 ++ */
1543 ++ if (likely(tp->mptcp->fully_established))
1544 ++ return false;
1545 ++
1546 ++ if (!(flag & MPTCP_FLAG_DATA_ACKED))
1547 ++ return false;
1548 ++
1549 ++ /* Don't fallback twice ;) */
1550 ++ if (tp->mpcb->infinite_mapping_snd)
1551 ++ return false;
1552 ++
1553 ++ pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",
1554 ++ __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,
1555 ++ &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,
1556 ++ __builtin_return_address(0));
1557 ++ if (!is_master_tp(tp))
1558 ++ return true;
1559 ++
1560 ++ tp->mpcb->infinite_mapping_snd = 1;
1561 ++ tp->mpcb->infinite_mapping_rcv = 1;
1562 ++ tp->mptcp->fully_established = 1;
1563 ++
1564 ++ return false;
1565 ++}
1566 ++
1567 ++/* Find the first index whose bit in the bit-field == 0 */
1568 ++static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
1569 ++{
1570 ++ u8 base = mpcb->next_path_index;
1571 ++ int i;
1572 ++
1573 ++ /* Start at 1, because 0 is reserved for the meta-sk */
1574 ++ mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {
1575 ++ if (i + base < 1)
1576 ++ continue;
1577 ++ if (i + base >= sizeof(mpcb->path_index_bits) * 8)
1578 ++ break;
1579 ++ i += base;
1580 ++ mpcb->path_index_bits |= (1 << i);
1581 ++ mpcb->next_path_index = i + 1;
1582 ++ return i;
1583 ++ }
1584 ++ mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {
1585 ++ if (i >= sizeof(mpcb->path_index_bits) * 8)
1586 ++ break;
1587 ++ if (i < 1)
1588 ++ continue;
1589 ++ mpcb->path_index_bits |= (1 << i);
1590 ++ mpcb->next_path_index = i + 1;
1591 ++ return i;
1592 ++ }
1593 ++
1594 ++ return 0;
1595 ++}
1596 ++
1597 ++static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk)
1598 ++{
1599 ++ return sk->sk_family == AF_INET6 &&
1600 ++ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
1601 ++}
1602 ++
1603 ++/* TCP and MPTCP mpc flag-depending functions */
1604 ++u16 mptcp_select_window(struct sock *sk);
1605 ++void mptcp_init_buffer_space(struct sock *sk);
1606 ++void mptcp_tcp_set_rto(struct sock *sk);
1607 ++
1608 ++/* TCP and MPTCP flag-depending functions */
1609 ++bool mptcp_prune_ofo_queue(struct sock *sk);
1610 ++
1611 ++#else /* CONFIG_MPTCP */
1612 ++#define mptcp_debug(fmt, args...) \
1613 ++ do { \
1614 ++ } while (0)
1615 ++
1616 ++/* Without MPTCP, we just do one iteration
1617 ++ * over the only socket available. This assumes that
1618 ++ * the sk/tp arg is the socket in that case.
1619 ++ */
1620 ++#define mptcp_for_each_sk(mpcb, sk)
1621 ++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)
1622 ++
1623 ++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
1624 ++{
1625 ++ return false;
1626 ++}
1627 ++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
1628 ++{
1629 ++ return false;
1630 ++}
1631 ++static inline struct sock *mptcp_meta_sk(const struct sock *sk)
1632 ++{
1633 ++ return NULL;
1634 ++}
1635 ++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
1636 ++{
1637 ++ return NULL;
1638 ++}
1639 ++static inline int is_meta_sk(const struct sock *sk)
1640 ++{
1641 ++ return 0;
1642 ++}
1643 ++static inline int is_master_tp(const struct tcp_sock *tp)
1644 ++{
1645 ++ return 0;
1646 ++}
1647 ++static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}
1648 ++static inline void mptcp_del_sock(const struct sock *sk) {}
1649 ++static inline void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk) {}
1650 ++static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
1651 ++static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {}
1652 ++static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
1653 ++ const struct sock *sk) {}
1654 ++static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
1655 ++static inline void mptcp_set_rto(const struct sock *sk) {}
1656 ++static inline void mptcp_send_fin(const struct sock *meta_sk) {}
1657 ++static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
1658 ++ const struct mptcp_options_received *mopt,
1659 ++ const struct sk_buff *skb) {}
1660 ++static inline void mptcp_syn_options(const struct sock *sk,
1661 ++ struct tcp_out_options *opts,
1662 ++ unsigned *remaining) {}
1663 ++static inline void mptcp_synack_options(struct request_sock *req,
1664 ++ struct tcp_out_options *opts,
1665 ++ unsigned *remaining) {}
1666 ++
1667 ++static inline void mptcp_established_options(struct sock *sk,
1668 ++ struct sk_buff *skb,
1669 ++ struct tcp_out_options *opts,
1670 ++ unsigned *size) {}
1671 ++static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
1672 ++ const struct tcp_out_options *opts,
1673 ++ struct sk_buff *skb) {}
1674 ++static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
1675 ++static inline int mptcp_doit(struct sock *sk)
1676 ++{
1677 ++ return 0;
1678 ++}
1679 ++static inline int mptcp_check_req_fastopen(struct sock *child,
1680 ++ struct request_sock *req)
1681 ++{
1682 ++ return 1;
1683 ++}
1684 ++static inline int mptcp_check_req_master(const struct sock *sk,
1685 ++ const struct sock *child,
1686 ++ struct request_sock *req,
1687 ++ struct request_sock **prev)
1688 ++{
1689 ++ return 1;
1690 ++}
1691 ++static inline struct sock *mptcp_check_req_child(struct sock *sk,
1692 ++ struct sock *child,
1693 ++ struct request_sock *req,
1694 ++ struct request_sock **prev,
1695 ++ const struct mptcp_options_received *mopt)
1696 ++{
1697 ++ return NULL;
1698 ++}
1699 ++static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
1700 ++{
1701 ++ return 0;
1702 ++}
1703 ++static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)
1704 ++{
1705 ++ return 0;
1706 ++}
1707 ++static inline void mptcp_sub_close_passive(struct sock *sk) {}
1708 ++static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)
1709 ++{
1710 ++ return false;
1711 ++}
1712 ++static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
1713 ++static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)
1714 ++{
1715 ++ return 0;
1716 ++}
1717 ++static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
1718 ++{
1719 ++ return 0;
1720 ++}
1721 ++static inline int mptcp_sysctl_syn_retries(void)
1722 ++{
1723 ++ return 0;
1724 ++}
1725 ++static inline void mptcp_send_reset(const struct sock *sk) {}
1726 ++static inline int mptcp_handle_options(struct sock *sk,
1727 ++ const struct tcphdr *th,
1728 ++ struct sk_buff *skb)
1729 ++{
1730 ++ return 0;
1731 ++}
1732 ++static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
1733 ++static inline void __init mptcp_init(void) {}
1734 ++static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1735 ++{
1736 ++ return 0;
1737 ++}
1738 ++static inline bool mptcp_sk_can_gso(const struct sock *sk)
1739 ++{
1740 ++ return false;
1741 ++}
1742 ++static inline bool mptcp_can_sg(const struct sock *meta_sk)
1743 ++{
1744 ++ return false;
1745 ++}
1746 ++static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk,
1747 ++ u32 mss_now, int large_allowed)
1748 ++{
1749 ++ return 0;
1750 ++}
1751 ++static inline void mptcp_destroy_sock(struct sock *sk) {}
1752 ++static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
1753 ++ struct sock **skptr,
1754 ++ struct sk_buff *skb,
1755 ++ const struct mptcp_options_received *mopt)
1756 ++{
1757 ++ return 0;
1758 ++}
1759 ++static inline bool mptcp_can_sendpage(struct sock *sk)
1760 ++{
1761 ++ return false;
1762 ++}
1763 ++static inline int mptcp_init_tw_sock(struct sock *sk,
1764 ++ struct tcp_timewait_sock *tw)
1765 ++{
1766 ++ return 0;
1767 ++}
1768 ++static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
1769 ++static inline void mptcp_disconnect(struct sock *sk) {}
1770 ++static inline void mptcp_tsq_flags(struct sock *sk) {}
1771 ++static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
1772 ++static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}
1773 ++static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}
1774 ++static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,
1775 ++ const struct tcp_options_received *rx_opt,
1776 ++ const struct mptcp_options_received *mopt,
1777 ++ const struct sk_buff *skb) {}
1778 ++static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
1779 ++ const struct sk_buff *skb) {}
1780 ++static inline void mptcp_delete_synack_timer(struct sock *meta_sk) {}
1781 ++#endif /* CONFIG_MPTCP */
1782 ++
1783 ++#endif /* _MPTCP_H */
1784 +diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h
1785 +new file mode 100644
1786 +index 000000000000..93ad97c77c5a
1787 +--- /dev/null
1788 ++++ b/include/net/mptcp_v4.h
1789 +@@ -0,0 +1,67 @@
1790 ++/*
1791 ++ * MPTCP implementation
1792 ++ *
1793 ++ * Initial Design & Implementation:
1794 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
1795 ++ *
1796 ++ * Current Maintainer & Author:
1797 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
1798 ++ *
1799 ++ * Additional authors:
1800 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1801 ++ * Gregory Detal <gregory.detal@×××××××××.be>
1802 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
1803 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
1804 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
1805 ++ * Andreas Ripke <ripke@××××××.eu>
1806 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
1807 ++ * Octavian Purdila <octavian.purdila@×××××.com>
1808 ++ * John Ronan <jronan@××××.org>
1809 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
1810 ++ * Brandon Heller <brandonh@××××××××.edu>
1811 ++ *
1812 ++ *
1813 ++ * This program is free software; you can redistribute it and/or
1814 ++ * modify it under the terms of the GNU General Public License
1815 ++ * as published by the Free Software Foundation; either version
1816 ++ * 2 of the License, or (at your option) any later version.
1817 ++ */
1818 ++
1819 ++#ifndef MPTCP_V4_H_
1820 ++#define MPTCP_V4_H_
1821 ++
1822 ++
1823 ++#include <linux/in.h>
1824 ++#include <linux/skbuff.h>
1825 ++#include <net/mptcp.h>
1826 ++#include <net/request_sock.h>
1827 ++#include <net/sock.h>
1828 ++
1829 ++extern struct request_sock_ops mptcp_request_sock_ops;
1830 ++extern const struct inet_connection_sock_af_ops mptcp_v4_specific;
1831 ++extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
1832 ++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
1833 ++
1834 ++#ifdef CONFIG_MPTCP
1835 ++
1836 ++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
1837 ++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
1838 ++ const __be32 laddr, const struct net *net);
1839 ++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
1840 ++ struct mptcp_rem4 *rem);
1841 ++int mptcp_pm_v4_init(void);
1842 ++void mptcp_pm_v4_undo(void);
1843 ++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
1844 ++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
1845 ++
1846 ++#else
1847 ++
1848 ++static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
1849 ++ const struct sk_buff *skb)
1850 ++{
1851 ++ return 0;
1852 ++}
1853 ++
1854 ++#endif /* CONFIG_MPTCP */
1855 ++
1856 ++#endif /* MPTCP_V4_H_ */
1857 +diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h
1858 +new file mode 100644
1859 +index 000000000000..49a4f30ccd4d
1860 +--- /dev/null
1861 ++++ b/include/net/mptcp_v6.h
1862 +@@ -0,0 +1,69 @@
1863 ++/*
1864 ++ * MPTCP implementation
1865 ++ *
1866 ++ * Initial Design & Implementation:
1867 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
1868 ++ *
1869 ++ * Current Maintainer & Author:
1870 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1871 ++ *
1872 ++ * Additional authors:
1873 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1874 ++ * Gregory Detal <gregory.detal@×××××××××.be>
1875 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
1876 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
1877 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
1878 ++ * Andreas Ripke <ripke@××××××.eu>
1879 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
1880 ++ * Octavian Purdila <octavian.purdila@×××××.com>
1881 ++ * John Ronan <jronan@××××.org>
1882 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
1883 ++ * Brandon Heller <brandonh@××××××××.edu>
1884 ++ *
1885 ++ *
1886 ++ * This program is free software; you can redistribute it and/or
1887 ++ * modify it under the terms of the GNU General Public License
1888 ++ * as published by the Free Software Foundation; either version
1889 ++ * 2 of the License, or (at your option) any later version.
1890 ++ */
1891 ++
1892 ++#ifndef _MPTCP_V6_H
1893 ++#define _MPTCP_V6_H
1894 ++
1895 ++#include <linux/in6.h>
1896 ++#include <net/if_inet6.h>
1897 ++
1898 ++#include <net/mptcp.h>
1899 ++
1900 ++
1901 ++#ifdef CONFIG_MPTCP
1902 ++extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;
1903 ++extern const struct inet_connection_sock_af_ops mptcp_v6_specific;
1904 ++extern struct request_sock_ops mptcp6_request_sock_ops;
1905 ++extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
1906 ++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
1907 ++
1908 ++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
1909 ++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
1910 ++ const struct in6_addr *laddr, const struct net *net);
1911 ++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
1912 ++ struct mptcp_rem6 *rem);
1913 ++int mptcp_pm_v6_init(void);
1914 ++void mptcp_pm_v6_undo(void);
1915 ++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
1916 ++ __be16 sport, __be16 dport);
1917 ++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
1918 ++ __be16 sport, __be16 dport);
1919 ++
1920 ++#else /* CONFIG_MPTCP */
1921 ++
1922 ++#define mptcp_v6_mapped ipv6_mapped
1923 ++
1924 ++static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
1925 ++{
1926 ++ return 0;
1927 ++}
1928 ++
1929 ++#endif /* CONFIG_MPTCP */
1930 ++
1931 ++#endif /* _MPTCP_V6_H */
1932 +diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
1933 +index 361d26077196..bae95a11c531 100644
1934 +--- a/include/net/net_namespace.h
1935 ++++ b/include/net/net_namespace.h
1936 +@@ -16,6 +16,7 @@
1937 + #include <net/netns/packet.h>
1938 + #include <net/netns/ipv4.h>
1939 + #include <net/netns/ipv6.h>
1940 ++#include <net/netns/mptcp.h>
1941 + #include <net/netns/ieee802154_6lowpan.h>
1942 + #include <net/netns/sctp.h>
1943 + #include <net/netns/dccp.h>
1944 +@@ -92,6 +93,9 @@ struct net {
1945 + #if IS_ENABLED(CONFIG_IPV6)
1946 + struct netns_ipv6 ipv6;
1947 + #endif
1948 ++#if IS_ENABLED(CONFIG_MPTCP)
1949 ++ struct netns_mptcp mptcp;
1950 ++#endif
1951 + #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
1952 + struct netns_ieee802154_lowpan ieee802154_lowpan;
1953 + #endif
1954 +diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h
1955 +new file mode 100644
1956 +index 000000000000..bad418b04cc8
1957 +--- /dev/null
1958 ++++ b/include/net/netns/mptcp.h
1959 +@@ -0,0 +1,44 @@
1960 ++/*
1961 ++ * MPTCP implementation - MPTCP namespace
1962 ++ *
1963 ++ * Initial Design & Implementation:
1964 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
1965 ++ *
1966 ++ * Current Maintainer:
1967 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
1968 ++ *
1969 ++ * Additional authors:
1970 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1971 ++ * Gregory Detal <gregory.detal@×××××××××.be>
1972 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
1973 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
1974 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
1975 ++ * Andreas Ripke <ripke@××××××.eu>
1976 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
1977 ++ * Octavian Purdila <octavian.purdila@×××××.com>
1978 ++ * John Ronan <jronan@××××.org>
1979 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
1980 ++ * Brandon Heller <brandonh@××××××××.edu>
1981 ++ *
1982 ++ *
1983 ++ * This program is free software; you can redistribute it and/or
1984 ++ * modify it under the terms of the GNU General Public License
1985 ++ * as published by the Free Software Foundation; either version
1986 ++ * 2 of the License, or (at your option) any later version.
1987 ++ */
1988 ++
1989 ++#ifndef __NETNS_MPTCP_H__
1990 ++#define __NETNS_MPTCP_H__
1991 ++
1992 ++#include <linux/compiler.h>
1993 ++
1994 ++enum {
1995 ++ MPTCP_PM_FULLMESH = 0,
1996 ++ MPTCP_PM_MAX
1997 ++};
1998 ++
1999 ++struct netns_mptcp {
2000 ++ void *path_managers[MPTCP_PM_MAX];
2001 ++};
2002 ++
2003 ++#endif /* __NETNS_MPTCP_H__ */
2004 +diff --git a/include/net/request_sock.h b/include/net/request_sock.h
2005 +index 7f830ff67f08..e79e87a8e1a6 100644
2006 +--- a/include/net/request_sock.h
2007 ++++ b/include/net/request_sock.h
2008 +@@ -164,7 +164,7 @@ struct request_sock_queue {
2009 + };
2010 +
2011 + int reqsk_queue_alloc(struct request_sock_queue *queue,
2012 +- unsigned int nr_table_entries);
2013 ++ unsigned int nr_table_entries, gfp_t flags);
2014 +
2015 + void __reqsk_queue_destroy(struct request_sock_queue *queue);
2016 + void reqsk_queue_destroy(struct request_sock_queue *queue);
2017 +diff --git a/include/net/sock.h b/include/net/sock.h
2018 +index 156350745700..0e23cae8861f 100644
2019 +--- a/include/net/sock.h
2020 ++++ b/include/net/sock.h
2021 +@@ -901,6 +901,16 @@ void sk_clear_memalloc(struct sock *sk);
2022 +
2023 + int sk_wait_data(struct sock *sk, long *timeo);
2024 +
2025 ++/* START - needed for MPTCP */
2026 ++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family);
2027 ++void sock_lock_init(struct sock *sk);
2028 ++
2029 ++extern struct lock_class_key af_callback_keys[AF_MAX];
2030 ++extern char *const af_family_clock_key_strings[AF_MAX+1];
2031 ++
2032 ++#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
2033 ++/* END - needed for MPTCP */
2034 ++
2035 + struct request_sock_ops;
2036 + struct timewait_sock_ops;
2037 + struct inet_hashinfo;
2038 +diff --git a/include/net/tcp.h b/include/net/tcp.h
2039 +index 7286db80e8b8..ff92e74cd684 100644
2040 +--- a/include/net/tcp.h
2041 ++++ b/include/net/tcp.h
2042 +@@ -177,6 +177,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
2043 + #define TCPOPT_SACK 5 /* SACK Block */
2044 + #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
2045 + #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
2046 ++#define TCPOPT_MPTCP 30
2047 + #define TCPOPT_EXP 254 /* Experimental */
2048 + /* Magic number to be after the option value for sharing TCP
2049 + * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
2050 +@@ -229,6 +230,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
2051 + #define TFO_SERVER_WO_SOCKOPT1 0x400
2052 + #define TFO_SERVER_WO_SOCKOPT2 0x800
2053 +
2054 ++/* Flags from tcp_input.c for tcp_ack */
2055 ++#define FLAG_DATA 0x01 /* Incoming frame contained data. */
2056 ++#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
2057 ++#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
2058 ++#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
2059 ++#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
2060 ++#define FLAG_DATA_SACKED 0x20 /* New SACK. */
2061 ++#define FLAG_ECE 0x40 /* ECE in this ACK */
2062 ++#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
2063 ++#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
2064 ++#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
2065 ++#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
2066 ++#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
2067 ++#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
2068 ++#define MPTCP_FLAG_DATA_ACKED 0x8000
2069 ++
2070 ++#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
2071 ++#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
2072 ++#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
2073 ++#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
2074 ++
2075 + extern struct inet_timewait_death_row tcp_death_row;
2076 +
2077 + /* sysctl variables for tcp */
2078 +@@ -344,6 +366,107 @@ extern struct proto tcp_prot;
2079 + #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
2080 + #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
2081 +
2082 ++/**** START - Exports needed for MPTCP ****/
2083 ++extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
2084 ++extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
2085 ++
2086 ++struct mptcp_options_received;
2087 ++
2088 ++void tcp_enter_quickack_mode(struct sock *sk);
2089 ++int tcp_close_state(struct sock *sk);
2090 ++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
2091 ++ const struct sk_buff *skb);
2092 ++int tcp_xmit_probe_skb(struct sock *sk, int urgent);
2093 ++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
2094 ++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
2095 ++ gfp_t gfp_mask);
2096 ++unsigned int tcp_mss_split_point(const struct sock *sk,
2097 ++ const struct sk_buff *skb,
2098 ++ unsigned int mss_now,
2099 ++ unsigned int max_segs,
2100 ++ int nonagle);
2101 ++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2102 ++ unsigned int cur_mss, int nonagle);
2103 ++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2104 ++ unsigned int cur_mss);
2105 ++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
2106 ++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
2107 ++ unsigned int mss_now);
2108 ++void __pskb_trim_head(struct sk_buff *skb, int len);
2109 ++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
2110 ++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
2111 ++void tcp_reset(struct sock *sk);
2112 ++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
2113 ++ const u32 ack_seq, const u32 nwin);
2114 ++bool tcp_urg_mode(const struct tcp_sock *tp);
2115 ++void tcp_ack_probe(struct sock *sk);
2116 ++void tcp_rearm_rto(struct sock *sk);
2117 ++int tcp_write_timeout(struct sock *sk);
2118 ++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
2119 ++ unsigned int timeout, bool syn_set);
2120 ++void tcp_write_err(struct sock *sk);
2121 ++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
2122 ++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
2123 ++ unsigned int mss_now);
2124 ++
2125 ++int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);
2126 ++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
2127 ++ struct request_sock *req);
2128 ++__u32 tcp_v4_init_sequence(const struct sk_buff *skb);
2129 ++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
2130 ++ struct flowi *fl,
2131 ++ struct request_sock *req,
2132 ++ u16 queue_mapping,
2133 ++ struct tcp_fastopen_cookie *foc);
2134 ++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);
2135 ++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);
2136 ++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);
2137 ++void tcp_v4_reqsk_destructor(struct request_sock *req);
2138 ++
2139 ++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);
2140 ++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
2141 ++ struct request_sock *req);
2142 ++__u32 tcp_v6_init_sequence(const struct sk_buff *skb);
2143 ++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
2144 ++ struct flowi *fl, struct request_sock *req,
2145 ++ u16 queue_mapping, struct tcp_fastopen_cookie *foc);
2146 ++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
2147 ++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
2148 ++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
2149 ++void tcp_v6_destroy_sock(struct sock *sk);
2150 ++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
2151 ++void tcp_v6_hash(struct sock *sk);
2152 ++struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
2153 ++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
2154 ++ struct request_sock *req,
2155 ++ struct dst_entry *dst);
2156 ++void tcp_v6_reqsk_destructor(struct request_sock *req);
2157 ++
2158 ++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
2159 ++ int large_allowed);
2160 ++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
2161 ++
2162 ++void skb_clone_fraglist(struct sk_buff *skb);
2163 ++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
2164 ++
2165 ++void inet_twsk_free(struct inet_timewait_sock *tw);
2166 ++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);
2167 ++/* These states need RST on ABORT according to RFC793 */
2168 ++static inline bool tcp_need_reset(int state)
2169 ++{
2170 ++ return (1 << state) &
2171 ++ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2172 ++ TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2173 ++}
2174 ++
2175 ++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
2176 ++ int hlen);
2177 ++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
2178 ++ bool *fragstolen);
2179 ++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,
2180 ++ struct sk_buff *from, bool *fragstolen);
2181 ++/**** END - Exports needed for MPTCP ****/
2182 ++
2183 + void tcp_tasklet_init(void);
2184 +
2185 + void tcp_v4_err(struct sk_buff *skb, u32);
2186 +@@ -440,6 +563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
2187 + size_t len, int nonblock, int flags, int *addr_len);
2188 + void tcp_parse_options(const struct sk_buff *skb,
2189 + struct tcp_options_received *opt_rx,
2190 ++ struct mptcp_options_received *mopt_rx,
2191 + int estab, struct tcp_fastopen_cookie *foc);
2192 + const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
2193 +
2194 +@@ -493,14 +617,8 @@ static inline u32 tcp_cookie_time(void)
2195 +
2196 + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
2197 + u16 *mssp);
2198 +-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);
2199 +-#else
2200 +-static inline __u32 cookie_v4_init_sequence(struct sock *sk,
2201 +- struct sk_buff *skb,
2202 +- __u16 *mss)
2203 +-{
2204 +- return 0;
2205 +-}
2206 ++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
2207 ++ __u16 *mss);
2208 + #endif
2209 +
2210 + __u32 cookie_init_timestamp(struct request_sock *req);
2211 +@@ -516,13 +634,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
2212 + const struct tcphdr *th, u16 *mssp);
2213 + __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
2214 + __u16 *mss);
2215 +-#else
2216 +-static inline __u32 cookie_v6_init_sequence(struct sock *sk,
2217 +- struct sk_buff *skb,
2218 +- __u16 *mss)
2219 +-{
2220 +- return 0;
2221 +-}
2222 + #endif
2223 + /* tcp_output.c */
2224 +
2225 +@@ -551,10 +662,17 @@ void tcp_send_delayed_ack(struct sock *sk);
2226 + void tcp_send_loss_probe(struct sock *sk);
2227 + bool tcp_schedule_loss_probe(struct sock *sk);
2228 +
2229 ++u16 tcp_select_window(struct sock *sk);
2230 ++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2231 ++ int push_one, gfp_t gfp);
2232 ++
2233 + /* tcp_input.c */
2234 + void tcp_resume_early_retransmit(struct sock *sk);
2235 + void tcp_rearm_rto(struct sock *sk);
2236 + void tcp_reset(struct sock *sk);
2237 ++void tcp_set_rto(struct sock *sk);
2238 ++bool tcp_should_expand_sndbuf(const struct sock *sk);
2239 ++bool tcp_prune_ofo_queue(struct sock *sk);
2240 +
2241 + /* tcp_timer.c */
2242 + void tcp_init_xmit_timers(struct sock *);
2243 +@@ -703,14 +821,27 @@ void tcp_send_window_probe(struct sock *sk);
2244 + */
2245 + struct tcp_skb_cb {
2246 + union {
2247 +- struct inet_skb_parm h4;
2248 ++ union {
2249 ++ struct inet_skb_parm h4;
2250 + #if IS_ENABLED(CONFIG_IPV6)
2251 +- struct inet6_skb_parm h6;
2252 ++ struct inet6_skb_parm h6;
2253 + #endif
2254 +- } header; /* For incoming frames */
2255 ++ } header; /* For incoming frames */
2256 ++#ifdef CONFIG_MPTCP
2257 ++ union { /* For MPTCP outgoing frames */
2258 ++ __u32 path_mask; /* paths that tried to send this skb */
2259 ++ __u32 dss[6]; /* DSS options */
2260 ++ };
2261 ++#endif
2262 ++ };
2263 + __u32 seq; /* Starting sequence number */
2264 + __u32 end_seq; /* SEQ + FIN + SYN + datalen */
2265 + __u32 when; /* used to compute rtt's */
2266 ++#ifdef CONFIG_MPTCP
2267 ++ __u8 mptcp_flags; /* flags for the MPTCP layer */
2268 ++ __u8 dss_off; /* Number of 4-byte words until
2269 ++ * seq-number */
2270 ++#endif
2271 + __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
2272 +
2273 + __u8 sacked; /* State flags for SACK/FACK. */
2274 +@@ -1075,7 +1206,8 @@ u32 tcp_default_init_rwnd(u32 mss);
2275 + /* Determine a window scaling and initial window to offer. */
2276 + void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
2277 + __u32 *window_clamp, int wscale_ok,
2278 +- __u8 *rcv_wscale, __u32 init_rcv_wnd);
2279 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
2280 ++ const struct sock *sk);
2281 +
2282 + static inline int tcp_win_from_space(int space)
2283 + {
2284 +@@ -1084,15 +1216,34 @@ static inline int tcp_win_from_space(int space)
2285 + space - (space>>sysctl_tcp_adv_win_scale);
2286 + }
2287 +
2288 ++#ifdef CONFIG_MPTCP
2289 ++extern struct static_key mptcp_static_key;
2290 ++static inline bool mptcp(const struct tcp_sock *tp)
2291 ++{
2292 ++ return static_key_false(&mptcp_static_key) && tp->mpc;
2293 ++}
2294 ++#else
2295 ++static inline bool mptcp(const struct tcp_sock *tp)
2296 ++{
2297 ++ return 0;
2298 ++}
2299 ++#endif
2300 ++
2301 + /* Note: caller must be prepared to deal with negative returns */
2302 + static inline int tcp_space(const struct sock *sk)
2303 + {
2304 ++ if (mptcp(tcp_sk(sk)))
2305 ++ sk = tcp_sk(sk)->meta_sk;
2306 ++
2307 + return tcp_win_from_space(sk->sk_rcvbuf -
2308 + atomic_read(&sk->sk_rmem_alloc));
2309 + }
2310 +
2311 + static inline int tcp_full_space(const struct sock *sk)
2312 + {
2313 ++ if (mptcp(tcp_sk(sk)))
2314 ++ sk = tcp_sk(sk)->meta_sk;
2315 ++
2316 + return tcp_win_from_space(sk->sk_rcvbuf);
2317 + }
2318 +
2319 +@@ -1115,6 +1266,8 @@ static inline void tcp_openreq_init(struct request_sock *req,
2320 + ireq->wscale_ok = rx_opt->wscale_ok;
2321 + ireq->acked = 0;
2322 + ireq->ecn_ok = 0;
2323 ++ ireq->mptcp_rqsk = 0;
2324 ++ ireq->saw_mpc = 0;
2325 + ireq->ir_rmt_port = tcp_hdr(skb)->source;
2326 + ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
2327 + }
2328 +@@ -1585,6 +1738,11 @@ int tcp4_proc_init(void);
2329 + void tcp4_proc_exit(void);
2330 + #endif
2331 +
2332 ++int tcp_rtx_synack(struct sock *sk, struct request_sock *req);
2333 ++int tcp_conn_request(struct request_sock_ops *rsk_ops,
2334 ++ const struct tcp_request_sock_ops *af_ops,
2335 ++ struct sock *sk, struct sk_buff *skb);
2336 ++
2337 + /* TCP af-specific functions */
2338 + struct tcp_sock_af_ops {
2339 + #ifdef CONFIG_TCP_MD5SIG
2340 +@@ -1601,7 +1759,32 @@ struct tcp_sock_af_ops {
2341 + #endif
2342 + };
2343 +
2344 ++/* TCP/MPTCP-specific functions */
2345 ++struct tcp_sock_ops {
2346 ++ u32 (*__select_window)(struct sock *sk);
2347 ++ u16 (*select_window)(struct sock *sk);
2348 ++ void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,
2349 ++ __u32 *window_clamp, int wscale_ok,
2350 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
2351 ++ const struct sock *sk);
2352 ++ void (*init_buffer_space)(struct sock *sk);
2353 ++ void (*set_rto)(struct sock *sk);
2354 ++ bool (*should_expand_sndbuf)(const struct sock *sk);
2355 ++ void (*send_fin)(struct sock *sk);
2356 ++ bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,
2357 ++ int push_one, gfp_t gfp);
2358 ++ void (*send_active_reset)(struct sock *sk, gfp_t priority);
2359 ++ int (*write_wakeup)(struct sock *sk);
2360 ++ bool (*prune_ofo_queue)(struct sock *sk);
2361 ++ void (*retransmit_timer)(struct sock *sk);
2362 ++ void (*time_wait)(struct sock *sk, int state, int timeo);
2363 ++ void (*cleanup_rbuf)(struct sock *sk, int copied);
2364 ++ void (*init_congestion_control)(struct sock *sk);
2365 ++};
2366 ++extern const struct tcp_sock_ops tcp_specific;
2367 ++
2368 + struct tcp_request_sock_ops {
2369 ++ u16 mss_clamp;
2370 + #ifdef CONFIG_TCP_MD5SIG
2371 + struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk,
2372 + struct request_sock *req);
2373 +@@ -1611,8 +1794,39 @@ struct tcp_request_sock_ops {
2374 + const struct request_sock *req,
2375 + const struct sk_buff *skb);
2376 + #endif
2377 ++ int (*init_req)(struct request_sock *req, struct sock *sk,
2378 ++ struct sk_buff *skb);
2379 ++#ifdef CONFIG_SYN_COOKIES
2380 ++ __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,
2381 ++ __u16 *mss);
2382 ++#endif
2383 ++ struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,
2384 ++ const struct request_sock *req,
2385 ++ bool *strict);
2386 ++ __u32 (*init_seq)(const struct sk_buff *skb);
2387 ++ int (*send_synack)(struct sock *sk, struct dst_entry *dst,
2388 ++ struct flowi *fl, struct request_sock *req,
2389 ++ u16 queue_mapping, struct tcp_fastopen_cookie *foc);
2390 ++ void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
2391 ++ const unsigned long timeout);
2392 + };
2393 +
2394 ++#ifdef CONFIG_SYN_COOKIES
2395 ++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
2396 ++ struct sock *sk, struct sk_buff *skb,
2397 ++ __u16 *mss)
2398 ++{
2399 ++ return ops->cookie_init_seq(sk, skb, mss);
2400 ++}
2401 ++#else
2402 ++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
2403 ++ struct sock *sk, struct sk_buff *skb,
2404 ++ __u16 *mss)
2405 ++{
2406 ++ return 0;
2407 ++}
2408 ++#endif
2409 ++
2410 + int tcpv4_offload_init(void);
2411 +
2412 + void tcp_v4_init(void);
2413 +diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
2414 +index 9cf2394f0bcf..c2634b6ed854 100644
2415 +--- a/include/uapi/linux/if.h
2416 ++++ b/include/uapi/linux/if.h
2417 +@@ -109,6 +109,9 @@ enum net_device_flags {
2418 + #define IFF_DORMANT IFF_DORMANT
2419 + #define IFF_ECHO IFF_ECHO
2420 +
2421 ++#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */
2422 ++#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */
2423 ++
2424 + #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
2425 + IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
2426 +
2427 +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
2428 +index 3b9718328d8b..487475681d84 100644
2429 +--- a/include/uapi/linux/tcp.h
2430 ++++ b/include/uapi/linux/tcp.h
2431 +@@ -112,6 +112,7 @@ enum {
2432 + #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
2433 + #define TCP_TIMESTAMP 24
2434 + #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
2435 ++#define MPTCP_ENABLED 26
2436 +
2437 + struct tcp_repair_opt {
2438 + __u32 opt_code;
2439 +diff --git a/net/Kconfig b/net/Kconfig
2440 +index d92afe4204d9..96b58593ad5e 100644
2441 +--- a/net/Kconfig
2442 ++++ b/net/Kconfig
2443 +@@ -79,6 +79,7 @@ if INET
2444 + source "net/ipv4/Kconfig"
2445 + source "net/ipv6/Kconfig"
2446 + source "net/netlabel/Kconfig"
2447 ++source "net/mptcp/Kconfig"
2448 +
2449 + endif # if INET
2450 +
2451 +diff --git a/net/Makefile b/net/Makefile
2452 +index cbbbe6d657ca..244bac1435b1 100644
2453 +--- a/net/Makefile
2454 ++++ b/net/Makefile
2455 +@@ -20,6 +20,7 @@ obj-$(CONFIG_INET) += ipv4/
2456 + obj-$(CONFIG_XFRM) += xfrm/
2457 + obj-$(CONFIG_UNIX) += unix/
2458 + obj-$(CONFIG_NET) += ipv6/
2459 ++obj-$(CONFIG_MPTCP) += mptcp/
2460 + obj-$(CONFIG_PACKET) += packet/
2461 + obj-$(CONFIG_NET_KEY) += key/
2462 + obj-$(CONFIG_BRIDGE) += bridge/
2463 +diff --git a/net/core/dev.c b/net/core/dev.c
2464 +index 367a586d0c8a..215d2757fbf6 100644
2465 +--- a/net/core/dev.c
2466 ++++ b/net/core/dev.c
2467 +@@ -5420,7 +5420,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
2468 +
2469 + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2470 + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2471 +- IFF_AUTOMEDIA)) |
2472 ++ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
2473 + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2474 + IFF_ALLMULTI));
2475 +
2476 +diff --git a/net/core/request_sock.c b/net/core/request_sock.c
2477 +index 467f326126e0..909dfa13f499 100644
2478 +--- a/net/core/request_sock.c
2479 ++++ b/net/core/request_sock.c
2480 +@@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256;
2481 + EXPORT_SYMBOL(sysctl_max_syn_backlog);
2482 +
2483 + int reqsk_queue_alloc(struct request_sock_queue *queue,
2484 +- unsigned int nr_table_entries)
2485 ++ unsigned int nr_table_entries,
2486 ++ gfp_t flags)
2487 + {
2488 + size_t lopt_size = sizeof(struct listen_sock);
2489 + struct listen_sock *lopt;
2490 +@@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
2491 + nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
2492 + lopt_size += nr_table_entries * sizeof(struct request_sock *);
2493 + if (lopt_size > PAGE_SIZE)
2494 +- lopt = vzalloc(lopt_size);
2495 ++ lopt = __vmalloc(lopt_size,
2496 ++ flags | __GFP_HIGHMEM | __GFP_ZERO,
2497 ++ PAGE_KERNEL);
2498 + else
2499 +- lopt = kzalloc(lopt_size, GFP_KERNEL);
2500 ++ lopt = kzalloc(lopt_size, flags);
2501 + if (lopt == NULL)
2502 + return -ENOMEM;
2503 +
2504 +diff --git a/net/core/skbuff.c b/net/core/skbuff.c
2505 +index c1a33033cbe2..8abc5d60fbe3 100644
2506 +--- a/net/core/skbuff.c
2507 ++++ b/net/core/skbuff.c
2508 +@@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)
2509 + skb_drop_list(&skb_shinfo(skb)->frag_list);
2510 + }
2511 +
2512 +-static void skb_clone_fraglist(struct sk_buff *skb)
2513 ++void skb_clone_fraglist(struct sk_buff *skb)
2514 + {
2515 + struct sk_buff *list;
2516 +
2517 +@@ -897,7 +897,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
2518 + skb->inner_mac_header += off;
2519 + }
2520 +
2521 +-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
2522 ++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
2523 + {
2524 + __copy_skb_header(new, old);
2525 +
2526 +diff --git a/net/core/sock.c b/net/core/sock.c
2527 +index 026e01f70274..359295523177 100644
2528 +--- a/net/core/sock.c
2529 ++++ b/net/core/sock.c
2530 +@@ -136,6 +136,11 @@
2531 +
2532 + #include <trace/events/sock.h>
2533 +
2534 ++#ifdef CONFIG_MPTCP
2535 ++#include <net/mptcp.h>
2536 ++#include <net/inet_common.h>
2537 ++#endif
2538 ++
2539 + #ifdef CONFIG_INET
2540 + #include <net/tcp.h>
2541 + #endif
2542 +@@ -280,7 +285,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
2543 + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
2544 + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
2545 + };
2546 +-static const char *const af_family_clock_key_strings[AF_MAX+1] = {
2547 ++char *const af_family_clock_key_strings[AF_MAX+1] = {
2548 + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
2549 + "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
2550 + "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
2551 +@@ -301,7 +306,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
2552 + * sk_callback_lock locking rules are per-address-family,
2553 + * so split the lock classes by using a per-AF key:
2554 + */
2555 +-static struct lock_class_key af_callback_keys[AF_MAX];
2556 ++struct lock_class_key af_callback_keys[AF_MAX];
2557 +
2558 + /* Take into consideration the size of the struct sk_buff overhead in the
2559 + * determination of these values, since that is non-constant across
2560 +@@ -422,8 +427,6 @@ static void sock_warn_obsolete_bsdism(const char *name)
2561 + }
2562 + }
2563 +
2564 +-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
2565 +-
2566 + static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
2567 + {
2568 + if (sk->sk_flags & flags) {
2569 +@@ -1253,8 +1256,25 @@ lenout:
2570 + *
2571 + * (We also register the sk_lock with the lock validator.)
2572 + */
2573 +-static inline void sock_lock_init(struct sock *sk)
2574 +-{
2575 ++void sock_lock_init(struct sock *sk)
2576 ++{
2577 ++#ifdef CONFIG_MPTCP
2578 ++ /* Reclassify the lock-class for subflows */
2579 ++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
2580 ++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {
2581 ++ sock_lock_init_class_and_name(sk, "slock-AF_INET-MPTCP",
2582 ++ &meta_slock_key,
2583 ++ "sk_lock-AF_INET-MPTCP",
2584 ++ &meta_key);
2585 ++
2586 ++ /* We don't yet have the mptcp-point.
2587 ++ * Thus we still need inet_sock_destruct
2588 ++ */
2589 ++ sk->sk_destruct = inet_sock_destruct;
2590 ++ return;
2591 ++ }
2592 ++#endif
2593 ++
2594 + sock_lock_init_class_and_name(sk,
2595 + af_family_slock_key_strings[sk->sk_family],
2596 + af_family_slock_keys + sk->sk_family,
2597 +@@ -1301,7 +1321,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
2598 + }
2599 + EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
2600 +
2601 +-static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2602 ++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2603 + int family)
2604 + {
2605 + struct sock *sk;
2606 +diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
2607 +index 4db3c2a1679c..04cb17d4b0ce 100644
2608 +--- a/net/dccp/ipv6.c
2609 ++++ b/net/dccp/ipv6.c
2610 +@@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
2611 + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
2612 + goto drop;
2613 +
2614 +- req = inet6_reqsk_alloc(&dccp6_request_sock_ops);
2615 ++ req = inet_reqsk_alloc(&dccp6_request_sock_ops);
2616 + if (req == NULL)
2617 + goto drop;
2618 +
2619 +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
2620 +index 05c57f0fcabe..630434db0085 100644
2621 +--- a/net/ipv4/Kconfig
2622 ++++ b/net/ipv4/Kconfig
2623 +@@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS
2624 + For further details see:
2625 + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
2626 +
2627 ++config TCP_CONG_COUPLED
2628 ++ tristate "MPTCP COUPLED CONGESTION CONTROL"
2629 ++ depends on MPTCP
2630 ++ default n
2631 ++ ---help---
2632 ++ MultiPath TCP Coupled Congestion Control
2633 ++ To enable it, just put 'coupled' in tcp_congestion_control
2634 ++
2635 ++config TCP_CONG_OLIA
2636 ++ tristate "MPTCP Opportunistic Linked Increase"
2637 ++ depends on MPTCP
2638 ++ default n
2639 ++ ---help---
2640 ++ MultiPath TCP Opportunistic Linked Increase Congestion Control
2641 ++ To enable it, just put 'olia' in tcp_congestion_control
2642 ++
2643 ++config TCP_CONG_WVEGAS
2644 ++ tristate "MPTCP WVEGAS CONGESTION CONTROL"
2645 ++ depends on MPTCP
2646 ++ default n
2647 ++ ---help---
2648 ++ wVegas congestion control for MPTCP
2649 ++ To enable it, just put 'wvegas' in tcp_congestion_control
2650 ++
2651 + choice
2652 + prompt "Default TCP congestion control"
2653 + default DEFAULT_CUBIC
2654 +@@ -584,6 +608,15 @@ choice
2655 + config DEFAULT_WESTWOOD
2656 + bool "Westwood" if TCP_CONG_WESTWOOD=y
2657 +
2658 ++ config DEFAULT_COUPLED
2659 ++ bool "Coupled" if TCP_CONG_COUPLED=y
2660 ++
2661 ++ config DEFAULT_OLIA
2662 ++ bool "Olia" if TCP_CONG_OLIA=y
2663 ++
2664 ++ config DEFAULT_WVEGAS
2665 ++ bool "Wvegas" if TCP_CONG_WVEGAS=y
2666 ++
2667 + config DEFAULT_RENO
2668 + bool "Reno"
2669 +
2670 +@@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG
2671 + default "vegas" if DEFAULT_VEGAS
2672 + default "westwood" if DEFAULT_WESTWOOD
2673 + default "veno" if DEFAULT_VENO
2674 ++ default "coupled" if DEFAULT_COUPLED
2675 ++ default "wvegas" if DEFAULT_WVEGAS
2676 + default "reno" if DEFAULT_RENO
2677 + default "cubic"
2678 +
2679 +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
2680 +index d156b3c5f363..4afd6d8d9028 100644
2681 +--- a/net/ipv4/af_inet.c
2682 ++++ b/net/ipv4/af_inet.c
2683 +@@ -104,6 +104,7 @@
2684 + #include <net/ip_fib.h>
2685 + #include <net/inet_connection_sock.h>
2686 + #include <net/tcp.h>
2687 ++#include <net/mptcp.h>
2688 + #include <net/udp.h>
2689 + #include <net/udplite.h>
2690 + #include <net/ping.h>
2691 +@@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen);
2692 + * Create an inet socket.
2693 + */
2694 +
2695 +-static int inet_create(struct net *net, struct socket *sock, int protocol,
2696 +- int kern)
2697 ++int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
2698 + {
2699 + struct sock *sk;
2700 + struct inet_protosw *answer;
2701 +@@ -676,6 +676,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
2702 + lock_sock(sk2);
2703 +
2704 + sock_rps_record_flow(sk2);
2705 ++
2706 ++ if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {
2707 ++ struct sock *sk_it = sk2;
2708 ++
2709 ++ mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)
2710 ++ sock_rps_record_flow(sk_it);
2711 ++
2712 ++ if (tcp_sk(sk2)->mpcb->master_sk) {
2713 ++ sk_it = tcp_sk(sk2)->mpcb->master_sk;
2714 ++
2715 ++ write_lock_bh(&sk_it->sk_callback_lock);
2716 ++ sk_it->sk_wq = newsock->wq;
2717 ++ sk_it->sk_socket = newsock;
2718 ++ write_unlock_bh(&sk_it->sk_callback_lock);
2719 ++ }
2720 ++ }
2721 ++
2722 + WARN_ON(!((1 << sk2->sk_state) &
2723 + (TCPF_ESTABLISHED | TCPF_SYN_RECV |
2724 + TCPF_CLOSE_WAIT | TCPF_CLOSE)));
2725 +@@ -1763,6 +1780,9 @@ static int __init inet_init(void)
2726 +
2727 + ip_init();
2728 +
2729 ++ /* We must initialize MPTCP before TCP. */
2730 ++ mptcp_init();
2731 ++
2732 + tcp_v4_init();
2733 +
2734 + /* Setup TCP slab cache for open requests. */
2735 +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
2736 +index 14d02ea905b6..7d734d8af19b 100644
2737 +--- a/net/ipv4/inet_connection_sock.c
2738 ++++ b/net/ipv4/inet_connection_sock.c
2739 +@@ -23,6 +23,7 @@
2740 + #include <net/route.h>
2741 + #include <net/tcp_states.h>
2742 + #include <net/xfrm.h>
2743 ++#include <net/mptcp.h>
2744 +
2745 + #ifdef INET_CSK_DEBUG
2746 + const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
2747 +@@ -465,8 +466,8 @@ no_route:
2748 + }
2749 + EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
2750 +
2751 +-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
2752 +- const u32 rnd, const u32 synq_hsize)
2753 ++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
2754 ++ const u32 synq_hsize)
2755 + {
2756 + return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
2757 + }
2758 +@@ -647,7 +648,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
2759 +
2760 + lopt->clock_hand = i;
2761 +
2762 +- if (lopt->qlen)
2763 ++ if (lopt->qlen && !is_meta_sk(parent))
2764 + inet_csk_reset_keepalive_timer(parent, interval);
2765 + }
2766 + EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
2767 +@@ -664,7 +665,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
2768 + const struct request_sock *req,
2769 + const gfp_t priority)
2770 + {
2771 +- struct sock *newsk = sk_clone_lock(sk, priority);
2772 ++ struct sock *newsk;
2773 ++
2774 ++ newsk = sk_clone_lock(sk, priority);
2775 +
2776 + if (newsk != NULL) {
2777 + struct inet_connection_sock *newicsk = inet_csk(newsk);
2778 +@@ -743,7 +746,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
2779 + {
2780 + struct inet_sock *inet = inet_sk(sk);
2781 + struct inet_connection_sock *icsk = inet_csk(sk);
2782 +- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
2783 ++ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,
2784 ++ GFP_KERNEL);
2785 +
2786 + if (rc != 0)
2787 + return rc;
2788 +@@ -801,9 +805,14 @@ void inet_csk_listen_stop(struct sock *sk)
2789 +
2790 + while ((req = acc_req) != NULL) {
2791 + struct sock *child = req->sk;
2792 ++ bool mutex_taken = false;
2793 +
2794 + acc_req = req->dl_next;
2795 +
2796 ++ if (is_meta_sk(child)) {
2797 ++ mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);
2798 ++ mutex_taken = true;
2799 ++ }
2800 + local_bh_disable();
2801 + bh_lock_sock(child);
2802 + WARN_ON(sock_owned_by_user(child));
2803 +@@ -832,6 +841,8 @@ void inet_csk_listen_stop(struct sock *sk)
2804 +
2805 + bh_unlock_sock(child);
2806 + local_bh_enable();
2807 ++ if (mutex_taken)
2808 ++ mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);
2809 + sock_put(child);
2810 +
2811 + sk_acceptq_removed(sk);
2812 +diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
2813 +index c86624b36a62..0ff3fe004d62 100644
2814 +--- a/net/ipv4/syncookies.c
2815 ++++ b/net/ipv4/syncookies.c
2816 +@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
2817 + }
2818 + EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
2819 +
2820 +-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
2821 ++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
2822 ++ __u16 *mssp)
2823 + {
2824 + const struct iphdr *iph = ip_hdr(skb);
2825 + const struct tcphdr *th = tcp_hdr(skb);
2826 +@@ -284,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
2827 +
2828 + /* check for timestamp cookie support */
2829 + memset(&tcp_opt, 0, sizeof(tcp_opt));
2830 +- tcp_parse_options(skb, &tcp_opt, 0, NULL);
2831 ++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
2832 +
2833 + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
2834 + goto out;
2835 +@@ -355,10 +356,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
2836 + /* Try to redo what tcp_v4_send_synack did. */
2837 + req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
2838 +
2839 +- tcp_select_initial_window(tcp_full_space(sk), req->mss,
2840 +- &req->rcv_wnd, &req->window_clamp,
2841 +- ireq->wscale_ok, &rcv_wscale,
2842 +- dst_metric(&rt->dst, RTAX_INITRWND));
2843 ++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
2844 ++ &req->rcv_wnd, &req->window_clamp,
2845 ++ ireq->wscale_ok, &rcv_wscale,
2846 ++ dst_metric(&rt->dst, RTAX_INITRWND), sk);
2847 +
2848 + ireq->rcv_wscale = rcv_wscale;
2849 +
2850 +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
2851 +index 9d2118e5fbc7..2cb89f886d45 100644
2852 +--- a/net/ipv4/tcp.c
2853 ++++ b/net/ipv4/tcp.c
2854 +@@ -271,6 +271,7 @@
2855 +
2856 + #include <net/icmp.h>
2857 + #include <net/inet_common.h>
2858 ++#include <net/mptcp.h>
2859 + #include <net/tcp.h>
2860 + #include <net/xfrm.h>
2861 + #include <net/ip.h>
2862 +@@ -371,6 +372,24 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
2863 + return period;
2864 + }
2865 +
2866 ++const struct tcp_sock_ops tcp_specific = {
2867 ++ .__select_window = __tcp_select_window,
2868 ++ .select_window = tcp_select_window,
2869 ++ .select_initial_window = tcp_select_initial_window,
2870 ++ .init_buffer_space = tcp_init_buffer_space,
2871 ++ .set_rto = tcp_set_rto,
2872 ++ .should_expand_sndbuf = tcp_should_expand_sndbuf,
2873 ++ .init_congestion_control = tcp_init_congestion_control,
2874 ++ .send_fin = tcp_send_fin,
2875 ++ .write_xmit = tcp_write_xmit,
2876 ++ .send_active_reset = tcp_send_active_reset,
2877 ++ .write_wakeup = tcp_write_wakeup,
2878 ++ .prune_ofo_queue = tcp_prune_ofo_queue,
2879 ++ .retransmit_timer = tcp_retransmit_timer,
2880 ++ .time_wait = tcp_time_wait,
2881 ++ .cleanup_rbuf = tcp_cleanup_rbuf,
2882 ++};
2883 ++
2884 + /* Address-family independent initialization for a tcp_sock.
2885 + *
2886 + * NOTE: A lot of things set to zero explicitly by call to
2887 +@@ -419,6 +438,8 @@ void tcp_init_sock(struct sock *sk)
2888 + sk->sk_sndbuf = sysctl_tcp_wmem[1];
2889 + sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2890 +
2891 ++ tp->ops = &tcp_specific;
2892 ++
2893 + local_bh_disable();
2894 + sock_update_memcg(sk);
2895 + sk_sockets_allocated_inc(sk);
2896 +@@ -726,6 +747,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
2897 + int ret;
2898 +
2899 + sock_rps_record_flow(sk);
2900 ++
2901 ++#ifdef CONFIG_MPTCP
2902 ++ if (mptcp(tcp_sk(sk))) {
2903 ++ struct sock *sk_it;
2904 ++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)
2905 ++ sock_rps_record_flow(sk_it);
2906 ++ }
2907 ++#endif
2908 + /*
2909 + * We can't seek on a socket input
2910 + */
2911 +@@ -821,8 +850,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
2912 + return NULL;
2913 + }
2914 +
2915 +-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
2916 +- int large_allowed)
2917 ++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
2918 + {
2919 + struct tcp_sock *tp = tcp_sk(sk);
2920 + u32 xmit_size_goal, old_size_goal;
2921 +@@ -872,8 +900,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
2922 + {
2923 + int mss_now;
2924 +
2925 +- mss_now = tcp_current_mss(sk);
2926 +- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2927 ++ if (mptcp(tcp_sk(sk))) {
2928 ++ mss_now = mptcp_current_mss(sk);
2929 ++ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2930 ++ } else {
2931 ++ mss_now = tcp_current_mss(sk);
2932 ++ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2933 ++ }
2934 +
2935 + return mss_now;
2936 + }
2937 +@@ -892,11 +925,32 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
2938 + * is fully established.
2939 + */
2940 + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
2941 +- !tcp_passive_fastopen(sk)) {
2942 ++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
2943 ++ tp->mpcb->master_sk : sk)) {
2944 + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
2945 + goto out_err;
2946 + }
2947 +
2948 ++ if (mptcp(tp)) {
2949 ++ struct sock *sk_it = sk;
2950 ++
2951 ++ /* We must check this with socket-lock hold because we iterate
2952 ++ * over the subflows.
2953 ++ */
2954 ++ if (!mptcp_can_sendpage(sk)) {
2955 ++ ssize_t ret;
2956 ++
2957 ++ release_sock(sk);
2958 ++ ret = sock_no_sendpage(sk->sk_socket, page, offset,
2959 ++ size, flags);
2960 ++ lock_sock(sk);
2961 ++ return ret;
2962 ++ }
2963 ++
2964 ++ mptcp_for_each_sk(tp->mpcb, sk_it)
2965 ++ sock_rps_record_flow(sk_it);
2966 ++ }
2967 ++
2968 + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2969 +
2970 + mss_now = tcp_send_mss(sk, &size_goal, flags);
2971 +@@ -1001,8 +1055,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
2972 + {
2973 + ssize_t res;
2974 +
2975 +- if (!(sk->sk_route_caps & NETIF_F_SG) ||
2976 +- !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
2977 ++ /* If MPTCP is enabled, we check it later after establishment */
2978 ++ if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) ||
2979 ++ !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))
2980 + return sock_no_sendpage(sk->sk_socket, page, offset, size,
2981 + flags);
2982 +
2983 +@@ -1018,6 +1073,9 @@ static inline int select_size(const struct sock *sk, bool sg)
2984 + const struct tcp_sock *tp = tcp_sk(sk);
2985 + int tmp = tp->mss_cache;
2986 +
2987 ++ if (mptcp(tp))
2988 ++ return mptcp_select_size(sk, sg);
2989 ++
2990 + if (sg) {
2991 + if (sk_can_gso(sk)) {
2992 + /* Small frames wont use a full page:
2993 +@@ -1100,11 +1158,18 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
2994 + * is fully established.
2995 + */
2996 + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
2997 +- !tcp_passive_fastopen(sk)) {
2998 ++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
2999 ++ tp->mpcb->master_sk : sk)) {
3000 + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
3001 + goto do_error;
3002 + }
3003 +
3004 ++ if (mptcp(tp)) {
3005 ++ struct sock *sk_it = sk;
3006 ++ mptcp_for_each_sk(tp->mpcb, sk_it)
3007 ++ sock_rps_record_flow(sk_it);
3008 ++ }
3009 ++
3010 + if (unlikely(tp->repair)) {
3011 + if (tp->repair_queue == TCP_RECV_QUEUE) {
3012 + copied = tcp_send_rcvq(sk, msg, size);
3013 +@@ -1132,7 +1197,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3014 + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
3015 + goto out_err;
3016 +
3017 +- sg = !!(sk->sk_route_caps & NETIF_F_SG);
3018 ++ if (mptcp(tp))
3019 ++ sg = mptcp_can_sg(sk);
3020 ++ else
3021 ++ sg = !!(sk->sk_route_caps & NETIF_F_SG);
3022 +
3023 + while (--iovlen >= 0) {
3024 + size_t seglen = iov->iov_len;
3025 +@@ -1183,8 +1251,15 @@ new_segment:
3026 +
3027 + /*
3028 + * Check whether we can use HW checksum.
3029 ++ *
3030 ++ * If dss-csum is enabled, we do not do hw-csum.
3031 ++ * In case of non-mptcp we check the
3032 ++ * device-capabilities.
3033 ++ * In case of mptcp, hw-csum's will be handled
3034 ++ * later in mptcp_write_xmit.
3035 + */
3036 +- if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
3037 ++ if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) &&
3038 ++ (mptcp(tp) || sk->sk_route_caps & NETIF_F_ALL_CSUM))
3039 + skb->ip_summed = CHECKSUM_PARTIAL;
3040 +
3041 + skb_entail(sk, skb);
3042 +@@ -1422,7 +1497,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
3043 +
3044 + /* Optimize, __tcp_select_window() is not cheap. */
3045 + if (2*rcv_window_now <= tp->window_clamp) {
3046 +- __u32 new_window = __tcp_select_window(sk);
3047 ++ __u32 new_window = tp->ops->__select_window(sk);
3048 +
3049 + /* Send ACK now, if this read freed lots of space
3050 + * in our buffer. Certainly, new_window is new window.
3051 +@@ -1587,7 +1662,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
3052 + /* Clean up data we have read: This will do ACK frames. */
3053 + if (copied > 0) {
3054 + tcp_recv_skb(sk, seq, &offset);
3055 +- tcp_cleanup_rbuf(sk, copied);
3056 ++ tp->ops->cleanup_rbuf(sk, copied);
3057 + }
3058 + return copied;
3059 + }
3060 +@@ -1623,6 +1698,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3061 +
3062 + lock_sock(sk);
3063 +
3064 ++#ifdef CONFIG_MPTCP
3065 ++ if (mptcp(tp)) {
3066 ++ struct sock *sk_it;
3067 ++ mptcp_for_each_sk(tp->mpcb, sk_it)
3068 ++ sock_rps_record_flow(sk_it);
3069 ++ }
3070 ++#endif
3071 ++
3072 + err = -ENOTCONN;
3073 + if (sk->sk_state == TCP_LISTEN)
3074 + goto out;
3075 +@@ -1761,7 +1844,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3076 + }
3077 + }
3078 +
3079 +- tcp_cleanup_rbuf(sk, copied);
3080 ++ tp->ops->cleanup_rbuf(sk, copied);
3081 +
3082 + if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
3083 + /* Install new reader */
3084 +@@ -1813,7 +1896,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3085 + if (tp->rcv_wnd == 0 &&
3086 + !skb_queue_empty(&sk->sk_async_wait_queue)) {
3087 + tcp_service_net_dma(sk, true);
3088 +- tcp_cleanup_rbuf(sk, copied);
3089 ++ tp->ops->cleanup_rbuf(sk, copied);
3090 + } else
3091 + dma_async_issue_pending(tp->ucopy.dma_chan);
3092 + }
3093 +@@ -1993,7 +2076,7 @@ skip_copy:
3094 + */
3095 +
3096 + /* Clean up data we have read: This will do ACK frames. */
3097 +- tcp_cleanup_rbuf(sk, copied);
3098 ++ tp->ops->cleanup_rbuf(sk, copied);
3099 +
3100 + release_sock(sk);
3101 + return copied;
3102 +@@ -2070,7 +2153,7 @@ static const unsigned char new_state[16] = {
3103 + /* TCP_CLOSING */ TCP_CLOSING,
3104 + };
3105 +
3106 +-static int tcp_close_state(struct sock *sk)
3107 ++int tcp_close_state(struct sock *sk)
3108 + {
3109 + int next = (int)new_state[sk->sk_state];
3110 + int ns = next & TCP_STATE_MASK;
3111 +@@ -2100,7 +2183,7 @@ void tcp_shutdown(struct sock *sk, int how)
3112 + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
3113 + /* Clear out any half completed packets. FIN if needed. */
3114 + if (tcp_close_state(sk))
3115 +- tcp_send_fin(sk);
3116 ++ tcp_sk(sk)->ops->send_fin(sk);
3117 + }
3118 + }
3119 + EXPORT_SYMBOL(tcp_shutdown);
3120 +@@ -2125,6 +2208,11 @@ void tcp_close(struct sock *sk, long timeout)
3121 + int data_was_unread = 0;
3122 + int state;
3123 +
3124 ++ if (is_meta_sk(sk)) {
3125 ++ mptcp_close(sk, timeout);
3126 ++ return;
3127 ++ }
3128 ++
3129 + lock_sock(sk);
3130 + sk->sk_shutdown = SHUTDOWN_MASK;
3131 +
3132 +@@ -2167,7 +2255,7 @@ void tcp_close(struct sock *sk, long timeout)
3133 + /* Unread data was tossed, zap the connection. */
3134 + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
3135 + tcp_set_state(sk, TCP_CLOSE);
3136 +- tcp_send_active_reset(sk, sk->sk_allocation);
3137 ++ tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);
3138 + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
3139 + /* Check zero linger _after_ checking for unread data. */
3140 + sk->sk_prot->disconnect(sk, 0);
3141 +@@ -2247,7 +2335,7 @@ adjudge_to_death:
3142 + struct tcp_sock *tp = tcp_sk(sk);
3143 + if (tp->linger2 < 0) {
3144 + tcp_set_state(sk, TCP_CLOSE);
3145 +- tcp_send_active_reset(sk, GFP_ATOMIC);
3146 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
3147 + NET_INC_STATS_BH(sock_net(sk),
3148 + LINUX_MIB_TCPABORTONLINGER);
3149 + } else {
3150 +@@ -2257,7 +2345,8 @@ adjudge_to_death:
3151 + inet_csk_reset_keepalive_timer(sk,
3152 + tmo - TCP_TIMEWAIT_LEN);
3153 + } else {
3154 +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
3155 ++ tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,
3156 ++ tmo);
3157 + goto out;
3158 + }
3159 + }
3160 +@@ -2266,7 +2355,7 @@ adjudge_to_death:
3161 + sk_mem_reclaim(sk);
3162 + if (tcp_check_oom(sk, 0)) {
3163 + tcp_set_state(sk, TCP_CLOSE);
3164 +- tcp_send_active_reset(sk, GFP_ATOMIC);
3165 ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
3166 + NET_INC_STATS_BH(sock_net(sk),
3167 + LINUX_MIB_TCPABORTONMEMORY);
3168 + }
3169 +@@ -2291,15 +2380,6 @@ out:
3170 + }
3171 + EXPORT_SYMBOL(tcp_close);
3172 +
3173 +-/* These states need RST on ABORT according to RFC793 */
3174 +-
3175 +-static inline bool tcp_need_reset(int state)
3176 +-{
3177 +- return (1 << state) &
3178 +- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
3179 +- TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
3180 +-}
3181 +-
3182 + int tcp_disconnect(struct sock *sk, int flags)
3183 + {
3184 + struct inet_sock *inet = inet_sk(sk);
3185 +@@ -2322,7 +2402,7 @@ int tcp_disconnect(struct sock *sk, int flags)
3186 + /* The last check adjusts for discrepancy of Linux wrt. RFC
3187 + * states
3188 + */
3189 +- tcp_send_active_reset(sk, gfp_any());
3190 ++ tp->ops->send_active_reset(sk, gfp_any());
3191 + sk->sk_err = ECONNRESET;
3192 + } else if (old_state == TCP_SYN_SENT)
3193 + sk->sk_err = ECONNRESET;
3194 +@@ -2340,6 +2420,13 @@ int tcp_disconnect(struct sock *sk, int flags)
3195 + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
3196 + inet_reset_saddr(sk);
3197 +
3198 ++ if (is_meta_sk(sk)) {
3199 ++ mptcp_disconnect(sk);
3200 ++ } else {
3201 ++ if (tp->inside_tk_table)
3202 ++ mptcp_hash_remove_bh(tp);
3203 ++ }
3204 ++
3205 + sk->sk_shutdown = 0;
3206 + sock_reset_flag(sk, SOCK_DONE);
3207 + tp->srtt_us = 0;
3208 +@@ -2632,6 +2719,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
3209 + break;
3210 +
3211 + case TCP_DEFER_ACCEPT:
3212 ++ /* An established MPTCP-connection (mptcp(tp) only returns true
3213 ++ * if the socket is established) should not use DEFER on new
3214 ++ * subflows.
3215 ++ */
3216 ++ if (mptcp(tp))
3217 ++ break;
3218 + /* Translate value in seconds to number of retransmits */
3219 + icsk->icsk_accept_queue.rskq_defer_accept =
3220 + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3221 +@@ -2659,7 +2752,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
3222 + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3223 + inet_csk_ack_scheduled(sk)) {
3224 + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
3225 +- tcp_cleanup_rbuf(sk, 1);
3226 ++ tp->ops->cleanup_rbuf(sk, 1);
3227 + if (!(val & 1))
3228 + icsk->icsk_ack.pingpong = 1;
3229 + }
3230 +@@ -2699,6 +2792,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
3231 + tp->notsent_lowat = val;
3232 + sk->sk_write_space(sk);
3233 + break;
3234 ++#ifdef CONFIG_MPTCP
3235 ++ case MPTCP_ENABLED:
3236 ++ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) {
3237 ++ if (val)
3238 ++ tp->mptcp_enabled = 1;
3239 ++ else
3240 ++ tp->mptcp_enabled = 0;
3241 ++ } else {
3242 ++ err = -EPERM;
3243 ++ }
3244 ++ break;
3245 ++#endif
3246 + default:
3247 + err = -ENOPROTOOPT;
3248 + break;
3249 +@@ -2931,6 +3036,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3250 + case TCP_NOTSENT_LOWAT:
3251 + val = tp->notsent_lowat;
3252 + break;
3253 ++#ifdef CONFIG_MPTCP
3254 ++ case MPTCP_ENABLED:
3255 ++ val = tp->mptcp_enabled;
3256 ++ break;
3257 ++#endif
3258 + default:
3259 + return -ENOPROTOOPT;
3260 + }
3261 +@@ -3120,8 +3230,11 @@ void tcp_done(struct sock *sk)
3262 + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3263 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3264 +
3265 ++ WARN_ON(sk->sk_state == TCP_CLOSE);
3266 + tcp_set_state(sk, TCP_CLOSE);
3267 ++
3268 + tcp_clear_xmit_timers(sk);
3269 ++
3270 + if (req != NULL)
3271 + reqsk_fastopen_remove(sk, req, false);
3272 +
3273 +diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
3274 +index 9771563ab564..5c230d96c4c1 100644
3275 +--- a/net/ipv4/tcp_fastopen.c
3276 ++++ b/net/ipv4/tcp_fastopen.c
3277 +@@ -7,6 +7,7 @@
3278 + #include <linux/rculist.h>
3279 + #include <net/inetpeer.h>
3280 + #include <net/tcp.h>
3281 ++#include <net/mptcp.h>
3282 +
3283 + int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
3284 +
3285 +@@ -133,7 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
3286 + {
3287 + struct tcp_sock *tp;
3288 + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
3289 +- struct sock *child;
3290 ++ struct sock *child, *meta_sk;
3291 +
3292 + req->num_retrans = 0;
3293 + req->num_timeout = 0;
3294 +@@ -176,13 +177,6 @@ static bool tcp_fastopen_create_child(struct sock *sk,
3295 + /* Add the child socket directly into the accept queue */
3296 + inet_csk_reqsk_queue_add(sk, req, child);
3297 +
3298 +- /* Now finish processing the fastopen child socket. */
3299 +- inet_csk(child)->icsk_af_ops->rebuild_header(child);
3300 +- tcp_init_congestion_control(child);
3301 +- tcp_mtup_init(child);
3302 +- tcp_init_metrics(child);
3303 +- tcp_init_buffer_space(child);
3304 +-
3305 + /* Queue the data carried in the SYN packet. We need to first
3306 + * bump skb's refcnt because the caller will attempt to free it.
3307 + *
3308 +@@ -199,8 +193,24 @@ static bool tcp_fastopen_create_child(struct sock *sk,
3309 + tp->syn_data_acked = 1;
3310 + }
3311 + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3312 ++
3313 ++ meta_sk = child;
3314 ++ if (!mptcp_check_req_fastopen(meta_sk, req)) {
3315 ++ child = tcp_sk(meta_sk)->mpcb->master_sk;
3316 ++ tp = tcp_sk(child);
3317 ++ }
3318 ++
3319 ++ /* Now finish processing the fastopen child socket. */
3320 ++ inet_csk(child)->icsk_af_ops->rebuild_header(child);
3321 ++ tp->ops->init_congestion_control(child);
3322 ++ tcp_mtup_init(child);
3323 ++ tcp_init_metrics(child);
3324 ++ tp->ops->init_buffer_space(child);
3325 ++
3326 + sk->sk_data_ready(sk);
3327 +- bh_unlock_sock(child);
3328 ++ if (mptcp(tcp_sk(child)))
3329 ++ bh_unlock_sock(child);
3330 ++ bh_unlock_sock(meta_sk);
3331 + sock_put(child);
3332 + WARN_ON(req->sk == NULL);
3333 + return true;
3334 +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
3335 +index 40639c288dc2..3273bb69f387 100644
3336 +--- a/net/ipv4/tcp_input.c
3337 ++++ b/net/ipv4/tcp_input.c
3338 +@@ -74,6 +74,9 @@
3339 + #include <linux/ipsec.h>
3340 + #include <asm/unaligned.h>
3341 + #include <net/netdma.h>
3342 ++#include <net/mptcp.h>
3343 ++#include <net/mptcp_v4.h>
3344 ++#include <net/mptcp_v6.h>
3345 +
3346 + int sysctl_tcp_timestamps __read_mostly = 1;
3347 + int sysctl_tcp_window_scaling __read_mostly = 1;
3348 +@@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly;
3349 + int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
3350 + int sysctl_tcp_early_retrans __read_mostly = 3;
3351 +
3352 +-#define FLAG_DATA 0x01 /* Incoming frame contained data. */
3353 +-#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
3354 +-#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
3355 +-#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
3356 +-#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
3357 +-#define FLAG_DATA_SACKED 0x20 /* New SACK. */
3358 +-#define FLAG_ECE 0x40 /* ECE in this ACK */
3359 +-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
3360 +-#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
3361 +-#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
3362 +-#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
3363 +-#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
3364 +-#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
3365 +-
3366 +-#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
3367 +-#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
3368 +-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
3369 +-#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
3370 +-
3371 + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
3372 + #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
3373 +
3374 +@@ -181,7 +165,7 @@ static void tcp_incr_quickack(struct sock *sk)
3375 + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
3376 + }
3377 +
3378 +-static void tcp_enter_quickack_mode(struct sock *sk)
3379 ++void tcp_enter_quickack_mode(struct sock *sk)
3380 + {
3381 + struct inet_connection_sock *icsk = inet_csk(sk);
3382 + tcp_incr_quickack(sk);
3383 +@@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
3384 + per_mss = roundup_pow_of_two(per_mss) +
3385 + SKB_DATA_ALIGN(sizeof(struct sk_buff));
3386 +
3387 +- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
3388 +- nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
3389 ++ if (mptcp(tp)) {
3390 ++ nr_segs = mptcp_check_snd_buf(tp);
3391 ++ } else {
3392 ++ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
3393 ++ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
3394 ++ }
3395 +
3396 + /* Fast Recovery (RFC 5681 3.2) :
3397 + * Cubic needs 1.7 factor, rounded to 2 to include
3398 +@@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk)
3399 + */
3400 + sndmem = 2 * nr_segs * per_mss;
3401 +
3402 +- if (sk->sk_sndbuf < sndmem)
3403 ++ /* MPTCP: after this sndmem is the new contribution of the
3404 ++ * current subflow to the aggregated sndbuf */
3405 ++ if (sk->sk_sndbuf < sndmem) {
3406 ++ int old_sndbuf = sk->sk_sndbuf;
3407 + sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
3408 ++ /* MPTCP: ok, the subflow sndbuf has grown, reflect
3409 ++ * this in the aggregate buffer.*/
3410 ++ if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)
3411 ++ mptcp_update_sndbuf(tp);
3412 ++ }
3413 + }
3414 +
3415 + /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
3416 +@@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
3417 + static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
3418 + {
3419 + struct tcp_sock *tp = tcp_sk(sk);
3420 ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
3421 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
3422 +
3423 + /* Check #1 */
3424 +- if (tp->rcv_ssthresh < tp->window_clamp &&
3425 +- (int)tp->rcv_ssthresh < tcp_space(sk) &&
3426 ++ if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&
3427 ++ (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&
3428 + !sk_under_memory_pressure(sk)) {
3429 + int incr;
3430 +
3431 +@@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
3432 + * will fit to rcvbuf in future.
3433 + */
3434 + if (tcp_win_from_space(skb->truesize) <= skb->len)
3435 +- incr = 2 * tp->advmss;
3436 ++ incr = 2 * meta_tp->advmss;
3437 + else
3438 +- incr = __tcp_grow_window(sk, skb);
3439 ++ incr = __tcp_grow_window(meta_sk, skb);
3440 +
3441 + if (incr) {
3442 + incr = max_t(int, incr, 2 * skb->len);
3443 +- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
3444 +- tp->window_clamp);
3445 ++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,
3446 ++ meta_tp->window_clamp);
3447 + inet_csk(sk)->icsk_ack.quick |= 1;
3448 + }
3449 + }
3450 +@@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
3451 + int copied;
3452 +
3453 + time = tcp_time_stamp - tp->rcvq_space.time;
3454 +- if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
3455 ++ if (mptcp(tp)) {
3456 ++ if (mptcp_check_rtt(tp, time))
3457 ++ return;
3458 ++ } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
3459 + return;
3460 +
3461 + /* Number of bytes copied to user in last RTT */
3462 +@@ -761,7 +762,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
3463 + /* Calculate rto without backoff. This is the second half of Van Jacobson's
3464 + * routine referred to above.
3465 + */
3466 +-static void tcp_set_rto(struct sock *sk)
3467 ++void tcp_set_rto(struct sock *sk)
3468 + {
3469 + const struct tcp_sock *tp = tcp_sk(sk);
3470 + /* Old crap is replaced with new one. 8)
3471 +@@ -1376,7 +1377,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
3472 + int len;
3473 + int in_sack;
3474 +
3475 +- if (!sk_can_gso(sk))
3476 ++ /* For MPTCP we cannot shift skb-data and remove one skb from the
3477 ++ * send-queue, because this will make us loose the DSS-option (which
3478 ++ * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.
3479 ++ */
3480 ++ if (!sk_can_gso(sk) || mptcp(tp))
3481 + goto fallback;
3482 +
3483 + /* Normally R but no L won't result in plain S */
3484 +@@ -2915,7 +2920,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
3485 + return false;
3486 +
3487 + tcp_rtt_estimator(sk, seq_rtt_us);
3488 +- tcp_set_rto(sk);
3489 ++ tp->ops->set_rto(sk);
3490 +
3491 + /* RFC6298: only reset backoff on valid RTT measurement. */
3492 + inet_csk(sk)->icsk_backoff = 0;
3493 +@@ -3000,7 +3005,7 @@ void tcp_resume_early_retransmit(struct sock *sk)
3494 + }
3495 +
3496 + /* If we get here, the whole TSO packet has not been acked. */
3497 +-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3498 ++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3499 + {
3500 + struct tcp_sock *tp = tcp_sk(sk);
3501 + u32 packets_acked;
3502 +@@ -3095,6 +3100,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3503 + */
3504 + if (!(scb->tcp_flags & TCPHDR_SYN)) {
3505 + flag |= FLAG_DATA_ACKED;
3506 ++ if (mptcp(tp) && mptcp_is_data_seq(skb))
3507 ++ flag |= MPTCP_FLAG_DATA_ACKED;
3508 + } else {
3509 + flag |= FLAG_SYN_ACKED;
3510 + tp->retrans_stamp = 0;
3511 +@@ -3189,7 +3196,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3512 + return flag;
3513 + }
3514 +
3515 +-static void tcp_ack_probe(struct sock *sk)
3516 ++void tcp_ack_probe(struct sock *sk)
3517 + {
3518 + const struct tcp_sock *tp = tcp_sk(sk);
3519 + struct inet_connection_sock *icsk = inet_csk(sk);
3520 +@@ -3236,9 +3243,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3521 + /* Check that window update is acceptable.
3522 + * The function assumes that snd_una<=ack<=snd_next.
3523 + */
3524 +-static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3525 +- const u32 ack, const u32 ack_seq,
3526 +- const u32 nwin)
3527 ++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
3528 ++ const u32 ack_seq, const u32 nwin)
3529 + {
3530 + return after(ack, tp->snd_una) ||
3531 + after(ack_seq, tp->snd_wl1) ||
3532 +@@ -3357,7 +3363,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3533 + }
3534 +
3535 + /* This routine deals with incoming acks, but not outgoing ones. */
3536 +-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3537 ++static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3538 + {
3539 + struct inet_connection_sock *icsk = inet_csk(sk);
3540 + struct tcp_sock *tp = tcp_sk(sk);
3541 +@@ -3449,6 +3455,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3542 + sack_rtt_us);
3543 + acked -= tp->packets_out;
3544 +
3545 ++ if (mptcp(tp)) {
3546 ++ if (mptcp_fallback_infinite(sk, flag)) {
3547 ++ pr_err("%s resetting flow\n", __func__);
3548 ++ mptcp_send_reset(sk);
3549 ++ goto invalid_ack;
3550 ++ }
3551 ++
3552 ++ mptcp_clean_rtx_infinite(skb, sk);
3553 ++ }
3554 ++
3555 + /* Advance cwnd if state allows */
3556 + if (tcp_may_raise_cwnd(sk, flag))
3557 + tcp_cong_avoid(sk, ack, acked);
3558 +@@ -3512,8 +3528,9 @@ old_ack:
3559 + * the fast version below fails.
3560 + */
3561 + void tcp_parse_options(const struct sk_buff *skb,
3562 +- struct tcp_options_received *opt_rx, int estab,
3563 +- struct tcp_fastopen_cookie *foc)
3564 ++ struct tcp_options_received *opt_rx,
3565 ++ struct mptcp_options_received *mopt,
3566 ++ int estab, struct tcp_fastopen_cookie *foc)
3567 + {
3568 + const unsigned char *ptr;
3569 + const struct tcphdr *th = tcp_hdr(skb);
3570 +@@ -3596,6 +3613,9 @@ void tcp_parse_options(const struct sk_buff *skb,
3571 + */
3572 + break;
3573 + #endif
3574 ++ case TCPOPT_MPTCP:
3575 ++ mptcp_parse_options(ptr - 2, opsize, mopt, skb);
3576 ++ break;
3577 + case TCPOPT_EXP:
3578 + /* Fast Open option shares code 254 using a
3579 + * 16 bits magic number. It's valid only in
3580 +@@ -3657,8 +3677,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
3581 + if (tcp_parse_aligned_timestamp(tp, th))
3582 + return true;
3583 + }
3584 +-
3585 +- tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3586 ++ tcp_parse_options(skb, &tp->rx_opt, mptcp(tp) ? &tp->mptcp->rx_opt : NULL,
3587 ++ 1, NULL);
3588 + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3589 + tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3590 +
3591 +@@ -3831,6 +3851,8 @@ static void tcp_fin(struct sock *sk)
3592 + dst = __sk_dst_get(sk);
3593 + if (!dst || !dst_metric(dst, RTAX_QUICKACK))
3594 + inet_csk(sk)->icsk_ack.pingpong = 1;
3595 ++ if (mptcp(tp))
3596 ++ mptcp_sub_close_passive(sk);
3597 + break;
3598 +
3599 + case TCP_CLOSE_WAIT:
3600 +@@ -3852,9 +3874,16 @@ static void tcp_fin(struct sock *sk)
3601 + tcp_set_state(sk, TCP_CLOSING);
3602 + break;
3603 + case TCP_FIN_WAIT2:
3604 ++ if (mptcp(tp)) {
3605 ++ /* The socket will get closed by mptcp_data_ready.
3606 ++ * We first have to process all data-sequences.
3607 ++ */
3608 ++ tp->close_it = 1;
3609 ++ break;
3610 ++ }
3611 + /* Received a FIN -- send ACK and enter TIME_WAIT. */
3612 + tcp_send_ack(sk);
3613 +- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
3614 ++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
3615 + break;
3616 + default:
3617 + /* Only TCP_LISTEN and TCP_CLOSE are left, in these
3618 +@@ -3876,6 +3905,10 @@ static void tcp_fin(struct sock *sk)
3619 + if (!sock_flag(sk, SOCK_DEAD)) {
3620 + sk->sk_state_change(sk);
3621 +
3622 ++ /* Don't wake up MPTCP-subflows */
3623 ++ if (mptcp(tp))
3624 ++ return;
3625 ++
3626 + /* Do not send POLL_HUP for half duplex close. */
3627 + if (sk->sk_shutdown == SHUTDOWN_MASK ||
3628 + sk->sk_state == TCP_CLOSE)
3629 +@@ -4073,7 +4106,11 @@ static void tcp_ofo_queue(struct sock *sk)
3630 + tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
3631 + }
3632 +
3633 +- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
3634 ++ /* In case of MPTCP, the segment may be empty if it's a
3635 ++ * non-data DATA_FIN. (see beginning of tcp_data_queue)
3636 ++ */
3637 ++ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
3638 ++ !(mptcp(tp) && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {
3639 + SOCK_DEBUG(sk, "ofo packet was already received\n");
3640 + __skb_unlink(skb, &tp->out_of_order_queue);
3641 + __kfree_skb(skb);
3642 +@@ -4091,12 +4128,14 @@ static void tcp_ofo_queue(struct sock *sk)
3643 + }
3644 + }
3645 +
3646 +-static bool tcp_prune_ofo_queue(struct sock *sk);
3647 + static int tcp_prune_queue(struct sock *sk);
3648 +
3649 + static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
3650 + unsigned int size)
3651 + {
3652 ++ if (mptcp(tcp_sk(sk)))
3653 ++ sk = mptcp_meta_sk(sk);
3654 ++
3655 + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
3656 + !sk_rmem_schedule(sk, skb, size)) {
3657 +
3658 +@@ -4104,7 +4143,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
3659 + return -1;
3660 +
3661 + if (!sk_rmem_schedule(sk, skb, size)) {
3662 +- if (!tcp_prune_ofo_queue(sk))
3663 ++ if (!tcp_sk(sk)->ops->prune_ofo_queue(sk))
3664 + return -1;
3665 +
3666 + if (!sk_rmem_schedule(sk, skb, size))
3667 +@@ -4127,15 +4166,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
3668 + * Better try to coalesce them right now to avoid future collapses.
3669 + * Returns true if caller should free @from instead of queueing it
3670 + */
3671 +-static bool tcp_try_coalesce(struct sock *sk,
3672 +- struct sk_buff *to,
3673 +- struct sk_buff *from,
3674 +- bool *fragstolen)
3675 ++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,
3676 ++ bool *fragstolen)
3677 + {
3678 + int delta;
3679 +
3680 + *fragstolen = false;
3681 +
3682 ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
3683 ++ return false;
3684 ++
3685 + if (tcp_hdr(from)->fin)
3686 + return false;
3687 +
3688 +@@ -4225,7 +4265,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
3689 +
3690 + /* Do skb overlap to previous one? */
3691 + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
3692 +- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
3693 ++ /* MPTCP allows non-data data-fin to be in the ofo-queue */
3694 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
3695 ++ !(mptcp(tp) && end_seq == seq)) {
3696 + /* All the bits are present. Drop. */
3697 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
3698 + __kfree_skb(skb);
3699 +@@ -4263,6 +4305,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
3700 + end_seq);
3701 + break;
3702 + }
3703 ++ /* MPTCP allows non-data data-fin to be in the ofo-queue */
3704 ++ if (mptcp(tp) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)
3705 ++ continue;
3706 + __skb_unlink(skb1, &tp->out_of_order_queue);
3707 + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
3708 + TCP_SKB_CB(skb1)->end_seq);
3709 +@@ -4280,8 +4325,8 @@ end:
3710 + }
3711 + }
3712 +
3713 +-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
3714 +- bool *fragstolen)
3715 ++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
3716 ++ bool *fragstolen)
3717 + {
3718 + int eaten;
3719 + struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
3720 +@@ -4343,7 +4388,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
3721 + int eaten = -1;
3722 + bool fragstolen = false;
3723 +
3724 +- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
3725 ++ /* If no data is present, but a data_fin is in the options, we still
3726 ++ * have to call mptcp_queue_skb later on. */
3727 ++ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
3728 ++ !(mptcp(tp) && mptcp_is_data_fin(skb)))
3729 + goto drop;
3730 +
3731 + skb_dst_drop(skb);
3732 +@@ -4389,7 +4437,7 @@ queue_and_out:
3733 + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
3734 + }
3735 + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3736 +- if (skb->len)
3737 ++ if (skb->len || mptcp_is_data_fin(skb))
3738 + tcp_event_data_recv(sk, skb);
3739 + if (th->fin)
3740 + tcp_fin(sk);
3741 +@@ -4411,7 +4459,11 @@ queue_and_out:
3742 +
3743 + if (eaten > 0)
3744 + kfree_skb_partial(skb, fragstolen);
3745 +- if (!sock_flag(sk, SOCK_DEAD))
3746 ++ if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp))
3747 ++ /* MPTCP: we always have to call data_ready, because
3748 ++ * we may be about to receive a data-fin, which still
3749 ++ * must get queued.
3750 ++ */
3751 + sk->sk_data_ready(sk);
3752 + return;
3753 + }
3754 +@@ -4463,6 +4515,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
3755 + next = skb_queue_next(list, skb);
3756 +
3757 + __skb_unlink(skb, list);
3758 ++ if (mptcp(tcp_sk(sk)))
3759 ++ mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb);
3760 + __kfree_skb(skb);
3761 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
3762 +
3763 +@@ -4630,7 +4684,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
3764 + * Purge the out-of-order queue.
3765 + * Return true if queue was pruned.
3766 + */
3767 +-static bool tcp_prune_ofo_queue(struct sock *sk)
3768 ++bool tcp_prune_ofo_queue(struct sock *sk)
3769 + {
3770 + struct tcp_sock *tp = tcp_sk(sk);
3771 + bool res = false;
3772 +@@ -4686,7 +4740,7 @@ static int tcp_prune_queue(struct sock *sk)
3773 + /* Collapsing did not help, destructive actions follow.
3774 + * This must not ever occur. */
3775 +
3776 +- tcp_prune_ofo_queue(sk);
3777 ++ tp->ops->prune_ofo_queue(sk);
3778 +
3779 + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
3780 + return 0;
3781 +@@ -4702,7 +4756,29 @@ static int tcp_prune_queue(struct sock *sk)
3782 + return -1;
3783 + }
3784 +
3785 +-static bool tcp_should_expand_sndbuf(const struct sock *sk)
3786 ++/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
3787 ++ * As additional protections, we do not touch cwnd in retransmission phases,
3788 ++ * and if application hit its sndbuf limit recently.
3789 ++ */
3790 ++void tcp_cwnd_application_limited(struct sock *sk)
3791 ++{
3792 ++ struct tcp_sock *tp = tcp_sk(sk);
3793 ++
3794 ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
3795 ++ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
3796 ++ /* Limited by application or receiver window. */
3797 ++ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
3798 ++ u32 win_used = max(tp->snd_cwnd_used, init_win);
3799 ++ if (win_used < tp->snd_cwnd) {
3800 ++ tp->snd_ssthresh = tcp_current_ssthresh(sk);
3801 ++ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
3802 ++ }
3803 ++ tp->snd_cwnd_used = 0;
3804 ++ }
3805 ++ tp->snd_cwnd_stamp = tcp_time_stamp;
3806 ++}
3807 ++
3808 ++bool tcp_should_expand_sndbuf(const struct sock *sk)
3809 + {
3810 + const struct tcp_sock *tp = tcp_sk(sk);
3811 +
3812 +@@ -4737,7 +4813,7 @@ static void tcp_new_space(struct sock *sk)
3813 + {
3814 + struct tcp_sock *tp = tcp_sk(sk);
3815 +
3816 +- if (tcp_should_expand_sndbuf(sk)) {
3817 ++ if (tp->ops->should_expand_sndbuf(sk)) {
3818 + tcp_sndbuf_expand(sk);
3819 + tp->snd_cwnd_stamp = tcp_time_stamp;
3820 + }
3821 +@@ -4749,8 +4825,9 @@ static void tcp_check_space(struct sock *sk)
3822 + {
3823 + if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
3824 + sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
3825 +- if (sk->sk_socket &&
3826 +- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
3827 ++ if (mptcp(tcp_sk(sk)) ||
3828 ++ (sk->sk_socket &&
3829 ++ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))
3830 + tcp_new_space(sk);
3831 + }
3832 + }
3833 +@@ -4773,7 +4850,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3834 + /* ... and right edge of window advances far enough.
3835 + * (tcp_recvmsg() will send ACK otherwise). Or...
3836 + */
3837 +- __tcp_select_window(sk) >= tp->rcv_wnd) ||
3838 ++ tp->ops->__select_window(sk) >= tp->rcv_wnd) ||
3839 + /* We ACK each frame or... */
3840 + tcp_in_quickack_mode(sk) ||
3841 + /* We have out of order data. */
3842 +@@ -4875,6 +4952,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
3843 + {
3844 + struct tcp_sock *tp = tcp_sk(sk);
3845 +
3846 ++ /* MPTCP urgent data is not yet supported */
3847 ++ if (mptcp(tp))
3848 ++ return;
3849 ++
3850 + /* Check if we get a new urgent pointer - normally not. */
3851 + if (th->urg)
3852 + tcp_check_urg(sk, th);
3853 +@@ -4942,8 +5023,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
3854 + }
3855 +
3856 + #ifdef CONFIG_NET_DMA
3857 +-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
3858 +- int hlen)
3859 ++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
3860 + {
3861 + struct tcp_sock *tp = tcp_sk(sk);
3862 + int chunk = skb->len - hlen;
3863 +@@ -5052,9 +5132,15 @@ syn_challenge:
3864 + goto discard;
3865 + }
3866 +
3867 ++ /* If valid: post process the received MPTCP options. */
3868 ++ if (mptcp(tp) && mptcp_handle_options(sk, th, skb))
3869 ++ goto discard;
3870 ++
3871 + return true;
3872 +
3873 + discard:
3874 ++ if (mptcp(tp))
3875 ++ mptcp_reset_mopt(tp);
3876 + __kfree_skb(skb);
3877 + return false;
3878 + }
3879 +@@ -5106,6 +5192,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3880 +
3881 + tp->rx_opt.saw_tstamp = 0;
3882 +
3883 ++ /* MPTCP: force slowpath. */
3884 ++ if (mptcp(tp))
3885 ++ goto slow_path;
3886 ++
3887 + /* pred_flags is 0xS?10 << 16 + snd_wnd
3888 + * if header_prediction is to be made
3889 + * 'S' will always be tp->tcp_header_len >> 2
3890 +@@ -5205,7 +5295,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3891 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
3892 + }
3893 + if (copied_early)
3894 +- tcp_cleanup_rbuf(sk, skb->len);
3895 ++ tp->ops->cleanup_rbuf(sk, skb->len);
3896 + }
3897 + if (!eaten) {
3898 + if (tcp_checksum_complete_user(sk, skb))
3899 +@@ -5313,14 +5403,14 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
3900 +
3901 + tcp_init_metrics(sk);
3902 +
3903 +- tcp_init_congestion_control(sk);
3904 ++ tp->ops->init_congestion_control(sk);
3905 +
3906 + /* Prevent spurious tcp_cwnd_restart() on first data
3907 + * packet.
3908 + */
3909 + tp->lsndtime = tcp_time_stamp;
3910 +
3911 +- tcp_init_buffer_space(sk);
3912 ++ tp->ops->init_buffer_space(sk);
3913 +
3914 + if (sock_flag(sk, SOCK_KEEPOPEN))
3915 + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
3916 +@@ -5350,7 +5440,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
3917 + /* Get original SYNACK MSS value if user MSS sets mss_clamp */
3918 + tcp_clear_options(&opt);
3919 + opt.user_mss = opt.mss_clamp = 0;
3920 +- tcp_parse_options(synack, &opt, 0, NULL);
3921 ++ tcp_parse_options(synack, &opt, NULL, 0, NULL);
3922 + mss = opt.mss_clamp;
3923 + }
3924 +
3925 +@@ -5365,7 +5455,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
3926 +
3927 + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
3928 +
3929 +- if (data) { /* Retransmit unacked data in SYN */
3930 ++ /* In mptcp case, we do not rely on "retransmit", but instead on
3931 ++ * "transmit", because if fastopen data is not acked, the retransmission
3932 ++ * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).
3933 ++ */
3934 ++ if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */
3935 + tcp_for_write_queue_from(data, sk) {
3936 + if (data == tcp_send_head(sk) ||
3937 + __tcp_retransmit_skb(sk, data))
3938 +@@ -5388,8 +5482,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3939 + struct tcp_sock *tp = tcp_sk(sk);
3940 + struct tcp_fastopen_cookie foc = { .len = -1 };
3941 + int saved_clamp = tp->rx_opt.mss_clamp;
3942 ++ struct mptcp_options_received mopt;
3943 ++ mptcp_init_mp_opt(&mopt);
3944 +
3945 +- tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
3946 ++ tcp_parse_options(skb, &tp->rx_opt,
3947 ++ mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc);
3948 + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3949 + tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3950 +
3951 +@@ -5448,6 +5545,30 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3952 + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
3953 + tcp_ack(sk, skb, FLAG_SLOWPATH);
3954 +
3955 ++ if (tp->request_mptcp || mptcp(tp)) {
3956 ++ int ret;
3957 ++ ret = mptcp_rcv_synsent_state_process(sk, &sk,
3958 ++ skb, &mopt);
3959 ++
3960 ++ /* May have changed if we support MPTCP */
3961 ++ tp = tcp_sk(sk);
3962 ++ icsk = inet_csk(sk);
3963 ++
3964 ++ if (ret == 1)
3965 ++ goto reset_and_undo;
3966 ++ if (ret == 2)
3967 ++ goto discard;
3968 ++ }
3969 ++
3970 ++ if (mptcp(tp) && !is_master_tp(tp)) {
3971 ++ /* Timer for repeating the ACK until an answer
3972 ++ * arrives. Used only when establishing an additional
3973 ++ * subflow inside of an MPTCP connection.
3974 ++ */
3975 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
3976 ++ jiffies + icsk->icsk_rto);
3977 ++ }
3978 ++
3979 + /* Ok.. it's good. Set up sequence numbers and
3980 + * move to established.
3981 + */
3982 +@@ -5474,6 +5595,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3983 + tp->tcp_header_len = sizeof(struct tcphdr);
3984 + }
3985 +
3986 ++ if (mptcp(tp)) {
3987 ++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
3988 ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
3989 ++ }
3990 ++
3991 + if (tcp_is_sack(tp) && sysctl_tcp_fack)
3992 + tcp_enable_fack(tp);
3993 +
3994 +@@ -5494,9 +5620,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3995 + tcp_rcv_fastopen_synack(sk, skb, &foc))
3996 + return -1;
3997 +
3998 +- if (sk->sk_write_pending ||
3999 ++ /* With MPTCP we cannot send data on the third ack due to the
4000 ++ * lack of option-space to combine with an MP_CAPABLE.
4001 ++ */
4002 ++ if (!mptcp(tp) && (sk->sk_write_pending ||
4003 + icsk->icsk_accept_queue.rskq_defer_accept ||
4004 +- icsk->icsk_ack.pingpong) {
4005 ++ icsk->icsk_ack.pingpong)) {
4006 + /* Save one ACK. Data will be ready after
4007 + * several ticks, if write_pending is set.
4008 + *
4009 +@@ -5536,6 +5665,7 @@ discard:
4010 + tcp_paws_reject(&tp->rx_opt, 0))
4011 + goto discard_and_undo;
4012 +
4013 ++ /* TODO - check this here for MPTCP */
4014 + if (th->syn) {
4015 + /* We see SYN without ACK. It is attempt of
4016 + * simultaneous connect with crossed SYNs.
4017 +@@ -5552,6 +5682,11 @@ discard:
4018 + tp->tcp_header_len = sizeof(struct tcphdr);
4019 + }
4020 +
4021 ++ if (mptcp(tp)) {
4022 ++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
4023 ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
4024 ++ }
4025 ++
4026 + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
4027 + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
4028 +
4029 +@@ -5610,6 +5745,7 @@ reset_and_undo:
4030 +
4031 + int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4032 + const struct tcphdr *th, unsigned int len)
4033 ++ __releases(&sk->sk_lock.slock)
4034 + {
4035 + struct tcp_sock *tp = tcp_sk(sk);
4036 + struct inet_connection_sock *icsk = inet_csk(sk);
4037 +@@ -5661,6 +5797,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4038 +
4039 + case TCP_SYN_SENT:
4040 + queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4041 ++ if (is_meta_sk(sk)) {
4042 ++ sk = tcp_sk(sk)->mpcb->master_sk;
4043 ++ tp = tcp_sk(sk);
4044 ++
4045 ++ /* Need to call it here, because it will announce new
4046 ++ * addresses, which can only be done after the third ack
4047 ++ * of the 3-way handshake.
4048 ++ */
4049 ++ mptcp_update_metasocket(sk, tp->meta_sk);
4050 ++ }
4051 + if (queued >= 0)
4052 + return queued;
4053 +
4054 +@@ -5668,6 +5814,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4055 + tcp_urg(sk, skb, th);
4056 + __kfree_skb(skb);
4057 + tcp_data_snd_check(sk);
4058 ++ if (mptcp(tp) && is_master_tp(tp))
4059 ++ bh_unlock_sock(sk);
4060 + return 0;
4061 + }
4062 +
4063 +@@ -5706,11 +5854,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4064 + synack_stamp = tp->lsndtime;
4065 + /* Make sure socket is routed, for correct metrics. */
4066 + icsk->icsk_af_ops->rebuild_header(sk);
4067 +- tcp_init_congestion_control(sk);
4068 ++ tp->ops->init_congestion_control(sk);
4069 +
4070 + tcp_mtup_init(sk);
4071 + tp->copied_seq = tp->rcv_nxt;
4072 +- tcp_init_buffer_space(sk);
4073 ++ tp->ops->init_buffer_space(sk);
4074 + }
4075 + smp_mb();
4076 + tcp_set_state(sk, TCP_ESTABLISHED);
4077 +@@ -5730,6 +5878,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4078 +
4079 + if (tp->rx_opt.tstamp_ok)
4080 + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
4081 ++ if (mptcp(tp))
4082 ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
4083 +
4084 + if (req) {
4085 + /* Re-arm the timer because data may have been sent out.
4086 +@@ -5751,6 +5901,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4087 +
4088 + tcp_initialize_rcv_mss(sk);
4089 + tcp_fast_path_on(tp);
4090 ++ /* Send an ACK when establishing a new
4091 ++ * MPTCP subflow, i.e. using an MP_JOIN
4092 ++ * subtype.
4093 ++ */
4094 ++ if (mptcp(tp) && !is_master_tp(tp))
4095 ++ tcp_send_ack(sk);
4096 + break;
4097 +
4098 + case TCP_FIN_WAIT1: {
4099 +@@ -5802,7 +5958,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4100 + tmo = tcp_fin_time(sk);
4101 + if (tmo > TCP_TIMEWAIT_LEN) {
4102 + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
4103 +- } else if (th->fin || sock_owned_by_user(sk)) {
4104 ++ } else if (th->fin || mptcp_is_data_fin(skb) ||
4105 ++ sock_owned_by_user(sk)) {
4106 + /* Bad case. We could lose such FIN otherwise.
4107 + * It is not a big problem, but it looks confusing
4108 + * and not so rare event. We still can lose it now,
4109 +@@ -5811,7 +5968,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4110 + */
4111 + inet_csk_reset_keepalive_timer(sk, tmo);
4112 + } else {
4113 +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
4114 ++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
4115 + goto discard;
4116 + }
4117 + break;
4118 +@@ -5819,7 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4119 +
4120 + case TCP_CLOSING:
4121 + if (tp->snd_una == tp->write_seq) {
4122 +- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4123 ++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
4124 + goto discard;
4125 + }
4126 + break;
4127 +@@ -5831,6 +5988,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4128 + goto discard;
4129 + }
4130 + break;
4131 ++ case TCP_CLOSE:
4132 ++ if (tp->mp_killed)
4133 ++ goto discard;
4134 + }
4135 +
4136 + /* step 6: check the URG bit */
4137 +@@ -5851,7 +6011,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4138 + */
4139 + if (sk->sk_shutdown & RCV_SHUTDOWN) {
4140 + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4141 +- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
4142 ++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
4143 ++ !mptcp(tp)) {
4144 ++ /* In case of mptcp, the reset is handled by
4145 ++ * mptcp_rcv_state_process
4146 ++ */
4147 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
4148 + tcp_reset(sk);
4149 + return 1;
4150 +@@ -5877,3 +6041,154 @@ discard:
4151 + return 0;
4152 + }
4153 + EXPORT_SYMBOL(tcp_rcv_state_process);
4154 ++
4155 ++static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
4156 ++{
4157 ++ struct inet_request_sock *ireq = inet_rsk(req);
4158 ++
4159 ++ if (family == AF_INET)
4160 ++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
4161 ++ &ireq->ir_rmt_addr, port);
4162 ++#if IS_ENABLED(CONFIG_IPV6)
4163 ++ else if (family == AF_INET6)
4164 ++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
4165 ++ &ireq->ir_v6_rmt_addr, port);
4166 ++#endif
4167 ++}
4168 ++
4169 ++int tcp_conn_request(struct request_sock_ops *rsk_ops,
4170 ++ const struct tcp_request_sock_ops *af_ops,
4171 ++ struct sock *sk, struct sk_buff *skb)
4172 ++{
4173 ++ struct tcp_options_received tmp_opt;
4174 ++ struct request_sock *req;
4175 ++ struct tcp_sock *tp = tcp_sk(sk);
4176 ++ struct dst_entry *dst = NULL;
4177 ++ __u32 isn = TCP_SKB_CB(skb)->when;
4178 ++ bool want_cookie = false, fastopen;
4179 ++ struct flowi fl;
4180 ++ struct tcp_fastopen_cookie foc = { .len = -1 };
4181 ++ int err;
4182 ++
4183 ++
4184 ++ /* TW buckets are converted to open requests without
4185 ++ * limitations, they conserve resources and peer is
4186 ++ * evidently real one.
4187 ++ */
4188 ++ if ((sysctl_tcp_syncookies == 2 ||
4189 ++ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
4190 ++ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
4191 ++ if (!want_cookie)
4192 ++ goto drop;
4193 ++ }
4194 ++
4195 ++
4196 ++ /* Accept backlog is full. If we have already queued enough
4197 ++ * of warm entries in syn queue, drop request. It is better than
4198 ++ * clogging syn queue with openreqs with exponentially increasing
4199 ++ * timeout.
4200 ++ */
4201 ++ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
4202 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
4203 ++ goto drop;
4204 ++ }
4205 ++
4206 ++ req = inet_reqsk_alloc(rsk_ops);
4207 ++ if (!req)
4208 ++ goto drop;
4209 ++
4210 ++ tcp_rsk(req)->af_specific = af_ops;
4211 ++
4212 ++ tcp_clear_options(&tmp_opt);
4213 ++ tmp_opt.mss_clamp = af_ops->mss_clamp;
4214 ++ tmp_opt.user_mss = tp->rx_opt.user_mss;
4215 ++ tcp_parse_options(skb, &tmp_opt, NULL, 0, want_cookie ? NULL : &foc);
4216 ++
4217 ++ if (want_cookie && !tmp_opt.saw_tstamp)
4218 ++ tcp_clear_options(&tmp_opt);
4219 ++
4220 ++ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
4221 ++ tcp_openreq_init(req, &tmp_opt, skb);
4222 ++
4223 ++ if (af_ops->init_req(req, sk, skb))
4224 ++ goto drop_and_free;
4225 ++
4226 ++ if (security_inet_conn_request(sk, skb, req))
4227 ++ goto drop_and_free;
4228 ++
4229 ++ if (!want_cookie || tmp_opt.tstamp_ok)
4230 ++ TCP_ECN_create_request(req, skb, sock_net(sk));
4231 ++
4232 ++ if (want_cookie) {
4233 ++ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
4234 ++ req->cookie_ts = tmp_opt.tstamp_ok;
4235 ++ } else if (!isn) {
4236 ++ /* VJ's idea. We save last timestamp seen
4237 ++ * from the destination in peer table, when entering
4238 ++ * state TIME-WAIT, and check against it before
4239 ++ * accepting new connection request.
4240 ++ *
4241 ++ * If "isn" is not zero, this request hit alive
4242 ++ * timewait bucket, so that all the necessary checks
4243 ++ * are made in the function processing timewait state.
4244 ++ */
4245 ++ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
4246 ++ bool strict;
4247 ++
4248 ++ dst = af_ops->route_req(sk, &fl, req, &strict);
4249 ++ if (dst && strict &&
4250 ++ !tcp_peer_is_proven(req, dst, true)) {
4251 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
4252 ++ goto drop_and_release;
4253 ++ }
4254 ++ }
4255 ++ /* Kill the following clause, if you dislike this way. */
4256 ++ else if (!sysctl_tcp_syncookies &&
4257 ++ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
4258 ++ (sysctl_max_syn_backlog >> 2)) &&
4259 ++ !tcp_peer_is_proven(req, dst, false)) {
4260 ++ /* Without syncookies last quarter of
4261 ++ * backlog is filled with destinations,
4262 ++ * proven to be alive.
4263 ++ * It means that we continue to communicate
4264 ++ * to destinations, already remembered
4265 ++ * to the moment of synflood.
4266 ++ */
4267 ++ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
4268 ++ rsk_ops->family);
4269 ++ goto drop_and_release;
4270 ++ }
4271 ++
4272 ++ isn = af_ops->init_seq(skb);
4273 ++ }
4274 ++ if (!dst) {
4275 ++ dst = af_ops->route_req(sk, &fl, req, NULL);
4276 ++ if (!dst)
4277 ++ goto drop_and_free;
4278 ++ }
4279 ++
4280 ++ tcp_rsk(req)->snt_isn = isn;
4281 ++ tcp_openreq_init_rwin(req, sk, dst);
4282 ++ fastopen = !want_cookie &&
4283 ++ tcp_try_fastopen(sk, skb, req, &foc, dst);
4284 ++ err = af_ops->send_synack(sk, dst, &fl, req,
4285 ++ skb_get_queue_mapping(skb), &foc);
4286 ++ if (!fastopen) {
4287 ++ if (err || want_cookie)
4288 ++ goto drop_and_free;
4289 ++
4290 ++ tcp_rsk(req)->listener = NULL;
4291 ++ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
4292 ++ }
4293 ++
4294 ++ return 0;
4295 ++
4296 ++drop_and_release:
4297 ++ dst_release(dst);
4298 ++drop_and_free:
4299 ++ reqsk_free(req);
4300 ++drop:
4301 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
4302 ++ return 0;
4303 ++}
4304 ++EXPORT_SYMBOL(tcp_conn_request);
4305 +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
4306 +index 77cccda1ad0c..c77017f600f1 100644
4307 +--- a/net/ipv4/tcp_ipv4.c
4308 ++++ b/net/ipv4/tcp_ipv4.c
4309 +@@ -67,6 +67,8 @@
4310 + #include <net/icmp.h>
4311 + #include <net/inet_hashtables.h>
4312 + #include <net/tcp.h>
4313 ++#include <net/mptcp.h>
4314 ++#include <net/mptcp_v4.h>
4315 + #include <net/transp_v6.h>
4316 + #include <net/ipv6.h>
4317 + #include <net/inet_common.h>
4318 +@@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
4319 + struct inet_hashinfo tcp_hashinfo;
4320 + EXPORT_SYMBOL(tcp_hashinfo);
4321 +
4322 +-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
4323 ++__u32 tcp_v4_init_sequence(const struct sk_buff *skb)
4324 + {
4325 + return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
4326 + ip_hdr(skb)->saddr,
4327 +@@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4328 + struct inet_sock *inet;
4329 + const int type = icmp_hdr(icmp_skb)->type;
4330 + const int code = icmp_hdr(icmp_skb)->code;
4331 +- struct sock *sk;
4332 ++ struct sock *sk, *meta_sk;
4333 + struct sk_buff *skb;
4334 + struct request_sock *fastopen;
4335 + __u32 seq, snd_una;
4336 +@@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4337 + return;
4338 + }
4339 +
4340 +- bh_lock_sock(sk);
4341 ++ tp = tcp_sk(sk);
4342 ++ if (mptcp(tp))
4343 ++ meta_sk = mptcp_meta_sk(sk);
4344 ++ else
4345 ++ meta_sk = sk;
4346 ++
4347 ++ bh_lock_sock(meta_sk);
4348 + /* If too many ICMPs get dropped on busy
4349 + * servers this needs to be solved differently.
4350 + * We do take care of PMTU discovery (RFC1191) special case :
4351 + * we can receive locally generated ICMP messages while socket is held.
4352 + */
4353 +- if (sock_owned_by_user(sk)) {
4354 ++ if (sock_owned_by_user(meta_sk)) {
4355 + if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
4356 + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
4357 + }
4358 +@@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4359 + }
4360 +
4361 + icsk = inet_csk(sk);
4362 +- tp = tcp_sk(sk);
4363 + seq = ntohl(th->seq);
4364 + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
4365 + fastopen = tp->fastopen_rsk;
4366 +@@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4367 + goto out;
4368 +
4369 + tp->mtu_info = info;
4370 +- if (!sock_owned_by_user(sk)) {
4371 ++ if (!sock_owned_by_user(meta_sk)) {
4372 + tcp_v4_mtu_reduced(sk);
4373 + } else {
4374 + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
4375 + sock_hold(sk);
4376 ++ if (mptcp(tp))
4377 ++ mptcp_tsq_flags(sk);
4378 + }
4379 + goto out;
4380 + }
4381 +@@ -429,7 +438,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4382 + !icsk->icsk_backoff || fastopen)
4383 + break;
4384 +
4385 +- if (sock_owned_by_user(sk))
4386 ++ if (sock_owned_by_user(meta_sk))
4387 + break;
4388 +
4389 + icsk->icsk_backoff--;
4390 +@@ -463,7 +472,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4391 + switch (sk->sk_state) {
4392 + struct request_sock *req, **prev;
4393 + case TCP_LISTEN:
4394 +- if (sock_owned_by_user(sk))
4395 ++ if (sock_owned_by_user(meta_sk))
4396 + goto out;
4397 +
4398 + req = inet_csk_search_req(sk, &prev, th->dest,
4399 +@@ -499,7 +508,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4400 + if (fastopen && fastopen->sk == NULL)
4401 + break;
4402 +
4403 +- if (!sock_owned_by_user(sk)) {
4404 ++ if (!sock_owned_by_user(meta_sk)) {
4405 + sk->sk_err = err;
4406 +
4407 + sk->sk_error_report(sk);
4408 +@@ -528,7 +537,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4409 + */
4410 +
4411 + inet = inet_sk(sk);
4412 +- if (!sock_owned_by_user(sk) && inet->recverr) {
4413 ++ if (!sock_owned_by_user(meta_sk) && inet->recverr) {
4414 + sk->sk_err = err;
4415 + sk->sk_error_report(sk);
4416 + } else { /* Only an error on timeout */
4417 +@@ -536,7 +545,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4418 + }
4419 +
4420 + out:
4421 +- bh_unlock_sock(sk);
4422 ++ bh_unlock_sock(meta_sk);
4423 + sock_put(sk);
4424 + }
4425 +
4426 +@@ -578,7 +587,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
4427 + * Exception: precedence violation. We do not implement it in any case.
4428 + */
4429 +
4430 +-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
4431 ++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
4432 + {
4433 + const struct tcphdr *th = tcp_hdr(skb);
4434 + struct {
4435 +@@ -702,10 +711,10 @@ release_sk1:
4436 + outside socket context is ugly, certainly. What can I do?
4437 + */
4438 +
4439 +-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
4440 ++static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
4441 + u32 win, u32 tsval, u32 tsecr, int oif,
4442 + struct tcp_md5sig_key *key,
4443 +- int reply_flags, u8 tos)
4444 ++ int reply_flags, u8 tos, int mptcp)
4445 + {
4446 + const struct tcphdr *th = tcp_hdr(skb);
4447 + struct {
4448 +@@ -714,6 +723,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
4449 + #ifdef CONFIG_TCP_MD5SIG
4450 + + (TCPOLEN_MD5SIG_ALIGNED >> 2)
4451 + #endif
4452 ++#ifdef CONFIG_MPTCP
4453 ++ + ((MPTCP_SUB_LEN_DSS >> 2) +
4454 ++ (MPTCP_SUB_LEN_ACK >> 2))
4455 ++#endif
4456 + ];
4457 + } rep;
4458 + struct ip_reply_arg arg;
4459 +@@ -758,6 +771,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
4460 + ip_hdr(skb)->daddr, &rep.th);
4461 + }
4462 + #endif
4463 ++#ifdef CONFIG_MPTCP
4464 ++ if (mptcp) {
4465 ++ int offset = (tsecr) ? 3 : 0;
4466 ++ /* Construction of 32-bit data_ack */
4467 ++ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
4468 ++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
4469 ++ (0x20 << 8) |
4470 ++ (0x01));
4471 ++ rep.opt[offset] = htonl(data_ack);
4472 ++
4473 ++ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
4474 ++ rep.th.doff = arg.iov[0].iov_len / 4;
4475 ++ }
4476 ++#endif /* CONFIG_MPTCP */
4477 ++
4478 + arg.flags = reply_flags;
4479 + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
4480 + ip_hdr(skb)->saddr, /* XXX */
4481 +@@ -776,36 +804,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
4482 + {
4483 + struct inet_timewait_sock *tw = inet_twsk(sk);
4484 + struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
4485 ++ u32 data_ack = 0;
4486 ++ int mptcp = 0;
4487 ++
4488 ++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
4489 ++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
4490 ++ mptcp = 1;
4491 ++ }
4492 +
4493 + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
4494 ++ data_ack,
4495 + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
4496 + tcp_time_stamp + tcptw->tw_ts_offset,
4497 + tcptw->tw_ts_recent,
4498 + tw->tw_bound_dev_if,
4499 + tcp_twsk_md5_key(tcptw),
4500 + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
4501 +- tw->tw_tos
4502 ++ tw->tw_tos, mptcp
4503 + );
4504 +
4505 + inet_twsk_put(tw);
4506 + }
4507 +
4508 +-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
4509 +- struct request_sock *req)
4510 ++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
4511 ++ struct request_sock *req)
4512 + {
4513 + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
4514 + * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
4515 + */
4516 + tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
4517 + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
4518 +- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
4519 ++ tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,
4520 + tcp_time_stamp,
4521 + req->ts_recent,
4522 + 0,
4523 + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
4524 + AF_INET),
4525 + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
4526 +- ip_hdr(skb)->tos);
4527 ++ ip_hdr(skb)->tos, 0);
4528 + }
4529 +
4530 + /*
4531 +@@ -813,10 +849,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
4532 + * This still operates on a request_sock only, not on a big
4533 + * socket.
4534 + */
4535 +-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
4536 +- struct request_sock *req,
4537 +- u16 queue_mapping,
4538 +- struct tcp_fastopen_cookie *foc)
4539 ++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
4540 ++ struct flowi *fl,
4541 ++ struct request_sock *req,
4542 ++ u16 queue_mapping,
4543 ++ struct tcp_fastopen_cookie *foc)
4544 + {
4545 + const struct inet_request_sock *ireq = inet_rsk(req);
4546 + struct flowi4 fl4;
4547 +@@ -844,21 +881,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
4548 + return err;
4549 + }
4550 +
4551 +-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
4552 +-{
4553 +- int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
4554 +-
4555 +- if (!res) {
4556 +- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
4557 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4558 +- }
4559 +- return res;
4560 +-}
4561 +-
4562 + /*
4563 + * IPv4 request_sock destructor.
4564 + */
4565 +-static void tcp_v4_reqsk_destructor(struct request_sock *req)
4566 ++void tcp_v4_reqsk_destructor(struct request_sock *req)
4567 + {
4568 + kfree(inet_rsk(req)->opt);
4569 + }
4570 +@@ -896,7 +922,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
4571 + /*
4572 + * Save and compile IPv4 options into the request_sock if needed.
4573 + */
4574 +-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
4575 ++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
4576 + {
4577 + const struct ip_options *opt = &(IPCB(skb)->opt);
4578 + struct ip_options_rcu *dopt = NULL;
4579 +@@ -1237,161 +1263,71 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
4580 +
4581 + #endif
4582 +
4583 ++static int tcp_v4_init_req(struct request_sock *req, struct sock *sk,
4584 ++ struct sk_buff *skb)
4585 ++{
4586 ++ struct inet_request_sock *ireq = inet_rsk(req);
4587 ++
4588 ++ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
4589 ++ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
4590 ++ ireq->no_srccheck = inet_sk(sk)->transparent;
4591 ++ ireq->opt = tcp_v4_save_options(skb);
4592 ++ ireq->ir_mark = inet_request_mark(sk, skb);
4593 ++
4594 ++ return 0;
4595 ++}
4596 ++
4597 ++static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
4598 ++ const struct request_sock *req,
4599 ++ bool *strict)
4600 ++{
4601 ++ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
4602 ++
4603 ++ if (strict) {
4604 ++ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
4605 ++ *strict = true;
4606 ++ else
4607 ++ *strict = false;
4608 ++ }
4609 ++
4610 ++ return dst;
4611 ++}
4612 ++
4613 + struct request_sock_ops tcp_request_sock_ops __read_mostly = {
4614 + .family = PF_INET,
4615 + .obj_size = sizeof(struct tcp_request_sock),
4616 +- .rtx_syn_ack = tcp_v4_rtx_synack,
4617 ++ .rtx_syn_ack = tcp_rtx_synack,
4618 + .send_ack = tcp_v4_reqsk_send_ack,
4619 + .destructor = tcp_v4_reqsk_destructor,
4620 + .send_reset = tcp_v4_send_reset,
4621 + .syn_ack_timeout = tcp_syn_ack_timeout,
4622 + };
4623 +
4624 ++const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
4625 ++ .mss_clamp = TCP_MSS_DEFAULT,
4626 + #ifdef CONFIG_TCP_MD5SIG
4627 +-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
4628 + .md5_lookup = tcp_v4_reqsk_md5_lookup,
4629 + .calc_md5_hash = tcp_v4_md5_hash_skb,
4630 +-};
4631 + #endif
4632 ++ .init_req = tcp_v4_init_req,
4633 ++#ifdef CONFIG_SYN_COOKIES
4634 ++ .cookie_init_seq = cookie_v4_init_sequence,
4635 ++#endif
4636 ++ .route_req = tcp_v4_route_req,
4637 ++ .init_seq = tcp_v4_init_sequence,
4638 ++ .send_synack = tcp_v4_send_synack,
4639 ++ .queue_hash_add = inet_csk_reqsk_queue_hash_add,
4640 ++};
4641 +
4642 + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
4643 + {
4644 +- struct tcp_options_received tmp_opt;
4645 +- struct request_sock *req;
4646 +- struct inet_request_sock *ireq;
4647 +- struct tcp_sock *tp = tcp_sk(sk);
4648 +- struct dst_entry *dst = NULL;
4649 +- __be32 saddr = ip_hdr(skb)->saddr;
4650 +- __be32 daddr = ip_hdr(skb)->daddr;
4651 +- __u32 isn = TCP_SKB_CB(skb)->when;
4652 +- bool want_cookie = false, fastopen;
4653 +- struct flowi4 fl4;
4654 +- struct tcp_fastopen_cookie foc = { .len = -1 };
4655 +- int err;
4656 +-
4657 + /* Never answer to SYNs send to broadcast or multicast */
4658 + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
4659 + goto drop;
4660 +
4661 +- /* TW buckets are converted to open requests without
4662 +- * limitations, they conserve resources and peer is
4663 +- * evidently real one.
4664 +- */
4665 +- if ((sysctl_tcp_syncookies == 2 ||
4666 +- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
4667 +- want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
4668 +- if (!want_cookie)
4669 +- goto drop;
4670 +- }
4671 +-
4672 +- /* Accept backlog is full. If we have already queued enough
4673 +- * of warm entries in syn queue, drop request. It is better than
4674 +- * clogging syn queue with openreqs with exponentially increasing
4675 +- * timeout.
4676 +- */
4677 +- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
4678 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
4679 +- goto drop;
4680 +- }
4681 +-
4682 +- req = inet_reqsk_alloc(&tcp_request_sock_ops);
4683 +- if (!req)
4684 +- goto drop;
4685 +-
4686 +-#ifdef CONFIG_TCP_MD5SIG
4687 +- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
4688 +-#endif
4689 +-
4690 +- tcp_clear_options(&tmp_opt);
4691 +- tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4692 +- tmp_opt.user_mss = tp->rx_opt.user_mss;
4693 +- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
4694 +-
4695 +- if (want_cookie && !tmp_opt.saw_tstamp)
4696 +- tcp_clear_options(&tmp_opt);
4697 +-
4698 +- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
4699 +- tcp_openreq_init(req, &tmp_opt, skb);
4700 ++ return tcp_conn_request(&tcp_request_sock_ops,
4701 ++ &tcp_request_sock_ipv4_ops, sk, skb);
4702 +
4703 +- ireq = inet_rsk(req);
4704 +- ireq->ir_loc_addr = daddr;
4705 +- ireq->ir_rmt_addr = saddr;
4706 +- ireq->no_srccheck = inet_sk(sk)->transparent;
4707 +- ireq->opt = tcp_v4_save_options(skb);
4708 +- ireq->ir_mark = inet_request_mark(sk, skb);
4709 +-
4710 +- if (security_inet_conn_request(sk, skb, req))
4711 +- goto drop_and_free;
4712 +-
4713 +- if (!want_cookie || tmp_opt.tstamp_ok)
4714 +- TCP_ECN_create_request(req, skb, sock_net(sk));
4715 +-
4716 +- if (want_cookie) {
4717 +- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
4718 +- req->cookie_ts = tmp_opt.tstamp_ok;
4719 +- } else if (!isn) {
4720 +- /* VJ's idea. We save last timestamp seen
4721 +- * from the destination in peer table, when entering
4722 +- * state TIME-WAIT, and check against it before
4723 +- * accepting new connection request.
4724 +- *
4725 +- * If "isn" is not zero, this request hit alive
4726 +- * timewait bucket, so that all the necessary checks
4727 +- * are made in the function processing timewait state.
4728 +- */
4729 +- if (tmp_opt.saw_tstamp &&
4730 +- tcp_death_row.sysctl_tw_recycle &&
4731 +- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
4732 +- fl4.daddr == saddr) {
4733 +- if (!tcp_peer_is_proven(req, dst, true)) {
4734 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
4735 +- goto drop_and_release;
4736 +- }
4737 +- }
4738 +- /* Kill the following clause, if you dislike this way. */
4739 +- else if (!sysctl_tcp_syncookies &&
4740 +- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
4741 +- (sysctl_max_syn_backlog >> 2)) &&
4742 +- !tcp_peer_is_proven(req, dst, false)) {
4743 +- /* Without syncookies last quarter of
4744 +- * backlog is filled with destinations,
4745 +- * proven to be alive.
4746 +- * It means that we continue to communicate
4747 +- * to destinations, already remembered
4748 +- * to the moment of synflood.
4749 +- */
4750 +- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
4751 +- &saddr, ntohs(tcp_hdr(skb)->source));
4752 +- goto drop_and_release;
4753 +- }
4754 +-
4755 +- isn = tcp_v4_init_sequence(skb);
4756 +- }
4757 +- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
4758 +- goto drop_and_free;
4759 +-
4760 +- tcp_rsk(req)->snt_isn = isn;
4761 +- tcp_rsk(req)->snt_synack = tcp_time_stamp;
4762 +- tcp_openreq_init_rwin(req, sk, dst);
4763 +- fastopen = !want_cookie &&
4764 +- tcp_try_fastopen(sk, skb, req, &foc, dst);
4765 +- err = tcp_v4_send_synack(sk, dst, req,
4766 +- skb_get_queue_mapping(skb), &foc);
4767 +- if (!fastopen) {
4768 +- if (err || want_cookie)
4769 +- goto drop_and_free;
4770 +-
4771 +- tcp_rsk(req)->snt_synack = tcp_time_stamp;
4772 +- tcp_rsk(req)->listener = NULL;
4773 +- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
4774 +- }
4775 +-
4776 +- return 0;
4777 +-
4778 +-drop_and_release:
4779 +- dst_release(dst);
4780 +-drop_and_free:
4781 +- reqsk_free(req);
4782 + drop:
4783 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
4784 + return 0;
4785 +@@ -1497,7 +1433,7 @@ put_and_exit:
4786 + }
4787 + EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
4788 +
4789 +-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
4790 ++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
4791 + {
4792 + struct tcphdr *th = tcp_hdr(skb);
4793 + const struct iphdr *iph = ip_hdr(skb);
4794 +@@ -1514,8 +1450,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
4795 +
4796 + if (nsk) {
4797 + if (nsk->sk_state != TCP_TIME_WAIT) {
4798 ++ /* Don't lock again the meta-sk. It has been locked
4799 ++ * before mptcp_v4_do_rcv.
4800 ++ */
4801 ++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))
4802 ++ bh_lock_sock(mptcp_meta_sk(nsk));
4803 + bh_lock_sock(nsk);
4804 ++
4805 + return nsk;
4806 ++
4807 + }
4808 + inet_twsk_put(inet_twsk(nsk));
4809 + return NULL;
4810 +@@ -1550,6 +1493,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
4811 + goto discard;
4812 + #endif
4813 +
4814 ++ if (is_meta_sk(sk))
4815 ++ return mptcp_v4_do_rcv(sk, skb);
4816 ++
4817 + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
4818 + struct dst_entry *dst = sk->sk_rx_dst;
4819 +
4820 +@@ -1681,7 +1627,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
4821 + } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
4822 + wake_up_interruptible_sync_poll(sk_sleep(sk),
4823 + POLLIN | POLLRDNORM | POLLRDBAND);
4824 +- if (!inet_csk_ack_scheduled(sk))
4825 ++ if (!inet_csk_ack_scheduled(sk) && !mptcp(tp))
4826 + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
4827 + (3 * tcp_rto_min(sk)) / 4,
4828 + TCP_RTO_MAX);
4829 +@@ -1698,7 +1644,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
4830 + {
4831 + const struct iphdr *iph;
4832 + const struct tcphdr *th;
4833 +- struct sock *sk;
4834 ++ struct sock *sk, *meta_sk = NULL;
4835 + int ret;
4836 + struct net *net = dev_net(skb->dev);
4837 +
4838 +@@ -1732,18 +1678,42 @@ int tcp_v4_rcv(struct sk_buff *skb)
4839 + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
4840 + skb->len - th->doff * 4);
4841 + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
4842 ++#ifdef CONFIG_MPTCP
4843 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
4844 ++ TCP_SKB_CB(skb)->dss_off = 0;
4845 ++#endif
4846 + TCP_SKB_CB(skb)->when = 0;
4847 + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
4848 + TCP_SKB_CB(skb)->sacked = 0;
4849 +
4850 + sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
4851 +- if (!sk)
4852 +- goto no_tcp_socket;
4853 +
4854 + process:
4855 +- if (sk->sk_state == TCP_TIME_WAIT)
4856 ++ if (sk && sk->sk_state == TCP_TIME_WAIT)
4857 + goto do_time_wait;
4858 +
4859 ++#ifdef CONFIG_MPTCP
4860 ++ if (!sk && th->syn && !th->ack) {
4861 ++ int ret = mptcp_lookup_join(skb, NULL);
4862 ++
4863 ++ if (ret < 0) {
4864 ++ tcp_v4_send_reset(NULL, skb);
4865 ++ goto discard_it;
4866 ++ } else if (ret > 0) {
4867 ++ return 0;
4868 ++ }
4869 ++ }
4870 ++
4871 ++ /* Is there a pending request sock for this segment ? */
4872 ++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
4873 ++ if (sk)
4874 ++ sock_put(sk);
4875 ++ return 0;
4876 ++ }
4877 ++#endif
4878 ++ if (!sk)
4879 ++ goto no_tcp_socket;
4880 ++
4881 + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
4882 + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
4883 + goto discard_and_relse;
4884 +@@ -1759,11 +1729,21 @@ process:
4885 + sk_mark_napi_id(sk, skb);
4886 + skb->dev = NULL;
4887 +
4888 +- bh_lock_sock_nested(sk);
4889 ++ if (mptcp(tcp_sk(sk))) {
4890 ++ meta_sk = mptcp_meta_sk(sk);
4891 ++
4892 ++ bh_lock_sock_nested(meta_sk);
4893 ++ if (sock_owned_by_user(meta_sk))
4894 ++ skb->sk = sk;
4895 ++ } else {
4896 ++ meta_sk = sk;
4897 ++ bh_lock_sock_nested(sk);
4898 ++ }
4899 ++
4900 + ret = 0;
4901 +- if (!sock_owned_by_user(sk)) {
4902 ++ if (!sock_owned_by_user(meta_sk)) {
4903 + #ifdef CONFIG_NET_DMA
4904 +- struct tcp_sock *tp = tcp_sk(sk);
4905 ++ struct tcp_sock *tp = tcp_sk(meta_sk);
4906 + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
4907 + tp->ucopy.dma_chan = net_dma_find_channel();
4908 + if (tp->ucopy.dma_chan)
4909 +@@ -1771,16 +1751,16 @@ process:
4910 + else
4911 + #endif
4912 + {
4913 +- if (!tcp_prequeue(sk, skb))
4914 ++ if (!tcp_prequeue(meta_sk, skb))
4915 + ret = tcp_v4_do_rcv(sk, skb);
4916 + }
4917 +- } else if (unlikely(sk_add_backlog(sk, skb,
4918 +- sk->sk_rcvbuf + sk->sk_sndbuf))) {
4919 +- bh_unlock_sock(sk);
4920 ++ } else if (unlikely(sk_add_backlog(meta_sk, skb,
4921 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
4922 ++ bh_unlock_sock(meta_sk);
4923 + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
4924 + goto discard_and_relse;
4925 + }
4926 +- bh_unlock_sock(sk);
4927 ++ bh_unlock_sock(meta_sk);
4928 +
4929 + sock_put(sk);
4930 +
4931 +@@ -1835,6 +1815,18 @@ do_time_wait:
4932 + sk = sk2;
4933 + goto process;
4934 + }
4935 ++#ifdef CONFIG_MPTCP
4936 ++ if (th->syn && !th->ack) {
4937 ++ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
4938 ++
4939 ++ if (ret < 0) {
4940 ++ tcp_v4_send_reset(NULL, skb);
4941 ++ goto discard_it;
4942 ++ } else if (ret > 0) {
4943 ++ return 0;
4944 ++ }
4945 ++ }
4946 ++#endif
4947 + /* Fall through to ACK */
4948 + }
4949 + case TCP_TW_ACK:
4950 +@@ -1900,7 +1892,12 @@ static int tcp_v4_init_sock(struct sock *sk)
4951 +
4952 + tcp_init_sock(sk);
4953 +
4954 +- icsk->icsk_af_ops = &ipv4_specific;
4955 ++#ifdef CONFIG_MPTCP
4956 ++ if (is_mptcp_enabled(sk))
4957 ++ icsk->icsk_af_ops = &mptcp_v4_specific;
4958 ++ else
4959 ++#endif
4960 ++ icsk->icsk_af_ops = &ipv4_specific;
4961 +
4962 + #ifdef CONFIG_TCP_MD5SIG
4963 + tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
4964 +@@ -1917,6 +1914,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
4965 +
4966 + tcp_cleanup_congestion_control(sk);
4967 +
4968 ++ if (mptcp(tp))
4969 ++ mptcp_destroy_sock(sk);
4970 ++ if (tp->inside_tk_table)
4971 ++ mptcp_hash_remove(tp);
4972 ++
4973 + /* Cleanup up the write buffer. */
4974 + tcp_write_queue_purge(sk);
4975 +
4976 +@@ -2481,6 +2483,19 @@ void tcp4_proc_exit(void)
4977 + }
4978 + #endif /* CONFIG_PROC_FS */
4979 +
4980 ++#ifdef CONFIG_MPTCP
4981 ++static void tcp_v4_clear_sk(struct sock *sk, int size)
4982 ++{
4983 ++ struct tcp_sock *tp = tcp_sk(sk);
4984 ++
4985 ++ /* we do not want to clear tk_table field, because of RCU lookups */
4986 ++ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table));
4987 ++
4988 ++ size -= offsetof(struct tcp_sock, tk_table) + sizeof(tp->tk_table);
4989 ++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size);
4990 ++}
4991 ++#endif
4992 ++
4993 + struct proto tcp_prot = {
4994 + .name = "TCP",
4995 + .owner = THIS_MODULE,
4996 +@@ -2528,6 +2543,9 @@ struct proto tcp_prot = {
4997 + .destroy_cgroup = tcp_destroy_cgroup,
4998 + .proto_cgroup = tcp_proto_cgroup,
4999 + #endif
5000 ++#ifdef CONFIG_MPTCP
5001 ++ .clear_sk = tcp_v4_clear_sk,
5002 ++#endif
5003 + };
5004 + EXPORT_SYMBOL(tcp_prot);
5005 +
5006 +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
5007 +index e68e0d4af6c9..ae6946857dff 100644
5008 +--- a/net/ipv4/tcp_minisocks.c
5009 ++++ b/net/ipv4/tcp_minisocks.c
5010 +@@ -18,11 +18,13 @@
5011 + * Jorge Cwik, <jorge@×××××××××××××.net>
5012 + */
5013 +
5014 ++#include <linux/kconfig.h>
5015 + #include <linux/mm.h>
5016 + #include <linux/module.h>
5017 + #include <linux/slab.h>
5018 + #include <linux/sysctl.h>
5019 + #include <linux/workqueue.h>
5020 ++#include <net/mptcp.h>
5021 + #include <net/tcp.h>
5022 + #include <net/inet_common.h>
5023 + #include <net/xfrm.h>
5024 +@@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
5025 + struct tcp_options_received tmp_opt;
5026 + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
5027 + bool paws_reject = false;
5028 ++ struct mptcp_options_received mopt;
5029 +
5030 + tmp_opt.saw_tstamp = 0;
5031 + if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
5032 +- tcp_parse_options(skb, &tmp_opt, 0, NULL);
5033 ++ mptcp_init_mp_opt(&mopt);
5034 ++
5035 ++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
5036 +
5037 + if (tmp_opt.saw_tstamp) {
5038 + tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
5039 +@@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
5040 + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
5041 + paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
5042 + }
5043 ++
5044 ++ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
5045 ++ if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)
5046 ++ goto kill_with_rst;
5047 ++ }
5048 + }
5049 +
5050 + if (tw->tw_substate == TCP_FIN_WAIT2) {
5051 +@@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
5052 + if (!th->ack ||
5053 + !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
5054 + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
5055 ++ /* If mptcp_is_data_fin() returns true, we are sure that
5056 ++ * mopt has been initialized - otherwise it would not
5057 ++ * be a DATA_FIN.
5058 ++ */
5059 ++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
5060 ++ mptcp_is_data_fin(skb) &&
5061 ++ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
5062 ++ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
5063 ++ return TCP_TW_ACK;
5064 ++
5065 + inet_twsk_put(tw);
5066 + return TCP_TW_SUCCESS;
5067 + }
5068 +@@ -290,6 +310,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
5069 + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
5070 + tcptw->tw_ts_offset = tp->tsoffset;
5071 +
5072 ++ if (mptcp(tp)) {
5073 ++ if (mptcp_init_tw_sock(sk, tcptw)) {
5074 ++ inet_twsk_free(tw);
5075 ++ goto exit;
5076 ++ }
5077 ++ } else {
5078 ++ tcptw->mptcp_tw = NULL;
5079 ++ }
5080 ++
5081 + #if IS_ENABLED(CONFIG_IPV6)
5082 + if (tw->tw_family == PF_INET6) {
5083 + struct ipv6_pinfo *np = inet6_sk(sk);
5084 +@@ -347,15 +376,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
5085 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
5086 + }
5087 +
5088 ++exit:
5089 + tcp_update_metrics(sk);
5090 + tcp_done(sk);
5091 + }
5092 +
5093 + void tcp_twsk_destructor(struct sock *sk)
5094 + {
5095 +-#ifdef CONFIG_TCP_MD5SIG
5096 + struct tcp_timewait_sock *twsk = tcp_twsk(sk);
5097 +
5098 ++ if (twsk->mptcp_tw)
5099 ++ mptcp_twsk_destructor(twsk);
5100 ++#ifdef CONFIG_TCP_MD5SIG
5101 + if (twsk->tw_md5_key)
5102 + kfree_rcu(twsk->tw_md5_key, rcu);
5103 + #endif
5104 +@@ -382,13 +414,14 @@ void tcp_openreq_init_rwin(struct request_sock *req,
5105 + req->window_clamp = tcp_full_space(sk);
5106 +
5107 + /* tcp_full_space because it is guaranteed to be the first packet */
5108 +- tcp_select_initial_window(tcp_full_space(sk),
5109 +- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
5110 ++ tp->ops->select_initial_window(tcp_full_space(sk),
5111 ++ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
5112 ++ (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
5113 + &req->rcv_wnd,
5114 + &req->window_clamp,
5115 + ireq->wscale_ok,
5116 + &rcv_wscale,
5117 +- dst_metric(dst, RTAX_INITRWND));
5118 ++ dst_metric(dst, RTAX_INITRWND), sk);
5119 + ireq->rcv_wscale = rcv_wscale;
5120 + }
5121 + EXPORT_SYMBOL(tcp_openreq_init_rwin);
5122 +@@ -499,6 +532,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
5123 + newtp->rx_opt.ts_recent_stamp = 0;
5124 + newtp->tcp_header_len = sizeof(struct tcphdr);
5125 + }
5126 ++ if (ireq->saw_mpc)
5127 ++ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
5128 + newtp->tsoffset = 0;
5129 + #ifdef CONFIG_TCP_MD5SIG
5130 + newtp->md5sig_info = NULL; /*XXX*/
5131 +@@ -535,16 +570,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
5132 + bool fastopen)
5133 + {
5134 + struct tcp_options_received tmp_opt;
5135 ++ struct mptcp_options_received mopt;
5136 + struct sock *child;
5137 + const struct tcphdr *th = tcp_hdr(skb);
5138 + __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
5139 + bool paws_reject = false;
5140 +
5141 +- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
5142 ++ BUG_ON(!mptcp(tcp_sk(sk)) && fastopen == (sk->sk_state == TCP_LISTEN));
5143 +
5144 + tmp_opt.saw_tstamp = 0;
5145 ++
5146 ++ mptcp_init_mp_opt(&mopt);
5147 ++
5148 + if (th->doff > (sizeof(struct tcphdr)>>2)) {
5149 +- tcp_parse_options(skb, &tmp_opt, 0, NULL);
5150 ++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
5151 +
5152 + if (tmp_opt.saw_tstamp) {
5153 + tmp_opt.ts_recent = req->ts_recent;
5154 +@@ -583,7 +622,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
5155 + *
5156 + * Reset timer after retransmitting SYNACK, similar to
5157 + * the idea of fast retransmit in recovery.
5158 ++ *
5159 ++ * Fall back to TCP if MP_CAPABLE is not set.
5160 + */
5161 ++
5162 ++ if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)
5163 ++ inet_rsk(req)->saw_mpc = false;
5164 ++
5165 ++
5166 + if (!inet_rtx_syn_ack(sk, req))
5167 + req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
5168 + TCP_RTO_MAX) + jiffies;
5169 +@@ -718,9 +764,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
5170 + * socket is created, wait for troubles.
5171 + */
5172 + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
5173 ++
5174 + if (child == NULL)
5175 + goto listen_overflow;
5176 +
5177 ++ if (!is_meta_sk(sk)) {
5178 ++ int ret = mptcp_check_req_master(sk, child, req, prev);
5179 ++ if (ret < 0)
5180 ++ goto listen_overflow;
5181 ++
5182 ++ /* MPTCP-supported */
5183 ++ if (!ret)
5184 ++ return tcp_sk(child)->mpcb->master_sk;
5185 ++ } else {
5186 ++ return mptcp_check_req_child(sk, child, req, prev, &mopt);
5187 ++ }
5188 + inet_csk_reqsk_queue_unlink(sk, req, prev);
5189 + inet_csk_reqsk_queue_removed(sk, req);
5190 +
5191 +@@ -746,7 +804,17 @@ embryonic_reset:
5192 + tcp_reset(sk);
5193 + }
5194 + if (!fastopen) {
5195 +- inet_csk_reqsk_queue_drop(sk, req, prev);
5196 ++ if (is_meta_sk(sk)) {
5197 ++ /* We want to avoid stoping the keepalive-timer and so
5198 ++ * avoid ending up in inet_csk_reqsk_queue_removed ...
5199 ++ */
5200 ++ inet_csk_reqsk_queue_unlink(sk, req, prev);
5201 ++ if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)
5202 ++ mptcp_delete_synack_timer(sk);
5203 ++ reqsk_free(req);
5204 ++ } else {
5205 ++ inet_csk_reqsk_queue_drop(sk, req, prev);
5206 ++ }
5207 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
5208 + }
5209 + return NULL;
5210 +@@ -770,8 +838,9 @@ int tcp_child_process(struct sock *parent, struct sock *child,
5211 + {
5212 + int ret = 0;
5213 + int state = child->sk_state;
5214 ++ struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;
5215 +
5216 +- if (!sock_owned_by_user(child)) {
5217 ++ if (!sock_owned_by_user(meta_sk)) {
5218 + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
5219 + skb->len);
5220 + /* Wakeup parent, send SIGIO */
5221 +@@ -782,10 +851,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,
5222 + * in main socket hash table and lock on listening
5223 + * socket does not protect us more.
5224 + */
5225 +- __sk_add_backlog(child, skb);
5226 ++ if (mptcp(tcp_sk(child)))
5227 ++ skb->sk = child;
5228 ++ __sk_add_backlog(meta_sk, skb);
5229 + }
5230 +
5231 +- bh_unlock_sock(child);
5232 ++ if (mptcp(tcp_sk(child)))
5233 ++ bh_unlock_sock(child);
5234 ++ bh_unlock_sock(meta_sk);
5235 + sock_put(child);
5236 + return ret;
5237 + }
5238 +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
5239 +index 179b51e6bda3..efd31b6c5784 100644
5240 +--- a/net/ipv4/tcp_output.c
5241 ++++ b/net/ipv4/tcp_output.c
5242 +@@ -36,6 +36,12 @@
5243 +
5244 + #define pr_fmt(fmt) "TCP: " fmt
5245 +
5246 ++#include <net/mptcp.h>
5247 ++#include <net/mptcp_v4.h>
5248 ++#if IS_ENABLED(CONFIG_IPV6)
5249 ++#include <net/mptcp_v6.h>
5250 ++#endif
5251 ++#include <net/ipv6.h>
5252 + #include <net/tcp.h>
5253 +
5254 + #include <linux/compiler.h>
5255 +@@ -68,11 +74,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
5256 + unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
5257 + EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
5258 +
5259 +-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5260 +- int push_one, gfp_t gfp);
5261 +-
5262 + /* Account for new data that has been sent to the network. */
5263 +-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
5264 ++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
5265 + {
5266 + struct inet_connection_sock *icsk = inet_csk(sk);
5267 + struct tcp_sock *tp = tcp_sk(sk);
5268 +@@ -214,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss)
5269 + void tcp_select_initial_window(int __space, __u32 mss,
5270 + __u32 *rcv_wnd, __u32 *window_clamp,
5271 + int wscale_ok, __u8 *rcv_wscale,
5272 +- __u32 init_rcv_wnd)
5273 ++ __u32 init_rcv_wnd, const struct sock *sk)
5274 + {
5275 + unsigned int space = (__space < 0 ? 0 : __space);
5276 +
5277 +@@ -269,12 +272,16 @@ EXPORT_SYMBOL(tcp_select_initial_window);
5278 + * value can be stuffed directly into th->window for an outgoing
5279 + * frame.
5280 + */
5281 +-static u16 tcp_select_window(struct sock *sk)
5282 ++u16 tcp_select_window(struct sock *sk)
5283 + {
5284 + struct tcp_sock *tp = tcp_sk(sk);
5285 + u32 old_win = tp->rcv_wnd;
5286 +- u32 cur_win = tcp_receive_window(tp);
5287 +- u32 new_win = __tcp_select_window(sk);
5288 ++ /* The window must never shrink at the meta-level. At the subflow we
5289 ++ * have to allow this. Otherwise we may announce a window too large
5290 ++ * for the current meta-level sk_rcvbuf.
5291 ++ */
5292 ++ u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);
5293 ++ u32 new_win = tp->ops->__select_window(sk);
5294 +
5295 + /* Never shrink the offered window */
5296 + if (new_win < cur_win) {
5297 +@@ -290,6 +297,7 @@ static u16 tcp_select_window(struct sock *sk)
5298 + LINUX_MIB_TCPWANTZEROWINDOWADV);
5299 + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
5300 + }
5301 ++
5302 + tp->rcv_wnd = new_win;
5303 + tp->rcv_wup = tp->rcv_nxt;
5304 +
5305 +@@ -374,7 +382,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
5306 + /* Constructs common control bits of non-data skb. If SYN/FIN is present,
5307 + * auto increment end seqno.
5308 + */
5309 +-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
5310 ++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
5311 + {
5312 + struct skb_shared_info *shinfo = skb_shinfo(skb);
5313 +
5314 +@@ -394,7 +402,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
5315 + TCP_SKB_CB(skb)->end_seq = seq;
5316 + }
5317 +
5318 +-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
5319 ++bool tcp_urg_mode(const struct tcp_sock *tp)
5320 + {
5321 + return tp->snd_una != tp->snd_up;
5322 + }
5323 +@@ -404,17 +412,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
5324 + #define OPTION_MD5 (1 << 2)
5325 + #define OPTION_WSCALE (1 << 3)
5326 + #define OPTION_FAST_OPEN_COOKIE (1 << 8)
5327 +-
5328 +-struct tcp_out_options {
5329 +- u16 options; /* bit field of OPTION_* */
5330 +- u16 mss; /* 0 to disable */
5331 +- u8 ws; /* window scale, 0 to disable */
5332 +- u8 num_sack_blocks; /* number of SACK blocks to include */
5333 +- u8 hash_size; /* bytes in hash_location */
5334 +- __u8 *hash_location; /* temporary pointer, overloaded */
5335 +- __u32 tsval, tsecr; /* need to include OPTION_TS */
5336 +- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
5337 +-};
5338 ++/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
5339 +
5340 + /* Write previously computed TCP options to the packet.
5341 + *
5342 +@@ -430,7 +428,7 @@ struct tcp_out_options {
5343 + * (but it may well be that other scenarios fail similarly).
5344 + */
5345 + static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
5346 +- struct tcp_out_options *opts)
5347 ++ struct tcp_out_options *opts, struct sk_buff *skb)
5348 + {
5349 + u16 options = opts->options; /* mungable copy */
5350 +
5351 +@@ -513,6 +511,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
5352 + }
5353 + ptr += (foc->len + 3) >> 2;
5354 + }
5355 ++
5356 ++ if (unlikely(OPTION_MPTCP & opts->options))
5357 ++ mptcp_options_write(ptr, tp, opts, skb);
5358 + }
5359 +
5360 + /* Compute TCP options for SYN packets. This is not the final
5361 +@@ -564,6 +565,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
5362 + if (unlikely(!(OPTION_TS & opts->options)))
5363 + remaining -= TCPOLEN_SACKPERM_ALIGNED;
5364 + }
5365 ++ if (tp->request_mptcp || mptcp(tp))
5366 ++ mptcp_syn_options(sk, opts, &remaining);
5367 +
5368 + if (fastopen && fastopen->cookie.len >= 0) {
5369 + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
5370 +@@ -637,6 +640,9 @@ static unsigned int tcp_synack_options(struct sock *sk,
5371 + }
5372 + }
5373 +
5374 ++ if (ireq->saw_mpc)
5375 ++ mptcp_synack_options(req, opts, &remaining);
5376 ++
5377 + return MAX_TCP_OPTION_SPACE - remaining;
5378 + }
5379 +
5380 +@@ -670,16 +676,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
5381 + opts->tsecr = tp->rx_opt.ts_recent;
5382 + size += TCPOLEN_TSTAMP_ALIGNED;
5383 + }
5384 ++ if (mptcp(tp))
5385 ++ mptcp_established_options(sk, skb, opts, &size);
5386 +
5387 + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
5388 + if (unlikely(eff_sacks)) {
5389 +- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
5390 +- opts->num_sack_blocks =
5391 +- min_t(unsigned int, eff_sacks,
5392 +- (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
5393 +- TCPOLEN_SACK_PERBLOCK);
5394 +- size += TCPOLEN_SACK_BASE_ALIGNED +
5395 +- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
5396 ++ const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
5397 ++ if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
5398 ++ opts->num_sack_blocks = 0;
5399 ++ else
5400 ++ opts->num_sack_blocks =
5401 ++ min_t(unsigned int, eff_sacks,
5402 ++ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
5403 ++ TCPOLEN_SACK_PERBLOCK);
5404 ++ if (opts->num_sack_blocks)
5405 ++ size += TCPOLEN_SACK_BASE_ALIGNED +
5406 ++ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
5407 + }
5408 +
5409 + return size;
5410 +@@ -711,8 +723,8 @@ static void tcp_tsq_handler(struct sock *sk)
5411 + if ((1 << sk->sk_state) &
5412 + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
5413 + TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
5414 +- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
5415 +- 0, GFP_ATOMIC);
5416 ++ tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),
5417 ++ tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);
5418 + }
5419 + /*
5420 + * One tasklet per cpu tries to send more skbs.
5421 +@@ -727,7 +739,7 @@ static void tcp_tasklet_func(unsigned long data)
5422 + unsigned long flags;
5423 + struct list_head *q, *n;
5424 + struct tcp_sock *tp;
5425 +- struct sock *sk;
5426 ++ struct sock *sk, *meta_sk;
5427 +
5428 + local_irq_save(flags);
5429 + list_splice_init(&tsq->head, &list);
5430 +@@ -738,15 +750,25 @@ static void tcp_tasklet_func(unsigned long data)
5431 + list_del(&tp->tsq_node);
5432 +
5433 + sk = (struct sock *)tp;
5434 +- bh_lock_sock(sk);
5435 ++ meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
5436 ++ bh_lock_sock(meta_sk);
5437 +
5438 +- if (!sock_owned_by_user(sk)) {
5439 ++ if (!sock_owned_by_user(meta_sk)) {
5440 + tcp_tsq_handler(sk);
5441 ++ if (mptcp(tp))
5442 ++ tcp_tsq_handler(meta_sk);
5443 + } else {
5444 ++ if (mptcp(tp) && sk->sk_state == TCP_CLOSE)
5445 ++ goto exit;
5446 ++
5447 + /* defer the work to tcp_release_cb() */
5448 + set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
5449 ++
5450 ++ if (mptcp(tp))
5451 ++ mptcp_tsq_flags(sk);
5452 + }
5453 +- bh_unlock_sock(sk);
5454 ++exit:
5455 ++ bh_unlock_sock(meta_sk);
5456 +
5457 + clear_bit(TSQ_QUEUED, &tp->tsq_flags);
5458 + sk_free(sk);
5459 +@@ -756,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data)
5460 + #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
5461 + (1UL << TCP_WRITE_TIMER_DEFERRED) | \
5462 + (1UL << TCP_DELACK_TIMER_DEFERRED) | \
5463 +- (1UL << TCP_MTU_REDUCED_DEFERRED))
5464 ++ (1UL << TCP_MTU_REDUCED_DEFERRED) | \
5465 ++ (1UL << MPTCP_PATH_MANAGER) | \
5466 ++ (1UL << MPTCP_SUB_DEFERRED))
5467 ++
5468 + /**
5469 + * tcp_release_cb - tcp release_sock() callback
5470 + * @sk: socket
5471 +@@ -803,6 +828,13 @@ void tcp_release_cb(struct sock *sk)
5472 + sk->sk_prot->mtu_reduced(sk);
5473 + __sock_put(sk);
5474 + }
5475 ++ if (flags & (1UL << MPTCP_PATH_MANAGER)) {
5476 ++ if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
5477 ++ tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
5478 ++ __sock_put(sk);
5479 ++ }
5480 ++ if (flags & (1UL << MPTCP_SUB_DEFERRED))
5481 ++ mptcp_tsq_sub_deferred(sk);
5482 + }
5483 + EXPORT_SYMBOL(tcp_release_cb);
5484 +
5485 +@@ -862,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb)
5486 + * We are working here with either a clone of the original
5487 + * SKB, or a fresh unique copy made by the retransmit engine.
5488 + */
5489 +-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5490 +- gfp_t gfp_mask)
5491 ++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5492 ++ gfp_t gfp_mask)
5493 + {
5494 + const struct inet_connection_sock *icsk = inet_csk(sk);
5495 + struct inet_sock *inet;
5496 +@@ -933,7 +965,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5497 + */
5498 + th->window = htons(min(tp->rcv_wnd, 65535U));
5499 + } else {
5500 +- th->window = htons(tcp_select_window(sk));
5501 ++ th->window = htons(tp->ops->select_window(sk));
5502 + }
5503 + th->check = 0;
5504 + th->urg_ptr = 0;
5505 +@@ -949,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5506 + }
5507 + }
5508 +
5509 +- tcp_options_write((__be32 *)(th + 1), tp, &opts);
5510 ++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
5511 + if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
5512 + TCP_ECN_send(sk, skb, tcp_header_size);
5513 +
5514 +@@ -988,7 +1020,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5515 + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
5516 + * otherwise socket can stall.
5517 + */
5518 +-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
5519 ++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
5520 + {
5521 + struct tcp_sock *tp = tcp_sk(sk);
5522 +
5523 +@@ -1001,15 +1033,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
5524 + }
5525 +
5526 + /* Initialize TSO segments for a packet. */
5527 +-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
5528 +- unsigned int mss_now)
5529 ++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
5530 ++ unsigned int mss_now)
5531 + {
5532 + struct skb_shared_info *shinfo = skb_shinfo(skb);
5533 +
5534 + /* Make sure we own this skb before messing gso_size/gso_segs */
5535 + WARN_ON_ONCE(skb_cloned(skb));
5536 +
5537 +- if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
5538 ++ if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||
5539 ++ (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {
5540 + /* Avoid the costly divide in the normal
5541 + * non-TSO case.
5542 + */
5543 +@@ -1041,7 +1074,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
5544 + /* Pcount in the middle of the write queue got changed, we need to do various
5545 + * tweaks to fix counters
5546 + */
5547 +-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
5548 ++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
5549 + {
5550 + struct tcp_sock *tp = tcp_sk(sk);
5551 +
5552 +@@ -1164,7 +1197,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
5553 + * eventually). The difference is that pulled data not copied, but
5554 + * immediately discarded.
5555 + */
5556 +-static void __pskb_trim_head(struct sk_buff *skb, int len)
5557 ++void __pskb_trim_head(struct sk_buff *skb, int len)
5558 + {
5559 + struct skb_shared_info *shinfo;
5560 + int i, k, eat;
5561 +@@ -1205,6 +1238,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
5562 + /* Remove acked data from a packet in the transmit queue. */
5563 + int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
5564 + {
5565 ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && mptcp_is_data_seq(skb))
5566 ++ return mptcp_trim_head(sk, skb, len);
5567 ++
5568 + if (skb_unclone(skb, GFP_ATOMIC))
5569 + return -ENOMEM;
5570 +
5571 +@@ -1222,6 +1258,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
5572 + if (tcp_skb_pcount(skb) > 1)
5573 + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
5574 +
5575 ++#ifdef CONFIG_MPTCP
5576 ++ /* Some data got acked - we assume that the seq-number reached the dest.
5577 ++ * Anyway, our MPTCP-option has been trimmed above - we lost it here.
5578 ++ * Only remove the SEQ if the call does not come from a meta retransmit.
5579 ++ */
5580 ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
5581 ++ TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;
5582 ++#endif
5583 ++
5584 + return 0;
5585 + }
5586 +
5587 +@@ -1379,6 +1424,7 @@ unsigned int tcp_current_mss(struct sock *sk)
5588 +
5589 + return mss_now;
5590 + }
5591 ++EXPORT_SYMBOL(tcp_current_mss);
5592 +
5593 + /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
5594 + * As additional protections, we do not touch cwnd in retransmission phases,
5595 +@@ -1446,8 +1492,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
5596 + * But we can avoid doing the divide again given we already have
5597 + * skb_pcount = skb->len / mss_now
5598 + */
5599 +-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
5600 +- const struct sk_buff *skb)
5601 ++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
5602 ++ const struct sk_buff *skb)
5603 + {
5604 + if (skb->len < tcp_skb_pcount(skb) * mss_now)
5605 + tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
5606 +@@ -1468,11 +1514,11 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
5607 + (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
5608 + }
5609 + /* Returns the portion of skb which can be sent right away */
5610 +-static unsigned int tcp_mss_split_point(const struct sock *sk,
5611 +- const struct sk_buff *skb,
5612 +- unsigned int mss_now,
5613 +- unsigned int max_segs,
5614 +- int nonagle)
5615 ++unsigned int tcp_mss_split_point(const struct sock *sk,
5616 ++ const struct sk_buff *skb,
5617 ++ unsigned int mss_now,
5618 ++ unsigned int max_segs,
5619 ++ int nonagle)
5620 + {
5621 + const struct tcp_sock *tp = tcp_sk(sk);
5622 + u32 partial, needed, window, max_len;
5623 +@@ -1502,13 +1548,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
5624 + /* Can at least one segment of SKB be sent right now, according to the
5625 + * congestion window rules? If so, return how many segments are allowed.
5626 + */
5627 +-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
5628 +- const struct sk_buff *skb)
5629 ++unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
5630 ++ const struct sk_buff *skb)
5631 + {
5632 + u32 in_flight, cwnd;
5633 +
5634 + /* Don't be strict about the congestion window for the final FIN. */
5635 +- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
5636 ++ if (skb &&
5637 ++ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
5638 + tcp_skb_pcount(skb) == 1)
5639 + return 1;
5640 +
5641 +@@ -1524,8 +1571,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
5642 + * This must be invoked the first time we consider transmitting
5643 + * SKB onto the wire.
5644 + */
5645 +-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
5646 +- unsigned int mss_now)
5647 ++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
5648 ++ unsigned int mss_now)
5649 + {
5650 + int tso_segs = tcp_skb_pcount(skb);
5651 +
5652 +@@ -1540,8 +1587,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
5653 + /* Return true if the Nagle test allows this packet to be
5654 + * sent now.
5655 + */
5656 +-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
5657 +- unsigned int cur_mss, int nonagle)
5658 ++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
5659 ++ unsigned int cur_mss, int nonagle)
5660 + {
5661 + /* Nagle rule does not apply to frames, which sit in the middle of the
5662 + * write_queue (they have no chances to get new data).
5663 +@@ -1553,7 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
5664 + return true;
5665 +
5666 + /* Don't use the nagle rule for urgent data (or for the final FIN). */
5667 +- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
5668 ++ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
5669 ++ mptcp_is_data_fin(skb))
5670 + return true;
5671 +
5672 + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
5673 +@@ -1563,9 +1611,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
5674 + }
5675 +
5676 + /* Does at least the first segment of SKB fit into the send window? */
5677 +-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
5678 +- const struct sk_buff *skb,
5679 +- unsigned int cur_mss)
5680 ++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
5681 ++ unsigned int cur_mss)
5682 + {
5683 + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
5684 +
5685 +@@ -1676,7 +1723,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
5686 + u32 send_win, cong_win, limit, in_flight;
5687 + int win_divisor;
5688 +
5689 +- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
5690 ++ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
5691 + goto send_now;
5692 +
5693 + if (icsk->icsk_ca_state != TCP_CA_Open)
5694 +@@ -1888,7 +1935,7 @@ static int tcp_mtu_probe(struct sock *sk)
5695 + * Returns true, if no segments are in flight and we have queued segments,
5696 + * but cannot send anything now because of SWS or another problem.
5697 + */
5698 +-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5699 ++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5700 + int push_one, gfp_t gfp)
5701 + {
5702 + struct tcp_sock *tp = tcp_sk(sk);
5703 +@@ -1900,7 +1947,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5704 +
5705 + sent_pkts = 0;
5706 +
5707 +- if (!push_one) {
5708 ++ /* pmtu not yet supported with MPTCP. Should be possible, by early
5709 ++ * exiting the loop inside tcp_mtu_probe, making sure that only one
5710 ++ * single DSS-mapping gets probed.
5711 ++ */
5712 ++ if (!push_one && !mptcp(tp)) {
5713 + /* Do MTU probing. */
5714 + result = tcp_mtu_probe(sk);
5715 + if (!result) {
5716 +@@ -2099,7 +2150,8 @@ void tcp_send_loss_probe(struct sock *sk)
5717 + int err = -1;
5718 +
5719 + if (tcp_send_head(sk) != NULL) {
5720 +- err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
5721 ++ err = tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2,
5722 ++ GFP_ATOMIC);
5723 + goto rearm_timer;
5724 + }
5725 +
5726 +@@ -2159,8 +2211,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
5727 + if (unlikely(sk->sk_state == TCP_CLOSE))
5728 + return;
5729 +
5730 +- if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
5731 +- sk_gfp_atomic(sk, GFP_ATOMIC)))
5732 ++ if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,
5733 ++ sk_gfp_atomic(sk, GFP_ATOMIC)))
5734 + tcp_check_probe_timer(sk);
5735 + }
5736 +
5737 +@@ -2173,7 +2225,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
5738 +
5739 + BUG_ON(!skb || skb->len < mss_now);
5740 +
5741 +- tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
5742 ++ tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,
5743 ++ sk->sk_allocation);
5744 + }
5745 +
5746 + /* This function returns the amount that we can raise the
5747 +@@ -2386,6 +2439,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
5748 + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
5749 + return;
5750 +
5751 ++ /* Currently not supported for MPTCP - but it should be possible */
5752 ++ if (mptcp(tp))
5753 ++ return;
5754 ++
5755 + tcp_for_write_queue_from_safe(skb, tmp, sk) {
5756 + if (!tcp_can_collapse(sk, skb))
5757 + break;
5758 +@@ -2843,7 +2900,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
5759 +
5760 + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
5761 + th->window = htons(min(req->rcv_wnd, 65535U));
5762 +- tcp_options_write((__be32 *)(th + 1), tp, &opts);
5763 ++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
5764 + th->doff = (tcp_header_size >> 2);
5765 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
5766 +
5767 +@@ -2897,13 +2954,13 @@ static void tcp_connect_init(struct sock *sk)
5768 + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
5769 + tp->window_clamp = tcp_full_space(sk);
5770 +
5771 +- tcp_select_initial_window(tcp_full_space(sk),
5772 +- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
5773 +- &tp->rcv_wnd,
5774 +- &tp->window_clamp,
5775 +- sysctl_tcp_window_scaling,
5776 +- &rcv_wscale,
5777 +- dst_metric(dst, RTAX_INITRWND));
5778 ++ tp->ops->select_initial_window(tcp_full_space(sk),
5779 ++ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
5780 ++ &tp->rcv_wnd,
5781 ++ &tp->window_clamp,
5782 ++ sysctl_tcp_window_scaling,
5783 ++ &rcv_wscale,
5784 ++ dst_metric(dst, RTAX_INITRWND), sk);
5785 +
5786 + tp->rx_opt.rcv_wscale = rcv_wscale;
5787 + tp->rcv_ssthresh = tp->rcv_wnd;
5788 +@@ -2927,6 +2984,36 @@ static void tcp_connect_init(struct sock *sk)
5789 + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
5790 + inet_csk(sk)->icsk_retransmits = 0;
5791 + tcp_clear_retrans(tp);
5792 ++
5793 ++#ifdef CONFIG_MPTCP
5794 ++ if (sysctl_mptcp_enabled && mptcp_doit(sk)) {
5795 ++ if (is_master_tp(tp)) {
5796 ++ tp->request_mptcp = 1;
5797 ++ mptcp_connect_init(sk);
5798 ++ } else if (tp->mptcp) {
5799 ++ struct inet_sock *inet = inet_sk(sk);
5800 ++
5801 ++ tp->mptcp->snt_isn = tp->write_seq;
5802 ++ tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
5803 ++
5804 ++ /* Set nonce for new subflows */
5805 ++ if (sk->sk_family == AF_INET)
5806 ++ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
5807 ++ inet->inet_saddr,
5808 ++ inet->inet_daddr,
5809 ++ inet->inet_sport,
5810 ++ inet->inet_dport);
5811 ++#if IS_ENABLED(CONFIG_IPV6)
5812 ++ else
5813 ++ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
5814 ++ inet6_sk(sk)->saddr.s6_addr32,
5815 ++ sk->sk_v6_daddr.s6_addr32,
5816 ++ inet->inet_sport,
5817 ++ inet->inet_dport);
5818 ++#endif
5819 ++ }
5820 ++ }
5821 ++#endif
5822 + }
5823 +
5824 + static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
5825 +@@ -3176,6 +3263,7 @@ void tcp_send_ack(struct sock *sk)
5826 + TCP_SKB_CB(buff)->when = tcp_time_stamp;
5827 + tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
5828 + }
5829 ++EXPORT_SYMBOL(tcp_send_ack);
5830 +
5831 + /* This routine sends a packet with an out of date sequence
5832 + * number. It assumes the other end will try to ack it.
5833 +@@ -3188,7 +3276,7 @@ void tcp_send_ack(struct sock *sk)
5834 + * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
5835 + * out-of-date with SND.UNA-1 to probe window.
5836 + */
5837 +-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
5838 ++int tcp_xmit_probe_skb(struct sock *sk, int urgent)
5839 + {
5840 + struct tcp_sock *tp = tcp_sk(sk);
5841 + struct sk_buff *skb;
5842 +@@ -3270,7 +3358,7 @@ void tcp_send_probe0(struct sock *sk)
5843 + struct tcp_sock *tp = tcp_sk(sk);
5844 + int err;
5845 +
5846 +- err = tcp_write_wakeup(sk);
5847 ++ err = tp->ops->write_wakeup(sk);
5848 +
5849 + if (tp->packets_out || !tcp_send_head(sk)) {
5850 + /* Cancel probe timer, if it is not required. */
5851 +@@ -3301,3 +3389,18 @@ void tcp_send_probe0(struct sock *sk)
5852 + TCP_RTO_MAX);
5853 + }
5854 + }
5855 ++
5856 ++int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
5857 ++{
5858 ++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
5859 ++ struct flowi fl;
5860 ++ int res;
5861 ++
5862 ++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
5863 ++ if (!res) {
5864 ++ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
5865 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
5866 ++ }
5867 ++ return res;
5868 ++}
5869 ++EXPORT_SYMBOL(tcp_rtx_synack);
5870 +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
5871 +index 286227abed10..966b873cbf3e 100644
5872 +--- a/net/ipv4/tcp_timer.c
5873 ++++ b/net/ipv4/tcp_timer.c
5874 +@@ -20,6 +20,7 @@
5875 +
5876 + #include <linux/module.h>
5877 + #include <linux/gfp.h>
5878 ++#include <net/mptcp.h>
5879 + #include <net/tcp.h>
5880 +
5881 + int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
5882 +@@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
5883 + int sysctl_tcp_orphan_retries __read_mostly;
5884 + int sysctl_tcp_thin_linear_timeouts __read_mostly;
5885 +
5886 +-static void tcp_write_err(struct sock *sk)
5887 ++void tcp_write_err(struct sock *sk)
5888 + {
5889 + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
5890 + sk->sk_error_report(sk);
5891 +@@ -74,7 +75,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
5892 + (!tp->snd_wnd && !tp->packets_out))
5893 + do_reset = 1;
5894 + if (do_reset)
5895 +- tcp_send_active_reset(sk, GFP_ATOMIC);
5896 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
5897 + tcp_done(sk);
5898 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
5899 + return 1;
5900 +@@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
5901 + * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
5902 + * syn_set flag is set.
5903 + */
5904 +-static bool retransmits_timed_out(struct sock *sk,
5905 +- unsigned int boundary,
5906 +- unsigned int timeout,
5907 +- bool syn_set)
5908 ++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
5909 ++ unsigned int timeout, bool syn_set)
5910 + {
5911 + unsigned int linear_backoff_thresh, start_ts;
5912 + unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
5913 +@@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk,
5914 + }
5915 +
5916 + /* A write timeout has occurred. Process the after effects. */
5917 +-static int tcp_write_timeout(struct sock *sk)
5918 ++int tcp_write_timeout(struct sock *sk)
5919 + {
5920 + struct inet_connection_sock *icsk = inet_csk(sk);
5921 + struct tcp_sock *tp = tcp_sk(sk);
5922 +@@ -171,6 +170,10 @@ static int tcp_write_timeout(struct sock *sk)
5923 + }
5924 + retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
5925 + syn_set = true;
5926 ++ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
5927 ++ if (tcp_sk(sk)->request_mptcp &&
5928 ++ icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())
5929 ++ tcp_sk(sk)->request_mptcp = 0;
5930 + } else {
5931 + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
5932 + /* Black hole detection */
5933 +@@ -251,18 +254,22 @@ out:
5934 + static void tcp_delack_timer(unsigned long data)
5935 + {
5936 + struct sock *sk = (struct sock *)data;
5937 ++ struct tcp_sock *tp = tcp_sk(sk);
5938 ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
5939 +
5940 +- bh_lock_sock(sk);
5941 +- if (!sock_owned_by_user(sk)) {
5942 ++ bh_lock_sock(meta_sk);
5943 ++ if (!sock_owned_by_user(meta_sk)) {
5944 + tcp_delack_timer_handler(sk);
5945 + } else {
5946 + inet_csk(sk)->icsk_ack.blocked = 1;
5947 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
5948 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
5949 + /* deleguate our work to tcp_release_cb() */
5950 + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
5951 + sock_hold(sk);
5952 ++ if (mptcp(tp))
5953 ++ mptcp_tsq_flags(sk);
5954 + }
5955 +- bh_unlock_sock(sk);
5956 ++ bh_unlock_sock(meta_sk);
5957 + sock_put(sk);
5958 + }
5959 +
5960 +@@ -479,6 +486,10 @@ out_reset_timer:
5961 + __sk_dst_reset(sk);
5962 +
5963 + out:;
5964 ++ if (mptcp(tp)) {
5965 ++ mptcp_reinject_data(sk, 1);
5966 ++ mptcp_set_rto(sk);
5967 ++ }
5968 + }
5969 +
5970 + void tcp_write_timer_handler(struct sock *sk)
5971 +@@ -505,7 +516,7 @@ void tcp_write_timer_handler(struct sock *sk)
5972 + break;
5973 + case ICSK_TIME_RETRANS:
5974 + icsk->icsk_pending = 0;
5975 +- tcp_retransmit_timer(sk);
5976 ++ tcp_sk(sk)->ops->retransmit_timer(sk);
5977 + break;
5978 + case ICSK_TIME_PROBE0:
5979 + icsk->icsk_pending = 0;
5980 +@@ -520,16 +531,19 @@ out:
5981 + static void tcp_write_timer(unsigned long data)
5982 + {
5983 + struct sock *sk = (struct sock *)data;
5984 ++ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
5985 +
5986 +- bh_lock_sock(sk);
5987 +- if (!sock_owned_by_user(sk)) {
5988 ++ bh_lock_sock(meta_sk);
5989 ++ if (!sock_owned_by_user(meta_sk)) {
5990 + tcp_write_timer_handler(sk);
5991 + } else {
5992 + /* deleguate our work to tcp_release_cb() */
5993 + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
5994 + sock_hold(sk);
5995 ++ if (mptcp(tcp_sk(sk)))
5996 ++ mptcp_tsq_flags(sk);
5997 + }
5998 +- bh_unlock_sock(sk);
5999 ++ bh_unlock_sock(meta_sk);
6000 + sock_put(sk);
6001 + }
6002 +
6003 +@@ -566,11 +580,12 @@ static void tcp_keepalive_timer (unsigned long data)
6004 + struct sock *sk = (struct sock *) data;
6005 + struct inet_connection_sock *icsk = inet_csk(sk);
6006 + struct tcp_sock *tp = tcp_sk(sk);
6007 ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
6008 + u32 elapsed;
6009 +
6010 + /* Only process if socket is not in use. */
6011 +- bh_lock_sock(sk);
6012 +- if (sock_owned_by_user(sk)) {
6013 ++ bh_lock_sock(meta_sk);
6014 ++ if (sock_owned_by_user(meta_sk)) {
6015 + /* Try again later. */
6016 + inet_csk_reset_keepalive_timer (sk, HZ/20);
6017 + goto out;
6018 +@@ -581,16 +596,38 @@ static void tcp_keepalive_timer (unsigned long data)
6019 + goto out;
6020 + }
6021 +
6022 ++ if (tp->send_mp_fclose) {
6023 ++ /* MUST do this before tcp_write_timeout, because retrans_stamp
6024 ++ * may have been set to 0 in another part while we are
6025 ++ * retransmitting MP_FASTCLOSE. Then, we would crash, because
6026 ++ * retransmits_timed_out accesses the meta-write-queue.
6027 ++ *
6028 ++ * We make sure that the timestamp is != 0.
6029 ++ */
6030 ++ if (!tp->retrans_stamp)
6031 ++ tp->retrans_stamp = tcp_time_stamp ? : 1;
6032 ++
6033 ++ if (tcp_write_timeout(sk))
6034 ++ goto out;
6035 ++
6036 ++ tcp_send_ack(sk);
6037 ++ icsk->icsk_retransmits++;
6038 ++
6039 ++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
6040 ++ elapsed = icsk->icsk_rto;
6041 ++ goto resched;
6042 ++ }
6043 ++
6044 + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
6045 + if (tp->linger2 >= 0) {
6046 + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
6047 +
6048 + if (tmo > 0) {
6049 +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
6050 ++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
6051 + goto out;
6052 + }
6053 + }
6054 +- tcp_send_active_reset(sk, GFP_ATOMIC);
6055 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
6056 + goto death;
6057 + }
6058 +
6059 +@@ -614,11 +651,11 @@ static void tcp_keepalive_timer (unsigned long data)
6060 + icsk->icsk_probes_out > 0) ||
6061 + (icsk->icsk_user_timeout == 0 &&
6062 + icsk->icsk_probes_out >= keepalive_probes(tp))) {
6063 +- tcp_send_active_reset(sk, GFP_ATOMIC);
6064 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
6065 + tcp_write_err(sk);
6066 + goto out;
6067 + }
6068 +- if (tcp_write_wakeup(sk) <= 0) {
6069 ++ if (tp->ops->write_wakeup(sk) <= 0) {
6070 + icsk->icsk_probes_out++;
6071 + elapsed = keepalive_intvl_when(tp);
6072 + } else {
6073 +@@ -642,7 +679,7 @@ death:
6074 + tcp_done(sk);
6075 +
6076 + out:
6077 +- bh_unlock_sock(sk);
6078 ++ bh_unlock_sock(meta_sk);
6079 + sock_put(sk);
6080 + }
6081 +
6082 +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
6083 +index 5667b3003af9..7139c2973fd2 100644
6084 +--- a/net/ipv6/addrconf.c
6085 ++++ b/net/ipv6/addrconf.c
6086 +@@ -760,6 +760,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
6087 +
6088 + kfree_rcu(ifp, rcu);
6089 + }
6090 ++EXPORT_SYMBOL(inet6_ifa_finish_destroy);
6091 +
6092 + static void
6093 + ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
6094 +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
6095 +index 7cb4392690dd..7057afbca4df 100644
6096 +--- a/net/ipv6/af_inet6.c
6097 ++++ b/net/ipv6/af_inet6.c
6098 +@@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
6099 + return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
6100 + }
6101 +
6102 +-static int inet6_create(struct net *net, struct socket *sock, int protocol,
6103 +- int kern)
6104 ++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
6105 + {
6106 + struct inet_sock *inet;
6107 + struct ipv6_pinfo *np;
6108 +diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
6109 +index a245e5ddffbd..99c892b8992d 100644
6110 +--- a/net/ipv6/inet6_connection_sock.c
6111 ++++ b/net/ipv6/inet6_connection_sock.c
6112 +@@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
6113 + /*
6114 + * request_sock (formerly open request) hash tables.
6115 + */
6116 +-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
6117 +- const u32 rnd, const u32 synq_hsize)
6118 ++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
6119 ++ const u32 rnd, const u32 synq_hsize)
6120 + {
6121 + u32 c;
6122 +
6123 +diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
6124 +index edb58aff4ae7..ea4d9fda0927 100644
6125 +--- a/net/ipv6/ipv6_sockglue.c
6126 ++++ b/net/ipv6/ipv6_sockglue.c
6127 +@@ -48,6 +48,8 @@
6128 + #include <net/addrconf.h>
6129 + #include <net/inet_common.h>
6130 + #include <net/tcp.h>
6131 ++#include <net/mptcp.h>
6132 ++#include <net/mptcp_v4.h>
6133 + #include <net/udp.h>
6134 + #include <net/udplite.h>
6135 + #include <net/xfrm.h>
6136 +@@ -196,7 +198,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
6137 + sock_prot_inuse_add(net, &tcp_prot, 1);
6138 + local_bh_enable();
6139 + sk->sk_prot = &tcp_prot;
6140 +- icsk->icsk_af_ops = &ipv4_specific;
6141 ++#ifdef CONFIG_MPTCP
6142 ++ if (is_mptcp_enabled(sk))
6143 ++ icsk->icsk_af_ops = &mptcp_v4_specific;
6144 ++ else
6145 ++#endif
6146 ++ icsk->icsk_af_ops = &ipv4_specific;
6147 + sk->sk_socket->ops = &inet_stream_ops;
6148 + sk->sk_family = PF_INET;
6149 + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
6150 +diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
6151 +index a822b880689b..b2b38869d795 100644
6152 +--- a/net/ipv6/syncookies.c
6153 ++++ b/net/ipv6/syncookies.c
6154 +@@ -181,13 +181,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
6155 +
6156 + /* check for timestamp cookie support */
6157 + memset(&tcp_opt, 0, sizeof(tcp_opt));
6158 +- tcp_parse_options(skb, &tcp_opt, 0, NULL);
6159 ++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
6160 +
6161 + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
6162 + goto out;
6163 +
6164 + ret = NULL;
6165 +- req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
6166 ++ req = inet_reqsk_alloc(&tcp6_request_sock_ops);
6167 + if (!req)
6168 + goto out;
6169 +
6170 +@@ -255,10 +255,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
6171 + }
6172 +
6173 + req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
6174 +- tcp_select_initial_window(tcp_full_space(sk), req->mss,
6175 +- &req->rcv_wnd, &req->window_clamp,
6176 +- ireq->wscale_ok, &rcv_wscale,
6177 +- dst_metric(dst, RTAX_INITRWND));
6178 ++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
6179 ++ &req->rcv_wnd, &req->window_clamp,
6180 ++ ireq->wscale_ok, &rcv_wscale,
6181 ++ dst_metric(dst, RTAX_INITRWND), sk);
6182 +
6183 + ireq->rcv_wscale = rcv_wscale;
6184 +
6185 +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
6186 +index 229239ad96b1..fda94d71666e 100644
6187 +--- a/net/ipv6/tcp_ipv6.c
6188 ++++ b/net/ipv6/tcp_ipv6.c
6189 +@@ -63,6 +63,8 @@
6190 + #include <net/inet_common.h>
6191 + #include <net/secure_seq.h>
6192 + #include <net/tcp_memcontrol.h>
6193 ++#include <net/mptcp.h>
6194 ++#include <net/mptcp_v6.h>
6195 + #include <net/busy_poll.h>
6196 +
6197 + #include <linux/proc_fs.h>
6198 +@@ -71,12 +73,6 @@
6199 + #include <linux/crypto.h>
6200 + #include <linux/scatterlist.h>
6201 +
6202 +-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
6203 +-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
6204 +- struct request_sock *req);
6205 +-
6206 +-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
6207 +-
6208 + static const struct inet_connection_sock_af_ops ipv6_mapped;
6209 + static const struct inet_connection_sock_af_ops ipv6_specific;
6210 + #ifdef CONFIG_TCP_MD5SIG
6211 +@@ -90,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
6212 + }
6213 + #endif
6214 +
6215 +-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
6216 ++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
6217 + {
6218 + struct dst_entry *dst = skb_dst(skb);
6219 + const struct rt6_info *rt = (const struct rt6_info *)dst;
6220 +@@ -102,10 +98,11 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
6221 + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
6222 + }
6223 +
6224 +-static void tcp_v6_hash(struct sock *sk)
6225 ++void tcp_v6_hash(struct sock *sk)
6226 + {
6227 + if (sk->sk_state != TCP_CLOSE) {
6228 +- if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
6229 ++ if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped ||
6230 ++ inet_csk(sk)->icsk_af_ops == &mptcp_v6_mapped) {
6231 + tcp_prot.hash(sk);
6232 + return;
6233 + }
6234 +@@ -115,7 +112,7 @@ static void tcp_v6_hash(struct sock *sk)
6235 + }
6236 + }
6237 +
6238 +-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
6239 ++__u32 tcp_v6_init_sequence(const struct sk_buff *skb)
6240 + {
6241 + return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
6242 + ipv6_hdr(skb)->saddr.s6_addr32,
6243 +@@ -123,7 +120,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
6244 + tcp_hdr(skb)->source);
6245 + }
6246 +
6247 +-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6248 ++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6249 + int addr_len)
6250 + {
6251 + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
6252 +@@ -215,7 +212,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6253 + sin.sin_port = usin->sin6_port;
6254 + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
6255 +
6256 +- icsk->icsk_af_ops = &ipv6_mapped;
6257 ++#ifdef CONFIG_MPTCP
6258 ++ if (is_mptcp_enabled(sk))
6259 ++ icsk->icsk_af_ops = &mptcp_v6_mapped;
6260 ++ else
6261 ++#endif
6262 ++ icsk->icsk_af_ops = &ipv6_mapped;
6263 + sk->sk_backlog_rcv = tcp_v4_do_rcv;
6264 + #ifdef CONFIG_TCP_MD5SIG
6265 + tp->af_specific = &tcp_sock_ipv6_mapped_specific;
6266 +@@ -225,7 +227,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6267 +
6268 + if (err) {
6269 + icsk->icsk_ext_hdr_len = exthdrlen;
6270 +- icsk->icsk_af_ops = &ipv6_specific;
6271 ++#ifdef CONFIG_MPTCP
6272 ++ if (is_mptcp_enabled(sk))
6273 ++ icsk->icsk_af_ops = &mptcp_v6_specific;
6274 ++ else
6275 ++#endif
6276 ++ icsk->icsk_af_ops = &ipv6_specific;
6277 + sk->sk_backlog_rcv = tcp_v6_do_rcv;
6278 + #ifdef CONFIG_TCP_MD5SIG
6279 + tp->af_specific = &tcp_sock_ipv6_specific;
6280 +@@ -337,7 +344,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6281 + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
6282 + const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
6283 + struct ipv6_pinfo *np;
6284 +- struct sock *sk;
6285 ++ struct sock *sk, *meta_sk;
6286 + int err;
6287 + struct tcp_sock *tp;
6288 + struct request_sock *fastopen;
6289 +@@ -358,8 +365,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6290 + return;
6291 + }
6292 +
6293 +- bh_lock_sock(sk);
6294 +- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
6295 ++ tp = tcp_sk(sk);
6296 ++ if (mptcp(tp))
6297 ++ meta_sk = mptcp_meta_sk(sk);
6298 ++ else
6299 ++ meta_sk = sk;
6300 ++
6301 ++ bh_lock_sock(meta_sk);
6302 ++ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
6303 + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
6304 +
6305 + if (sk->sk_state == TCP_CLOSE)
6306 +@@ -370,7 +383,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6307 + goto out;
6308 + }
6309 +
6310 +- tp = tcp_sk(sk);
6311 + seq = ntohl(th->seq);
6312 + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
6313 + fastopen = tp->fastopen_rsk;
6314 +@@ -403,11 +415,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6315 + goto out;
6316 +
6317 + tp->mtu_info = ntohl(info);
6318 +- if (!sock_owned_by_user(sk))
6319 ++ if (!sock_owned_by_user(meta_sk))
6320 + tcp_v6_mtu_reduced(sk);
6321 +- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
6322 ++ else {
6323 ++ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
6324 + &tp->tsq_flags))
6325 +- sock_hold(sk);
6326 ++ sock_hold(sk);
6327 ++ if (mptcp(tp))
6328 ++ mptcp_tsq_flags(sk);
6329 ++ }
6330 + goto out;
6331 + }
6332 +
6333 +@@ -417,7 +433,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6334 + switch (sk->sk_state) {
6335 + struct request_sock *req, **prev;
6336 + case TCP_LISTEN:
6337 +- if (sock_owned_by_user(sk))
6338 ++ if (sock_owned_by_user(meta_sk))
6339 + goto out;
6340 +
6341 + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
6342 +@@ -447,7 +463,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6343 + if (fastopen && fastopen->sk == NULL)
6344 + break;
6345 +
6346 +- if (!sock_owned_by_user(sk)) {
6347 ++ if (!sock_owned_by_user(meta_sk)) {
6348 + sk->sk_err = err;
6349 + sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
6350 +
6351 +@@ -457,26 +473,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6352 + goto out;
6353 + }
6354 +
6355 +- if (!sock_owned_by_user(sk) && np->recverr) {
6356 ++ if (!sock_owned_by_user(meta_sk) && np->recverr) {
6357 + sk->sk_err = err;
6358 + sk->sk_error_report(sk);
6359 + } else
6360 + sk->sk_err_soft = err;
6361 +
6362 + out:
6363 +- bh_unlock_sock(sk);
6364 ++ bh_unlock_sock(meta_sk);
6365 + sock_put(sk);
6366 + }
6367 +
6368 +
6369 +-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
6370 +- struct flowi6 *fl6,
6371 +- struct request_sock *req,
6372 +- u16 queue_mapping,
6373 +- struct tcp_fastopen_cookie *foc)
6374 ++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
6375 ++ struct flowi *fl,
6376 ++ struct request_sock *req,
6377 ++ u16 queue_mapping,
6378 ++ struct tcp_fastopen_cookie *foc)
6379 + {
6380 + struct inet_request_sock *ireq = inet_rsk(req);
6381 + struct ipv6_pinfo *np = inet6_sk(sk);
6382 ++ struct flowi6 *fl6 = &fl->u.ip6;
6383 + struct sk_buff *skb;
6384 + int err = -ENOMEM;
6385 +
6386 +@@ -497,18 +514,21 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
6387 + skb_set_queue_mapping(skb, queue_mapping);
6388 + err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
6389 + err = net_xmit_eval(err);
6390 ++ if (!tcp_rsk(req)->snt_synack && !err)
6391 ++ tcp_rsk(req)->snt_synack = tcp_time_stamp;
6392 + }
6393 +
6394 + done:
6395 + return err;
6396 + }
6397 +
6398 +-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
6399 ++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
6400 + {
6401 +- struct flowi6 fl6;
6402 ++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
6403 ++ struct flowi fl;
6404 + int res;
6405 +
6406 +- res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL);
6407 ++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
6408 + if (!res) {
6409 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
6410 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
6411 +@@ -516,7 +536,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
6412 + return res;
6413 + }
6414 +
6415 +-static void tcp_v6_reqsk_destructor(struct request_sock *req)
6416 ++void tcp_v6_reqsk_destructor(struct request_sock *req)
6417 + {
6418 + kfree_skb(inet_rsk(req)->pktopts);
6419 + }
6420 +@@ -718,27 +738,74 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
6421 + }
6422 + #endif
6423 +
6424 ++static int tcp_v6_init_req(struct request_sock *req, struct sock *sk,
6425 ++ struct sk_buff *skb)
6426 ++{
6427 ++ struct inet_request_sock *ireq = inet_rsk(req);
6428 ++ struct ipv6_pinfo *np = inet6_sk(sk);
6429 ++
6430 ++ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
6431 ++ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
6432 ++
6433 ++ ireq->ir_iif = sk->sk_bound_dev_if;
6434 ++ ireq->ir_mark = inet_request_mark(sk, skb);
6435 ++
6436 ++ /* So that link locals have meaning */
6437 ++ if (!sk->sk_bound_dev_if &&
6438 ++ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
6439 ++ ireq->ir_iif = inet6_iif(skb);
6440 ++
6441 ++ if (!TCP_SKB_CB(skb)->when &&
6442 ++ (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||
6443 ++ np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
6444 ++ np->rxopt.bits.rxohlim || np->repflow)) {
6445 ++ atomic_inc(&skb->users);
6446 ++ ireq->pktopts = skb;
6447 ++ }
6448 ++
6449 ++ return 0;
6450 ++}
6451 ++
6452 ++static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
6453 ++ const struct request_sock *req,
6454 ++ bool *strict)
6455 ++{
6456 ++ if (strict)
6457 ++ *strict = true;
6458 ++ return inet6_csk_route_req(sk, &fl->u.ip6, req);
6459 ++}
6460 ++
6461 + struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
6462 + .family = AF_INET6,
6463 + .obj_size = sizeof(struct tcp6_request_sock),
6464 +- .rtx_syn_ack = tcp_v6_rtx_synack,
6465 ++ .rtx_syn_ack = tcp_rtx_synack,
6466 + .send_ack = tcp_v6_reqsk_send_ack,
6467 + .destructor = tcp_v6_reqsk_destructor,
6468 + .send_reset = tcp_v6_send_reset,
6469 + .syn_ack_timeout = tcp_syn_ack_timeout,
6470 + };
6471 +
6472 ++const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
6473 ++ .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
6474 ++ sizeof(struct ipv6hdr),
6475 + #ifdef CONFIG_TCP_MD5SIG
6476 +-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
6477 + .md5_lookup = tcp_v6_reqsk_md5_lookup,
6478 + .calc_md5_hash = tcp_v6_md5_hash_skb,
6479 +-};
6480 + #endif
6481 ++ .init_req = tcp_v6_init_req,
6482 ++#ifdef CONFIG_SYN_COOKIES
6483 ++ .cookie_init_seq = cookie_v6_init_sequence,
6484 ++#endif
6485 ++ .route_req = tcp_v6_route_req,
6486 ++ .init_seq = tcp_v6_init_sequence,
6487 ++ .send_synack = tcp_v6_send_synack,
6488 ++ .queue_hash_add = inet6_csk_reqsk_queue_hash_add,
6489 ++};
6490 +
6491 +-static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6492 +- u32 tsval, u32 tsecr, int oif,
6493 +- struct tcp_md5sig_key *key, int rst, u8 tclass,
6494 +- u32 label)
6495 ++static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,
6496 ++ u32 data_ack, u32 win, u32 tsval, u32 tsecr,
6497 ++ int oif, struct tcp_md5sig_key *key, int rst,
6498 ++ u8 tclass, u32 label, int mptcp)
6499 + {
6500 + const struct tcphdr *th = tcp_hdr(skb);
6501 + struct tcphdr *t1;
6502 +@@ -756,7 +823,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6503 + if (key)
6504 + tot_len += TCPOLEN_MD5SIG_ALIGNED;
6505 + #endif
6506 +-
6507 ++#ifdef CONFIG_MPTCP
6508 ++ if (mptcp)
6509 ++ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
6510 ++#endif
6511 + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
6512 + GFP_ATOMIC);
6513 + if (buff == NULL)
6514 +@@ -794,6 +864,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6515 + tcp_v6_md5_hash_hdr((__u8 *)topt, key,
6516 + &ipv6_hdr(skb)->saddr,
6517 + &ipv6_hdr(skb)->daddr, t1);
6518 ++ topt += 4;
6519 ++ }
6520 ++#endif
6521 ++#ifdef CONFIG_MPTCP
6522 ++ if (mptcp) {
6523 ++ /* Construction of 32-bit data_ack */
6524 ++ *topt++ = htonl((TCPOPT_MPTCP << 24) |
6525 ++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
6526 ++ (0x20 << 8) |
6527 ++ (0x01));
6528 ++ *topt++ = htonl(data_ack);
6529 + }
6530 + #endif
6531 +
6532 +@@ -834,7 +915,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6533 + kfree_skb(buff);
6534 + }
6535 +
6536 +-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
6537 ++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
6538 + {
6539 + const struct tcphdr *th = tcp_hdr(skb);
6540 + u32 seq = 0, ack_seq = 0;
6541 +@@ -891,7 +972,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
6542 + (th->doff << 2);
6543 +
6544 + oif = sk ? sk->sk_bound_dev_if : 0;
6545 +- tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
6546 ++ tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0);
6547 +
6548 + #ifdef CONFIG_TCP_MD5SIG
6549 + release_sk1:
6550 +@@ -902,45 +983,52 @@ release_sk1:
6551 + #endif
6552 + }
6553 +
6554 +-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
6555 ++static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
6556 + u32 win, u32 tsval, u32 tsecr, int oif,
6557 + struct tcp_md5sig_key *key, u8 tclass,
6558 +- u32 label)
6559 ++ u32 label, int mptcp)
6560 + {
6561 +- tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass,
6562 +- label);
6563 ++ tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, oif,
6564 ++ key, 0, tclass, label, mptcp);
6565 + }
6566 +
6567 + static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
6568 + {
6569 + struct inet_timewait_sock *tw = inet_twsk(sk);
6570 + struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
6571 ++ u32 data_ack = 0;
6572 ++ int mptcp = 0;
6573 +
6574 ++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
6575 ++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
6576 ++ mptcp = 1;
6577 ++ }
6578 + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
6579 ++ data_ack,
6580 + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
6581 + tcp_time_stamp + tcptw->tw_ts_offset,
6582 + tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
6583 +- tw->tw_tclass, (tw->tw_flowlabel << 12));
6584 ++ tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp);
6585 +
6586 + inet_twsk_put(tw);
6587 + }
6588 +
6589 +-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
6590 +- struct request_sock *req)
6591 ++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
6592 ++ struct request_sock *req)
6593 + {
6594 + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
6595 + * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
6596 + */
6597 + tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
6598 + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
6599 +- tcp_rsk(req)->rcv_nxt,
6600 ++ tcp_rsk(req)->rcv_nxt, 0,
6601 + req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
6602 + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
6603 +- 0, 0);
6604 ++ 0, 0, 0);
6605 + }
6606 +
6607 +
6608 +-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6609 ++struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6610 + {
6611 + struct request_sock *req, **prev;
6612 + const struct tcphdr *th = tcp_hdr(skb);
6613 +@@ -959,7 +1047,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6614 +
6615 + if (nsk) {
6616 + if (nsk->sk_state != TCP_TIME_WAIT) {
6617 ++ /* Don't lock again the meta-sk. It has been locked
6618 ++ * before mptcp_v6_do_rcv.
6619 ++ */
6620 ++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))
6621 ++ bh_lock_sock(mptcp_meta_sk(nsk));
6622 + bh_lock_sock(nsk);
6623 ++
6624 + return nsk;
6625 + }
6626 + inet_twsk_put(inet_twsk(nsk));
6627 +@@ -973,161 +1067,25 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6628 + return sk;
6629 + }
6630 +
6631 +-/* FIXME: this is substantially similar to the ipv4 code.
6632 +- * Can some kind of merge be done? -- erics
6633 +- */
6634 +-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
6635 ++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
6636 + {
6637 +- struct tcp_options_received tmp_opt;
6638 +- struct request_sock *req;
6639 +- struct inet_request_sock *ireq;
6640 +- struct ipv6_pinfo *np = inet6_sk(sk);
6641 +- struct tcp_sock *tp = tcp_sk(sk);
6642 +- __u32 isn = TCP_SKB_CB(skb)->when;
6643 +- struct dst_entry *dst = NULL;
6644 +- struct tcp_fastopen_cookie foc = { .len = -1 };
6645 +- bool want_cookie = false, fastopen;
6646 +- struct flowi6 fl6;
6647 +- int err;
6648 +-
6649 + if (skb->protocol == htons(ETH_P_IP))
6650 + return tcp_v4_conn_request(sk, skb);
6651 +
6652 + if (!ipv6_unicast_destination(skb))
6653 + goto drop;
6654 +
6655 +- if ((sysctl_tcp_syncookies == 2 ||
6656 +- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6657 +- want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");
6658 +- if (!want_cookie)
6659 +- goto drop;
6660 +- }
6661 +-
6662 +- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6663 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6664 +- goto drop;
6665 +- }
6666 +-
6667 +- req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
6668 +- if (req == NULL)
6669 +- goto drop;
6670 +-
6671 +-#ifdef CONFIG_TCP_MD5SIG
6672 +- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
6673 +-#endif
6674 +-
6675 +- tcp_clear_options(&tmp_opt);
6676 +- tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
6677 +- tmp_opt.user_mss = tp->rx_opt.user_mss;
6678 +- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
6679 +-
6680 +- if (want_cookie && !tmp_opt.saw_tstamp)
6681 +- tcp_clear_options(&tmp_opt);
6682 ++ return tcp_conn_request(&tcp6_request_sock_ops,
6683 ++ &tcp_request_sock_ipv6_ops, sk, skb);
6684 +
6685 +- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6686 +- tcp_openreq_init(req, &tmp_opt, skb);
6687 +-
6688 +- ireq = inet_rsk(req);
6689 +- ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
6690 +- ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
6691 +- if (!want_cookie || tmp_opt.tstamp_ok)
6692 +- TCP_ECN_create_request(req, skb, sock_net(sk));
6693 +-
6694 +- ireq->ir_iif = sk->sk_bound_dev_if;
6695 +- ireq->ir_mark = inet_request_mark(sk, skb);
6696 +-
6697 +- /* So that link locals have meaning */
6698 +- if (!sk->sk_bound_dev_if &&
6699 +- ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
6700 +- ireq->ir_iif = inet6_iif(skb);
6701 +-
6702 +- if (!isn) {
6703 +- if (ipv6_opt_accepted(sk, skb) ||
6704 +- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
6705 +- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim ||
6706 +- np->repflow) {
6707 +- atomic_inc(&skb->users);
6708 +- ireq->pktopts = skb;
6709 +- }
6710 +-
6711 +- if (want_cookie) {
6712 +- isn = cookie_v6_init_sequence(sk, skb, &req->mss);
6713 +- req->cookie_ts = tmp_opt.tstamp_ok;
6714 +- goto have_isn;
6715 +- }
6716 +-
6717 +- /* VJ's idea. We save last timestamp seen
6718 +- * from the destination in peer table, when entering
6719 +- * state TIME-WAIT, and check against it before
6720 +- * accepting new connection request.
6721 +- *
6722 +- * If "isn" is not zero, this request hit alive
6723 +- * timewait bucket, so that all the necessary checks
6724 +- * are made in the function processing timewait state.
6725 +- */
6726 +- if (tmp_opt.saw_tstamp &&
6727 +- tcp_death_row.sysctl_tw_recycle &&
6728 +- (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
6729 +- if (!tcp_peer_is_proven(req, dst, true)) {
6730 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
6731 +- goto drop_and_release;
6732 +- }
6733 +- }
6734 +- /* Kill the following clause, if you dislike this way. */
6735 +- else if (!sysctl_tcp_syncookies &&
6736 +- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6737 +- (sysctl_max_syn_backlog >> 2)) &&
6738 +- !tcp_peer_is_proven(req, dst, false)) {
6739 +- /* Without syncookies last quarter of
6740 +- * backlog is filled with destinations,
6741 +- * proven to be alive.
6742 +- * It means that we continue to communicate
6743 +- * to destinations, already remembered
6744 +- * to the moment of synflood.
6745 +- */
6746 +- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
6747 +- &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source));
6748 +- goto drop_and_release;
6749 +- }
6750 +-
6751 +- isn = tcp_v6_init_sequence(skb);
6752 +- }
6753 +-have_isn:
6754 +-
6755 +- if (security_inet_conn_request(sk, skb, req))
6756 +- goto drop_and_release;
6757 +-
6758 +- if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL)
6759 +- goto drop_and_free;
6760 +-
6761 +- tcp_rsk(req)->snt_isn = isn;
6762 +- tcp_rsk(req)->snt_synack = tcp_time_stamp;
6763 +- tcp_openreq_init_rwin(req, sk, dst);
6764 +- fastopen = !want_cookie &&
6765 +- tcp_try_fastopen(sk, skb, req, &foc, dst);
6766 +- err = tcp_v6_send_synack(sk, dst, &fl6, req,
6767 +- skb_get_queue_mapping(skb), &foc);
6768 +- if (!fastopen) {
6769 +- if (err || want_cookie)
6770 +- goto drop_and_free;
6771 +-
6772 +- tcp_rsk(req)->listener = NULL;
6773 +- inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6774 +- }
6775 +- return 0;
6776 +-
6777 +-drop_and_release:
6778 +- dst_release(dst);
6779 +-drop_and_free:
6780 +- reqsk_free(req);
6781 + drop:
6782 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6783 + return 0; /* don't send reset */
6784 + }
6785 +
6786 +-static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
6787 +- struct request_sock *req,
6788 +- struct dst_entry *dst)
6789 ++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
6790 ++ struct request_sock *req,
6791 ++ struct dst_entry *dst)
6792 + {
6793 + struct inet_request_sock *ireq;
6794 + struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
6795 +@@ -1165,7 +1123,12 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
6796 +
6797 + newsk->sk_v6_rcv_saddr = newnp->saddr;
6798 +
6799 +- inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
6800 ++#ifdef CONFIG_MPTCP
6801 ++ if (is_mptcp_enabled(newsk))
6802 ++ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;
6803 ++ else
6804 ++#endif
6805 ++ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
6806 + newsk->sk_backlog_rcv = tcp_v4_do_rcv;
6807 + #ifdef CONFIG_TCP_MD5SIG
6808 + newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
6809 +@@ -1329,7 +1292,7 @@ out:
6810 + * This is because we cannot sleep with the original spinlock
6811 + * held.
6812 + */
6813 +-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
6814 ++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
6815 + {
6816 + struct ipv6_pinfo *np = inet6_sk(sk);
6817 + struct tcp_sock *tp;
6818 +@@ -1351,6 +1314,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
6819 + goto discard;
6820 + #endif
6821 +
6822 ++ if (is_meta_sk(sk))
6823 ++ return mptcp_v6_do_rcv(sk, skb);
6824 ++
6825 + if (sk_filter(sk, skb))
6826 + goto discard;
6827 +
6828 +@@ -1472,7 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
6829 + {
6830 + const struct tcphdr *th;
6831 + const struct ipv6hdr *hdr;
6832 +- struct sock *sk;
6833 ++ struct sock *sk, *meta_sk = NULL;
6834 + int ret;
6835 + struct net *net = dev_net(skb->dev);
6836 +
6837 +@@ -1503,18 +1469,43 @@ static int tcp_v6_rcv(struct sk_buff *skb)
6838 + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
6839 + skb->len - th->doff*4);
6840 + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
6841 ++#ifdef CONFIG_MPTCP
6842 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
6843 ++ TCP_SKB_CB(skb)->dss_off = 0;
6844 ++#endif
6845 + TCP_SKB_CB(skb)->when = 0;
6846 + TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
6847 + TCP_SKB_CB(skb)->sacked = 0;
6848 +
6849 + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
6850 +- if (!sk)
6851 +- goto no_tcp_socket;
6852 +
6853 + process:
6854 +- if (sk->sk_state == TCP_TIME_WAIT)
6855 ++ if (sk && sk->sk_state == TCP_TIME_WAIT)
6856 + goto do_time_wait;
6857 +
6858 ++#ifdef CONFIG_MPTCP
6859 ++ if (!sk && th->syn && !th->ack) {
6860 ++ int ret = mptcp_lookup_join(skb, NULL);
6861 ++
6862 ++ if (ret < 0) {
6863 ++ tcp_v6_send_reset(NULL, skb);
6864 ++ goto discard_it;
6865 ++ } else if (ret > 0) {
6866 ++ return 0;
6867 ++ }
6868 ++ }
6869 ++
6870 ++ /* Is there a pending request sock for this segment ? */
6871 ++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
6872 ++ if (sk)
6873 ++ sock_put(sk);
6874 ++ return 0;
6875 ++ }
6876 ++#endif
6877 ++
6878 ++ if (!sk)
6879 ++ goto no_tcp_socket;
6880 ++
6881 + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
6882 + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
6883 + goto discard_and_relse;
6884 +@@ -1529,11 +1520,21 @@ process:
6885 + sk_mark_napi_id(sk, skb);
6886 + skb->dev = NULL;
6887 +
6888 +- bh_lock_sock_nested(sk);
6889 ++ if (mptcp(tcp_sk(sk))) {
6890 ++ meta_sk = mptcp_meta_sk(sk);
6891 ++
6892 ++ bh_lock_sock_nested(meta_sk);
6893 ++ if (sock_owned_by_user(meta_sk))
6894 ++ skb->sk = sk;
6895 ++ } else {
6896 ++ meta_sk = sk;
6897 ++ bh_lock_sock_nested(sk);
6898 ++ }
6899 ++
6900 + ret = 0;
6901 +- if (!sock_owned_by_user(sk)) {
6902 ++ if (!sock_owned_by_user(meta_sk)) {
6903 + #ifdef CONFIG_NET_DMA
6904 +- struct tcp_sock *tp = tcp_sk(sk);
6905 ++ struct tcp_sock *tp = tcp_sk(meta_sk);
6906 + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
6907 + tp->ucopy.dma_chan = net_dma_find_channel();
6908 + if (tp->ucopy.dma_chan)
6909 +@@ -1541,16 +1542,17 @@ process:
6910 + else
6911 + #endif
6912 + {
6913 +- if (!tcp_prequeue(sk, skb))
6914 ++ if (!tcp_prequeue(meta_sk, skb))
6915 + ret = tcp_v6_do_rcv(sk, skb);
6916 + }
6917 +- } else if (unlikely(sk_add_backlog(sk, skb,
6918 +- sk->sk_rcvbuf + sk->sk_sndbuf))) {
6919 +- bh_unlock_sock(sk);
6920 ++ } else if (unlikely(sk_add_backlog(meta_sk, skb,
6921 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
6922 ++ bh_unlock_sock(meta_sk);
6923 + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6924 + goto discard_and_relse;
6925 + }
6926 +- bh_unlock_sock(sk);
6927 ++
6928 ++ bh_unlock_sock(meta_sk);
6929 +
6930 + sock_put(sk);
6931 + return ret ? -1 : 0;
6932 +@@ -1607,6 +1609,18 @@ do_time_wait:
6933 + sk = sk2;
6934 + goto process;
6935 + }
6936 ++#ifdef CONFIG_MPTCP
6937 ++ if (th->syn && !th->ack) {
6938 ++ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
6939 ++
6940 ++ if (ret < 0) {
6941 ++ tcp_v6_send_reset(NULL, skb);
6942 ++ goto discard_it;
6943 ++ } else if (ret > 0) {
6944 ++ return 0;
6945 ++ }
6946 ++ }
6947 ++#endif
6948 + /* Fall through to ACK */
6949 + }
6950 + case TCP_TW_ACK:
6951 +@@ -1657,7 +1671,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
6952 + }
6953 + }
6954 +
6955 +-static struct timewait_sock_ops tcp6_timewait_sock_ops = {
6956 ++struct timewait_sock_ops tcp6_timewait_sock_ops = {
6957 + .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
6958 + .twsk_unique = tcp_twsk_unique,
6959 + .twsk_destructor = tcp_twsk_destructor,
6960 +@@ -1730,7 +1744,12 @@ static int tcp_v6_init_sock(struct sock *sk)
6961 +
6962 + tcp_init_sock(sk);
6963 +
6964 +- icsk->icsk_af_ops = &ipv6_specific;
6965 ++#ifdef CONFIG_MPTCP
6966 ++ if (is_mptcp_enabled(sk))
6967 ++ icsk->icsk_af_ops = &mptcp_v6_specific;
6968 ++ else
6969 ++#endif
6970 ++ icsk->icsk_af_ops = &ipv6_specific;
6971 +
6972 + #ifdef CONFIG_TCP_MD5SIG
6973 + tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
6974 +@@ -1739,7 +1758,7 @@ static int tcp_v6_init_sock(struct sock *sk)
6975 + return 0;
6976 + }
6977 +
6978 +-static void tcp_v6_destroy_sock(struct sock *sk)
6979 ++void tcp_v6_destroy_sock(struct sock *sk)
6980 + {
6981 + tcp_v4_destroy_sock(sk);
6982 + inet6_destroy_sock(sk);
6983 +@@ -1924,12 +1943,28 @@ void tcp6_proc_exit(struct net *net)
6984 + static void tcp_v6_clear_sk(struct sock *sk, int size)
6985 + {
6986 + struct inet_sock *inet = inet_sk(sk);
6987 ++#ifdef CONFIG_MPTCP
6988 ++ struct tcp_sock *tp = tcp_sk(sk);
6989 ++ /* size_tk_table goes from the end of tk_table to the end of sk */
6990 ++ int size_tk_table = size - offsetof(struct tcp_sock, tk_table) -
6991 ++ sizeof(tp->tk_table);
6992 ++#endif
6993 +
6994 + /* we do not want to clear pinet6 field, because of RCU lookups */
6995 + sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
6996 +
6997 + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
6998 ++
6999 ++#ifdef CONFIG_MPTCP
7000 ++ /* We zero out only from pinet6 to tk_table */
7001 ++ size -= size_tk_table + sizeof(tp->tk_table);
7002 ++#endif
7003 + memset(&inet->pinet6 + 1, 0, size);
7004 ++
7005 ++#ifdef CONFIG_MPTCP
7006 ++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size_tk_table);
7007 ++#endif
7008 ++
7009 + }
7010 +
7011 + struct proto tcpv6_prot = {
7012 +diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
7013 +new file mode 100644
7014 +index 000000000000..cdfc03adabf8
7015 +--- /dev/null
7016 ++++ b/net/mptcp/Kconfig
7017 +@@ -0,0 +1,115 @@
7018 ++#
7019 ++# MPTCP configuration
7020 ++#
7021 ++config MPTCP
7022 ++ bool "MPTCP protocol"
7023 ++ depends on (IPV6=y || IPV6=n)
7024 ++ ---help---
7025 ++ This replaces the normal TCP stack with a Multipath TCP stack,
7026 ++ able to use several paths at once.
7027 ++
7028 ++menuconfig MPTCP_PM_ADVANCED
7029 ++ bool "MPTCP: advanced path-manager control"
7030 ++ depends on MPTCP=y
7031 ++ ---help---
7032 ++ Support for selection of different path-managers. You should choose 'Y' here,
7033 ++ because otherwise you will not actively create new MPTCP-subflows.
7034 ++
7035 ++if MPTCP_PM_ADVANCED
7036 ++
7037 ++config MPTCP_FULLMESH
7038 ++ tristate "MPTCP Full-Mesh Path-Manager"
7039 ++ depends on MPTCP=y
7040 ++ ---help---
7041 ++ This path-management module will create a full-mesh among all IP-addresses.
7042 ++
7043 ++config MPTCP_NDIFFPORTS
7044 ++ tristate "MPTCP ndiff-ports"
7045 ++ depends on MPTCP=y
7046 ++ ---help---
7047 ++ This path-management module will create multiple subflows between the same
7048 ++ pair of IP-addresses, modifying the source-port. You can set the number
7049 ++ of subflows via the mptcp_ndiffports-sysctl.
7050 ++
7051 ++config MPTCP_BINDER
7052 ++ tristate "MPTCP Binder"
7053 ++ depends on (MPTCP=y)
7054 ++ ---help---
7055 ++ This path-management module works like ndiffports, and adds the sysctl
7056 ++ option to set the gateway (and/or path to) per each additional subflow
7057 ++ via Loose Source Routing (IPv4 only).
7058 ++
7059 ++choice
7060 ++ prompt "Default MPTCP Path-Manager"
7061 ++ default DEFAULT
7062 ++ help
7063 ++ Select the Path-Manager of your choice
7064 ++
7065 ++ config DEFAULT_FULLMESH
7066 ++ bool "Full mesh" if MPTCP_FULLMESH=y
7067 ++
7068 ++ config DEFAULT_NDIFFPORTS
7069 ++ bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
7070 ++
7071 ++ config DEFAULT_BINDER
7072 ++ bool "binder" if MPTCP_BINDER=y
7073 ++
7074 ++ config DEFAULT_DUMMY
7075 ++ bool "Default"
7076 ++
7077 ++endchoice
7078 ++
7079 ++endif
7080 ++
7081 ++config DEFAULT_MPTCP_PM
7082 ++ string
7083 ++ default "default" if DEFAULT_DUMMY
7084 ++ default "fullmesh" if DEFAULT_FULLMESH
7085 ++ default "ndiffports" if DEFAULT_NDIFFPORTS
7086 ++ default "binder" if DEFAULT_BINDER
7087 ++ default "default"
7088 ++
7089 ++menuconfig MPTCP_SCHED_ADVANCED
7090 ++ bool "MPTCP: advanced scheduler control"
7091 ++ depends on MPTCP=y
7092 ++ ---help---
7093 ++ Support for selection of different schedulers. You should choose 'Y' here,
7094 ++ if you want to choose a different scheduler than the default one.
7095 ++
7096 ++if MPTCP_SCHED_ADVANCED
7097 ++
7098 ++config MPTCP_ROUNDROBIN
7099 ++ tristate "MPTCP Round-Robin"
7100 ++ depends on (MPTCP=y)
7101 ++ ---help---
7102 ++ This is a very simple round-robin scheduler. Probably has bad performance
7103 ++ but might be interesting for researchers.
7104 ++
7105 ++choice
7106 ++ prompt "Default MPTCP Scheduler"
7107 ++ default DEFAULT
7108 ++ help
7109 ++ Select the Scheduler of your choice
7110 ++
7111 ++ config DEFAULT_SCHEDULER
7112 ++ bool "Default"
7113 ++ ---help---
7114 ++ This is the default scheduler, sending first on the subflow
7115 ++ with the lowest RTT.
7116 ++
7117 ++ config DEFAULT_ROUNDROBIN
7118 ++ bool "Round-Robin" if MPTCP_ROUNDROBIN=y
7119 ++ ---help---
7120 ++ This is the round-rob scheduler, sending in a round-robin
7121 ++ fashion..
7122 ++
7123 ++endchoice
7124 ++endif
7125 ++
7126 ++config DEFAULT_MPTCP_SCHED
7127 ++ string
7128 ++ depends on (MPTCP=y)
7129 ++ default "default" if DEFAULT_SCHEDULER
7130 ++ default "roundrobin" if DEFAULT_ROUNDROBIN
7131 ++ default "default"
7132 ++
7133 +diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
7134 +new file mode 100644
7135 +index 000000000000..35561a7012e3
7136 +--- /dev/null
7137 ++++ b/net/mptcp/Makefile
7138 +@@ -0,0 +1,20 @@
7139 ++#
7140 ++## Makefile for MultiPath TCP support code.
7141 ++#
7142 ++#
7143 ++
7144 ++obj-$(CONFIG_MPTCP) += mptcp.o
7145 ++
7146 ++mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \
7147 ++ mptcp_output.o mptcp_input.o mptcp_sched.o
7148 ++
7149 ++obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o
7150 ++obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
7151 ++obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
7152 ++obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
7153 ++obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
7154 ++obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o
7155 ++obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o
7156 ++
7157 ++mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
7158 ++
7159 +diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c
7160 +new file mode 100644
7161 +index 000000000000..95d8da560715
7162 +--- /dev/null
7163 ++++ b/net/mptcp/mptcp_binder.c
7164 +@@ -0,0 +1,487 @@
7165 ++#include <linux/module.h>
7166 ++
7167 ++#include <net/mptcp.h>
7168 ++#include <net/mptcp_v4.h>
7169 ++
7170 ++#include <linux/route.h>
7171 ++#include <linux/inet.h>
7172 ++#include <linux/mroute.h>
7173 ++#include <linux/spinlock_types.h>
7174 ++#include <net/inet_ecn.h>
7175 ++#include <net/route.h>
7176 ++#include <net/xfrm.h>
7177 ++#include <net/compat.h>
7178 ++#include <linux/slab.h>
7179 ++
7180 ++#define MPTCP_GW_MAX_LISTS 10
7181 ++#define MPTCP_GW_LIST_MAX_LEN 6
7182 ++#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \
7183 ++ MPTCP_GW_MAX_LISTS)
7184 ++
7185 ++struct mptcp_gw_list {
7186 ++ struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];
7187 ++ u8 len[MPTCP_GW_MAX_LISTS];
7188 ++};
7189 ++
7190 ++struct binder_priv {
7191 ++ /* Worker struct for subflow establishment */
7192 ++ struct work_struct subflow_work;
7193 ++
7194 ++ struct mptcp_cb *mpcb;
7195 ++
7196 ++ /* Prevent multiple sub-sockets concurrently iterating over sockets */
7197 ++ spinlock_t *flow_lock;
7198 ++};
7199 ++
7200 ++static struct mptcp_gw_list *mptcp_gws;
7201 ++static rwlock_t mptcp_gws_lock;
7202 ++
7203 ++static int mptcp_binder_ndiffports __read_mostly = 1;
7204 ++
7205 ++static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;
7206 ++
7207 ++static int mptcp_get_avail_list_ipv4(struct sock *sk)
7208 ++{
7209 ++ int i, j, list_taken, opt_ret, opt_len;
7210 ++ unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];
7211 ++
7212 ++ for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {
7213 ++ if (mptcp_gws->len[i] == 0)
7214 ++ goto error;
7215 ++
7216 ++ mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);
7217 ++ list_taken = 0;
7218 ++
7219 ++ /* Loop through all sub-sockets in this connection */
7220 ++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) {
7221 ++ mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");
7222 ++
7223 ++ /* Reset length and options buffer, then retrieve
7224 ++ * from socket
7225 ++ */
7226 ++ opt_len = MAX_IPOPTLEN;
7227 ++ memset(opt, 0, MAX_IPOPTLEN);
7228 ++ opt_ret = ip_getsockopt(sk, IPPROTO_IP,
7229 ++ IP_OPTIONS, opt, &opt_len);
7230 ++ if (opt_ret < 0) {
7231 ++ mptcp_debug(KERN_ERR "%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",
7232 ++ __func__, opt_ret);
7233 ++ goto error;
7234 ++ }
7235 ++
7236 ++ /* If socket has no options, it has no stake in this list */
7237 ++ if (opt_len <= 0)
7238 ++ continue;
7239 ++
7240 ++ /* Iterate options buffer */
7241 ++ for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {
7242 ++ if (*opt_ptr == IPOPT_LSRR) {
7243 ++ mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");
7244 ++ goto sock_lsrr;
7245 ++ }
7246 ++ }
7247 ++ continue;
7248 ++
7249 ++sock_lsrr:
7250 ++ /* Pointer to the 2nd to last address */
7251 ++ opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;
7252 ++
7253 ++ /* Addresses start 3 bytes after type offset */
7254 ++ opt_ptr += 3;
7255 ++ j = 0;
7256 ++
7257 ++ /* Different length lists cannot be the same */
7258 ++ if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])
7259 ++ continue;
7260 ++
7261 ++ /* Iterate if we are still inside options list
7262 ++ * and sysctl list
7263 ++ */
7264 ++ while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {
7265 ++ /* If there is a different address, this list must
7266 ++ * not be set on this socket
7267 ++ */
7268 ++ if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))
7269 ++ break;
7270 ++
7271 ++ /* Jump 4 bytes to next address */
7272 ++ opt_ptr += 4;
7273 ++ j++;
7274 ++ }
7275 ++
7276 ++ /* Reached the end without a differing address, lists
7277 ++ * are therefore identical.
7278 ++ */
7279 ++ if (j == mptcp_gws->len[i]) {
7280 ++ mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");
7281 ++ list_taken = 1;
7282 ++ break;
7283 ++ }
7284 ++ }
7285 ++
7286 ++ /* Free list found if not taken by a socket */
7287 ++ if (!list_taken) {
7288 ++ mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");
7289 ++ break;
7290 ++ }
7291 ++ }
7292 ++
7293 ++ if (i >= MPTCP_GW_MAX_LISTS)
7294 ++ goto error;
7295 ++
7296 ++ return i;
7297 ++error:
7298 ++ return -1;
7299 ++}
7300 ++
7301 ++/* The list of addresses is parsed each time a new connection is opened,
7302 ++ * to make sure it's up to date. In case of error, all the lists are
7303 ++ * marked as unavailable and the subflow's fingerprint is set to 0.
7304 ++ */
7305 ++static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)
7306 ++{
7307 ++ int i, j, ret;
7308 ++ unsigned char opt[MAX_IPOPTLEN] = {0};
7309 ++ struct tcp_sock *tp = tcp_sk(sk);
7310 ++ struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];
7311 ++
7312 ++ /* Read lock: multiple sockets can read LSRR addresses at the same
7313 ++ * time, but writes are done in mutual exclusion.
7314 ++ * Spin lock: must search for free list for one socket at a time, or
7315 ++ * multiple sockets could take the same list.
7316 ++ */
7317 ++ read_lock(&mptcp_gws_lock);
7318 ++ spin_lock(fmp->flow_lock);
7319 ++
7320 ++ i = mptcp_get_avail_list_ipv4(sk);
7321 ++
7322 ++ /* Execution enters here only if a free path is found.
7323 ++ */
7324 ++ if (i >= 0) {
7325 ++ opt[0] = IPOPT_NOP;
7326 ++ opt[1] = IPOPT_LSRR;
7327 ++ opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *
7328 ++ (mptcp_gws->len[i] + 1) + 3;
7329 ++ opt[3] = IPOPT_MINOFF;
7330 ++ for (j = 0; j < mptcp_gws->len[i]; ++j)
7331 ++ memcpy(opt + 4 +
7332 ++ (j * sizeof(mptcp_gws->list[i][0].s_addr)),
7333 ++ &mptcp_gws->list[i][j].s_addr,
7334 ++ sizeof(mptcp_gws->list[i][0].s_addr));
7335 ++ /* Final destination must be part of IP_OPTIONS parameter. */
7336 ++ memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,
7337 ++ sizeof(addr.s_addr));
7338 ++
7339 ++ /* setsockopt must be inside the lock, otherwise another
7340 ++ * subflow could fail to see that we have taken a list.
7341 ++ */
7342 ++ ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, opt,
7343 ++ 4 + sizeof(mptcp_gws->list[i][0].s_addr)
7344 ++ * (mptcp_gws->len[i] + 1));
7345 ++
7346 ++ if (ret < 0) {
7347 ++ mptcp_debug(KERN_ERR "%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",
7348 ++ __func__, ret);
7349 ++ }
7350 ++ }
7351 ++
7352 ++ spin_unlock(fmp->flow_lock);
7353 ++ read_unlock(&mptcp_gws_lock);
7354 ++
7355 ++ return;
7356 ++}
7357 ++
7358 ++/* Parses gateways string for a list of paths to different
7359 ++ * gateways, and stores them for use with the Loose Source Routing (LSRR)
7360 ++ * socket option. Each list must have "," separated addresses, and the lists
7361 ++ * themselves must be separated by "-". Returns -1 in case one or more of the
7362 ++ * addresses is not a valid ipv4/6 address.
7363 ++ */
7364 ++static int mptcp_parse_gateway_ipv4(char *gateways)
7365 ++{
7366 ++ int i, j, k, ret;
7367 ++ char *tmp_string = NULL;
7368 ++ struct in_addr tmp_addr;
7369 ++
7370 ++ tmp_string = kzalloc(16, GFP_KERNEL);
7371 ++ if (tmp_string == NULL)
7372 ++ return -ENOMEM;
7373 ++
7374 ++ write_lock(&mptcp_gws_lock);
7375 ++
7376 ++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
7377 ++
7378 ++ /* A TMP string is used since inet_pton needs a null terminated string
7379 ++ * but we do not want to modify the sysctl for obvious reasons.
7380 ++ * i will iterate over the SYSCTL string, j will iterate over the
7381 ++ * temporary string where each IP is copied into, k will iterate over
7382 ++ * the IPs in each list.
7383 ++ */
7384 ++ for (i = j = k = 0;
7385 ++ i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;
7386 ++ ++i) {
7387 ++ if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {
7388 ++ /* If the temp IP is empty and the current list is
7389 ++ * empty, we are done.
7390 ++ */
7391 ++ if (j == 0 && mptcp_gws->len[k] == 0)
7392 ++ break;
7393 ++
7394 ++ /* Terminate the temp IP string, then if it is
7395 ++ * non-empty parse the IP and copy it.
7396 ++ */
7397 ++ tmp_string[j] = '\0';
7398 ++ if (j > 0) {
7399 ++ mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);
7400 ++
7401 ++ ret = in4_pton(tmp_string, strlen(tmp_string),
7402 ++ (u8 *)&tmp_addr.s_addr, '\0',
7403 ++ NULL);
7404 ++
7405 ++ if (ret) {
7406 ++ mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",
7407 ++ ret,
7408 ++ &tmp_addr.s_addr);
7409 ++ memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,
7410 ++ &tmp_addr.s_addr,
7411 ++ sizeof(tmp_addr.s_addr));
7412 ++ mptcp_gws->len[k]++;
7413 ++ j = 0;
7414 ++ tmp_string[j] = '\0';
7415 ++ /* Since we can't impose a limit to
7416 ++ * what the user can input, make sure
7417 ++ * there are not too many IPs in the
7418 ++ * SYSCTL string.
7419 ++ */
7420 ++ if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {
7421 ++ mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",
7422 ++ k,
7423 ++ MPTCP_GW_LIST_MAX_LEN);
7424 ++ goto error;
7425 ++ }
7426 ++ } else {
7427 ++ goto error;
7428 ++ }
7429 ++ }
7430 ++
7431 ++ if (gateways[i] == '-' || gateways[i] == '\0')
7432 ++ ++k;
7433 ++ } else {
7434 ++ tmp_string[j] = gateways[i];
7435 ++ ++j;
7436 ++ }
7437 ++ }
7438 ++
7439 ++ /* Number of flows is number of gateway lists plus master flow */
7440 ++ mptcp_binder_ndiffports = k+1;
7441 ++
7442 ++ write_unlock(&mptcp_gws_lock);
7443 ++ kfree(tmp_string);
7444 ++
7445 ++ return 0;
7446 ++
7447 ++error:
7448 ++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
7449 ++ memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);
7450 ++ write_unlock(&mptcp_gws_lock);
7451 ++ kfree(tmp_string);
7452 ++ return -1;
7453 ++}
7454 ++
7455 ++/**
7456 ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
7457 ++ *
7458 ++ * This function uses a goto next_subflow, to allow releasing the lock between
7459 ++ * new subflows and giving other processes a chance to do some work on the
7460 ++ * socket and potentially finishing the communication.
7461 ++ **/
7462 ++static void create_subflow_worker(struct work_struct *work)
7463 ++{
7464 ++ const struct binder_priv *pm_priv = container_of(work,
7465 ++ struct binder_priv,
7466 ++ subflow_work);
7467 ++ struct mptcp_cb *mpcb = pm_priv->mpcb;
7468 ++ struct sock *meta_sk = mpcb->meta_sk;
7469 ++ int iter = 0;
7470 ++
7471 ++next_subflow:
7472 ++ if (iter) {
7473 ++ release_sock(meta_sk);
7474 ++ mutex_unlock(&mpcb->mpcb_mutex);
7475 ++
7476 ++ cond_resched();
7477 ++ }
7478 ++ mutex_lock(&mpcb->mpcb_mutex);
7479 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
7480 ++
7481 ++ iter++;
7482 ++
7483 ++ if (sock_flag(meta_sk, SOCK_DEAD))
7484 ++ goto exit;
7485 ++
7486 ++ if (mpcb->master_sk &&
7487 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
7488 ++ goto exit;
7489 ++
7490 ++ if (mptcp_binder_ndiffports > iter &&
7491 ++ mptcp_binder_ndiffports > mpcb->cnt_subflows) {
7492 ++ struct mptcp_loc4 loc;
7493 ++ struct mptcp_rem4 rem;
7494 ++
7495 ++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
7496 ++ loc.loc4_id = 0;
7497 ++ loc.low_prio = 0;
7498 ++
7499 ++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
7500 ++ rem.port = inet_sk(meta_sk)->inet_dport;
7501 ++ rem.rem4_id = 0; /* Default 0 */
7502 ++
7503 ++ mptcp_init4_subsockets(meta_sk, &loc, &rem);
7504 ++
7505 ++ goto next_subflow;
7506 ++ }
7507 ++
7508 ++exit:
7509 ++ release_sock(meta_sk);
7510 ++ mutex_unlock(&mpcb->mpcb_mutex);
7511 ++ sock_put(meta_sk);
7512 ++}
7513 ++
7514 ++static void binder_new_session(const struct sock *meta_sk)
7515 ++{
7516 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
7517 ++ struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];
7518 ++ static DEFINE_SPINLOCK(flow_lock);
7519 ++
7520 ++#if IS_ENABLED(CONFIG_IPV6)
7521 ++ if (meta_sk->sk_family == AF_INET6 &&
7522 ++ !mptcp_v6_is_v4_mapped(meta_sk)) {
7523 ++ mptcp_fallback_default(mpcb);
7524 ++ return;
7525 ++ }
7526 ++#endif
7527 ++
7528 ++ /* Initialize workqueue-struct */
7529 ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
7530 ++ fmp->mpcb = mpcb;
7531 ++
7532 ++ fmp->flow_lock = &flow_lock;
7533 ++}
7534 ++
7535 ++static void binder_create_subflows(struct sock *meta_sk)
7536 ++{
7537 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
7538 ++ struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];
7539 ++
7540 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
7541 ++ mpcb->send_infinite_mapping ||
7542 ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
7543 ++ return;
7544 ++
7545 ++ if (!work_pending(&pm_priv->subflow_work)) {
7546 ++ sock_hold(meta_sk);
7547 ++ queue_work(mptcp_wq, &pm_priv->subflow_work);
7548 ++ }
7549 ++}
7550 ++
7551 ++static int binder_get_local_id(sa_family_t family, union inet_addr *addr,
7552 ++ struct net *net, bool *low_prio)
7553 ++{
7554 ++ return 0;
7555 ++}
7556 ++
7557 ++/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.
7558 ++ * Inspired from proc_tcp_congestion_control().
7559 ++ */
7560 ++static int proc_mptcp_gateways(ctl_table *ctl, int write,
7561 ++ void __user *buffer, size_t *lenp,
7562 ++ loff_t *ppos)
7563 ++{
7564 ++ int ret;
7565 ++ ctl_table tbl = {
7566 ++ .maxlen = MPTCP_GW_SYSCTL_MAX_LEN,
7567 ++ };
7568 ++
7569 ++ if (write) {
7570 ++ tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);
7571 ++ if (tbl.data == NULL)
7572 ++ return -1;
7573 ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
7574 ++ if (ret == 0) {
7575 ++ ret = mptcp_parse_gateway_ipv4(tbl.data);
7576 ++ memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);
7577 ++ }
7578 ++ kfree(tbl.data);
7579 ++ } else {
7580 ++ ret = proc_dostring(ctl, write, buffer, lenp, ppos);
7581 ++ }
7582 ++
7583 ++
7584 ++ return ret;
7585 ++}
7586 ++
7587 ++static struct mptcp_pm_ops binder __read_mostly = {
7588 ++ .new_session = binder_new_session,
7589 ++ .fully_established = binder_create_subflows,
7590 ++ .get_local_id = binder_get_local_id,
7591 ++ .init_subsocket_v4 = mptcp_v4_add_lsrr,
7592 ++ .name = "binder",
7593 ++ .owner = THIS_MODULE,
7594 ++};
7595 ++
7596 ++static struct ctl_table binder_table[] = {
7597 ++ {
7598 ++ .procname = "mptcp_binder_gateways",
7599 ++ .data = &sysctl_mptcp_binder_gateways,
7600 ++ .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,
7601 ++ .mode = 0644,
7602 ++ .proc_handler = &proc_mptcp_gateways
7603 ++ },
7604 ++ { }
7605 ++};
7606 ++
7607 ++struct ctl_table_header *mptcp_sysctl_binder;
7608 ++
7609 ++/* General initialization of MPTCP_PM */
7610 ++static int __init binder_register(void)
7611 ++{
7612 ++ mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);
7613 ++ if (!mptcp_gws)
7614 ++ return -ENOMEM;
7615 ++
7616 ++ rwlock_init(&mptcp_gws_lock);
7617 ++
7618 ++ BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);
7619 ++
7620 ++ mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",
7621 ++ binder_table);
7622 ++ if (!mptcp_sysctl_binder)
7623 ++ goto sysctl_fail;
7624 ++
7625 ++ if (mptcp_register_path_manager(&binder))
7626 ++ goto pm_failed;
7627 ++
7628 ++ return 0;
7629 ++
7630 ++pm_failed:
7631 ++ unregister_net_sysctl_table(mptcp_sysctl_binder);
7632 ++sysctl_fail:
7633 ++ kfree(mptcp_gws);
7634 ++
7635 ++ return -1;
7636 ++}
7637 ++
7638 ++static void binder_unregister(void)
7639 ++{
7640 ++ mptcp_unregister_path_manager(&binder);
7641 ++ unregister_net_sysctl_table(mptcp_sysctl_binder);
7642 ++ kfree(mptcp_gws);
7643 ++}
7644 ++
7645 ++module_init(binder_register);
7646 ++module_exit(binder_unregister);
7647 ++
7648 ++MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");
7649 ++MODULE_LICENSE("GPL");
7650 ++MODULE_DESCRIPTION("BINDER MPTCP");
7651 ++MODULE_VERSION("0.1");
7652 +diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c
7653 +new file mode 100644
7654 +index 000000000000..5d761164eb85
7655 +--- /dev/null
7656 ++++ b/net/mptcp/mptcp_coupled.c
7657 +@@ -0,0 +1,270 @@
7658 ++/*
7659 ++ * MPTCP implementation - Linked Increase congestion control Algorithm (LIA)
7660 ++ *
7661 ++ * Initial Design & Implementation:
7662 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
7663 ++ *
7664 ++ * Current Maintainer & Author:
7665 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
7666 ++ *
7667 ++ * Additional authors:
7668 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
7669 ++ * Gregory Detal <gregory.detal@×××××××××.be>
7670 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
7671 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
7672 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
7673 ++ * Andreas Ripke <ripke@××××××.eu>
7674 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
7675 ++ * Octavian Purdila <octavian.purdila@×××××.com>
7676 ++ * John Ronan <jronan@××××.org>
7677 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
7678 ++ * Brandon Heller <brandonh@××××××××.edu>
7679 ++ *
7680 ++ *
7681 ++ * This program is free software; you can redistribute it and/or
7682 ++ * modify it under the terms of the GNU General Public License
7683 ++ * as published by the Free Software Foundation; either version
7684 ++ * 2 of the License, or (at your option) any later version.
7685 ++ */
7686 ++#include <net/tcp.h>
7687 ++#include <net/mptcp.h>
7688 ++
7689 ++#include <linux/module.h>
7690 ++
7691 ++/* Scaling is done in the numerator with alpha_scale_num and in the denominator
7692 ++ * with alpha_scale_den.
7693 ++ *
7694 ++ * To downscale, we just need to use alpha_scale.
7695 ++ *
7696 ++ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
7697 ++ */
7698 ++static int alpha_scale_den = 10;
7699 ++static int alpha_scale_num = 32;
7700 ++static int alpha_scale = 12;
7701 ++
7702 ++struct mptcp_ccc {
7703 ++ u64 alpha;
7704 ++ bool forced_update;
7705 ++};
7706 ++
7707 ++static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
7708 ++{
7709 ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
7710 ++}
7711 ++
7712 ++static inline u64 mptcp_get_alpha(const struct sock *meta_sk)
7713 ++{
7714 ++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;
7715 ++}
7716 ++
7717 ++static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)
7718 ++{
7719 ++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;
7720 ++}
7721 ++
7722 ++static inline u64 mptcp_ccc_scale(u32 val, int scale)
7723 ++{
7724 ++ return (u64) val << scale;
7725 ++}
7726 ++
7727 ++static inline bool mptcp_get_forced(const struct sock *meta_sk)
7728 ++{
7729 ++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;
7730 ++}
7731 ++
7732 ++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
7733 ++{
7734 ++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;
7735 ++}
7736 ++
7737 ++static void mptcp_ccc_recalc_alpha(const struct sock *sk)
7738 ++{
7739 ++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
7740 ++ const struct sock *sub_sk;
7741 ++ int best_cwnd = 0, best_rtt = 0, can_send = 0;
7742 ++ u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
7743 ++
7744 ++ if (!mpcb)
7745 ++ return;
7746 ++
7747 ++ /* Only one subflow left - fall back to normal reno-behavior
7748 ++ * (set alpha to 1)
7749 ++ */
7750 ++ if (mpcb->cnt_established <= 1)
7751 ++ goto exit;
7752 ++
7753 ++ /* Do regular alpha-calculation for multiple subflows */
7754 ++
7755 ++ /* Find the max numerator of the alpha-calculation */
7756 ++ mptcp_for_each_sk(mpcb, sub_sk) {
7757 ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
7758 ++ u64 tmp;
7759 ++
7760 ++ if (!mptcp_ccc_sk_can_send(sub_sk))
7761 ++ continue;
7762 ++
7763 ++ can_send++;
7764 ++
7765 ++ /* We need to look for the path, that provides the max-value.
7766 ++ * Integer-overflow is not possible here, because
7767 ++ * tmp will be in u64.
7768 ++ */
7769 ++ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
7770 ++ alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);
7771 ++
7772 ++ if (tmp >= max_numerator) {
7773 ++ max_numerator = tmp;
7774 ++ best_cwnd = sub_tp->snd_cwnd;
7775 ++ best_rtt = sub_tp->srtt_us;
7776 ++ }
7777 ++ }
7778 ++
7779 ++ /* No subflow is able to send - we don't care anymore */
7780 ++ if (unlikely(!can_send))
7781 ++ goto exit;
7782 ++
7783 ++ /* Calculate the denominator */
7784 ++ mptcp_for_each_sk(mpcb, sub_sk) {
7785 ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
7786 ++
7787 ++ if (!mptcp_ccc_sk_can_send(sub_sk))
7788 ++ continue;
7789 ++
7790 ++ sum_denominator += div_u64(
7791 ++ mptcp_ccc_scale(sub_tp->snd_cwnd,
7792 ++ alpha_scale_den) * best_rtt,
7793 ++ sub_tp->srtt_us);
7794 ++ }
7795 ++ sum_denominator *= sum_denominator;
7796 ++ if (unlikely(!sum_denominator)) {
7797 ++ pr_err("%s: sum_denominator == 0, cnt_established:%d\n",
7798 ++ __func__, mpcb->cnt_established);
7799 ++ mptcp_for_each_sk(mpcb, sub_sk) {
7800 ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
7801 ++ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
7802 ++ __func__, sub_tp->mptcp->path_index,
7803 ++ sub_sk->sk_state, sub_tp->srtt_us,
7804 ++ sub_tp->snd_cwnd);
7805 ++ }
7806 ++ }
7807 ++
7808 ++ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
7809 ++
7810 ++ if (unlikely(!alpha))
7811 ++ alpha = 1;
7812 ++
7813 ++exit:
7814 ++ mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
7815 ++}
7816 ++
7817 ++static void mptcp_ccc_init(struct sock *sk)
7818 ++{
7819 ++ if (mptcp(tcp_sk(sk))) {
7820 ++ mptcp_set_forced(mptcp_meta_sk(sk), 0);
7821 ++ mptcp_set_alpha(mptcp_meta_sk(sk), 1);
7822 ++ }
7823 ++ /* If we do not mptcp, behave like reno: return */
7824 ++}
7825 ++
7826 ++static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
7827 ++{
7828 ++ if (event == CA_EVENT_LOSS)
7829 ++ mptcp_ccc_recalc_alpha(sk);
7830 ++}
7831 ++
7832 ++static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
7833 ++{
7834 ++ if (!mptcp(tcp_sk(sk)))
7835 ++ return;
7836 ++
7837 ++ mptcp_set_forced(mptcp_meta_sk(sk), 1);
7838 ++}
7839 ++
7840 ++static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)
7841 ++{
7842 ++ struct tcp_sock *tp = tcp_sk(sk);
7843 ++ const struct mptcp_cb *mpcb = tp->mpcb;
7844 ++ int snd_cwnd;
7845 ++
7846 ++ if (!mptcp(tp)) {
7847 ++ tcp_reno_cong_avoid(sk, ack, acked);
7848 ++ return;
7849 ++ }
7850 ++
7851 ++ if (!tcp_is_cwnd_limited(sk))
7852 ++ return;
7853 ++
7854 ++ if (tp->snd_cwnd <= tp->snd_ssthresh) {
7855 ++ /* In "safe" area, increase. */
7856 ++ tcp_slow_start(tp, acked);
7857 ++ mptcp_ccc_recalc_alpha(sk);
7858 ++ return;
7859 ++ }
7860 ++
7861 ++ if (mptcp_get_forced(mptcp_meta_sk(sk))) {
7862 ++ mptcp_ccc_recalc_alpha(sk);
7863 ++ mptcp_set_forced(mptcp_meta_sk(sk), 0);
7864 ++ }
7865 ++
7866 ++ if (mpcb->cnt_established > 1) {
7867 ++ u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
7868 ++
7869 ++ /* This may happen, if at the initialization, the mpcb
7870 ++ * was not yet attached to the sock, and thus
7871 ++ * initializing alpha failed.
7872 ++ */
7873 ++ if (unlikely(!alpha))
7874 ++ alpha = 1;
7875 ++
7876 ++ snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),
7877 ++ alpha);
7878 ++
7879 ++ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
7880 ++ * Thus, we select here the max value.
7881 ++ */
7882 ++ if (snd_cwnd < tp->snd_cwnd)
7883 ++ snd_cwnd = tp->snd_cwnd;
7884 ++ } else {
7885 ++ snd_cwnd = tp->snd_cwnd;
7886 ++ }
7887 ++
7888 ++ if (tp->snd_cwnd_cnt >= snd_cwnd) {
7889 ++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
7890 ++ tp->snd_cwnd++;
7891 ++ mptcp_ccc_recalc_alpha(sk);
7892 ++ }
7893 ++
7894 ++ tp->snd_cwnd_cnt = 0;
7895 ++ } else {
7896 ++ tp->snd_cwnd_cnt++;
7897 ++ }
7898 ++}
7899 ++
7900 ++static struct tcp_congestion_ops mptcp_ccc = {
7901 ++ .init = mptcp_ccc_init,
7902 ++ .ssthresh = tcp_reno_ssthresh,
7903 ++ .cong_avoid = mptcp_ccc_cong_avoid,
7904 ++ .cwnd_event = mptcp_ccc_cwnd_event,
7905 ++ .set_state = mptcp_ccc_set_state,
7906 ++ .owner = THIS_MODULE,
7907 ++ .name = "lia",
7908 ++};
7909 ++
7910 ++static int __init mptcp_ccc_register(void)
7911 ++{
7912 ++ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
7913 ++ return tcp_register_congestion_control(&mptcp_ccc);
7914 ++}
7915 ++
7916 ++static void __exit mptcp_ccc_unregister(void)
7917 ++{
7918 ++ tcp_unregister_congestion_control(&mptcp_ccc);
7919 ++}
7920 ++
7921 ++module_init(mptcp_ccc_register);
7922 ++module_exit(mptcp_ccc_unregister);
7923 ++
7924 ++MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
7925 ++MODULE_LICENSE("GPL");
7926 ++MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");
7927 ++MODULE_VERSION("0.1");
7928 +diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
7929 +new file mode 100644
7930 +index 000000000000..28dfa0479f5e
7931 +--- /dev/null
7932 ++++ b/net/mptcp/mptcp_ctrl.c
7933 +@@ -0,0 +1,2401 @@
7934 ++/*
7935 ++ * MPTCP implementation - MPTCP-control
7936 ++ *
7937 ++ * Initial Design & Implementation:
7938 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
7939 ++ *
7940 ++ * Current Maintainer & Author:
7941 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
7942 ++ *
7943 ++ * Additional authors:
7944 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
7945 ++ * Gregory Detal <gregory.detal@×××××××××.be>
7946 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
7947 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
7948 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
7949 ++ * Andreas Ripke <ripke@××××××.eu>
7950 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
7951 ++ * Octavian Purdila <octavian.purdila@×××××.com>
7952 ++ * John Ronan <jronan@××××.org>
7953 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
7954 ++ * Brandon Heller <brandonh@××××××××.edu>
7955 ++ *
7956 ++ *
7957 ++ * This program is free software; you can redistribute it and/or
7958 ++ * modify it under the terms of the GNU General Public License
7959 ++ * as published by the Free Software Foundation; either version
7960 ++ * 2 of the License, or (at your option) any later version.
7961 ++ */
7962 ++
7963 ++#include <net/inet_common.h>
7964 ++#include <net/inet6_hashtables.h>
7965 ++#include <net/ipv6.h>
7966 ++#include <net/ip6_checksum.h>
7967 ++#include <net/mptcp.h>
7968 ++#include <net/mptcp_v4.h>
7969 ++#if IS_ENABLED(CONFIG_IPV6)
7970 ++#include <net/ip6_route.h>
7971 ++#include <net/mptcp_v6.h>
7972 ++#endif
7973 ++#include <net/sock.h>
7974 ++#include <net/tcp.h>
7975 ++#include <net/tcp_states.h>
7976 ++#include <net/transp_v6.h>
7977 ++#include <net/xfrm.h>
7978 ++
7979 ++#include <linux/cryptohash.h>
7980 ++#include <linux/kconfig.h>
7981 ++#include <linux/module.h>
7982 ++#include <linux/netpoll.h>
7983 ++#include <linux/list.h>
7984 ++#include <linux/jhash.h>
7985 ++#include <linux/tcp.h>
7986 ++#include <linux/net.h>
7987 ++#include <linux/in.h>
7988 ++#include <linux/random.h>
7989 ++#include <linux/inetdevice.h>
7990 ++#include <linux/workqueue.h>
7991 ++#include <linux/atomic.h>
7992 ++#include <linux/sysctl.h>
7993 ++
7994 ++static struct kmem_cache *mptcp_sock_cache __read_mostly;
7995 ++static struct kmem_cache *mptcp_cb_cache __read_mostly;
7996 ++static struct kmem_cache *mptcp_tw_cache __read_mostly;
7997 ++
7998 ++int sysctl_mptcp_enabled __read_mostly = 1;
7999 ++int sysctl_mptcp_checksum __read_mostly = 1;
8000 ++int sysctl_mptcp_debug __read_mostly;
8001 ++EXPORT_SYMBOL(sysctl_mptcp_debug);
8002 ++int sysctl_mptcp_syn_retries __read_mostly = 3;
8003 ++
8004 ++bool mptcp_init_failed __read_mostly;
8005 ++
8006 ++struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE;
8007 ++EXPORT_SYMBOL(mptcp_static_key);
8008 ++
8009 ++static int proc_mptcp_path_manager(ctl_table *ctl, int write,
8010 ++ void __user *buffer, size_t *lenp,
8011 ++ loff_t *ppos)
8012 ++{
8013 ++ char val[MPTCP_PM_NAME_MAX];
8014 ++ ctl_table tbl = {
8015 ++ .data = val,
8016 ++ .maxlen = MPTCP_PM_NAME_MAX,
8017 ++ };
8018 ++ int ret;
8019 ++
8020 ++ mptcp_get_default_path_manager(val);
8021 ++
8022 ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
8023 ++ if (write && ret == 0)
8024 ++ ret = mptcp_set_default_path_manager(val);
8025 ++ return ret;
8026 ++}
8027 ++
8028 ++static int proc_mptcp_scheduler(ctl_table *ctl, int write,
8029 ++ void __user *buffer, size_t *lenp,
8030 ++ loff_t *ppos)
8031 ++{
8032 ++ char val[MPTCP_SCHED_NAME_MAX];
8033 ++ ctl_table tbl = {
8034 ++ .data = val,
8035 ++ .maxlen = MPTCP_SCHED_NAME_MAX,
8036 ++ };
8037 ++ int ret;
8038 ++
8039 ++ mptcp_get_default_scheduler(val);
8040 ++
8041 ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
8042 ++ if (write && ret == 0)
8043 ++ ret = mptcp_set_default_scheduler(val);
8044 ++ return ret;
8045 ++}
8046 ++
8047 ++static struct ctl_table mptcp_table[] = {
8048 ++ {
8049 ++ .procname = "mptcp_enabled",
8050 ++ .data = &sysctl_mptcp_enabled,
8051 ++ .maxlen = sizeof(int),
8052 ++ .mode = 0644,
8053 ++ .proc_handler = &proc_dointvec
8054 ++ },
8055 ++ {
8056 ++ .procname = "mptcp_checksum",
8057 ++ .data = &sysctl_mptcp_checksum,
8058 ++ .maxlen = sizeof(int),
8059 ++ .mode = 0644,
8060 ++ .proc_handler = &proc_dointvec
8061 ++ },
8062 ++ {
8063 ++ .procname = "mptcp_debug",
8064 ++ .data = &sysctl_mptcp_debug,
8065 ++ .maxlen = sizeof(int),
8066 ++ .mode = 0644,
8067 ++ .proc_handler = &proc_dointvec
8068 ++ },
8069 ++ {
8070 ++ .procname = "mptcp_syn_retries",
8071 ++ .data = &sysctl_mptcp_syn_retries,
8072 ++ .maxlen = sizeof(int),
8073 ++ .mode = 0644,
8074 ++ .proc_handler = &proc_dointvec
8075 ++ },
8076 ++ {
8077 ++ .procname = "mptcp_path_manager",
8078 ++ .mode = 0644,
8079 ++ .maxlen = MPTCP_PM_NAME_MAX,
8080 ++ .proc_handler = proc_mptcp_path_manager,
8081 ++ },
8082 ++ {
8083 ++ .procname = "mptcp_scheduler",
8084 ++ .mode = 0644,
8085 ++ .maxlen = MPTCP_SCHED_NAME_MAX,
8086 ++ .proc_handler = proc_mptcp_scheduler,
8087 ++ },
8088 ++ { }
8089 ++};
8090 ++
8091 ++static inline u32 mptcp_hash_tk(u32 token)
8092 ++{
8093 ++ return token % MPTCP_HASH_SIZE;
8094 ++}
8095 ++
8096 ++struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
8097 ++EXPORT_SYMBOL(tk_hashtable);
8098 ++
8099 ++/* This second hashtable is needed to retrieve request socks
8100 ++ * created as a result of a join request. While the SYN contains
8101 ++ * the token, the final ack does not, so we need a separate hashtable
8102 ++ * to retrieve the mpcb.
8103 ++ */
8104 ++struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
8105 ++spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
8106 ++
8107 ++/* The following hash table is used to avoid collision of token */
8108 ++static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];
8109 ++spinlock_t mptcp_tk_hashlock; /* hashtable protection */
8110 ++
8111 ++static bool mptcp_reqsk_find_tk(const u32 token)
8112 ++{
8113 ++ const u32 hash = mptcp_hash_tk(token);
8114 ++ const struct mptcp_request_sock *mtreqsk;
8115 ++ const struct hlist_nulls_node *node;
8116 ++
8117 ++begin:
8118 ++ hlist_nulls_for_each_entry_rcu(mtreqsk, node,
8119 ++ &mptcp_reqsk_tk_htb[hash], hash_entry) {
8120 ++ if (token == mtreqsk->mptcp_loc_token)
8121 ++ return true;
8122 ++ }
8123 ++ /* A request-socket is destroyed by RCU. So, it might have been recycled
8124 ++ * and put into another hash-table list. So, after the lookup we may
8125 ++ * end up in a different list. So, we may need to restart.
8126 ++ *
8127 ++ * See also the comment in __inet_lookup_established.
8128 ++ */
8129 ++ if (get_nulls_value(node) != hash)
8130 ++ goto begin;
8131 ++ return false;
8132 ++}
8133 ++
8134 ++static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token)
8135 ++{
8136 ++ u32 hash = mptcp_hash_tk(token);
8137 ++
8138 ++ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry,
8139 ++ &mptcp_reqsk_tk_htb[hash]);
8140 ++}
8141 ++
8142 ++static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk)
8143 ++{
8144 ++ rcu_read_lock();
8145 ++ spin_lock(&mptcp_tk_hashlock);
8146 ++ hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry);
8147 ++ spin_unlock(&mptcp_tk_hashlock);
8148 ++ rcu_read_unlock();
8149 ++}
8150 ++
8151 ++void mptcp_reqsk_destructor(struct request_sock *req)
8152 ++{
8153 ++ if (!mptcp_rsk(req)->is_sub) {
8154 ++ if (in_softirq()) {
8155 ++ mptcp_reqsk_remove_tk(req);
8156 ++ } else {
8157 ++ rcu_read_lock_bh();
8158 ++ spin_lock(&mptcp_tk_hashlock);
8159 ++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);
8160 ++ spin_unlock(&mptcp_tk_hashlock);
8161 ++ rcu_read_unlock_bh();
8162 ++ }
8163 ++ } else {
8164 ++ mptcp_hash_request_remove(req);
8165 ++ }
8166 ++}
8167 ++
8168 ++static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token)
8169 ++{
8170 ++ u32 hash = mptcp_hash_tk(token);
8171 ++ hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);
8172 ++ meta_tp->inside_tk_table = 1;
8173 ++}
8174 ++
8175 ++static bool mptcp_find_token(u32 token)
8176 ++{
8177 ++ const u32 hash = mptcp_hash_tk(token);
8178 ++ const struct tcp_sock *meta_tp;
8179 ++ const struct hlist_nulls_node *node;
8180 ++
8181 ++begin:
8182 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {
8183 ++ if (token == meta_tp->mptcp_loc_token)
8184 ++ return true;
8185 ++ }
8186 ++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
8187 ++ * and put into another hash-table list. So, after the lookup we may
8188 ++ * end up in a different list. So, we may need to restart.
8189 ++ *
8190 ++ * See also the comment in __inet_lookup_established.
8191 ++ */
8192 ++ if (get_nulls_value(node) != hash)
8193 ++ goto begin;
8194 ++ return false;
8195 ++}
8196 ++
8197 ++static void mptcp_set_key_reqsk(struct request_sock *req,
8198 ++ const struct sk_buff *skb)
8199 ++{
8200 ++ const struct inet_request_sock *ireq = inet_rsk(req);
8201 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
8202 ++
8203 ++ if (skb->protocol == htons(ETH_P_IP)) {
8204 ++ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
8205 ++ ip_hdr(skb)->daddr,
8206 ++ htons(ireq->ir_num),
8207 ++ ireq->ir_rmt_port);
8208 ++#if IS_ENABLED(CONFIG_IPV6)
8209 ++ } else {
8210 ++ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
8211 ++ ipv6_hdr(skb)->daddr.s6_addr32,
8212 ++ htons(ireq->ir_num),
8213 ++ ireq->ir_rmt_port);
8214 ++#endif
8215 ++ }
8216 ++
8217 ++ mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
8218 ++}
8219 ++
8220 ++/* New MPTCP-connection request, prepare a new token for the meta-socket that
8221 ++ * will be created in mptcp_check_req_master(), and store the received token.
8222 ++ */
8223 ++void mptcp_reqsk_new_mptcp(struct request_sock *req,
8224 ++ const struct mptcp_options_received *mopt,
8225 ++ const struct sk_buff *skb)
8226 ++{
8227 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
8228 ++
8229 ++ inet_rsk(req)->saw_mpc = 1;
8230 ++
8231 ++ rcu_read_lock();
8232 ++ spin_lock(&mptcp_tk_hashlock);
8233 ++ do {
8234 ++ mptcp_set_key_reqsk(req, skb);
8235 ++ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
8236 ++ mptcp_find_token(mtreq->mptcp_loc_token));
8237 ++
8238 ++ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
8239 ++ spin_unlock(&mptcp_tk_hashlock);
8240 ++ rcu_read_unlock();
8241 ++ mtreq->mptcp_rem_key = mopt->mptcp_key;
8242 ++}
8243 ++
8244 ++static void mptcp_set_key_sk(const struct sock *sk)
8245 ++{
8246 ++ struct tcp_sock *tp = tcp_sk(sk);
8247 ++ const struct inet_sock *isk = inet_sk(sk);
8248 ++
8249 ++ if (sk->sk_family == AF_INET)
8250 ++ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
8251 ++ isk->inet_daddr,
8252 ++ isk->inet_sport,
8253 ++ isk->inet_dport);
8254 ++#if IS_ENABLED(CONFIG_IPV6)
8255 ++ else
8256 ++ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
8257 ++ sk->sk_v6_daddr.s6_addr32,
8258 ++ isk->inet_sport,
8259 ++ isk->inet_dport);
8260 ++#endif
8261 ++
8262 ++ mptcp_key_sha1(tp->mptcp_loc_key,
8263 ++ &tp->mptcp_loc_token, NULL);
8264 ++}
8265 ++
8266 ++void mptcp_connect_init(struct sock *sk)
8267 ++{
8268 ++ struct tcp_sock *tp = tcp_sk(sk);
8269 ++
8270 ++ rcu_read_lock_bh();
8271 ++ spin_lock(&mptcp_tk_hashlock);
8272 ++ do {
8273 ++ mptcp_set_key_sk(sk);
8274 ++ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
8275 ++ mptcp_find_token(tp->mptcp_loc_token));
8276 ++
8277 ++ __mptcp_hash_insert(tp, tp->mptcp_loc_token);
8278 ++ spin_unlock(&mptcp_tk_hashlock);
8279 ++ rcu_read_unlock_bh();
8280 ++}
8281 ++
8282 ++/**
8283 ++ * This function increments the refcount of the mpcb struct.
8284 ++ * It is the responsibility of the caller to decrement when releasing
8285 ++ * the structure.
8286 ++ */
8287 ++struct sock *mptcp_hash_find(const struct net *net, const u32 token)
8288 ++{
8289 ++ const u32 hash = mptcp_hash_tk(token);
8290 ++ const struct tcp_sock *meta_tp;
8291 ++ struct sock *meta_sk = NULL;
8292 ++ const struct hlist_nulls_node *node;
8293 ++
8294 ++ rcu_read_lock();
8295 ++begin:
8296 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],
8297 ++ tk_table) {
8298 ++ meta_sk = (struct sock *)meta_tp;
8299 ++ if (token == meta_tp->mptcp_loc_token &&
8300 ++ net_eq(net, sock_net(meta_sk))) {
8301 ++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
8302 ++ goto out;
8303 ++ if (unlikely(token != meta_tp->mptcp_loc_token ||
8304 ++ !net_eq(net, sock_net(meta_sk)))) {
8305 ++ sock_gen_put(meta_sk);
8306 ++ goto begin;
8307 ++ }
8308 ++ goto found;
8309 ++ }
8310 ++ }
8311 ++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
8312 ++ * and put into another hash-table list. So, after the lookup we may
8313 ++ * end up in a different list. So, we may need to restart.
8314 ++ *
8315 ++ * See also the comment in __inet_lookup_established.
8316 ++ */
8317 ++ if (get_nulls_value(node) != hash)
8318 ++ goto begin;
8319 ++out:
8320 ++ meta_sk = NULL;
8321 ++found:
8322 ++ rcu_read_unlock();
8323 ++ return meta_sk;
8324 ++}
8325 ++
8326 ++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
8327 ++{
8328 ++ /* remove from the token hashtable */
8329 ++ rcu_read_lock_bh();
8330 ++ spin_lock(&mptcp_tk_hashlock);
8331 ++ hlist_nulls_del_init_rcu(&meta_tp->tk_table);
8332 ++ meta_tp->inside_tk_table = 0;
8333 ++ spin_unlock(&mptcp_tk_hashlock);
8334 ++ rcu_read_unlock_bh();
8335 ++}
8336 ++
8337 ++void mptcp_hash_remove(struct tcp_sock *meta_tp)
8338 ++{
8339 ++ rcu_read_lock();
8340 ++ spin_lock(&mptcp_tk_hashlock);
8341 ++ hlist_nulls_del_init_rcu(&meta_tp->tk_table);
8342 ++ meta_tp->inside_tk_table = 0;
8343 ++ spin_unlock(&mptcp_tk_hashlock);
8344 ++ rcu_read_unlock();
8345 ++}
8346 ++
8347 ++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk)
8348 ++{
8349 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
8350 ++ struct sock *sk, *rttsk = NULL, *lastsk = NULL;
8351 ++ u32 min_time = 0, last_active = 0;
8352 ++
8353 ++ mptcp_for_each_sk(meta_tp->mpcb, sk) {
8354 ++ struct tcp_sock *tp = tcp_sk(sk);
8355 ++ u32 elapsed;
8356 ++
8357 ++ if (!mptcp_sk_can_send_ack(sk) || tp->pf)
8358 ++ continue;
8359 ++
8360 ++ elapsed = keepalive_time_elapsed(tp);
8361 ++
8362 ++ /* We take the one with the lowest RTT within a reasonable
8363 ++ * (meta-RTO)-timeframe
8364 ++ */
8365 ++ if (elapsed < inet_csk(meta_sk)->icsk_rto) {
8366 ++ if (!min_time || tp->srtt_us < min_time) {
8367 ++ min_time = tp->srtt_us;
8368 ++ rttsk = sk;
8369 ++ }
8370 ++ continue;
8371 ++ }
8372 ++
8373 ++ /* Otherwise, we just take the most recent active */
8374 ++ if (!rttsk && (!last_active || elapsed < last_active)) {
8375 ++ last_active = elapsed;
8376 ++ lastsk = sk;
8377 ++ }
8378 ++ }
8379 ++
8380 ++ if (rttsk)
8381 ++ return rttsk;
8382 ++
8383 ++ return lastsk;
8384 ++}
8385 ++EXPORT_SYMBOL(mptcp_select_ack_sock);
8386 ++
8387 ++static void mptcp_sock_def_error_report(struct sock *sk)
8388 ++{
8389 ++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
8390 ++
8391 ++ if (!sock_flag(sk, SOCK_DEAD))
8392 ++ mptcp_sub_close(sk, 0);
8393 ++
8394 ++ if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||
8395 ++ mpcb->send_infinite_mapping) {
8396 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
8397 ++
8398 ++ meta_sk->sk_err = sk->sk_err;
8399 ++ meta_sk->sk_err_soft = sk->sk_err_soft;
8400 ++
8401 ++ if (!sock_flag(meta_sk, SOCK_DEAD))
8402 ++ meta_sk->sk_error_report(meta_sk);
8403 ++
8404 ++ tcp_done(meta_sk);
8405 ++ }
8406 ++
8407 ++ sk->sk_err = 0;
8408 ++ return;
8409 ++}
8410 ++
8411 ++static void mptcp_mpcb_put(struct mptcp_cb *mpcb)
8412 ++{
8413 ++ if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {
8414 ++ mptcp_cleanup_path_manager(mpcb);
8415 ++ mptcp_cleanup_scheduler(mpcb);
8416 ++ kmem_cache_free(mptcp_cb_cache, mpcb);
8417 ++ }
8418 ++}
8419 ++
8420 ++static void mptcp_sock_destruct(struct sock *sk)
8421 ++{
8422 ++ struct tcp_sock *tp = tcp_sk(sk);
8423 ++
8424 ++ inet_sock_destruct(sk);
8425 ++
8426 ++ if (!is_meta_sk(sk) && !tp->was_meta_sk) {
8427 ++ BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list));
8428 ++
8429 ++ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
8430 ++ tp->mptcp = NULL;
8431 ++
8432 ++ /* Taken when mpcb pointer was set */
8433 ++ sock_put(mptcp_meta_sk(sk));
8434 ++ mptcp_mpcb_put(tp->mpcb);
8435 ++ } else {
8436 ++ struct mptcp_cb *mpcb = tp->mpcb;
8437 ++ struct mptcp_tw *mptw;
8438 ++
8439 ++ /* The mpcb is disappearing - we can make the final
8440 ++ * update to the rcv_nxt of the time-wait-sock and remove
8441 ++ * its reference to the mpcb.
8442 ++ */
8443 ++ spin_lock_bh(&mpcb->tw_lock);
8444 ++ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
8445 ++ list_del_rcu(&mptw->list);
8446 ++ mptw->in_list = 0;
8447 ++ mptcp_mpcb_put(mpcb);
8448 ++ rcu_assign_pointer(mptw->mpcb, NULL);
8449 ++ }
8450 ++ spin_unlock_bh(&mpcb->tw_lock);
8451 ++
8452 ++ mptcp_mpcb_put(mpcb);
8453 ++
8454 ++ mptcp_debug("%s destroying meta-sk\n", __func__);
8455 ++ }
8456 ++
8457 ++ WARN_ON(!static_key_false(&mptcp_static_key));
8458 ++ /* Must be the last call, because is_meta_sk() above still needs the
8459 ++ * static key
8460 ++ */
8461 ++ static_key_slow_dec(&mptcp_static_key);
8462 ++}
8463 ++
8464 ++void mptcp_destroy_sock(struct sock *sk)
8465 ++{
8466 ++ if (is_meta_sk(sk)) {
8467 ++ struct sock *sk_it, *tmpsk;
8468 ++
8469 ++ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
8470 ++ mptcp_purge_ofo_queue(tcp_sk(sk));
8471 ++
8472 ++ /* We have to close all remaining subflows. Normally, they
8473 ++ * should all be about to get closed. But, if the kernel is
8474 ++ * forcing a closure (e.g., tcp_write_err), the subflows might
8475 ++ * not have been closed properly (as we are waiting for the
8476 ++ * DATA_ACK of the DATA_FIN).
8477 ++ */
8478 ++ mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {
8479 ++ /* Already did call tcp_close - waiting for graceful
8480 ++ * closure, or if we are retransmitting fast-close on
8481 ++ * the subflow. The reset (or timeout) will kill the
8482 ++ * subflow..
8483 ++ */
8484 ++ if (tcp_sk(sk_it)->closing ||
8485 ++ tcp_sk(sk_it)->send_mp_fclose)
8486 ++ continue;
8487 ++
8488 ++ /* Allow the delayed work first to prevent time-wait state */
8489 ++ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
8490 ++ continue;
8491 ++
8492 ++ mptcp_sub_close(sk_it, 0);
8493 ++ }
8494 ++
8495 ++ mptcp_delete_synack_timer(sk);
8496 ++ } else {
8497 ++ mptcp_del_sock(sk);
8498 ++ }
8499 ++}
8500 ++
8501 ++static void mptcp_set_state(struct sock *sk)
8502 ++{
8503 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
8504 ++
8505 ++ /* Meta is not yet established - wake up the application */
8506 ++ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
8507 ++ sk->sk_state == TCP_ESTABLISHED) {
8508 ++ tcp_set_state(meta_sk, TCP_ESTABLISHED);
8509 ++
8510 ++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
8511 ++ meta_sk->sk_state_change(meta_sk);
8512 ++ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
8513 ++ }
8514 ++ }
8515 ++
8516 ++ if (sk->sk_state == TCP_ESTABLISHED) {
8517 ++ tcp_sk(sk)->mptcp->establish_increased = 1;
8518 ++ tcp_sk(sk)->mpcb->cnt_established++;
8519 ++ }
8520 ++}
8521 ++
8522 ++void mptcp_init_congestion_control(struct sock *sk)
8523 ++{
8524 ++ struct inet_connection_sock *icsk = inet_csk(sk);
8525 ++ struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk));
8526 ++ const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops;
8527 ++
8528 ++ /* The application didn't set the congestion control to use
8529 ++ * fallback to the default one.
8530 ++ */
8531 ++ if (ca == &tcp_init_congestion_ops)
8532 ++ goto use_default;
8533 ++
8534 ++ /* Use the same congestion control as set by the user. If the
8535 ++ * module is not available fallback to the default one.
8536 ++ */
8537 ++ if (!try_module_get(ca->owner)) {
8538 ++ pr_warn("%s: fallback to the system default CC\n", __func__);
8539 ++ goto use_default;
8540 ++ }
8541 ++
8542 ++ icsk->icsk_ca_ops = ca;
8543 ++ if (icsk->icsk_ca_ops->init)
8544 ++ icsk->icsk_ca_ops->init(sk);
8545 ++
8546 ++ return;
8547 ++
8548 ++use_default:
8549 ++ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
8550 ++ tcp_init_congestion_control(sk);
8551 ++}
8552 ++
8553 ++u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
8554 ++u32 mptcp_seed = 0;
8555 ++
8556 ++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
8557 ++{
8558 ++ u32 workspace[SHA_WORKSPACE_WORDS];
8559 ++ u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
8560 ++ u8 input[64];
8561 ++ int i;
8562 ++
8563 ++ memset(workspace, 0, sizeof(workspace));
8564 ++
8565 ++ /* Initialize input with appropriate padding */
8566 ++ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
8567 ++ * is explicitly set too
8568 ++ */
8569 ++ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
8570 ++ input[8] = 0x80; /* Padding: First bit after message = 1 */
8571 ++ input[63] = 0x40; /* Padding: Length of the message = 64 bits */
8572 ++
8573 ++ sha_init(mptcp_hashed_key);
8574 ++ sha_transform(mptcp_hashed_key, input, workspace);
8575 ++
8576 ++ for (i = 0; i < 5; i++)
8577 ++ mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);
8578 ++
8579 ++ if (token)
8580 ++ *token = mptcp_hashed_key[0];
8581 ++ if (idsn)
8582 ++ *idsn = *((u64 *)&mptcp_hashed_key[3]);
8583 ++}
8584 ++
8585 ++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
8586 ++ u32 *hash_out)
8587 ++{
8588 ++ u32 workspace[SHA_WORKSPACE_WORDS];
8589 ++ u8 input[128]; /* 2 512-bit blocks */
8590 ++ int i;
8591 ++
8592 ++ memset(workspace, 0, sizeof(workspace));
8593 ++
8594 ++ /* Generate key xored with ipad */
8595 ++ memset(input, 0x36, 64);
8596 ++ for (i = 0; i < 8; i++)
8597 ++ input[i] ^= key_1[i];
8598 ++ for (i = 0; i < 8; i++)
8599 ++ input[i + 8] ^= key_2[i];
8600 ++
8601 ++ memcpy(&input[64], rand_1, 4);
8602 ++ memcpy(&input[68], rand_2, 4);
8603 ++ input[72] = 0x80; /* Padding: First bit after message = 1 */
8604 ++ memset(&input[73], 0, 53);
8605 ++
8606 ++ /* Padding: Length of the message = 512 + 64 bits */
8607 ++ input[126] = 0x02;
8608 ++ input[127] = 0x40;
8609 ++
8610 ++ sha_init(hash_out);
8611 ++ sha_transform(hash_out, input, workspace);
8612 ++ memset(workspace, 0, sizeof(workspace));
8613 ++
8614 ++ sha_transform(hash_out, &input[64], workspace);
8615 ++ memset(workspace, 0, sizeof(workspace));
8616 ++
8617 ++ for (i = 0; i < 5; i++)
8618 ++ hash_out[i] = cpu_to_be32(hash_out[i]);
8619 ++
8620 ++ /* Prepare second part of hmac */
8621 ++ memset(input, 0x5C, 64);
8622 ++ for (i = 0; i < 8; i++)
8623 ++ input[i] ^= key_1[i];
8624 ++ for (i = 0; i < 8; i++)
8625 ++ input[i + 8] ^= key_2[i];
8626 ++
8627 ++ memcpy(&input[64], hash_out, 20);
8628 ++ input[84] = 0x80;
8629 ++ memset(&input[85], 0, 41);
8630 ++
8631 ++ /* Padding: Length of the message = 512 + 160 bits */
8632 ++ input[126] = 0x02;
8633 ++ input[127] = 0xA0;
8634 ++
8635 ++ sha_init(hash_out);
8636 ++ sha_transform(hash_out, input, workspace);
8637 ++ memset(workspace, 0, sizeof(workspace));
8638 ++
8639 ++ sha_transform(hash_out, &input[64], workspace);
8640 ++
8641 ++ for (i = 0; i < 5; i++)
8642 ++ hash_out[i] = cpu_to_be32(hash_out[i]);
8643 ++}
8644 ++
8645 ++static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
8646 ++{
8647 ++ /* Socket-options handled by sk_clone_lock while creating the meta-sk.
8648 ++ * ======
8649 ++ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
8650 ++ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
8651 ++ * TCP_NODELAY, TCP_CORK
8652 ++ *
8653 ++ * Socket-options handled in this function here
8654 ++ * ======
8655 ++ * TCP_DEFER_ACCEPT
8656 ++ * SO_KEEPALIVE
8657 ++ *
8658 ++ * Socket-options on the todo-list
8659 ++ * ======
8660 ++ * SO_BINDTODEVICE - should probably prevent creation of new subsocks
8661 ++ * across other devices. - what about the api-draft?
8662 ++ * SO_DEBUG
8663 ++ * SO_REUSEADDR - probably we don't care about this
8664 ++ * SO_DONTROUTE, SO_BROADCAST
8665 ++ * SO_OOBINLINE
8666 ++ * SO_LINGER
8667 ++ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
8668 ++ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
8669 ++ * SO_RXQ_OVFL
8670 ++ * TCP_COOKIE_TRANSACTIONS
8671 ++ * TCP_MAXSEG
8672 ++ * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this
8673 ++ * in mptcp_retransmit_timer. AND we need to check what is
8674 ++ * about the subsockets.
8675 ++ * TCP_LINGER2
8676 ++ * TCP_WINDOW_CLAMP
8677 ++ * TCP_USER_TIMEOUT
8678 ++ * TCP_MD5SIG
8679 ++ *
8680 ++ * Socket-options of no concern for the meta-socket (but for the subsocket)
8681 ++ * ======
8682 ++ * SO_PRIORITY
8683 ++ * SO_MARK
8684 ++ * TCP_CONGESTION
8685 ++ * TCP_SYNCNT
8686 ++ * TCP_QUICKACK
8687 ++ */
8688 ++
8689 ++ /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */
8690 ++ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
8691 ++
8692 ++ /* Keepalives are handled entirely at the MPTCP-layer */
8693 ++ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {
8694 ++ inet_csk_reset_keepalive_timer(meta_sk,
8695 ++ keepalive_time_when(tcp_sk(meta_sk)));
8696 ++ sock_reset_flag(master_sk, SOCK_KEEPOPEN);
8697 ++ inet_csk_delete_keepalive_timer(master_sk);
8698 ++ }
8699 ++
8700 ++ /* Do not propagate subflow-errors up to the MPTCP-layer */
8701 ++ inet_sk(master_sk)->recverr = 0;
8702 ++}
8703 ++
8704 ++static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk)
8705 ++{
8706 ++ /* IP_TOS also goes to the subflow. */
8707 ++ if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {
8708 ++ inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;
8709 ++ sub_sk->sk_priority = meta_sk->sk_priority;
8710 ++ sk_dst_reset(sub_sk);
8711 ++ }
8712 ++
8713 ++ /* Inherit SO_REUSEADDR */
8714 ++ sub_sk->sk_reuse = meta_sk->sk_reuse;
8715 ++
8716 ++ /* Inherit snd/rcv-buffer locks */
8717 ++ sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
8718 ++
8719 ++ /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */
8720 ++ tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
8721 ++
8722 ++ /* Keepalives are handled entirely at the MPTCP-layer */
8723 ++ if (sock_flag(sub_sk, SOCK_KEEPOPEN)) {
8724 ++ sock_reset_flag(sub_sk, SOCK_KEEPOPEN);
8725 ++ inet_csk_delete_keepalive_timer(sub_sk);
8726 ++ }
8727 ++
8728 ++ /* Do not propagate subflow-errors up to the MPTCP-layer */
8729 ++ inet_sk(sub_sk)->recverr = 0;
8730 ++}
8731 ++
8732 ++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
8733 ++{
8734 ++ /* skb-sk may be NULL if we receive a packet immediatly after the
8735 ++ * SYN/ACK + MP_CAPABLE.
8736 ++ */
8737 ++ struct sock *sk = skb->sk ? skb->sk : meta_sk;
8738 ++ int ret = 0;
8739 ++
8740 ++ skb->sk = NULL;
8741 ++
8742 ++ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
8743 ++ kfree_skb(skb);
8744 ++ return 0;
8745 ++ }
8746 ++
8747 ++ if (sk->sk_family == AF_INET)
8748 ++ ret = tcp_v4_do_rcv(sk, skb);
8749 ++#if IS_ENABLED(CONFIG_IPV6)
8750 ++ else
8751 ++ ret = tcp_v6_do_rcv(sk, skb);
8752 ++#endif
8753 ++
8754 ++ sock_put(sk);
8755 ++ return ret;
8756 ++}
8757 ++
8758 ++struct lock_class_key meta_key;
8759 ++struct lock_class_key meta_slock_key;
8760 ++
8761 ++static void mptcp_synack_timer_handler(unsigned long data)
8762 ++{
8763 ++ struct sock *meta_sk = (struct sock *) data;
8764 ++ struct listen_sock *lopt = inet_csk(meta_sk)->icsk_accept_queue.listen_opt;
8765 ++
8766 ++ /* Only process if socket is not in use. */
8767 ++ bh_lock_sock(meta_sk);
8768 ++
8769 ++ if (sock_owned_by_user(meta_sk)) {
8770 ++ /* Try again later. */
8771 ++ mptcp_reset_synack_timer(meta_sk, HZ/20);
8772 ++ goto out;
8773 ++ }
8774 ++
8775 ++ /* May happen if the queue got destructed in mptcp_close */
8776 ++ if (!lopt)
8777 ++ goto out;
8778 ++
8779 ++ inet_csk_reqsk_queue_prune(meta_sk, TCP_SYNQ_INTERVAL,
8780 ++ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
8781 ++
8782 ++ if (lopt->qlen)
8783 ++ mptcp_reset_synack_timer(meta_sk, TCP_SYNQ_INTERVAL);
8784 ++
8785 ++out:
8786 ++ bh_unlock_sock(meta_sk);
8787 ++ sock_put(meta_sk);
8788 ++}
8789 ++
8790 ++static const struct tcp_sock_ops mptcp_meta_specific = {
8791 ++ .__select_window = __mptcp_select_window,
8792 ++ .select_window = mptcp_select_window,
8793 ++ .select_initial_window = mptcp_select_initial_window,
8794 ++ .init_buffer_space = mptcp_init_buffer_space,
8795 ++ .set_rto = mptcp_tcp_set_rto,
8796 ++ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
8797 ++ .init_congestion_control = mptcp_init_congestion_control,
8798 ++ .send_fin = mptcp_send_fin,
8799 ++ .write_xmit = mptcp_write_xmit,
8800 ++ .send_active_reset = mptcp_send_active_reset,
8801 ++ .write_wakeup = mptcp_write_wakeup,
8802 ++ .prune_ofo_queue = mptcp_prune_ofo_queue,
8803 ++ .retransmit_timer = mptcp_retransmit_timer,
8804 ++ .time_wait = mptcp_time_wait,
8805 ++ .cleanup_rbuf = mptcp_cleanup_rbuf,
8806 ++};
8807 ++
8808 ++static const struct tcp_sock_ops mptcp_sub_specific = {
8809 ++ .__select_window = __mptcp_select_window,
8810 ++ .select_window = mptcp_select_window,
8811 ++ .select_initial_window = mptcp_select_initial_window,
8812 ++ .init_buffer_space = mptcp_init_buffer_space,
8813 ++ .set_rto = mptcp_tcp_set_rto,
8814 ++ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
8815 ++ .init_congestion_control = mptcp_init_congestion_control,
8816 ++ .send_fin = tcp_send_fin,
8817 ++ .write_xmit = tcp_write_xmit,
8818 ++ .send_active_reset = tcp_send_active_reset,
8819 ++ .write_wakeup = tcp_write_wakeup,
8820 ++ .prune_ofo_queue = tcp_prune_ofo_queue,
8821 ++ .retransmit_timer = tcp_retransmit_timer,
8822 ++ .time_wait = tcp_time_wait,
8823 ++ .cleanup_rbuf = tcp_cleanup_rbuf,
8824 ++};
8825 ++
8826 ++static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)
8827 ++{
8828 ++ struct mptcp_cb *mpcb;
8829 ++ struct sock *master_sk;
8830 ++ struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);
8831 ++ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
8832 ++ u64 idsn;
8833 ++
8834 ++ dst_release(meta_sk->sk_rx_dst);
8835 ++ meta_sk->sk_rx_dst = NULL;
8836 ++ /* This flag is set to announce sock_lock_init to
8837 ++ * reclassify the lock-class of the master socket.
8838 ++ */
8839 ++ meta_tp->is_master_sk = 1;
8840 ++ master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO);
8841 ++ meta_tp->is_master_sk = 0;
8842 ++ if (!master_sk)
8843 ++ return -ENOBUFS;
8844 ++
8845 ++ master_tp = tcp_sk(master_sk);
8846 ++ master_icsk = inet_csk(master_sk);
8847 ++
8848 ++ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
8849 ++ if (!mpcb) {
8850 ++ /* sk_free (and __sk_free) requirese wmem_alloc to be 1.
8851 ++ * All the rest is set to 0 thanks to __GFP_ZERO above.
8852 ++ */
8853 ++ atomic_set(&master_sk->sk_wmem_alloc, 1);
8854 ++ sk_free(master_sk);
8855 ++ return -ENOBUFS;
8856 ++ }
8857 ++
8858 ++#if IS_ENABLED(CONFIG_IPV6)
8859 ++ if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) {
8860 ++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
8861 ++
8862 ++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
8863 ++
8864 ++ newnp = inet6_sk(master_sk);
8865 ++ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
8866 ++
8867 ++ newnp->ipv6_mc_list = NULL;
8868 ++ newnp->ipv6_ac_list = NULL;
8869 ++ newnp->ipv6_fl_list = NULL;
8870 ++ newnp->opt = NULL;
8871 ++ newnp->pktoptions = NULL;
8872 ++ (void)xchg(&newnp->rxpmtu, NULL);
8873 ++ } else if (meta_sk->sk_family == AF_INET6) {
8874 ++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
8875 ++
8876 ++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
8877 ++
8878 ++ newnp = inet6_sk(master_sk);
8879 ++ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
8880 ++
8881 ++ newnp->hop_limit = -1;
8882 ++ newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
8883 ++ newnp->mc_loop = 1;
8884 ++ newnp->pmtudisc = IPV6_PMTUDISC_WANT;
8885 ++ newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only;
8886 ++ }
8887 ++#endif
8888 ++
8889 ++ meta_tp->mptcp = NULL;
8890 ++
8891 ++ /* Store the keys and generate the peer's token */
8892 ++ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
8893 ++ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
8894 ++
8895 ++ /* Generate Initial data-sequence-numbers */
8896 ++ mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);
8897 ++ idsn = ntohll(idsn) + 1;
8898 ++ mpcb->snd_high_order[0] = idsn >> 32;
8899 ++ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
8900 ++
8901 ++ meta_tp->write_seq = (u32)idsn;
8902 ++ meta_tp->snd_sml = meta_tp->write_seq;
8903 ++ meta_tp->snd_una = meta_tp->write_seq;
8904 ++ meta_tp->snd_nxt = meta_tp->write_seq;
8905 ++ meta_tp->pushed_seq = meta_tp->write_seq;
8906 ++ meta_tp->snd_up = meta_tp->write_seq;
8907 ++
8908 ++ mpcb->mptcp_rem_key = remote_key;
8909 ++ mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
8910 ++ idsn = ntohll(idsn) + 1;
8911 ++ mpcb->rcv_high_order[0] = idsn >> 32;
8912 ++ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
8913 ++ meta_tp->copied_seq = (u32) idsn;
8914 ++ meta_tp->rcv_nxt = (u32) idsn;
8915 ++ meta_tp->rcv_wup = (u32) idsn;
8916 ++
8917 ++ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
8918 ++ meta_tp->snd_wnd = window;
8919 ++ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
8920 ++
8921 ++ meta_tp->packets_out = 0;
8922 ++ meta_icsk->icsk_probes_out = 0;
8923 ++
8924 ++ /* Set mptcp-pointers */
8925 ++ master_tp->mpcb = mpcb;
8926 ++ master_tp->meta_sk = meta_sk;
8927 ++ meta_tp->mpcb = mpcb;
8928 ++ meta_tp->meta_sk = meta_sk;
8929 ++ mpcb->meta_sk = meta_sk;
8930 ++ mpcb->master_sk = master_sk;
8931 ++
8932 ++ meta_tp->was_meta_sk = 0;
8933 ++
8934 ++ /* Initialize the queues */
8935 ++ skb_queue_head_init(&mpcb->reinject_queue);
8936 ++ skb_queue_head_init(&master_tp->out_of_order_queue);
8937 ++ tcp_prequeue_init(master_tp);
8938 ++ INIT_LIST_HEAD(&master_tp->tsq_node);
8939 ++
8940 ++ master_tp->tsq_flags = 0;
8941 ++
8942 ++ mutex_init(&mpcb->mpcb_mutex);
8943 ++
8944 ++ /* Init the accept_queue structure, we support a queue of 32 pending
8945 ++ * connections, it does not need to be huge, since we only store here
8946 ++ * pending subflow creations.
8947 ++ */
8948 ++ if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {
8949 ++ inet_put_port(master_sk);
8950 ++ kmem_cache_free(mptcp_cb_cache, mpcb);
8951 ++ sk_free(master_sk);
8952 ++ return -ENOMEM;
8953 ++ }
8954 ++
8955 ++ /* Redefine function-pointers as the meta-sk is now fully ready */
8956 ++ static_key_slow_inc(&mptcp_static_key);
8957 ++ meta_tp->mpc = 1;
8958 ++ meta_tp->ops = &mptcp_meta_specific;
8959 ++
8960 ++ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
8961 ++ meta_sk->sk_destruct = mptcp_sock_destruct;
8962 ++
8963 ++ /* Meta-level retransmit timer */
8964 ++ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
8965 ++
8966 ++ tcp_init_xmit_timers(master_sk);
8967 ++ /* Has been set for sending out the SYN */
8968 ++ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
8969 ++
8970 ++ if (!meta_tp->inside_tk_table) {
8971 ++ /* Adding the meta_tp in the token hashtable - coming from server-side */
8972 ++ rcu_read_lock();
8973 ++ spin_lock(&mptcp_tk_hashlock);
8974 ++
8975 ++ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
8976 ++
8977 ++ spin_unlock(&mptcp_tk_hashlock);
8978 ++ rcu_read_unlock();
8979 ++ }
8980 ++ master_tp->inside_tk_table = 0;
8981 ++
8982 ++ /* Init time-wait stuff */
8983 ++ INIT_LIST_HEAD(&mpcb->tw_list);
8984 ++ spin_lock_init(&mpcb->tw_lock);
8985 ++
8986 ++ INIT_HLIST_HEAD(&mpcb->callback_list);
8987 ++
8988 ++ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
8989 ++
8990 ++ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
8991 ++ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
8992 ++ mpcb->orig_window_clamp = meta_tp->window_clamp;
8993 ++
8994 ++ /* The meta is directly linked - set refcnt to 1 */
8995 ++ atomic_set(&mpcb->mpcb_refcnt, 1);
8996 ++
8997 ++ mptcp_init_path_manager(mpcb);
8998 ++ mptcp_init_scheduler(mpcb);
8999 ++
9000 ++ setup_timer(&mpcb->synack_timer, mptcp_synack_timer_handler,
9001 ++ (unsigned long)meta_sk);
9002 ++
9003 ++ mptcp_debug("%s: created mpcb with token %#x\n",
9004 ++ __func__, mpcb->mptcp_loc_token);
9005 ++
9006 ++ return 0;
9007 ++}
9008 ++
9009 ++void mptcp_fallback_meta_sk(struct sock *meta_sk)
9010 ++{
9011 ++ kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);
9012 ++ kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);
9013 ++}
9014 ++
9015 ++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
9016 ++ gfp_t flags)
9017 ++{
9018 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
9019 ++ struct tcp_sock *tp = tcp_sk(sk);
9020 ++
9021 ++ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
9022 ++ if (!tp->mptcp)
9023 ++ return -ENOMEM;
9024 ++
9025 ++ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
9026 ++ /* No more space for more subflows? */
9027 ++ if (!tp->mptcp->path_index) {
9028 ++ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
9029 ++ return -EPERM;
9030 ++ }
9031 ++
9032 ++ INIT_HLIST_NODE(&tp->mptcp->cb_list);
9033 ++
9034 ++ tp->mptcp->tp = tp;
9035 ++ tp->mpcb = mpcb;
9036 ++ tp->meta_sk = meta_sk;
9037 ++
9038 ++ static_key_slow_inc(&mptcp_static_key);
9039 ++ tp->mpc = 1;
9040 ++ tp->ops = &mptcp_sub_specific;
9041 ++
9042 ++ tp->mptcp->loc_id = loc_id;
9043 ++ tp->mptcp->rem_id = rem_id;
9044 ++ if (mpcb->sched_ops->init)
9045 ++ mpcb->sched_ops->init(sk);
9046 ++
9047 ++ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
9048 ++ * included in mptcp_del_sock(), because the mpcb must remain alive
9049 ++ * until the last subsocket is completely destroyed.
9050 ++ */
9051 ++ sock_hold(meta_sk);
9052 ++ atomic_inc(&mpcb->mpcb_refcnt);
9053 ++
9054 ++ tp->mptcp->next = mpcb->connection_list;
9055 ++ mpcb->connection_list = tp;
9056 ++ tp->mptcp->attached = 1;
9057 ++
9058 ++ mpcb->cnt_subflows++;
9059 ++ atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),
9060 ++ &meta_sk->sk_rmem_alloc);
9061 ++
9062 ++ mptcp_sub_inherit_sockopts(meta_sk, sk);
9063 ++ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
9064 ++
9065 ++ /* As we successfully allocated the mptcp_tcp_sock, we have to
9066 ++ * change the function-pointers here (for sk_destruct to work correctly)
9067 ++ */
9068 ++ sk->sk_error_report = mptcp_sock_def_error_report;
9069 ++ sk->sk_data_ready = mptcp_data_ready;
9070 ++ sk->sk_write_space = mptcp_write_space;
9071 ++ sk->sk_state_change = mptcp_set_state;
9072 ++ sk->sk_destruct = mptcp_sock_destruct;
9073 ++
9074 ++ if (sk->sk_family == AF_INET)
9075 ++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",
9076 ++ __func__ , mpcb->mptcp_loc_token,
9077 ++ tp->mptcp->path_index,
9078 ++ &((struct inet_sock *)tp)->inet_saddr,
9079 ++ ntohs(((struct inet_sock *)tp)->inet_sport),
9080 ++ &((struct inet_sock *)tp)->inet_daddr,
9081 ++ ntohs(((struct inet_sock *)tp)->inet_dport),
9082 ++ mpcb->cnt_subflows);
9083 ++#if IS_ENABLED(CONFIG_IPV6)
9084 ++ else
9085 ++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",
9086 ++ __func__ , mpcb->mptcp_loc_token,
9087 ++ tp->mptcp->path_index, &inet6_sk(sk)->saddr,
9088 ++ ntohs(((struct inet_sock *)tp)->inet_sport),
9089 ++ &sk->sk_v6_daddr,
9090 ++ ntohs(((struct inet_sock *)tp)->inet_dport),
9091 ++ mpcb->cnt_subflows);
9092 ++#endif
9093 ++
9094 ++ return 0;
9095 ++}
9096 ++
9097 ++void mptcp_del_sock(struct sock *sk)
9098 ++{
9099 ++ struct tcp_sock *tp = tcp_sk(sk), *tp_prev;
9100 ++ struct mptcp_cb *mpcb;
9101 ++
9102 ++ if (!tp->mptcp || !tp->mptcp->attached)
9103 ++ return;
9104 ++
9105 ++ mpcb = tp->mpcb;
9106 ++ tp_prev = mpcb->connection_list;
9107 ++
9108 ++ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
9109 ++ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
9110 ++ sk->sk_state, is_meta_sk(sk));
9111 ++
9112 ++ if (tp_prev == tp) {
9113 ++ mpcb->connection_list = tp->mptcp->next;
9114 ++ } else {
9115 ++ for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {
9116 ++ if (tp_prev->mptcp->next == tp) {
9117 ++ tp_prev->mptcp->next = tp->mptcp->next;
9118 ++ break;
9119 ++ }
9120 ++ }
9121 ++ }
9122 ++ mpcb->cnt_subflows--;
9123 ++ if (tp->mptcp->establish_increased)
9124 ++ mpcb->cnt_established--;
9125 ++
9126 ++ tp->mptcp->next = NULL;
9127 ++ tp->mptcp->attached = 0;
9128 ++ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
9129 ++
9130 ++ if (!skb_queue_empty(&sk->sk_write_queue))
9131 ++ mptcp_reinject_data(sk, 0);
9132 ++
9133 ++ if (is_master_tp(tp))
9134 ++ mpcb->master_sk = NULL;
9135 ++ else if (tp->mptcp->pre_established)
9136 ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
9137 ++
9138 ++ rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);
9139 ++}
9140 ++
9141 ++/* Updates the metasocket ULID/port data, based on the given sock.
9142 ++ * The argument sock must be the sock accessible to the application.
9143 ++ * In this function, we update the meta socket info, based on the changes
9144 ++ * in the application socket (bind, address allocation, ...)
9145 ++ */
9146 ++void mptcp_update_metasocket(struct sock *sk, const struct sock *meta_sk)
9147 ++{
9148 ++ if (tcp_sk(sk)->mpcb->pm_ops->new_session)
9149 ++ tcp_sk(sk)->mpcb->pm_ops->new_session(meta_sk);
9150 ++
9151 ++ tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;
9152 ++}
9153 ++
9154 ++/* Clean up the receive buffer for full frames taken by the user,
9155 ++ * then send an ACK if necessary. COPIED is the number of bytes
9156 ++ * tcp_recvmsg has given to the user so far, it speeds up the
9157 ++ * calculation of whether or not we must ACK for the sake of
9158 ++ * a window update.
9159 ++ */
9160 ++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
9161 ++{
9162 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
9163 ++ struct sock *sk;
9164 ++ __u32 rcv_window_now = 0;
9165 ++
9166 ++ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
9167 ++ rcv_window_now = tcp_receive_window(meta_tp);
9168 ++
9169 ++ if (2 * rcv_window_now > meta_tp->window_clamp)
9170 ++ rcv_window_now = 0;
9171 ++ }
9172 ++
9173 ++ mptcp_for_each_sk(meta_tp->mpcb, sk) {
9174 ++ struct tcp_sock *tp = tcp_sk(sk);
9175 ++ const struct inet_connection_sock *icsk = inet_csk(sk);
9176 ++
9177 ++ if (!mptcp_sk_can_send_ack(sk))
9178 ++ continue;
9179 ++
9180 ++ if (!inet_csk_ack_scheduled(sk))
9181 ++ goto second_part;
9182 ++ /* Delayed ACKs frequently hit locked sockets during bulk
9183 ++ * receive.
9184 ++ */
9185 ++ if (icsk->icsk_ack.blocked ||
9186 ++ /* Once-per-two-segments ACK was not sent by tcp_input.c */
9187 ++ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
9188 ++ /* If this read emptied read buffer, we send ACK, if
9189 ++ * connection is not bidirectional, user drained
9190 ++ * receive buffer and there was a small segment
9191 ++ * in queue.
9192 ++ */
9193 ++ (copied > 0 &&
9194 ++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
9195 ++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
9196 ++ !icsk->icsk_ack.pingpong)) &&
9197 ++ !atomic_read(&meta_sk->sk_rmem_alloc))) {
9198 ++ tcp_send_ack(sk);
9199 ++ continue;
9200 ++ }
9201 ++
9202 ++second_part:
9203 ++ /* This here is the second part of tcp_cleanup_rbuf */
9204 ++ if (rcv_window_now) {
9205 ++ __u32 new_window = tp->ops->__select_window(sk);
9206 ++
9207 ++ /* Send ACK now, if this read freed lots of space
9208 ++ * in our buffer. Certainly, new_window is new window.
9209 ++ * We can advertise it now, if it is not less than
9210 ++ * current one.
9211 ++ * "Lots" means "at least twice" here.
9212 ++ */
9213 ++ if (new_window && new_window >= 2 * rcv_window_now)
9214 ++ tcp_send_ack(sk);
9215 ++ }
9216 ++ }
9217 ++}
9218 ++
9219 ++static int mptcp_sub_send_fin(struct sock *sk)
9220 ++{
9221 ++ struct tcp_sock *tp = tcp_sk(sk);
9222 ++ struct sk_buff *skb = tcp_write_queue_tail(sk);
9223 ++ int mss_now;
9224 ++
9225 ++ /* Optimization, tack on the FIN if we have a queue of
9226 ++ * unsent frames. But be careful about outgoing SACKS
9227 ++ * and IP options.
9228 ++ */
9229 ++ mss_now = tcp_current_mss(sk);
9230 ++
9231 ++ if (tcp_send_head(sk) != NULL) {
9232 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
9233 ++ TCP_SKB_CB(skb)->end_seq++;
9234 ++ tp->write_seq++;
9235 ++ } else {
9236 ++ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
9237 ++ if (!skb)
9238 ++ return 1;
9239 ++
9240 ++ /* Reserve space for headers and prepare control bits. */
9241 ++ skb_reserve(skb, MAX_TCP_HEADER);
9242 ++ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
9243 ++ tcp_init_nondata_skb(skb, tp->write_seq,
9244 ++ TCPHDR_ACK | TCPHDR_FIN);
9245 ++ tcp_queue_skb(sk, skb);
9246 ++ }
9247 ++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
9248 ++
9249 ++ return 0;
9250 ++}
9251 ++
9252 ++void mptcp_sub_close_wq(struct work_struct *work)
9253 ++{
9254 ++ struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp;
9255 ++ struct sock *sk = (struct sock *)tp;
9256 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
9257 ++
9258 ++ mutex_lock(&tp->mpcb->mpcb_mutex);
9259 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
9260 ++
9261 ++ if (sock_flag(sk, SOCK_DEAD))
9262 ++ goto exit;
9263 ++
9264 ++ /* We come from tcp_disconnect. We are sure that meta_sk is set */
9265 ++ if (!mptcp(tp)) {
9266 ++ tp->closing = 1;
9267 ++ sock_rps_reset_flow(sk);
9268 ++ tcp_close(sk, 0);
9269 ++ goto exit;
9270 ++ }
9271 ++
9272 ++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
9273 ++ tp->closing = 1;
9274 ++ sock_rps_reset_flow(sk);
9275 ++ tcp_close(sk, 0);
9276 ++ } else if (tcp_close_state(sk)) {
9277 ++ sk->sk_shutdown |= SEND_SHUTDOWN;
9278 ++ tcp_send_fin(sk);
9279 ++ }
9280 ++
9281 ++exit:
9282 ++ release_sock(meta_sk);
9283 ++ mutex_unlock(&tp->mpcb->mpcb_mutex);
9284 ++ sock_put(sk);
9285 ++}
9286 ++
9287 ++void mptcp_sub_close(struct sock *sk, unsigned long delay)
9288 ++{
9289 ++ struct tcp_sock *tp = tcp_sk(sk);
9290 ++ struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
9291 ++
9292 ++ /* We are already closing - e.g., call from sock_def_error_report upon
9293 ++ * tcp_disconnect in tcp_close.
9294 ++ */
9295 ++ if (tp->closing)
9296 ++ return;
9297 ++
9298 ++ /* Work already scheduled ? */
9299 ++ if (work_pending(&work->work)) {
9300 ++ /* Work present - who will be first ? */
9301 ++ if (jiffies + delay > work->timer.expires)
9302 ++ return;
9303 ++
9304 ++ /* Try canceling - if it fails, work will be executed soon */
9305 ++ if (!cancel_delayed_work(work))
9306 ++ return;
9307 ++ sock_put(sk);
9308 ++ }
9309 ++
9310 ++ if (!delay) {
9311 ++ unsigned char old_state = sk->sk_state;
9312 ++
9313 ++ /* If we are in user-context we can directly do the closing
9314 ++ * procedure. No need to schedule a work-queue.
9315 ++ */
9316 ++ if (!in_softirq()) {
9317 ++ if (sock_flag(sk, SOCK_DEAD))
9318 ++ return;
9319 ++
9320 ++ if (!mptcp(tp)) {
9321 ++ tp->closing = 1;
9322 ++ sock_rps_reset_flow(sk);
9323 ++ tcp_close(sk, 0);
9324 ++ return;
9325 ++ }
9326 ++
9327 ++ if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||
9328 ++ sk->sk_state == TCP_CLOSE) {
9329 ++ tp->closing = 1;
9330 ++ sock_rps_reset_flow(sk);
9331 ++ tcp_close(sk, 0);
9332 ++ } else if (tcp_close_state(sk)) {
9333 ++ sk->sk_shutdown |= SEND_SHUTDOWN;
9334 ++ tcp_send_fin(sk);
9335 ++ }
9336 ++
9337 ++ return;
9338 ++ }
9339 ++
9340 ++ /* We directly send the FIN. Because it may take so a long time,
9341 ++ * untile the work-queue will get scheduled...
9342 ++ *
9343 ++ * If mptcp_sub_send_fin returns 1, it failed and thus we reset
9344 ++ * the old state so that tcp_close will finally send the fin
9345 ++ * in user-context.
9346 ++ */
9347 ++ if (!sk->sk_err && old_state != TCP_CLOSE &&
9348 ++ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
9349 ++ if (old_state == TCP_ESTABLISHED)
9350 ++ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
9351 ++ sk->sk_state = old_state;
9352 ++ }
9353 ++ }
9354 ++
9355 ++ sock_hold(sk);
9356 ++ queue_delayed_work(mptcp_wq, work, delay);
9357 ++}
9358 ++
9359 ++void mptcp_sub_force_close(struct sock *sk)
9360 ++{
9361 ++ /* The below tcp_done may have freed the socket, if he is already dead.
9362 ++ * Thus, we are not allowed to access it afterwards. That's why
9363 ++ * we have to store the dead-state in this local variable.
9364 ++ */
9365 ++ int sock_is_dead = sock_flag(sk, SOCK_DEAD);
9366 ++
9367 ++ tcp_sk(sk)->mp_killed = 1;
9368 ++
9369 ++ if (sk->sk_state != TCP_CLOSE)
9370 ++ tcp_done(sk);
9371 ++
9372 ++ if (!sock_is_dead)
9373 ++ mptcp_sub_close(sk, 0);
9374 ++}
9375 ++EXPORT_SYMBOL(mptcp_sub_force_close);
9376 ++
9377 ++/* Update the mpcb send window, based on the contributions
9378 ++ * of each subflow
9379 ++ */
9380 ++void mptcp_update_sndbuf(const struct tcp_sock *tp)
9381 ++{
9382 ++ struct sock *meta_sk = tp->meta_sk, *sk;
9383 ++ int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
9384 ++
9385 ++ mptcp_for_each_sk(tp->mpcb, sk) {
9386 ++ if (!mptcp_sk_can_send(sk))
9387 ++ continue;
9388 ++
9389 ++ new_sndbuf += sk->sk_sndbuf;
9390 ++
9391 ++ if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {
9392 ++ new_sndbuf = sysctl_tcp_wmem[2];
9393 ++ break;
9394 ++ }
9395 ++ }
9396 ++ meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);
9397 ++
9398 ++ /* The subflow's call to sk_write_space in tcp_new_space ends up in
9399 ++ * mptcp_write_space.
9400 ++ * It has nothing to do with waking up the application.
9401 ++ * So, we do it here.
9402 ++ */
9403 ++ if (old_sndbuf != meta_sk->sk_sndbuf)
9404 ++ meta_sk->sk_write_space(meta_sk);
9405 ++}
9406 ++
9407 ++void mptcp_close(struct sock *meta_sk, long timeout)
9408 ++{
9409 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
9410 ++ struct sock *sk_it, *tmpsk;
9411 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
9412 ++ struct sk_buff *skb;
9413 ++ int data_was_unread = 0;
9414 ++ int state;
9415 ++
9416 ++ mptcp_debug("%s: Close of meta_sk with tok %#x\n",
9417 ++ __func__, mpcb->mptcp_loc_token);
9418 ++
9419 ++ mutex_lock(&mpcb->mpcb_mutex);
9420 ++ lock_sock(meta_sk);
9421 ++
9422 ++ if (meta_tp->inside_tk_table) {
9423 ++ /* Detach the mpcb from the token hashtable */
9424 ++ mptcp_hash_remove_bh(meta_tp);
9425 ++ reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);
9426 ++ }
9427 ++
9428 ++ meta_sk->sk_shutdown = SHUTDOWN_MASK;
9429 ++ /* We need to flush the recv. buffs. We do this only on the
9430 ++ * descriptor close, not protocol-sourced closes, because the
9431 ++ * reader process may not have drained the data yet!
9432 ++ */
9433 ++ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
9434 ++ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
9435 ++ tcp_hdr(skb)->fin;
9436 ++ data_was_unread += len;
9437 ++ __kfree_skb(skb);
9438 ++ }
9439 ++
9440 ++ sk_mem_reclaim(meta_sk);
9441 ++
9442 ++ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
9443 ++ if (meta_sk->sk_state == TCP_CLOSE) {
9444 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
9445 ++ if (tcp_sk(sk_it)->send_mp_fclose)
9446 ++ continue;
9447 ++ mptcp_sub_close(sk_it, 0);
9448 ++ }
9449 ++ goto adjudge_to_death;
9450 ++ }
9451 ++
9452 ++ if (data_was_unread) {
9453 ++ /* Unread data was tossed, zap the connection. */
9454 ++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
9455 ++ tcp_set_state(meta_sk, TCP_CLOSE);
9456 ++ tcp_sk(meta_sk)->ops->send_active_reset(meta_sk,
9457 ++ meta_sk->sk_allocation);
9458 ++ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
9459 ++ /* Check zero linger _after_ checking for unread data. */
9460 ++ meta_sk->sk_prot->disconnect(meta_sk, 0);
9461 ++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
9462 ++ } else if (tcp_close_state(meta_sk)) {
9463 ++ mptcp_send_fin(meta_sk);
9464 ++ } else if (meta_tp->snd_una == meta_tp->write_seq) {
9465 ++ /* The DATA_FIN has been sent and acknowledged
9466 ++ * (e.g., by sk_shutdown). Close all the other subflows
9467 ++ */
9468 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
9469 ++ unsigned long delay = 0;
9470 ++ /* If we are the passive closer, don't trigger
9471 ++ * subflow-fin until the subflow has been finned
9472 ++ * by the peer. - thus we add a delay
9473 ++ */
9474 ++ if (mpcb->passive_close &&
9475 ++ sk_it->sk_state == TCP_ESTABLISHED)
9476 ++ delay = inet_csk(sk_it)->icsk_rto << 3;
9477 ++
9478 ++ mptcp_sub_close(sk_it, delay);
9479 ++ }
9480 ++ }
9481 ++
9482 ++ sk_stream_wait_close(meta_sk, timeout);
9483 ++
9484 ++adjudge_to_death:
9485 ++ state = meta_sk->sk_state;
9486 ++ sock_hold(meta_sk);
9487 ++ sock_orphan(meta_sk);
9488 ++
9489 ++ /* socket will be freed after mptcp_close - we have to prevent
9490 ++ * access from the subflows.
9491 ++ */
9492 ++ mptcp_for_each_sk(mpcb, sk_it) {
9493 ++ /* Similar to sock_orphan, but we don't set it DEAD, because
9494 ++ * the callbacks are still set and must be called.
9495 ++ */
9496 ++ write_lock_bh(&sk_it->sk_callback_lock);
9497 ++ sk_set_socket(sk_it, NULL);
9498 ++ sk_it->sk_wq = NULL;
9499 ++ write_unlock_bh(&sk_it->sk_callback_lock);
9500 ++ }
9501 ++
9502 ++ /* It is the last release_sock in its life. It will remove backlog. */
9503 ++ release_sock(meta_sk);
9504 ++
9505 ++ /* Now socket is owned by kernel and we acquire BH lock
9506 ++ * to finish close. No need to check for user refs.
9507 ++ */
9508 ++ local_bh_disable();
9509 ++ bh_lock_sock(meta_sk);
9510 ++ WARN_ON(sock_owned_by_user(meta_sk));
9511 ++
9512 ++ percpu_counter_inc(meta_sk->sk_prot->orphan_count);
9513 ++
9514 ++ /* Have we already been destroyed by a softirq or backlog? */
9515 ++ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
9516 ++ goto out;
9517 ++
9518 ++ /* This is a (useful) BSD violating of the RFC. There is a
9519 ++ * problem with TCP as specified in that the other end could
9520 ++ * keep a socket open forever with no application left this end.
9521 ++ * We use a 3 minute timeout (about the same as BSD) then kill
9522 ++ * our end. If they send after that then tough - BUT: long enough
9523 ++ * that we won't make the old 4*rto = almost no time - whoops
9524 ++ * reset mistake.
9525 ++ *
9526 ++ * Nope, it was not mistake. It is really desired behaviour
9527 ++ * f.e. on http servers, when such sockets are useless, but
9528 ++ * consume significant resources. Let's do it with special
9529 ++ * linger2 option. --ANK
9530 ++ */
9531 ++
9532 ++ if (meta_sk->sk_state == TCP_FIN_WAIT2) {
9533 ++ if (meta_tp->linger2 < 0) {
9534 ++ tcp_set_state(meta_sk, TCP_CLOSE);
9535 ++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
9536 ++ NET_INC_STATS_BH(sock_net(meta_sk),
9537 ++ LINUX_MIB_TCPABORTONLINGER);
9538 ++ } else {
9539 ++ const int tmo = tcp_fin_time(meta_sk);
9540 ++
9541 ++ if (tmo > TCP_TIMEWAIT_LEN) {
9542 ++ inet_csk_reset_keepalive_timer(meta_sk,
9543 ++ tmo - TCP_TIMEWAIT_LEN);
9544 ++ } else {
9545 ++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2,
9546 ++ tmo);
9547 ++ goto out;
9548 ++ }
9549 ++ }
9550 ++ }
9551 ++ if (meta_sk->sk_state != TCP_CLOSE) {
9552 ++ sk_mem_reclaim(meta_sk);
9553 ++ if (tcp_too_many_orphans(meta_sk, 0)) {
9554 ++ if (net_ratelimit())
9555 ++ pr_info("MPTCP: too many of orphaned sockets\n");
9556 ++ tcp_set_state(meta_sk, TCP_CLOSE);
9557 ++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
9558 ++ NET_INC_STATS_BH(sock_net(meta_sk),
9559 ++ LINUX_MIB_TCPABORTONMEMORY);
9560 ++ }
9561 ++ }
9562 ++
9563 ++
9564 ++ if (meta_sk->sk_state == TCP_CLOSE)
9565 ++ inet_csk_destroy_sock(meta_sk);
9566 ++ /* Otherwise, socket is reprieved until protocol close. */
9567 ++
9568 ++out:
9569 ++ bh_unlock_sock(meta_sk);
9570 ++ local_bh_enable();
9571 ++ mutex_unlock(&mpcb->mpcb_mutex);
9572 ++ sock_put(meta_sk); /* Taken by sock_hold */
9573 ++}
9574 ++
9575 ++void mptcp_disconnect(struct sock *sk)
9576 ++{
9577 ++ struct sock *subsk, *tmpsk;
9578 ++ struct tcp_sock *tp = tcp_sk(sk);
9579 ++
9580 ++ mptcp_delete_synack_timer(sk);
9581 ++
9582 ++ __skb_queue_purge(&tp->mpcb->reinject_queue);
9583 ++
9584 ++ if (tp->inside_tk_table) {
9585 ++ mptcp_hash_remove_bh(tp);
9586 ++ reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);
9587 ++ }
9588 ++
9589 ++ local_bh_disable();
9590 ++ mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {
9591 ++ /* The socket will get removed from the subsocket-list
9592 ++ * and made non-mptcp by setting mpc to 0.
9593 ++ *
9594 ++ * This is necessary, because tcp_disconnect assumes
9595 ++ * that the connection is completly dead afterwards.
9596 ++ * Thus we need to do a mptcp_del_sock. Due to this call
9597 ++ * we have to make it non-mptcp.
9598 ++ *
9599 ++ * We have to lock the socket, because we set mpc to 0.
9600 ++ * An incoming packet would take the subsocket's lock
9601 ++ * and go on into the receive-path.
9602 ++ * This would be a race.
9603 ++ */
9604 ++
9605 ++ bh_lock_sock(subsk);
9606 ++ mptcp_del_sock(subsk);
9607 ++ tcp_sk(subsk)->mpc = 0;
9608 ++ tcp_sk(subsk)->ops = &tcp_specific;
9609 ++ mptcp_sub_force_close(subsk);
9610 ++ bh_unlock_sock(subsk);
9611 ++ }
9612 ++ local_bh_enable();
9613 ++
9614 ++ tp->was_meta_sk = 1;
9615 ++ tp->mpc = 0;
9616 ++ tp->ops = &tcp_specific;
9617 ++}
9618 ++
9619 ++
9620 ++/* Returns 1 if we should enable MPTCP for that socket. */
9621 ++int mptcp_doit(struct sock *sk)
9622 ++{
9623 ++ /* Do not allow MPTCP enabling if the MPTCP initialization failed */
9624 ++ if (mptcp_init_failed)
9625 ++ return 0;
9626 ++
9627 ++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
9628 ++ return 0;
9629 ++
9630 ++ /* Socket may already be established (e.g., called from tcp_recvmsg) */
9631 ++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->request_mptcp)
9632 ++ return 1;
9633 ++
9634 ++ /* Don't do mptcp over loopback */
9635 ++ if (sk->sk_family == AF_INET &&
9636 ++ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
9637 ++ ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
9638 ++ return 0;
9639 ++#if IS_ENABLED(CONFIG_IPV6)
9640 ++ if (sk->sk_family == AF_INET6 &&
9641 ++ (ipv6_addr_loopback(&sk->sk_v6_daddr) ||
9642 ++ ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
9643 ++ return 0;
9644 ++#endif
9645 ++ if (mptcp_v6_is_v4_mapped(sk) &&
9646 ++ ipv4_is_loopback(inet_sk(sk)->inet_saddr))
9647 ++ return 0;
9648 ++
9649 ++#ifdef CONFIG_TCP_MD5SIG
9650 ++ /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */
9651 ++ if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))
9652 ++ return 0;
9653 ++#endif
9654 ++
9655 ++ return 1;
9656 ++}
9657 ++
9658 ++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)
9659 ++{
9660 ++ struct tcp_sock *master_tp;
9661 ++ struct sock *master_sk;
9662 ++
9663 ++ if (mptcp_alloc_mpcb(meta_sk, remote_key, window))
9664 ++ goto err_alloc_mpcb;
9665 ++
9666 ++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
9667 ++ master_tp = tcp_sk(master_sk);
9668 ++
9669 ++ if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))
9670 ++ goto err_add_sock;
9671 ++
9672 ++ if (__inet_inherit_port(meta_sk, master_sk) < 0)
9673 ++ goto err_add_sock;
9674 ++
9675 ++ meta_sk->sk_prot->unhash(meta_sk);
9676 ++
9677 ++ if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))
9678 ++ __inet_hash_nolisten(master_sk, NULL);
9679 ++#if IS_ENABLED(CONFIG_IPV6)
9680 ++ else
9681 ++ __inet6_hash(master_sk, NULL);
9682 ++#endif
9683 ++
9684 ++ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
9685 ++
9686 ++ return 0;
9687 ++
9688 ++err_add_sock:
9689 ++ mptcp_fallback_meta_sk(meta_sk);
9690 ++
9691 ++ inet_csk_prepare_forced_close(master_sk);
9692 ++ tcp_done(master_sk);
9693 ++ inet_csk_prepare_forced_close(meta_sk);
9694 ++ tcp_done(meta_sk);
9695 ++
9696 ++err_alloc_mpcb:
9697 ++ return -ENOBUFS;
9698 ++}
9699 ++
9700 ++static int __mptcp_check_req_master(struct sock *child,
9701 ++ struct request_sock *req)
9702 ++{
9703 ++ struct tcp_sock *child_tp = tcp_sk(child);
9704 ++ struct sock *meta_sk = child;
9705 ++ struct mptcp_cb *mpcb;
9706 ++ struct mptcp_request_sock *mtreq;
9707 ++
9708 ++ /* Never contained an MP_CAPABLE */
9709 ++ if (!inet_rsk(req)->mptcp_rqsk)
9710 ++ return 1;
9711 ++
9712 ++ if (!inet_rsk(req)->saw_mpc) {
9713 ++ /* Fallback to regular TCP, because we saw one SYN without
9714 ++ * MP_CAPABLE. In tcp_check_req we continue the regular path.
9715 ++ * But, the socket has been added to the reqsk_tk_htb, so we
9716 ++ * must still remove it.
9717 ++ */
9718 ++ mptcp_reqsk_remove_tk(req);
9719 ++ return 1;
9720 ++ }
9721 ++
9722 ++ /* Just set this values to pass them to mptcp_alloc_mpcb */
9723 ++ mtreq = mptcp_rsk(req);
9724 ++ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
9725 ++ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
9726 ++
9727 ++ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
9728 ++ child_tp->snd_wnd))
9729 ++ return -ENOBUFS;
9730 ++
9731 ++ child = tcp_sk(child)->mpcb->master_sk;
9732 ++ child_tp = tcp_sk(child);
9733 ++ mpcb = child_tp->mpcb;
9734 ++
9735 ++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
9736 ++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
9737 ++
9738 ++ mpcb->dss_csum = mtreq->dss_csum;
9739 ++ mpcb->server_side = 1;
9740 ++
9741 ++ /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */
9742 ++ mptcp_update_metasocket(child, meta_sk);
9743 ++
9744 ++ /* Needs to be done here additionally, because when accepting a
9745 ++ * new connection we pass by __reqsk_free and not reqsk_free.
9746 ++ */
9747 ++ mptcp_reqsk_remove_tk(req);
9748 ++
9749 ++ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
9750 ++ sock_put(meta_sk);
9751 ++
9752 ++ return 0;
9753 ++}
9754 ++
9755 ++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req)
9756 ++{
9757 ++ struct sock *meta_sk = child, *master_sk;
9758 ++ struct sk_buff *skb;
9759 ++ u32 new_mapping;
9760 ++ int ret;
9761 ++
9762 ++ ret = __mptcp_check_req_master(child, req);
9763 ++ if (ret)
9764 ++ return ret;
9765 ++
9766 ++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
9767 ++
9768 ++ /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have
9769 ++ * pre-MPTCP data in the receive queue.
9770 ++ */
9771 ++ tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt -
9772 ++ tcp_rsk(req)->rcv_isn - 1;
9773 ++
9774 ++ /* Map subflow sequence number to data sequence numbers. We need to map
9775 ++ * these data to [IDSN - len - 1, IDSN[.
9776 ++ */
9777 ++ new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1;
9778 ++
9779 ++ /* There should be only one skb: the SYN + data. */
9780 ++ skb_queue_walk(&meta_sk->sk_receive_queue, skb) {
9781 ++ TCP_SKB_CB(skb)->seq += new_mapping;
9782 ++ TCP_SKB_CB(skb)->end_seq += new_mapping;
9783 ++ }
9784 ++
9785 ++ /* With fastopen we change the semantics of the relative subflow
9786 ++ * sequence numbers to deal with middleboxes that could add/remove
9787 ++ * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1
9788 ++ * instead of the regular TCP ISN.
9789 ++ */
9790 ++ tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1;
9791 ++
9792 ++ /* We need to update copied_seq of the master_sk to account for the
9793 ++ * already moved data to the meta receive queue.
9794 ++ */
9795 ++ tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt;
9796 ++
9797 ++ /* Handled by the master_sk */
9798 ++ tcp_sk(meta_sk)->fastopen_rsk = NULL;
9799 ++
9800 ++ return 0;
9801 ++}
9802 ++
9803 ++int mptcp_check_req_master(struct sock *sk, struct sock *child,
9804 ++ struct request_sock *req,
9805 ++ struct request_sock **prev)
9806 ++{
9807 ++ struct sock *meta_sk = child;
9808 ++ int ret;
9809 ++
9810 ++ ret = __mptcp_check_req_master(child, req);
9811 ++ if (ret)
9812 ++ return ret;
9813 ++
9814 ++ inet_csk_reqsk_queue_unlink(sk, req, prev);
9815 ++ inet_csk_reqsk_queue_removed(sk, req);
9816 ++ inet_csk_reqsk_queue_add(sk, req, meta_sk);
9817 ++
9818 ++ return 0;
9819 ++}
9820 ++
9821 ++struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,
9822 ++ struct request_sock *req,
9823 ++ struct request_sock **prev,
9824 ++ const struct mptcp_options_received *mopt)
9825 ++{
9826 ++ struct tcp_sock *child_tp = tcp_sk(child);
9827 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
9828 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
9829 ++ u8 hash_mac_check[20];
9830 ++
9831 ++ child_tp->inside_tk_table = 0;
9832 ++
9833 ++ if (!mopt->join_ack)
9834 ++ goto teardown;
9835 ++
9836 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
9837 ++ (u8 *)&mpcb->mptcp_loc_key,
9838 ++ (u8 *)&mtreq->mptcp_rem_nonce,
9839 ++ (u8 *)&mtreq->mptcp_loc_nonce,
9840 ++ (u32 *)hash_mac_check);
9841 ++
9842 ++ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))
9843 ++ goto teardown;
9844 ++
9845 ++ /* Point it to the same struct socket and wq as the meta_sk */
9846 ++ sk_set_socket(child, meta_sk->sk_socket);
9847 ++ child->sk_wq = meta_sk->sk_wq;
9848 ++
9849 ++ if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {
9850 ++ /* Has been inherited, but now child_tp->mptcp is NULL */
9851 ++ child_tp->mpc = 0;
9852 ++ child_tp->ops = &tcp_specific;
9853 ++
9854 ++ /* TODO when we support acking the third ack for new subflows,
9855 ++ * we should silently discard this third ack, by returning NULL.
9856 ++ *
9857 ++ * Maybe, at the retransmission we will have enough memory to
9858 ++ * fully add the socket to the meta-sk.
9859 ++ */
9860 ++ goto teardown;
9861 ++ }
9862 ++
9863 ++ /* The child is a clone of the meta socket, we must now reset
9864 ++ * some of the fields
9865 ++ */
9866 ++ child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio;
9867 ++
9868 ++ /* We should allow proper increase of the snd/rcv-buffers. Thus, we
9869 ++ * use the original values instead of the bloated up ones from the
9870 ++ * clone.
9871 ++ */
9872 ++ child->sk_sndbuf = mpcb->orig_sk_sndbuf;
9873 ++ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
9874 ++
9875 ++ child_tp->mptcp->slave_sk = 1;
9876 ++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
9877 ++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
9878 ++ child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;
9879 ++
9880 ++ child_tp->tsq_flags = 0;
9881 ++
9882 ++ /* Subflows do not use the accept queue, as they
9883 ++ * are attached immediately to the mpcb.
9884 ++ */
9885 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
9886 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
9887 ++ reqsk_free(req);
9888 ++ return child;
9889 ++
9890 ++teardown:
9891 ++ /* Drop this request - sock creation failed. */
9892 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
9893 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
9894 ++ reqsk_free(req);
9895 ++ inet_csk_prepare_forced_close(child);
9896 ++ tcp_done(child);
9897 ++ return meta_sk;
9898 ++}
9899 ++
9900 ++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw)
9901 ++{
9902 ++ struct mptcp_tw *mptw;
9903 ++ struct tcp_sock *tp = tcp_sk(sk);
9904 ++ struct mptcp_cb *mpcb = tp->mpcb;
9905 ++
9906 ++ /* A subsocket in tw can only receive data. So, if we are in
9907 ++ * infinite-receive, then we should not reply with a data-ack or act
9908 ++ * upon general MPTCP-signaling. We prevent this by simply not creating
9909 ++ * the mptcp_tw_sock.
9910 ++ */
9911 ++ if (mpcb->infinite_mapping_rcv) {
9912 ++ tw->mptcp_tw = NULL;
9913 ++ return 0;
9914 ++ }
9915 ++
9916 ++ /* Alloc MPTCP-tw-sock */
9917 ++ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
9918 ++ if (!mptw)
9919 ++ return -ENOBUFS;
9920 ++
9921 ++ atomic_inc(&mpcb->mpcb_refcnt);
9922 ++
9923 ++ tw->mptcp_tw = mptw;
9924 ++ mptw->loc_key = mpcb->mptcp_loc_key;
9925 ++ mptw->meta_tw = mpcb->in_time_wait;
9926 ++ if (mptw->meta_tw) {
9927 ++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
9928 ++ if (mpcb->mptw_state != TCP_TIME_WAIT)
9929 ++ mptw->rcv_nxt++;
9930 ++ }
9931 ++ rcu_assign_pointer(mptw->mpcb, mpcb);
9932 ++
9933 ++ spin_lock(&mpcb->tw_lock);
9934 ++ list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
9935 ++ mptw->in_list = 1;
9936 ++ spin_unlock(&mpcb->tw_lock);
9937 ++
9938 ++ return 0;
9939 ++}
9940 ++
9941 ++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
9942 ++{
9943 ++ struct mptcp_cb *mpcb;
9944 ++
9945 ++ rcu_read_lock();
9946 ++ mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
9947 ++
9948 ++ /* If we are still holding a ref to the mpcb, we have to remove ourself
9949 ++ * from the list and drop the ref properly.
9950 ++ */
9951 ++ if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {
9952 ++ spin_lock(&mpcb->tw_lock);
9953 ++ if (tw->mptcp_tw->in_list) {
9954 ++ list_del_rcu(&tw->mptcp_tw->list);
9955 ++ tw->mptcp_tw->in_list = 0;
9956 ++ }
9957 ++ spin_unlock(&mpcb->tw_lock);
9958 ++
9959 ++ /* Twice, because we increased it above */
9960 ++ mptcp_mpcb_put(mpcb);
9961 ++ mptcp_mpcb_put(mpcb);
9962 ++ }
9963 ++
9964 ++ rcu_read_unlock();
9965 ++
9966 ++ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
9967 ++}
9968 ++
9969 ++/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
9970 ++ * data-fin.
9971 ++ */
9972 ++void mptcp_time_wait(struct sock *sk, int state, int timeo)
9973 ++{
9974 ++ struct tcp_sock *tp = tcp_sk(sk);
9975 ++ struct mptcp_tw *mptw;
9976 ++
9977 ++ /* Used for sockets that go into tw after the meta
9978 ++ * (see mptcp_init_tw_sock())
9979 ++ */
9980 ++ tp->mpcb->in_time_wait = 1;
9981 ++ tp->mpcb->mptw_state = state;
9982 ++
9983 ++ /* Update the time-wait-sock's information */
9984 ++ rcu_read_lock_bh();
9985 ++ list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {
9986 ++ mptw->meta_tw = 1;
9987 ++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);
9988 ++
9989 ++ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
9990 ++ * pretend as if the DATA_FIN has already reached us, that way
9991 ++ * the checks in tcp_timewait_state_process will be good as the
9992 ++ * DATA_FIN comes in.
9993 ++ */
9994 ++ if (state != TCP_TIME_WAIT)
9995 ++ mptw->rcv_nxt++;
9996 ++ }
9997 ++ rcu_read_unlock_bh();
9998 ++
9999 ++ tcp_done(sk);
10000 ++}
10001 ++
10002 ++void mptcp_tsq_flags(struct sock *sk)
10003 ++{
10004 ++ struct tcp_sock *tp = tcp_sk(sk);
10005 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
10006 ++
10007 ++ /* It will be handled as a regular deferred-call */
10008 ++ if (is_meta_sk(sk))
10009 ++ return;
10010 ++
10011 ++ if (hlist_unhashed(&tp->mptcp->cb_list)) {
10012 ++ hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list);
10013 ++ /* We need to hold it here, as the sock_hold is not assured
10014 ++ * by the release_sock as it is done in regular TCP.
10015 ++ *
10016 ++ * The subsocket may get inet_csk_destroy'd while it is inside
10017 ++ * the callback_list.
10018 ++ */
10019 ++ sock_hold(sk);
10020 ++ }
10021 ++
10022 ++ if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))
10023 ++ sock_hold(meta_sk);
10024 ++}
10025 ++
10026 ++void mptcp_tsq_sub_deferred(struct sock *meta_sk)
10027 ++{
10028 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
10029 ++ struct mptcp_tcp_sock *mptcp;
10030 ++ struct hlist_node *tmp;
10031 ++
10032 ++ BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);
10033 ++
10034 ++ __sock_put(meta_sk);
10035 ++ hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {
10036 ++ struct tcp_sock *tp = mptcp->tp;
10037 ++ struct sock *sk = (struct sock *)tp;
10038 ++
10039 ++ hlist_del_init(&mptcp->cb_list);
10040 ++ sk->sk_prot->release_cb(sk);
10041 ++ /* Final sock_put (cfr. mptcp_tsq_flags */
10042 ++ sock_put(sk);
10043 ++ }
10044 ++}
10045 ++
10046 ++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,
10047 ++ struct sk_buff *skb)
10048 ++{
10049 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
10050 ++ struct mptcp_options_received mopt;
10051 ++ u8 mptcp_hash_mac[20];
10052 ++
10053 ++ mptcp_init_mp_opt(&mopt);
10054 ++ tcp_parse_mptcp_options(skb, &mopt);
10055 ++
10056 ++ mtreq = mptcp_rsk(req);
10057 ++ mtreq->mptcp_mpcb = mpcb;
10058 ++ mtreq->is_sub = 1;
10059 ++ inet_rsk(req)->mptcp_rqsk = 1;
10060 ++
10061 ++ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
10062 ++
10063 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
10064 ++ (u8 *)&mpcb->mptcp_rem_key,
10065 ++ (u8 *)&mtreq->mptcp_loc_nonce,
10066 ++ (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
10067 ++ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
10068 ++
10069 ++ mtreq->rem_id = mopt.rem_id;
10070 ++ mtreq->rcv_low_prio = mopt.low_prio;
10071 ++ inet_rsk(req)->saw_mpc = 1;
10072 ++}
10073 ++
10074 ++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb)
10075 ++{
10076 ++ struct mptcp_options_received mopt;
10077 ++ struct mptcp_request_sock *mreq = mptcp_rsk(req);
10078 ++
10079 ++ mptcp_init_mp_opt(&mopt);
10080 ++ tcp_parse_mptcp_options(skb, &mopt);
10081 ++
10082 ++ mreq->is_sub = 0;
10083 ++ inet_rsk(req)->mptcp_rqsk = 1;
10084 ++ mreq->dss_csum = mopt.dss_csum;
10085 ++ mreq->hash_entry.pprev = NULL;
10086 ++
10087 ++ mptcp_reqsk_new_mptcp(req, &mopt, skb);
10088 ++}
10089 ++
10090 ++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb)
10091 ++{
10092 ++ struct mptcp_options_received mopt;
10093 ++ const struct tcp_sock *tp = tcp_sk(sk);
10094 ++ __u32 isn = TCP_SKB_CB(skb)->when;
10095 ++ bool want_cookie = false;
10096 ++
10097 ++ if ((sysctl_tcp_syncookies == 2 ||
10098 ++ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
10099 ++ want_cookie = tcp_syn_flood_action(sk, skb,
10100 ++ mptcp_request_sock_ops.slab_name);
10101 ++ if (!want_cookie)
10102 ++ goto drop;
10103 ++ }
10104 ++
10105 ++ mptcp_init_mp_opt(&mopt);
10106 ++ tcp_parse_mptcp_options(skb, &mopt);
10107 ++
10108 ++ if (mopt.is_mp_join)
10109 ++ return mptcp_do_join_short(skb, &mopt, sock_net(sk));
10110 ++ if (mopt.drop_me)
10111 ++ goto drop;
10112 ++
10113 ++ if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)
10114 ++ mopt.saw_mpc = 0;
10115 ++
10116 ++ if (skb->protocol == htons(ETH_P_IP)) {
10117 ++ if (mopt.saw_mpc && !want_cookie) {
10118 ++ if (skb_rtable(skb)->rt_flags &
10119 ++ (RTCF_BROADCAST | RTCF_MULTICAST))
10120 ++ goto drop;
10121 ++
10122 ++ return tcp_conn_request(&mptcp_request_sock_ops,
10123 ++ &mptcp_request_sock_ipv4_ops,
10124 ++ sk, skb);
10125 ++ }
10126 ++
10127 ++ return tcp_v4_conn_request(sk, skb);
10128 ++#if IS_ENABLED(CONFIG_IPV6)
10129 ++ } else {
10130 ++ if (mopt.saw_mpc && !want_cookie) {
10131 ++ if (!ipv6_unicast_destination(skb))
10132 ++ goto drop;
10133 ++
10134 ++ return tcp_conn_request(&mptcp6_request_sock_ops,
10135 ++ &mptcp_request_sock_ipv6_ops,
10136 ++ sk, skb);
10137 ++ }
10138 ++
10139 ++ return tcp_v6_conn_request(sk, skb);
10140 ++#endif
10141 ++ }
10142 ++drop:
10143 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
10144 ++ return 0;
10145 ++}
10146 ++
10147 ++struct workqueue_struct *mptcp_wq;
10148 ++EXPORT_SYMBOL(mptcp_wq);
10149 ++
10150 ++/* Output /proc/net/mptcp */
10151 ++static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
10152 ++{
10153 ++ struct tcp_sock *meta_tp;
10154 ++ const struct net *net = seq->private;
10155 ++ int i, n = 0;
10156 ++
10157 ++ seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode");
10158 ++ seq_putc(seq, '\n');
10159 ++
10160 ++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
10161 ++ struct hlist_nulls_node *node;
10162 ++ rcu_read_lock_bh();
10163 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node,
10164 ++ &tk_hashtable[i], tk_table) {
10165 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
10166 ++ struct sock *meta_sk = (struct sock *)meta_tp;
10167 ++ struct inet_sock *isk = inet_sk(meta_sk);
10168 ++
10169 ++ if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk)))
10170 ++ continue;
10171 ++
10172 ++ if (capable(CAP_NET_ADMIN)) {
10173 ++ seq_printf(seq, "%4d: %04X %04X ", n++,
10174 ++ mpcb->mptcp_loc_token,
10175 ++ mpcb->mptcp_rem_token);
10176 ++ } else {
10177 ++ seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1);
10178 ++ }
10179 ++ if (meta_sk->sk_family == AF_INET ||
10180 ++ mptcp_v6_is_v4_mapped(meta_sk)) {
10181 ++ seq_printf(seq, " 0 %08X:%04X %08X:%04X ",
10182 ++ isk->inet_rcv_saddr,
10183 ++ ntohs(isk->inet_sport),
10184 ++ isk->inet_daddr,
10185 ++ ntohs(isk->inet_dport));
10186 ++#if IS_ENABLED(CONFIG_IPV6)
10187 ++ } else if (meta_sk->sk_family == AF_INET6) {
10188 ++ struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr;
10189 ++ struct in6_addr *dst = &meta_sk->sk_v6_daddr;
10190 ++ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
10191 ++ src->s6_addr32[0], src->s6_addr32[1],
10192 ++ src->s6_addr32[2], src->s6_addr32[3],
10193 ++ ntohs(isk->inet_sport),
10194 ++ dst->s6_addr32[0], dst->s6_addr32[1],
10195 ++ dst->s6_addr32[2], dst->s6_addr32[3],
10196 ++ ntohs(isk->inet_dport));
10197 ++#endif
10198 ++ }
10199 ++ seq_printf(seq, " %02X %02X %08X:%08X %lu",
10200 ++ meta_sk->sk_state, mpcb->cnt_subflows,
10201 ++ meta_tp->write_seq - meta_tp->snd_una,
10202 ++ max_t(int, meta_tp->rcv_nxt -
10203 ++ meta_tp->copied_seq, 0),
10204 ++ sock_i_ino(meta_sk));
10205 ++ seq_putc(seq, '\n');
10206 ++ }
10207 ++
10208 ++ rcu_read_unlock_bh();
10209 ++ }
10210 ++
10211 ++ return 0;
10212 ++}
10213 ++
10214 ++static int mptcp_pm_seq_open(struct inode *inode, struct file *file)
10215 ++{
10216 ++ return single_open_net(inode, file, mptcp_pm_seq_show);
10217 ++}
10218 ++
10219 ++static const struct file_operations mptcp_pm_seq_fops = {
10220 ++ .owner = THIS_MODULE,
10221 ++ .open = mptcp_pm_seq_open,
10222 ++ .read = seq_read,
10223 ++ .llseek = seq_lseek,
10224 ++ .release = single_release_net,
10225 ++};
10226 ++
10227 ++static int mptcp_pm_init_net(struct net *net)
10228 ++{
10229 ++ if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))
10230 ++ return -ENOMEM;
10231 ++
10232 ++ return 0;
10233 ++}
10234 ++
10235 ++static void mptcp_pm_exit_net(struct net *net)
10236 ++{
10237 ++ remove_proc_entry("mptcp", net->proc_net);
10238 ++}
10239 ++
10240 ++static struct pernet_operations mptcp_pm_proc_ops = {
10241 ++ .init = mptcp_pm_init_net,
10242 ++ .exit = mptcp_pm_exit_net,
10243 ++};
10244 ++
10245 ++/* General initialization of mptcp */
10246 ++void __init mptcp_init(void)
10247 ++{
10248 ++ int i;
10249 ++ struct ctl_table_header *mptcp_sysctl;
10250 ++
10251 ++ mptcp_sock_cache = kmem_cache_create("mptcp_sock",
10252 ++ sizeof(struct mptcp_tcp_sock),
10253 ++ 0, SLAB_HWCACHE_ALIGN,
10254 ++ NULL);
10255 ++ if (!mptcp_sock_cache)
10256 ++ goto mptcp_sock_cache_failed;
10257 ++
10258 ++ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
10259 ++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
10260 ++ NULL);
10261 ++ if (!mptcp_cb_cache)
10262 ++ goto mptcp_cb_cache_failed;
10263 ++
10264 ++ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
10265 ++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
10266 ++ NULL);
10267 ++ if (!mptcp_tw_cache)
10268 ++ goto mptcp_tw_cache_failed;
10269 ++
10270 ++ get_random_bytes(mptcp_secret, sizeof(mptcp_secret));
10271 ++
10272 ++ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
10273 ++ if (!mptcp_wq)
10274 ++ goto alloc_workqueue_failed;
10275 ++
10276 ++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
10277 ++ INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);
10278 ++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_htb[i],
10279 ++ i + MPTCP_REQSK_NULLS_BASE);
10280 ++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);
10281 ++ }
10282 ++
10283 ++ spin_lock_init(&mptcp_reqsk_hlock);
10284 ++ spin_lock_init(&mptcp_tk_hashlock);
10285 ++
10286 ++ if (register_pernet_subsys(&mptcp_pm_proc_ops))
10287 ++ goto pernet_failed;
10288 ++
10289 ++#if IS_ENABLED(CONFIG_IPV6)
10290 ++ if (mptcp_pm_v6_init())
10291 ++ goto mptcp_pm_v6_failed;
10292 ++#endif
10293 ++ if (mptcp_pm_v4_init())
10294 ++ goto mptcp_pm_v4_failed;
10295 ++
10296 ++ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
10297 ++ if (!mptcp_sysctl)
10298 ++ goto register_sysctl_failed;
10299 ++
10300 ++ if (mptcp_register_path_manager(&mptcp_pm_default))
10301 ++ goto register_pm_failed;
10302 ++
10303 ++ if (mptcp_register_scheduler(&mptcp_sched_default))
10304 ++ goto register_sched_failed;
10305 ++
10306 ++ pr_info("MPTCP: Stable release v0.89.0-rc");
10307 ++
10308 ++ mptcp_init_failed = false;
10309 ++
10310 ++ return;
10311 ++
10312 ++register_sched_failed:
10313 ++ mptcp_unregister_path_manager(&mptcp_pm_default);
10314 ++register_pm_failed:
10315 ++ unregister_net_sysctl_table(mptcp_sysctl);
10316 ++register_sysctl_failed:
10317 ++ mptcp_pm_v4_undo();
10318 ++mptcp_pm_v4_failed:
10319 ++#if IS_ENABLED(CONFIG_IPV6)
10320 ++ mptcp_pm_v6_undo();
10321 ++mptcp_pm_v6_failed:
10322 ++#endif
10323 ++ unregister_pernet_subsys(&mptcp_pm_proc_ops);
10324 ++pernet_failed:
10325 ++ destroy_workqueue(mptcp_wq);
10326 ++alloc_workqueue_failed:
10327 ++ kmem_cache_destroy(mptcp_tw_cache);
10328 ++mptcp_tw_cache_failed:
10329 ++ kmem_cache_destroy(mptcp_cb_cache);
10330 ++mptcp_cb_cache_failed:
10331 ++ kmem_cache_destroy(mptcp_sock_cache);
10332 ++mptcp_sock_cache_failed:
10333 ++ mptcp_init_failed = true;
10334 ++}
10335 +diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c
10336 +new file mode 100644
10337 +index 000000000000..3a54413ce25b
10338 +--- /dev/null
10339 ++++ b/net/mptcp/mptcp_fullmesh.c
10340 +@@ -0,0 +1,1722 @@
10341 ++#include <linux/module.h>
10342 ++
10343 ++#include <net/mptcp.h>
10344 ++#include <net/mptcp_v4.h>
10345 ++
10346 ++#if IS_ENABLED(CONFIG_IPV6)
10347 ++#include <net/mptcp_v6.h>
10348 ++#include <net/addrconf.h>
10349 ++#endif
10350 ++
10351 ++enum {
10352 ++ MPTCP_EVENT_ADD = 1,
10353 ++ MPTCP_EVENT_DEL,
10354 ++ MPTCP_EVENT_MOD,
10355 ++};
10356 ++
10357 ++#define MPTCP_SUBFLOW_RETRY_DELAY 1000
10358 ++
10359 ++/* Max number of local or remote addresses we can store.
10360 ++ * When changing, see the bitfield below in fullmesh_rem4/6.
10361 ++ */
10362 ++#define MPTCP_MAX_ADDR 8
10363 ++
10364 ++struct fullmesh_rem4 {
10365 ++ u8 rem4_id;
10366 ++ u8 bitfield;
10367 ++ u8 retry_bitfield;
10368 ++ __be16 port;
10369 ++ struct in_addr addr;
10370 ++};
10371 ++
10372 ++struct fullmesh_rem6 {
10373 ++ u8 rem6_id;
10374 ++ u8 bitfield;
10375 ++ u8 retry_bitfield;
10376 ++ __be16 port;
10377 ++ struct in6_addr addr;
10378 ++};
10379 ++
10380 ++struct mptcp_loc_addr {
10381 ++ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
10382 ++ u8 loc4_bits;
10383 ++ u8 next_v4_index;
10384 ++
10385 ++ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
10386 ++ u8 loc6_bits;
10387 ++ u8 next_v6_index;
10388 ++};
10389 ++
10390 ++struct mptcp_addr_event {
10391 ++ struct list_head list;
10392 ++ unsigned short family;
10393 ++ u8 code:7,
10394 ++ low_prio:1;
10395 ++ union inet_addr addr;
10396 ++};
10397 ++
10398 ++struct fullmesh_priv {
10399 ++ /* Worker struct for subflow establishment */
10400 ++ struct work_struct subflow_work;
10401 ++ /* Delayed worker, when the routing-tables are not yet ready. */
10402 ++ struct delayed_work subflow_retry_work;
10403 ++
10404 ++ /* Remote addresses */
10405 ++ struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR];
10406 ++ struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR];
10407 ++
10408 ++ struct mptcp_cb *mpcb;
10409 ++
10410 ++ u16 remove_addrs; /* Addresses to remove */
10411 ++ u8 announced_addrs_v4; /* IPv4 Addresses we did announce */
10412 ++ u8 announced_addrs_v6; /* IPv6 Addresses we did announce */
10413 ++
10414 ++ u8 add_addr; /* Are we sending an add_addr? */
10415 ++
10416 ++ u8 rem4_bits;
10417 ++ u8 rem6_bits;
10418 ++};
10419 ++
10420 ++struct mptcp_fm_ns {
10421 ++ struct mptcp_loc_addr __rcu *local;
10422 ++ spinlock_t local_lock; /* Protecting the above pointer */
10423 ++ struct list_head events;
10424 ++ struct delayed_work address_worker;
10425 ++
10426 ++ struct net *net;
10427 ++};
10428 ++
10429 ++static struct mptcp_pm_ops full_mesh __read_mostly;
10430 ++
10431 ++static void full_mesh_create_subflows(struct sock *meta_sk);
10432 ++
10433 ++static struct mptcp_fm_ns *fm_get_ns(const struct net *net)
10434 ++{
10435 ++ return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];
10436 ++}
10437 ++
10438 ++static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb)
10439 ++{
10440 ++ return (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
10441 ++}
10442 ++
10443 ++/* Find the first free index in the bitfield */
10444 ++static int __mptcp_find_free_index(u8 bitfield, u8 base)
10445 ++{
10446 ++ int i;
10447 ++
10448 ++ /* There are anyways no free bits... */
10449 ++ if (bitfield == 0xff)
10450 ++ goto exit;
10451 ++
10452 ++ i = ffs(~(bitfield >> base)) - 1;
10453 ++ if (i < 0)
10454 ++ goto exit;
10455 ++
10456 ++ /* No free bits when starting at base, try from 0 on */
10457 ++ if (i + base >= sizeof(bitfield) * 8)
10458 ++ return __mptcp_find_free_index(bitfield, 0);
10459 ++
10460 ++ return i + base;
10461 ++exit:
10462 ++ return -1;
10463 ++}
10464 ++
10465 ++static int mptcp_find_free_index(u8 bitfield)
10466 ++{
10467 ++ return __mptcp_find_free_index(bitfield, 0);
10468 ++}
10469 ++
10470 ++static void mptcp_addv4_raddr(struct mptcp_cb *mpcb,
10471 ++ const struct in_addr *addr,
10472 ++ __be16 port, u8 id)
10473 ++{
10474 ++ int i;
10475 ++ struct fullmesh_rem4 *rem4;
10476 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10477 ++
10478 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10479 ++ rem4 = &fmp->remaddr4[i];
10480 ++
10481 ++ /* Address is already in the list --- continue */
10482 ++ if (rem4->rem4_id == id &&
10483 ++ rem4->addr.s_addr == addr->s_addr && rem4->port == port)
10484 ++ return;
10485 ++
10486 ++ /* This may be the case, when the peer is behind a NAT. He is
10487 ++ * trying to JOIN, thus sending the JOIN with a certain ID.
10488 ++ * However the src_addr of the IP-packet has been changed. We
10489 ++ * update the addr in the list, because this is the address as
10490 ++ * OUR BOX sees it.
10491 ++ */
10492 ++ if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {
10493 ++ /* update the address */
10494 ++ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
10495 ++ __func__, &rem4->addr.s_addr,
10496 ++ &addr->s_addr, id);
10497 ++ rem4->addr.s_addr = addr->s_addr;
10498 ++ rem4->port = port;
10499 ++ mpcb->list_rcvd = 1;
10500 ++ return;
10501 ++ }
10502 ++ }
10503 ++
10504 ++ i = mptcp_find_free_index(fmp->rem4_bits);
10505 ++ /* Do we have already the maximum number of local/remote addresses? */
10506 ++ if (i < 0) {
10507 ++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
10508 ++ __func__, MPTCP_MAX_ADDR, &addr->s_addr);
10509 ++ return;
10510 ++ }
10511 ++
10512 ++ rem4 = &fmp->remaddr4[i];
10513 ++
10514 ++ /* Address is not known yet, store it */
10515 ++ rem4->addr.s_addr = addr->s_addr;
10516 ++ rem4->port = port;
10517 ++ rem4->bitfield = 0;
10518 ++ rem4->retry_bitfield = 0;
10519 ++ rem4->rem4_id = id;
10520 ++ mpcb->list_rcvd = 1;
10521 ++ fmp->rem4_bits |= (1 << i);
10522 ++
10523 ++ return;
10524 ++}
10525 ++
10526 ++static void mptcp_addv6_raddr(struct mptcp_cb *mpcb,
10527 ++ const struct in6_addr *addr,
10528 ++ __be16 port, u8 id)
10529 ++{
10530 ++ int i;
10531 ++ struct fullmesh_rem6 *rem6;
10532 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10533 ++
10534 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10535 ++ rem6 = &fmp->remaddr6[i];
10536 ++
10537 ++ /* Address is already in the list --- continue */
10538 ++ if (rem6->rem6_id == id &&
10539 ++ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
10540 ++ return;
10541 ++
10542 ++ /* This may be the case, when the peer is behind a NAT. He is
10543 ++ * trying to JOIN, thus sending the JOIN with a certain ID.
10544 ++ * However the src_addr of the IP-packet has been changed. We
10545 ++ * update the addr in the list, because this is the address as
10546 ++ * OUR BOX sees it.
10547 ++ */
10548 ++ if (rem6->rem6_id == id) {
10549 ++ /* update the address */
10550 ++ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
10551 ++ __func__, &rem6->addr, addr, id);
10552 ++ rem6->addr = *addr;
10553 ++ rem6->port = port;
10554 ++ mpcb->list_rcvd = 1;
10555 ++ return;
10556 ++ }
10557 ++ }
10558 ++
10559 ++ i = mptcp_find_free_index(fmp->rem6_bits);
10560 ++ /* Do we have already the maximum number of local/remote addresses? */
10561 ++ if (i < 0) {
10562 ++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
10563 ++ __func__, MPTCP_MAX_ADDR, addr);
10564 ++ return;
10565 ++ }
10566 ++
10567 ++ rem6 = &fmp->remaddr6[i];
10568 ++
10569 ++ /* Address is not known yet, store it */
10570 ++ rem6->addr = *addr;
10571 ++ rem6->port = port;
10572 ++ rem6->bitfield = 0;
10573 ++ rem6->retry_bitfield = 0;
10574 ++ rem6->rem6_id = id;
10575 ++ mpcb->list_rcvd = 1;
10576 ++ fmp->rem6_bits |= (1 << i);
10577 ++
10578 ++ return;
10579 ++}
10580 ++
10581 ++static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
10582 ++{
10583 ++ int i;
10584 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10585 ++
10586 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10587 ++ if (fmp->remaddr4[i].rem4_id == id) {
10588 ++ /* remove address from bitfield */
10589 ++ fmp->rem4_bits &= ~(1 << i);
10590 ++
10591 ++ break;
10592 ++ }
10593 ++ }
10594 ++}
10595 ++
10596 ++static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id)
10597 ++{
10598 ++ int i;
10599 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10600 ++
10601 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10602 ++ if (fmp->remaddr6[i].rem6_id == id) {
10603 ++ /* remove address from bitfield */
10604 ++ fmp->rem6_bits &= ~(1 << i);
10605 ++
10606 ++ break;
10607 ++ }
10608 ++ }
10609 ++}
10610 ++
10611 ++/* Sets the bitfield of the remote-address field */
10612 ++static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb,
10613 ++ const struct in_addr *addr, u8 index)
10614 ++{
10615 ++ int i;
10616 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10617 ++
10618 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10619 ++ if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) {
10620 ++ fmp->remaddr4[i].bitfield |= (1 << index);
10621 ++ return;
10622 ++ }
10623 ++ }
10624 ++}
10625 ++
10626 ++/* Sets the bitfield of the remote-address field */
10627 ++static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
10628 ++ const struct in6_addr *addr, u8 index)
10629 ++{
10630 ++ int i;
10631 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10632 ++
10633 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10634 ++ if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) {
10635 ++ fmp->remaddr6[i].bitfield |= (1 << index);
10636 ++ return;
10637 ++ }
10638 ++ }
10639 ++}
10640 ++
10641 ++static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb,
10642 ++ const union inet_addr *addr,
10643 ++ sa_family_t family, u8 id)
10644 ++{
10645 ++ if (family == AF_INET)
10646 ++ mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id);
10647 ++ else
10648 ++ mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id);
10649 ++}
10650 ++
10651 ++static void retry_subflow_worker(struct work_struct *work)
10652 ++{
10653 ++ struct delayed_work *delayed_work = container_of(work,
10654 ++ struct delayed_work,
10655 ++ work);
10656 ++ struct fullmesh_priv *fmp = container_of(delayed_work,
10657 ++ struct fullmesh_priv,
10658 ++ subflow_retry_work);
10659 ++ struct mptcp_cb *mpcb = fmp->mpcb;
10660 ++ struct sock *meta_sk = mpcb->meta_sk;
10661 ++ struct mptcp_loc_addr *mptcp_local;
10662 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
10663 ++ int iter = 0, i;
10664 ++
10665 ++ /* We need a local (stable) copy of the address-list. Really, it is not
10666 ++ * such a big deal, if the address-list is not 100% up-to-date.
10667 ++ */
10668 ++ rcu_read_lock_bh();
10669 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
10670 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
10671 ++ rcu_read_unlock_bh();
10672 ++
10673 ++ if (!mptcp_local)
10674 ++ return;
10675 ++
10676 ++next_subflow:
10677 ++ if (iter) {
10678 ++ release_sock(meta_sk);
10679 ++ mutex_unlock(&mpcb->mpcb_mutex);
10680 ++
10681 ++ cond_resched();
10682 ++ }
10683 ++ mutex_lock(&mpcb->mpcb_mutex);
10684 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
10685 ++
10686 ++ iter++;
10687 ++
10688 ++ if (sock_flag(meta_sk, SOCK_DEAD))
10689 ++ goto exit;
10690 ++
10691 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10692 ++ struct fullmesh_rem4 *rem = &fmp->remaddr4[i];
10693 ++ /* Do we need to retry establishing a subflow ? */
10694 ++ if (rem->retry_bitfield) {
10695 ++ int i = mptcp_find_free_index(~rem->retry_bitfield);
10696 ++ struct mptcp_rem4 rem4;
10697 ++
10698 ++ rem->bitfield |= (1 << i);
10699 ++ rem->retry_bitfield &= ~(1 << i);
10700 ++
10701 ++ rem4.addr = rem->addr;
10702 ++ rem4.port = rem->port;
10703 ++ rem4.rem4_id = rem->rem4_id;
10704 ++
10705 ++ mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4);
10706 ++ goto next_subflow;
10707 ++ }
10708 ++ }
10709 ++
10710 ++#if IS_ENABLED(CONFIG_IPV6)
10711 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10712 ++ struct fullmesh_rem6 *rem = &fmp->remaddr6[i];
10713 ++
10714 ++ /* Do we need to retry establishing a subflow ? */
10715 ++ if (rem->retry_bitfield) {
10716 ++ int i = mptcp_find_free_index(~rem->retry_bitfield);
10717 ++ struct mptcp_rem6 rem6;
10718 ++
10719 ++ rem->bitfield |= (1 << i);
10720 ++ rem->retry_bitfield &= ~(1 << i);
10721 ++
10722 ++ rem6.addr = rem->addr;
10723 ++ rem6.port = rem->port;
10724 ++ rem6.rem6_id = rem->rem6_id;
10725 ++
10726 ++ mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6);
10727 ++ goto next_subflow;
10728 ++ }
10729 ++ }
10730 ++#endif
10731 ++
10732 ++exit:
10733 ++ kfree(mptcp_local);
10734 ++ release_sock(meta_sk);
10735 ++ mutex_unlock(&mpcb->mpcb_mutex);
10736 ++ sock_put(meta_sk);
10737 ++}
10738 ++
10739 ++/**
10740 ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
10741 ++ *
10742 ++ * This function uses a goto next_subflow, to allow releasing the lock between
10743 ++ * new subflows and giving other processes a chance to do some work on the
10744 ++ * socket and potentially finishing the communication.
10745 ++ **/
10746 ++static void create_subflow_worker(struct work_struct *work)
10747 ++{
10748 ++ struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv,
10749 ++ subflow_work);
10750 ++ struct mptcp_cb *mpcb = fmp->mpcb;
10751 ++ struct sock *meta_sk = mpcb->meta_sk;
10752 ++ struct mptcp_loc_addr *mptcp_local;
10753 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
10754 ++ int iter = 0, retry = 0;
10755 ++ int i;
10756 ++
10757 ++ /* We need a local (stable) copy of the address-list. Really, it is not
10758 ++ * such a big deal, if the address-list is not 100% up-to-date.
10759 ++ */
10760 ++ rcu_read_lock_bh();
10761 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
10762 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
10763 ++ rcu_read_unlock_bh();
10764 ++
10765 ++ if (!mptcp_local)
10766 ++ return;
10767 ++
10768 ++next_subflow:
10769 ++ if (iter) {
10770 ++ release_sock(meta_sk);
10771 ++ mutex_unlock(&mpcb->mpcb_mutex);
10772 ++
10773 ++ cond_resched();
10774 ++ }
10775 ++ mutex_lock(&mpcb->mpcb_mutex);
10776 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
10777 ++
10778 ++ iter++;
10779 ++
10780 ++ if (sock_flag(meta_sk, SOCK_DEAD))
10781 ++ goto exit;
10782 ++
10783 ++ if (mpcb->master_sk &&
10784 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
10785 ++ goto exit;
10786 ++
10787 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10788 ++ struct fullmesh_rem4 *rem;
10789 ++ u8 remaining_bits;
10790 ++
10791 ++ rem = &fmp->remaddr4[i];
10792 ++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;
10793 ++
10794 ++ /* Are there still combinations to handle? */
10795 ++ if (remaining_bits) {
10796 ++ int i = mptcp_find_free_index(~remaining_bits);
10797 ++ struct mptcp_rem4 rem4;
10798 ++
10799 ++ rem->bitfield |= (1 << i);
10800 ++
10801 ++ rem4.addr = rem->addr;
10802 ++ rem4.port = rem->port;
10803 ++ rem4.rem4_id = rem->rem4_id;
10804 ++
10805 ++ /* If a route is not yet available then retry once */
10806 ++ if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],
10807 ++ &rem4) == -ENETUNREACH)
10808 ++ retry = rem->retry_bitfield |= (1 << i);
10809 ++ goto next_subflow;
10810 ++ }
10811 ++ }
10812 ++
10813 ++#if IS_ENABLED(CONFIG_IPV6)
10814 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10815 ++ struct fullmesh_rem6 *rem;
10816 ++ u8 remaining_bits;
10817 ++
10818 ++ rem = &fmp->remaddr6[i];
10819 ++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;
10820 ++
10821 ++ /* Are there still combinations to handle? */
10822 ++ if (remaining_bits) {
10823 ++ int i = mptcp_find_free_index(~remaining_bits);
10824 ++ struct mptcp_rem6 rem6;
10825 ++
10826 ++ rem->bitfield |= (1 << i);
10827 ++
10828 ++ rem6.addr = rem->addr;
10829 ++ rem6.port = rem->port;
10830 ++ rem6.rem6_id = rem->rem6_id;
10831 ++
10832 ++ /* If a route is not yet available then retry once */
10833 ++ if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],
10834 ++ &rem6) == -ENETUNREACH)
10835 ++ retry = rem->retry_bitfield |= (1 << i);
10836 ++ goto next_subflow;
10837 ++ }
10838 ++ }
10839 ++#endif
10840 ++
10841 ++ if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) {
10842 ++ sock_hold(meta_sk);
10843 ++ queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work,
10844 ++ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
10845 ++ }
10846 ++
10847 ++exit:
10848 ++ kfree(mptcp_local);
10849 ++ release_sock(meta_sk);
10850 ++ mutex_unlock(&mpcb->mpcb_mutex);
10851 ++ sock_put(meta_sk);
10852 ++}
10853 ++
10854 ++static void announce_remove_addr(u8 addr_id, struct sock *meta_sk)
10855 ++{
10856 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
10857 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10858 ++ struct sock *sk = mptcp_select_ack_sock(meta_sk);
10859 ++
10860 ++ fmp->remove_addrs |= (1 << addr_id);
10861 ++ mpcb->addr_signal = 1;
10862 ++
10863 ++ if (sk)
10864 ++ tcp_send_ack(sk);
10865 ++}
10866 ++
10867 ++static void update_addr_bitfields(struct sock *meta_sk,
10868 ++ const struct mptcp_loc_addr *mptcp_local)
10869 ++{
10870 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
10871 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10872 ++ int i;
10873 ++
10874 ++ /* The bits in announced_addrs_* always match with loc*_bits. So, a
10875 ++ * simply & operation unsets the correct bits, because these go from
10876 ++ * announced to non-announced
10877 ++ */
10878 ++ fmp->announced_addrs_v4 &= mptcp_local->loc4_bits;
10879 ++
10880 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10881 ++ fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits;
10882 ++ fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;
10883 ++ }
10884 ++
10885 ++ fmp->announced_addrs_v6 &= mptcp_local->loc6_bits;
10886 ++
10887 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10888 ++ fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits;
10889 ++ fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;
10890 ++ }
10891 ++}
10892 ++
10893 ++static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local,
10894 ++ sa_family_t family, const union inet_addr *addr)
10895 ++{
10896 ++ int i;
10897 ++ u8 loc_bits;
10898 ++ bool found = false;
10899 ++
10900 ++ if (family == AF_INET)
10901 ++ loc_bits = mptcp_local->loc4_bits;
10902 ++ else
10903 ++ loc_bits = mptcp_local->loc6_bits;
10904 ++
10905 ++ mptcp_for_each_bit_set(loc_bits, i) {
10906 ++ if (family == AF_INET &&
10907 ++ mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {
10908 ++ found = true;
10909 ++ break;
10910 ++ }
10911 ++ if (family == AF_INET6 &&
10912 ++ ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,
10913 ++ &addr->in6)) {
10914 ++ found = true;
10915 ++ break;
10916 ++ }
10917 ++ }
10918 ++
10919 ++ if (!found)
10920 ++ return -1;
10921 ++
10922 ++ return i;
10923 ++}
10924 ++
10925 ++static void mptcp_address_worker(struct work_struct *work)
10926 ++{
10927 ++ const struct delayed_work *delayed_work = container_of(work,
10928 ++ struct delayed_work,
10929 ++ work);
10930 ++ struct mptcp_fm_ns *fm_ns = container_of(delayed_work,
10931 ++ struct mptcp_fm_ns,
10932 ++ address_worker);
10933 ++ struct net *net = fm_ns->net;
10934 ++ struct mptcp_addr_event *event = NULL;
10935 ++ struct mptcp_loc_addr *mptcp_local, *old;
10936 ++ int i, id = -1; /* id is used in the socket-code on a delete-event */
10937 ++ bool success; /* Used to indicate if we succeeded handling the event */
10938 ++
10939 ++next_event:
10940 ++ success = false;
10941 ++ kfree(event);
10942 ++
10943 ++ /* First, let's dequeue an event from our event-list */
10944 ++ rcu_read_lock_bh();
10945 ++ spin_lock(&fm_ns->local_lock);
10946 ++
10947 ++ event = list_first_entry_or_null(&fm_ns->events,
10948 ++ struct mptcp_addr_event, list);
10949 ++ if (!event) {
10950 ++ spin_unlock(&fm_ns->local_lock);
10951 ++ rcu_read_unlock_bh();
10952 ++ return;
10953 ++ }
10954 ++
10955 ++ list_del(&event->list);
10956 ++
10957 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
10958 ++
10959 ++ if (event->code == MPTCP_EVENT_DEL) {
10960 ++ id = mptcp_find_address(mptcp_local, event->family, &event->addr);
10961 ++
10962 ++ /* Not in the list - so we don't care */
10963 ++ if (id < 0) {
10964 ++ mptcp_debug("%s could not find id\n", __func__);
10965 ++ goto duno;
10966 ++ }
10967 ++
10968 ++ old = mptcp_local;
10969 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
10970 ++ GFP_ATOMIC);
10971 ++ if (!mptcp_local)
10972 ++ goto duno;
10973 ++
10974 ++ if (event->family == AF_INET)
10975 ++ mptcp_local->loc4_bits &= ~(1 << id);
10976 ++ else
10977 ++ mptcp_local->loc6_bits &= ~(1 << id);
10978 ++
10979 ++ rcu_assign_pointer(fm_ns->local, mptcp_local);
10980 ++ kfree(old);
10981 ++ } else {
10982 ++ int i = mptcp_find_address(mptcp_local, event->family, &event->addr);
10983 ++ int j = i;
10984 ++
10985 ++ if (j < 0) {
10986 ++ /* Not in the list, so we have to find an empty slot */
10987 ++ if (event->family == AF_INET)
10988 ++ i = __mptcp_find_free_index(mptcp_local->loc4_bits,
10989 ++ mptcp_local->next_v4_index);
10990 ++ if (event->family == AF_INET6)
10991 ++ i = __mptcp_find_free_index(mptcp_local->loc6_bits,
10992 ++ mptcp_local->next_v6_index);
10993 ++
10994 ++ if (i < 0) {
10995 ++ mptcp_debug("%s no more space\n", __func__);
10996 ++ goto duno;
10997 ++ }
10998 ++
10999 ++ /* It might have been a MOD-event. */
11000 ++ event->code = MPTCP_EVENT_ADD;
11001 ++ } else {
11002 ++ /* Let's check if anything changes */
11003 ++ if (event->family == AF_INET &&
11004 ++ event->low_prio == mptcp_local->locaddr4[i].low_prio)
11005 ++ goto duno;
11006 ++
11007 ++ if (event->family == AF_INET6 &&
11008 ++ event->low_prio == mptcp_local->locaddr6[i].low_prio)
11009 ++ goto duno;
11010 ++ }
11011 ++
11012 ++ old = mptcp_local;
11013 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
11014 ++ GFP_ATOMIC);
11015 ++ if (!mptcp_local)
11016 ++ goto duno;
11017 ++
11018 ++ if (event->family == AF_INET) {
11019 ++ mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;
11020 ++ mptcp_local->locaddr4[i].loc4_id = i + 1;
11021 ++ mptcp_local->locaddr4[i].low_prio = event->low_prio;
11022 ++ } else {
11023 ++ mptcp_local->locaddr6[i].addr = event->addr.in6;
11024 ++ mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;
11025 ++ mptcp_local->locaddr6[i].low_prio = event->low_prio;
11026 ++ }
11027 ++
11028 ++ if (j < 0) {
11029 ++ if (event->family == AF_INET) {
11030 ++ mptcp_local->loc4_bits |= (1 << i);
11031 ++ mptcp_local->next_v4_index = i + 1;
11032 ++ } else {
11033 ++ mptcp_local->loc6_bits |= (1 << i);
11034 ++ mptcp_local->next_v6_index = i + 1;
11035 ++ }
11036 ++ }
11037 ++
11038 ++ rcu_assign_pointer(fm_ns->local, mptcp_local);
11039 ++ kfree(old);
11040 ++ }
11041 ++ success = true;
11042 ++
11043 ++duno:
11044 ++ spin_unlock(&fm_ns->local_lock);
11045 ++ rcu_read_unlock_bh();
11046 ++
11047 ++ if (!success)
11048 ++ goto next_event;
11049 ++
11050 ++ /* Now we iterate over the MPTCP-sockets and apply the event. */
11051 ++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
11052 ++ const struct hlist_nulls_node *node;
11053 ++ struct tcp_sock *meta_tp;
11054 ++
11055 ++ rcu_read_lock_bh();
11056 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],
11057 ++ tk_table) {
11058 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
11059 ++ struct sock *meta_sk = (struct sock *)meta_tp, *sk;
11060 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11061 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11062 ++
11063 ++ if (sock_net(meta_sk) != net)
11064 ++ continue;
11065 ++
11066 ++ if (meta_v4) {
11067 ++ /* skip IPv6 events if meta is IPv4 */
11068 ++ if (event->family == AF_INET6)
11069 ++ continue;
11070 ++ }
11071 ++ /* skip IPv4 events if IPV6_V6ONLY is set */
11072 ++ else if (event->family == AF_INET &&
11073 ++ inet6_sk(meta_sk)->ipv6only)
11074 ++ continue;
11075 ++
11076 ++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
11077 ++ continue;
11078 ++
11079 ++ bh_lock_sock(meta_sk);
11080 ++
11081 ++ if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) ||
11082 ++ mpcb->infinite_mapping_snd ||
11083 ++ mpcb->infinite_mapping_rcv ||
11084 ++ mpcb->send_infinite_mapping)
11085 ++ goto next;
11086 ++
11087 ++ /* May be that the pm has changed in-between */
11088 ++ if (mpcb->pm_ops != &full_mesh)
11089 ++ goto next;
11090 ++
11091 ++ if (sock_owned_by_user(meta_sk)) {
11092 ++ if (!test_and_set_bit(MPTCP_PATH_MANAGER,
11093 ++ &meta_tp->tsq_flags))
11094 ++ sock_hold(meta_sk);
11095 ++
11096 ++ goto next;
11097 ++ }
11098 ++
11099 ++ if (event->code == MPTCP_EVENT_ADD) {
11100 ++ fmp->add_addr++;
11101 ++ mpcb->addr_signal = 1;
11102 ++
11103 ++ sk = mptcp_select_ack_sock(meta_sk);
11104 ++ if (sk)
11105 ++ tcp_send_ack(sk);
11106 ++
11107 ++ full_mesh_create_subflows(meta_sk);
11108 ++ }
11109 ++
11110 ++ if (event->code == MPTCP_EVENT_DEL) {
11111 ++ struct sock *sk, *tmpsk;
11112 ++ struct mptcp_loc_addr *mptcp_local;
11113 ++ bool found = false;
11114 ++
11115 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
11116 ++
11117 ++ /* In any case, we need to update our bitfields */
11118 ++ if (id >= 0)
11119 ++ update_addr_bitfields(meta_sk, mptcp_local);
11120 ++
11121 ++ /* Look for the socket and remove him */
11122 ++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
11123 ++ if ((event->family == AF_INET6 &&
11124 ++ (sk->sk_family == AF_INET ||
11125 ++ mptcp_v6_is_v4_mapped(sk))) ||
11126 ++ (event->family == AF_INET &&
11127 ++ (sk->sk_family == AF_INET6 &&
11128 ++ !mptcp_v6_is_v4_mapped(sk))))
11129 ++ continue;
11130 ++
11131 ++ if (event->family == AF_INET &&
11132 ++ (sk->sk_family == AF_INET ||
11133 ++ mptcp_v6_is_v4_mapped(sk)) &&
11134 ++ inet_sk(sk)->inet_saddr != event->addr.in.s_addr)
11135 ++ continue;
11136 ++
11137 ++ if (event->family == AF_INET6 &&
11138 ++ sk->sk_family == AF_INET6 &&
11139 ++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))
11140 ++ continue;
11141 ++
11142 ++ /* Reinject, so that pf = 1 and so we
11143 ++ * won't select this one as the
11144 ++ * ack-sock.
11145 ++ */
11146 ++ mptcp_reinject_data(sk, 0);
11147 ++
11148 ++ /* We announce the removal of this id */
11149 ++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk);
11150 ++
11151 ++ mptcp_sub_force_close(sk);
11152 ++ found = true;
11153 ++ }
11154 ++
11155 ++ if (found)
11156 ++ goto next;
11157 ++
11158 ++ /* The id may have been given by the event,
11159 ++ * matching on a local address. And it may not
11160 ++ * have matched on one of the above sockets,
11161 ++ * because the client never created a subflow.
11162 ++ * So, we have to finally remove it here.
11163 ++ */
11164 ++ if (id > 0)
11165 ++ announce_remove_addr(id, meta_sk);
11166 ++ }
11167 ++
11168 ++ if (event->code == MPTCP_EVENT_MOD) {
11169 ++ struct sock *sk;
11170 ++
11171 ++ mptcp_for_each_sk(mpcb, sk) {
11172 ++ struct tcp_sock *tp = tcp_sk(sk);
11173 ++ if (event->family == AF_INET &&
11174 ++ (sk->sk_family == AF_INET ||
11175 ++ mptcp_v6_is_v4_mapped(sk)) &&
11176 ++ inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {
11177 ++ if (event->low_prio != tp->mptcp->low_prio) {
11178 ++ tp->mptcp->send_mp_prio = 1;
11179 ++ tp->mptcp->low_prio = event->low_prio;
11180 ++
11181 ++ tcp_send_ack(sk);
11182 ++ }
11183 ++ }
11184 ++
11185 ++ if (event->family == AF_INET6 &&
11186 ++ sk->sk_family == AF_INET6 &&
11187 ++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {
11188 ++ if (event->low_prio != tp->mptcp->low_prio) {
11189 ++ tp->mptcp->send_mp_prio = 1;
11190 ++ tp->mptcp->low_prio = event->low_prio;
11191 ++
11192 ++ tcp_send_ack(sk);
11193 ++ }
11194 ++ }
11195 ++ }
11196 ++ }
11197 ++next:
11198 ++ bh_unlock_sock(meta_sk);
11199 ++ sock_put(meta_sk);
11200 ++ }
11201 ++ rcu_read_unlock_bh();
11202 ++ }
11203 ++ goto next_event;
11204 ++}
11205 ++
11206 ++static struct mptcp_addr_event *lookup_similar_event(const struct net *net,
11207 ++ const struct mptcp_addr_event *event)
11208 ++{
11209 ++ struct mptcp_addr_event *eventq;
11210 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11211 ++
11212 ++ list_for_each_entry(eventq, &fm_ns->events, list) {
11213 ++ if (eventq->family != event->family)
11214 ++ continue;
11215 ++ if (event->family == AF_INET) {
11216 ++ if (eventq->addr.in.s_addr == event->addr.in.s_addr)
11217 ++ return eventq;
11218 ++ } else {
11219 ++ if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))
11220 ++ return eventq;
11221 ++ }
11222 ++ }
11223 ++ return NULL;
11224 ++}
11225 ++
11226 ++/* We already hold the net-namespace MPTCP-lock */
11227 ++static void add_pm_event(struct net *net, const struct mptcp_addr_event *event)
11228 ++{
11229 ++ struct mptcp_addr_event *eventq = lookup_similar_event(net, event);
11230 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11231 ++
11232 ++ if (eventq) {
11233 ++ switch (event->code) {
11234 ++ case MPTCP_EVENT_DEL:
11235 ++ mptcp_debug("%s del old_code %u\n", __func__, eventq->code);
11236 ++ list_del(&eventq->list);
11237 ++ kfree(eventq);
11238 ++ break;
11239 ++ case MPTCP_EVENT_ADD:
11240 ++ mptcp_debug("%s add old_code %u\n", __func__, eventq->code);
11241 ++ eventq->low_prio = event->low_prio;
11242 ++ eventq->code = MPTCP_EVENT_ADD;
11243 ++ return;
11244 ++ case MPTCP_EVENT_MOD:
11245 ++ mptcp_debug("%s mod old_code %u\n", __func__, eventq->code);
11246 ++ eventq->low_prio = event->low_prio;
11247 ++ eventq->code = MPTCP_EVENT_MOD;
11248 ++ return;
11249 ++ }
11250 ++ }
11251 ++
11252 ++ /* OK, we have to add the new address to the wait queue */
11253 ++ eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);
11254 ++ if (!eventq)
11255 ++ return;
11256 ++
11257 ++ list_add_tail(&eventq->list, &fm_ns->events);
11258 ++
11259 ++ /* Create work-queue */
11260 ++ if (!delayed_work_pending(&fm_ns->address_worker))
11261 ++ queue_delayed_work(mptcp_wq, &fm_ns->address_worker,
11262 ++ msecs_to_jiffies(500));
11263 ++}
11264 ++
11265 ++static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event,
11266 ++ struct net *net)
11267 ++{
11268 ++ const struct net_device *netdev = ifa->ifa_dev->dev;
11269 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11270 ++ struct mptcp_addr_event mpevent;
11271 ++
11272 ++ if (ifa->ifa_scope > RT_SCOPE_LINK ||
11273 ++ ipv4_is_loopback(ifa->ifa_local))
11274 ++ return;
11275 ++
11276 ++ spin_lock_bh(&fm_ns->local_lock);
11277 ++
11278 ++ mpevent.family = AF_INET;
11279 ++ mpevent.addr.in.s_addr = ifa->ifa_local;
11280 ++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
11281 ++
11282 ++ if (event == NETDEV_DOWN || !netif_running(netdev) ||
11283 ++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
11284 ++ mpevent.code = MPTCP_EVENT_DEL;
11285 ++ else if (event == NETDEV_UP)
11286 ++ mpevent.code = MPTCP_EVENT_ADD;
11287 ++ else if (event == NETDEV_CHANGE)
11288 ++ mpevent.code = MPTCP_EVENT_MOD;
11289 ++
11290 ++ mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__,
11291 ++ &ifa->ifa_local, mpevent.code, mpevent.low_prio);
11292 ++ add_pm_event(net, &mpevent);
11293 ++
11294 ++ spin_unlock_bh(&fm_ns->local_lock);
11295 ++ return;
11296 ++}
11297 ++
11298 ++/* React on IPv4-addr add/rem-events */
11299 ++static int mptcp_pm_inetaddr_event(struct notifier_block *this,
11300 ++ unsigned long event, void *ptr)
11301 ++{
11302 ++ const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
11303 ++ struct net *net = dev_net(ifa->ifa_dev->dev);
11304 ++
11305 ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
11306 ++ event == NETDEV_CHANGE))
11307 ++ return NOTIFY_DONE;
11308 ++
11309 ++ addr4_event_handler(ifa, event, net);
11310 ++
11311 ++ return NOTIFY_DONE;
11312 ++}
11313 ++
11314 ++static struct notifier_block mptcp_pm_inetaddr_notifier = {
11315 ++ .notifier_call = mptcp_pm_inetaddr_event,
11316 ++};
11317 ++
11318 ++#if IS_ENABLED(CONFIG_IPV6)
11319 ++
11320 ++/* IPV6-related address/interface watchers */
11321 ++struct mptcp_dad_data {
11322 ++ struct timer_list timer;
11323 ++ struct inet6_ifaddr *ifa;
11324 ++};
11325 ++
11326 ++static void dad_callback(unsigned long arg);
11327 ++static int inet6_addr_event(struct notifier_block *this,
11328 ++ unsigned long event, void *ptr);
11329 ++
11330 ++static int ipv6_is_in_dad_state(const struct inet6_ifaddr *ifa)
11331 ++{
11332 ++ return (ifa->flags & IFA_F_TENTATIVE) &&
11333 ++ ifa->state == INET6_IFADDR_STATE_DAD;
11334 ++}
11335 ++
11336 ++static void dad_init_timer(struct mptcp_dad_data *data,
11337 ++ struct inet6_ifaddr *ifa)
11338 ++{
11339 ++ data->ifa = ifa;
11340 ++ data->timer.data = (unsigned long)data;
11341 ++ data->timer.function = dad_callback;
11342 ++ if (ifa->idev->cnf.rtr_solicit_delay)
11343 ++ data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;
11344 ++ else
11345 ++ data->timer.expires = jiffies + (HZ/10);
11346 ++}
11347 ++
11348 ++static void dad_callback(unsigned long arg)
11349 ++{
11350 ++ struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;
11351 ++
11352 ++ if (ipv6_is_in_dad_state(data->ifa)) {
11353 ++ dad_init_timer(data, data->ifa);
11354 ++ add_timer(&data->timer);
11355 ++ } else {
11356 ++ inet6_addr_event(NULL, NETDEV_UP, data->ifa);
11357 ++ in6_ifa_put(data->ifa);
11358 ++ kfree(data);
11359 ++ }
11360 ++}
11361 ++
11362 ++static inline void dad_setup_timer(struct inet6_ifaddr *ifa)
11363 ++{
11364 ++ struct mptcp_dad_data *data;
11365 ++
11366 ++ data = kmalloc(sizeof(*data), GFP_ATOMIC);
11367 ++
11368 ++ if (!data)
11369 ++ return;
11370 ++
11371 ++ init_timer(&data->timer);
11372 ++ dad_init_timer(data, ifa);
11373 ++ add_timer(&data->timer);
11374 ++ in6_ifa_hold(ifa);
11375 ++}
11376 ++
11377 ++static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event,
11378 ++ struct net *net)
11379 ++{
11380 ++ const struct net_device *netdev = ifa->idev->dev;
11381 ++ int addr_type = ipv6_addr_type(&ifa->addr);
11382 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11383 ++ struct mptcp_addr_event mpevent;
11384 ++
11385 ++ if (ifa->scope > RT_SCOPE_LINK ||
11386 ++ addr_type == IPV6_ADDR_ANY ||
11387 ++ (addr_type & IPV6_ADDR_LOOPBACK) ||
11388 ++ (addr_type & IPV6_ADDR_LINKLOCAL))
11389 ++ return;
11390 ++
11391 ++ spin_lock_bh(&fm_ns->local_lock);
11392 ++
11393 ++ mpevent.family = AF_INET6;
11394 ++ mpevent.addr.in6 = ifa->addr;
11395 ++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
11396 ++
11397 ++ if (event == NETDEV_DOWN || !netif_running(netdev) ||
11398 ++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
11399 ++ mpevent.code = MPTCP_EVENT_DEL;
11400 ++ else if (event == NETDEV_UP)
11401 ++ mpevent.code = MPTCP_EVENT_ADD;
11402 ++ else if (event == NETDEV_CHANGE)
11403 ++ mpevent.code = MPTCP_EVENT_MOD;
11404 ++
11405 ++ mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__,
11406 ++ &ifa->addr, mpevent.code, mpevent.low_prio);
11407 ++ add_pm_event(net, &mpevent);
11408 ++
11409 ++ spin_unlock_bh(&fm_ns->local_lock);
11410 ++ return;
11411 ++}
11412 ++
11413 ++/* React on IPv6-addr add/rem-events */
11414 ++static int inet6_addr_event(struct notifier_block *this, unsigned long event,
11415 ++ void *ptr)
11416 ++{
11417 ++ struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;
11418 ++ struct net *net = dev_net(ifa6->idev->dev);
11419 ++
11420 ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
11421 ++ event == NETDEV_CHANGE))
11422 ++ return NOTIFY_DONE;
11423 ++
11424 ++ if (ipv6_is_in_dad_state(ifa6))
11425 ++ dad_setup_timer(ifa6);
11426 ++ else
11427 ++ addr6_event_handler(ifa6, event, net);
11428 ++
11429 ++ return NOTIFY_DONE;
11430 ++}
11431 ++
11432 ++static struct notifier_block inet6_addr_notifier = {
11433 ++ .notifier_call = inet6_addr_event,
11434 ++};
11435 ++
11436 ++#endif
11437 ++
11438 ++/* React on ifup/down-events */
11439 ++static int netdev_event(struct notifier_block *this, unsigned long event,
11440 ++ void *ptr)
11441 ++{
11442 ++ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
11443 ++ struct in_device *in_dev;
11444 ++#if IS_ENABLED(CONFIG_IPV6)
11445 ++ struct inet6_dev *in6_dev;
11446 ++#endif
11447 ++
11448 ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
11449 ++ event == NETDEV_CHANGE))
11450 ++ return NOTIFY_DONE;
11451 ++
11452 ++ rcu_read_lock();
11453 ++ in_dev = __in_dev_get_rtnl(dev);
11454 ++
11455 ++ if (in_dev) {
11456 ++ for_ifa(in_dev) {
11457 ++ mptcp_pm_inetaddr_event(NULL, event, ifa);
11458 ++ } endfor_ifa(in_dev);
11459 ++ }
11460 ++
11461 ++#if IS_ENABLED(CONFIG_IPV6)
11462 ++ in6_dev = __in6_dev_get(dev);
11463 ++
11464 ++ if (in6_dev) {
11465 ++ struct inet6_ifaddr *ifa6;
11466 ++ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
11467 ++ inet6_addr_event(NULL, event, ifa6);
11468 ++ }
11469 ++#endif
11470 ++
11471 ++ rcu_read_unlock();
11472 ++ return NOTIFY_DONE;
11473 ++}
11474 ++
11475 ++static struct notifier_block mptcp_pm_netdev_notifier = {
11476 ++ .notifier_call = netdev_event,
11477 ++};
11478 ++
11479 ++static void full_mesh_add_raddr(struct mptcp_cb *mpcb,
11480 ++ const union inet_addr *addr,
11481 ++ sa_family_t family, __be16 port, u8 id)
11482 ++{
11483 ++ if (family == AF_INET)
11484 ++ mptcp_addv4_raddr(mpcb, &addr->in, port, id);
11485 ++ else
11486 ++ mptcp_addv6_raddr(mpcb, &addr->in6, port, id);
11487 ++}
11488 ++
11489 ++static void full_mesh_new_session(const struct sock *meta_sk)
11490 ++{
11491 ++ struct mptcp_loc_addr *mptcp_local;
11492 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
11493 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11494 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
11495 ++ int i, index;
11496 ++ union inet_addr saddr, daddr;
11497 ++ sa_family_t family;
11498 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11499 ++
11500 ++ /* Init local variables necessary for the rest */
11501 ++ if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) {
11502 ++ saddr.ip = inet_sk(meta_sk)->inet_saddr;
11503 ++ daddr.ip = inet_sk(meta_sk)->inet_daddr;
11504 ++ family = AF_INET;
11505 ++#if IS_ENABLED(CONFIG_IPV6)
11506 ++ } else {
11507 ++ saddr.in6 = inet6_sk(meta_sk)->saddr;
11508 ++ daddr.in6 = meta_sk->sk_v6_daddr;
11509 ++ family = AF_INET6;
11510 ++#endif
11511 ++ }
11512 ++
11513 ++ rcu_read_lock();
11514 ++ mptcp_local = rcu_dereference(fm_ns->local);
11515 ++
11516 ++ index = mptcp_find_address(mptcp_local, family, &saddr);
11517 ++ if (index < 0)
11518 ++ goto fallback;
11519 ++
11520 ++ full_mesh_add_raddr(mpcb, &daddr, family, 0, 0);
11521 ++ mptcp_set_init_addr_bit(mpcb, &daddr, family, index);
11522 ++
11523 ++ /* Initialize workqueue-struct */
11524 ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
11525 ++ INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);
11526 ++ fmp->mpcb = mpcb;
11527 ++
11528 ++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
11529 ++ goto skip_ipv4;
11530 ++
11531 ++ /* Look for the address among the local addresses */
11532 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11533 ++ __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;
11534 ++
11535 ++ /* We do not need to announce the initial subflow's address again */
11536 ++ if (family == AF_INET && saddr.ip == ifa_address)
11537 ++ continue;
11538 ++
11539 ++ fmp->add_addr++;
11540 ++ mpcb->addr_signal = 1;
11541 ++ }
11542 ++
11543 ++skip_ipv4:
11544 ++#if IS_ENABLED(CONFIG_IPV6)
11545 ++ /* skip IPv6 addresses if meta-socket is IPv4 */
11546 ++ if (meta_v4)
11547 ++ goto skip_ipv6;
11548 ++
11549 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11550 ++ const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;
11551 ++
11552 ++ /* We do not need to announce the initial subflow's address again */
11553 ++ if (family == AF_INET6 && ipv6_addr_equal(&saddr.in6, ifa6))
11554 ++ continue;
11555 ++
11556 ++ fmp->add_addr++;
11557 ++ mpcb->addr_signal = 1;
11558 ++ }
11559 ++
11560 ++skip_ipv6:
11561 ++#endif
11562 ++
11563 ++ rcu_read_unlock();
11564 ++
11565 ++ if (family == AF_INET)
11566 ++ fmp->announced_addrs_v4 |= (1 << index);
11567 ++ else
11568 ++ fmp->announced_addrs_v6 |= (1 << index);
11569 ++
11570 ++ for (i = fmp->add_addr; i && fmp->add_addr; i--)
11571 ++ tcp_send_ack(mpcb->master_sk);
11572 ++
11573 ++ return;
11574 ++
11575 ++fallback:
11576 ++ rcu_read_unlock();
11577 ++ mptcp_fallback_default(mpcb);
11578 ++ return;
11579 ++}
11580 ++
11581 ++static void full_mesh_create_subflows(struct sock *meta_sk)
11582 ++{
11583 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
11584 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11585 ++
11586 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
11587 ++ mpcb->send_infinite_mapping ||
11588 ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
11589 ++ return;
11590 ++
11591 ++ if (mpcb->master_sk &&
11592 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
11593 ++ return;
11594 ++
11595 ++ if (!work_pending(&fmp->subflow_work)) {
11596 ++ sock_hold(meta_sk);
11597 ++ queue_work(mptcp_wq, &fmp->subflow_work);
11598 ++ }
11599 ++}
11600 ++
11601 ++/* Called upon release_sock, if the socket was owned by the user during
11602 ++ * a path-management event.
11603 ++ */
11604 ++static void full_mesh_release_sock(struct sock *meta_sk)
11605 ++{
11606 ++ struct mptcp_loc_addr *mptcp_local;
11607 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
11608 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11609 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
11610 ++ struct sock *sk, *tmpsk;
11611 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11612 ++ int i;
11613 ++
11614 ++ rcu_read_lock();
11615 ++ mptcp_local = rcu_dereference(fm_ns->local);
11616 ++
11617 ++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
11618 ++ goto skip_ipv4;
11619 ++
11620 ++ /* First, detect modifications or additions */
11621 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11622 ++ struct in_addr ifa = mptcp_local->locaddr4[i].addr;
11623 ++ bool found = false;
11624 ++
11625 ++ mptcp_for_each_sk(mpcb, sk) {
11626 ++ struct tcp_sock *tp = tcp_sk(sk);
11627 ++
11628 ++ if (sk->sk_family == AF_INET6 &&
11629 ++ !mptcp_v6_is_v4_mapped(sk))
11630 ++ continue;
11631 ++
11632 ++ if (inet_sk(sk)->inet_saddr != ifa.s_addr)
11633 ++ continue;
11634 ++
11635 ++ found = true;
11636 ++
11637 ++ if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {
11638 ++ tp->mptcp->send_mp_prio = 1;
11639 ++ tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;
11640 ++
11641 ++ tcp_send_ack(sk);
11642 ++ }
11643 ++ }
11644 ++
11645 ++ if (!found) {
11646 ++ fmp->add_addr++;
11647 ++ mpcb->addr_signal = 1;
11648 ++
11649 ++ sk = mptcp_select_ack_sock(meta_sk);
11650 ++ if (sk)
11651 ++ tcp_send_ack(sk);
11652 ++ full_mesh_create_subflows(meta_sk);
11653 ++ }
11654 ++ }
11655 ++
11656 ++skip_ipv4:
11657 ++#if IS_ENABLED(CONFIG_IPV6)
11658 ++ /* skip IPv6 addresses if meta-socket is IPv4 */
11659 ++ if (meta_v4)
11660 ++ goto removal;
11661 ++
11662 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11663 ++ struct in6_addr ifa = mptcp_local->locaddr6[i].addr;
11664 ++ bool found = false;
11665 ++
11666 ++ mptcp_for_each_sk(mpcb, sk) {
11667 ++ struct tcp_sock *tp = tcp_sk(sk);
11668 ++
11669 ++ if (sk->sk_family == AF_INET ||
11670 ++ mptcp_v6_is_v4_mapped(sk))
11671 ++ continue;
11672 ++
11673 ++ if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))
11674 ++ continue;
11675 ++
11676 ++ found = true;
11677 ++
11678 ++ if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {
11679 ++ tp->mptcp->send_mp_prio = 1;
11680 ++ tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;
11681 ++
11682 ++ tcp_send_ack(sk);
11683 ++ }
11684 ++ }
11685 ++
11686 ++ if (!found) {
11687 ++ fmp->add_addr++;
11688 ++ mpcb->addr_signal = 1;
11689 ++
11690 ++ sk = mptcp_select_ack_sock(meta_sk);
11691 ++ if (sk)
11692 ++ tcp_send_ack(sk);
11693 ++ full_mesh_create_subflows(meta_sk);
11694 ++ }
11695 ++ }
11696 ++
11697 ++removal:
11698 ++#endif
11699 ++
11700 ++ /* Now, detect address-removals */
11701 ++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
11702 ++ bool shall_remove = true;
11703 ++
11704 ++ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
11705 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11706 ++ if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {
11707 ++ shall_remove = false;
11708 ++ break;
11709 ++ }
11710 ++ }
11711 ++ } else {
11712 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11713 ++ if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {
11714 ++ shall_remove = false;
11715 ++ break;
11716 ++ }
11717 ++ }
11718 ++ }
11719 ++
11720 ++ if (shall_remove) {
11721 ++ /* Reinject, so that pf = 1 and so we
11722 ++ * won't select this one as the
11723 ++ * ack-sock.
11724 ++ */
11725 ++ mptcp_reinject_data(sk, 0);
11726 ++
11727 ++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id,
11728 ++ meta_sk);
11729 ++
11730 ++ mptcp_sub_force_close(sk);
11731 ++ }
11732 ++ }
11733 ++
11734 ++ /* Just call it optimistically. It actually cannot do any harm */
11735 ++ update_addr_bitfields(meta_sk, mptcp_local);
11736 ++
11737 ++ rcu_read_unlock();
11738 ++}
11739 ++
11740 ++static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,
11741 ++ struct net *net, bool *low_prio)
11742 ++{
11743 ++ struct mptcp_loc_addr *mptcp_local;
11744 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11745 ++ int index, id = -1;
11746 ++
11747 ++ /* Handle the backup-flows */
11748 ++ rcu_read_lock();
11749 ++ mptcp_local = rcu_dereference(fm_ns->local);
11750 ++
11751 ++ index = mptcp_find_address(mptcp_local, family, addr);
11752 ++
11753 ++ if (index != -1) {
11754 ++ if (family == AF_INET) {
11755 ++ id = mptcp_local->locaddr4[index].loc4_id;
11756 ++ *low_prio = mptcp_local->locaddr4[index].low_prio;
11757 ++ } else {
11758 ++ id = mptcp_local->locaddr6[index].loc6_id;
11759 ++ *low_prio = mptcp_local->locaddr6[index].low_prio;
11760 ++ }
11761 ++ }
11762 ++
11763 ++
11764 ++ rcu_read_unlock();
11765 ++
11766 ++ return id;
11767 ++}
11768 ++
11769 ++static void full_mesh_addr_signal(struct sock *sk, unsigned *size,
11770 ++ struct tcp_out_options *opts,
11771 ++ struct sk_buff *skb)
11772 ++{
11773 ++ const struct tcp_sock *tp = tcp_sk(sk);
11774 ++ struct mptcp_cb *mpcb = tp->mpcb;
11775 ++ struct sock *meta_sk = mpcb->meta_sk;
11776 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11777 ++ struct mptcp_loc_addr *mptcp_local;
11778 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
11779 ++ int remove_addr_len;
11780 ++ u8 unannouncedv4 = 0, unannouncedv6 = 0;
11781 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11782 ++
11783 ++ mpcb->addr_signal = 0;
11784 ++
11785 ++ if (likely(!fmp->add_addr))
11786 ++ goto remove_addr;
11787 ++
11788 ++ rcu_read_lock();
11789 ++ mptcp_local = rcu_dereference(fm_ns->local);
11790 ++
11791 ++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
11792 ++ goto skip_ipv4;
11793 ++
11794 ++ /* IPv4 */
11795 ++ unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;
11796 ++ if (unannouncedv4 &&
11797 ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {
11798 ++ int ind = mptcp_find_free_index(~unannouncedv4);
11799 ++
11800 ++ opts->options |= OPTION_MPTCP;
11801 ++ opts->mptcp_options |= OPTION_ADD_ADDR;
11802 ++ opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;
11803 ++ opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;
11804 ++ opts->add_addr_v4 = 1;
11805 ++
11806 ++ if (skb) {
11807 ++ fmp->announced_addrs_v4 |= (1 << ind);
11808 ++ fmp->add_addr--;
11809 ++ }
11810 ++ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
11811 ++ }
11812 ++
11813 ++ if (meta_v4)
11814 ++ goto skip_ipv6;
11815 ++
11816 ++skip_ipv4:
11817 ++ /* IPv6 */
11818 ++ unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;
11819 ++ if (unannouncedv6 &&
11820 ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {
11821 ++ int ind = mptcp_find_free_index(~unannouncedv6);
11822 ++
11823 ++ opts->options |= OPTION_MPTCP;
11824 ++ opts->mptcp_options |= OPTION_ADD_ADDR;
11825 ++ opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;
11826 ++ opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;
11827 ++ opts->add_addr_v6 = 1;
11828 ++
11829 ++ if (skb) {
11830 ++ fmp->announced_addrs_v6 |= (1 << ind);
11831 ++ fmp->add_addr--;
11832 ++ }
11833 ++ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
11834 ++ }
11835 ++
11836 ++skip_ipv6:
11837 ++ rcu_read_unlock();
11838 ++
11839 ++ if (!unannouncedv4 && !unannouncedv6 && skb)
11840 ++ fmp->add_addr--;
11841 ++
11842 ++remove_addr:
11843 ++ if (likely(!fmp->remove_addrs))
11844 ++ goto exit;
11845 ++
11846 ++ remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);
11847 ++ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
11848 ++ goto exit;
11849 ++
11850 ++ opts->options |= OPTION_MPTCP;
11851 ++ opts->mptcp_options |= OPTION_REMOVE_ADDR;
11852 ++ opts->remove_addrs = fmp->remove_addrs;
11853 ++ *size += remove_addr_len;
11854 ++ if (skb)
11855 ++ fmp->remove_addrs = 0;
11856 ++
11857 ++exit:
11858 ++ mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs);
11859 ++}
11860 ++
11861 ++static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id)
11862 ++{
11863 ++ mptcp_v4_rem_raddress(mpcb, rem_id);
11864 ++ mptcp_v6_rem_raddress(mpcb, rem_id);
11865 ++}
11866 ++
11867 ++/* Output /proc/net/mptcp_fullmesh */
11868 ++static int mptcp_fm_seq_show(struct seq_file *seq, void *v)
11869 ++{
11870 ++ const struct net *net = seq->private;
11871 ++ struct mptcp_loc_addr *mptcp_local;
11872 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11873 ++ int i;
11874 ++
11875 ++ seq_printf(seq, "Index, Address-ID, Backup, IP-address\n");
11876 ++
11877 ++ rcu_read_lock_bh();
11878 ++ mptcp_local = rcu_dereference(fm_ns->local);
11879 ++
11880 ++ seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index);
11881 ++
11882 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11883 ++ struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i];
11884 ++
11885 ++ seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id,
11886 ++ loc4->low_prio, &loc4->addr);
11887 ++ }
11888 ++
11889 ++ seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index);
11890 ++
11891 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11892 ++ struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i];
11893 ++
11894 ++ seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id,
11895 ++ loc6->low_prio, &loc6->addr);
11896 ++ }
11897 ++ rcu_read_unlock_bh();
11898 ++
11899 ++ return 0;
11900 ++}
11901 ++
11902 ++static int mptcp_fm_seq_open(struct inode *inode, struct file *file)
11903 ++{
11904 ++ return single_open_net(inode, file, mptcp_fm_seq_show);
11905 ++}
11906 ++
11907 ++static const struct file_operations mptcp_fm_seq_fops = {
11908 ++ .owner = THIS_MODULE,
11909 ++ .open = mptcp_fm_seq_open,
11910 ++ .read = seq_read,
11911 ++ .llseek = seq_lseek,
11912 ++ .release = single_release_net,
11913 ++};
11914 ++
11915 ++static int mptcp_fm_init_net(struct net *net)
11916 ++{
11917 ++ struct mptcp_loc_addr *mptcp_local;
11918 ++ struct mptcp_fm_ns *fm_ns;
11919 ++ int err = 0;
11920 ++
11921 ++ fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);
11922 ++ if (!fm_ns)
11923 ++ return -ENOBUFS;
11924 ++
11925 ++ mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);
11926 ++ if (!mptcp_local) {
11927 ++ err = -ENOBUFS;
11928 ++ goto err_mptcp_local;
11929 ++ }
11930 ++
11931 ++ if (!proc_create("mptcp_fullmesh", S_IRUGO, net->proc_net,
11932 ++ &mptcp_fm_seq_fops)) {
11933 ++ err = -ENOMEM;
11934 ++ goto err_seq_fops;
11935 ++ }
11936 ++
11937 ++ mptcp_local->next_v4_index = 1;
11938 ++
11939 ++ rcu_assign_pointer(fm_ns->local, mptcp_local);
11940 ++ INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);
11941 ++ INIT_LIST_HEAD(&fm_ns->events);
11942 ++ spin_lock_init(&fm_ns->local_lock);
11943 ++ fm_ns->net = net;
11944 ++ net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;
11945 ++
11946 ++ return 0;
11947 ++err_seq_fops:
11948 ++ kfree(mptcp_local);
11949 ++err_mptcp_local:
11950 ++ kfree(fm_ns);
11951 ++ return err;
11952 ++}
11953 ++
11954 ++static void mptcp_fm_exit_net(struct net *net)
11955 ++{
11956 ++ struct mptcp_addr_event *eventq, *tmp;
11957 ++ struct mptcp_fm_ns *fm_ns;
11958 ++ struct mptcp_loc_addr *mptcp_local;
11959 ++
11960 ++ fm_ns = fm_get_ns(net);
11961 ++ cancel_delayed_work_sync(&fm_ns->address_worker);
11962 ++
11963 ++ rcu_read_lock_bh();
11964 ++
11965 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
11966 ++ kfree(mptcp_local);
11967 ++
11968 ++ spin_lock(&fm_ns->local_lock);
11969 ++ list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {
11970 ++ list_del(&eventq->list);
11971 ++ kfree(eventq);
11972 ++ }
11973 ++ spin_unlock(&fm_ns->local_lock);
11974 ++
11975 ++ rcu_read_unlock_bh();
11976 ++
11977 ++ remove_proc_entry("mptcp_fullmesh", net->proc_net);
11978 ++
11979 ++ kfree(fm_ns);
11980 ++}
11981 ++
11982 ++static struct pernet_operations full_mesh_net_ops = {
11983 ++ .init = mptcp_fm_init_net,
11984 ++ .exit = mptcp_fm_exit_net,
11985 ++};
11986 ++
11987 ++static struct mptcp_pm_ops full_mesh __read_mostly = {
11988 ++ .new_session = full_mesh_new_session,
11989 ++ .release_sock = full_mesh_release_sock,
11990 ++ .fully_established = full_mesh_create_subflows,
11991 ++ .new_remote_address = full_mesh_create_subflows,
11992 ++ .get_local_id = full_mesh_get_local_id,
11993 ++ .addr_signal = full_mesh_addr_signal,
11994 ++ .add_raddr = full_mesh_add_raddr,
11995 ++ .rem_raddr = full_mesh_rem_raddr,
11996 ++ .name = "fullmesh",
11997 ++ .owner = THIS_MODULE,
11998 ++};
11999 ++
12000 ++/* General initialization of MPTCP_PM */
12001 ++static int __init full_mesh_register(void)
12002 ++{
12003 ++ int ret;
12004 ++
12005 ++ BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);
12006 ++
12007 ++ ret = register_pernet_subsys(&full_mesh_net_ops);
12008 ++ if (ret)
12009 ++ goto out;
12010 ++
12011 ++ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
12012 ++ if (ret)
12013 ++ goto err_reg_inetaddr;
12014 ++ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
12015 ++ if (ret)
12016 ++ goto err_reg_netdev;
12017 ++
12018 ++#if IS_ENABLED(CONFIG_IPV6)
12019 ++ ret = register_inet6addr_notifier(&inet6_addr_notifier);
12020 ++ if (ret)
12021 ++ goto err_reg_inet6addr;
12022 ++#endif
12023 ++
12024 ++ ret = mptcp_register_path_manager(&full_mesh);
12025 ++ if (ret)
12026 ++ goto err_reg_pm;
12027 ++
12028 ++out:
12029 ++ return ret;
12030 ++
12031 ++
12032 ++err_reg_pm:
12033 ++#if IS_ENABLED(CONFIG_IPV6)
12034 ++ unregister_inet6addr_notifier(&inet6_addr_notifier);
12035 ++err_reg_inet6addr:
12036 ++#endif
12037 ++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
12038 ++err_reg_netdev:
12039 ++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
12040 ++err_reg_inetaddr:
12041 ++ unregister_pernet_subsys(&full_mesh_net_ops);
12042 ++ goto out;
12043 ++}
12044 ++
12045 ++static void full_mesh_unregister(void)
12046 ++{
12047 ++#if IS_ENABLED(CONFIG_IPV6)
12048 ++ unregister_inet6addr_notifier(&inet6_addr_notifier);
12049 ++#endif
12050 ++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
12051 ++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
12052 ++ unregister_pernet_subsys(&full_mesh_net_ops);
12053 ++ mptcp_unregister_path_manager(&full_mesh);
12054 ++}
12055 ++
12056 ++module_init(full_mesh_register);
12057 ++module_exit(full_mesh_unregister);
12058 ++
12059 ++MODULE_AUTHOR("Christoph Paasch");
12060 ++MODULE_LICENSE("GPL");
12061 ++MODULE_DESCRIPTION("Full-Mesh MPTCP");
12062 ++MODULE_VERSION("0.88");
12063 +diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c
12064 +new file mode 100644
12065 +index 000000000000..43704ccb639e
12066 +--- /dev/null
12067 ++++ b/net/mptcp/mptcp_input.c
12068 +@@ -0,0 +1,2405 @@
12069 ++/*
12070 ++ * MPTCP implementation - Sending side
12071 ++ *
12072 ++ * Initial Design & Implementation:
12073 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
12074 ++ *
12075 ++ * Current Maintainer & Author:
12076 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
12077 ++ *
12078 ++ * Additional authors:
12079 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
12080 ++ * Gregory Detal <gregory.detal@×××××××××.be>
12081 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
12082 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
12083 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
12084 ++ * Andreas Ripke <ripke@××××××.eu>
12085 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
12086 ++ * Octavian Purdila <octavian.purdila@×××××.com>
12087 ++ * John Ronan <jronan@××××.org>
12088 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
12089 ++ * Brandon Heller <brandonh@××××××××.edu>
12090 ++ *
12091 ++ *
12092 ++ * This program is free software; you can redistribute it and/or
12093 ++ * modify it under the terms of the GNU General Public License
12094 ++ * as published by the Free Software Foundation; either version
12095 ++ * 2 of the License, or (at your option) any later version.
12096 ++ */
12097 ++
12098 ++#include <asm/unaligned.h>
12099 ++
12100 ++#include <net/mptcp.h>
12101 ++#include <net/mptcp_v4.h>
12102 ++#include <net/mptcp_v6.h>
12103 ++
12104 ++#include <linux/kconfig.h>
12105 ++
12106 ++/* is seq1 < seq2 ? */
12107 ++static inline bool before64(const u64 seq1, const u64 seq2)
12108 ++{
12109 ++ return (s64)(seq1 - seq2) < 0;
12110 ++}
12111 ++
12112 ++/* is seq1 > seq2 ? */
12113 ++#define after64(seq1, seq2) before64(seq2, seq1)
12114 ++
12115 ++static inline void mptcp_become_fully_estab(struct sock *sk)
12116 ++{
12117 ++ tcp_sk(sk)->mptcp->fully_established = 1;
12118 ++
12119 ++ if (is_master_tp(tcp_sk(sk)) &&
12120 ++ tcp_sk(sk)->mpcb->pm_ops->fully_established)
12121 ++ tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));
12122 ++}
12123 ++
12124 ++/* Similar to tcp_tso_acked without any memory accounting */
12125 ++static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk,
12126 ++ struct sk_buff *skb)
12127 ++{
12128 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12129 ++ u32 packets_acked, len;
12130 ++
12131 ++ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una));
12132 ++
12133 ++ packets_acked = tcp_skb_pcount(skb);
12134 ++
12135 ++ if (skb_unclone(skb, GFP_ATOMIC))
12136 ++ return 0;
12137 ++
12138 ++ len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq;
12139 ++ __pskb_trim_head(skb, len);
12140 ++
12141 ++ TCP_SKB_CB(skb)->seq += len;
12142 ++ skb->ip_summed = CHECKSUM_PARTIAL;
12143 ++ skb->truesize -= len;
12144 ++
12145 ++ /* Any change of skb->len requires recalculation of tso factor. */
12146 ++ if (tcp_skb_pcount(skb) > 1)
12147 ++ tcp_set_skb_tso_segs(meta_sk, skb, tcp_skb_mss(skb));
12148 ++ packets_acked -= tcp_skb_pcount(skb);
12149 ++
12150 ++ if (packets_acked) {
12151 ++ BUG_ON(tcp_skb_pcount(skb) == 0);
12152 ++ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
12153 ++ }
12154 ++
12155 ++ return packets_acked;
12156 ++}
12157 ++
12158 ++/**
12159 ++ * Cleans the meta-socket retransmission queue and the reinject-queue.
12160 ++ * @sk must be the metasocket.
12161 ++ */
12162 ++static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
12163 ++{
12164 ++ struct sk_buff *skb, *tmp;
12165 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12166 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
12167 ++ bool acked = false;
12168 ++ u32 acked_pcount;
12169 ++
12170 ++ while ((skb = tcp_write_queue_head(meta_sk)) &&
12171 ++ skb != tcp_send_head(meta_sk)) {
12172 ++ bool fully_acked = true;
12173 ++
12174 ++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
12175 ++ if (tcp_skb_pcount(skb) == 1 ||
12176 ++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
12177 ++ break;
12178 ++
12179 ++ acked_pcount = tcp_tso_acked(meta_sk, skb);
12180 ++ if (!acked_pcount)
12181 ++ break;
12182 ++
12183 ++ fully_acked = false;
12184 ++ } else {
12185 ++ acked_pcount = tcp_skb_pcount(skb);
12186 ++ }
12187 ++
12188 ++ acked = true;
12189 ++ meta_tp->packets_out -= acked_pcount;
12190 ++ meta_tp->retrans_stamp = 0;
12191 ++
12192 ++ if (!fully_acked)
12193 ++ break;
12194 ++
12195 ++ tcp_unlink_write_queue(skb, meta_sk);
12196 ++
12197 ++ if (mptcp_is_data_fin(skb)) {
12198 ++ struct sock *sk_it;
12199 ++
12200 ++ /* DATA_FIN has been acknowledged - now we can close
12201 ++ * the subflows
12202 ++ */
12203 ++ mptcp_for_each_sk(mpcb, sk_it) {
12204 ++ unsigned long delay = 0;
12205 ++
12206 ++ /* If we are the passive closer, don't trigger
12207 ++ * subflow-fin until the subflow has been finned
12208 ++ * by the peer - thus we add a delay.
12209 ++ */
12210 ++ if (mpcb->passive_close &&
12211 ++ sk_it->sk_state == TCP_ESTABLISHED)
12212 ++ delay = inet_csk(sk_it)->icsk_rto << 3;
12213 ++
12214 ++ mptcp_sub_close(sk_it, delay);
12215 ++ }
12216 ++ }
12217 ++ sk_wmem_free_skb(meta_sk, skb);
12218 ++ }
12219 ++ /* Remove acknowledged data from the reinject queue */
12220 ++ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
12221 ++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
12222 ++ if (tcp_skb_pcount(skb) == 1 ||
12223 ++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
12224 ++ break;
12225 ++
12226 ++ mptcp_tso_acked_reinject(meta_sk, skb);
12227 ++ break;
12228 ++ }
12229 ++
12230 ++ __skb_unlink(skb, &mpcb->reinject_queue);
12231 ++ __kfree_skb(skb);
12232 ++ }
12233 ++
12234 ++ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
12235 ++ meta_tp->snd_up = meta_tp->snd_una;
12236 ++
12237 ++ if (acked) {
12238 ++ tcp_rearm_rto(meta_sk);
12239 ++ /* Normally this is done in tcp_try_undo_loss - but MPTCP
12240 ++ * does not call this function.
12241 ++ */
12242 ++ inet_csk(meta_sk)->icsk_retransmits = 0;
12243 ++ }
12244 ++}
12245 ++
12246 ++/* Inspired by tcp_rcv_state_process */
12247 ++static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
12248 ++ const struct sk_buff *skb, u32 data_seq,
12249 ++ u16 data_len)
12250 ++{
12251 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
12252 ++ const struct tcphdr *th = tcp_hdr(skb);
12253 ++
12254 ++ /* State-machine handling if FIN has been enqueued and he has
12255 ++ * been acked (snd_una == write_seq) - it's important that this
12256 ++ * here is after sk_wmem_free_skb because otherwise
12257 ++ * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
12258 ++ */
12259 ++ switch (meta_sk->sk_state) {
12260 ++ case TCP_FIN_WAIT1: {
12261 ++ struct dst_entry *dst;
12262 ++ int tmo;
12263 ++
12264 ++ if (meta_tp->snd_una != meta_tp->write_seq)
12265 ++ break;
12266 ++
12267 ++ tcp_set_state(meta_sk, TCP_FIN_WAIT2);
12268 ++ meta_sk->sk_shutdown |= SEND_SHUTDOWN;
12269 ++
12270 ++ dst = __sk_dst_get(sk);
12271 ++ if (dst)
12272 ++ dst_confirm(dst);
12273 ++
12274 ++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
12275 ++ /* Wake up lingering close() */
12276 ++ meta_sk->sk_state_change(meta_sk);
12277 ++ break;
12278 ++ }
12279 ++
12280 ++ if (meta_tp->linger2 < 0 ||
12281 ++ (data_len &&
12282 ++ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
12283 ++ meta_tp->rcv_nxt))) {
12284 ++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
12285 ++ tcp_done(meta_sk);
12286 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
12287 ++ return 1;
12288 ++ }
12289 ++
12290 ++ tmo = tcp_fin_time(meta_sk);
12291 ++ if (tmo > TCP_TIMEWAIT_LEN) {
12292 ++ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
12293 ++ } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) {
12294 ++ /* Bad case. We could lose such FIN otherwise.
12295 ++ * It is not a big problem, but it looks confusing
12296 ++ * and not so rare event. We still can lose it now,
12297 ++ * if it spins in bh_lock_sock(), but it is really
12298 ++ * marginal case.
12299 ++ */
12300 ++ inet_csk_reset_keepalive_timer(meta_sk, tmo);
12301 ++ } else {
12302 ++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
12303 ++ }
12304 ++ break;
12305 ++ }
12306 ++ case TCP_CLOSING:
12307 ++ case TCP_LAST_ACK:
12308 ++ if (meta_tp->snd_una == meta_tp->write_seq) {
12309 ++ tcp_done(meta_sk);
12310 ++ return 1;
12311 ++ }
12312 ++ break;
12313 ++ }
12314 ++
12315 ++ /* step 7: process the segment text */
12316 ++ switch (meta_sk->sk_state) {
12317 ++ case TCP_FIN_WAIT1:
12318 ++ case TCP_FIN_WAIT2:
12319 ++ /* RFC 793 says to queue data in these states,
12320 ++ * RFC 1122 says we MUST send a reset.
12321 ++ * BSD 4.4 also does reset.
12322 ++ */
12323 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
12324 ++ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
12325 ++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
12326 ++ !mptcp_is_data_fin2(skb, tp)) {
12327 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
12328 ++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
12329 ++ tcp_reset(meta_sk);
12330 ++ return 1;
12331 ++ }
12332 ++ }
12333 ++ break;
12334 ++ }
12335 ++
12336 ++ return 0;
12337 ++}
12338 ++
12339 ++/**
12340 ++ * @return:
12341 ++ * i) 1: Everything's fine.
12342 ++ * ii) -1: A reset has been sent on the subflow - csum-failure
12343 ++ * iii) 0: csum-failure but no reset sent, because it's the last subflow.
12344 ++ * Last packet should not be destroyed by the caller because it has
12345 ++ * been done here.
12346 ++ */
12347 ++static int mptcp_verif_dss_csum(struct sock *sk)
12348 ++{
12349 ++ struct tcp_sock *tp = tcp_sk(sk);
12350 ++ struct sk_buff *tmp, *tmp1, *last = NULL;
12351 ++ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
12352 ++ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
12353 ++ int iter = 0;
12354 ++
12355 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
12356 ++ unsigned int csum_len;
12357 ++
12358 ++ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
12359 ++ /* Mapping ends in the middle of the packet -
12360 ++ * csum only these bytes
12361 ++ */
12362 ++ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
12363 ++ else
12364 ++ csum_len = tmp->len;
12365 ++
12366 ++ offset = 0;
12367 ++ if (overflowed) {
12368 ++ char first_word[4];
12369 ++ first_word[0] = 0;
12370 ++ first_word[1] = 0;
12371 ++ first_word[2] = 0;
12372 ++ first_word[3] = *(tmp->data);
12373 ++ csum_tcp = csum_partial(first_word, 4, csum_tcp);
12374 ++ offset = 1;
12375 ++ csum_len--;
12376 ++ overflowed = 0;
12377 ++ }
12378 ++
12379 ++ csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);
12380 ++
12381 ++ /* Was it on an odd-length? Then we have to merge the next byte
12382 ++ * correctly (see above)
12383 ++ */
12384 ++ if (csum_len != (csum_len & (~1)))
12385 ++ overflowed = 1;
12386 ++
12387 ++ if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
12388 ++ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
12389 ++
12390 ++ /* If a 64-bit dss is present, we increase the offset
12391 ++ * by 4 bytes, as the high-order 64-bits will be added
12392 ++ * in the final csum_partial-call.
12393 ++ */
12394 ++ u32 offset = skb_transport_offset(tmp) +
12395 ++ TCP_SKB_CB(tmp)->dss_off;
12396 ++ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
12397 ++ offset += 4;
12398 ++
12399 ++ csum_tcp = skb_checksum(tmp, offset,
12400 ++ MPTCP_SUB_LEN_SEQ_CSUM,
12401 ++ csum_tcp);
12402 ++
12403 ++ csum_tcp = csum_partial(&data_seq,
12404 ++ sizeof(data_seq), csum_tcp);
12405 ++
12406 ++ dss_csum_added = 1; /* Just do it once */
12407 ++ }
12408 ++ last = tmp;
12409 ++ iter++;
12410 ++
12411 ++ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
12412 ++ !before(TCP_SKB_CB(tmp1)->seq,
12413 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12414 ++ break;
12415 ++ }
12416 ++
12417 ++ /* Now, checksum must be 0 */
12418 ++ if (unlikely(csum_fold(csum_tcp))) {
12419 ++ pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",
12420 ++ __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq,
12421 ++ dss_csum_added, overflowed, iter);
12422 ++
12423 ++ tp->mptcp->send_mp_fail = 1;
12424 ++
12425 ++ /* map_data_seq is the data-seq number of the
12426 ++ * mapping we are currently checking
12427 ++ */
12428 ++ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
12429 ++
12430 ++ if (tp->mpcb->cnt_subflows > 1) {
12431 ++ mptcp_send_reset(sk);
12432 ++ ans = -1;
12433 ++ } else {
12434 ++ tp->mpcb->send_infinite_mapping = 1;
12435 ++
12436 ++ /* Need to purge the rcv-queue as it's no more valid */
12437 ++ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
12438 ++ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
12439 ++ kfree_skb(tmp);
12440 ++ }
12441 ++
12442 ++ ans = 0;
12443 ++ }
12444 ++ }
12445 ++
12446 ++ return ans;
12447 ++}
12448 ++
12449 ++static inline void mptcp_prepare_skb(struct sk_buff *skb,
12450 ++ const struct sock *sk)
12451 ++{
12452 ++ const struct tcp_sock *tp = tcp_sk(sk);
12453 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
12454 ++ u32 inc = 0;
12455 ++
12456 ++ /* If skb is the end of this mapping (end is always at mapping-boundary
12457 ++ * thanks to the splitting/trimming), then we need to increase
12458 ++ * data-end-seq by 1 if this here is a data-fin.
12459 ++ *
12460 ++ * We need to do -1 because end_seq includes the subflow-FIN.
12461 ++ */
12462 ++ if (tp->mptcp->map_data_fin &&
12463 ++ (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) ==
12464 ++ (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
12465 ++ inc = 1;
12466 ++
12467 ++ /* We manually set the fin-flag if it is a data-fin. For easy
12468 ++ * processing in tcp_recvmsg.
12469 ++ */
12470 ++ tcp_hdr(skb)->fin = 1;
12471 ++ } else {
12472 ++ /* We may have a subflow-fin with data but without data-fin */
12473 ++ tcp_hdr(skb)->fin = 0;
12474 ++ }
12475 ++
12476 ++ /* Adapt data-seq's to the packet itself. We kinda transform the
12477 ++ * dss-mapping to a per-packet granularity. This is necessary to
12478 ++ * correctly handle overlapping mappings coming from different
12479 ++ * subflows. Otherwise it would be a complete mess.
12480 ++ */
12481 ++ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
12482 ++ tcb->end_seq = tcb->seq + skb->len + inc;
12483 ++}
12484 ++
12485 ++/**
12486 ++ * @return: 1 if the segment has been eaten and can be suppressed,
12487 ++ * otherwise 0.
12488 ++ */
12489 ++static inline int mptcp_direct_copy(const struct sk_buff *skb,
12490 ++ struct sock *meta_sk)
12491 ++{
12492 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12493 ++ int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);
12494 ++ int eaten = 0;
12495 ++
12496 ++ __set_current_state(TASK_RUNNING);
12497 ++
12498 ++ local_bh_enable();
12499 ++ if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {
12500 ++ meta_tp->ucopy.len -= chunk;
12501 ++ meta_tp->copied_seq += chunk;
12502 ++ eaten = (chunk == skb->len);
12503 ++ tcp_rcv_space_adjust(meta_sk);
12504 ++ }
12505 ++ local_bh_disable();
12506 ++ return eaten;
12507 ++}
12508 ++
12509 ++static inline void mptcp_reset_mapping(struct tcp_sock *tp)
12510 ++{
12511 ++ tp->mptcp->map_data_len = 0;
12512 ++ tp->mptcp->map_data_seq = 0;
12513 ++ tp->mptcp->map_subseq = 0;
12514 ++ tp->mptcp->map_data_fin = 0;
12515 ++ tp->mptcp->mapping_present = 0;
12516 ++}
12517 ++
12518 ++/* The DSS-mapping received on the sk only covers the second half of the skb
12519 ++ * (cut at seq). We trim the head from the skb.
12520 ++ * Data will be freed upon kfree().
12521 ++ *
12522 ++ * Inspired by tcp_trim_head().
12523 ++ */
12524 ++static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
12525 ++{
12526 ++ int len = seq - TCP_SKB_CB(skb)->seq;
12527 ++ u32 new_seq = TCP_SKB_CB(skb)->seq + len;
12528 ++
12529 ++ if (len < skb_headlen(skb))
12530 ++ __skb_pull(skb, len);
12531 ++ else
12532 ++ __pskb_trim_head(skb, len - skb_headlen(skb));
12533 ++
12534 ++ TCP_SKB_CB(skb)->seq = new_seq;
12535 ++
12536 ++ skb->truesize -= len;
12537 ++ atomic_sub(len, &sk->sk_rmem_alloc);
12538 ++ sk_mem_uncharge(sk, len);
12539 ++}
12540 ++
12541 ++/* The DSS-mapping received on the sk only covers the first half of the skb
12542 ++ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
12543 ++ * as further packets may resolve the mapping of the second half of data.
12544 ++ *
12545 ++ * Inspired by tcp_fragment().
12546 ++ */
12547 ++static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
12548 ++{
12549 ++ struct sk_buff *buff;
12550 ++ int nsize;
12551 ++ int nlen, len;
12552 ++
12553 ++ len = seq - TCP_SKB_CB(skb)->seq;
12554 ++ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
12555 ++ if (nsize < 0)
12556 ++ nsize = 0;
12557 ++
12558 ++ /* Get a new skb... force flag on. */
12559 ++ buff = alloc_skb(nsize, GFP_ATOMIC);
12560 ++ if (buff == NULL)
12561 ++ return -ENOMEM;
12562 ++
12563 ++ skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
12564 ++ skb_reset_transport_header(buff);
12565 ++
12566 ++ tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;
12567 ++ tcp_hdr(skb)->fin = 0;
12568 ++
12569 ++ /* We absolutly need to call skb_set_owner_r before refreshing the
12570 ++ * truesize of buff, otherwise the moved data will account twice.
12571 ++ */
12572 ++ skb_set_owner_r(buff, sk);
12573 ++ nlen = skb->len - len - nsize;
12574 ++ buff->truesize += nlen;
12575 ++ skb->truesize -= nlen;
12576 ++
12577 ++ /* Correct the sequence numbers. */
12578 ++ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
12579 ++ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
12580 ++ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
12581 ++
12582 ++ skb_split(skb, buff, len);
12583 ++
12584 ++ __skb_queue_after(&sk->sk_receive_queue, skb, buff);
12585 ++
12586 ++ return 0;
12587 ++}
12588 ++
12589 ++/* @return: 0 everything is fine. Just continue processing
12590 ++ * 1 subflow is broken stop everything
12591 ++ * -1 this packet was broken - continue with the next one.
12592 ++ */
12593 ++static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
12594 ++{
12595 ++ struct tcp_sock *tp = tcp_sk(sk);
12596 ++
12597 ++ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
12598 ++ if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&
12599 ++ !tp->mpcb->infinite_mapping_rcv) {
12600 ++ /* Remove a pure subflow-fin from the queue and increase
12601 ++ * copied_seq.
12602 ++ */
12603 ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
12604 ++ __skb_unlink(skb, &sk->sk_receive_queue);
12605 ++ __kfree_skb(skb);
12606 ++ return -1;
12607 ++ }
12608 ++
12609 ++ /* If we are not yet fully established and do not know the mapping for
12610 ++ * this segment, this path has to fallback to infinite or be torn down.
12611 ++ */
12612 ++ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
12613 ++ !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {
12614 ++ pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",
12615 ++ __func__, tp->mpcb->mptcp_loc_token,
12616 ++ tp->mptcp->path_index, __builtin_return_address(0),
12617 ++ TCP_SKB_CB(skb)->seq);
12618 ++
12619 ++ if (!is_master_tp(tp)) {
12620 ++ mptcp_send_reset(sk);
12621 ++ return 1;
12622 ++ }
12623 ++
12624 ++ tp->mpcb->infinite_mapping_snd = 1;
12625 ++ tp->mpcb->infinite_mapping_rcv = 1;
12626 ++ /* We do a seamless fallback and should not send a inf.mapping. */
12627 ++ tp->mpcb->send_infinite_mapping = 0;
12628 ++ tp->mptcp->fully_established = 1;
12629 ++ }
12630 ++
12631 ++ /* Receiver-side becomes fully established when a whole rcv-window has
12632 ++ * been received without the need to fallback due to the previous
12633 ++ * condition.
12634 ++ */
12635 ++ if (!tp->mptcp->fully_established) {
12636 ++ tp->mptcp->init_rcv_wnd -= skb->len;
12637 ++ if (tp->mptcp->init_rcv_wnd < 0)
12638 ++ mptcp_become_fully_estab(sk);
12639 ++ }
12640 ++
12641 ++ return 0;
12642 ++}
12643 ++
12644 ++/* @return: 0 everything is fine. Just continue processing
12645 ++ * 1 subflow is broken stop everything
12646 ++ * -1 this packet was broken - continue with the next one.
12647 ++ */
12648 ++static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
12649 ++{
12650 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
12651 ++ struct mptcp_cb *mpcb = tp->mpcb;
12652 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
12653 ++ u32 *ptr;
12654 ++ u32 data_seq, sub_seq, data_len, tcp_end_seq;
12655 ++
12656 ++ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be
12657 ++ * in-order at the data-level. Thus data-seq-numbers can be inferred
12658 ++ * from what is expected at the data-level.
12659 ++ */
12660 ++ if (mpcb->infinite_mapping_rcv) {
12661 ++ tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);
12662 ++ tp->mptcp->map_subseq = tcb->seq;
12663 ++ tp->mptcp->map_data_len = skb->len;
12664 ++ tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;
12665 ++ tp->mptcp->mapping_present = 1;
12666 ++ return 0;
12667 ++ }
12668 ++
12669 ++ /* No mapping here? Exit - it is either already set or still on its way */
12670 ++ if (!mptcp_is_data_seq(skb)) {
12671 ++ /* Too many packets without a mapping - this subflow is broken */
12672 ++ if (!tp->mptcp->mapping_present &&
12673 ++ tp->rcv_nxt - tp->copied_seq > 65536) {
12674 ++ mptcp_send_reset(sk);
12675 ++ return 1;
12676 ++ }
12677 ++
12678 ++ return 0;
12679 ++ }
12680 ++
12681 ++ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
12682 ++ ptr++;
12683 ++ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
12684 ++ ptr++;
12685 ++ data_len = get_unaligned_be16(ptr);
12686 ++
12687 ++ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
12688 ++ * The draft sets it to 0, but we really would like to have the
12689 ++ * real value, to have an easy handling afterwards here in this
12690 ++ * function.
12691 ++ */
12692 ++ if (mptcp_is_data_fin(skb) && skb->len == 0)
12693 ++ sub_seq = TCP_SKB_CB(skb)->seq;
12694 ++
12695 ++ /* If there is already a mapping - we check if it maps with the current
12696 ++ * one. If not - we reset.
12697 ++ */
12698 ++ if (tp->mptcp->mapping_present &&
12699 ++ (data_seq != (u32)tp->mptcp->map_data_seq ||
12700 ++ sub_seq != tp->mptcp->map_subseq ||
12701 ++ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
12702 ++ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
12703 ++ /* Mapping in packet is different from what we want */
12704 ++ pr_err("%s Mappings do not match!\n", __func__);
12705 ++ pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
12706 ++ __func__, data_seq, (u32)tp->mptcp->map_data_seq,
12707 ++ sub_seq, tp->mptcp->map_subseq, data_len,
12708 ++ tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
12709 ++ tp->mptcp->map_data_fin);
12710 ++ mptcp_send_reset(sk);
12711 ++ return 1;
12712 ++ }
12713 ++
12714 ++ /* If the previous check was good, the current mapping is valid and we exit. */
12715 ++ if (tp->mptcp->mapping_present)
12716 ++ return 0;
12717 ++
12718 ++ /* Mapping not yet set on this subflow - we set it here! */
12719 ++
12720 ++ if (!data_len) {
12721 ++ mpcb->infinite_mapping_rcv = 1;
12722 ++ tp->mptcp->fully_established = 1;
12723 ++ /* We need to repeat mp_fail's until the sender felt
12724 ++ * back to infinite-mapping - here we stop repeating it.
12725 ++ */
12726 ++ tp->mptcp->send_mp_fail = 0;
12727 ++
12728 ++ /* We have to fixup data_len - it must be the same as skb->len */
12729 ++ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
12730 ++ sub_seq = tcb->seq;
12731 ++
12732 ++ /* TODO kill all other subflows than this one */
12733 ++ /* data_seq and so on are set correctly */
12734 ++
12735 ++ /* At this point, the meta-ofo-queue has to be emptied,
12736 ++ * as the following data is guaranteed to be in-order at
12737 ++ * the data and subflow-level
12738 ++ */
12739 ++ mptcp_purge_ofo_queue(meta_tp);
12740 ++ }
12741 ++
12742 ++ /* We are sending mp-fail's and thus are in fallback mode.
12743 ++ * Ignore packets which do not announce the fallback and still
12744 ++ * want to provide a mapping.
12745 ++ */
12746 ++ if (tp->mptcp->send_mp_fail) {
12747 ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
12748 ++ __skb_unlink(skb, &sk->sk_receive_queue);
12749 ++ __kfree_skb(skb);
12750 ++ return -1;
12751 ++ }
12752 ++
12753 ++ /* FIN increased the mapping-length by 1 */
12754 ++ if (mptcp_is_data_fin(skb))
12755 ++ data_len--;
12756 ++
12757 ++ /* Subflow-sequences of packet must be
12758 ++ * (at least partially) be part of the DSS-mapping's
12759 ++ * subflow-sequence-space.
12760 ++ *
12761 ++ * Basically the mapping is not valid, if either of the
12762 ++ * following conditions is true:
12763 ++ *
12764 ++ * 1. It's not a data_fin and
12765 ++ * MPTCP-sub_seq >= TCP-end_seq
12766 ++ *
12767 ++ * 2. It's a data_fin and TCP-end_seq > TCP-seq and
12768 ++ * MPTCP-sub_seq >= TCP-end_seq
12769 ++ *
12770 ++ * The previous two can be merged into:
12771 ++ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
12772 ++ * Because if it's not a data-fin, TCP-end_seq > TCP-seq
12773 ++ *
12774 ++ * 3. It's a data_fin and skb->len == 0 and
12775 ++ * MPTCP-sub_seq > TCP-end_seq
12776 ++ *
12777 ++ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
12778 ++ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
12779 ++ *
12780 ++ * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)
12781 ++ */
12782 ++
12783 ++ /* subflow-fin is not part of the mapping - ignore it here ! */
12784 ++ tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;
12785 ++ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
12786 ++ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
12787 ++ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||
12788 ++ before(sub_seq, tp->copied_seq)) {
12789 ++ /* Subflow-sequences of packet is different from what is in the
12790 ++ * packet's dss-mapping. The peer is misbehaving - reset
12791 ++ */
12792 ++ pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "
12793 ++ "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"
12794 ++ "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),
12795 ++ skb->len, data_len, tp->copied_seq);
12796 ++ mptcp_send_reset(sk);
12797 ++ return 1;
12798 ++ }
12799 ++
12800 ++ /* Does the DSS had 64-bit seqnum's ? */
12801 ++ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
12802 ++ /* Wrapped around? */
12803 ++ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
12804 ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
12805 ++ } else {
12806 ++ /* Else, access the default high-order bits */
12807 ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
12808 ++ }
12809 ++ } else {
12810 ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
12811 ++
12812 ++ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
12813 ++ /* We make sure that the data_seq is invalid.
12814 ++ * It will be dropped later.
12815 ++ */
12816 ++ tp->mptcp->map_data_seq += 0xFFFFFFFF;
12817 ++ tp->mptcp->map_data_seq += 0xFFFFFFFF;
12818 ++ }
12819 ++ }
12820 ++
12821 ++ tp->mptcp->map_data_len = data_len;
12822 ++ tp->mptcp->map_subseq = sub_seq;
12823 ++ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
12824 ++ tp->mptcp->mapping_present = 1;
12825 ++
12826 ++ return 0;
12827 ++}
12828 ++
12829 ++/* Similar to tcp_sequence(...) */
12830 ++static inline bool mptcp_sequence(const struct tcp_sock *meta_tp,
12831 ++ u64 data_seq, u64 end_data_seq)
12832 ++{
12833 ++ const struct mptcp_cb *mpcb = meta_tp->mpcb;
12834 ++ u64 rcv_wup64;
12835 ++
12836 ++ /* Wrap-around? */
12837 ++ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
12838 ++ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
12839 ++ meta_tp->rcv_wup;
12840 ++ } else {
12841 ++ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
12842 ++ meta_tp->rcv_wup);
12843 ++ }
12844 ++
12845 ++ return !before64(end_data_seq, rcv_wup64) &&
12846 ++ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));
12847 ++}
12848 ++
12849 ++/* @return: 0 everything is fine. Just continue processing
12850 ++ * -1 this packet was broken - continue with the next one.
12851 ++ */
12852 ++static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
12853 ++{
12854 ++ struct tcp_sock *tp = tcp_sk(sk);
12855 ++ struct sk_buff *tmp, *tmp1;
12856 ++ u32 tcp_end_seq;
12857 ++
12858 ++ if (!tp->mptcp->mapping_present)
12859 ++ return 0;
12860 ++
12861 ++ /* either, the new skb gave us the mapping and the first segment
12862 ++ * in the sub-rcv-queue has to be trimmed ...
12863 ++ */
12864 ++ tmp = skb_peek(&sk->sk_receive_queue);
12865 ++ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
12866 ++ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))
12867 ++ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
12868 ++
12869 ++ /* ... or the new skb (tail) has to be split at the end. */
12870 ++ tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);
12871 ++ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
12872 ++ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
12873 ++ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
12874 ++ /* TODO : maybe handle this here better.
12875 ++ * We now just force meta-retransmission.
12876 ++ */
12877 ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
12878 ++ __skb_unlink(skb, &sk->sk_receive_queue);
12879 ++ __kfree_skb(skb);
12880 ++ return -1;
12881 ++ }
12882 ++ }
12883 ++
12884 ++ /* Now, remove old sk_buff's from the receive-queue.
12885 ++ * This may happen if the mapping has been lost for these segments and
12886 ++ * the next mapping has already been received.
12887 ++ */
12888 ++ if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
12889 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12890 ++ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
12891 ++ break;
12892 ++
12893 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12894 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12895 ++
12896 ++ /* Impossible that we could free skb here, because his
12897 ++ * mapping is known to be valid from previous checks
12898 ++ */
12899 ++ __kfree_skb(tmp1);
12900 ++ }
12901 ++ }
12902 ++
12903 ++ return 0;
12904 ++}
12905 ++
12906 ++/* @return: 0 everything is fine. Just continue processing
12907 ++ * 1 subflow is broken stop everything
12908 ++ * -1 this mapping has been put in the meta-receive-queue
12909 ++ * -2 this mapping has been eaten by the application
12910 ++ */
12911 ++static int mptcp_queue_skb(struct sock *sk)
12912 ++{
12913 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
12914 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
12915 ++ struct mptcp_cb *mpcb = tp->mpcb;
12916 ++ struct sk_buff *tmp, *tmp1;
12917 ++ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
12918 ++ bool data_queued = false;
12919 ++
12920 ++ /* Have we not yet received the full mapping? */
12921 ++ if (!tp->mptcp->mapping_present ||
12922 ++ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12923 ++ return 0;
12924 ++
12925 ++ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq
12926 ++ * OR
12927 ++ * This mapping is out of window
12928 ++ */
12929 ++ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
12930 ++ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
12931 ++ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
12932 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12933 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12934 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12935 ++ __kfree_skb(tmp1);
12936 ++
12937 ++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
12938 ++ !before(TCP_SKB_CB(tmp)->seq,
12939 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12940 ++ break;
12941 ++ }
12942 ++
12943 ++ mptcp_reset_mapping(tp);
12944 ++
12945 ++ return -1;
12946 ++ }
12947 ++
12948 ++ /* Record it, because we want to send our data_fin on the same path */
12949 ++ if (tp->mptcp->map_data_fin) {
12950 ++ mpcb->dfin_path_index = tp->mptcp->path_index;
12951 ++ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
12952 ++ }
12953 ++
12954 ++ /* Verify the checksum */
12955 ++ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
12956 ++ int ret = mptcp_verif_dss_csum(sk);
12957 ++
12958 ++ if (ret <= 0) {
12959 ++ mptcp_reset_mapping(tp);
12960 ++ return 1;
12961 ++ }
12962 ++ }
12963 ++
12964 ++ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
12965 ++ /* Seg's have to go to the meta-ofo-queue */
12966 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12967 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12968 ++ mptcp_prepare_skb(tmp1, sk);
12969 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12970 ++ /* MUST be done here, because fragstolen may be true later.
12971 ++ * Then, kfree_skb_partial will not account the memory.
12972 ++ */
12973 ++ skb_orphan(tmp1);
12974 ++
12975 ++ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
12976 ++ mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);
12977 ++ else
12978 ++ __kfree_skb(tmp1);
12979 ++
12980 ++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
12981 ++ !before(TCP_SKB_CB(tmp)->seq,
12982 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12983 ++ break;
12984 ++ }
12985 ++ tcp_enter_quickack_mode(sk);
12986 ++ } else {
12987 ++ /* Ready for the meta-rcv-queue */
12988 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12989 ++ int eaten = 0;
12990 ++ const bool copied_early = false;
12991 ++ bool fragstolen = false;
12992 ++ u32 old_rcv_nxt = meta_tp->rcv_nxt;
12993 ++
12994 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12995 ++ mptcp_prepare_skb(tmp1, sk);
12996 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12997 ++ /* MUST be done here, because fragstolen may be true.
12998 ++ * Then, kfree_skb_partial will not account the memory.
12999 ++ */
13000 ++ skb_orphan(tmp1);
13001 ++
13002 ++ /* This segment has already been received */
13003 ++ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
13004 ++ __kfree_skb(tmp1);
13005 ++ goto next;
13006 ++ }
13007 ++
13008 ++#ifdef CONFIG_NET_DMA
13009 ++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
13010 ++ meta_tp->ucopy.task == current &&
13011 ++ meta_tp->copied_seq == meta_tp->rcv_nxt &&
13012 ++ tmp1->len <= meta_tp->ucopy.len &&
13013 ++ sock_owned_by_user(meta_sk) &&
13014 ++ tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {
13015 ++ copied_early = true;
13016 ++ eaten = 1;
13017 ++ }
13018 ++#endif
13019 ++
13020 ++ /* Is direct copy possible ? */
13021 ++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
13022 ++ meta_tp->ucopy.task == current &&
13023 ++ meta_tp->copied_seq == meta_tp->rcv_nxt &&
13024 ++ meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&
13025 ++ !copied_early)
13026 ++ eaten = mptcp_direct_copy(tmp1, meta_sk);
13027 ++
13028 ++ if (mpcb->in_time_wait) /* In time-wait, do not receive data */
13029 ++ eaten = 1;
13030 ++
13031 ++ if (!eaten)
13032 ++ eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);
13033 ++
13034 ++ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
13035 ++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
13036 ++
13037 ++#ifdef CONFIG_NET_DMA
13038 ++ if (copied_early)
13039 ++ meta_tp->cleanup_rbuf(meta_sk, tmp1->len);
13040 ++#endif
13041 ++
13042 ++ if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)
13043 ++ mptcp_fin(meta_sk);
13044 ++
13045 ++ /* Check if this fills a gap in the ofo queue */
13046 ++ if (!skb_queue_empty(&meta_tp->out_of_order_queue))
13047 ++ mptcp_ofo_queue(meta_sk);
13048 ++
13049 ++#ifdef CONFIG_NET_DMA
13050 ++ if (copied_early)
13051 ++ __skb_queue_tail(&meta_sk->sk_async_wait_queue,
13052 ++ tmp1);
13053 ++ else
13054 ++#endif
13055 ++ if (eaten)
13056 ++ kfree_skb_partial(tmp1, fragstolen);
13057 ++
13058 ++ data_queued = true;
13059 ++next:
13060 ++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
13061 ++ !before(TCP_SKB_CB(tmp)->seq,
13062 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
13063 ++ break;
13064 ++ }
13065 ++ }
13066 ++
13067 ++ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;
13068 ++ mptcp_reset_mapping(tp);
13069 ++
13070 ++ return data_queued ? -1 : -2;
13071 ++}
13072 ++
13073 ++void mptcp_data_ready(struct sock *sk)
13074 ++{
13075 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
13076 ++ struct sk_buff *skb, *tmp;
13077 ++ int queued = 0;
13078 ++
13079 ++ /* restart before the check, because mptcp_fin might have changed the
13080 ++ * state.
13081 ++ */
13082 ++restart:
13083 ++ /* If the meta cannot receive data, there is no point in pushing data.
13084 ++ * If we are in time-wait, we may still be waiting for the final FIN.
13085 ++ * So, we should proceed with the processing.
13086 ++ */
13087 ++ if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) {
13088 ++ skb_queue_purge(&sk->sk_receive_queue);
13089 ++ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
13090 ++ goto exit;
13091 ++ }
13092 ++
13093 ++ /* Iterate over all segments, detect their mapping (if we don't have
13094 ++ * one yet), validate them and push everything one level higher.
13095 ++ */
13096 ++ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
13097 ++ int ret;
13098 ++ /* Pre-validation - e.g., early fallback */
13099 ++ ret = mptcp_prevalidate_skb(sk, skb);
13100 ++ if (ret < 0)
13101 ++ goto restart;
13102 ++ else if (ret > 0)
13103 ++ break;
13104 ++
13105 ++ /* Set the current mapping */
13106 ++ ret = mptcp_detect_mapping(sk, skb);
13107 ++ if (ret < 0)
13108 ++ goto restart;
13109 ++ else if (ret > 0)
13110 ++ break;
13111 ++
13112 ++ /* Validation */
13113 ++ if (mptcp_validate_mapping(sk, skb) < 0)
13114 ++ goto restart;
13115 ++
13116 ++ /* Push a level higher */
13117 ++ ret = mptcp_queue_skb(sk);
13118 ++ if (ret < 0) {
13119 ++ if (ret == -1)
13120 ++ queued = ret;
13121 ++ goto restart;
13122 ++ } else if (ret == 0) {
13123 ++ continue;
13124 ++ } else { /* ret == 1 */
13125 ++ break;
13126 ++ }
13127 ++ }
13128 ++
13129 ++exit:
13130 ++ if (tcp_sk(sk)->close_it) {
13131 ++ tcp_send_ack(sk);
13132 ++ tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0);
13133 ++ }
13134 ++
13135 ++ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
13136 ++ meta_sk->sk_data_ready(meta_sk);
13137 ++}
13138 ++
13139 ++
13140 ++int mptcp_check_req(struct sk_buff *skb, struct net *net)
13141 ++{
13142 ++ const struct tcphdr *th = tcp_hdr(skb);
13143 ++ struct sock *meta_sk = NULL;
13144 ++
13145 ++ /* MPTCP structures not initialized */
13146 ++ if (mptcp_init_failed)
13147 ++ return 0;
13148 ++
13149 ++ if (skb->protocol == htons(ETH_P_IP))
13150 ++ meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,
13151 ++ ip_hdr(skb)->daddr, net);
13152 ++#if IS_ENABLED(CONFIG_IPV6)
13153 ++ else /* IPv6 */
13154 ++ meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,
13155 ++ &ipv6_hdr(skb)->daddr, net);
13156 ++#endif /* CONFIG_IPV6 */
13157 ++
13158 ++ if (!meta_sk)
13159 ++ return 0;
13160 ++
13161 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
13162 ++
13163 ++ bh_lock_sock_nested(meta_sk);
13164 ++ if (sock_owned_by_user(meta_sk)) {
13165 ++ skb->sk = meta_sk;
13166 ++ if (unlikely(sk_add_backlog(meta_sk, skb,
13167 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
13168 ++ bh_unlock_sock(meta_sk);
13169 ++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
13170 ++ sock_put(meta_sk); /* Taken by mptcp_search_req */
13171 ++ kfree_skb(skb);
13172 ++ return 1;
13173 ++ }
13174 ++ } else if (skb->protocol == htons(ETH_P_IP)) {
13175 ++ tcp_v4_do_rcv(meta_sk, skb);
13176 ++#if IS_ENABLED(CONFIG_IPV6)
13177 ++ } else { /* IPv6 */
13178 ++ tcp_v6_do_rcv(meta_sk, skb);
13179 ++#endif /* CONFIG_IPV6 */
13180 ++ }
13181 ++ bh_unlock_sock(meta_sk);
13182 ++ sock_put(meta_sk); /* Taken by mptcp_vX_search_req */
13183 ++ return 1;
13184 ++}
13185 ++
13186 ++struct mp_join *mptcp_find_join(const struct sk_buff *skb)
13187 ++{
13188 ++ const struct tcphdr *th = tcp_hdr(skb);
13189 ++ unsigned char *ptr;
13190 ++ int length = (th->doff * 4) - sizeof(struct tcphdr);
13191 ++
13192 ++ /* Jump through the options to check whether JOIN is there */
13193 ++ ptr = (unsigned char *)(th + 1);
13194 ++ while (length > 0) {
13195 ++ int opcode = *ptr++;
13196 ++ int opsize;
13197 ++
13198 ++ switch (opcode) {
13199 ++ case TCPOPT_EOL:
13200 ++ return NULL;
13201 ++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
13202 ++ length--;
13203 ++ continue;
13204 ++ default:
13205 ++ opsize = *ptr++;
13206 ++ if (opsize < 2) /* "silly options" */
13207 ++ return NULL;
13208 ++ if (opsize > length)
13209 ++ return NULL; /* don't parse partial options */
13210 ++ if (opcode == TCPOPT_MPTCP &&
13211 ++ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
13212 ++ return (struct mp_join *)(ptr - 2);
13213 ++ }
13214 ++ ptr += opsize - 2;
13215 ++ length -= opsize;
13216 ++ }
13217 ++ }
13218 ++ return NULL;
13219 ++}
13220 ++
13221 ++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
13222 ++{
13223 ++ const struct mptcp_cb *mpcb;
13224 ++ struct sock *meta_sk;
13225 ++ u32 token;
13226 ++ bool meta_v4;
13227 ++ struct mp_join *join_opt = mptcp_find_join(skb);
13228 ++ if (!join_opt)
13229 ++ return 0;
13230 ++
13231 ++ /* MPTCP structures were not initialized, so return error */
13232 ++ if (mptcp_init_failed)
13233 ++ return -1;
13234 ++
13235 ++ token = join_opt->u.syn.token;
13236 ++ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
13237 ++ if (!meta_sk) {
13238 ++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
13239 ++ return -1;
13240 ++ }
13241 ++
13242 ++ meta_v4 = meta_sk->sk_family == AF_INET;
13243 ++ if (meta_v4) {
13244 ++ if (skb->protocol == htons(ETH_P_IPV6)) {
13245 ++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
13246 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13247 ++ return -1;
13248 ++ }
13249 ++ } else if (skb->protocol == htons(ETH_P_IP) &&
13250 ++ inet6_sk(meta_sk)->ipv6only) {
13251 ++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
13252 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13253 ++ return -1;
13254 ++ }
13255 ++
13256 ++ mpcb = tcp_sk(meta_sk)->mpcb;
13257 ++ if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {
13258 ++ /* We are in fallback-mode on the reception-side -
13259 ++ * no new subflows!
13260 ++ */
13261 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13262 ++ return -1;
13263 ++ }
13264 ++
13265 ++ /* Coming from time-wait-sock processing in tcp_v4_rcv.
13266 ++ * We have to deschedule it before continuing, because otherwise
13267 ++ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
13268 ++ */
13269 ++ if (tw) {
13270 ++ inet_twsk_deschedule(tw, &tcp_death_row);
13271 ++ inet_twsk_put(tw);
13272 ++ }
13273 ++
13274 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
13275 ++ /* OK, this is a new syn/join, let's create a new open request and
13276 ++ * send syn+ack
13277 ++ */
13278 ++ bh_lock_sock_nested(meta_sk);
13279 ++ if (sock_owned_by_user(meta_sk)) {
13280 ++ skb->sk = meta_sk;
13281 ++ if (unlikely(sk_add_backlog(meta_sk, skb,
13282 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
13283 ++ bh_unlock_sock(meta_sk);
13284 ++ NET_INC_STATS_BH(sock_net(meta_sk),
13285 ++ LINUX_MIB_TCPBACKLOGDROP);
13286 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13287 ++ kfree_skb(skb);
13288 ++ return 1;
13289 ++ }
13290 ++ } else if (skb->protocol == htons(ETH_P_IP)) {
13291 ++ tcp_v4_do_rcv(meta_sk, skb);
13292 ++#if IS_ENABLED(CONFIG_IPV6)
13293 ++ } else {
13294 ++ tcp_v6_do_rcv(meta_sk, skb);
13295 ++#endif /* CONFIG_IPV6 */
13296 ++ }
13297 ++ bh_unlock_sock(meta_sk);
13298 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13299 ++ return 1;
13300 ++}
13301 ++
13302 ++int mptcp_do_join_short(struct sk_buff *skb,
13303 ++ const struct mptcp_options_received *mopt,
13304 ++ struct net *net)
13305 ++{
13306 ++ struct sock *meta_sk;
13307 ++ u32 token;
13308 ++ bool meta_v4;
13309 ++
13310 ++ token = mopt->mptcp_rem_token;
13311 ++ meta_sk = mptcp_hash_find(net, token);
13312 ++ if (!meta_sk) {
13313 ++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
13314 ++ return -1;
13315 ++ }
13316 ++
13317 ++ meta_v4 = meta_sk->sk_family == AF_INET;
13318 ++ if (meta_v4) {
13319 ++ if (skb->protocol == htons(ETH_P_IPV6)) {
13320 ++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
13321 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13322 ++ return -1;
13323 ++ }
13324 ++ } else if (skb->protocol == htons(ETH_P_IP) &&
13325 ++ inet6_sk(meta_sk)->ipv6only) {
13326 ++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
13327 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13328 ++ return -1;
13329 ++ }
13330 ++
13331 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
13332 ++
13333 ++ /* OK, this is a new syn/join, let's create a new open request and
13334 ++ * send syn+ack
13335 ++ */
13336 ++ bh_lock_sock(meta_sk);
13337 ++
13338 ++ /* This check is also done in mptcp_vX_do_rcv. But, there we cannot
13339 ++ * call tcp_vX_send_reset, because we hold already two socket-locks.
13340 ++ * (the listener and the meta from above)
13341 ++ *
13342 ++ * And the send-reset will try to take yet another one (ip_send_reply).
13343 ++ * Thus, we propagate the reset up to tcp_rcv_state_process.
13344 ++ */
13345 ++ if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||
13346 ++ tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||
13347 ++ meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {
13348 ++ bh_unlock_sock(meta_sk);
13349 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13350 ++ return -1;
13351 ++ }
13352 ++
13353 ++ if (sock_owned_by_user(meta_sk)) {
13354 ++ skb->sk = meta_sk;
13355 ++ if (unlikely(sk_add_backlog(meta_sk, skb,
13356 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))
13357 ++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
13358 ++ else
13359 ++ /* Must make sure that upper layers won't free the
13360 ++ * skb if it is added to the backlog-queue.
13361 ++ */
13362 ++ skb_get(skb);
13363 ++ } else {
13364 ++ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
13365 ++ * the skb will finally be freed by tcp_v4_do_rcv (where we are
13366 ++ * coming from)
13367 ++ */
13368 ++ skb_get(skb);
13369 ++ if (skb->protocol == htons(ETH_P_IP)) {
13370 ++ tcp_v4_do_rcv(meta_sk, skb);
13371 ++#if IS_ENABLED(CONFIG_IPV6)
13372 ++ } else { /* IPv6 */
13373 ++ tcp_v6_do_rcv(meta_sk, skb);
13374 ++#endif /* CONFIG_IPV6 */
13375 ++ }
13376 ++ }
13377 ++
13378 ++ bh_unlock_sock(meta_sk);
13379 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13380 ++ return 0;
13381 ++}
13382 ++
13383 ++/**
13384 ++ * Equivalent of tcp_fin() for MPTCP
13385 ++ * Can be called only when the FIN is validly part
13386 ++ * of the data seqnum space. Not before when we get holes.
13387 ++ */
13388 ++void mptcp_fin(struct sock *meta_sk)
13389 ++{
13390 ++ struct sock *sk = NULL, *sk_it;
13391 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
13392 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
13393 ++
13394 ++ mptcp_for_each_sk(mpcb, sk_it) {
13395 ++ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
13396 ++ sk = sk_it;
13397 ++ break;
13398 ++ }
13399 ++ }
13400 ++
13401 ++ if (!sk || sk->sk_state == TCP_CLOSE)
13402 ++ sk = mptcp_select_ack_sock(meta_sk);
13403 ++
13404 ++ inet_csk_schedule_ack(sk);
13405 ++
13406 ++ meta_sk->sk_shutdown |= RCV_SHUTDOWN;
13407 ++ sock_set_flag(meta_sk, SOCK_DONE);
13408 ++
13409 ++ switch (meta_sk->sk_state) {
13410 ++ case TCP_SYN_RECV:
13411 ++ case TCP_ESTABLISHED:
13412 ++ /* Move to CLOSE_WAIT */
13413 ++ tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
13414 ++ inet_csk(sk)->icsk_ack.pingpong = 1;
13415 ++ break;
13416 ++
13417 ++ case TCP_CLOSE_WAIT:
13418 ++ case TCP_CLOSING:
13419 ++ /* Received a retransmission of the FIN, do
13420 ++ * nothing.
13421 ++ */
13422 ++ break;
13423 ++ case TCP_LAST_ACK:
13424 ++ /* RFC793: Remain in the LAST-ACK state. */
13425 ++ break;
13426 ++
13427 ++ case TCP_FIN_WAIT1:
13428 ++ /* This case occurs when a simultaneous close
13429 ++ * happens, we must ack the received FIN and
13430 ++ * enter the CLOSING state.
13431 ++ */
13432 ++ tcp_send_ack(sk);
13433 ++ tcp_set_state(meta_sk, TCP_CLOSING);
13434 ++ break;
13435 ++ case TCP_FIN_WAIT2:
13436 ++ /* Received a FIN -- send ACK and enter TIME_WAIT. */
13437 ++ tcp_send_ack(sk);
13438 ++ meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0);
13439 ++ break;
13440 ++ default:
13441 ++ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
13442 ++ * cases we should never reach this piece of code.
13443 ++ */
13444 ++ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
13445 ++ meta_sk->sk_state);
13446 ++ break;
13447 ++ }
13448 ++
13449 ++ /* It _is_ possible, that we have something out-of-order _after_ FIN.
13450 ++ * Probably, we should reset in this case. For now drop them.
13451 ++ */
13452 ++ mptcp_purge_ofo_queue(meta_tp);
13453 ++ sk_mem_reclaim(meta_sk);
13454 ++
13455 ++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
13456 ++ meta_sk->sk_state_change(meta_sk);
13457 ++
13458 ++ /* Do not send POLL_HUP for half duplex close. */
13459 ++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
13460 ++ meta_sk->sk_state == TCP_CLOSE)
13461 ++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
13462 ++ else
13463 ++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
13464 ++ }
13465 ++
13466 ++ return;
13467 ++}
13468 ++
13469 ++static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
13470 ++{
13471 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
13472 ++ struct sk_buff *skb;
13473 ++
13474 ++ if (!meta_tp->packets_out)
13475 ++ return;
13476 ++
13477 ++ tcp_for_write_queue(skb, meta_sk) {
13478 ++ if (skb == tcp_send_head(meta_sk))
13479 ++ break;
13480 ++
13481 ++ if (mptcp_retransmit_skb(meta_sk, skb))
13482 ++ return;
13483 ++
13484 ++ if (skb == tcp_write_queue_head(meta_sk))
13485 ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
13486 ++ inet_csk(meta_sk)->icsk_rto,
13487 ++ TCP_RTO_MAX);
13488 ++ }
13489 ++}
13490 ++
13491 ++/* Handle the DATA_ACK */
13492 ++static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)
13493 ++{
13494 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
13495 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
13496 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
13497 ++ u32 prior_snd_una = meta_tp->snd_una;
13498 ++ int prior_packets;
13499 ++ u32 nwin, data_ack, data_seq;
13500 ++ u16 data_len = 0;
13501 ++
13502 ++ /* A valid packet came in - subflow is operational again */
13503 ++ tp->pf = 0;
13504 ++
13505 ++ /* Even if there is no data-ack, we stop retransmitting.
13506 ++ * Except if this is a SYN/ACK. Then it is just a retransmission
13507 ++ */
13508 ++ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
13509 ++ tp->mptcp->pre_established = 0;
13510 ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
13511 ++ }
13512 ++
13513 ++ /* If we are in infinite mapping mode, rx_opt.data_ack has been
13514 ++ * set by mptcp_clean_rtx_infinite.
13515 ++ */
13516 ++ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
13517 ++ goto exit;
13518 ++
13519 ++ data_ack = tp->mptcp->rx_opt.data_ack;
13520 ++
13521 ++ if (unlikely(!tp->mptcp->fully_established) &&
13522 ++ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
13523 ++ /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1)
13524 ++ * includes a data-ack, we are fully established
13525 ++ */
13526 ++ mptcp_become_fully_estab(sk);
13527 ++
13528 ++ /* Get the data_seq */
13529 ++ if (mptcp_is_data_seq(skb)) {
13530 ++ data_seq = tp->mptcp->rx_opt.data_seq;
13531 ++ data_len = tp->mptcp->rx_opt.data_len;
13532 ++ } else {
13533 ++ data_seq = meta_tp->snd_wl1;
13534 ++ }
13535 ++
13536 ++ /* If the ack is older than previous acks
13537 ++ * then we can probably ignore it.
13538 ++ */
13539 ++ if (before(data_ack, prior_snd_una))
13540 ++ goto exit;
13541 ++
13542 ++ /* If the ack includes data we haven't sent yet, discard
13543 ++ * this segment (RFC793 Section 3.9).
13544 ++ */
13545 ++ if (after(data_ack, meta_tp->snd_nxt))
13546 ++ goto exit;
13547 ++
13548 ++ /*** Now, update the window - inspired by tcp_ack_update_window ***/
13549 ++ nwin = ntohs(tcp_hdr(skb)->window);
13550 ++
13551 ++ if (likely(!tcp_hdr(skb)->syn))
13552 ++ nwin <<= tp->rx_opt.snd_wscale;
13553 ++
13554 ++ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
13555 ++ tcp_update_wl(meta_tp, data_seq);
13556 ++
13557 ++ /* Draft v09, Section 3.3.5:
13558 ++ * [...] It should only update its local receive window values
13559 ++ * when the largest sequence number allowed (i.e. DATA_ACK +
13560 ++ * receive window) increases. [...]
13561 ++ */
13562 ++ if (meta_tp->snd_wnd != nwin &&
13563 ++ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
13564 ++ meta_tp->snd_wnd = nwin;
13565 ++
13566 ++ if (nwin > meta_tp->max_window)
13567 ++ meta_tp->max_window = nwin;
13568 ++ }
13569 ++ }
13570 ++ /*** Done, update the window ***/
13571 ++
13572 ++ /* We passed data and got it acked, remove any soft error
13573 ++ * log. Something worked...
13574 ++ */
13575 ++ sk->sk_err_soft = 0;
13576 ++ inet_csk(meta_sk)->icsk_probes_out = 0;
13577 ++ meta_tp->rcv_tstamp = tcp_time_stamp;
13578 ++ prior_packets = meta_tp->packets_out;
13579 ++ if (!prior_packets)
13580 ++ goto no_queue;
13581 ++
13582 ++ meta_tp->snd_una = data_ack;
13583 ++
13584 ++ mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
13585 ++
13586 ++ /* We are in loss-state, and something got acked, retransmit the whole
13587 ++ * queue now!
13588 ++ */
13589 ++ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
13590 ++ after(data_ack, prior_snd_una)) {
13591 ++ mptcp_xmit_retransmit_queue(meta_sk);
13592 ++ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
13593 ++ }
13594 ++
13595 ++ /* Simplified version of tcp_new_space, because the snd-buffer
13596 ++ * is handled by all the subflows.
13597 ++ */
13598 ++ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
13599 ++ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
13600 ++ if (meta_sk->sk_socket &&
13601 ++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
13602 ++ meta_sk->sk_write_space(meta_sk);
13603 ++ }
13604 ++
13605 ++ if (meta_sk->sk_state != TCP_ESTABLISHED &&
13606 ++ mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))
13607 ++ return;
13608 ++
13609 ++exit:
13610 ++ mptcp_push_pending_frames(meta_sk);
13611 ++
13612 ++ return;
13613 ++
13614 ++no_queue:
13615 ++ if (tcp_send_head(meta_sk))
13616 ++ tcp_ack_probe(meta_sk);
13617 ++
13618 ++ mptcp_push_pending_frames(meta_sk);
13619 ++
13620 ++ return;
13621 ++}
13622 ++
13623 ++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk)
13624 ++{
13625 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
13626 ++
13627 ++ if (!tp->mpcb->infinite_mapping_snd)
13628 ++ return;
13629 ++
13630 ++ /* The difference between both write_seq's represents the offset between
13631 ++ * data-sequence and subflow-sequence. As we are infinite, this must
13632 ++ * match.
13633 ++ *
13634 ++ * Thus, from this difference we can infer the meta snd_una.
13635 ++ */
13636 ++ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
13637 ++ tp->snd_una;
13638 ++
13639 ++ mptcp_data_ack(sk, skb);
13640 ++}
13641 ++
13642 ++/**** static functions used by mptcp_parse_options */
13643 ++
13644 ++static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
13645 ++{
13646 ++ struct sock *sk_it, *tmpsk;
13647 ++
13648 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
13649 ++ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
13650 ++ mptcp_reinject_data(sk_it, 0);
13651 ++ sk_it->sk_err = ECONNRESET;
13652 ++ if (tcp_need_reset(sk_it->sk_state))
13653 ++ tcp_sk(sk_it)->ops->send_active_reset(sk_it,
13654 ++ GFP_ATOMIC);
13655 ++ mptcp_sub_force_close(sk_it);
13656 ++ }
13657 ++ }
13658 ++}
13659 ++
13660 ++void mptcp_parse_options(const uint8_t *ptr, int opsize,
13661 ++ struct mptcp_options_received *mopt,
13662 ++ const struct sk_buff *skb)
13663 ++{
13664 ++ const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
13665 ++
13666 ++ /* If the socket is mp-capable we would have a mopt. */
13667 ++ if (!mopt)
13668 ++ return;
13669 ++
13670 ++ switch (mp_opt->sub) {
13671 ++ case MPTCP_SUB_CAPABLE:
13672 ++ {
13673 ++ const struct mp_capable *mpcapable = (struct mp_capable *)ptr;
13674 ++
13675 ++ if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&
13676 ++ opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {
13677 ++ mptcp_debug("%s: mp_capable: bad option size %d\n",
13678 ++ __func__, opsize);
13679 ++ break;
13680 ++ }
13681 ++
13682 ++ if (!sysctl_mptcp_enabled)
13683 ++ break;
13684 ++
13685 ++ /* We only support MPTCP version 0 */
13686 ++ if (mpcapable->ver != 0)
13687 ++ break;
13688 ++
13689 ++ /* MPTCP-RFC 6824:
13690 ++ * "If receiving a message with the 'B' flag set to 1, and this
13691 ++ * is not understood, then this SYN MUST be silently ignored;
13692 ++ */
13693 ++ if (mpcapable->b) {
13694 ++ mopt->drop_me = 1;
13695 ++ break;
13696 ++ }
13697 ++
13698 ++ /* MPTCP-RFC 6824:
13699 ++ * "An implementation that only supports this method MUST set
13700 ++ * bit "H" to 1, and bits "C" through "G" to 0."
13701 ++ */
13702 ++ if (!mpcapable->h)
13703 ++ break;
13704 ++
13705 ++ mopt->saw_mpc = 1;
13706 ++ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
13707 ++
13708 ++ if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)
13709 ++ mopt->mptcp_key = mpcapable->sender_key;
13710 ++
13711 ++ break;
13712 ++ }
13713 ++ case MPTCP_SUB_JOIN:
13714 ++ {
13715 ++ const struct mp_join *mpjoin = (struct mp_join *)ptr;
13716 ++
13717 ++ if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
13718 ++ opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
13719 ++ opsize != MPTCP_SUB_LEN_JOIN_ACK) {
13720 ++ mptcp_debug("%s: mp_join: bad option size %d\n",
13721 ++ __func__, opsize);
13722 ++ break;
13723 ++ }
13724 ++
13725 ++ /* saw_mpc must be set, because in tcp_check_req we assume that
13726 ++ * it is set to support falling back to reg. TCP if a rexmitted
13727 ++ * SYN has no MP_CAPABLE or MP_JOIN
13728 ++ */
13729 ++ switch (opsize) {
13730 ++ case MPTCP_SUB_LEN_JOIN_SYN:
13731 ++ mopt->is_mp_join = 1;
13732 ++ mopt->saw_mpc = 1;
13733 ++ mopt->low_prio = mpjoin->b;
13734 ++ mopt->rem_id = mpjoin->addr_id;
13735 ++ mopt->mptcp_rem_token = mpjoin->u.syn.token;
13736 ++ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
13737 ++ break;
13738 ++ case MPTCP_SUB_LEN_JOIN_SYNACK:
13739 ++ mopt->saw_mpc = 1;
13740 ++ mopt->low_prio = mpjoin->b;
13741 ++ mopt->rem_id = mpjoin->addr_id;
13742 ++ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
13743 ++ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
13744 ++ break;
13745 ++ case MPTCP_SUB_LEN_JOIN_ACK:
13746 ++ mopt->saw_mpc = 1;
13747 ++ mopt->join_ack = 1;
13748 ++ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
13749 ++ break;
13750 ++ }
13751 ++ break;
13752 ++ }
13753 ++ case MPTCP_SUB_DSS:
13754 ++ {
13755 ++ const struct mp_dss *mdss = (struct mp_dss *)ptr;
13756 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
13757 ++
13758 ++ /* We check opsize for the csum and non-csum case. We do this,
13759 ++ * because the draft says that the csum SHOULD be ignored if
13760 ++ * it has not been negotiated in the MP_CAPABLE but still is
13761 ++ * present in the data.
13762 ++ *
13763 ++ * It will get ignored later in mptcp_queue_skb.
13764 ++ */
13765 ++ if (opsize != mptcp_sub_len_dss(mdss, 0) &&
13766 ++ opsize != mptcp_sub_len_dss(mdss, 1)) {
13767 ++ mptcp_debug("%s: mp_dss: bad option size %d\n",
13768 ++ __func__, opsize);
13769 ++ break;
13770 ++ }
13771 ++
13772 ++ ptr += 4;
13773 ++
13774 ++ if (mdss->A) {
13775 ++ tcb->mptcp_flags |= MPTCPHDR_ACK;
13776 ++
13777 ++ if (mdss->a) {
13778 ++ mopt->data_ack = (u32) get_unaligned_be64(ptr);
13779 ++ ptr += MPTCP_SUB_LEN_ACK_64;
13780 ++ } else {
13781 ++ mopt->data_ack = get_unaligned_be32(ptr);
13782 ++ ptr += MPTCP_SUB_LEN_ACK;
13783 ++ }
13784 ++ }
13785 ++
13786 ++ tcb->dss_off = (ptr - skb_transport_header(skb));
13787 ++
13788 ++ if (mdss->M) {
13789 ++ if (mdss->m) {
13790 ++ u64 data_seq64 = get_unaligned_be64(ptr);
13791 ++
13792 ++ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
13793 ++ mopt->data_seq = (u32) data_seq64;
13794 ++
13795 ++ ptr += 12; /* 64-bit dseq + subseq */
13796 ++ } else {
13797 ++ mopt->data_seq = get_unaligned_be32(ptr);
13798 ++ ptr += 8; /* 32-bit dseq + subseq */
13799 ++ }
13800 ++ mopt->data_len = get_unaligned_be16(ptr);
13801 ++
13802 ++ tcb->mptcp_flags |= MPTCPHDR_SEQ;
13803 ++
13804 ++ /* Is a check-sum present? */
13805 ++ if (opsize == mptcp_sub_len_dss(mdss, 1))
13806 ++ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
13807 ++
13808 ++ /* DATA_FIN only possible with DSS-mapping */
13809 ++ if (mdss->F)
13810 ++ tcb->mptcp_flags |= MPTCPHDR_FIN;
13811 ++ }
13812 ++
13813 ++ break;
13814 ++ }
13815 ++ case MPTCP_SUB_ADD_ADDR:
13816 ++ {
13817 ++#if IS_ENABLED(CONFIG_IPV6)
13818 ++ const struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
13819 ++
13820 ++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
13821 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
13822 ++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
13823 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {
13824 ++#else
13825 ++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
13826 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {
13827 ++#endif /* CONFIG_IPV6 */
13828 ++ mptcp_debug("%s: mp_add_addr: bad option size %d\n",
13829 ++ __func__, opsize);
13830 ++ break;
13831 ++ }
13832 ++
13833 ++ /* We have to manually parse the options if we got two of them. */
13834 ++ if (mopt->saw_add_addr) {
13835 ++ mopt->more_add_addr = 1;
13836 ++ break;
13837 ++ }
13838 ++ mopt->saw_add_addr = 1;
13839 ++ mopt->add_addr_ptr = ptr;
13840 ++ break;
13841 ++ }
13842 ++ case MPTCP_SUB_REMOVE_ADDR:
13843 ++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
13844 ++ mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
13845 ++ __func__, opsize);
13846 ++ break;
13847 ++ }
13848 ++
13849 ++ if (mopt->saw_rem_addr) {
13850 ++ mopt->more_rem_addr = 1;
13851 ++ break;
13852 ++ }
13853 ++ mopt->saw_rem_addr = 1;
13854 ++ mopt->rem_addr_ptr = ptr;
13855 ++ break;
13856 ++ case MPTCP_SUB_PRIO:
13857 ++ {
13858 ++ const struct mp_prio *mpprio = (struct mp_prio *)ptr;
13859 ++
13860 ++ if (opsize != MPTCP_SUB_LEN_PRIO &&
13861 ++ opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
13862 ++ mptcp_debug("%s: mp_prio: bad option size %d\n",
13863 ++ __func__, opsize);
13864 ++ break;
13865 ++ }
13866 ++
13867 ++ mopt->saw_low_prio = 1;
13868 ++ mopt->low_prio = mpprio->b;
13869 ++
13870 ++ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
13871 ++ mopt->saw_low_prio = 2;
13872 ++ mopt->prio_addr_id = mpprio->addr_id;
13873 ++ }
13874 ++ break;
13875 ++ }
13876 ++ case MPTCP_SUB_FAIL:
13877 ++ if (opsize != MPTCP_SUB_LEN_FAIL) {
13878 ++ mptcp_debug("%s: mp_fail: bad option size %d\n",
13879 ++ __func__, opsize);
13880 ++ break;
13881 ++ }
13882 ++ mopt->mp_fail = 1;
13883 ++ break;
13884 ++ case MPTCP_SUB_FCLOSE:
13885 ++ if (opsize != MPTCP_SUB_LEN_FCLOSE) {
13886 ++ mptcp_debug("%s: mp_fclose: bad option size %d\n",
13887 ++ __func__, opsize);
13888 ++ break;
13889 ++ }
13890 ++
13891 ++ mopt->mp_fclose = 1;
13892 ++ mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;
13893 ++
13894 ++ break;
13895 ++ default:
13896 ++ mptcp_debug("%s: Received unkown subtype: %d\n",
13897 ++ __func__, mp_opt->sub);
13898 ++ break;
13899 ++ }
13900 ++}
13901 ++
13902 ++/** Parse only MPTCP options */
13903 ++void tcp_parse_mptcp_options(const struct sk_buff *skb,
13904 ++ struct mptcp_options_received *mopt)
13905 ++{
13906 ++ const struct tcphdr *th = tcp_hdr(skb);
13907 ++ int length = (th->doff * 4) - sizeof(struct tcphdr);
13908 ++ const unsigned char *ptr = (const unsigned char *)(th + 1);
13909 ++
13910 ++ while (length > 0) {
13911 ++ int opcode = *ptr++;
13912 ++ int opsize;
13913 ++
13914 ++ switch (opcode) {
13915 ++ case TCPOPT_EOL:
13916 ++ return;
13917 ++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
13918 ++ length--;
13919 ++ continue;
13920 ++ default:
13921 ++ opsize = *ptr++;
13922 ++ if (opsize < 2) /* "silly options" */
13923 ++ return;
13924 ++ if (opsize > length)
13925 ++ return; /* don't parse partial options */
13926 ++ if (opcode == TCPOPT_MPTCP)
13927 ++ mptcp_parse_options(ptr - 2, opsize, mopt, skb);
13928 ++ }
13929 ++ ptr += opsize - 2;
13930 ++ length -= opsize;
13931 ++ }
13932 ++}
13933 ++
13934 ++int mptcp_check_rtt(const struct tcp_sock *tp, int time)
13935 ++{
13936 ++ struct mptcp_cb *mpcb = tp->mpcb;
13937 ++ struct sock *sk;
13938 ++ u32 rtt_max = 0;
13939 ++
13940 ++ /* In MPTCP, we take the max delay across all flows,
13941 ++ * in order to take into account meta-reordering buffers.
13942 ++ */
13943 ++ mptcp_for_each_sk(mpcb, sk) {
13944 ++ if (!mptcp_sk_can_recv(sk))
13945 ++ continue;
13946 ++
13947 ++ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)
13948 ++ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;
13949 ++ }
13950 ++ if (time < (rtt_max >> 3) || !rtt_max)
13951 ++ return 1;
13952 ++
13953 ++ return 0;
13954 ++}
13955 ++
13956 ++static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
13957 ++{
13958 ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
13959 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
13960 ++ __be16 port = 0;
13961 ++ union inet_addr addr;
13962 ++ sa_family_t family;
13963 ++
13964 ++ if (mpadd->ipver == 4) {
13965 ++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)
13966 ++ port = mpadd->u.v4.port;
13967 ++ family = AF_INET;
13968 ++ addr.in = mpadd->u.v4.addr;
13969 ++#if IS_ENABLED(CONFIG_IPV6)
13970 ++ } else if (mpadd->ipver == 6) {
13971 ++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)
13972 ++ port = mpadd->u.v6.port;
13973 ++ family = AF_INET6;
13974 ++ addr.in6 = mpadd->u.v6.addr;
13975 ++#endif /* CONFIG_IPV6 */
13976 ++ } else {
13977 ++ return;
13978 ++ }
13979 ++
13980 ++ if (mpcb->pm_ops->add_raddr)
13981 ++ mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id);
13982 ++}
13983 ++
13984 ++static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
13985 ++{
13986 ++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
13987 ++ int i;
13988 ++ u8 rem_id;
13989 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
13990 ++
13991 ++ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
13992 ++ rem_id = (&mprem->addrs_id)[i];
13993 ++
13994 ++ if (mpcb->pm_ops->rem_raddr)
13995 ++ mpcb->pm_ops->rem_raddr(mpcb, rem_id);
13996 ++ mptcp_send_reset_rem_id(mpcb, rem_id);
13997 ++ }
13998 ++}
13999 ++
14000 ++static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
14001 ++{
14002 ++ struct tcphdr *th = tcp_hdr(skb);
14003 ++ unsigned char *ptr;
14004 ++ int length = (th->doff * 4) - sizeof(struct tcphdr);
14005 ++
14006 ++ /* Jump through the options to check whether ADD_ADDR is there */
14007 ++ ptr = (unsigned char *)(th + 1);
14008 ++ while (length > 0) {
14009 ++ int opcode = *ptr++;
14010 ++ int opsize;
14011 ++
14012 ++ switch (opcode) {
14013 ++ case TCPOPT_EOL:
14014 ++ return;
14015 ++ case TCPOPT_NOP:
14016 ++ length--;
14017 ++ continue;
14018 ++ default:
14019 ++ opsize = *ptr++;
14020 ++ if (opsize < 2)
14021 ++ return;
14022 ++ if (opsize > length)
14023 ++ return; /* don't parse partial options */
14024 ++ if (opcode == TCPOPT_MPTCP &&
14025 ++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
14026 ++#if IS_ENABLED(CONFIG_IPV6)
14027 ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
14028 ++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
14029 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
14030 ++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
14031 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))
14032 ++#else
14033 ++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
14034 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)
14035 ++#endif /* CONFIG_IPV6 */
14036 ++ goto cont;
14037 ++
14038 ++ mptcp_handle_add_addr(ptr, sk);
14039 ++ }
14040 ++ if (opcode == TCPOPT_MPTCP &&
14041 ++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
14042 ++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
14043 ++ goto cont;
14044 ++
14045 ++ mptcp_handle_rem_addr(ptr, sk);
14046 ++ }
14047 ++cont:
14048 ++ ptr += opsize - 2;
14049 ++ length -= opsize;
14050 ++ }
14051 ++ }
14052 ++ return;
14053 ++}
14054 ++
14055 ++static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
14056 ++{
14057 ++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
14058 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
14059 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
14060 ++
14061 ++ if (unlikely(mptcp->rx_opt.mp_fail)) {
14062 ++ mptcp->rx_opt.mp_fail = 0;
14063 ++
14064 ++ if (!th->rst && !mpcb->infinite_mapping_snd) {
14065 ++ struct sock *sk_it;
14066 ++
14067 ++ mpcb->send_infinite_mapping = 1;
14068 ++ /* We resend everything that has not been acknowledged */
14069 ++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
14070 ++
14071 ++ /* We artificially restart the whole send-queue. Thus,
14072 ++ * it is as if no packets are in flight
14073 ++ */
14074 ++ tcp_sk(meta_sk)->packets_out = 0;
14075 ++
14076 ++ /* If the snd_nxt already wrapped around, we have to
14077 ++ * undo the wrapping, as we are restarting from snd_una
14078 ++ * on.
14079 ++ */
14080 ++ if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {
14081 ++ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
14082 ++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
14083 ++ }
14084 ++ tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;
14085 ++
14086 ++ /* Trigger a sending on the meta. */
14087 ++ mptcp_push_pending_frames(meta_sk);
14088 ++
14089 ++ mptcp_for_each_sk(mpcb, sk_it) {
14090 ++ if (sk != sk_it)
14091 ++ mptcp_sub_force_close(sk_it);
14092 ++ }
14093 ++ }
14094 ++
14095 ++ return 0;
14096 ++ }
14097 ++
14098 ++ if (unlikely(mptcp->rx_opt.mp_fclose)) {
14099 ++ struct sock *sk_it, *tmpsk;
14100 ++
14101 ++ mptcp->rx_opt.mp_fclose = 0;
14102 ++ if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)
14103 ++ return 0;
14104 ++
14105 ++ if (tcp_need_reset(sk->sk_state))
14106 ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
14107 ++
14108 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)
14109 ++ mptcp_sub_force_close(sk_it);
14110 ++
14111 ++ tcp_reset(meta_sk);
14112 ++
14113 ++ return 1;
14114 ++ }
14115 ++
14116 ++ return 0;
14117 ++}
14118 ++
14119 ++static inline void mptcp_path_array_check(struct sock *meta_sk)
14120 ++{
14121 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
14122 ++
14123 ++ if (unlikely(mpcb->list_rcvd)) {
14124 ++ mpcb->list_rcvd = 0;
14125 ++ if (mpcb->pm_ops->new_remote_address)
14126 ++ mpcb->pm_ops->new_remote_address(meta_sk);
14127 ++ }
14128 ++}
14129 ++
14130 ++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
14131 ++ const struct sk_buff *skb)
14132 ++{
14133 ++ struct tcp_sock *tp = tcp_sk(sk);
14134 ++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
14135 ++
14136 ++ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
14137 ++ return 0;
14138 ++
14139 ++ if (mptcp_mp_fail_rcvd(sk, th))
14140 ++ return 1;
14141 ++
14142 ++ /* RFC 6824, Section 3.3:
14143 ++ * If a checksum is not present when its use has been negotiated, the
14144 ++ * receiver MUST close the subflow with a RST as it is considered broken.
14145 ++ */
14146 ++ if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&
14147 ++ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
14148 ++ if (tcp_need_reset(sk->sk_state))
14149 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
14150 ++
14151 ++ mptcp_sub_force_close(sk);
14152 ++ return 1;
14153 ++ }
14154 ++
14155 ++ /* We have to acknowledge retransmissions of the third
14156 ++ * ack.
14157 ++ */
14158 ++ if (mopt->join_ack) {
14159 ++ tcp_send_delayed_ack(sk);
14160 ++ mopt->join_ack = 0;
14161 ++ }
14162 ++
14163 ++ if (mopt->saw_add_addr || mopt->saw_rem_addr) {
14164 ++ if (mopt->more_add_addr || mopt->more_rem_addr) {
14165 ++ mptcp_parse_addropt(skb, sk);
14166 ++ } else {
14167 ++ if (mopt->saw_add_addr)
14168 ++ mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
14169 ++ if (mopt->saw_rem_addr)
14170 ++ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
14171 ++ }
14172 ++
14173 ++ mopt->more_add_addr = 0;
14174 ++ mopt->saw_add_addr = 0;
14175 ++ mopt->more_rem_addr = 0;
14176 ++ mopt->saw_rem_addr = 0;
14177 ++ }
14178 ++ if (mopt->saw_low_prio) {
14179 ++ if (mopt->saw_low_prio == 1) {
14180 ++ tp->mptcp->rcv_low_prio = mopt->low_prio;
14181 ++ } else {
14182 ++ struct sock *sk_it;
14183 ++ mptcp_for_each_sk(tp->mpcb, sk_it) {
14184 ++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;
14185 ++ if (mptcp->rem_id == mopt->prio_addr_id)
14186 ++ mptcp->rcv_low_prio = mopt->low_prio;
14187 ++ }
14188 ++ }
14189 ++ mopt->saw_low_prio = 0;
14190 ++ }
14191 ++
14192 ++ mptcp_data_ack(sk, skb);
14193 ++
14194 ++ mptcp_path_array_check(mptcp_meta_sk(sk));
14195 ++ /* Socket may have been mp_killed by a REMOVE_ADDR */
14196 ++ if (tp->mp_killed)
14197 ++ return 1;
14198 ++
14199 ++ return 0;
14200 ++}
14201 ++
14202 ++/* In case of fastopen, some data can already be in the write queue.
14203 ++ * We need to update the sequence number of the segments as they
14204 ++ * were initially TCP sequence numbers.
14205 ++ */
14206 ++static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk)
14207 ++{
14208 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14209 ++ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);
14210 ++ struct sk_buff *skb;
14211 ++ u32 new_mapping = meta_tp->write_seq - master_tp->snd_una;
14212 ++
14213 ++ /* There should only be one skb in write queue: the data not
14214 ++ * acknowledged in the SYN+ACK. In this case, we need to map
14215 ++ * this data to data sequence numbers.
14216 ++ */
14217 ++ skb_queue_walk(&meta_sk->sk_write_queue, skb) {
14218 ++ /* If the server only acknowledges partially the data sent in
14219 ++ * the SYN, we need to trim the acknowledged part because
14220 ++ * we don't want to retransmit this already received data.
14221 ++ * When we reach this point, tcp_ack() has already cleaned up
14222 ++ * fully acked segments. However, tcp trims partially acked
14223 ++ * segments only when retransmitting. Since MPTCP comes into
14224 ++ * play only now, we will fake an initial transmit, and
14225 ++ * retransmit_skb() will not be called. The following fragment
14226 ++ * comes from __tcp_retransmit_skb().
14227 ++ */
14228 ++ if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) {
14229 ++ BUG_ON(before(TCP_SKB_CB(skb)->end_seq,
14230 ++ master_tp->snd_una));
14231 ++ /* tcp_trim_head can only returns ENOMEM if skb is
14232 ++ * cloned. It is not the case here (see
14233 ++ * tcp_send_syn_data).
14234 ++ */
14235 ++ BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una -
14236 ++ TCP_SKB_CB(skb)->seq));
14237 ++ }
14238 ++
14239 ++ TCP_SKB_CB(skb)->seq += new_mapping;
14240 ++ TCP_SKB_CB(skb)->end_seq += new_mapping;
14241 ++ }
14242 ++
14243 ++ /* We can advance write_seq by the number of bytes unacknowledged
14244 ++ * and that were mapped in the previous loop.
14245 ++ */
14246 ++ meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una;
14247 ++
14248 ++ /* The packets from the master_sk will be entailed to it later
14249 ++ * Until that time, its write queue is empty, and
14250 ++ * write_seq must align with snd_una
14251 ++ */
14252 ++ master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una;
14253 ++ master_tp->packets_out = 0;
14254 ++
14255 ++ /* Although these data have been sent already over the subsk,
14256 ++ * They have never been sent over the meta_sk, so we rewind
14257 ++ * the send_head so that tcp considers it as an initial send
14258 ++ * (instead of retransmit).
14259 ++ */
14260 ++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
14261 ++}
14262 ++
14263 ++/* The skptr is needed, because if we become MPTCP-capable, we have to switch
14264 ++ * from meta-socket to master-socket.
14265 ++ *
14266 ++ * @return: 1 - we want to reset this connection
14267 ++ * 2 - we want to discard the received syn/ack
14268 ++ * 0 - everything is fine - continue
14269 ++ */
14270 ++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
14271 ++ const struct sk_buff *skb,
14272 ++ const struct mptcp_options_received *mopt)
14273 ++{
14274 ++ struct tcp_sock *tp = tcp_sk(sk);
14275 ++
14276 ++ if (mptcp(tp)) {
14277 ++ u8 hash_mac_check[20];
14278 ++ struct mptcp_cb *mpcb = tp->mpcb;
14279 ++
14280 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
14281 ++ (u8 *)&mpcb->mptcp_loc_key,
14282 ++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
14283 ++ (u8 *)&tp->mptcp->mptcp_loc_nonce,
14284 ++ (u32 *)hash_mac_check);
14285 ++ if (memcmp(hash_mac_check,
14286 ++ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
14287 ++ mptcp_sub_force_close(sk);
14288 ++ return 1;
14289 ++ }
14290 ++
14291 ++ /* Set this flag in order to postpone data sending
14292 ++ * until the 4th ack arrives.
14293 ++ */
14294 ++ tp->mptcp->pre_established = 1;
14295 ++ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
14296 ++
14297 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
14298 ++ (u8 *)&mpcb->mptcp_rem_key,
14299 ++ (u8 *)&tp->mptcp->mptcp_loc_nonce,
14300 ++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
14301 ++ (u32 *)&tp->mptcp->sender_mac[0]);
14302 ++
14303 ++ } else if (mopt->saw_mpc) {
14304 ++ struct sock *meta_sk = sk;
14305 ++
14306 ++ if (mptcp_create_master_sk(sk, mopt->mptcp_key,
14307 ++ ntohs(tcp_hdr(skb)->window)))
14308 ++ return 2;
14309 ++
14310 ++ sk = tcp_sk(sk)->mpcb->master_sk;
14311 ++ *skptr = sk;
14312 ++ tp = tcp_sk(sk);
14313 ++
14314 ++ /* If fastopen was used data might be in the send queue. We
14315 ++ * need to update their sequence number to MPTCP-level seqno.
14316 ++ * Note that it can happen in rare cases that fastopen_req is
14317 ++ * NULL and syn_data is 0 but fastopen indeed occurred and
14318 ++ * data has been queued in the write queue (but not sent).
14319 ++ * Example of such rare cases: connect is non-blocking and
14320 ++ * TFO is configured to work without cookies.
14321 ++ */
14322 ++ if (!skb_queue_empty(&meta_sk->sk_write_queue))
14323 ++ mptcp_rcv_synsent_fastopen(meta_sk);
14324 ++
14325 ++ /* -1, because the SYN consumed 1 byte. In case of TFO, we
14326 ++ * start the subflow-sequence number as if the data of the SYN
14327 ++ * is not part of any mapping.
14328 ++ */
14329 ++ tp->mptcp->snt_isn = tp->snd_una - 1;
14330 ++ tp->mpcb->dss_csum = mopt->dss_csum;
14331 ++ tp->mptcp->include_mpc = 1;
14332 ++
14333 ++ /* Ensure that fastopen is handled at the meta-level. */
14334 ++ tp->fastopen_req = NULL;
14335 ++
14336 ++ sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);
14337 ++ sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;
14338 ++
14339 ++ /* hold in sk_clone_lock due to initialization to 2 */
14340 ++ sock_put(sk);
14341 ++ } else {
14342 ++ tp->request_mptcp = 0;
14343 ++
14344 ++ if (tp->inside_tk_table)
14345 ++ mptcp_hash_remove(tp);
14346 ++ }
14347 ++
14348 ++ if (mptcp(tp))
14349 ++ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
14350 ++
14351 ++ return 0;
14352 ++}
14353 ++
14354 ++bool mptcp_should_expand_sndbuf(const struct sock *sk)
14355 ++{
14356 ++ const struct sock *sk_it;
14357 ++ const struct sock *meta_sk = mptcp_meta_sk(sk);
14358 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14359 ++ int cnt_backups = 0;
14360 ++ int backup_available = 0;
14361 ++
14362 ++ /* We circumvent this check in tcp_check_space, because we want to
14363 ++ * always call sk_write_space. So, we reproduce the check here.
14364 ++ */
14365 ++ if (!meta_sk->sk_socket ||
14366 ++ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
14367 ++ return false;
14368 ++
14369 ++ /* If the user specified a specific send buffer setting, do
14370 ++ * not modify it.
14371 ++ */
14372 ++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
14373 ++ return false;
14374 ++
14375 ++ /* If we are under global TCP memory pressure, do not expand. */
14376 ++ if (sk_under_memory_pressure(meta_sk))
14377 ++ return false;
14378 ++
14379 ++ /* If we are under soft global TCP memory pressure, do not expand. */
14380 ++ if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))
14381 ++ return false;
14382 ++
14383 ++
14384 ++ /* For MPTCP we look for a subsocket that could send data.
14385 ++ * If we found one, then we update the send-buffer.
14386 ++ */
14387 ++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
14388 ++ struct tcp_sock *tp_it = tcp_sk(sk_it);
14389 ++
14390 ++ if (!mptcp_sk_can_send(sk_it))
14391 ++ continue;
14392 ++
14393 ++ /* Backup-flows have to be counted - if there is no other
14394 ++ * subflow we take the backup-flow into account.
14395 ++ */
14396 ++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio)
14397 ++ cnt_backups++;
14398 ++
14399 ++ if (tp_it->packets_out < tp_it->snd_cwnd) {
14400 ++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
14401 ++ backup_available = 1;
14402 ++ continue;
14403 ++ }
14404 ++ return true;
14405 ++ }
14406 ++ }
14407 ++
14408 ++ /* Backup-flow is available for sending - update send-buffer */
14409 ++ if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)
14410 ++ return true;
14411 ++ return false;
14412 ++}
14413 ++
14414 ++void mptcp_init_buffer_space(struct sock *sk)
14415 ++{
14416 ++ struct tcp_sock *tp = tcp_sk(sk);
14417 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
14418 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14419 ++ int space;
14420 ++
14421 ++ tcp_init_buffer_space(sk);
14422 ++
14423 ++ if (is_master_tp(tp)) {
14424 ++ meta_tp->rcvq_space.space = meta_tp->rcv_wnd;
14425 ++ meta_tp->rcvq_space.time = tcp_time_stamp;
14426 ++ meta_tp->rcvq_space.seq = meta_tp->copied_seq;
14427 ++
14428 ++ /* If there is only one subflow, we just use regular TCP
14429 ++ * autotuning. User-locks are handled already by
14430 ++ * tcp_init_buffer_space
14431 ++ */
14432 ++ meta_tp->window_clamp = tp->window_clamp;
14433 ++ meta_tp->rcv_ssthresh = tp->rcv_ssthresh;
14434 ++ meta_sk->sk_rcvbuf = sk->sk_rcvbuf;
14435 ++ meta_sk->sk_sndbuf = sk->sk_sndbuf;
14436 ++
14437 ++ return;
14438 ++ }
14439 ++
14440 ++ if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)
14441 ++ goto snd_buf;
14442 ++
14443 ++ /* Adding a new subflow to the rcv-buffer space. We make a simple
14444 ++ * addition, to give some space to allow traffic on the new subflow.
14445 ++ * Autotuning will increase it further later on.
14446 ++ */
14447 ++ space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);
14448 ++ if (space > meta_sk->sk_rcvbuf) {
14449 ++ meta_tp->window_clamp += tp->window_clamp;
14450 ++ meta_tp->rcv_ssthresh += tp->rcv_ssthresh;
14451 ++ meta_sk->sk_rcvbuf = space;
14452 ++ }
14453 ++
14454 ++snd_buf:
14455 ++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
14456 ++ return;
14457 ++
14458 ++ /* Adding a new subflow to the send-buffer space. We make a simple
14459 ++ * addition, to give some space to allow traffic on the new subflow.
14460 ++ * Autotuning will increase it further later on.
14461 ++ */
14462 ++ space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);
14463 ++ if (space > meta_sk->sk_sndbuf) {
14464 ++ meta_sk->sk_sndbuf = space;
14465 ++ meta_sk->sk_write_space(meta_sk);
14466 ++ }
14467 ++}
14468 ++
14469 ++void mptcp_tcp_set_rto(struct sock *sk)
14470 ++{
14471 ++ tcp_set_rto(sk);
14472 ++ mptcp_set_rto(sk);
14473 ++}
14474 +diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c
14475 +new file mode 100644
14476 +index 000000000000..1183d1305d35
14477 +--- /dev/null
14478 ++++ b/net/mptcp/mptcp_ipv4.c
14479 +@@ -0,0 +1,483 @@
14480 ++/*
14481 ++ * MPTCP implementation - IPv4-specific functions
14482 ++ *
14483 ++ * Initial Design & Implementation:
14484 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
14485 ++ *
14486 ++ * Current Maintainer:
14487 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
14488 ++ *
14489 ++ * Additional authors:
14490 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
14491 ++ * Gregory Detal <gregory.detal@×××××××××.be>
14492 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
14493 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
14494 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
14495 ++ * Andreas Ripke <ripke@××××××.eu>
14496 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
14497 ++ * Octavian Purdila <octavian.purdila@×××××.com>
14498 ++ * John Ronan <jronan@××××.org>
14499 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
14500 ++ * Brandon Heller <brandonh@××××××××.edu>
14501 ++ *
14502 ++ *
14503 ++ * This program is free software; you can redistribute it and/or
14504 ++ * modify it under the terms of the GNU General Public License
14505 ++ * as published by the Free Software Foundation; either version
14506 ++ * 2 of the License, or (at your option) any later version.
14507 ++ */
14508 ++
14509 ++#include <linux/export.h>
14510 ++#include <linux/ip.h>
14511 ++#include <linux/list.h>
14512 ++#include <linux/skbuff.h>
14513 ++#include <linux/spinlock.h>
14514 ++#include <linux/tcp.h>
14515 ++
14516 ++#include <net/inet_common.h>
14517 ++#include <net/inet_connection_sock.h>
14518 ++#include <net/mptcp.h>
14519 ++#include <net/mptcp_v4.h>
14520 ++#include <net/request_sock.h>
14521 ++#include <net/tcp.h>
14522 ++
14523 ++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
14524 ++{
14525 ++ u32 hash[MD5_DIGEST_WORDS];
14526 ++
14527 ++ hash[0] = (__force u32)saddr;
14528 ++ hash[1] = (__force u32)daddr;
14529 ++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
14530 ++ hash[3] = mptcp_seed++;
14531 ++
14532 ++ md5_transform(hash, mptcp_secret);
14533 ++
14534 ++ return hash[0];
14535 ++}
14536 ++
14537 ++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
14538 ++{
14539 ++ u32 hash[MD5_DIGEST_WORDS];
14540 ++
14541 ++ hash[0] = (__force u32)saddr;
14542 ++ hash[1] = (__force u32)daddr;
14543 ++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
14544 ++ hash[3] = mptcp_seed++;
14545 ++
14546 ++ md5_transform(hash, mptcp_secret);
14547 ++
14548 ++ return *((u64 *)hash);
14549 ++}
14550 ++
14551 ++
14552 ++static void mptcp_v4_reqsk_destructor(struct request_sock *req)
14553 ++{
14554 ++ mptcp_reqsk_destructor(req);
14555 ++
14556 ++ tcp_v4_reqsk_destructor(req);
14557 ++}
14558 ++
14559 ++static int mptcp_v4_init_req(struct request_sock *req, struct sock *sk,
14560 ++ struct sk_buff *skb)
14561 ++{
14562 ++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb);
14563 ++ mptcp_reqsk_init(req, skb);
14564 ++
14565 ++ return 0;
14566 ++}
14567 ++
14568 ++static int mptcp_v4_join_init_req(struct request_sock *req, struct sock *sk,
14569 ++ struct sk_buff *skb)
14570 ++{
14571 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
14572 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
14573 ++ union inet_addr addr;
14574 ++ int loc_id;
14575 ++ bool low_prio = false;
14576 ++
14577 ++ /* We need to do this as early as possible. Because, if we fail later
14578 ++ * (e.g., get_local_id), then reqsk_free tries to remove the
14579 ++ * request-socket from the htb in mptcp_hash_request_remove as pprev
14580 ++ * may be different from NULL.
14581 ++ */
14582 ++ mtreq->hash_entry.pprev = NULL;
14583 ++
14584 ++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb);
14585 ++
14586 ++ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,
14587 ++ ip_hdr(skb)->daddr,
14588 ++ tcp_hdr(skb)->source,
14589 ++ tcp_hdr(skb)->dest);
14590 ++ addr.ip = inet_rsk(req)->ir_loc_addr;
14591 ++ loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio);
14592 ++ if (loc_id == -1)
14593 ++ return -1;
14594 ++ mtreq->loc_id = loc_id;
14595 ++ mtreq->low_prio = low_prio;
14596 ++
14597 ++ mptcp_join_reqsk_init(mpcb, req, skb);
14598 ++
14599 ++ return 0;
14600 ++}
14601 ++
14602 ++/* Similar to tcp_request_sock_ops */
14603 ++struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
14604 ++ .family = PF_INET,
14605 ++ .obj_size = sizeof(struct mptcp_request_sock),
14606 ++ .rtx_syn_ack = tcp_rtx_synack,
14607 ++ .send_ack = tcp_v4_reqsk_send_ack,
14608 ++ .destructor = mptcp_v4_reqsk_destructor,
14609 ++ .send_reset = tcp_v4_send_reset,
14610 ++ .syn_ack_timeout = tcp_syn_ack_timeout,
14611 ++};
14612 ++
14613 ++static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,
14614 ++ struct request_sock *req,
14615 ++ const unsigned long timeout)
14616 ++{
14617 ++ const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
14618 ++ inet_rsk(req)->ir_rmt_port,
14619 ++ 0, MPTCP_HASH_SIZE);
14620 ++ /* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not
14621 ++ * want to reset the keepalive-timer (responsible for retransmitting
14622 ++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
14623 ++ * overload the keepalive timer. Also, it's not a big deal, because the
14624 ++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
14625 ++ * if the third ACK gets lost, the client will handle the retransmission
14626 ++ * anyways. If our SYN/ACK gets lost, the client will retransmit the
14627 ++ * SYN.
14628 ++ */
14629 ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
14630 ++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
14631 ++ const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
14632 ++ inet_rsk(req)->ir_rmt_port,
14633 ++ lopt->hash_rnd, lopt->nr_table_entries);
14634 ++
14635 ++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
14636 ++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)
14637 ++ mptcp_reset_synack_timer(meta_sk, timeout);
14638 ++
14639 ++ rcu_read_lock();
14640 ++ spin_lock(&mptcp_reqsk_hlock);
14641 ++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);
14642 ++ spin_unlock(&mptcp_reqsk_hlock);
14643 ++ rcu_read_unlock();
14644 ++}
14645 ++
14646 ++/* Similar to tcp_v4_conn_request */
14647 ++static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
14648 ++{
14649 ++ return tcp_conn_request(&mptcp_request_sock_ops,
14650 ++ &mptcp_join_request_sock_ipv4_ops,
14651 ++ meta_sk, skb);
14652 ++}
14653 ++
14654 ++/* We only process join requests here. (either the SYN or the final ACK) */
14655 ++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
14656 ++{
14657 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
14658 ++ struct sock *child, *rsk = NULL;
14659 ++ int ret;
14660 ++
14661 ++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
14662 ++ struct tcphdr *th = tcp_hdr(skb);
14663 ++ const struct iphdr *iph = ip_hdr(skb);
14664 ++ struct sock *sk;
14665 ++
14666 ++ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
14667 ++ iph->saddr, th->source, iph->daddr,
14668 ++ th->dest, inet_iif(skb));
14669 ++
14670 ++ if (!sk) {
14671 ++ kfree_skb(skb);
14672 ++ return 0;
14673 ++ }
14674 ++ if (is_meta_sk(sk)) {
14675 ++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
14676 ++ kfree_skb(skb);
14677 ++ sock_put(sk);
14678 ++ return 0;
14679 ++ }
14680 ++
14681 ++ if (sk->sk_state == TCP_TIME_WAIT) {
14682 ++ inet_twsk_put(inet_twsk(sk));
14683 ++ kfree_skb(skb);
14684 ++ return 0;
14685 ++ }
14686 ++
14687 ++ ret = tcp_v4_do_rcv(sk, skb);
14688 ++ sock_put(sk);
14689 ++
14690 ++ return ret;
14691 ++ }
14692 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
14693 ++
14694 ++ /* Has been removed from the tk-table. Thus, no new subflows.
14695 ++ *
14696 ++ * Check for close-state is necessary, because we may have been closed
14697 ++ * without passing by mptcp_close().
14698 ++ *
14699 ++ * When falling back, no new subflows are allowed either.
14700 ++ */
14701 ++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
14702 ++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
14703 ++ goto reset_and_discard;
14704 ++
14705 ++ child = tcp_v4_hnd_req(meta_sk, skb);
14706 ++
14707 ++ if (!child)
14708 ++ goto discard;
14709 ++
14710 ++ if (child != meta_sk) {
14711 ++ sock_rps_save_rxhash(child, skb);
14712 ++ /* We don't call tcp_child_process here, because we hold
14713 ++ * already the meta-sk-lock and are sure that it is not owned
14714 ++ * by the user.
14715 ++ */
14716 ++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
14717 ++ bh_unlock_sock(child);
14718 ++ sock_put(child);
14719 ++ if (ret) {
14720 ++ rsk = child;
14721 ++ goto reset_and_discard;
14722 ++ }
14723 ++ } else {
14724 ++ if (tcp_hdr(skb)->syn) {
14725 ++ mptcp_v4_join_request(meta_sk, skb);
14726 ++ goto discard;
14727 ++ }
14728 ++ goto reset_and_discard;
14729 ++ }
14730 ++ return 0;
14731 ++
14732 ++reset_and_discard:
14733 ++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {
14734 ++ const struct tcphdr *th = tcp_hdr(skb);
14735 ++ const struct iphdr *iph = ip_hdr(skb);
14736 ++ struct request_sock **prev, *req;
14737 ++ /* If we end up here, it means we should not have matched on the
14738 ++ * request-socket. But, because the request-sock queue is only
14739 ++ * destroyed in mptcp_close, the socket may actually already be
14740 ++ * in close-state (e.g., through shutdown()) while still having
14741 ++ * pending request sockets.
14742 ++ */
14743 ++ req = inet_csk_search_req(meta_sk, &prev, th->source,
14744 ++ iph->saddr, iph->daddr);
14745 ++ if (req) {
14746 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
14747 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,
14748 ++ req);
14749 ++ reqsk_free(req);
14750 ++ }
14751 ++ }
14752 ++
14753 ++ tcp_v4_send_reset(rsk, skb);
14754 ++discard:
14755 ++ kfree_skb(skb);
14756 ++ return 0;
14757 ++}
14758 ++
14759 ++/* After this, the ref count of the meta_sk associated with the request_sock
14760 ++ * is incremented. Thus it is the responsibility of the caller
14761 ++ * to call sock_put() when the reference is not needed anymore.
14762 ++ */
14763 ++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
14764 ++ const __be32 laddr, const struct net *net)
14765 ++{
14766 ++ const struct mptcp_request_sock *mtreq;
14767 ++ struct sock *meta_sk = NULL;
14768 ++ const struct hlist_nulls_node *node;
14769 ++ const u32 hash = inet_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);
14770 ++
14771 ++ rcu_read_lock();
14772 ++begin:
14773 ++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],
14774 ++ hash_entry) {
14775 ++ struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));
14776 ++ meta_sk = mtreq->mptcp_mpcb->meta_sk;
14777 ++
14778 ++ if (ireq->ir_rmt_port == rport &&
14779 ++ ireq->ir_rmt_addr == raddr &&
14780 ++ ireq->ir_loc_addr == laddr &&
14781 ++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&
14782 ++ net_eq(net, sock_net(meta_sk)))
14783 ++ goto found;
14784 ++ meta_sk = NULL;
14785 ++ }
14786 ++ /* A request-socket is destroyed by RCU. So, it might have been recycled
14787 ++ * and put into another hash-table list. So, after the lookup we may
14788 ++ * end up in a different list. So, we may need to restart.
14789 ++ *
14790 ++ * See also the comment in __inet_lookup_established.
14791 ++ */
14792 ++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)
14793 ++ goto begin;
14794 ++
14795 ++found:
14796 ++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
14797 ++ meta_sk = NULL;
14798 ++ rcu_read_unlock();
14799 ++
14800 ++ return meta_sk;
14801 ++}
14802 ++
14803 ++/* Create a new IPv4 subflow.
14804 ++ *
14805 ++ * We are in user-context and meta-sock-lock is hold.
14806 ++ */
14807 ++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
14808 ++ struct mptcp_rem4 *rem)
14809 ++{
14810 ++ struct tcp_sock *tp;
14811 ++ struct sock *sk;
14812 ++ struct sockaddr_in loc_in, rem_in;
14813 ++ struct socket sock;
14814 ++ int ret;
14815 ++
14816 ++ /** First, create and prepare the new socket */
14817 ++
14818 ++ sock.type = meta_sk->sk_socket->type;
14819 ++ sock.state = SS_UNCONNECTED;
14820 ++ sock.wq = meta_sk->sk_socket->wq;
14821 ++ sock.file = meta_sk->sk_socket->file;
14822 ++ sock.ops = NULL;
14823 ++
14824 ++ ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
14825 ++ if (unlikely(ret < 0)) {
14826 ++ mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);
14827 ++ return ret;
14828 ++ }
14829 ++
14830 ++ sk = sock.sk;
14831 ++ tp = tcp_sk(sk);
14832 ++
14833 ++ /* All subsockets need the MPTCP-lock-class */
14834 ++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
14835 ++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
14836 ++
14837 ++ if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))
14838 ++ goto error;
14839 ++
14840 ++ tp->mptcp->slave_sk = 1;
14841 ++ tp->mptcp->low_prio = loc->low_prio;
14842 ++
14843 ++ /* Initializing the timer for an MPTCP subflow */
14844 ++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
14845 ++
14846 ++ /** Then, connect the socket to the peer */
14847 ++ loc_in.sin_family = AF_INET;
14848 ++ rem_in.sin_family = AF_INET;
14849 ++ loc_in.sin_port = 0;
14850 ++ if (rem->port)
14851 ++ rem_in.sin_port = rem->port;
14852 ++ else
14853 ++ rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
14854 ++ loc_in.sin_addr = loc->addr;
14855 ++ rem_in.sin_addr = rem->addr;
14856 ++
14857 ++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in));
14858 ++ if (ret < 0) {
14859 ++ mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",
14860 ++ __func__, ret);
14861 ++ goto error;
14862 ++ }
14863 ++
14864 ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",
14865 ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
14866 ++ tp->mptcp->path_index, &loc_in.sin_addr,
14867 ++ ntohs(loc_in.sin_port), &rem_in.sin_addr,
14868 ++ ntohs(rem_in.sin_port));
14869 ++
14870 ++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)
14871 ++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);
14872 ++
14873 ++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
14874 ++ sizeof(struct sockaddr_in), O_NONBLOCK);
14875 ++ if (ret < 0 && ret != -EINPROGRESS) {
14876 ++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
14877 ++ __func__, ret);
14878 ++ goto error;
14879 ++ }
14880 ++
14881 ++ sk_set_socket(sk, meta_sk->sk_socket);
14882 ++ sk->sk_wq = meta_sk->sk_wq;
14883 ++
14884 ++ return 0;
14885 ++
14886 ++error:
14887 ++ /* May happen if mptcp_add_sock fails first */
14888 ++ if (!mptcp(tp)) {
14889 ++ tcp_close(sk, 0);
14890 ++ } else {
14891 ++ local_bh_disable();
14892 ++ mptcp_sub_force_close(sk);
14893 ++ local_bh_enable();
14894 ++ }
14895 ++ return ret;
14896 ++}
14897 ++EXPORT_SYMBOL(mptcp_init4_subsockets);
14898 ++
14899 ++const struct inet_connection_sock_af_ops mptcp_v4_specific = {
14900 ++ .queue_xmit = ip_queue_xmit,
14901 ++ .send_check = tcp_v4_send_check,
14902 ++ .rebuild_header = inet_sk_rebuild_header,
14903 ++ .sk_rx_dst_set = inet_sk_rx_dst_set,
14904 ++ .conn_request = mptcp_conn_request,
14905 ++ .syn_recv_sock = tcp_v4_syn_recv_sock,
14906 ++ .net_header_len = sizeof(struct iphdr),
14907 ++ .setsockopt = ip_setsockopt,
14908 ++ .getsockopt = ip_getsockopt,
14909 ++ .addr2sockaddr = inet_csk_addr2sockaddr,
14910 ++ .sockaddr_len = sizeof(struct sockaddr_in),
14911 ++ .bind_conflict = inet_csk_bind_conflict,
14912 ++#ifdef CONFIG_COMPAT
14913 ++ .compat_setsockopt = compat_ip_setsockopt,
14914 ++ .compat_getsockopt = compat_ip_getsockopt,
14915 ++#endif
14916 ++};
14917 ++
14918 ++struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
14919 ++struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
14920 ++
14921 ++/* General initialization of IPv4 for MPTCP */
14922 ++int mptcp_pm_v4_init(void)
14923 ++{
14924 ++ int ret = 0;
14925 ++ struct request_sock_ops *ops = &mptcp_request_sock_ops;
14926 ++
14927 ++ mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
14928 ++ mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;
14929 ++
14930 ++ mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
14931 ++ mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;
14932 ++ mptcp_join_request_sock_ipv4_ops.queue_hash_add = mptcp_v4_reqsk_queue_hash_add;
14933 ++
14934 ++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
14935 ++ if (ops->slab_name == NULL) {
14936 ++ ret = -ENOMEM;
14937 ++ goto out;
14938 ++ }
14939 ++
14940 ++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
14941 ++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
14942 ++ NULL);
14943 ++
14944 ++ if (ops->slab == NULL) {
14945 ++ ret = -ENOMEM;
14946 ++ goto err_reqsk_create;
14947 ++ }
14948 ++
14949 ++out:
14950 ++ return ret;
14951 ++
14952 ++err_reqsk_create:
14953 ++ kfree(ops->slab_name);
14954 ++ ops->slab_name = NULL;
14955 ++ goto out;
14956 ++}
14957 ++
14958 ++void mptcp_pm_v4_undo(void)
14959 ++{
14960 ++ kmem_cache_destroy(mptcp_request_sock_ops.slab);
14961 ++ kfree(mptcp_request_sock_ops.slab_name);
14962 ++}
14963 +diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c
14964 +new file mode 100644
14965 +index 000000000000..1036973aa855
14966 +--- /dev/null
14967 ++++ b/net/mptcp/mptcp_ipv6.c
14968 +@@ -0,0 +1,518 @@
14969 ++/*
14970 ++ * MPTCP implementation - IPv6-specific functions
14971 ++ *
14972 ++ * Initial Design & Implementation:
14973 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
14974 ++ *
14975 ++ * Current Maintainer:
14976 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
14977 ++ *
14978 ++ * Additional authors:
14979 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
14980 ++ * Gregory Detal <gregory.detal@×××××××××.be>
14981 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
14982 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
14983 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
14984 ++ * Andreas Ripke <ripke@××××××.eu>
14985 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
14986 ++ * Octavian Purdila <octavian.purdila@×××××.com>
14987 ++ * John Ronan <jronan@××××.org>
14988 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
14989 ++ * Brandon Heller <brandonh@××××××××.edu>
14990 ++ *
14991 ++ *
14992 ++ * This program is free software; you can redistribute it and/or
14993 ++ * modify it under the terms of the GNU General Public License
14994 ++ * as published by the Free Software Foundation; either version
14995 ++ * 2 of the License, or (at your option) any later version.
14996 ++ */
14997 ++
14998 ++#include <linux/export.h>
14999 ++#include <linux/in6.h>
15000 ++#include <linux/kernel.h>
15001 ++
15002 ++#include <net/addrconf.h>
15003 ++#include <net/flow.h>
15004 ++#include <net/inet6_connection_sock.h>
15005 ++#include <net/inet6_hashtables.h>
15006 ++#include <net/inet_common.h>
15007 ++#include <net/ipv6.h>
15008 ++#include <net/ip6_checksum.h>
15009 ++#include <net/ip6_route.h>
15010 ++#include <net/mptcp.h>
15011 ++#include <net/mptcp_v6.h>
15012 ++#include <net/tcp.h>
15013 ++#include <net/transp_v6.h>
15014 ++
15015 ++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
15016 ++ __be16 sport, __be16 dport)
15017 ++{
15018 ++ u32 secret[MD5_MESSAGE_BYTES / 4];
15019 ++ u32 hash[MD5_DIGEST_WORDS];
15020 ++ u32 i;
15021 ++
15022 ++ memcpy(hash, saddr, 16);
15023 ++ for (i = 0; i < 4; i++)
15024 ++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
15025 ++ secret[4] = mptcp_secret[4] +
15026 ++ (((__force u16)sport << 16) + (__force u16)dport);
15027 ++ secret[5] = mptcp_seed++;
15028 ++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
15029 ++ secret[i] = mptcp_secret[i];
15030 ++
15031 ++ md5_transform(hash, secret);
15032 ++
15033 ++ return hash[0];
15034 ++}
15035 ++
15036 ++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
15037 ++ __be16 sport, __be16 dport)
15038 ++{
15039 ++ u32 secret[MD5_MESSAGE_BYTES / 4];
15040 ++ u32 hash[MD5_DIGEST_WORDS];
15041 ++ u32 i;
15042 ++
15043 ++ memcpy(hash, saddr, 16);
15044 ++ for (i = 0; i < 4; i++)
15045 ++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
15046 ++ secret[4] = mptcp_secret[4] +
15047 ++ (((__force u16)sport << 16) + (__force u16)dport);
15048 ++ secret[5] = mptcp_seed++;
15049 ++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
15050 ++ secret[i] = mptcp_secret[i];
15051 ++
15052 ++ md5_transform(hash, secret);
15053 ++
15054 ++ return *((u64 *)hash);
15055 ++}
15056 ++
15057 ++static void mptcp_v6_reqsk_destructor(struct request_sock *req)
15058 ++{
15059 ++ mptcp_reqsk_destructor(req);
15060 ++
15061 ++ tcp_v6_reqsk_destructor(req);
15062 ++}
15063 ++
15064 ++static int mptcp_v6_init_req(struct request_sock *req, struct sock *sk,
15065 ++ struct sk_buff *skb)
15066 ++{
15067 ++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb);
15068 ++ mptcp_reqsk_init(req, skb);
15069 ++
15070 ++ return 0;
15071 ++}
15072 ++
15073 ++static int mptcp_v6_join_init_req(struct request_sock *req, struct sock *sk,
15074 ++ struct sk_buff *skb)
15075 ++{
15076 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
15077 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
15078 ++ union inet_addr addr;
15079 ++ int loc_id;
15080 ++ bool low_prio = false;
15081 ++
15082 ++ /* We need to do this as early as possible. Because, if we fail later
15083 ++ * (e.g., get_local_id), then reqsk_free tries to remove the
15084 ++ * request-socket from the htb in mptcp_hash_request_remove as pprev
15085 ++ * may be different from NULL.
15086 ++ */
15087 ++ mtreq->hash_entry.pprev = NULL;
15088 ++
15089 ++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb);
15090 ++
15091 ++ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,
15092 ++ ipv6_hdr(skb)->daddr.s6_addr32,
15093 ++ tcp_hdr(skb)->source,
15094 ++ tcp_hdr(skb)->dest);
15095 ++ addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
15096 ++ loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio);
15097 ++ if (loc_id == -1)
15098 ++ return -1;
15099 ++ mtreq->loc_id = loc_id;
15100 ++ mtreq->low_prio = low_prio;
15101 ++
15102 ++ mptcp_join_reqsk_init(mpcb, req, skb);
15103 ++
15104 ++ return 0;
15105 ++}
15106 ++
15107 ++/* Similar to tcp6_request_sock_ops */
15108 ++struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
15109 ++ .family = AF_INET6,
15110 ++ .obj_size = sizeof(struct mptcp_request_sock),
15111 ++ .rtx_syn_ack = tcp_v6_rtx_synack,
15112 ++ .send_ack = tcp_v6_reqsk_send_ack,
15113 ++ .destructor = mptcp_v6_reqsk_destructor,
15114 ++ .send_reset = tcp_v6_send_reset,
15115 ++ .syn_ack_timeout = tcp_syn_ack_timeout,
15116 ++};
15117 ++
15118 ++static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,
15119 ++ struct request_sock *req,
15120 ++ const unsigned long timeout)
15121 ++{
15122 ++ const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
15123 ++ inet_rsk(req)->ir_rmt_port,
15124 ++ 0, MPTCP_HASH_SIZE);
15125 ++ /* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not
15126 ++ * want to reset the keepalive-timer (responsible for retransmitting
15127 ++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
15128 ++ * overload the keepalive timer. Also, it's not a big deal, because the
15129 ++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
15130 ++ * if the third ACK gets lost, the client will handle the retransmission
15131 ++ * anyways. If our SYN/ACK gets lost, the client will retransmit the
15132 ++ * SYN.
15133 ++ */
15134 ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
15135 ++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
15136 ++ const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
15137 ++ inet_rsk(req)->ir_rmt_port,
15138 ++ lopt->hash_rnd, lopt->nr_table_entries);
15139 ++
15140 ++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
15141 ++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)
15142 ++ mptcp_reset_synack_timer(meta_sk, timeout);
15143 ++
15144 ++ rcu_read_lock();
15145 ++ spin_lock(&mptcp_reqsk_hlock);
15146 ++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);
15147 ++ spin_unlock(&mptcp_reqsk_hlock);
15148 ++ rcu_read_unlock();
15149 ++}
15150 ++
15151 ++static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
15152 ++{
15153 ++ return tcp_conn_request(&mptcp6_request_sock_ops,
15154 ++ &mptcp_join_request_sock_ipv6_ops,
15155 ++ meta_sk, skb);
15156 ++}
15157 ++
15158 ++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
15159 ++{
15160 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
15161 ++ struct sock *child, *rsk = NULL;
15162 ++ int ret;
15163 ++
15164 ++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
15165 ++ struct tcphdr *th = tcp_hdr(skb);
15166 ++ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
15167 ++ struct sock *sk;
15168 ++
15169 ++ sk = __inet6_lookup_established(sock_net(meta_sk),
15170 ++ &tcp_hashinfo,
15171 ++ &ip6h->saddr, th->source,
15172 ++ &ip6h->daddr, ntohs(th->dest),
15173 ++ inet6_iif(skb));
15174 ++
15175 ++ if (!sk) {
15176 ++ kfree_skb(skb);
15177 ++ return 0;
15178 ++ }
15179 ++ if (is_meta_sk(sk)) {
15180 ++ WARN("%s Did not find a sub-sk!\n", __func__);
15181 ++ kfree_skb(skb);
15182 ++ sock_put(sk);
15183 ++ return 0;
15184 ++ }
15185 ++
15186 ++ if (sk->sk_state == TCP_TIME_WAIT) {
15187 ++ inet_twsk_put(inet_twsk(sk));
15188 ++ kfree_skb(skb);
15189 ++ return 0;
15190 ++ }
15191 ++
15192 ++ ret = tcp_v6_do_rcv(sk, skb);
15193 ++ sock_put(sk);
15194 ++
15195 ++ return ret;
15196 ++ }
15197 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
15198 ++
15199 ++ /* Has been removed from the tk-table. Thus, no new subflows.
15200 ++ *
15201 ++ * Check for close-state is necessary, because we may have been closed
15202 ++ * without passing by mptcp_close().
15203 ++ *
15204 ++ * When falling back, no new subflows are allowed either.
15205 ++ */
15206 ++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
15207 ++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
15208 ++ goto reset_and_discard;
15209 ++
15210 ++ child = tcp_v6_hnd_req(meta_sk, skb);
15211 ++
15212 ++ if (!child)
15213 ++ goto discard;
15214 ++
15215 ++ if (child != meta_sk) {
15216 ++ sock_rps_save_rxhash(child, skb);
15217 ++ /* We don't call tcp_child_process here, because we hold
15218 ++ * already the meta-sk-lock and are sure that it is not owned
15219 ++ * by the user.
15220 ++ */
15221 ++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
15222 ++ bh_unlock_sock(child);
15223 ++ sock_put(child);
15224 ++ if (ret) {
15225 ++ rsk = child;
15226 ++ goto reset_and_discard;
15227 ++ }
15228 ++ } else {
15229 ++ if (tcp_hdr(skb)->syn) {
15230 ++ mptcp_v6_join_request(meta_sk, skb);
15231 ++ goto discard;
15232 ++ }
15233 ++ goto reset_and_discard;
15234 ++ }
15235 ++ return 0;
15236 ++
15237 ++reset_and_discard:
15238 ++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {
15239 ++ const struct tcphdr *th = tcp_hdr(skb);
15240 ++ struct request_sock **prev, *req;
15241 ++ /* If we end up here, it means we should not have matched on the
15242 ++ * request-socket. But, because the request-sock queue is only
15243 ++ * destroyed in mptcp_close, the socket may actually already be
15244 ++ * in close-state (e.g., through shutdown()) while still having
15245 ++ * pending request sockets.
15246 ++ */
15247 ++ req = inet6_csk_search_req(meta_sk, &prev, th->source,
15248 ++ &ipv6_hdr(skb)->saddr,
15249 ++ &ipv6_hdr(skb)->daddr, inet6_iif(skb));
15250 ++ if (req) {
15251 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
15252 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,
15253 ++ req);
15254 ++ reqsk_free(req);
15255 ++ }
15256 ++ }
15257 ++
15258 ++ tcp_v6_send_reset(rsk, skb);
15259 ++discard:
15260 ++ kfree_skb(skb);
15261 ++ return 0;
15262 ++}
15263 ++
15264 ++/* After this, the ref count of the meta_sk associated with the request_sock
15265 ++ * is incremented. Thus it is the responsibility of the caller
15266 ++ * to call sock_put() when the reference is not needed anymore.
15267 ++ */
15268 ++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
15269 ++ const struct in6_addr *laddr, const struct net *net)
15270 ++{
15271 ++ const struct mptcp_request_sock *mtreq;
15272 ++ struct sock *meta_sk = NULL;
15273 ++ const struct hlist_nulls_node *node;
15274 ++ const u32 hash = inet6_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);
15275 ++
15276 ++ rcu_read_lock();
15277 ++begin:
15278 ++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],
15279 ++ hash_entry) {
15280 ++ struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq));
15281 ++ meta_sk = mtreq->mptcp_mpcb->meta_sk;
15282 ++
15283 ++ if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport &&
15284 ++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&
15285 ++ ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) &&
15286 ++ ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) &&
15287 ++ net_eq(net, sock_net(meta_sk)))
15288 ++ goto found;
15289 ++ meta_sk = NULL;
15290 ++ }
15291 ++ /* A request-socket is destroyed by RCU. So, it might have been recycled
15292 ++ * and put into another hash-table list. So, after the lookup we may
15293 ++ * end up in a different list. So, we may need to restart.
15294 ++ *
15295 ++ * See also the comment in __inet_lookup_established.
15296 ++ */
15297 ++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)
15298 ++ goto begin;
15299 ++
15300 ++found:
15301 ++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
15302 ++ meta_sk = NULL;
15303 ++ rcu_read_unlock();
15304 ++
15305 ++ return meta_sk;
15306 ++}
15307 ++
15308 ++/* Create a new IPv6 subflow.
15309 ++ *
15310 ++ * We are in user-context and meta-sock-lock is hold.
15311 ++ */
15312 ++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
15313 ++ struct mptcp_rem6 *rem)
15314 ++{
15315 ++ struct tcp_sock *tp;
15316 ++ struct sock *sk;
15317 ++ struct sockaddr_in6 loc_in, rem_in;
15318 ++ struct socket sock;
15319 ++ int ret;
15320 ++
15321 ++ /** First, create and prepare the new socket */
15322 ++
15323 ++ sock.type = meta_sk->sk_socket->type;
15324 ++ sock.state = SS_UNCONNECTED;
15325 ++ sock.wq = meta_sk->sk_socket->wq;
15326 ++ sock.file = meta_sk->sk_socket->file;
15327 ++ sock.ops = NULL;
15328 ++
15329 ++ ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
15330 ++ if (unlikely(ret < 0)) {
15331 ++ mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);
15332 ++ return ret;
15333 ++ }
15334 ++
15335 ++ sk = sock.sk;
15336 ++ tp = tcp_sk(sk);
15337 ++
15338 ++ /* All subsockets need the MPTCP-lock-class */
15339 ++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
15340 ++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
15341 ++
15342 ++ if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))
15343 ++ goto error;
15344 ++
15345 ++ tp->mptcp->slave_sk = 1;
15346 ++ tp->mptcp->low_prio = loc->low_prio;
15347 ++
15348 ++ /* Initializing the timer for an MPTCP subflow */
15349 ++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
15350 ++
15351 ++ /** Then, connect the socket to the peer */
15352 ++ loc_in.sin6_family = AF_INET6;
15353 ++ rem_in.sin6_family = AF_INET6;
15354 ++ loc_in.sin6_port = 0;
15355 ++ if (rem->port)
15356 ++ rem_in.sin6_port = rem->port;
15357 ++ else
15358 ++ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
15359 ++ loc_in.sin6_addr = loc->addr;
15360 ++ rem_in.sin6_addr = rem->addr;
15361 ++
15362 ++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in6));
15363 ++ if (ret < 0) {
15364 ++ mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",
15365 ++ __func__, ret);
15366 ++ goto error;
15367 ++ }
15368 ++
15369 ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",
15370 ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
15371 ++ tp->mptcp->path_index, &loc_in.sin6_addr,
15372 ++ ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
15373 ++ ntohs(rem_in.sin6_port));
15374 ++
15375 ++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)
15376 ++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);
15377 ++
15378 ++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
15379 ++ sizeof(struct sockaddr_in6), O_NONBLOCK);
15380 ++ if (ret < 0 && ret != -EINPROGRESS) {
15381 ++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
15382 ++ __func__, ret);
15383 ++ goto error;
15384 ++ }
15385 ++
15386 ++ sk_set_socket(sk, meta_sk->sk_socket);
15387 ++ sk->sk_wq = meta_sk->sk_wq;
15388 ++
15389 ++ return 0;
15390 ++
15391 ++error:
15392 ++ /* May happen if mptcp_add_sock fails first */
15393 ++ if (!mptcp(tp)) {
15394 ++ tcp_close(sk, 0);
15395 ++ } else {
15396 ++ local_bh_disable();
15397 ++ mptcp_sub_force_close(sk);
15398 ++ local_bh_enable();
15399 ++ }
15400 ++ return ret;
15401 ++}
15402 ++EXPORT_SYMBOL(mptcp_init6_subsockets);
15403 ++
15404 ++const struct inet_connection_sock_af_ops mptcp_v6_specific = {
15405 ++ .queue_xmit = inet6_csk_xmit,
15406 ++ .send_check = tcp_v6_send_check,
15407 ++ .rebuild_header = inet6_sk_rebuild_header,
15408 ++ .sk_rx_dst_set = inet6_sk_rx_dst_set,
15409 ++ .conn_request = mptcp_conn_request,
15410 ++ .syn_recv_sock = tcp_v6_syn_recv_sock,
15411 ++ .net_header_len = sizeof(struct ipv6hdr),
15412 ++ .net_frag_header_len = sizeof(struct frag_hdr),
15413 ++ .setsockopt = ipv6_setsockopt,
15414 ++ .getsockopt = ipv6_getsockopt,
15415 ++ .addr2sockaddr = inet6_csk_addr2sockaddr,
15416 ++ .sockaddr_len = sizeof(struct sockaddr_in6),
15417 ++ .bind_conflict = inet6_csk_bind_conflict,
15418 ++#ifdef CONFIG_COMPAT
15419 ++ .compat_setsockopt = compat_ipv6_setsockopt,
15420 ++ .compat_getsockopt = compat_ipv6_getsockopt,
15421 ++#endif
15422 ++};
15423 ++
15424 ++const struct inet_connection_sock_af_ops mptcp_v6_mapped = {
15425 ++ .queue_xmit = ip_queue_xmit,
15426 ++ .send_check = tcp_v4_send_check,
15427 ++ .rebuild_header = inet_sk_rebuild_header,
15428 ++ .sk_rx_dst_set = inet_sk_rx_dst_set,
15429 ++ .conn_request = mptcp_conn_request,
15430 ++ .syn_recv_sock = tcp_v6_syn_recv_sock,
15431 ++ .net_header_len = sizeof(struct iphdr),
15432 ++ .setsockopt = ipv6_setsockopt,
15433 ++ .getsockopt = ipv6_getsockopt,
15434 ++ .addr2sockaddr = inet6_csk_addr2sockaddr,
15435 ++ .sockaddr_len = sizeof(struct sockaddr_in6),
15436 ++ .bind_conflict = inet6_csk_bind_conflict,
15437 ++#ifdef CONFIG_COMPAT
15438 ++ .compat_setsockopt = compat_ipv6_setsockopt,
15439 ++ .compat_getsockopt = compat_ipv6_getsockopt,
15440 ++#endif
15441 ++};
15442 ++
15443 ++struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
15444 ++struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
15445 ++
15446 ++int mptcp_pm_v6_init(void)
15447 ++{
15448 ++ int ret = 0;
15449 ++ struct request_sock_ops *ops = &mptcp6_request_sock_ops;
15450 ++
15451 ++ mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
15452 ++ mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;
15453 ++
15454 ++ mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
15455 ++ mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;
15456 ++ mptcp_join_request_sock_ipv6_ops.queue_hash_add = mptcp_v6_reqsk_queue_hash_add;
15457 ++
15458 ++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
15459 ++ if (ops->slab_name == NULL) {
15460 ++ ret = -ENOMEM;
15461 ++ goto out;
15462 ++ }
15463 ++
15464 ++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
15465 ++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
15466 ++ NULL);
15467 ++
15468 ++ if (ops->slab == NULL) {
15469 ++ ret = -ENOMEM;
15470 ++ goto err_reqsk_create;
15471 ++ }
15472 ++
15473 ++out:
15474 ++ return ret;
15475 ++
15476 ++err_reqsk_create:
15477 ++ kfree(ops->slab_name);
15478 ++ ops->slab_name = NULL;
15479 ++ goto out;
15480 ++}
15481 ++
15482 ++void mptcp_pm_v6_undo(void)
15483 ++{
15484 ++ kmem_cache_destroy(mptcp6_request_sock_ops.slab);
15485 ++ kfree(mptcp6_request_sock_ops.slab_name);
15486 ++}
15487 +diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c
15488 +new file mode 100644
15489 +index 000000000000..6f5087983175
15490 +--- /dev/null
15491 ++++ b/net/mptcp/mptcp_ndiffports.c
15492 +@@ -0,0 +1,161 @@
15493 ++#include <linux/module.h>
15494 ++
15495 ++#include <net/mptcp.h>
15496 ++#include <net/mptcp_v4.h>
15497 ++
15498 ++#if IS_ENABLED(CONFIG_IPV6)
15499 ++#include <net/mptcp_v6.h>
15500 ++#endif
15501 ++
15502 ++struct ndiffports_priv {
15503 ++ /* Worker struct for subflow establishment */
15504 ++ struct work_struct subflow_work;
15505 ++
15506 ++ struct mptcp_cb *mpcb;
15507 ++};
15508 ++
15509 ++static int num_subflows __read_mostly = 2;
15510 ++module_param(num_subflows, int, 0644);
15511 ++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");
15512 ++
15513 ++/**
15514 ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
15515 ++ *
15516 ++ * This function uses a goto next_subflow, to allow releasing the lock between
15517 ++ * new subflows and giving other processes a chance to do some work on the
15518 ++ * socket and potentially finishing the communication.
15519 ++ **/
15520 ++static void create_subflow_worker(struct work_struct *work)
15521 ++{
15522 ++ const struct ndiffports_priv *pm_priv = container_of(work,
15523 ++ struct ndiffports_priv,
15524 ++ subflow_work);
15525 ++ struct mptcp_cb *mpcb = pm_priv->mpcb;
15526 ++ struct sock *meta_sk = mpcb->meta_sk;
15527 ++ int iter = 0;
15528 ++
15529 ++next_subflow:
15530 ++ if (iter) {
15531 ++ release_sock(meta_sk);
15532 ++ mutex_unlock(&mpcb->mpcb_mutex);
15533 ++
15534 ++ cond_resched();
15535 ++ }
15536 ++ mutex_lock(&mpcb->mpcb_mutex);
15537 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
15538 ++
15539 ++ iter++;
15540 ++
15541 ++ if (sock_flag(meta_sk, SOCK_DEAD))
15542 ++ goto exit;
15543 ++
15544 ++ if (mpcb->master_sk &&
15545 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
15546 ++ goto exit;
15547 ++
15548 ++ if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) {
15549 ++ if (meta_sk->sk_family == AF_INET ||
15550 ++ mptcp_v6_is_v4_mapped(meta_sk)) {
15551 ++ struct mptcp_loc4 loc;
15552 ++ struct mptcp_rem4 rem;
15553 ++
15554 ++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
15555 ++ loc.loc4_id = 0;
15556 ++ loc.low_prio = 0;
15557 ++
15558 ++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
15559 ++ rem.port = inet_sk(meta_sk)->inet_dport;
15560 ++ rem.rem4_id = 0; /* Default 0 */
15561 ++
15562 ++ mptcp_init4_subsockets(meta_sk, &loc, &rem);
15563 ++ } else {
15564 ++#if IS_ENABLED(CONFIG_IPV6)
15565 ++ struct mptcp_loc6 loc;
15566 ++ struct mptcp_rem6 rem;
15567 ++
15568 ++ loc.addr = inet6_sk(meta_sk)->saddr;
15569 ++ loc.loc6_id = 0;
15570 ++ loc.low_prio = 0;
15571 ++
15572 ++ rem.addr = meta_sk->sk_v6_daddr;
15573 ++ rem.port = inet_sk(meta_sk)->inet_dport;
15574 ++ rem.rem6_id = 0; /* Default 0 */
15575 ++
15576 ++ mptcp_init6_subsockets(meta_sk, &loc, &rem);
15577 ++#endif
15578 ++ }
15579 ++ goto next_subflow;
15580 ++ }
15581 ++
15582 ++exit:
15583 ++ release_sock(meta_sk);
15584 ++ mutex_unlock(&mpcb->mpcb_mutex);
15585 ++ sock_put(meta_sk);
15586 ++}
15587 ++
15588 ++static void ndiffports_new_session(const struct sock *meta_sk)
15589 ++{
15590 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
15591 ++ struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
15592 ++
15593 ++ /* Initialize workqueue-struct */
15594 ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
15595 ++ fmp->mpcb = mpcb;
15596 ++}
15597 ++
15598 ++static void ndiffports_create_subflows(struct sock *meta_sk)
15599 ++{
15600 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
15601 ++ struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
15602 ++
15603 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
15604 ++ mpcb->send_infinite_mapping ||
15605 ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
15606 ++ return;
15607 ++
15608 ++ if (!work_pending(&pm_priv->subflow_work)) {
15609 ++ sock_hold(meta_sk);
15610 ++ queue_work(mptcp_wq, &pm_priv->subflow_work);
15611 ++ }
15612 ++}
15613 ++
15614 ++static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr,
15615 ++ struct net *net, bool *low_prio)
15616 ++{
15617 ++ return 0;
15618 ++}
15619 ++
15620 ++static struct mptcp_pm_ops ndiffports __read_mostly = {
15621 ++ .new_session = ndiffports_new_session,
15622 ++ .fully_established = ndiffports_create_subflows,
15623 ++ .get_local_id = ndiffports_get_local_id,
15624 ++ .name = "ndiffports",
15625 ++ .owner = THIS_MODULE,
15626 ++};
15627 ++
15628 ++/* General initialization of MPTCP_PM */
15629 ++static int __init ndiffports_register(void)
15630 ++{
15631 ++ BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
15632 ++
15633 ++ if (mptcp_register_path_manager(&ndiffports))
15634 ++ goto exit;
15635 ++
15636 ++ return 0;
15637 ++
15638 ++exit:
15639 ++ return -1;
15640 ++}
15641 ++
15642 ++static void ndiffports_unregister(void)
15643 ++{
15644 ++ mptcp_unregister_path_manager(&ndiffports);
15645 ++}
15646 ++
15647 ++module_init(ndiffports_register);
15648 ++module_exit(ndiffports_unregister);
15649 ++
15650 ++MODULE_AUTHOR("Christoph Paasch");
15651 ++MODULE_LICENSE("GPL");
15652 ++MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
15653 ++MODULE_VERSION("0.88");
15654 +diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c
15655 +new file mode 100644
15656 +index 000000000000..ec4e98622637
15657 +--- /dev/null
15658 ++++ b/net/mptcp/mptcp_ofo_queue.c
15659 +@@ -0,0 +1,295 @@
15660 ++/*
15661 ++ * MPTCP implementation - Fast algorithm for MPTCP meta-reordering
15662 ++ *
15663 ++ * Initial Design & Implementation:
15664 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
15665 ++ *
15666 ++ * Current Maintainer & Author:
15667 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
15668 ++ *
15669 ++ * Additional authors:
15670 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
15671 ++ * Gregory Detal <gregory.detal@×××××××××.be>
15672 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
15673 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
15674 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
15675 ++ * Andreas Ripke <ripke@××××××.eu>
15676 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
15677 ++ * Octavian Purdila <octavian.purdila@×××××.com>
15678 ++ * John Ronan <jronan@××××.org>
15679 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
15680 ++ * Brandon Heller <brandonh@××××××××.edu>
15681 ++ *
15682 ++ * This program is free software; you can redistribute it and/or
15683 ++ * modify it under the terms of the GNU General Public License
15684 ++ * as published by the Free Software Foundation; either version
15685 ++ * 2 of the License, or (at your option) any later version.
15686 ++ */
15687 ++
15688 ++#include <linux/skbuff.h>
15689 ++#include <linux/slab.h>
15690 ++#include <net/tcp.h>
15691 ++#include <net/mptcp.h>
15692 ++
15693 ++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
15694 ++ const struct sk_buff *skb)
15695 ++{
15696 ++ struct tcp_sock *tp;
15697 ++
15698 ++ mptcp_for_each_tp(mpcb, tp) {
15699 ++ if (tp->mptcp->shortcut_ofoqueue == skb) {
15700 ++ tp->mptcp->shortcut_ofoqueue = NULL;
15701 ++ return;
15702 ++ }
15703 ++ }
15704 ++}
15705 ++
15706 ++/* Does 'skb' fits after 'here' in the queue 'head' ?
15707 ++ * If yes, we queue it and return 1
15708 ++ */
15709 ++static int mptcp_ofo_queue_after(struct sk_buff_head *head,
15710 ++ struct sk_buff *skb, struct sk_buff *here,
15711 ++ const struct tcp_sock *tp)
15712 ++{
15713 ++ struct sock *meta_sk = tp->meta_sk;
15714 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
15715 ++ u32 seq = TCP_SKB_CB(skb)->seq;
15716 ++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
15717 ++
15718 ++ /* We want to queue skb after here, thus seq >= end_seq */
15719 ++ if (before(seq, TCP_SKB_CB(here)->end_seq))
15720 ++ return 0;
15721 ++
15722 ++ if (seq == TCP_SKB_CB(here)->end_seq) {
15723 ++ bool fragstolen = false;
15724 ++
15725 ++ if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {
15726 ++ __skb_queue_after(&meta_tp->out_of_order_queue, here, skb);
15727 ++ return 1;
15728 ++ } else {
15729 ++ kfree_skb_partial(skb, fragstolen);
15730 ++ return -1;
15731 ++ }
15732 ++ }
15733 ++
15734 ++ /* If here is the last one, we can always queue it */
15735 ++ if (skb_queue_is_last(head, here)) {
15736 ++ __skb_queue_after(head, here, skb);
15737 ++ return 1;
15738 ++ } else {
15739 ++ struct sk_buff *skb1 = skb_queue_next(head, here);
15740 ++ /* It's not the last one, but does it fits between 'here' and
15741 ++ * the one after 'here' ? Thus, does end_seq <= after_here->seq
15742 ++ */
15743 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {
15744 ++ __skb_queue_after(head, here, skb);
15745 ++ return 1;
15746 ++ }
15747 ++ }
15748 ++
15749 ++ return 0;
15750 ++}
15751 ++
15752 ++static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,
15753 ++ struct sk_buff_head *head, struct tcp_sock *tp)
15754 ++{
15755 ++ struct sock *meta_sk = tp->meta_sk;
15756 ++ struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);
15757 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
15758 ++ struct sk_buff *skb1, *best_shortcut = NULL;
15759 ++ u32 seq = TCP_SKB_CB(skb)->seq;
15760 ++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
15761 ++ u32 distance = 0xffffffff;
15762 ++
15763 ++ /* First, check the tp's shortcut */
15764 ++ if (!shortcut) {
15765 ++ if (skb_queue_empty(head)) {
15766 ++ __skb_queue_head(head, skb);
15767 ++ goto end;
15768 ++ }
15769 ++ } else {
15770 ++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
15771 ++ /* Does the tp's shortcut is a hit? If yes, we insert. */
15772 ++
15773 ++ if (ret) {
15774 ++ skb = (ret > 0) ? skb : NULL;
15775 ++ goto end;
15776 ++ }
15777 ++ }
15778 ++
15779 ++ /* Check the shortcuts of the other subsockets. */
15780 ++ mptcp_for_each_tp(mpcb, tp_it) {
15781 ++ shortcut = tp_it->mptcp->shortcut_ofoqueue;
15782 ++ /* Can we queue it here? If yes, do so! */
15783 ++ if (shortcut) {
15784 ++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
15785 ++
15786 ++ if (ret) {
15787 ++ skb = (ret > 0) ? skb : NULL;
15788 ++ goto end;
15789 ++ }
15790 ++ }
15791 ++
15792 ++ /* Could not queue it, check if we are close.
15793 ++ * We are looking for a shortcut, close enough to seq to
15794 ++ * set skb1 prematurely and thus improve the subsequent lookup,
15795 ++ * which tries to find a skb1 so that skb1->seq <= seq.
15796 ++ *
15797 ++ * So, here we only take shortcuts, whose shortcut->seq > seq,
15798 ++ * and minimize the distance between shortcut->seq and seq and
15799 ++ * set best_shortcut to this one with the minimal distance.
15800 ++ *
15801 ++ * That way, the subsequent while-loop is shortest.
15802 ++ */
15803 ++ if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {
15804 ++ /* Are we closer than the current best shortcut? */
15805 ++ if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) {
15806 ++ distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq);
15807 ++ best_shortcut = shortcut;
15808 ++ }
15809 ++ }
15810 ++ }
15811 ++
15812 ++ if (best_shortcut)
15813 ++ skb1 = best_shortcut;
15814 ++ else
15815 ++ skb1 = skb_peek_tail(head);
15816 ++
15817 ++ if (seq == TCP_SKB_CB(skb1)->end_seq) {
15818 ++ bool fragstolen = false;
15819 ++
15820 ++ if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {
15821 ++ __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);
15822 ++ } else {
15823 ++ kfree_skb_partial(skb, fragstolen);
15824 ++ skb = NULL;
15825 ++ }
15826 ++
15827 ++ goto end;
15828 ++ }
15829 ++
15830 ++ /* Find the insertion point, starting from best_shortcut if available.
15831 ++ *
15832 ++ * Inspired from tcp_data_queue_ofo.
15833 ++ */
15834 ++ while (1) {
15835 ++ /* skb1->seq <= seq */
15836 ++ if (!after(TCP_SKB_CB(skb1)->seq, seq))
15837 ++ break;
15838 ++ if (skb_queue_is_first(head, skb1)) {
15839 ++ skb1 = NULL;
15840 ++ break;
15841 ++ }
15842 ++ skb1 = skb_queue_prev(head, skb1);
15843 ++ }
15844 ++
15845 ++ /* Do skb overlap to previous one? */
15846 ++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
15847 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
15848 ++ /* All the bits are present. */
15849 ++ __kfree_skb(skb);
15850 ++ skb = NULL;
15851 ++ goto end;
15852 ++ }
15853 ++ if (seq == TCP_SKB_CB(skb1)->seq) {
15854 ++ if (skb_queue_is_first(head, skb1))
15855 ++ skb1 = NULL;
15856 ++ else
15857 ++ skb1 = skb_queue_prev(head, skb1);
15858 ++ }
15859 ++ }
15860 ++ if (!skb1)
15861 ++ __skb_queue_head(head, skb);
15862 ++ else
15863 ++ __skb_queue_after(head, skb1, skb);
15864 ++
15865 ++ /* And clean segments covered by new one as whole. */
15866 ++ while (!skb_queue_is_last(head, skb)) {
15867 ++ skb1 = skb_queue_next(head, skb);
15868 ++
15869 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
15870 ++ break;
15871 ++
15872 ++ __skb_unlink(skb1, head);
15873 ++ mptcp_remove_shortcuts(mpcb, skb1);
15874 ++ __kfree_skb(skb1);
15875 ++ }
15876 ++
15877 ++end:
15878 ++ if (skb) {
15879 ++ skb_set_owner_r(skb, meta_sk);
15880 ++ tp->mptcp->shortcut_ofoqueue = skb;
15881 ++ }
15882 ++
15883 ++ return;
15884 ++}
15885 ++
15886 ++/**
15887 ++ * @sk: the subflow that received this skb.
15888 ++ */
15889 ++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
15890 ++ struct sock *sk)
15891 ++{
15892 ++ struct tcp_sock *tp = tcp_sk(sk);
15893 ++
15894 ++ try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,
15895 ++ &tcp_sk(meta_sk)->out_of_order_queue, tp);
15896 ++}
15897 ++
15898 ++bool mptcp_prune_ofo_queue(struct sock *sk)
15899 ++{
15900 ++ struct tcp_sock *tp = tcp_sk(sk);
15901 ++ bool res = false;
15902 ++
15903 ++ if (!skb_queue_empty(&tp->out_of_order_queue)) {
15904 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
15905 ++ mptcp_purge_ofo_queue(tp);
15906 ++
15907 ++ /* No sack at the mptcp-level */
15908 ++ sk_mem_reclaim(sk);
15909 ++ res = true;
15910 ++ }
15911 ++
15912 ++ return res;
15913 ++}
15914 ++
15915 ++void mptcp_ofo_queue(struct sock *meta_sk)
15916 ++{
15917 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
15918 ++ struct sk_buff *skb;
15919 ++
15920 ++ while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {
15921 ++ u32 old_rcv_nxt = meta_tp->rcv_nxt;
15922 ++ if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))
15923 ++ break;
15924 ++
15925 ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {
15926 ++ __skb_unlink(skb, &meta_tp->out_of_order_queue);
15927 ++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
15928 ++ __kfree_skb(skb);
15929 ++ continue;
15930 ++ }
15931 ++
15932 ++ __skb_unlink(skb, &meta_tp->out_of_order_queue);
15933 ++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
15934 ++
15935 ++ __skb_queue_tail(&meta_sk->sk_receive_queue, skb);
15936 ++ meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
15937 ++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
15938 ++
15939 ++ if (tcp_hdr(skb)->fin)
15940 ++ mptcp_fin(meta_sk);
15941 ++ }
15942 ++}
15943 ++
15944 ++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)
15945 ++{
15946 ++ struct sk_buff_head *head = &meta_tp->out_of_order_queue;
15947 ++ struct sk_buff *skb, *tmp;
15948 ++
15949 ++ skb_queue_walk_safe(head, skb, tmp) {
15950 ++ __skb_unlink(skb, head);
15951 ++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
15952 ++ kfree_skb(skb);
15953 ++ }
15954 ++}
15955 +diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c
15956 +new file mode 100644
15957 +index 000000000000..53f5c43bb488
15958 +--- /dev/null
15959 ++++ b/net/mptcp/mptcp_olia.c
15960 +@@ -0,0 +1,311 @@
15961 ++/*
15962 ++ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
15963 ++ *
15964 ++ * Algorithm design:
15965 ++ * Ramin Khalili <ramin.khalili@××××.ch>
15966 ++ * Nicolas Gast <nicolas.gast@××××.ch>
15967 ++ * Jean-Yves Le Boudec <jean-yves.leboudec@××××.ch>
15968 ++ *
15969 ++ * Implementation:
15970 ++ * Ramin Khalili <ramin.khalili@××××.ch>
15971 ++ *
15972 ++ * Ported to the official MPTCP-kernel:
15973 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
15974 ++ *
15975 ++ * This program is free software; you can redistribute it and/or
15976 ++ * modify it under the terms of the GNU General Public License
15977 ++ * as published by the Free Software Foundation; either version
15978 ++ * 2 of the License, or (at your option) any later version.
15979 ++ */
15980 ++
15981 ++
15982 ++#include <net/tcp.h>
15983 ++#include <net/mptcp.h>
15984 ++
15985 ++#include <linux/module.h>
15986 ++
15987 ++static int scale = 10;
15988 ++
15989 ++struct mptcp_olia {
15990 ++ u32 mptcp_loss1;
15991 ++ u32 mptcp_loss2;
15992 ++ u32 mptcp_loss3;
15993 ++ int epsilon_num;
15994 ++ u32 epsilon_den;
15995 ++ int mptcp_snd_cwnd_cnt;
15996 ++};
15997 ++
15998 ++static inline int mptcp_olia_sk_can_send(const struct sock *sk)
15999 ++{
16000 ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
16001 ++}
16002 ++
16003 ++static inline u64 mptcp_olia_scale(u64 val, int scale)
16004 ++{
16005 ++ return (u64) val << scale;
16006 ++}
16007 ++
16008 ++/* take care of artificially inflate (see RFC5681)
16009 ++ * of cwnd during fast-retransmit phase
16010 ++ */
16011 ++static u32 mptcp_get_crt_cwnd(struct sock *sk)
16012 ++{
16013 ++ const struct inet_connection_sock *icsk = inet_csk(sk);
16014 ++
16015 ++ if (icsk->icsk_ca_state == TCP_CA_Recovery)
16016 ++ return tcp_sk(sk)->snd_ssthresh;
16017 ++ else
16018 ++ return tcp_sk(sk)->snd_cwnd;
16019 ++}
16020 ++
16021 ++/* return the dominator of the first term of the increasing term */
16022 ++static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt)
16023 ++{
16024 ++ struct sock *sk;
16025 ++ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
16026 ++
16027 ++ mptcp_for_each_sk(mpcb, sk) {
16028 ++ struct tcp_sock *tp = tcp_sk(sk);
16029 ++ u64 scaled_num;
16030 ++ u32 tmp_cwnd;
16031 ++
16032 ++ if (!mptcp_olia_sk_can_send(sk))
16033 ++ continue;
16034 ++
16035 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16036 ++ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
16037 ++ rate += div_u64(scaled_num , tp->srtt_us);
16038 ++ }
16039 ++ rate *= rate;
16040 ++ return rate;
16041 ++}
16042 ++
16043 ++/* find the maximum cwnd, used to find set M */
16044 ++static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)
16045 ++{
16046 ++ struct sock *sk;
16047 ++ u32 best_cwnd = 0;
16048 ++
16049 ++ mptcp_for_each_sk(mpcb, sk) {
16050 ++ u32 tmp_cwnd;
16051 ++
16052 ++ if (!mptcp_olia_sk_can_send(sk))
16053 ++ continue;
16054 ++
16055 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16056 ++ if (tmp_cwnd > best_cwnd)
16057 ++ best_cwnd = tmp_cwnd;
16058 ++ }
16059 ++ return best_cwnd;
16060 ++}
16061 ++
16062 ++static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)
16063 ++{
16064 ++ struct mptcp_olia *ca;
16065 ++ struct tcp_sock *tp;
16066 ++ struct sock *sk;
16067 ++ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
16068 ++ u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;
16069 ++ u8 M = 0, B_not_M = 0;
16070 ++
16071 ++ /* TODO - integrate this in the following loop - we just want to iterate once */
16072 ++
16073 ++ max_cwnd = mptcp_get_max_cwnd(mpcb);
16074 ++
16075 ++ /* find the best path */
16076 ++ mptcp_for_each_sk(mpcb, sk) {
16077 ++ tp = tcp_sk(sk);
16078 ++ ca = inet_csk_ca(sk);
16079 ++
16080 ++ if (!mptcp_olia_sk_can_send(sk))
16081 ++ continue;
16082 ++
16083 ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
16084 ++ /* TODO - check here and rename variables */
16085 ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
16086 ++ ca->mptcp_loss2 - ca->mptcp_loss1);
16087 ++
16088 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16089 ++ if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {
16090 ++ best_rtt = tmp_rtt;
16091 ++ best_int = tmp_int;
16092 ++ best_cwnd = tmp_cwnd;
16093 ++ }
16094 ++ }
16095 ++
16096 ++ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
16097 ++ /* find the size of M and B_not_M */
16098 ++ mptcp_for_each_sk(mpcb, sk) {
16099 ++ tp = tcp_sk(sk);
16100 ++ ca = inet_csk_ca(sk);
16101 ++
16102 ++ if (!mptcp_olia_sk_can_send(sk))
16103 ++ continue;
16104 ++
16105 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16106 ++ if (tmp_cwnd == max_cwnd) {
16107 ++ M++;
16108 ++ } else {
16109 ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
16110 ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
16111 ++ ca->mptcp_loss2 - ca->mptcp_loss1);
16112 ++
16113 ++ if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)
16114 ++ B_not_M++;
16115 ++ }
16116 ++ }
16117 ++
16118 ++ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */
16119 ++ mptcp_for_each_sk(mpcb, sk) {
16120 ++ tp = tcp_sk(sk);
16121 ++ ca = inet_csk_ca(sk);
16122 ++
16123 ++ if (!mptcp_olia_sk_can_send(sk))
16124 ++ continue;
16125 ++
16126 ++ if (B_not_M == 0) {
16127 ++ ca->epsilon_num = 0;
16128 ++ ca->epsilon_den = 1;
16129 ++ } else {
16130 ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
16131 ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
16132 ++ ca->mptcp_loss2 - ca->mptcp_loss1);
16133 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16134 ++
16135 ++ if (tmp_cwnd < max_cwnd &&
16136 ++ (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {
16137 ++ ca->epsilon_num = 1;
16138 ++ ca->epsilon_den = mpcb->cnt_established * B_not_M;
16139 ++ } else if (tmp_cwnd == max_cwnd) {
16140 ++ ca->epsilon_num = -1;
16141 ++ ca->epsilon_den = mpcb->cnt_established * M;
16142 ++ } else {
16143 ++ ca->epsilon_num = 0;
16144 ++ ca->epsilon_den = 1;
16145 ++ }
16146 ++ }
16147 ++ }
16148 ++}
16149 ++
16150 ++/* setting the initial values */
16151 ++static void mptcp_olia_init(struct sock *sk)
16152 ++{
16153 ++ const struct tcp_sock *tp = tcp_sk(sk);
16154 ++ struct mptcp_olia *ca = inet_csk_ca(sk);
16155 ++
16156 ++ if (mptcp(tp)) {
16157 ++ ca->mptcp_loss1 = tp->snd_una;
16158 ++ ca->mptcp_loss2 = tp->snd_una;
16159 ++ ca->mptcp_loss3 = tp->snd_una;
16160 ++ ca->mptcp_snd_cwnd_cnt = 0;
16161 ++ ca->epsilon_num = 0;
16162 ++ ca->epsilon_den = 1;
16163 ++ }
16164 ++}
16165 ++
16166 ++/* updating inter-loss distance and ssthresh */
16167 ++static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
16168 ++{
16169 ++ if (!mptcp(tcp_sk(sk)))
16170 ++ return;
16171 ++
16172 ++ if (new_state == TCP_CA_Loss ||
16173 ++ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
16174 ++ struct mptcp_olia *ca = inet_csk_ca(sk);
16175 ++
16176 ++ if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
16177 ++ !inet_csk(sk)->icsk_retransmits) {
16178 ++ ca->mptcp_loss1 = ca->mptcp_loss2;
16179 ++ ca->mptcp_loss2 = ca->mptcp_loss3;
16180 ++ }
16181 ++ }
16182 ++}
16183 ++
16184 ++/* main algorithm */
16185 ++static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
16186 ++{
16187 ++ struct tcp_sock *tp = tcp_sk(sk);
16188 ++ struct mptcp_olia *ca = inet_csk_ca(sk);
16189 ++ const struct mptcp_cb *mpcb = tp->mpcb;
16190 ++
16191 ++ u64 inc_num, inc_den, rate, cwnd_scaled;
16192 ++
16193 ++ if (!mptcp(tp)) {
16194 ++ tcp_reno_cong_avoid(sk, ack, acked);
16195 ++ return;
16196 ++ }
16197 ++
16198 ++ ca->mptcp_loss3 = tp->snd_una;
16199 ++
16200 ++ if (!tcp_is_cwnd_limited(sk))
16201 ++ return;
16202 ++
16203 ++ /* slow start if it is in the safe area */
16204 ++ if (tp->snd_cwnd <= tp->snd_ssthresh) {
16205 ++ tcp_slow_start(tp, acked);
16206 ++ return;
16207 ++ }
16208 ++
16209 ++ mptcp_get_epsilon(mpcb);
16210 ++ rate = mptcp_get_rate(mpcb, tp->srtt_us);
16211 ++ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
16212 ++ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
16213 ++
16214 ++ /* calculate the increasing term, scaling is used to reduce the rounding effect */
16215 ++ if (ca->epsilon_num == -1) {
16216 ++ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
16217 ++ inc_num = rate - ca->epsilon_den *
16218 ++ cwnd_scaled * cwnd_scaled;
16219 ++ ca->mptcp_snd_cwnd_cnt -= div64_u64(
16220 ++ mptcp_olia_scale(inc_num , scale) , inc_den);
16221 ++ } else {
16222 ++ inc_num = ca->epsilon_den *
16223 ++ cwnd_scaled * cwnd_scaled - rate;
16224 ++ ca->mptcp_snd_cwnd_cnt += div64_u64(
16225 ++ mptcp_olia_scale(inc_num , scale) , inc_den);
16226 ++ }
16227 ++ } else {
16228 ++ inc_num = ca->epsilon_num * rate +
16229 ++ ca->epsilon_den * cwnd_scaled * cwnd_scaled;
16230 ++ ca->mptcp_snd_cwnd_cnt += div64_u64(
16231 ++ mptcp_olia_scale(inc_num , scale) , inc_den);
16232 ++ }
16233 ++
16234 ++
16235 ++ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
16236 ++ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
16237 ++ tp->snd_cwnd++;
16238 ++ ca->mptcp_snd_cwnd_cnt = 0;
16239 ++ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
16240 ++ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
16241 ++ ca->mptcp_snd_cwnd_cnt = 0;
16242 ++ }
16243 ++}
16244 ++
16245 ++static struct tcp_congestion_ops mptcp_olia = {
16246 ++ .init = mptcp_olia_init,
16247 ++ .ssthresh = tcp_reno_ssthresh,
16248 ++ .cong_avoid = mptcp_olia_cong_avoid,
16249 ++ .set_state = mptcp_olia_set_state,
16250 ++ .owner = THIS_MODULE,
16251 ++ .name = "olia",
16252 ++};
16253 ++
16254 ++static int __init mptcp_olia_register(void)
16255 ++{
16256 ++ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
16257 ++ return tcp_register_congestion_control(&mptcp_olia);
16258 ++}
16259 ++
16260 ++static void __exit mptcp_olia_unregister(void)
16261 ++{
16262 ++ tcp_unregister_congestion_control(&mptcp_olia);
16263 ++}
16264 ++
16265 ++module_init(mptcp_olia_register);
16266 ++module_exit(mptcp_olia_unregister);
16267 ++
16268 ++MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
16269 ++MODULE_LICENSE("GPL");
16270 ++MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
16271 ++MODULE_VERSION("0.1");
16272 +diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
16273 +new file mode 100644
16274 +index 000000000000..400ea254c078
16275 +--- /dev/null
16276 ++++ b/net/mptcp/mptcp_output.c
16277 +@@ -0,0 +1,1743 @@
16278 ++/*
16279 ++ * MPTCP implementation - Sending side
16280 ++ *
16281 ++ * Initial Design & Implementation:
16282 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
16283 ++ *
16284 ++ * Current Maintainer & Author:
16285 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
16286 ++ *
16287 ++ * Additional authors:
16288 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
16289 ++ * Gregory Detal <gregory.detal@×××××××××.be>
16290 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
16291 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
16292 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
16293 ++ * Andreas Ripke <ripke@××××××.eu>
16294 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
16295 ++ * Octavian Purdila <octavian.purdila@×××××.com>
16296 ++ * John Ronan <jronan@××××.org>
16297 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
16298 ++ * Brandon Heller <brandonh@××××××××.edu>
16299 ++ *
16300 ++ *
16301 ++ * This program is free software; you can redistribute it and/or
16302 ++ * modify it under the terms of the GNU General Public License
16303 ++ * as published by the Free Software Foundation; either version
16304 ++ * 2 of the License, or (at your option) any later version.
16305 ++ */
16306 ++
16307 ++#include <linux/kconfig.h>
16308 ++#include <linux/skbuff.h>
16309 ++#include <linux/tcp.h>
16310 ++
16311 ++#include <net/mptcp.h>
16312 ++#include <net/mptcp_v4.h>
16313 ++#include <net/mptcp_v6.h>
16314 ++#include <net/sock.h>
16315 ++
16316 ++static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN +
16317 ++ MPTCP_SUB_LEN_ACK_ALIGN +
16318 ++ MPTCP_SUB_LEN_SEQ_ALIGN;
16319 ++
16320 ++static inline int mptcp_sub_len_remove_addr(u16 bitfield)
16321 ++{
16322 ++ unsigned int c;
16323 ++ for (c = 0; bitfield; c++)
16324 ++ bitfield &= bitfield - 1;
16325 ++ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
16326 ++}
16327 ++
16328 ++int mptcp_sub_len_remove_addr_align(u16 bitfield)
16329 ++{
16330 ++ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
16331 ++}
16332 ++EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);
16333 ++
16334 ++/* get the data-seq and end-data-seq and store them again in the
16335 ++ * tcp_skb_cb
16336 ++ */
16337 ++static int mptcp_reconstruct_mapping(struct sk_buff *skb)
16338 ++{
16339 ++ const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss;
16340 ++ u32 *p32;
16341 ++ u16 *p16;
16342 ++
16343 ++ if (!mpdss->M)
16344 ++ return 1;
16345 ++
16346 ++ /* Move the pointer to the data-seq */
16347 ++ p32 = (u32 *)mpdss;
16348 ++ p32++;
16349 ++ if (mpdss->A) {
16350 ++ p32++;
16351 ++ if (mpdss->a)
16352 ++ p32++;
16353 ++ }
16354 ++
16355 ++ TCP_SKB_CB(skb)->seq = ntohl(*p32);
16356 ++
16357 ++ /* Get the data_len to calculate the end_data_seq */
16358 ++ p32++;
16359 ++ p32++;
16360 ++ p16 = (u16 *)p32;
16361 ++ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
16362 ++
16363 ++ return 0;
16364 ++}
16365 ++
16366 ++static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb)
16367 ++{
16368 ++ struct sk_buff *skb_it;
16369 ++
16370 ++ skb_it = tcp_write_queue_head(meta_sk);
16371 ++
16372 ++ tcp_for_write_queue_from(skb_it, meta_sk) {
16373 ++ if (skb_it == tcp_send_head(meta_sk))
16374 ++ break;
16375 ++
16376 ++ if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {
16377 ++ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
16378 ++ break;
16379 ++ }
16380 ++ }
16381 ++}
16382 ++
16383 ++/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
16384 ++ * coming from the meta-retransmit-timer
16385 ++ */
16386 ++static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
16387 ++ struct sock *sk, int clone_it)
16388 ++{
16389 ++ struct sk_buff *skb, *skb1;
16390 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16391 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
16392 ++ u32 seq, end_seq;
16393 ++
16394 ++ if (clone_it) {
16395 ++ /* pskb_copy is necessary here, because the TCP/IP-headers
16396 ++ * will be changed when it's going to be reinjected on another
16397 ++ * subflow.
16398 ++ */
16399 ++ skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC);
16400 ++ } else {
16401 ++ __skb_unlink(orig_skb, &sk->sk_write_queue);
16402 ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
16403 ++ sk->sk_wmem_queued -= orig_skb->truesize;
16404 ++ sk_mem_uncharge(sk, orig_skb->truesize);
16405 ++ skb = orig_skb;
16406 ++ }
16407 ++ if (unlikely(!skb))
16408 ++ return;
16409 ++
16410 ++ if (sk && mptcp_reconstruct_mapping(skb)) {
16411 ++ __kfree_skb(skb);
16412 ++ return;
16413 ++ }
16414 ++
16415 ++ skb->sk = meta_sk;
16416 ++
16417 ++ /* If it reached already the destination, we don't have to reinject it */
16418 ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
16419 ++ __kfree_skb(skb);
16420 ++ return;
16421 ++ }
16422 ++
16423 ++ /* Only reinject segments that are fully covered by the mapping */
16424 ++ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
16425 ++ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
16426 ++ u32 seq = TCP_SKB_CB(skb)->seq;
16427 ++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
16428 ++
16429 ++ __kfree_skb(skb);
16430 ++
16431 ++ /* Ok, now we have to look for the full mapping in the meta
16432 ++ * send-queue :S
16433 ++ */
16434 ++ tcp_for_write_queue(skb, meta_sk) {
16435 ++ /* Not yet at the mapping? */
16436 ++ if (before(TCP_SKB_CB(skb)->seq, seq))
16437 ++ continue;
16438 ++ /* We have passed by the mapping */
16439 ++ if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
16440 ++ return;
16441 ++
16442 ++ __mptcp_reinject_data(skb, meta_sk, NULL, 1);
16443 ++ }
16444 ++ return;
16445 ++ }
16446 ++
16447 ++ /* Segment goes back to the MPTCP-layer. So, we need to zero the
16448 ++ * path_mask/dss.
16449 ++ */
16450 ++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
16451 ++
16452 ++ /* We need to find out the path-mask from the meta-write-queue
16453 ++ * to properly select a subflow.
16454 ++ */
16455 ++ mptcp_find_and_set_pathmask(meta_sk, skb);
16456 ++
16457 ++ /* If it's empty, just add */
16458 ++ if (skb_queue_empty(&mpcb->reinject_queue)) {
16459 ++ skb_queue_head(&mpcb->reinject_queue, skb);
16460 ++ return;
16461 ++ }
16462 ++
16463 ++ /* Find place to insert skb - or even we can 'drop' it, as the
16464 ++ * data is already covered by other skb's in the reinject-queue.
16465 ++ *
16466 ++ * This is inspired by code from tcp_data_queue.
16467 ++ */
16468 ++
16469 ++ skb1 = skb_peek_tail(&mpcb->reinject_queue);
16470 ++ seq = TCP_SKB_CB(skb)->seq;
16471 ++ while (1) {
16472 ++ if (!after(TCP_SKB_CB(skb1)->seq, seq))
16473 ++ break;
16474 ++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
16475 ++ skb1 = NULL;
16476 ++ break;
16477 ++ }
16478 ++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
16479 ++ }
16480 ++
16481 ++ /* Do skb overlap to previous one? */
16482 ++ end_seq = TCP_SKB_CB(skb)->end_seq;
16483 ++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
16484 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
16485 ++ /* All the bits are present. Don't reinject */
16486 ++ __kfree_skb(skb);
16487 ++ return;
16488 ++ }
16489 ++ if (seq == TCP_SKB_CB(skb1)->seq) {
16490 ++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
16491 ++ skb1 = NULL;
16492 ++ else
16493 ++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
16494 ++ }
16495 ++ }
16496 ++ if (!skb1)
16497 ++ __skb_queue_head(&mpcb->reinject_queue, skb);
16498 ++ else
16499 ++ __skb_queue_after(&mpcb->reinject_queue, skb1, skb);
16500 ++
16501 ++ /* And clean segments covered by new one as whole. */
16502 ++ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
16503 ++ skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
16504 ++
16505 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
16506 ++ break;
16507 ++
16508 ++ __skb_unlink(skb1, &mpcb->reinject_queue);
16509 ++ __kfree_skb(skb1);
16510 ++ }
16511 ++ return;
16512 ++}
16513 ++
16514 ++/* Inserts data into the reinject queue */
16515 ++void mptcp_reinject_data(struct sock *sk, int clone_it)
16516 ++{
16517 ++ struct sk_buff *skb_it, *tmp;
16518 ++ struct tcp_sock *tp = tcp_sk(sk);
16519 ++ struct sock *meta_sk = tp->meta_sk;
16520 ++
16521 ++ /* It has already been closed - there is really no point in reinjecting */
16522 ++ if (meta_sk->sk_state == TCP_CLOSE)
16523 ++ return;
16524 ++
16525 ++ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
16526 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
16527 ++ /* Subflow syn's and fin's are not reinjected.
16528 ++ *
16529 ++ * As well as empty subflow-fins with a data-fin.
16530 ++ * They are reinjected below (without the subflow-fin-flag)
16531 ++ */
16532 ++ if (tcb->tcp_flags & TCPHDR_SYN ||
16533 ++ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
16534 ++ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
16535 ++ continue;
16536 ++
16537 ++ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);
16538 ++ }
16539 ++
16540 ++ skb_it = tcp_write_queue_tail(meta_sk);
16541 ++ /* If sk has sent the empty data-fin, we have to reinject it too. */
16542 ++ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
16543 ++ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {
16544 ++ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1);
16545 ++ }
16546 ++
16547 ++ mptcp_push_pending_frames(meta_sk);
16548 ++
16549 ++ tp->pf = 1;
16550 ++}
16551 ++EXPORT_SYMBOL(mptcp_reinject_data);
16552 ++
16553 ++static void mptcp_combine_dfin(const struct sk_buff *skb, const struct sock *meta_sk,
16554 ++ struct sock *subsk)
16555 ++{
16556 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16557 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
16558 ++ struct sock *sk_it;
16559 ++ int all_empty = 1, all_acked;
16560 ++
16561 ++ /* In infinite mapping we always try to combine */
16562 ++ if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {
16563 ++ subsk->sk_shutdown |= SEND_SHUTDOWN;
16564 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
16565 ++ return;
16566 ++ }
16567 ++
16568 ++ /* Don't combine, if they didn't combine - otherwise we end up in
16569 ++ * TIME_WAIT, even if our app is smart enough to avoid it
16570 ++ */
16571 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
16572 ++ if (!mpcb->dfin_combined)
16573 ++ return;
16574 ++ }
16575 ++
16576 ++ /* If no other subflow has data to send, we can combine */
16577 ++ mptcp_for_each_sk(mpcb, sk_it) {
16578 ++ if (!mptcp_sk_can_send(sk_it))
16579 ++ continue;
16580 ++
16581 ++ if (!tcp_write_queue_empty(sk_it))
16582 ++ all_empty = 0;
16583 ++ }
16584 ++
16585 ++ /* If all data has been DATA_ACKed, we can combine.
16586 ++ * -1, because the data_fin consumed one byte
16587 ++ */
16588 ++ all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));
16589 ++
16590 ++ if ((all_empty || all_acked) && tcp_close_state(subsk)) {
16591 ++ subsk->sk_shutdown |= SEND_SHUTDOWN;
16592 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
16593 ++ }
16594 ++}
16595 ++
16596 ++static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
16597 ++ __be32 *ptr)
16598 ++{
16599 ++ const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
16600 ++ __be32 *start = ptr;
16601 ++ __u16 data_len;
16602 ++
16603 ++ *ptr++ = htonl(tcb->seq); /* data_seq */
16604 ++
16605 ++ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
16606 ++ if (mptcp_is_data_fin(skb) && skb->len == 0)
16607 ++ *ptr++ = 0; /* subseq */
16608 ++ else
16609 ++ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
16610 ++
16611 ++ if (tcb->mptcp_flags & MPTCPHDR_INF)
16612 ++ data_len = 0;
16613 ++ else
16614 ++ data_len = tcb->end_seq - tcb->seq;
16615 ++
16616 ++ if (tp->mpcb->dss_csum && data_len) {
16617 ++ __be16 *p16 = (__be16 *)ptr;
16618 ++ __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
16619 ++ __wsum csum;
16620 ++
16621 ++ *ptr = htonl(((data_len) << 16) |
16622 ++ (TCPOPT_EOL << 8) |
16623 ++ (TCPOPT_EOL));
16624 ++ csum = csum_partial(ptr - 2, 12, skb->csum);
16625 ++ p16++;
16626 ++ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
16627 ++ } else {
16628 ++ *ptr++ = htonl(((data_len) << 16) |
16629 ++ (TCPOPT_NOP << 8) |
16630 ++ (TCPOPT_NOP));
16631 ++ }
16632 ++
16633 ++ return ptr - start;
16634 ++}
16635 ++
16636 ++static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
16637 ++ __be32 *ptr)
16638 ++{
16639 ++ struct mp_dss *mdss = (struct mp_dss *)ptr;
16640 ++ __be32 *start = ptr;
16641 ++
16642 ++ mdss->kind = TCPOPT_MPTCP;
16643 ++ mdss->sub = MPTCP_SUB_DSS;
16644 ++ mdss->rsv1 = 0;
16645 ++ mdss->rsv2 = 0;
16646 ++ mdss->F = mptcp_is_data_fin(skb) ? 1 : 0;
16647 ++ mdss->m = 0;
16648 ++ mdss->M = mptcp_is_data_seq(skb) ? 1 : 0;
16649 ++ mdss->a = 0;
16650 ++ mdss->A = 1;
16651 ++ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
16652 ++ ptr++;
16653 ++
16654 ++ *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);
16655 ++
16656 ++ return ptr - start;
16657 ++}
16658 ++
16659 ++/* RFC6824 states that once a particular subflow mapping has been sent
16660 ++ * out it must never be changed. However, packets may be split while
16661 ++ * they are in the retransmission queue (due to SACK or ACKs) and that
16662 ++ * arguably means that we would change the mapping (e.g. it splits it,
16663 ++ * our sends out a subset of the initial mapping).
16664 ++ *
16665 ++ * Furthermore, the skb checksum is not always preserved across splits
16666 ++ * (e.g. mptcp_fragment) which would mean that we need to recompute
16667 ++ * the DSS checksum in this case.
16668 ++ *
16669 ++ * To avoid this we save the initial DSS mapping which allows us to
16670 ++ * send the same DSS mapping even for fragmented retransmits.
16671 ++ */
16672 ++static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb)
16673 ++{
16674 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
16675 ++ __be32 *ptr = (__be32 *)tcb->dss;
16676 ++
16677 ++ tcb->mptcp_flags |= MPTCPHDR_SEQ;
16678 ++
16679 ++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
16680 ++ ptr += mptcp_write_dss_mapping(tp, skb, ptr);
16681 ++}
16682 ++
16683 ++/* Write the saved DSS mapping to the header */
16684 ++static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb,
16685 ++ __be32 *ptr)
16686 ++{
16687 ++ __be32 *start = ptr;
16688 ++
16689 ++ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);
16690 ++
16691 ++ /* update the data_ack */
16692 ++ start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt);
16693 ++
16694 ++ /* dss is in a union with inet_skb_parm and
16695 ++ * the IP layer expects zeroed IPCB fields.
16696 ++ */
16697 ++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
16698 ++
16699 ++ return mptcp_dss_len/sizeof(*ptr);
16700 ++}
16701 ++
16702 ++static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
16703 ++{
16704 ++ struct tcp_sock *tp = tcp_sk(sk);
16705 ++ const struct sock *meta_sk = mptcp_meta_sk(sk);
16706 ++ const struct mptcp_cb *mpcb = tp->mpcb;
16707 ++ struct tcp_skb_cb *tcb;
16708 ++ struct sk_buff *subskb = NULL;
16709 ++
16710 ++ if (!reinject)
16711 ++ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
16712 ++ MPTCPHDR_SEQ64_INDEX : 0);
16713 ++
16714 ++ subskb = pskb_copy_for_clone(skb, GFP_ATOMIC);
16715 ++ if (!subskb)
16716 ++ return false;
16717 ++
16718 ++ /* At the subflow-level we need to call again tcp_init_tso_segs. We
16719 ++ * force this, by setting gso_segs to 0. It has been set to 1 prior to
16720 ++ * the call to mptcp_skb_entail.
16721 ++ */
16722 ++ skb_shinfo(subskb)->gso_segs = 0;
16723 ++
16724 ++ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
16725 ++
16726 ++ if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&
16727 ++ skb->ip_summed == CHECKSUM_PARTIAL) {
16728 ++ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
16729 ++ subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;
16730 ++ }
16731 ++
16732 ++ tcb = TCP_SKB_CB(subskb);
16733 ++
16734 ++ if (tp->mpcb->send_infinite_mapping &&
16735 ++ !tp->mpcb->infinite_mapping_snd &&
16736 ++ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
16737 ++ tp->mptcp->fully_established = 1;
16738 ++ tp->mpcb->infinite_mapping_snd = 1;
16739 ++ tp->mptcp->infinite_cutoff_seq = tp->write_seq;
16740 ++ tcb->mptcp_flags |= MPTCPHDR_INF;
16741 ++ }
16742 ++
16743 ++ if (mptcp_is_data_fin(subskb))
16744 ++ mptcp_combine_dfin(subskb, meta_sk, sk);
16745 ++
16746 ++ mptcp_save_dss_data_seq(tp, subskb);
16747 ++
16748 ++ tcb->seq = tp->write_seq;
16749 ++ tcb->sacked = 0; /* reset the sacked field: from the point of view
16750 ++ * of this subflow, we are sending a brand new
16751 ++ * segment
16752 ++ */
16753 ++ /* Take into account seg len */
16754 ++ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
16755 ++ tcb->end_seq = tp->write_seq;
16756 ++
16757 ++ /* If it's a non-payload DATA_FIN (also no subflow-fin), the
16758 ++ * segment is not part of the subflow but on a meta-only-level.
16759 ++ */
16760 ++ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
16761 ++ tcp_add_write_queue_tail(sk, subskb);
16762 ++ sk->sk_wmem_queued += subskb->truesize;
16763 ++ sk_mem_charge(sk, subskb->truesize);
16764 ++ } else {
16765 ++ int err;
16766 ++
16767 ++ /* Necessary to initialize for tcp_transmit_skb. mss of 1, as
16768 ++ * skb->len = 0 will force tso_segs to 1.
16769 ++ */
16770 ++ tcp_init_tso_segs(sk, subskb, 1);
16771 ++ /* Empty data-fins are sent immediatly on the subflow */
16772 ++ TCP_SKB_CB(subskb)->when = tcp_time_stamp;
16773 ++ err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC);
16774 ++
16775 ++ /* It has not been queued, we can free it now. */
16776 ++ kfree_skb(subskb);
16777 ++
16778 ++ if (err)
16779 ++ return false;
16780 ++ }
16781 ++
16782 ++ if (!tp->mptcp->fully_established) {
16783 ++ tp->mptcp->second_packet = 1;
16784 ++ tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
16785 ++ }
16786 ++
16787 ++ return true;
16788 ++}
16789 ++
16790 ++/* Fragment an skb and update the mptcp meta-data. Due to reinject, we
16791 ++ * might need to undo some operations done by tcp_fragment.
16792 ++ */
16793 ++static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len,
16794 ++ gfp_t gfp, int reinject)
16795 ++{
16796 ++ int ret, diff, old_factor;
16797 ++ struct sk_buff *buff;
16798 ++ u8 flags;
16799 ++
16800 ++ if (skb_headlen(skb) < len)
16801 ++ diff = skb->len - len;
16802 ++ else
16803 ++ diff = skb->data_len;
16804 ++ old_factor = tcp_skb_pcount(skb);
16805 ++
16806 ++ /* The mss_now in tcp_fragment is used to set the tso_segs of the skb.
16807 ++ * At the MPTCP-level we do not care about the absolute value. All we
16808 ++ * care about is that it is set to 1 for accurate packets_out
16809 ++ * accounting.
16810 ++ */
16811 ++ ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp);
16812 ++ if (ret)
16813 ++ return ret;
16814 ++
16815 ++ buff = skb->next;
16816 ++
16817 ++ flags = TCP_SKB_CB(skb)->mptcp_flags;
16818 ++ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
16819 ++ TCP_SKB_CB(buff)->mptcp_flags = flags;
16820 ++ TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask;
16821 ++
16822 ++ /* If reinject == 1, the buff will be added to the reinject
16823 ++ * queue, which is currently not part of memory accounting. So
16824 ++ * undo the changes done by tcp_fragment and update the
16825 ++ * reinject queue. Also, undo changes to the packet counters.
16826 ++ */
16827 ++ if (reinject == 1) {
16828 ++ int undo = buff->truesize - diff;
16829 ++ meta_sk->sk_wmem_queued -= undo;
16830 ++ sk_mem_uncharge(meta_sk, undo);
16831 ++
16832 ++ tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++;
16833 ++ meta_sk->sk_write_queue.qlen--;
16834 ++
16835 ++ if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
16836 ++ undo = old_factor - tcp_skb_pcount(skb) -
16837 ++ tcp_skb_pcount(buff);
16838 ++ if (undo)
16839 ++ tcp_adjust_pcount(meta_sk, skb, -undo);
16840 ++ }
16841 ++ }
16842 ++
16843 ++ return 0;
16844 ++}
16845 ++
16846 ++/* Inspired by tcp_write_wakeup */
16847 ++int mptcp_write_wakeup(struct sock *meta_sk)
16848 ++{
16849 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16850 ++ struct sk_buff *skb;
16851 ++ struct sock *sk_it;
16852 ++ int ans = 0;
16853 ++
16854 ++ if (meta_sk->sk_state == TCP_CLOSE)
16855 ++ return -1;
16856 ++
16857 ++ skb = tcp_send_head(meta_sk);
16858 ++ if (skb &&
16859 ++ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
16860 ++ unsigned int mss;
16861 ++ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
16862 ++ struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true);
16863 ++ struct tcp_sock *subtp;
16864 ++ if (!subsk)
16865 ++ goto window_probe;
16866 ++ subtp = tcp_sk(subsk);
16867 ++ mss = tcp_current_mss(subsk);
16868 ++
16869 ++ seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq,
16870 ++ tcp_wnd_end(subtp) - subtp->write_seq);
16871 ++
16872 ++ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
16873 ++ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
16874 ++
16875 ++ /* We are probing the opening of a window
16876 ++ * but the window size is != 0
16877 ++ * must have been a result SWS avoidance ( sender )
16878 ++ */
16879 ++ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
16880 ++ skb->len > mss) {
16881 ++ seg_size = min(seg_size, mss);
16882 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
16883 ++ if (mptcp_fragment(meta_sk, skb, seg_size,
16884 ++ GFP_ATOMIC, 0))
16885 ++ return -1;
16886 ++ } else if (!tcp_skb_pcount(skb)) {
16887 ++ /* see mptcp_write_xmit on why we use UINT_MAX */
16888 ++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);
16889 ++ }
16890 ++
16891 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
16892 ++ if (!mptcp_skb_entail(subsk, skb, 0))
16893 ++ return -1;
16894 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
16895 ++
16896 ++ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
16897 ++ TCP_SKB_CB(skb)->seq);
16898 ++ tcp_event_new_data_sent(meta_sk, skb);
16899 ++
16900 ++ __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH);
16901 ++
16902 ++ return 0;
16903 ++ } else {
16904 ++window_probe:
16905 ++ if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
16906 ++ meta_tp->snd_una + 0xFFFF)) {
16907 ++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
16908 ++ if (mptcp_sk_can_send_ack(sk_it))
16909 ++ tcp_xmit_probe_skb(sk_it, 1);
16910 ++ }
16911 ++ }
16912 ++
16913 ++ /* At least one of the tcp_xmit_probe_skb's has to succeed */
16914 ++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
16915 ++ int ret;
16916 ++
16917 ++ if (!mptcp_sk_can_send_ack(sk_it))
16918 ++ continue;
16919 ++
16920 ++ ret = tcp_xmit_probe_skb(sk_it, 0);
16921 ++ if (unlikely(ret > 0))
16922 ++ ans = ret;
16923 ++ }
16924 ++ return ans;
16925 ++ }
16926 ++}
16927 ++
16928 ++bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
16929 ++ int push_one, gfp_t gfp)
16930 ++{
16931 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
16932 ++ struct sock *subsk = NULL;
16933 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
16934 ++ struct sk_buff *skb;
16935 ++ unsigned int sent_pkts;
16936 ++ int reinject = 0;
16937 ++ unsigned int sublimit;
16938 ++
16939 ++ sent_pkts = 0;
16940 ++
16941 ++ while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk,
16942 ++ &sublimit))) {
16943 ++ unsigned int limit;
16944 ++
16945 ++ subtp = tcp_sk(subsk);
16946 ++ mss_now = tcp_current_mss(subsk);
16947 ++
16948 ++ if (reinject == 1) {
16949 ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
16950 ++ /* Segment already reached the peer, take the next one */
16951 ++ __skb_unlink(skb, &mpcb->reinject_queue);
16952 ++ __kfree_skb(skb);
16953 ++ continue;
16954 ++ }
16955 ++ }
16956 ++
16957 ++ /* If the segment was cloned (e.g. a meta retransmission),
16958 ++ * the header must be expanded/copied so that there is no
16959 ++ * corruption of TSO information.
16960 ++ */
16961 ++ if (skb_unclone(skb, GFP_ATOMIC))
16962 ++ break;
16963 ++
16964 ++ if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now)))
16965 ++ break;
16966 ++
16967 ++ /* Force tso_segs to 1 by using UINT_MAX.
16968 ++ * We actually don't care about the exact number of segments
16969 ++ * emitted on the subflow. We need just to set tso_segs, because
16970 ++ * we still need an accurate packets_out count in
16971 ++ * tcp_event_new_data_sent.
16972 ++ */
16973 ++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);
16974 ++
16975 ++ /* Check for nagle, irregardless of tso_segs. If the segment is
16976 ++ * actually larger than mss_now (TSO segment), then
16977 ++ * tcp_nagle_check will have partial == false and always trigger
16978 ++ * the transmission.
16979 ++ * tcp_write_xmit has a TSO-level nagle check which is not
16980 ++ * subject to the MPTCP-level. It is based on the properties of
16981 ++ * the subflow, not the MPTCP-level.
16982 ++ */
16983 ++ if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
16984 ++ (tcp_skb_is_last(meta_sk, skb) ?
16985 ++ nonagle : TCP_NAGLE_PUSH))))
16986 ++ break;
16987 ++
16988 ++ limit = mss_now;
16989 ++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
16990 ++ * tcp_write_xmit. Otherwise split-point would return 0.
16991 ++ */
16992 ++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
16993 ++ /* We limit the size of the skb so that it fits into the
16994 ++ * window. Call tcp_mss_split_point to avoid duplicating
16995 ++ * code.
16996 ++ * We really only care about fitting the skb into the
16997 ++ * window. That's why we use UINT_MAX. If the skb does
16998 ++ * not fit into the cwnd_quota or the NIC's max-segs
16999 ++ * limitation, it will be split by the subflow's
17000 ++ * tcp_write_xmit which does the appropriate call to
17001 ++ * tcp_mss_split_point.
17002 ++ */
17003 ++ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
17004 ++ UINT_MAX / mss_now,
17005 ++ nonagle);
17006 ++
17007 ++ if (sublimit)
17008 ++ limit = min(limit, sublimit);
17009 ++
17010 ++ if (skb->len > limit &&
17011 ++ unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject)))
17012 ++ break;
17013 ++
17014 ++ if (!mptcp_skb_entail(subsk, skb, reinject))
17015 ++ break;
17016 ++ /* Nagle is handled at the MPTCP-layer, so
17017 ++ * always push on the subflow
17018 ++ */
17019 ++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
17020 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
17021 ++
17022 ++ if (!reinject) {
17023 ++ mptcp_check_sndseq_wrap(meta_tp,
17024 ++ TCP_SKB_CB(skb)->end_seq -
17025 ++ TCP_SKB_CB(skb)->seq);
17026 ++ tcp_event_new_data_sent(meta_sk, skb);
17027 ++ }
17028 ++
17029 ++ tcp_minshall_update(meta_tp, mss_now, skb);
17030 ++ sent_pkts += tcp_skb_pcount(skb);
17031 ++
17032 ++ if (reinject > 0) {
17033 ++ __skb_unlink(skb, &mpcb->reinject_queue);
17034 ++ kfree_skb(skb);
17035 ++ }
17036 ++
17037 ++ if (push_one)
17038 ++ break;
17039 ++ }
17040 ++
17041 ++ return !meta_tp->packets_out && tcp_send_head(meta_sk);
17042 ++}
17043 ++
17044 ++void mptcp_write_space(struct sock *sk)
17045 ++{
17046 ++ mptcp_push_pending_frames(mptcp_meta_sk(sk));
17047 ++}
17048 ++
17049 ++u32 __mptcp_select_window(struct sock *sk)
17050 ++{
17051 ++ struct inet_connection_sock *icsk = inet_csk(sk);
17052 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
17053 ++ int mss, free_space, full_space, window;
17054 ++
17055 ++ /* MSS for the peer's data. Previous versions used mss_clamp
17056 ++ * here. I don't know if the value based on our guesses
17057 ++ * of peer's MSS is better for the performance. It's more correct
17058 ++ * but may be worse for the performance because of rcv_mss
17059 ++ * fluctuations. --SAW 1998/11/1
17060 ++ */
17061 ++ mss = icsk->icsk_ack.rcv_mss;
17062 ++ free_space = tcp_space(sk);
17063 ++ full_space = min_t(int, meta_tp->window_clamp,
17064 ++ tcp_full_space(sk));
17065 ++
17066 ++ if (mss > full_space)
17067 ++ mss = full_space;
17068 ++
17069 ++ if (free_space < (full_space >> 1)) {
17070 ++ icsk->icsk_ack.quick = 0;
17071 ++
17072 ++ if (tcp_memory_pressure)
17073 ++ /* TODO this has to be adapted when we support different
17074 ++ * MSS's among the subflows.
17075 ++ */
17076 ++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
17077 ++ 4U * meta_tp->advmss);
17078 ++
17079 ++ if (free_space < mss)
17080 ++ return 0;
17081 ++ }
17082 ++
17083 ++ if (free_space > meta_tp->rcv_ssthresh)
17084 ++ free_space = meta_tp->rcv_ssthresh;
17085 ++
17086 ++ /* Don't do rounding if we are using window scaling, since the
17087 ++ * scaled window will not line up with the MSS boundary anyway.
17088 ++ */
17089 ++ window = meta_tp->rcv_wnd;
17090 ++ if (tp->rx_opt.rcv_wscale) {
17091 ++ window = free_space;
17092 ++
17093 ++ /* Advertise enough space so that it won't get scaled away.
17094 ++ * Import case: prevent zero window announcement if
17095 ++ * 1<<rcv_wscale > mss.
17096 ++ */
17097 ++ if (((window >> tp->rx_opt.rcv_wscale) << tp->
17098 ++ rx_opt.rcv_wscale) != window)
17099 ++ window = (((window >> tp->rx_opt.rcv_wscale) + 1)
17100 ++ << tp->rx_opt.rcv_wscale);
17101 ++ } else {
17102 ++ /* Get the largest window that is a nice multiple of mss.
17103 ++ * Window clamp already applied above.
17104 ++ * If our current window offering is within 1 mss of the
17105 ++ * free space we just keep it. This prevents the divide
17106 ++ * and multiply from happening most of the time.
17107 ++ * We also don't do any window rounding when the free space
17108 ++ * is too small.
17109 ++ */
17110 ++ if (window <= free_space - mss || window > free_space)
17111 ++ window = (free_space / mss) * mss;
17112 ++ else if (mss == full_space &&
17113 ++ free_space > window + (full_space >> 1))
17114 ++ window = free_space;
17115 ++ }
17116 ++
17117 ++ return window;
17118 ++}
17119 ++
17120 ++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
17121 ++ unsigned *remaining)
17122 ++{
17123 ++ const struct tcp_sock *tp = tcp_sk(sk);
17124 ++
17125 ++ opts->options |= OPTION_MPTCP;
17126 ++ if (is_master_tp(tp)) {
17127 ++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
17128 ++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
17129 ++ opts->mp_capable.sender_key = tp->mptcp_loc_key;
17130 ++ opts->dss_csum = !!sysctl_mptcp_checksum;
17131 ++ } else {
17132 ++ const struct mptcp_cb *mpcb = tp->mpcb;
17133 ++
17134 ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
17135 ++ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
17136 ++ opts->mp_join_syns.token = mpcb->mptcp_rem_token;
17137 ++ opts->mp_join_syns.low_prio = tp->mptcp->low_prio;
17138 ++ opts->addr_id = tp->mptcp->loc_id;
17139 ++ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
17140 ++ }
17141 ++}
17142 ++
17143 ++void mptcp_synack_options(struct request_sock *req,
17144 ++ struct tcp_out_options *opts, unsigned *remaining)
17145 ++{
17146 ++ struct mptcp_request_sock *mtreq;
17147 ++ mtreq = mptcp_rsk(req);
17148 ++
17149 ++ opts->options |= OPTION_MPTCP;
17150 ++ /* MPCB not yet set - thus it's a new MPTCP-session */
17151 ++ if (!mtreq->is_sub) {
17152 ++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
17153 ++ opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
17154 ++ opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;
17155 ++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
17156 ++ } else {
17157 ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
17158 ++ opts->mp_join_syns.sender_truncated_mac =
17159 ++ mtreq->mptcp_hash_tmac;
17160 ++ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
17161 ++ opts->mp_join_syns.low_prio = mtreq->low_prio;
17162 ++ opts->addr_id = mtreq->loc_id;
17163 ++ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
17164 ++ }
17165 ++}
17166 ++
17167 ++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
17168 ++ struct tcp_out_options *opts, unsigned *size)
17169 ++{
17170 ++ struct tcp_sock *tp = tcp_sk(sk);
17171 ++ struct mptcp_cb *mpcb = tp->mpcb;
17172 ++ const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
17173 ++
17174 ++ /* We are coming from tcp_current_mss with the meta_sk as an argument.
17175 ++ * It does not make sense to check for the options, because when the
17176 ++ * segment gets sent, another subflow will be chosen.
17177 ++ */
17178 ++ if (!skb && is_meta_sk(sk))
17179 ++ return;
17180 ++
17181 ++ /* In fallback mp_fail-mode, we have to repeat it until the fallback
17182 ++ * has been done by the sender
17183 ++ */
17184 ++ if (unlikely(tp->mptcp->send_mp_fail)) {
17185 ++ opts->options |= OPTION_MPTCP;
17186 ++ opts->mptcp_options |= OPTION_MP_FAIL;
17187 ++ *size += MPTCP_SUB_LEN_FAIL;
17188 ++ return;
17189 ++ }
17190 ++
17191 ++ if (unlikely(tp->send_mp_fclose)) {
17192 ++ opts->options |= OPTION_MPTCP;
17193 ++ opts->mptcp_options |= OPTION_MP_FCLOSE;
17194 ++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
17195 ++ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
17196 ++ return;
17197 ++ }
17198 ++
17199 ++ /* 1. If we are the sender of the infinite-mapping, we need the
17200 ++ * MPTCPHDR_INF-flag, because a retransmission of the
17201 ++ * infinite-announcment still needs the mptcp-option.
17202 ++ *
17203 ++ * We need infinite_cutoff_seq, because retransmissions from before
17204 ++ * the infinite-cutoff-moment still need the MPTCP-signalling to stay
17205 ++ * consistent.
17206 ++ *
17207 ++ * 2. If we are the receiver of the infinite-mapping, we always skip
17208 ++ * mptcp-options, because acknowledgments from before the
17209 ++ * infinite-mapping point have already been sent out.
17210 ++ *
17211 ++ * I know, the whole infinite-mapping stuff is ugly...
17212 ++ *
17213 ++ * TODO: Handle wrapped data-sequence numbers
17214 ++ * (even if it's very unlikely)
17215 ++ */
17216 ++ if (unlikely(mpcb->infinite_mapping_snd) &&
17217 ++ ((mpcb->send_infinite_mapping && tcb &&
17218 ++ mptcp_is_data_seq(skb) &&
17219 ++ !(tcb->mptcp_flags & MPTCPHDR_INF) &&
17220 ++ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
17221 ++ !mpcb->send_infinite_mapping))
17222 ++ return;
17223 ++
17224 ++ if (unlikely(tp->mptcp->include_mpc)) {
17225 ++ opts->options |= OPTION_MPTCP;
17226 ++ opts->mptcp_options |= OPTION_MP_CAPABLE |
17227 ++ OPTION_TYPE_ACK;
17228 ++ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
17229 ++ opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
17230 ++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
17231 ++ opts->dss_csum = mpcb->dss_csum;
17232 ++
17233 ++ if (skb)
17234 ++ tp->mptcp->include_mpc = 0;
17235 ++ }
17236 ++ if (unlikely(tp->mptcp->pre_established)) {
17237 ++ opts->options |= OPTION_MPTCP;
17238 ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
17239 ++ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
17240 ++ }
17241 ++
17242 ++ if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {
17243 ++ opts->options |= OPTION_MPTCP;
17244 ++ opts->mptcp_options |= OPTION_DATA_ACK;
17245 ++ /* If !skb, we come from tcp_current_mss and thus we always
17246 ++ * assume that the DSS-option will be set for the data-packet.
17247 ++ */
17248 ++ if (skb && !mptcp_is_data_seq(skb)) {
17249 ++ *size += MPTCP_SUB_LEN_ACK_ALIGN;
17250 ++ } else {
17251 ++ /* Doesn't matter, if csum included or not. It will be
17252 ++ * either 10 or 12, and thus aligned = 12
17253 ++ */
17254 ++ *size += MPTCP_SUB_LEN_ACK_ALIGN +
17255 ++ MPTCP_SUB_LEN_SEQ_ALIGN;
17256 ++ }
17257 ++
17258 ++ *size += MPTCP_SUB_LEN_DSS_ALIGN;
17259 ++ }
17260 ++
17261 ++ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal)
17262 ++ mpcb->pm_ops->addr_signal(sk, size, opts, skb);
17263 ++
17264 ++ if (unlikely(tp->mptcp->send_mp_prio) &&
17265 ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
17266 ++ opts->options |= OPTION_MPTCP;
17267 ++ opts->mptcp_options |= OPTION_MP_PRIO;
17268 ++ if (skb)
17269 ++ tp->mptcp->send_mp_prio = 0;
17270 ++ *size += MPTCP_SUB_LEN_PRIO_ALIGN;
17271 ++ }
17272 ++
17273 ++ return;
17274 ++}
17275 ++
17276 ++u16 mptcp_select_window(struct sock *sk)
17277 ++{
17278 ++ u16 new_win = tcp_select_window(sk);
17279 ++ struct tcp_sock *tp = tcp_sk(sk);
17280 ++ struct tcp_sock *meta_tp = mptcp_meta_tp(tp);
17281 ++
17282 ++ meta_tp->rcv_wnd = tp->rcv_wnd;
17283 ++ meta_tp->rcv_wup = meta_tp->rcv_nxt;
17284 ++
17285 ++ return new_win;
17286 ++}
17287 ++
17288 ++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
17289 ++ const struct tcp_out_options *opts,
17290 ++ struct sk_buff *skb)
17291 ++{
17292 ++ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
17293 ++ struct mp_capable *mpc = (struct mp_capable *)ptr;
17294 ++
17295 ++ mpc->kind = TCPOPT_MPTCP;
17296 ++
17297 ++ if ((OPTION_TYPE_SYN & opts->mptcp_options) ||
17298 ++ (OPTION_TYPE_SYNACK & opts->mptcp_options)) {
17299 ++ mpc->sender_key = opts->mp_capable.sender_key;
17300 ++ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
17301 ++ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
17302 ++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
17303 ++ mpc->sender_key = opts->mp_capable.sender_key;
17304 ++ mpc->receiver_key = opts->mp_capable.receiver_key;
17305 ++ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
17306 ++ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
17307 ++ }
17308 ++
17309 ++ mpc->sub = MPTCP_SUB_CAPABLE;
17310 ++ mpc->ver = 0;
17311 ++ mpc->a = opts->dss_csum;
17312 ++ mpc->b = 0;
17313 ++ mpc->rsv = 0;
17314 ++ mpc->h = 1;
17315 ++ }
17316 ++
17317 ++ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
17318 ++ struct mp_join *mpj = (struct mp_join *)ptr;
17319 ++
17320 ++ mpj->kind = TCPOPT_MPTCP;
17321 ++ mpj->sub = MPTCP_SUB_JOIN;
17322 ++ mpj->rsv = 0;
17323 ++
17324 ++ if (OPTION_TYPE_SYN & opts->mptcp_options) {
17325 ++ mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
17326 ++ mpj->u.syn.token = opts->mp_join_syns.token;
17327 ++ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
17328 ++ mpj->b = opts->mp_join_syns.low_prio;
17329 ++ mpj->addr_id = opts->addr_id;
17330 ++ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
17331 ++ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
17332 ++ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
17333 ++ mpj->u.synack.mac =
17334 ++ opts->mp_join_syns.sender_truncated_mac;
17335 ++ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
17336 ++ mpj->b = opts->mp_join_syns.low_prio;
17337 ++ mpj->addr_id = opts->addr_id;
17338 ++ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
17339 ++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
17340 ++ mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
17341 ++ mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */
17342 ++ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
17343 ++ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
17344 ++ }
17345 ++ }
17346 ++ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
17347 ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
17348 ++
17349 ++ mpadd->kind = TCPOPT_MPTCP;
17350 ++ if (opts->add_addr_v4) {
17351 ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
17352 ++ mpadd->sub = MPTCP_SUB_ADD_ADDR;
17353 ++ mpadd->ipver = 4;
17354 ++ mpadd->addr_id = opts->add_addr4.addr_id;
17355 ++ mpadd->u.v4.addr = opts->add_addr4.addr;
17356 ++ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
17357 ++ } else if (opts->add_addr_v6) {
17358 ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
17359 ++ mpadd->sub = MPTCP_SUB_ADD_ADDR;
17360 ++ mpadd->ipver = 6;
17361 ++ mpadd->addr_id = opts->add_addr6.addr_id;
17362 ++ memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,
17363 ++ sizeof(mpadd->u.v6.addr));
17364 ++ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
17365 ++ }
17366 ++ }
17367 ++ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
17368 ++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
17369 ++ u8 *addrs_id;
17370 ++ int id, len, len_align;
17371 ++
17372 ++ len = mptcp_sub_len_remove_addr(opts->remove_addrs);
17373 ++ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
17374 ++
17375 ++ mprem->kind = TCPOPT_MPTCP;
17376 ++ mprem->len = len;
17377 ++ mprem->sub = MPTCP_SUB_REMOVE_ADDR;
17378 ++ mprem->rsv = 0;
17379 ++ addrs_id = &mprem->addrs_id;
17380 ++
17381 ++ mptcp_for_each_bit_set(opts->remove_addrs, id)
17382 ++ *(addrs_id++) = id;
17383 ++
17384 ++ /* Fill the rest with NOP's */
17385 ++ if (len_align > len) {
17386 ++ int i;
17387 ++ for (i = 0; i < len_align - len; i++)
17388 ++ *(addrs_id++) = TCPOPT_NOP;
17389 ++ }
17390 ++
17391 ++ ptr += len_align >> 2;
17392 ++ }
17393 ++ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
17394 ++ struct mp_fail *mpfail = (struct mp_fail *)ptr;
17395 ++
17396 ++ mpfail->kind = TCPOPT_MPTCP;
17397 ++ mpfail->len = MPTCP_SUB_LEN_FAIL;
17398 ++ mpfail->sub = MPTCP_SUB_FAIL;
17399 ++ mpfail->rsv1 = 0;
17400 ++ mpfail->rsv2 = 0;
17401 ++ mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq);
17402 ++
17403 ++ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
17404 ++ }
17405 ++ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
17406 ++ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
17407 ++
17408 ++ mpfclose->kind = TCPOPT_MPTCP;
17409 ++ mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
17410 ++ mpfclose->sub = MPTCP_SUB_FCLOSE;
17411 ++ mpfclose->rsv1 = 0;
17412 ++ mpfclose->rsv2 = 0;
17413 ++ mpfclose->key = opts->mp_capable.receiver_key;
17414 ++
17415 ++ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
17416 ++ }
17417 ++
17418 ++ if (OPTION_DATA_ACK & opts->mptcp_options) {
17419 ++ if (!mptcp_is_data_seq(skb))
17420 ++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
17421 ++ else
17422 ++ ptr += mptcp_write_dss_data_seq(tp, skb, ptr);
17423 ++ }
17424 ++ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
17425 ++ struct mp_prio *mpprio = (struct mp_prio *)ptr;
17426 ++
17427 ++ mpprio->kind = TCPOPT_MPTCP;
17428 ++ mpprio->len = MPTCP_SUB_LEN_PRIO;
17429 ++ mpprio->sub = MPTCP_SUB_PRIO;
17430 ++ mpprio->rsv = 0;
17431 ++ mpprio->b = tp->mptcp->low_prio;
17432 ++ mpprio->addr_id = TCPOPT_NOP;
17433 ++
17434 ++ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
17435 ++ }
17436 ++}
17437 ++
17438 ++/* Sends the datafin */
17439 ++void mptcp_send_fin(struct sock *meta_sk)
17440 ++{
17441 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17442 ++ struct sk_buff *skb = tcp_write_queue_tail(meta_sk);
17443 ++ int mss_now;
17444 ++
17445 ++ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
17446 ++ meta_tp->mpcb->passive_close = 1;
17447 ++
17448 ++ /* Optimization, tack on the FIN if we have a queue of
17449 ++ * unsent frames. But be careful about outgoing SACKS
17450 ++ * and IP options.
17451 ++ */
17452 ++ mss_now = mptcp_current_mss(meta_sk);
17453 ++
17454 ++ if (tcp_send_head(meta_sk) != NULL) {
17455 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
17456 ++ TCP_SKB_CB(skb)->end_seq++;
17457 ++ meta_tp->write_seq++;
17458 ++ } else {
17459 ++ /* Socket is locked, keep trying until memory is available. */
17460 ++ for (;;) {
17461 ++ skb = alloc_skb_fclone(MAX_TCP_HEADER,
17462 ++ meta_sk->sk_allocation);
17463 ++ if (skb)
17464 ++ break;
17465 ++ yield();
17466 ++ }
17467 ++ /* Reserve space for headers and prepare control bits. */
17468 ++ skb_reserve(skb, MAX_TCP_HEADER);
17469 ++
17470 ++ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
17471 ++ TCP_SKB_CB(skb)->end_seq++;
17472 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
17473 ++ tcp_queue_skb(meta_sk, skb);
17474 ++ }
17475 ++ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
17476 ++}
17477 ++
17478 ++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
17479 ++{
17480 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17481 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
17482 ++ struct sock *sk = NULL, *sk_it = NULL, *tmpsk;
17483 ++
17484 ++ if (!mpcb->cnt_subflows)
17485 ++ return;
17486 ++
17487 ++ WARN_ON(meta_tp->send_mp_fclose);
17488 ++
17489 ++ /* First - select a socket */
17490 ++ sk = mptcp_select_ack_sock(meta_sk);
17491 ++
17492 ++ /* May happen if no subflow is in an appropriate state */
17493 ++ if (!sk)
17494 ++ return;
17495 ++
17496 ++ /* We are in infinite mode - just send a reset */
17497 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {
17498 ++ sk->sk_err = ECONNRESET;
17499 ++ if (tcp_need_reset(sk->sk_state))
17500 ++ tcp_send_active_reset(sk, priority);
17501 ++ mptcp_sub_force_close(sk);
17502 ++ return;
17503 ++ }
17504 ++
17505 ++
17506 ++ tcp_sk(sk)->send_mp_fclose = 1;
17507 ++ /** Reset all other subflows */
17508 ++
17509 ++ /* tcp_done must be handled with bh disabled */
17510 ++ if (!in_serving_softirq())
17511 ++ local_bh_disable();
17512 ++
17513 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
17514 ++ if (tcp_sk(sk_it)->send_mp_fclose)
17515 ++ continue;
17516 ++
17517 ++ sk_it->sk_err = ECONNRESET;
17518 ++ if (tcp_need_reset(sk_it->sk_state))
17519 ++ tcp_send_active_reset(sk_it, GFP_ATOMIC);
17520 ++ mptcp_sub_force_close(sk_it);
17521 ++ }
17522 ++
17523 ++ if (!in_serving_softirq())
17524 ++ local_bh_enable();
17525 ++
17526 ++ tcp_send_ack(sk);
17527 ++ inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);
17528 ++
17529 ++ meta_tp->send_mp_fclose = 1;
17530 ++}
17531 ++
17532 ++static void mptcp_ack_retransmit_timer(struct sock *sk)
17533 ++{
17534 ++ struct sk_buff *skb;
17535 ++ struct tcp_sock *tp = tcp_sk(sk);
17536 ++ struct inet_connection_sock *icsk = inet_csk(sk);
17537 ++
17538 ++ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
17539 ++ goto out; /* Routing failure or similar */
17540 ++
17541 ++ if (!tp->retrans_stamp)
17542 ++ tp->retrans_stamp = tcp_time_stamp ? : 1;
17543 ++
17544 ++ if (tcp_write_timeout(sk)) {
17545 ++ tp->mptcp->pre_established = 0;
17546 ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
17547 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
17548 ++ goto out;
17549 ++ }
17550 ++
17551 ++ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
17552 ++ if (skb == NULL) {
17553 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
17554 ++ jiffies + icsk->icsk_rto);
17555 ++ return;
17556 ++ }
17557 ++
17558 ++ /* Reserve space for headers and prepare control bits */
17559 ++ skb_reserve(skb, MAX_TCP_HEADER);
17560 ++ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
17561 ++
17562 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
17563 ++ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
17564 ++ /* Retransmission failed because of local congestion,
17565 ++ * do not backoff.
17566 ++ */
17567 ++ if (!icsk->icsk_retransmits)
17568 ++ icsk->icsk_retransmits = 1;
17569 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
17570 ++ jiffies + icsk->icsk_rto);
17571 ++ return;
17572 ++ }
17573 ++
17574 ++
17575 ++ icsk->icsk_retransmits++;
17576 ++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
17577 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
17578 ++ jiffies + icsk->icsk_rto);
17579 ++ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
17580 ++ __sk_dst_reset(sk);
17581 ++
17582 ++out:;
17583 ++}
17584 ++
17585 ++void mptcp_ack_handler(unsigned long data)
17586 ++{
17587 ++ struct sock *sk = (struct sock *)data;
17588 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
17589 ++
17590 ++ bh_lock_sock(meta_sk);
17591 ++ if (sock_owned_by_user(meta_sk)) {
17592 ++ /* Try again later */
17593 ++ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
17594 ++ jiffies + (HZ / 20));
17595 ++ goto out_unlock;
17596 ++ }
17597 ++
17598 ++ if (sk->sk_state == TCP_CLOSE)
17599 ++ goto out_unlock;
17600 ++ if (!tcp_sk(sk)->mptcp->pre_established)
17601 ++ goto out_unlock;
17602 ++
17603 ++ mptcp_ack_retransmit_timer(sk);
17604 ++
17605 ++ sk_mem_reclaim(sk);
17606 ++
17607 ++out_unlock:
17608 ++ bh_unlock_sock(meta_sk);
17609 ++ sock_put(sk);
17610 ++}
17611 ++
17612 ++/* Similar to tcp_retransmit_skb
17613 ++ *
17614 ++ * The diff is that we handle the retransmission-stats (retrans_stamp) at the
17615 ++ * meta-level.
17616 ++ */
17617 ++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
17618 ++{
17619 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17620 ++ struct sock *subsk;
17621 ++ unsigned int limit, mss_now;
17622 ++ int err = -1;
17623 ++
17624 ++ /* Do not sent more than we queued. 1/4 is reserved for possible
17625 ++ * copying overhead: fragmentation, tunneling, mangling etc.
17626 ++ *
17627 ++ * This is a meta-retransmission thus we check on the meta-socket.
17628 ++ */
17629 ++ if (atomic_read(&meta_sk->sk_wmem_alloc) >
17630 ++ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
17631 ++ return -EAGAIN;
17632 ++ }
17633 ++
17634 ++ /* We need to make sure that the retransmitted segment can be sent on a
17635 ++ * subflow right now. If it is too big, it needs to be fragmented.
17636 ++ */
17637 ++ subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false);
17638 ++ if (!subsk) {
17639 ++ /* We want to increase icsk_retransmits, thus return 0, so that
17640 ++ * mptcp_retransmit_timer enters the desired branch.
17641 ++ */
17642 ++ err = 0;
17643 ++ goto failed;
17644 ++ }
17645 ++ mss_now = tcp_current_mss(subsk);
17646 ++
17647 ++ /* If the segment was cloned (e.g. a meta retransmission), the header
17648 ++ * must be expanded/copied so that there is no corruption of TSO
17649 ++ * information.
17650 ++ */
17651 ++ if (skb_unclone(skb, GFP_ATOMIC)) {
17652 ++ err = -ENOMEM;
17653 ++ goto failed;
17654 ++ }
17655 ++
17656 ++ /* Must have been set by mptcp_write_xmit before */
17657 ++ BUG_ON(!tcp_skb_pcount(skb));
17658 ++
17659 ++ limit = mss_now;
17660 ++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
17661 ++ * tcp_write_xmit. Otherwise split-point would return 0.
17662 ++ */
17663 ++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
17664 ++ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
17665 ++ UINT_MAX / mss_now,
17666 ++ TCP_NAGLE_OFF);
17667 ++
17668 ++ if (skb->len > limit &&
17669 ++ unlikely(mptcp_fragment(meta_sk, skb, limit,
17670 ++ GFP_ATOMIC, 0)))
17671 ++ goto failed;
17672 ++
17673 ++ if (!mptcp_skb_entail(subsk, skb, -1))
17674 ++ goto failed;
17675 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
17676 ++
17677 ++ /* Update global TCP statistics. */
17678 ++ TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
17679 ++
17680 ++ /* Diff to tcp_retransmit_skb */
17681 ++
17682 ++ /* Save stamp of the first retransmit. */
17683 ++ if (!meta_tp->retrans_stamp)
17684 ++ meta_tp->retrans_stamp = TCP_SKB_CB(skb)->when;
17685 ++
17686 ++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
17687 ++
17688 ++ return 0;
17689 ++
17690 ++failed:
17691 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL);
17692 ++ return err;
17693 ++}
17694 ++
17695 ++/* Similar to tcp_retransmit_timer
17696 ++ *
17697 ++ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
17698 ++ * and that we don't have an srtt estimation at the meta-level.
17699 ++ */
17700 ++void mptcp_retransmit_timer(struct sock *meta_sk)
17701 ++{
17702 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17703 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
17704 ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
17705 ++ int err;
17706 ++
17707 ++ /* In fallback, retransmission is handled at the subflow-level */
17708 ++ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||
17709 ++ mpcb->send_infinite_mapping)
17710 ++ return;
17711 ++
17712 ++ WARN_ON(tcp_write_queue_empty(meta_sk));
17713 ++
17714 ++ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
17715 ++ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
17716 ++ /* Receiver dastardly shrinks window. Our retransmits
17717 ++ * become zero probes, but we should not timeout this
17718 ++ * connection. If the socket is an orphan, time it out,
17719 ++ * we cannot allow such beasts to hang infinitely.
17720 ++ */
17721 ++ struct inet_sock *meta_inet = inet_sk(meta_sk);
17722 ++ if (meta_sk->sk_family == AF_INET) {
17723 ++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
17724 ++ &meta_inet->inet_daddr,
17725 ++ ntohs(meta_inet->inet_dport),
17726 ++ meta_inet->inet_num, meta_tp->snd_una,
17727 ++ meta_tp->snd_nxt);
17728 ++ }
17729 ++#if IS_ENABLED(CONFIG_IPV6)
17730 ++ else if (meta_sk->sk_family == AF_INET6) {
17731 ++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
17732 ++ &meta_sk->sk_v6_daddr,
17733 ++ ntohs(meta_inet->inet_dport),
17734 ++ meta_inet->inet_num, meta_tp->snd_una,
17735 ++ meta_tp->snd_nxt);
17736 ++ }
17737 ++#endif
17738 ++ if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
17739 ++ tcp_write_err(meta_sk);
17740 ++ return;
17741 ++ }
17742 ++
17743 ++ mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
17744 ++ goto out_reset_timer;
17745 ++ }
17746 ++
17747 ++ if (tcp_write_timeout(meta_sk))
17748 ++ return;
17749 ++
17750 ++ if (meta_icsk->icsk_retransmits == 0)
17751 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
17752 ++
17753 ++ meta_icsk->icsk_ca_state = TCP_CA_Loss;
17754 ++
17755 ++ err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
17756 ++ if (err > 0) {
17757 ++ /* Retransmission failed because of local congestion,
17758 ++ * do not backoff.
17759 ++ */
17760 ++ if (!meta_icsk->icsk_retransmits)
17761 ++ meta_icsk->icsk_retransmits = 1;
17762 ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
17763 ++ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
17764 ++ TCP_RTO_MAX);
17765 ++ return;
17766 ++ }
17767 ++
17768 ++ /* Increase the timeout each time we retransmit. Note that
17769 ++ * we do not increase the rtt estimate. rto is initialized
17770 ++ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
17771 ++ * that doubling rto each time is the least we can get away with.
17772 ++ * In KA9Q, Karn uses this for the first few times, and then
17773 ++ * goes to quadratic. netBSD doubles, but only goes up to *64,
17774 ++ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
17775 ++ * defined in the protocol as the maximum possible RTT. I guess
17776 ++ * we'll have to use something other than TCP to talk to the
17777 ++ * University of Mars.
17778 ++ *
17779 ++ * PAWS allows us longer timeouts and large windows, so once
17780 ++ * implemented ftp to mars will work nicely. We will have to fix
17781 ++ * the 120 second clamps though!
17782 ++ */
17783 ++ meta_icsk->icsk_backoff++;
17784 ++ meta_icsk->icsk_retransmits++;
17785 ++
17786 ++out_reset_timer:
17787 ++ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
17788 ++ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
17789 ++ * might be increased if the stream oscillates between thin and thick,
17790 ++ * thus the old value might already be too high compared to the value
17791 ++ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
17792 ++ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
17793 ++ * exponential backoff behaviour to avoid continue hammering
17794 ++ * linear-timeout retransmissions into a black hole
17795 ++ */
17796 ++ if (meta_sk->sk_state == TCP_ESTABLISHED &&
17797 ++ (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
17798 ++ tcp_stream_is_thin(meta_tp) &&
17799 ++ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
17800 ++ meta_icsk->icsk_backoff = 0;
17801 ++ /* We cannot do the same as in tcp_write_timer because the
17802 ++ * srtt is not set here.
17803 ++ */
17804 ++ mptcp_set_rto(meta_sk);
17805 ++ } else {
17806 ++ /* Use normal (exponential) backoff */
17807 ++ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
17808 ++ }
17809 ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
17810 ++
17811 ++ return;
17812 ++}
17813 ++
17814 ++/* Modify values to an mptcp-level for the initial window of new subflows */
17815 ++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
17816 ++ __u32 *window_clamp, int wscale_ok,
17817 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
17818 ++ const struct sock *sk)
17819 ++{
17820 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
17821 ++
17822 ++ *window_clamp = mpcb->orig_window_clamp;
17823 ++ __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);
17824 ++
17825 ++ tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,
17826 ++ wscale_ok, rcv_wscale, init_rcv_wnd, sk);
17827 ++}
17828 ++
17829 ++static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss,
17830 ++ unsigned int (*mss_cb)(struct sock *sk))
17831 ++{
17832 ++ struct sock *sk;
17833 ++ u64 rate = 0;
17834 ++
17835 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
17836 ++ struct tcp_sock *tp = tcp_sk(sk);
17837 ++ int this_mss;
17838 ++ u64 this_rate;
17839 ++
17840 ++ if (!mptcp_sk_can_send(sk))
17841 ++ continue;
17842 ++
17843 ++ /* Do not consider subflows without a RTT estimation yet
17844 ++ * otherwise this_rate >>> rate.
17845 ++ */
17846 ++ if (unlikely(!tp->srtt_us))
17847 ++ continue;
17848 ++
17849 ++ this_mss = mss_cb(sk);
17850 ++
17851 ++ /* If this_mss is smaller than mss, it means that a segment will
17852 ++ * be splitted in two (or more) when pushed on this subflow. If
17853 ++ * you consider that mss = 1428 and this_mss = 1420 then two
17854 ++ * segments will be generated: a 1420-byte and 8-byte segment.
17855 ++ * The latter will introduce a large overhead as for a single
17856 ++ * data segment 2 slots will be used in the congestion window.
17857 ++ * Therefore reducing by ~2 the potential throughput of this
17858 ++ * subflow. Indeed, 1428 will be send while 2840 could have been
17859 ++ * sent if mss == 1420 reducing the throughput by 2840 / 1428.
17860 ++ *
17861 ++ * The following algorithm take into account this overhead
17862 ++ * when computing the potential throughput that MPTCP can
17863 ++ * achieve when generating mss-byte segments.
17864 ++ *
17865 ++ * The formulae is the following:
17866 ++ * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub}
17867 ++ * Where ratio is computed as follows:
17868 ++ * \frac{mss}{\ceil{mss / mss_sub} * mss_sub}
17869 ++ *
17870 ++ * ratio gives the reduction factor of the theoretical
17871 ++ * throughput a subflow can achieve if MPTCP uses a specific
17872 ++ * MSS value.
17873 ++ */
17874 ++ this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) *
17875 ++ max(tp->snd_cwnd, tp->packets_out),
17876 ++ (u64)tp->srtt_us *
17877 ++ DIV_ROUND_UP(mss, this_mss) * this_mss);
17878 ++ rate += this_rate;
17879 ++ }
17880 ++
17881 ++ return rate;
17882 ++}
17883 ++
17884 ++static unsigned int __mptcp_current_mss(const struct sock *meta_sk,
17885 ++ unsigned int (*mss_cb)(struct sock *sk))
17886 ++{
17887 ++ unsigned int mss = 0;
17888 ++ u64 rate = 0;
17889 ++ struct sock *sk;
17890 ++
17891 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
17892 ++ int this_mss;
17893 ++ u64 this_rate;
17894 ++
17895 ++ if (!mptcp_sk_can_send(sk))
17896 ++ continue;
17897 ++
17898 ++ this_mss = mss_cb(sk);
17899 ++
17900 ++ /* Same mss values will produce the same throughput. */
17901 ++ if (this_mss == mss)
17902 ++ continue;
17903 ++
17904 ++ /* See whether using this mss value can theoretically improve
17905 ++ * the performances.
17906 ++ */
17907 ++ this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb);
17908 ++ if (this_rate >= rate) {
17909 ++ mss = this_mss;
17910 ++ rate = this_rate;
17911 ++ }
17912 ++ }
17913 ++
17914 ++ return mss;
17915 ++}
17916 ++
17917 ++unsigned int mptcp_current_mss(struct sock *meta_sk)
17918 ++{
17919 ++ unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss);
17920 ++
17921 ++ /* If no subflow is available, we take a default-mss from the
17922 ++ * meta-socket.
17923 ++ */
17924 ++ return !mss ? tcp_current_mss(meta_sk) : mss;
17925 ++}
17926 ++
17927 ++static unsigned int mptcp_select_size_mss(struct sock *sk)
17928 ++{
17929 ++ return tcp_sk(sk)->mss_cache;
17930 ++}
17931 ++
17932 ++int mptcp_select_size(const struct sock *meta_sk, bool sg)
17933 ++{
17934 ++ unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss);
17935 ++
17936 ++ if (sg) {
17937 ++ if (mptcp_sk_can_gso(meta_sk)) {
17938 ++ mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
17939 ++ } else {
17940 ++ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
17941 ++
17942 ++ if (mss >= pgbreak &&
17943 ++ mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
17944 ++ mss = pgbreak;
17945 ++ }
17946 ++ }
17947 ++
17948 ++ return !mss ? tcp_sk(meta_sk)->mss_cache : mss;
17949 ++}
17950 ++
17951 ++int mptcp_check_snd_buf(const struct tcp_sock *tp)
17952 ++{
17953 ++ const struct sock *sk;
17954 ++ u32 rtt_max = tp->srtt_us;
17955 ++ u64 bw_est;
17956 ++
17957 ++ if (!tp->srtt_us)
17958 ++ return tp->reordering + 1;
17959 ++
17960 ++ mptcp_for_each_sk(tp->mpcb, sk) {
17961 ++ if (!mptcp_sk_can_send(sk))
17962 ++ continue;
17963 ++
17964 ++ if (rtt_max < tcp_sk(sk)->srtt_us)
17965 ++ rtt_max = tcp_sk(sk)->srtt_us;
17966 ++ }
17967 ++
17968 ++ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
17969 ++ (u64)tp->srtt_us);
17970 ++
17971 ++ return max_t(unsigned int, (u32)(bw_est >> 16),
17972 ++ tp->reordering + 1);
17973 ++}
17974 ++
17975 ++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
17976 ++ int large_allowed)
17977 ++{
17978 ++ struct sock *sk;
17979 ++ u32 xmit_size_goal = 0;
17980 ++
17981 ++ if (large_allowed && mptcp_sk_can_gso(meta_sk)) {
17982 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
17983 ++ int this_size_goal;
17984 ++
17985 ++ if (!mptcp_sk_can_send(sk))
17986 ++ continue;
17987 ++
17988 ++ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
17989 ++ if (this_size_goal > xmit_size_goal)
17990 ++ xmit_size_goal = this_size_goal;
17991 ++ }
17992 ++ }
17993 ++
17994 ++ return max(xmit_size_goal, mss_now);
17995 ++}
17996 ++
17997 ++/* Similar to tcp_trim_head - but we correctly copy the DSS-option */
17998 ++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
17999 ++{
18000 ++ if (skb_cloned(skb)) {
18001 ++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
18002 ++ return -ENOMEM;
18003 ++ }
18004 ++
18005 ++ __pskb_trim_head(skb, len);
18006 ++
18007 ++ TCP_SKB_CB(skb)->seq += len;
18008 ++ skb->ip_summed = CHECKSUM_PARTIAL;
18009 ++
18010 ++ skb->truesize -= len;
18011 ++ sk->sk_wmem_queued -= len;
18012 ++ sk_mem_uncharge(sk, len);
18013 ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
18014 ++
18015 ++ /* Any change of skb->len requires recalculation of tso factor. */
18016 ++ if (tcp_skb_pcount(skb) > 1)
18017 ++ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
18018 ++
18019 ++ return 0;
18020 ++}
18021 +diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c
18022 +new file mode 100644
18023 +index 000000000000..9542f950729f
18024 +--- /dev/null
18025 ++++ b/net/mptcp/mptcp_pm.c
18026 +@@ -0,0 +1,169 @@
18027 ++/*
18028 ++ * MPTCP implementation - MPTCP-subflow-management
18029 ++ *
18030 ++ * Initial Design & Implementation:
18031 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
18032 ++ *
18033 ++ * Current Maintainer & Author:
18034 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
18035 ++ *
18036 ++ * Additional authors:
18037 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
18038 ++ * Gregory Detal <gregory.detal@×××××××××.be>
18039 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
18040 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
18041 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
18042 ++ * Andreas Ripke <ripke@××××××.eu>
18043 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
18044 ++ * Octavian Purdila <octavian.purdila@×××××.com>
18045 ++ * John Ronan <jronan@××××.org>
18046 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
18047 ++ * Brandon Heller <brandonh@××××××××.edu>
18048 ++ *
18049 ++ *
18050 ++ * This program is free software; you can redistribute it and/or
18051 ++ * modify it under the terms of the GNU General Public License
18052 ++ * as published by the Free Software Foundation; either version
18053 ++ * 2 of the License, or (at your option) any later version.
18054 ++ */
18055 ++
18056 ++
18057 ++#include <linux/module.h>
18058 ++#include <net/mptcp.h>
18059 ++
18060 ++static DEFINE_SPINLOCK(mptcp_pm_list_lock);
18061 ++static LIST_HEAD(mptcp_pm_list);
18062 ++
18063 ++static int mptcp_default_id(sa_family_t family, union inet_addr *addr,
18064 ++ struct net *net, bool *low_prio)
18065 ++{
18066 ++ return 0;
18067 ++}
18068 ++
18069 ++struct mptcp_pm_ops mptcp_pm_default = {
18070 ++ .get_local_id = mptcp_default_id, /* We do not care */
18071 ++ .name = "default",
18072 ++ .owner = THIS_MODULE,
18073 ++};
18074 ++
18075 ++static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
18076 ++{
18077 ++ struct mptcp_pm_ops *e;
18078 ++
18079 ++ list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
18080 ++ if (strcmp(e->name, name) == 0)
18081 ++ return e;
18082 ++ }
18083 ++
18084 ++ return NULL;
18085 ++}
18086 ++
18087 ++int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
18088 ++{
18089 ++ int ret = 0;
18090 ++
18091 ++ if (!pm->get_local_id)
18092 ++ return -EINVAL;
18093 ++
18094 ++ spin_lock(&mptcp_pm_list_lock);
18095 ++ if (mptcp_pm_find(pm->name)) {
18096 ++ pr_notice("%s already registered\n", pm->name);
18097 ++ ret = -EEXIST;
18098 ++ } else {
18099 ++ list_add_tail_rcu(&pm->list, &mptcp_pm_list);
18100 ++ pr_info("%s registered\n", pm->name);
18101 ++ }
18102 ++ spin_unlock(&mptcp_pm_list_lock);
18103 ++
18104 ++ return ret;
18105 ++}
18106 ++EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
18107 ++
18108 ++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
18109 ++{
18110 ++ spin_lock(&mptcp_pm_list_lock);
18111 ++ list_del_rcu(&pm->list);
18112 ++ spin_unlock(&mptcp_pm_list_lock);
18113 ++}
18114 ++EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
18115 ++
18116 ++void mptcp_get_default_path_manager(char *name)
18117 ++{
18118 ++ struct mptcp_pm_ops *pm;
18119 ++
18120 ++ BUG_ON(list_empty(&mptcp_pm_list));
18121 ++
18122 ++ rcu_read_lock();
18123 ++ pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
18124 ++ strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
18125 ++ rcu_read_unlock();
18126 ++}
18127 ++
18128 ++int mptcp_set_default_path_manager(const char *name)
18129 ++{
18130 ++ struct mptcp_pm_ops *pm;
18131 ++ int ret = -ENOENT;
18132 ++
18133 ++ spin_lock(&mptcp_pm_list_lock);
18134 ++ pm = mptcp_pm_find(name);
18135 ++#ifdef CONFIG_MODULES
18136 ++ if (!pm && capable(CAP_NET_ADMIN)) {
18137 ++ spin_unlock(&mptcp_pm_list_lock);
18138 ++
18139 ++ request_module("mptcp_%s", name);
18140 ++ spin_lock(&mptcp_pm_list_lock);
18141 ++ pm = mptcp_pm_find(name);
18142 ++ }
18143 ++#endif
18144 ++
18145 ++ if (pm) {
18146 ++ list_move(&pm->list, &mptcp_pm_list);
18147 ++ ret = 0;
18148 ++ } else {
18149 ++ pr_info("%s is not available\n", name);
18150 ++ }
18151 ++ spin_unlock(&mptcp_pm_list_lock);
18152 ++
18153 ++ return ret;
18154 ++}
18155 ++
18156 ++void mptcp_init_path_manager(struct mptcp_cb *mpcb)
18157 ++{
18158 ++ struct mptcp_pm_ops *pm;
18159 ++
18160 ++ rcu_read_lock();
18161 ++ list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
18162 ++ if (try_module_get(pm->owner)) {
18163 ++ mpcb->pm_ops = pm;
18164 ++ break;
18165 ++ }
18166 ++ }
18167 ++ rcu_read_unlock();
18168 ++}
18169 ++
18170 ++/* Manage refcounts on socket close. */
18171 ++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
18172 ++{
18173 ++ module_put(mpcb->pm_ops->owner);
18174 ++}
18175 ++
18176 ++/* Fallback to the default path-manager. */
18177 ++void mptcp_fallback_default(struct mptcp_cb *mpcb)
18178 ++{
18179 ++ struct mptcp_pm_ops *pm;
18180 ++
18181 ++ mptcp_cleanup_path_manager(mpcb);
18182 ++ pm = mptcp_pm_find("default");
18183 ++
18184 ++ /* Cannot fail - it's the default module */
18185 ++ try_module_get(pm->owner);
18186 ++ mpcb->pm_ops = pm;
18187 ++}
18188 ++EXPORT_SYMBOL_GPL(mptcp_fallback_default);
18189 ++
18190 ++/* Set default value from kernel configuration at bootup */
18191 ++static int __init mptcp_path_manager_default(void)
18192 ++{
18193 ++ return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);
18194 ++}
18195 ++late_initcall(mptcp_path_manager_default);
18196 +diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c
18197 +new file mode 100644
18198 +index 000000000000..93278f684069
18199 +--- /dev/null
18200 ++++ b/net/mptcp/mptcp_rr.c
18201 +@@ -0,0 +1,301 @@
18202 ++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
18203 ++
18204 ++#include <linux/module.h>
18205 ++#include <net/mptcp.h>
18206 ++
18207 ++static unsigned char num_segments __read_mostly = 1;
18208 ++module_param(num_segments, byte, 0644);
18209 ++MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");
18210 ++
18211 ++static bool cwnd_limited __read_mostly = 1;
18212 ++module_param(cwnd_limited, bool, 0644);
18213 ++MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");
18214 ++
18215 ++struct rrsched_priv {
18216 ++ unsigned char quota;
18217 ++};
18218 ++
18219 ++static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)
18220 ++{
18221 ++ return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];
18222 ++}
18223 ++
18224 ++/* If the sub-socket sk available to send the skb? */
18225 ++static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,
18226 ++ bool zero_wnd_test, bool cwnd_test)
18227 ++{
18228 ++ const struct tcp_sock *tp = tcp_sk(sk);
18229 ++ unsigned int space, in_flight;
18230 ++
18231 ++ /* Set of states for which we are allowed to send data */
18232 ++ if (!mptcp_sk_can_send(sk))
18233 ++ return false;
18234 ++
18235 ++ /* We do not send data on this subflow unless it is
18236 ++ * fully established, i.e. the 4th ack has been received.
18237 ++ */
18238 ++ if (tp->mptcp->pre_established)
18239 ++ return false;
18240 ++
18241 ++ if (tp->pf)
18242 ++ return false;
18243 ++
18244 ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
18245 ++ /* If SACK is disabled, and we got a loss, TCP does not exit
18246 ++ * the loss-state until something above high_seq has been acked.
18247 ++ * (see tcp_try_undo_recovery)
18248 ++ *
18249 ++ * high_seq is the snd_nxt at the moment of the RTO. As soon
18250 ++ * as we have an RTO, we won't push data on the subflow.
18251 ++ * Thus, snd_una can never go beyond high_seq.
18252 ++ */
18253 ++ if (!tcp_is_reno(tp))
18254 ++ return false;
18255 ++ else if (tp->snd_una != tp->high_seq)
18256 ++ return false;
18257 ++ }
18258 ++
18259 ++ if (!tp->mptcp->fully_established) {
18260 ++ /* Make sure that we send in-order data */
18261 ++ if (skb && tp->mptcp->second_packet &&
18262 ++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
18263 ++ return false;
18264 ++ }
18265 ++
18266 ++ if (!cwnd_test)
18267 ++ goto zero_wnd_test;
18268 ++
18269 ++ in_flight = tcp_packets_in_flight(tp);
18270 ++ /* Not even a single spot in the cwnd */
18271 ++ if (in_flight >= tp->snd_cwnd)
18272 ++ return false;
18273 ++
18274 ++ /* Now, check if what is queued in the subflow's send-queue
18275 ++ * already fills the cwnd.
18276 ++ */
18277 ++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
18278 ++
18279 ++ if (tp->write_seq - tp->snd_nxt > space)
18280 ++ return false;
18281 ++
18282 ++zero_wnd_test:
18283 ++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
18284 ++ return false;
18285 ++
18286 ++ return true;
18287 ++}
18288 ++
18289 ++/* Are we not allowed to reinject this skb on tp? */
18290 ++static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
18291 ++{
18292 ++ /* If the skb has already been enqueued in this sk, try to find
18293 ++ * another one.
18294 ++ */
18295 ++ return skb &&
18296 ++ /* Has the skb already been enqueued into this subsocket? */
18297 ++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
18298 ++}
18299 ++
18300 ++/* We just look for any subflow that is available */
18301 ++static struct sock *rr_get_available_subflow(struct sock *meta_sk,
18302 ++ struct sk_buff *skb,
18303 ++ bool zero_wnd_test)
18304 ++{
18305 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18306 ++ struct sock *sk, *bestsk = NULL, *backupsk = NULL;
18307 ++
18308 ++ /* Answer data_fin on same subflow!!! */
18309 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
18310 ++ skb && mptcp_is_data_fin(skb)) {
18311 ++ mptcp_for_each_sk(mpcb, sk) {
18312 ++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
18313 ++ mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
18314 ++ return sk;
18315 ++ }
18316 ++ }
18317 ++
18318 ++ /* First, find the best subflow */
18319 ++ mptcp_for_each_sk(mpcb, sk) {
18320 ++ struct tcp_sock *tp = tcp_sk(sk);
18321 ++
18322 ++ if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
18323 ++ continue;
18324 ++
18325 ++ if (mptcp_rr_dont_reinject_skb(tp, skb)) {
18326 ++ backupsk = sk;
18327 ++ continue;
18328 ++ }
18329 ++
18330 ++ bestsk = sk;
18331 ++ }
18332 ++
18333 ++ if (bestsk) {
18334 ++ sk = bestsk;
18335 ++ } else if (backupsk) {
18336 ++ /* It has been sent on all subflows once - let's give it a
18337 ++ * chance again by restarting its pathmask.
18338 ++ */
18339 ++ if (skb)
18340 ++ TCP_SKB_CB(skb)->path_mask = 0;
18341 ++ sk = backupsk;
18342 ++ }
18343 ++
18344 ++ return sk;
18345 ++}
18346 ++
18347 ++/* Returns the next segment to be sent from the mptcp meta-queue.
18348 ++ * (chooses the reinject queue if any segment is waiting in it, otherwise,
18349 ++ * chooses the normal write queue).
18350 ++ * Sets *@reinject to 1 if the returned segment comes from the
18351 ++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
18352 ++ * and sets it to -1 if it is a meta-level retransmission to optimize the
18353 ++ * receive-buffer.
18354 ++ */
18355 ++static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)
18356 ++{
18357 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18358 ++ struct sk_buff *skb = NULL;
18359 ++
18360 ++ *reinject = 0;
18361 ++
18362 ++ /* If we are in fallback-mode, just take from the meta-send-queue */
18363 ++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
18364 ++ return tcp_send_head(meta_sk);
18365 ++
18366 ++ skb = skb_peek(&mpcb->reinject_queue);
18367 ++
18368 ++ if (skb)
18369 ++ *reinject = 1;
18370 ++ else
18371 ++ skb = tcp_send_head(meta_sk);
18372 ++ return skb;
18373 ++}
18374 ++
18375 ++static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,
18376 ++ int *reinject,
18377 ++ struct sock **subsk,
18378 ++ unsigned int *limit)
18379 ++{
18380 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18381 ++ struct sock *sk_it, *choose_sk = NULL;
18382 ++ struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);
18383 ++ unsigned char split = num_segments;
18384 ++ unsigned char iter = 0, full_subs = 0;
18385 ++
18386 ++ /* As we set it, we have to reset it as well. */
18387 ++ *limit = 0;
18388 ++
18389 ++ if (!skb)
18390 ++ return NULL;
18391 ++
18392 ++ if (*reinject) {
18393 ++ *subsk = rr_get_available_subflow(meta_sk, skb, false);
18394 ++ if (!*subsk)
18395 ++ return NULL;
18396 ++
18397 ++ return skb;
18398 ++ }
18399 ++
18400 ++retry:
18401 ++
18402 ++ /* First, we look for a subflow who is currently being used */
18403 ++ mptcp_for_each_sk(mpcb, sk_it) {
18404 ++ struct tcp_sock *tp_it = tcp_sk(sk_it);
18405 ++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
18406 ++
18407 ++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
18408 ++ continue;
18409 ++
18410 ++ iter++;
18411 ++
18412 ++ /* Is this subflow currently being used? */
18413 ++ if (rsp->quota > 0 && rsp->quota < num_segments) {
18414 ++ split = num_segments - rsp->quota;
18415 ++ choose_sk = sk_it;
18416 ++ goto found;
18417 ++ }
18418 ++
18419 ++ /* Or, it's totally unused */
18420 ++ if (!rsp->quota) {
18421 ++ split = num_segments;
18422 ++ choose_sk = sk_it;
18423 ++ }
18424 ++
18425 ++ /* Or, it must then be fully used */
18426 ++ if (rsp->quota == num_segments)
18427 ++ full_subs++;
18428 ++ }
18429 ++
18430 ++ /* All considered subflows have a full quota, and we considered at
18431 ++ * least one.
18432 ++ */
18433 ++ if (iter && iter == full_subs) {
18434 ++ /* So, we restart this round by setting quota to 0 and retry
18435 ++ * to find a subflow.
18436 ++ */
18437 ++ mptcp_for_each_sk(mpcb, sk_it) {
18438 ++ struct tcp_sock *tp_it = tcp_sk(sk_it);
18439 ++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
18440 ++
18441 ++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
18442 ++ continue;
18443 ++
18444 ++ rsp->quota = 0;
18445 ++ }
18446 ++
18447 ++ goto retry;
18448 ++ }
18449 ++
18450 ++found:
18451 ++ if (choose_sk) {
18452 ++ unsigned int mss_now;
18453 ++ struct tcp_sock *choose_tp = tcp_sk(choose_sk);
18454 ++ struct rrsched_priv *rsp = rrsched_get_priv(choose_tp);
18455 ++
18456 ++ if (!mptcp_rr_is_available(choose_sk, skb, false, true))
18457 ++ return NULL;
18458 ++
18459 ++ *subsk = choose_sk;
18460 ++ mss_now = tcp_current_mss(*subsk);
18461 ++ *limit = split * mss_now;
18462 ++
18463 ++ if (skb->len > mss_now)
18464 ++ rsp->quota += DIV_ROUND_UP(skb->len, mss_now);
18465 ++ else
18466 ++ rsp->quota++;
18467 ++
18468 ++ return skb;
18469 ++ }
18470 ++
18471 ++ return NULL;
18472 ++}
18473 ++
18474 ++static struct mptcp_sched_ops mptcp_sched_rr = {
18475 ++ .get_subflow = rr_get_available_subflow,
18476 ++ .next_segment = mptcp_rr_next_segment,
18477 ++ .name = "roundrobin",
18478 ++ .owner = THIS_MODULE,
18479 ++};
18480 ++
18481 ++static int __init rr_register(void)
18482 ++{
18483 ++ BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);
18484 ++
18485 ++ if (mptcp_register_scheduler(&mptcp_sched_rr))
18486 ++ return -1;
18487 ++
18488 ++ return 0;
18489 ++}
18490 ++
18491 ++static void rr_unregister(void)
18492 ++{
18493 ++ mptcp_unregister_scheduler(&mptcp_sched_rr);
18494 ++}
18495 ++
18496 ++module_init(rr_register);
18497 ++module_exit(rr_unregister);
18498 ++
18499 ++MODULE_AUTHOR("Christoph Paasch");
18500 ++MODULE_LICENSE("GPL");
18501 ++MODULE_DESCRIPTION("ROUNDROBIN MPTCP");
18502 ++MODULE_VERSION("0.89");
18503 +diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c
18504 +new file mode 100644
18505 +index 000000000000..6c7ff4eceac1
18506 +--- /dev/null
18507 ++++ b/net/mptcp/mptcp_sched.c
18508 +@@ -0,0 +1,493 @@
18509 ++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
18510 ++
18511 ++#include <linux/module.h>
18512 ++#include <net/mptcp.h>
18513 ++
18514 ++static DEFINE_SPINLOCK(mptcp_sched_list_lock);
18515 ++static LIST_HEAD(mptcp_sched_list);
18516 ++
18517 ++struct defsched_priv {
18518 ++ u32 last_rbuf_opti;
18519 ++};
18520 ++
18521 ++static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)
18522 ++{
18523 ++ return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];
18524 ++}
18525 ++
18526 ++/* If the sub-socket sk available to send the skb? */
18527 ++static bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
18528 ++ bool zero_wnd_test)
18529 ++{
18530 ++ const struct tcp_sock *tp = tcp_sk(sk);
18531 ++ unsigned int mss_now, space, in_flight;
18532 ++
18533 ++ /* Set of states for which we are allowed to send data */
18534 ++ if (!mptcp_sk_can_send(sk))
18535 ++ return false;
18536 ++
18537 ++ /* We do not send data on this subflow unless it is
18538 ++ * fully established, i.e. the 4th ack has been received.
18539 ++ */
18540 ++ if (tp->mptcp->pre_established)
18541 ++ return false;
18542 ++
18543 ++ if (tp->pf)
18544 ++ return false;
18545 ++
18546 ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
18547 ++ /* If SACK is disabled, and we got a loss, TCP does not exit
18548 ++ * the loss-state until something above high_seq has been acked.
18549 ++ * (see tcp_try_undo_recovery)
18550 ++ *
18551 ++ * high_seq is the snd_nxt at the moment of the RTO. As soon
18552 ++ * as we have an RTO, we won't push data on the subflow.
18553 ++ * Thus, snd_una can never go beyond high_seq.
18554 ++ */
18555 ++ if (!tcp_is_reno(tp))
18556 ++ return false;
18557 ++ else if (tp->snd_una != tp->high_seq)
18558 ++ return false;
18559 ++ }
18560 ++
18561 ++ if (!tp->mptcp->fully_established) {
18562 ++ /* Make sure that we send in-order data */
18563 ++ if (skb && tp->mptcp->second_packet &&
18564 ++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
18565 ++ return false;
18566 ++ }
18567 ++
18568 ++ /* If TSQ is already throttling us, do not send on this subflow. When
18569 ++ * TSQ gets cleared the subflow becomes eligible again.
18570 ++ */
18571 ++ if (test_bit(TSQ_THROTTLED, &tp->tsq_flags))
18572 ++ return false;
18573 ++
18574 ++ in_flight = tcp_packets_in_flight(tp);
18575 ++ /* Not even a single spot in the cwnd */
18576 ++ if (in_flight >= tp->snd_cwnd)
18577 ++ return false;
18578 ++
18579 ++ /* Now, check if what is queued in the subflow's send-queue
18580 ++ * already fills the cwnd.
18581 ++ */
18582 ++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
18583 ++
18584 ++ if (tp->write_seq - tp->snd_nxt > space)
18585 ++ return false;
18586 ++
18587 ++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
18588 ++ return false;
18589 ++
18590 ++ mss_now = tcp_current_mss(sk);
18591 ++
18592 ++ /* Don't send on this subflow if we bypass the allowed send-window at
18593 ++ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
18594 ++ * calculated end_seq (because here at this point end_seq is still at
18595 ++ * the meta-level).
18596 ++ */
18597 ++ if (skb && !zero_wnd_test &&
18598 ++ after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
18599 ++ return false;
18600 ++
18601 ++ return true;
18602 ++}
18603 ++
18604 ++/* Are we not allowed to reinject this skb on tp? */
18605 ++static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
18606 ++{
18607 ++ /* If the skb has already been enqueued in this sk, try to find
18608 ++ * another one.
18609 ++ */
18610 ++ return skb &&
18611 ++ /* Has the skb already been enqueued into this subsocket? */
18612 ++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
18613 ++}
18614 ++
18615 ++/* This is the scheduler. This function decides on which flow to send
18616 ++ * a given MSS. If all subflows are found to be busy, NULL is returned
18617 ++ * The flow is selected based on the shortest RTT.
18618 ++ * If all paths have full cong windows, we simply return NULL.
18619 ++ *
18620 ++ * Additionally, this function is aware of the backup-subflows.
18621 ++ */
18622 ++static struct sock *get_available_subflow(struct sock *meta_sk,
18623 ++ struct sk_buff *skb,
18624 ++ bool zero_wnd_test)
18625 ++{
18626 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18627 ++ struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;
18628 ++ u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;
18629 ++ int cnt_backups = 0;
18630 ++
18631 ++ /* if there is only one subflow, bypass the scheduling function */
18632 ++ if (mpcb->cnt_subflows == 1) {
18633 ++ bestsk = (struct sock *)mpcb->connection_list;
18634 ++ if (!mptcp_is_available(bestsk, skb, zero_wnd_test))
18635 ++ bestsk = NULL;
18636 ++ return bestsk;
18637 ++ }
18638 ++
18639 ++ /* Answer data_fin on same subflow!!! */
18640 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
18641 ++ skb && mptcp_is_data_fin(skb)) {
18642 ++ mptcp_for_each_sk(mpcb, sk) {
18643 ++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
18644 ++ mptcp_is_available(sk, skb, zero_wnd_test))
18645 ++ return sk;
18646 ++ }
18647 ++ }
18648 ++
18649 ++ /* First, find the best subflow */
18650 ++ mptcp_for_each_sk(mpcb, sk) {
18651 ++ struct tcp_sock *tp = tcp_sk(sk);
18652 ++
18653 ++ if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)
18654 ++ cnt_backups++;
18655 ++
18656 ++ if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
18657 ++ tp->srtt_us < lowprio_min_time_to_peer) {
18658 ++ if (!mptcp_is_available(sk, skb, zero_wnd_test))
18659 ++ continue;
18660 ++
18661 ++ if (mptcp_dont_reinject_skb(tp, skb)) {
18662 ++ backupsk = sk;
18663 ++ continue;
18664 ++ }
18665 ++
18666 ++ lowprio_min_time_to_peer = tp->srtt_us;
18667 ++ lowpriosk = sk;
18668 ++ } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
18669 ++ tp->srtt_us < min_time_to_peer) {
18670 ++ if (!mptcp_is_available(sk, skb, zero_wnd_test))
18671 ++ continue;
18672 ++
18673 ++ if (mptcp_dont_reinject_skb(tp, skb)) {
18674 ++ backupsk = sk;
18675 ++ continue;
18676 ++ }
18677 ++
18678 ++ min_time_to_peer = tp->srtt_us;
18679 ++ bestsk = sk;
18680 ++ }
18681 ++ }
18682 ++
18683 ++ if (mpcb->cnt_established == cnt_backups && lowpriosk) {
18684 ++ sk = lowpriosk;
18685 ++ } else if (bestsk) {
18686 ++ sk = bestsk;
18687 ++ } else if (backupsk) {
18688 ++ /* It has been sent on all subflows once - let's give it a
18689 ++ * chance again by restarting its pathmask.
18690 ++ */
18691 ++ if (skb)
18692 ++ TCP_SKB_CB(skb)->path_mask = 0;
18693 ++ sk = backupsk;
18694 ++ }
18695 ++
18696 ++ return sk;
18697 ++}
18698 ++
18699 ++static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
18700 ++{
18701 ++ struct sock *meta_sk;
18702 ++ const struct tcp_sock *tp = tcp_sk(sk);
18703 ++ struct tcp_sock *tp_it;
18704 ++ struct sk_buff *skb_head;
18705 ++ struct defsched_priv *dsp = defsched_get_priv(tp);
18706 ++
18707 ++ if (tp->mpcb->cnt_subflows == 1)
18708 ++ return NULL;
18709 ++
18710 ++ meta_sk = mptcp_meta_sk(sk);
18711 ++ skb_head = tcp_write_queue_head(meta_sk);
18712 ++
18713 ++ if (!skb_head || skb_head == tcp_send_head(meta_sk))
18714 ++ return NULL;
18715 ++
18716 ++ /* If penalization is optional (coming from mptcp_next_segment() and
18717 ++ * We are not send-buffer-limited we do not penalize. The retransmission
18718 ++ * is just an optimization to fix the idle-time due to the delay before
18719 ++ * we wake up the application.
18720 ++ */
18721 ++ if (!penal && sk_stream_memory_free(meta_sk))
18722 ++ goto retrans;
18723 ++
18724 ++ /* Only penalize again after an RTT has elapsed */
18725 ++ if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))
18726 ++ goto retrans;
18727 ++
18728 ++ /* Half the cwnd of the slow flow */
18729 ++ mptcp_for_each_tp(tp->mpcb, tp_it) {
18730 ++ if (tp_it != tp &&
18731 ++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
18732 ++ if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
18733 ++ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
18734 ++ if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)
18735 ++ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
18736 ++
18737 ++ dsp->last_rbuf_opti = tcp_time_stamp;
18738 ++ }
18739 ++ break;
18740 ++ }
18741 ++ }
18742 ++
18743 ++retrans:
18744 ++
18745 ++ /* Segment not yet injected into this path? Take it!!! */
18746 ++ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
18747 ++ bool do_retrans = false;
18748 ++ mptcp_for_each_tp(tp->mpcb, tp_it) {
18749 ++ if (tp_it != tp &&
18750 ++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
18751 ++ if (tp_it->snd_cwnd <= 4) {
18752 ++ do_retrans = true;
18753 ++ break;
18754 ++ }
18755 ++
18756 ++ if (4 * tp->srtt_us >= tp_it->srtt_us) {
18757 ++ do_retrans = false;
18758 ++ break;
18759 ++ } else {
18760 ++ do_retrans = true;
18761 ++ }
18762 ++ }
18763 ++ }
18764 ++
18765 ++ if (do_retrans && mptcp_is_available(sk, skb_head, false))
18766 ++ return skb_head;
18767 ++ }
18768 ++ return NULL;
18769 ++}
18770 ++
18771 ++/* Returns the next segment to be sent from the mptcp meta-queue.
18772 ++ * (chooses the reinject queue if any segment is waiting in it, otherwise,
18773 ++ * chooses the normal write queue).
18774 ++ * Sets *@reinject to 1 if the returned segment comes from the
18775 ++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
18776 ++ * and sets it to -1 if it is a meta-level retransmission to optimize the
18777 ++ * receive-buffer.
18778 ++ */
18779 ++static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)
18780 ++{
18781 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18782 ++ struct sk_buff *skb = NULL;
18783 ++
18784 ++ *reinject = 0;
18785 ++
18786 ++ /* If we are in fallback-mode, just take from the meta-send-queue */
18787 ++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
18788 ++ return tcp_send_head(meta_sk);
18789 ++
18790 ++ skb = skb_peek(&mpcb->reinject_queue);
18791 ++
18792 ++ if (skb) {
18793 ++ *reinject = 1;
18794 ++ } else {
18795 ++ skb = tcp_send_head(meta_sk);
18796 ++
18797 ++ if (!skb && meta_sk->sk_socket &&
18798 ++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
18799 ++ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
18800 ++ struct sock *subsk = get_available_subflow(meta_sk, NULL,
18801 ++ false);
18802 ++ if (!subsk)
18803 ++ return NULL;
18804 ++
18805 ++ skb = mptcp_rcv_buf_optimization(subsk, 0);
18806 ++ if (skb)
18807 ++ *reinject = -1;
18808 ++ }
18809 ++ }
18810 ++ return skb;
18811 ++}
18812 ++
18813 ++static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
18814 ++ int *reinject,
18815 ++ struct sock **subsk,
18816 ++ unsigned int *limit)
18817 ++{
18818 ++ struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
18819 ++ unsigned int mss_now;
18820 ++ struct tcp_sock *subtp;
18821 ++ u16 gso_max_segs;
18822 ++ u32 max_len, max_segs, window, needed;
18823 ++
18824 ++ /* As we set it, we have to reset it as well. */
18825 ++ *limit = 0;
18826 ++
18827 ++ if (!skb)
18828 ++ return NULL;
18829 ++
18830 ++ *subsk = get_available_subflow(meta_sk, skb, false);
18831 ++ if (!*subsk)
18832 ++ return NULL;
18833 ++
18834 ++ subtp = tcp_sk(*subsk);
18835 ++ mss_now = tcp_current_mss(*subsk);
18836 ++
18837 ++ if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {
18838 ++ skb = mptcp_rcv_buf_optimization(*subsk, 1);
18839 ++ if (skb)
18840 ++ *reinject = -1;
18841 ++ else
18842 ++ return NULL;
18843 ++ }
18844 ++
18845 ++ /* No splitting required, as we will only send one single segment */
18846 ++ if (skb->len <= mss_now)
18847 ++ return skb;
18848 ++
18849 ++ /* The following is similar to tcp_mss_split_point, but
18850 ++ * we do not care about nagle, because we will anyways
18851 ++ * use TCP_NAGLE_PUSH, which overrides this.
18852 ++ *
18853 ++ * So, we first limit according to the cwnd/gso-size and then according
18854 ++ * to the subflow's window.
18855 ++ */
18856 ++
18857 ++ gso_max_segs = (*subsk)->sk_gso_max_segs;
18858 ++ if (!gso_max_segs) /* No gso supported on the subflow's NIC */
18859 ++ gso_max_segs = 1;
18860 ++ max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);
18861 ++ if (!max_segs)
18862 ++ return NULL;
18863 ++
18864 ++ max_len = mss_now * max_segs;
18865 ++ window = tcp_wnd_end(subtp) - subtp->write_seq;
18866 ++
18867 ++ needed = min(skb->len, window);
18868 ++ if (max_len <= skb->len)
18869 ++ /* Take max_win, which is actually the cwnd/gso-size */
18870 ++ *limit = max_len;
18871 ++ else
18872 ++ /* Or, take the window */
18873 ++ *limit = needed;
18874 ++
18875 ++ return skb;
18876 ++}
18877 ++
18878 ++static void defsched_init(struct sock *sk)
18879 ++{
18880 ++ struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk));
18881 ++
18882 ++ dsp->last_rbuf_opti = tcp_time_stamp;
18883 ++}
18884 ++
18885 ++struct mptcp_sched_ops mptcp_sched_default = {
18886 ++ .get_subflow = get_available_subflow,
18887 ++ .next_segment = mptcp_next_segment,
18888 ++ .init = defsched_init,
18889 ++ .name = "default",
18890 ++ .owner = THIS_MODULE,
18891 ++};
18892 ++
18893 ++static struct mptcp_sched_ops *mptcp_sched_find(const char *name)
18894 ++{
18895 ++ struct mptcp_sched_ops *e;
18896 ++
18897 ++ list_for_each_entry_rcu(e, &mptcp_sched_list, list) {
18898 ++ if (strcmp(e->name, name) == 0)
18899 ++ return e;
18900 ++ }
18901 ++
18902 ++ return NULL;
18903 ++}
18904 ++
18905 ++int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
18906 ++{
18907 ++ int ret = 0;
18908 ++
18909 ++ if (!sched->get_subflow || !sched->next_segment)
18910 ++ return -EINVAL;
18911 ++
18912 ++ spin_lock(&mptcp_sched_list_lock);
18913 ++ if (mptcp_sched_find(sched->name)) {
18914 ++ pr_notice("%s already registered\n", sched->name);
18915 ++ ret = -EEXIST;
18916 ++ } else {
18917 ++ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
18918 ++ pr_info("%s registered\n", sched->name);
18919 ++ }
18920 ++ spin_unlock(&mptcp_sched_list_lock);
18921 ++
18922 ++ return ret;
18923 ++}
18924 ++EXPORT_SYMBOL_GPL(mptcp_register_scheduler);
18925 ++
18926 ++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
18927 ++{
18928 ++ spin_lock(&mptcp_sched_list_lock);
18929 ++ list_del_rcu(&sched->list);
18930 ++ spin_unlock(&mptcp_sched_list_lock);
18931 ++}
18932 ++EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);
18933 ++
18934 ++void mptcp_get_default_scheduler(char *name)
18935 ++{
18936 ++ struct mptcp_sched_ops *sched;
18937 ++
18938 ++ BUG_ON(list_empty(&mptcp_sched_list));
18939 ++
18940 ++ rcu_read_lock();
18941 ++ sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);
18942 ++ strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);
18943 ++ rcu_read_unlock();
18944 ++}
18945 ++
18946 ++int mptcp_set_default_scheduler(const char *name)
18947 ++{
18948 ++ struct mptcp_sched_ops *sched;
18949 ++ int ret = -ENOENT;
18950 ++
18951 ++ spin_lock(&mptcp_sched_list_lock);
18952 ++ sched = mptcp_sched_find(name);
18953 ++#ifdef CONFIG_MODULES
18954 ++ if (!sched && capable(CAP_NET_ADMIN)) {
18955 ++ spin_unlock(&mptcp_sched_list_lock);
18956 ++
18957 ++ request_module("mptcp_%s", name);
18958 ++ spin_lock(&mptcp_sched_list_lock);
18959 ++ sched = mptcp_sched_find(name);
18960 ++ }
18961 ++#endif
18962 ++
18963 ++ if (sched) {
18964 ++ list_move(&sched->list, &mptcp_sched_list);
18965 ++ ret = 0;
18966 ++ } else {
18967 ++ pr_info("%s is not available\n", name);
18968 ++ }
18969 ++ spin_unlock(&mptcp_sched_list_lock);
18970 ++
18971 ++ return ret;
18972 ++}
18973 ++
18974 ++void mptcp_init_scheduler(struct mptcp_cb *mpcb)
18975 ++{
18976 ++ struct mptcp_sched_ops *sched;
18977 ++
18978 ++ rcu_read_lock();
18979 ++ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
18980 ++ if (try_module_get(sched->owner)) {
18981 ++ mpcb->sched_ops = sched;
18982 ++ break;
18983 ++ }
18984 ++ }
18985 ++ rcu_read_unlock();
18986 ++}
18987 ++
18988 ++/* Manage refcounts on socket close. */
18989 ++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)
18990 ++{
18991 ++ module_put(mpcb->sched_ops->owner);
18992 ++}
18993 ++
18994 ++/* Set default value from kernel configuration at bootup */
18995 ++static int __init mptcp_scheduler_default(void)
18996 ++{
18997 ++ BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);
18998 ++
18999 ++ return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);
19000 ++}
19001 ++late_initcall(mptcp_scheduler_default);
19002 +diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c
19003 +new file mode 100644
19004 +index 000000000000..29ca1d868d17
19005 +--- /dev/null
19006 ++++ b/net/mptcp/mptcp_wvegas.c
19007 +@@ -0,0 +1,268 @@
19008 ++/*
19009 ++ * MPTCP implementation - WEIGHTED VEGAS
19010 ++ *
19011 ++ * Algorithm design:
19012 ++ * Yu Cao <cyAnalyst@×××.com>
19013 ++ * Mingwei Xu <xmw@××××××××××××××××××××××.cn>
19014 ++ * Xiaoming Fu <fu@××××××××××××××××××.de>
19015 ++ *
19016 ++ * Implementation:
19017 ++ * Yu Cao <cyAnalyst@×××.com>
19018 ++ * Enhuan Dong <deh13@××××××××××××××××××.cn>
19019 ++ *
19020 ++ * Ported to the official MPTCP-kernel:
19021 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
19022 ++ *
19023 ++ * This program is free software; you can redistribute it and/or
19024 ++ * modify it under the terms of the GNU General Public License
19025 ++ * as published by the Free Software Foundation; either version
19026 ++ * 2 of the License, or (at your option) any later version.
19027 ++ */
19028 ++
19029 ++#include <linux/skbuff.h>
19030 ++#include <net/tcp.h>
19031 ++#include <net/mptcp.h>
19032 ++#include <linux/module.h>
19033 ++#include <linux/tcp.h>
19034 ++
19035 ++static int initial_alpha = 2;
19036 ++static int total_alpha = 10;
19037 ++static int gamma = 1;
19038 ++
19039 ++module_param(initial_alpha, int, 0644);
19040 ++MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
19041 ++module_param(total_alpha, int, 0644);
19042 ++MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
19043 ++module_param(gamma, int, 0644);
19044 ++MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
19045 ++
19046 ++#define MPTCP_WVEGAS_SCALE 16
19047 ++
19048 ++/* wVegas variables */
19049 ++struct wvegas {
19050 ++ u32 beg_snd_nxt; /* right edge during last RTT */
19051 ++ u8 doing_wvegas_now;/* if true, do wvegas for this RTT */
19052 ++
19053 ++ u16 cnt_rtt; /* # of RTTs measured within last RTT */
19054 ++ u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
19055 ++ u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */
19056 ++
19057 ++ u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
19058 ++ u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
19059 ++ int alpha; /* alpha for each subflows */
19060 ++
19061 ++ u32 queue_delay; /* queue delay*/
19062 ++};
19063 ++
19064 ++
19065 ++static inline u64 mptcp_wvegas_scale(u32 val, int scale)
19066 ++{
19067 ++ return (u64) val << scale;
19068 ++}
19069 ++
19070 ++static void wvegas_enable(const struct sock *sk)
19071 ++{
19072 ++ const struct tcp_sock *tp = tcp_sk(sk);
19073 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19074 ++
19075 ++ wvegas->doing_wvegas_now = 1;
19076 ++
19077 ++ wvegas->beg_snd_nxt = tp->snd_nxt;
19078 ++
19079 ++ wvegas->cnt_rtt = 0;
19080 ++ wvegas->sampled_rtt = 0;
19081 ++
19082 ++ wvegas->instant_rate = 0;
19083 ++ wvegas->alpha = initial_alpha;
19084 ++ wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
19085 ++
19086 ++ wvegas->queue_delay = 0;
19087 ++}
19088 ++
19089 ++static inline void wvegas_disable(const struct sock *sk)
19090 ++{
19091 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19092 ++
19093 ++ wvegas->doing_wvegas_now = 0;
19094 ++}
19095 ++
19096 ++static void mptcp_wvegas_init(struct sock *sk)
19097 ++{
19098 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19099 ++
19100 ++ wvegas->base_rtt = 0x7fffffff;
19101 ++ wvegas_enable(sk);
19102 ++}
19103 ++
19104 ++static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
19105 ++{
19106 ++ return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
19107 ++}
19108 ++
19109 ++static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
19110 ++{
19111 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19112 ++ u32 vrtt;
19113 ++
19114 ++ if (rtt_us < 0)
19115 ++ return;
19116 ++
19117 ++ vrtt = rtt_us + 1;
19118 ++
19119 ++ if (vrtt < wvegas->base_rtt)
19120 ++ wvegas->base_rtt = vrtt;
19121 ++
19122 ++ wvegas->sampled_rtt += vrtt;
19123 ++ wvegas->cnt_rtt++;
19124 ++}
19125 ++
19126 ++static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
19127 ++{
19128 ++ if (ca_state == TCP_CA_Open)
19129 ++ wvegas_enable(sk);
19130 ++ else
19131 ++ wvegas_disable(sk);
19132 ++}
19133 ++
19134 ++static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
19135 ++{
19136 ++ if (event == CA_EVENT_CWND_RESTART) {
19137 ++ mptcp_wvegas_init(sk);
19138 ++ } else if (event == CA_EVENT_LOSS) {
19139 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19140 ++ wvegas->instant_rate = 0;
19141 ++ }
19142 ++}
19143 ++
19144 ++static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)
19145 ++{
19146 ++ return min(tp->snd_ssthresh, tp->snd_cwnd - 1);
19147 ++}
19148 ++
19149 ++static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)
19150 ++{
19151 ++ u64 total_rate = 0;
19152 ++ struct sock *sub_sk;
19153 ++ const struct wvegas *wvegas = inet_csk_ca(sk);
19154 ++
19155 ++ if (!mpcb)
19156 ++ return wvegas->weight;
19157 ++
19158 ++
19159 ++ mptcp_for_each_sk(mpcb, sub_sk) {
19160 ++ struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
19161 ++
19162 ++ /* sampled_rtt is initialized by 0 */
19163 ++ if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
19164 ++ total_rate += sub_wvegas->instant_rate;
19165 ++ }
19166 ++
19167 ++ if (total_rate && wvegas->instant_rate)
19168 ++ return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
19169 ++ else
19170 ++ return wvegas->weight;
19171 ++}
19172 ++
19173 ++static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
19174 ++{
19175 ++ struct tcp_sock *tp = tcp_sk(sk);
19176 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19177 ++
19178 ++ if (!wvegas->doing_wvegas_now) {
19179 ++ tcp_reno_cong_avoid(sk, ack, acked);
19180 ++ return;
19181 ++ }
19182 ++
19183 ++ if (after(ack, wvegas->beg_snd_nxt)) {
19184 ++ wvegas->beg_snd_nxt = tp->snd_nxt;
19185 ++
19186 ++ if (wvegas->cnt_rtt <= 2) {
19187 ++ tcp_reno_cong_avoid(sk, ack, acked);
19188 ++ } else {
19189 ++ u32 rtt, diff, q_delay;
19190 ++ u64 target_cwnd;
19191 ++
19192 ++ rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
19193 ++ target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
19194 ++
19195 ++ diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
19196 ++
19197 ++ if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
19198 ++ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
19199 ++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
19200 ++
19201 ++ } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
19202 ++ tcp_slow_start(tp, acked);
19203 ++ } else {
19204 ++ if (diff >= wvegas->alpha) {
19205 ++ wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
19206 ++ wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
19207 ++ wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
19208 ++ }
19209 ++ if (diff > wvegas->alpha) {
19210 ++ tp->snd_cwnd--;
19211 ++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
19212 ++ } else if (diff < wvegas->alpha) {
19213 ++ tp->snd_cwnd++;
19214 ++ }
19215 ++
19216 ++ /* Try to drain link queue if needed*/
19217 ++ q_delay = rtt - wvegas->base_rtt;
19218 ++ if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
19219 ++ wvegas->queue_delay = q_delay;
19220 ++
19221 ++ if (q_delay >= 2 * wvegas->queue_delay) {
19222 ++ u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
19223 ++ tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
19224 ++ wvegas->queue_delay = 0;
19225 ++ }
19226 ++ }
19227 ++
19228 ++ if (tp->snd_cwnd < 2)
19229 ++ tp->snd_cwnd = 2;
19230 ++ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
19231 ++ tp->snd_cwnd = tp->snd_cwnd_clamp;
19232 ++
19233 ++ tp->snd_ssthresh = tcp_current_ssthresh(sk);
19234 ++ }
19235 ++
19236 ++ wvegas->cnt_rtt = 0;
19237 ++ wvegas->sampled_rtt = 0;
19238 ++ }
19239 ++ /* Use normal slow start */
19240 ++ else if (tp->snd_cwnd <= tp->snd_ssthresh)
19241 ++ tcp_slow_start(tp, acked);
19242 ++}
19243 ++
19244 ++
19245 ++static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
19246 ++ .init = mptcp_wvegas_init,
19247 ++ .ssthresh = tcp_reno_ssthresh,
19248 ++ .cong_avoid = mptcp_wvegas_cong_avoid,
19249 ++ .pkts_acked = mptcp_wvegas_pkts_acked,
19250 ++ .set_state = mptcp_wvegas_state,
19251 ++ .cwnd_event = mptcp_wvegas_cwnd_event,
19252 ++
19253 ++ .owner = THIS_MODULE,
19254 ++ .name = "wvegas",
19255 ++};
19256 ++
19257 ++static int __init mptcp_wvegas_register(void)
19258 ++{
19259 ++ BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
19260 ++ tcp_register_congestion_control(&mptcp_wvegas);
19261 ++ return 0;
19262 ++}
19263 ++
19264 ++static void __exit mptcp_wvegas_unregister(void)
19265 ++{
19266 ++ tcp_unregister_congestion_control(&mptcp_wvegas);
19267 ++}
19268 ++
19269 ++module_init(mptcp_wvegas_register);
19270 ++module_exit(mptcp_wvegas_unregister);
19271 ++
19272 ++MODULE_AUTHOR("Yu Cao, Enhuan Dong");
19273 ++MODULE_LICENSE("GPL");
19274 ++MODULE_DESCRIPTION("MPTCP wVegas");
19275 ++MODULE_VERSION("0.1");