Gentoo Archives: gentoo-commits

From: Mike Pagano <mpagano@g.o>
To: gentoo-commits@l.g.o
Subject: [gentoo-commits] proj/linux-patches:3.16 commit in: /
Date: Fri, 26 Sep 2014 19:40:22
Message-Id: 1411760417.d9d386b72f6c05e68b48912cc93da59331852155.mpagano@gentoo
1 commit: d9d386b72f6c05e68b48912cc93da59331852155
2 Author: Mike Pagano <mpagano <AT> gentoo <DOT> org>
3 AuthorDate: Fri Sep 26 19:40:17 2014 +0000
4 Commit: Mike Pagano <mpagano <AT> gentoo <DOT> org>
5 CommitDate: Fri Sep 26 19:40:17 2014 +0000
6 URL: http://sources.gentoo.org/gitweb/?p=proj/linux-patches.git;a=commit;h=d9d386b7
7
8 Add multipath-tcp patch. Fix distro config.
9
10 ---
11 0000_README | 4 +
12 2500_multipath-tcp-v3.16-872d7f6c6f4e.patch | 19230 ++++++++++++++++++++++++++
13 4567_distro-Gentoo-Kconfig.patch | 19 +-
14 3 files changed, 19243 insertions(+), 10 deletions(-)
15
16 diff --git a/0000_README b/0000_README
17 index 706e53e..d92e6b7 100644
18 --- a/0000_README
19 +++ b/0000_README
20 @@ -58,6 +58,10 @@ Patch: 2400_kcopy-patch-for-infiniband-driver.patch
21 From: Alexey Shvetsov <alexxy@g.o>
22 Desc: Zero copy for infiniband psm userspace driver
23
24 +Patch: 2500_multipath-tcp-v3.16-872d7f6c6f4e.patch
25 +From: http://multipath-tcp.org/
26 +Desc: Patch for simultaneous use of several IP-addresses/interfaces in TCP for better resource utilization, better throughput and smoother reaction to failures.
27 +
28 Patch: 2700_ThinkPad-30-brightness-control-fix.patch
29 From: Seth Forshee <seth.forshee@×××××××××.com>
30 Desc: ACPI: Disable Windows 8 compatibility for some Lenovo ThinkPads
31
32 diff --git a/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch b/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch
33 new file mode 100644
34 index 0000000..3000da3
35 --- /dev/null
36 +++ b/2500_multipath-tcp-v3.16-872d7f6c6f4e.patch
37 @@ -0,0 +1,19230 @@
38 +diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
39 +index 768a0fb67dd6..5a46d91a8df9 100644
40 +--- a/drivers/infiniband/hw/cxgb4/cm.c
41 ++++ b/drivers/infiniband/hw/cxgb4/cm.c
42 +@@ -3432,7 +3432,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
43 + */
44 + memset(&tmp_opt, 0, sizeof(tmp_opt));
45 + tcp_clear_options(&tmp_opt);
46 +- tcp_parse_options(skb, &tmp_opt, 0, NULL);
47 ++ tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);
48 +
49 + req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
50 + memset(req, 0, sizeof(*req));
51 +diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
52 +index 2faef339d8f2..d86c853ffaad 100644
53 +--- a/include/linux/ipv6.h
54 ++++ b/include/linux/ipv6.h
55 +@@ -256,16 +256,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
56 + return inet_sk(__sk)->pinet6;
57 + }
58 +
59 +-static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops)
60 +-{
61 +- struct request_sock *req = reqsk_alloc(ops);
62 +-
63 +- if (req)
64 +- inet_rsk(req)->pktopts = NULL;
65 +-
66 +- return req;
67 +-}
68 +-
69 + static inline struct raw6_sock *raw6_sk(const struct sock *sk)
70 + {
71 + return (struct raw6_sock *)sk;
72 +@@ -309,12 +299,6 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk)
73 + return NULL;
74 + }
75 +
76 +-static inline struct inet6_request_sock *
77 +- inet6_rsk(const struct request_sock *rsk)
78 +-{
79 +- return NULL;
80 +-}
81 +-
82 + static inline struct raw6_sock *raw6_sk(const struct sock *sk)
83 + {
84 + return NULL;
85 +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
86 +index ec89301ada41..99ea4b0e3693 100644
87 +--- a/include/linux/skbuff.h
88 ++++ b/include/linux/skbuff.h
89 +@@ -2784,8 +2784,10 @@ static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
90 + bool zero_okay,
91 + __sum16 check)
92 + {
93 +- if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
94 +- skb->csum_valid = 1;
95 ++ if (skb_csum_unnecessary(skb)) {
96 ++ return false;
97 ++ } else if (zero_okay && !check) {
98 ++ skb->ip_summed = CHECKSUM_UNNECESSARY;
99 + return false;
100 + }
101 +
102 +diff --git a/include/linux/tcp.h b/include/linux/tcp.h
103 +index a0513210798f..7bc2e078d6ca 100644
104 +--- a/include/linux/tcp.h
105 ++++ b/include/linux/tcp.h
106 +@@ -53,7 +53,7 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb)
107 + /* TCP Fast Open */
108 + #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */
109 + #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */
110 +-#define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */
111 ++#define TCP_FASTOPEN_COOKIE_SIZE 4 /* the size employed by this impl. */
112 +
113 + /* TCP Fast Open Cookie as stored in memory */
114 + struct tcp_fastopen_cookie {
115 +@@ -72,6 +72,51 @@ struct tcp_sack_block {
116 + u32 end_seq;
117 + };
118 +
119 ++struct tcp_out_options {
120 ++ u16 options; /* bit field of OPTION_* */
121 ++ u8 ws; /* window scale, 0 to disable */
122 ++ u8 num_sack_blocks;/* number of SACK blocks to include */
123 ++ u8 hash_size; /* bytes in hash_location */
124 ++ u16 mss; /* 0 to disable */
125 ++ __u8 *hash_location; /* temporary pointer, overloaded */
126 ++ __u32 tsval, tsecr; /* need to include OPTION_TS */
127 ++ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
128 ++#ifdef CONFIG_MPTCP
129 ++ u16 mptcp_options; /* bit field of MPTCP related OPTION_* */
130 ++ u8 dss_csum:1,
131 ++ add_addr_v4:1,
132 ++ add_addr_v6:1; /* dss-checksum required? */
133 ++
134 ++ union {
135 ++ struct {
136 ++ __u64 sender_key; /* sender's key for mptcp */
137 ++ __u64 receiver_key; /* receiver's key for mptcp */
138 ++ } mp_capable;
139 ++
140 ++ struct {
141 ++ __u64 sender_truncated_mac;
142 ++ __u32 sender_nonce;
143 ++ /* random number of the sender */
144 ++ __u32 token; /* token for mptcp */
145 ++ u8 low_prio:1;
146 ++ } mp_join_syns;
147 ++ };
148 ++
149 ++ struct {
150 ++ struct in_addr addr;
151 ++ u8 addr_id;
152 ++ } add_addr4;
153 ++
154 ++ struct {
155 ++ struct in6_addr addr;
156 ++ u8 addr_id;
157 ++ } add_addr6;
158 ++
159 ++ u16 remove_addrs; /* list of address id */
160 ++ u8 addr_id; /* address id (mp_join or add_address) */
161 ++#endif /* CONFIG_MPTCP */
162 ++};
163 ++
164 + /*These are used to set the sack_ok field in struct tcp_options_received */
165 + #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */
166 + #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/
167 +@@ -95,6 +140,9 @@ struct tcp_options_received {
168 + u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
169 + };
170 +
171 ++struct mptcp_cb;
172 ++struct mptcp_tcp_sock;
173 ++
174 + static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
175 + {
176 + rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
177 +@@ -111,10 +159,7 @@ struct tcp_request_sock_ops;
178 +
179 + struct tcp_request_sock {
180 + struct inet_request_sock req;
181 +-#ifdef CONFIG_TCP_MD5SIG
182 +- /* Only used by TCP MD5 Signature so far. */
183 + const struct tcp_request_sock_ops *af_specific;
184 +-#endif
185 + struct sock *listener; /* needed for TFO */
186 + u32 rcv_isn;
187 + u32 snt_isn;
188 +@@ -130,6 +175,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req)
189 + return (struct tcp_request_sock *)req;
190 + }
191 +
192 ++struct tcp_md5sig_key;
193 ++
194 + struct tcp_sock {
195 + /* inet_connection_sock has to be the first member of tcp_sock */
196 + struct inet_connection_sock inet_conn;
197 +@@ -326,6 +373,37 @@ struct tcp_sock {
198 + * socket. Used to retransmit SYNACKs etc.
199 + */
200 + struct request_sock *fastopen_rsk;
201 ++
202 ++ /* MPTCP/TCP-specific callbacks */
203 ++ const struct tcp_sock_ops *ops;
204 ++
205 ++ struct mptcp_cb *mpcb;
206 ++ struct sock *meta_sk;
207 ++ /* We keep these flags even if CONFIG_MPTCP is not checked, because
208 ++ * it allows checking MPTCP capability just by checking the mpc flag,
209 ++ * rather than adding ifdefs everywhere.
210 ++ */
211 ++ u16 mpc:1, /* Other end is multipath capable */
212 ++ inside_tk_table:1, /* Is the tcp_sock inside the token-table? */
213 ++ send_mp_fclose:1,
214 ++ request_mptcp:1, /* Did we send out an MP_CAPABLE?
215 ++ * (this speeds up mptcp_doit() in tcp_recvmsg)
216 ++ */
217 ++ mptcp_enabled:1, /* Is MPTCP enabled from the application ? */
218 ++ pf:1, /* Potentially Failed state: when this flag is set, we
219 ++ * stop using the subflow
220 ++ */
221 ++ mp_killed:1, /* Killed with a tcp_done in mptcp? */
222 ++ was_meta_sk:1, /* This was a meta sk (in case of reuse) */
223 ++ is_master_sk,
224 ++ close_it:1, /* Must close socket in mptcp_data_ready? */
225 ++ closing:1;
226 ++ struct mptcp_tcp_sock *mptcp;
227 ++#ifdef CONFIG_MPTCP
228 ++ struct hlist_nulls_node tk_table;
229 ++ u32 mptcp_loc_token;
230 ++ u64 mptcp_loc_key;
231 ++#endif /* CONFIG_MPTCP */
232 + };
233 +
234 + enum tsq_flags {
235 +@@ -337,6 +415,8 @@ enum tsq_flags {
236 + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
237 + * tcp_v{4|6}_mtu_reduced()
238 + */
239 ++ MPTCP_PATH_MANAGER, /* MPTCP deferred creation of new subflows */
240 ++ MPTCP_SUB_DEFERRED, /* A subflow got deferred - process them */
241 + };
242 +
243 + static inline struct tcp_sock *tcp_sk(const struct sock *sk)
244 +@@ -355,6 +435,7 @@ struct tcp_timewait_sock {
245 + #ifdef CONFIG_TCP_MD5SIG
246 + struct tcp_md5sig_key *tw_md5_key;
247 + #endif
248 ++ struct mptcp_tw *mptcp_tw;
249 + };
250 +
251 + static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
252 +diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h
253 +index 74af137304be..83f63033897a 100644
254 +--- a/include/net/inet6_connection_sock.h
255 ++++ b/include/net/inet6_connection_sock.h
256 +@@ -27,6 +27,8 @@ int inet6_csk_bind_conflict(const struct sock *sk,
257 +
258 + struct dst_entry *inet6_csk_route_req(struct sock *sk, struct flowi6 *fl6,
259 + const struct request_sock *req);
260 ++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
261 ++ const u32 rnd, const u32 synq_hsize);
262 +
263 + struct request_sock *inet6_csk_search_req(const struct sock *sk,
264 + struct request_sock ***prevp,
265 +diff --git a/include/net/inet_common.h b/include/net/inet_common.h
266 +index fe7994c48b75..780f229f46a8 100644
267 +--- a/include/net/inet_common.h
268 ++++ b/include/net/inet_common.h
269 +@@ -1,6 +1,8 @@
270 + #ifndef _INET_COMMON_H
271 + #define _INET_COMMON_H
272 +
273 ++#include <net/sock.h>
274 ++
275 + extern const struct proto_ops inet_stream_ops;
276 + extern const struct proto_ops inet_dgram_ops;
277 +
278 +@@ -13,6 +15,8 @@ struct sock;
279 + struct sockaddr;
280 + struct socket;
281 +
282 ++int inet_create(struct net *net, struct socket *sock, int protocol, int kern);
283 ++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern);
284 + int inet_release(struct socket *sock);
285 + int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
286 + int addr_len, int flags);
287 +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
288 +index 7a4313887568..f62159e39839 100644
289 +--- a/include/net/inet_connection_sock.h
290 ++++ b/include/net/inet_connection_sock.h
291 +@@ -30,6 +30,7 @@
292 +
293 + struct inet_bind_bucket;
294 + struct tcp_congestion_ops;
295 ++struct tcp_options_received;
296 +
297 + /*
298 + * Pointers to address related TCP functions
299 +@@ -243,6 +244,9 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
300 +
301 + struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
302 +
303 ++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
304 ++ const u32 synq_hsize);
305 ++
306 + struct request_sock *inet_csk_search_req(const struct sock *sk,
307 + struct request_sock ***prevp,
308 + const __be16 rport,
309 +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
310 +index b1edf17bec01..6a32d8d6b85e 100644
311 +--- a/include/net/inet_sock.h
312 ++++ b/include/net/inet_sock.h
313 +@@ -86,10 +86,14 @@ struct inet_request_sock {
314 + wscale_ok : 1,
315 + ecn_ok : 1,
316 + acked : 1,
317 +- no_srccheck: 1;
318 ++ no_srccheck: 1,
319 ++ mptcp_rqsk : 1,
320 ++ saw_mpc : 1;
321 + kmemcheck_bitfield_end(flags);
322 +- struct ip_options_rcu *opt;
323 +- struct sk_buff *pktopts;
324 ++ union {
325 ++ struct ip_options_rcu *opt;
326 ++ struct sk_buff *pktopts;
327 ++ };
328 + u32 ir_mark;
329 + };
330 +
331 +diff --git a/include/net/mptcp.h b/include/net/mptcp.h
332 +new file mode 100644
333 +index 000000000000..712780fc39e4
334 +--- /dev/null
335 ++++ b/include/net/mptcp.h
336 +@@ -0,0 +1,1439 @@
337 ++/*
338 ++ * MPTCP implementation
339 ++ *
340 ++ * Initial Design & Implementation:
341 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
342 ++ *
343 ++ * Current Maintainer & Author:
344 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
345 ++ *
346 ++ * Additional authors:
347 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
348 ++ * Gregory Detal <gregory.detal@×××××××××.be>
349 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
350 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
351 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
352 ++ * Andreas Ripke <ripke@××××××.eu>
353 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
354 ++ * Octavian Purdila <octavian.purdila@×××××.com>
355 ++ * John Ronan <jronan@××××.org>
356 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
357 ++ * Brandon Heller <brandonh@××××××××.edu>
358 ++ *
359 ++ *
360 ++ * This program is free software; you can redistribute it and/or
361 ++ * modify it under the terms of the GNU General Public License
362 ++ * as published by the Free Software Foundation; either version
363 ++ * 2 of the License, or (at your option) any later version.
364 ++ */
365 ++
366 ++#ifndef _MPTCP_H
367 ++#define _MPTCP_H
368 ++
369 ++#include <linux/inetdevice.h>
370 ++#include <linux/ipv6.h>
371 ++#include <linux/list.h>
372 ++#include <linux/net.h>
373 ++#include <linux/netpoll.h>
374 ++#include <linux/skbuff.h>
375 ++#include <linux/socket.h>
376 ++#include <linux/tcp.h>
377 ++#include <linux/kernel.h>
378 ++
379 ++#include <asm/byteorder.h>
380 ++#include <asm/unaligned.h>
381 ++#include <crypto/hash.h>
382 ++#include <net/tcp.h>
383 ++
384 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
385 ++ #define ntohll(x) be64_to_cpu(x)
386 ++ #define htonll(x) cpu_to_be64(x)
387 ++#elif defined(__BIG_ENDIAN_BITFIELD)
388 ++ #define ntohll(x) (x)
389 ++ #define htonll(x) (x)
390 ++#endif
391 ++
392 ++struct mptcp_loc4 {
393 ++ u8 loc4_id;
394 ++ u8 low_prio:1;
395 ++ struct in_addr addr;
396 ++};
397 ++
398 ++struct mptcp_rem4 {
399 ++ u8 rem4_id;
400 ++ __be16 port;
401 ++ struct in_addr addr;
402 ++};
403 ++
404 ++struct mptcp_loc6 {
405 ++ u8 loc6_id;
406 ++ u8 low_prio:1;
407 ++ struct in6_addr addr;
408 ++};
409 ++
410 ++struct mptcp_rem6 {
411 ++ u8 rem6_id;
412 ++ __be16 port;
413 ++ struct in6_addr addr;
414 ++};
415 ++
416 ++struct mptcp_request_sock {
417 ++ struct tcp_request_sock req;
418 ++ /* hlist-nulls entry to the hash-table. Depending on whether this is a
419 ++ * a new MPTCP connection or an additional subflow, the request-socket
420 ++ * is either in the mptcp_reqsk_tk_htb or mptcp_reqsk_htb.
421 ++ */
422 ++ struct hlist_nulls_node hash_entry;
423 ++
424 ++ union {
425 ++ struct {
426 ++ /* Only on initial subflows */
427 ++ u64 mptcp_loc_key;
428 ++ u64 mptcp_rem_key;
429 ++ u32 mptcp_loc_token;
430 ++ };
431 ++
432 ++ struct {
433 ++ /* Only on additional subflows */
434 ++ struct mptcp_cb *mptcp_mpcb;
435 ++ u32 mptcp_rem_nonce;
436 ++ u32 mptcp_loc_nonce;
437 ++ u64 mptcp_hash_tmac;
438 ++ };
439 ++ };
440 ++
441 ++ u8 loc_id;
442 ++ u8 rem_id; /* Address-id in the MP_JOIN */
443 ++ u8 dss_csum:1,
444 ++ is_sub:1, /* Is this a new subflow? */
445 ++ low_prio:1, /* Interface set to low-prio? */
446 ++ rcv_low_prio:1;
447 ++};
448 ++
449 ++struct mptcp_options_received {
450 ++ u16 saw_mpc:1,
451 ++ dss_csum:1,
452 ++ drop_me:1,
453 ++
454 ++ is_mp_join:1,
455 ++ join_ack:1,
456 ++
457 ++ saw_low_prio:2, /* 0x1 - low-prio set for this subflow
458 ++ * 0x2 - low-prio set for another subflow
459 ++ */
460 ++ low_prio:1,
461 ++
462 ++ saw_add_addr:2, /* Saw at least one add_addr option:
463 ++ * 0x1: IPv4 - 0x2: IPv6
464 ++ */
465 ++ more_add_addr:1, /* Saw one more add-addr. */
466 ++
467 ++ saw_rem_addr:1, /* Saw at least one rem_addr option */
468 ++ more_rem_addr:1, /* Saw one more rem-addr. */
469 ++
470 ++ mp_fail:1,
471 ++ mp_fclose:1;
472 ++ u8 rem_id; /* Address-id in the MP_JOIN */
473 ++ u8 prio_addr_id; /* Address-id in the MP_PRIO */
474 ++
475 ++ const unsigned char *add_addr_ptr; /* Pointer to add-address option */
476 ++ const unsigned char *rem_addr_ptr; /* Pointer to rem-address option */
477 ++
478 ++ u32 data_ack;
479 ++ u32 data_seq;
480 ++ u16 data_len;
481 ++
482 ++ u32 mptcp_rem_token;/* Remote token */
483 ++
484 ++ /* Key inside the option (from mp_capable or fast_close) */
485 ++ u64 mptcp_key;
486 ++
487 ++ u32 mptcp_recv_nonce;
488 ++ u64 mptcp_recv_tmac;
489 ++ u8 mptcp_recv_mac[20];
490 ++};
491 ++
492 ++struct mptcp_tcp_sock {
493 ++ struct tcp_sock *next; /* Next subflow socket */
494 ++ struct hlist_node cb_list;
495 ++ struct mptcp_options_received rx_opt;
496 ++
497 ++ /* Those three fields record the current mapping */
498 ++ u64 map_data_seq;
499 ++ u32 map_subseq;
500 ++ u16 map_data_len;
501 ++ u16 slave_sk:1,
502 ++ fully_established:1,
503 ++ establish_increased:1,
504 ++ second_packet:1,
505 ++ attached:1,
506 ++ send_mp_fail:1,
507 ++ include_mpc:1,
508 ++ mapping_present:1,
509 ++ map_data_fin:1,
510 ++ low_prio:1, /* use this socket as backup */
511 ++ rcv_low_prio:1, /* Peer sent low-prio option to us */
512 ++ send_mp_prio:1, /* Trigger to send mp_prio on this socket */
513 ++ pre_established:1; /* State between sending 3rd ACK and
514 ++ * receiving the fourth ack of new subflows.
515 ++ */
516 ++
517 ++ /* isn: needed to translate abs to relative subflow seqnums */
518 ++ u32 snt_isn;
519 ++ u32 rcv_isn;
520 ++ u8 path_index;
521 ++ u8 loc_id;
522 ++ u8 rem_id;
523 ++
524 ++#define MPTCP_SCHED_SIZE 4
525 ++ u8 mptcp_sched[MPTCP_SCHED_SIZE] __aligned(8);
526 ++
527 ++ struct sk_buff *shortcut_ofoqueue; /* Shortcut to the current modified
528 ++ * skb in the ofo-queue.
529 ++ */
530 ++
531 ++ int init_rcv_wnd;
532 ++ u32 infinite_cutoff_seq;
533 ++ struct delayed_work work;
534 ++ u32 mptcp_loc_nonce;
535 ++ struct tcp_sock *tp; /* Where is my daddy? */
536 ++ u32 last_end_data_seq;
537 ++
538 ++ /* MP_JOIN subflow: timer for retransmitting the 3rd ack */
539 ++ struct timer_list mptcp_ack_timer;
540 ++
541 ++ /* HMAC of the third ack */
542 ++ char sender_mac[20];
543 ++};
544 ++
545 ++struct mptcp_tw {
546 ++ struct list_head list;
547 ++ u64 loc_key;
548 ++ u64 rcv_nxt;
549 ++ struct mptcp_cb __rcu *mpcb;
550 ++ u8 meta_tw:1,
551 ++ in_list:1;
552 ++};
553 ++
554 ++#define MPTCP_PM_NAME_MAX 16
555 ++struct mptcp_pm_ops {
556 ++ struct list_head list;
557 ++
558 ++ /* Signal the creation of a new MPTCP-session. */
559 ++ void (*new_session)(const struct sock *meta_sk);
560 ++ void (*release_sock)(struct sock *meta_sk);
561 ++ void (*fully_established)(struct sock *meta_sk);
562 ++ void (*new_remote_address)(struct sock *meta_sk);
563 ++ int (*get_local_id)(sa_family_t family, union inet_addr *addr,
564 ++ struct net *net, bool *low_prio);
565 ++ void (*addr_signal)(struct sock *sk, unsigned *size,
566 ++ struct tcp_out_options *opts, struct sk_buff *skb);
567 ++ void (*add_raddr)(struct mptcp_cb *mpcb, const union inet_addr *addr,
568 ++ sa_family_t family, __be16 port, u8 id);
569 ++ void (*rem_raddr)(struct mptcp_cb *mpcb, u8 rem_id);
570 ++ void (*init_subsocket_v4)(struct sock *sk, struct in_addr addr);
571 ++ void (*init_subsocket_v6)(struct sock *sk, struct in6_addr addr);
572 ++
573 ++ char name[MPTCP_PM_NAME_MAX];
574 ++ struct module *owner;
575 ++};
576 ++
577 ++#define MPTCP_SCHED_NAME_MAX 16
578 ++struct mptcp_sched_ops {
579 ++ struct list_head list;
580 ++
581 ++ struct sock * (*get_subflow)(struct sock *meta_sk,
582 ++ struct sk_buff *skb,
583 ++ bool zero_wnd_test);
584 ++ struct sk_buff * (*next_segment)(struct sock *meta_sk,
585 ++ int *reinject,
586 ++ struct sock **subsk,
587 ++ unsigned int *limit);
588 ++ void (*init)(struct sock *sk);
589 ++
590 ++ char name[MPTCP_SCHED_NAME_MAX];
591 ++ struct module *owner;
592 ++};
593 ++
594 ++struct mptcp_cb {
595 ++ /* list of sockets in this multipath connection */
596 ++ struct tcp_sock *connection_list;
597 ++ /* list of sockets that need a call to release_cb */
598 ++ struct hlist_head callback_list;
599 ++
600 ++ /* High-order bits of 64-bit sequence numbers */
601 ++ u32 snd_high_order[2];
602 ++ u32 rcv_high_order[2];
603 ++
604 ++ u16 send_infinite_mapping:1,
605 ++ in_time_wait:1,
606 ++ list_rcvd:1, /* XXX TO REMOVE */
607 ++ addr_signal:1, /* Path-manager wants us to call addr_signal */
608 ++ dss_csum:1,
609 ++ server_side:1,
610 ++ infinite_mapping_rcv:1,
611 ++ infinite_mapping_snd:1,
612 ++ dfin_combined:1, /* Was the DFIN combined with subflow-fin? */
613 ++ passive_close:1,
614 ++ snd_hiseq_index:1, /* Index in snd_high_order of snd_nxt */
615 ++ rcv_hiseq_index:1; /* Index in rcv_high_order of rcv_nxt */
616 ++
617 ++ /* socket count in this connection */
618 ++ u8 cnt_subflows;
619 ++ u8 cnt_established;
620 ++
621 ++ struct mptcp_sched_ops *sched_ops;
622 ++
623 ++ struct sk_buff_head reinject_queue;
624 ++ /* First cache-line boundary is here minus 8 bytes. But from the
625 ++ * reinject-queue only the next and prev pointers are regularly
626 ++ * accessed. Thus, the whole data-path is on a single cache-line.
627 ++ */
628 ++
629 ++ u64 csum_cutoff_seq;
630 ++
631 ++ /***** Start of fields, used for connection closure */
632 ++ spinlock_t tw_lock;
633 ++ unsigned char mptw_state;
634 ++ u8 dfin_path_index;
635 ++
636 ++ struct list_head tw_list;
637 ++
638 ++ /***** Start of fields, used for subflow establishment and closure */
639 ++ atomic_t mpcb_refcnt;
640 ++
641 ++ /* Mutex needed, because otherwise mptcp_close will complain that the
642 ++ * socket is owned by the user.
643 ++ * E.g., mptcp_sub_close_wq is taking the meta-lock.
644 ++ */
645 ++ struct mutex mpcb_mutex;
646 ++
647 ++ /***** Start of fields, used for subflow establishment */
648 ++ struct sock *meta_sk;
649 ++
650 ++ /* Master socket, also part of the connection_list, this
651 ++ * socket is the one that the application sees.
652 ++ */
653 ++ struct sock *master_sk;
654 ++
655 ++ __u64 mptcp_loc_key;
656 ++ __u64 mptcp_rem_key;
657 ++ __u32 mptcp_loc_token;
658 ++ __u32 mptcp_rem_token;
659 ++
660 ++#define MPTCP_PM_SIZE 608
661 ++ u8 mptcp_pm[MPTCP_PM_SIZE] __aligned(8);
662 ++ struct mptcp_pm_ops *pm_ops;
663 ++
664 ++ u32 path_index_bits;
665 ++ /* Next pi to pick up in case a new path becomes available */
666 ++ u8 next_path_index;
667 ++
668 ++ /* Original snd/rcvbuf of the initial subflow.
669 ++ * Used for the new subflows on the server-side to allow correct
670 ++ * autotuning
671 ++ */
672 ++ int orig_sk_rcvbuf;
673 ++ int orig_sk_sndbuf;
674 ++ u32 orig_window_clamp;
675 ++
676 ++ /* Timer for retransmitting SYN/ACK+MP_JOIN */
677 ++ struct timer_list synack_timer;
678 ++};
679 ++
680 ++#define MPTCP_SUB_CAPABLE 0
681 ++#define MPTCP_SUB_LEN_CAPABLE_SYN 12
682 ++#define MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN 12
683 ++#define MPTCP_SUB_LEN_CAPABLE_ACK 20
684 ++#define MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN 20
685 ++
686 ++#define MPTCP_SUB_JOIN 1
687 ++#define MPTCP_SUB_LEN_JOIN_SYN 12
688 ++#define MPTCP_SUB_LEN_JOIN_SYN_ALIGN 12
689 ++#define MPTCP_SUB_LEN_JOIN_SYNACK 16
690 ++#define MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN 16
691 ++#define MPTCP_SUB_LEN_JOIN_ACK 24
692 ++#define MPTCP_SUB_LEN_JOIN_ACK_ALIGN 24
693 ++
694 ++#define MPTCP_SUB_DSS 2
695 ++#define MPTCP_SUB_LEN_DSS 4
696 ++#define MPTCP_SUB_LEN_DSS_ALIGN 4
697 ++
698 ++/* Lengths for seq and ack are the ones without the generic MPTCP-option header,
699 ++ * as they are part of the DSS-option.
700 ++ * To get the total length, just add the different options together.
701 ++ */
702 ++#define MPTCP_SUB_LEN_SEQ 10
703 ++#define MPTCP_SUB_LEN_SEQ_CSUM 12
704 ++#define MPTCP_SUB_LEN_SEQ_ALIGN 12
705 ++
706 ++#define MPTCP_SUB_LEN_SEQ_64 14
707 ++#define MPTCP_SUB_LEN_SEQ_CSUM_64 16
708 ++#define MPTCP_SUB_LEN_SEQ_64_ALIGN 16
709 ++
710 ++#define MPTCP_SUB_LEN_ACK 4
711 ++#define MPTCP_SUB_LEN_ACK_ALIGN 4
712 ++
713 ++#define MPTCP_SUB_LEN_ACK_64 8
714 ++#define MPTCP_SUB_LEN_ACK_64_ALIGN 8
715 ++
716 ++/* This is the "default" option-length we will send out most often.
717 ++ * MPTCP DSS-header
718 ++ * 32-bit data sequence number
719 ++ * 32-bit data ack
720 ++ *
721 ++ * It is necessary to calculate the effective MSS we will be using when
722 ++ * sending data.
723 ++ */
724 ++#define MPTCP_SUB_LEN_DSM_ALIGN (MPTCP_SUB_LEN_DSS_ALIGN + \
725 ++ MPTCP_SUB_LEN_SEQ_ALIGN + \
726 ++ MPTCP_SUB_LEN_ACK_ALIGN)
727 ++
728 ++#define MPTCP_SUB_ADD_ADDR 3
729 ++#define MPTCP_SUB_LEN_ADD_ADDR4 8
730 ++#define MPTCP_SUB_LEN_ADD_ADDR6 20
731 ++#define MPTCP_SUB_LEN_ADD_ADDR4_ALIGN 8
732 ++#define MPTCP_SUB_LEN_ADD_ADDR6_ALIGN 20
733 ++
734 ++#define MPTCP_SUB_REMOVE_ADDR 4
735 ++#define MPTCP_SUB_LEN_REMOVE_ADDR 4
736 ++
737 ++#define MPTCP_SUB_PRIO 5
738 ++#define MPTCP_SUB_LEN_PRIO 3
739 ++#define MPTCP_SUB_LEN_PRIO_ADDR 4
740 ++#define MPTCP_SUB_LEN_PRIO_ALIGN 4
741 ++
742 ++#define MPTCP_SUB_FAIL 6
743 ++#define MPTCP_SUB_LEN_FAIL 12
744 ++#define MPTCP_SUB_LEN_FAIL_ALIGN 12
745 ++
746 ++#define MPTCP_SUB_FCLOSE 7
747 ++#define MPTCP_SUB_LEN_FCLOSE 12
748 ++#define MPTCP_SUB_LEN_FCLOSE_ALIGN 12
749 ++
750 ++
751 ++#define OPTION_MPTCP (1 << 5)
752 ++
753 ++#ifdef CONFIG_MPTCP
754 ++
755 ++/* Used for checking if the mptcp initialization has been successful */
756 ++extern bool mptcp_init_failed;
757 ++
758 ++/* MPTCP options */
759 ++#define OPTION_TYPE_SYN (1 << 0)
760 ++#define OPTION_TYPE_SYNACK (1 << 1)
761 ++#define OPTION_TYPE_ACK (1 << 2)
762 ++#define OPTION_MP_CAPABLE (1 << 3)
763 ++#define OPTION_DATA_ACK (1 << 4)
764 ++#define OPTION_ADD_ADDR (1 << 5)
765 ++#define OPTION_MP_JOIN (1 << 6)
766 ++#define OPTION_MP_FAIL (1 << 7)
767 ++#define OPTION_MP_FCLOSE (1 << 8)
768 ++#define OPTION_REMOVE_ADDR (1 << 9)
769 ++#define OPTION_MP_PRIO (1 << 10)
770 ++
771 ++/* MPTCP flags: both TX and RX */
772 ++#define MPTCPHDR_SEQ 0x01 /* DSS.M option is present */
773 ++#define MPTCPHDR_FIN 0x02 /* DSS.F option is present */
774 ++#define MPTCPHDR_SEQ64_INDEX 0x04 /* index of seq in mpcb->snd_high_order */
775 ++/* MPTCP flags: RX only */
776 ++#define MPTCPHDR_ACK 0x08
777 ++#define MPTCPHDR_SEQ64_SET 0x10 /* Did we received a 64-bit seq number? */
778 ++#define MPTCPHDR_SEQ64_OFO 0x20 /* Is it not in our circular array? */
779 ++#define MPTCPHDR_DSS_CSUM 0x40
780 ++#define MPTCPHDR_JOIN 0x80
781 ++/* MPTCP flags: TX only */
782 ++#define MPTCPHDR_INF 0x08
783 ++
784 ++struct mptcp_option {
785 ++ __u8 kind;
786 ++ __u8 len;
787 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
788 ++ __u8 ver:4,
789 ++ sub:4;
790 ++#elif defined(__BIG_ENDIAN_BITFIELD)
791 ++ __u8 sub:4,
792 ++ ver:4;
793 ++#else
794 ++#error "Adjust your <asm/byteorder.h> defines"
795 ++#endif
796 ++};
797 ++
798 ++struct mp_capable {
799 ++ __u8 kind;
800 ++ __u8 len;
801 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
802 ++ __u8 ver:4,
803 ++ sub:4;
804 ++ __u8 h:1,
805 ++ rsv:5,
806 ++ b:1,
807 ++ a:1;
808 ++#elif defined(__BIG_ENDIAN_BITFIELD)
809 ++ __u8 sub:4,
810 ++ ver:4;
811 ++ __u8 a:1,
812 ++ b:1,
813 ++ rsv:5,
814 ++ h:1;
815 ++#else
816 ++#error "Adjust your <asm/byteorder.h> defines"
817 ++#endif
818 ++ __u64 sender_key;
819 ++ __u64 receiver_key;
820 ++} __attribute__((__packed__));
821 ++
822 ++struct mp_join {
823 ++ __u8 kind;
824 ++ __u8 len;
825 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
826 ++ __u8 b:1,
827 ++ rsv:3,
828 ++ sub:4;
829 ++#elif defined(__BIG_ENDIAN_BITFIELD)
830 ++ __u8 sub:4,
831 ++ rsv:3,
832 ++ b:1;
833 ++#else
834 ++#error "Adjust your <asm/byteorder.h> defines"
835 ++#endif
836 ++ __u8 addr_id;
837 ++ union {
838 ++ struct {
839 ++ u32 token;
840 ++ u32 nonce;
841 ++ } syn;
842 ++ struct {
843 ++ __u64 mac;
844 ++ u32 nonce;
845 ++ } synack;
846 ++ struct {
847 ++ __u8 mac[20];
848 ++ } ack;
849 ++ } u;
850 ++} __attribute__((__packed__));
851 ++
852 ++struct mp_dss {
853 ++ __u8 kind;
854 ++ __u8 len;
855 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
856 ++ __u16 rsv1:4,
857 ++ sub:4,
858 ++ A:1,
859 ++ a:1,
860 ++ M:1,
861 ++ m:1,
862 ++ F:1,
863 ++ rsv2:3;
864 ++#elif defined(__BIG_ENDIAN_BITFIELD)
865 ++ __u16 sub:4,
866 ++ rsv1:4,
867 ++ rsv2:3,
868 ++ F:1,
869 ++ m:1,
870 ++ M:1,
871 ++ a:1,
872 ++ A:1;
873 ++#else
874 ++#error "Adjust your <asm/byteorder.h> defines"
875 ++#endif
876 ++};
877 ++
878 ++struct mp_add_addr {
879 ++ __u8 kind;
880 ++ __u8 len;
881 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
882 ++ __u8 ipver:4,
883 ++ sub:4;
884 ++#elif defined(__BIG_ENDIAN_BITFIELD)
885 ++ __u8 sub:4,
886 ++ ipver:4;
887 ++#else
888 ++#error "Adjust your <asm/byteorder.h> defines"
889 ++#endif
890 ++ __u8 addr_id;
891 ++ union {
892 ++ struct {
893 ++ struct in_addr addr;
894 ++ __be16 port;
895 ++ } v4;
896 ++ struct {
897 ++ struct in6_addr addr;
898 ++ __be16 port;
899 ++ } v6;
900 ++ } u;
901 ++} __attribute__((__packed__));
902 ++
903 ++struct mp_remove_addr {
904 ++ __u8 kind;
905 ++ __u8 len;
906 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
907 ++ __u8 rsv:4,
908 ++ sub:4;
909 ++#elif defined(__BIG_ENDIAN_BITFIELD)
910 ++ __u8 sub:4,
911 ++ rsv:4;
912 ++#else
913 ++#error "Adjust your <asm/byteorder.h> defines"
914 ++#endif
915 ++ /* list of addr_id */
916 ++ __u8 addrs_id;
917 ++};
918 ++
919 ++struct mp_fail {
920 ++ __u8 kind;
921 ++ __u8 len;
922 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
923 ++ __u16 rsv1:4,
924 ++ sub:4,
925 ++ rsv2:8;
926 ++#elif defined(__BIG_ENDIAN_BITFIELD)
927 ++ __u16 sub:4,
928 ++ rsv1:4,
929 ++ rsv2:8;
930 ++#else
931 ++#error "Adjust your <asm/byteorder.h> defines"
932 ++#endif
933 ++ __be64 data_seq;
934 ++} __attribute__((__packed__));
935 ++
936 ++struct mp_fclose {
937 ++ __u8 kind;
938 ++ __u8 len;
939 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
940 ++ __u16 rsv1:4,
941 ++ sub:4,
942 ++ rsv2:8;
943 ++#elif defined(__BIG_ENDIAN_BITFIELD)
944 ++ __u16 sub:4,
945 ++ rsv1:4,
946 ++ rsv2:8;
947 ++#else
948 ++#error "Adjust your <asm/byteorder.h> defines"
949 ++#endif
950 ++ __u64 key;
951 ++} __attribute__((__packed__));
952 ++
953 ++struct mp_prio {
954 ++ __u8 kind;
955 ++ __u8 len;
956 ++#if defined(__LITTLE_ENDIAN_BITFIELD)
957 ++ __u8 b:1,
958 ++ rsv:3,
959 ++ sub:4;
960 ++#elif defined(__BIG_ENDIAN_BITFIELD)
961 ++ __u8 sub:4,
962 ++ rsv:3,
963 ++ b:1;
964 ++#else
965 ++#error "Adjust your <asm/byteorder.h> defines"
966 ++#endif
967 ++ __u8 addr_id;
968 ++} __attribute__((__packed__));
969 ++
970 ++static inline int mptcp_sub_len_dss(const struct mp_dss *m, const int csum)
971 ++{
972 ++ return 4 + m->A * (4 + m->a * 4) + m->M * (10 + m->m * 4 + csum * 2);
973 ++}
974 ++
975 ++#define MPTCP_APP 2
976 ++
977 ++extern int sysctl_mptcp_enabled;
978 ++extern int sysctl_mptcp_checksum;
979 ++extern int sysctl_mptcp_debug;
980 ++extern int sysctl_mptcp_syn_retries;
981 ++
982 ++extern struct workqueue_struct *mptcp_wq;
983 ++
984 ++#define mptcp_debug(fmt, args...) \
985 ++ do { \
986 ++ if (unlikely(sysctl_mptcp_debug)) \
987 ++ pr_err(__FILE__ ": " fmt, ##args); \
988 ++ } while (0)
989 ++
990 ++/* Iterates over all subflows */
991 ++#define mptcp_for_each_tp(mpcb, tp) \
992 ++ for ((tp) = (mpcb)->connection_list; (tp); (tp) = (tp)->mptcp->next)
993 ++
994 ++#define mptcp_for_each_sk(mpcb, sk) \
995 ++ for ((sk) = (struct sock *)(mpcb)->connection_list; \
996 ++ sk; \
997 ++ sk = (struct sock *)tcp_sk(sk)->mptcp->next)
998 ++
999 ++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp) \
1000 ++ for (__sk = (struct sock *)(__mpcb)->connection_list, \
1001 ++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL; \
1002 ++ __sk; \
1003 ++ __sk = __temp, \
1004 ++ __temp = __sk ? (struct sock *)tcp_sk(__sk)->mptcp->next : NULL)
1005 ++
1006 ++/* Iterates over all bit set to 1 in a bitset */
1007 ++#define mptcp_for_each_bit_set(b, i) \
1008 ++ for (i = ffs(b) - 1; i >= 0; i = ffs(b >> (i + 1) << (i + 1)) - 1)
1009 ++
1010 ++#define mptcp_for_each_bit_unset(b, i) \
1011 ++ mptcp_for_each_bit_set(~b, i)
1012 ++
1013 ++extern struct lock_class_key meta_key;
1014 ++extern struct lock_class_key meta_slock_key;
1015 ++extern u32 mptcp_secret[MD5_MESSAGE_BYTES / 4];
1016 ++
1017 ++/* This is needed to ensure that two subsequent key/nonce-generation result in
1018 ++ * different keys/nonces if the IPs and ports are the same.
1019 ++ */
1020 ++extern u32 mptcp_seed;
1021 ++
1022 ++#define MPTCP_HASH_SIZE 1024
1023 ++
1024 ++extern struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
1025 ++
1026 ++/* This second hashtable is needed to retrieve request socks
1027 ++ * created as a result of a join request. While the SYN contains
1028 ++ * the token, the final ack does not, so we need a separate hashtable
1029 ++ * to retrieve the mpcb.
1030 ++ */
1031 ++extern struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
1032 ++extern spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
1033 ++
1034 ++/* Lock, protecting the two hash-tables that hold the token. Namely,
1035 ++ * mptcp_reqsk_tk_htb and tk_hashtable
1036 ++ */
1037 ++extern spinlock_t mptcp_tk_hashlock; /* hashtable protection */
1038 ++
1039 ++/* Request-sockets can be hashed in the tk_htb for collision-detection or in
1040 ++ * the regular htb for join-connections. We need to define different NULLS
1041 ++ * values so that we can correctly detect a request-socket that has been
1042 ++ * recycled. See also c25eb3bfb9729.
1043 ++ */
1044 ++#define MPTCP_REQSK_NULLS_BASE (1U << 29)
1045 ++
1046 ++
1047 ++void mptcp_data_ready(struct sock *sk);
1048 ++void mptcp_write_space(struct sock *sk);
1049 ++
1050 ++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
1051 ++ struct sock *sk);
1052 ++void mptcp_ofo_queue(struct sock *meta_sk);
1053 ++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp);
1054 ++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied);
1055 ++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
1056 ++ gfp_t flags);
1057 ++void mptcp_del_sock(struct sock *sk);
1058 ++void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk);
1059 ++void mptcp_reinject_data(struct sock *orig_sk, int clone_it);
1060 ++void mptcp_update_sndbuf(const struct tcp_sock *tp);
1061 ++void mptcp_send_fin(struct sock *meta_sk);
1062 ++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority);
1063 ++bool mptcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1064 ++ int push_one, gfp_t gfp);
1065 ++void tcp_parse_mptcp_options(const struct sk_buff *skb,
1066 ++ struct mptcp_options_received *mopt);
1067 ++void mptcp_parse_options(const uint8_t *ptr, int opsize,
1068 ++ struct mptcp_options_received *mopt,
1069 ++ const struct sk_buff *skb);
1070 ++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
1071 ++ unsigned *remaining);
1072 ++void mptcp_synack_options(struct request_sock *req,
1073 ++ struct tcp_out_options *opts,
1074 ++ unsigned *remaining);
1075 ++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
1076 ++ struct tcp_out_options *opts, unsigned *size);
1077 ++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
1078 ++ const struct tcp_out_options *opts,
1079 ++ struct sk_buff *skb);
1080 ++void mptcp_close(struct sock *meta_sk, long timeout);
1081 ++int mptcp_doit(struct sock *sk);
1082 ++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window);
1083 ++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req);
1084 ++int mptcp_check_req_master(struct sock *sk, struct sock *child,
1085 ++ struct request_sock *req,
1086 ++ struct request_sock **prev);
1087 ++struct sock *mptcp_check_req_child(struct sock *sk, struct sock *child,
1088 ++ struct request_sock *req,
1089 ++ struct request_sock **prev,
1090 ++ const struct mptcp_options_received *mopt);
1091 ++u32 __mptcp_select_window(struct sock *sk);
1092 ++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
1093 ++ __u32 *window_clamp, int wscale_ok,
1094 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
1095 ++ const struct sock *sk);
1096 ++unsigned int mptcp_current_mss(struct sock *meta_sk);
1097 ++int mptcp_select_size(const struct sock *meta_sk, bool sg);
1098 ++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn);
1099 ++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
1100 ++ u32 *hash_out);
1101 ++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk);
1102 ++void mptcp_fin(struct sock *meta_sk);
1103 ++void mptcp_retransmit_timer(struct sock *meta_sk);
1104 ++int mptcp_write_wakeup(struct sock *meta_sk);
1105 ++void mptcp_sub_close_wq(struct work_struct *work);
1106 ++void mptcp_sub_close(struct sock *sk, unsigned long delay);
1107 ++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk);
1108 ++void mptcp_fallback_meta_sk(struct sock *meta_sk);
1109 ++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb);
1110 ++void mptcp_ack_handler(unsigned long);
1111 ++int mptcp_check_rtt(const struct tcp_sock *tp, int time);
1112 ++int mptcp_check_snd_buf(const struct tcp_sock *tp);
1113 ++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
1114 ++ const struct sk_buff *skb);
1115 ++void __init mptcp_init(void);
1116 ++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len);
1117 ++void mptcp_destroy_sock(struct sock *sk);
1118 ++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
1119 ++ const struct sk_buff *skb,
1120 ++ const struct mptcp_options_received *mopt);
1121 ++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
1122 ++ int large_allowed);
1123 ++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw);
1124 ++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw);
1125 ++void mptcp_time_wait(struct sock *sk, int state, int timeo);
1126 ++void mptcp_disconnect(struct sock *sk);
1127 ++bool mptcp_should_expand_sndbuf(const struct sock *sk);
1128 ++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb);
1129 ++void mptcp_tsq_flags(struct sock *sk);
1130 ++void mptcp_tsq_sub_deferred(struct sock *meta_sk);
1131 ++struct mp_join *mptcp_find_join(const struct sk_buff *skb);
1132 ++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp);
1133 ++void mptcp_hash_remove(struct tcp_sock *meta_tp);
1134 ++struct sock *mptcp_hash_find(const struct net *net, const u32 token);
1135 ++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw);
1136 ++int mptcp_do_join_short(struct sk_buff *skb,
1137 ++ const struct mptcp_options_received *mopt,
1138 ++ struct net *net);
1139 ++void mptcp_reqsk_destructor(struct request_sock *req);
1140 ++void mptcp_reqsk_new_mptcp(struct request_sock *req,
1141 ++ const struct mptcp_options_received *mopt,
1142 ++ const struct sk_buff *skb);
1143 ++int mptcp_check_req(struct sk_buff *skb, struct net *net);
1144 ++void mptcp_connect_init(struct sock *sk);
1145 ++void mptcp_sub_force_close(struct sock *sk);
1146 ++int mptcp_sub_len_remove_addr_align(u16 bitfield);
1147 ++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
1148 ++ const struct sk_buff *skb);
1149 ++void mptcp_init_buffer_space(struct sock *sk);
1150 ++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,
1151 ++ struct sk_buff *skb);
1152 ++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb);
1153 ++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb);
1154 ++void mptcp_init_congestion_control(struct sock *sk);
1155 ++
1156 ++/* MPTCP-path-manager registration/initialization functions */
1157 ++int mptcp_register_path_manager(struct mptcp_pm_ops *pm);
1158 ++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm);
1159 ++void mptcp_init_path_manager(struct mptcp_cb *mpcb);
1160 ++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb);
1161 ++void mptcp_fallback_default(struct mptcp_cb *mpcb);
1162 ++void mptcp_get_default_path_manager(char *name);
1163 ++int mptcp_set_default_path_manager(const char *name);
1164 ++extern struct mptcp_pm_ops mptcp_pm_default;
1165 ++
1166 ++/* MPTCP-scheduler registration/initialization functions */
1167 ++int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
1168 ++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
1169 ++void mptcp_init_scheduler(struct mptcp_cb *mpcb);
1170 ++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb);
1171 ++void mptcp_get_default_scheduler(char *name);
1172 ++int mptcp_set_default_scheduler(const char *name);
1173 ++extern struct mptcp_sched_ops mptcp_sched_default;
1174 ++
1175 ++static inline void mptcp_reset_synack_timer(struct sock *meta_sk,
1176 ++ unsigned long len)
1177 ++{
1178 ++ sk_reset_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer,
1179 ++ jiffies + len);
1180 ++}
1181 ++
1182 ++static inline void mptcp_delete_synack_timer(struct sock *meta_sk)
1183 ++{
1184 ++ sk_stop_timer(meta_sk, &tcp_sk(meta_sk)->mpcb->synack_timer);
1185 ++}
1186 ++
1187 ++static inline bool is_mptcp_enabled(const struct sock *sk)
1188 ++{
1189 ++ if (!sysctl_mptcp_enabled || mptcp_init_failed)
1190 ++ return false;
1191 ++
1192 ++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
1193 ++ return false;
1194 ++
1195 ++ return true;
1196 ++}
1197 ++
1198 ++static inline int mptcp_pi_to_flag(int pi)
1199 ++{
1200 ++ return 1 << (pi - 1);
1201 ++}
1202 ++
1203 ++static inline
1204 ++struct mptcp_request_sock *mptcp_rsk(const struct request_sock *req)
1205 ++{
1206 ++ return (struct mptcp_request_sock *)req;
1207 ++}
1208 ++
1209 ++static inline
1210 ++struct request_sock *rev_mptcp_rsk(const struct mptcp_request_sock *req)
1211 ++{
1212 ++ return (struct request_sock *)req;
1213 ++}
1214 ++
1215 ++static inline bool mptcp_can_sendpage(struct sock *sk)
1216 ++{
1217 ++ struct sock *sk_it;
1218 ++
1219 ++ if (tcp_sk(sk)->mpcb->dss_csum)
1220 ++ return false;
1221 ++
1222 ++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it) {
1223 ++ if (!(sk_it->sk_route_caps & NETIF_F_SG) ||
1224 ++ !(sk_it->sk_route_caps & NETIF_F_ALL_CSUM))
1225 ++ return false;
1226 ++ }
1227 ++
1228 ++ return true;
1229 ++}
1230 ++
1231 ++static inline void mptcp_push_pending_frames(struct sock *meta_sk)
1232 ++{
1233 ++ /* We check packets out and send-head here. TCP only checks the
1234 ++ * send-head. But, MPTCP also checks packets_out, as this is an
1235 ++ * indication that we might want to do opportunistic reinjection.
1236 ++ */
1237 ++ if (tcp_sk(meta_sk)->packets_out || tcp_send_head(meta_sk)) {
1238 ++ struct tcp_sock *tp = tcp_sk(meta_sk);
1239 ++
1240 ++ /* We don't care about the MSS, because it will be set in
1241 ++ * mptcp_write_xmit.
1242 ++ */
1243 ++ __tcp_push_pending_frames(meta_sk, 0, tp->nonagle);
1244 ++ }
1245 ++}
1246 ++
1247 ++static inline void mptcp_send_reset(struct sock *sk)
1248 ++{
1249 ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
1250 ++ mptcp_sub_force_close(sk);
1251 ++}
1252 ++
1253 ++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
1254 ++{
1255 ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ;
1256 ++}
1257 ++
1258 ++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
1259 ++{
1260 ++ return TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_FIN;
1261 ++}
1262 ++
1263 ++/* Is it a data-fin while in infinite mapping mode?
1264 ++ * In infinite mode, a subflow-fin is in fact a data-fin.
1265 ++ */
1266 ++static inline bool mptcp_is_data_fin2(const struct sk_buff *skb,
1267 ++ const struct tcp_sock *tp)
1268 ++{
1269 ++ return mptcp_is_data_fin(skb) ||
1270 ++ (tp->mpcb->infinite_mapping_rcv && tcp_hdr(skb)->fin);
1271 ++}
1272 ++
1273 ++static inline u8 mptcp_get_64_bit(u64 data_seq, struct mptcp_cb *mpcb)
1274 ++{
1275 ++ u64 data_seq_high = (u32)(data_seq >> 32);
1276 ++
1277 ++ if (mpcb->rcv_high_order[0] == data_seq_high)
1278 ++ return 0;
1279 ++ else if (mpcb->rcv_high_order[1] == data_seq_high)
1280 ++ return MPTCPHDR_SEQ64_INDEX;
1281 ++ else
1282 ++ return MPTCPHDR_SEQ64_OFO;
1283 ++}
1284 ++
1285 ++/* Sets the data_seq and returns pointer to the in-skb field of the data_seq.
1286 ++ * If the packet has a 64-bit dseq, the pointer points to the last 32 bits.
1287 ++ */
1288 ++static inline __u32 *mptcp_skb_set_data_seq(const struct sk_buff *skb,
1289 ++ u32 *data_seq,
1290 ++ struct mptcp_cb *mpcb)
1291 ++{
1292 ++ __u32 *ptr = (__u32 *)(skb_transport_header(skb) + TCP_SKB_CB(skb)->dss_off);
1293 ++
1294 ++ if (TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_SEQ64_SET) {
1295 ++ u64 data_seq64 = get_unaligned_be64(ptr);
1296 ++
1297 ++ if (mpcb)
1298 ++ TCP_SKB_CB(skb)->mptcp_flags |= mptcp_get_64_bit(data_seq64, mpcb);
1299 ++
1300 ++ *data_seq = (u32)data_seq64;
1301 ++ ptr++;
1302 ++ } else {
1303 ++ *data_seq = get_unaligned_be32(ptr);
1304 ++ }
1305 ++
1306 ++ return ptr;
1307 ++}
1308 ++
1309 ++static inline struct sock *mptcp_meta_sk(const struct sock *sk)
1310 ++{
1311 ++ return tcp_sk(sk)->meta_sk;
1312 ++}
1313 ++
1314 ++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
1315 ++{
1316 ++ return tcp_sk(tp->meta_sk);
1317 ++}
1318 ++
1319 ++static inline int is_meta_tp(const struct tcp_sock *tp)
1320 ++{
1321 ++ return tp->mpcb && mptcp_meta_tp(tp) == tp;
1322 ++}
1323 ++
1324 ++static inline int is_meta_sk(const struct sock *sk)
1325 ++{
1326 ++ return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP &&
1327 ++ mptcp(tcp_sk(sk)) && mptcp_meta_sk(sk) == sk;
1328 ++}
1329 ++
1330 ++static inline int is_master_tp(const struct tcp_sock *tp)
1331 ++{
1332 ++ return !mptcp(tp) || (!tp->mptcp->slave_sk && !is_meta_tp(tp));
1333 ++}
1334 ++
1335 ++static inline void mptcp_hash_request_remove(struct request_sock *req)
1336 ++{
1337 ++ int in_softirq = 0;
1338 ++
1339 ++ if (hlist_nulls_unhashed(&mptcp_rsk(req)->hash_entry))
1340 ++ return;
1341 ++
1342 ++ if (in_softirq()) {
1343 ++ spin_lock(&mptcp_reqsk_hlock);
1344 ++ in_softirq = 1;
1345 ++ } else {
1346 ++ spin_lock_bh(&mptcp_reqsk_hlock);
1347 ++ }
1348 ++
1349 ++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);
1350 ++
1351 ++ if (in_softirq)
1352 ++ spin_unlock(&mptcp_reqsk_hlock);
1353 ++ else
1354 ++ spin_unlock_bh(&mptcp_reqsk_hlock);
1355 ++}
1356 ++
1357 ++static inline void mptcp_init_mp_opt(struct mptcp_options_received *mopt)
1358 ++{
1359 ++ mopt->saw_mpc = 0;
1360 ++ mopt->dss_csum = 0;
1361 ++ mopt->drop_me = 0;
1362 ++
1363 ++ mopt->is_mp_join = 0;
1364 ++ mopt->join_ack = 0;
1365 ++
1366 ++ mopt->saw_low_prio = 0;
1367 ++ mopt->low_prio = 0;
1368 ++
1369 ++ mopt->saw_add_addr = 0;
1370 ++ mopt->more_add_addr = 0;
1371 ++
1372 ++ mopt->saw_rem_addr = 0;
1373 ++ mopt->more_rem_addr = 0;
1374 ++
1375 ++ mopt->mp_fail = 0;
1376 ++ mopt->mp_fclose = 0;
1377 ++}
1378 ++
1379 ++static inline void mptcp_reset_mopt(struct tcp_sock *tp)
1380 ++{
1381 ++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
1382 ++
1383 ++ mopt->saw_low_prio = 0;
1384 ++ mopt->saw_add_addr = 0;
1385 ++ mopt->more_add_addr = 0;
1386 ++ mopt->saw_rem_addr = 0;
1387 ++ mopt->more_rem_addr = 0;
1388 ++ mopt->join_ack = 0;
1389 ++ mopt->mp_fail = 0;
1390 ++ mopt->mp_fclose = 0;
1391 ++}
1392 ++
1393 ++static inline __be32 mptcp_get_highorder_sndbits(const struct sk_buff *skb,
1394 ++ const struct mptcp_cb *mpcb)
1395 ++{
1396 ++ return htonl(mpcb->snd_high_order[(TCP_SKB_CB(skb)->mptcp_flags &
1397 ++ MPTCPHDR_SEQ64_INDEX) ? 1 : 0]);
1398 ++}
1399 ++
1400 ++static inline u64 mptcp_get_data_seq_64(const struct mptcp_cb *mpcb, int index,
1401 ++ u32 data_seq_32)
1402 ++{
1403 ++ return ((u64)mpcb->rcv_high_order[index] << 32) | data_seq_32;
1404 ++}
1405 ++
1406 ++static inline u64 mptcp_get_rcv_nxt_64(const struct tcp_sock *meta_tp)
1407 ++{
1408 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
1409 ++ return mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
1410 ++ meta_tp->rcv_nxt);
1411 ++}
1412 ++
1413 ++static inline void mptcp_check_sndseq_wrap(struct tcp_sock *meta_tp, int inc)
1414 ++{
1415 ++ if (unlikely(meta_tp->snd_nxt > meta_tp->snd_nxt + inc)) {
1416 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
1417 ++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
1418 ++ mpcb->snd_high_order[mpcb->snd_hiseq_index] += 2;
1419 ++ }
1420 ++}
1421 ++
1422 ++static inline void mptcp_check_rcvseq_wrap(struct tcp_sock *meta_tp,
1423 ++ u32 old_rcv_nxt)
1424 ++{
1425 ++ if (unlikely(old_rcv_nxt > meta_tp->rcv_nxt)) {
1426 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
1427 ++ mpcb->rcv_high_order[mpcb->rcv_hiseq_index] += 2;
1428 ++ mpcb->rcv_hiseq_index = mpcb->rcv_hiseq_index ? 0 : 1;
1429 ++ }
1430 ++}
1431 ++
1432 ++static inline int mptcp_sk_can_send(const struct sock *sk)
1433 ++{
1434 ++ return tcp_passive_fastopen(sk) ||
1435 ++ ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1436 ++ !tcp_sk(sk)->mptcp->pre_established);
1437 ++}
1438 ++
1439 ++static inline int mptcp_sk_can_recv(const struct sock *sk)
1440 ++{
1441 ++ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2);
1442 ++}
1443 ++
1444 ++static inline int mptcp_sk_can_send_ack(const struct sock *sk)
1445 ++{
1446 ++ return !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV |
1447 ++ TCPF_CLOSE | TCPF_LISTEN)) &&
1448 ++ !tcp_sk(sk)->mptcp->pre_established;
1449 ++}
1450 ++
1451 ++/* Only support GSO if all subflows supports it */
1452 ++static inline bool mptcp_sk_can_gso(const struct sock *meta_sk)
1453 ++{
1454 ++ struct sock *sk;
1455 ++
1456 ++ if (tcp_sk(meta_sk)->mpcb->dss_csum)
1457 ++ return false;
1458 ++
1459 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1460 ++ if (!mptcp_sk_can_send(sk))
1461 ++ continue;
1462 ++ if (!sk_can_gso(sk))
1463 ++ return false;
1464 ++ }
1465 ++ return true;
1466 ++}
1467 ++
1468 ++static inline bool mptcp_can_sg(const struct sock *meta_sk)
1469 ++{
1470 ++ struct sock *sk;
1471 ++
1472 ++ if (tcp_sk(meta_sk)->mpcb->dss_csum)
1473 ++ return false;
1474 ++
1475 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
1476 ++ if (!mptcp_sk_can_send(sk))
1477 ++ continue;
1478 ++ if (!(sk->sk_route_caps & NETIF_F_SG))
1479 ++ return false;
1480 ++ }
1481 ++ return true;
1482 ++}
1483 ++
1484 ++static inline void mptcp_set_rto(struct sock *sk)
1485 ++{
1486 ++ struct tcp_sock *tp = tcp_sk(sk);
1487 ++ struct sock *sk_it;
1488 ++ struct inet_connection_sock *micsk = inet_csk(mptcp_meta_sk(sk));
1489 ++ __u32 max_rto = 0;
1490 ++
1491 ++ /* We are in recovery-phase on the MPTCP-level. Do not update the
1492 ++ * RTO, because this would kill exponential backoff.
1493 ++ */
1494 ++ if (micsk->icsk_retransmits)
1495 ++ return;
1496 ++
1497 ++ mptcp_for_each_sk(tp->mpcb, sk_it) {
1498 ++ if (mptcp_sk_can_send(sk_it) &&
1499 ++ inet_csk(sk_it)->icsk_rto > max_rto)
1500 ++ max_rto = inet_csk(sk_it)->icsk_rto;
1501 ++ }
1502 ++ if (max_rto) {
1503 ++ micsk->icsk_rto = max_rto << 1;
1504 ++
1505 ++ /* A successfull rto-measurement - reset backoff counter */
1506 ++ micsk->icsk_backoff = 0;
1507 ++ }
1508 ++}
1509 ++
1510 ++static inline int mptcp_sysctl_syn_retries(void)
1511 ++{
1512 ++ return sysctl_mptcp_syn_retries;
1513 ++}
1514 ++
1515 ++static inline void mptcp_sub_close_passive(struct sock *sk)
1516 ++{
1517 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
1518 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(meta_sk);
1519 ++
1520 ++ /* Only close, if the app did a send-shutdown (passive close), and we
1521 ++ * received the data-ack of the data-fin.
1522 ++ */
1523 ++ if (tp->mpcb->passive_close && meta_tp->snd_una == meta_tp->write_seq)
1524 ++ mptcp_sub_close(sk, 0);
1525 ++}
1526 ++
1527 ++static inline bool mptcp_fallback_infinite(struct sock *sk, int flag)
1528 ++{
1529 ++ struct tcp_sock *tp = tcp_sk(sk);
1530 ++
1531 ++ /* If data has been acknowleged on the meta-level, fully_established
1532 ++ * will have been set before and thus we will not fall back to infinite
1533 ++ * mapping.
1534 ++ */
1535 ++ if (likely(tp->mptcp->fully_established))
1536 ++ return false;
1537 ++
1538 ++ if (!(flag & MPTCP_FLAG_DATA_ACKED))
1539 ++ return false;
1540 ++
1541 ++ /* Don't fallback twice ;) */
1542 ++ if (tp->mpcb->infinite_mapping_snd)
1543 ++ return false;
1544 ++
1545 ++ pr_err("%s %#x will fallback - pi %d, src %pI4 dst %pI4 from %pS\n",
1546 ++ __func__, tp->mpcb->mptcp_loc_token, tp->mptcp->path_index,
1547 ++ &inet_sk(sk)->inet_saddr, &inet_sk(sk)->inet_daddr,
1548 ++ __builtin_return_address(0));
1549 ++ if (!is_master_tp(tp))
1550 ++ return true;
1551 ++
1552 ++ tp->mpcb->infinite_mapping_snd = 1;
1553 ++ tp->mpcb->infinite_mapping_rcv = 1;
1554 ++ tp->mptcp->fully_established = 1;
1555 ++
1556 ++ return false;
1557 ++}
1558 ++
1559 ++/* Find the first index whose bit in the bit-field == 0 */
1560 ++static inline u8 mptcp_set_new_pathindex(struct mptcp_cb *mpcb)
1561 ++{
1562 ++ u8 base = mpcb->next_path_index;
1563 ++ int i;
1564 ++
1565 ++ /* Start at 1, because 0 is reserved for the meta-sk */
1566 ++ mptcp_for_each_bit_unset(mpcb->path_index_bits >> base, i) {
1567 ++ if (i + base < 1)
1568 ++ continue;
1569 ++ if (i + base >= sizeof(mpcb->path_index_bits) * 8)
1570 ++ break;
1571 ++ i += base;
1572 ++ mpcb->path_index_bits |= (1 << i);
1573 ++ mpcb->next_path_index = i + 1;
1574 ++ return i;
1575 ++ }
1576 ++ mptcp_for_each_bit_unset(mpcb->path_index_bits, i) {
1577 ++ if (i >= sizeof(mpcb->path_index_bits) * 8)
1578 ++ break;
1579 ++ if (i < 1)
1580 ++ continue;
1581 ++ mpcb->path_index_bits |= (1 << i);
1582 ++ mpcb->next_path_index = i + 1;
1583 ++ return i;
1584 ++ }
1585 ++
1586 ++ return 0;
1587 ++}
1588 ++
1589 ++static inline bool mptcp_v6_is_v4_mapped(const struct sock *sk)
1590 ++{
1591 ++ return sk->sk_family == AF_INET6 &&
1592 ++ ipv6_addr_type(&inet6_sk(sk)->saddr) == IPV6_ADDR_MAPPED;
1593 ++}
1594 ++
1595 ++/* TCP and MPTCP mpc flag-depending functions */
1596 ++u16 mptcp_select_window(struct sock *sk);
1597 ++void mptcp_init_buffer_space(struct sock *sk);
1598 ++void mptcp_tcp_set_rto(struct sock *sk);
1599 ++
1600 ++/* TCP and MPTCP flag-depending functions */
1601 ++bool mptcp_prune_ofo_queue(struct sock *sk);
1602 ++
1603 ++#else /* CONFIG_MPTCP */
1604 ++#define mptcp_debug(fmt, args...) \
1605 ++ do { \
1606 ++ } while (0)
1607 ++
1608 ++/* Without MPTCP, we just do one iteration
1609 ++ * over the only socket available. This assumes that
1610 ++ * the sk/tp arg is the socket in that case.
1611 ++ */
1612 ++#define mptcp_for_each_sk(mpcb, sk)
1613 ++#define mptcp_for_each_sk_safe(__mpcb, __sk, __temp)
1614 ++
1615 ++static inline bool mptcp_is_data_fin(const struct sk_buff *skb)
1616 ++{
1617 ++ return false;
1618 ++}
1619 ++static inline bool mptcp_is_data_seq(const struct sk_buff *skb)
1620 ++{
1621 ++ return false;
1622 ++}
1623 ++static inline struct sock *mptcp_meta_sk(const struct sock *sk)
1624 ++{
1625 ++ return NULL;
1626 ++}
1627 ++static inline struct tcp_sock *mptcp_meta_tp(const struct tcp_sock *tp)
1628 ++{
1629 ++ return NULL;
1630 ++}
1631 ++static inline int is_meta_sk(const struct sock *sk)
1632 ++{
1633 ++ return 0;
1634 ++}
1635 ++static inline int is_master_tp(const struct tcp_sock *tp)
1636 ++{
1637 ++ return 0;
1638 ++}
1639 ++static inline void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp) {}
1640 ++static inline void mptcp_del_sock(const struct sock *sk) {}
1641 ++static inline void mptcp_update_metasocket(struct sock *sock, const struct sock *meta_sk) {}
1642 ++static inline void mptcp_reinject_data(struct sock *orig_sk, int clone_it) {}
1643 ++static inline void mptcp_update_sndbuf(const struct tcp_sock *tp) {}
1644 ++static inline void mptcp_clean_rtx_infinite(const struct sk_buff *skb,
1645 ++ const struct sock *sk) {}
1646 ++static inline void mptcp_sub_close(struct sock *sk, unsigned long delay) {}
1647 ++static inline void mptcp_set_rto(const struct sock *sk) {}
1648 ++static inline void mptcp_send_fin(const struct sock *meta_sk) {}
1649 ++static inline void mptcp_parse_options(const uint8_t *ptr, const int opsize,
1650 ++ const struct mptcp_options_received *mopt,
1651 ++ const struct sk_buff *skb) {}
1652 ++static inline void mptcp_syn_options(const struct sock *sk,
1653 ++ struct tcp_out_options *opts,
1654 ++ unsigned *remaining) {}
1655 ++static inline void mptcp_synack_options(struct request_sock *req,
1656 ++ struct tcp_out_options *opts,
1657 ++ unsigned *remaining) {}
1658 ++
1659 ++static inline void mptcp_established_options(struct sock *sk,
1660 ++ struct sk_buff *skb,
1661 ++ struct tcp_out_options *opts,
1662 ++ unsigned *size) {}
1663 ++static inline void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
1664 ++ const struct tcp_out_options *opts,
1665 ++ struct sk_buff *skb) {}
1666 ++static inline void mptcp_close(struct sock *meta_sk, long timeout) {}
1667 ++static inline int mptcp_doit(struct sock *sk)
1668 ++{
1669 ++ return 0;
1670 ++}
1671 ++static inline int mptcp_check_req_fastopen(struct sock *child,
1672 ++ struct request_sock *req)
1673 ++{
1674 ++ return 1;
1675 ++}
1676 ++static inline int mptcp_check_req_master(const struct sock *sk,
1677 ++ const struct sock *child,
1678 ++ struct request_sock *req,
1679 ++ struct request_sock **prev)
1680 ++{
1681 ++ return 1;
1682 ++}
1683 ++static inline struct sock *mptcp_check_req_child(struct sock *sk,
1684 ++ struct sock *child,
1685 ++ struct request_sock *req,
1686 ++ struct request_sock **prev,
1687 ++ const struct mptcp_options_received *mopt)
1688 ++{
1689 ++ return NULL;
1690 ++}
1691 ++static inline unsigned int mptcp_current_mss(struct sock *meta_sk)
1692 ++{
1693 ++ return 0;
1694 ++}
1695 ++static inline int mptcp_select_size(const struct sock *meta_sk, bool sg)
1696 ++{
1697 ++ return 0;
1698 ++}
1699 ++static inline void mptcp_sub_close_passive(struct sock *sk) {}
1700 ++static inline bool mptcp_fallback_infinite(const struct sock *sk, int flag)
1701 ++{
1702 ++ return false;
1703 ++}
1704 ++static inline void mptcp_init_mp_opt(const struct mptcp_options_received *mopt) {}
1705 ++static inline int mptcp_check_rtt(const struct tcp_sock *tp, int time)
1706 ++{
1707 ++ return 0;
1708 ++}
1709 ++static inline int mptcp_check_snd_buf(const struct tcp_sock *tp)
1710 ++{
1711 ++ return 0;
1712 ++}
1713 ++static inline int mptcp_sysctl_syn_retries(void)
1714 ++{
1715 ++ return 0;
1716 ++}
1717 ++static inline void mptcp_send_reset(const struct sock *sk) {}
1718 ++static inline int mptcp_handle_options(struct sock *sk,
1719 ++ const struct tcphdr *th,
1720 ++ struct sk_buff *skb)
1721 ++{
1722 ++ return 0;
1723 ++}
1724 ++static inline void mptcp_reset_mopt(struct tcp_sock *tp) {}
1725 ++static inline void __init mptcp_init(void) {}
1726 ++static inline int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
1727 ++{
1728 ++ return 0;
1729 ++}
1730 ++static inline bool mptcp_sk_can_gso(const struct sock *sk)
1731 ++{
1732 ++ return false;
1733 ++}
1734 ++static inline bool mptcp_can_sg(const struct sock *meta_sk)
1735 ++{
1736 ++ return false;
1737 ++}
1738 ++static inline unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk,
1739 ++ u32 mss_now, int large_allowed)
1740 ++{
1741 ++ return 0;
1742 ++}
1743 ++static inline void mptcp_destroy_sock(struct sock *sk) {}
1744 ++static inline int mptcp_rcv_synsent_state_process(struct sock *sk,
1745 ++ struct sock **skptr,
1746 ++ struct sk_buff *skb,
1747 ++ const struct mptcp_options_received *mopt)
1748 ++{
1749 ++ return 0;
1750 ++}
1751 ++static inline bool mptcp_can_sendpage(struct sock *sk)
1752 ++{
1753 ++ return false;
1754 ++}
1755 ++static inline int mptcp_init_tw_sock(struct sock *sk,
1756 ++ struct tcp_timewait_sock *tw)
1757 ++{
1758 ++ return 0;
1759 ++}
1760 ++static inline void mptcp_twsk_destructor(struct tcp_timewait_sock *tw) {}
1761 ++static inline void mptcp_disconnect(struct sock *sk) {}
1762 ++static inline void mptcp_tsq_flags(struct sock *sk) {}
1763 ++static inline void mptcp_tsq_sub_deferred(struct sock *meta_sk) {}
1764 ++static inline void mptcp_hash_remove_bh(struct tcp_sock *meta_tp) {}
1765 ++static inline void mptcp_hash_remove(struct tcp_sock *meta_tp) {}
1766 ++static inline void mptcp_reqsk_new_mptcp(struct request_sock *req,
1767 ++ const struct tcp_options_received *rx_opt,
1768 ++ const struct mptcp_options_received *mopt,
1769 ++ const struct sk_buff *skb) {}
1770 ++static inline void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
1771 ++ const struct sk_buff *skb) {}
1772 ++static inline void mptcp_delete_synack_timer(struct sock *meta_sk) {}
1773 ++#endif /* CONFIG_MPTCP */
1774 ++
1775 ++#endif /* _MPTCP_H */
1776 +diff --git a/include/net/mptcp_v4.h b/include/net/mptcp_v4.h
1777 +new file mode 100644
1778 +index 000000000000..93ad97c77c5a
1779 +--- /dev/null
1780 ++++ b/include/net/mptcp_v4.h
1781 +@@ -0,0 +1,67 @@
1782 ++/*
1783 ++ * MPTCP implementation
1784 ++ *
1785 ++ * Initial Design & Implementation:
1786 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
1787 ++ *
1788 ++ * Current Maintainer & Author:
1789 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
1790 ++ *
1791 ++ * Additional authors:
1792 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1793 ++ * Gregory Detal <gregory.detal@×××××××××.be>
1794 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
1795 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
1796 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
1797 ++ * Andreas Ripke <ripke@××××××.eu>
1798 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
1799 ++ * Octavian Purdila <octavian.purdila@×××××.com>
1800 ++ * John Ronan <jronan@××××.org>
1801 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
1802 ++ * Brandon Heller <brandonh@××××××××.edu>
1803 ++ *
1804 ++ *
1805 ++ * This program is free software; you can redistribute it and/or
1806 ++ * modify it under the terms of the GNU General Public License
1807 ++ * as published by the Free Software Foundation; either version
1808 ++ * 2 of the License, or (at your option) any later version.
1809 ++ */
1810 ++
1811 ++#ifndef MPTCP_V4_H_
1812 ++#define MPTCP_V4_H_
1813 ++
1814 ++
1815 ++#include <linux/in.h>
1816 ++#include <linux/skbuff.h>
1817 ++#include <net/mptcp.h>
1818 ++#include <net/request_sock.h>
1819 ++#include <net/sock.h>
1820 ++
1821 ++extern struct request_sock_ops mptcp_request_sock_ops;
1822 ++extern const struct inet_connection_sock_af_ops mptcp_v4_specific;
1823 ++extern struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
1824 ++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
1825 ++
1826 ++#ifdef CONFIG_MPTCP
1827 ++
1828 ++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
1829 ++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
1830 ++ const __be32 laddr, const struct net *net);
1831 ++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
1832 ++ struct mptcp_rem4 *rem);
1833 ++int mptcp_pm_v4_init(void);
1834 ++void mptcp_pm_v4_undo(void);
1835 ++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
1836 ++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport);
1837 ++
1838 ++#else
1839 ++
1840 ++static inline int mptcp_v4_do_rcv(const struct sock *meta_sk,
1841 ++ const struct sk_buff *skb)
1842 ++{
1843 ++ return 0;
1844 ++}
1845 ++
1846 ++#endif /* CONFIG_MPTCP */
1847 ++
1848 ++#endif /* MPTCP_V4_H_ */
1849 +diff --git a/include/net/mptcp_v6.h b/include/net/mptcp_v6.h
1850 +new file mode 100644
1851 +index 000000000000..49a4f30ccd4d
1852 +--- /dev/null
1853 ++++ b/include/net/mptcp_v6.h
1854 +@@ -0,0 +1,69 @@
1855 ++/*
1856 ++ * MPTCP implementation
1857 ++ *
1858 ++ * Initial Design & Implementation:
1859 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
1860 ++ *
1861 ++ * Current Maintainer & Author:
1862 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1863 ++ *
1864 ++ * Additional authors:
1865 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1866 ++ * Gregory Detal <gregory.detal@×××××××××.be>
1867 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
1868 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
1869 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
1870 ++ * Andreas Ripke <ripke@××××××.eu>
1871 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
1872 ++ * Octavian Purdila <octavian.purdila@×××××.com>
1873 ++ * John Ronan <jronan@××××.org>
1874 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
1875 ++ * Brandon Heller <brandonh@××××××××.edu>
1876 ++ *
1877 ++ *
1878 ++ * This program is free software; you can redistribute it and/or
1879 ++ * modify it under the terms of the GNU General Public License
1880 ++ * as published by the Free Software Foundation; either version
1881 ++ * 2 of the License, or (at your option) any later version.
1882 ++ */
1883 ++
1884 ++#ifndef _MPTCP_V6_H
1885 ++#define _MPTCP_V6_H
1886 ++
1887 ++#include <linux/in6.h>
1888 ++#include <net/if_inet6.h>
1889 ++
1890 ++#include <net/mptcp.h>
1891 ++
1892 ++
1893 ++#ifdef CONFIG_MPTCP
1894 ++extern const struct inet_connection_sock_af_ops mptcp_v6_mapped;
1895 ++extern const struct inet_connection_sock_af_ops mptcp_v6_specific;
1896 ++extern struct request_sock_ops mptcp6_request_sock_ops;
1897 ++extern struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
1898 ++extern struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
1899 ++
1900 ++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb);
1901 ++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
1902 ++ const struct in6_addr *laddr, const struct net *net);
1903 ++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
1904 ++ struct mptcp_rem6 *rem);
1905 ++int mptcp_pm_v6_init(void);
1906 ++void mptcp_pm_v6_undo(void);
1907 ++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
1908 ++ __be16 sport, __be16 dport);
1909 ++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
1910 ++ __be16 sport, __be16 dport);
1911 ++
1912 ++#else /* CONFIG_MPTCP */
1913 ++
1914 ++#define mptcp_v6_mapped ipv6_mapped
1915 ++
1916 ++static inline int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
1917 ++{
1918 ++ return 0;
1919 ++}
1920 ++
1921 ++#endif /* CONFIG_MPTCP */
1922 ++
1923 ++#endif /* _MPTCP_V6_H */
1924 +diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
1925 +index 361d26077196..bae95a11c531 100644
1926 +--- a/include/net/net_namespace.h
1927 ++++ b/include/net/net_namespace.h
1928 +@@ -16,6 +16,7 @@
1929 + #include <net/netns/packet.h>
1930 + #include <net/netns/ipv4.h>
1931 + #include <net/netns/ipv6.h>
1932 ++#include <net/netns/mptcp.h>
1933 + #include <net/netns/ieee802154_6lowpan.h>
1934 + #include <net/netns/sctp.h>
1935 + #include <net/netns/dccp.h>
1936 +@@ -92,6 +93,9 @@ struct net {
1937 + #if IS_ENABLED(CONFIG_IPV6)
1938 + struct netns_ipv6 ipv6;
1939 + #endif
1940 ++#if IS_ENABLED(CONFIG_MPTCP)
1941 ++ struct netns_mptcp mptcp;
1942 ++#endif
1943 + #if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
1944 + struct netns_ieee802154_lowpan ieee802154_lowpan;
1945 + #endif
1946 +diff --git a/include/net/netns/mptcp.h b/include/net/netns/mptcp.h
1947 +new file mode 100644
1948 +index 000000000000..bad418b04cc8
1949 +--- /dev/null
1950 ++++ b/include/net/netns/mptcp.h
1951 +@@ -0,0 +1,44 @@
1952 ++/*
1953 ++ * MPTCP implementation - MPTCP namespace
1954 ++ *
1955 ++ * Initial Design & Implementation:
1956 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
1957 ++ *
1958 ++ * Current Maintainer:
1959 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
1960 ++ *
1961 ++ * Additional authors:
1962 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
1963 ++ * Gregory Detal <gregory.detal@×××××××××.be>
1964 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
1965 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
1966 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
1967 ++ * Andreas Ripke <ripke@××××××.eu>
1968 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
1969 ++ * Octavian Purdila <octavian.purdila@×××××.com>
1970 ++ * John Ronan <jronan@××××.org>
1971 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
1972 ++ * Brandon Heller <brandonh@××××××××.edu>
1973 ++ *
1974 ++ *
1975 ++ * This program is free software; you can redistribute it and/or
1976 ++ * modify it under the terms of the GNU General Public License
1977 ++ * as published by the Free Software Foundation; either version
1978 ++ * 2 of the License, or (at your option) any later version.
1979 ++ */
1980 ++
1981 ++#ifndef __NETNS_MPTCP_H__
1982 ++#define __NETNS_MPTCP_H__
1983 ++
1984 ++#include <linux/compiler.h>
1985 ++
1986 ++enum {
1987 ++ MPTCP_PM_FULLMESH = 0,
1988 ++ MPTCP_PM_MAX
1989 ++};
1990 ++
1991 ++struct netns_mptcp {
1992 ++ void *path_managers[MPTCP_PM_MAX];
1993 ++};
1994 ++
1995 ++#endif /* __NETNS_MPTCP_H__ */
1996 +diff --git a/include/net/request_sock.h b/include/net/request_sock.h
1997 +index 7f830ff67f08..e79e87a8e1a6 100644
1998 +--- a/include/net/request_sock.h
1999 ++++ b/include/net/request_sock.h
2000 +@@ -164,7 +164,7 @@ struct request_sock_queue {
2001 + };
2002 +
2003 + int reqsk_queue_alloc(struct request_sock_queue *queue,
2004 +- unsigned int nr_table_entries);
2005 ++ unsigned int nr_table_entries, gfp_t flags);
2006 +
2007 + void __reqsk_queue_destroy(struct request_sock_queue *queue);
2008 + void reqsk_queue_destroy(struct request_sock_queue *queue);
2009 +diff --git a/include/net/sock.h b/include/net/sock.h
2010 +index 156350745700..0e23cae8861f 100644
2011 +--- a/include/net/sock.h
2012 ++++ b/include/net/sock.h
2013 +@@ -901,6 +901,16 @@ void sk_clear_memalloc(struct sock *sk);
2014 +
2015 + int sk_wait_data(struct sock *sk, long *timeo);
2016 +
2017 ++/* START - needed for MPTCP */
2018 ++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, int family);
2019 ++void sock_lock_init(struct sock *sk);
2020 ++
2021 ++extern struct lock_class_key af_callback_keys[AF_MAX];
2022 ++extern char *const af_family_clock_key_strings[AF_MAX+1];
2023 ++
2024 ++#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
2025 ++/* END - needed for MPTCP */
2026 ++
2027 + struct request_sock_ops;
2028 + struct timewait_sock_ops;
2029 + struct inet_hashinfo;
2030 +diff --git a/include/net/tcp.h b/include/net/tcp.h
2031 +index 7286db80e8b8..ff92e74cd684 100644
2032 +--- a/include/net/tcp.h
2033 ++++ b/include/net/tcp.h
2034 +@@ -177,6 +177,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
2035 + #define TCPOPT_SACK 5 /* SACK Block */
2036 + #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
2037 + #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
2038 ++#define TCPOPT_MPTCP 30
2039 + #define TCPOPT_EXP 254 /* Experimental */
2040 + /* Magic number to be after the option value for sharing TCP
2041 + * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
2042 +@@ -229,6 +230,27 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
2043 + #define TFO_SERVER_WO_SOCKOPT1 0x400
2044 + #define TFO_SERVER_WO_SOCKOPT2 0x800
2045 +
2046 ++/* Flags from tcp_input.c for tcp_ack */
2047 ++#define FLAG_DATA 0x01 /* Incoming frame contained data. */
2048 ++#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
2049 ++#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
2050 ++#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
2051 ++#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
2052 ++#define FLAG_DATA_SACKED 0x20 /* New SACK. */
2053 ++#define FLAG_ECE 0x40 /* ECE in this ACK */
2054 ++#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
2055 ++#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
2056 ++#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
2057 ++#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
2058 ++#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
2059 ++#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
2060 ++#define MPTCP_FLAG_DATA_ACKED 0x8000
2061 ++
2062 ++#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
2063 ++#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
2064 ++#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
2065 ++#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
2066 ++
2067 + extern struct inet_timewait_death_row tcp_death_row;
2068 +
2069 + /* sysctl variables for tcp */
2070 +@@ -344,6 +366,107 @@ extern struct proto tcp_prot;
2071 + #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val)
2072 + #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
2073 +
2074 ++/**** START - Exports needed for MPTCP ****/
2075 ++extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops;
2076 ++extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops;
2077 ++
2078 ++struct mptcp_options_received;
2079 ++
2080 ++void tcp_enter_quickack_mode(struct sock *sk);
2081 ++int tcp_close_state(struct sock *sk);
2082 ++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
2083 ++ const struct sk_buff *skb);
2084 ++int tcp_xmit_probe_skb(struct sock *sk, int urgent);
2085 ++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
2086 ++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
2087 ++ gfp_t gfp_mask);
2088 ++unsigned int tcp_mss_split_point(const struct sock *sk,
2089 ++ const struct sk_buff *skb,
2090 ++ unsigned int mss_now,
2091 ++ unsigned int max_segs,
2092 ++ int nonagle);
2093 ++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2094 ++ unsigned int cur_mss, int nonagle);
2095 ++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
2096 ++ unsigned int cur_mss);
2097 ++unsigned int tcp_cwnd_test(const struct tcp_sock *tp, const struct sk_buff *skb);
2098 ++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
2099 ++ unsigned int mss_now);
2100 ++void __pskb_trim_head(struct sk_buff *skb, int len);
2101 ++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb);
2102 ++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags);
2103 ++void tcp_reset(struct sock *sk);
2104 ++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
2105 ++ const u32 ack_seq, const u32 nwin);
2106 ++bool tcp_urg_mode(const struct tcp_sock *tp);
2107 ++void tcp_ack_probe(struct sock *sk);
2108 ++void tcp_rearm_rto(struct sock *sk);
2109 ++int tcp_write_timeout(struct sock *sk);
2110 ++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
2111 ++ unsigned int timeout, bool syn_set);
2112 ++void tcp_write_err(struct sock *sk);
2113 ++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr);
2114 ++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
2115 ++ unsigned int mss_now);
2116 ++
2117 ++int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req);
2118 ++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
2119 ++ struct request_sock *req);
2120 ++__u32 tcp_v4_init_sequence(const struct sk_buff *skb);
2121 ++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
2122 ++ struct flowi *fl,
2123 ++ struct request_sock *req,
2124 ++ u16 queue_mapping,
2125 ++ struct tcp_fastopen_cookie *foc);
2126 ++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb);
2127 ++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb);
2128 ++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb);
2129 ++void tcp_v4_reqsk_destructor(struct request_sock *req);
2130 ++
2131 ++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req);
2132 ++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
2133 ++ struct request_sock *req);
2134 ++__u32 tcp_v6_init_sequence(const struct sk_buff *skb);
2135 ++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
2136 ++ struct flowi *fl, struct request_sock *req,
2137 ++ u16 queue_mapping, struct tcp_fastopen_cookie *foc);
2138 ++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
2139 ++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
2140 ++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
2141 ++void tcp_v6_destroy_sock(struct sock *sk);
2142 ++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
2143 ++void tcp_v6_hash(struct sock *sk);
2144 ++struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb);
2145 ++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
2146 ++ struct request_sock *req,
2147 ++ struct dst_entry *dst);
2148 ++void tcp_v6_reqsk_destructor(struct request_sock *req);
2149 ++
2150 ++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
2151 ++ int large_allowed);
2152 ++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb);
2153 ++
2154 ++void skb_clone_fraglist(struct sk_buff *skb);
2155 ++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old);
2156 ++
2157 ++void inet_twsk_free(struct inet_timewait_sock *tw);
2158 ++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb);
2159 ++/* These states need RST on ABORT according to RFC793 */
2160 ++static inline bool tcp_need_reset(int state)
2161 ++{
2162 ++ return (1 << state) &
2163 ++ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2164 ++ TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2165 ++}
2166 ++
2167 ++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
2168 ++ int hlen);
2169 ++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
2170 ++ bool *fragstolen);
2171 ++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to,
2172 ++ struct sk_buff *from, bool *fragstolen);
2173 ++/**** END - Exports needed for MPTCP ****/
2174 ++
2175 + void tcp_tasklet_init(void);
2176 +
2177 + void tcp_v4_err(struct sk_buff *skb, u32);
2178 +@@ -440,6 +563,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
2179 + size_t len, int nonblock, int flags, int *addr_len);
2180 + void tcp_parse_options(const struct sk_buff *skb,
2181 + struct tcp_options_received *opt_rx,
2182 ++ struct mptcp_options_received *mopt_rx,
2183 + int estab, struct tcp_fastopen_cookie *foc);
2184 + const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
2185 +
2186 +@@ -493,14 +617,8 @@ static inline u32 tcp_cookie_time(void)
2187 +
2188 + u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
2189 + u16 *mssp);
2190 +-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mss);
2191 +-#else
2192 +-static inline __u32 cookie_v4_init_sequence(struct sock *sk,
2193 +- struct sk_buff *skb,
2194 +- __u16 *mss)
2195 +-{
2196 +- return 0;
2197 +-}
2198 ++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
2199 ++ __u16 *mss);
2200 + #endif
2201 +
2202 + __u32 cookie_init_timestamp(struct request_sock *req);
2203 +@@ -516,13 +634,6 @@ u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
2204 + const struct tcphdr *th, u16 *mssp);
2205 + __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
2206 + __u16 *mss);
2207 +-#else
2208 +-static inline __u32 cookie_v6_init_sequence(struct sock *sk,
2209 +- struct sk_buff *skb,
2210 +- __u16 *mss)
2211 +-{
2212 +- return 0;
2213 +-}
2214 + #endif
2215 + /* tcp_output.c */
2216 +
2217 +@@ -551,10 +662,17 @@ void tcp_send_delayed_ack(struct sock *sk);
2218 + void tcp_send_loss_probe(struct sock *sk);
2219 + bool tcp_schedule_loss_probe(struct sock *sk);
2220 +
2221 ++u16 tcp_select_window(struct sock *sk);
2222 ++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
2223 ++ int push_one, gfp_t gfp);
2224 ++
2225 + /* tcp_input.c */
2226 + void tcp_resume_early_retransmit(struct sock *sk);
2227 + void tcp_rearm_rto(struct sock *sk);
2228 + void tcp_reset(struct sock *sk);
2229 ++void tcp_set_rto(struct sock *sk);
2230 ++bool tcp_should_expand_sndbuf(const struct sock *sk);
2231 ++bool tcp_prune_ofo_queue(struct sock *sk);
2232 +
2233 + /* tcp_timer.c */
2234 + void tcp_init_xmit_timers(struct sock *);
2235 +@@ -703,14 +821,27 @@ void tcp_send_window_probe(struct sock *sk);
2236 + */
2237 + struct tcp_skb_cb {
2238 + union {
2239 +- struct inet_skb_parm h4;
2240 ++ union {
2241 ++ struct inet_skb_parm h4;
2242 + #if IS_ENABLED(CONFIG_IPV6)
2243 +- struct inet6_skb_parm h6;
2244 ++ struct inet6_skb_parm h6;
2245 + #endif
2246 +- } header; /* For incoming frames */
2247 ++ } header; /* For incoming frames */
2248 ++#ifdef CONFIG_MPTCP
2249 ++ union { /* For MPTCP outgoing frames */
2250 ++ __u32 path_mask; /* paths that tried to send this skb */
2251 ++ __u32 dss[6]; /* DSS options */
2252 ++ };
2253 ++#endif
2254 ++ };
2255 + __u32 seq; /* Starting sequence number */
2256 + __u32 end_seq; /* SEQ + FIN + SYN + datalen */
2257 + __u32 when; /* used to compute rtt's */
2258 ++#ifdef CONFIG_MPTCP
2259 ++ __u8 mptcp_flags; /* flags for the MPTCP layer */
2260 ++ __u8 dss_off; /* Number of 4-byte words until
2261 ++ * seq-number */
2262 ++#endif
2263 + __u8 tcp_flags; /* TCP header flags. (tcp[13]) */
2264 +
2265 + __u8 sacked; /* State flags for SACK/FACK. */
2266 +@@ -1075,7 +1206,8 @@ u32 tcp_default_init_rwnd(u32 mss);
2267 + /* Determine a window scaling and initial window to offer. */
2268 + void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
2269 + __u32 *window_clamp, int wscale_ok,
2270 +- __u8 *rcv_wscale, __u32 init_rcv_wnd);
2271 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
2272 ++ const struct sock *sk);
2273 +
2274 + static inline int tcp_win_from_space(int space)
2275 + {
2276 +@@ -1084,15 +1216,34 @@ static inline int tcp_win_from_space(int space)
2277 + space - (space>>sysctl_tcp_adv_win_scale);
2278 + }
2279 +
2280 ++#ifdef CONFIG_MPTCP
2281 ++extern struct static_key mptcp_static_key;
2282 ++static inline bool mptcp(const struct tcp_sock *tp)
2283 ++{
2284 ++ return static_key_false(&mptcp_static_key) && tp->mpc;
2285 ++}
2286 ++#else
2287 ++static inline bool mptcp(const struct tcp_sock *tp)
2288 ++{
2289 ++ return 0;
2290 ++}
2291 ++#endif
2292 ++
2293 + /* Note: caller must be prepared to deal with negative returns */
2294 + static inline int tcp_space(const struct sock *sk)
2295 + {
2296 ++ if (mptcp(tcp_sk(sk)))
2297 ++ sk = tcp_sk(sk)->meta_sk;
2298 ++
2299 + return tcp_win_from_space(sk->sk_rcvbuf -
2300 + atomic_read(&sk->sk_rmem_alloc));
2301 + }
2302 +
2303 + static inline int tcp_full_space(const struct sock *sk)
2304 + {
2305 ++ if (mptcp(tcp_sk(sk)))
2306 ++ sk = tcp_sk(sk)->meta_sk;
2307 ++
2308 + return tcp_win_from_space(sk->sk_rcvbuf);
2309 + }
2310 +
2311 +@@ -1115,6 +1266,8 @@ static inline void tcp_openreq_init(struct request_sock *req,
2312 + ireq->wscale_ok = rx_opt->wscale_ok;
2313 + ireq->acked = 0;
2314 + ireq->ecn_ok = 0;
2315 ++ ireq->mptcp_rqsk = 0;
2316 ++ ireq->saw_mpc = 0;
2317 + ireq->ir_rmt_port = tcp_hdr(skb)->source;
2318 + ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
2319 + }
2320 +@@ -1585,6 +1738,11 @@ int tcp4_proc_init(void);
2321 + void tcp4_proc_exit(void);
2322 + #endif
2323 +
2324 ++int tcp_rtx_synack(struct sock *sk, struct request_sock *req);
2325 ++int tcp_conn_request(struct request_sock_ops *rsk_ops,
2326 ++ const struct tcp_request_sock_ops *af_ops,
2327 ++ struct sock *sk, struct sk_buff *skb);
2328 ++
2329 + /* TCP af-specific functions */
2330 + struct tcp_sock_af_ops {
2331 + #ifdef CONFIG_TCP_MD5SIG
2332 +@@ -1601,7 +1759,32 @@ struct tcp_sock_af_ops {
2333 + #endif
2334 + };
2335 +
2336 ++/* TCP/MPTCP-specific functions */
2337 ++struct tcp_sock_ops {
2338 ++ u32 (*__select_window)(struct sock *sk);
2339 ++ u16 (*select_window)(struct sock *sk);
2340 ++ void (*select_initial_window)(int __space, __u32 mss, __u32 *rcv_wnd,
2341 ++ __u32 *window_clamp, int wscale_ok,
2342 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
2343 ++ const struct sock *sk);
2344 ++ void (*init_buffer_space)(struct sock *sk);
2345 ++ void (*set_rto)(struct sock *sk);
2346 ++ bool (*should_expand_sndbuf)(const struct sock *sk);
2347 ++ void (*send_fin)(struct sock *sk);
2348 ++ bool (*write_xmit)(struct sock *sk, unsigned int mss_now, int nonagle,
2349 ++ int push_one, gfp_t gfp);
2350 ++ void (*send_active_reset)(struct sock *sk, gfp_t priority);
2351 ++ int (*write_wakeup)(struct sock *sk);
2352 ++ bool (*prune_ofo_queue)(struct sock *sk);
2353 ++ void (*retransmit_timer)(struct sock *sk);
2354 ++ void (*time_wait)(struct sock *sk, int state, int timeo);
2355 ++ void (*cleanup_rbuf)(struct sock *sk, int copied);
2356 ++ void (*init_congestion_control)(struct sock *sk);
2357 ++};
2358 ++extern const struct tcp_sock_ops tcp_specific;
2359 ++
2360 + struct tcp_request_sock_ops {
2361 ++ u16 mss_clamp;
2362 + #ifdef CONFIG_TCP_MD5SIG
2363 + struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk,
2364 + struct request_sock *req);
2365 +@@ -1611,8 +1794,39 @@ struct tcp_request_sock_ops {
2366 + const struct request_sock *req,
2367 + const struct sk_buff *skb);
2368 + #endif
2369 ++ int (*init_req)(struct request_sock *req, struct sock *sk,
2370 ++ struct sk_buff *skb);
2371 ++#ifdef CONFIG_SYN_COOKIES
2372 ++ __u32 (*cookie_init_seq)(struct sock *sk, const struct sk_buff *skb,
2373 ++ __u16 *mss);
2374 ++#endif
2375 ++ struct dst_entry *(*route_req)(struct sock *sk, struct flowi *fl,
2376 ++ const struct request_sock *req,
2377 ++ bool *strict);
2378 ++ __u32 (*init_seq)(const struct sk_buff *skb);
2379 ++ int (*send_synack)(struct sock *sk, struct dst_entry *dst,
2380 ++ struct flowi *fl, struct request_sock *req,
2381 ++ u16 queue_mapping, struct tcp_fastopen_cookie *foc);
2382 ++ void (*queue_hash_add)(struct sock *sk, struct request_sock *req,
2383 ++ const unsigned long timeout);
2384 + };
2385 +
2386 ++#ifdef CONFIG_SYN_COOKIES
2387 ++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
2388 ++ struct sock *sk, struct sk_buff *skb,
2389 ++ __u16 *mss)
2390 ++{
2391 ++ return ops->cookie_init_seq(sk, skb, mss);
2392 ++}
2393 ++#else
2394 ++static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops,
2395 ++ struct sock *sk, struct sk_buff *skb,
2396 ++ __u16 *mss)
2397 ++{
2398 ++ return 0;
2399 ++}
2400 ++#endif
2401 ++
2402 + int tcpv4_offload_init(void);
2403 +
2404 + void tcp_v4_init(void);
2405 +diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
2406 +index 9cf2394f0bcf..c2634b6ed854 100644
2407 +--- a/include/uapi/linux/if.h
2408 ++++ b/include/uapi/linux/if.h
2409 +@@ -109,6 +109,9 @@ enum net_device_flags {
2410 + #define IFF_DORMANT IFF_DORMANT
2411 + #define IFF_ECHO IFF_ECHO
2412 +
2413 ++#define IFF_NOMULTIPATH 0x80000 /* Disable for MPTCP */
2414 ++#define IFF_MPBACKUP 0x100000 /* Use as backup path for MPTCP */
2415 ++
2416 + #define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_ECHO|\
2417 + IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
2418 +
2419 +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
2420 +index 3b9718328d8b..487475681d84 100644
2421 +--- a/include/uapi/linux/tcp.h
2422 ++++ b/include/uapi/linux/tcp.h
2423 +@@ -112,6 +112,7 @@ enum {
2424 + #define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
2425 + #define TCP_TIMESTAMP 24
2426 + #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
2427 ++#define MPTCP_ENABLED 26
2428 +
2429 + struct tcp_repair_opt {
2430 + __u32 opt_code;
2431 +diff --git a/net/Kconfig b/net/Kconfig
2432 +index d92afe4204d9..96b58593ad5e 100644
2433 +--- a/net/Kconfig
2434 ++++ b/net/Kconfig
2435 +@@ -79,6 +79,7 @@ if INET
2436 + source "net/ipv4/Kconfig"
2437 + source "net/ipv6/Kconfig"
2438 + source "net/netlabel/Kconfig"
2439 ++source "net/mptcp/Kconfig"
2440 +
2441 + endif # if INET
2442 +
2443 +diff --git a/net/Makefile b/net/Makefile
2444 +index cbbbe6d657ca..244bac1435b1 100644
2445 +--- a/net/Makefile
2446 ++++ b/net/Makefile
2447 +@@ -20,6 +20,7 @@ obj-$(CONFIG_INET) += ipv4/
2448 + obj-$(CONFIG_XFRM) += xfrm/
2449 + obj-$(CONFIG_UNIX) += unix/
2450 + obj-$(CONFIG_NET) += ipv6/
2451 ++obj-$(CONFIG_MPTCP) += mptcp/
2452 + obj-$(CONFIG_PACKET) += packet/
2453 + obj-$(CONFIG_NET_KEY) += key/
2454 + obj-$(CONFIG_BRIDGE) += bridge/
2455 +diff --git a/net/core/dev.c b/net/core/dev.c
2456 +index 367a586d0c8a..215d2757fbf6 100644
2457 +--- a/net/core/dev.c
2458 ++++ b/net/core/dev.c
2459 +@@ -5420,7 +5420,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
2460 +
2461 + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2462 + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2463 +- IFF_AUTOMEDIA)) |
2464 ++ IFF_AUTOMEDIA | IFF_NOMULTIPATH | IFF_MPBACKUP)) |
2465 + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2466 + IFF_ALLMULTI));
2467 +
2468 +diff --git a/net/core/request_sock.c b/net/core/request_sock.c
2469 +index 467f326126e0..909dfa13f499 100644
2470 +--- a/net/core/request_sock.c
2471 ++++ b/net/core/request_sock.c
2472 +@@ -38,7 +38,8 @@ int sysctl_max_syn_backlog = 256;
2473 + EXPORT_SYMBOL(sysctl_max_syn_backlog);
2474 +
2475 + int reqsk_queue_alloc(struct request_sock_queue *queue,
2476 +- unsigned int nr_table_entries)
2477 ++ unsigned int nr_table_entries,
2478 ++ gfp_t flags)
2479 + {
2480 + size_t lopt_size = sizeof(struct listen_sock);
2481 + struct listen_sock *lopt;
2482 +@@ -48,9 +49,11 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
2483 + nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
2484 + lopt_size += nr_table_entries * sizeof(struct request_sock *);
2485 + if (lopt_size > PAGE_SIZE)
2486 +- lopt = vzalloc(lopt_size);
2487 ++ lopt = __vmalloc(lopt_size,
2488 ++ flags | __GFP_HIGHMEM | __GFP_ZERO,
2489 ++ PAGE_KERNEL);
2490 + else
2491 +- lopt = kzalloc(lopt_size, GFP_KERNEL);
2492 ++ lopt = kzalloc(lopt_size, flags);
2493 + if (lopt == NULL)
2494 + return -ENOMEM;
2495 +
2496 +diff --git a/net/core/skbuff.c b/net/core/skbuff.c
2497 +index c1a33033cbe2..8abc5d60fbe3 100644
2498 +--- a/net/core/skbuff.c
2499 ++++ b/net/core/skbuff.c
2500 +@@ -472,7 +472,7 @@ static inline void skb_drop_fraglist(struct sk_buff *skb)
2501 + skb_drop_list(&skb_shinfo(skb)->frag_list);
2502 + }
2503 +
2504 +-static void skb_clone_fraglist(struct sk_buff *skb)
2505 ++void skb_clone_fraglist(struct sk_buff *skb)
2506 + {
2507 + struct sk_buff *list;
2508 +
2509 +@@ -897,7 +897,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
2510 + skb->inner_mac_header += off;
2511 + }
2512 +
2513 +-static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
2514 ++void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
2515 + {
2516 + __copy_skb_header(new, old);
2517 +
2518 +diff --git a/net/core/sock.c b/net/core/sock.c
2519 +index 026e01f70274..359295523177 100644
2520 +--- a/net/core/sock.c
2521 ++++ b/net/core/sock.c
2522 +@@ -136,6 +136,11 @@
2523 +
2524 + #include <trace/events/sock.h>
2525 +
2526 ++#ifdef CONFIG_MPTCP
2527 ++#include <net/mptcp.h>
2528 ++#include <net/inet_common.h>
2529 ++#endif
2530 ++
2531 + #ifdef CONFIG_INET
2532 + #include <net/tcp.h>
2533 + #endif
2534 +@@ -280,7 +285,7 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
2535 + "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
2536 + "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
2537 + };
2538 +-static const char *const af_family_clock_key_strings[AF_MAX+1] = {
2539 ++char *const af_family_clock_key_strings[AF_MAX+1] = {
2540 + "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
2541 + "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK",
2542 + "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" ,
2543 +@@ -301,7 +306,7 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
2544 + * sk_callback_lock locking rules are per-address-family,
2545 + * so split the lock classes by using a per-AF key:
2546 + */
2547 +-static struct lock_class_key af_callback_keys[AF_MAX];
2548 ++struct lock_class_key af_callback_keys[AF_MAX];
2549 +
2550 + /* Take into consideration the size of the struct sk_buff overhead in the
2551 + * determination of these values, since that is non-constant across
2552 +@@ -422,8 +427,6 @@ static void sock_warn_obsolete_bsdism(const char *name)
2553 + }
2554 + }
2555 +
2556 +-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
2557 +-
2558 + static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
2559 + {
2560 + if (sk->sk_flags & flags) {
2561 +@@ -1253,8 +1256,25 @@ lenout:
2562 + *
2563 + * (We also register the sk_lock with the lock validator.)
2564 + */
2565 +-static inline void sock_lock_init(struct sock *sk)
2566 +-{
2567 ++void sock_lock_init(struct sock *sk)
2568 ++{
2569 ++#ifdef CONFIG_MPTCP
2570 ++ /* Reclassify the lock-class for subflows */
2571 ++ if (sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP)
2572 ++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->is_master_sk) {
2573 ++ sock_lock_init_class_and_name(sk, "slock-AF_INET-MPTCP",
2574 ++ &meta_slock_key,
2575 ++ "sk_lock-AF_INET-MPTCP",
2576 ++ &meta_key);
2577 ++
2578 ++ /* We don't yet have the mptcp-point.
2579 ++ * Thus we still need inet_sock_destruct
2580 ++ */
2581 ++ sk->sk_destruct = inet_sock_destruct;
2582 ++ return;
2583 ++ }
2584 ++#endif
2585 ++
2586 + sock_lock_init_class_and_name(sk,
2587 + af_family_slock_key_strings[sk->sk_family],
2588 + af_family_slock_keys + sk->sk_family,
2589 +@@ -1301,7 +1321,7 @@ void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
2590 + }
2591 + EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
2592 +
2593 +-static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2594 ++struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2595 + int family)
2596 + {
2597 + struct sock *sk;
2598 +diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
2599 +index 4db3c2a1679c..04cb17d4b0ce 100644
2600 +--- a/net/dccp/ipv6.c
2601 ++++ b/net/dccp/ipv6.c
2602 +@@ -386,7 +386,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
2603 + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
2604 + goto drop;
2605 +
2606 +- req = inet6_reqsk_alloc(&dccp6_request_sock_ops);
2607 ++ req = inet_reqsk_alloc(&dccp6_request_sock_ops);
2608 + if (req == NULL)
2609 + goto drop;
2610 +
2611 +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
2612 +index 05c57f0fcabe..630434db0085 100644
2613 +--- a/net/ipv4/Kconfig
2614 ++++ b/net/ipv4/Kconfig
2615 +@@ -556,6 +556,30 @@ config TCP_CONG_ILLINOIS
2616 + For further details see:
2617 + http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
2618 +
2619 ++config TCP_CONG_COUPLED
2620 ++ tristate "MPTCP COUPLED CONGESTION CONTROL"
2621 ++ depends on MPTCP
2622 ++ default n
2623 ++ ---help---
2624 ++ MultiPath TCP Coupled Congestion Control
2625 ++ To enable it, just put 'coupled' in tcp_congestion_control
2626 ++
2627 ++config TCP_CONG_OLIA
2628 ++ tristate "MPTCP Opportunistic Linked Increase"
2629 ++ depends on MPTCP
2630 ++ default n
2631 ++ ---help---
2632 ++ MultiPath TCP Opportunistic Linked Increase Congestion Control
2633 ++ To enable it, just put 'olia' in tcp_congestion_control
2634 ++
2635 ++config TCP_CONG_WVEGAS
2636 ++ tristate "MPTCP WVEGAS CONGESTION CONTROL"
2637 ++ depends on MPTCP
2638 ++ default n
2639 ++ ---help---
2640 ++ wVegas congestion control for MPTCP
2641 ++ To enable it, just put 'wvegas' in tcp_congestion_control
2642 ++
2643 + choice
2644 + prompt "Default TCP congestion control"
2645 + default DEFAULT_CUBIC
2646 +@@ -584,6 +608,15 @@ choice
2647 + config DEFAULT_WESTWOOD
2648 + bool "Westwood" if TCP_CONG_WESTWOOD=y
2649 +
2650 ++ config DEFAULT_COUPLED
2651 ++ bool "Coupled" if TCP_CONG_COUPLED=y
2652 ++
2653 ++ config DEFAULT_OLIA
2654 ++ bool "Olia" if TCP_CONG_OLIA=y
2655 ++
2656 ++ config DEFAULT_WVEGAS
2657 ++ bool "Wvegas" if TCP_CONG_WVEGAS=y
2658 ++
2659 + config DEFAULT_RENO
2660 + bool "Reno"
2661 +
2662 +@@ -605,6 +638,8 @@ config DEFAULT_TCP_CONG
2663 + default "vegas" if DEFAULT_VEGAS
2664 + default "westwood" if DEFAULT_WESTWOOD
2665 + default "veno" if DEFAULT_VENO
2666 ++ default "coupled" if DEFAULT_COUPLED
2667 ++ default "wvegas" if DEFAULT_WVEGAS
2668 + default "reno" if DEFAULT_RENO
2669 + default "cubic"
2670 +
2671 +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
2672 +index d156b3c5f363..4afd6d8d9028 100644
2673 +--- a/net/ipv4/af_inet.c
2674 ++++ b/net/ipv4/af_inet.c
2675 +@@ -104,6 +104,7 @@
2676 + #include <net/ip_fib.h>
2677 + #include <net/inet_connection_sock.h>
2678 + #include <net/tcp.h>
2679 ++#include <net/mptcp.h>
2680 + #include <net/udp.h>
2681 + #include <net/udplite.h>
2682 + #include <net/ping.h>
2683 +@@ -246,8 +247,7 @@ EXPORT_SYMBOL(inet_listen);
2684 + * Create an inet socket.
2685 + */
2686 +
2687 +-static int inet_create(struct net *net, struct socket *sock, int protocol,
2688 +- int kern)
2689 ++int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
2690 + {
2691 + struct sock *sk;
2692 + struct inet_protosw *answer;
2693 +@@ -676,6 +676,23 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags)
2694 + lock_sock(sk2);
2695 +
2696 + sock_rps_record_flow(sk2);
2697 ++
2698 ++ if (sk2->sk_protocol == IPPROTO_TCP && mptcp(tcp_sk(sk2))) {
2699 ++ struct sock *sk_it = sk2;
2700 ++
2701 ++ mptcp_for_each_sk(tcp_sk(sk2)->mpcb, sk_it)
2702 ++ sock_rps_record_flow(sk_it);
2703 ++
2704 ++ if (tcp_sk(sk2)->mpcb->master_sk) {
2705 ++ sk_it = tcp_sk(sk2)->mpcb->master_sk;
2706 ++
2707 ++ write_lock_bh(&sk_it->sk_callback_lock);
2708 ++ sk_it->sk_wq = newsock->wq;
2709 ++ sk_it->sk_socket = newsock;
2710 ++ write_unlock_bh(&sk_it->sk_callback_lock);
2711 ++ }
2712 ++ }
2713 ++
2714 + WARN_ON(!((1 << sk2->sk_state) &
2715 + (TCPF_ESTABLISHED | TCPF_SYN_RECV |
2716 + TCPF_CLOSE_WAIT | TCPF_CLOSE)));
2717 +@@ -1763,6 +1780,9 @@ static int __init inet_init(void)
2718 +
2719 + ip_init();
2720 +
2721 ++ /* We must initialize MPTCP before TCP. */
2722 ++ mptcp_init();
2723 ++
2724 + tcp_v4_init();
2725 +
2726 + /* Setup TCP slab cache for open requests. */
2727 +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
2728 +index 14d02ea905b6..7d734d8af19b 100644
2729 +--- a/net/ipv4/inet_connection_sock.c
2730 ++++ b/net/ipv4/inet_connection_sock.c
2731 +@@ -23,6 +23,7 @@
2732 + #include <net/route.h>
2733 + #include <net/tcp_states.h>
2734 + #include <net/xfrm.h>
2735 ++#include <net/mptcp.h>
2736 +
2737 + #ifdef INET_CSK_DEBUG
2738 + const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
2739 +@@ -465,8 +466,8 @@ no_route:
2740 + }
2741 + EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
2742 +
2743 +-static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
2744 +- const u32 rnd, const u32 synq_hsize)
2745 ++u32 inet_synq_hash(const __be32 raddr, const __be16 rport, const u32 rnd,
2746 ++ const u32 synq_hsize)
2747 + {
2748 + return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
2749 + }
2750 +@@ -647,7 +648,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
2751 +
2752 + lopt->clock_hand = i;
2753 +
2754 +- if (lopt->qlen)
2755 ++ if (lopt->qlen && !is_meta_sk(parent))
2756 + inet_csk_reset_keepalive_timer(parent, interval);
2757 + }
2758 + EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
2759 +@@ -664,7 +665,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
2760 + const struct request_sock *req,
2761 + const gfp_t priority)
2762 + {
2763 +- struct sock *newsk = sk_clone_lock(sk, priority);
2764 ++ struct sock *newsk;
2765 ++
2766 ++ newsk = sk_clone_lock(sk, priority);
2767 +
2768 + if (newsk != NULL) {
2769 + struct inet_connection_sock *newicsk = inet_csk(newsk);
2770 +@@ -743,7 +746,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
2771 + {
2772 + struct inet_sock *inet = inet_sk(sk);
2773 + struct inet_connection_sock *icsk = inet_csk(sk);
2774 +- int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);
2775 ++ int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries,
2776 ++ GFP_KERNEL);
2777 +
2778 + if (rc != 0)
2779 + return rc;
2780 +@@ -801,9 +805,14 @@ void inet_csk_listen_stop(struct sock *sk)
2781 +
2782 + while ((req = acc_req) != NULL) {
2783 + struct sock *child = req->sk;
2784 ++ bool mutex_taken = false;
2785 +
2786 + acc_req = req->dl_next;
2787 +
2788 ++ if (is_meta_sk(child)) {
2789 ++ mutex_lock(&tcp_sk(child)->mpcb->mpcb_mutex);
2790 ++ mutex_taken = true;
2791 ++ }
2792 + local_bh_disable();
2793 + bh_lock_sock(child);
2794 + WARN_ON(sock_owned_by_user(child));
2795 +@@ -832,6 +841,8 @@ void inet_csk_listen_stop(struct sock *sk)
2796 +
2797 + bh_unlock_sock(child);
2798 + local_bh_enable();
2799 ++ if (mutex_taken)
2800 ++ mutex_unlock(&tcp_sk(child)->mpcb->mpcb_mutex);
2801 + sock_put(child);
2802 +
2803 + sk_acceptq_removed(sk);
2804 +diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
2805 +index c86624b36a62..0ff3fe004d62 100644
2806 +--- a/net/ipv4/syncookies.c
2807 ++++ b/net/ipv4/syncookies.c
2808 +@@ -170,7 +170,8 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
2809 + }
2810 + EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence);
2811 +
2812 +-__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
2813 ++__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
2814 ++ __u16 *mssp)
2815 + {
2816 + const struct iphdr *iph = ip_hdr(skb);
2817 + const struct tcphdr *th = tcp_hdr(skb);
2818 +@@ -284,7 +285,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
2819 +
2820 + /* check for timestamp cookie support */
2821 + memset(&tcp_opt, 0, sizeof(tcp_opt));
2822 +- tcp_parse_options(skb, &tcp_opt, 0, NULL);
2823 ++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
2824 +
2825 + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
2826 + goto out;
2827 +@@ -355,10 +356,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
2828 + /* Try to redo what tcp_v4_send_synack did. */
2829 + req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
2830 +
2831 +- tcp_select_initial_window(tcp_full_space(sk), req->mss,
2832 +- &req->rcv_wnd, &req->window_clamp,
2833 +- ireq->wscale_ok, &rcv_wscale,
2834 +- dst_metric(&rt->dst, RTAX_INITRWND));
2835 ++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
2836 ++ &req->rcv_wnd, &req->window_clamp,
2837 ++ ireq->wscale_ok, &rcv_wscale,
2838 ++ dst_metric(&rt->dst, RTAX_INITRWND), sk);
2839 +
2840 + ireq->rcv_wscale = rcv_wscale;
2841 +
2842 +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
2843 +index 9d2118e5fbc7..2cb89f886d45 100644
2844 +--- a/net/ipv4/tcp.c
2845 ++++ b/net/ipv4/tcp.c
2846 +@@ -271,6 +271,7 @@
2847 +
2848 + #include <net/icmp.h>
2849 + #include <net/inet_common.h>
2850 ++#include <net/mptcp.h>
2851 + #include <net/tcp.h>
2852 + #include <net/xfrm.h>
2853 + #include <net/ip.h>
2854 +@@ -371,6 +372,24 @@ static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
2855 + return period;
2856 + }
2857 +
2858 ++const struct tcp_sock_ops tcp_specific = {
2859 ++ .__select_window = __tcp_select_window,
2860 ++ .select_window = tcp_select_window,
2861 ++ .select_initial_window = tcp_select_initial_window,
2862 ++ .init_buffer_space = tcp_init_buffer_space,
2863 ++ .set_rto = tcp_set_rto,
2864 ++ .should_expand_sndbuf = tcp_should_expand_sndbuf,
2865 ++ .init_congestion_control = tcp_init_congestion_control,
2866 ++ .send_fin = tcp_send_fin,
2867 ++ .write_xmit = tcp_write_xmit,
2868 ++ .send_active_reset = tcp_send_active_reset,
2869 ++ .write_wakeup = tcp_write_wakeup,
2870 ++ .prune_ofo_queue = tcp_prune_ofo_queue,
2871 ++ .retransmit_timer = tcp_retransmit_timer,
2872 ++ .time_wait = tcp_time_wait,
2873 ++ .cleanup_rbuf = tcp_cleanup_rbuf,
2874 ++};
2875 ++
2876 + /* Address-family independent initialization for a tcp_sock.
2877 + *
2878 + * NOTE: A lot of things set to zero explicitly by call to
2879 +@@ -419,6 +438,8 @@ void tcp_init_sock(struct sock *sk)
2880 + sk->sk_sndbuf = sysctl_tcp_wmem[1];
2881 + sk->sk_rcvbuf = sysctl_tcp_rmem[1];
2882 +
2883 ++ tp->ops = &tcp_specific;
2884 ++
2885 + local_bh_disable();
2886 + sock_update_memcg(sk);
2887 + sk_sockets_allocated_inc(sk);
2888 +@@ -726,6 +747,14 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
2889 + int ret;
2890 +
2891 + sock_rps_record_flow(sk);
2892 ++
2893 ++#ifdef CONFIG_MPTCP
2894 ++ if (mptcp(tcp_sk(sk))) {
2895 ++ struct sock *sk_it;
2896 ++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk_it)
2897 ++ sock_rps_record_flow(sk_it);
2898 ++ }
2899 ++#endif
2900 + /*
2901 + * We can't seek on a socket input
2902 + */
2903 +@@ -821,8 +850,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
2904 + return NULL;
2905 + }
2906 +
2907 +-static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
2908 +- int large_allowed)
2909 ++unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed)
2910 + {
2911 + struct tcp_sock *tp = tcp_sk(sk);
2912 + u32 xmit_size_goal, old_size_goal;
2913 +@@ -872,8 +900,13 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
2914 + {
2915 + int mss_now;
2916 +
2917 +- mss_now = tcp_current_mss(sk);
2918 +- *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2919 ++ if (mptcp(tcp_sk(sk))) {
2920 ++ mss_now = mptcp_current_mss(sk);
2921 ++ *size_goal = mptcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2922 ++ } else {
2923 ++ mss_now = tcp_current_mss(sk);
2924 ++ *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
2925 ++ }
2926 +
2927 + return mss_now;
2928 + }
2929 +@@ -892,11 +925,32 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
2930 + * is fully established.
2931 + */
2932 + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
2933 +- !tcp_passive_fastopen(sk)) {
2934 ++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
2935 ++ tp->mpcb->master_sk : sk)) {
2936 + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
2937 + goto out_err;
2938 + }
2939 +
2940 ++ if (mptcp(tp)) {
2941 ++ struct sock *sk_it = sk;
2942 ++
2943 ++ /* We must check this with socket-lock hold because we iterate
2944 ++ * over the subflows.
2945 ++ */
2946 ++ if (!mptcp_can_sendpage(sk)) {
2947 ++ ssize_t ret;
2948 ++
2949 ++ release_sock(sk);
2950 ++ ret = sock_no_sendpage(sk->sk_socket, page, offset,
2951 ++ size, flags);
2952 ++ lock_sock(sk);
2953 ++ return ret;
2954 ++ }
2955 ++
2956 ++ mptcp_for_each_sk(tp->mpcb, sk_it)
2957 ++ sock_rps_record_flow(sk_it);
2958 ++ }
2959 ++
2960 + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2961 +
2962 + mss_now = tcp_send_mss(sk, &size_goal, flags);
2963 +@@ -1001,8 +1055,9 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
2964 + {
2965 + ssize_t res;
2966 +
2967 +- if (!(sk->sk_route_caps & NETIF_F_SG) ||
2968 +- !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
2969 ++ /* If MPTCP is enabled, we check it later after establishment */
2970 ++ if (!mptcp(tcp_sk(sk)) && (!(sk->sk_route_caps & NETIF_F_SG) ||
2971 ++ !(sk->sk_route_caps & NETIF_F_ALL_CSUM)))
2972 + return sock_no_sendpage(sk->sk_socket, page, offset, size,
2973 + flags);
2974 +
2975 +@@ -1018,6 +1073,9 @@ static inline int select_size(const struct sock *sk, bool sg)
2976 + const struct tcp_sock *tp = tcp_sk(sk);
2977 + int tmp = tp->mss_cache;
2978 +
2979 ++ if (mptcp(tp))
2980 ++ return mptcp_select_size(sk, sg);
2981 ++
2982 + if (sg) {
2983 + if (sk_can_gso(sk)) {
2984 + /* Small frames wont use a full page:
2985 +@@ -1100,11 +1158,18 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
2986 + * is fully established.
2987 + */
2988 + if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
2989 +- !tcp_passive_fastopen(sk)) {
2990 ++ !tcp_passive_fastopen(mptcp(tp) && tp->mpcb->master_sk ?
2991 ++ tp->mpcb->master_sk : sk)) {
2992 + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
2993 + goto do_error;
2994 + }
2995 +
2996 ++ if (mptcp(tp)) {
2997 ++ struct sock *sk_it = sk;
2998 ++ mptcp_for_each_sk(tp->mpcb, sk_it)
2999 ++ sock_rps_record_flow(sk_it);
3000 ++ }
3001 ++
3002 + if (unlikely(tp->repair)) {
3003 + if (tp->repair_queue == TCP_RECV_QUEUE) {
3004 + copied = tcp_send_rcvq(sk, msg, size);
3005 +@@ -1132,7 +1197,10 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3006 + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
3007 + goto out_err;
3008 +
3009 +- sg = !!(sk->sk_route_caps & NETIF_F_SG);
3010 ++ if (mptcp(tp))
3011 ++ sg = mptcp_can_sg(sk);
3012 ++ else
3013 ++ sg = !!(sk->sk_route_caps & NETIF_F_SG);
3014 +
3015 + while (--iovlen >= 0) {
3016 + size_t seglen = iov->iov_len;
3017 +@@ -1183,8 +1251,15 @@ new_segment:
3018 +
3019 + /*
3020 + * Check whether we can use HW checksum.
3021 ++ *
3022 ++ * If dss-csum is enabled, we do not do hw-csum.
3023 ++ * In case of non-mptcp we check the
3024 ++ * device-capabilities.
3025 ++ * In case of mptcp, hw-csum's will be handled
3026 ++ * later in mptcp_write_xmit.
3027 + */
3028 +- if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
3029 ++ if (((mptcp(tp) && !tp->mpcb->dss_csum) || !mptcp(tp)) &&
3030 ++ (mptcp(tp) || sk->sk_route_caps & NETIF_F_ALL_CSUM))
3031 + skb->ip_summed = CHECKSUM_PARTIAL;
3032 +
3033 + skb_entail(sk, skb);
3034 +@@ -1422,7 +1497,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
3035 +
3036 + /* Optimize, __tcp_select_window() is not cheap. */
3037 + if (2*rcv_window_now <= tp->window_clamp) {
3038 +- __u32 new_window = __tcp_select_window(sk);
3039 ++ __u32 new_window = tp->ops->__select_window(sk);
3040 +
3041 + /* Send ACK now, if this read freed lots of space
3042 + * in our buffer. Certainly, new_window is new window.
3043 +@@ -1587,7 +1662,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
3044 + /* Clean up data we have read: This will do ACK frames. */
3045 + if (copied > 0) {
3046 + tcp_recv_skb(sk, seq, &offset);
3047 +- tcp_cleanup_rbuf(sk, copied);
3048 ++ tp->ops->cleanup_rbuf(sk, copied);
3049 + }
3050 + return copied;
3051 + }
3052 +@@ -1623,6 +1698,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3053 +
3054 + lock_sock(sk);
3055 +
3056 ++#ifdef CONFIG_MPTCP
3057 ++ if (mptcp(tp)) {
3058 ++ struct sock *sk_it;
3059 ++ mptcp_for_each_sk(tp->mpcb, sk_it)
3060 ++ sock_rps_record_flow(sk_it);
3061 ++ }
3062 ++#endif
3063 ++
3064 + err = -ENOTCONN;
3065 + if (sk->sk_state == TCP_LISTEN)
3066 + goto out;
3067 +@@ -1761,7 +1844,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3068 + }
3069 + }
3070 +
3071 +- tcp_cleanup_rbuf(sk, copied);
3072 ++ tp->ops->cleanup_rbuf(sk, copied);
3073 +
3074 + if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
3075 + /* Install new reader */
3076 +@@ -1813,7 +1896,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
3077 + if (tp->rcv_wnd == 0 &&
3078 + !skb_queue_empty(&sk->sk_async_wait_queue)) {
3079 + tcp_service_net_dma(sk, true);
3080 +- tcp_cleanup_rbuf(sk, copied);
3081 ++ tp->ops->cleanup_rbuf(sk, copied);
3082 + } else
3083 + dma_async_issue_pending(tp->ucopy.dma_chan);
3084 + }
3085 +@@ -1993,7 +2076,7 @@ skip_copy:
3086 + */
3087 +
3088 + /* Clean up data we have read: This will do ACK frames. */
3089 +- tcp_cleanup_rbuf(sk, copied);
3090 ++ tp->ops->cleanup_rbuf(sk, copied);
3091 +
3092 + release_sock(sk);
3093 + return copied;
3094 +@@ -2070,7 +2153,7 @@ static const unsigned char new_state[16] = {
3095 + /* TCP_CLOSING */ TCP_CLOSING,
3096 + };
3097 +
3098 +-static int tcp_close_state(struct sock *sk)
3099 ++int tcp_close_state(struct sock *sk)
3100 + {
3101 + int next = (int)new_state[sk->sk_state];
3102 + int ns = next & TCP_STATE_MASK;
3103 +@@ -2100,7 +2183,7 @@ void tcp_shutdown(struct sock *sk, int how)
3104 + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
3105 + /* Clear out any half completed packets. FIN if needed. */
3106 + if (tcp_close_state(sk))
3107 +- tcp_send_fin(sk);
3108 ++ tcp_sk(sk)->ops->send_fin(sk);
3109 + }
3110 + }
3111 + EXPORT_SYMBOL(tcp_shutdown);
3112 +@@ -2125,6 +2208,11 @@ void tcp_close(struct sock *sk, long timeout)
3113 + int data_was_unread = 0;
3114 + int state;
3115 +
3116 ++ if (is_meta_sk(sk)) {
3117 ++ mptcp_close(sk, timeout);
3118 ++ return;
3119 ++ }
3120 ++
3121 + lock_sock(sk);
3122 + sk->sk_shutdown = SHUTDOWN_MASK;
3123 +
3124 +@@ -2167,7 +2255,7 @@ void tcp_close(struct sock *sk, long timeout)
3125 + /* Unread data was tossed, zap the connection. */
3126 + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
3127 + tcp_set_state(sk, TCP_CLOSE);
3128 +- tcp_send_active_reset(sk, sk->sk_allocation);
3129 ++ tcp_sk(sk)->ops->send_active_reset(sk, sk->sk_allocation);
3130 + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
3131 + /* Check zero linger _after_ checking for unread data. */
3132 + sk->sk_prot->disconnect(sk, 0);
3133 +@@ -2247,7 +2335,7 @@ adjudge_to_death:
3134 + struct tcp_sock *tp = tcp_sk(sk);
3135 + if (tp->linger2 < 0) {
3136 + tcp_set_state(sk, TCP_CLOSE);
3137 +- tcp_send_active_reset(sk, GFP_ATOMIC);
3138 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
3139 + NET_INC_STATS_BH(sock_net(sk),
3140 + LINUX_MIB_TCPABORTONLINGER);
3141 + } else {
3142 +@@ -2257,7 +2345,8 @@ adjudge_to_death:
3143 + inet_csk_reset_keepalive_timer(sk,
3144 + tmo - TCP_TIMEWAIT_LEN);
3145 + } else {
3146 +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
3147 ++ tcp_sk(sk)->ops->time_wait(sk, TCP_FIN_WAIT2,
3148 ++ tmo);
3149 + goto out;
3150 + }
3151 + }
3152 +@@ -2266,7 +2355,7 @@ adjudge_to_death:
3153 + sk_mem_reclaim(sk);
3154 + if (tcp_check_oom(sk, 0)) {
3155 + tcp_set_state(sk, TCP_CLOSE);
3156 +- tcp_send_active_reset(sk, GFP_ATOMIC);
3157 ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
3158 + NET_INC_STATS_BH(sock_net(sk),
3159 + LINUX_MIB_TCPABORTONMEMORY);
3160 + }
3161 +@@ -2291,15 +2380,6 @@ out:
3162 + }
3163 + EXPORT_SYMBOL(tcp_close);
3164 +
3165 +-/* These states need RST on ABORT according to RFC793 */
3166 +-
3167 +-static inline bool tcp_need_reset(int state)
3168 +-{
3169 +- return (1 << state) &
3170 +- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
3171 +- TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
3172 +-}
3173 +-
3174 + int tcp_disconnect(struct sock *sk, int flags)
3175 + {
3176 + struct inet_sock *inet = inet_sk(sk);
3177 +@@ -2322,7 +2402,7 @@ int tcp_disconnect(struct sock *sk, int flags)
3178 + /* The last check adjusts for discrepancy of Linux wrt. RFC
3179 + * states
3180 + */
3181 +- tcp_send_active_reset(sk, gfp_any());
3182 ++ tp->ops->send_active_reset(sk, gfp_any());
3183 + sk->sk_err = ECONNRESET;
3184 + } else if (old_state == TCP_SYN_SENT)
3185 + sk->sk_err = ECONNRESET;
3186 +@@ -2340,6 +2420,13 @@ int tcp_disconnect(struct sock *sk, int flags)
3187 + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
3188 + inet_reset_saddr(sk);
3189 +
3190 ++ if (is_meta_sk(sk)) {
3191 ++ mptcp_disconnect(sk);
3192 ++ } else {
3193 ++ if (tp->inside_tk_table)
3194 ++ mptcp_hash_remove_bh(tp);
3195 ++ }
3196 ++
3197 + sk->sk_shutdown = 0;
3198 + sock_reset_flag(sk, SOCK_DONE);
3199 + tp->srtt_us = 0;
3200 +@@ -2632,6 +2719,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
3201 + break;
3202 +
3203 + case TCP_DEFER_ACCEPT:
3204 ++ /* An established MPTCP-connection (mptcp(tp) only returns true
3205 ++ * if the socket is established) should not use DEFER on new
3206 ++ * subflows.
3207 ++ */
3208 ++ if (mptcp(tp))
3209 ++ break;
3210 + /* Translate value in seconds to number of retransmits */
3211 + icsk->icsk_accept_queue.rskq_defer_accept =
3212 + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3213 +@@ -2659,7 +2752,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
3214 + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
3215 + inet_csk_ack_scheduled(sk)) {
3216 + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
3217 +- tcp_cleanup_rbuf(sk, 1);
3218 ++ tp->ops->cleanup_rbuf(sk, 1);
3219 + if (!(val & 1))
3220 + icsk->icsk_ack.pingpong = 1;
3221 + }
3222 +@@ -2699,6 +2792,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
3223 + tp->notsent_lowat = val;
3224 + sk->sk_write_space(sk);
3225 + break;
3226 ++#ifdef CONFIG_MPTCP
3227 ++ case MPTCP_ENABLED:
3228 ++ if (sk->sk_state == TCP_CLOSE || sk->sk_state == TCP_LISTEN) {
3229 ++ if (val)
3230 ++ tp->mptcp_enabled = 1;
3231 ++ else
3232 ++ tp->mptcp_enabled = 0;
3233 ++ } else {
3234 ++ err = -EPERM;
3235 ++ }
3236 ++ break;
3237 ++#endif
3238 + default:
3239 + err = -ENOPROTOOPT;
3240 + break;
3241 +@@ -2931,6 +3036,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
3242 + case TCP_NOTSENT_LOWAT:
3243 + val = tp->notsent_lowat;
3244 + break;
3245 ++#ifdef CONFIG_MPTCP
3246 ++ case MPTCP_ENABLED:
3247 ++ val = tp->mptcp_enabled;
3248 ++ break;
3249 ++#endif
3250 + default:
3251 + return -ENOPROTOOPT;
3252 + }
3253 +@@ -3120,8 +3230,11 @@ void tcp_done(struct sock *sk)
3254 + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
3255 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
3256 +
3257 ++ WARN_ON(sk->sk_state == TCP_CLOSE);
3258 + tcp_set_state(sk, TCP_CLOSE);
3259 ++
3260 + tcp_clear_xmit_timers(sk);
3261 ++
3262 + if (req != NULL)
3263 + reqsk_fastopen_remove(sk, req, false);
3264 +
3265 +diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
3266 +index 9771563ab564..5c230d96c4c1 100644
3267 +--- a/net/ipv4/tcp_fastopen.c
3268 ++++ b/net/ipv4/tcp_fastopen.c
3269 +@@ -7,6 +7,7 @@
3270 + #include <linux/rculist.h>
3271 + #include <net/inetpeer.h>
3272 + #include <net/tcp.h>
3273 ++#include <net/mptcp.h>
3274 +
3275 + int sysctl_tcp_fastopen __read_mostly = TFO_CLIENT_ENABLE;
3276 +
3277 +@@ -133,7 +134,7 @@ static bool tcp_fastopen_create_child(struct sock *sk,
3278 + {
3279 + struct tcp_sock *tp;
3280 + struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
3281 +- struct sock *child;
3282 ++ struct sock *child, *meta_sk;
3283 +
3284 + req->num_retrans = 0;
3285 + req->num_timeout = 0;
3286 +@@ -176,13 +177,6 @@ static bool tcp_fastopen_create_child(struct sock *sk,
3287 + /* Add the child socket directly into the accept queue */
3288 + inet_csk_reqsk_queue_add(sk, req, child);
3289 +
3290 +- /* Now finish processing the fastopen child socket. */
3291 +- inet_csk(child)->icsk_af_ops->rebuild_header(child);
3292 +- tcp_init_congestion_control(child);
3293 +- tcp_mtup_init(child);
3294 +- tcp_init_metrics(child);
3295 +- tcp_init_buffer_space(child);
3296 +-
3297 + /* Queue the data carried in the SYN packet. We need to first
3298 + * bump skb's refcnt because the caller will attempt to free it.
3299 + *
3300 +@@ -199,8 +193,24 @@ static bool tcp_fastopen_create_child(struct sock *sk,
3301 + tp->syn_data_acked = 1;
3302 + }
3303 + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3304 ++
3305 ++ meta_sk = child;
3306 ++ if (!mptcp_check_req_fastopen(meta_sk, req)) {
3307 ++ child = tcp_sk(meta_sk)->mpcb->master_sk;
3308 ++ tp = tcp_sk(child);
3309 ++ }
3310 ++
3311 ++ /* Now finish processing the fastopen child socket. */
3312 ++ inet_csk(child)->icsk_af_ops->rebuild_header(child);
3313 ++ tp->ops->init_congestion_control(child);
3314 ++ tcp_mtup_init(child);
3315 ++ tcp_init_metrics(child);
3316 ++ tp->ops->init_buffer_space(child);
3317 ++
3318 + sk->sk_data_ready(sk);
3319 +- bh_unlock_sock(child);
3320 ++ if (mptcp(tcp_sk(child)))
3321 ++ bh_unlock_sock(child);
3322 ++ bh_unlock_sock(meta_sk);
3323 + sock_put(child);
3324 + WARN_ON(req->sk == NULL);
3325 + return true;
3326 +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
3327 +index 40639c288dc2..3273bb69f387 100644
3328 +--- a/net/ipv4/tcp_input.c
3329 ++++ b/net/ipv4/tcp_input.c
3330 +@@ -74,6 +74,9 @@
3331 + #include <linux/ipsec.h>
3332 + #include <asm/unaligned.h>
3333 + #include <net/netdma.h>
3334 ++#include <net/mptcp.h>
3335 ++#include <net/mptcp_v4.h>
3336 ++#include <net/mptcp_v6.h>
3337 +
3338 + int sysctl_tcp_timestamps __read_mostly = 1;
3339 + int sysctl_tcp_window_scaling __read_mostly = 1;
3340 +@@ -99,25 +102,6 @@ int sysctl_tcp_thin_dupack __read_mostly;
3341 + int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
3342 + int sysctl_tcp_early_retrans __read_mostly = 3;
3343 +
3344 +-#define FLAG_DATA 0x01 /* Incoming frame contained data. */
3345 +-#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
3346 +-#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
3347 +-#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
3348 +-#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
3349 +-#define FLAG_DATA_SACKED 0x20 /* New SACK. */
3350 +-#define FLAG_ECE 0x40 /* ECE in this ACK */
3351 +-#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
3352 +-#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
3353 +-#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
3354 +-#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
3355 +-#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
3356 +-#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
3357 +-
3358 +-#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
3359 +-#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
3360 +-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
3361 +-#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
3362 +-
3363 + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
3364 + #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
3365 +
3366 +@@ -181,7 +165,7 @@ static void tcp_incr_quickack(struct sock *sk)
3367 + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
3368 + }
3369 +
3370 +-static void tcp_enter_quickack_mode(struct sock *sk)
3371 ++void tcp_enter_quickack_mode(struct sock *sk)
3372 + {
3373 + struct inet_connection_sock *icsk = inet_csk(sk);
3374 + tcp_incr_quickack(sk);
3375 +@@ -283,8 +267,12 @@ static void tcp_sndbuf_expand(struct sock *sk)
3376 + per_mss = roundup_pow_of_two(per_mss) +
3377 + SKB_DATA_ALIGN(sizeof(struct sk_buff));
3378 +
3379 +- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
3380 +- nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
3381 ++ if (mptcp(tp)) {
3382 ++ nr_segs = mptcp_check_snd_buf(tp);
3383 ++ } else {
3384 ++ nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
3385 ++ nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
3386 ++ }
3387 +
3388 + /* Fast Recovery (RFC 5681 3.2) :
3389 + * Cubic needs 1.7 factor, rounded to 2 to include
3390 +@@ -292,8 +280,16 @@ static void tcp_sndbuf_expand(struct sock *sk)
3391 + */
3392 + sndmem = 2 * nr_segs * per_mss;
3393 +
3394 +- if (sk->sk_sndbuf < sndmem)
3395 ++ /* MPTCP: after this sndmem is the new contribution of the
3396 ++ * current subflow to the aggregated sndbuf */
3397 ++ if (sk->sk_sndbuf < sndmem) {
3398 ++ int old_sndbuf = sk->sk_sndbuf;
3399 + sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
3400 ++ /* MPTCP: ok, the subflow sndbuf has grown, reflect
3401 ++ * this in the aggregate buffer.*/
3402 ++ if (mptcp(tp) && old_sndbuf != sk->sk_sndbuf)
3403 ++ mptcp_update_sndbuf(tp);
3404 ++ }
3405 + }
3406 +
3407 + /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
3408 +@@ -342,10 +338,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
3409 + static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
3410 + {
3411 + struct tcp_sock *tp = tcp_sk(sk);
3412 ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
3413 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
3414 +
3415 + /* Check #1 */
3416 +- if (tp->rcv_ssthresh < tp->window_clamp &&
3417 +- (int)tp->rcv_ssthresh < tcp_space(sk) &&
3418 ++ if (meta_tp->rcv_ssthresh < meta_tp->window_clamp &&
3419 ++ (int)meta_tp->rcv_ssthresh < tcp_space(sk) &&
3420 + !sk_under_memory_pressure(sk)) {
3421 + int incr;
3422 +
3423 +@@ -353,14 +351,14 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
3424 + * will fit to rcvbuf in future.
3425 + */
3426 + if (tcp_win_from_space(skb->truesize) <= skb->len)
3427 +- incr = 2 * tp->advmss;
3428 ++ incr = 2 * meta_tp->advmss;
3429 + else
3430 +- incr = __tcp_grow_window(sk, skb);
3431 ++ incr = __tcp_grow_window(meta_sk, skb);
3432 +
3433 + if (incr) {
3434 + incr = max_t(int, incr, 2 * skb->len);
3435 +- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
3436 +- tp->window_clamp);
3437 ++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh + incr,
3438 ++ meta_tp->window_clamp);
3439 + inet_csk(sk)->icsk_ack.quick |= 1;
3440 + }
3441 + }
3442 +@@ -543,7 +541,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
3443 + int copied;
3444 +
3445 + time = tcp_time_stamp - tp->rcvq_space.time;
3446 +- if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
3447 ++ if (mptcp(tp)) {
3448 ++ if (mptcp_check_rtt(tp, time))
3449 ++ return;
3450 ++ } else if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
3451 + return;
3452 +
3453 + /* Number of bytes copied to user in last RTT */
3454 +@@ -761,7 +762,7 @@ static void tcp_update_pacing_rate(struct sock *sk)
3455 + /* Calculate rto without backoff. This is the second half of Van Jacobson's
3456 + * routine referred to above.
3457 + */
3458 +-static void tcp_set_rto(struct sock *sk)
3459 ++void tcp_set_rto(struct sock *sk)
3460 + {
3461 + const struct tcp_sock *tp = tcp_sk(sk);
3462 + /* Old crap is replaced with new one. 8)
3463 +@@ -1376,7 +1377,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
3464 + int len;
3465 + int in_sack;
3466 +
3467 +- if (!sk_can_gso(sk))
3468 ++ /* For MPTCP we cannot shift skb-data and remove one skb from the
3469 ++ * send-queue, because this will make us loose the DSS-option (which
3470 ++ * is stored in TCP_SKB_CB(skb)->dss) of the skb we are removing.
3471 ++ */
3472 ++ if (!sk_can_gso(sk) || mptcp(tp))
3473 + goto fallback;
3474 +
3475 + /* Normally R but no L won't result in plain S */
3476 +@@ -2915,7 +2920,7 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
3477 + return false;
3478 +
3479 + tcp_rtt_estimator(sk, seq_rtt_us);
3480 +- tcp_set_rto(sk);
3481 ++ tp->ops->set_rto(sk);
3482 +
3483 + /* RFC6298: only reset backoff on valid RTT measurement. */
3484 + inet_csk(sk)->icsk_backoff = 0;
3485 +@@ -3000,7 +3005,7 @@ void tcp_resume_early_retransmit(struct sock *sk)
3486 + }
3487 +
3488 + /* If we get here, the whole TSO packet has not been acked. */
3489 +-static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3490 ++u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
3491 + {
3492 + struct tcp_sock *tp = tcp_sk(sk);
3493 + u32 packets_acked;
3494 +@@ -3095,6 +3100,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3495 + */
3496 + if (!(scb->tcp_flags & TCPHDR_SYN)) {
3497 + flag |= FLAG_DATA_ACKED;
3498 ++ if (mptcp(tp) && mptcp_is_data_seq(skb))
3499 ++ flag |= MPTCP_FLAG_DATA_ACKED;
3500 + } else {
3501 + flag |= FLAG_SYN_ACKED;
3502 + tp->retrans_stamp = 0;
3503 +@@ -3189,7 +3196,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3504 + return flag;
3505 + }
3506 +
3507 +-static void tcp_ack_probe(struct sock *sk)
3508 ++void tcp_ack_probe(struct sock *sk)
3509 + {
3510 + const struct tcp_sock *tp = tcp_sk(sk);
3511 + struct inet_connection_sock *icsk = inet_csk(sk);
3512 +@@ -3236,9 +3243,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
3513 + /* Check that window update is acceptable.
3514 + * The function assumes that snd_una<=ack<=snd_next.
3515 + */
3516 +-static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3517 +- const u32 ack, const u32 ack_seq,
3518 +- const u32 nwin)
3519 ++bool tcp_may_update_window(const struct tcp_sock *tp, const u32 ack,
3520 ++ const u32 ack_seq, const u32 nwin)
3521 + {
3522 + return after(ack, tp->snd_una) ||
3523 + after(ack_seq, tp->snd_wl1) ||
3524 +@@ -3357,7 +3363,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
3525 + }
3526 +
3527 + /* This routine deals with incoming acks, but not outgoing ones. */
3528 +-static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3529 ++static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3530 + {
3531 + struct inet_connection_sock *icsk = inet_csk(sk);
3532 + struct tcp_sock *tp = tcp_sk(sk);
3533 +@@ -3449,6 +3455,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3534 + sack_rtt_us);
3535 + acked -= tp->packets_out;
3536 +
3537 ++ if (mptcp(tp)) {
3538 ++ if (mptcp_fallback_infinite(sk, flag)) {
3539 ++ pr_err("%s resetting flow\n", __func__);
3540 ++ mptcp_send_reset(sk);
3541 ++ goto invalid_ack;
3542 ++ }
3543 ++
3544 ++ mptcp_clean_rtx_infinite(skb, sk);
3545 ++ }
3546 ++
3547 + /* Advance cwnd if state allows */
3548 + if (tcp_may_raise_cwnd(sk, flag))
3549 + tcp_cong_avoid(sk, ack, acked);
3550 +@@ -3512,8 +3528,9 @@ old_ack:
3551 + * the fast version below fails.
3552 + */
3553 + void tcp_parse_options(const struct sk_buff *skb,
3554 +- struct tcp_options_received *opt_rx, int estab,
3555 +- struct tcp_fastopen_cookie *foc)
3556 ++ struct tcp_options_received *opt_rx,
3557 ++ struct mptcp_options_received *mopt,
3558 ++ int estab, struct tcp_fastopen_cookie *foc)
3559 + {
3560 + const unsigned char *ptr;
3561 + const struct tcphdr *th = tcp_hdr(skb);
3562 +@@ -3596,6 +3613,9 @@ void tcp_parse_options(const struct sk_buff *skb,
3563 + */
3564 + break;
3565 + #endif
3566 ++ case TCPOPT_MPTCP:
3567 ++ mptcp_parse_options(ptr - 2, opsize, mopt, skb);
3568 ++ break;
3569 + case TCPOPT_EXP:
3570 + /* Fast Open option shares code 254 using a
3571 + * 16 bits magic number. It's valid only in
3572 +@@ -3657,8 +3677,8 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
3573 + if (tcp_parse_aligned_timestamp(tp, th))
3574 + return true;
3575 + }
3576 +-
3577 +- tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
3578 ++ tcp_parse_options(skb, &tp->rx_opt, mptcp(tp) ? &tp->mptcp->rx_opt : NULL,
3579 ++ 1, NULL);
3580 + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3581 + tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3582 +
3583 +@@ -3831,6 +3851,8 @@ static void tcp_fin(struct sock *sk)
3584 + dst = __sk_dst_get(sk);
3585 + if (!dst || !dst_metric(dst, RTAX_QUICKACK))
3586 + inet_csk(sk)->icsk_ack.pingpong = 1;
3587 ++ if (mptcp(tp))
3588 ++ mptcp_sub_close_passive(sk);
3589 + break;
3590 +
3591 + case TCP_CLOSE_WAIT:
3592 +@@ -3852,9 +3874,16 @@ static void tcp_fin(struct sock *sk)
3593 + tcp_set_state(sk, TCP_CLOSING);
3594 + break;
3595 + case TCP_FIN_WAIT2:
3596 ++ if (mptcp(tp)) {
3597 ++ /* The socket will get closed by mptcp_data_ready.
3598 ++ * We first have to process all data-sequences.
3599 ++ */
3600 ++ tp->close_it = 1;
3601 ++ break;
3602 ++ }
3603 + /* Received a FIN -- send ACK and enter TIME_WAIT. */
3604 + tcp_send_ack(sk);
3605 +- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
3606 ++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
3607 + break;
3608 + default:
3609 + /* Only TCP_LISTEN and TCP_CLOSE are left, in these
3610 +@@ -3876,6 +3905,10 @@ static void tcp_fin(struct sock *sk)
3611 + if (!sock_flag(sk, SOCK_DEAD)) {
3612 + sk->sk_state_change(sk);
3613 +
3614 ++ /* Don't wake up MPTCP-subflows */
3615 ++ if (mptcp(tp))
3616 ++ return;
3617 ++
3618 + /* Do not send POLL_HUP for half duplex close. */
3619 + if (sk->sk_shutdown == SHUTDOWN_MASK ||
3620 + sk->sk_state == TCP_CLOSE)
3621 +@@ -4073,7 +4106,11 @@ static void tcp_ofo_queue(struct sock *sk)
3622 + tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
3623 + }
3624 +
3625 +- if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
3626 ++ /* In case of MPTCP, the segment may be empty if it's a
3627 ++ * non-data DATA_FIN. (see beginning of tcp_data_queue)
3628 ++ */
3629 ++ if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt) &&
3630 ++ !(mptcp(tp) && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)) {
3631 + SOCK_DEBUG(sk, "ofo packet was already received\n");
3632 + __skb_unlink(skb, &tp->out_of_order_queue);
3633 + __kfree_skb(skb);
3634 +@@ -4091,12 +4128,14 @@ static void tcp_ofo_queue(struct sock *sk)
3635 + }
3636 + }
3637 +
3638 +-static bool tcp_prune_ofo_queue(struct sock *sk);
3639 + static int tcp_prune_queue(struct sock *sk);
3640 +
3641 + static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
3642 + unsigned int size)
3643 + {
3644 ++ if (mptcp(tcp_sk(sk)))
3645 ++ sk = mptcp_meta_sk(sk);
3646 ++
3647 + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
3648 + !sk_rmem_schedule(sk, skb, size)) {
3649 +
3650 +@@ -4104,7 +4143,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
3651 + return -1;
3652 +
3653 + if (!sk_rmem_schedule(sk, skb, size)) {
3654 +- if (!tcp_prune_ofo_queue(sk))
3655 ++ if (!tcp_sk(sk)->ops->prune_ofo_queue(sk))
3656 + return -1;
3657 +
3658 + if (!sk_rmem_schedule(sk, skb, size))
3659 +@@ -4127,15 +4166,16 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
3660 + * Better try to coalesce them right now to avoid future collapses.
3661 + * Returns true if caller should free @from instead of queueing it
3662 + */
3663 +-static bool tcp_try_coalesce(struct sock *sk,
3664 +- struct sk_buff *to,
3665 +- struct sk_buff *from,
3666 +- bool *fragstolen)
3667 ++bool tcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from,
3668 ++ bool *fragstolen)
3669 + {
3670 + int delta;
3671 +
3672 + *fragstolen = false;
3673 +
3674 ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
3675 ++ return false;
3676 ++
3677 + if (tcp_hdr(from)->fin)
3678 + return false;
3679 +
3680 +@@ -4225,7 +4265,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
3681 +
3682 + /* Do skb overlap to previous one? */
3683 + if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
3684 +- if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
3685 ++ /* MPTCP allows non-data data-fin to be in the ofo-queue */
3686 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq) &&
3687 ++ !(mptcp(tp) && end_seq == seq)) {
3688 + /* All the bits are present. Drop. */
3689 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
3690 + __kfree_skb(skb);
3691 +@@ -4263,6 +4305,9 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
3692 + end_seq);
3693 + break;
3694 + }
3695 ++ /* MPTCP allows non-data data-fin to be in the ofo-queue */
3696 ++ if (mptcp(tp) && TCP_SKB_CB(skb1)->seq == TCP_SKB_CB(skb1)->end_seq)
3697 ++ continue;
3698 + __skb_unlink(skb1, &tp->out_of_order_queue);
3699 + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
3700 + TCP_SKB_CB(skb1)->end_seq);
3701 +@@ -4280,8 +4325,8 @@ end:
3702 + }
3703 + }
3704 +
3705 +-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
3706 +- bool *fragstolen)
3707 ++int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
3708 ++ bool *fragstolen)
3709 + {
3710 + int eaten;
3711 + struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
3712 +@@ -4343,7 +4388,10 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
3713 + int eaten = -1;
3714 + bool fragstolen = false;
3715 +
3716 +- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
3717 ++ /* If no data is present, but a data_fin is in the options, we still
3718 ++ * have to call mptcp_queue_skb later on. */
3719 ++ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq &&
3720 ++ !(mptcp(tp) && mptcp_is_data_fin(skb)))
3721 + goto drop;
3722 +
3723 + skb_dst_drop(skb);
3724 +@@ -4389,7 +4437,7 @@ queue_and_out:
3725 + eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
3726 + }
3727 + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
3728 +- if (skb->len)
3729 ++ if (skb->len || mptcp_is_data_fin(skb))
3730 + tcp_event_data_recv(sk, skb);
3731 + if (th->fin)
3732 + tcp_fin(sk);
3733 +@@ -4411,7 +4459,11 @@ queue_and_out:
3734 +
3735 + if (eaten > 0)
3736 + kfree_skb_partial(skb, fragstolen);
3737 +- if (!sock_flag(sk, SOCK_DEAD))
3738 ++ if (!sock_flag(sk, SOCK_DEAD) || mptcp(tp))
3739 ++ /* MPTCP: we always have to call data_ready, because
3740 ++ * we may be about to receive a data-fin, which still
3741 ++ * must get queued.
3742 ++ */
3743 + sk->sk_data_ready(sk);
3744 + return;
3745 + }
3746 +@@ -4463,6 +4515,8 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
3747 + next = skb_queue_next(list, skb);
3748 +
3749 + __skb_unlink(skb, list);
3750 ++ if (mptcp(tcp_sk(sk)))
3751 ++ mptcp_remove_shortcuts(tcp_sk(sk)->mpcb, skb);
3752 + __kfree_skb(skb);
3753 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
3754 +
3755 +@@ -4630,7 +4684,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
3756 + * Purge the out-of-order queue.
3757 + * Return true if queue was pruned.
3758 + */
3759 +-static bool tcp_prune_ofo_queue(struct sock *sk)
3760 ++bool tcp_prune_ofo_queue(struct sock *sk)
3761 + {
3762 + struct tcp_sock *tp = tcp_sk(sk);
3763 + bool res = false;
3764 +@@ -4686,7 +4740,7 @@ static int tcp_prune_queue(struct sock *sk)
3765 + /* Collapsing did not help, destructive actions follow.
3766 + * This must not ever occur. */
3767 +
3768 +- tcp_prune_ofo_queue(sk);
3769 ++ tp->ops->prune_ofo_queue(sk);
3770 +
3771 + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
3772 + return 0;
3773 +@@ -4702,7 +4756,29 @@ static int tcp_prune_queue(struct sock *sk)
3774 + return -1;
3775 + }
3776 +
3777 +-static bool tcp_should_expand_sndbuf(const struct sock *sk)
3778 ++/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
3779 ++ * As additional protections, we do not touch cwnd in retransmission phases,
3780 ++ * and if application hit its sndbuf limit recently.
3781 ++ */
3782 ++void tcp_cwnd_application_limited(struct sock *sk)
3783 ++{
3784 ++ struct tcp_sock *tp = tcp_sk(sk);
3785 ++
3786 ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
3787 ++ sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
3788 ++ /* Limited by application or receiver window. */
3789 ++ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
3790 ++ u32 win_used = max(tp->snd_cwnd_used, init_win);
3791 ++ if (win_used < tp->snd_cwnd) {
3792 ++ tp->snd_ssthresh = tcp_current_ssthresh(sk);
3793 ++ tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
3794 ++ }
3795 ++ tp->snd_cwnd_used = 0;
3796 ++ }
3797 ++ tp->snd_cwnd_stamp = tcp_time_stamp;
3798 ++}
3799 ++
3800 ++bool tcp_should_expand_sndbuf(const struct sock *sk)
3801 + {
3802 + const struct tcp_sock *tp = tcp_sk(sk);
3803 +
3804 +@@ -4737,7 +4813,7 @@ static void tcp_new_space(struct sock *sk)
3805 + {
3806 + struct tcp_sock *tp = tcp_sk(sk);
3807 +
3808 +- if (tcp_should_expand_sndbuf(sk)) {
3809 ++ if (tp->ops->should_expand_sndbuf(sk)) {
3810 + tcp_sndbuf_expand(sk);
3811 + tp->snd_cwnd_stamp = tcp_time_stamp;
3812 + }
3813 +@@ -4749,8 +4825,9 @@ static void tcp_check_space(struct sock *sk)
3814 + {
3815 + if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
3816 + sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
3817 +- if (sk->sk_socket &&
3818 +- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
3819 ++ if (mptcp(tcp_sk(sk)) ||
3820 ++ (sk->sk_socket &&
3821 ++ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)))
3822 + tcp_new_space(sk);
3823 + }
3824 + }
3825 +@@ -4773,7 +4850,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
3826 + /* ... and right edge of window advances far enough.
3827 + * (tcp_recvmsg() will send ACK otherwise). Or...
3828 + */
3829 +- __tcp_select_window(sk) >= tp->rcv_wnd) ||
3830 ++ tp->ops->__select_window(sk) >= tp->rcv_wnd) ||
3831 + /* We ACK each frame or... */
3832 + tcp_in_quickack_mode(sk) ||
3833 + /* We have out of order data. */
3834 +@@ -4875,6 +4952,10 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
3835 + {
3836 + struct tcp_sock *tp = tcp_sk(sk);
3837 +
3838 ++ /* MPTCP urgent data is not yet supported */
3839 ++ if (mptcp(tp))
3840 ++ return;
3841 ++
3842 + /* Check if we get a new urgent pointer - normally not. */
3843 + if (th->urg)
3844 + tcp_check_urg(sk, th);
3845 +@@ -4942,8 +5023,7 @@ static inline bool tcp_checksum_complete_user(struct sock *sk,
3846 + }
3847 +
3848 + #ifdef CONFIG_NET_DMA
3849 +-static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
3850 +- int hlen)
3851 ++bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
3852 + {
3853 + struct tcp_sock *tp = tcp_sk(sk);
3854 + int chunk = skb->len - hlen;
3855 +@@ -5052,9 +5132,15 @@ syn_challenge:
3856 + goto discard;
3857 + }
3858 +
3859 ++ /* If valid: post process the received MPTCP options. */
3860 ++ if (mptcp(tp) && mptcp_handle_options(sk, th, skb))
3861 ++ goto discard;
3862 ++
3863 + return true;
3864 +
3865 + discard:
3866 ++ if (mptcp(tp))
3867 ++ mptcp_reset_mopt(tp);
3868 + __kfree_skb(skb);
3869 + return false;
3870 + }
3871 +@@ -5106,6 +5192,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3872 +
3873 + tp->rx_opt.saw_tstamp = 0;
3874 +
3875 ++ /* MPTCP: force slowpath. */
3876 ++ if (mptcp(tp))
3877 ++ goto slow_path;
3878 ++
3879 + /* pred_flags is 0xS?10 << 16 + snd_wnd
3880 + * if header_prediction is to be made
3881 + * 'S' will always be tp->tcp_header_len >> 2
3882 +@@ -5205,7 +5295,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
3883 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
3884 + }
3885 + if (copied_early)
3886 +- tcp_cleanup_rbuf(sk, skb->len);
3887 ++ tp->ops->cleanup_rbuf(sk, skb->len);
3888 + }
3889 + if (!eaten) {
3890 + if (tcp_checksum_complete_user(sk, skb))
3891 +@@ -5313,14 +5403,14 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
3892 +
3893 + tcp_init_metrics(sk);
3894 +
3895 +- tcp_init_congestion_control(sk);
3896 ++ tp->ops->init_congestion_control(sk);
3897 +
3898 + /* Prevent spurious tcp_cwnd_restart() on first data
3899 + * packet.
3900 + */
3901 + tp->lsndtime = tcp_time_stamp;
3902 +
3903 +- tcp_init_buffer_space(sk);
3904 ++ tp->ops->init_buffer_space(sk);
3905 +
3906 + if (sock_flag(sk, SOCK_KEEPOPEN))
3907 + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
3908 +@@ -5350,7 +5440,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
3909 + /* Get original SYNACK MSS value if user MSS sets mss_clamp */
3910 + tcp_clear_options(&opt);
3911 + opt.user_mss = opt.mss_clamp = 0;
3912 +- tcp_parse_options(synack, &opt, 0, NULL);
3913 ++ tcp_parse_options(synack, &opt, NULL, 0, NULL);
3914 + mss = opt.mss_clamp;
3915 + }
3916 +
3917 +@@ -5365,7 +5455,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
3918 +
3919 + tcp_fastopen_cache_set(sk, mss, cookie, syn_drop);
3920 +
3921 +- if (data) { /* Retransmit unacked data in SYN */
3922 ++ /* In mptcp case, we do not rely on "retransmit", but instead on
3923 ++ * "transmit", because if fastopen data is not acked, the retransmission
3924 ++ * becomes the first MPTCP data (see mptcp_rcv_synsent_fastopen).
3925 ++ */
3926 ++ if (data && !mptcp(tp)) { /* Retransmit unacked data in SYN */
3927 + tcp_for_write_queue_from(data, sk) {
3928 + if (data == tcp_send_head(sk) ||
3929 + __tcp_retransmit_skb(sk, data))
3930 +@@ -5388,8 +5482,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3931 + struct tcp_sock *tp = tcp_sk(sk);
3932 + struct tcp_fastopen_cookie foc = { .len = -1 };
3933 + int saved_clamp = tp->rx_opt.mss_clamp;
3934 ++ struct mptcp_options_received mopt;
3935 ++ mptcp_init_mp_opt(&mopt);
3936 +
3937 +- tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
3938 ++ tcp_parse_options(skb, &tp->rx_opt,
3939 ++ mptcp(tp) ? &tp->mptcp->rx_opt : &mopt, 0, &foc);
3940 + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
3941 + tp->rx_opt.rcv_tsecr -= tp->tsoffset;
3942 +
3943 +@@ -5448,6 +5545,30 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3944 + tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
3945 + tcp_ack(sk, skb, FLAG_SLOWPATH);
3946 +
3947 ++ if (tp->request_mptcp || mptcp(tp)) {
3948 ++ int ret;
3949 ++ ret = mptcp_rcv_synsent_state_process(sk, &sk,
3950 ++ skb, &mopt);
3951 ++
3952 ++ /* May have changed if we support MPTCP */
3953 ++ tp = tcp_sk(sk);
3954 ++ icsk = inet_csk(sk);
3955 ++
3956 ++ if (ret == 1)
3957 ++ goto reset_and_undo;
3958 ++ if (ret == 2)
3959 ++ goto discard;
3960 ++ }
3961 ++
3962 ++ if (mptcp(tp) && !is_master_tp(tp)) {
3963 ++ /* Timer for repeating the ACK until an answer
3964 ++ * arrives. Used only when establishing an additional
3965 ++ * subflow inside of an MPTCP connection.
3966 ++ */
3967 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
3968 ++ jiffies + icsk->icsk_rto);
3969 ++ }
3970 ++
3971 + /* Ok.. it's good. Set up sequence numbers and
3972 + * move to established.
3973 + */
3974 +@@ -5474,6 +5595,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3975 + tp->tcp_header_len = sizeof(struct tcphdr);
3976 + }
3977 +
3978 ++ if (mptcp(tp)) {
3979 ++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
3980 ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
3981 ++ }
3982 ++
3983 + if (tcp_is_sack(tp) && sysctl_tcp_fack)
3984 + tcp_enable_fack(tp);
3985 +
3986 +@@ -5494,9 +5620,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
3987 + tcp_rcv_fastopen_synack(sk, skb, &foc))
3988 + return -1;
3989 +
3990 +- if (sk->sk_write_pending ||
3991 ++ /* With MPTCP we cannot send data on the third ack due to the
3992 ++ * lack of option-space to combine with an MP_CAPABLE.
3993 ++ */
3994 ++ if (!mptcp(tp) && (sk->sk_write_pending ||
3995 + icsk->icsk_accept_queue.rskq_defer_accept ||
3996 +- icsk->icsk_ack.pingpong) {
3997 ++ icsk->icsk_ack.pingpong)) {
3998 + /* Save one ACK. Data will be ready after
3999 + * several ticks, if write_pending is set.
4000 + *
4001 +@@ -5536,6 +5665,7 @@ discard:
4002 + tcp_paws_reject(&tp->rx_opt, 0))
4003 + goto discard_and_undo;
4004 +
4005 ++ /* TODO - check this here for MPTCP */
4006 + if (th->syn) {
4007 + /* We see SYN without ACK. It is attempt of
4008 + * simultaneous connect with crossed SYNs.
4009 +@@ -5552,6 +5682,11 @@ discard:
4010 + tp->tcp_header_len = sizeof(struct tcphdr);
4011 + }
4012 +
4013 ++ if (mptcp(tp)) {
4014 ++ tp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
4015 ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
4016 ++ }
4017 ++
4018 + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
4019 + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
4020 +
4021 +@@ -5610,6 +5745,7 @@ reset_and_undo:
4022 +
4023 + int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4024 + const struct tcphdr *th, unsigned int len)
4025 ++ __releases(&sk->sk_lock.slock)
4026 + {
4027 + struct tcp_sock *tp = tcp_sk(sk);
4028 + struct inet_connection_sock *icsk = inet_csk(sk);
4029 +@@ -5661,6 +5797,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4030 +
4031 + case TCP_SYN_SENT:
4032 + queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
4033 ++ if (is_meta_sk(sk)) {
4034 ++ sk = tcp_sk(sk)->mpcb->master_sk;
4035 ++ tp = tcp_sk(sk);
4036 ++
4037 ++ /* Need to call it here, because it will announce new
4038 ++ * addresses, which can only be done after the third ack
4039 ++ * of the 3-way handshake.
4040 ++ */
4041 ++ mptcp_update_metasocket(sk, tp->meta_sk);
4042 ++ }
4043 + if (queued >= 0)
4044 + return queued;
4045 +
4046 +@@ -5668,6 +5814,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4047 + tcp_urg(sk, skb, th);
4048 + __kfree_skb(skb);
4049 + tcp_data_snd_check(sk);
4050 ++ if (mptcp(tp) && is_master_tp(tp))
4051 ++ bh_unlock_sock(sk);
4052 + return 0;
4053 + }
4054 +
4055 +@@ -5706,11 +5854,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4056 + synack_stamp = tp->lsndtime;
4057 + /* Make sure socket is routed, for correct metrics. */
4058 + icsk->icsk_af_ops->rebuild_header(sk);
4059 +- tcp_init_congestion_control(sk);
4060 ++ tp->ops->init_congestion_control(sk);
4061 +
4062 + tcp_mtup_init(sk);
4063 + tp->copied_seq = tp->rcv_nxt;
4064 +- tcp_init_buffer_space(sk);
4065 ++ tp->ops->init_buffer_space(sk);
4066 + }
4067 + smp_mb();
4068 + tcp_set_state(sk, TCP_ESTABLISHED);
4069 +@@ -5730,6 +5878,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4070 +
4071 + if (tp->rx_opt.tstamp_ok)
4072 + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
4073 ++ if (mptcp(tp))
4074 ++ tp->advmss -= MPTCP_SUB_LEN_DSM_ALIGN;
4075 +
4076 + if (req) {
4077 + /* Re-arm the timer because data may have been sent out.
4078 +@@ -5751,6 +5901,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4079 +
4080 + tcp_initialize_rcv_mss(sk);
4081 + tcp_fast_path_on(tp);
4082 ++ /* Send an ACK when establishing a new
4083 ++ * MPTCP subflow, i.e. using an MP_JOIN
4084 ++ * subtype.
4085 ++ */
4086 ++ if (mptcp(tp) && !is_master_tp(tp))
4087 ++ tcp_send_ack(sk);
4088 + break;
4089 +
4090 + case TCP_FIN_WAIT1: {
4091 +@@ -5802,7 +5958,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4092 + tmo = tcp_fin_time(sk);
4093 + if (tmo > TCP_TIMEWAIT_LEN) {
4094 + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
4095 +- } else if (th->fin || sock_owned_by_user(sk)) {
4096 ++ } else if (th->fin || mptcp_is_data_fin(skb) ||
4097 ++ sock_owned_by_user(sk)) {
4098 + /* Bad case. We could lose such FIN otherwise.
4099 + * It is not a big problem, but it looks confusing
4100 + * and not so rare event. We still can lose it now,
4101 +@@ -5811,7 +5968,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4102 + */
4103 + inet_csk_reset_keepalive_timer(sk, tmo);
4104 + } else {
4105 +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
4106 ++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
4107 + goto discard;
4108 + }
4109 + break;
4110 +@@ -5819,7 +5976,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4111 +
4112 + case TCP_CLOSING:
4113 + if (tp->snd_una == tp->write_seq) {
4114 +- tcp_time_wait(sk, TCP_TIME_WAIT, 0);
4115 ++ tp->ops->time_wait(sk, TCP_TIME_WAIT, 0);
4116 + goto discard;
4117 + }
4118 + break;
4119 +@@ -5831,6 +5988,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4120 + goto discard;
4121 + }
4122 + break;
4123 ++ case TCP_CLOSE:
4124 ++ if (tp->mp_killed)
4125 ++ goto discard;
4126 + }
4127 +
4128 + /* step 6: check the URG bit */
4129 +@@ -5851,7 +6011,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
4130 + */
4131 + if (sk->sk_shutdown & RCV_SHUTDOWN) {
4132 + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4133 +- after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
4134 ++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
4135 ++ !mptcp(tp)) {
4136 ++ /* In case of mptcp, the reset is handled by
4137 ++ * mptcp_rcv_state_process
4138 ++ */
4139 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
4140 + tcp_reset(sk);
4141 + return 1;
4142 +@@ -5877,3 +6041,154 @@ discard:
4143 + return 0;
4144 + }
4145 + EXPORT_SYMBOL(tcp_rcv_state_process);
4146 ++
4147 ++static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
4148 ++{
4149 ++ struct inet_request_sock *ireq = inet_rsk(req);
4150 ++
4151 ++ if (family == AF_INET)
4152 ++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
4153 ++ &ireq->ir_rmt_addr, port);
4154 ++#if IS_ENABLED(CONFIG_IPV6)
4155 ++ else if (family == AF_INET6)
4156 ++ LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"),
4157 ++ &ireq->ir_v6_rmt_addr, port);
4158 ++#endif
4159 ++}
4160 ++
4161 ++int tcp_conn_request(struct request_sock_ops *rsk_ops,
4162 ++ const struct tcp_request_sock_ops *af_ops,
4163 ++ struct sock *sk, struct sk_buff *skb)
4164 ++{
4165 ++ struct tcp_options_received tmp_opt;
4166 ++ struct request_sock *req;
4167 ++ struct tcp_sock *tp = tcp_sk(sk);
4168 ++ struct dst_entry *dst = NULL;
4169 ++ __u32 isn = TCP_SKB_CB(skb)->when;
4170 ++ bool want_cookie = false, fastopen;
4171 ++ struct flowi fl;
4172 ++ struct tcp_fastopen_cookie foc = { .len = -1 };
4173 ++ int err;
4174 ++
4175 ++
4176 ++ /* TW buckets are converted to open requests without
4177 ++ * limitations, they conserve resources and peer is
4178 ++ * evidently real one.
4179 ++ */
4180 ++ if ((sysctl_tcp_syncookies == 2 ||
4181 ++ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
4182 ++ want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
4183 ++ if (!want_cookie)
4184 ++ goto drop;
4185 ++ }
4186 ++
4187 ++
4188 ++ /* Accept backlog is full. If we have already queued enough
4189 ++ * of warm entries in syn queue, drop request. It is better than
4190 ++ * clogging syn queue with openreqs with exponentially increasing
4191 ++ * timeout.
4192 ++ */
4193 ++ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
4194 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
4195 ++ goto drop;
4196 ++ }
4197 ++
4198 ++ req = inet_reqsk_alloc(rsk_ops);
4199 ++ if (!req)
4200 ++ goto drop;
4201 ++
4202 ++ tcp_rsk(req)->af_specific = af_ops;
4203 ++
4204 ++ tcp_clear_options(&tmp_opt);
4205 ++ tmp_opt.mss_clamp = af_ops->mss_clamp;
4206 ++ tmp_opt.user_mss = tp->rx_opt.user_mss;
4207 ++ tcp_parse_options(skb, &tmp_opt, NULL, 0, want_cookie ? NULL : &foc);
4208 ++
4209 ++ if (want_cookie && !tmp_opt.saw_tstamp)
4210 ++ tcp_clear_options(&tmp_opt);
4211 ++
4212 ++ tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
4213 ++ tcp_openreq_init(req, &tmp_opt, skb);
4214 ++
4215 ++ if (af_ops->init_req(req, sk, skb))
4216 ++ goto drop_and_free;
4217 ++
4218 ++ if (security_inet_conn_request(sk, skb, req))
4219 ++ goto drop_and_free;
4220 ++
4221 ++ if (!want_cookie || tmp_opt.tstamp_ok)
4222 ++ TCP_ECN_create_request(req, skb, sock_net(sk));
4223 ++
4224 ++ if (want_cookie) {
4225 ++ isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
4226 ++ req->cookie_ts = tmp_opt.tstamp_ok;
4227 ++ } else if (!isn) {
4228 ++ /* VJ's idea. We save last timestamp seen
4229 ++ * from the destination in peer table, when entering
4230 ++ * state TIME-WAIT, and check against it before
4231 ++ * accepting new connection request.
4232 ++ *
4233 ++ * If "isn" is not zero, this request hit alive
4234 ++ * timewait bucket, so that all the necessary checks
4235 ++ * are made in the function processing timewait state.
4236 ++ */
4237 ++ if (tmp_opt.saw_tstamp && tcp_death_row.sysctl_tw_recycle) {
4238 ++ bool strict;
4239 ++
4240 ++ dst = af_ops->route_req(sk, &fl, req, &strict);
4241 ++ if (dst && strict &&
4242 ++ !tcp_peer_is_proven(req, dst, true)) {
4243 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
4244 ++ goto drop_and_release;
4245 ++ }
4246 ++ }
4247 ++ /* Kill the following clause, if you dislike this way. */
4248 ++ else if (!sysctl_tcp_syncookies &&
4249 ++ (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
4250 ++ (sysctl_max_syn_backlog >> 2)) &&
4251 ++ !tcp_peer_is_proven(req, dst, false)) {
4252 ++ /* Without syncookies last quarter of
4253 ++ * backlog is filled with destinations,
4254 ++ * proven to be alive.
4255 ++ * It means that we continue to communicate
4256 ++ * to destinations, already remembered
4257 ++ * to the moment of synflood.
4258 ++ */
4259 ++ pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
4260 ++ rsk_ops->family);
4261 ++ goto drop_and_release;
4262 ++ }
4263 ++
4264 ++ isn = af_ops->init_seq(skb);
4265 ++ }
4266 ++ if (!dst) {
4267 ++ dst = af_ops->route_req(sk, &fl, req, NULL);
4268 ++ if (!dst)
4269 ++ goto drop_and_free;
4270 ++ }
4271 ++
4272 ++ tcp_rsk(req)->snt_isn = isn;
4273 ++ tcp_openreq_init_rwin(req, sk, dst);
4274 ++ fastopen = !want_cookie &&
4275 ++ tcp_try_fastopen(sk, skb, req, &foc, dst);
4276 ++ err = af_ops->send_synack(sk, dst, &fl, req,
4277 ++ skb_get_queue_mapping(skb), &foc);
4278 ++ if (!fastopen) {
4279 ++ if (err || want_cookie)
4280 ++ goto drop_and_free;
4281 ++
4282 ++ tcp_rsk(req)->listener = NULL;
4283 ++ af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
4284 ++ }
4285 ++
4286 ++ return 0;
4287 ++
4288 ++drop_and_release:
4289 ++ dst_release(dst);
4290 ++drop_and_free:
4291 ++ reqsk_free(req);
4292 ++drop:
4293 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
4294 ++ return 0;
4295 ++}
4296 ++EXPORT_SYMBOL(tcp_conn_request);
4297 +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
4298 +index 77cccda1ad0c..c77017f600f1 100644
4299 +--- a/net/ipv4/tcp_ipv4.c
4300 ++++ b/net/ipv4/tcp_ipv4.c
4301 +@@ -67,6 +67,8 @@
4302 + #include <net/icmp.h>
4303 + #include <net/inet_hashtables.h>
4304 + #include <net/tcp.h>
4305 ++#include <net/mptcp.h>
4306 ++#include <net/mptcp_v4.h>
4307 + #include <net/transp_v6.h>
4308 + #include <net/ipv6.h>
4309 + #include <net/inet_common.h>
4310 +@@ -99,7 +101,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
4311 + struct inet_hashinfo tcp_hashinfo;
4312 + EXPORT_SYMBOL(tcp_hashinfo);
4313 +
4314 +-static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
4315 ++__u32 tcp_v4_init_sequence(const struct sk_buff *skb)
4316 + {
4317 + return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
4318 + ip_hdr(skb)->saddr,
4319 +@@ -334,7 +336,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4320 + struct inet_sock *inet;
4321 + const int type = icmp_hdr(icmp_skb)->type;
4322 + const int code = icmp_hdr(icmp_skb)->code;
4323 +- struct sock *sk;
4324 ++ struct sock *sk, *meta_sk;
4325 + struct sk_buff *skb;
4326 + struct request_sock *fastopen;
4327 + __u32 seq, snd_una;
4328 +@@ -358,13 +360,19 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4329 + return;
4330 + }
4331 +
4332 +- bh_lock_sock(sk);
4333 ++ tp = tcp_sk(sk);
4334 ++ if (mptcp(tp))
4335 ++ meta_sk = mptcp_meta_sk(sk);
4336 ++ else
4337 ++ meta_sk = sk;
4338 ++
4339 ++ bh_lock_sock(meta_sk);
4340 + /* If too many ICMPs get dropped on busy
4341 + * servers this needs to be solved differently.
4342 + * We do take care of PMTU discovery (RFC1191) special case :
4343 + * we can receive locally generated ICMP messages while socket is held.
4344 + */
4345 +- if (sock_owned_by_user(sk)) {
4346 ++ if (sock_owned_by_user(meta_sk)) {
4347 + if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
4348 + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
4349 + }
4350 +@@ -377,7 +385,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4351 + }
4352 +
4353 + icsk = inet_csk(sk);
4354 +- tp = tcp_sk(sk);
4355 + seq = ntohl(th->seq);
4356 + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
4357 + fastopen = tp->fastopen_rsk;
4358 +@@ -411,11 +418,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4359 + goto out;
4360 +
4361 + tp->mtu_info = info;
4362 +- if (!sock_owned_by_user(sk)) {
4363 ++ if (!sock_owned_by_user(meta_sk)) {
4364 + tcp_v4_mtu_reduced(sk);
4365 + } else {
4366 + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
4367 + sock_hold(sk);
4368 ++ if (mptcp(tp))
4369 ++ mptcp_tsq_flags(sk);
4370 + }
4371 + goto out;
4372 + }
4373 +@@ -429,7 +438,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4374 + !icsk->icsk_backoff || fastopen)
4375 + break;
4376 +
4377 +- if (sock_owned_by_user(sk))
4378 ++ if (sock_owned_by_user(meta_sk))
4379 + break;
4380 +
4381 + icsk->icsk_backoff--;
4382 +@@ -463,7 +472,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4383 + switch (sk->sk_state) {
4384 + struct request_sock *req, **prev;
4385 + case TCP_LISTEN:
4386 +- if (sock_owned_by_user(sk))
4387 ++ if (sock_owned_by_user(meta_sk))
4388 + goto out;
4389 +
4390 + req = inet_csk_search_req(sk, &prev, th->dest,
4391 +@@ -499,7 +508,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4392 + if (fastopen && fastopen->sk == NULL)
4393 + break;
4394 +
4395 +- if (!sock_owned_by_user(sk)) {
4396 ++ if (!sock_owned_by_user(meta_sk)) {
4397 + sk->sk_err = err;
4398 +
4399 + sk->sk_error_report(sk);
4400 +@@ -528,7 +537,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4401 + */
4402 +
4403 + inet = inet_sk(sk);
4404 +- if (!sock_owned_by_user(sk) && inet->recverr) {
4405 ++ if (!sock_owned_by_user(meta_sk) && inet->recverr) {
4406 + sk->sk_err = err;
4407 + sk->sk_error_report(sk);
4408 + } else { /* Only an error on timeout */
4409 +@@ -536,7 +545,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
4410 + }
4411 +
4412 + out:
4413 +- bh_unlock_sock(sk);
4414 ++ bh_unlock_sock(meta_sk);
4415 + sock_put(sk);
4416 + }
4417 +
4418 +@@ -578,7 +587,7 @@ EXPORT_SYMBOL(tcp_v4_send_check);
4419 + * Exception: precedence violation. We do not implement it in any case.
4420 + */
4421 +
4422 +-static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
4423 ++void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
4424 + {
4425 + const struct tcphdr *th = tcp_hdr(skb);
4426 + struct {
4427 +@@ -702,10 +711,10 @@ release_sk1:
4428 + outside socket context is ugly, certainly. What can I do?
4429 + */
4430 +
4431 +-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
4432 ++static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
4433 + u32 win, u32 tsval, u32 tsecr, int oif,
4434 + struct tcp_md5sig_key *key,
4435 +- int reply_flags, u8 tos)
4436 ++ int reply_flags, u8 tos, int mptcp)
4437 + {
4438 + const struct tcphdr *th = tcp_hdr(skb);
4439 + struct {
4440 +@@ -714,6 +723,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
4441 + #ifdef CONFIG_TCP_MD5SIG
4442 + + (TCPOLEN_MD5SIG_ALIGNED >> 2)
4443 + #endif
4444 ++#ifdef CONFIG_MPTCP
4445 ++ + ((MPTCP_SUB_LEN_DSS >> 2) +
4446 ++ (MPTCP_SUB_LEN_ACK >> 2))
4447 ++#endif
4448 + ];
4449 + } rep;
4450 + struct ip_reply_arg arg;
4451 +@@ -758,6 +771,21 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
4452 + ip_hdr(skb)->daddr, &rep.th);
4453 + }
4454 + #endif
4455 ++#ifdef CONFIG_MPTCP
4456 ++ if (mptcp) {
4457 ++ int offset = (tsecr) ? 3 : 0;
4458 ++ /* Construction of 32-bit data_ack */
4459 ++ rep.opt[offset++] = htonl((TCPOPT_MPTCP << 24) |
4460 ++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
4461 ++ (0x20 << 8) |
4462 ++ (0x01));
4463 ++ rep.opt[offset] = htonl(data_ack);
4464 ++
4465 ++ arg.iov[0].iov_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
4466 ++ rep.th.doff = arg.iov[0].iov_len / 4;
4467 ++ }
4468 ++#endif /* CONFIG_MPTCP */
4469 ++
4470 + arg.flags = reply_flags;
4471 + arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
4472 + ip_hdr(skb)->saddr, /* XXX */
4473 +@@ -776,36 +804,44 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
4474 + {
4475 + struct inet_timewait_sock *tw = inet_twsk(sk);
4476 + struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
4477 ++ u32 data_ack = 0;
4478 ++ int mptcp = 0;
4479 ++
4480 ++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
4481 ++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
4482 ++ mptcp = 1;
4483 ++ }
4484 +
4485 + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
4486 ++ data_ack,
4487 + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
4488 + tcp_time_stamp + tcptw->tw_ts_offset,
4489 + tcptw->tw_ts_recent,
4490 + tw->tw_bound_dev_if,
4491 + tcp_twsk_md5_key(tcptw),
4492 + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
4493 +- tw->tw_tos
4494 ++ tw->tw_tos, mptcp
4495 + );
4496 +
4497 + inet_twsk_put(tw);
4498 + }
4499 +
4500 +-static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
4501 +- struct request_sock *req)
4502 ++void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
4503 ++ struct request_sock *req)
4504 + {
4505 + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
4506 + * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
4507 + */
4508 + tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
4509 + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
4510 +- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
4511 ++ tcp_rsk(req)->rcv_nxt, 0, req->rcv_wnd,
4512 + tcp_time_stamp,
4513 + req->ts_recent,
4514 + 0,
4515 + tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
4516 + AF_INET),
4517 + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
4518 +- ip_hdr(skb)->tos);
4519 ++ ip_hdr(skb)->tos, 0);
4520 + }
4521 +
4522 + /*
4523 +@@ -813,10 +849,11 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
4524 + * This still operates on a request_sock only, not on a big
4525 + * socket.
4526 + */
4527 +-static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
4528 +- struct request_sock *req,
4529 +- u16 queue_mapping,
4530 +- struct tcp_fastopen_cookie *foc)
4531 ++int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
4532 ++ struct flowi *fl,
4533 ++ struct request_sock *req,
4534 ++ u16 queue_mapping,
4535 ++ struct tcp_fastopen_cookie *foc)
4536 + {
4537 + const struct inet_request_sock *ireq = inet_rsk(req);
4538 + struct flowi4 fl4;
4539 +@@ -844,21 +881,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
4540 + return err;
4541 + }
4542 +
4543 +-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
4544 +-{
4545 +- int res = tcp_v4_send_synack(sk, NULL, req, 0, NULL);
4546 +-
4547 +- if (!res) {
4548 +- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
4549 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4550 +- }
4551 +- return res;
4552 +-}
4553 +-
4554 + /*
4555 + * IPv4 request_sock destructor.
4556 + */
4557 +-static void tcp_v4_reqsk_destructor(struct request_sock *req)
4558 ++void tcp_v4_reqsk_destructor(struct request_sock *req)
4559 + {
4560 + kfree(inet_rsk(req)->opt);
4561 + }
4562 +@@ -896,7 +922,7 @@ EXPORT_SYMBOL(tcp_syn_flood_action);
4563 + /*
4564 + * Save and compile IPv4 options into the request_sock if needed.
4565 + */
4566 +-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
4567 ++struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
4568 + {
4569 + const struct ip_options *opt = &(IPCB(skb)->opt);
4570 + struct ip_options_rcu *dopt = NULL;
4571 +@@ -1237,161 +1263,71 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
4572 +
4573 + #endif
4574 +
4575 ++static int tcp_v4_init_req(struct request_sock *req, struct sock *sk,
4576 ++ struct sk_buff *skb)
4577 ++{
4578 ++ struct inet_request_sock *ireq = inet_rsk(req);
4579 ++
4580 ++ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
4581 ++ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
4582 ++ ireq->no_srccheck = inet_sk(sk)->transparent;
4583 ++ ireq->opt = tcp_v4_save_options(skb);
4584 ++ ireq->ir_mark = inet_request_mark(sk, skb);
4585 ++
4586 ++ return 0;
4587 ++}
4588 ++
4589 ++static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl,
4590 ++ const struct request_sock *req,
4591 ++ bool *strict)
4592 ++{
4593 ++ struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req);
4594 ++
4595 ++ if (strict) {
4596 ++ if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr)
4597 ++ *strict = true;
4598 ++ else
4599 ++ *strict = false;
4600 ++ }
4601 ++
4602 ++ return dst;
4603 ++}
4604 ++
4605 + struct request_sock_ops tcp_request_sock_ops __read_mostly = {
4606 + .family = PF_INET,
4607 + .obj_size = sizeof(struct tcp_request_sock),
4608 +- .rtx_syn_ack = tcp_v4_rtx_synack,
4609 ++ .rtx_syn_ack = tcp_rtx_synack,
4610 + .send_ack = tcp_v4_reqsk_send_ack,
4611 + .destructor = tcp_v4_reqsk_destructor,
4612 + .send_reset = tcp_v4_send_reset,
4613 + .syn_ack_timeout = tcp_syn_ack_timeout,
4614 + };
4615 +
4616 ++const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
4617 ++ .mss_clamp = TCP_MSS_DEFAULT,
4618 + #ifdef CONFIG_TCP_MD5SIG
4619 +-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
4620 + .md5_lookup = tcp_v4_reqsk_md5_lookup,
4621 + .calc_md5_hash = tcp_v4_md5_hash_skb,
4622 +-};
4623 + #endif
4624 ++ .init_req = tcp_v4_init_req,
4625 ++#ifdef CONFIG_SYN_COOKIES
4626 ++ .cookie_init_seq = cookie_v4_init_sequence,
4627 ++#endif
4628 ++ .route_req = tcp_v4_route_req,
4629 ++ .init_seq = tcp_v4_init_sequence,
4630 ++ .send_synack = tcp_v4_send_synack,
4631 ++ .queue_hash_add = inet_csk_reqsk_queue_hash_add,
4632 ++};
4633 +
4634 + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
4635 + {
4636 +- struct tcp_options_received tmp_opt;
4637 +- struct request_sock *req;
4638 +- struct inet_request_sock *ireq;
4639 +- struct tcp_sock *tp = tcp_sk(sk);
4640 +- struct dst_entry *dst = NULL;
4641 +- __be32 saddr = ip_hdr(skb)->saddr;
4642 +- __be32 daddr = ip_hdr(skb)->daddr;
4643 +- __u32 isn = TCP_SKB_CB(skb)->when;
4644 +- bool want_cookie = false, fastopen;
4645 +- struct flowi4 fl4;
4646 +- struct tcp_fastopen_cookie foc = { .len = -1 };
4647 +- int err;
4648 +-
4649 + /* Never answer to SYNs send to broadcast or multicast */
4650 + if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
4651 + goto drop;
4652 +
4653 +- /* TW buckets are converted to open requests without
4654 +- * limitations, they conserve resources and peer is
4655 +- * evidently real one.
4656 +- */
4657 +- if ((sysctl_tcp_syncookies == 2 ||
4658 +- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
4659 +- want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
4660 +- if (!want_cookie)
4661 +- goto drop;
4662 +- }
4663 +-
4664 +- /* Accept backlog is full. If we have already queued enough
4665 +- * of warm entries in syn queue, drop request. It is better than
4666 +- * clogging syn queue with openreqs with exponentially increasing
4667 +- * timeout.
4668 +- */
4669 +- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
4670 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
4671 +- goto drop;
4672 +- }
4673 +-
4674 +- req = inet_reqsk_alloc(&tcp_request_sock_ops);
4675 +- if (!req)
4676 +- goto drop;
4677 +-
4678 +-#ifdef CONFIG_TCP_MD5SIG
4679 +- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
4680 +-#endif
4681 +-
4682 +- tcp_clear_options(&tmp_opt);
4683 +- tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
4684 +- tmp_opt.user_mss = tp->rx_opt.user_mss;
4685 +- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
4686 +-
4687 +- if (want_cookie && !tmp_opt.saw_tstamp)
4688 +- tcp_clear_options(&tmp_opt);
4689 +-
4690 +- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
4691 +- tcp_openreq_init(req, &tmp_opt, skb);
4692 ++ return tcp_conn_request(&tcp_request_sock_ops,
4693 ++ &tcp_request_sock_ipv4_ops, sk, skb);
4694 +
4695 +- ireq = inet_rsk(req);
4696 +- ireq->ir_loc_addr = daddr;
4697 +- ireq->ir_rmt_addr = saddr;
4698 +- ireq->no_srccheck = inet_sk(sk)->transparent;
4699 +- ireq->opt = tcp_v4_save_options(skb);
4700 +- ireq->ir_mark = inet_request_mark(sk, skb);
4701 +-
4702 +- if (security_inet_conn_request(sk, skb, req))
4703 +- goto drop_and_free;
4704 +-
4705 +- if (!want_cookie || tmp_opt.tstamp_ok)
4706 +- TCP_ECN_create_request(req, skb, sock_net(sk));
4707 +-
4708 +- if (want_cookie) {
4709 +- isn = cookie_v4_init_sequence(sk, skb, &req->mss);
4710 +- req->cookie_ts = tmp_opt.tstamp_ok;
4711 +- } else if (!isn) {
4712 +- /* VJ's idea. We save last timestamp seen
4713 +- * from the destination in peer table, when entering
4714 +- * state TIME-WAIT, and check against it before
4715 +- * accepting new connection request.
4716 +- *
4717 +- * If "isn" is not zero, this request hit alive
4718 +- * timewait bucket, so that all the necessary checks
4719 +- * are made in the function processing timewait state.
4720 +- */
4721 +- if (tmp_opt.saw_tstamp &&
4722 +- tcp_death_row.sysctl_tw_recycle &&
4723 +- (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
4724 +- fl4.daddr == saddr) {
4725 +- if (!tcp_peer_is_proven(req, dst, true)) {
4726 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
4727 +- goto drop_and_release;
4728 +- }
4729 +- }
4730 +- /* Kill the following clause, if you dislike this way. */
4731 +- else if (!sysctl_tcp_syncookies &&
4732 +- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
4733 +- (sysctl_max_syn_backlog >> 2)) &&
4734 +- !tcp_peer_is_proven(req, dst, false)) {
4735 +- /* Without syncookies last quarter of
4736 +- * backlog is filled with destinations,
4737 +- * proven to be alive.
4738 +- * It means that we continue to communicate
4739 +- * to destinations, already remembered
4740 +- * to the moment of synflood.
4741 +- */
4742 +- LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
4743 +- &saddr, ntohs(tcp_hdr(skb)->source));
4744 +- goto drop_and_release;
4745 +- }
4746 +-
4747 +- isn = tcp_v4_init_sequence(skb);
4748 +- }
4749 +- if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
4750 +- goto drop_and_free;
4751 +-
4752 +- tcp_rsk(req)->snt_isn = isn;
4753 +- tcp_rsk(req)->snt_synack = tcp_time_stamp;
4754 +- tcp_openreq_init_rwin(req, sk, dst);
4755 +- fastopen = !want_cookie &&
4756 +- tcp_try_fastopen(sk, skb, req, &foc, dst);
4757 +- err = tcp_v4_send_synack(sk, dst, req,
4758 +- skb_get_queue_mapping(skb), &foc);
4759 +- if (!fastopen) {
4760 +- if (err || want_cookie)
4761 +- goto drop_and_free;
4762 +-
4763 +- tcp_rsk(req)->snt_synack = tcp_time_stamp;
4764 +- tcp_rsk(req)->listener = NULL;
4765 +- inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
4766 +- }
4767 +-
4768 +- return 0;
4769 +-
4770 +-drop_and_release:
4771 +- dst_release(dst);
4772 +-drop_and_free:
4773 +- reqsk_free(req);
4774 + drop:
4775 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
4776 + return 0;
4777 +@@ -1497,7 +1433,7 @@ put_and_exit:
4778 + }
4779 + EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
4780 +
4781 +-static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
4782 ++struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
4783 + {
4784 + struct tcphdr *th = tcp_hdr(skb);
4785 + const struct iphdr *iph = ip_hdr(skb);
4786 +@@ -1514,8 +1450,15 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
4787 +
4788 + if (nsk) {
4789 + if (nsk->sk_state != TCP_TIME_WAIT) {
4790 ++ /* Don't lock again the meta-sk. It has been locked
4791 ++ * before mptcp_v4_do_rcv.
4792 ++ */
4793 ++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))
4794 ++ bh_lock_sock(mptcp_meta_sk(nsk));
4795 + bh_lock_sock(nsk);
4796 ++
4797 + return nsk;
4798 ++
4799 + }
4800 + inet_twsk_put(inet_twsk(nsk));
4801 + return NULL;
4802 +@@ -1550,6 +1493,9 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
4803 + goto discard;
4804 + #endif
4805 +
4806 ++ if (is_meta_sk(sk))
4807 ++ return mptcp_v4_do_rcv(sk, skb);
4808 ++
4809 + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
4810 + struct dst_entry *dst = sk->sk_rx_dst;
4811 +
4812 +@@ -1681,7 +1627,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
4813 + } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
4814 + wake_up_interruptible_sync_poll(sk_sleep(sk),
4815 + POLLIN | POLLRDNORM | POLLRDBAND);
4816 +- if (!inet_csk_ack_scheduled(sk))
4817 ++ if (!inet_csk_ack_scheduled(sk) && !mptcp(tp))
4818 + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
4819 + (3 * tcp_rto_min(sk)) / 4,
4820 + TCP_RTO_MAX);
4821 +@@ -1698,7 +1644,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
4822 + {
4823 + const struct iphdr *iph;
4824 + const struct tcphdr *th;
4825 +- struct sock *sk;
4826 ++ struct sock *sk, *meta_sk = NULL;
4827 + int ret;
4828 + struct net *net = dev_net(skb->dev);
4829 +
4830 +@@ -1732,18 +1678,42 @@ int tcp_v4_rcv(struct sk_buff *skb)
4831 + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
4832 + skb->len - th->doff * 4);
4833 + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
4834 ++#ifdef CONFIG_MPTCP
4835 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
4836 ++ TCP_SKB_CB(skb)->dss_off = 0;
4837 ++#endif
4838 + TCP_SKB_CB(skb)->when = 0;
4839 + TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
4840 + TCP_SKB_CB(skb)->sacked = 0;
4841 +
4842 + sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
4843 +- if (!sk)
4844 +- goto no_tcp_socket;
4845 +
4846 + process:
4847 +- if (sk->sk_state == TCP_TIME_WAIT)
4848 ++ if (sk && sk->sk_state == TCP_TIME_WAIT)
4849 + goto do_time_wait;
4850 +
4851 ++#ifdef CONFIG_MPTCP
4852 ++ if (!sk && th->syn && !th->ack) {
4853 ++ int ret = mptcp_lookup_join(skb, NULL);
4854 ++
4855 ++ if (ret < 0) {
4856 ++ tcp_v4_send_reset(NULL, skb);
4857 ++ goto discard_it;
4858 ++ } else if (ret > 0) {
4859 ++ return 0;
4860 ++ }
4861 ++ }
4862 ++
4863 ++ /* Is there a pending request sock for this segment ? */
4864 ++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
4865 ++ if (sk)
4866 ++ sock_put(sk);
4867 ++ return 0;
4868 ++ }
4869 ++#endif
4870 ++ if (!sk)
4871 ++ goto no_tcp_socket;
4872 ++
4873 + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
4874 + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
4875 + goto discard_and_relse;
4876 +@@ -1759,11 +1729,21 @@ process:
4877 + sk_mark_napi_id(sk, skb);
4878 + skb->dev = NULL;
4879 +
4880 +- bh_lock_sock_nested(sk);
4881 ++ if (mptcp(tcp_sk(sk))) {
4882 ++ meta_sk = mptcp_meta_sk(sk);
4883 ++
4884 ++ bh_lock_sock_nested(meta_sk);
4885 ++ if (sock_owned_by_user(meta_sk))
4886 ++ skb->sk = sk;
4887 ++ } else {
4888 ++ meta_sk = sk;
4889 ++ bh_lock_sock_nested(sk);
4890 ++ }
4891 ++
4892 + ret = 0;
4893 +- if (!sock_owned_by_user(sk)) {
4894 ++ if (!sock_owned_by_user(meta_sk)) {
4895 + #ifdef CONFIG_NET_DMA
4896 +- struct tcp_sock *tp = tcp_sk(sk);
4897 ++ struct tcp_sock *tp = tcp_sk(meta_sk);
4898 + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
4899 + tp->ucopy.dma_chan = net_dma_find_channel();
4900 + if (tp->ucopy.dma_chan)
4901 +@@ -1771,16 +1751,16 @@ process:
4902 + else
4903 + #endif
4904 + {
4905 +- if (!tcp_prequeue(sk, skb))
4906 ++ if (!tcp_prequeue(meta_sk, skb))
4907 + ret = tcp_v4_do_rcv(sk, skb);
4908 + }
4909 +- } else if (unlikely(sk_add_backlog(sk, skb,
4910 +- sk->sk_rcvbuf + sk->sk_sndbuf))) {
4911 +- bh_unlock_sock(sk);
4912 ++ } else if (unlikely(sk_add_backlog(meta_sk, skb,
4913 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
4914 ++ bh_unlock_sock(meta_sk);
4915 + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
4916 + goto discard_and_relse;
4917 + }
4918 +- bh_unlock_sock(sk);
4919 ++ bh_unlock_sock(meta_sk);
4920 +
4921 + sock_put(sk);
4922 +
4923 +@@ -1835,6 +1815,18 @@ do_time_wait:
4924 + sk = sk2;
4925 + goto process;
4926 + }
4927 ++#ifdef CONFIG_MPTCP
4928 ++ if (th->syn && !th->ack) {
4929 ++ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
4930 ++
4931 ++ if (ret < 0) {
4932 ++ tcp_v4_send_reset(NULL, skb);
4933 ++ goto discard_it;
4934 ++ } else if (ret > 0) {
4935 ++ return 0;
4936 ++ }
4937 ++ }
4938 ++#endif
4939 + /* Fall through to ACK */
4940 + }
4941 + case TCP_TW_ACK:
4942 +@@ -1900,7 +1892,12 @@ static int tcp_v4_init_sock(struct sock *sk)
4943 +
4944 + tcp_init_sock(sk);
4945 +
4946 +- icsk->icsk_af_ops = &ipv4_specific;
4947 ++#ifdef CONFIG_MPTCP
4948 ++ if (is_mptcp_enabled(sk))
4949 ++ icsk->icsk_af_ops = &mptcp_v4_specific;
4950 ++ else
4951 ++#endif
4952 ++ icsk->icsk_af_ops = &ipv4_specific;
4953 +
4954 + #ifdef CONFIG_TCP_MD5SIG
4955 + tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
4956 +@@ -1917,6 +1914,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
4957 +
4958 + tcp_cleanup_congestion_control(sk);
4959 +
4960 ++ if (mptcp(tp))
4961 ++ mptcp_destroy_sock(sk);
4962 ++ if (tp->inside_tk_table)
4963 ++ mptcp_hash_remove(tp);
4964 ++
4965 + /* Cleanup up the write buffer. */
4966 + tcp_write_queue_purge(sk);
4967 +
4968 +@@ -2481,6 +2483,19 @@ void tcp4_proc_exit(void)
4969 + }
4970 + #endif /* CONFIG_PROC_FS */
4971 +
4972 ++#ifdef CONFIG_MPTCP
4973 ++static void tcp_v4_clear_sk(struct sock *sk, int size)
4974 ++{
4975 ++ struct tcp_sock *tp = tcp_sk(sk);
4976 ++
4977 ++ /* we do not want to clear tk_table field, because of RCU lookups */
4978 ++ sk_prot_clear_nulls(sk, offsetof(struct tcp_sock, tk_table));
4979 ++
4980 ++ size -= offsetof(struct tcp_sock, tk_table) + sizeof(tp->tk_table);
4981 ++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size);
4982 ++}
4983 ++#endif
4984 ++
4985 + struct proto tcp_prot = {
4986 + .name = "TCP",
4987 + .owner = THIS_MODULE,
4988 +@@ -2528,6 +2543,9 @@ struct proto tcp_prot = {
4989 + .destroy_cgroup = tcp_destroy_cgroup,
4990 + .proto_cgroup = tcp_proto_cgroup,
4991 + #endif
4992 ++#ifdef CONFIG_MPTCP
4993 ++ .clear_sk = tcp_v4_clear_sk,
4994 ++#endif
4995 + };
4996 + EXPORT_SYMBOL(tcp_prot);
4997 +
4998 +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
4999 +index e68e0d4af6c9..ae6946857dff 100644
5000 +--- a/net/ipv4/tcp_minisocks.c
5001 ++++ b/net/ipv4/tcp_minisocks.c
5002 +@@ -18,11 +18,13 @@
5003 + * Jorge Cwik, <jorge@×××××××××××××.net>
5004 + */
5005 +
5006 ++#include <linux/kconfig.h>
5007 + #include <linux/mm.h>
5008 + #include <linux/module.h>
5009 + #include <linux/slab.h>
5010 + #include <linux/sysctl.h>
5011 + #include <linux/workqueue.h>
5012 ++#include <net/mptcp.h>
5013 + #include <net/tcp.h>
5014 + #include <net/inet_common.h>
5015 + #include <net/xfrm.h>
5016 +@@ -95,10 +97,13 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
5017 + struct tcp_options_received tmp_opt;
5018 + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
5019 + bool paws_reject = false;
5020 ++ struct mptcp_options_received mopt;
5021 +
5022 + tmp_opt.saw_tstamp = 0;
5023 + if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
5024 +- tcp_parse_options(skb, &tmp_opt, 0, NULL);
5025 ++ mptcp_init_mp_opt(&mopt);
5026 ++
5027 ++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
5028 +
5029 + if (tmp_opt.saw_tstamp) {
5030 + tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
5031 +@@ -106,6 +111,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
5032 + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
5033 + paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
5034 + }
5035 ++
5036 ++ if (unlikely(mopt.mp_fclose) && tcptw->mptcp_tw) {
5037 ++ if (mopt.mptcp_key == tcptw->mptcp_tw->loc_key)
5038 ++ goto kill_with_rst;
5039 ++ }
5040 + }
5041 +
5042 + if (tw->tw_substate == TCP_FIN_WAIT2) {
5043 +@@ -128,6 +138,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
5044 + if (!th->ack ||
5045 + !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
5046 + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
5047 ++ /* If mptcp_is_data_fin() returns true, we are sure that
5048 ++ * mopt has been initialized - otherwise it would not
5049 ++ * be a DATA_FIN.
5050 ++ */
5051 ++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw &&
5052 ++ mptcp_is_data_fin(skb) &&
5053 ++ TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
5054 ++ mopt.data_seq + 1 == (u32)tcptw->mptcp_tw->rcv_nxt)
5055 ++ return TCP_TW_ACK;
5056 ++
5057 + inet_twsk_put(tw);
5058 + return TCP_TW_SUCCESS;
5059 + }
5060 +@@ -290,6 +310,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
5061 + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
5062 + tcptw->tw_ts_offset = tp->tsoffset;
5063 +
5064 ++ if (mptcp(tp)) {
5065 ++ if (mptcp_init_tw_sock(sk, tcptw)) {
5066 ++ inet_twsk_free(tw);
5067 ++ goto exit;
5068 ++ }
5069 ++ } else {
5070 ++ tcptw->mptcp_tw = NULL;
5071 ++ }
5072 ++
5073 + #if IS_ENABLED(CONFIG_IPV6)
5074 + if (tw->tw_family == PF_INET6) {
5075 + struct ipv6_pinfo *np = inet6_sk(sk);
5076 +@@ -347,15 +376,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
5077 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
5078 + }
5079 +
5080 ++exit:
5081 + tcp_update_metrics(sk);
5082 + tcp_done(sk);
5083 + }
5084 +
5085 + void tcp_twsk_destructor(struct sock *sk)
5086 + {
5087 +-#ifdef CONFIG_TCP_MD5SIG
5088 + struct tcp_timewait_sock *twsk = tcp_twsk(sk);
5089 +
5090 ++ if (twsk->mptcp_tw)
5091 ++ mptcp_twsk_destructor(twsk);
5092 ++#ifdef CONFIG_TCP_MD5SIG
5093 + if (twsk->tw_md5_key)
5094 + kfree_rcu(twsk->tw_md5_key, rcu);
5095 + #endif
5096 +@@ -382,13 +414,14 @@ void tcp_openreq_init_rwin(struct request_sock *req,
5097 + req->window_clamp = tcp_full_space(sk);
5098 +
5099 + /* tcp_full_space because it is guaranteed to be the first packet */
5100 +- tcp_select_initial_window(tcp_full_space(sk),
5101 +- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
5102 ++ tp->ops->select_initial_window(tcp_full_space(sk),
5103 ++ mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) -
5104 ++ (ireq->saw_mpc ? MPTCP_SUB_LEN_DSM_ALIGN : 0),
5105 + &req->rcv_wnd,
5106 + &req->window_clamp,
5107 + ireq->wscale_ok,
5108 + &rcv_wscale,
5109 +- dst_metric(dst, RTAX_INITRWND));
5110 ++ dst_metric(dst, RTAX_INITRWND), sk);
5111 + ireq->rcv_wscale = rcv_wscale;
5112 + }
5113 + EXPORT_SYMBOL(tcp_openreq_init_rwin);
5114 +@@ -499,6 +532,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
5115 + newtp->rx_opt.ts_recent_stamp = 0;
5116 + newtp->tcp_header_len = sizeof(struct tcphdr);
5117 + }
5118 ++ if (ireq->saw_mpc)
5119 ++ newtp->tcp_header_len += MPTCP_SUB_LEN_DSM_ALIGN;
5120 + newtp->tsoffset = 0;
5121 + #ifdef CONFIG_TCP_MD5SIG
5122 + newtp->md5sig_info = NULL; /*XXX*/
5123 +@@ -535,16 +570,20 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
5124 + bool fastopen)
5125 + {
5126 + struct tcp_options_received tmp_opt;
5127 ++ struct mptcp_options_received mopt;
5128 + struct sock *child;
5129 + const struct tcphdr *th = tcp_hdr(skb);
5130 + __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
5131 + bool paws_reject = false;
5132 +
5133 +- BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
5134 ++ BUG_ON(!mptcp(tcp_sk(sk)) && fastopen == (sk->sk_state == TCP_LISTEN));
5135 +
5136 + tmp_opt.saw_tstamp = 0;
5137 ++
5138 ++ mptcp_init_mp_opt(&mopt);
5139 ++
5140 + if (th->doff > (sizeof(struct tcphdr)>>2)) {
5141 +- tcp_parse_options(skb, &tmp_opt, 0, NULL);
5142 ++ tcp_parse_options(skb, &tmp_opt, &mopt, 0, NULL);
5143 +
5144 + if (tmp_opt.saw_tstamp) {
5145 + tmp_opt.ts_recent = req->ts_recent;
5146 +@@ -583,7 +622,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
5147 + *
5148 + * Reset timer after retransmitting SYNACK, similar to
5149 + * the idea of fast retransmit in recovery.
5150 ++ *
5151 ++ * Fall back to TCP if MP_CAPABLE is not set.
5152 + */
5153 ++
5154 ++ if (inet_rsk(req)->saw_mpc && !mopt.saw_mpc)
5155 ++ inet_rsk(req)->saw_mpc = false;
5156 ++
5157 ++
5158 + if (!inet_rtx_syn_ack(sk, req))
5159 + req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
5160 + TCP_RTO_MAX) + jiffies;
5161 +@@ -718,9 +764,21 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
5162 + * socket is created, wait for troubles.
5163 + */
5164 + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
5165 ++
5166 + if (child == NULL)
5167 + goto listen_overflow;
5168 +
5169 ++ if (!is_meta_sk(sk)) {
5170 ++ int ret = mptcp_check_req_master(sk, child, req, prev);
5171 ++ if (ret < 0)
5172 ++ goto listen_overflow;
5173 ++
5174 ++ /* MPTCP-supported */
5175 ++ if (!ret)
5176 ++ return tcp_sk(child)->mpcb->master_sk;
5177 ++ } else {
5178 ++ return mptcp_check_req_child(sk, child, req, prev, &mopt);
5179 ++ }
5180 + inet_csk_reqsk_queue_unlink(sk, req, prev);
5181 + inet_csk_reqsk_queue_removed(sk, req);
5182 +
5183 +@@ -746,7 +804,17 @@ embryonic_reset:
5184 + tcp_reset(sk);
5185 + }
5186 + if (!fastopen) {
5187 +- inet_csk_reqsk_queue_drop(sk, req, prev);
5188 ++ if (is_meta_sk(sk)) {
5189 ++ /* We want to avoid stoping the keepalive-timer and so
5190 ++ * avoid ending up in inet_csk_reqsk_queue_removed ...
5191 ++ */
5192 ++ inet_csk_reqsk_queue_unlink(sk, req, prev);
5193 ++ if (reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req) == 0)
5194 ++ mptcp_delete_synack_timer(sk);
5195 ++ reqsk_free(req);
5196 ++ } else {
5197 ++ inet_csk_reqsk_queue_drop(sk, req, prev);
5198 ++ }
5199 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
5200 + }
5201 + return NULL;
5202 +@@ -770,8 +838,9 @@ int tcp_child_process(struct sock *parent, struct sock *child,
5203 + {
5204 + int ret = 0;
5205 + int state = child->sk_state;
5206 ++ struct sock *meta_sk = mptcp(tcp_sk(child)) ? mptcp_meta_sk(child) : child;
5207 +
5208 +- if (!sock_owned_by_user(child)) {
5209 ++ if (!sock_owned_by_user(meta_sk)) {
5210 + ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb),
5211 + skb->len);
5212 + /* Wakeup parent, send SIGIO */
5213 +@@ -782,10 +851,14 @@ int tcp_child_process(struct sock *parent, struct sock *child,
5214 + * in main socket hash table and lock on listening
5215 + * socket does not protect us more.
5216 + */
5217 +- __sk_add_backlog(child, skb);
5218 ++ if (mptcp(tcp_sk(child)))
5219 ++ skb->sk = child;
5220 ++ __sk_add_backlog(meta_sk, skb);
5221 + }
5222 +
5223 +- bh_unlock_sock(child);
5224 ++ if (mptcp(tcp_sk(child)))
5225 ++ bh_unlock_sock(child);
5226 ++ bh_unlock_sock(meta_sk);
5227 + sock_put(child);
5228 + return ret;
5229 + }
5230 +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
5231 +index 179b51e6bda3..efd31b6c5784 100644
5232 +--- a/net/ipv4/tcp_output.c
5233 ++++ b/net/ipv4/tcp_output.c
5234 +@@ -36,6 +36,12 @@
5235 +
5236 + #define pr_fmt(fmt) "TCP: " fmt
5237 +
5238 ++#include <net/mptcp.h>
5239 ++#include <net/mptcp_v4.h>
5240 ++#if IS_ENABLED(CONFIG_IPV6)
5241 ++#include <net/mptcp_v6.h>
5242 ++#endif
5243 ++#include <net/ipv6.h>
5244 + #include <net/tcp.h>
5245 +
5246 + #include <linux/compiler.h>
5247 +@@ -68,11 +74,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
5248 + unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
5249 + EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
5250 +
5251 +-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5252 +- int push_one, gfp_t gfp);
5253 +-
5254 + /* Account for new data that has been sent to the network. */
5255 +-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
5256 ++void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
5257 + {
5258 + struct inet_connection_sock *icsk = inet_csk(sk);
5259 + struct tcp_sock *tp = tcp_sk(sk);
5260 +@@ -214,7 +217,7 @@ u32 tcp_default_init_rwnd(u32 mss)
5261 + void tcp_select_initial_window(int __space, __u32 mss,
5262 + __u32 *rcv_wnd, __u32 *window_clamp,
5263 + int wscale_ok, __u8 *rcv_wscale,
5264 +- __u32 init_rcv_wnd)
5265 ++ __u32 init_rcv_wnd, const struct sock *sk)
5266 + {
5267 + unsigned int space = (__space < 0 ? 0 : __space);
5268 +
5269 +@@ -269,12 +272,16 @@ EXPORT_SYMBOL(tcp_select_initial_window);
5270 + * value can be stuffed directly into th->window for an outgoing
5271 + * frame.
5272 + */
5273 +-static u16 tcp_select_window(struct sock *sk)
5274 ++u16 tcp_select_window(struct sock *sk)
5275 + {
5276 + struct tcp_sock *tp = tcp_sk(sk);
5277 + u32 old_win = tp->rcv_wnd;
5278 +- u32 cur_win = tcp_receive_window(tp);
5279 +- u32 new_win = __tcp_select_window(sk);
5280 ++ /* The window must never shrink at the meta-level. At the subflow we
5281 ++ * have to allow this. Otherwise we may announce a window too large
5282 ++ * for the current meta-level sk_rcvbuf.
5283 ++ */
5284 ++ u32 cur_win = tcp_receive_window(mptcp(tp) ? tcp_sk(mptcp_meta_sk(sk)) : tp);
5285 ++ u32 new_win = tp->ops->__select_window(sk);
5286 +
5287 + /* Never shrink the offered window */
5288 + if (new_win < cur_win) {
5289 +@@ -290,6 +297,7 @@ static u16 tcp_select_window(struct sock *sk)
5290 + LINUX_MIB_TCPWANTZEROWINDOWADV);
5291 + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
5292 + }
5293 ++
5294 + tp->rcv_wnd = new_win;
5295 + tp->rcv_wup = tp->rcv_nxt;
5296 +
5297 +@@ -374,7 +382,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
5298 + /* Constructs common control bits of non-data skb. If SYN/FIN is present,
5299 + * auto increment end seqno.
5300 + */
5301 +-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
5302 ++void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
5303 + {
5304 + struct skb_shared_info *shinfo = skb_shinfo(skb);
5305 +
5306 +@@ -394,7 +402,7 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
5307 + TCP_SKB_CB(skb)->end_seq = seq;
5308 + }
5309 +
5310 +-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
5311 ++bool tcp_urg_mode(const struct tcp_sock *tp)
5312 + {
5313 + return tp->snd_una != tp->snd_up;
5314 + }
5315 +@@ -404,17 +412,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
5316 + #define OPTION_MD5 (1 << 2)
5317 + #define OPTION_WSCALE (1 << 3)
5318 + #define OPTION_FAST_OPEN_COOKIE (1 << 8)
5319 +-
5320 +-struct tcp_out_options {
5321 +- u16 options; /* bit field of OPTION_* */
5322 +- u16 mss; /* 0 to disable */
5323 +- u8 ws; /* window scale, 0 to disable */
5324 +- u8 num_sack_blocks; /* number of SACK blocks to include */
5325 +- u8 hash_size; /* bytes in hash_location */
5326 +- __u8 *hash_location; /* temporary pointer, overloaded */
5327 +- __u32 tsval, tsecr; /* need to include OPTION_TS */
5328 +- struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
5329 +-};
5330 ++/* Before adding here - take a look at OPTION_MPTCP in include/net/mptcp.h */
5331 +
5332 + /* Write previously computed TCP options to the packet.
5333 + *
5334 +@@ -430,7 +428,7 @@ struct tcp_out_options {
5335 + * (but it may well be that other scenarios fail similarly).
5336 + */
5337 + static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
5338 +- struct tcp_out_options *opts)
5339 ++ struct tcp_out_options *opts, struct sk_buff *skb)
5340 + {
5341 + u16 options = opts->options; /* mungable copy */
5342 +
5343 +@@ -513,6 +511,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
5344 + }
5345 + ptr += (foc->len + 3) >> 2;
5346 + }
5347 ++
5348 ++ if (unlikely(OPTION_MPTCP & opts->options))
5349 ++ mptcp_options_write(ptr, tp, opts, skb);
5350 + }
5351 +
5352 + /* Compute TCP options for SYN packets. This is not the final
5353 +@@ -564,6 +565,8 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
5354 + if (unlikely(!(OPTION_TS & opts->options)))
5355 + remaining -= TCPOLEN_SACKPERM_ALIGNED;
5356 + }
5357 ++ if (tp->request_mptcp || mptcp(tp))
5358 ++ mptcp_syn_options(sk, opts, &remaining);
5359 +
5360 + if (fastopen && fastopen->cookie.len >= 0) {
5361 + u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
5362 +@@ -637,6 +640,9 @@ static unsigned int tcp_synack_options(struct sock *sk,
5363 + }
5364 + }
5365 +
5366 ++ if (ireq->saw_mpc)
5367 ++ mptcp_synack_options(req, opts, &remaining);
5368 ++
5369 + return MAX_TCP_OPTION_SPACE - remaining;
5370 + }
5371 +
5372 +@@ -670,16 +676,22 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
5373 + opts->tsecr = tp->rx_opt.ts_recent;
5374 + size += TCPOLEN_TSTAMP_ALIGNED;
5375 + }
5376 ++ if (mptcp(tp))
5377 ++ mptcp_established_options(sk, skb, opts, &size);
5378 +
5379 + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
5380 + if (unlikely(eff_sacks)) {
5381 +- const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
5382 +- opts->num_sack_blocks =
5383 +- min_t(unsigned int, eff_sacks,
5384 +- (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
5385 +- TCPOLEN_SACK_PERBLOCK);
5386 +- size += TCPOLEN_SACK_BASE_ALIGNED +
5387 +- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
5388 ++ const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
5389 ++ if (remaining < TCPOLEN_SACK_BASE_ALIGNED)
5390 ++ opts->num_sack_blocks = 0;
5391 ++ else
5392 ++ opts->num_sack_blocks =
5393 ++ min_t(unsigned int, eff_sacks,
5394 ++ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
5395 ++ TCPOLEN_SACK_PERBLOCK);
5396 ++ if (opts->num_sack_blocks)
5397 ++ size += TCPOLEN_SACK_BASE_ALIGNED +
5398 ++ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
5399 + }
5400 +
5401 + return size;
5402 +@@ -711,8 +723,8 @@ static void tcp_tsq_handler(struct sock *sk)
5403 + if ((1 << sk->sk_state) &
5404 + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
5405 + TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
5406 +- tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle,
5407 +- 0, GFP_ATOMIC);
5408 ++ tcp_sk(sk)->ops->write_xmit(sk, tcp_current_mss(sk),
5409 ++ tcp_sk(sk)->nonagle, 0, GFP_ATOMIC);
5410 + }
5411 + /*
5412 + * One tasklet per cpu tries to send more skbs.
5413 +@@ -727,7 +739,7 @@ static void tcp_tasklet_func(unsigned long data)
5414 + unsigned long flags;
5415 + struct list_head *q, *n;
5416 + struct tcp_sock *tp;
5417 +- struct sock *sk;
5418 ++ struct sock *sk, *meta_sk;
5419 +
5420 + local_irq_save(flags);
5421 + list_splice_init(&tsq->head, &list);
5422 +@@ -738,15 +750,25 @@ static void tcp_tasklet_func(unsigned long data)
5423 + list_del(&tp->tsq_node);
5424 +
5425 + sk = (struct sock *)tp;
5426 +- bh_lock_sock(sk);
5427 ++ meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
5428 ++ bh_lock_sock(meta_sk);
5429 +
5430 +- if (!sock_owned_by_user(sk)) {
5431 ++ if (!sock_owned_by_user(meta_sk)) {
5432 + tcp_tsq_handler(sk);
5433 ++ if (mptcp(tp))
5434 ++ tcp_tsq_handler(meta_sk);
5435 + } else {
5436 ++ if (mptcp(tp) && sk->sk_state == TCP_CLOSE)
5437 ++ goto exit;
5438 ++
5439 + /* defer the work to tcp_release_cb() */
5440 + set_bit(TCP_TSQ_DEFERRED, &tp->tsq_flags);
5441 ++
5442 ++ if (mptcp(tp))
5443 ++ mptcp_tsq_flags(sk);
5444 + }
5445 +- bh_unlock_sock(sk);
5446 ++exit:
5447 ++ bh_unlock_sock(meta_sk);
5448 +
5449 + clear_bit(TSQ_QUEUED, &tp->tsq_flags);
5450 + sk_free(sk);
5451 +@@ -756,7 +778,10 @@ static void tcp_tasklet_func(unsigned long data)
5452 + #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
5453 + (1UL << TCP_WRITE_TIMER_DEFERRED) | \
5454 + (1UL << TCP_DELACK_TIMER_DEFERRED) | \
5455 +- (1UL << TCP_MTU_REDUCED_DEFERRED))
5456 ++ (1UL << TCP_MTU_REDUCED_DEFERRED) | \
5457 ++ (1UL << MPTCP_PATH_MANAGER) | \
5458 ++ (1UL << MPTCP_SUB_DEFERRED))
5459 ++
5460 + /**
5461 + * tcp_release_cb - tcp release_sock() callback
5462 + * @sk: socket
5463 +@@ -803,6 +828,13 @@ void tcp_release_cb(struct sock *sk)
5464 + sk->sk_prot->mtu_reduced(sk);
5465 + __sock_put(sk);
5466 + }
5467 ++ if (flags & (1UL << MPTCP_PATH_MANAGER)) {
5468 ++ if (tcp_sk(sk)->mpcb->pm_ops->release_sock)
5469 ++ tcp_sk(sk)->mpcb->pm_ops->release_sock(sk);
5470 ++ __sock_put(sk);
5471 ++ }
5472 ++ if (flags & (1UL << MPTCP_SUB_DEFERRED))
5473 ++ mptcp_tsq_sub_deferred(sk);
5474 + }
5475 + EXPORT_SYMBOL(tcp_release_cb);
5476 +
5477 +@@ -862,8 +894,8 @@ void tcp_wfree(struct sk_buff *skb)
5478 + * We are working here with either a clone of the original
5479 + * SKB, or a fresh unique copy made by the retransmit engine.
5480 + */
5481 +-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5482 +- gfp_t gfp_mask)
5483 ++int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5484 ++ gfp_t gfp_mask)
5485 + {
5486 + const struct inet_connection_sock *icsk = inet_csk(sk);
5487 + struct inet_sock *inet;
5488 +@@ -933,7 +965,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5489 + */
5490 + th->window = htons(min(tp->rcv_wnd, 65535U));
5491 + } else {
5492 +- th->window = htons(tcp_select_window(sk));
5493 ++ th->window = htons(tp->ops->select_window(sk));
5494 + }
5495 + th->check = 0;
5496 + th->urg_ptr = 0;
5497 +@@ -949,7 +981,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5498 + }
5499 + }
5500 +
5501 +- tcp_options_write((__be32 *)(th + 1), tp, &opts);
5502 ++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
5503 + if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
5504 + TCP_ECN_send(sk, skb, tcp_header_size);
5505 +
5506 +@@ -988,7 +1020,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
5507 + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
5508 + * otherwise socket can stall.
5509 + */
5510 +-static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
5511 ++void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
5512 + {
5513 + struct tcp_sock *tp = tcp_sk(sk);
5514 +
5515 +@@ -1001,15 +1033,16 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
5516 + }
5517 +
5518 + /* Initialize TSO segments for a packet. */
5519 +-static void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
5520 +- unsigned int mss_now)
5521 ++void tcp_set_skb_tso_segs(const struct sock *sk, struct sk_buff *skb,
5522 ++ unsigned int mss_now)
5523 + {
5524 + struct skb_shared_info *shinfo = skb_shinfo(skb);
5525 +
5526 + /* Make sure we own this skb before messing gso_size/gso_segs */
5527 + WARN_ON_ONCE(skb_cloned(skb));
5528 +
5529 +- if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
5530 ++ if (skb->len <= mss_now || (is_meta_sk(sk) && !mptcp_sk_can_gso(sk)) ||
5531 ++ (!is_meta_sk(sk) && !sk_can_gso(sk)) || skb->ip_summed == CHECKSUM_NONE) {
5532 + /* Avoid the costly divide in the normal
5533 + * non-TSO case.
5534 + */
5535 +@@ -1041,7 +1074,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
5536 + /* Pcount in the middle of the write queue got changed, we need to do various
5537 + * tweaks to fix counters
5538 + */
5539 +-static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
5540 ++void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
5541 + {
5542 + struct tcp_sock *tp = tcp_sk(sk);
5543 +
5544 +@@ -1164,7 +1197,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
5545 + * eventually). The difference is that pulled data not copied, but
5546 + * immediately discarded.
5547 + */
5548 +-static void __pskb_trim_head(struct sk_buff *skb, int len)
5549 ++void __pskb_trim_head(struct sk_buff *skb, int len)
5550 + {
5551 + struct skb_shared_info *shinfo;
5552 + int i, k, eat;
5553 +@@ -1205,6 +1238,9 @@ static void __pskb_trim_head(struct sk_buff *skb, int len)
5554 + /* Remove acked data from a packet in the transmit queue. */
5555 + int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
5556 + {
5557 ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk) && mptcp_is_data_seq(skb))
5558 ++ return mptcp_trim_head(sk, skb, len);
5559 ++
5560 + if (skb_unclone(skb, GFP_ATOMIC))
5561 + return -ENOMEM;
5562 +
5563 +@@ -1222,6 +1258,15 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
5564 + if (tcp_skb_pcount(skb) > 1)
5565 + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
5566 +
5567 ++#ifdef CONFIG_MPTCP
5568 ++ /* Some data got acked - we assume that the seq-number reached the dest.
5569 ++ * Anyway, our MPTCP-option has been trimmed above - we lost it here.
5570 ++ * Only remove the SEQ if the call does not come from a meta retransmit.
5571 ++ */
5572 ++ if (mptcp(tcp_sk(sk)) && !is_meta_sk(sk))
5573 ++ TCP_SKB_CB(skb)->mptcp_flags &= ~MPTCPHDR_SEQ;
5574 ++#endif
5575 ++
5576 + return 0;
5577 + }
5578 +
5579 +@@ -1379,6 +1424,7 @@ unsigned int tcp_current_mss(struct sock *sk)
5580 +
5581 + return mss_now;
5582 + }
5583 ++EXPORT_SYMBOL(tcp_current_mss);
5584 +
5585 + /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
5586 + * As additional protections, we do not touch cwnd in retransmission phases,
5587 +@@ -1446,8 +1492,8 @@ static bool tcp_minshall_check(const struct tcp_sock *tp)
5588 + * But we can avoid doing the divide again given we already have
5589 + * skb_pcount = skb->len / mss_now
5590 + */
5591 +-static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
5592 +- const struct sk_buff *skb)
5593 ++void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
5594 ++ const struct sk_buff *skb)
5595 + {
5596 + if (skb->len < tcp_skb_pcount(skb) * mss_now)
5597 + tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
5598 +@@ -1468,11 +1514,11 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
5599 + (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
5600 + }
5601 + /* Returns the portion of skb which can be sent right away */
5602 +-static unsigned int tcp_mss_split_point(const struct sock *sk,
5603 +- const struct sk_buff *skb,
5604 +- unsigned int mss_now,
5605 +- unsigned int max_segs,
5606 +- int nonagle)
5607 ++unsigned int tcp_mss_split_point(const struct sock *sk,
5608 ++ const struct sk_buff *skb,
5609 ++ unsigned int mss_now,
5610 ++ unsigned int max_segs,
5611 ++ int nonagle)
5612 + {
5613 + const struct tcp_sock *tp = tcp_sk(sk);
5614 + u32 partial, needed, window, max_len;
5615 +@@ -1502,13 +1548,14 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
5616 + /* Can at least one segment of SKB be sent right now, according to the
5617 + * congestion window rules? If so, return how many segments are allowed.
5618 + */
5619 +-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
5620 +- const struct sk_buff *skb)
5621 ++unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
5622 ++ const struct sk_buff *skb)
5623 + {
5624 + u32 in_flight, cwnd;
5625 +
5626 + /* Don't be strict about the congestion window for the final FIN. */
5627 +- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
5628 ++ if (skb &&
5629 ++ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
5630 + tcp_skb_pcount(skb) == 1)
5631 + return 1;
5632 +
5633 +@@ -1524,8 +1571,8 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
5634 + * This must be invoked the first time we consider transmitting
5635 + * SKB onto the wire.
5636 + */
5637 +-static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
5638 +- unsigned int mss_now)
5639 ++int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
5640 ++ unsigned int mss_now)
5641 + {
5642 + int tso_segs = tcp_skb_pcount(skb);
5643 +
5644 +@@ -1540,8 +1587,8 @@ static int tcp_init_tso_segs(const struct sock *sk, struct sk_buff *skb,
5645 + /* Return true if the Nagle test allows this packet to be
5646 + * sent now.
5647 + */
5648 +-static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
5649 +- unsigned int cur_mss, int nonagle)
5650 ++bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
5651 ++ unsigned int cur_mss, int nonagle)
5652 + {
5653 + /* Nagle rule does not apply to frames, which sit in the middle of the
5654 + * write_queue (they have no chances to get new data).
5655 +@@ -1553,7 +1600,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
5656 + return true;
5657 +
5658 + /* Don't use the nagle rule for urgent data (or for the final FIN). */
5659 +- if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
5660 ++ if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
5661 ++ mptcp_is_data_fin(skb))
5662 + return true;
5663 +
5664 + if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
5665 +@@ -1563,9 +1611,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
5666 + }
5667 +
5668 + /* Does at least the first segment of SKB fit into the send window? */
5669 +-static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
5670 +- const struct sk_buff *skb,
5671 +- unsigned int cur_mss)
5672 ++bool tcp_snd_wnd_test(const struct tcp_sock *tp, const struct sk_buff *skb,
5673 ++ unsigned int cur_mss)
5674 + {
5675 + u32 end_seq = TCP_SKB_CB(skb)->end_seq;
5676 +
5677 +@@ -1676,7 +1723,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
5678 + u32 send_win, cong_win, limit, in_flight;
5679 + int win_divisor;
5680 +
5681 +- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
5682 ++ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) || mptcp_is_data_fin(skb))
5683 + goto send_now;
5684 +
5685 + if (icsk->icsk_ca_state != TCP_CA_Open)
5686 +@@ -1888,7 +1935,7 @@ static int tcp_mtu_probe(struct sock *sk)
5687 + * Returns true, if no segments are in flight and we have queued segments,
5688 + * but cannot send anything now because of SWS or another problem.
5689 + */
5690 +-static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5691 ++bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5692 + int push_one, gfp_t gfp)
5693 + {
5694 + struct tcp_sock *tp = tcp_sk(sk);
5695 +@@ -1900,7 +1947,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
5696 +
5697 + sent_pkts = 0;
5698 +
5699 +- if (!push_one) {
5700 ++ /* pmtu not yet supported with MPTCP. Should be possible, by early
5701 ++ * exiting the loop inside tcp_mtu_probe, making sure that only one
5702 ++ * single DSS-mapping gets probed.
5703 ++ */
5704 ++ if (!push_one && !mptcp(tp)) {
5705 + /* Do MTU probing. */
5706 + result = tcp_mtu_probe(sk);
5707 + if (!result) {
5708 +@@ -2099,7 +2150,8 @@ void tcp_send_loss_probe(struct sock *sk)
5709 + int err = -1;
5710 +
5711 + if (tcp_send_head(sk) != NULL) {
5712 +- err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
5713 ++ err = tp->ops->write_xmit(sk, mss, TCP_NAGLE_OFF, 2,
5714 ++ GFP_ATOMIC);
5715 + goto rearm_timer;
5716 + }
5717 +
5718 +@@ -2159,8 +2211,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
5719 + if (unlikely(sk->sk_state == TCP_CLOSE))
5720 + return;
5721 +
5722 +- if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
5723 +- sk_gfp_atomic(sk, GFP_ATOMIC)))
5724 ++ if (tcp_sk(sk)->ops->write_xmit(sk, cur_mss, nonagle, 0,
5725 ++ sk_gfp_atomic(sk, GFP_ATOMIC)))
5726 + tcp_check_probe_timer(sk);
5727 + }
5728 +
5729 +@@ -2173,7 +2225,8 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
5730 +
5731 + BUG_ON(!skb || skb->len < mss_now);
5732 +
5733 +- tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
5734 ++ tcp_sk(sk)->ops->write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1,
5735 ++ sk->sk_allocation);
5736 + }
5737 +
5738 + /* This function returns the amount that we can raise the
5739 +@@ -2386,6 +2439,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
5740 + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
5741 + return;
5742 +
5743 ++ /* Currently not supported for MPTCP - but it should be possible */
5744 ++ if (mptcp(tp))
5745 ++ return;
5746 ++
5747 + tcp_for_write_queue_from_safe(skb, tmp, sk) {
5748 + if (!tcp_can_collapse(sk, skb))
5749 + break;
5750 +@@ -2843,7 +2900,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
5751 +
5752 + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
5753 + th->window = htons(min(req->rcv_wnd, 65535U));
5754 +- tcp_options_write((__be32 *)(th + 1), tp, &opts);
5755 ++ tcp_options_write((__be32 *)(th + 1), tp, &opts, skb);
5756 + th->doff = (tcp_header_size >> 2);
5757 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
5758 +
5759 +@@ -2897,13 +2954,13 @@ static void tcp_connect_init(struct sock *sk)
5760 + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
5761 + tp->window_clamp = tcp_full_space(sk);
5762 +
5763 +- tcp_select_initial_window(tcp_full_space(sk),
5764 +- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
5765 +- &tp->rcv_wnd,
5766 +- &tp->window_clamp,
5767 +- sysctl_tcp_window_scaling,
5768 +- &rcv_wscale,
5769 +- dst_metric(dst, RTAX_INITRWND));
5770 ++ tp->ops->select_initial_window(tcp_full_space(sk),
5771 ++ tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
5772 ++ &tp->rcv_wnd,
5773 ++ &tp->window_clamp,
5774 ++ sysctl_tcp_window_scaling,
5775 ++ &rcv_wscale,
5776 ++ dst_metric(dst, RTAX_INITRWND), sk);
5777 +
5778 + tp->rx_opt.rcv_wscale = rcv_wscale;
5779 + tp->rcv_ssthresh = tp->rcv_wnd;
5780 +@@ -2927,6 +2984,36 @@ static void tcp_connect_init(struct sock *sk)
5781 + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
5782 + inet_csk(sk)->icsk_retransmits = 0;
5783 + tcp_clear_retrans(tp);
5784 ++
5785 ++#ifdef CONFIG_MPTCP
5786 ++ if (sysctl_mptcp_enabled && mptcp_doit(sk)) {
5787 ++ if (is_master_tp(tp)) {
5788 ++ tp->request_mptcp = 1;
5789 ++ mptcp_connect_init(sk);
5790 ++ } else if (tp->mptcp) {
5791 ++ struct inet_sock *inet = inet_sk(sk);
5792 ++
5793 ++ tp->mptcp->snt_isn = tp->write_seq;
5794 ++ tp->mptcp->init_rcv_wnd = tp->rcv_wnd;
5795 ++
5796 ++ /* Set nonce for new subflows */
5797 ++ if (sk->sk_family == AF_INET)
5798 ++ tp->mptcp->mptcp_loc_nonce = mptcp_v4_get_nonce(
5799 ++ inet->inet_saddr,
5800 ++ inet->inet_daddr,
5801 ++ inet->inet_sport,
5802 ++ inet->inet_dport);
5803 ++#if IS_ENABLED(CONFIG_IPV6)
5804 ++ else
5805 ++ tp->mptcp->mptcp_loc_nonce = mptcp_v6_get_nonce(
5806 ++ inet6_sk(sk)->saddr.s6_addr32,
5807 ++ sk->sk_v6_daddr.s6_addr32,
5808 ++ inet->inet_sport,
5809 ++ inet->inet_dport);
5810 ++#endif
5811 ++ }
5812 ++ }
5813 ++#endif
5814 + }
5815 +
5816 + static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
5817 +@@ -3176,6 +3263,7 @@ void tcp_send_ack(struct sock *sk)
5818 + TCP_SKB_CB(buff)->when = tcp_time_stamp;
5819 + tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
5820 + }
5821 ++EXPORT_SYMBOL(tcp_send_ack);
5822 +
5823 + /* This routine sends a packet with an out of date sequence
5824 + * number. It assumes the other end will try to ack it.
5825 +@@ -3188,7 +3276,7 @@ void tcp_send_ack(struct sock *sk)
5826 + * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
5827 + * out-of-date with SND.UNA-1 to probe window.
5828 + */
5829 +-static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
5830 ++int tcp_xmit_probe_skb(struct sock *sk, int urgent)
5831 + {
5832 + struct tcp_sock *tp = tcp_sk(sk);
5833 + struct sk_buff *skb;
5834 +@@ -3270,7 +3358,7 @@ void tcp_send_probe0(struct sock *sk)
5835 + struct tcp_sock *tp = tcp_sk(sk);
5836 + int err;
5837 +
5838 +- err = tcp_write_wakeup(sk);
5839 ++ err = tp->ops->write_wakeup(sk);
5840 +
5841 + if (tp->packets_out || !tcp_send_head(sk)) {
5842 + /* Cancel probe timer, if it is not required. */
5843 +@@ -3301,3 +3389,18 @@ void tcp_send_probe0(struct sock *sk)
5844 + TCP_RTO_MAX);
5845 + }
5846 + }
5847 ++
5848 ++int tcp_rtx_synack(struct sock *sk, struct request_sock *req)
5849 ++{
5850 ++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
5851 ++ struct flowi fl;
5852 ++ int res;
5853 ++
5854 ++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
5855 ++ if (!res) {
5856 ++ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
5857 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
5858 ++ }
5859 ++ return res;
5860 ++}
5861 ++EXPORT_SYMBOL(tcp_rtx_synack);
5862 +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
5863 +index 286227abed10..966b873cbf3e 100644
5864 +--- a/net/ipv4/tcp_timer.c
5865 ++++ b/net/ipv4/tcp_timer.c
5866 +@@ -20,6 +20,7 @@
5867 +
5868 + #include <linux/module.h>
5869 + #include <linux/gfp.h>
5870 ++#include <net/mptcp.h>
5871 + #include <net/tcp.h>
5872 +
5873 + int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
5874 +@@ -32,7 +33,7 @@ int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
5875 + int sysctl_tcp_orphan_retries __read_mostly;
5876 + int sysctl_tcp_thin_linear_timeouts __read_mostly;
5877 +
5878 +-static void tcp_write_err(struct sock *sk)
5879 ++void tcp_write_err(struct sock *sk)
5880 + {
5881 + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
5882 + sk->sk_error_report(sk);
5883 +@@ -74,7 +75,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset)
5884 + (!tp->snd_wnd && !tp->packets_out))
5885 + do_reset = 1;
5886 + if (do_reset)
5887 +- tcp_send_active_reset(sk, GFP_ATOMIC);
5888 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
5889 + tcp_done(sk);
5890 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
5891 + return 1;
5892 +@@ -124,10 +125,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
5893 + * retransmissions with an initial RTO of TCP_RTO_MIN or TCP_TIMEOUT_INIT if
5894 + * syn_set flag is set.
5895 + */
5896 +-static bool retransmits_timed_out(struct sock *sk,
5897 +- unsigned int boundary,
5898 +- unsigned int timeout,
5899 +- bool syn_set)
5900 ++bool retransmits_timed_out(struct sock *sk, unsigned int boundary,
5901 ++ unsigned int timeout, bool syn_set)
5902 + {
5903 + unsigned int linear_backoff_thresh, start_ts;
5904 + unsigned int rto_base = syn_set ? TCP_TIMEOUT_INIT : TCP_RTO_MIN;
5905 +@@ -153,7 +152,7 @@ static bool retransmits_timed_out(struct sock *sk,
5906 + }
5907 +
5908 + /* A write timeout has occurred. Process the after effects. */
5909 +-static int tcp_write_timeout(struct sock *sk)
5910 ++int tcp_write_timeout(struct sock *sk)
5911 + {
5912 + struct inet_connection_sock *icsk = inet_csk(sk);
5913 + struct tcp_sock *tp = tcp_sk(sk);
5914 +@@ -171,6 +170,10 @@ static int tcp_write_timeout(struct sock *sk)
5915 + }
5916 + retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
5917 + syn_set = true;
5918 ++ /* Stop retransmitting MP_CAPABLE options in SYN if timed out. */
5919 ++ if (tcp_sk(sk)->request_mptcp &&
5920 ++ icsk->icsk_retransmits >= mptcp_sysctl_syn_retries())
5921 ++ tcp_sk(sk)->request_mptcp = 0;
5922 + } else {
5923 + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
5924 + /* Black hole detection */
5925 +@@ -251,18 +254,22 @@ out:
5926 + static void tcp_delack_timer(unsigned long data)
5927 + {
5928 + struct sock *sk = (struct sock *)data;
5929 ++ struct tcp_sock *tp = tcp_sk(sk);
5930 ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
5931 +
5932 +- bh_lock_sock(sk);
5933 +- if (!sock_owned_by_user(sk)) {
5934 ++ bh_lock_sock(meta_sk);
5935 ++ if (!sock_owned_by_user(meta_sk)) {
5936 + tcp_delack_timer_handler(sk);
5937 + } else {
5938 + inet_csk(sk)->icsk_ack.blocked = 1;
5939 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
5940 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_DELAYEDACKLOCKED);
5941 + /* deleguate our work to tcp_release_cb() */
5942 + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
5943 + sock_hold(sk);
5944 ++ if (mptcp(tp))
5945 ++ mptcp_tsq_flags(sk);
5946 + }
5947 +- bh_unlock_sock(sk);
5948 ++ bh_unlock_sock(meta_sk);
5949 + sock_put(sk);
5950 + }
5951 +
5952 +@@ -479,6 +486,10 @@ out_reset_timer:
5953 + __sk_dst_reset(sk);
5954 +
5955 + out:;
5956 ++ if (mptcp(tp)) {
5957 ++ mptcp_reinject_data(sk, 1);
5958 ++ mptcp_set_rto(sk);
5959 ++ }
5960 + }
5961 +
5962 + void tcp_write_timer_handler(struct sock *sk)
5963 +@@ -505,7 +516,7 @@ void tcp_write_timer_handler(struct sock *sk)
5964 + break;
5965 + case ICSK_TIME_RETRANS:
5966 + icsk->icsk_pending = 0;
5967 +- tcp_retransmit_timer(sk);
5968 ++ tcp_sk(sk)->ops->retransmit_timer(sk);
5969 + break;
5970 + case ICSK_TIME_PROBE0:
5971 + icsk->icsk_pending = 0;
5972 +@@ -520,16 +531,19 @@ out:
5973 + static void tcp_write_timer(unsigned long data)
5974 + {
5975 + struct sock *sk = (struct sock *)data;
5976 ++ struct sock *meta_sk = mptcp(tcp_sk(sk)) ? mptcp_meta_sk(sk) : sk;
5977 +
5978 +- bh_lock_sock(sk);
5979 +- if (!sock_owned_by_user(sk)) {
5980 ++ bh_lock_sock(meta_sk);
5981 ++ if (!sock_owned_by_user(meta_sk)) {
5982 + tcp_write_timer_handler(sk);
5983 + } else {
5984 + /* deleguate our work to tcp_release_cb() */
5985 + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
5986 + sock_hold(sk);
5987 ++ if (mptcp(tcp_sk(sk)))
5988 ++ mptcp_tsq_flags(sk);
5989 + }
5990 +- bh_unlock_sock(sk);
5991 ++ bh_unlock_sock(meta_sk);
5992 + sock_put(sk);
5993 + }
5994 +
5995 +@@ -566,11 +580,12 @@ static void tcp_keepalive_timer (unsigned long data)
5996 + struct sock *sk = (struct sock *) data;
5997 + struct inet_connection_sock *icsk = inet_csk(sk);
5998 + struct tcp_sock *tp = tcp_sk(sk);
5999 ++ struct sock *meta_sk = mptcp(tp) ? mptcp_meta_sk(sk) : sk;
6000 + u32 elapsed;
6001 +
6002 + /* Only process if socket is not in use. */
6003 +- bh_lock_sock(sk);
6004 +- if (sock_owned_by_user(sk)) {
6005 ++ bh_lock_sock(meta_sk);
6006 ++ if (sock_owned_by_user(meta_sk)) {
6007 + /* Try again later. */
6008 + inet_csk_reset_keepalive_timer (sk, HZ/20);
6009 + goto out;
6010 +@@ -581,16 +596,38 @@ static void tcp_keepalive_timer (unsigned long data)
6011 + goto out;
6012 + }
6013 +
6014 ++ if (tp->send_mp_fclose) {
6015 ++ /* MUST do this before tcp_write_timeout, because retrans_stamp
6016 ++ * may have been set to 0 in another part while we are
6017 ++ * retransmitting MP_FASTCLOSE. Then, we would crash, because
6018 ++ * retransmits_timed_out accesses the meta-write-queue.
6019 ++ *
6020 ++ * We make sure that the timestamp is != 0.
6021 ++ */
6022 ++ if (!tp->retrans_stamp)
6023 ++ tp->retrans_stamp = tcp_time_stamp ? : 1;
6024 ++
6025 ++ if (tcp_write_timeout(sk))
6026 ++ goto out;
6027 ++
6028 ++ tcp_send_ack(sk);
6029 ++ icsk->icsk_retransmits++;
6030 ++
6031 ++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
6032 ++ elapsed = icsk->icsk_rto;
6033 ++ goto resched;
6034 ++ }
6035 ++
6036 + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
6037 + if (tp->linger2 >= 0) {
6038 + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
6039 +
6040 + if (tmo > 0) {
6041 +- tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
6042 ++ tp->ops->time_wait(sk, TCP_FIN_WAIT2, tmo);
6043 + goto out;
6044 + }
6045 + }
6046 +- tcp_send_active_reset(sk, GFP_ATOMIC);
6047 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
6048 + goto death;
6049 + }
6050 +
6051 +@@ -614,11 +651,11 @@ static void tcp_keepalive_timer (unsigned long data)
6052 + icsk->icsk_probes_out > 0) ||
6053 + (icsk->icsk_user_timeout == 0 &&
6054 + icsk->icsk_probes_out >= keepalive_probes(tp))) {
6055 +- tcp_send_active_reset(sk, GFP_ATOMIC);
6056 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
6057 + tcp_write_err(sk);
6058 + goto out;
6059 + }
6060 +- if (tcp_write_wakeup(sk) <= 0) {
6061 ++ if (tp->ops->write_wakeup(sk) <= 0) {
6062 + icsk->icsk_probes_out++;
6063 + elapsed = keepalive_intvl_when(tp);
6064 + } else {
6065 +@@ -642,7 +679,7 @@ death:
6066 + tcp_done(sk);
6067 +
6068 + out:
6069 +- bh_unlock_sock(sk);
6070 ++ bh_unlock_sock(meta_sk);
6071 + sock_put(sk);
6072 + }
6073 +
6074 +diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
6075 +index 5667b3003af9..7139c2973fd2 100644
6076 +--- a/net/ipv6/addrconf.c
6077 ++++ b/net/ipv6/addrconf.c
6078 +@@ -760,6 +760,7 @@ void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
6079 +
6080 + kfree_rcu(ifp, rcu);
6081 + }
6082 ++EXPORT_SYMBOL(inet6_ifa_finish_destroy);
6083 +
6084 + static void
6085 + ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
6086 +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
6087 +index 7cb4392690dd..7057afbca4df 100644
6088 +--- a/net/ipv6/af_inet6.c
6089 ++++ b/net/ipv6/af_inet6.c
6090 +@@ -97,8 +97,7 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
6091 + return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
6092 + }
6093 +
6094 +-static int inet6_create(struct net *net, struct socket *sock, int protocol,
6095 +- int kern)
6096 ++int inet6_create(struct net *net, struct socket *sock, int protocol, int kern)
6097 + {
6098 + struct inet_sock *inet;
6099 + struct ipv6_pinfo *np;
6100 +diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
6101 +index a245e5ddffbd..99c892b8992d 100644
6102 +--- a/net/ipv6/inet6_connection_sock.c
6103 ++++ b/net/ipv6/inet6_connection_sock.c
6104 +@@ -96,8 +96,8 @@ struct dst_entry *inet6_csk_route_req(struct sock *sk,
6105 + /*
6106 + * request_sock (formerly open request) hash tables.
6107 + */
6108 +-static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
6109 +- const u32 rnd, const u32 synq_hsize)
6110 ++u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
6111 ++ const u32 rnd, const u32 synq_hsize)
6112 + {
6113 + u32 c;
6114 +
6115 +diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
6116 +index edb58aff4ae7..ea4d9fda0927 100644
6117 +--- a/net/ipv6/ipv6_sockglue.c
6118 ++++ b/net/ipv6/ipv6_sockglue.c
6119 +@@ -48,6 +48,8 @@
6120 + #include <net/addrconf.h>
6121 + #include <net/inet_common.h>
6122 + #include <net/tcp.h>
6123 ++#include <net/mptcp.h>
6124 ++#include <net/mptcp_v4.h>
6125 + #include <net/udp.h>
6126 + #include <net/udplite.h>
6127 + #include <net/xfrm.h>
6128 +@@ -196,7 +198,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
6129 + sock_prot_inuse_add(net, &tcp_prot, 1);
6130 + local_bh_enable();
6131 + sk->sk_prot = &tcp_prot;
6132 +- icsk->icsk_af_ops = &ipv4_specific;
6133 ++#ifdef CONFIG_MPTCP
6134 ++ if (is_mptcp_enabled(sk))
6135 ++ icsk->icsk_af_ops = &mptcp_v4_specific;
6136 ++ else
6137 ++#endif
6138 ++ icsk->icsk_af_ops = &ipv4_specific;
6139 + sk->sk_socket->ops = &inet_stream_ops;
6140 + sk->sk_family = PF_INET;
6141 + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
6142 +diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
6143 +index a822b880689b..b2b38869d795 100644
6144 +--- a/net/ipv6/syncookies.c
6145 ++++ b/net/ipv6/syncookies.c
6146 +@@ -181,13 +181,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
6147 +
6148 + /* check for timestamp cookie support */
6149 + memset(&tcp_opt, 0, sizeof(tcp_opt));
6150 +- tcp_parse_options(skb, &tcp_opt, 0, NULL);
6151 ++ tcp_parse_options(skb, &tcp_opt, NULL, 0, NULL);
6152 +
6153 + if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
6154 + goto out;
6155 +
6156 + ret = NULL;
6157 +- req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
6158 ++ req = inet_reqsk_alloc(&tcp6_request_sock_ops);
6159 + if (!req)
6160 + goto out;
6161 +
6162 +@@ -255,10 +255,10 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
6163 + }
6164 +
6165 + req->window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
6166 +- tcp_select_initial_window(tcp_full_space(sk), req->mss,
6167 +- &req->rcv_wnd, &req->window_clamp,
6168 +- ireq->wscale_ok, &rcv_wscale,
6169 +- dst_metric(dst, RTAX_INITRWND));
6170 ++ tp->ops->select_initial_window(tcp_full_space(sk), req->mss,
6171 ++ &req->rcv_wnd, &req->window_clamp,
6172 ++ ireq->wscale_ok, &rcv_wscale,
6173 ++ dst_metric(dst, RTAX_INITRWND), sk);
6174 +
6175 + ireq->rcv_wscale = rcv_wscale;
6176 +
6177 +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
6178 +index 229239ad96b1..fda94d71666e 100644
6179 +--- a/net/ipv6/tcp_ipv6.c
6180 ++++ b/net/ipv6/tcp_ipv6.c
6181 +@@ -63,6 +63,8 @@
6182 + #include <net/inet_common.h>
6183 + #include <net/secure_seq.h>
6184 + #include <net/tcp_memcontrol.h>
6185 ++#include <net/mptcp.h>
6186 ++#include <net/mptcp_v6.h>
6187 + #include <net/busy_poll.h>
6188 +
6189 + #include <linux/proc_fs.h>
6190 +@@ -71,12 +73,6 @@
6191 + #include <linux/crypto.h>
6192 + #include <linux/scatterlist.h>
6193 +
6194 +-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb);
6195 +-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
6196 +- struct request_sock *req);
6197 +-
6198 +-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
6199 +-
6200 + static const struct inet_connection_sock_af_ops ipv6_mapped;
6201 + static const struct inet_connection_sock_af_ops ipv6_specific;
6202 + #ifdef CONFIG_TCP_MD5SIG
6203 +@@ -90,7 +86,7 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
6204 + }
6205 + #endif
6206 +
6207 +-static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
6208 ++void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
6209 + {
6210 + struct dst_entry *dst = skb_dst(skb);
6211 + const struct rt6_info *rt = (const struct rt6_info *)dst;
6212 +@@ -102,10 +98,11 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
6213 + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
6214 + }
6215 +
6216 +-static void tcp_v6_hash(struct sock *sk)
6217 ++void tcp_v6_hash(struct sock *sk)
6218 + {
6219 + if (sk->sk_state != TCP_CLOSE) {
6220 +- if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) {
6221 ++ if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped ||
6222 ++ inet_csk(sk)->icsk_af_ops == &mptcp_v6_mapped) {
6223 + tcp_prot.hash(sk);
6224 + return;
6225 + }
6226 +@@ -115,7 +112,7 @@ static void tcp_v6_hash(struct sock *sk)
6227 + }
6228 + }
6229 +
6230 +-static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
6231 ++__u32 tcp_v6_init_sequence(const struct sk_buff *skb)
6232 + {
6233 + return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
6234 + ipv6_hdr(skb)->saddr.s6_addr32,
6235 +@@ -123,7 +120,7 @@ static __u32 tcp_v6_init_sequence(const struct sk_buff *skb)
6236 + tcp_hdr(skb)->source);
6237 + }
6238 +
6239 +-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6240 ++int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6241 + int addr_len)
6242 + {
6243 + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
6244 +@@ -215,7 +212,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6245 + sin.sin_port = usin->sin6_port;
6246 + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
6247 +
6248 +- icsk->icsk_af_ops = &ipv6_mapped;
6249 ++#ifdef CONFIG_MPTCP
6250 ++ if (is_mptcp_enabled(sk))
6251 ++ icsk->icsk_af_ops = &mptcp_v6_mapped;
6252 ++ else
6253 ++#endif
6254 ++ icsk->icsk_af_ops = &ipv6_mapped;
6255 + sk->sk_backlog_rcv = tcp_v4_do_rcv;
6256 + #ifdef CONFIG_TCP_MD5SIG
6257 + tp->af_specific = &tcp_sock_ipv6_mapped_specific;
6258 +@@ -225,7 +227,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
6259 +
6260 + if (err) {
6261 + icsk->icsk_ext_hdr_len = exthdrlen;
6262 +- icsk->icsk_af_ops = &ipv6_specific;
6263 ++#ifdef CONFIG_MPTCP
6264 ++ if (is_mptcp_enabled(sk))
6265 ++ icsk->icsk_af_ops = &mptcp_v6_specific;
6266 ++ else
6267 ++#endif
6268 ++ icsk->icsk_af_ops = &ipv6_specific;
6269 + sk->sk_backlog_rcv = tcp_v6_do_rcv;
6270 + #ifdef CONFIG_TCP_MD5SIG
6271 + tp->af_specific = &tcp_sock_ipv6_specific;
6272 +@@ -337,7 +344,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6273 + const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
6274 + const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
6275 + struct ipv6_pinfo *np;
6276 +- struct sock *sk;
6277 ++ struct sock *sk, *meta_sk;
6278 + int err;
6279 + struct tcp_sock *tp;
6280 + struct request_sock *fastopen;
6281 +@@ -358,8 +365,14 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6282 + return;
6283 + }
6284 +
6285 +- bh_lock_sock(sk);
6286 +- if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
6287 ++ tp = tcp_sk(sk);
6288 ++ if (mptcp(tp))
6289 ++ meta_sk = mptcp_meta_sk(sk);
6290 ++ else
6291 ++ meta_sk = sk;
6292 ++
6293 ++ bh_lock_sock(meta_sk);
6294 ++ if (sock_owned_by_user(meta_sk) && type != ICMPV6_PKT_TOOBIG)
6295 + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
6296 +
6297 + if (sk->sk_state == TCP_CLOSE)
6298 +@@ -370,7 +383,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6299 + goto out;
6300 + }
6301 +
6302 +- tp = tcp_sk(sk);
6303 + seq = ntohl(th->seq);
6304 + /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
6305 + fastopen = tp->fastopen_rsk;
6306 +@@ -403,11 +415,15 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6307 + goto out;
6308 +
6309 + tp->mtu_info = ntohl(info);
6310 +- if (!sock_owned_by_user(sk))
6311 ++ if (!sock_owned_by_user(meta_sk))
6312 + tcp_v6_mtu_reduced(sk);
6313 +- else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
6314 ++ else {
6315 ++ if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
6316 + &tp->tsq_flags))
6317 +- sock_hold(sk);
6318 ++ sock_hold(sk);
6319 ++ if (mptcp(tp))
6320 ++ mptcp_tsq_flags(sk);
6321 ++ }
6322 + goto out;
6323 + }
6324 +
6325 +@@ -417,7 +433,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6326 + switch (sk->sk_state) {
6327 + struct request_sock *req, **prev;
6328 + case TCP_LISTEN:
6329 +- if (sock_owned_by_user(sk))
6330 ++ if (sock_owned_by_user(meta_sk))
6331 + goto out;
6332 +
6333 + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr,
6334 +@@ -447,7 +463,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6335 + if (fastopen && fastopen->sk == NULL)
6336 + break;
6337 +
6338 +- if (!sock_owned_by_user(sk)) {
6339 ++ if (!sock_owned_by_user(meta_sk)) {
6340 + sk->sk_err = err;
6341 + sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
6342 +
6343 +@@ -457,26 +473,27 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
6344 + goto out;
6345 + }
6346 +
6347 +- if (!sock_owned_by_user(sk) && np->recverr) {
6348 ++ if (!sock_owned_by_user(meta_sk) && np->recverr) {
6349 + sk->sk_err = err;
6350 + sk->sk_error_report(sk);
6351 + } else
6352 + sk->sk_err_soft = err;
6353 +
6354 + out:
6355 +- bh_unlock_sock(sk);
6356 ++ bh_unlock_sock(meta_sk);
6357 + sock_put(sk);
6358 + }
6359 +
6360 +
6361 +-static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
6362 +- struct flowi6 *fl6,
6363 +- struct request_sock *req,
6364 +- u16 queue_mapping,
6365 +- struct tcp_fastopen_cookie *foc)
6366 ++int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
6367 ++ struct flowi *fl,
6368 ++ struct request_sock *req,
6369 ++ u16 queue_mapping,
6370 ++ struct tcp_fastopen_cookie *foc)
6371 + {
6372 + struct inet_request_sock *ireq = inet_rsk(req);
6373 + struct ipv6_pinfo *np = inet6_sk(sk);
6374 ++ struct flowi6 *fl6 = &fl->u.ip6;
6375 + struct sk_buff *skb;
6376 + int err = -ENOMEM;
6377 +
6378 +@@ -497,18 +514,21 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
6379 + skb_set_queue_mapping(skb, queue_mapping);
6380 + err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
6381 + err = net_xmit_eval(err);
6382 ++ if (!tcp_rsk(req)->snt_synack && !err)
6383 ++ tcp_rsk(req)->snt_synack = tcp_time_stamp;
6384 + }
6385 +
6386 + done:
6387 + return err;
6388 + }
6389 +
6390 +-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
6391 ++int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
6392 + {
6393 +- struct flowi6 fl6;
6394 ++ const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
6395 ++ struct flowi fl;
6396 + int res;
6397 +
6398 +- res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0, NULL);
6399 ++ res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
6400 + if (!res) {
6401 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
6402 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
6403 +@@ -516,7 +536,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
6404 + return res;
6405 + }
6406 +
6407 +-static void tcp_v6_reqsk_destructor(struct request_sock *req)
6408 ++void tcp_v6_reqsk_destructor(struct request_sock *req)
6409 + {
6410 + kfree_skb(inet_rsk(req)->pktopts);
6411 + }
6412 +@@ -718,27 +738,74 @@ static int tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
6413 + }
6414 + #endif
6415 +
6416 ++static int tcp_v6_init_req(struct request_sock *req, struct sock *sk,
6417 ++ struct sk_buff *skb)
6418 ++{
6419 ++ struct inet_request_sock *ireq = inet_rsk(req);
6420 ++ struct ipv6_pinfo *np = inet6_sk(sk);
6421 ++
6422 ++ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
6423 ++ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
6424 ++
6425 ++ ireq->ir_iif = sk->sk_bound_dev_if;
6426 ++ ireq->ir_mark = inet_request_mark(sk, skb);
6427 ++
6428 ++ /* So that link locals have meaning */
6429 ++ if (!sk->sk_bound_dev_if &&
6430 ++ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
6431 ++ ireq->ir_iif = inet6_iif(skb);
6432 ++
6433 ++ if (!TCP_SKB_CB(skb)->when &&
6434 ++ (ipv6_opt_accepted(sk, skb) || np->rxopt.bits.rxinfo ||
6435 ++ np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
6436 ++ np->rxopt.bits.rxohlim || np->repflow)) {
6437 ++ atomic_inc(&skb->users);
6438 ++ ireq->pktopts = skb;
6439 ++ }
6440 ++
6441 ++ return 0;
6442 ++}
6443 ++
6444 ++static struct dst_entry *tcp_v6_route_req(struct sock *sk, struct flowi *fl,
6445 ++ const struct request_sock *req,
6446 ++ bool *strict)
6447 ++{
6448 ++ if (strict)
6449 ++ *strict = true;
6450 ++ return inet6_csk_route_req(sk, &fl->u.ip6, req);
6451 ++}
6452 ++
6453 + struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
6454 + .family = AF_INET6,
6455 + .obj_size = sizeof(struct tcp6_request_sock),
6456 +- .rtx_syn_ack = tcp_v6_rtx_synack,
6457 ++ .rtx_syn_ack = tcp_rtx_synack,
6458 + .send_ack = tcp_v6_reqsk_send_ack,
6459 + .destructor = tcp_v6_reqsk_destructor,
6460 + .send_reset = tcp_v6_send_reset,
6461 + .syn_ack_timeout = tcp_syn_ack_timeout,
6462 + };
6463 +
6464 ++const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
6465 ++ .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
6466 ++ sizeof(struct ipv6hdr),
6467 + #ifdef CONFIG_TCP_MD5SIG
6468 +-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
6469 + .md5_lookup = tcp_v6_reqsk_md5_lookup,
6470 + .calc_md5_hash = tcp_v6_md5_hash_skb,
6471 +-};
6472 + #endif
6473 ++ .init_req = tcp_v6_init_req,
6474 ++#ifdef CONFIG_SYN_COOKIES
6475 ++ .cookie_init_seq = cookie_v6_init_sequence,
6476 ++#endif
6477 ++ .route_req = tcp_v6_route_req,
6478 ++ .init_seq = tcp_v6_init_sequence,
6479 ++ .send_synack = tcp_v6_send_synack,
6480 ++ .queue_hash_add = inet6_csk_reqsk_queue_hash_add,
6481 ++};
6482 +
6483 +-static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6484 +- u32 tsval, u32 tsecr, int oif,
6485 +- struct tcp_md5sig_key *key, int rst, u8 tclass,
6486 +- u32 label)
6487 ++static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack,
6488 ++ u32 data_ack, u32 win, u32 tsval, u32 tsecr,
6489 ++ int oif, struct tcp_md5sig_key *key, int rst,
6490 ++ u8 tclass, u32 label, int mptcp)
6491 + {
6492 + const struct tcphdr *th = tcp_hdr(skb);
6493 + struct tcphdr *t1;
6494 +@@ -756,7 +823,10 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6495 + if (key)
6496 + tot_len += TCPOLEN_MD5SIG_ALIGNED;
6497 + #endif
6498 +-
6499 ++#ifdef CONFIG_MPTCP
6500 ++ if (mptcp)
6501 ++ tot_len += MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK;
6502 ++#endif
6503 + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
6504 + GFP_ATOMIC);
6505 + if (buff == NULL)
6506 +@@ -794,6 +864,17 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6507 + tcp_v6_md5_hash_hdr((__u8 *)topt, key,
6508 + &ipv6_hdr(skb)->saddr,
6509 + &ipv6_hdr(skb)->daddr, t1);
6510 ++ topt += 4;
6511 ++ }
6512 ++#endif
6513 ++#ifdef CONFIG_MPTCP
6514 ++ if (mptcp) {
6515 ++ /* Construction of 32-bit data_ack */
6516 ++ *topt++ = htonl((TCPOPT_MPTCP << 24) |
6517 ++ ((MPTCP_SUB_LEN_DSS + MPTCP_SUB_LEN_ACK) << 16) |
6518 ++ (0x20 << 8) |
6519 ++ (0x01));
6520 ++ *topt++ = htonl(data_ack);
6521 + }
6522 + #endif
6523 +
6524 +@@ -834,7 +915,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
6525 + kfree_skb(buff);
6526 + }
6527 +
6528 +-static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
6529 ++void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
6530 + {
6531 + const struct tcphdr *th = tcp_hdr(skb);
6532 + u32 seq = 0, ack_seq = 0;
6533 +@@ -891,7 +972,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
6534 + (th->doff << 2);
6535 +
6536 + oif = sk ? sk->sk_bound_dev_if : 0;
6537 +- tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
6538 ++ tcp_v6_send_response(skb, seq, ack_seq, 0, 0, 0, 0, oif, key, 1, 0, 0, 0);
6539 +
6540 + #ifdef CONFIG_TCP_MD5SIG
6541 + release_sk1:
6542 +@@ -902,45 +983,52 @@ release_sk1:
6543 + #endif
6544 + }
6545 +
6546 +-static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
6547 ++static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 data_ack,
6548 + u32 win, u32 tsval, u32 tsecr, int oif,
6549 + struct tcp_md5sig_key *key, u8 tclass,
6550 +- u32 label)
6551 ++ u32 label, int mptcp)
6552 + {
6553 +- tcp_v6_send_response(skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass,
6554 +- label);
6555 ++ tcp_v6_send_response(skb, seq, ack, data_ack, win, tsval, tsecr, oif,
6556 ++ key, 0, tclass, label, mptcp);
6557 + }
6558 +
6559 + static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
6560 + {
6561 + struct inet_timewait_sock *tw = inet_twsk(sk);
6562 + struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
6563 ++ u32 data_ack = 0;
6564 ++ int mptcp = 0;
6565 +
6566 ++ if (tcptw->mptcp_tw && tcptw->mptcp_tw->meta_tw) {
6567 ++ data_ack = (u32)tcptw->mptcp_tw->rcv_nxt;
6568 ++ mptcp = 1;
6569 ++ }
6570 + tcp_v6_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
6571 ++ data_ack,
6572 + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
6573 + tcp_time_stamp + tcptw->tw_ts_offset,
6574 + tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
6575 +- tw->tw_tclass, (tw->tw_flowlabel << 12));
6576 ++ tw->tw_tclass, (tw->tw_flowlabel << 12), mptcp);
6577 +
6578 + inet_twsk_put(tw);
6579 + }
6580 +
6581 +-static void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
6582 +- struct request_sock *req)
6583 ++void tcp_v6_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
6584 ++ struct request_sock *req)
6585 + {
6586 + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
6587 + * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
6588 + */
6589 + tcp_v6_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
6590 + tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
6591 +- tcp_rsk(req)->rcv_nxt,
6592 ++ tcp_rsk(req)->rcv_nxt, 0,
6593 + req->rcv_wnd, tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
6594 + tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
6595 +- 0, 0);
6596 ++ 0, 0, 0);
6597 + }
6598 +
6599 +
6600 +-static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6601 ++struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6602 + {
6603 + struct request_sock *req, **prev;
6604 + const struct tcphdr *th = tcp_hdr(skb);
6605 +@@ -959,7 +1047,13 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6606 +
6607 + if (nsk) {
6608 + if (nsk->sk_state != TCP_TIME_WAIT) {
6609 ++ /* Don't lock again the meta-sk. It has been locked
6610 ++ * before mptcp_v6_do_rcv.
6611 ++ */
6612 ++ if (mptcp(tcp_sk(nsk)) && !is_meta_sk(sk))
6613 ++ bh_lock_sock(mptcp_meta_sk(nsk));
6614 + bh_lock_sock(nsk);
6615 ++
6616 + return nsk;
6617 + }
6618 + inet_twsk_put(inet_twsk(nsk));
6619 +@@ -973,161 +1067,25 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
6620 + return sk;
6621 + }
6622 +
6623 +-/* FIXME: this is substantially similar to the ipv4 code.
6624 +- * Can some kind of merge be done? -- erics
6625 +- */
6626 +-static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
6627 ++int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
6628 + {
6629 +- struct tcp_options_received tmp_opt;
6630 +- struct request_sock *req;
6631 +- struct inet_request_sock *ireq;
6632 +- struct ipv6_pinfo *np = inet6_sk(sk);
6633 +- struct tcp_sock *tp = tcp_sk(sk);
6634 +- __u32 isn = TCP_SKB_CB(skb)->when;
6635 +- struct dst_entry *dst = NULL;
6636 +- struct tcp_fastopen_cookie foc = { .len = -1 };
6637 +- bool want_cookie = false, fastopen;
6638 +- struct flowi6 fl6;
6639 +- int err;
6640 +-
6641 + if (skb->protocol == htons(ETH_P_IP))
6642 + return tcp_v4_conn_request(sk, skb);
6643 +
6644 + if (!ipv6_unicast_destination(skb))
6645 + goto drop;
6646 +
6647 +- if ((sysctl_tcp_syncookies == 2 ||
6648 +- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
6649 +- want_cookie = tcp_syn_flood_action(sk, skb, "TCPv6");
6650 +- if (!want_cookie)
6651 +- goto drop;
6652 +- }
6653 +-
6654 +- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
6655 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
6656 +- goto drop;
6657 +- }
6658 +-
6659 +- req = inet6_reqsk_alloc(&tcp6_request_sock_ops);
6660 +- if (req == NULL)
6661 +- goto drop;
6662 +-
6663 +-#ifdef CONFIG_TCP_MD5SIG
6664 +- tcp_rsk(req)->af_specific = &tcp_request_sock_ipv6_ops;
6665 +-#endif
6666 +-
6667 +- tcp_clear_options(&tmp_opt);
6668 +- tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
6669 +- tmp_opt.user_mss = tp->rx_opt.user_mss;
6670 +- tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
6671 +-
6672 +- if (want_cookie && !tmp_opt.saw_tstamp)
6673 +- tcp_clear_options(&tmp_opt);
6674 ++ return tcp_conn_request(&tcp6_request_sock_ops,
6675 ++ &tcp_request_sock_ipv6_ops, sk, skb);
6676 +
6677 +- tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
6678 +- tcp_openreq_init(req, &tmp_opt, skb);
6679 +-
6680 +- ireq = inet_rsk(req);
6681 +- ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
6682 +- ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
6683 +- if (!want_cookie || tmp_opt.tstamp_ok)
6684 +- TCP_ECN_create_request(req, skb, sock_net(sk));
6685 +-
6686 +- ireq->ir_iif = sk->sk_bound_dev_if;
6687 +- ireq->ir_mark = inet_request_mark(sk, skb);
6688 +-
6689 +- /* So that link locals have meaning */
6690 +- if (!sk->sk_bound_dev_if &&
6691 +- ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
6692 +- ireq->ir_iif = inet6_iif(skb);
6693 +-
6694 +- if (!isn) {
6695 +- if (ipv6_opt_accepted(sk, skb) ||
6696 +- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
6697 +- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim ||
6698 +- np->repflow) {
6699 +- atomic_inc(&skb->users);
6700 +- ireq->pktopts = skb;
6701 +- }
6702 +-
6703 +- if (want_cookie) {
6704 +- isn = cookie_v6_init_sequence(sk, skb, &req->mss);
6705 +- req->cookie_ts = tmp_opt.tstamp_ok;
6706 +- goto have_isn;
6707 +- }
6708 +-
6709 +- /* VJ's idea. We save last timestamp seen
6710 +- * from the destination in peer table, when entering
6711 +- * state TIME-WAIT, and check against it before
6712 +- * accepting new connection request.
6713 +- *
6714 +- * If "isn" is not zero, this request hit alive
6715 +- * timewait bucket, so that all the necessary checks
6716 +- * are made in the function processing timewait state.
6717 +- */
6718 +- if (tmp_opt.saw_tstamp &&
6719 +- tcp_death_row.sysctl_tw_recycle &&
6720 +- (dst = inet6_csk_route_req(sk, &fl6, req)) != NULL) {
6721 +- if (!tcp_peer_is_proven(req, dst, true)) {
6722 +- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
6723 +- goto drop_and_release;
6724 +- }
6725 +- }
6726 +- /* Kill the following clause, if you dislike this way. */
6727 +- else if (!sysctl_tcp_syncookies &&
6728 +- (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
6729 +- (sysctl_max_syn_backlog >> 2)) &&
6730 +- !tcp_peer_is_proven(req, dst, false)) {
6731 +- /* Without syncookies last quarter of
6732 +- * backlog is filled with destinations,
6733 +- * proven to be alive.
6734 +- * It means that we continue to communicate
6735 +- * to destinations, already remembered
6736 +- * to the moment of synflood.
6737 +- */
6738 +- LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI6/%u\n",
6739 +- &ireq->ir_v6_rmt_addr, ntohs(tcp_hdr(skb)->source));
6740 +- goto drop_and_release;
6741 +- }
6742 +-
6743 +- isn = tcp_v6_init_sequence(skb);
6744 +- }
6745 +-have_isn:
6746 +-
6747 +- if (security_inet_conn_request(sk, skb, req))
6748 +- goto drop_and_release;
6749 +-
6750 +- if (!dst && (dst = inet6_csk_route_req(sk, &fl6, req)) == NULL)
6751 +- goto drop_and_free;
6752 +-
6753 +- tcp_rsk(req)->snt_isn = isn;
6754 +- tcp_rsk(req)->snt_synack = tcp_time_stamp;
6755 +- tcp_openreq_init_rwin(req, sk, dst);
6756 +- fastopen = !want_cookie &&
6757 +- tcp_try_fastopen(sk, skb, req, &foc, dst);
6758 +- err = tcp_v6_send_synack(sk, dst, &fl6, req,
6759 +- skb_get_queue_mapping(skb), &foc);
6760 +- if (!fastopen) {
6761 +- if (err || want_cookie)
6762 +- goto drop_and_free;
6763 +-
6764 +- tcp_rsk(req)->listener = NULL;
6765 +- inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
6766 +- }
6767 +- return 0;
6768 +-
6769 +-drop_and_release:
6770 +- dst_release(dst);
6771 +-drop_and_free:
6772 +- reqsk_free(req);
6773 + drop:
6774 + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
6775 + return 0; /* don't send reset */
6776 + }
6777 +
6778 +-static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
6779 +- struct request_sock *req,
6780 +- struct dst_entry *dst)
6781 ++struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
6782 ++ struct request_sock *req,
6783 ++ struct dst_entry *dst)
6784 + {
6785 + struct inet_request_sock *ireq;
6786 + struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
6787 +@@ -1165,7 +1123,12 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
6788 +
6789 + newsk->sk_v6_rcv_saddr = newnp->saddr;
6790 +
6791 +- inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
6792 ++#ifdef CONFIG_MPTCP
6793 ++ if (is_mptcp_enabled(newsk))
6794 ++ inet_csk(newsk)->icsk_af_ops = &mptcp_v6_mapped;
6795 ++ else
6796 ++#endif
6797 ++ inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
6798 + newsk->sk_backlog_rcv = tcp_v4_do_rcv;
6799 + #ifdef CONFIG_TCP_MD5SIG
6800 + newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
6801 +@@ -1329,7 +1292,7 @@ out:
6802 + * This is because we cannot sleep with the original spinlock
6803 + * held.
6804 + */
6805 +-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
6806 ++int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
6807 + {
6808 + struct ipv6_pinfo *np = inet6_sk(sk);
6809 + struct tcp_sock *tp;
6810 +@@ -1351,6 +1314,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
6811 + goto discard;
6812 + #endif
6813 +
6814 ++ if (is_meta_sk(sk))
6815 ++ return mptcp_v6_do_rcv(sk, skb);
6816 ++
6817 + if (sk_filter(sk, skb))
6818 + goto discard;
6819 +
6820 +@@ -1472,7 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
6821 + {
6822 + const struct tcphdr *th;
6823 + const struct ipv6hdr *hdr;
6824 +- struct sock *sk;
6825 ++ struct sock *sk, *meta_sk = NULL;
6826 + int ret;
6827 + struct net *net = dev_net(skb->dev);
6828 +
6829 +@@ -1503,18 +1469,43 @@ static int tcp_v6_rcv(struct sk_buff *skb)
6830 + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
6831 + skb->len - th->doff*4);
6832 + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
6833 ++#ifdef CONFIG_MPTCP
6834 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
6835 ++ TCP_SKB_CB(skb)->dss_off = 0;
6836 ++#endif
6837 + TCP_SKB_CB(skb)->when = 0;
6838 + TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
6839 + TCP_SKB_CB(skb)->sacked = 0;
6840 +
6841 + sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
6842 +- if (!sk)
6843 +- goto no_tcp_socket;
6844 +
6845 + process:
6846 +- if (sk->sk_state == TCP_TIME_WAIT)
6847 ++ if (sk && sk->sk_state == TCP_TIME_WAIT)
6848 + goto do_time_wait;
6849 +
6850 ++#ifdef CONFIG_MPTCP
6851 ++ if (!sk && th->syn && !th->ack) {
6852 ++ int ret = mptcp_lookup_join(skb, NULL);
6853 ++
6854 ++ if (ret < 0) {
6855 ++ tcp_v6_send_reset(NULL, skb);
6856 ++ goto discard_it;
6857 ++ } else if (ret > 0) {
6858 ++ return 0;
6859 ++ }
6860 ++ }
6861 ++
6862 ++ /* Is there a pending request sock for this segment ? */
6863 ++ if ((!sk || sk->sk_state == TCP_LISTEN) && mptcp_check_req(skb, net)) {
6864 ++ if (sk)
6865 ++ sock_put(sk);
6866 ++ return 0;
6867 ++ }
6868 ++#endif
6869 ++
6870 ++ if (!sk)
6871 ++ goto no_tcp_socket;
6872 ++
6873 + if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
6874 + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
6875 + goto discard_and_relse;
6876 +@@ -1529,11 +1520,21 @@ process:
6877 + sk_mark_napi_id(sk, skb);
6878 + skb->dev = NULL;
6879 +
6880 +- bh_lock_sock_nested(sk);
6881 ++ if (mptcp(tcp_sk(sk))) {
6882 ++ meta_sk = mptcp_meta_sk(sk);
6883 ++
6884 ++ bh_lock_sock_nested(meta_sk);
6885 ++ if (sock_owned_by_user(meta_sk))
6886 ++ skb->sk = sk;
6887 ++ } else {
6888 ++ meta_sk = sk;
6889 ++ bh_lock_sock_nested(sk);
6890 ++ }
6891 ++
6892 + ret = 0;
6893 +- if (!sock_owned_by_user(sk)) {
6894 ++ if (!sock_owned_by_user(meta_sk)) {
6895 + #ifdef CONFIG_NET_DMA
6896 +- struct tcp_sock *tp = tcp_sk(sk);
6897 ++ struct tcp_sock *tp = tcp_sk(meta_sk);
6898 + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
6899 + tp->ucopy.dma_chan = net_dma_find_channel();
6900 + if (tp->ucopy.dma_chan)
6901 +@@ -1541,16 +1542,17 @@ process:
6902 + else
6903 + #endif
6904 + {
6905 +- if (!tcp_prequeue(sk, skb))
6906 ++ if (!tcp_prequeue(meta_sk, skb))
6907 + ret = tcp_v6_do_rcv(sk, skb);
6908 + }
6909 +- } else if (unlikely(sk_add_backlog(sk, skb,
6910 +- sk->sk_rcvbuf + sk->sk_sndbuf))) {
6911 +- bh_unlock_sock(sk);
6912 ++ } else if (unlikely(sk_add_backlog(meta_sk, skb,
6913 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
6914 ++ bh_unlock_sock(meta_sk);
6915 + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
6916 + goto discard_and_relse;
6917 + }
6918 +- bh_unlock_sock(sk);
6919 ++
6920 ++ bh_unlock_sock(meta_sk);
6921 +
6922 + sock_put(sk);
6923 + return ret ? -1 : 0;
6924 +@@ -1607,6 +1609,18 @@ do_time_wait:
6925 + sk = sk2;
6926 + goto process;
6927 + }
6928 ++#ifdef CONFIG_MPTCP
6929 ++ if (th->syn && !th->ack) {
6930 ++ int ret = mptcp_lookup_join(skb, inet_twsk(sk));
6931 ++
6932 ++ if (ret < 0) {
6933 ++ tcp_v6_send_reset(NULL, skb);
6934 ++ goto discard_it;
6935 ++ } else if (ret > 0) {
6936 ++ return 0;
6937 ++ }
6938 ++ }
6939 ++#endif
6940 + /* Fall through to ACK */
6941 + }
6942 + case TCP_TW_ACK:
6943 +@@ -1657,7 +1671,7 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
6944 + }
6945 + }
6946 +
6947 +-static struct timewait_sock_ops tcp6_timewait_sock_ops = {
6948 ++struct timewait_sock_ops tcp6_timewait_sock_ops = {
6949 + .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
6950 + .twsk_unique = tcp_twsk_unique,
6951 + .twsk_destructor = tcp_twsk_destructor,
6952 +@@ -1730,7 +1744,12 @@ static int tcp_v6_init_sock(struct sock *sk)
6953 +
6954 + tcp_init_sock(sk);
6955 +
6956 +- icsk->icsk_af_ops = &ipv6_specific;
6957 ++#ifdef CONFIG_MPTCP
6958 ++ if (is_mptcp_enabled(sk))
6959 ++ icsk->icsk_af_ops = &mptcp_v6_specific;
6960 ++ else
6961 ++#endif
6962 ++ icsk->icsk_af_ops = &ipv6_specific;
6963 +
6964 + #ifdef CONFIG_TCP_MD5SIG
6965 + tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
6966 +@@ -1739,7 +1758,7 @@ static int tcp_v6_init_sock(struct sock *sk)
6967 + return 0;
6968 + }
6969 +
6970 +-static void tcp_v6_destroy_sock(struct sock *sk)
6971 ++void tcp_v6_destroy_sock(struct sock *sk)
6972 + {
6973 + tcp_v4_destroy_sock(sk);
6974 + inet6_destroy_sock(sk);
6975 +@@ -1924,12 +1943,28 @@ void tcp6_proc_exit(struct net *net)
6976 + static void tcp_v6_clear_sk(struct sock *sk, int size)
6977 + {
6978 + struct inet_sock *inet = inet_sk(sk);
6979 ++#ifdef CONFIG_MPTCP
6980 ++ struct tcp_sock *tp = tcp_sk(sk);
6981 ++ /* size_tk_table goes from the end of tk_table to the end of sk */
6982 ++ int size_tk_table = size - offsetof(struct tcp_sock, tk_table) -
6983 ++ sizeof(tp->tk_table);
6984 ++#endif
6985 +
6986 + /* we do not want to clear pinet6 field, because of RCU lookups */
6987 + sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
6988 +
6989 + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
6990 ++
6991 ++#ifdef CONFIG_MPTCP
6992 ++ /* We zero out only from pinet6 to tk_table */
6993 ++ size -= size_tk_table + sizeof(tp->tk_table);
6994 ++#endif
6995 + memset(&inet->pinet6 + 1, 0, size);
6996 ++
6997 ++#ifdef CONFIG_MPTCP
6998 ++ memset((char *)&tp->tk_table + sizeof(tp->tk_table), 0, size_tk_table);
6999 ++#endif
7000 ++
7001 + }
7002 +
7003 + struct proto tcpv6_prot = {
7004 +diff --git a/net/mptcp/Kconfig b/net/mptcp/Kconfig
7005 +new file mode 100644
7006 +index 000000000000..cdfc03adabf8
7007 +--- /dev/null
7008 ++++ b/net/mptcp/Kconfig
7009 +@@ -0,0 +1,115 @@
7010 ++#
7011 ++# MPTCP configuration
7012 ++#
7013 ++config MPTCP
7014 ++ bool "MPTCP protocol"
7015 ++ depends on (IPV6=y || IPV6=n)
7016 ++ ---help---
7017 ++ This replaces the normal TCP stack with a Multipath TCP stack,
7018 ++ able to use several paths at once.
7019 ++
7020 ++menuconfig MPTCP_PM_ADVANCED
7021 ++ bool "MPTCP: advanced path-manager control"
7022 ++ depends on MPTCP=y
7023 ++ ---help---
7024 ++ Support for selection of different path-managers. You should choose 'Y' here,
7025 ++ because otherwise you will not actively create new MPTCP-subflows.
7026 ++
7027 ++if MPTCP_PM_ADVANCED
7028 ++
7029 ++config MPTCP_FULLMESH
7030 ++ tristate "MPTCP Full-Mesh Path-Manager"
7031 ++ depends on MPTCP=y
7032 ++ ---help---
7033 ++ This path-management module will create a full-mesh among all IP-addresses.
7034 ++
7035 ++config MPTCP_NDIFFPORTS
7036 ++ tristate "MPTCP ndiff-ports"
7037 ++ depends on MPTCP=y
7038 ++ ---help---
7039 ++ This path-management module will create multiple subflows between the same
7040 ++ pair of IP-addresses, modifying the source-port. You can set the number
7041 ++ of subflows via the mptcp_ndiffports-sysctl.
7042 ++
7043 ++config MPTCP_BINDER
7044 ++ tristate "MPTCP Binder"
7045 ++ depends on (MPTCP=y)
7046 ++ ---help---
7047 ++ This path-management module works like ndiffports, and adds the sysctl
7048 ++ option to set the gateway (and/or path to) per each additional subflow
7049 ++ via Loose Source Routing (IPv4 only).
7050 ++
7051 ++choice
7052 ++ prompt "Default MPTCP Path-Manager"
7053 ++ default DEFAULT
7054 ++ help
7055 ++ Select the Path-Manager of your choice
7056 ++
7057 ++ config DEFAULT_FULLMESH
7058 ++ bool "Full mesh" if MPTCP_FULLMESH=y
7059 ++
7060 ++ config DEFAULT_NDIFFPORTS
7061 ++ bool "ndiff-ports" if MPTCP_NDIFFPORTS=y
7062 ++
7063 ++ config DEFAULT_BINDER
7064 ++ bool "binder" if MPTCP_BINDER=y
7065 ++
7066 ++ config DEFAULT_DUMMY
7067 ++ bool "Default"
7068 ++
7069 ++endchoice
7070 ++
7071 ++endif
7072 ++
7073 ++config DEFAULT_MPTCP_PM
7074 ++ string
7075 ++ default "default" if DEFAULT_DUMMY
7076 ++ default "fullmesh" if DEFAULT_FULLMESH
7077 ++ default "ndiffports" if DEFAULT_NDIFFPORTS
7078 ++ default "binder" if DEFAULT_BINDER
7079 ++ default "default"
7080 ++
7081 ++menuconfig MPTCP_SCHED_ADVANCED
7082 ++ bool "MPTCP: advanced scheduler control"
7083 ++ depends on MPTCP=y
7084 ++ ---help---
7085 ++ Support for selection of different schedulers. You should choose 'Y' here,
7086 ++ if you want to choose a different scheduler than the default one.
7087 ++
7088 ++if MPTCP_SCHED_ADVANCED
7089 ++
7090 ++config MPTCP_ROUNDROBIN
7091 ++ tristate "MPTCP Round-Robin"
7092 ++ depends on (MPTCP=y)
7093 ++ ---help---
7094 ++ This is a very simple round-robin scheduler. Probably has bad performance
7095 ++ but might be interesting for researchers.
7096 ++
7097 ++choice
7098 ++ prompt "Default MPTCP Scheduler"
7099 ++ default DEFAULT
7100 ++ help
7101 ++ Select the Scheduler of your choice
7102 ++
7103 ++ config DEFAULT_SCHEDULER
7104 ++ bool "Default"
7105 ++ ---help---
7106 ++ This is the default scheduler, sending first on the subflow
7107 ++ with the lowest RTT.
7108 ++
7109 ++ config DEFAULT_ROUNDROBIN
7110 ++ bool "Round-Robin" if MPTCP_ROUNDROBIN=y
7111 ++ ---help---
7112 ++ This is the round-rob scheduler, sending in a round-robin
7113 ++ fashion..
7114 ++
7115 ++endchoice
7116 ++endif
7117 ++
7118 ++config DEFAULT_MPTCP_SCHED
7119 ++ string
7120 ++ depends on (MPTCP=y)
7121 ++ default "default" if DEFAULT_SCHEDULER
7122 ++ default "roundrobin" if DEFAULT_ROUNDROBIN
7123 ++ default "default"
7124 ++
7125 +diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
7126 +new file mode 100644
7127 +index 000000000000..35561a7012e3
7128 +--- /dev/null
7129 ++++ b/net/mptcp/Makefile
7130 +@@ -0,0 +1,20 @@
7131 ++#
7132 ++## Makefile for MultiPath TCP support code.
7133 ++#
7134 ++#
7135 ++
7136 ++obj-$(CONFIG_MPTCP) += mptcp.o
7137 ++
7138 ++mptcp-y := mptcp_ctrl.o mptcp_ipv4.o mptcp_ofo_queue.o mptcp_pm.o \
7139 ++ mptcp_output.o mptcp_input.o mptcp_sched.o
7140 ++
7141 ++obj-$(CONFIG_TCP_CONG_COUPLED) += mptcp_coupled.o
7142 ++obj-$(CONFIG_TCP_CONG_OLIA) += mptcp_olia.o
7143 ++obj-$(CONFIG_TCP_CONG_WVEGAS) += mptcp_wvegas.o
7144 ++obj-$(CONFIG_MPTCP_FULLMESH) += mptcp_fullmesh.o
7145 ++obj-$(CONFIG_MPTCP_NDIFFPORTS) += mptcp_ndiffports.o
7146 ++obj-$(CONFIG_MPTCP_BINDER) += mptcp_binder.o
7147 ++obj-$(CONFIG_MPTCP_ROUNDROBIN) += mptcp_rr.o
7148 ++
7149 ++mptcp-$(subst m,y,$(CONFIG_IPV6)) += mptcp_ipv6.o
7150 ++
7151 +diff --git a/net/mptcp/mptcp_binder.c b/net/mptcp/mptcp_binder.c
7152 +new file mode 100644
7153 +index 000000000000..95d8da560715
7154 +--- /dev/null
7155 ++++ b/net/mptcp/mptcp_binder.c
7156 +@@ -0,0 +1,487 @@
7157 ++#include <linux/module.h>
7158 ++
7159 ++#include <net/mptcp.h>
7160 ++#include <net/mptcp_v4.h>
7161 ++
7162 ++#include <linux/route.h>
7163 ++#include <linux/inet.h>
7164 ++#include <linux/mroute.h>
7165 ++#include <linux/spinlock_types.h>
7166 ++#include <net/inet_ecn.h>
7167 ++#include <net/route.h>
7168 ++#include <net/xfrm.h>
7169 ++#include <net/compat.h>
7170 ++#include <linux/slab.h>
7171 ++
7172 ++#define MPTCP_GW_MAX_LISTS 10
7173 ++#define MPTCP_GW_LIST_MAX_LEN 6
7174 ++#define MPTCP_GW_SYSCTL_MAX_LEN (15 * MPTCP_GW_LIST_MAX_LEN * \
7175 ++ MPTCP_GW_MAX_LISTS)
7176 ++
7177 ++struct mptcp_gw_list {
7178 ++ struct in_addr list[MPTCP_GW_MAX_LISTS][MPTCP_GW_LIST_MAX_LEN];
7179 ++ u8 len[MPTCP_GW_MAX_LISTS];
7180 ++};
7181 ++
7182 ++struct binder_priv {
7183 ++ /* Worker struct for subflow establishment */
7184 ++ struct work_struct subflow_work;
7185 ++
7186 ++ struct mptcp_cb *mpcb;
7187 ++
7188 ++ /* Prevent multiple sub-sockets concurrently iterating over sockets */
7189 ++ spinlock_t *flow_lock;
7190 ++};
7191 ++
7192 ++static struct mptcp_gw_list *mptcp_gws;
7193 ++static rwlock_t mptcp_gws_lock;
7194 ++
7195 ++static int mptcp_binder_ndiffports __read_mostly = 1;
7196 ++
7197 ++static char sysctl_mptcp_binder_gateways[MPTCP_GW_SYSCTL_MAX_LEN] __read_mostly;
7198 ++
7199 ++static int mptcp_get_avail_list_ipv4(struct sock *sk)
7200 ++{
7201 ++ int i, j, list_taken, opt_ret, opt_len;
7202 ++ unsigned char *opt_ptr, *opt_end_ptr, opt[MAX_IPOPTLEN];
7203 ++
7204 ++ for (i = 0; i < MPTCP_GW_MAX_LISTS; ++i) {
7205 ++ if (mptcp_gws->len[i] == 0)
7206 ++ goto error;
7207 ++
7208 ++ mptcp_debug("mptcp_get_avail_list_ipv4: List %i\n", i);
7209 ++ list_taken = 0;
7210 ++
7211 ++ /* Loop through all sub-sockets in this connection */
7212 ++ mptcp_for_each_sk(tcp_sk(sk)->mpcb, sk) {
7213 ++ mptcp_debug("mptcp_get_avail_list_ipv4: Next sock\n");
7214 ++
7215 ++ /* Reset length and options buffer, then retrieve
7216 ++ * from socket
7217 ++ */
7218 ++ opt_len = MAX_IPOPTLEN;
7219 ++ memset(opt, 0, MAX_IPOPTLEN);
7220 ++ opt_ret = ip_getsockopt(sk, IPPROTO_IP,
7221 ++ IP_OPTIONS, opt, &opt_len);
7222 ++ if (opt_ret < 0) {
7223 ++ mptcp_debug(KERN_ERR "%s: MPTCP subsocket getsockopt() IP_OPTIONS failed, error %d\n",
7224 ++ __func__, opt_ret);
7225 ++ goto error;
7226 ++ }
7227 ++
7228 ++ /* If socket has no options, it has no stake in this list */
7229 ++ if (opt_len <= 0)
7230 ++ continue;
7231 ++
7232 ++ /* Iterate options buffer */
7233 ++ for (opt_ptr = &opt[0]; opt_ptr < &opt[opt_len]; opt_ptr++) {
7234 ++ if (*opt_ptr == IPOPT_LSRR) {
7235 ++ mptcp_debug("mptcp_get_avail_list_ipv4: LSRR options found\n");
7236 ++ goto sock_lsrr;
7237 ++ }
7238 ++ }
7239 ++ continue;
7240 ++
7241 ++sock_lsrr:
7242 ++ /* Pointer to the 2nd to last address */
7243 ++ opt_end_ptr = opt_ptr+(*(opt_ptr+1))-4;
7244 ++
7245 ++ /* Addresses start 3 bytes after type offset */
7246 ++ opt_ptr += 3;
7247 ++ j = 0;
7248 ++
7249 ++ /* Different length lists cannot be the same */
7250 ++ if ((opt_end_ptr-opt_ptr)/4 != mptcp_gws->len[i])
7251 ++ continue;
7252 ++
7253 ++ /* Iterate if we are still inside options list
7254 ++ * and sysctl list
7255 ++ */
7256 ++ while (opt_ptr < opt_end_ptr && j < mptcp_gws->len[i]) {
7257 ++ /* If there is a different address, this list must
7258 ++ * not be set on this socket
7259 ++ */
7260 ++ if (memcmp(&mptcp_gws->list[i][j], opt_ptr, 4))
7261 ++ break;
7262 ++
7263 ++ /* Jump 4 bytes to next address */
7264 ++ opt_ptr += 4;
7265 ++ j++;
7266 ++ }
7267 ++
7268 ++ /* Reached the end without a differing address, lists
7269 ++ * are therefore identical.
7270 ++ */
7271 ++ if (j == mptcp_gws->len[i]) {
7272 ++ mptcp_debug("mptcp_get_avail_list_ipv4: List already used\n");
7273 ++ list_taken = 1;
7274 ++ break;
7275 ++ }
7276 ++ }
7277 ++
7278 ++ /* Free list found if not taken by a socket */
7279 ++ if (!list_taken) {
7280 ++ mptcp_debug("mptcp_get_avail_list_ipv4: List free\n");
7281 ++ break;
7282 ++ }
7283 ++ }
7284 ++
7285 ++ if (i >= MPTCP_GW_MAX_LISTS)
7286 ++ goto error;
7287 ++
7288 ++ return i;
7289 ++error:
7290 ++ return -1;
7291 ++}
7292 ++
7293 ++/* The list of addresses is parsed each time a new connection is opened,
7294 ++ * to make sure it's up to date. In case of error, all the lists are
7295 ++ * marked as unavailable and the subflow's fingerprint is set to 0.
7296 ++ */
7297 ++static void mptcp_v4_add_lsrr(struct sock *sk, struct in_addr addr)
7298 ++{
7299 ++ int i, j, ret;
7300 ++ unsigned char opt[MAX_IPOPTLEN] = {0};
7301 ++ struct tcp_sock *tp = tcp_sk(sk);
7302 ++ struct binder_priv *fmp = (struct binder_priv *)&tp->mpcb->mptcp_pm[0];
7303 ++
7304 ++ /* Read lock: multiple sockets can read LSRR addresses at the same
7305 ++ * time, but writes are done in mutual exclusion.
7306 ++ * Spin lock: must search for free list for one socket at a time, or
7307 ++ * multiple sockets could take the same list.
7308 ++ */
7309 ++ read_lock(&mptcp_gws_lock);
7310 ++ spin_lock(fmp->flow_lock);
7311 ++
7312 ++ i = mptcp_get_avail_list_ipv4(sk);
7313 ++
7314 ++ /* Execution enters here only if a free path is found.
7315 ++ */
7316 ++ if (i >= 0) {
7317 ++ opt[0] = IPOPT_NOP;
7318 ++ opt[1] = IPOPT_LSRR;
7319 ++ opt[2] = sizeof(mptcp_gws->list[i][0].s_addr) *
7320 ++ (mptcp_gws->len[i] + 1) + 3;
7321 ++ opt[3] = IPOPT_MINOFF;
7322 ++ for (j = 0; j < mptcp_gws->len[i]; ++j)
7323 ++ memcpy(opt + 4 +
7324 ++ (j * sizeof(mptcp_gws->list[i][0].s_addr)),
7325 ++ &mptcp_gws->list[i][j].s_addr,
7326 ++ sizeof(mptcp_gws->list[i][0].s_addr));
7327 ++ /* Final destination must be part of IP_OPTIONS parameter. */
7328 ++ memcpy(opt + 4 + (j * sizeof(addr.s_addr)), &addr.s_addr,
7329 ++ sizeof(addr.s_addr));
7330 ++
7331 ++ /* setsockopt must be inside the lock, otherwise another
7332 ++ * subflow could fail to see that we have taken a list.
7333 ++ */
7334 ++ ret = ip_setsockopt(sk, IPPROTO_IP, IP_OPTIONS, opt,
7335 ++ 4 + sizeof(mptcp_gws->list[i][0].s_addr)
7336 ++ * (mptcp_gws->len[i] + 1));
7337 ++
7338 ++ if (ret < 0) {
7339 ++ mptcp_debug(KERN_ERR "%s: MPTCP subsock setsockopt() IP_OPTIONS failed, error %d\n",
7340 ++ __func__, ret);
7341 ++ }
7342 ++ }
7343 ++
7344 ++ spin_unlock(fmp->flow_lock);
7345 ++ read_unlock(&mptcp_gws_lock);
7346 ++
7347 ++ return;
7348 ++}
7349 ++
7350 ++/* Parses gateways string for a list of paths to different
7351 ++ * gateways, and stores them for use with the Loose Source Routing (LSRR)
7352 ++ * socket option. Each list must have "," separated addresses, and the lists
7353 ++ * themselves must be separated by "-". Returns -1 in case one or more of the
7354 ++ * addresses is not a valid ipv4/6 address.
7355 ++ */
7356 ++static int mptcp_parse_gateway_ipv4(char *gateways)
7357 ++{
7358 ++ int i, j, k, ret;
7359 ++ char *tmp_string = NULL;
7360 ++ struct in_addr tmp_addr;
7361 ++
7362 ++ tmp_string = kzalloc(16, GFP_KERNEL);
7363 ++ if (tmp_string == NULL)
7364 ++ return -ENOMEM;
7365 ++
7366 ++ write_lock(&mptcp_gws_lock);
7367 ++
7368 ++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
7369 ++
7370 ++ /* A TMP string is used since inet_pton needs a null terminated string
7371 ++ * but we do not want to modify the sysctl for obvious reasons.
7372 ++ * i will iterate over the SYSCTL string, j will iterate over the
7373 ++ * temporary string where each IP is copied into, k will iterate over
7374 ++ * the IPs in each list.
7375 ++ */
7376 ++ for (i = j = k = 0;
7377 ++ i < MPTCP_GW_SYSCTL_MAX_LEN && k < MPTCP_GW_MAX_LISTS;
7378 ++ ++i) {
7379 ++ if (gateways[i] == '-' || gateways[i] == ',' || gateways[i] == '\0') {
7380 ++ /* If the temp IP is empty and the current list is
7381 ++ * empty, we are done.
7382 ++ */
7383 ++ if (j == 0 && mptcp_gws->len[k] == 0)
7384 ++ break;
7385 ++
7386 ++ /* Terminate the temp IP string, then if it is
7387 ++ * non-empty parse the IP and copy it.
7388 ++ */
7389 ++ tmp_string[j] = '\0';
7390 ++ if (j > 0) {
7391 ++ mptcp_debug("mptcp_parse_gateway_list tmp: %s i: %d\n", tmp_string, i);
7392 ++
7393 ++ ret = in4_pton(tmp_string, strlen(tmp_string),
7394 ++ (u8 *)&tmp_addr.s_addr, '\0',
7395 ++ NULL);
7396 ++
7397 ++ if (ret) {
7398 ++ mptcp_debug("mptcp_parse_gateway_list ret: %d s_addr: %pI4\n",
7399 ++ ret,
7400 ++ &tmp_addr.s_addr);
7401 ++ memcpy(&mptcp_gws->list[k][mptcp_gws->len[k]].s_addr,
7402 ++ &tmp_addr.s_addr,
7403 ++ sizeof(tmp_addr.s_addr));
7404 ++ mptcp_gws->len[k]++;
7405 ++ j = 0;
7406 ++ tmp_string[j] = '\0';
7407 ++ /* Since we can't impose a limit to
7408 ++ * what the user can input, make sure
7409 ++ * there are not too many IPs in the
7410 ++ * SYSCTL string.
7411 ++ */
7412 ++ if (mptcp_gws->len[k] > MPTCP_GW_LIST_MAX_LEN) {
7413 ++ mptcp_debug("mptcp_parse_gateway_list too many members in list %i: max %i\n",
7414 ++ k,
7415 ++ MPTCP_GW_LIST_MAX_LEN);
7416 ++ goto error;
7417 ++ }
7418 ++ } else {
7419 ++ goto error;
7420 ++ }
7421 ++ }
7422 ++
7423 ++ if (gateways[i] == '-' || gateways[i] == '\0')
7424 ++ ++k;
7425 ++ } else {
7426 ++ tmp_string[j] = gateways[i];
7427 ++ ++j;
7428 ++ }
7429 ++ }
7430 ++
7431 ++ /* Number of flows is number of gateway lists plus master flow */
7432 ++ mptcp_binder_ndiffports = k+1;
7433 ++
7434 ++ write_unlock(&mptcp_gws_lock);
7435 ++ kfree(tmp_string);
7436 ++
7437 ++ return 0;
7438 ++
7439 ++error:
7440 ++ memset(mptcp_gws, 0, sizeof(struct mptcp_gw_list));
7441 ++ memset(gateways, 0, sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN);
7442 ++ write_unlock(&mptcp_gws_lock);
7443 ++ kfree(tmp_string);
7444 ++ return -1;
7445 ++}
7446 ++
7447 ++/**
7448 ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
7449 ++ *
7450 ++ * This function uses a goto next_subflow, to allow releasing the lock between
7451 ++ * new subflows and giving other processes a chance to do some work on the
7452 ++ * socket and potentially finishing the communication.
7453 ++ **/
7454 ++static void create_subflow_worker(struct work_struct *work)
7455 ++{
7456 ++ const struct binder_priv *pm_priv = container_of(work,
7457 ++ struct binder_priv,
7458 ++ subflow_work);
7459 ++ struct mptcp_cb *mpcb = pm_priv->mpcb;
7460 ++ struct sock *meta_sk = mpcb->meta_sk;
7461 ++ int iter = 0;
7462 ++
7463 ++next_subflow:
7464 ++ if (iter) {
7465 ++ release_sock(meta_sk);
7466 ++ mutex_unlock(&mpcb->mpcb_mutex);
7467 ++
7468 ++ cond_resched();
7469 ++ }
7470 ++ mutex_lock(&mpcb->mpcb_mutex);
7471 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
7472 ++
7473 ++ iter++;
7474 ++
7475 ++ if (sock_flag(meta_sk, SOCK_DEAD))
7476 ++ goto exit;
7477 ++
7478 ++ if (mpcb->master_sk &&
7479 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
7480 ++ goto exit;
7481 ++
7482 ++ if (mptcp_binder_ndiffports > iter &&
7483 ++ mptcp_binder_ndiffports > mpcb->cnt_subflows) {
7484 ++ struct mptcp_loc4 loc;
7485 ++ struct mptcp_rem4 rem;
7486 ++
7487 ++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
7488 ++ loc.loc4_id = 0;
7489 ++ loc.low_prio = 0;
7490 ++
7491 ++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
7492 ++ rem.port = inet_sk(meta_sk)->inet_dport;
7493 ++ rem.rem4_id = 0; /* Default 0 */
7494 ++
7495 ++ mptcp_init4_subsockets(meta_sk, &loc, &rem);
7496 ++
7497 ++ goto next_subflow;
7498 ++ }
7499 ++
7500 ++exit:
7501 ++ release_sock(meta_sk);
7502 ++ mutex_unlock(&mpcb->mpcb_mutex);
7503 ++ sock_put(meta_sk);
7504 ++}
7505 ++
7506 ++static void binder_new_session(const struct sock *meta_sk)
7507 ++{
7508 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
7509 ++ struct binder_priv *fmp = (struct binder_priv *)&mpcb->mptcp_pm[0];
7510 ++ static DEFINE_SPINLOCK(flow_lock);
7511 ++
7512 ++#if IS_ENABLED(CONFIG_IPV6)
7513 ++ if (meta_sk->sk_family == AF_INET6 &&
7514 ++ !mptcp_v6_is_v4_mapped(meta_sk)) {
7515 ++ mptcp_fallback_default(mpcb);
7516 ++ return;
7517 ++ }
7518 ++#endif
7519 ++
7520 ++ /* Initialize workqueue-struct */
7521 ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
7522 ++ fmp->mpcb = mpcb;
7523 ++
7524 ++ fmp->flow_lock = &flow_lock;
7525 ++}
7526 ++
7527 ++static void binder_create_subflows(struct sock *meta_sk)
7528 ++{
7529 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
7530 ++ struct binder_priv *pm_priv = (struct binder_priv *)&mpcb->mptcp_pm[0];
7531 ++
7532 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
7533 ++ mpcb->send_infinite_mapping ||
7534 ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
7535 ++ return;
7536 ++
7537 ++ if (!work_pending(&pm_priv->subflow_work)) {
7538 ++ sock_hold(meta_sk);
7539 ++ queue_work(mptcp_wq, &pm_priv->subflow_work);
7540 ++ }
7541 ++}
7542 ++
7543 ++static int binder_get_local_id(sa_family_t family, union inet_addr *addr,
7544 ++ struct net *net, bool *low_prio)
7545 ++{
7546 ++ return 0;
7547 ++}
7548 ++
7549 ++/* Callback functions, executed when syctl mptcp.mptcp_gateways is updated.
7550 ++ * Inspired from proc_tcp_congestion_control().
7551 ++ */
7552 ++static int proc_mptcp_gateways(ctl_table *ctl, int write,
7553 ++ void __user *buffer, size_t *lenp,
7554 ++ loff_t *ppos)
7555 ++{
7556 ++ int ret;
7557 ++ ctl_table tbl = {
7558 ++ .maxlen = MPTCP_GW_SYSCTL_MAX_LEN,
7559 ++ };
7560 ++
7561 ++ if (write) {
7562 ++ tbl.data = kzalloc(MPTCP_GW_SYSCTL_MAX_LEN, GFP_KERNEL);
7563 ++ if (tbl.data == NULL)
7564 ++ return -1;
7565 ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
7566 ++ if (ret == 0) {
7567 ++ ret = mptcp_parse_gateway_ipv4(tbl.data);
7568 ++ memcpy(ctl->data, tbl.data, MPTCP_GW_SYSCTL_MAX_LEN);
7569 ++ }
7570 ++ kfree(tbl.data);
7571 ++ } else {
7572 ++ ret = proc_dostring(ctl, write, buffer, lenp, ppos);
7573 ++ }
7574 ++
7575 ++
7576 ++ return ret;
7577 ++}
7578 ++
7579 ++static struct mptcp_pm_ops binder __read_mostly = {
7580 ++ .new_session = binder_new_session,
7581 ++ .fully_established = binder_create_subflows,
7582 ++ .get_local_id = binder_get_local_id,
7583 ++ .init_subsocket_v4 = mptcp_v4_add_lsrr,
7584 ++ .name = "binder",
7585 ++ .owner = THIS_MODULE,
7586 ++};
7587 ++
7588 ++static struct ctl_table binder_table[] = {
7589 ++ {
7590 ++ .procname = "mptcp_binder_gateways",
7591 ++ .data = &sysctl_mptcp_binder_gateways,
7592 ++ .maxlen = sizeof(char) * MPTCP_GW_SYSCTL_MAX_LEN,
7593 ++ .mode = 0644,
7594 ++ .proc_handler = &proc_mptcp_gateways
7595 ++ },
7596 ++ { }
7597 ++};
7598 ++
7599 ++struct ctl_table_header *mptcp_sysctl_binder;
7600 ++
7601 ++/* General initialization of MPTCP_PM */
7602 ++static int __init binder_register(void)
7603 ++{
7604 ++ mptcp_gws = kzalloc(sizeof(*mptcp_gws), GFP_KERNEL);
7605 ++ if (!mptcp_gws)
7606 ++ return -ENOMEM;
7607 ++
7608 ++ rwlock_init(&mptcp_gws_lock);
7609 ++
7610 ++ BUILD_BUG_ON(sizeof(struct binder_priv) > MPTCP_PM_SIZE);
7611 ++
7612 ++ mptcp_sysctl_binder = register_net_sysctl(&init_net, "net/mptcp",
7613 ++ binder_table);
7614 ++ if (!mptcp_sysctl_binder)
7615 ++ goto sysctl_fail;
7616 ++
7617 ++ if (mptcp_register_path_manager(&binder))
7618 ++ goto pm_failed;
7619 ++
7620 ++ return 0;
7621 ++
7622 ++pm_failed:
7623 ++ unregister_net_sysctl_table(mptcp_sysctl_binder);
7624 ++sysctl_fail:
7625 ++ kfree(mptcp_gws);
7626 ++
7627 ++ return -1;
7628 ++}
7629 ++
7630 ++static void binder_unregister(void)
7631 ++{
7632 ++ mptcp_unregister_path_manager(&binder);
7633 ++ unregister_net_sysctl_table(mptcp_sysctl_binder);
7634 ++ kfree(mptcp_gws);
7635 ++}
7636 ++
7637 ++module_init(binder_register);
7638 ++module_exit(binder_unregister);
7639 ++
7640 ++MODULE_AUTHOR("Luca Boccassi, Duncan Eastoe, Christoph Paasch (ndiffports)");
7641 ++MODULE_LICENSE("GPL");
7642 ++MODULE_DESCRIPTION("BINDER MPTCP");
7643 ++MODULE_VERSION("0.1");
7644 +diff --git a/net/mptcp/mptcp_coupled.c b/net/mptcp/mptcp_coupled.c
7645 +new file mode 100644
7646 +index 000000000000..5d761164eb85
7647 +--- /dev/null
7648 ++++ b/net/mptcp/mptcp_coupled.c
7649 +@@ -0,0 +1,270 @@
7650 ++/*
7651 ++ * MPTCP implementation - Linked Increase congestion control Algorithm (LIA)
7652 ++ *
7653 ++ * Initial Design & Implementation:
7654 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
7655 ++ *
7656 ++ * Current Maintainer & Author:
7657 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
7658 ++ *
7659 ++ * Additional authors:
7660 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
7661 ++ * Gregory Detal <gregory.detal@×××××××××.be>
7662 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
7663 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
7664 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
7665 ++ * Andreas Ripke <ripke@××××××.eu>
7666 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
7667 ++ * Octavian Purdila <octavian.purdila@×××××.com>
7668 ++ * John Ronan <jronan@××××.org>
7669 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
7670 ++ * Brandon Heller <brandonh@××××××××.edu>
7671 ++ *
7672 ++ *
7673 ++ * This program is free software; you can redistribute it and/or
7674 ++ * modify it under the terms of the GNU General Public License
7675 ++ * as published by the Free Software Foundation; either version
7676 ++ * 2 of the License, or (at your option) any later version.
7677 ++ */
7678 ++#include <net/tcp.h>
7679 ++#include <net/mptcp.h>
7680 ++
7681 ++#include <linux/module.h>
7682 ++
7683 ++/* Scaling is done in the numerator with alpha_scale_num and in the denominator
7684 ++ * with alpha_scale_den.
7685 ++ *
7686 ++ * To downscale, we just need to use alpha_scale.
7687 ++ *
7688 ++ * We have: alpha_scale = alpha_scale_num / (alpha_scale_den ^ 2)
7689 ++ */
7690 ++static int alpha_scale_den = 10;
7691 ++static int alpha_scale_num = 32;
7692 ++static int alpha_scale = 12;
7693 ++
7694 ++struct mptcp_ccc {
7695 ++ u64 alpha;
7696 ++ bool forced_update;
7697 ++};
7698 ++
7699 ++static inline int mptcp_ccc_sk_can_send(const struct sock *sk)
7700 ++{
7701 ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
7702 ++}
7703 ++
7704 ++static inline u64 mptcp_get_alpha(const struct sock *meta_sk)
7705 ++{
7706 ++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha;
7707 ++}
7708 ++
7709 ++static inline void mptcp_set_alpha(const struct sock *meta_sk, u64 alpha)
7710 ++{
7711 ++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->alpha = alpha;
7712 ++}
7713 ++
7714 ++static inline u64 mptcp_ccc_scale(u32 val, int scale)
7715 ++{
7716 ++ return (u64) val << scale;
7717 ++}
7718 ++
7719 ++static inline bool mptcp_get_forced(const struct sock *meta_sk)
7720 ++{
7721 ++ return ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update;
7722 ++}
7723 ++
7724 ++static inline void mptcp_set_forced(const struct sock *meta_sk, bool force)
7725 ++{
7726 ++ ((struct mptcp_ccc *)inet_csk_ca(meta_sk))->forced_update = force;
7727 ++}
7728 ++
7729 ++static void mptcp_ccc_recalc_alpha(const struct sock *sk)
7730 ++{
7731 ++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
7732 ++ const struct sock *sub_sk;
7733 ++ int best_cwnd = 0, best_rtt = 0, can_send = 0;
7734 ++ u64 max_numerator = 0, sum_denominator = 0, alpha = 1;
7735 ++
7736 ++ if (!mpcb)
7737 ++ return;
7738 ++
7739 ++ /* Only one subflow left - fall back to normal reno-behavior
7740 ++ * (set alpha to 1)
7741 ++ */
7742 ++ if (mpcb->cnt_established <= 1)
7743 ++ goto exit;
7744 ++
7745 ++ /* Do regular alpha-calculation for multiple subflows */
7746 ++
7747 ++ /* Find the max numerator of the alpha-calculation */
7748 ++ mptcp_for_each_sk(mpcb, sub_sk) {
7749 ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
7750 ++ u64 tmp;
7751 ++
7752 ++ if (!mptcp_ccc_sk_can_send(sub_sk))
7753 ++ continue;
7754 ++
7755 ++ can_send++;
7756 ++
7757 ++ /* We need to look for the path, that provides the max-value.
7758 ++ * Integer-overflow is not possible here, because
7759 ++ * tmp will be in u64.
7760 ++ */
7761 ++ tmp = div64_u64(mptcp_ccc_scale(sub_tp->snd_cwnd,
7762 ++ alpha_scale_num), (u64)sub_tp->srtt_us * sub_tp->srtt_us);
7763 ++
7764 ++ if (tmp >= max_numerator) {
7765 ++ max_numerator = tmp;
7766 ++ best_cwnd = sub_tp->snd_cwnd;
7767 ++ best_rtt = sub_tp->srtt_us;
7768 ++ }
7769 ++ }
7770 ++
7771 ++ /* No subflow is able to send - we don't care anymore */
7772 ++ if (unlikely(!can_send))
7773 ++ goto exit;
7774 ++
7775 ++ /* Calculate the denominator */
7776 ++ mptcp_for_each_sk(mpcb, sub_sk) {
7777 ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
7778 ++
7779 ++ if (!mptcp_ccc_sk_can_send(sub_sk))
7780 ++ continue;
7781 ++
7782 ++ sum_denominator += div_u64(
7783 ++ mptcp_ccc_scale(sub_tp->snd_cwnd,
7784 ++ alpha_scale_den) * best_rtt,
7785 ++ sub_tp->srtt_us);
7786 ++ }
7787 ++ sum_denominator *= sum_denominator;
7788 ++ if (unlikely(!sum_denominator)) {
7789 ++ pr_err("%s: sum_denominator == 0, cnt_established:%d\n",
7790 ++ __func__, mpcb->cnt_established);
7791 ++ mptcp_for_each_sk(mpcb, sub_sk) {
7792 ++ struct tcp_sock *sub_tp = tcp_sk(sub_sk);
7793 ++ pr_err("%s: pi:%d, state:%d\n, rtt:%u, cwnd: %u",
7794 ++ __func__, sub_tp->mptcp->path_index,
7795 ++ sub_sk->sk_state, sub_tp->srtt_us,
7796 ++ sub_tp->snd_cwnd);
7797 ++ }
7798 ++ }
7799 ++
7800 ++ alpha = div64_u64(mptcp_ccc_scale(best_cwnd, alpha_scale_num), sum_denominator);
7801 ++
7802 ++ if (unlikely(!alpha))
7803 ++ alpha = 1;
7804 ++
7805 ++exit:
7806 ++ mptcp_set_alpha(mptcp_meta_sk(sk), alpha);
7807 ++}
7808 ++
7809 ++static void mptcp_ccc_init(struct sock *sk)
7810 ++{
7811 ++ if (mptcp(tcp_sk(sk))) {
7812 ++ mptcp_set_forced(mptcp_meta_sk(sk), 0);
7813 ++ mptcp_set_alpha(mptcp_meta_sk(sk), 1);
7814 ++ }
7815 ++ /* If we do not mptcp, behave like reno: return */
7816 ++}
7817 ++
7818 ++static void mptcp_ccc_cwnd_event(struct sock *sk, enum tcp_ca_event event)
7819 ++{
7820 ++ if (event == CA_EVENT_LOSS)
7821 ++ mptcp_ccc_recalc_alpha(sk);
7822 ++}
7823 ++
7824 ++static void mptcp_ccc_set_state(struct sock *sk, u8 ca_state)
7825 ++{
7826 ++ if (!mptcp(tcp_sk(sk)))
7827 ++ return;
7828 ++
7829 ++ mptcp_set_forced(mptcp_meta_sk(sk), 1);
7830 ++}
7831 ++
7832 ++static void mptcp_ccc_cong_avoid(struct sock *sk, u32 ack, u32 acked)
7833 ++{
7834 ++ struct tcp_sock *tp = tcp_sk(sk);
7835 ++ const struct mptcp_cb *mpcb = tp->mpcb;
7836 ++ int snd_cwnd;
7837 ++
7838 ++ if (!mptcp(tp)) {
7839 ++ tcp_reno_cong_avoid(sk, ack, acked);
7840 ++ return;
7841 ++ }
7842 ++
7843 ++ if (!tcp_is_cwnd_limited(sk))
7844 ++ return;
7845 ++
7846 ++ if (tp->snd_cwnd <= tp->snd_ssthresh) {
7847 ++ /* In "safe" area, increase. */
7848 ++ tcp_slow_start(tp, acked);
7849 ++ mptcp_ccc_recalc_alpha(sk);
7850 ++ return;
7851 ++ }
7852 ++
7853 ++ if (mptcp_get_forced(mptcp_meta_sk(sk))) {
7854 ++ mptcp_ccc_recalc_alpha(sk);
7855 ++ mptcp_set_forced(mptcp_meta_sk(sk), 0);
7856 ++ }
7857 ++
7858 ++ if (mpcb->cnt_established > 1) {
7859 ++ u64 alpha = mptcp_get_alpha(mptcp_meta_sk(sk));
7860 ++
7861 ++ /* This may happen, if at the initialization, the mpcb
7862 ++ * was not yet attached to the sock, and thus
7863 ++ * initializing alpha failed.
7864 ++ */
7865 ++ if (unlikely(!alpha))
7866 ++ alpha = 1;
7867 ++
7868 ++ snd_cwnd = (int) div_u64 ((u64) mptcp_ccc_scale(1, alpha_scale),
7869 ++ alpha);
7870 ++
7871 ++ /* snd_cwnd_cnt >= max (scale * tot_cwnd / alpha, cwnd)
7872 ++ * Thus, we select here the max value.
7873 ++ */
7874 ++ if (snd_cwnd < tp->snd_cwnd)
7875 ++ snd_cwnd = tp->snd_cwnd;
7876 ++ } else {
7877 ++ snd_cwnd = tp->snd_cwnd;
7878 ++ }
7879 ++
7880 ++ if (tp->snd_cwnd_cnt >= snd_cwnd) {
7881 ++ if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
7882 ++ tp->snd_cwnd++;
7883 ++ mptcp_ccc_recalc_alpha(sk);
7884 ++ }
7885 ++
7886 ++ tp->snd_cwnd_cnt = 0;
7887 ++ } else {
7888 ++ tp->snd_cwnd_cnt++;
7889 ++ }
7890 ++}
7891 ++
7892 ++static struct tcp_congestion_ops mptcp_ccc = {
7893 ++ .init = mptcp_ccc_init,
7894 ++ .ssthresh = tcp_reno_ssthresh,
7895 ++ .cong_avoid = mptcp_ccc_cong_avoid,
7896 ++ .cwnd_event = mptcp_ccc_cwnd_event,
7897 ++ .set_state = mptcp_ccc_set_state,
7898 ++ .owner = THIS_MODULE,
7899 ++ .name = "lia",
7900 ++};
7901 ++
7902 ++static int __init mptcp_ccc_register(void)
7903 ++{
7904 ++ BUILD_BUG_ON(sizeof(struct mptcp_ccc) > ICSK_CA_PRIV_SIZE);
7905 ++ return tcp_register_congestion_control(&mptcp_ccc);
7906 ++}
7907 ++
7908 ++static void __exit mptcp_ccc_unregister(void)
7909 ++{
7910 ++ tcp_unregister_congestion_control(&mptcp_ccc);
7911 ++}
7912 ++
7913 ++module_init(mptcp_ccc_register);
7914 ++module_exit(mptcp_ccc_unregister);
7915 ++
7916 ++MODULE_AUTHOR("Christoph Paasch, Sébastien Barré");
7917 ++MODULE_LICENSE("GPL");
7918 ++MODULE_DESCRIPTION("MPTCP LINKED INCREASE CONGESTION CONTROL ALGORITHM");
7919 ++MODULE_VERSION("0.1");
7920 +diff --git a/net/mptcp/mptcp_ctrl.c b/net/mptcp/mptcp_ctrl.c
7921 +new file mode 100644
7922 +index 000000000000..28dfa0479f5e
7923 +--- /dev/null
7924 ++++ b/net/mptcp/mptcp_ctrl.c
7925 +@@ -0,0 +1,2401 @@
7926 ++/*
7927 ++ * MPTCP implementation - MPTCP-control
7928 ++ *
7929 ++ * Initial Design & Implementation:
7930 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
7931 ++ *
7932 ++ * Current Maintainer & Author:
7933 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
7934 ++ *
7935 ++ * Additional authors:
7936 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
7937 ++ * Gregory Detal <gregory.detal@×××××××××.be>
7938 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
7939 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
7940 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
7941 ++ * Andreas Ripke <ripke@××××××.eu>
7942 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
7943 ++ * Octavian Purdila <octavian.purdila@×××××.com>
7944 ++ * John Ronan <jronan@××××.org>
7945 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
7946 ++ * Brandon Heller <brandonh@××××××××.edu>
7947 ++ *
7948 ++ *
7949 ++ * This program is free software; you can redistribute it and/or
7950 ++ * modify it under the terms of the GNU General Public License
7951 ++ * as published by the Free Software Foundation; either version
7952 ++ * 2 of the License, or (at your option) any later version.
7953 ++ */
7954 ++
7955 ++#include <net/inet_common.h>
7956 ++#include <net/inet6_hashtables.h>
7957 ++#include <net/ipv6.h>
7958 ++#include <net/ip6_checksum.h>
7959 ++#include <net/mptcp.h>
7960 ++#include <net/mptcp_v4.h>
7961 ++#if IS_ENABLED(CONFIG_IPV6)
7962 ++#include <net/ip6_route.h>
7963 ++#include <net/mptcp_v6.h>
7964 ++#endif
7965 ++#include <net/sock.h>
7966 ++#include <net/tcp.h>
7967 ++#include <net/tcp_states.h>
7968 ++#include <net/transp_v6.h>
7969 ++#include <net/xfrm.h>
7970 ++
7971 ++#include <linux/cryptohash.h>
7972 ++#include <linux/kconfig.h>
7973 ++#include <linux/module.h>
7974 ++#include <linux/netpoll.h>
7975 ++#include <linux/list.h>
7976 ++#include <linux/jhash.h>
7977 ++#include <linux/tcp.h>
7978 ++#include <linux/net.h>
7979 ++#include <linux/in.h>
7980 ++#include <linux/random.h>
7981 ++#include <linux/inetdevice.h>
7982 ++#include <linux/workqueue.h>
7983 ++#include <linux/atomic.h>
7984 ++#include <linux/sysctl.h>
7985 ++
7986 ++static struct kmem_cache *mptcp_sock_cache __read_mostly;
7987 ++static struct kmem_cache *mptcp_cb_cache __read_mostly;
7988 ++static struct kmem_cache *mptcp_tw_cache __read_mostly;
7989 ++
7990 ++int sysctl_mptcp_enabled __read_mostly = 1;
7991 ++int sysctl_mptcp_checksum __read_mostly = 1;
7992 ++int sysctl_mptcp_debug __read_mostly;
7993 ++EXPORT_SYMBOL(sysctl_mptcp_debug);
7994 ++int sysctl_mptcp_syn_retries __read_mostly = 3;
7995 ++
7996 ++bool mptcp_init_failed __read_mostly;
7997 ++
7998 ++struct static_key mptcp_static_key = STATIC_KEY_INIT_FALSE;
7999 ++EXPORT_SYMBOL(mptcp_static_key);
8000 ++
8001 ++static int proc_mptcp_path_manager(ctl_table *ctl, int write,
8002 ++ void __user *buffer, size_t *lenp,
8003 ++ loff_t *ppos)
8004 ++{
8005 ++ char val[MPTCP_PM_NAME_MAX];
8006 ++ ctl_table tbl = {
8007 ++ .data = val,
8008 ++ .maxlen = MPTCP_PM_NAME_MAX,
8009 ++ };
8010 ++ int ret;
8011 ++
8012 ++ mptcp_get_default_path_manager(val);
8013 ++
8014 ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
8015 ++ if (write && ret == 0)
8016 ++ ret = mptcp_set_default_path_manager(val);
8017 ++ return ret;
8018 ++}
8019 ++
8020 ++static int proc_mptcp_scheduler(ctl_table *ctl, int write,
8021 ++ void __user *buffer, size_t *lenp,
8022 ++ loff_t *ppos)
8023 ++{
8024 ++ char val[MPTCP_SCHED_NAME_MAX];
8025 ++ ctl_table tbl = {
8026 ++ .data = val,
8027 ++ .maxlen = MPTCP_SCHED_NAME_MAX,
8028 ++ };
8029 ++ int ret;
8030 ++
8031 ++ mptcp_get_default_scheduler(val);
8032 ++
8033 ++ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
8034 ++ if (write && ret == 0)
8035 ++ ret = mptcp_set_default_scheduler(val);
8036 ++ return ret;
8037 ++}
8038 ++
8039 ++static struct ctl_table mptcp_table[] = {
8040 ++ {
8041 ++ .procname = "mptcp_enabled",
8042 ++ .data = &sysctl_mptcp_enabled,
8043 ++ .maxlen = sizeof(int),
8044 ++ .mode = 0644,
8045 ++ .proc_handler = &proc_dointvec
8046 ++ },
8047 ++ {
8048 ++ .procname = "mptcp_checksum",
8049 ++ .data = &sysctl_mptcp_checksum,
8050 ++ .maxlen = sizeof(int),
8051 ++ .mode = 0644,
8052 ++ .proc_handler = &proc_dointvec
8053 ++ },
8054 ++ {
8055 ++ .procname = "mptcp_debug",
8056 ++ .data = &sysctl_mptcp_debug,
8057 ++ .maxlen = sizeof(int),
8058 ++ .mode = 0644,
8059 ++ .proc_handler = &proc_dointvec
8060 ++ },
8061 ++ {
8062 ++ .procname = "mptcp_syn_retries",
8063 ++ .data = &sysctl_mptcp_syn_retries,
8064 ++ .maxlen = sizeof(int),
8065 ++ .mode = 0644,
8066 ++ .proc_handler = &proc_dointvec
8067 ++ },
8068 ++ {
8069 ++ .procname = "mptcp_path_manager",
8070 ++ .mode = 0644,
8071 ++ .maxlen = MPTCP_PM_NAME_MAX,
8072 ++ .proc_handler = proc_mptcp_path_manager,
8073 ++ },
8074 ++ {
8075 ++ .procname = "mptcp_scheduler",
8076 ++ .mode = 0644,
8077 ++ .maxlen = MPTCP_SCHED_NAME_MAX,
8078 ++ .proc_handler = proc_mptcp_scheduler,
8079 ++ },
8080 ++ { }
8081 ++};
8082 ++
8083 ++static inline u32 mptcp_hash_tk(u32 token)
8084 ++{
8085 ++ return token % MPTCP_HASH_SIZE;
8086 ++}
8087 ++
8088 ++struct hlist_nulls_head tk_hashtable[MPTCP_HASH_SIZE];
8089 ++EXPORT_SYMBOL(tk_hashtable);
8090 ++
8091 ++/* This second hashtable is needed to retrieve request socks
8092 ++ * created as a result of a join request. While the SYN contains
8093 ++ * the token, the final ack does not, so we need a separate hashtable
8094 ++ * to retrieve the mpcb.
8095 ++ */
8096 ++struct hlist_nulls_head mptcp_reqsk_htb[MPTCP_HASH_SIZE];
8097 ++spinlock_t mptcp_reqsk_hlock; /* hashtable protection */
8098 ++
8099 ++/* The following hash table is used to avoid collision of token */
8100 ++static struct hlist_nulls_head mptcp_reqsk_tk_htb[MPTCP_HASH_SIZE];
8101 ++spinlock_t mptcp_tk_hashlock; /* hashtable protection */
8102 ++
8103 ++static bool mptcp_reqsk_find_tk(const u32 token)
8104 ++{
8105 ++ const u32 hash = mptcp_hash_tk(token);
8106 ++ const struct mptcp_request_sock *mtreqsk;
8107 ++ const struct hlist_nulls_node *node;
8108 ++
8109 ++begin:
8110 ++ hlist_nulls_for_each_entry_rcu(mtreqsk, node,
8111 ++ &mptcp_reqsk_tk_htb[hash], hash_entry) {
8112 ++ if (token == mtreqsk->mptcp_loc_token)
8113 ++ return true;
8114 ++ }
8115 ++ /* A request-socket is destroyed by RCU. So, it might have been recycled
8116 ++ * and put into another hash-table list. So, after the lookup we may
8117 ++ * end up in a different list. So, we may need to restart.
8118 ++ *
8119 ++ * See also the comment in __inet_lookup_established.
8120 ++ */
8121 ++ if (get_nulls_value(node) != hash)
8122 ++ goto begin;
8123 ++ return false;
8124 ++}
8125 ++
8126 ++static void mptcp_reqsk_insert_tk(struct request_sock *reqsk, const u32 token)
8127 ++{
8128 ++ u32 hash = mptcp_hash_tk(token);
8129 ++
8130 ++ hlist_nulls_add_head_rcu(&mptcp_rsk(reqsk)->hash_entry,
8131 ++ &mptcp_reqsk_tk_htb[hash]);
8132 ++}
8133 ++
8134 ++static void mptcp_reqsk_remove_tk(const struct request_sock *reqsk)
8135 ++{
8136 ++ rcu_read_lock();
8137 ++ spin_lock(&mptcp_tk_hashlock);
8138 ++ hlist_nulls_del_init_rcu(&mptcp_rsk(reqsk)->hash_entry);
8139 ++ spin_unlock(&mptcp_tk_hashlock);
8140 ++ rcu_read_unlock();
8141 ++}
8142 ++
8143 ++void mptcp_reqsk_destructor(struct request_sock *req)
8144 ++{
8145 ++ if (!mptcp_rsk(req)->is_sub) {
8146 ++ if (in_softirq()) {
8147 ++ mptcp_reqsk_remove_tk(req);
8148 ++ } else {
8149 ++ rcu_read_lock_bh();
8150 ++ spin_lock(&mptcp_tk_hashlock);
8151 ++ hlist_nulls_del_init_rcu(&mptcp_rsk(req)->hash_entry);
8152 ++ spin_unlock(&mptcp_tk_hashlock);
8153 ++ rcu_read_unlock_bh();
8154 ++ }
8155 ++ } else {
8156 ++ mptcp_hash_request_remove(req);
8157 ++ }
8158 ++}
8159 ++
8160 ++static void __mptcp_hash_insert(struct tcp_sock *meta_tp, const u32 token)
8161 ++{
8162 ++ u32 hash = mptcp_hash_tk(token);
8163 ++ hlist_nulls_add_head_rcu(&meta_tp->tk_table, &tk_hashtable[hash]);
8164 ++ meta_tp->inside_tk_table = 1;
8165 ++}
8166 ++
8167 ++static bool mptcp_find_token(u32 token)
8168 ++{
8169 ++ const u32 hash = mptcp_hash_tk(token);
8170 ++ const struct tcp_sock *meta_tp;
8171 ++ const struct hlist_nulls_node *node;
8172 ++
8173 ++begin:
8174 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash], tk_table) {
8175 ++ if (token == meta_tp->mptcp_loc_token)
8176 ++ return true;
8177 ++ }
8178 ++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
8179 ++ * and put into another hash-table list. So, after the lookup we may
8180 ++ * end up in a different list. So, we may need to restart.
8181 ++ *
8182 ++ * See also the comment in __inet_lookup_established.
8183 ++ */
8184 ++ if (get_nulls_value(node) != hash)
8185 ++ goto begin;
8186 ++ return false;
8187 ++}
8188 ++
8189 ++static void mptcp_set_key_reqsk(struct request_sock *req,
8190 ++ const struct sk_buff *skb)
8191 ++{
8192 ++ const struct inet_request_sock *ireq = inet_rsk(req);
8193 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
8194 ++
8195 ++ if (skb->protocol == htons(ETH_P_IP)) {
8196 ++ mtreq->mptcp_loc_key = mptcp_v4_get_key(ip_hdr(skb)->saddr,
8197 ++ ip_hdr(skb)->daddr,
8198 ++ htons(ireq->ir_num),
8199 ++ ireq->ir_rmt_port);
8200 ++#if IS_ENABLED(CONFIG_IPV6)
8201 ++ } else {
8202 ++ mtreq->mptcp_loc_key = mptcp_v6_get_key(ipv6_hdr(skb)->saddr.s6_addr32,
8203 ++ ipv6_hdr(skb)->daddr.s6_addr32,
8204 ++ htons(ireq->ir_num),
8205 ++ ireq->ir_rmt_port);
8206 ++#endif
8207 ++ }
8208 ++
8209 ++ mptcp_key_sha1(mtreq->mptcp_loc_key, &mtreq->mptcp_loc_token, NULL);
8210 ++}
8211 ++
8212 ++/* New MPTCP-connection request, prepare a new token for the meta-socket that
8213 ++ * will be created in mptcp_check_req_master(), and store the received token.
8214 ++ */
8215 ++void mptcp_reqsk_new_mptcp(struct request_sock *req,
8216 ++ const struct mptcp_options_received *mopt,
8217 ++ const struct sk_buff *skb)
8218 ++{
8219 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
8220 ++
8221 ++ inet_rsk(req)->saw_mpc = 1;
8222 ++
8223 ++ rcu_read_lock();
8224 ++ spin_lock(&mptcp_tk_hashlock);
8225 ++ do {
8226 ++ mptcp_set_key_reqsk(req, skb);
8227 ++ } while (mptcp_reqsk_find_tk(mtreq->mptcp_loc_token) ||
8228 ++ mptcp_find_token(mtreq->mptcp_loc_token));
8229 ++
8230 ++ mptcp_reqsk_insert_tk(req, mtreq->mptcp_loc_token);
8231 ++ spin_unlock(&mptcp_tk_hashlock);
8232 ++ rcu_read_unlock();
8233 ++ mtreq->mptcp_rem_key = mopt->mptcp_key;
8234 ++}
8235 ++
8236 ++static void mptcp_set_key_sk(const struct sock *sk)
8237 ++{
8238 ++ struct tcp_sock *tp = tcp_sk(sk);
8239 ++ const struct inet_sock *isk = inet_sk(sk);
8240 ++
8241 ++ if (sk->sk_family == AF_INET)
8242 ++ tp->mptcp_loc_key = mptcp_v4_get_key(isk->inet_saddr,
8243 ++ isk->inet_daddr,
8244 ++ isk->inet_sport,
8245 ++ isk->inet_dport);
8246 ++#if IS_ENABLED(CONFIG_IPV6)
8247 ++ else
8248 ++ tp->mptcp_loc_key = mptcp_v6_get_key(inet6_sk(sk)->saddr.s6_addr32,
8249 ++ sk->sk_v6_daddr.s6_addr32,
8250 ++ isk->inet_sport,
8251 ++ isk->inet_dport);
8252 ++#endif
8253 ++
8254 ++ mptcp_key_sha1(tp->mptcp_loc_key,
8255 ++ &tp->mptcp_loc_token, NULL);
8256 ++}
8257 ++
8258 ++void mptcp_connect_init(struct sock *sk)
8259 ++{
8260 ++ struct tcp_sock *tp = tcp_sk(sk);
8261 ++
8262 ++ rcu_read_lock_bh();
8263 ++ spin_lock(&mptcp_tk_hashlock);
8264 ++ do {
8265 ++ mptcp_set_key_sk(sk);
8266 ++ } while (mptcp_reqsk_find_tk(tp->mptcp_loc_token) ||
8267 ++ mptcp_find_token(tp->mptcp_loc_token));
8268 ++
8269 ++ __mptcp_hash_insert(tp, tp->mptcp_loc_token);
8270 ++ spin_unlock(&mptcp_tk_hashlock);
8271 ++ rcu_read_unlock_bh();
8272 ++}
8273 ++
8274 ++/**
8275 ++ * This function increments the refcount of the mpcb struct.
8276 ++ * It is the responsibility of the caller to decrement when releasing
8277 ++ * the structure.
8278 ++ */
8279 ++struct sock *mptcp_hash_find(const struct net *net, const u32 token)
8280 ++{
8281 ++ const u32 hash = mptcp_hash_tk(token);
8282 ++ const struct tcp_sock *meta_tp;
8283 ++ struct sock *meta_sk = NULL;
8284 ++ const struct hlist_nulls_node *node;
8285 ++
8286 ++ rcu_read_lock();
8287 ++begin:
8288 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[hash],
8289 ++ tk_table) {
8290 ++ meta_sk = (struct sock *)meta_tp;
8291 ++ if (token == meta_tp->mptcp_loc_token &&
8292 ++ net_eq(net, sock_net(meta_sk))) {
8293 ++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
8294 ++ goto out;
8295 ++ if (unlikely(token != meta_tp->mptcp_loc_token ||
8296 ++ !net_eq(net, sock_net(meta_sk)))) {
8297 ++ sock_gen_put(meta_sk);
8298 ++ goto begin;
8299 ++ }
8300 ++ goto found;
8301 ++ }
8302 ++ }
8303 ++ /* A TCP-socket is destroyed by RCU. So, it might have been recycled
8304 ++ * and put into another hash-table list. So, after the lookup we may
8305 ++ * end up in a different list. So, we may need to restart.
8306 ++ *
8307 ++ * See also the comment in __inet_lookup_established.
8308 ++ */
8309 ++ if (get_nulls_value(node) != hash)
8310 ++ goto begin;
8311 ++out:
8312 ++ meta_sk = NULL;
8313 ++found:
8314 ++ rcu_read_unlock();
8315 ++ return meta_sk;
8316 ++}
8317 ++
8318 ++void mptcp_hash_remove_bh(struct tcp_sock *meta_tp)
8319 ++{
8320 ++ /* remove from the token hashtable */
8321 ++ rcu_read_lock_bh();
8322 ++ spin_lock(&mptcp_tk_hashlock);
8323 ++ hlist_nulls_del_init_rcu(&meta_tp->tk_table);
8324 ++ meta_tp->inside_tk_table = 0;
8325 ++ spin_unlock(&mptcp_tk_hashlock);
8326 ++ rcu_read_unlock_bh();
8327 ++}
8328 ++
8329 ++void mptcp_hash_remove(struct tcp_sock *meta_tp)
8330 ++{
8331 ++ rcu_read_lock();
8332 ++ spin_lock(&mptcp_tk_hashlock);
8333 ++ hlist_nulls_del_init_rcu(&meta_tp->tk_table);
8334 ++ meta_tp->inside_tk_table = 0;
8335 ++ spin_unlock(&mptcp_tk_hashlock);
8336 ++ rcu_read_unlock();
8337 ++}
8338 ++
8339 ++struct sock *mptcp_select_ack_sock(const struct sock *meta_sk)
8340 ++{
8341 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
8342 ++ struct sock *sk, *rttsk = NULL, *lastsk = NULL;
8343 ++ u32 min_time = 0, last_active = 0;
8344 ++
8345 ++ mptcp_for_each_sk(meta_tp->mpcb, sk) {
8346 ++ struct tcp_sock *tp = tcp_sk(sk);
8347 ++ u32 elapsed;
8348 ++
8349 ++ if (!mptcp_sk_can_send_ack(sk) || tp->pf)
8350 ++ continue;
8351 ++
8352 ++ elapsed = keepalive_time_elapsed(tp);
8353 ++
8354 ++ /* We take the one with the lowest RTT within a reasonable
8355 ++ * (meta-RTO)-timeframe
8356 ++ */
8357 ++ if (elapsed < inet_csk(meta_sk)->icsk_rto) {
8358 ++ if (!min_time || tp->srtt_us < min_time) {
8359 ++ min_time = tp->srtt_us;
8360 ++ rttsk = sk;
8361 ++ }
8362 ++ continue;
8363 ++ }
8364 ++
8365 ++ /* Otherwise, we just take the most recent active */
8366 ++ if (!rttsk && (!last_active || elapsed < last_active)) {
8367 ++ last_active = elapsed;
8368 ++ lastsk = sk;
8369 ++ }
8370 ++ }
8371 ++
8372 ++ if (rttsk)
8373 ++ return rttsk;
8374 ++
8375 ++ return lastsk;
8376 ++}
8377 ++EXPORT_SYMBOL(mptcp_select_ack_sock);
8378 ++
8379 ++static void mptcp_sock_def_error_report(struct sock *sk)
8380 ++{
8381 ++ const struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
8382 ++
8383 ++ if (!sock_flag(sk, SOCK_DEAD))
8384 ++ mptcp_sub_close(sk, 0);
8385 ++
8386 ++ if (mpcb->infinite_mapping_rcv || mpcb->infinite_mapping_snd ||
8387 ++ mpcb->send_infinite_mapping) {
8388 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
8389 ++
8390 ++ meta_sk->sk_err = sk->sk_err;
8391 ++ meta_sk->sk_err_soft = sk->sk_err_soft;
8392 ++
8393 ++ if (!sock_flag(meta_sk, SOCK_DEAD))
8394 ++ meta_sk->sk_error_report(meta_sk);
8395 ++
8396 ++ tcp_done(meta_sk);
8397 ++ }
8398 ++
8399 ++ sk->sk_err = 0;
8400 ++ return;
8401 ++}
8402 ++
8403 ++static void mptcp_mpcb_put(struct mptcp_cb *mpcb)
8404 ++{
8405 ++ if (atomic_dec_and_test(&mpcb->mpcb_refcnt)) {
8406 ++ mptcp_cleanup_path_manager(mpcb);
8407 ++ mptcp_cleanup_scheduler(mpcb);
8408 ++ kmem_cache_free(mptcp_cb_cache, mpcb);
8409 ++ }
8410 ++}
8411 ++
8412 ++static void mptcp_sock_destruct(struct sock *sk)
8413 ++{
8414 ++ struct tcp_sock *tp = tcp_sk(sk);
8415 ++
8416 ++ inet_sock_destruct(sk);
8417 ++
8418 ++ if (!is_meta_sk(sk) && !tp->was_meta_sk) {
8419 ++ BUG_ON(!hlist_unhashed(&tp->mptcp->cb_list));
8420 ++
8421 ++ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
8422 ++ tp->mptcp = NULL;
8423 ++
8424 ++ /* Taken when mpcb pointer was set */
8425 ++ sock_put(mptcp_meta_sk(sk));
8426 ++ mptcp_mpcb_put(tp->mpcb);
8427 ++ } else {
8428 ++ struct mptcp_cb *mpcb = tp->mpcb;
8429 ++ struct mptcp_tw *mptw;
8430 ++
8431 ++ /* The mpcb is disappearing - we can make the final
8432 ++ * update to the rcv_nxt of the time-wait-sock and remove
8433 ++ * its reference to the mpcb.
8434 ++ */
8435 ++ spin_lock_bh(&mpcb->tw_lock);
8436 ++ list_for_each_entry_rcu(mptw, &mpcb->tw_list, list) {
8437 ++ list_del_rcu(&mptw->list);
8438 ++ mptw->in_list = 0;
8439 ++ mptcp_mpcb_put(mpcb);
8440 ++ rcu_assign_pointer(mptw->mpcb, NULL);
8441 ++ }
8442 ++ spin_unlock_bh(&mpcb->tw_lock);
8443 ++
8444 ++ mptcp_mpcb_put(mpcb);
8445 ++
8446 ++ mptcp_debug("%s destroying meta-sk\n", __func__);
8447 ++ }
8448 ++
8449 ++ WARN_ON(!static_key_false(&mptcp_static_key));
8450 ++ /* Must be the last call, because is_meta_sk() above still needs the
8451 ++ * static key
8452 ++ */
8453 ++ static_key_slow_dec(&mptcp_static_key);
8454 ++}
8455 ++
8456 ++void mptcp_destroy_sock(struct sock *sk)
8457 ++{
8458 ++ if (is_meta_sk(sk)) {
8459 ++ struct sock *sk_it, *tmpsk;
8460 ++
8461 ++ __skb_queue_purge(&tcp_sk(sk)->mpcb->reinject_queue);
8462 ++ mptcp_purge_ofo_queue(tcp_sk(sk));
8463 ++
8464 ++ /* We have to close all remaining subflows. Normally, they
8465 ++ * should all be about to get closed. But, if the kernel is
8466 ++ * forcing a closure (e.g., tcp_write_err), the subflows might
8467 ++ * not have been closed properly (as we are waiting for the
8468 ++ * DATA_ACK of the DATA_FIN).
8469 ++ */
8470 ++ mptcp_for_each_sk_safe(tcp_sk(sk)->mpcb, sk_it, tmpsk) {
8471 ++ /* Already did call tcp_close - waiting for graceful
8472 ++ * closure, or if we are retransmitting fast-close on
8473 ++ * the subflow. The reset (or timeout) will kill the
8474 ++ * subflow..
8475 ++ */
8476 ++ if (tcp_sk(sk_it)->closing ||
8477 ++ tcp_sk(sk_it)->send_mp_fclose)
8478 ++ continue;
8479 ++
8480 ++ /* Allow the delayed work first to prevent time-wait state */
8481 ++ if (delayed_work_pending(&tcp_sk(sk_it)->mptcp->work))
8482 ++ continue;
8483 ++
8484 ++ mptcp_sub_close(sk_it, 0);
8485 ++ }
8486 ++
8487 ++ mptcp_delete_synack_timer(sk);
8488 ++ } else {
8489 ++ mptcp_del_sock(sk);
8490 ++ }
8491 ++}
8492 ++
8493 ++static void mptcp_set_state(struct sock *sk)
8494 ++{
8495 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
8496 ++
8497 ++ /* Meta is not yet established - wake up the application */
8498 ++ if ((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) &&
8499 ++ sk->sk_state == TCP_ESTABLISHED) {
8500 ++ tcp_set_state(meta_sk, TCP_ESTABLISHED);
8501 ++
8502 ++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
8503 ++ meta_sk->sk_state_change(meta_sk);
8504 ++ sk_wake_async(meta_sk, SOCK_WAKE_IO, POLL_OUT);
8505 ++ }
8506 ++ }
8507 ++
8508 ++ if (sk->sk_state == TCP_ESTABLISHED) {
8509 ++ tcp_sk(sk)->mptcp->establish_increased = 1;
8510 ++ tcp_sk(sk)->mpcb->cnt_established++;
8511 ++ }
8512 ++}
8513 ++
8514 ++void mptcp_init_congestion_control(struct sock *sk)
8515 ++{
8516 ++ struct inet_connection_sock *icsk = inet_csk(sk);
8517 ++ struct inet_connection_sock *meta_icsk = inet_csk(mptcp_meta_sk(sk));
8518 ++ const struct tcp_congestion_ops *ca = meta_icsk->icsk_ca_ops;
8519 ++
8520 ++ /* The application didn't set the congestion control to use
8521 ++ * fallback to the default one.
8522 ++ */
8523 ++ if (ca == &tcp_init_congestion_ops)
8524 ++ goto use_default;
8525 ++
8526 ++ /* Use the same congestion control as set by the user. If the
8527 ++ * module is not available fallback to the default one.
8528 ++ */
8529 ++ if (!try_module_get(ca->owner)) {
8530 ++ pr_warn("%s: fallback to the system default CC\n", __func__);
8531 ++ goto use_default;
8532 ++ }
8533 ++
8534 ++ icsk->icsk_ca_ops = ca;
8535 ++ if (icsk->icsk_ca_ops->init)
8536 ++ icsk->icsk_ca_ops->init(sk);
8537 ++
8538 ++ return;
8539 ++
8540 ++use_default:
8541 ++ icsk->icsk_ca_ops = &tcp_init_congestion_ops;
8542 ++ tcp_init_congestion_control(sk);
8543 ++}
8544 ++
8545 ++u32 mptcp_secret[MD5_MESSAGE_BYTES / 4] ____cacheline_aligned;
8546 ++u32 mptcp_seed = 0;
8547 ++
8548 ++void mptcp_key_sha1(u64 key, u32 *token, u64 *idsn)
8549 ++{
8550 ++ u32 workspace[SHA_WORKSPACE_WORDS];
8551 ++ u32 mptcp_hashed_key[SHA_DIGEST_WORDS];
8552 ++ u8 input[64];
8553 ++ int i;
8554 ++
8555 ++ memset(workspace, 0, sizeof(workspace));
8556 ++
8557 ++ /* Initialize input with appropriate padding */
8558 ++ memset(&input[9], 0, sizeof(input) - 10); /* -10, because the last byte
8559 ++ * is explicitly set too
8560 ++ */
8561 ++ memcpy(input, &key, sizeof(key)); /* Copy key to the msg beginning */
8562 ++ input[8] = 0x80; /* Padding: First bit after message = 1 */
8563 ++ input[63] = 0x40; /* Padding: Length of the message = 64 bits */
8564 ++
8565 ++ sha_init(mptcp_hashed_key);
8566 ++ sha_transform(mptcp_hashed_key, input, workspace);
8567 ++
8568 ++ for (i = 0; i < 5; i++)
8569 ++ mptcp_hashed_key[i] = cpu_to_be32(mptcp_hashed_key[i]);
8570 ++
8571 ++ if (token)
8572 ++ *token = mptcp_hashed_key[0];
8573 ++ if (idsn)
8574 ++ *idsn = *((u64 *)&mptcp_hashed_key[3]);
8575 ++}
8576 ++
8577 ++void mptcp_hmac_sha1(u8 *key_1, u8 *key_2, u8 *rand_1, u8 *rand_2,
8578 ++ u32 *hash_out)
8579 ++{
8580 ++ u32 workspace[SHA_WORKSPACE_WORDS];
8581 ++ u8 input[128]; /* 2 512-bit blocks */
8582 ++ int i;
8583 ++
8584 ++ memset(workspace, 0, sizeof(workspace));
8585 ++
8586 ++ /* Generate key xored with ipad */
8587 ++ memset(input, 0x36, 64);
8588 ++ for (i = 0; i < 8; i++)
8589 ++ input[i] ^= key_1[i];
8590 ++ for (i = 0; i < 8; i++)
8591 ++ input[i + 8] ^= key_2[i];
8592 ++
8593 ++ memcpy(&input[64], rand_1, 4);
8594 ++ memcpy(&input[68], rand_2, 4);
8595 ++ input[72] = 0x80; /* Padding: First bit after message = 1 */
8596 ++ memset(&input[73], 0, 53);
8597 ++
8598 ++ /* Padding: Length of the message = 512 + 64 bits */
8599 ++ input[126] = 0x02;
8600 ++ input[127] = 0x40;
8601 ++
8602 ++ sha_init(hash_out);
8603 ++ sha_transform(hash_out, input, workspace);
8604 ++ memset(workspace, 0, sizeof(workspace));
8605 ++
8606 ++ sha_transform(hash_out, &input[64], workspace);
8607 ++ memset(workspace, 0, sizeof(workspace));
8608 ++
8609 ++ for (i = 0; i < 5; i++)
8610 ++ hash_out[i] = cpu_to_be32(hash_out[i]);
8611 ++
8612 ++ /* Prepare second part of hmac */
8613 ++ memset(input, 0x5C, 64);
8614 ++ for (i = 0; i < 8; i++)
8615 ++ input[i] ^= key_1[i];
8616 ++ for (i = 0; i < 8; i++)
8617 ++ input[i + 8] ^= key_2[i];
8618 ++
8619 ++ memcpy(&input[64], hash_out, 20);
8620 ++ input[84] = 0x80;
8621 ++ memset(&input[85], 0, 41);
8622 ++
8623 ++ /* Padding: Length of the message = 512 + 160 bits */
8624 ++ input[126] = 0x02;
8625 ++ input[127] = 0xA0;
8626 ++
8627 ++ sha_init(hash_out);
8628 ++ sha_transform(hash_out, input, workspace);
8629 ++ memset(workspace, 0, sizeof(workspace));
8630 ++
8631 ++ sha_transform(hash_out, &input[64], workspace);
8632 ++
8633 ++ for (i = 0; i < 5; i++)
8634 ++ hash_out[i] = cpu_to_be32(hash_out[i]);
8635 ++}
8636 ++
8637 ++static void mptcp_mpcb_inherit_sockopts(struct sock *meta_sk, struct sock *master_sk)
8638 ++{
8639 ++ /* Socket-options handled by sk_clone_lock while creating the meta-sk.
8640 ++ * ======
8641 ++ * SO_SNDBUF, SO_SNDBUFFORCE, SO_RCVBUF, SO_RCVBUFFORCE, SO_RCVLOWAT,
8642 ++ * SO_RCVTIMEO, SO_SNDTIMEO, SO_ATTACH_FILTER, SO_DETACH_FILTER,
8643 ++ * TCP_NODELAY, TCP_CORK
8644 ++ *
8645 ++ * Socket-options handled in this function here
8646 ++ * ======
8647 ++ * TCP_DEFER_ACCEPT
8648 ++ * SO_KEEPALIVE
8649 ++ *
8650 ++ * Socket-options on the todo-list
8651 ++ * ======
8652 ++ * SO_BINDTODEVICE - should probably prevent creation of new subsocks
8653 ++ * across other devices. - what about the api-draft?
8654 ++ * SO_DEBUG
8655 ++ * SO_REUSEADDR - probably we don't care about this
8656 ++ * SO_DONTROUTE, SO_BROADCAST
8657 ++ * SO_OOBINLINE
8658 ++ * SO_LINGER
8659 ++ * SO_TIMESTAMP* - I don't think this is of concern for a SOCK_STREAM
8660 ++ * SO_PASSSEC - I don't think this is of concern for a SOCK_STREAM
8661 ++ * SO_RXQ_OVFL
8662 ++ * TCP_COOKIE_TRANSACTIONS
8663 ++ * TCP_MAXSEG
8664 ++ * TCP_THIN_* - Handled by sk_clone_lock, but we need to support this
8665 ++ * in mptcp_retransmit_timer. AND we need to check what is
8666 ++ * about the subsockets.
8667 ++ * TCP_LINGER2
8668 ++ * TCP_WINDOW_CLAMP
8669 ++ * TCP_USER_TIMEOUT
8670 ++ * TCP_MD5SIG
8671 ++ *
8672 ++ * Socket-options of no concern for the meta-socket (but for the subsocket)
8673 ++ * ======
8674 ++ * SO_PRIORITY
8675 ++ * SO_MARK
8676 ++ * TCP_CONGESTION
8677 ++ * TCP_SYNCNT
8678 ++ * TCP_QUICKACK
8679 ++ */
8680 ++
8681 ++ /* DEFER_ACCEPT should not be set on the meta, as we want to accept new subflows directly */
8682 ++ inet_csk(meta_sk)->icsk_accept_queue.rskq_defer_accept = 0;
8683 ++
8684 ++ /* Keepalives are handled entirely at the MPTCP-layer */
8685 ++ if (sock_flag(meta_sk, SOCK_KEEPOPEN)) {
8686 ++ inet_csk_reset_keepalive_timer(meta_sk,
8687 ++ keepalive_time_when(tcp_sk(meta_sk)));
8688 ++ sock_reset_flag(master_sk, SOCK_KEEPOPEN);
8689 ++ inet_csk_delete_keepalive_timer(master_sk);
8690 ++ }
8691 ++
8692 ++ /* Do not propagate subflow-errors up to the MPTCP-layer */
8693 ++ inet_sk(master_sk)->recverr = 0;
8694 ++}
8695 ++
8696 ++static void mptcp_sub_inherit_sockopts(const struct sock *meta_sk, struct sock *sub_sk)
8697 ++{
8698 ++ /* IP_TOS also goes to the subflow. */
8699 ++ if (inet_sk(sub_sk)->tos != inet_sk(meta_sk)->tos) {
8700 ++ inet_sk(sub_sk)->tos = inet_sk(meta_sk)->tos;
8701 ++ sub_sk->sk_priority = meta_sk->sk_priority;
8702 ++ sk_dst_reset(sub_sk);
8703 ++ }
8704 ++
8705 ++ /* Inherit SO_REUSEADDR */
8706 ++ sub_sk->sk_reuse = meta_sk->sk_reuse;
8707 ++
8708 ++ /* Inherit snd/rcv-buffer locks */
8709 ++ sub_sk->sk_userlocks = meta_sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
8710 ++
8711 ++ /* Nagle/Cork is forced off on the subflows. It is handled at the meta-layer */
8712 ++ tcp_sk(sub_sk)->nonagle = TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
8713 ++
8714 ++ /* Keepalives are handled entirely at the MPTCP-layer */
8715 ++ if (sock_flag(sub_sk, SOCK_KEEPOPEN)) {
8716 ++ sock_reset_flag(sub_sk, SOCK_KEEPOPEN);
8717 ++ inet_csk_delete_keepalive_timer(sub_sk);
8718 ++ }
8719 ++
8720 ++ /* Do not propagate subflow-errors up to the MPTCP-layer */
8721 ++ inet_sk(sub_sk)->recverr = 0;
8722 ++}
8723 ++
8724 ++int mptcp_backlog_rcv(struct sock *meta_sk, struct sk_buff *skb)
8725 ++{
8726 ++ /* skb-sk may be NULL if we receive a packet immediatly after the
8727 ++ * SYN/ACK + MP_CAPABLE.
8728 ++ */
8729 ++ struct sock *sk = skb->sk ? skb->sk : meta_sk;
8730 ++ int ret = 0;
8731 ++
8732 ++ skb->sk = NULL;
8733 ++
8734 ++ if (unlikely(!atomic_inc_not_zero(&sk->sk_refcnt))) {
8735 ++ kfree_skb(skb);
8736 ++ return 0;
8737 ++ }
8738 ++
8739 ++ if (sk->sk_family == AF_INET)
8740 ++ ret = tcp_v4_do_rcv(sk, skb);
8741 ++#if IS_ENABLED(CONFIG_IPV6)
8742 ++ else
8743 ++ ret = tcp_v6_do_rcv(sk, skb);
8744 ++#endif
8745 ++
8746 ++ sock_put(sk);
8747 ++ return ret;
8748 ++}
8749 ++
8750 ++struct lock_class_key meta_key;
8751 ++struct lock_class_key meta_slock_key;
8752 ++
8753 ++static void mptcp_synack_timer_handler(unsigned long data)
8754 ++{
8755 ++ struct sock *meta_sk = (struct sock *) data;
8756 ++ struct listen_sock *lopt = inet_csk(meta_sk)->icsk_accept_queue.listen_opt;
8757 ++
8758 ++ /* Only process if socket is not in use. */
8759 ++ bh_lock_sock(meta_sk);
8760 ++
8761 ++ if (sock_owned_by_user(meta_sk)) {
8762 ++ /* Try again later. */
8763 ++ mptcp_reset_synack_timer(meta_sk, HZ/20);
8764 ++ goto out;
8765 ++ }
8766 ++
8767 ++ /* May happen if the queue got destructed in mptcp_close */
8768 ++ if (!lopt)
8769 ++ goto out;
8770 ++
8771 ++ inet_csk_reqsk_queue_prune(meta_sk, TCP_SYNQ_INTERVAL,
8772 ++ TCP_TIMEOUT_INIT, TCP_RTO_MAX);
8773 ++
8774 ++ if (lopt->qlen)
8775 ++ mptcp_reset_synack_timer(meta_sk, TCP_SYNQ_INTERVAL);
8776 ++
8777 ++out:
8778 ++ bh_unlock_sock(meta_sk);
8779 ++ sock_put(meta_sk);
8780 ++}
8781 ++
8782 ++static const struct tcp_sock_ops mptcp_meta_specific = {
8783 ++ .__select_window = __mptcp_select_window,
8784 ++ .select_window = mptcp_select_window,
8785 ++ .select_initial_window = mptcp_select_initial_window,
8786 ++ .init_buffer_space = mptcp_init_buffer_space,
8787 ++ .set_rto = mptcp_tcp_set_rto,
8788 ++ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
8789 ++ .init_congestion_control = mptcp_init_congestion_control,
8790 ++ .send_fin = mptcp_send_fin,
8791 ++ .write_xmit = mptcp_write_xmit,
8792 ++ .send_active_reset = mptcp_send_active_reset,
8793 ++ .write_wakeup = mptcp_write_wakeup,
8794 ++ .prune_ofo_queue = mptcp_prune_ofo_queue,
8795 ++ .retransmit_timer = mptcp_retransmit_timer,
8796 ++ .time_wait = mptcp_time_wait,
8797 ++ .cleanup_rbuf = mptcp_cleanup_rbuf,
8798 ++};
8799 ++
8800 ++static const struct tcp_sock_ops mptcp_sub_specific = {
8801 ++ .__select_window = __mptcp_select_window,
8802 ++ .select_window = mptcp_select_window,
8803 ++ .select_initial_window = mptcp_select_initial_window,
8804 ++ .init_buffer_space = mptcp_init_buffer_space,
8805 ++ .set_rto = mptcp_tcp_set_rto,
8806 ++ .should_expand_sndbuf = mptcp_should_expand_sndbuf,
8807 ++ .init_congestion_control = mptcp_init_congestion_control,
8808 ++ .send_fin = tcp_send_fin,
8809 ++ .write_xmit = tcp_write_xmit,
8810 ++ .send_active_reset = tcp_send_active_reset,
8811 ++ .write_wakeup = tcp_write_wakeup,
8812 ++ .prune_ofo_queue = tcp_prune_ofo_queue,
8813 ++ .retransmit_timer = tcp_retransmit_timer,
8814 ++ .time_wait = tcp_time_wait,
8815 ++ .cleanup_rbuf = tcp_cleanup_rbuf,
8816 ++};
8817 ++
8818 ++static int mptcp_alloc_mpcb(struct sock *meta_sk, __u64 remote_key, u32 window)
8819 ++{
8820 ++ struct mptcp_cb *mpcb;
8821 ++ struct sock *master_sk;
8822 ++ struct inet_connection_sock *master_icsk, *meta_icsk = inet_csk(meta_sk);
8823 ++ struct tcp_sock *master_tp, *meta_tp = tcp_sk(meta_sk);
8824 ++ u64 idsn;
8825 ++
8826 ++ dst_release(meta_sk->sk_rx_dst);
8827 ++ meta_sk->sk_rx_dst = NULL;
8828 ++ /* This flag is set to announce sock_lock_init to
8829 ++ * reclassify the lock-class of the master socket.
8830 ++ */
8831 ++ meta_tp->is_master_sk = 1;
8832 ++ master_sk = sk_clone_lock(meta_sk, GFP_ATOMIC | __GFP_ZERO);
8833 ++ meta_tp->is_master_sk = 0;
8834 ++ if (!master_sk)
8835 ++ return -ENOBUFS;
8836 ++
8837 ++ master_tp = tcp_sk(master_sk);
8838 ++ master_icsk = inet_csk(master_sk);
8839 ++
8840 ++ mpcb = kmem_cache_zalloc(mptcp_cb_cache, GFP_ATOMIC);
8841 ++ if (!mpcb) {
8842 ++ /* sk_free (and __sk_free) requirese wmem_alloc to be 1.
8843 ++ * All the rest is set to 0 thanks to __GFP_ZERO above.
8844 ++ */
8845 ++ atomic_set(&master_sk->sk_wmem_alloc, 1);
8846 ++ sk_free(master_sk);
8847 ++ return -ENOBUFS;
8848 ++ }
8849 ++
8850 ++#if IS_ENABLED(CONFIG_IPV6)
8851 ++ if (meta_icsk->icsk_af_ops == &mptcp_v6_mapped) {
8852 ++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
8853 ++
8854 ++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
8855 ++
8856 ++ newnp = inet6_sk(master_sk);
8857 ++ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
8858 ++
8859 ++ newnp->ipv6_mc_list = NULL;
8860 ++ newnp->ipv6_ac_list = NULL;
8861 ++ newnp->ipv6_fl_list = NULL;
8862 ++ newnp->opt = NULL;
8863 ++ newnp->pktoptions = NULL;
8864 ++ (void)xchg(&newnp->rxpmtu, NULL);
8865 ++ } else if (meta_sk->sk_family == AF_INET6) {
8866 ++ struct ipv6_pinfo *newnp, *np = inet6_sk(meta_sk);
8867 ++
8868 ++ inet_sk(master_sk)->pinet6 = &((struct tcp6_sock *)master_sk)->inet6;
8869 ++
8870 ++ newnp = inet6_sk(master_sk);
8871 ++ memcpy(newnp, np, sizeof(struct ipv6_pinfo));
8872 ++
8873 ++ newnp->hop_limit = -1;
8874 ++ newnp->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
8875 ++ newnp->mc_loop = 1;
8876 ++ newnp->pmtudisc = IPV6_PMTUDISC_WANT;
8877 ++ newnp->ipv6only = sock_net(master_sk)->ipv6.sysctl.bindv6only;
8878 ++ }
8879 ++#endif
8880 ++
8881 ++ meta_tp->mptcp = NULL;
8882 ++
8883 ++ /* Store the keys and generate the peer's token */
8884 ++ mpcb->mptcp_loc_key = meta_tp->mptcp_loc_key;
8885 ++ mpcb->mptcp_loc_token = meta_tp->mptcp_loc_token;
8886 ++
8887 ++ /* Generate Initial data-sequence-numbers */
8888 ++ mptcp_key_sha1(mpcb->mptcp_loc_key, NULL, &idsn);
8889 ++ idsn = ntohll(idsn) + 1;
8890 ++ mpcb->snd_high_order[0] = idsn >> 32;
8891 ++ mpcb->snd_high_order[1] = mpcb->snd_high_order[0] - 1;
8892 ++
8893 ++ meta_tp->write_seq = (u32)idsn;
8894 ++ meta_tp->snd_sml = meta_tp->write_seq;
8895 ++ meta_tp->snd_una = meta_tp->write_seq;
8896 ++ meta_tp->snd_nxt = meta_tp->write_seq;
8897 ++ meta_tp->pushed_seq = meta_tp->write_seq;
8898 ++ meta_tp->snd_up = meta_tp->write_seq;
8899 ++
8900 ++ mpcb->mptcp_rem_key = remote_key;
8901 ++ mptcp_key_sha1(mpcb->mptcp_rem_key, &mpcb->mptcp_rem_token, &idsn);
8902 ++ idsn = ntohll(idsn) + 1;
8903 ++ mpcb->rcv_high_order[0] = idsn >> 32;
8904 ++ mpcb->rcv_high_order[1] = mpcb->rcv_high_order[0] + 1;
8905 ++ meta_tp->copied_seq = (u32) idsn;
8906 ++ meta_tp->rcv_nxt = (u32) idsn;
8907 ++ meta_tp->rcv_wup = (u32) idsn;
8908 ++
8909 ++ meta_tp->snd_wl1 = meta_tp->rcv_nxt - 1;
8910 ++ meta_tp->snd_wnd = window;
8911 ++ meta_tp->retrans_stamp = 0; /* Set in tcp_connect() */
8912 ++
8913 ++ meta_tp->packets_out = 0;
8914 ++ meta_icsk->icsk_probes_out = 0;
8915 ++
8916 ++ /* Set mptcp-pointers */
8917 ++ master_tp->mpcb = mpcb;
8918 ++ master_tp->meta_sk = meta_sk;
8919 ++ meta_tp->mpcb = mpcb;
8920 ++ meta_tp->meta_sk = meta_sk;
8921 ++ mpcb->meta_sk = meta_sk;
8922 ++ mpcb->master_sk = master_sk;
8923 ++
8924 ++ meta_tp->was_meta_sk = 0;
8925 ++
8926 ++ /* Initialize the queues */
8927 ++ skb_queue_head_init(&mpcb->reinject_queue);
8928 ++ skb_queue_head_init(&master_tp->out_of_order_queue);
8929 ++ tcp_prequeue_init(master_tp);
8930 ++ INIT_LIST_HEAD(&master_tp->tsq_node);
8931 ++
8932 ++ master_tp->tsq_flags = 0;
8933 ++
8934 ++ mutex_init(&mpcb->mpcb_mutex);
8935 ++
8936 ++ /* Init the accept_queue structure, we support a queue of 32 pending
8937 ++ * connections, it does not need to be huge, since we only store here
8938 ++ * pending subflow creations.
8939 ++ */
8940 ++ if (reqsk_queue_alloc(&meta_icsk->icsk_accept_queue, 32, GFP_ATOMIC)) {
8941 ++ inet_put_port(master_sk);
8942 ++ kmem_cache_free(mptcp_cb_cache, mpcb);
8943 ++ sk_free(master_sk);
8944 ++ return -ENOMEM;
8945 ++ }
8946 ++
8947 ++ /* Redefine function-pointers as the meta-sk is now fully ready */
8948 ++ static_key_slow_inc(&mptcp_static_key);
8949 ++ meta_tp->mpc = 1;
8950 ++ meta_tp->ops = &mptcp_meta_specific;
8951 ++
8952 ++ meta_sk->sk_backlog_rcv = mptcp_backlog_rcv;
8953 ++ meta_sk->sk_destruct = mptcp_sock_destruct;
8954 ++
8955 ++ /* Meta-level retransmit timer */
8956 ++ meta_icsk->icsk_rto *= 2; /* Double of initial - rto */
8957 ++
8958 ++ tcp_init_xmit_timers(master_sk);
8959 ++ /* Has been set for sending out the SYN */
8960 ++ inet_csk_clear_xmit_timer(meta_sk, ICSK_TIME_RETRANS);
8961 ++
8962 ++ if (!meta_tp->inside_tk_table) {
8963 ++ /* Adding the meta_tp in the token hashtable - coming from server-side */
8964 ++ rcu_read_lock();
8965 ++ spin_lock(&mptcp_tk_hashlock);
8966 ++
8967 ++ __mptcp_hash_insert(meta_tp, mpcb->mptcp_loc_token);
8968 ++
8969 ++ spin_unlock(&mptcp_tk_hashlock);
8970 ++ rcu_read_unlock();
8971 ++ }
8972 ++ master_tp->inside_tk_table = 0;
8973 ++
8974 ++ /* Init time-wait stuff */
8975 ++ INIT_LIST_HEAD(&mpcb->tw_list);
8976 ++ spin_lock_init(&mpcb->tw_lock);
8977 ++
8978 ++ INIT_HLIST_HEAD(&mpcb->callback_list);
8979 ++
8980 ++ mptcp_mpcb_inherit_sockopts(meta_sk, master_sk);
8981 ++
8982 ++ mpcb->orig_sk_rcvbuf = meta_sk->sk_rcvbuf;
8983 ++ mpcb->orig_sk_sndbuf = meta_sk->sk_sndbuf;
8984 ++ mpcb->orig_window_clamp = meta_tp->window_clamp;
8985 ++
8986 ++ /* The meta is directly linked - set refcnt to 1 */
8987 ++ atomic_set(&mpcb->mpcb_refcnt, 1);
8988 ++
8989 ++ mptcp_init_path_manager(mpcb);
8990 ++ mptcp_init_scheduler(mpcb);
8991 ++
8992 ++ setup_timer(&mpcb->synack_timer, mptcp_synack_timer_handler,
8993 ++ (unsigned long)meta_sk);
8994 ++
8995 ++ mptcp_debug("%s: created mpcb with token %#x\n",
8996 ++ __func__, mpcb->mptcp_loc_token);
8997 ++
8998 ++ return 0;
8999 ++}
9000 ++
9001 ++void mptcp_fallback_meta_sk(struct sock *meta_sk)
9002 ++{
9003 ++ kfree(inet_csk(meta_sk)->icsk_accept_queue.listen_opt);
9004 ++ kmem_cache_free(mptcp_cb_cache, tcp_sk(meta_sk)->mpcb);
9005 ++}
9006 ++
9007 ++int mptcp_add_sock(struct sock *meta_sk, struct sock *sk, u8 loc_id, u8 rem_id,
9008 ++ gfp_t flags)
9009 ++{
9010 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
9011 ++ struct tcp_sock *tp = tcp_sk(sk);
9012 ++
9013 ++ tp->mptcp = kmem_cache_zalloc(mptcp_sock_cache, flags);
9014 ++ if (!tp->mptcp)
9015 ++ return -ENOMEM;
9016 ++
9017 ++ tp->mptcp->path_index = mptcp_set_new_pathindex(mpcb);
9018 ++ /* No more space for more subflows? */
9019 ++ if (!tp->mptcp->path_index) {
9020 ++ kmem_cache_free(mptcp_sock_cache, tp->mptcp);
9021 ++ return -EPERM;
9022 ++ }
9023 ++
9024 ++ INIT_HLIST_NODE(&tp->mptcp->cb_list);
9025 ++
9026 ++ tp->mptcp->tp = tp;
9027 ++ tp->mpcb = mpcb;
9028 ++ tp->meta_sk = meta_sk;
9029 ++
9030 ++ static_key_slow_inc(&mptcp_static_key);
9031 ++ tp->mpc = 1;
9032 ++ tp->ops = &mptcp_sub_specific;
9033 ++
9034 ++ tp->mptcp->loc_id = loc_id;
9035 ++ tp->mptcp->rem_id = rem_id;
9036 ++ if (mpcb->sched_ops->init)
9037 ++ mpcb->sched_ops->init(sk);
9038 ++
9039 ++ /* The corresponding sock_put is in mptcp_sock_destruct(). It cannot be
9040 ++ * included in mptcp_del_sock(), because the mpcb must remain alive
9041 ++ * until the last subsocket is completely destroyed.
9042 ++ */
9043 ++ sock_hold(meta_sk);
9044 ++ atomic_inc(&mpcb->mpcb_refcnt);
9045 ++
9046 ++ tp->mptcp->next = mpcb->connection_list;
9047 ++ mpcb->connection_list = tp;
9048 ++ tp->mptcp->attached = 1;
9049 ++
9050 ++ mpcb->cnt_subflows++;
9051 ++ atomic_add(atomic_read(&((struct sock *)tp)->sk_rmem_alloc),
9052 ++ &meta_sk->sk_rmem_alloc);
9053 ++
9054 ++ mptcp_sub_inherit_sockopts(meta_sk, sk);
9055 ++ INIT_DELAYED_WORK(&tp->mptcp->work, mptcp_sub_close_wq);
9056 ++
9057 ++ /* As we successfully allocated the mptcp_tcp_sock, we have to
9058 ++ * change the function-pointers here (for sk_destruct to work correctly)
9059 ++ */
9060 ++ sk->sk_error_report = mptcp_sock_def_error_report;
9061 ++ sk->sk_data_ready = mptcp_data_ready;
9062 ++ sk->sk_write_space = mptcp_write_space;
9063 ++ sk->sk_state_change = mptcp_set_state;
9064 ++ sk->sk_destruct = mptcp_sock_destruct;
9065 ++
9066 ++ if (sk->sk_family == AF_INET)
9067 ++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI4:%d dst_addr:%pI4:%d, cnt_subflows now %d\n",
9068 ++ __func__ , mpcb->mptcp_loc_token,
9069 ++ tp->mptcp->path_index,
9070 ++ &((struct inet_sock *)tp)->inet_saddr,
9071 ++ ntohs(((struct inet_sock *)tp)->inet_sport),
9072 ++ &((struct inet_sock *)tp)->inet_daddr,
9073 ++ ntohs(((struct inet_sock *)tp)->inet_dport),
9074 ++ mpcb->cnt_subflows);
9075 ++#if IS_ENABLED(CONFIG_IPV6)
9076 ++ else
9077 ++ mptcp_debug("%s: token %#x pi %d, src_addr:%pI6:%d dst_addr:%pI6:%d, cnt_subflows now %d\n",
9078 ++ __func__ , mpcb->mptcp_loc_token,
9079 ++ tp->mptcp->path_index, &inet6_sk(sk)->saddr,
9080 ++ ntohs(((struct inet_sock *)tp)->inet_sport),
9081 ++ &sk->sk_v6_daddr,
9082 ++ ntohs(((struct inet_sock *)tp)->inet_dport),
9083 ++ mpcb->cnt_subflows);
9084 ++#endif
9085 ++
9086 ++ return 0;
9087 ++}
9088 ++
9089 ++void mptcp_del_sock(struct sock *sk)
9090 ++{
9091 ++ struct tcp_sock *tp = tcp_sk(sk), *tp_prev;
9092 ++ struct mptcp_cb *mpcb;
9093 ++
9094 ++ if (!tp->mptcp || !tp->mptcp->attached)
9095 ++ return;
9096 ++
9097 ++ mpcb = tp->mpcb;
9098 ++ tp_prev = mpcb->connection_list;
9099 ++
9100 ++ mptcp_debug("%s: Removing subsock tok %#x pi:%d state %d is_meta? %d\n",
9101 ++ __func__, mpcb->mptcp_loc_token, tp->mptcp->path_index,
9102 ++ sk->sk_state, is_meta_sk(sk));
9103 ++
9104 ++ if (tp_prev == tp) {
9105 ++ mpcb->connection_list = tp->mptcp->next;
9106 ++ } else {
9107 ++ for (; tp_prev && tp_prev->mptcp->next; tp_prev = tp_prev->mptcp->next) {
9108 ++ if (tp_prev->mptcp->next == tp) {
9109 ++ tp_prev->mptcp->next = tp->mptcp->next;
9110 ++ break;
9111 ++ }
9112 ++ }
9113 ++ }
9114 ++ mpcb->cnt_subflows--;
9115 ++ if (tp->mptcp->establish_increased)
9116 ++ mpcb->cnt_established--;
9117 ++
9118 ++ tp->mptcp->next = NULL;
9119 ++ tp->mptcp->attached = 0;
9120 ++ mpcb->path_index_bits &= ~(1 << tp->mptcp->path_index);
9121 ++
9122 ++ if (!skb_queue_empty(&sk->sk_write_queue))
9123 ++ mptcp_reinject_data(sk, 0);
9124 ++
9125 ++ if (is_master_tp(tp))
9126 ++ mpcb->master_sk = NULL;
9127 ++ else if (tp->mptcp->pre_established)
9128 ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
9129 ++
9130 ++ rcu_assign_pointer(inet_sk(sk)->inet_opt, NULL);
9131 ++}
9132 ++
9133 ++/* Updates the metasocket ULID/port data, based on the given sock.
9134 ++ * The argument sock must be the sock accessible to the application.
9135 ++ * In this function, we update the meta socket info, based on the changes
9136 ++ * in the application socket (bind, address allocation, ...)
9137 ++ */
9138 ++void mptcp_update_metasocket(struct sock *sk, const struct sock *meta_sk)
9139 ++{
9140 ++ if (tcp_sk(sk)->mpcb->pm_ops->new_session)
9141 ++ tcp_sk(sk)->mpcb->pm_ops->new_session(meta_sk);
9142 ++
9143 ++ tcp_sk(sk)->mptcp->send_mp_prio = tcp_sk(sk)->mptcp->low_prio;
9144 ++}
9145 ++
9146 ++/* Clean up the receive buffer for full frames taken by the user,
9147 ++ * then send an ACK if necessary. COPIED is the number of bytes
9148 ++ * tcp_recvmsg has given to the user so far, it speeds up the
9149 ++ * calculation of whether or not we must ACK for the sake of
9150 ++ * a window update.
9151 ++ */
9152 ++void mptcp_cleanup_rbuf(struct sock *meta_sk, int copied)
9153 ++{
9154 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
9155 ++ struct sock *sk;
9156 ++ __u32 rcv_window_now = 0;
9157 ++
9158 ++ if (copied > 0 && !(meta_sk->sk_shutdown & RCV_SHUTDOWN)) {
9159 ++ rcv_window_now = tcp_receive_window(meta_tp);
9160 ++
9161 ++ if (2 * rcv_window_now > meta_tp->window_clamp)
9162 ++ rcv_window_now = 0;
9163 ++ }
9164 ++
9165 ++ mptcp_for_each_sk(meta_tp->mpcb, sk) {
9166 ++ struct tcp_sock *tp = tcp_sk(sk);
9167 ++ const struct inet_connection_sock *icsk = inet_csk(sk);
9168 ++
9169 ++ if (!mptcp_sk_can_send_ack(sk))
9170 ++ continue;
9171 ++
9172 ++ if (!inet_csk_ack_scheduled(sk))
9173 ++ goto second_part;
9174 ++ /* Delayed ACKs frequently hit locked sockets during bulk
9175 ++ * receive.
9176 ++ */
9177 ++ if (icsk->icsk_ack.blocked ||
9178 ++ /* Once-per-two-segments ACK was not sent by tcp_input.c */
9179 ++ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
9180 ++ /* If this read emptied read buffer, we send ACK, if
9181 ++ * connection is not bidirectional, user drained
9182 ++ * receive buffer and there was a small segment
9183 ++ * in queue.
9184 ++ */
9185 ++ (copied > 0 &&
9186 ++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
9187 ++ ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
9188 ++ !icsk->icsk_ack.pingpong)) &&
9189 ++ !atomic_read(&meta_sk->sk_rmem_alloc))) {
9190 ++ tcp_send_ack(sk);
9191 ++ continue;
9192 ++ }
9193 ++
9194 ++second_part:
9195 ++ /* This here is the second part of tcp_cleanup_rbuf */
9196 ++ if (rcv_window_now) {
9197 ++ __u32 new_window = tp->ops->__select_window(sk);
9198 ++
9199 ++ /* Send ACK now, if this read freed lots of space
9200 ++ * in our buffer. Certainly, new_window is new window.
9201 ++ * We can advertise it now, if it is not less than
9202 ++ * current one.
9203 ++ * "Lots" means "at least twice" here.
9204 ++ */
9205 ++ if (new_window && new_window >= 2 * rcv_window_now)
9206 ++ tcp_send_ack(sk);
9207 ++ }
9208 ++ }
9209 ++}
9210 ++
9211 ++static int mptcp_sub_send_fin(struct sock *sk)
9212 ++{
9213 ++ struct tcp_sock *tp = tcp_sk(sk);
9214 ++ struct sk_buff *skb = tcp_write_queue_tail(sk);
9215 ++ int mss_now;
9216 ++
9217 ++ /* Optimization, tack on the FIN if we have a queue of
9218 ++ * unsent frames. But be careful about outgoing SACKS
9219 ++ * and IP options.
9220 ++ */
9221 ++ mss_now = tcp_current_mss(sk);
9222 ++
9223 ++ if (tcp_send_head(sk) != NULL) {
9224 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
9225 ++ TCP_SKB_CB(skb)->end_seq++;
9226 ++ tp->write_seq++;
9227 ++ } else {
9228 ++ skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_ATOMIC);
9229 ++ if (!skb)
9230 ++ return 1;
9231 ++
9232 ++ /* Reserve space for headers and prepare control bits. */
9233 ++ skb_reserve(skb, MAX_TCP_HEADER);
9234 ++ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
9235 ++ tcp_init_nondata_skb(skb, tp->write_seq,
9236 ++ TCPHDR_ACK | TCPHDR_FIN);
9237 ++ tcp_queue_skb(sk, skb);
9238 ++ }
9239 ++ __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
9240 ++
9241 ++ return 0;
9242 ++}
9243 ++
9244 ++void mptcp_sub_close_wq(struct work_struct *work)
9245 ++{
9246 ++ struct tcp_sock *tp = container_of(work, struct mptcp_tcp_sock, work.work)->tp;
9247 ++ struct sock *sk = (struct sock *)tp;
9248 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
9249 ++
9250 ++ mutex_lock(&tp->mpcb->mpcb_mutex);
9251 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
9252 ++
9253 ++ if (sock_flag(sk, SOCK_DEAD))
9254 ++ goto exit;
9255 ++
9256 ++ /* We come from tcp_disconnect. We are sure that meta_sk is set */
9257 ++ if (!mptcp(tp)) {
9258 ++ tp->closing = 1;
9259 ++ sock_rps_reset_flow(sk);
9260 ++ tcp_close(sk, 0);
9261 ++ goto exit;
9262 ++ }
9263 ++
9264 ++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) {
9265 ++ tp->closing = 1;
9266 ++ sock_rps_reset_flow(sk);
9267 ++ tcp_close(sk, 0);
9268 ++ } else if (tcp_close_state(sk)) {
9269 ++ sk->sk_shutdown |= SEND_SHUTDOWN;
9270 ++ tcp_send_fin(sk);
9271 ++ }
9272 ++
9273 ++exit:
9274 ++ release_sock(meta_sk);
9275 ++ mutex_unlock(&tp->mpcb->mpcb_mutex);
9276 ++ sock_put(sk);
9277 ++}
9278 ++
9279 ++void mptcp_sub_close(struct sock *sk, unsigned long delay)
9280 ++{
9281 ++ struct tcp_sock *tp = tcp_sk(sk);
9282 ++ struct delayed_work *work = &tcp_sk(sk)->mptcp->work;
9283 ++
9284 ++ /* We are already closing - e.g., call from sock_def_error_report upon
9285 ++ * tcp_disconnect in tcp_close.
9286 ++ */
9287 ++ if (tp->closing)
9288 ++ return;
9289 ++
9290 ++ /* Work already scheduled ? */
9291 ++ if (work_pending(&work->work)) {
9292 ++ /* Work present - who will be first ? */
9293 ++ if (jiffies + delay > work->timer.expires)
9294 ++ return;
9295 ++
9296 ++ /* Try canceling - if it fails, work will be executed soon */
9297 ++ if (!cancel_delayed_work(work))
9298 ++ return;
9299 ++ sock_put(sk);
9300 ++ }
9301 ++
9302 ++ if (!delay) {
9303 ++ unsigned char old_state = sk->sk_state;
9304 ++
9305 ++ /* If we are in user-context we can directly do the closing
9306 ++ * procedure. No need to schedule a work-queue.
9307 ++ */
9308 ++ if (!in_softirq()) {
9309 ++ if (sock_flag(sk, SOCK_DEAD))
9310 ++ return;
9311 ++
9312 ++ if (!mptcp(tp)) {
9313 ++ tp->closing = 1;
9314 ++ sock_rps_reset_flow(sk);
9315 ++ tcp_close(sk, 0);
9316 ++ return;
9317 ++ }
9318 ++
9319 ++ if (mptcp_meta_sk(sk)->sk_shutdown == SHUTDOWN_MASK ||
9320 ++ sk->sk_state == TCP_CLOSE) {
9321 ++ tp->closing = 1;
9322 ++ sock_rps_reset_flow(sk);
9323 ++ tcp_close(sk, 0);
9324 ++ } else if (tcp_close_state(sk)) {
9325 ++ sk->sk_shutdown |= SEND_SHUTDOWN;
9326 ++ tcp_send_fin(sk);
9327 ++ }
9328 ++
9329 ++ return;
9330 ++ }
9331 ++
9332 ++ /* We directly send the FIN. Because it may take so a long time,
9333 ++ * untile the work-queue will get scheduled...
9334 ++ *
9335 ++ * If mptcp_sub_send_fin returns 1, it failed and thus we reset
9336 ++ * the old state so that tcp_close will finally send the fin
9337 ++ * in user-context.
9338 ++ */
9339 ++ if (!sk->sk_err && old_state != TCP_CLOSE &&
9340 ++ tcp_close_state(sk) && mptcp_sub_send_fin(sk)) {
9341 ++ if (old_state == TCP_ESTABLISHED)
9342 ++ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
9343 ++ sk->sk_state = old_state;
9344 ++ }
9345 ++ }
9346 ++
9347 ++ sock_hold(sk);
9348 ++ queue_delayed_work(mptcp_wq, work, delay);
9349 ++}
9350 ++
9351 ++void mptcp_sub_force_close(struct sock *sk)
9352 ++{
9353 ++ /* The below tcp_done may have freed the socket, if he is already dead.
9354 ++ * Thus, we are not allowed to access it afterwards. That's why
9355 ++ * we have to store the dead-state in this local variable.
9356 ++ */
9357 ++ int sock_is_dead = sock_flag(sk, SOCK_DEAD);
9358 ++
9359 ++ tcp_sk(sk)->mp_killed = 1;
9360 ++
9361 ++ if (sk->sk_state != TCP_CLOSE)
9362 ++ tcp_done(sk);
9363 ++
9364 ++ if (!sock_is_dead)
9365 ++ mptcp_sub_close(sk, 0);
9366 ++}
9367 ++EXPORT_SYMBOL(mptcp_sub_force_close);
9368 ++
9369 ++/* Update the mpcb send window, based on the contributions
9370 ++ * of each subflow
9371 ++ */
9372 ++void mptcp_update_sndbuf(const struct tcp_sock *tp)
9373 ++{
9374 ++ struct sock *meta_sk = tp->meta_sk, *sk;
9375 ++ int new_sndbuf = 0, old_sndbuf = meta_sk->sk_sndbuf;
9376 ++
9377 ++ mptcp_for_each_sk(tp->mpcb, sk) {
9378 ++ if (!mptcp_sk_can_send(sk))
9379 ++ continue;
9380 ++
9381 ++ new_sndbuf += sk->sk_sndbuf;
9382 ++
9383 ++ if (new_sndbuf > sysctl_tcp_wmem[2] || new_sndbuf < 0) {
9384 ++ new_sndbuf = sysctl_tcp_wmem[2];
9385 ++ break;
9386 ++ }
9387 ++ }
9388 ++ meta_sk->sk_sndbuf = max(min(new_sndbuf, sysctl_tcp_wmem[2]), meta_sk->sk_sndbuf);
9389 ++
9390 ++ /* The subflow's call to sk_write_space in tcp_new_space ends up in
9391 ++ * mptcp_write_space.
9392 ++ * It has nothing to do with waking up the application.
9393 ++ * So, we do it here.
9394 ++ */
9395 ++ if (old_sndbuf != meta_sk->sk_sndbuf)
9396 ++ meta_sk->sk_write_space(meta_sk);
9397 ++}
9398 ++
9399 ++void mptcp_close(struct sock *meta_sk, long timeout)
9400 ++{
9401 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
9402 ++ struct sock *sk_it, *tmpsk;
9403 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
9404 ++ struct sk_buff *skb;
9405 ++ int data_was_unread = 0;
9406 ++ int state;
9407 ++
9408 ++ mptcp_debug("%s: Close of meta_sk with tok %#x\n",
9409 ++ __func__, mpcb->mptcp_loc_token);
9410 ++
9411 ++ mutex_lock(&mpcb->mpcb_mutex);
9412 ++ lock_sock(meta_sk);
9413 ++
9414 ++ if (meta_tp->inside_tk_table) {
9415 ++ /* Detach the mpcb from the token hashtable */
9416 ++ mptcp_hash_remove_bh(meta_tp);
9417 ++ reqsk_queue_destroy(&inet_csk(meta_sk)->icsk_accept_queue);
9418 ++ }
9419 ++
9420 ++ meta_sk->sk_shutdown = SHUTDOWN_MASK;
9421 ++ /* We need to flush the recv. buffs. We do this only on the
9422 ++ * descriptor close, not protocol-sourced closes, because the
9423 ++ * reader process may not have drained the data yet!
9424 ++ */
9425 ++ while ((skb = __skb_dequeue(&meta_sk->sk_receive_queue)) != NULL) {
9426 ++ u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
9427 ++ tcp_hdr(skb)->fin;
9428 ++ data_was_unread += len;
9429 ++ __kfree_skb(skb);
9430 ++ }
9431 ++
9432 ++ sk_mem_reclaim(meta_sk);
9433 ++
9434 ++ /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
9435 ++ if (meta_sk->sk_state == TCP_CLOSE) {
9436 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
9437 ++ if (tcp_sk(sk_it)->send_mp_fclose)
9438 ++ continue;
9439 ++ mptcp_sub_close(sk_it, 0);
9440 ++ }
9441 ++ goto adjudge_to_death;
9442 ++ }
9443 ++
9444 ++ if (data_was_unread) {
9445 ++ /* Unread data was tossed, zap the connection. */
9446 ++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONCLOSE);
9447 ++ tcp_set_state(meta_sk, TCP_CLOSE);
9448 ++ tcp_sk(meta_sk)->ops->send_active_reset(meta_sk,
9449 ++ meta_sk->sk_allocation);
9450 ++ } else if (sock_flag(meta_sk, SOCK_LINGER) && !meta_sk->sk_lingertime) {
9451 ++ /* Check zero linger _after_ checking for unread data. */
9452 ++ meta_sk->sk_prot->disconnect(meta_sk, 0);
9453 ++ NET_INC_STATS_USER(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
9454 ++ } else if (tcp_close_state(meta_sk)) {
9455 ++ mptcp_send_fin(meta_sk);
9456 ++ } else if (meta_tp->snd_una == meta_tp->write_seq) {
9457 ++ /* The DATA_FIN has been sent and acknowledged
9458 ++ * (e.g., by sk_shutdown). Close all the other subflows
9459 ++ */
9460 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
9461 ++ unsigned long delay = 0;
9462 ++ /* If we are the passive closer, don't trigger
9463 ++ * subflow-fin until the subflow has been finned
9464 ++ * by the peer. - thus we add a delay
9465 ++ */
9466 ++ if (mpcb->passive_close &&
9467 ++ sk_it->sk_state == TCP_ESTABLISHED)
9468 ++ delay = inet_csk(sk_it)->icsk_rto << 3;
9469 ++
9470 ++ mptcp_sub_close(sk_it, delay);
9471 ++ }
9472 ++ }
9473 ++
9474 ++ sk_stream_wait_close(meta_sk, timeout);
9475 ++
9476 ++adjudge_to_death:
9477 ++ state = meta_sk->sk_state;
9478 ++ sock_hold(meta_sk);
9479 ++ sock_orphan(meta_sk);
9480 ++
9481 ++ /* socket will be freed after mptcp_close - we have to prevent
9482 ++ * access from the subflows.
9483 ++ */
9484 ++ mptcp_for_each_sk(mpcb, sk_it) {
9485 ++ /* Similar to sock_orphan, but we don't set it DEAD, because
9486 ++ * the callbacks are still set and must be called.
9487 ++ */
9488 ++ write_lock_bh(&sk_it->sk_callback_lock);
9489 ++ sk_set_socket(sk_it, NULL);
9490 ++ sk_it->sk_wq = NULL;
9491 ++ write_unlock_bh(&sk_it->sk_callback_lock);
9492 ++ }
9493 ++
9494 ++ /* It is the last release_sock in its life. It will remove backlog. */
9495 ++ release_sock(meta_sk);
9496 ++
9497 ++ /* Now socket is owned by kernel and we acquire BH lock
9498 ++ * to finish close. No need to check for user refs.
9499 ++ */
9500 ++ local_bh_disable();
9501 ++ bh_lock_sock(meta_sk);
9502 ++ WARN_ON(sock_owned_by_user(meta_sk));
9503 ++
9504 ++ percpu_counter_inc(meta_sk->sk_prot->orphan_count);
9505 ++
9506 ++ /* Have we already been destroyed by a softirq or backlog? */
9507 ++ if (state != TCP_CLOSE && meta_sk->sk_state == TCP_CLOSE)
9508 ++ goto out;
9509 ++
9510 ++ /* This is a (useful) BSD violating of the RFC. There is a
9511 ++ * problem with TCP as specified in that the other end could
9512 ++ * keep a socket open forever with no application left this end.
9513 ++ * We use a 3 minute timeout (about the same as BSD) then kill
9514 ++ * our end. If they send after that then tough - BUT: long enough
9515 ++ * that we won't make the old 4*rto = almost no time - whoops
9516 ++ * reset mistake.
9517 ++ *
9518 ++ * Nope, it was not mistake. It is really desired behaviour
9519 ++ * f.e. on http servers, when such sockets are useless, but
9520 ++ * consume significant resources. Let's do it with special
9521 ++ * linger2 option. --ANK
9522 ++ */
9523 ++
9524 ++ if (meta_sk->sk_state == TCP_FIN_WAIT2) {
9525 ++ if (meta_tp->linger2 < 0) {
9526 ++ tcp_set_state(meta_sk, TCP_CLOSE);
9527 ++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
9528 ++ NET_INC_STATS_BH(sock_net(meta_sk),
9529 ++ LINUX_MIB_TCPABORTONLINGER);
9530 ++ } else {
9531 ++ const int tmo = tcp_fin_time(meta_sk);
9532 ++
9533 ++ if (tmo > TCP_TIMEWAIT_LEN) {
9534 ++ inet_csk_reset_keepalive_timer(meta_sk,
9535 ++ tmo - TCP_TIMEWAIT_LEN);
9536 ++ } else {
9537 ++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2,
9538 ++ tmo);
9539 ++ goto out;
9540 ++ }
9541 ++ }
9542 ++ }
9543 ++ if (meta_sk->sk_state != TCP_CLOSE) {
9544 ++ sk_mem_reclaim(meta_sk);
9545 ++ if (tcp_too_many_orphans(meta_sk, 0)) {
9546 ++ if (net_ratelimit())
9547 ++ pr_info("MPTCP: too many of orphaned sockets\n");
9548 ++ tcp_set_state(meta_sk, TCP_CLOSE);
9549 ++ meta_tp->ops->send_active_reset(meta_sk, GFP_ATOMIC);
9550 ++ NET_INC_STATS_BH(sock_net(meta_sk),
9551 ++ LINUX_MIB_TCPABORTONMEMORY);
9552 ++ }
9553 ++ }
9554 ++
9555 ++
9556 ++ if (meta_sk->sk_state == TCP_CLOSE)
9557 ++ inet_csk_destroy_sock(meta_sk);
9558 ++ /* Otherwise, socket is reprieved until protocol close. */
9559 ++
9560 ++out:
9561 ++ bh_unlock_sock(meta_sk);
9562 ++ local_bh_enable();
9563 ++ mutex_unlock(&mpcb->mpcb_mutex);
9564 ++ sock_put(meta_sk); /* Taken by sock_hold */
9565 ++}
9566 ++
9567 ++void mptcp_disconnect(struct sock *sk)
9568 ++{
9569 ++ struct sock *subsk, *tmpsk;
9570 ++ struct tcp_sock *tp = tcp_sk(sk);
9571 ++
9572 ++ mptcp_delete_synack_timer(sk);
9573 ++
9574 ++ __skb_queue_purge(&tp->mpcb->reinject_queue);
9575 ++
9576 ++ if (tp->inside_tk_table) {
9577 ++ mptcp_hash_remove_bh(tp);
9578 ++ reqsk_queue_destroy(&inet_csk(tp->meta_sk)->icsk_accept_queue);
9579 ++ }
9580 ++
9581 ++ local_bh_disable();
9582 ++ mptcp_for_each_sk_safe(tp->mpcb, subsk, tmpsk) {
9583 ++ /* The socket will get removed from the subsocket-list
9584 ++ * and made non-mptcp by setting mpc to 0.
9585 ++ *
9586 ++ * This is necessary, because tcp_disconnect assumes
9587 ++ * that the connection is completly dead afterwards.
9588 ++ * Thus we need to do a mptcp_del_sock. Due to this call
9589 ++ * we have to make it non-mptcp.
9590 ++ *
9591 ++ * We have to lock the socket, because we set mpc to 0.
9592 ++ * An incoming packet would take the subsocket's lock
9593 ++ * and go on into the receive-path.
9594 ++ * This would be a race.
9595 ++ */
9596 ++
9597 ++ bh_lock_sock(subsk);
9598 ++ mptcp_del_sock(subsk);
9599 ++ tcp_sk(subsk)->mpc = 0;
9600 ++ tcp_sk(subsk)->ops = &tcp_specific;
9601 ++ mptcp_sub_force_close(subsk);
9602 ++ bh_unlock_sock(subsk);
9603 ++ }
9604 ++ local_bh_enable();
9605 ++
9606 ++ tp->was_meta_sk = 1;
9607 ++ tp->mpc = 0;
9608 ++ tp->ops = &tcp_specific;
9609 ++}
9610 ++
9611 ++
9612 ++/* Returns 1 if we should enable MPTCP for that socket. */
9613 ++int mptcp_doit(struct sock *sk)
9614 ++{
9615 ++ /* Do not allow MPTCP enabling if the MPTCP initialization failed */
9616 ++ if (mptcp_init_failed)
9617 ++ return 0;
9618 ++
9619 ++ if (sysctl_mptcp_enabled == MPTCP_APP && !tcp_sk(sk)->mptcp_enabled)
9620 ++ return 0;
9621 ++
9622 ++ /* Socket may already be established (e.g., called from tcp_recvmsg) */
9623 ++ if (mptcp(tcp_sk(sk)) || tcp_sk(sk)->request_mptcp)
9624 ++ return 1;
9625 ++
9626 ++ /* Don't do mptcp over loopback */
9627 ++ if (sk->sk_family == AF_INET &&
9628 ++ (ipv4_is_loopback(inet_sk(sk)->inet_daddr) ||
9629 ++ ipv4_is_loopback(inet_sk(sk)->inet_saddr)))
9630 ++ return 0;
9631 ++#if IS_ENABLED(CONFIG_IPV6)
9632 ++ if (sk->sk_family == AF_INET6 &&
9633 ++ (ipv6_addr_loopback(&sk->sk_v6_daddr) ||
9634 ++ ipv6_addr_loopback(&inet6_sk(sk)->saddr)))
9635 ++ return 0;
9636 ++#endif
9637 ++ if (mptcp_v6_is_v4_mapped(sk) &&
9638 ++ ipv4_is_loopback(inet_sk(sk)->inet_saddr))
9639 ++ return 0;
9640 ++
9641 ++#ifdef CONFIG_TCP_MD5SIG
9642 ++ /* If TCP_MD5SIG is enabled, do not do MPTCP - there is no Option-Space */
9643 ++ if (tcp_sk(sk)->af_specific->md5_lookup(sk, sk))
9644 ++ return 0;
9645 ++#endif
9646 ++
9647 ++ return 1;
9648 ++}
9649 ++
9650 ++int mptcp_create_master_sk(struct sock *meta_sk, __u64 remote_key, u32 window)
9651 ++{
9652 ++ struct tcp_sock *master_tp;
9653 ++ struct sock *master_sk;
9654 ++
9655 ++ if (mptcp_alloc_mpcb(meta_sk, remote_key, window))
9656 ++ goto err_alloc_mpcb;
9657 ++
9658 ++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
9659 ++ master_tp = tcp_sk(master_sk);
9660 ++
9661 ++ if (mptcp_add_sock(meta_sk, master_sk, 0, 0, GFP_ATOMIC))
9662 ++ goto err_add_sock;
9663 ++
9664 ++ if (__inet_inherit_port(meta_sk, master_sk) < 0)
9665 ++ goto err_add_sock;
9666 ++
9667 ++ meta_sk->sk_prot->unhash(meta_sk);
9668 ++
9669 ++ if (master_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(master_sk))
9670 ++ __inet_hash_nolisten(master_sk, NULL);
9671 ++#if IS_ENABLED(CONFIG_IPV6)
9672 ++ else
9673 ++ __inet6_hash(master_sk, NULL);
9674 ++#endif
9675 ++
9676 ++ master_tp->mptcp->init_rcv_wnd = master_tp->rcv_wnd;
9677 ++
9678 ++ return 0;
9679 ++
9680 ++err_add_sock:
9681 ++ mptcp_fallback_meta_sk(meta_sk);
9682 ++
9683 ++ inet_csk_prepare_forced_close(master_sk);
9684 ++ tcp_done(master_sk);
9685 ++ inet_csk_prepare_forced_close(meta_sk);
9686 ++ tcp_done(meta_sk);
9687 ++
9688 ++err_alloc_mpcb:
9689 ++ return -ENOBUFS;
9690 ++}
9691 ++
9692 ++static int __mptcp_check_req_master(struct sock *child,
9693 ++ struct request_sock *req)
9694 ++{
9695 ++ struct tcp_sock *child_tp = tcp_sk(child);
9696 ++ struct sock *meta_sk = child;
9697 ++ struct mptcp_cb *mpcb;
9698 ++ struct mptcp_request_sock *mtreq;
9699 ++
9700 ++ /* Never contained an MP_CAPABLE */
9701 ++ if (!inet_rsk(req)->mptcp_rqsk)
9702 ++ return 1;
9703 ++
9704 ++ if (!inet_rsk(req)->saw_mpc) {
9705 ++ /* Fallback to regular TCP, because we saw one SYN without
9706 ++ * MP_CAPABLE. In tcp_check_req we continue the regular path.
9707 ++ * But, the socket has been added to the reqsk_tk_htb, so we
9708 ++ * must still remove it.
9709 ++ */
9710 ++ mptcp_reqsk_remove_tk(req);
9711 ++ return 1;
9712 ++ }
9713 ++
9714 ++ /* Just set this values to pass them to mptcp_alloc_mpcb */
9715 ++ mtreq = mptcp_rsk(req);
9716 ++ child_tp->mptcp_loc_key = mtreq->mptcp_loc_key;
9717 ++ child_tp->mptcp_loc_token = mtreq->mptcp_loc_token;
9718 ++
9719 ++ if (mptcp_create_master_sk(meta_sk, mtreq->mptcp_rem_key,
9720 ++ child_tp->snd_wnd))
9721 ++ return -ENOBUFS;
9722 ++
9723 ++ child = tcp_sk(child)->mpcb->master_sk;
9724 ++ child_tp = tcp_sk(child);
9725 ++ mpcb = child_tp->mpcb;
9726 ++
9727 ++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
9728 ++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
9729 ++
9730 ++ mpcb->dss_csum = mtreq->dss_csum;
9731 ++ mpcb->server_side = 1;
9732 ++
9733 ++ /* Will be moved to ESTABLISHED by tcp_rcv_state_process() */
9734 ++ mptcp_update_metasocket(child, meta_sk);
9735 ++
9736 ++ /* Needs to be done here additionally, because when accepting a
9737 ++ * new connection we pass by __reqsk_free and not reqsk_free.
9738 ++ */
9739 ++ mptcp_reqsk_remove_tk(req);
9740 ++
9741 ++ /* Hold when creating the meta-sk in tcp_vX_syn_recv_sock. */
9742 ++ sock_put(meta_sk);
9743 ++
9744 ++ return 0;
9745 ++}
9746 ++
9747 ++int mptcp_check_req_fastopen(struct sock *child, struct request_sock *req)
9748 ++{
9749 ++ struct sock *meta_sk = child, *master_sk;
9750 ++ struct sk_buff *skb;
9751 ++ u32 new_mapping;
9752 ++ int ret;
9753 ++
9754 ++ ret = __mptcp_check_req_master(child, req);
9755 ++ if (ret)
9756 ++ return ret;
9757 ++
9758 ++ master_sk = tcp_sk(meta_sk)->mpcb->master_sk;
9759 ++
9760 ++ /* We need to rewind copied_seq as it is set to IDSN + 1 and as we have
9761 ++ * pre-MPTCP data in the receive queue.
9762 ++ */
9763 ++ tcp_sk(meta_sk)->copied_seq -= tcp_sk(master_sk)->rcv_nxt -
9764 ++ tcp_rsk(req)->rcv_isn - 1;
9765 ++
9766 ++ /* Map subflow sequence number to data sequence numbers. We need to map
9767 ++ * these data to [IDSN - len - 1, IDSN[.
9768 ++ */
9769 ++ new_mapping = tcp_sk(meta_sk)->copied_seq - tcp_rsk(req)->rcv_isn - 1;
9770 ++
9771 ++ /* There should be only one skb: the SYN + data. */
9772 ++ skb_queue_walk(&meta_sk->sk_receive_queue, skb) {
9773 ++ TCP_SKB_CB(skb)->seq += new_mapping;
9774 ++ TCP_SKB_CB(skb)->end_seq += new_mapping;
9775 ++ }
9776 ++
9777 ++ /* With fastopen we change the semantics of the relative subflow
9778 ++ * sequence numbers to deal with middleboxes that could add/remove
9779 ++ * multiple bytes in the SYN. We chose to start counting at rcv_nxt - 1
9780 ++ * instead of the regular TCP ISN.
9781 ++ */
9782 ++ tcp_sk(master_sk)->mptcp->rcv_isn = tcp_sk(master_sk)->rcv_nxt - 1;
9783 ++
9784 ++ /* We need to update copied_seq of the master_sk to account for the
9785 ++ * already moved data to the meta receive queue.
9786 ++ */
9787 ++ tcp_sk(master_sk)->copied_seq = tcp_sk(master_sk)->rcv_nxt;
9788 ++
9789 ++ /* Handled by the master_sk */
9790 ++ tcp_sk(meta_sk)->fastopen_rsk = NULL;
9791 ++
9792 ++ return 0;
9793 ++}
9794 ++
9795 ++int mptcp_check_req_master(struct sock *sk, struct sock *child,
9796 ++ struct request_sock *req,
9797 ++ struct request_sock **prev)
9798 ++{
9799 ++ struct sock *meta_sk = child;
9800 ++ int ret;
9801 ++
9802 ++ ret = __mptcp_check_req_master(child, req);
9803 ++ if (ret)
9804 ++ return ret;
9805 ++
9806 ++ inet_csk_reqsk_queue_unlink(sk, req, prev);
9807 ++ inet_csk_reqsk_queue_removed(sk, req);
9808 ++ inet_csk_reqsk_queue_add(sk, req, meta_sk);
9809 ++
9810 ++ return 0;
9811 ++}
9812 ++
9813 ++struct sock *mptcp_check_req_child(struct sock *meta_sk, struct sock *child,
9814 ++ struct request_sock *req,
9815 ++ struct request_sock **prev,
9816 ++ const struct mptcp_options_received *mopt)
9817 ++{
9818 ++ struct tcp_sock *child_tp = tcp_sk(child);
9819 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
9820 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
9821 ++ u8 hash_mac_check[20];
9822 ++
9823 ++ child_tp->inside_tk_table = 0;
9824 ++
9825 ++ if (!mopt->join_ack)
9826 ++ goto teardown;
9827 ++
9828 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
9829 ++ (u8 *)&mpcb->mptcp_loc_key,
9830 ++ (u8 *)&mtreq->mptcp_rem_nonce,
9831 ++ (u8 *)&mtreq->mptcp_loc_nonce,
9832 ++ (u32 *)hash_mac_check);
9833 ++
9834 ++ if (memcmp(hash_mac_check, (char *)&mopt->mptcp_recv_mac, 20))
9835 ++ goto teardown;
9836 ++
9837 ++ /* Point it to the same struct socket and wq as the meta_sk */
9838 ++ sk_set_socket(child, meta_sk->sk_socket);
9839 ++ child->sk_wq = meta_sk->sk_wq;
9840 ++
9841 ++ if (mptcp_add_sock(meta_sk, child, mtreq->loc_id, mtreq->rem_id, GFP_ATOMIC)) {
9842 ++ /* Has been inherited, but now child_tp->mptcp is NULL */
9843 ++ child_tp->mpc = 0;
9844 ++ child_tp->ops = &tcp_specific;
9845 ++
9846 ++ /* TODO when we support acking the third ack for new subflows,
9847 ++ * we should silently discard this third ack, by returning NULL.
9848 ++ *
9849 ++ * Maybe, at the retransmission we will have enough memory to
9850 ++ * fully add the socket to the meta-sk.
9851 ++ */
9852 ++ goto teardown;
9853 ++ }
9854 ++
9855 ++ /* The child is a clone of the meta socket, we must now reset
9856 ++ * some of the fields
9857 ++ */
9858 ++ child_tp->mptcp->rcv_low_prio = mtreq->rcv_low_prio;
9859 ++
9860 ++ /* We should allow proper increase of the snd/rcv-buffers. Thus, we
9861 ++ * use the original values instead of the bloated up ones from the
9862 ++ * clone.
9863 ++ */
9864 ++ child->sk_sndbuf = mpcb->orig_sk_sndbuf;
9865 ++ child->sk_rcvbuf = mpcb->orig_sk_rcvbuf;
9866 ++
9867 ++ child_tp->mptcp->slave_sk = 1;
9868 ++ child_tp->mptcp->snt_isn = tcp_rsk(req)->snt_isn;
9869 ++ child_tp->mptcp->rcv_isn = tcp_rsk(req)->rcv_isn;
9870 ++ child_tp->mptcp->init_rcv_wnd = req->rcv_wnd;
9871 ++
9872 ++ child_tp->tsq_flags = 0;
9873 ++
9874 ++ /* Subflows do not use the accept queue, as they
9875 ++ * are attached immediately to the mpcb.
9876 ++ */
9877 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
9878 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
9879 ++ reqsk_free(req);
9880 ++ return child;
9881 ++
9882 ++teardown:
9883 ++ /* Drop this request - sock creation failed. */
9884 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
9885 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue, req);
9886 ++ reqsk_free(req);
9887 ++ inet_csk_prepare_forced_close(child);
9888 ++ tcp_done(child);
9889 ++ return meta_sk;
9890 ++}
9891 ++
9892 ++int mptcp_init_tw_sock(struct sock *sk, struct tcp_timewait_sock *tw)
9893 ++{
9894 ++ struct mptcp_tw *mptw;
9895 ++ struct tcp_sock *tp = tcp_sk(sk);
9896 ++ struct mptcp_cb *mpcb = tp->mpcb;
9897 ++
9898 ++ /* A subsocket in tw can only receive data. So, if we are in
9899 ++ * infinite-receive, then we should not reply with a data-ack or act
9900 ++ * upon general MPTCP-signaling. We prevent this by simply not creating
9901 ++ * the mptcp_tw_sock.
9902 ++ */
9903 ++ if (mpcb->infinite_mapping_rcv) {
9904 ++ tw->mptcp_tw = NULL;
9905 ++ return 0;
9906 ++ }
9907 ++
9908 ++ /* Alloc MPTCP-tw-sock */
9909 ++ mptw = kmem_cache_alloc(mptcp_tw_cache, GFP_ATOMIC);
9910 ++ if (!mptw)
9911 ++ return -ENOBUFS;
9912 ++
9913 ++ atomic_inc(&mpcb->mpcb_refcnt);
9914 ++
9915 ++ tw->mptcp_tw = mptw;
9916 ++ mptw->loc_key = mpcb->mptcp_loc_key;
9917 ++ mptw->meta_tw = mpcb->in_time_wait;
9918 ++ if (mptw->meta_tw) {
9919 ++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(mptcp_meta_tp(tp));
9920 ++ if (mpcb->mptw_state != TCP_TIME_WAIT)
9921 ++ mptw->rcv_nxt++;
9922 ++ }
9923 ++ rcu_assign_pointer(mptw->mpcb, mpcb);
9924 ++
9925 ++ spin_lock(&mpcb->tw_lock);
9926 ++ list_add_rcu(&mptw->list, &tp->mpcb->tw_list);
9927 ++ mptw->in_list = 1;
9928 ++ spin_unlock(&mpcb->tw_lock);
9929 ++
9930 ++ return 0;
9931 ++}
9932 ++
9933 ++void mptcp_twsk_destructor(struct tcp_timewait_sock *tw)
9934 ++{
9935 ++ struct mptcp_cb *mpcb;
9936 ++
9937 ++ rcu_read_lock();
9938 ++ mpcb = rcu_dereference(tw->mptcp_tw->mpcb);
9939 ++
9940 ++ /* If we are still holding a ref to the mpcb, we have to remove ourself
9941 ++ * from the list and drop the ref properly.
9942 ++ */
9943 ++ if (mpcb && atomic_inc_not_zero(&mpcb->mpcb_refcnt)) {
9944 ++ spin_lock(&mpcb->tw_lock);
9945 ++ if (tw->mptcp_tw->in_list) {
9946 ++ list_del_rcu(&tw->mptcp_tw->list);
9947 ++ tw->mptcp_tw->in_list = 0;
9948 ++ }
9949 ++ spin_unlock(&mpcb->tw_lock);
9950 ++
9951 ++ /* Twice, because we increased it above */
9952 ++ mptcp_mpcb_put(mpcb);
9953 ++ mptcp_mpcb_put(mpcb);
9954 ++ }
9955 ++
9956 ++ rcu_read_unlock();
9957 ++
9958 ++ kmem_cache_free(mptcp_tw_cache, tw->mptcp_tw);
9959 ++}
9960 ++
9961 ++/* Updates the rcv_nxt of the time-wait-socks and allows them to ack a
9962 ++ * data-fin.
9963 ++ */
9964 ++void mptcp_time_wait(struct sock *sk, int state, int timeo)
9965 ++{
9966 ++ struct tcp_sock *tp = tcp_sk(sk);
9967 ++ struct mptcp_tw *mptw;
9968 ++
9969 ++ /* Used for sockets that go into tw after the meta
9970 ++ * (see mptcp_init_tw_sock())
9971 ++ */
9972 ++ tp->mpcb->in_time_wait = 1;
9973 ++ tp->mpcb->mptw_state = state;
9974 ++
9975 ++ /* Update the time-wait-sock's information */
9976 ++ rcu_read_lock_bh();
9977 ++ list_for_each_entry_rcu(mptw, &tp->mpcb->tw_list, list) {
9978 ++ mptw->meta_tw = 1;
9979 ++ mptw->rcv_nxt = mptcp_get_rcv_nxt_64(tp);
9980 ++
9981 ++ /* We want to ack a DATA_FIN, but are yet in FIN_WAIT_2 -
9982 ++ * pretend as if the DATA_FIN has already reached us, that way
9983 ++ * the checks in tcp_timewait_state_process will be good as the
9984 ++ * DATA_FIN comes in.
9985 ++ */
9986 ++ if (state != TCP_TIME_WAIT)
9987 ++ mptw->rcv_nxt++;
9988 ++ }
9989 ++ rcu_read_unlock_bh();
9990 ++
9991 ++ tcp_done(sk);
9992 ++}
9993 ++
9994 ++void mptcp_tsq_flags(struct sock *sk)
9995 ++{
9996 ++ struct tcp_sock *tp = tcp_sk(sk);
9997 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
9998 ++
9999 ++ /* It will be handled as a regular deferred-call */
10000 ++ if (is_meta_sk(sk))
10001 ++ return;
10002 ++
10003 ++ if (hlist_unhashed(&tp->mptcp->cb_list)) {
10004 ++ hlist_add_head(&tp->mptcp->cb_list, &tp->mpcb->callback_list);
10005 ++ /* We need to hold it here, as the sock_hold is not assured
10006 ++ * by the release_sock as it is done in regular TCP.
10007 ++ *
10008 ++ * The subsocket may get inet_csk_destroy'd while it is inside
10009 ++ * the callback_list.
10010 ++ */
10011 ++ sock_hold(sk);
10012 ++ }
10013 ++
10014 ++ if (!test_and_set_bit(MPTCP_SUB_DEFERRED, &tcp_sk(meta_sk)->tsq_flags))
10015 ++ sock_hold(meta_sk);
10016 ++}
10017 ++
10018 ++void mptcp_tsq_sub_deferred(struct sock *meta_sk)
10019 ++{
10020 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
10021 ++ struct mptcp_tcp_sock *mptcp;
10022 ++ struct hlist_node *tmp;
10023 ++
10024 ++ BUG_ON(!is_meta_sk(meta_sk) && !meta_tp->was_meta_sk);
10025 ++
10026 ++ __sock_put(meta_sk);
10027 ++ hlist_for_each_entry_safe(mptcp, tmp, &meta_tp->mpcb->callback_list, cb_list) {
10028 ++ struct tcp_sock *tp = mptcp->tp;
10029 ++ struct sock *sk = (struct sock *)tp;
10030 ++
10031 ++ hlist_del_init(&mptcp->cb_list);
10032 ++ sk->sk_prot->release_cb(sk);
10033 ++ /* Final sock_put (cfr. mptcp_tsq_flags */
10034 ++ sock_put(sk);
10035 ++ }
10036 ++}
10037 ++
10038 ++void mptcp_join_reqsk_init(struct mptcp_cb *mpcb, const struct request_sock *req,
10039 ++ struct sk_buff *skb)
10040 ++{
10041 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
10042 ++ struct mptcp_options_received mopt;
10043 ++ u8 mptcp_hash_mac[20];
10044 ++
10045 ++ mptcp_init_mp_opt(&mopt);
10046 ++ tcp_parse_mptcp_options(skb, &mopt);
10047 ++
10048 ++ mtreq = mptcp_rsk(req);
10049 ++ mtreq->mptcp_mpcb = mpcb;
10050 ++ mtreq->is_sub = 1;
10051 ++ inet_rsk(req)->mptcp_rqsk = 1;
10052 ++
10053 ++ mtreq->mptcp_rem_nonce = mopt.mptcp_recv_nonce;
10054 ++
10055 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
10056 ++ (u8 *)&mpcb->mptcp_rem_key,
10057 ++ (u8 *)&mtreq->mptcp_loc_nonce,
10058 ++ (u8 *)&mtreq->mptcp_rem_nonce, (u32 *)mptcp_hash_mac);
10059 ++ mtreq->mptcp_hash_tmac = *(u64 *)mptcp_hash_mac;
10060 ++
10061 ++ mtreq->rem_id = mopt.rem_id;
10062 ++ mtreq->rcv_low_prio = mopt.low_prio;
10063 ++ inet_rsk(req)->saw_mpc = 1;
10064 ++}
10065 ++
10066 ++void mptcp_reqsk_init(struct request_sock *req, const struct sk_buff *skb)
10067 ++{
10068 ++ struct mptcp_options_received mopt;
10069 ++ struct mptcp_request_sock *mreq = mptcp_rsk(req);
10070 ++
10071 ++ mptcp_init_mp_opt(&mopt);
10072 ++ tcp_parse_mptcp_options(skb, &mopt);
10073 ++
10074 ++ mreq->is_sub = 0;
10075 ++ inet_rsk(req)->mptcp_rqsk = 1;
10076 ++ mreq->dss_csum = mopt.dss_csum;
10077 ++ mreq->hash_entry.pprev = NULL;
10078 ++
10079 ++ mptcp_reqsk_new_mptcp(req, &mopt, skb);
10080 ++}
10081 ++
10082 ++int mptcp_conn_request(struct sock *sk, struct sk_buff *skb)
10083 ++{
10084 ++ struct mptcp_options_received mopt;
10085 ++ const struct tcp_sock *tp = tcp_sk(sk);
10086 ++ __u32 isn = TCP_SKB_CB(skb)->when;
10087 ++ bool want_cookie = false;
10088 ++
10089 ++ if ((sysctl_tcp_syncookies == 2 ||
10090 ++ inet_csk_reqsk_queue_is_full(sk)) && !isn) {
10091 ++ want_cookie = tcp_syn_flood_action(sk, skb,
10092 ++ mptcp_request_sock_ops.slab_name);
10093 ++ if (!want_cookie)
10094 ++ goto drop;
10095 ++ }
10096 ++
10097 ++ mptcp_init_mp_opt(&mopt);
10098 ++ tcp_parse_mptcp_options(skb, &mopt);
10099 ++
10100 ++ if (mopt.is_mp_join)
10101 ++ return mptcp_do_join_short(skb, &mopt, sock_net(sk));
10102 ++ if (mopt.drop_me)
10103 ++ goto drop;
10104 ++
10105 ++ if (sysctl_mptcp_enabled == MPTCP_APP && !tp->mptcp_enabled)
10106 ++ mopt.saw_mpc = 0;
10107 ++
10108 ++ if (skb->protocol == htons(ETH_P_IP)) {
10109 ++ if (mopt.saw_mpc && !want_cookie) {
10110 ++ if (skb_rtable(skb)->rt_flags &
10111 ++ (RTCF_BROADCAST | RTCF_MULTICAST))
10112 ++ goto drop;
10113 ++
10114 ++ return tcp_conn_request(&mptcp_request_sock_ops,
10115 ++ &mptcp_request_sock_ipv4_ops,
10116 ++ sk, skb);
10117 ++ }
10118 ++
10119 ++ return tcp_v4_conn_request(sk, skb);
10120 ++#if IS_ENABLED(CONFIG_IPV6)
10121 ++ } else {
10122 ++ if (mopt.saw_mpc && !want_cookie) {
10123 ++ if (!ipv6_unicast_destination(skb))
10124 ++ goto drop;
10125 ++
10126 ++ return tcp_conn_request(&mptcp6_request_sock_ops,
10127 ++ &mptcp_request_sock_ipv6_ops,
10128 ++ sk, skb);
10129 ++ }
10130 ++
10131 ++ return tcp_v6_conn_request(sk, skb);
10132 ++#endif
10133 ++ }
10134 ++drop:
10135 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
10136 ++ return 0;
10137 ++}
10138 ++
10139 ++struct workqueue_struct *mptcp_wq;
10140 ++EXPORT_SYMBOL(mptcp_wq);
10141 ++
10142 ++/* Output /proc/net/mptcp */
10143 ++static int mptcp_pm_seq_show(struct seq_file *seq, void *v)
10144 ++{
10145 ++ struct tcp_sock *meta_tp;
10146 ++ const struct net *net = seq->private;
10147 ++ int i, n = 0;
10148 ++
10149 ++ seq_printf(seq, " sl loc_tok rem_tok v6 local_address remote_address st ns tx_queue rx_queue inode");
10150 ++ seq_putc(seq, '\n');
10151 ++
10152 ++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
10153 ++ struct hlist_nulls_node *node;
10154 ++ rcu_read_lock_bh();
10155 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node,
10156 ++ &tk_hashtable[i], tk_table) {
10157 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
10158 ++ struct sock *meta_sk = (struct sock *)meta_tp;
10159 ++ struct inet_sock *isk = inet_sk(meta_sk);
10160 ++
10161 ++ if (!mptcp(meta_tp) || !net_eq(net, sock_net(meta_sk)))
10162 ++ continue;
10163 ++
10164 ++ if (capable(CAP_NET_ADMIN)) {
10165 ++ seq_printf(seq, "%4d: %04X %04X ", n++,
10166 ++ mpcb->mptcp_loc_token,
10167 ++ mpcb->mptcp_rem_token);
10168 ++ } else {
10169 ++ seq_printf(seq, "%4d: %04X %04X ", n++, -1, -1);
10170 ++ }
10171 ++ if (meta_sk->sk_family == AF_INET ||
10172 ++ mptcp_v6_is_v4_mapped(meta_sk)) {
10173 ++ seq_printf(seq, " 0 %08X:%04X %08X:%04X ",
10174 ++ isk->inet_rcv_saddr,
10175 ++ ntohs(isk->inet_sport),
10176 ++ isk->inet_daddr,
10177 ++ ntohs(isk->inet_dport));
10178 ++#if IS_ENABLED(CONFIG_IPV6)
10179 ++ } else if (meta_sk->sk_family == AF_INET6) {
10180 ++ struct in6_addr *src = &meta_sk->sk_v6_rcv_saddr;
10181 ++ struct in6_addr *dst = &meta_sk->sk_v6_daddr;
10182 ++ seq_printf(seq, " 1 %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X",
10183 ++ src->s6_addr32[0], src->s6_addr32[1],
10184 ++ src->s6_addr32[2], src->s6_addr32[3],
10185 ++ ntohs(isk->inet_sport),
10186 ++ dst->s6_addr32[0], dst->s6_addr32[1],
10187 ++ dst->s6_addr32[2], dst->s6_addr32[3],
10188 ++ ntohs(isk->inet_dport));
10189 ++#endif
10190 ++ }
10191 ++ seq_printf(seq, " %02X %02X %08X:%08X %lu",
10192 ++ meta_sk->sk_state, mpcb->cnt_subflows,
10193 ++ meta_tp->write_seq - meta_tp->snd_una,
10194 ++ max_t(int, meta_tp->rcv_nxt -
10195 ++ meta_tp->copied_seq, 0),
10196 ++ sock_i_ino(meta_sk));
10197 ++ seq_putc(seq, '\n');
10198 ++ }
10199 ++
10200 ++ rcu_read_unlock_bh();
10201 ++ }
10202 ++
10203 ++ return 0;
10204 ++}
10205 ++
10206 ++static int mptcp_pm_seq_open(struct inode *inode, struct file *file)
10207 ++{
10208 ++ return single_open_net(inode, file, mptcp_pm_seq_show);
10209 ++}
10210 ++
10211 ++static const struct file_operations mptcp_pm_seq_fops = {
10212 ++ .owner = THIS_MODULE,
10213 ++ .open = mptcp_pm_seq_open,
10214 ++ .read = seq_read,
10215 ++ .llseek = seq_lseek,
10216 ++ .release = single_release_net,
10217 ++};
10218 ++
10219 ++static int mptcp_pm_init_net(struct net *net)
10220 ++{
10221 ++ if (!proc_create("mptcp", S_IRUGO, net->proc_net, &mptcp_pm_seq_fops))
10222 ++ return -ENOMEM;
10223 ++
10224 ++ return 0;
10225 ++}
10226 ++
10227 ++static void mptcp_pm_exit_net(struct net *net)
10228 ++{
10229 ++ remove_proc_entry("mptcp", net->proc_net);
10230 ++}
10231 ++
10232 ++static struct pernet_operations mptcp_pm_proc_ops = {
10233 ++ .init = mptcp_pm_init_net,
10234 ++ .exit = mptcp_pm_exit_net,
10235 ++};
10236 ++
10237 ++/* General initialization of mptcp */
10238 ++void __init mptcp_init(void)
10239 ++{
10240 ++ int i;
10241 ++ struct ctl_table_header *mptcp_sysctl;
10242 ++
10243 ++ mptcp_sock_cache = kmem_cache_create("mptcp_sock",
10244 ++ sizeof(struct mptcp_tcp_sock),
10245 ++ 0, SLAB_HWCACHE_ALIGN,
10246 ++ NULL);
10247 ++ if (!mptcp_sock_cache)
10248 ++ goto mptcp_sock_cache_failed;
10249 ++
10250 ++ mptcp_cb_cache = kmem_cache_create("mptcp_cb", sizeof(struct mptcp_cb),
10251 ++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
10252 ++ NULL);
10253 ++ if (!mptcp_cb_cache)
10254 ++ goto mptcp_cb_cache_failed;
10255 ++
10256 ++ mptcp_tw_cache = kmem_cache_create("mptcp_tw", sizeof(struct mptcp_tw),
10257 ++ 0, SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
10258 ++ NULL);
10259 ++ if (!mptcp_tw_cache)
10260 ++ goto mptcp_tw_cache_failed;
10261 ++
10262 ++ get_random_bytes(mptcp_secret, sizeof(mptcp_secret));
10263 ++
10264 ++ mptcp_wq = alloc_workqueue("mptcp_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
10265 ++ if (!mptcp_wq)
10266 ++ goto alloc_workqueue_failed;
10267 ++
10268 ++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
10269 ++ INIT_HLIST_NULLS_HEAD(&tk_hashtable[i], i);
10270 ++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_htb[i],
10271 ++ i + MPTCP_REQSK_NULLS_BASE);
10272 ++ INIT_HLIST_NULLS_HEAD(&mptcp_reqsk_tk_htb[i], i);
10273 ++ }
10274 ++
10275 ++ spin_lock_init(&mptcp_reqsk_hlock);
10276 ++ spin_lock_init(&mptcp_tk_hashlock);
10277 ++
10278 ++ if (register_pernet_subsys(&mptcp_pm_proc_ops))
10279 ++ goto pernet_failed;
10280 ++
10281 ++#if IS_ENABLED(CONFIG_IPV6)
10282 ++ if (mptcp_pm_v6_init())
10283 ++ goto mptcp_pm_v6_failed;
10284 ++#endif
10285 ++ if (mptcp_pm_v4_init())
10286 ++ goto mptcp_pm_v4_failed;
10287 ++
10288 ++ mptcp_sysctl = register_net_sysctl(&init_net, "net/mptcp", mptcp_table);
10289 ++ if (!mptcp_sysctl)
10290 ++ goto register_sysctl_failed;
10291 ++
10292 ++ if (mptcp_register_path_manager(&mptcp_pm_default))
10293 ++ goto register_pm_failed;
10294 ++
10295 ++ if (mptcp_register_scheduler(&mptcp_sched_default))
10296 ++ goto register_sched_failed;
10297 ++
10298 ++ pr_info("MPTCP: Stable release v0.89.0-rc");
10299 ++
10300 ++ mptcp_init_failed = false;
10301 ++
10302 ++ return;
10303 ++
10304 ++register_sched_failed:
10305 ++ mptcp_unregister_path_manager(&mptcp_pm_default);
10306 ++register_pm_failed:
10307 ++ unregister_net_sysctl_table(mptcp_sysctl);
10308 ++register_sysctl_failed:
10309 ++ mptcp_pm_v4_undo();
10310 ++mptcp_pm_v4_failed:
10311 ++#if IS_ENABLED(CONFIG_IPV6)
10312 ++ mptcp_pm_v6_undo();
10313 ++mptcp_pm_v6_failed:
10314 ++#endif
10315 ++ unregister_pernet_subsys(&mptcp_pm_proc_ops);
10316 ++pernet_failed:
10317 ++ destroy_workqueue(mptcp_wq);
10318 ++alloc_workqueue_failed:
10319 ++ kmem_cache_destroy(mptcp_tw_cache);
10320 ++mptcp_tw_cache_failed:
10321 ++ kmem_cache_destroy(mptcp_cb_cache);
10322 ++mptcp_cb_cache_failed:
10323 ++ kmem_cache_destroy(mptcp_sock_cache);
10324 ++mptcp_sock_cache_failed:
10325 ++ mptcp_init_failed = true;
10326 ++}
10327 +diff --git a/net/mptcp/mptcp_fullmesh.c b/net/mptcp/mptcp_fullmesh.c
10328 +new file mode 100644
10329 +index 000000000000..3a54413ce25b
10330 +--- /dev/null
10331 ++++ b/net/mptcp/mptcp_fullmesh.c
10332 +@@ -0,0 +1,1722 @@
10333 ++#include <linux/module.h>
10334 ++
10335 ++#include <net/mptcp.h>
10336 ++#include <net/mptcp_v4.h>
10337 ++
10338 ++#if IS_ENABLED(CONFIG_IPV6)
10339 ++#include <net/mptcp_v6.h>
10340 ++#include <net/addrconf.h>
10341 ++#endif
10342 ++
10343 ++enum {
10344 ++ MPTCP_EVENT_ADD = 1,
10345 ++ MPTCP_EVENT_DEL,
10346 ++ MPTCP_EVENT_MOD,
10347 ++};
10348 ++
10349 ++#define MPTCP_SUBFLOW_RETRY_DELAY 1000
10350 ++
10351 ++/* Max number of local or remote addresses we can store.
10352 ++ * When changing, see the bitfield below in fullmesh_rem4/6.
10353 ++ */
10354 ++#define MPTCP_MAX_ADDR 8
10355 ++
10356 ++struct fullmesh_rem4 {
10357 ++ u8 rem4_id;
10358 ++ u8 bitfield;
10359 ++ u8 retry_bitfield;
10360 ++ __be16 port;
10361 ++ struct in_addr addr;
10362 ++};
10363 ++
10364 ++struct fullmesh_rem6 {
10365 ++ u8 rem6_id;
10366 ++ u8 bitfield;
10367 ++ u8 retry_bitfield;
10368 ++ __be16 port;
10369 ++ struct in6_addr addr;
10370 ++};
10371 ++
10372 ++struct mptcp_loc_addr {
10373 ++ struct mptcp_loc4 locaddr4[MPTCP_MAX_ADDR];
10374 ++ u8 loc4_bits;
10375 ++ u8 next_v4_index;
10376 ++
10377 ++ struct mptcp_loc6 locaddr6[MPTCP_MAX_ADDR];
10378 ++ u8 loc6_bits;
10379 ++ u8 next_v6_index;
10380 ++};
10381 ++
10382 ++struct mptcp_addr_event {
10383 ++ struct list_head list;
10384 ++ unsigned short family;
10385 ++ u8 code:7,
10386 ++ low_prio:1;
10387 ++ union inet_addr addr;
10388 ++};
10389 ++
10390 ++struct fullmesh_priv {
10391 ++ /* Worker struct for subflow establishment */
10392 ++ struct work_struct subflow_work;
10393 ++ /* Delayed worker, when the routing-tables are not yet ready. */
10394 ++ struct delayed_work subflow_retry_work;
10395 ++
10396 ++ /* Remote addresses */
10397 ++ struct fullmesh_rem4 remaddr4[MPTCP_MAX_ADDR];
10398 ++ struct fullmesh_rem6 remaddr6[MPTCP_MAX_ADDR];
10399 ++
10400 ++ struct mptcp_cb *mpcb;
10401 ++
10402 ++ u16 remove_addrs; /* Addresses to remove */
10403 ++ u8 announced_addrs_v4; /* IPv4 Addresses we did announce */
10404 ++ u8 announced_addrs_v6; /* IPv6 Addresses we did announce */
10405 ++
10406 ++ u8 add_addr; /* Are we sending an add_addr? */
10407 ++
10408 ++ u8 rem4_bits;
10409 ++ u8 rem6_bits;
10410 ++};
10411 ++
10412 ++struct mptcp_fm_ns {
10413 ++ struct mptcp_loc_addr __rcu *local;
10414 ++ spinlock_t local_lock; /* Protecting the above pointer */
10415 ++ struct list_head events;
10416 ++ struct delayed_work address_worker;
10417 ++
10418 ++ struct net *net;
10419 ++};
10420 ++
10421 ++static struct mptcp_pm_ops full_mesh __read_mostly;
10422 ++
10423 ++static void full_mesh_create_subflows(struct sock *meta_sk);
10424 ++
10425 ++static struct mptcp_fm_ns *fm_get_ns(const struct net *net)
10426 ++{
10427 ++ return (struct mptcp_fm_ns *)net->mptcp.path_managers[MPTCP_PM_FULLMESH];
10428 ++}
10429 ++
10430 ++static struct fullmesh_priv *fullmesh_get_priv(const struct mptcp_cb *mpcb)
10431 ++{
10432 ++ return (struct fullmesh_priv *)&mpcb->mptcp_pm[0];
10433 ++}
10434 ++
10435 ++/* Find the first free index in the bitfield */
10436 ++static int __mptcp_find_free_index(u8 bitfield, u8 base)
10437 ++{
10438 ++ int i;
10439 ++
10440 ++ /* There are anyways no free bits... */
10441 ++ if (bitfield == 0xff)
10442 ++ goto exit;
10443 ++
10444 ++ i = ffs(~(bitfield >> base)) - 1;
10445 ++ if (i < 0)
10446 ++ goto exit;
10447 ++
10448 ++ /* No free bits when starting at base, try from 0 on */
10449 ++ if (i + base >= sizeof(bitfield) * 8)
10450 ++ return __mptcp_find_free_index(bitfield, 0);
10451 ++
10452 ++ return i + base;
10453 ++exit:
10454 ++ return -1;
10455 ++}
10456 ++
10457 ++static int mptcp_find_free_index(u8 bitfield)
10458 ++{
10459 ++ return __mptcp_find_free_index(bitfield, 0);
10460 ++}
10461 ++
10462 ++static void mptcp_addv4_raddr(struct mptcp_cb *mpcb,
10463 ++ const struct in_addr *addr,
10464 ++ __be16 port, u8 id)
10465 ++{
10466 ++ int i;
10467 ++ struct fullmesh_rem4 *rem4;
10468 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10469 ++
10470 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10471 ++ rem4 = &fmp->remaddr4[i];
10472 ++
10473 ++ /* Address is already in the list --- continue */
10474 ++ if (rem4->rem4_id == id &&
10475 ++ rem4->addr.s_addr == addr->s_addr && rem4->port == port)
10476 ++ return;
10477 ++
10478 ++ /* This may be the case, when the peer is behind a NAT. He is
10479 ++ * trying to JOIN, thus sending the JOIN with a certain ID.
10480 ++ * However the src_addr of the IP-packet has been changed. We
10481 ++ * update the addr in the list, because this is the address as
10482 ++ * OUR BOX sees it.
10483 ++ */
10484 ++ if (rem4->rem4_id == id && rem4->addr.s_addr != addr->s_addr) {
10485 ++ /* update the address */
10486 ++ mptcp_debug("%s: updating old addr:%pI4 to addr %pI4 with id:%d\n",
10487 ++ __func__, &rem4->addr.s_addr,
10488 ++ &addr->s_addr, id);
10489 ++ rem4->addr.s_addr = addr->s_addr;
10490 ++ rem4->port = port;
10491 ++ mpcb->list_rcvd = 1;
10492 ++ return;
10493 ++ }
10494 ++ }
10495 ++
10496 ++ i = mptcp_find_free_index(fmp->rem4_bits);
10497 ++ /* Do we have already the maximum number of local/remote addresses? */
10498 ++ if (i < 0) {
10499 ++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI4\n",
10500 ++ __func__, MPTCP_MAX_ADDR, &addr->s_addr);
10501 ++ return;
10502 ++ }
10503 ++
10504 ++ rem4 = &fmp->remaddr4[i];
10505 ++
10506 ++ /* Address is not known yet, store it */
10507 ++ rem4->addr.s_addr = addr->s_addr;
10508 ++ rem4->port = port;
10509 ++ rem4->bitfield = 0;
10510 ++ rem4->retry_bitfield = 0;
10511 ++ rem4->rem4_id = id;
10512 ++ mpcb->list_rcvd = 1;
10513 ++ fmp->rem4_bits |= (1 << i);
10514 ++
10515 ++ return;
10516 ++}
10517 ++
10518 ++static void mptcp_addv6_raddr(struct mptcp_cb *mpcb,
10519 ++ const struct in6_addr *addr,
10520 ++ __be16 port, u8 id)
10521 ++{
10522 ++ int i;
10523 ++ struct fullmesh_rem6 *rem6;
10524 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10525 ++
10526 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10527 ++ rem6 = &fmp->remaddr6[i];
10528 ++
10529 ++ /* Address is already in the list --- continue */
10530 ++ if (rem6->rem6_id == id &&
10531 ++ ipv6_addr_equal(&rem6->addr, addr) && rem6->port == port)
10532 ++ return;
10533 ++
10534 ++ /* This may be the case, when the peer is behind a NAT. He is
10535 ++ * trying to JOIN, thus sending the JOIN with a certain ID.
10536 ++ * However the src_addr of the IP-packet has been changed. We
10537 ++ * update the addr in the list, because this is the address as
10538 ++ * OUR BOX sees it.
10539 ++ */
10540 ++ if (rem6->rem6_id == id) {
10541 ++ /* update the address */
10542 ++ mptcp_debug("%s: updating old addr: %pI6 to addr %pI6 with id:%d\n",
10543 ++ __func__, &rem6->addr, addr, id);
10544 ++ rem6->addr = *addr;
10545 ++ rem6->port = port;
10546 ++ mpcb->list_rcvd = 1;
10547 ++ return;
10548 ++ }
10549 ++ }
10550 ++
10551 ++ i = mptcp_find_free_index(fmp->rem6_bits);
10552 ++ /* Do we have already the maximum number of local/remote addresses? */
10553 ++ if (i < 0) {
10554 ++ mptcp_debug("%s: At max num of remote addresses: %d --- not adding address: %pI6\n",
10555 ++ __func__, MPTCP_MAX_ADDR, addr);
10556 ++ return;
10557 ++ }
10558 ++
10559 ++ rem6 = &fmp->remaddr6[i];
10560 ++
10561 ++ /* Address is not known yet, store it */
10562 ++ rem6->addr = *addr;
10563 ++ rem6->port = port;
10564 ++ rem6->bitfield = 0;
10565 ++ rem6->retry_bitfield = 0;
10566 ++ rem6->rem6_id = id;
10567 ++ mpcb->list_rcvd = 1;
10568 ++ fmp->rem6_bits |= (1 << i);
10569 ++
10570 ++ return;
10571 ++}
10572 ++
10573 ++static void mptcp_v4_rem_raddress(struct mptcp_cb *mpcb, u8 id)
10574 ++{
10575 ++ int i;
10576 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10577 ++
10578 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10579 ++ if (fmp->remaddr4[i].rem4_id == id) {
10580 ++ /* remove address from bitfield */
10581 ++ fmp->rem4_bits &= ~(1 << i);
10582 ++
10583 ++ break;
10584 ++ }
10585 ++ }
10586 ++}
10587 ++
10588 ++static void mptcp_v6_rem_raddress(const struct mptcp_cb *mpcb, u8 id)
10589 ++{
10590 ++ int i;
10591 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10592 ++
10593 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10594 ++ if (fmp->remaddr6[i].rem6_id == id) {
10595 ++ /* remove address from bitfield */
10596 ++ fmp->rem6_bits &= ~(1 << i);
10597 ++
10598 ++ break;
10599 ++ }
10600 ++ }
10601 ++}
10602 ++
10603 ++/* Sets the bitfield of the remote-address field */
10604 ++static void mptcp_v4_set_init_addr_bit(const struct mptcp_cb *mpcb,
10605 ++ const struct in_addr *addr, u8 index)
10606 ++{
10607 ++ int i;
10608 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10609 ++
10610 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10611 ++ if (fmp->remaddr4[i].addr.s_addr == addr->s_addr) {
10612 ++ fmp->remaddr4[i].bitfield |= (1 << index);
10613 ++ return;
10614 ++ }
10615 ++ }
10616 ++}
10617 ++
10618 ++/* Sets the bitfield of the remote-address field */
10619 ++static void mptcp_v6_set_init_addr_bit(struct mptcp_cb *mpcb,
10620 ++ const struct in6_addr *addr, u8 index)
10621 ++{
10622 ++ int i;
10623 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10624 ++
10625 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10626 ++ if (ipv6_addr_equal(&fmp->remaddr6[i].addr, addr)) {
10627 ++ fmp->remaddr6[i].bitfield |= (1 << index);
10628 ++ return;
10629 ++ }
10630 ++ }
10631 ++}
10632 ++
10633 ++static void mptcp_set_init_addr_bit(struct mptcp_cb *mpcb,
10634 ++ const union inet_addr *addr,
10635 ++ sa_family_t family, u8 id)
10636 ++{
10637 ++ if (family == AF_INET)
10638 ++ mptcp_v4_set_init_addr_bit(mpcb, &addr->in, id);
10639 ++ else
10640 ++ mptcp_v6_set_init_addr_bit(mpcb, &addr->in6, id);
10641 ++}
10642 ++
10643 ++static void retry_subflow_worker(struct work_struct *work)
10644 ++{
10645 ++ struct delayed_work *delayed_work = container_of(work,
10646 ++ struct delayed_work,
10647 ++ work);
10648 ++ struct fullmesh_priv *fmp = container_of(delayed_work,
10649 ++ struct fullmesh_priv,
10650 ++ subflow_retry_work);
10651 ++ struct mptcp_cb *mpcb = fmp->mpcb;
10652 ++ struct sock *meta_sk = mpcb->meta_sk;
10653 ++ struct mptcp_loc_addr *mptcp_local;
10654 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
10655 ++ int iter = 0, i;
10656 ++
10657 ++ /* We need a local (stable) copy of the address-list. Really, it is not
10658 ++ * such a big deal, if the address-list is not 100% up-to-date.
10659 ++ */
10660 ++ rcu_read_lock_bh();
10661 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
10662 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
10663 ++ rcu_read_unlock_bh();
10664 ++
10665 ++ if (!mptcp_local)
10666 ++ return;
10667 ++
10668 ++next_subflow:
10669 ++ if (iter) {
10670 ++ release_sock(meta_sk);
10671 ++ mutex_unlock(&mpcb->mpcb_mutex);
10672 ++
10673 ++ cond_resched();
10674 ++ }
10675 ++ mutex_lock(&mpcb->mpcb_mutex);
10676 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
10677 ++
10678 ++ iter++;
10679 ++
10680 ++ if (sock_flag(meta_sk, SOCK_DEAD))
10681 ++ goto exit;
10682 ++
10683 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10684 ++ struct fullmesh_rem4 *rem = &fmp->remaddr4[i];
10685 ++ /* Do we need to retry establishing a subflow ? */
10686 ++ if (rem->retry_bitfield) {
10687 ++ int i = mptcp_find_free_index(~rem->retry_bitfield);
10688 ++ struct mptcp_rem4 rem4;
10689 ++
10690 ++ rem->bitfield |= (1 << i);
10691 ++ rem->retry_bitfield &= ~(1 << i);
10692 ++
10693 ++ rem4.addr = rem->addr;
10694 ++ rem4.port = rem->port;
10695 ++ rem4.rem4_id = rem->rem4_id;
10696 ++
10697 ++ mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i], &rem4);
10698 ++ goto next_subflow;
10699 ++ }
10700 ++ }
10701 ++
10702 ++#if IS_ENABLED(CONFIG_IPV6)
10703 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10704 ++ struct fullmesh_rem6 *rem = &fmp->remaddr6[i];
10705 ++
10706 ++ /* Do we need to retry establishing a subflow ? */
10707 ++ if (rem->retry_bitfield) {
10708 ++ int i = mptcp_find_free_index(~rem->retry_bitfield);
10709 ++ struct mptcp_rem6 rem6;
10710 ++
10711 ++ rem->bitfield |= (1 << i);
10712 ++ rem->retry_bitfield &= ~(1 << i);
10713 ++
10714 ++ rem6.addr = rem->addr;
10715 ++ rem6.port = rem->port;
10716 ++ rem6.rem6_id = rem->rem6_id;
10717 ++
10718 ++ mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i], &rem6);
10719 ++ goto next_subflow;
10720 ++ }
10721 ++ }
10722 ++#endif
10723 ++
10724 ++exit:
10725 ++ kfree(mptcp_local);
10726 ++ release_sock(meta_sk);
10727 ++ mutex_unlock(&mpcb->mpcb_mutex);
10728 ++ sock_put(meta_sk);
10729 ++}
10730 ++
10731 ++/**
10732 ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
10733 ++ *
10734 ++ * This function uses a goto next_subflow, to allow releasing the lock between
10735 ++ * new subflows and giving other processes a chance to do some work on the
10736 ++ * socket and potentially finishing the communication.
10737 ++ **/
10738 ++static void create_subflow_worker(struct work_struct *work)
10739 ++{
10740 ++ struct fullmesh_priv *fmp = container_of(work, struct fullmesh_priv,
10741 ++ subflow_work);
10742 ++ struct mptcp_cb *mpcb = fmp->mpcb;
10743 ++ struct sock *meta_sk = mpcb->meta_sk;
10744 ++ struct mptcp_loc_addr *mptcp_local;
10745 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
10746 ++ int iter = 0, retry = 0;
10747 ++ int i;
10748 ++
10749 ++ /* We need a local (stable) copy of the address-list. Really, it is not
10750 ++ * such a big deal, if the address-list is not 100% up-to-date.
10751 ++ */
10752 ++ rcu_read_lock_bh();
10753 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
10754 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local), GFP_ATOMIC);
10755 ++ rcu_read_unlock_bh();
10756 ++
10757 ++ if (!mptcp_local)
10758 ++ return;
10759 ++
10760 ++next_subflow:
10761 ++ if (iter) {
10762 ++ release_sock(meta_sk);
10763 ++ mutex_unlock(&mpcb->mpcb_mutex);
10764 ++
10765 ++ cond_resched();
10766 ++ }
10767 ++ mutex_lock(&mpcb->mpcb_mutex);
10768 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
10769 ++
10770 ++ iter++;
10771 ++
10772 ++ if (sock_flag(meta_sk, SOCK_DEAD))
10773 ++ goto exit;
10774 ++
10775 ++ if (mpcb->master_sk &&
10776 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
10777 ++ goto exit;
10778 ++
10779 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10780 ++ struct fullmesh_rem4 *rem;
10781 ++ u8 remaining_bits;
10782 ++
10783 ++ rem = &fmp->remaddr4[i];
10784 ++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc4_bits;
10785 ++
10786 ++ /* Are there still combinations to handle? */
10787 ++ if (remaining_bits) {
10788 ++ int i = mptcp_find_free_index(~remaining_bits);
10789 ++ struct mptcp_rem4 rem4;
10790 ++
10791 ++ rem->bitfield |= (1 << i);
10792 ++
10793 ++ rem4.addr = rem->addr;
10794 ++ rem4.port = rem->port;
10795 ++ rem4.rem4_id = rem->rem4_id;
10796 ++
10797 ++ /* If a route is not yet available then retry once */
10798 ++ if (mptcp_init4_subsockets(meta_sk, &mptcp_local->locaddr4[i],
10799 ++ &rem4) == -ENETUNREACH)
10800 ++ retry = rem->retry_bitfield |= (1 << i);
10801 ++ goto next_subflow;
10802 ++ }
10803 ++ }
10804 ++
10805 ++#if IS_ENABLED(CONFIG_IPV6)
10806 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10807 ++ struct fullmesh_rem6 *rem;
10808 ++ u8 remaining_bits;
10809 ++
10810 ++ rem = &fmp->remaddr6[i];
10811 ++ remaining_bits = ~(rem->bitfield) & mptcp_local->loc6_bits;
10812 ++
10813 ++ /* Are there still combinations to handle? */
10814 ++ if (remaining_bits) {
10815 ++ int i = mptcp_find_free_index(~remaining_bits);
10816 ++ struct mptcp_rem6 rem6;
10817 ++
10818 ++ rem->bitfield |= (1 << i);
10819 ++
10820 ++ rem6.addr = rem->addr;
10821 ++ rem6.port = rem->port;
10822 ++ rem6.rem6_id = rem->rem6_id;
10823 ++
10824 ++ /* If a route is not yet available then retry once */
10825 ++ if (mptcp_init6_subsockets(meta_sk, &mptcp_local->locaddr6[i],
10826 ++ &rem6) == -ENETUNREACH)
10827 ++ retry = rem->retry_bitfield |= (1 << i);
10828 ++ goto next_subflow;
10829 ++ }
10830 ++ }
10831 ++#endif
10832 ++
10833 ++ if (retry && !delayed_work_pending(&fmp->subflow_retry_work)) {
10834 ++ sock_hold(meta_sk);
10835 ++ queue_delayed_work(mptcp_wq, &fmp->subflow_retry_work,
10836 ++ msecs_to_jiffies(MPTCP_SUBFLOW_RETRY_DELAY));
10837 ++ }
10838 ++
10839 ++exit:
10840 ++ kfree(mptcp_local);
10841 ++ release_sock(meta_sk);
10842 ++ mutex_unlock(&mpcb->mpcb_mutex);
10843 ++ sock_put(meta_sk);
10844 ++}
10845 ++
10846 ++static void announce_remove_addr(u8 addr_id, struct sock *meta_sk)
10847 ++{
10848 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
10849 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10850 ++ struct sock *sk = mptcp_select_ack_sock(meta_sk);
10851 ++
10852 ++ fmp->remove_addrs |= (1 << addr_id);
10853 ++ mpcb->addr_signal = 1;
10854 ++
10855 ++ if (sk)
10856 ++ tcp_send_ack(sk);
10857 ++}
10858 ++
10859 ++static void update_addr_bitfields(struct sock *meta_sk,
10860 ++ const struct mptcp_loc_addr *mptcp_local)
10861 ++{
10862 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
10863 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
10864 ++ int i;
10865 ++
10866 ++ /* The bits in announced_addrs_* always match with loc*_bits. So, a
10867 ++ * simply & operation unsets the correct bits, because these go from
10868 ++ * announced to non-announced
10869 ++ */
10870 ++ fmp->announced_addrs_v4 &= mptcp_local->loc4_bits;
10871 ++
10872 ++ mptcp_for_each_bit_set(fmp->rem4_bits, i) {
10873 ++ fmp->remaddr4[i].bitfield &= mptcp_local->loc4_bits;
10874 ++ fmp->remaddr4[i].retry_bitfield &= mptcp_local->loc4_bits;
10875 ++ }
10876 ++
10877 ++ fmp->announced_addrs_v6 &= mptcp_local->loc6_bits;
10878 ++
10879 ++ mptcp_for_each_bit_set(fmp->rem6_bits, i) {
10880 ++ fmp->remaddr6[i].bitfield &= mptcp_local->loc6_bits;
10881 ++ fmp->remaddr6[i].retry_bitfield &= mptcp_local->loc6_bits;
10882 ++ }
10883 ++}
10884 ++
10885 ++static int mptcp_find_address(const struct mptcp_loc_addr *mptcp_local,
10886 ++ sa_family_t family, const union inet_addr *addr)
10887 ++{
10888 ++ int i;
10889 ++ u8 loc_bits;
10890 ++ bool found = false;
10891 ++
10892 ++ if (family == AF_INET)
10893 ++ loc_bits = mptcp_local->loc4_bits;
10894 ++ else
10895 ++ loc_bits = mptcp_local->loc6_bits;
10896 ++
10897 ++ mptcp_for_each_bit_set(loc_bits, i) {
10898 ++ if (family == AF_INET &&
10899 ++ mptcp_local->locaddr4[i].addr.s_addr == addr->in.s_addr) {
10900 ++ found = true;
10901 ++ break;
10902 ++ }
10903 ++ if (family == AF_INET6 &&
10904 ++ ipv6_addr_equal(&mptcp_local->locaddr6[i].addr,
10905 ++ &addr->in6)) {
10906 ++ found = true;
10907 ++ break;
10908 ++ }
10909 ++ }
10910 ++
10911 ++ if (!found)
10912 ++ return -1;
10913 ++
10914 ++ return i;
10915 ++}
10916 ++
10917 ++static void mptcp_address_worker(struct work_struct *work)
10918 ++{
10919 ++ const struct delayed_work *delayed_work = container_of(work,
10920 ++ struct delayed_work,
10921 ++ work);
10922 ++ struct mptcp_fm_ns *fm_ns = container_of(delayed_work,
10923 ++ struct mptcp_fm_ns,
10924 ++ address_worker);
10925 ++ struct net *net = fm_ns->net;
10926 ++ struct mptcp_addr_event *event = NULL;
10927 ++ struct mptcp_loc_addr *mptcp_local, *old;
10928 ++ int i, id = -1; /* id is used in the socket-code on a delete-event */
10929 ++ bool success; /* Used to indicate if we succeeded handling the event */
10930 ++
10931 ++next_event:
10932 ++ success = false;
10933 ++ kfree(event);
10934 ++
10935 ++ /* First, let's dequeue an event from our event-list */
10936 ++ rcu_read_lock_bh();
10937 ++ spin_lock(&fm_ns->local_lock);
10938 ++
10939 ++ event = list_first_entry_or_null(&fm_ns->events,
10940 ++ struct mptcp_addr_event, list);
10941 ++ if (!event) {
10942 ++ spin_unlock(&fm_ns->local_lock);
10943 ++ rcu_read_unlock_bh();
10944 ++ return;
10945 ++ }
10946 ++
10947 ++ list_del(&event->list);
10948 ++
10949 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
10950 ++
10951 ++ if (event->code == MPTCP_EVENT_DEL) {
10952 ++ id = mptcp_find_address(mptcp_local, event->family, &event->addr);
10953 ++
10954 ++ /* Not in the list - so we don't care */
10955 ++ if (id < 0) {
10956 ++ mptcp_debug("%s could not find id\n", __func__);
10957 ++ goto duno;
10958 ++ }
10959 ++
10960 ++ old = mptcp_local;
10961 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
10962 ++ GFP_ATOMIC);
10963 ++ if (!mptcp_local)
10964 ++ goto duno;
10965 ++
10966 ++ if (event->family == AF_INET)
10967 ++ mptcp_local->loc4_bits &= ~(1 << id);
10968 ++ else
10969 ++ mptcp_local->loc6_bits &= ~(1 << id);
10970 ++
10971 ++ rcu_assign_pointer(fm_ns->local, mptcp_local);
10972 ++ kfree(old);
10973 ++ } else {
10974 ++ int i = mptcp_find_address(mptcp_local, event->family, &event->addr);
10975 ++ int j = i;
10976 ++
10977 ++ if (j < 0) {
10978 ++ /* Not in the list, so we have to find an empty slot */
10979 ++ if (event->family == AF_INET)
10980 ++ i = __mptcp_find_free_index(mptcp_local->loc4_bits,
10981 ++ mptcp_local->next_v4_index);
10982 ++ if (event->family == AF_INET6)
10983 ++ i = __mptcp_find_free_index(mptcp_local->loc6_bits,
10984 ++ mptcp_local->next_v6_index);
10985 ++
10986 ++ if (i < 0) {
10987 ++ mptcp_debug("%s no more space\n", __func__);
10988 ++ goto duno;
10989 ++ }
10990 ++
10991 ++ /* It might have been a MOD-event. */
10992 ++ event->code = MPTCP_EVENT_ADD;
10993 ++ } else {
10994 ++ /* Let's check if anything changes */
10995 ++ if (event->family == AF_INET &&
10996 ++ event->low_prio == mptcp_local->locaddr4[i].low_prio)
10997 ++ goto duno;
10998 ++
10999 ++ if (event->family == AF_INET6 &&
11000 ++ event->low_prio == mptcp_local->locaddr6[i].low_prio)
11001 ++ goto duno;
11002 ++ }
11003 ++
11004 ++ old = mptcp_local;
11005 ++ mptcp_local = kmemdup(mptcp_local, sizeof(*mptcp_local),
11006 ++ GFP_ATOMIC);
11007 ++ if (!mptcp_local)
11008 ++ goto duno;
11009 ++
11010 ++ if (event->family == AF_INET) {
11011 ++ mptcp_local->locaddr4[i].addr.s_addr = event->addr.in.s_addr;
11012 ++ mptcp_local->locaddr4[i].loc4_id = i + 1;
11013 ++ mptcp_local->locaddr4[i].low_prio = event->low_prio;
11014 ++ } else {
11015 ++ mptcp_local->locaddr6[i].addr = event->addr.in6;
11016 ++ mptcp_local->locaddr6[i].loc6_id = i + MPTCP_MAX_ADDR;
11017 ++ mptcp_local->locaddr6[i].low_prio = event->low_prio;
11018 ++ }
11019 ++
11020 ++ if (j < 0) {
11021 ++ if (event->family == AF_INET) {
11022 ++ mptcp_local->loc4_bits |= (1 << i);
11023 ++ mptcp_local->next_v4_index = i + 1;
11024 ++ } else {
11025 ++ mptcp_local->loc6_bits |= (1 << i);
11026 ++ mptcp_local->next_v6_index = i + 1;
11027 ++ }
11028 ++ }
11029 ++
11030 ++ rcu_assign_pointer(fm_ns->local, mptcp_local);
11031 ++ kfree(old);
11032 ++ }
11033 ++ success = true;
11034 ++
11035 ++duno:
11036 ++ spin_unlock(&fm_ns->local_lock);
11037 ++ rcu_read_unlock_bh();
11038 ++
11039 ++ if (!success)
11040 ++ goto next_event;
11041 ++
11042 ++ /* Now we iterate over the MPTCP-sockets and apply the event. */
11043 ++ for (i = 0; i < MPTCP_HASH_SIZE; i++) {
11044 ++ const struct hlist_nulls_node *node;
11045 ++ struct tcp_sock *meta_tp;
11046 ++
11047 ++ rcu_read_lock_bh();
11048 ++ hlist_nulls_for_each_entry_rcu(meta_tp, node, &tk_hashtable[i],
11049 ++ tk_table) {
11050 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
11051 ++ struct sock *meta_sk = (struct sock *)meta_tp, *sk;
11052 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11053 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11054 ++
11055 ++ if (sock_net(meta_sk) != net)
11056 ++ continue;
11057 ++
11058 ++ if (meta_v4) {
11059 ++ /* skip IPv6 events if meta is IPv4 */
11060 ++ if (event->family == AF_INET6)
11061 ++ continue;
11062 ++ }
11063 ++ /* skip IPv4 events if IPV6_V6ONLY is set */
11064 ++ else if (event->family == AF_INET &&
11065 ++ inet6_sk(meta_sk)->ipv6only)
11066 ++ continue;
11067 ++
11068 ++ if (unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
11069 ++ continue;
11070 ++
11071 ++ bh_lock_sock(meta_sk);
11072 ++
11073 ++ if (!mptcp(meta_tp) || !is_meta_sk(meta_sk) ||
11074 ++ mpcb->infinite_mapping_snd ||
11075 ++ mpcb->infinite_mapping_rcv ||
11076 ++ mpcb->send_infinite_mapping)
11077 ++ goto next;
11078 ++
11079 ++ /* May be that the pm has changed in-between */
11080 ++ if (mpcb->pm_ops != &full_mesh)
11081 ++ goto next;
11082 ++
11083 ++ if (sock_owned_by_user(meta_sk)) {
11084 ++ if (!test_and_set_bit(MPTCP_PATH_MANAGER,
11085 ++ &meta_tp->tsq_flags))
11086 ++ sock_hold(meta_sk);
11087 ++
11088 ++ goto next;
11089 ++ }
11090 ++
11091 ++ if (event->code == MPTCP_EVENT_ADD) {
11092 ++ fmp->add_addr++;
11093 ++ mpcb->addr_signal = 1;
11094 ++
11095 ++ sk = mptcp_select_ack_sock(meta_sk);
11096 ++ if (sk)
11097 ++ tcp_send_ack(sk);
11098 ++
11099 ++ full_mesh_create_subflows(meta_sk);
11100 ++ }
11101 ++
11102 ++ if (event->code == MPTCP_EVENT_DEL) {
11103 ++ struct sock *sk, *tmpsk;
11104 ++ struct mptcp_loc_addr *mptcp_local;
11105 ++ bool found = false;
11106 ++
11107 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
11108 ++
11109 ++ /* In any case, we need to update our bitfields */
11110 ++ if (id >= 0)
11111 ++ update_addr_bitfields(meta_sk, mptcp_local);
11112 ++
11113 ++ /* Look for the socket and remove him */
11114 ++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
11115 ++ if ((event->family == AF_INET6 &&
11116 ++ (sk->sk_family == AF_INET ||
11117 ++ mptcp_v6_is_v4_mapped(sk))) ||
11118 ++ (event->family == AF_INET &&
11119 ++ (sk->sk_family == AF_INET6 &&
11120 ++ !mptcp_v6_is_v4_mapped(sk))))
11121 ++ continue;
11122 ++
11123 ++ if (event->family == AF_INET &&
11124 ++ (sk->sk_family == AF_INET ||
11125 ++ mptcp_v6_is_v4_mapped(sk)) &&
11126 ++ inet_sk(sk)->inet_saddr != event->addr.in.s_addr)
11127 ++ continue;
11128 ++
11129 ++ if (event->family == AF_INET6 &&
11130 ++ sk->sk_family == AF_INET6 &&
11131 ++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6))
11132 ++ continue;
11133 ++
11134 ++ /* Reinject, so that pf = 1 and so we
11135 ++ * won't select this one as the
11136 ++ * ack-sock.
11137 ++ */
11138 ++ mptcp_reinject_data(sk, 0);
11139 ++
11140 ++ /* We announce the removal of this id */
11141 ++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id, meta_sk);
11142 ++
11143 ++ mptcp_sub_force_close(sk);
11144 ++ found = true;
11145 ++ }
11146 ++
11147 ++ if (found)
11148 ++ goto next;
11149 ++
11150 ++ /* The id may have been given by the event,
11151 ++ * matching on a local address. And it may not
11152 ++ * have matched on one of the above sockets,
11153 ++ * because the client never created a subflow.
11154 ++ * So, we have to finally remove it here.
11155 ++ */
11156 ++ if (id > 0)
11157 ++ announce_remove_addr(id, meta_sk);
11158 ++ }
11159 ++
11160 ++ if (event->code == MPTCP_EVENT_MOD) {
11161 ++ struct sock *sk;
11162 ++
11163 ++ mptcp_for_each_sk(mpcb, sk) {
11164 ++ struct tcp_sock *tp = tcp_sk(sk);
11165 ++ if (event->family == AF_INET &&
11166 ++ (sk->sk_family == AF_INET ||
11167 ++ mptcp_v6_is_v4_mapped(sk)) &&
11168 ++ inet_sk(sk)->inet_saddr == event->addr.in.s_addr) {
11169 ++ if (event->low_prio != tp->mptcp->low_prio) {
11170 ++ tp->mptcp->send_mp_prio = 1;
11171 ++ tp->mptcp->low_prio = event->low_prio;
11172 ++
11173 ++ tcp_send_ack(sk);
11174 ++ }
11175 ++ }
11176 ++
11177 ++ if (event->family == AF_INET6 &&
11178 ++ sk->sk_family == AF_INET6 &&
11179 ++ !ipv6_addr_equal(&inet6_sk(sk)->saddr, &event->addr.in6)) {
11180 ++ if (event->low_prio != tp->mptcp->low_prio) {
11181 ++ tp->mptcp->send_mp_prio = 1;
11182 ++ tp->mptcp->low_prio = event->low_prio;
11183 ++
11184 ++ tcp_send_ack(sk);
11185 ++ }
11186 ++ }
11187 ++ }
11188 ++ }
11189 ++next:
11190 ++ bh_unlock_sock(meta_sk);
11191 ++ sock_put(meta_sk);
11192 ++ }
11193 ++ rcu_read_unlock_bh();
11194 ++ }
11195 ++ goto next_event;
11196 ++}
11197 ++
11198 ++static struct mptcp_addr_event *lookup_similar_event(const struct net *net,
11199 ++ const struct mptcp_addr_event *event)
11200 ++{
11201 ++ struct mptcp_addr_event *eventq;
11202 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11203 ++
11204 ++ list_for_each_entry(eventq, &fm_ns->events, list) {
11205 ++ if (eventq->family != event->family)
11206 ++ continue;
11207 ++ if (event->family == AF_INET) {
11208 ++ if (eventq->addr.in.s_addr == event->addr.in.s_addr)
11209 ++ return eventq;
11210 ++ } else {
11211 ++ if (ipv6_addr_equal(&eventq->addr.in6, &event->addr.in6))
11212 ++ return eventq;
11213 ++ }
11214 ++ }
11215 ++ return NULL;
11216 ++}
11217 ++
11218 ++/* We already hold the net-namespace MPTCP-lock */
11219 ++static void add_pm_event(struct net *net, const struct mptcp_addr_event *event)
11220 ++{
11221 ++ struct mptcp_addr_event *eventq = lookup_similar_event(net, event);
11222 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11223 ++
11224 ++ if (eventq) {
11225 ++ switch (event->code) {
11226 ++ case MPTCP_EVENT_DEL:
11227 ++ mptcp_debug("%s del old_code %u\n", __func__, eventq->code);
11228 ++ list_del(&eventq->list);
11229 ++ kfree(eventq);
11230 ++ break;
11231 ++ case MPTCP_EVENT_ADD:
11232 ++ mptcp_debug("%s add old_code %u\n", __func__, eventq->code);
11233 ++ eventq->low_prio = event->low_prio;
11234 ++ eventq->code = MPTCP_EVENT_ADD;
11235 ++ return;
11236 ++ case MPTCP_EVENT_MOD:
11237 ++ mptcp_debug("%s mod old_code %u\n", __func__, eventq->code);
11238 ++ eventq->low_prio = event->low_prio;
11239 ++ eventq->code = MPTCP_EVENT_MOD;
11240 ++ return;
11241 ++ }
11242 ++ }
11243 ++
11244 ++ /* OK, we have to add the new address to the wait queue */
11245 ++ eventq = kmemdup(event, sizeof(struct mptcp_addr_event), GFP_ATOMIC);
11246 ++ if (!eventq)
11247 ++ return;
11248 ++
11249 ++ list_add_tail(&eventq->list, &fm_ns->events);
11250 ++
11251 ++ /* Create work-queue */
11252 ++ if (!delayed_work_pending(&fm_ns->address_worker))
11253 ++ queue_delayed_work(mptcp_wq, &fm_ns->address_worker,
11254 ++ msecs_to_jiffies(500));
11255 ++}
11256 ++
11257 ++static void addr4_event_handler(const struct in_ifaddr *ifa, unsigned long event,
11258 ++ struct net *net)
11259 ++{
11260 ++ const struct net_device *netdev = ifa->ifa_dev->dev;
11261 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11262 ++ struct mptcp_addr_event mpevent;
11263 ++
11264 ++ if (ifa->ifa_scope > RT_SCOPE_LINK ||
11265 ++ ipv4_is_loopback(ifa->ifa_local))
11266 ++ return;
11267 ++
11268 ++ spin_lock_bh(&fm_ns->local_lock);
11269 ++
11270 ++ mpevent.family = AF_INET;
11271 ++ mpevent.addr.in.s_addr = ifa->ifa_local;
11272 ++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
11273 ++
11274 ++ if (event == NETDEV_DOWN || !netif_running(netdev) ||
11275 ++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
11276 ++ mpevent.code = MPTCP_EVENT_DEL;
11277 ++ else if (event == NETDEV_UP)
11278 ++ mpevent.code = MPTCP_EVENT_ADD;
11279 ++ else if (event == NETDEV_CHANGE)
11280 ++ mpevent.code = MPTCP_EVENT_MOD;
11281 ++
11282 ++ mptcp_debug("%s created event for %pI4, code %u prio %u\n", __func__,
11283 ++ &ifa->ifa_local, mpevent.code, mpevent.low_prio);
11284 ++ add_pm_event(net, &mpevent);
11285 ++
11286 ++ spin_unlock_bh(&fm_ns->local_lock);
11287 ++ return;
11288 ++}
11289 ++
11290 ++/* React on IPv4-addr add/rem-events */
11291 ++static int mptcp_pm_inetaddr_event(struct notifier_block *this,
11292 ++ unsigned long event, void *ptr)
11293 ++{
11294 ++ const struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
11295 ++ struct net *net = dev_net(ifa->ifa_dev->dev);
11296 ++
11297 ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
11298 ++ event == NETDEV_CHANGE))
11299 ++ return NOTIFY_DONE;
11300 ++
11301 ++ addr4_event_handler(ifa, event, net);
11302 ++
11303 ++ return NOTIFY_DONE;
11304 ++}
11305 ++
11306 ++static struct notifier_block mptcp_pm_inetaddr_notifier = {
11307 ++ .notifier_call = mptcp_pm_inetaddr_event,
11308 ++};
11309 ++
11310 ++#if IS_ENABLED(CONFIG_IPV6)
11311 ++
11312 ++/* IPV6-related address/interface watchers */
11313 ++struct mptcp_dad_data {
11314 ++ struct timer_list timer;
11315 ++ struct inet6_ifaddr *ifa;
11316 ++};
11317 ++
11318 ++static void dad_callback(unsigned long arg);
11319 ++static int inet6_addr_event(struct notifier_block *this,
11320 ++ unsigned long event, void *ptr);
11321 ++
11322 ++static int ipv6_is_in_dad_state(const struct inet6_ifaddr *ifa)
11323 ++{
11324 ++ return (ifa->flags & IFA_F_TENTATIVE) &&
11325 ++ ifa->state == INET6_IFADDR_STATE_DAD;
11326 ++}
11327 ++
11328 ++static void dad_init_timer(struct mptcp_dad_data *data,
11329 ++ struct inet6_ifaddr *ifa)
11330 ++{
11331 ++ data->ifa = ifa;
11332 ++ data->timer.data = (unsigned long)data;
11333 ++ data->timer.function = dad_callback;
11334 ++ if (ifa->idev->cnf.rtr_solicit_delay)
11335 ++ data->timer.expires = jiffies + ifa->idev->cnf.rtr_solicit_delay;
11336 ++ else
11337 ++ data->timer.expires = jiffies + (HZ/10);
11338 ++}
11339 ++
11340 ++static void dad_callback(unsigned long arg)
11341 ++{
11342 ++ struct mptcp_dad_data *data = (struct mptcp_dad_data *)arg;
11343 ++
11344 ++ if (ipv6_is_in_dad_state(data->ifa)) {
11345 ++ dad_init_timer(data, data->ifa);
11346 ++ add_timer(&data->timer);
11347 ++ } else {
11348 ++ inet6_addr_event(NULL, NETDEV_UP, data->ifa);
11349 ++ in6_ifa_put(data->ifa);
11350 ++ kfree(data);
11351 ++ }
11352 ++}
11353 ++
11354 ++static inline void dad_setup_timer(struct inet6_ifaddr *ifa)
11355 ++{
11356 ++ struct mptcp_dad_data *data;
11357 ++
11358 ++ data = kmalloc(sizeof(*data), GFP_ATOMIC);
11359 ++
11360 ++ if (!data)
11361 ++ return;
11362 ++
11363 ++ init_timer(&data->timer);
11364 ++ dad_init_timer(data, ifa);
11365 ++ add_timer(&data->timer);
11366 ++ in6_ifa_hold(ifa);
11367 ++}
11368 ++
11369 ++static void addr6_event_handler(const struct inet6_ifaddr *ifa, unsigned long event,
11370 ++ struct net *net)
11371 ++{
11372 ++ const struct net_device *netdev = ifa->idev->dev;
11373 ++ int addr_type = ipv6_addr_type(&ifa->addr);
11374 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11375 ++ struct mptcp_addr_event mpevent;
11376 ++
11377 ++ if (ifa->scope > RT_SCOPE_LINK ||
11378 ++ addr_type == IPV6_ADDR_ANY ||
11379 ++ (addr_type & IPV6_ADDR_LOOPBACK) ||
11380 ++ (addr_type & IPV6_ADDR_LINKLOCAL))
11381 ++ return;
11382 ++
11383 ++ spin_lock_bh(&fm_ns->local_lock);
11384 ++
11385 ++ mpevent.family = AF_INET6;
11386 ++ mpevent.addr.in6 = ifa->addr;
11387 ++ mpevent.low_prio = (netdev->flags & IFF_MPBACKUP) ? 1 : 0;
11388 ++
11389 ++ if (event == NETDEV_DOWN || !netif_running(netdev) ||
11390 ++ (netdev->flags & IFF_NOMULTIPATH) || !(netdev->flags & IFF_UP))
11391 ++ mpevent.code = MPTCP_EVENT_DEL;
11392 ++ else if (event == NETDEV_UP)
11393 ++ mpevent.code = MPTCP_EVENT_ADD;
11394 ++ else if (event == NETDEV_CHANGE)
11395 ++ mpevent.code = MPTCP_EVENT_MOD;
11396 ++
11397 ++ mptcp_debug("%s created event for %pI6, code %u prio %u\n", __func__,
11398 ++ &ifa->addr, mpevent.code, mpevent.low_prio);
11399 ++ add_pm_event(net, &mpevent);
11400 ++
11401 ++ spin_unlock_bh(&fm_ns->local_lock);
11402 ++ return;
11403 ++}
11404 ++
11405 ++/* React on IPv6-addr add/rem-events */
11406 ++static int inet6_addr_event(struct notifier_block *this, unsigned long event,
11407 ++ void *ptr)
11408 ++{
11409 ++ struct inet6_ifaddr *ifa6 = (struct inet6_ifaddr *)ptr;
11410 ++ struct net *net = dev_net(ifa6->idev->dev);
11411 ++
11412 ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
11413 ++ event == NETDEV_CHANGE))
11414 ++ return NOTIFY_DONE;
11415 ++
11416 ++ if (ipv6_is_in_dad_state(ifa6))
11417 ++ dad_setup_timer(ifa6);
11418 ++ else
11419 ++ addr6_event_handler(ifa6, event, net);
11420 ++
11421 ++ return NOTIFY_DONE;
11422 ++}
11423 ++
11424 ++static struct notifier_block inet6_addr_notifier = {
11425 ++ .notifier_call = inet6_addr_event,
11426 ++};
11427 ++
11428 ++#endif
11429 ++
11430 ++/* React on ifup/down-events */
11431 ++static int netdev_event(struct notifier_block *this, unsigned long event,
11432 ++ void *ptr)
11433 ++{
11434 ++ const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
11435 ++ struct in_device *in_dev;
11436 ++#if IS_ENABLED(CONFIG_IPV6)
11437 ++ struct inet6_dev *in6_dev;
11438 ++#endif
11439 ++
11440 ++ if (!(event == NETDEV_UP || event == NETDEV_DOWN ||
11441 ++ event == NETDEV_CHANGE))
11442 ++ return NOTIFY_DONE;
11443 ++
11444 ++ rcu_read_lock();
11445 ++ in_dev = __in_dev_get_rtnl(dev);
11446 ++
11447 ++ if (in_dev) {
11448 ++ for_ifa(in_dev) {
11449 ++ mptcp_pm_inetaddr_event(NULL, event, ifa);
11450 ++ } endfor_ifa(in_dev);
11451 ++ }
11452 ++
11453 ++#if IS_ENABLED(CONFIG_IPV6)
11454 ++ in6_dev = __in6_dev_get(dev);
11455 ++
11456 ++ if (in6_dev) {
11457 ++ struct inet6_ifaddr *ifa6;
11458 ++ list_for_each_entry(ifa6, &in6_dev->addr_list, if_list)
11459 ++ inet6_addr_event(NULL, event, ifa6);
11460 ++ }
11461 ++#endif
11462 ++
11463 ++ rcu_read_unlock();
11464 ++ return NOTIFY_DONE;
11465 ++}
11466 ++
11467 ++static struct notifier_block mptcp_pm_netdev_notifier = {
11468 ++ .notifier_call = netdev_event,
11469 ++};
11470 ++
11471 ++static void full_mesh_add_raddr(struct mptcp_cb *mpcb,
11472 ++ const union inet_addr *addr,
11473 ++ sa_family_t family, __be16 port, u8 id)
11474 ++{
11475 ++ if (family == AF_INET)
11476 ++ mptcp_addv4_raddr(mpcb, &addr->in, port, id);
11477 ++ else
11478 ++ mptcp_addv6_raddr(mpcb, &addr->in6, port, id);
11479 ++}
11480 ++
11481 ++static void full_mesh_new_session(const struct sock *meta_sk)
11482 ++{
11483 ++ struct mptcp_loc_addr *mptcp_local;
11484 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
11485 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11486 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
11487 ++ int i, index;
11488 ++ union inet_addr saddr, daddr;
11489 ++ sa_family_t family;
11490 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11491 ++
11492 ++ /* Init local variables necessary for the rest */
11493 ++ if (meta_sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(meta_sk)) {
11494 ++ saddr.ip = inet_sk(meta_sk)->inet_saddr;
11495 ++ daddr.ip = inet_sk(meta_sk)->inet_daddr;
11496 ++ family = AF_INET;
11497 ++#if IS_ENABLED(CONFIG_IPV6)
11498 ++ } else {
11499 ++ saddr.in6 = inet6_sk(meta_sk)->saddr;
11500 ++ daddr.in6 = meta_sk->sk_v6_daddr;
11501 ++ family = AF_INET6;
11502 ++#endif
11503 ++ }
11504 ++
11505 ++ rcu_read_lock();
11506 ++ mptcp_local = rcu_dereference(fm_ns->local);
11507 ++
11508 ++ index = mptcp_find_address(mptcp_local, family, &saddr);
11509 ++ if (index < 0)
11510 ++ goto fallback;
11511 ++
11512 ++ full_mesh_add_raddr(mpcb, &daddr, family, 0, 0);
11513 ++ mptcp_set_init_addr_bit(mpcb, &daddr, family, index);
11514 ++
11515 ++ /* Initialize workqueue-struct */
11516 ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
11517 ++ INIT_DELAYED_WORK(&fmp->subflow_retry_work, retry_subflow_worker);
11518 ++ fmp->mpcb = mpcb;
11519 ++
11520 ++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
11521 ++ goto skip_ipv4;
11522 ++
11523 ++ /* Look for the address among the local addresses */
11524 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11525 ++ __be32 ifa_address = mptcp_local->locaddr4[i].addr.s_addr;
11526 ++
11527 ++ /* We do not need to announce the initial subflow's address again */
11528 ++ if (family == AF_INET && saddr.ip == ifa_address)
11529 ++ continue;
11530 ++
11531 ++ fmp->add_addr++;
11532 ++ mpcb->addr_signal = 1;
11533 ++ }
11534 ++
11535 ++skip_ipv4:
11536 ++#if IS_ENABLED(CONFIG_IPV6)
11537 ++ /* skip IPv6 addresses if meta-socket is IPv4 */
11538 ++ if (meta_v4)
11539 ++ goto skip_ipv6;
11540 ++
11541 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11542 ++ const struct in6_addr *ifa6 = &mptcp_local->locaddr6[i].addr;
11543 ++
11544 ++ /* We do not need to announce the initial subflow's address again */
11545 ++ if (family == AF_INET6 && ipv6_addr_equal(&saddr.in6, ifa6))
11546 ++ continue;
11547 ++
11548 ++ fmp->add_addr++;
11549 ++ mpcb->addr_signal = 1;
11550 ++ }
11551 ++
11552 ++skip_ipv6:
11553 ++#endif
11554 ++
11555 ++ rcu_read_unlock();
11556 ++
11557 ++ if (family == AF_INET)
11558 ++ fmp->announced_addrs_v4 |= (1 << index);
11559 ++ else
11560 ++ fmp->announced_addrs_v6 |= (1 << index);
11561 ++
11562 ++ for (i = fmp->add_addr; i && fmp->add_addr; i--)
11563 ++ tcp_send_ack(mpcb->master_sk);
11564 ++
11565 ++ return;
11566 ++
11567 ++fallback:
11568 ++ rcu_read_unlock();
11569 ++ mptcp_fallback_default(mpcb);
11570 ++ return;
11571 ++}
11572 ++
11573 ++static void full_mesh_create_subflows(struct sock *meta_sk)
11574 ++{
11575 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
11576 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11577 ++
11578 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
11579 ++ mpcb->send_infinite_mapping ||
11580 ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
11581 ++ return;
11582 ++
11583 ++ if (mpcb->master_sk &&
11584 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
11585 ++ return;
11586 ++
11587 ++ if (!work_pending(&fmp->subflow_work)) {
11588 ++ sock_hold(meta_sk);
11589 ++ queue_work(mptcp_wq, &fmp->subflow_work);
11590 ++ }
11591 ++}
11592 ++
11593 ++/* Called upon release_sock, if the socket was owned by the user during
11594 ++ * a path-management event.
11595 ++ */
11596 ++static void full_mesh_release_sock(struct sock *meta_sk)
11597 ++{
11598 ++ struct mptcp_loc_addr *mptcp_local;
11599 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
11600 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11601 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(meta_sk));
11602 ++ struct sock *sk, *tmpsk;
11603 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11604 ++ int i;
11605 ++
11606 ++ rcu_read_lock();
11607 ++ mptcp_local = rcu_dereference(fm_ns->local);
11608 ++
11609 ++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
11610 ++ goto skip_ipv4;
11611 ++
11612 ++ /* First, detect modifications or additions */
11613 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11614 ++ struct in_addr ifa = mptcp_local->locaddr4[i].addr;
11615 ++ bool found = false;
11616 ++
11617 ++ mptcp_for_each_sk(mpcb, sk) {
11618 ++ struct tcp_sock *tp = tcp_sk(sk);
11619 ++
11620 ++ if (sk->sk_family == AF_INET6 &&
11621 ++ !mptcp_v6_is_v4_mapped(sk))
11622 ++ continue;
11623 ++
11624 ++ if (inet_sk(sk)->inet_saddr != ifa.s_addr)
11625 ++ continue;
11626 ++
11627 ++ found = true;
11628 ++
11629 ++ if (mptcp_local->locaddr4[i].low_prio != tp->mptcp->low_prio) {
11630 ++ tp->mptcp->send_mp_prio = 1;
11631 ++ tp->mptcp->low_prio = mptcp_local->locaddr4[i].low_prio;
11632 ++
11633 ++ tcp_send_ack(sk);
11634 ++ }
11635 ++ }
11636 ++
11637 ++ if (!found) {
11638 ++ fmp->add_addr++;
11639 ++ mpcb->addr_signal = 1;
11640 ++
11641 ++ sk = mptcp_select_ack_sock(meta_sk);
11642 ++ if (sk)
11643 ++ tcp_send_ack(sk);
11644 ++ full_mesh_create_subflows(meta_sk);
11645 ++ }
11646 ++ }
11647 ++
11648 ++skip_ipv4:
11649 ++#if IS_ENABLED(CONFIG_IPV6)
11650 ++ /* skip IPv6 addresses if meta-socket is IPv4 */
11651 ++ if (meta_v4)
11652 ++ goto removal;
11653 ++
11654 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11655 ++ struct in6_addr ifa = mptcp_local->locaddr6[i].addr;
11656 ++ bool found = false;
11657 ++
11658 ++ mptcp_for_each_sk(mpcb, sk) {
11659 ++ struct tcp_sock *tp = tcp_sk(sk);
11660 ++
11661 ++ if (sk->sk_family == AF_INET ||
11662 ++ mptcp_v6_is_v4_mapped(sk))
11663 ++ continue;
11664 ++
11665 ++ if (!ipv6_addr_equal(&inet6_sk(sk)->saddr, &ifa))
11666 ++ continue;
11667 ++
11668 ++ found = true;
11669 ++
11670 ++ if (mptcp_local->locaddr6[i].low_prio != tp->mptcp->low_prio) {
11671 ++ tp->mptcp->send_mp_prio = 1;
11672 ++ tp->mptcp->low_prio = mptcp_local->locaddr6[i].low_prio;
11673 ++
11674 ++ tcp_send_ack(sk);
11675 ++ }
11676 ++ }
11677 ++
11678 ++ if (!found) {
11679 ++ fmp->add_addr++;
11680 ++ mpcb->addr_signal = 1;
11681 ++
11682 ++ sk = mptcp_select_ack_sock(meta_sk);
11683 ++ if (sk)
11684 ++ tcp_send_ack(sk);
11685 ++ full_mesh_create_subflows(meta_sk);
11686 ++ }
11687 ++ }
11688 ++
11689 ++removal:
11690 ++#endif
11691 ++
11692 ++ /* Now, detect address-removals */
11693 ++ mptcp_for_each_sk_safe(mpcb, sk, tmpsk) {
11694 ++ bool shall_remove = true;
11695 ++
11696 ++ if (sk->sk_family == AF_INET || mptcp_v6_is_v4_mapped(sk)) {
11697 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11698 ++ if (inet_sk(sk)->inet_saddr == mptcp_local->locaddr4[i].addr.s_addr) {
11699 ++ shall_remove = false;
11700 ++ break;
11701 ++ }
11702 ++ }
11703 ++ } else {
11704 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11705 ++ if (ipv6_addr_equal(&inet6_sk(sk)->saddr, &mptcp_local->locaddr6[i].addr)) {
11706 ++ shall_remove = false;
11707 ++ break;
11708 ++ }
11709 ++ }
11710 ++ }
11711 ++
11712 ++ if (shall_remove) {
11713 ++ /* Reinject, so that pf = 1 and so we
11714 ++ * won't select this one as the
11715 ++ * ack-sock.
11716 ++ */
11717 ++ mptcp_reinject_data(sk, 0);
11718 ++
11719 ++ announce_remove_addr(tcp_sk(sk)->mptcp->loc_id,
11720 ++ meta_sk);
11721 ++
11722 ++ mptcp_sub_force_close(sk);
11723 ++ }
11724 ++ }
11725 ++
11726 ++ /* Just call it optimistically. It actually cannot do any harm */
11727 ++ update_addr_bitfields(meta_sk, mptcp_local);
11728 ++
11729 ++ rcu_read_unlock();
11730 ++}
11731 ++
11732 ++static int full_mesh_get_local_id(sa_family_t family, union inet_addr *addr,
11733 ++ struct net *net, bool *low_prio)
11734 ++{
11735 ++ struct mptcp_loc_addr *mptcp_local;
11736 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11737 ++ int index, id = -1;
11738 ++
11739 ++ /* Handle the backup-flows */
11740 ++ rcu_read_lock();
11741 ++ mptcp_local = rcu_dereference(fm_ns->local);
11742 ++
11743 ++ index = mptcp_find_address(mptcp_local, family, addr);
11744 ++
11745 ++ if (index != -1) {
11746 ++ if (family == AF_INET) {
11747 ++ id = mptcp_local->locaddr4[index].loc4_id;
11748 ++ *low_prio = mptcp_local->locaddr4[index].low_prio;
11749 ++ } else {
11750 ++ id = mptcp_local->locaddr6[index].loc6_id;
11751 ++ *low_prio = mptcp_local->locaddr6[index].low_prio;
11752 ++ }
11753 ++ }
11754 ++
11755 ++
11756 ++ rcu_read_unlock();
11757 ++
11758 ++ return id;
11759 ++}
11760 ++
11761 ++static void full_mesh_addr_signal(struct sock *sk, unsigned *size,
11762 ++ struct tcp_out_options *opts,
11763 ++ struct sk_buff *skb)
11764 ++{
11765 ++ const struct tcp_sock *tp = tcp_sk(sk);
11766 ++ struct mptcp_cb *mpcb = tp->mpcb;
11767 ++ struct sock *meta_sk = mpcb->meta_sk;
11768 ++ struct fullmesh_priv *fmp = fullmesh_get_priv(mpcb);
11769 ++ struct mptcp_loc_addr *mptcp_local;
11770 ++ struct mptcp_fm_ns *fm_ns = fm_get_ns(sock_net(sk));
11771 ++ int remove_addr_len;
11772 ++ u8 unannouncedv4 = 0, unannouncedv6 = 0;
11773 ++ bool meta_v4 = meta_sk->sk_family == AF_INET;
11774 ++
11775 ++ mpcb->addr_signal = 0;
11776 ++
11777 ++ if (likely(!fmp->add_addr))
11778 ++ goto remove_addr;
11779 ++
11780 ++ rcu_read_lock();
11781 ++ mptcp_local = rcu_dereference(fm_ns->local);
11782 ++
11783 ++ if (!meta_v4 && inet6_sk(meta_sk)->ipv6only)
11784 ++ goto skip_ipv4;
11785 ++
11786 ++ /* IPv4 */
11787 ++ unannouncedv4 = (~fmp->announced_addrs_v4) & mptcp_local->loc4_bits;
11788 ++ if (unannouncedv4 &&
11789 ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR4_ALIGN) {
11790 ++ int ind = mptcp_find_free_index(~unannouncedv4);
11791 ++
11792 ++ opts->options |= OPTION_MPTCP;
11793 ++ opts->mptcp_options |= OPTION_ADD_ADDR;
11794 ++ opts->add_addr4.addr_id = mptcp_local->locaddr4[ind].loc4_id;
11795 ++ opts->add_addr4.addr = mptcp_local->locaddr4[ind].addr;
11796 ++ opts->add_addr_v4 = 1;
11797 ++
11798 ++ if (skb) {
11799 ++ fmp->announced_addrs_v4 |= (1 << ind);
11800 ++ fmp->add_addr--;
11801 ++ }
11802 ++ *size += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN;
11803 ++ }
11804 ++
11805 ++ if (meta_v4)
11806 ++ goto skip_ipv6;
11807 ++
11808 ++skip_ipv4:
11809 ++ /* IPv6 */
11810 ++ unannouncedv6 = (~fmp->announced_addrs_v6) & mptcp_local->loc6_bits;
11811 ++ if (unannouncedv6 &&
11812 ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_ADD_ADDR6_ALIGN) {
11813 ++ int ind = mptcp_find_free_index(~unannouncedv6);
11814 ++
11815 ++ opts->options |= OPTION_MPTCP;
11816 ++ opts->mptcp_options |= OPTION_ADD_ADDR;
11817 ++ opts->add_addr6.addr_id = mptcp_local->locaddr6[ind].loc6_id;
11818 ++ opts->add_addr6.addr = mptcp_local->locaddr6[ind].addr;
11819 ++ opts->add_addr_v6 = 1;
11820 ++
11821 ++ if (skb) {
11822 ++ fmp->announced_addrs_v6 |= (1 << ind);
11823 ++ fmp->add_addr--;
11824 ++ }
11825 ++ *size += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN;
11826 ++ }
11827 ++
11828 ++skip_ipv6:
11829 ++ rcu_read_unlock();
11830 ++
11831 ++ if (!unannouncedv4 && !unannouncedv6 && skb)
11832 ++ fmp->add_addr--;
11833 ++
11834 ++remove_addr:
11835 ++ if (likely(!fmp->remove_addrs))
11836 ++ goto exit;
11837 ++
11838 ++ remove_addr_len = mptcp_sub_len_remove_addr_align(fmp->remove_addrs);
11839 ++ if (MAX_TCP_OPTION_SPACE - *size < remove_addr_len)
11840 ++ goto exit;
11841 ++
11842 ++ opts->options |= OPTION_MPTCP;
11843 ++ opts->mptcp_options |= OPTION_REMOVE_ADDR;
11844 ++ opts->remove_addrs = fmp->remove_addrs;
11845 ++ *size += remove_addr_len;
11846 ++ if (skb)
11847 ++ fmp->remove_addrs = 0;
11848 ++
11849 ++exit:
11850 ++ mpcb->addr_signal = !!(fmp->add_addr || fmp->remove_addrs);
11851 ++}
11852 ++
11853 ++static void full_mesh_rem_raddr(struct mptcp_cb *mpcb, u8 rem_id)
11854 ++{
11855 ++ mptcp_v4_rem_raddress(mpcb, rem_id);
11856 ++ mptcp_v6_rem_raddress(mpcb, rem_id);
11857 ++}
11858 ++
11859 ++/* Output /proc/net/mptcp_fullmesh */
11860 ++static int mptcp_fm_seq_show(struct seq_file *seq, void *v)
11861 ++{
11862 ++ const struct net *net = seq->private;
11863 ++ struct mptcp_loc_addr *mptcp_local;
11864 ++ const struct mptcp_fm_ns *fm_ns = fm_get_ns(net);
11865 ++ int i;
11866 ++
11867 ++ seq_printf(seq, "Index, Address-ID, Backup, IP-address\n");
11868 ++
11869 ++ rcu_read_lock_bh();
11870 ++ mptcp_local = rcu_dereference(fm_ns->local);
11871 ++
11872 ++ seq_printf(seq, "IPv4, next v4-index: %u\n", mptcp_local->next_v4_index);
11873 ++
11874 ++ mptcp_for_each_bit_set(mptcp_local->loc4_bits, i) {
11875 ++ struct mptcp_loc4 *loc4 = &mptcp_local->locaddr4[i];
11876 ++
11877 ++ seq_printf(seq, "%u, %u, %u, %pI4\n", i, loc4->loc4_id,
11878 ++ loc4->low_prio, &loc4->addr);
11879 ++ }
11880 ++
11881 ++ seq_printf(seq, "IPv6, next v6-index: %u\n", mptcp_local->next_v6_index);
11882 ++
11883 ++ mptcp_for_each_bit_set(mptcp_local->loc6_bits, i) {
11884 ++ struct mptcp_loc6 *loc6 = &mptcp_local->locaddr6[i];
11885 ++
11886 ++ seq_printf(seq, "%u, %u, %u, %pI6\n", i, loc6->loc6_id,
11887 ++ loc6->low_prio, &loc6->addr);
11888 ++ }
11889 ++ rcu_read_unlock_bh();
11890 ++
11891 ++ return 0;
11892 ++}
11893 ++
11894 ++static int mptcp_fm_seq_open(struct inode *inode, struct file *file)
11895 ++{
11896 ++ return single_open_net(inode, file, mptcp_fm_seq_show);
11897 ++}
11898 ++
11899 ++static const struct file_operations mptcp_fm_seq_fops = {
11900 ++ .owner = THIS_MODULE,
11901 ++ .open = mptcp_fm_seq_open,
11902 ++ .read = seq_read,
11903 ++ .llseek = seq_lseek,
11904 ++ .release = single_release_net,
11905 ++};
11906 ++
11907 ++static int mptcp_fm_init_net(struct net *net)
11908 ++{
11909 ++ struct mptcp_loc_addr *mptcp_local;
11910 ++ struct mptcp_fm_ns *fm_ns;
11911 ++ int err = 0;
11912 ++
11913 ++ fm_ns = kzalloc(sizeof(*fm_ns), GFP_KERNEL);
11914 ++ if (!fm_ns)
11915 ++ return -ENOBUFS;
11916 ++
11917 ++ mptcp_local = kzalloc(sizeof(*mptcp_local), GFP_KERNEL);
11918 ++ if (!mptcp_local) {
11919 ++ err = -ENOBUFS;
11920 ++ goto err_mptcp_local;
11921 ++ }
11922 ++
11923 ++ if (!proc_create("mptcp_fullmesh", S_IRUGO, net->proc_net,
11924 ++ &mptcp_fm_seq_fops)) {
11925 ++ err = -ENOMEM;
11926 ++ goto err_seq_fops;
11927 ++ }
11928 ++
11929 ++ mptcp_local->next_v4_index = 1;
11930 ++
11931 ++ rcu_assign_pointer(fm_ns->local, mptcp_local);
11932 ++ INIT_DELAYED_WORK(&fm_ns->address_worker, mptcp_address_worker);
11933 ++ INIT_LIST_HEAD(&fm_ns->events);
11934 ++ spin_lock_init(&fm_ns->local_lock);
11935 ++ fm_ns->net = net;
11936 ++ net->mptcp.path_managers[MPTCP_PM_FULLMESH] = fm_ns;
11937 ++
11938 ++ return 0;
11939 ++err_seq_fops:
11940 ++ kfree(mptcp_local);
11941 ++err_mptcp_local:
11942 ++ kfree(fm_ns);
11943 ++ return err;
11944 ++}
11945 ++
11946 ++static void mptcp_fm_exit_net(struct net *net)
11947 ++{
11948 ++ struct mptcp_addr_event *eventq, *tmp;
11949 ++ struct mptcp_fm_ns *fm_ns;
11950 ++ struct mptcp_loc_addr *mptcp_local;
11951 ++
11952 ++ fm_ns = fm_get_ns(net);
11953 ++ cancel_delayed_work_sync(&fm_ns->address_worker);
11954 ++
11955 ++ rcu_read_lock_bh();
11956 ++
11957 ++ mptcp_local = rcu_dereference_bh(fm_ns->local);
11958 ++ kfree(mptcp_local);
11959 ++
11960 ++ spin_lock(&fm_ns->local_lock);
11961 ++ list_for_each_entry_safe(eventq, tmp, &fm_ns->events, list) {
11962 ++ list_del(&eventq->list);
11963 ++ kfree(eventq);
11964 ++ }
11965 ++ spin_unlock(&fm_ns->local_lock);
11966 ++
11967 ++ rcu_read_unlock_bh();
11968 ++
11969 ++ remove_proc_entry("mptcp_fullmesh", net->proc_net);
11970 ++
11971 ++ kfree(fm_ns);
11972 ++}
11973 ++
11974 ++static struct pernet_operations full_mesh_net_ops = {
11975 ++ .init = mptcp_fm_init_net,
11976 ++ .exit = mptcp_fm_exit_net,
11977 ++};
11978 ++
11979 ++static struct mptcp_pm_ops full_mesh __read_mostly = {
11980 ++ .new_session = full_mesh_new_session,
11981 ++ .release_sock = full_mesh_release_sock,
11982 ++ .fully_established = full_mesh_create_subflows,
11983 ++ .new_remote_address = full_mesh_create_subflows,
11984 ++ .get_local_id = full_mesh_get_local_id,
11985 ++ .addr_signal = full_mesh_addr_signal,
11986 ++ .add_raddr = full_mesh_add_raddr,
11987 ++ .rem_raddr = full_mesh_rem_raddr,
11988 ++ .name = "fullmesh",
11989 ++ .owner = THIS_MODULE,
11990 ++};
11991 ++
11992 ++/* General initialization of MPTCP_PM */
11993 ++static int __init full_mesh_register(void)
11994 ++{
11995 ++ int ret;
11996 ++
11997 ++ BUILD_BUG_ON(sizeof(struct fullmesh_priv) > MPTCP_PM_SIZE);
11998 ++
11999 ++ ret = register_pernet_subsys(&full_mesh_net_ops);
12000 ++ if (ret)
12001 ++ goto out;
12002 ++
12003 ++ ret = register_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
12004 ++ if (ret)
12005 ++ goto err_reg_inetaddr;
12006 ++ ret = register_netdevice_notifier(&mptcp_pm_netdev_notifier);
12007 ++ if (ret)
12008 ++ goto err_reg_netdev;
12009 ++
12010 ++#if IS_ENABLED(CONFIG_IPV6)
12011 ++ ret = register_inet6addr_notifier(&inet6_addr_notifier);
12012 ++ if (ret)
12013 ++ goto err_reg_inet6addr;
12014 ++#endif
12015 ++
12016 ++ ret = mptcp_register_path_manager(&full_mesh);
12017 ++ if (ret)
12018 ++ goto err_reg_pm;
12019 ++
12020 ++out:
12021 ++ return ret;
12022 ++
12023 ++
12024 ++err_reg_pm:
12025 ++#if IS_ENABLED(CONFIG_IPV6)
12026 ++ unregister_inet6addr_notifier(&inet6_addr_notifier);
12027 ++err_reg_inet6addr:
12028 ++#endif
12029 ++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
12030 ++err_reg_netdev:
12031 ++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
12032 ++err_reg_inetaddr:
12033 ++ unregister_pernet_subsys(&full_mesh_net_ops);
12034 ++ goto out;
12035 ++}
12036 ++
12037 ++static void full_mesh_unregister(void)
12038 ++{
12039 ++#if IS_ENABLED(CONFIG_IPV6)
12040 ++ unregister_inet6addr_notifier(&inet6_addr_notifier);
12041 ++#endif
12042 ++ unregister_netdevice_notifier(&mptcp_pm_netdev_notifier);
12043 ++ unregister_inetaddr_notifier(&mptcp_pm_inetaddr_notifier);
12044 ++ unregister_pernet_subsys(&full_mesh_net_ops);
12045 ++ mptcp_unregister_path_manager(&full_mesh);
12046 ++}
12047 ++
12048 ++module_init(full_mesh_register);
12049 ++module_exit(full_mesh_unregister);
12050 ++
12051 ++MODULE_AUTHOR("Christoph Paasch");
12052 ++MODULE_LICENSE("GPL");
12053 ++MODULE_DESCRIPTION("Full-Mesh MPTCP");
12054 ++MODULE_VERSION("0.88");
12055 +diff --git a/net/mptcp/mptcp_input.c b/net/mptcp/mptcp_input.c
12056 +new file mode 100644
12057 +index 000000000000..43704ccb639e
12058 +--- /dev/null
12059 ++++ b/net/mptcp/mptcp_input.c
12060 +@@ -0,0 +1,2405 @@
12061 ++/*
12062 ++ * MPTCP implementation - Sending side
12063 ++ *
12064 ++ * Initial Design & Implementation:
12065 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
12066 ++ *
12067 ++ * Current Maintainer & Author:
12068 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
12069 ++ *
12070 ++ * Additional authors:
12071 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
12072 ++ * Gregory Detal <gregory.detal@×××××××××.be>
12073 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
12074 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
12075 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
12076 ++ * Andreas Ripke <ripke@××××××.eu>
12077 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
12078 ++ * Octavian Purdila <octavian.purdila@×××××.com>
12079 ++ * John Ronan <jronan@××××.org>
12080 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
12081 ++ * Brandon Heller <brandonh@××××××××.edu>
12082 ++ *
12083 ++ *
12084 ++ * This program is free software; you can redistribute it and/or
12085 ++ * modify it under the terms of the GNU General Public License
12086 ++ * as published by the Free Software Foundation; either version
12087 ++ * 2 of the License, or (at your option) any later version.
12088 ++ */
12089 ++
12090 ++#include <asm/unaligned.h>
12091 ++
12092 ++#include <net/mptcp.h>
12093 ++#include <net/mptcp_v4.h>
12094 ++#include <net/mptcp_v6.h>
12095 ++
12096 ++#include <linux/kconfig.h>
12097 ++
12098 ++/* is seq1 < seq2 ? */
12099 ++static inline bool before64(const u64 seq1, const u64 seq2)
12100 ++{
12101 ++ return (s64)(seq1 - seq2) < 0;
12102 ++}
12103 ++
12104 ++/* is seq1 > seq2 ? */
12105 ++#define after64(seq1, seq2) before64(seq2, seq1)
12106 ++
12107 ++static inline void mptcp_become_fully_estab(struct sock *sk)
12108 ++{
12109 ++ tcp_sk(sk)->mptcp->fully_established = 1;
12110 ++
12111 ++ if (is_master_tp(tcp_sk(sk)) &&
12112 ++ tcp_sk(sk)->mpcb->pm_ops->fully_established)
12113 ++ tcp_sk(sk)->mpcb->pm_ops->fully_established(mptcp_meta_sk(sk));
12114 ++}
12115 ++
12116 ++/* Similar to tcp_tso_acked without any memory accounting */
12117 ++static inline int mptcp_tso_acked_reinject(const struct sock *meta_sk,
12118 ++ struct sk_buff *skb)
12119 ++{
12120 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12121 ++ u32 packets_acked, len;
12122 ++
12123 ++ BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una));
12124 ++
12125 ++ packets_acked = tcp_skb_pcount(skb);
12126 ++
12127 ++ if (skb_unclone(skb, GFP_ATOMIC))
12128 ++ return 0;
12129 ++
12130 ++ len = meta_tp->snd_una - TCP_SKB_CB(skb)->seq;
12131 ++ __pskb_trim_head(skb, len);
12132 ++
12133 ++ TCP_SKB_CB(skb)->seq += len;
12134 ++ skb->ip_summed = CHECKSUM_PARTIAL;
12135 ++ skb->truesize -= len;
12136 ++
12137 ++ /* Any change of skb->len requires recalculation of tso factor. */
12138 ++ if (tcp_skb_pcount(skb) > 1)
12139 ++ tcp_set_skb_tso_segs(meta_sk, skb, tcp_skb_mss(skb));
12140 ++ packets_acked -= tcp_skb_pcount(skb);
12141 ++
12142 ++ if (packets_acked) {
12143 ++ BUG_ON(tcp_skb_pcount(skb) == 0);
12144 ++ BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
12145 ++ }
12146 ++
12147 ++ return packets_acked;
12148 ++}
12149 ++
12150 ++/**
12151 ++ * Cleans the meta-socket retransmission queue and the reinject-queue.
12152 ++ * @sk must be the metasocket.
12153 ++ */
12154 ++static void mptcp_clean_rtx_queue(struct sock *meta_sk, u32 prior_snd_una)
12155 ++{
12156 ++ struct sk_buff *skb, *tmp;
12157 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12158 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
12159 ++ bool acked = false;
12160 ++ u32 acked_pcount;
12161 ++
12162 ++ while ((skb = tcp_write_queue_head(meta_sk)) &&
12163 ++ skb != tcp_send_head(meta_sk)) {
12164 ++ bool fully_acked = true;
12165 ++
12166 ++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
12167 ++ if (tcp_skb_pcount(skb) == 1 ||
12168 ++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
12169 ++ break;
12170 ++
12171 ++ acked_pcount = tcp_tso_acked(meta_sk, skb);
12172 ++ if (!acked_pcount)
12173 ++ break;
12174 ++
12175 ++ fully_acked = false;
12176 ++ } else {
12177 ++ acked_pcount = tcp_skb_pcount(skb);
12178 ++ }
12179 ++
12180 ++ acked = true;
12181 ++ meta_tp->packets_out -= acked_pcount;
12182 ++ meta_tp->retrans_stamp = 0;
12183 ++
12184 ++ if (!fully_acked)
12185 ++ break;
12186 ++
12187 ++ tcp_unlink_write_queue(skb, meta_sk);
12188 ++
12189 ++ if (mptcp_is_data_fin(skb)) {
12190 ++ struct sock *sk_it;
12191 ++
12192 ++ /* DATA_FIN has been acknowledged - now we can close
12193 ++ * the subflows
12194 ++ */
12195 ++ mptcp_for_each_sk(mpcb, sk_it) {
12196 ++ unsigned long delay = 0;
12197 ++
12198 ++ /* If we are the passive closer, don't trigger
12199 ++ * subflow-fin until the subflow has been finned
12200 ++ * by the peer - thus we add a delay.
12201 ++ */
12202 ++ if (mpcb->passive_close &&
12203 ++ sk_it->sk_state == TCP_ESTABLISHED)
12204 ++ delay = inet_csk(sk_it)->icsk_rto << 3;
12205 ++
12206 ++ mptcp_sub_close(sk_it, delay);
12207 ++ }
12208 ++ }
12209 ++ sk_wmem_free_skb(meta_sk, skb);
12210 ++ }
12211 ++ /* Remove acknowledged data from the reinject queue */
12212 ++ skb_queue_walk_safe(&mpcb->reinject_queue, skb, tmp) {
12213 ++ if (before(meta_tp->snd_una, TCP_SKB_CB(skb)->end_seq)) {
12214 ++ if (tcp_skb_pcount(skb) == 1 ||
12215 ++ !after(meta_tp->snd_una, TCP_SKB_CB(skb)->seq))
12216 ++ break;
12217 ++
12218 ++ mptcp_tso_acked_reinject(meta_sk, skb);
12219 ++ break;
12220 ++ }
12221 ++
12222 ++ __skb_unlink(skb, &mpcb->reinject_queue);
12223 ++ __kfree_skb(skb);
12224 ++ }
12225 ++
12226 ++ if (likely(between(meta_tp->snd_up, prior_snd_una, meta_tp->snd_una)))
12227 ++ meta_tp->snd_up = meta_tp->snd_una;
12228 ++
12229 ++ if (acked) {
12230 ++ tcp_rearm_rto(meta_sk);
12231 ++ /* Normally this is done in tcp_try_undo_loss - but MPTCP
12232 ++ * does not call this function.
12233 ++ */
12234 ++ inet_csk(meta_sk)->icsk_retransmits = 0;
12235 ++ }
12236 ++}
12237 ++
12238 ++/* Inspired by tcp_rcv_state_process */
12239 ++static int mptcp_rcv_state_process(struct sock *meta_sk, struct sock *sk,
12240 ++ const struct sk_buff *skb, u32 data_seq,
12241 ++ u16 data_len)
12242 ++{
12243 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
12244 ++ const struct tcphdr *th = tcp_hdr(skb);
12245 ++
12246 ++ /* State-machine handling if FIN has been enqueued and he has
12247 ++ * been acked (snd_una == write_seq) - it's important that this
12248 ++ * here is after sk_wmem_free_skb because otherwise
12249 ++ * sk_forward_alloc is wrong upon inet_csk_destroy_sock()
12250 ++ */
12251 ++ switch (meta_sk->sk_state) {
12252 ++ case TCP_FIN_WAIT1: {
12253 ++ struct dst_entry *dst;
12254 ++ int tmo;
12255 ++
12256 ++ if (meta_tp->snd_una != meta_tp->write_seq)
12257 ++ break;
12258 ++
12259 ++ tcp_set_state(meta_sk, TCP_FIN_WAIT2);
12260 ++ meta_sk->sk_shutdown |= SEND_SHUTDOWN;
12261 ++
12262 ++ dst = __sk_dst_get(sk);
12263 ++ if (dst)
12264 ++ dst_confirm(dst);
12265 ++
12266 ++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
12267 ++ /* Wake up lingering close() */
12268 ++ meta_sk->sk_state_change(meta_sk);
12269 ++ break;
12270 ++ }
12271 ++
12272 ++ if (meta_tp->linger2 < 0 ||
12273 ++ (data_len &&
12274 ++ after(data_seq + data_len - (mptcp_is_data_fin2(skb, tp) ? 1 : 0),
12275 ++ meta_tp->rcv_nxt))) {
12276 ++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
12277 ++ tcp_done(meta_sk);
12278 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
12279 ++ return 1;
12280 ++ }
12281 ++
12282 ++ tmo = tcp_fin_time(meta_sk);
12283 ++ if (tmo > TCP_TIMEWAIT_LEN) {
12284 ++ inet_csk_reset_keepalive_timer(meta_sk, tmo - TCP_TIMEWAIT_LEN);
12285 ++ } else if (mptcp_is_data_fin2(skb, tp) || sock_owned_by_user(meta_sk)) {
12286 ++ /* Bad case. We could lose such FIN otherwise.
12287 ++ * It is not a big problem, but it looks confusing
12288 ++ * and not so rare event. We still can lose it now,
12289 ++ * if it spins in bh_lock_sock(), but it is really
12290 ++ * marginal case.
12291 ++ */
12292 ++ inet_csk_reset_keepalive_timer(meta_sk, tmo);
12293 ++ } else {
12294 ++ meta_tp->ops->time_wait(meta_sk, TCP_FIN_WAIT2, tmo);
12295 ++ }
12296 ++ break;
12297 ++ }
12298 ++ case TCP_CLOSING:
12299 ++ case TCP_LAST_ACK:
12300 ++ if (meta_tp->snd_una == meta_tp->write_seq) {
12301 ++ tcp_done(meta_sk);
12302 ++ return 1;
12303 ++ }
12304 ++ break;
12305 ++ }
12306 ++
12307 ++ /* step 7: process the segment text */
12308 ++ switch (meta_sk->sk_state) {
12309 ++ case TCP_FIN_WAIT1:
12310 ++ case TCP_FIN_WAIT2:
12311 ++ /* RFC 793 says to queue data in these states,
12312 ++ * RFC 1122 says we MUST send a reset.
12313 ++ * BSD 4.4 also does reset.
12314 ++ */
12315 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
12316 ++ if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
12317 ++ after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt) &&
12318 ++ !mptcp_is_data_fin2(skb, tp)) {
12319 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPABORTONDATA);
12320 ++ mptcp_send_active_reset(meta_sk, GFP_ATOMIC);
12321 ++ tcp_reset(meta_sk);
12322 ++ return 1;
12323 ++ }
12324 ++ }
12325 ++ break;
12326 ++ }
12327 ++
12328 ++ return 0;
12329 ++}
12330 ++
12331 ++/**
12332 ++ * @return:
12333 ++ * i) 1: Everything's fine.
12334 ++ * ii) -1: A reset has been sent on the subflow - csum-failure
12335 ++ * iii) 0: csum-failure but no reset sent, because it's the last subflow.
12336 ++ * Last packet should not be destroyed by the caller because it has
12337 ++ * been done here.
12338 ++ */
12339 ++static int mptcp_verif_dss_csum(struct sock *sk)
12340 ++{
12341 ++ struct tcp_sock *tp = tcp_sk(sk);
12342 ++ struct sk_buff *tmp, *tmp1, *last = NULL;
12343 ++ __wsum csum_tcp = 0; /* cumulative checksum of pld + mptcp-header */
12344 ++ int ans = 1, overflowed = 0, offset = 0, dss_csum_added = 0;
12345 ++ int iter = 0;
12346 ++
12347 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp, tmp1) {
12348 ++ unsigned int csum_len;
12349 ++
12350 ++ if (before(tp->mptcp->map_subseq + tp->mptcp->map_data_len, TCP_SKB_CB(tmp)->end_seq))
12351 ++ /* Mapping ends in the middle of the packet -
12352 ++ * csum only these bytes
12353 ++ */
12354 ++ csum_len = tp->mptcp->map_subseq + tp->mptcp->map_data_len - TCP_SKB_CB(tmp)->seq;
12355 ++ else
12356 ++ csum_len = tmp->len;
12357 ++
12358 ++ offset = 0;
12359 ++ if (overflowed) {
12360 ++ char first_word[4];
12361 ++ first_word[0] = 0;
12362 ++ first_word[1] = 0;
12363 ++ first_word[2] = 0;
12364 ++ first_word[3] = *(tmp->data);
12365 ++ csum_tcp = csum_partial(first_word, 4, csum_tcp);
12366 ++ offset = 1;
12367 ++ csum_len--;
12368 ++ overflowed = 0;
12369 ++ }
12370 ++
12371 ++ csum_tcp = skb_checksum(tmp, offset, csum_len, csum_tcp);
12372 ++
12373 ++ /* Was it on an odd-length? Then we have to merge the next byte
12374 ++ * correctly (see above)
12375 ++ */
12376 ++ if (csum_len != (csum_len & (~1)))
12377 ++ overflowed = 1;
12378 ++
12379 ++ if (mptcp_is_data_seq(tmp) && !dss_csum_added) {
12380 ++ __be32 data_seq = htonl((u32)(tp->mptcp->map_data_seq >> 32));
12381 ++
12382 ++ /* If a 64-bit dss is present, we increase the offset
12383 ++ * by 4 bytes, as the high-order 64-bits will be added
12384 ++ * in the final csum_partial-call.
12385 ++ */
12386 ++ u32 offset = skb_transport_offset(tmp) +
12387 ++ TCP_SKB_CB(tmp)->dss_off;
12388 ++ if (TCP_SKB_CB(tmp)->mptcp_flags & MPTCPHDR_SEQ64_SET)
12389 ++ offset += 4;
12390 ++
12391 ++ csum_tcp = skb_checksum(tmp, offset,
12392 ++ MPTCP_SUB_LEN_SEQ_CSUM,
12393 ++ csum_tcp);
12394 ++
12395 ++ csum_tcp = csum_partial(&data_seq,
12396 ++ sizeof(data_seq), csum_tcp);
12397 ++
12398 ++ dss_csum_added = 1; /* Just do it once */
12399 ++ }
12400 ++ last = tmp;
12401 ++ iter++;
12402 ++
12403 ++ if (!skb_queue_is_last(&sk->sk_receive_queue, tmp) &&
12404 ++ !before(TCP_SKB_CB(tmp1)->seq,
12405 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12406 ++ break;
12407 ++ }
12408 ++
12409 ++ /* Now, checksum must be 0 */
12410 ++ if (unlikely(csum_fold(csum_tcp))) {
12411 ++ pr_err("%s csum is wrong: %#x data_seq %u dss_csum_added %d overflowed %d iterations %d\n",
12412 ++ __func__, csum_fold(csum_tcp), TCP_SKB_CB(last)->seq,
12413 ++ dss_csum_added, overflowed, iter);
12414 ++
12415 ++ tp->mptcp->send_mp_fail = 1;
12416 ++
12417 ++ /* map_data_seq is the data-seq number of the
12418 ++ * mapping we are currently checking
12419 ++ */
12420 ++ tp->mpcb->csum_cutoff_seq = tp->mptcp->map_data_seq;
12421 ++
12422 ++ if (tp->mpcb->cnt_subflows > 1) {
12423 ++ mptcp_send_reset(sk);
12424 ++ ans = -1;
12425 ++ } else {
12426 ++ tp->mpcb->send_infinite_mapping = 1;
12427 ++
12428 ++ /* Need to purge the rcv-queue as it's no more valid */
12429 ++ while ((tmp = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
12430 ++ tp->copied_seq = TCP_SKB_CB(tmp)->end_seq;
12431 ++ kfree_skb(tmp);
12432 ++ }
12433 ++
12434 ++ ans = 0;
12435 ++ }
12436 ++ }
12437 ++
12438 ++ return ans;
12439 ++}
12440 ++
12441 ++static inline void mptcp_prepare_skb(struct sk_buff *skb,
12442 ++ const struct sock *sk)
12443 ++{
12444 ++ const struct tcp_sock *tp = tcp_sk(sk);
12445 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
12446 ++ u32 inc = 0;
12447 ++
12448 ++ /* If skb is the end of this mapping (end is always at mapping-boundary
12449 ++ * thanks to the splitting/trimming), then we need to increase
12450 ++ * data-end-seq by 1 if this here is a data-fin.
12451 ++ *
12452 ++ * We need to do -1 because end_seq includes the subflow-FIN.
12453 ++ */
12454 ++ if (tp->mptcp->map_data_fin &&
12455 ++ (tcb->end_seq - (tcp_hdr(skb)->fin ? 1 : 0)) ==
12456 ++ (tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
12457 ++ inc = 1;
12458 ++
12459 ++ /* We manually set the fin-flag if it is a data-fin. For easy
12460 ++ * processing in tcp_recvmsg.
12461 ++ */
12462 ++ tcp_hdr(skb)->fin = 1;
12463 ++ } else {
12464 ++ /* We may have a subflow-fin with data but without data-fin */
12465 ++ tcp_hdr(skb)->fin = 0;
12466 ++ }
12467 ++
12468 ++ /* Adapt data-seq's to the packet itself. We kinda transform the
12469 ++ * dss-mapping to a per-packet granularity. This is necessary to
12470 ++ * correctly handle overlapping mappings coming from different
12471 ++ * subflows. Otherwise it would be a complete mess.
12472 ++ */
12473 ++ tcb->seq = ((u32)tp->mptcp->map_data_seq) + tcb->seq - tp->mptcp->map_subseq;
12474 ++ tcb->end_seq = tcb->seq + skb->len + inc;
12475 ++}
12476 ++
12477 ++/**
12478 ++ * @return: 1 if the segment has been eaten and can be suppressed,
12479 ++ * otherwise 0.
12480 ++ */
12481 ++static inline int mptcp_direct_copy(const struct sk_buff *skb,
12482 ++ struct sock *meta_sk)
12483 ++{
12484 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
12485 ++ int chunk = min_t(unsigned int, skb->len, meta_tp->ucopy.len);
12486 ++ int eaten = 0;
12487 ++
12488 ++ __set_current_state(TASK_RUNNING);
12489 ++
12490 ++ local_bh_enable();
12491 ++ if (!skb_copy_datagram_iovec(skb, 0, meta_tp->ucopy.iov, chunk)) {
12492 ++ meta_tp->ucopy.len -= chunk;
12493 ++ meta_tp->copied_seq += chunk;
12494 ++ eaten = (chunk == skb->len);
12495 ++ tcp_rcv_space_adjust(meta_sk);
12496 ++ }
12497 ++ local_bh_disable();
12498 ++ return eaten;
12499 ++}
12500 ++
12501 ++static inline void mptcp_reset_mapping(struct tcp_sock *tp)
12502 ++{
12503 ++ tp->mptcp->map_data_len = 0;
12504 ++ tp->mptcp->map_data_seq = 0;
12505 ++ tp->mptcp->map_subseq = 0;
12506 ++ tp->mptcp->map_data_fin = 0;
12507 ++ tp->mptcp->mapping_present = 0;
12508 ++}
12509 ++
12510 ++/* The DSS-mapping received on the sk only covers the second half of the skb
12511 ++ * (cut at seq). We trim the head from the skb.
12512 ++ * Data will be freed upon kfree().
12513 ++ *
12514 ++ * Inspired by tcp_trim_head().
12515 ++ */
12516 ++static void mptcp_skb_trim_head(struct sk_buff *skb, struct sock *sk, u32 seq)
12517 ++{
12518 ++ int len = seq - TCP_SKB_CB(skb)->seq;
12519 ++ u32 new_seq = TCP_SKB_CB(skb)->seq + len;
12520 ++
12521 ++ if (len < skb_headlen(skb))
12522 ++ __skb_pull(skb, len);
12523 ++ else
12524 ++ __pskb_trim_head(skb, len - skb_headlen(skb));
12525 ++
12526 ++ TCP_SKB_CB(skb)->seq = new_seq;
12527 ++
12528 ++ skb->truesize -= len;
12529 ++ atomic_sub(len, &sk->sk_rmem_alloc);
12530 ++ sk_mem_uncharge(sk, len);
12531 ++}
12532 ++
12533 ++/* The DSS-mapping received on the sk only covers the first half of the skb
12534 ++ * (cut at seq). We create a second skb (@return), and queue it in the rcv-queue
12535 ++ * as further packets may resolve the mapping of the second half of data.
12536 ++ *
12537 ++ * Inspired by tcp_fragment().
12538 ++ */
12539 ++static int mptcp_skb_split_tail(struct sk_buff *skb, struct sock *sk, u32 seq)
12540 ++{
12541 ++ struct sk_buff *buff;
12542 ++ int nsize;
12543 ++ int nlen, len;
12544 ++
12545 ++ len = seq - TCP_SKB_CB(skb)->seq;
12546 ++ nsize = skb_headlen(skb) - len + tcp_sk(sk)->tcp_header_len;
12547 ++ if (nsize < 0)
12548 ++ nsize = 0;
12549 ++
12550 ++ /* Get a new skb... force flag on. */
12551 ++ buff = alloc_skb(nsize, GFP_ATOMIC);
12552 ++ if (buff == NULL)
12553 ++ return -ENOMEM;
12554 ++
12555 ++ skb_reserve(buff, tcp_sk(sk)->tcp_header_len);
12556 ++ skb_reset_transport_header(buff);
12557 ++
12558 ++ tcp_hdr(buff)->fin = tcp_hdr(skb)->fin;
12559 ++ tcp_hdr(skb)->fin = 0;
12560 ++
12561 ++ /* We absolutly need to call skb_set_owner_r before refreshing the
12562 ++ * truesize of buff, otherwise the moved data will account twice.
12563 ++ */
12564 ++ skb_set_owner_r(buff, sk);
12565 ++ nlen = skb->len - len - nsize;
12566 ++ buff->truesize += nlen;
12567 ++ skb->truesize -= nlen;
12568 ++
12569 ++ /* Correct the sequence numbers. */
12570 ++ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
12571 ++ TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
12572 ++ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
12573 ++
12574 ++ skb_split(skb, buff, len);
12575 ++
12576 ++ __skb_queue_after(&sk->sk_receive_queue, skb, buff);
12577 ++
12578 ++ return 0;
12579 ++}
12580 ++
12581 ++/* @return: 0 everything is fine. Just continue processing
12582 ++ * 1 subflow is broken stop everything
12583 ++ * -1 this packet was broken - continue with the next one.
12584 ++ */
12585 ++static int mptcp_prevalidate_skb(struct sock *sk, struct sk_buff *skb)
12586 ++{
12587 ++ struct tcp_sock *tp = tcp_sk(sk);
12588 ++
12589 ++ /* If we are in infinite mode, the subflow-fin is in fact a data-fin. */
12590 ++ if (!skb->len && tcp_hdr(skb)->fin && !mptcp_is_data_fin(skb) &&
12591 ++ !tp->mpcb->infinite_mapping_rcv) {
12592 ++ /* Remove a pure subflow-fin from the queue and increase
12593 ++ * copied_seq.
12594 ++ */
12595 ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
12596 ++ __skb_unlink(skb, &sk->sk_receive_queue);
12597 ++ __kfree_skb(skb);
12598 ++ return -1;
12599 ++ }
12600 ++
12601 ++ /* If we are not yet fully established and do not know the mapping for
12602 ++ * this segment, this path has to fallback to infinite or be torn down.
12603 ++ */
12604 ++ if (!tp->mptcp->fully_established && !mptcp_is_data_seq(skb) &&
12605 ++ !tp->mptcp->mapping_present && !tp->mpcb->infinite_mapping_rcv) {
12606 ++ pr_err("%s %#x will fallback - pi %d from %pS, seq %u\n",
12607 ++ __func__, tp->mpcb->mptcp_loc_token,
12608 ++ tp->mptcp->path_index, __builtin_return_address(0),
12609 ++ TCP_SKB_CB(skb)->seq);
12610 ++
12611 ++ if (!is_master_tp(tp)) {
12612 ++ mptcp_send_reset(sk);
12613 ++ return 1;
12614 ++ }
12615 ++
12616 ++ tp->mpcb->infinite_mapping_snd = 1;
12617 ++ tp->mpcb->infinite_mapping_rcv = 1;
12618 ++ /* We do a seamless fallback and should not send a inf.mapping. */
12619 ++ tp->mpcb->send_infinite_mapping = 0;
12620 ++ tp->mptcp->fully_established = 1;
12621 ++ }
12622 ++
12623 ++ /* Receiver-side becomes fully established when a whole rcv-window has
12624 ++ * been received without the need to fallback due to the previous
12625 ++ * condition.
12626 ++ */
12627 ++ if (!tp->mptcp->fully_established) {
12628 ++ tp->mptcp->init_rcv_wnd -= skb->len;
12629 ++ if (tp->mptcp->init_rcv_wnd < 0)
12630 ++ mptcp_become_fully_estab(sk);
12631 ++ }
12632 ++
12633 ++ return 0;
12634 ++}
12635 ++
12636 ++/* @return: 0 everything is fine. Just continue processing
12637 ++ * 1 subflow is broken stop everything
12638 ++ * -1 this packet was broken - continue with the next one.
12639 ++ */
12640 ++static int mptcp_detect_mapping(struct sock *sk, struct sk_buff *skb)
12641 ++{
12642 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
12643 ++ struct mptcp_cb *mpcb = tp->mpcb;
12644 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
12645 ++ u32 *ptr;
12646 ++ u32 data_seq, sub_seq, data_len, tcp_end_seq;
12647 ++
12648 ++ /* If we are in infinite-mapping-mode, the subflow is guaranteed to be
12649 ++ * in-order at the data-level. Thus data-seq-numbers can be inferred
12650 ++ * from what is expected at the data-level.
12651 ++ */
12652 ++ if (mpcb->infinite_mapping_rcv) {
12653 ++ tp->mptcp->map_data_seq = mptcp_get_rcv_nxt_64(meta_tp);
12654 ++ tp->mptcp->map_subseq = tcb->seq;
12655 ++ tp->mptcp->map_data_len = skb->len;
12656 ++ tp->mptcp->map_data_fin = tcp_hdr(skb)->fin;
12657 ++ tp->mptcp->mapping_present = 1;
12658 ++ return 0;
12659 ++ }
12660 ++
12661 ++ /* No mapping here? Exit - it is either already set or still on its way */
12662 ++ if (!mptcp_is_data_seq(skb)) {
12663 ++ /* Too many packets without a mapping - this subflow is broken */
12664 ++ if (!tp->mptcp->mapping_present &&
12665 ++ tp->rcv_nxt - tp->copied_seq > 65536) {
12666 ++ mptcp_send_reset(sk);
12667 ++ return 1;
12668 ++ }
12669 ++
12670 ++ return 0;
12671 ++ }
12672 ++
12673 ++ ptr = mptcp_skb_set_data_seq(skb, &data_seq, mpcb);
12674 ++ ptr++;
12675 ++ sub_seq = get_unaligned_be32(ptr) + tp->mptcp->rcv_isn;
12676 ++ ptr++;
12677 ++ data_len = get_unaligned_be16(ptr);
12678 ++
12679 ++ /* If it's an empty skb with DATA_FIN, sub_seq must get fixed.
12680 ++ * The draft sets it to 0, but we really would like to have the
12681 ++ * real value, to have an easy handling afterwards here in this
12682 ++ * function.
12683 ++ */
12684 ++ if (mptcp_is_data_fin(skb) && skb->len == 0)
12685 ++ sub_seq = TCP_SKB_CB(skb)->seq;
12686 ++
12687 ++ /* If there is already a mapping - we check if it maps with the current
12688 ++ * one. If not - we reset.
12689 ++ */
12690 ++ if (tp->mptcp->mapping_present &&
12691 ++ (data_seq != (u32)tp->mptcp->map_data_seq ||
12692 ++ sub_seq != tp->mptcp->map_subseq ||
12693 ++ data_len != tp->mptcp->map_data_len + tp->mptcp->map_data_fin ||
12694 ++ mptcp_is_data_fin(skb) != tp->mptcp->map_data_fin)) {
12695 ++ /* Mapping in packet is different from what we want */
12696 ++ pr_err("%s Mappings do not match!\n", __func__);
12697 ++ pr_err("%s dseq %u mdseq %u, sseq %u msseq %u dlen %u mdlen %u dfin %d mdfin %d\n",
12698 ++ __func__, data_seq, (u32)tp->mptcp->map_data_seq,
12699 ++ sub_seq, tp->mptcp->map_subseq, data_len,
12700 ++ tp->mptcp->map_data_len, mptcp_is_data_fin(skb),
12701 ++ tp->mptcp->map_data_fin);
12702 ++ mptcp_send_reset(sk);
12703 ++ return 1;
12704 ++ }
12705 ++
12706 ++ /* If the previous check was good, the current mapping is valid and we exit. */
12707 ++ if (tp->mptcp->mapping_present)
12708 ++ return 0;
12709 ++
12710 ++ /* Mapping not yet set on this subflow - we set it here! */
12711 ++
12712 ++ if (!data_len) {
12713 ++ mpcb->infinite_mapping_rcv = 1;
12714 ++ tp->mptcp->fully_established = 1;
12715 ++ /* We need to repeat mp_fail's until the sender felt
12716 ++ * back to infinite-mapping - here we stop repeating it.
12717 ++ */
12718 ++ tp->mptcp->send_mp_fail = 0;
12719 ++
12720 ++ /* We have to fixup data_len - it must be the same as skb->len */
12721 ++ data_len = skb->len + (mptcp_is_data_fin(skb) ? 1 : 0);
12722 ++ sub_seq = tcb->seq;
12723 ++
12724 ++ /* TODO kill all other subflows than this one */
12725 ++ /* data_seq and so on are set correctly */
12726 ++
12727 ++ /* At this point, the meta-ofo-queue has to be emptied,
12728 ++ * as the following data is guaranteed to be in-order at
12729 ++ * the data and subflow-level
12730 ++ */
12731 ++ mptcp_purge_ofo_queue(meta_tp);
12732 ++ }
12733 ++
12734 ++ /* We are sending mp-fail's and thus are in fallback mode.
12735 ++ * Ignore packets which do not announce the fallback and still
12736 ++ * want to provide a mapping.
12737 ++ */
12738 ++ if (tp->mptcp->send_mp_fail) {
12739 ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
12740 ++ __skb_unlink(skb, &sk->sk_receive_queue);
12741 ++ __kfree_skb(skb);
12742 ++ return -1;
12743 ++ }
12744 ++
12745 ++ /* FIN increased the mapping-length by 1 */
12746 ++ if (mptcp_is_data_fin(skb))
12747 ++ data_len--;
12748 ++
12749 ++ /* Subflow-sequences of packet must be
12750 ++ * (at least partially) be part of the DSS-mapping's
12751 ++ * subflow-sequence-space.
12752 ++ *
12753 ++ * Basically the mapping is not valid, if either of the
12754 ++ * following conditions is true:
12755 ++ *
12756 ++ * 1. It's not a data_fin and
12757 ++ * MPTCP-sub_seq >= TCP-end_seq
12758 ++ *
12759 ++ * 2. It's a data_fin and TCP-end_seq > TCP-seq and
12760 ++ * MPTCP-sub_seq >= TCP-end_seq
12761 ++ *
12762 ++ * The previous two can be merged into:
12763 ++ * TCP-end_seq > TCP-seq and MPTCP-sub_seq >= TCP-end_seq
12764 ++ * Because if it's not a data-fin, TCP-end_seq > TCP-seq
12765 ++ *
12766 ++ * 3. It's a data_fin and skb->len == 0 and
12767 ++ * MPTCP-sub_seq > TCP-end_seq
12768 ++ *
12769 ++ * 4. It's not a data_fin and TCP-end_seq > TCP-seq and
12770 ++ * MPTCP-sub_seq + MPTCP-data_len <= TCP-seq
12771 ++ *
12772 ++ * 5. MPTCP-sub_seq is prior to what we already copied (copied_seq)
12773 ++ */
12774 ++
12775 ++ /* subflow-fin is not part of the mapping - ignore it here ! */
12776 ++ tcp_end_seq = tcb->end_seq - tcp_hdr(skb)->fin;
12777 ++ if ((!before(sub_seq, tcb->end_seq) && after(tcp_end_seq, tcb->seq)) ||
12778 ++ (mptcp_is_data_fin(skb) && skb->len == 0 && after(sub_seq, tcb->end_seq)) ||
12779 ++ (!after(sub_seq + data_len, tcb->seq) && after(tcp_end_seq, tcb->seq)) ||
12780 ++ before(sub_seq, tp->copied_seq)) {
12781 ++ /* Subflow-sequences of packet is different from what is in the
12782 ++ * packet's dss-mapping. The peer is misbehaving - reset
12783 ++ */
12784 ++ pr_err("%s Packet's mapping does not map to the DSS sub_seq %u "
12785 ++ "end_seq %u, tcp_end_seq %u seq %u dfin %u len %u data_len %u"
12786 ++ "copied_seq %u\n", __func__, sub_seq, tcb->end_seq, tcp_end_seq, tcb->seq, mptcp_is_data_fin(skb),
12787 ++ skb->len, data_len, tp->copied_seq);
12788 ++ mptcp_send_reset(sk);
12789 ++ return 1;
12790 ++ }
12791 ++
12792 ++ /* Does the DSS had 64-bit seqnum's ? */
12793 ++ if (!(tcb->mptcp_flags & MPTCPHDR_SEQ64_SET)) {
12794 ++ /* Wrapped around? */
12795 ++ if (unlikely(after(data_seq, meta_tp->rcv_nxt) && data_seq < meta_tp->rcv_nxt)) {
12796 ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, !mpcb->rcv_hiseq_index, data_seq);
12797 ++ } else {
12798 ++ /* Else, access the default high-order bits */
12799 ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index, data_seq);
12800 ++ }
12801 ++ } else {
12802 ++ tp->mptcp->map_data_seq = mptcp_get_data_seq_64(mpcb, (tcb->mptcp_flags & MPTCPHDR_SEQ64_INDEX) ? 1 : 0, data_seq);
12803 ++
12804 ++ if (unlikely(tcb->mptcp_flags & MPTCPHDR_SEQ64_OFO)) {
12805 ++ /* We make sure that the data_seq is invalid.
12806 ++ * It will be dropped later.
12807 ++ */
12808 ++ tp->mptcp->map_data_seq += 0xFFFFFFFF;
12809 ++ tp->mptcp->map_data_seq += 0xFFFFFFFF;
12810 ++ }
12811 ++ }
12812 ++
12813 ++ tp->mptcp->map_data_len = data_len;
12814 ++ tp->mptcp->map_subseq = sub_seq;
12815 ++ tp->mptcp->map_data_fin = mptcp_is_data_fin(skb) ? 1 : 0;
12816 ++ tp->mptcp->mapping_present = 1;
12817 ++
12818 ++ return 0;
12819 ++}
12820 ++
12821 ++/* Similar to tcp_sequence(...) */
12822 ++static inline bool mptcp_sequence(const struct tcp_sock *meta_tp,
12823 ++ u64 data_seq, u64 end_data_seq)
12824 ++{
12825 ++ const struct mptcp_cb *mpcb = meta_tp->mpcb;
12826 ++ u64 rcv_wup64;
12827 ++
12828 ++ /* Wrap-around? */
12829 ++ if (meta_tp->rcv_wup > meta_tp->rcv_nxt) {
12830 ++ rcv_wup64 = ((u64)(mpcb->rcv_high_order[mpcb->rcv_hiseq_index] - 1) << 32) |
12831 ++ meta_tp->rcv_wup;
12832 ++ } else {
12833 ++ rcv_wup64 = mptcp_get_data_seq_64(mpcb, mpcb->rcv_hiseq_index,
12834 ++ meta_tp->rcv_wup);
12835 ++ }
12836 ++
12837 ++ return !before64(end_data_seq, rcv_wup64) &&
12838 ++ !after64(data_seq, mptcp_get_rcv_nxt_64(meta_tp) + tcp_receive_window(meta_tp));
12839 ++}
12840 ++
12841 ++/* @return: 0 everything is fine. Just continue processing
12842 ++ * -1 this packet was broken - continue with the next one.
12843 ++ */
12844 ++static int mptcp_validate_mapping(struct sock *sk, struct sk_buff *skb)
12845 ++{
12846 ++ struct tcp_sock *tp = tcp_sk(sk);
12847 ++ struct sk_buff *tmp, *tmp1;
12848 ++ u32 tcp_end_seq;
12849 ++
12850 ++ if (!tp->mptcp->mapping_present)
12851 ++ return 0;
12852 ++
12853 ++ /* either, the new skb gave us the mapping and the first segment
12854 ++ * in the sub-rcv-queue has to be trimmed ...
12855 ++ */
12856 ++ tmp = skb_peek(&sk->sk_receive_queue);
12857 ++ if (before(TCP_SKB_CB(tmp)->seq, tp->mptcp->map_subseq) &&
12858 ++ after(TCP_SKB_CB(tmp)->end_seq, tp->mptcp->map_subseq))
12859 ++ mptcp_skb_trim_head(tmp, sk, tp->mptcp->map_subseq);
12860 ++
12861 ++ /* ... or the new skb (tail) has to be split at the end. */
12862 ++ tcp_end_seq = TCP_SKB_CB(skb)->end_seq - (tcp_hdr(skb)->fin ? 1 : 0);
12863 ++ if (after(tcp_end_seq, tp->mptcp->map_subseq + tp->mptcp->map_data_len)) {
12864 ++ u32 seq = tp->mptcp->map_subseq + tp->mptcp->map_data_len;
12865 ++ if (mptcp_skb_split_tail(skb, sk, seq)) { /* Allocation failed */
12866 ++ /* TODO : maybe handle this here better.
12867 ++ * We now just force meta-retransmission.
12868 ++ */
12869 ++ tp->copied_seq = TCP_SKB_CB(skb)->end_seq;
12870 ++ __skb_unlink(skb, &sk->sk_receive_queue);
12871 ++ __kfree_skb(skb);
12872 ++ return -1;
12873 ++ }
12874 ++ }
12875 ++
12876 ++ /* Now, remove old sk_buff's from the receive-queue.
12877 ++ * This may happen if the mapping has been lost for these segments and
12878 ++ * the next mapping has already been received.
12879 ++ */
12880 ++ if (before(TCP_SKB_CB(skb_peek(&sk->sk_receive_queue))->seq, tp->mptcp->map_subseq)) {
12881 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12882 ++ if (!before(TCP_SKB_CB(tmp1)->seq, tp->mptcp->map_subseq))
12883 ++ break;
12884 ++
12885 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12886 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12887 ++
12888 ++ /* Impossible that we could free skb here, because his
12889 ++ * mapping is known to be valid from previous checks
12890 ++ */
12891 ++ __kfree_skb(tmp1);
12892 ++ }
12893 ++ }
12894 ++
12895 ++ return 0;
12896 ++}
12897 ++
12898 ++/* @return: 0 everything is fine. Just continue processing
12899 ++ * 1 subflow is broken stop everything
12900 ++ * -1 this mapping has been put in the meta-receive-queue
12901 ++ * -2 this mapping has been eaten by the application
12902 ++ */
12903 ++static int mptcp_queue_skb(struct sock *sk)
12904 ++{
12905 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
12906 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
12907 ++ struct mptcp_cb *mpcb = tp->mpcb;
12908 ++ struct sk_buff *tmp, *tmp1;
12909 ++ u64 rcv_nxt64 = mptcp_get_rcv_nxt_64(meta_tp);
12910 ++ bool data_queued = false;
12911 ++
12912 ++ /* Have we not yet received the full mapping? */
12913 ++ if (!tp->mptcp->mapping_present ||
12914 ++ before(tp->rcv_nxt, tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12915 ++ return 0;
12916 ++
12917 ++ /* Is this an overlapping mapping? rcv_nxt >= end_data_seq
12918 ++ * OR
12919 ++ * This mapping is out of window
12920 ++ */
12921 ++ if (!before64(rcv_nxt64, tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin) ||
12922 ++ !mptcp_sequence(meta_tp, tp->mptcp->map_data_seq,
12923 ++ tp->mptcp->map_data_seq + tp->mptcp->map_data_len + tp->mptcp->map_data_fin)) {
12924 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12925 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12926 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12927 ++ __kfree_skb(tmp1);
12928 ++
12929 ++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
12930 ++ !before(TCP_SKB_CB(tmp)->seq,
12931 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12932 ++ break;
12933 ++ }
12934 ++
12935 ++ mptcp_reset_mapping(tp);
12936 ++
12937 ++ return -1;
12938 ++ }
12939 ++
12940 ++ /* Record it, because we want to send our data_fin on the same path */
12941 ++ if (tp->mptcp->map_data_fin) {
12942 ++ mpcb->dfin_path_index = tp->mptcp->path_index;
12943 ++ mpcb->dfin_combined = !!(sk->sk_shutdown & RCV_SHUTDOWN);
12944 ++ }
12945 ++
12946 ++ /* Verify the checksum */
12947 ++ if (mpcb->dss_csum && !mpcb->infinite_mapping_rcv) {
12948 ++ int ret = mptcp_verif_dss_csum(sk);
12949 ++
12950 ++ if (ret <= 0) {
12951 ++ mptcp_reset_mapping(tp);
12952 ++ return 1;
12953 ++ }
12954 ++ }
12955 ++
12956 ++ if (before64(rcv_nxt64, tp->mptcp->map_data_seq)) {
12957 ++ /* Seg's have to go to the meta-ofo-queue */
12958 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12959 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12960 ++ mptcp_prepare_skb(tmp1, sk);
12961 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12962 ++ /* MUST be done here, because fragstolen may be true later.
12963 ++ * Then, kfree_skb_partial will not account the memory.
12964 ++ */
12965 ++ skb_orphan(tmp1);
12966 ++
12967 ++ if (!mpcb->in_time_wait) /* In time-wait, do not receive data */
12968 ++ mptcp_add_meta_ofo_queue(meta_sk, tmp1, sk);
12969 ++ else
12970 ++ __kfree_skb(tmp1);
12971 ++
12972 ++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
12973 ++ !before(TCP_SKB_CB(tmp)->seq,
12974 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
12975 ++ break;
12976 ++ }
12977 ++ tcp_enter_quickack_mode(sk);
12978 ++ } else {
12979 ++ /* Ready for the meta-rcv-queue */
12980 ++ skb_queue_walk_safe(&sk->sk_receive_queue, tmp1, tmp) {
12981 ++ int eaten = 0;
12982 ++ const bool copied_early = false;
12983 ++ bool fragstolen = false;
12984 ++ u32 old_rcv_nxt = meta_tp->rcv_nxt;
12985 ++
12986 ++ tp->copied_seq = TCP_SKB_CB(tmp1)->end_seq;
12987 ++ mptcp_prepare_skb(tmp1, sk);
12988 ++ __skb_unlink(tmp1, &sk->sk_receive_queue);
12989 ++ /* MUST be done here, because fragstolen may be true.
12990 ++ * Then, kfree_skb_partial will not account the memory.
12991 ++ */
12992 ++ skb_orphan(tmp1);
12993 ++
12994 ++ /* This segment has already been received */
12995 ++ if (!after(TCP_SKB_CB(tmp1)->end_seq, meta_tp->rcv_nxt)) {
12996 ++ __kfree_skb(tmp1);
12997 ++ goto next;
12998 ++ }
12999 ++
13000 ++#ifdef CONFIG_NET_DMA
13001 ++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
13002 ++ meta_tp->ucopy.task == current &&
13003 ++ meta_tp->copied_seq == meta_tp->rcv_nxt &&
13004 ++ tmp1->len <= meta_tp->ucopy.len &&
13005 ++ sock_owned_by_user(meta_sk) &&
13006 ++ tcp_dma_try_early_copy(meta_sk, tmp1, 0)) {
13007 ++ copied_early = true;
13008 ++ eaten = 1;
13009 ++ }
13010 ++#endif
13011 ++
13012 ++ /* Is direct copy possible ? */
13013 ++ if (TCP_SKB_CB(tmp1)->seq == meta_tp->rcv_nxt &&
13014 ++ meta_tp->ucopy.task == current &&
13015 ++ meta_tp->copied_seq == meta_tp->rcv_nxt &&
13016 ++ meta_tp->ucopy.len && sock_owned_by_user(meta_sk) &&
13017 ++ !copied_early)
13018 ++ eaten = mptcp_direct_copy(tmp1, meta_sk);
13019 ++
13020 ++ if (mpcb->in_time_wait) /* In time-wait, do not receive data */
13021 ++ eaten = 1;
13022 ++
13023 ++ if (!eaten)
13024 ++ eaten = tcp_queue_rcv(meta_sk, tmp1, 0, &fragstolen);
13025 ++
13026 ++ meta_tp->rcv_nxt = TCP_SKB_CB(tmp1)->end_seq;
13027 ++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
13028 ++
13029 ++#ifdef CONFIG_NET_DMA
13030 ++ if (copied_early)
13031 ++ meta_tp->cleanup_rbuf(meta_sk, tmp1->len);
13032 ++#endif
13033 ++
13034 ++ if (tcp_hdr(tmp1)->fin && !mpcb->in_time_wait)
13035 ++ mptcp_fin(meta_sk);
13036 ++
13037 ++ /* Check if this fills a gap in the ofo queue */
13038 ++ if (!skb_queue_empty(&meta_tp->out_of_order_queue))
13039 ++ mptcp_ofo_queue(meta_sk);
13040 ++
13041 ++#ifdef CONFIG_NET_DMA
13042 ++ if (copied_early)
13043 ++ __skb_queue_tail(&meta_sk->sk_async_wait_queue,
13044 ++ tmp1);
13045 ++ else
13046 ++#endif
13047 ++ if (eaten)
13048 ++ kfree_skb_partial(tmp1, fragstolen);
13049 ++
13050 ++ data_queued = true;
13051 ++next:
13052 ++ if (!skb_queue_empty(&sk->sk_receive_queue) &&
13053 ++ !before(TCP_SKB_CB(tmp)->seq,
13054 ++ tp->mptcp->map_subseq + tp->mptcp->map_data_len))
13055 ++ break;
13056 ++ }
13057 ++ }
13058 ++
13059 ++ inet_csk(meta_sk)->icsk_ack.lrcvtime = tcp_time_stamp;
13060 ++ mptcp_reset_mapping(tp);
13061 ++
13062 ++ return data_queued ? -1 : -2;
13063 ++}
13064 ++
13065 ++void mptcp_data_ready(struct sock *sk)
13066 ++{
13067 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
13068 ++ struct sk_buff *skb, *tmp;
13069 ++ int queued = 0;
13070 ++
13071 ++ /* restart before the check, because mptcp_fin might have changed the
13072 ++ * state.
13073 ++ */
13074 ++restart:
13075 ++ /* If the meta cannot receive data, there is no point in pushing data.
13076 ++ * If we are in time-wait, we may still be waiting for the final FIN.
13077 ++ * So, we should proceed with the processing.
13078 ++ */
13079 ++ if (!mptcp_sk_can_recv(meta_sk) && !tcp_sk(sk)->mpcb->in_time_wait) {
13080 ++ skb_queue_purge(&sk->sk_receive_queue);
13081 ++ tcp_sk(sk)->copied_seq = tcp_sk(sk)->rcv_nxt;
13082 ++ goto exit;
13083 ++ }
13084 ++
13085 ++ /* Iterate over all segments, detect their mapping (if we don't have
13086 ++ * one yet), validate them and push everything one level higher.
13087 ++ */
13088 ++ skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
13089 ++ int ret;
13090 ++ /* Pre-validation - e.g., early fallback */
13091 ++ ret = mptcp_prevalidate_skb(sk, skb);
13092 ++ if (ret < 0)
13093 ++ goto restart;
13094 ++ else if (ret > 0)
13095 ++ break;
13096 ++
13097 ++ /* Set the current mapping */
13098 ++ ret = mptcp_detect_mapping(sk, skb);
13099 ++ if (ret < 0)
13100 ++ goto restart;
13101 ++ else if (ret > 0)
13102 ++ break;
13103 ++
13104 ++ /* Validation */
13105 ++ if (mptcp_validate_mapping(sk, skb) < 0)
13106 ++ goto restart;
13107 ++
13108 ++ /* Push a level higher */
13109 ++ ret = mptcp_queue_skb(sk);
13110 ++ if (ret < 0) {
13111 ++ if (ret == -1)
13112 ++ queued = ret;
13113 ++ goto restart;
13114 ++ } else if (ret == 0) {
13115 ++ continue;
13116 ++ } else { /* ret == 1 */
13117 ++ break;
13118 ++ }
13119 ++ }
13120 ++
13121 ++exit:
13122 ++ if (tcp_sk(sk)->close_it) {
13123 ++ tcp_send_ack(sk);
13124 ++ tcp_sk(sk)->ops->time_wait(sk, TCP_TIME_WAIT, 0);
13125 ++ }
13126 ++
13127 ++ if (queued == -1 && !sock_flag(meta_sk, SOCK_DEAD))
13128 ++ meta_sk->sk_data_ready(meta_sk);
13129 ++}
13130 ++
13131 ++
13132 ++int mptcp_check_req(struct sk_buff *skb, struct net *net)
13133 ++{
13134 ++ const struct tcphdr *th = tcp_hdr(skb);
13135 ++ struct sock *meta_sk = NULL;
13136 ++
13137 ++ /* MPTCP structures not initialized */
13138 ++ if (mptcp_init_failed)
13139 ++ return 0;
13140 ++
13141 ++ if (skb->protocol == htons(ETH_P_IP))
13142 ++ meta_sk = mptcp_v4_search_req(th->source, ip_hdr(skb)->saddr,
13143 ++ ip_hdr(skb)->daddr, net);
13144 ++#if IS_ENABLED(CONFIG_IPV6)
13145 ++ else /* IPv6 */
13146 ++ meta_sk = mptcp_v6_search_req(th->source, &ipv6_hdr(skb)->saddr,
13147 ++ &ipv6_hdr(skb)->daddr, net);
13148 ++#endif /* CONFIG_IPV6 */
13149 ++
13150 ++ if (!meta_sk)
13151 ++ return 0;
13152 ++
13153 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
13154 ++
13155 ++ bh_lock_sock_nested(meta_sk);
13156 ++ if (sock_owned_by_user(meta_sk)) {
13157 ++ skb->sk = meta_sk;
13158 ++ if (unlikely(sk_add_backlog(meta_sk, skb,
13159 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
13160 ++ bh_unlock_sock(meta_sk);
13161 ++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
13162 ++ sock_put(meta_sk); /* Taken by mptcp_search_req */
13163 ++ kfree_skb(skb);
13164 ++ return 1;
13165 ++ }
13166 ++ } else if (skb->protocol == htons(ETH_P_IP)) {
13167 ++ tcp_v4_do_rcv(meta_sk, skb);
13168 ++#if IS_ENABLED(CONFIG_IPV6)
13169 ++ } else { /* IPv6 */
13170 ++ tcp_v6_do_rcv(meta_sk, skb);
13171 ++#endif /* CONFIG_IPV6 */
13172 ++ }
13173 ++ bh_unlock_sock(meta_sk);
13174 ++ sock_put(meta_sk); /* Taken by mptcp_vX_search_req */
13175 ++ return 1;
13176 ++}
13177 ++
13178 ++struct mp_join *mptcp_find_join(const struct sk_buff *skb)
13179 ++{
13180 ++ const struct tcphdr *th = tcp_hdr(skb);
13181 ++ unsigned char *ptr;
13182 ++ int length = (th->doff * 4) - sizeof(struct tcphdr);
13183 ++
13184 ++ /* Jump through the options to check whether JOIN is there */
13185 ++ ptr = (unsigned char *)(th + 1);
13186 ++ while (length > 0) {
13187 ++ int opcode = *ptr++;
13188 ++ int opsize;
13189 ++
13190 ++ switch (opcode) {
13191 ++ case TCPOPT_EOL:
13192 ++ return NULL;
13193 ++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
13194 ++ length--;
13195 ++ continue;
13196 ++ default:
13197 ++ opsize = *ptr++;
13198 ++ if (opsize < 2) /* "silly options" */
13199 ++ return NULL;
13200 ++ if (opsize > length)
13201 ++ return NULL; /* don't parse partial options */
13202 ++ if (opcode == TCPOPT_MPTCP &&
13203 ++ ((struct mptcp_option *)(ptr - 2))->sub == MPTCP_SUB_JOIN) {
13204 ++ return (struct mp_join *)(ptr - 2);
13205 ++ }
13206 ++ ptr += opsize - 2;
13207 ++ length -= opsize;
13208 ++ }
13209 ++ }
13210 ++ return NULL;
13211 ++}
13212 ++
13213 ++int mptcp_lookup_join(struct sk_buff *skb, struct inet_timewait_sock *tw)
13214 ++{
13215 ++ const struct mptcp_cb *mpcb;
13216 ++ struct sock *meta_sk;
13217 ++ u32 token;
13218 ++ bool meta_v4;
13219 ++ struct mp_join *join_opt = mptcp_find_join(skb);
13220 ++ if (!join_opt)
13221 ++ return 0;
13222 ++
13223 ++ /* MPTCP structures were not initialized, so return error */
13224 ++ if (mptcp_init_failed)
13225 ++ return -1;
13226 ++
13227 ++ token = join_opt->u.syn.token;
13228 ++ meta_sk = mptcp_hash_find(dev_net(skb_dst(skb)->dev), token);
13229 ++ if (!meta_sk) {
13230 ++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
13231 ++ return -1;
13232 ++ }
13233 ++
13234 ++ meta_v4 = meta_sk->sk_family == AF_INET;
13235 ++ if (meta_v4) {
13236 ++ if (skb->protocol == htons(ETH_P_IPV6)) {
13237 ++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
13238 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13239 ++ return -1;
13240 ++ }
13241 ++ } else if (skb->protocol == htons(ETH_P_IP) &&
13242 ++ inet6_sk(meta_sk)->ipv6only) {
13243 ++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
13244 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13245 ++ return -1;
13246 ++ }
13247 ++
13248 ++ mpcb = tcp_sk(meta_sk)->mpcb;
13249 ++ if (mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping) {
13250 ++ /* We are in fallback-mode on the reception-side -
13251 ++ * no new subflows!
13252 ++ */
13253 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13254 ++ return -1;
13255 ++ }
13256 ++
13257 ++ /* Coming from time-wait-sock processing in tcp_v4_rcv.
13258 ++ * We have to deschedule it before continuing, because otherwise
13259 ++ * mptcp_v4_do_rcv will hit again on it inside tcp_v4_hnd_req.
13260 ++ */
13261 ++ if (tw) {
13262 ++ inet_twsk_deschedule(tw, &tcp_death_row);
13263 ++ inet_twsk_put(tw);
13264 ++ }
13265 ++
13266 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
13267 ++ /* OK, this is a new syn/join, let's create a new open request and
13268 ++ * send syn+ack
13269 ++ */
13270 ++ bh_lock_sock_nested(meta_sk);
13271 ++ if (sock_owned_by_user(meta_sk)) {
13272 ++ skb->sk = meta_sk;
13273 ++ if (unlikely(sk_add_backlog(meta_sk, skb,
13274 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf))) {
13275 ++ bh_unlock_sock(meta_sk);
13276 ++ NET_INC_STATS_BH(sock_net(meta_sk),
13277 ++ LINUX_MIB_TCPBACKLOGDROP);
13278 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13279 ++ kfree_skb(skb);
13280 ++ return 1;
13281 ++ }
13282 ++ } else if (skb->protocol == htons(ETH_P_IP)) {
13283 ++ tcp_v4_do_rcv(meta_sk, skb);
13284 ++#if IS_ENABLED(CONFIG_IPV6)
13285 ++ } else {
13286 ++ tcp_v6_do_rcv(meta_sk, skb);
13287 ++#endif /* CONFIG_IPV6 */
13288 ++ }
13289 ++ bh_unlock_sock(meta_sk);
13290 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13291 ++ return 1;
13292 ++}
13293 ++
13294 ++int mptcp_do_join_short(struct sk_buff *skb,
13295 ++ const struct mptcp_options_received *mopt,
13296 ++ struct net *net)
13297 ++{
13298 ++ struct sock *meta_sk;
13299 ++ u32 token;
13300 ++ bool meta_v4;
13301 ++
13302 ++ token = mopt->mptcp_rem_token;
13303 ++ meta_sk = mptcp_hash_find(net, token);
13304 ++ if (!meta_sk) {
13305 ++ mptcp_debug("%s:mpcb not found:%x\n", __func__, token);
13306 ++ return -1;
13307 ++ }
13308 ++
13309 ++ meta_v4 = meta_sk->sk_family == AF_INET;
13310 ++ if (meta_v4) {
13311 ++ if (skb->protocol == htons(ETH_P_IPV6)) {
13312 ++ mptcp_debug("SYN+MP_JOIN with IPV6 address on pure IPV4 meta\n");
13313 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13314 ++ return -1;
13315 ++ }
13316 ++ } else if (skb->protocol == htons(ETH_P_IP) &&
13317 ++ inet6_sk(meta_sk)->ipv6only) {
13318 ++ mptcp_debug("SYN+MP_JOIN with IPV4 address on IPV6_V6ONLY meta\n");
13319 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13320 ++ return -1;
13321 ++ }
13322 ++
13323 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_JOIN;
13324 ++
13325 ++ /* OK, this is a new syn/join, let's create a new open request and
13326 ++ * send syn+ack
13327 ++ */
13328 ++ bh_lock_sock(meta_sk);
13329 ++
13330 ++ /* This check is also done in mptcp_vX_do_rcv. But, there we cannot
13331 ++ * call tcp_vX_send_reset, because we hold already two socket-locks.
13332 ++ * (the listener and the meta from above)
13333 ++ *
13334 ++ * And the send-reset will try to take yet another one (ip_send_reply).
13335 ++ * Thus, we propagate the reset up to tcp_rcv_state_process.
13336 ++ */
13337 ++ if (tcp_sk(meta_sk)->mpcb->infinite_mapping_rcv ||
13338 ++ tcp_sk(meta_sk)->mpcb->send_infinite_mapping ||
13339 ++ meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table) {
13340 ++ bh_unlock_sock(meta_sk);
13341 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13342 ++ return -1;
13343 ++ }
13344 ++
13345 ++ if (sock_owned_by_user(meta_sk)) {
13346 ++ skb->sk = meta_sk;
13347 ++ if (unlikely(sk_add_backlog(meta_sk, skb,
13348 ++ meta_sk->sk_rcvbuf + meta_sk->sk_sndbuf)))
13349 ++ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
13350 ++ else
13351 ++ /* Must make sure that upper layers won't free the
13352 ++ * skb if it is added to the backlog-queue.
13353 ++ */
13354 ++ skb_get(skb);
13355 ++ } else {
13356 ++ /* mptcp_v4_do_rcv tries to free the skb - we prevent this, as
13357 ++ * the skb will finally be freed by tcp_v4_do_rcv (where we are
13358 ++ * coming from)
13359 ++ */
13360 ++ skb_get(skb);
13361 ++ if (skb->protocol == htons(ETH_P_IP)) {
13362 ++ tcp_v4_do_rcv(meta_sk, skb);
13363 ++#if IS_ENABLED(CONFIG_IPV6)
13364 ++ } else { /* IPv6 */
13365 ++ tcp_v6_do_rcv(meta_sk, skb);
13366 ++#endif /* CONFIG_IPV6 */
13367 ++ }
13368 ++ }
13369 ++
13370 ++ bh_unlock_sock(meta_sk);
13371 ++ sock_put(meta_sk); /* Taken by mptcp_hash_find */
13372 ++ return 0;
13373 ++}
13374 ++
13375 ++/**
13376 ++ * Equivalent of tcp_fin() for MPTCP
13377 ++ * Can be called only when the FIN is validly part
13378 ++ * of the data seqnum space. Not before when we get holes.
13379 ++ */
13380 ++void mptcp_fin(struct sock *meta_sk)
13381 ++{
13382 ++ struct sock *sk = NULL, *sk_it;
13383 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
13384 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
13385 ++
13386 ++ mptcp_for_each_sk(mpcb, sk_it) {
13387 ++ if (tcp_sk(sk_it)->mptcp->path_index == mpcb->dfin_path_index) {
13388 ++ sk = sk_it;
13389 ++ break;
13390 ++ }
13391 ++ }
13392 ++
13393 ++ if (!sk || sk->sk_state == TCP_CLOSE)
13394 ++ sk = mptcp_select_ack_sock(meta_sk);
13395 ++
13396 ++ inet_csk_schedule_ack(sk);
13397 ++
13398 ++ meta_sk->sk_shutdown |= RCV_SHUTDOWN;
13399 ++ sock_set_flag(meta_sk, SOCK_DONE);
13400 ++
13401 ++ switch (meta_sk->sk_state) {
13402 ++ case TCP_SYN_RECV:
13403 ++ case TCP_ESTABLISHED:
13404 ++ /* Move to CLOSE_WAIT */
13405 ++ tcp_set_state(meta_sk, TCP_CLOSE_WAIT);
13406 ++ inet_csk(sk)->icsk_ack.pingpong = 1;
13407 ++ break;
13408 ++
13409 ++ case TCP_CLOSE_WAIT:
13410 ++ case TCP_CLOSING:
13411 ++ /* Received a retransmission of the FIN, do
13412 ++ * nothing.
13413 ++ */
13414 ++ break;
13415 ++ case TCP_LAST_ACK:
13416 ++ /* RFC793: Remain in the LAST-ACK state. */
13417 ++ break;
13418 ++
13419 ++ case TCP_FIN_WAIT1:
13420 ++ /* This case occurs when a simultaneous close
13421 ++ * happens, we must ack the received FIN and
13422 ++ * enter the CLOSING state.
13423 ++ */
13424 ++ tcp_send_ack(sk);
13425 ++ tcp_set_state(meta_sk, TCP_CLOSING);
13426 ++ break;
13427 ++ case TCP_FIN_WAIT2:
13428 ++ /* Received a FIN -- send ACK and enter TIME_WAIT. */
13429 ++ tcp_send_ack(sk);
13430 ++ meta_tp->ops->time_wait(meta_sk, TCP_TIME_WAIT, 0);
13431 ++ break;
13432 ++ default:
13433 ++ /* Only TCP_LISTEN and TCP_CLOSE are left, in these
13434 ++ * cases we should never reach this piece of code.
13435 ++ */
13436 ++ pr_err("%s: Impossible, meta_sk->sk_state=%d\n", __func__,
13437 ++ meta_sk->sk_state);
13438 ++ break;
13439 ++ }
13440 ++
13441 ++ /* It _is_ possible, that we have something out-of-order _after_ FIN.
13442 ++ * Probably, we should reset in this case. For now drop them.
13443 ++ */
13444 ++ mptcp_purge_ofo_queue(meta_tp);
13445 ++ sk_mem_reclaim(meta_sk);
13446 ++
13447 ++ if (!sock_flag(meta_sk, SOCK_DEAD)) {
13448 ++ meta_sk->sk_state_change(meta_sk);
13449 ++
13450 ++ /* Do not send POLL_HUP for half duplex close. */
13451 ++ if (meta_sk->sk_shutdown == SHUTDOWN_MASK ||
13452 ++ meta_sk->sk_state == TCP_CLOSE)
13453 ++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_HUP);
13454 ++ else
13455 ++ sk_wake_async(meta_sk, SOCK_WAKE_WAITD, POLL_IN);
13456 ++ }
13457 ++
13458 ++ return;
13459 ++}
13460 ++
13461 ++static void mptcp_xmit_retransmit_queue(struct sock *meta_sk)
13462 ++{
13463 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
13464 ++ struct sk_buff *skb;
13465 ++
13466 ++ if (!meta_tp->packets_out)
13467 ++ return;
13468 ++
13469 ++ tcp_for_write_queue(skb, meta_sk) {
13470 ++ if (skb == tcp_send_head(meta_sk))
13471 ++ break;
13472 ++
13473 ++ if (mptcp_retransmit_skb(meta_sk, skb))
13474 ++ return;
13475 ++
13476 ++ if (skb == tcp_write_queue_head(meta_sk))
13477 ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
13478 ++ inet_csk(meta_sk)->icsk_rto,
13479 ++ TCP_RTO_MAX);
13480 ++ }
13481 ++}
13482 ++
13483 ++/* Handle the DATA_ACK */
13484 ++static void mptcp_data_ack(struct sock *sk, const struct sk_buff *skb)
13485 ++{
13486 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
13487 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *tp = tcp_sk(sk);
13488 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
13489 ++ u32 prior_snd_una = meta_tp->snd_una;
13490 ++ int prior_packets;
13491 ++ u32 nwin, data_ack, data_seq;
13492 ++ u16 data_len = 0;
13493 ++
13494 ++ /* A valid packet came in - subflow is operational again */
13495 ++ tp->pf = 0;
13496 ++
13497 ++ /* Even if there is no data-ack, we stop retransmitting.
13498 ++ * Except if this is a SYN/ACK. Then it is just a retransmission
13499 ++ */
13500 ++ if (tp->mptcp->pre_established && !tcp_hdr(skb)->syn) {
13501 ++ tp->mptcp->pre_established = 0;
13502 ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
13503 ++ }
13504 ++
13505 ++ /* If we are in infinite mapping mode, rx_opt.data_ack has been
13506 ++ * set by mptcp_clean_rtx_infinite.
13507 ++ */
13508 ++ if (!(tcb->mptcp_flags & MPTCPHDR_ACK) && !tp->mpcb->infinite_mapping_snd)
13509 ++ goto exit;
13510 ++
13511 ++ data_ack = tp->mptcp->rx_opt.data_ack;
13512 ++
13513 ++ if (unlikely(!tp->mptcp->fully_established) &&
13514 ++ tp->mptcp->snt_isn + 1 != TCP_SKB_CB(skb)->ack_seq)
13515 ++ /* As soon as a subflow-data-ack (not acking syn, thus snt_isn + 1)
13516 ++ * includes a data-ack, we are fully established
13517 ++ */
13518 ++ mptcp_become_fully_estab(sk);
13519 ++
13520 ++ /* Get the data_seq */
13521 ++ if (mptcp_is_data_seq(skb)) {
13522 ++ data_seq = tp->mptcp->rx_opt.data_seq;
13523 ++ data_len = tp->mptcp->rx_opt.data_len;
13524 ++ } else {
13525 ++ data_seq = meta_tp->snd_wl1;
13526 ++ }
13527 ++
13528 ++ /* If the ack is older than previous acks
13529 ++ * then we can probably ignore it.
13530 ++ */
13531 ++ if (before(data_ack, prior_snd_una))
13532 ++ goto exit;
13533 ++
13534 ++ /* If the ack includes data we haven't sent yet, discard
13535 ++ * this segment (RFC793 Section 3.9).
13536 ++ */
13537 ++ if (after(data_ack, meta_tp->snd_nxt))
13538 ++ goto exit;
13539 ++
13540 ++ /*** Now, update the window - inspired by tcp_ack_update_window ***/
13541 ++ nwin = ntohs(tcp_hdr(skb)->window);
13542 ++
13543 ++ if (likely(!tcp_hdr(skb)->syn))
13544 ++ nwin <<= tp->rx_opt.snd_wscale;
13545 ++
13546 ++ if (tcp_may_update_window(meta_tp, data_ack, data_seq, nwin)) {
13547 ++ tcp_update_wl(meta_tp, data_seq);
13548 ++
13549 ++ /* Draft v09, Section 3.3.5:
13550 ++ * [...] It should only update its local receive window values
13551 ++ * when the largest sequence number allowed (i.e. DATA_ACK +
13552 ++ * receive window) increases. [...]
13553 ++ */
13554 ++ if (meta_tp->snd_wnd != nwin &&
13555 ++ !before(data_ack + nwin, tcp_wnd_end(meta_tp))) {
13556 ++ meta_tp->snd_wnd = nwin;
13557 ++
13558 ++ if (nwin > meta_tp->max_window)
13559 ++ meta_tp->max_window = nwin;
13560 ++ }
13561 ++ }
13562 ++ /*** Done, update the window ***/
13563 ++
13564 ++ /* We passed data and got it acked, remove any soft error
13565 ++ * log. Something worked...
13566 ++ */
13567 ++ sk->sk_err_soft = 0;
13568 ++ inet_csk(meta_sk)->icsk_probes_out = 0;
13569 ++ meta_tp->rcv_tstamp = tcp_time_stamp;
13570 ++ prior_packets = meta_tp->packets_out;
13571 ++ if (!prior_packets)
13572 ++ goto no_queue;
13573 ++
13574 ++ meta_tp->snd_una = data_ack;
13575 ++
13576 ++ mptcp_clean_rtx_queue(meta_sk, prior_snd_una);
13577 ++
13578 ++ /* We are in loss-state, and something got acked, retransmit the whole
13579 ++ * queue now!
13580 ++ */
13581 ++ if (inet_csk(meta_sk)->icsk_ca_state == TCP_CA_Loss &&
13582 ++ after(data_ack, prior_snd_una)) {
13583 ++ mptcp_xmit_retransmit_queue(meta_sk);
13584 ++ inet_csk(meta_sk)->icsk_ca_state = TCP_CA_Open;
13585 ++ }
13586 ++
13587 ++ /* Simplified version of tcp_new_space, because the snd-buffer
13588 ++ * is handled by all the subflows.
13589 ++ */
13590 ++ if (sock_flag(meta_sk, SOCK_QUEUE_SHRUNK)) {
13591 ++ sock_reset_flag(meta_sk, SOCK_QUEUE_SHRUNK);
13592 ++ if (meta_sk->sk_socket &&
13593 ++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
13594 ++ meta_sk->sk_write_space(meta_sk);
13595 ++ }
13596 ++
13597 ++ if (meta_sk->sk_state != TCP_ESTABLISHED &&
13598 ++ mptcp_rcv_state_process(meta_sk, sk, skb, data_seq, data_len))
13599 ++ return;
13600 ++
13601 ++exit:
13602 ++ mptcp_push_pending_frames(meta_sk);
13603 ++
13604 ++ return;
13605 ++
13606 ++no_queue:
13607 ++ if (tcp_send_head(meta_sk))
13608 ++ tcp_ack_probe(meta_sk);
13609 ++
13610 ++ mptcp_push_pending_frames(meta_sk);
13611 ++
13612 ++ return;
13613 ++}
13614 ++
13615 ++void mptcp_clean_rtx_infinite(const struct sk_buff *skb, struct sock *sk)
13616 ++{
13617 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = tcp_sk(mptcp_meta_sk(sk));
13618 ++
13619 ++ if (!tp->mpcb->infinite_mapping_snd)
13620 ++ return;
13621 ++
13622 ++ /* The difference between both write_seq's represents the offset between
13623 ++ * data-sequence and subflow-sequence. As we are infinite, this must
13624 ++ * match.
13625 ++ *
13626 ++ * Thus, from this difference we can infer the meta snd_una.
13627 ++ */
13628 ++ tp->mptcp->rx_opt.data_ack = meta_tp->snd_nxt - tp->snd_nxt +
13629 ++ tp->snd_una;
13630 ++
13631 ++ mptcp_data_ack(sk, skb);
13632 ++}
13633 ++
13634 ++/**** static functions used by mptcp_parse_options */
13635 ++
13636 ++static void mptcp_send_reset_rem_id(const struct mptcp_cb *mpcb, u8 rem_id)
13637 ++{
13638 ++ struct sock *sk_it, *tmpsk;
13639 ++
13640 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
13641 ++ if (tcp_sk(sk_it)->mptcp->rem_id == rem_id) {
13642 ++ mptcp_reinject_data(sk_it, 0);
13643 ++ sk_it->sk_err = ECONNRESET;
13644 ++ if (tcp_need_reset(sk_it->sk_state))
13645 ++ tcp_sk(sk_it)->ops->send_active_reset(sk_it,
13646 ++ GFP_ATOMIC);
13647 ++ mptcp_sub_force_close(sk_it);
13648 ++ }
13649 ++ }
13650 ++}
13651 ++
13652 ++void mptcp_parse_options(const uint8_t *ptr, int opsize,
13653 ++ struct mptcp_options_received *mopt,
13654 ++ const struct sk_buff *skb)
13655 ++{
13656 ++ const struct mptcp_option *mp_opt = (struct mptcp_option *)ptr;
13657 ++
13658 ++ /* If the socket is mp-capable we would have a mopt. */
13659 ++ if (!mopt)
13660 ++ return;
13661 ++
13662 ++ switch (mp_opt->sub) {
13663 ++ case MPTCP_SUB_CAPABLE:
13664 ++ {
13665 ++ const struct mp_capable *mpcapable = (struct mp_capable *)ptr;
13666 ++
13667 ++ if (opsize != MPTCP_SUB_LEN_CAPABLE_SYN &&
13668 ++ opsize != MPTCP_SUB_LEN_CAPABLE_ACK) {
13669 ++ mptcp_debug("%s: mp_capable: bad option size %d\n",
13670 ++ __func__, opsize);
13671 ++ break;
13672 ++ }
13673 ++
13674 ++ if (!sysctl_mptcp_enabled)
13675 ++ break;
13676 ++
13677 ++ /* We only support MPTCP version 0 */
13678 ++ if (mpcapable->ver != 0)
13679 ++ break;
13680 ++
13681 ++ /* MPTCP-RFC 6824:
13682 ++ * "If receiving a message with the 'B' flag set to 1, and this
13683 ++ * is not understood, then this SYN MUST be silently ignored;
13684 ++ */
13685 ++ if (mpcapable->b) {
13686 ++ mopt->drop_me = 1;
13687 ++ break;
13688 ++ }
13689 ++
13690 ++ /* MPTCP-RFC 6824:
13691 ++ * "An implementation that only supports this method MUST set
13692 ++ * bit "H" to 1, and bits "C" through "G" to 0."
13693 ++ */
13694 ++ if (!mpcapable->h)
13695 ++ break;
13696 ++
13697 ++ mopt->saw_mpc = 1;
13698 ++ mopt->dss_csum = sysctl_mptcp_checksum || mpcapable->a;
13699 ++
13700 ++ if (opsize >= MPTCP_SUB_LEN_CAPABLE_SYN)
13701 ++ mopt->mptcp_key = mpcapable->sender_key;
13702 ++
13703 ++ break;
13704 ++ }
13705 ++ case MPTCP_SUB_JOIN:
13706 ++ {
13707 ++ const struct mp_join *mpjoin = (struct mp_join *)ptr;
13708 ++
13709 ++ if (opsize != MPTCP_SUB_LEN_JOIN_SYN &&
13710 ++ opsize != MPTCP_SUB_LEN_JOIN_SYNACK &&
13711 ++ opsize != MPTCP_SUB_LEN_JOIN_ACK) {
13712 ++ mptcp_debug("%s: mp_join: bad option size %d\n",
13713 ++ __func__, opsize);
13714 ++ break;
13715 ++ }
13716 ++
13717 ++ /* saw_mpc must be set, because in tcp_check_req we assume that
13718 ++ * it is set to support falling back to reg. TCP if a rexmitted
13719 ++ * SYN has no MP_CAPABLE or MP_JOIN
13720 ++ */
13721 ++ switch (opsize) {
13722 ++ case MPTCP_SUB_LEN_JOIN_SYN:
13723 ++ mopt->is_mp_join = 1;
13724 ++ mopt->saw_mpc = 1;
13725 ++ mopt->low_prio = mpjoin->b;
13726 ++ mopt->rem_id = mpjoin->addr_id;
13727 ++ mopt->mptcp_rem_token = mpjoin->u.syn.token;
13728 ++ mopt->mptcp_recv_nonce = mpjoin->u.syn.nonce;
13729 ++ break;
13730 ++ case MPTCP_SUB_LEN_JOIN_SYNACK:
13731 ++ mopt->saw_mpc = 1;
13732 ++ mopt->low_prio = mpjoin->b;
13733 ++ mopt->rem_id = mpjoin->addr_id;
13734 ++ mopt->mptcp_recv_tmac = mpjoin->u.synack.mac;
13735 ++ mopt->mptcp_recv_nonce = mpjoin->u.synack.nonce;
13736 ++ break;
13737 ++ case MPTCP_SUB_LEN_JOIN_ACK:
13738 ++ mopt->saw_mpc = 1;
13739 ++ mopt->join_ack = 1;
13740 ++ memcpy(mopt->mptcp_recv_mac, mpjoin->u.ack.mac, 20);
13741 ++ break;
13742 ++ }
13743 ++ break;
13744 ++ }
13745 ++ case MPTCP_SUB_DSS:
13746 ++ {
13747 ++ const struct mp_dss *mdss = (struct mp_dss *)ptr;
13748 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
13749 ++
13750 ++ /* We check opsize for the csum and non-csum case. We do this,
13751 ++ * because the draft says that the csum SHOULD be ignored if
13752 ++ * it has not been negotiated in the MP_CAPABLE but still is
13753 ++ * present in the data.
13754 ++ *
13755 ++ * It will get ignored later in mptcp_queue_skb.
13756 ++ */
13757 ++ if (opsize != mptcp_sub_len_dss(mdss, 0) &&
13758 ++ opsize != mptcp_sub_len_dss(mdss, 1)) {
13759 ++ mptcp_debug("%s: mp_dss: bad option size %d\n",
13760 ++ __func__, opsize);
13761 ++ break;
13762 ++ }
13763 ++
13764 ++ ptr += 4;
13765 ++
13766 ++ if (mdss->A) {
13767 ++ tcb->mptcp_flags |= MPTCPHDR_ACK;
13768 ++
13769 ++ if (mdss->a) {
13770 ++ mopt->data_ack = (u32) get_unaligned_be64(ptr);
13771 ++ ptr += MPTCP_SUB_LEN_ACK_64;
13772 ++ } else {
13773 ++ mopt->data_ack = get_unaligned_be32(ptr);
13774 ++ ptr += MPTCP_SUB_LEN_ACK;
13775 ++ }
13776 ++ }
13777 ++
13778 ++ tcb->dss_off = (ptr - skb_transport_header(skb));
13779 ++
13780 ++ if (mdss->M) {
13781 ++ if (mdss->m) {
13782 ++ u64 data_seq64 = get_unaligned_be64(ptr);
13783 ++
13784 ++ tcb->mptcp_flags |= MPTCPHDR_SEQ64_SET;
13785 ++ mopt->data_seq = (u32) data_seq64;
13786 ++
13787 ++ ptr += 12; /* 64-bit dseq + subseq */
13788 ++ } else {
13789 ++ mopt->data_seq = get_unaligned_be32(ptr);
13790 ++ ptr += 8; /* 32-bit dseq + subseq */
13791 ++ }
13792 ++ mopt->data_len = get_unaligned_be16(ptr);
13793 ++
13794 ++ tcb->mptcp_flags |= MPTCPHDR_SEQ;
13795 ++
13796 ++ /* Is a check-sum present? */
13797 ++ if (opsize == mptcp_sub_len_dss(mdss, 1))
13798 ++ tcb->mptcp_flags |= MPTCPHDR_DSS_CSUM;
13799 ++
13800 ++ /* DATA_FIN only possible with DSS-mapping */
13801 ++ if (mdss->F)
13802 ++ tcb->mptcp_flags |= MPTCPHDR_FIN;
13803 ++ }
13804 ++
13805 ++ break;
13806 ++ }
13807 ++ case MPTCP_SUB_ADD_ADDR:
13808 ++ {
13809 ++#if IS_ENABLED(CONFIG_IPV6)
13810 ++ const struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
13811 ++
13812 ++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
13813 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
13814 ++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
13815 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2)) {
13816 ++#else
13817 ++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
13818 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) {
13819 ++#endif /* CONFIG_IPV6 */
13820 ++ mptcp_debug("%s: mp_add_addr: bad option size %d\n",
13821 ++ __func__, opsize);
13822 ++ break;
13823 ++ }
13824 ++
13825 ++ /* We have to manually parse the options if we got two of them. */
13826 ++ if (mopt->saw_add_addr) {
13827 ++ mopt->more_add_addr = 1;
13828 ++ break;
13829 ++ }
13830 ++ mopt->saw_add_addr = 1;
13831 ++ mopt->add_addr_ptr = ptr;
13832 ++ break;
13833 ++ }
13834 ++ case MPTCP_SUB_REMOVE_ADDR:
13835 ++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0) {
13836 ++ mptcp_debug("%s: mp_remove_addr: bad option size %d\n",
13837 ++ __func__, opsize);
13838 ++ break;
13839 ++ }
13840 ++
13841 ++ if (mopt->saw_rem_addr) {
13842 ++ mopt->more_rem_addr = 1;
13843 ++ break;
13844 ++ }
13845 ++ mopt->saw_rem_addr = 1;
13846 ++ mopt->rem_addr_ptr = ptr;
13847 ++ break;
13848 ++ case MPTCP_SUB_PRIO:
13849 ++ {
13850 ++ const struct mp_prio *mpprio = (struct mp_prio *)ptr;
13851 ++
13852 ++ if (opsize != MPTCP_SUB_LEN_PRIO &&
13853 ++ opsize != MPTCP_SUB_LEN_PRIO_ADDR) {
13854 ++ mptcp_debug("%s: mp_prio: bad option size %d\n",
13855 ++ __func__, opsize);
13856 ++ break;
13857 ++ }
13858 ++
13859 ++ mopt->saw_low_prio = 1;
13860 ++ mopt->low_prio = mpprio->b;
13861 ++
13862 ++ if (opsize == MPTCP_SUB_LEN_PRIO_ADDR) {
13863 ++ mopt->saw_low_prio = 2;
13864 ++ mopt->prio_addr_id = mpprio->addr_id;
13865 ++ }
13866 ++ break;
13867 ++ }
13868 ++ case MPTCP_SUB_FAIL:
13869 ++ if (opsize != MPTCP_SUB_LEN_FAIL) {
13870 ++ mptcp_debug("%s: mp_fail: bad option size %d\n",
13871 ++ __func__, opsize);
13872 ++ break;
13873 ++ }
13874 ++ mopt->mp_fail = 1;
13875 ++ break;
13876 ++ case MPTCP_SUB_FCLOSE:
13877 ++ if (opsize != MPTCP_SUB_LEN_FCLOSE) {
13878 ++ mptcp_debug("%s: mp_fclose: bad option size %d\n",
13879 ++ __func__, opsize);
13880 ++ break;
13881 ++ }
13882 ++
13883 ++ mopt->mp_fclose = 1;
13884 ++ mopt->mptcp_key = ((struct mp_fclose *)ptr)->key;
13885 ++
13886 ++ break;
13887 ++ default:
13888 ++ mptcp_debug("%s: Received unkown subtype: %d\n",
13889 ++ __func__, mp_opt->sub);
13890 ++ break;
13891 ++ }
13892 ++}
13893 ++
13894 ++/** Parse only MPTCP options */
13895 ++void tcp_parse_mptcp_options(const struct sk_buff *skb,
13896 ++ struct mptcp_options_received *mopt)
13897 ++{
13898 ++ const struct tcphdr *th = tcp_hdr(skb);
13899 ++ int length = (th->doff * 4) - sizeof(struct tcphdr);
13900 ++ const unsigned char *ptr = (const unsigned char *)(th + 1);
13901 ++
13902 ++ while (length > 0) {
13903 ++ int opcode = *ptr++;
13904 ++ int opsize;
13905 ++
13906 ++ switch (opcode) {
13907 ++ case TCPOPT_EOL:
13908 ++ return;
13909 ++ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
13910 ++ length--;
13911 ++ continue;
13912 ++ default:
13913 ++ opsize = *ptr++;
13914 ++ if (opsize < 2) /* "silly options" */
13915 ++ return;
13916 ++ if (opsize > length)
13917 ++ return; /* don't parse partial options */
13918 ++ if (opcode == TCPOPT_MPTCP)
13919 ++ mptcp_parse_options(ptr - 2, opsize, mopt, skb);
13920 ++ }
13921 ++ ptr += opsize - 2;
13922 ++ length -= opsize;
13923 ++ }
13924 ++}
13925 ++
13926 ++int mptcp_check_rtt(const struct tcp_sock *tp, int time)
13927 ++{
13928 ++ struct mptcp_cb *mpcb = tp->mpcb;
13929 ++ struct sock *sk;
13930 ++ u32 rtt_max = 0;
13931 ++
13932 ++ /* In MPTCP, we take the max delay across all flows,
13933 ++ * in order to take into account meta-reordering buffers.
13934 ++ */
13935 ++ mptcp_for_each_sk(mpcb, sk) {
13936 ++ if (!mptcp_sk_can_recv(sk))
13937 ++ continue;
13938 ++
13939 ++ if (rtt_max < tcp_sk(sk)->rcv_rtt_est.rtt)
13940 ++ rtt_max = tcp_sk(sk)->rcv_rtt_est.rtt;
13941 ++ }
13942 ++ if (time < (rtt_max >> 3) || !rtt_max)
13943 ++ return 1;
13944 ++
13945 ++ return 0;
13946 ++}
13947 ++
13948 ++static void mptcp_handle_add_addr(const unsigned char *ptr, struct sock *sk)
13949 ++{
13950 ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
13951 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
13952 ++ __be16 port = 0;
13953 ++ union inet_addr addr;
13954 ++ sa_family_t family;
13955 ++
13956 ++ if (mpadd->ipver == 4) {
13957 ++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR4 + 2)
13958 ++ port = mpadd->u.v4.port;
13959 ++ family = AF_INET;
13960 ++ addr.in = mpadd->u.v4.addr;
13961 ++#if IS_ENABLED(CONFIG_IPV6)
13962 ++ } else if (mpadd->ipver == 6) {
13963 ++ if (mpadd->len == MPTCP_SUB_LEN_ADD_ADDR6 + 2)
13964 ++ port = mpadd->u.v6.port;
13965 ++ family = AF_INET6;
13966 ++ addr.in6 = mpadd->u.v6.addr;
13967 ++#endif /* CONFIG_IPV6 */
13968 ++ } else {
13969 ++ return;
13970 ++ }
13971 ++
13972 ++ if (mpcb->pm_ops->add_raddr)
13973 ++ mpcb->pm_ops->add_raddr(mpcb, &addr, family, port, mpadd->addr_id);
13974 ++}
13975 ++
13976 ++static void mptcp_handle_rem_addr(const unsigned char *ptr, struct sock *sk)
13977 ++{
13978 ++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
13979 ++ int i;
13980 ++ u8 rem_id;
13981 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
13982 ++
13983 ++ for (i = 0; i <= mprem->len - MPTCP_SUB_LEN_REMOVE_ADDR; i++) {
13984 ++ rem_id = (&mprem->addrs_id)[i];
13985 ++
13986 ++ if (mpcb->pm_ops->rem_raddr)
13987 ++ mpcb->pm_ops->rem_raddr(mpcb, rem_id);
13988 ++ mptcp_send_reset_rem_id(mpcb, rem_id);
13989 ++ }
13990 ++}
13991 ++
13992 ++static void mptcp_parse_addropt(const struct sk_buff *skb, struct sock *sk)
13993 ++{
13994 ++ struct tcphdr *th = tcp_hdr(skb);
13995 ++ unsigned char *ptr;
13996 ++ int length = (th->doff * 4) - sizeof(struct tcphdr);
13997 ++
13998 ++ /* Jump through the options to check whether ADD_ADDR is there */
13999 ++ ptr = (unsigned char *)(th + 1);
14000 ++ while (length > 0) {
14001 ++ int opcode = *ptr++;
14002 ++ int opsize;
14003 ++
14004 ++ switch (opcode) {
14005 ++ case TCPOPT_EOL:
14006 ++ return;
14007 ++ case TCPOPT_NOP:
14008 ++ length--;
14009 ++ continue;
14010 ++ default:
14011 ++ opsize = *ptr++;
14012 ++ if (opsize < 2)
14013 ++ return;
14014 ++ if (opsize > length)
14015 ++ return; /* don't parse partial options */
14016 ++ if (opcode == TCPOPT_MPTCP &&
14017 ++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_ADD_ADDR) {
14018 ++#if IS_ENABLED(CONFIG_IPV6)
14019 ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
14020 ++ if ((mpadd->ipver == 4 && opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
14021 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2) ||
14022 ++ (mpadd->ipver == 6 && opsize != MPTCP_SUB_LEN_ADD_ADDR6 &&
14023 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR6 + 2))
14024 ++#else
14025 ++ if (opsize != MPTCP_SUB_LEN_ADD_ADDR4 &&
14026 ++ opsize != MPTCP_SUB_LEN_ADD_ADDR4 + 2)
14027 ++#endif /* CONFIG_IPV6 */
14028 ++ goto cont;
14029 ++
14030 ++ mptcp_handle_add_addr(ptr, sk);
14031 ++ }
14032 ++ if (opcode == TCPOPT_MPTCP &&
14033 ++ ((struct mptcp_option *)ptr)->sub == MPTCP_SUB_REMOVE_ADDR) {
14034 ++ if ((opsize - MPTCP_SUB_LEN_REMOVE_ADDR) < 0)
14035 ++ goto cont;
14036 ++
14037 ++ mptcp_handle_rem_addr(ptr, sk);
14038 ++ }
14039 ++cont:
14040 ++ ptr += opsize - 2;
14041 ++ length -= opsize;
14042 ++ }
14043 ++ }
14044 ++ return;
14045 ++}
14046 ++
14047 ++static inline int mptcp_mp_fail_rcvd(struct sock *sk, const struct tcphdr *th)
14048 ++{
14049 ++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk)->mptcp;
14050 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
14051 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
14052 ++
14053 ++ if (unlikely(mptcp->rx_opt.mp_fail)) {
14054 ++ mptcp->rx_opt.mp_fail = 0;
14055 ++
14056 ++ if (!th->rst && !mpcb->infinite_mapping_snd) {
14057 ++ struct sock *sk_it;
14058 ++
14059 ++ mpcb->send_infinite_mapping = 1;
14060 ++ /* We resend everything that has not been acknowledged */
14061 ++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
14062 ++
14063 ++ /* We artificially restart the whole send-queue. Thus,
14064 ++ * it is as if no packets are in flight
14065 ++ */
14066 ++ tcp_sk(meta_sk)->packets_out = 0;
14067 ++
14068 ++ /* If the snd_nxt already wrapped around, we have to
14069 ++ * undo the wrapping, as we are restarting from snd_una
14070 ++ * on.
14071 ++ */
14072 ++ if (tcp_sk(meta_sk)->snd_nxt < tcp_sk(meta_sk)->snd_una) {
14073 ++ mpcb->snd_high_order[mpcb->snd_hiseq_index] -= 2;
14074 ++ mpcb->snd_hiseq_index = mpcb->snd_hiseq_index ? 0 : 1;
14075 ++ }
14076 ++ tcp_sk(meta_sk)->snd_nxt = tcp_sk(meta_sk)->snd_una;
14077 ++
14078 ++ /* Trigger a sending on the meta. */
14079 ++ mptcp_push_pending_frames(meta_sk);
14080 ++
14081 ++ mptcp_for_each_sk(mpcb, sk_it) {
14082 ++ if (sk != sk_it)
14083 ++ mptcp_sub_force_close(sk_it);
14084 ++ }
14085 ++ }
14086 ++
14087 ++ return 0;
14088 ++ }
14089 ++
14090 ++ if (unlikely(mptcp->rx_opt.mp_fclose)) {
14091 ++ struct sock *sk_it, *tmpsk;
14092 ++
14093 ++ mptcp->rx_opt.mp_fclose = 0;
14094 ++ if (mptcp->rx_opt.mptcp_key != mpcb->mptcp_loc_key)
14095 ++ return 0;
14096 ++
14097 ++ if (tcp_need_reset(sk->sk_state))
14098 ++ tcp_sk(sk)->ops->send_active_reset(sk, GFP_ATOMIC);
14099 ++
14100 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk)
14101 ++ mptcp_sub_force_close(sk_it);
14102 ++
14103 ++ tcp_reset(meta_sk);
14104 ++
14105 ++ return 1;
14106 ++ }
14107 ++
14108 ++ return 0;
14109 ++}
14110 ++
14111 ++static inline void mptcp_path_array_check(struct sock *meta_sk)
14112 ++{
14113 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
14114 ++
14115 ++ if (unlikely(mpcb->list_rcvd)) {
14116 ++ mpcb->list_rcvd = 0;
14117 ++ if (mpcb->pm_ops->new_remote_address)
14118 ++ mpcb->pm_ops->new_remote_address(meta_sk);
14119 ++ }
14120 ++}
14121 ++
14122 ++int mptcp_handle_options(struct sock *sk, const struct tcphdr *th,
14123 ++ const struct sk_buff *skb)
14124 ++{
14125 ++ struct tcp_sock *tp = tcp_sk(sk);
14126 ++ struct mptcp_options_received *mopt = &tp->mptcp->rx_opt;
14127 ++
14128 ++ if (tp->mpcb->infinite_mapping_rcv || tp->mpcb->infinite_mapping_snd)
14129 ++ return 0;
14130 ++
14131 ++ if (mptcp_mp_fail_rcvd(sk, th))
14132 ++ return 1;
14133 ++
14134 ++ /* RFC 6824, Section 3.3:
14135 ++ * If a checksum is not present when its use has been negotiated, the
14136 ++ * receiver MUST close the subflow with a RST as it is considered broken.
14137 ++ */
14138 ++ if (mptcp_is_data_seq(skb) && tp->mpcb->dss_csum &&
14139 ++ !(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_DSS_CSUM)) {
14140 ++ if (tcp_need_reset(sk->sk_state))
14141 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
14142 ++
14143 ++ mptcp_sub_force_close(sk);
14144 ++ return 1;
14145 ++ }
14146 ++
14147 ++ /* We have to acknowledge retransmissions of the third
14148 ++ * ack.
14149 ++ */
14150 ++ if (mopt->join_ack) {
14151 ++ tcp_send_delayed_ack(sk);
14152 ++ mopt->join_ack = 0;
14153 ++ }
14154 ++
14155 ++ if (mopt->saw_add_addr || mopt->saw_rem_addr) {
14156 ++ if (mopt->more_add_addr || mopt->more_rem_addr) {
14157 ++ mptcp_parse_addropt(skb, sk);
14158 ++ } else {
14159 ++ if (mopt->saw_add_addr)
14160 ++ mptcp_handle_add_addr(mopt->add_addr_ptr, sk);
14161 ++ if (mopt->saw_rem_addr)
14162 ++ mptcp_handle_rem_addr(mopt->rem_addr_ptr, sk);
14163 ++ }
14164 ++
14165 ++ mopt->more_add_addr = 0;
14166 ++ mopt->saw_add_addr = 0;
14167 ++ mopt->more_rem_addr = 0;
14168 ++ mopt->saw_rem_addr = 0;
14169 ++ }
14170 ++ if (mopt->saw_low_prio) {
14171 ++ if (mopt->saw_low_prio == 1) {
14172 ++ tp->mptcp->rcv_low_prio = mopt->low_prio;
14173 ++ } else {
14174 ++ struct sock *sk_it;
14175 ++ mptcp_for_each_sk(tp->mpcb, sk_it) {
14176 ++ struct mptcp_tcp_sock *mptcp = tcp_sk(sk_it)->mptcp;
14177 ++ if (mptcp->rem_id == mopt->prio_addr_id)
14178 ++ mptcp->rcv_low_prio = mopt->low_prio;
14179 ++ }
14180 ++ }
14181 ++ mopt->saw_low_prio = 0;
14182 ++ }
14183 ++
14184 ++ mptcp_data_ack(sk, skb);
14185 ++
14186 ++ mptcp_path_array_check(mptcp_meta_sk(sk));
14187 ++ /* Socket may have been mp_killed by a REMOVE_ADDR */
14188 ++ if (tp->mp_killed)
14189 ++ return 1;
14190 ++
14191 ++ return 0;
14192 ++}
14193 ++
14194 ++/* In case of fastopen, some data can already be in the write queue.
14195 ++ * We need to update the sequence number of the segments as they
14196 ++ * were initially TCP sequence numbers.
14197 ++ */
14198 ++static void mptcp_rcv_synsent_fastopen(struct sock *meta_sk)
14199 ++{
14200 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14201 ++ struct tcp_sock *master_tp = tcp_sk(meta_tp->mpcb->master_sk);
14202 ++ struct sk_buff *skb;
14203 ++ u32 new_mapping = meta_tp->write_seq - master_tp->snd_una;
14204 ++
14205 ++ /* There should only be one skb in write queue: the data not
14206 ++ * acknowledged in the SYN+ACK. In this case, we need to map
14207 ++ * this data to data sequence numbers.
14208 ++ */
14209 ++ skb_queue_walk(&meta_sk->sk_write_queue, skb) {
14210 ++ /* If the server only acknowledges partially the data sent in
14211 ++ * the SYN, we need to trim the acknowledged part because
14212 ++ * we don't want to retransmit this already received data.
14213 ++ * When we reach this point, tcp_ack() has already cleaned up
14214 ++ * fully acked segments. However, tcp trims partially acked
14215 ++ * segments only when retransmitting. Since MPTCP comes into
14216 ++ * play only now, we will fake an initial transmit, and
14217 ++ * retransmit_skb() will not be called. The following fragment
14218 ++ * comes from __tcp_retransmit_skb().
14219 ++ */
14220 ++ if (before(TCP_SKB_CB(skb)->seq, master_tp->snd_una)) {
14221 ++ BUG_ON(before(TCP_SKB_CB(skb)->end_seq,
14222 ++ master_tp->snd_una));
14223 ++ /* tcp_trim_head can only returns ENOMEM if skb is
14224 ++ * cloned. It is not the case here (see
14225 ++ * tcp_send_syn_data).
14226 ++ */
14227 ++ BUG_ON(tcp_trim_head(meta_sk, skb, master_tp->snd_una -
14228 ++ TCP_SKB_CB(skb)->seq));
14229 ++ }
14230 ++
14231 ++ TCP_SKB_CB(skb)->seq += new_mapping;
14232 ++ TCP_SKB_CB(skb)->end_seq += new_mapping;
14233 ++ }
14234 ++
14235 ++ /* We can advance write_seq by the number of bytes unacknowledged
14236 ++ * and that were mapped in the previous loop.
14237 ++ */
14238 ++ meta_tp->write_seq += master_tp->write_seq - master_tp->snd_una;
14239 ++
14240 ++ /* The packets from the master_sk will be entailed to it later
14241 ++ * Until that time, its write queue is empty, and
14242 ++ * write_seq must align with snd_una
14243 ++ */
14244 ++ master_tp->snd_nxt = master_tp->write_seq = master_tp->snd_una;
14245 ++ master_tp->packets_out = 0;
14246 ++
14247 ++ /* Although these data have been sent already over the subsk,
14248 ++ * They have never been sent over the meta_sk, so we rewind
14249 ++ * the send_head so that tcp considers it as an initial send
14250 ++ * (instead of retransmit).
14251 ++ */
14252 ++ meta_sk->sk_send_head = tcp_write_queue_head(meta_sk);
14253 ++}
14254 ++
14255 ++/* The skptr is needed, because if we become MPTCP-capable, we have to switch
14256 ++ * from meta-socket to master-socket.
14257 ++ *
14258 ++ * @return: 1 - we want to reset this connection
14259 ++ * 2 - we want to discard the received syn/ack
14260 ++ * 0 - everything is fine - continue
14261 ++ */
14262 ++int mptcp_rcv_synsent_state_process(struct sock *sk, struct sock **skptr,
14263 ++ const struct sk_buff *skb,
14264 ++ const struct mptcp_options_received *mopt)
14265 ++{
14266 ++ struct tcp_sock *tp = tcp_sk(sk);
14267 ++
14268 ++ if (mptcp(tp)) {
14269 ++ u8 hash_mac_check[20];
14270 ++ struct mptcp_cb *mpcb = tp->mpcb;
14271 ++
14272 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_rem_key,
14273 ++ (u8 *)&mpcb->mptcp_loc_key,
14274 ++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
14275 ++ (u8 *)&tp->mptcp->mptcp_loc_nonce,
14276 ++ (u32 *)hash_mac_check);
14277 ++ if (memcmp(hash_mac_check,
14278 ++ (char *)&tp->mptcp->rx_opt.mptcp_recv_tmac, 8)) {
14279 ++ mptcp_sub_force_close(sk);
14280 ++ return 1;
14281 ++ }
14282 ++
14283 ++ /* Set this flag in order to postpone data sending
14284 ++ * until the 4th ack arrives.
14285 ++ */
14286 ++ tp->mptcp->pre_established = 1;
14287 ++ tp->mptcp->rcv_low_prio = tp->mptcp->rx_opt.low_prio;
14288 ++
14289 ++ mptcp_hmac_sha1((u8 *)&mpcb->mptcp_loc_key,
14290 ++ (u8 *)&mpcb->mptcp_rem_key,
14291 ++ (u8 *)&tp->mptcp->mptcp_loc_nonce,
14292 ++ (u8 *)&tp->mptcp->rx_opt.mptcp_recv_nonce,
14293 ++ (u32 *)&tp->mptcp->sender_mac[0]);
14294 ++
14295 ++ } else if (mopt->saw_mpc) {
14296 ++ struct sock *meta_sk = sk;
14297 ++
14298 ++ if (mptcp_create_master_sk(sk, mopt->mptcp_key,
14299 ++ ntohs(tcp_hdr(skb)->window)))
14300 ++ return 2;
14301 ++
14302 ++ sk = tcp_sk(sk)->mpcb->master_sk;
14303 ++ *skptr = sk;
14304 ++ tp = tcp_sk(sk);
14305 ++
14306 ++ /* If fastopen was used data might be in the send queue. We
14307 ++ * need to update their sequence number to MPTCP-level seqno.
14308 ++ * Note that it can happen in rare cases that fastopen_req is
14309 ++ * NULL and syn_data is 0 but fastopen indeed occurred and
14310 ++ * data has been queued in the write queue (but not sent).
14311 ++ * Example of such rare cases: connect is non-blocking and
14312 ++ * TFO is configured to work without cookies.
14313 ++ */
14314 ++ if (!skb_queue_empty(&meta_sk->sk_write_queue))
14315 ++ mptcp_rcv_synsent_fastopen(meta_sk);
14316 ++
14317 ++ /* -1, because the SYN consumed 1 byte. In case of TFO, we
14318 ++ * start the subflow-sequence number as if the data of the SYN
14319 ++ * is not part of any mapping.
14320 ++ */
14321 ++ tp->mptcp->snt_isn = tp->snd_una - 1;
14322 ++ tp->mpcb->dss_csum = mopt->dss_csum;
14323 ++ tp->mptcp->include_mpc = 1;
14324 ++
14325 ++ /* Ensure that fastopen is handled at the meta-level. */
14326 ++ tp->fastopen_req = NULL;
14327 ++
14328 ++ sk_set_socket(sk, mptcp_meta_sk(sk)->sk_socket);
14329 ++ sk->sk_wq = mptcp_meta_sk(sk)->sk_wq;
14330 ++
14331 ++ /* hold in sk_clone_lock due to initialization to 2 */
14332 ++ sock_put(sk);
14333 ++ } else {
14334 ++ tp->request_mptcp = 0;
14335 ++
14336 ++ if (tp->inside_tk_table)
14337 ++ mptcp_hash_remove(tp);
14338 ++ }
14339 ++
14340 ++ if (mptcp(tp))
14341 ++ tp->mptcp->rcv_isn = TCP_SKB_CB(skb)->seq;
14342 ++
14343 ++ return 0;
14344 ++}
14345 ++
14346 ++bool mptcp_should_expand_sndbuf(const struct sock *sk)
14347 ++{
14348 ++ const struct sock *sk_it;
14349 ++ const struct sock *meta_sk = mptcp_meta_sk(sk);
14350 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14351 ++ int cnt_backups = 0;
14352 ++ int backup_available = 0;
14353 ++
14354 ++ /* We circumvent this check in tcp_check_space, because we want to
14355 ++ * always call sk_write_space. So, we reproduce the check here.
14356 ++ */
14357 ++ if (!meta_sk->sk_socket ||
14358 ++ !test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags))
14359 ++ return false;
14360 ++
14361 ++ /* If the user specified a specific send buffer setting, do
14362 ++ * not modify it.
14363 ++ */
14364 ++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
14365 ++ return false;
14366 ++
14367 ++ /* If we are under global TCP memory pressure, do not expand. */
14368 ++ if (sk_under_memory_pressure(meta_sk))
14369 ++ return false;
14370 ++
14371 ++ /* If we are under soft global TCP memory pressure, do not expand. */
14372 ++ if (sk_memory_allocated(meta_sk) >= sk_prot_mem_limits(meta_sk, 0))
14373 ++ return false;
14374 ++
14375 ++
14376 ++ /* For MPTCP we look for a subsocket that could send data.
14377 ++ * If we found one, then we update the send-buffer.
14378 ++ */
14379 ++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
14380 ++ struct tcp_sock *tp_it = tcp_sk(sk_it);
14381 ++
14382 ++ if (!mptcp_sk_can_send(sk_it))
14383 ++ continue;
14384 ++
14385 ++ /* Backup-flows have to be counted - if there is no other
14386 ++ * subflow we take the backup-flow into account.
14387 ++ */
14388 ++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio)
14389 ++ cnt_backups++;
14390 ++
14391 ++ if (tp_it->packets_out < tp_it->snd_cwnd) {
14392 ++ if (tp_it->mptcp->rcv_low_prio || tp_it->mptcp->low_prio) {
14393 ++ backup_available = 1;
14394 ++ continue;
14395 ++ }
14396 ++ return true;
14397 ++ }
14398 ++ }
14399 ++
14400 ++ /* Backup-flow is available for sending - update send-buffer */
14401 ++ if (meta_tp->mpcb->cnt_established == cnt_backups && backup_available)
14402 ++ return true;
14403 ++ return false;
14404 ++}
14405 ++
14406 ++void mptcp_init_buffer_space(struct sock *sk)
14407 ++{
14408 ++ struct tcp_sock *tp = tcp_sk(sk);
14409 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
14410 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
14411 ++ int space;
14412 ++
14413 ++ tcp_init_buffer_space(sk);
14414 ++
14415 ++ if (is_master_tp(tp)) {
14416 ++ meta_tp->rcvq_space.space = meta_tp->rcv_wnd;
14417 ++ meta_tp->rcvq_space.time = tcp_time_stamp;
14418 ++ meta_tp->rcvq_space.seq = meta_tp->copied_seq;
14419 ++
14420 ++ /* If there is only one subflow, we just use regular TCP
14421 ++ * autotuning. User-locks are handled already by
14422 ++ * tcp_init_buffer_space
14423 ++ */
14424 ++ meta_tp->window_clamp = tp->window_clamp;
14425 ++ meta_tp->rcv_ssthresh = tp->rcv_ssthresh;
14426 ++ meta_sk->sk_rcvbuf = sk->sk_rcvbuf;
14427 ++ meta_sk->sk_sndbuf = sk->sk_sndbuf;
14428 ++
14429 ++ return;
14430 ++ }
14431 ++
14432 ++ if (meta_sk->sk_userlocks & SOCK_RCVBUF_LOCK)
14433 ++ goto snd_buf;
14434 ++
14435 ++ /* Adding a new subflow to the rcv-buffer space. We make a simple
14436 ++ * addition, to give some space to allow traffic on the new subflow.
14437 ++ * Autotuning will increase it further later on.
14438 ++ */
14439 ++ space = min(meta_sk->sk_rcvbuf + sk->sk_rcvbuf, sysctl_tcp_rmem[2]);
14440 ++ if (space > meta_sk->sk_rcvbuf) {
14441 ++ meta_tp->window_clamp += tp->window_clamp;
14442 ++ meta_tp->rcv_ssthresh += tp->rcv_ssthresh;
14443 ++ meta_sk->sk_rcvbuf = space;
14444 ++ }
14445 ++
14446 ++snd_buf:
14447 ++ if (meta_sk->sk_userlocks & SOCK_SNDBUF_LOCK)
14448 ++ return;
14449 ++
14450 ++ /* Adding a new subflow to the send-buffer space. We make a simple
14451 ++ * addition, to give some space to allow traffic on the new subflow.
14452 ++ * Autotuning will increase it further later on.
14453 ++ */
14454 ++ space = min(meta_sk->sk_sndbuf + sk->sk_sndbuf, sysctl_tcp_wmem[2]);
14455 ++ if (space > meta_sk->sk_sndbuf) {
14456 ++ meta_sk->sk_sndbuf = space;
14457 ++ meta_sk->sk_write_space(meta_sk);
14458 ++ }
14459 ++}
14460 ++
14461 ++void mptcp_tcp_set_rto(struct sock *sk)
14462 ++{
14463 ++ tcp_set_rto(sk);
14464 ++ mptcp_set_rto(sk);
14465 ++}
14466 +diff --git a/net/mptcp/mptcp_ipv4.c b/net/mptcp/mptcp_ipv4.c
14467 +new file mode 100644
14468 +index 000000000000..1183d1305d35
14469 +--- /dev/null
14470 ++++ b/net/mptcp/mptcp_ipv4.c
14471 +@@ -0,0 +1,483 @@
14472 ++/*
14473 ++ * MPTCP implementation - IPv4-specific functions
14474 ++ *
14475 ++ * Initial Design & Implementation:
14476 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
14477 ++ *
14478 ++ * Current Maintainer:
14479 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
14480 ++ *
14481 ++ * Additional authors:
14482 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
14483 ++ * Gregory Detal <gregory.detal@×××××××××.be>
14484 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
14485 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
14486 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
14487 ++ * Andreas Ripke <ripke@××××××.eu>
14488 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
14489 ++ * Octavian Purdila <octavian.purdila@×××××.com>
14490 ++ * John Ronan <jronan@××××.org>
14491 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
14492 ++ * Brandon Heller <brandonh@××××××××.edu>
14493 ++ *
14494 ++ *
14495 ++ * This program is free software; you can redistribute it and/or
14496 ++ * modify it under the terms of the GNU General Public License
14497 ++ * as published by the Free Software Foundation; either version
14498 ++ * 2 of the License, or (at your option) any later version.
14499 ++ */
14500 ++
14501 ++#include <linux/export.h>
14502 ++#include <linux/ip.h>
14503 ++#include <linux/list.h>
14504 ++#include <linux/skbuff.h>
14505 ++#include <linux/spinlock.h>
14506 ++#include <linux/tcp.h>
14507 ++
14508 ++#include <net/inet_common.h>
14509 ++#include <net/inet_connection_sock.h>
14510 ++#include <net/mptcp.h>
14511 ++#include <net/mptcp_v4.h>
14512 ++#include <net/request_sock.h>
14513 ++#include <net/tcp.h>
14514 ++
14515 ++u32 mptcp_v4_get_nonce(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
14516 ++{
14517 ++ u32 hash[MD5_DIGEST_WORDS];
14518 ++
14519 ++ hash[0] = (__force u32)saddr;
14520 ++ hash[1] = (__force u32)daddr;
14521 ++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
14522 ++ hash[3] = mptcp_seed++;
14523 ++
14524 ++ md5_transform(hash, mptcp_secret);
14525 ++
14526 ++ return hash[0];
14527 ++}
14528 ++
14529 ++u64 mptcp_v4_get_key(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport)
14530 ++{
14531 ++ u32 hash[MD5_DIGEST_WORDS];
14532 ++
14533 ++ hash[0] = (__force u32)saddr;
14534 ++ hash[1] = (__force u32)daddr;
14535 ++ hash[2] = ((__force u16)sport << 16) + (__force u16)dport;
14536 ++ hash[3] = mptcp_seed++;
14537 ++
14538 ++ md5_transform(hash, mptcp_secret);
14539 ++
14540 ++ return *((u64 *)hash);
14541 ++}
14542 ++
14543 ++
14544 ++static void mptcp_v4_reqsk_destructor(struct request_sock *req)
14545 ++{
14546 ++ mptcp_reqsk_destructor(req);
14547 ++
14548 ++ tcp_v4_reqsk_destructor(req);
14549 ++}
14550 ++
14551 ++static int mptcp_v4_init_req(struct request_sock *req, struct sock *sk,
14552 ++ struct sk_buff *skb)
14553 ++{
14554 ++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb);
14555 ++ mptcp_reqsk_init(req, skb);
14556 ++
14557 ++ return 0;
14558 ++}
14559 ++
14560 ++static int mptcp_v4_join_init_req(struct request_sock *req, struct sock *sk,
14561 ++ struct sk_buff *skb)
14562 ++{
14563 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
14564 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
14565 ++ union inet_addr addr;
14566 ++ int loc_id;
14567 ++ bool low_prio = false;
14568 ++
14569 ++ /* We need to do this as early as possible. Because, if we fail later
14570 ++ * (e.g., get_local_id), then reqsk_free tries to remove the
14571 ++ * request-socket from the htb in mptcp_hash_request_remove as pprev
14572 ++ * may be different from NULL.
14573 ++ */
14574 ++ mtreq->hash_entry.pprev = NULL;
14575 ++
14576 ++ tcp_request_sock_ipv4_ops.init_req(req, sk, skb);
14577 ++
14578 ++ mtreq->mptcp_loc_nonce = mptcp_v4_get_nonce(ip_hdr(skb)->saddr,
14579 ++ ip_hdr(skb)->daddr,
14580 ++ tcp_hdr(skb)->source,
14581 ++ tcp_hdr(skb)->dest);
14582 ++ addr.ip = inet_rsk(req)->ir_loc_addr;
14583 ++ loc_id = mpcb->pm_ops->get_local_id(AF_INET, &addr, sock_net(sk), &low_prio);
14584 ++ if (loc_id == -1)
14585 ++ return -1;
14586 ++ mtreq->loc_id = loc_id;
14587 ++ mtreq->low_prio = low_prio;
14588 ++
14589 ++ mptcp_join_reqsk_init(mpcb, req, skb);
14590 ++
14591 ++ return 0;
14592 ++}
14593 ++
14594 ++/* Similar to tcp_request_sock_ops */
14595 ++struct request_sock_ops mptcp_request_sock_ops __read_mostly = {
14596 ++ .family = PF_INET,
14597 ++ .obj_size = sizeof(struct mptcp_request_sock),
14598 ++ .rtx_syn_ack = tcp_rtx_synack,
14599 ++ .send_ack = tcp_v4_reqsk_send_ack,
14600 ++ .destructor = mptcp_v4_reqsk_destructor,
14601 ++ .send_reset = tcp_v4_send_reset,
14602 ++ .syn_ack_timeout = tcp_syn_ack_timeout,
14603 ++};
14604 ++
14605 ++static void mptcp_v4_reqsk_queue_hash_add(struct sock *meta_sk,
14606 ++ struct request_sock *req,
14607 ++ const unsigned long timeout)
14608 ++{
14609 ++ const u32 h1 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
14610 ++ inet_rsk(req)->ir_rmt_port,
14611 ++ 0, MPTCP_HASH_SIZE);
14612 ++ /* We cannot call inet_csk_reqsk_queue_hash_add(), because we do not
14613 ++ * want to reset the keepalive-timer (responsible for retransmitting
14614 ++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
14615 ++ * overload the keepalive timer. Also, it's not a big deal, because the
14616 ++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
14617 ++ * if the third ACK gets lost, the client will handle the retransmission
14618 ++ * anyways. If our SYN/ACK gets lost, the client will retransmit the
14619 ++ * SYN.
14620 ++ */
14621 ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
14622 ++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
14623 ++ const u32 h2 = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
14624 ++ inet_rsk(req)->ir_rmt_port,
14625 ++ lopt->hash_rnd, lopt->nr_table_entries);
14626 ++
14627 ++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
14628 ++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)
14629 ++ mptcp_reset_synack_timer(meta_sk, timeout);
14630 ++
14631 ++ rcu_read_lock();
14632 ++ spin_lock(&mptcp_reqsk_hlock);
14633 ++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);
14634 ++ spin_unlock(&mptcp_reqsk_hlock);
14635 ++ rcu_read_unlock();
14636 ++}
14637 ++
14638 ++/* Similar to tcp_v4_conn_request */
14639 ++static int mptcp_v4_join_request(struct sock *meta_sk, struct sk_buff *skb)
14640 ++{
14641 ++ return tcp_conn_request(&mptcp_request_sock_ops,
14642 ++ &mptcp_join_request_sock_ipv4_ops,
14643 ++ meta_sk, skb);
14644 ++}
14645 ++
14646 ++/* We only process join requests here. (either the SYN or the final ACK) */
14647 ++int mptcp_v4_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
14648 ++{
14649 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
14650 ++ struct sock *child, *rsk = NULL;
14651 ++ int ret;
14652 ++
14653 ++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
14654 ++ struct tcphdr *th = tcp_hdr(skb);
14655 ++ const struct iphdr *iph = ip_hdr(skb);
14656 ++ struct sock *sk;
14657 ++
14658 ++ sk = inet_lookup_established(sock_net(meta_sk), &tcp_hashinfo,
14659 ++ iph->saddr, th->source, iph->daddr,
14660 ++ th->dest, inet_iif(skb));
14661 ++
14662 ++ if (!sk) {
14663 ++ kfree_skb(skb);
14664 ++ return 0;
14665 ++ }
14666 ++ if (is_meta_sk(sk)) {
14667 ++ WARN("%s Did not find a sub-sk - did found the meta!\n", __func__);
14668 ++ kfree_skb(skb);
14669 ++ sock_put(sk);
14670 ++ return 0;
14671 ++ }
14672 ++
14673 ++ if (sk->sk_state == TCP_TIME_WAIT) {
14674 ++ inet_twsk_put(inet_twsk(sk));
14675 ++ kfree_skb(skb);
14676 ++ return 0;
14677 ++ }
14678 ++
14679 ++ ret = tcp_v4_do_rcv(sk, skb);
14680 ++ sock_put(sk);
14681 ++
14682 ++ return ret;
14683 ++ }
14684 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
14685 ++
14686 ++ /* Has been removed from the tk-table. Thus, no new subflows.
14687 ++ *
14688 ++ * Check for close-state is necessary, because we may have been closed
14689 ++ * without passing by mptcp_close().
14690 ++ *
14691 ++ * When falling back, no new subflows are allowed either.
14692 ++ */
14693 ++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
14694 ++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
14695 ++ goto reset_and_discard;
14696 ++
14697 ++ child = tcp_v4_hnd_req(meta_sk, skb);
14698 ++
14699 ++ if (!child)
14700 ++ goto discard;
14701 ++
14702 ++ if (child != meta_sk) {
14703 ++ sock_rps_save_rxhash(child, skb);
14704 ++ /* We don't call tcp_child_process here, because we hold
14705 ++ * already the meta-sk-lock and are sure that it is not owned
14706 ++ * by the user.
14707 ++ */
14708 ++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
14709 ++ bh_unlock_sock(child);
14710 ++ sock_put(child);
14711 ++ if (ret) {
14712 ++ rsk = child;
14713 ++ goto reset_and_discard;
14714 ++ }
14715 ++ } else {
14716 ++ if (tcp_hdr(skb)->syn) {
14717 ++ mptcp_v4_join_request(meta_sk, skb);
14718 ++ goto discard;
14719 ++ }
14720 ++ goto reset_and_discard;
14721 ++ }
14722 ++ return 0;
14723 ++
14724 ++reset_and_discard:
14725 ++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {
14726 ++ const struct tcphdr *th = tcp_hdr(skb);
14727 ++ const struct iphdr *iph = ip_hdr(skb);
14728 ++ struct request_sock **prev, *req;
14729 ++ /* If we end up here, it means we should not have matched on the
14730 ++ * request-socket. But, because the request-sock queue is only
14731 ++ * destroyed in mptcp_close, the socket may actually already be
14732 ++ * in close-state (e.g., through shutdown()) while still having
14733 ++ * pending request sockets.
14734 ++ */
14735 ++ req = inet_csk_search_req(meta_sk, &prev, th->source,
14736 ++ iph->saddr, iph->daddr);
14737 ++ if (req) {
14738 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
14739 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,
14740 ++ req);
14741 ++ reqsk_free(req);
14742 ++ }
14743 ++ }
14744 ++
14745 ++ tcp_v4_send_reset(rsk, skb);
14746 ++discard:
14747 ++ kfree_skb(skb);
14748 ++ return 0;
14749 ++}
14750 ++
14751 ++/* After this, the ref count of the meta_sk associated with the request_sock
14752 ++ * is incremented. Thus it is the responsibility of the caller
14753 ++ * to call sock_put() when the reference is not needed anymore.
14754 ++ */
14755 ++struct sock *mptcp_v4_search_req(const __be16 rport, const __be32 raddr,
14756 ++ const __be32 laddr, const struct net *net)
14757 ++{
14758 ++ const struct mptcp_request_sock *mtreq;
14759 ++ struct sock *meta_sk = NULL;
14760 ++ const struct hlist_nulls_node *node;
14761 ++ const u32 hash = inet_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);
14762 ++
14763 ++ rcu_read_lock();
14764 ++begin:
14765 ++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],
14766 ++ hash_entry) {
14767 ++ struct inet_request_sock *ireq = inet_rsk(rev_mptcp_rsk(mtreq));
14768 ++ meta_sk = mtreq->mptcp_mpcb->meta_sk;
14769 ++
14770 ++ if (ireq->ir_rmt_port == rport &&
14771 ++ ireq->ir_rmt_addr == raddr &&
14772 ++ ireq->ir_loc_addr == laddr &&
14773 ++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET &&
14774 ++ net_eq(net, sock_net(meta_sk)))
14775 ++ goto found;
14776 ++ meta_sk = NULL;
14777 ++ }
14778 ++ /* A request-socket is destroyed by RCU. So, it might have been recycled
14779 ++ * and put into another hash-table list. So, after the lookup we may
14780 ++ * end up in a different list. So, we may need to restart.
14781 ++ *
14782 ++ * See also the comment in __inet_lookup_established.
14783 ++ */
14784 ++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)
14785 ++ goto begin;
14786 ++
14787 ++found:
14788 ++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
14789 ++ meta_sk = NULL;
14790 ++ rcu_read_unlock();
14791 ++
14792 ++ return meta_sk;
14793 ++}
14794 ++
14795 ++/* Create a new IPv4 subflow.
14796 ++ *
14797 ++ * We are in user-context and meta-sock-lock is hold.
14798 ++ */
14799 ++int mptcp_init4_subsockets(struct sock *meta_sk, const struct mptcp_loc4 *loc,
14800 ++ struct mptcp_rem4 *rem)
14801 ++{
14802 ++ struct tcp_sock *tp;
14803 ++ struct sock *sk;
14804 ++ struct sockaddr_in loc_in, rem_in;
14805 ++ struct socket sock;
14806 ++ int ret;
14807 ++
14808 ++ /** First, create and prepare the new socket */
14809 ++
14810 ++ sock.type = meta_sk->sk_socket->type;
14811 ++ sock.state = SS_UNCONNECTED;
14812 ++ sock.wq = meta_sk->sk_socket->wq;
14813 ++ sock.file = meta_sk->sk_socket->file;
14814 ++ sock.ops = NULL;
14815 ++
14816 ++ ret = inet_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
14817 ++ if (unlikely(ret < 0)) {
14818 ++ mptcp_debug("%s inet_create failed ret: %d\n", __func__, ret);
14819 ++ return ret;
14820 ++ }
14821 ++
14822 ++ sk = sock.sk;
14823 ++ tp = tcp_sk(sk);
14824 ++
14825 ++ /* All subsockets need the MPTCP-lock-class */
14826 ++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
14827 ++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
14828 ++
14829 ++ if (mptcp_add_sock(meta_sk, sk, loc->loc4_id, rem->rem4_id, GFP_KERNEL))
14830 ++ goto error;
14831 ++
14832 ++ tp->mptcp->slave_sk = 1;
14833 ++ tp->mptcp->low_prio = loc->low_prio;
14834 ++
14835 ++ /* Initializing the timer for an MPTCP subflow */
14836 ++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
14837 ++
14838 ++ /** Then, connect the socket to the peer */
14839 ++ loc_in.sin_family = AF_INET;
14840 ++ rem_in.sin_family = AF_INET;
14841 ++ loc_in.sin_port = 0;
14842 ++ if (rem->port)
14843 ++ rem_in.sin_port = rem->port;
14844 ++ else
14845 ++ rem_in.sin_port = inet_sk(meta_sk)->inet_dport;
14846 ++ loc_in.sin_addr = loc->addr;
14847 ++ rem_in.sin_addr = rem->addr;
14848 ++
14849 ++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in));
14850 ++ if (ret < 0) {
14851 ++ mptcp_debug("%s: MPTCP subsocket bind() failed, error %d\n",
14852 ++ __func__, ret);
14853 ++ goto error;
14854 ++ }
14855 ++
14856 ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI4:%d dst_addr:%pI4:%d\n",
14857 ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
14858 ++ tp->mptcp->path_index, &loc_in.sin_addr,
14859 ++ ntohs(loc_in.sin_port), &rem_in.sin_addr,
14860 ++ ntohs(rem_in.sin_port));
14861 ++
14862 ++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4)
14863 ++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v4(sk, rem->addr);
14864 ++
14865 ++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
14866 ++ sizeof(struct sockaddr_in), O_NONBLOCK);
14867 ++ if (ret < 0 && ret != -EINPROGRESS) {
14868 ++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
14869 ++ __func__, ret);
14870 ++ goto error;
14871 ++ }
14872 ++
14873 ++ sk_set_socket(sk, meta_sk->sk_socket);
14874 ++ sk->sk_wq = meta_sk->sk_wq;
14875 ++
14876 ++ return 0;
14877 ++
14878 ++error:
14879 ++ /* May happen if mptcp_add_sock fails first */
14880 ++ if (!mptcp(tp)) {
14881 ++ tcp_close(sk, 0);
14882 ++ } else {
14883 ++ local_bh_disable();
14884 ++ mptcp_sub_force_close(sk);
14885 ++ local_bh_enable();
14886 ++ }
14887 ++ return ret;
14888 ++}
14889 ++EXPORT_SYMBOL(mptcp_init4_subsockets);
14890 ++
14891 ++const struct inet_connection_sock_af_ops mptcp_v4_specific = {
14892 ++ .queue_xmit = ip_queue_xmit,
14893 ++ .send_check = tcp_v4_send_check,
14894 ++ .rebuild_header = inet_sk_rebuild_header,
14895 ++ .sk_rx_dst_set = inet_sk_rx_dst_set,
14896 ++ .conn_request = mptcp_conn_request,
14897 ++ .syn_recv_sock = tcp_v4_syn_recv_sock,
14898 ++ .net_header_len = sizeof(struct iphdr),
14899 ++ .setsockopt = ip_setsockopt,
14900 ++ .getsockopt = ip_getsockopt,
14901 ++ .addr2sockaddr = inet_csk_addr2sockaddr,
14902 ++ .sockaddr_len = sizeof(struct sockaddr_in),
14903 ++ .bind_conflict = inet_csk_bind_conflict,
14904 ++#ifdef CONFIG_COMPAT
14905 ++ .compat_setsockopt = compat_ip_setsockopt,
14906 ++ .compat_getsockopt = compat_ip_getsockopt,
14907 ++#endif
14908 ++};
14909 ++
14910 ++struct tcp_request_sock_ops mptcp_request_sock_ipv4_ops;
14911 ++struct tcp_request_sock_ops mptcp_join_request_sock_ipv4_ops;
14912 ++
14913 ++/* General initialization of IPv4 for MPTCP */
14914 ++int mptcp_pm_v4_init(void)
14915 ++{
14916 ++ int ret = 0;
14917 ++ struct request_sock_ops *ops = &mptcp_request_sock_ops;
14918 ++
14919 ++ mptcp_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
14920 ++ mptcp_request_sock_ipv4_ops.init_req = mptcp_v4_init_req;
14921 ++
14922 ++ mptcp_join_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
14923 ++ mptcp_join_request_sock_ipv4_ops.init_req = mptcp_v4_join_init_req;
14924 ++ mptcp_join_request_sock_ipv4_ops.queue_hash_add = mptcp_v4_reqsk_queue_hash_add;
14925 ++
14926 ++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP");
14927 ++ if (ops->slab_name == NULL) {
14928 ++ ret = -ENOMEM;
14929 ++ goto out;
14930 ++ }
14931 ++
14932 ++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
14933 ++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
14934 ++ NULL);
14935 ++
14936 ++ if (ops->slab == NULL) {
14937 ++ ret = -ENOMEM;
14938 ++ goto err_reqsk_create;
14939 ++ }
14940 ++
14941 ++out:
14942 ++ return ret;
14943 ++
14944 ++err_reqsk_create:
14945 ++ kfree(ops->slab_name);
14946 ++ ops->slab_name = NULL;
14947 ++ goto out;
14948 ++}
14949 ++
14950 ++void mptcp_pm_v4_undo(void)
14951 ++{
14952 ++ kmem_cache_destroy(mptcp_request_sock_ops.slab);
14953 ++ kfree(mptcp_request_sock_ops.slab_name);
14954 ++}
14955 +diff --git a/net/mptcp/mptcp_ipv6.c b/net/mptcp/mptcp_ipv6.c
14956 +new file mode 100644
14957 +index 000000000000..1036973aa855
14958 +--- /dev/null
14959 ++++ b/net/mptcp/mptcp_ipv6.c
14960 +@@ -0,0 +1,518 @@
14961 ++/*
14962 ++ * MPTCP implementation - IPv6-specific functions
14963 ++ *
14964 ++ * Initial Design & Implementation:
14965 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
14966 ++ *
14967 ++ * Current Maintainer:
14968 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
14969 ++ *
14970 ++ * Additional authors:
14971 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
14972 ++ * Gregory Detal <gregory.detal@×××××××××.be>
14973 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
14974 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
14975 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
14976 ++ * Andreas Ripke <ripke@××××××.eu>
14977 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
14978 ++ * Octavian Purdila <octavian.purdila@×××××.com>
14979 ++ * John Ronan <jronan@××××.org>
14980 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
14981 ++ * Brandon Heller <brandonh@××××××××.edu>
14982 ++ *
14983 ++ *
14984 ++ * This program is free software; you can redistribute it and/or
14985 ++ * modify it under the terms of the GNU General Public License
14986 ++ * as published by the Free Software Foundation; either version
14987 ++ * 2 of the License, or (at your option) any later version.
14988 ++ */
14989 ++
14990 ++#include <linux/export.h>
14991 ++#include <linux/in6.h>
14992 ++#include <linux/kernel.h>
14993 ++
14994 ++#include <net/addrconf.h>
14995 ++#include <net/flow.h>
14996 ++#include <net/inet6_connection_sock.h>
14997 ++#include <net/inet6_hashtables.h>
14998 ++#include <net/inet_common.h>
14999 ++#include <net/ipv6.h>
15000 ++#include <net/ip6_checksum.h>
15001 ++#include <net/ip6_route.h>
15002 ++#include <net/mptcp.h>
15003 ++#include <net/mptcp_v6.h>
15004 ++#include <net/tcp.h>
15005 ++#include <net/transp_v6.h>
15006 ++
15007 ++__u32 mptcp_v6_get_nonce(const __be32 *saddr, const __be32 *daddr,
15008 ++ __be16 sport, __be16 dport)
15009 ++{
15010 ++ u32 secret[MD5_MESSAGE_BYTES / 4];
15011 ++ u32 hash[MD5_DIGEST_WORDS];
15012 ++ u32 i;
15013 ++
15014 ++ memcpy(hash, saddr, 16);
15015 ++ for (i = 0; i < 4; i++)
15016 ++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
15017 ++ secret[4] = mptcp_secret[4] +
15018 ++ (((__force u16)sport << 16) + (__force u16)dport);
15019 ++ secret[5] = mptcp_seed++;
15020 ++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
15021 ++ secret[i] = mptcp_secret[i];
15022 ++
15023 ++ md5_transform(hash, secret);
15024 ++
15025 ++ return hash[0];
15026 ++}
15027 ++
15028 ++u64 mptcp_v6_get_key(const __be32 *saddr, const __be32 *daddr,
15029 ++ __be16 sport, __be16 dport)
15030 ++{
15031 ++ u32 secret[MD5_MESSAGE_BYTES / 4];
15032 ++ u32 hash[MD5_DIGEST_WORDS];
15033 ++ u32 i;
15034 ++
15035 ++ memcpy(hash, saddr, 16);
15036 ++ for (i = 0; i < 4; i++)
15037 ++ secret[i] = mptcp_secret[i] + (__force u32)daddr[i];
15038 ++ secret[4] = mptcp_secret[4] +
15039 ++ (((__force u16)sport << 16) + (__force u16)dport);
15040 ++ secret[5] = mptcp_seed++;
15041 ++ for (i = 6; i < MD5_MESSAGE_BYTES / 4; i++)
15042 ++ secret[i] = mptcp_secret[i];
15043 ++
15044 ++ md5_transform(hash, secret);
15045 ++
15046 ++ return *((u64 *)hash);
15047 ++}
15048 ++
15049 ++static void mptcp_v6_reqsk_destructor(struct request_sock *req)
15050 ++{
15051 ++ mptcp_reqsk_destructor(req);
15052 ++
15053 ++ tcp_v6_reqsk_destructor(req);
15054 ++}
15055 ++
15056 ++static int mptcp_v6_init_req(struct request_sock *req, struct sock *sk,
15057 ++ struct sk_buff *skb)
15058 ++{
15059 ++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb);
15060 ++ mptcp_reqsk_init(req, skb);
15061 ++
15062 ++ return 0;
15063 ++}
15064 ++
15065 ++static int mptcp_v6_join_init_req(struct request_sock *req, struct sock *sk,
15066 ++ struct sk_buff *skb)
15067 ++{
15068 ++ struct mptcp_request_sock *mtreq = mptcp_rsk(req);
15069 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
15070 ++ union inet_addr addr;
15071 ++ int loc_id;
15072 ++ bool low_prio = false;
15073 ++
15074 ++ /* We need to do this as early as possible. Because, if we fail later
15075 ++ * (e.g., get_local_id), then reqsk_free tries to remove the
15076 ++ * request-socket from the htb in mptcp_hash_request_remove as pprev
15077 ++ * may be different from NULL.
15078 ++ */
15079 ++ mtreq->hash_entry.pprev = NULL;
15080 ++
15081 ++ tcp_request_sock_ipv6_ops.init_req(req, sk, skb);
15082 ++
15083 ++ mtreq->mptcp_loc_nonce = mptcp_v6_get_nonce(ipv6_hdr(skb)->saddr.s6_addr32,
15084 ++ ipv6_hdr(skb)->daddr.s6_addr32,
15085 ++ tcp_hdr(skb)->source,
15086 ++ tcp_hdr(skb)->dest);
15087 ++ addr.in6 = inet_rsk(req)->ir_v6_loc_addr;
15088 ++ loc_id = mpcb->pm_ops->get_local_id(AF_INET6, &addr, sock_net(sk), &low_prio);
15089 ++ if (loc_id == -1)
15090 ++ return -1;
15091 ++ mtreq->loc_id = loc_id;
15092 ++ mtreq->low_prio = low_prio;
15093 ++
15094 ++ mptcp_join_reqsk_init(mpcb, req, skb);
15095 ++
15096 ++ return 0;
15097 ++}
15098 ++
15099 ++/* Similar to tcp6_request_sock_ops */
15100 ++struct request_sock_ops mptcp6_request_sock_ops __read_mostly = {
15101 ++ .family = AF_INET6,
15102 ++ .obj_size = sizeof(struct mptcp_request_sock),
15103 ++ .rtx_syn_ack = tcp_v6_rtx_synack,
15104 ++ .send_ack = tcp_v6_reqsk_send_ack,
15105 ++ .destructor = mptcp_v6_reqsk_destructor,
15106 ++ .send_reset = tcp_v6_send_reset,
15107 ++ .syn_ack_timeout = tcp_syn_ack_timeout,
15108 ++};
15109 ++
15110 ++static void mptcp_v6_reqsk_queue_hash_add(struct sock *meta_sk,
15111 ++ struct request_sock *req,
15112 ++ const unsigned long timeout)
15113 ++{
15114 ++ const u32 h1 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
15115 ++ inet_rsk(req)->ir_rmt_port,
15116 ++ 0, MPTCP_HASH_SIZE);
15117 ++ /* We cannot call inet6_csk_reqsk_queue_hash_add(), because we do not
15118 ++ * want to reset the keepalive-timer (responsible for retransmitting
15119 ++ * SYN/ACKs). We do not retransmit SYN/ACKs+MP_JOINs, because we cannot
15120 ++ * overload the keepalive timer. Also, it's not a big deal, because the
15121 ++ * third ACK of the MP_JOIN-handshake is sent in a reliable manner. So,
15122 ++ * if the third ACK gets lost, the client will handle the retransmission
15123 ++ * anyways. If our SYN/ACK gets lost, the client will retransmit the
15124 ++ * SYN.
15125 ++ */
15126 ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
15127 ++ struct listen_sock *lopt = meta_icsk->icsk_accept_queue.listen_opt;
15128 ++ const u32 h2 = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
15129 ++ inet_rsk(req)->ir_rmt_port,
15130 ++ lopt->hash_rnd, lopt->nr_table_entries);
15131 ++
15132 ++ reqsk_queue_hash_req(&meta_icsk->icsk_accept_queue, h2, req, timeout);
15133 ++ if (reqsk_queue_added(&meta_icsk->icsk_accept_queue) == 0)
15134 ++ mptcp_reset_synack_timer(meta_sk, timeout);
15135 ++
15136 ++ rcu_read_lock();
15137 ++ spin_lock(&mptcp_reqsk_hlock);
15138 ++ hlist_nulls_add_head_rcu(&mptcp_rsk(req)->hash_entry, &mptcp_reqsk_htb[h1]);
15139 ++ spin_unlock(&mptcp_reqsk_hlock);
15140 ++ rcu_read_unlock();
15141 ++}
15142 ++
15143 ++static int mptcp_v6_join_request(struct sock *meta_sk, struct sk_buff *skb)
15144 ++{
15145 ++ return tcp_conn_request(&mptcp6_request_sock_ops,
15146 ++ &mptcp_join_request_sock_ipv6_ops,
15147 ++ meta_sk, skb);
15148 ++}
15149 ++
15150 ++int mptcp_v6_do_rcv(struct sock *meta_sk, struct sk_buff *skb)
15151 ++{
15152 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
15153 ++ struct sock *child, *rsk = NULL;
15154 ++ int ret;
15155 ++
15156 ++ if (!(TCP_SKB_CB(skb)->mptcp_flags & MPTCPHDR_JOIN)) {
15157 ++ struct tcphdr *th = tcp_hdr(skb);
15158 ++ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
15159 ++ struct sock *sk;
15160 ++
15161 ++ sk = __inet6_lookup_established(sock_net(meta_sk),
15162 ++ &tcp_hashinfo,
15163 ++ &ip6h->saddr, th->source,
15164 ++ &ip6h->daddr, ntohs(th->dest),
15165 ++ inet6_iif(skb));
15166 ++
15167 ++ if (!sk) {
15168 ++ kfree_skb(skb);
15169 ++ return 0;
15170 ++ }
15171 ++ if (is_meta_sk(sk)) {
15172 ++ WARN("%s Did not find a sub-sk!\n", __func__);
15173 ++ kfree_skb(skb);
15174 ++ sock_put(sk);
15175 ++ return 0;
15176 ++ }
15177 ++
15178 ++ if (sk->sk_state == TCP_TIME_WAIT) {
15179 ++ inet_twsk_put(inet_twsk(sk));
15180 ++ kfree_skb(skb);
15181 ++ return 0;
15182 ++ }
15183 ++
15184 ++ ret = tcp_v6_do_rcv(sk, skb);
15185 ++ sock_put(sk);
15186 ++
15187 ++ return ret;
15188 ++ }
15189 ++ TCP_SKB_CB(skb)->mptcp_flags = 0;
15190 ++
15191 ++ /* Has been removed from the tk-table. Thus, no new subflows.
15192 ++ *
15193 ++ * Check for close-state is necessary, because we may have been closed
15194 ++ * without passing by mptcp_close().
15195 ++ *
15196 ++ * When falling back, no new subflows are allowed either.
15197 ++ */
15198 ++ if (meta_sk->sk_state == TCP_CLOSE || !tcp_sk(meta_sk)->inside_tk_table ||
15199 ++ mpcb->infinite_mapping_rcv || mpcb->send_infinite_mapping)
15200 ++ goto reset_and_discard;
15201 ++
15202 ++ child = tcp_v6_hnd_req(meta_sk, skb);
15203 ++
15204 ++ if (!child)
15205 ++ goto discard;
15206 ++
15207 ++ if (child != meta_sk) {
15208 ++ sock_rps_save_rxhash(child, skb);
15209 ++ /* We don't call tcp_child_process here, because we hold
15210 ++ * already the meta-sk-lock and are sure that it is not owned
15211 ++ * by the user.
15212 ++ */
15213 ++ ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
15214 ++ bh_unlock_sock(child);
15215 ++ sock_put(child);
15216 ++ if (ret) {
15217 ++ rsk = child;
15218 ++ goto reset_and_discard;
15219 ++ }
15220 ++ } else {
15221 ++ if (tcp_hdr(skb)->syn) {
15222 ++ mptcp_v6_join_request(meta_sk, skb);
15223 ++ goto discard;
15224 ++ }
15225 ++ goto reset_and_discard;
15226 ++ }
15227 ++ return 0;
15228 ++
15229 ++reset_and_discard:
15230 ++ if (reqsk_queue_len(&inet_csk(meta_sk)->icsk_accept_queue)) {
15231 ++ const struct tcphdr *th = tcp_hdr(skb);
15232 ++ struct request_sock **prev, *req;
15233 ++ /* If we end up here, it means we should not have matched on the
15234 ++ * request-socket. But, because the request-sock queue is only
15235 ++ * destroyed in mptcp_close, the socket may actually already be
15236 ++ * in close-state (e.g., through shutdown()) while still having
15237 ++ * pending request sockets.
15238 ++ */
15239 ++ req = inet6_csk_search_req(meta_sk, &prev, th->source,
15240 ++ &ipv6_hdr(skb)->saddr,
15241 ++ &ipv6_hdr(skb)->daddr, inet6_iif(skb));
15242 ++ if (req) {
15243 ++ inet_csk_reqsk_queue_unlink(meta_sk, req, prev);
15244 ++ reqsk_queue_removed(&inet_csk(meta_sk)->icsk_accept_queue,
15245 ++ req);
15246 ++ reqsk_free(req);
15247 ++ }
15248 ++ }
15249 ++
15250 ++ tcp_v6_send_reset(rsk, skb);
15251 ++discard:
15252 ++ kfree_skb(skb);
15253 ++ return 0;
15254 ++}
15255 ++
15256 ++/* After this, the ref count of the meta_sk associated with the request_sock
15257 ++ * is incremented. Thus it is the responsibility of the caller
15258 ++ * to call sock_put() when the reference is not needed anymore.
15259 ++ */
15260 ++struct sock *mptcp_v6_search_req(const __be16 rport, const struct in6_addr *raddr,
15261 ++ const struct in6_addr *laddr, const struct net *net)
15262 ++{
15263 ++ const struct mptcp_request_sock *mtreq;
15264 ++ struct sock *meta_sk = NULL;
15265 ++ const struct hlist_nulls_node *node;
15266 ++ const u32 hash = inet6_synq_hash(raddr, rport, 0, MPTCP_HASH_SIZE);
15267 ++
15268 ++ rcu_read_lock();
15269 ++begin:
15270 ++ hlist_nulls_for_each_entry_rcu(mtreq, node, &mptcp_reqsk_htb[hash],
15271 ++ hash_entry) {
15272 ++ struct inet_request_sock *treq = inet_rsk(rev_mptcp_rsk(mtreq));
15273 ++ meta_sk = mtreq->mptcp_mpcb->meta_sk;
15274 ++
15275 ++ if (inet_rsk(rev_mptcp_rsk(mtreq))->ir_rmt_port == rport &&
15276 ++ rev_mptcp_rsk(mtreq)->rsk_ops->family == AF_INET6 &&
15277 ++ ipv6_addr_equal(&treq->ir_v6_rmt_addr, raddr) &&
15278 ++ ipv6_addr_equal(&treq->ir_v6_loc_addr, laddr) &&
15279 ++ net_eq(net, sock_net(meta_sk)))
15280 ++ goto found;
15281 ++ meta_sk = NULL;
15282 ++ }
15283 ++ /* A request-socket is destroyed by RCU. So, it might have been recycled
15284 ++ * and put into another hash-table list. So, after the lookup we may
15285 ++ * end up in a different list. So, we may need to restart.
15286 ++ *
15287 ++ * See also the comment in __inet_lookup_established.
15288 ++ */
15289 ++ if (get_nulls_value(node) != hash + MPTCP_REQSK_NULLS_BASE)
15290 ++ goto begin;
15291 ++
15292 ++found:
15293 ++ if (meta_sk && unlikely(!atomic_inc_not_zero(&meta_sk->sk_refcnt)))
15294 ++ meta_sk = NULL;
15295 ++ rcu_read_unlock();
15296 ++
15297 ++ return meta_sk;
15298 ++}
15299 ++
15300 ++/* Create a new IPv6 subflow.
15301 ++ *
15302 ++ * We are in user-context and meta-sock-lock is hold.
15303 ++ */
15304 ++int mptcp_init6_subsockets(struct sock *meta_sk, const struct mptcp_loc6 *loc,
15305 ++ struct mptcp_rem6 *rem)
15306 ++{
15307 ++ struct tcp_sock *tp;
15308 ++ struct sock *sk;
15309 ++ struct sockaddr_in6 loc_in, rem_in;
15310 ++ struct socket sock;
15311 ++ int ret;
15312 ++
15313 ++ /** First, create and prepare the new socket */
15314 ++
15315 ++ sock.type = meta_sk->sk_socket->type;
15316 ++ sock.state = SS_UNCONNECTED;
15317 ++ sock.wq = meta_sk->sk_socket->wq;
15318 ++ sock.file = meta_sk->sk_socket->file;
15319 ++ sock.ops = NULL;
15320 ++
15321 ++ ret = inet6_create(sock_net(meta_sk), &sock, IPPROTO_TCP, 1);
15322 ++ if (unlikely(ret < 0)) {
15323 ++ mptcp_debug("%s inet6_create failed ret: %d\n", __func__, ret);
15324 ++ return ret;
15325 ++ }
15326 ++
15327 ++ sk = sock.sk;
15328 ++ tp = tcp_sk(sk);
15329 ++
15330 ++ /* All subsockets need the MPTCP-lock-class */
15331 ++ lockdep_set_class_and_name(&(sk)->sk_lock.slock, &meta_slock_key, "slock-AF_INET-MPTCP");
15332 ++ lockdep_init_map(&(sk)->sk_lock.dep_map, "sk_lock-AF_INET-MPTCP", &meta_key, 0);
15333 ++
15334 ++ if (mptcp_add_sock(meta_sk, sk, loc->loc6_id, rem->rem6_id, GFP_KERNEL))
15335 ++ goto error;
15336 ++
15337 ++ tp->mptcp->slave_sk = 1;
15338 ++ tp->mptcp->low_prio = loc->low_prio;
15339 ++
15340 ++ /* Initializing the timer for an MPTCP subflow */
15341 ++ setup_timer(&tp->mptcp->mptcp_ack_timer, mptcp_ack_handler, (unsigned long)sk);
15342 ++
15343 ++ /** Then, connect the socket to the peer */
15344 ++ loc_in.sin6_family = AF_INET6;
15345 ++ rem_in.sin6_family = AF_INET6;
15346 ++ loc_in.sin6_port = 0;
15347 ++ if (rem->port)
15348 ++ rem_in.sin6_port = rem->port;
15349 ++ else
15350 ++ rem_in.sin6_port = inet_sk(meta_sk)->inet_dport;
15351 ++ loc_in.sin6_addr = loc->addr;
15352 ++ rem_in.sin6_addr = rem->addr;
15353 ++
15354 ++ ret = sock.ops->bind(&sock, (struct sockaddr *)&loc_in, sizeof(struct sockaddr_in6));
15355 ++ if (ret < 0) {
15356 ++ mptcp_debug("%s: MPTCP subsocket bind()failed, error %d\n",
15357 ++ __func__, ret);
15358 ++ goto error;
15359 ++ }
15360 ++
15361 ++ mptcp_debug("%s: token %#x pi %d src_addr:%pI6:%d dst_addr:%pI6:%d\n",
15362 ++ __func__, tcp_sk(meta_sk)->mpcb->mptcp_loc_token,
15363 ++ tp->mptcp->path_index, &loc_in.sin6_addr,
15364 ++ ntohs(loc_in.sin6_port), &rem_in.sin6_addr,
15365 ++ ntohs(rem_in.sin6_port));
15366 ++
15367 ++ if (tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6)
15368 ++ tcp_sk(meta_sk)->mpcb->pm_ops->init_subsocket_v6(sk, rem->addr);
15369 ++
15370 ++ ret = sock.ops->connect(&sock, (struct sockaddr *)&rem_in,
15371 ++ sizeof(struct sockaddr_in6), O_NONBLOCK);
15372 ++ if (ret < 0 && ret != -EINPROGRESS) {
15373 ++ mptcp_debug("%s: MPTCP subsocket connect() failed, error %d\n",
15374 ++ __func__, ret);
15375 ++ goto error;
15376 ++ }
15377 ++
15378 ++ sk_set_socket(sk, meta_sk->sk_socket);
15379 ++ sk->sk_wq = meta_sk->sk_wq;
15380 ++
15381 ++ return 0;
15382 ++
15383 ++error:
15384 ++ /* May happen if mptcp_add_sock fails first */
15385 ++ if (!mptcp(tp)) {
15386 ++ tcp_close(sk, 0);
15387 ++ } else {
15388 ++ local_bh_disable();
15389 ++ mptcp_sub_force_close(sk);
15390 ++ local_bh_enable();
15391 ++ }
15392 ++ return ret;
15393 ++}
15394 ++EXPORT_SYMBOL(mptcp_init6_subsockets);
15395 ++
15396 ++const struct inet_connection_sock_af_ops mptcp_v6_specific = {
15397 ++ .queue_xmit = inet6_csk_xmit,
15398 ++ .send_check = tcp_v6_send_check,
15399 ++ .rebuild_header = inet6_sk_rebuild_header,
15400 ++ .sk_rx_dst_set = inet6_sk_rx_dst_set,
15401 ++ .conn_request = mptcp_conn_request,
15402 ++ .syn_recv_sock = tcp_v6_syn_recv_sock,
15403 ++ .net_header_len = sizeof(struct ipv6hdr),
15404 ++ .net_frag_header_len = sizeof(struct frag_hdr),
15405 ++ .setsockopt = ipv6_setsockopt,
15406 ++ .getsockopt = ipv6_getsockopt,
15407 ++ .addr2sockaddr = inet6_csk_addr2sockaddr,
15408 ++ .sockaddr_len = sizeof(struct sockaddr_in6),
15409 ++ .bind_conflict = inet6_csk_bind_conflict,
15410 ++#ifdef CONFIG_COMPAT
15411 ++ .compat_setsockopt = compat_ipv6_setsockopt,
15412 ++ .compat_getsockopt = compat_ipv6_getsockopt,
15413 ++#endif
15414 ++};
15415 ++
15416 ++const struct inet_connection_sock_af_ops mptcp_v6_mapped = {
15417 ++ .queue_xmit = ip_queue_xmit,
15418 ++ .send_check = tcp_v4_send_check,
15419 ++ .rebuild_header = inet_sk_rebuild_header,
15420 ++ .sk_rx_dst_set = inet_sk_rx_dst_set,
15421 ++ .conn_request = mptcp_conn_request,
15422 ++ .syn_recv_sock = tcp_v6_syn_recv_sock,
15423 ++ .net_header_len = sizeof(struct iphdr),
15424 ++ .setsockopt = ipv6_setsockopt,
15425 ++ .getsockopt = ipv6_getsockopt,
15426 ++ .addr2sockaddr = inet6_csk_addr2sockaddr,
15427 ++ .sockaddr_len = sizeof(struct sockaddr_in6),
15428 ++ .bind_conflict = inet6_csk_bind_conflict,
15429 ++#ifdef CONFIG_COMPAT
15430 ++ .compat_setsockopt = compat_ipv6_setsockopt,
15431 ++ .compat_getsockopt = compat_ipv6_getsockopt,
15432 ++#endif
15433 ++};
15434 ++
15435 ++struct tcp_request_sock_ops mptcp_request_sock_ipv6_ops;
15436 ++struct tcp_request_sock_ops mptcp_join_request_sock_ipv6_ops;
15437 ++
15438 ++int mptcp_pm_v6_init(void)
15439 ++{
15440 ++ int ret = 0;
15441 ++ struct request_sock_ops *ops = &mptcp6_request_sock_ops;
15442 ++
15443 ++ mptcp_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
15444 ++ mptcp_request_sock_ipv6_ops.init_req = mptcp_v6_init_req;
15445 ++
15446 ++ mptcp_join_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
15447 ++ mptcp_join_request_sock_ipv6_ops.init_req = mptcp_v6_join_init_req;
15448 ++ mptcp_join_request_sock_ipv6_ops.queue_hash_add = mptcp_v6_reqsk_queue_hash_add;
15449 ++
15450 ++ ops->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", "MPTCP6");
15451 ++ if (ops->slab_name == NULL) {
15452 ++ ret = -ENOMEM;
15453 ++ goto out;
15454 ++ }
15455 ++
15456 ++ ops->slab = kmem_cache_create(ops->slab_name, ops->obj_size, 0,
15457 ++ SLAB_DESTROY_BY_RCU|SLAB_HWCACHE_ALIGN,
15458 ++ NULL);
15459 ++
15460 ++ if (ops->slab == NULL) {
15461 ++ ret = -ENOMEM;
15462 ++ goto err_reqsk_create;
15463 ++ }
15464 ++
15465 ++out:
15466 ++ return ret;
15467 ++
15468 ++err_reqsk_create:
15469 ++ kfree(ops->slab_name);
15470 ++ ops->slab_name = NULL;
15471 ++ goto out;
15472 ++}
15473 ++
15474 ++void mptcp_pm_v6_undo(void)
15475 ++{
15476 ++ kmem_cache_destroy(mptcp6_request_sock_ops.slab);
15477 ++ kfree(mptcp6_request_sock_ops.slab_name);
15478 ++}
15479 +diff --git a/net/mptcp/mptcp_ndiffports.c b/net/mptcp/mptcp_ndiffports.c
15480 +new file mode 100644
15481 +index 000000000000..6f5087983175
15482 +--- /dev/null
15483 ++++ b/net/mptcp/mptcp_ndiffports.c
15484 +@@ -0,0 +1,161 @@
15485 ++#include <linux/module.h>
15486 ++
15487 ++#include <net/mptcp.h>
15488 ++#include <net/mptcp_v4.h>
15489 ++
15490 ++#if IS_ENABLED(CONFIG_IPV6)
15491 ++#include <net/mptcp_v6.h>
15492 ++#endif
15493 ++
15494 ++struct ndiffports_priv {
15495 ++ /* Worker struct for subflow establishment */
15496 ++ struct work_struct subflow_work;
15497 ++
15498 ++ struct mptcp_cb *mpcb;
15499 ++};
15500 ++
15501 ++static int num_subflows __read_mostly = 2;
15502 ++module_param(num_subflows, int, 0644);
15503 ++MODULE_PARM_DESC(num_subflows, "choose the number of subflows per MPTCP connection");
15504 ++
15505 ++/**
15506 ++ * Create all new subflows, by doing calls to mptcp_initX_subsockets
15507 ++ *
15508 ++ * This function uses a goto next_subflow, to allow releasing the lock between
15509 ++ * new subflows and giving other processes a chance to do some work on the
15510 ++ * socket and potentially finishing the communication.
15511 ++ **/
15512 ++static void create_subflow_worker(struct work_struct *work)
15513 ++{
15514 ++ const struct ndiffports_priv *pm_priv = container_of(work,
15515 ++ struct ndiffports_priv,
15516 ++ subflow_work);
15517 ++ struct mptcp_cb *mpcb = pm_priv->mpcb;
15518 ++ struct sock *meta_sk = mpcb->meta_sk;
15519 ++ int iter = 0;
15520 ++
15521 ++next_subflow:
15522 ++ if (iter) {
15523 ++ release_sock(meta_sk);
15524 ++ mutex_unlock(&mpcb->mpcb_mutex);
15525 ++
15526 ++ cond_resched();
15527 ++ }
15528 ++ mutex_lock(&mpcb->mpcb_mutex);
15529 ++ lock_sock_nested(meta_sk, SINGLE_DEPTH_NESTING);
15530 ++
15531 ++ iter++;
15532 ++
15533 ++ if (sock_flag(meta_sk, SOCK_DEAD))
15534 ++ goto exit;
15535 ++
15536 ++ if (mpcb->master_sk &&
15537 ++ !tcp_sk(mpcb->master_sk)->mptcp->fully_established)
15538 ++ goto exit;
15539 ++
15540 ++ if (num_subflows > iter && num_subflows > mpcb->cnt_subflows) {
15541 ++ if (meta_sk->sk_family == AF_INET ||
15542 ++ mptcp_v6_is_v4_mapped(meta_sk)) {
15543 ++ struct mptcp_loc4 loc;
15544 ++ struct mptcp_rem4 rem;
15545 ++
15546 ++ loc.addr.s_addr = inet_sk(meta_sk)->inet_saddr;
15547 ++ loc.loc4_id = 0;
15548 ++ loc.low_prio = 0;
15549 ++
15550 ++ rem.addr.s_addr = inet_sk(meta_sk)->inet_daddr;
15551 ++ rem.port = inet_sk(meta_sk)->inet_dport;
15552 ++ rem.rem4_id = 0; /* Default 0 */
15553 ++
15554 ++ mptcp_init4_subsockets(meta_sk, &loc, &rem);
15555 ++ } else {
15556 ++#if IS_ENABLED(CONFIG_IPV6)
15557 ++ struct mptcp_loc6 loc;
15558 ++ struct mptcp_rem6 rem;
15559 ++
15560 ++ loc.addr = inet6_sk(meta_sk)->saddr;
15561 ++ loc.loc6_id = 0;
15562 ++ loc.low_prio = 0;
15563 ++
15564 ++ rem.addr = meta_sk->sk_v6_daddr;
15565 ++ rem.port = inet_sk(meta_sk)->inet_dport;
15566 ++ rem.rem6_id = 0; /* Default 0 */
15567 ++
15568 ++ mptcp_init6_subsockets(meta_sk, &loc, &rem);
15569 ++#endif
15570 ++ }
15571 ++ goto next_subflow;
15572 ++ }
15573 ++
15574 ++exit:
15575 ++ release_sock(meta_sk);
15576 ++ mutex_unlock(&mpcb->mpcb_mutex);
15577 ++ sock_put(meta_sk);
15578 ++}
15579 ++
15580 ++static void ndiffports_new_session(const struct sock *meta_sk)
15581 ++{
15582 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
15583 ++ struct ndiffports_priv *fmp = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
15584 ++
15585 ++ /* Initialize workqueue-struct */
15586 ++ INIT_WORK(&fmp->subflow_work, create_subflow_worker);
15587 ++ fmp->mpcb = mpcb;
15588 ++}
15589 ++
15590 ++static void ndiffports_create_subflows(struct sock *meta_sk)
15591 ++{
15592 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
15593 ++ struct ndiffports_priv *pm_priv = (struct ndiffports_priv *)&mpcb->mptcp_pm[0];
15594 ++
15595 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv ||
15596 ++ mpcb->send_infinite_mapping ||
15597 ++ mpcb->server_side || sock_flag(meta_sk, SOCK_DEAD))
15598 ++ return;
15599 ++
15600 ++ if (!work_pending(&pm_priv->subflow_work)) {
15601 ++ sock_hold(meta_sk);
15602 ++ queue_work(mptcp_wq, &pm_priv->subflow_work);
15603 ++ }
15604 ++}
15605 ++
15606 ++static int ndiffports_get_local_id(sa_family_t family, union inet_addr *addr,
15607 ++ struct net *net, bool *low_prio)
15608 ++{
15609 ++ return 0;
15610 ++}
15611 ++
15612 ++static struct mptcp_pm_ops ndiffports __read_mostly = {
15613 ++ .new_session = ndiffports_new_session,
15614 ++ .fully_established = ndiffports_create_subflows,
15615 ++ .get_local_id = ndiffports_get_local_id,
15616 ++ .name = "ndiffports",
15617 ++ .owner = THIS_MODULE,
15618 ++};
15619 ++
15620 ++/* General initialization of MPTCP_PM */
15621 ++static int __init ndiffports_register(void)
15622 ++{
15623 ++ BUILD_BUG_ON(sizeof(struct ndiffports_priv) > MPTCP_PM_SIZE);
15624 ++
15625 ++ if (mptcp_register_path_manager(&ndiffports))
15626 ++ goto exit;
15627 ++
15628 ++ return 0;
15629 ++
15630 ++exit:
15631 ++ return -1;
15632 ++}
15633 ++
15634 ++static void ndiffports_unregister(void)
15635 ++{
15636 ++ mptcp_unregister_path_manager(&ndiffports);
15637 ++}
15638 ++
15639 ++module_init(ndiffports_register);
15640 ++module_exit(ndiffports_unregister);
15641 ++
15642 ++MODULE_AUTHOR("Christoph Paasch");
15643 ++MODULE_LICENSE("GPL");
15644 ++MODULE_DESCRIPTION("NDIFF-PORTS MPTCP");
15645 ++MODULE_VERSION("0.88");
15646 +diff --git a/net/mptcp/mptcp_ofo_queue.c b/net/mptcp/mptcp_ofo_queue.c
15647 +new file mode 100644
15648 +index 000000000000..ec4e98622637
15649 +--- /dev/null
15650 ++++ b/net/mptcp/mptcp_ofo_queue.c
15651 +@@ -0,0 +1,295 @@
15652 ++/*
15653 ++ * MPTCP implementation - Fast algorithm for MPTCP meta-reordering
15654 ++ *
15655 ++ * Initial Design & Implementation:
15656 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
15657 ++ *
15658 ++ * Current Maintainer & Author:
15659 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
15660 ++ *
15661 ++ * Additional authors:
15662 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
15663 ++ * Gregory Detal <gregory.detal@×××××××××.be>
15664 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
15665 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
15666 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
15667 ++ * Andreas Ripke <ripke@××××××.eu>
15668 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
15669 ++ * Octavian Purdila <octavian.purdila@×××××.com>
15670 ++ * John Ronan <jronan@××××.org>
15671 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
15672 ++ * Brandon Heller <brandonh@××××××××.edu>
15673 ++ *
15674 ++ * This program is free software; you can redistribute it and/or
15675 ++ * modify it under the terms of the GNU General Public License
15676 ++ * as published by the Free Software Foundation; either version
15677 ++ * 2 of the License, or (at your option) any later version.
15678 ++ */
15679 ++
15680 ++#include <linux/skbuff.h>
15681 ++#include <linux/slab.h>
15682 ++#include <net/tcp.h>
15683 ++#include <net/mptcp.h>
15684 ++
15685 ++void mptcp_remove_shortcuts(const struct mptcp_cb *mpcb,
15686 ++ const struct sk_buff *skb)
15687 ++{
15688 ++ struct tcp_sock *tp;
15689 ++
15690 ++ mptcp_for_each_tp(mpcb, tp) {
15691 ++ if (tp->mptcp->shortcut_ofoqueue == skb) {
15692 ++ tp->mptcp->shortcut_ofoqueue = NULL;
15693 ++ return;
15694 ++ }
15695 ++ }
15696 ++}
15697 ++
15698 ++/* Does 'skb' fits after 'here' in the queue 'head' ?
15699 ++ * If yes, we queue it and return 1
15700 ++ */
15701 ++static int mptcp_ofo_queue_after(struct sk_buff_head *head,
15702 ++ struct sk_buff *skb, struct sk_buff *here,
15703 ++ const struct tcp_sock *tp)
15704 ++{
15705 ++ struct sock *meta_sk = tp->meta_sk;
15706 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
15707 ++ u32 seq = TCP_SKB_CB(skb)->seq;
15708 ++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
15709 ++
15710 ++ /* We want to queue skb after here, thus seq >= end_seq */
15711 ++ if (before(seq, TCP_SKB_CB(here)->end_seq))
15712 ++ return 0;
15713 ++
15714 ++ if (seq == TCP_SKB_CB(here)->end_seq) {
15715 ++ bool fragstolen = false;
15716 ++
15717 ++ if (!tcp_try_coalesce(meta_sk, here, skb, &fragstolen)) {
15718 ++ __skb_queue_after(&meta_tp->out_of_order_queue, here, skb);
15719 ++ return 1;
15720 ++ } else {
15721 ++ kfree_skb_partial(skb, fragstolen);
15722 ++ return -1;
15723 ++ }
15724 ++ }
15725 ++
15726 ++ /* If here is the last one, we can always queue it */
15727 ++ if (skb_queue_is_last(head, here)) {
15728 ++ __skb_queue_after(head, here, skb);
15729 ++ return 1;
15730 ++ } else {
15731 ++ struct sk_buff *skb1 = skb_queue_next(head, here);
15732 ++ /* It's not the last one, but does it fits between 'here' and
15733 ++ * the one after 'here' ? Thus, does end_seq <= after_here->seq
15734 ++ */
15735 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) {
15736 ++ __skb_queue_after(head, here, skb);
15737 ++ return 1;
15738 ++ }
15739 ++ }
15740 ++
15741 ++ return 0;
15742 ++}
15743 ++
15744 ++static void try_shortcut(struct sk_buff *shortcut, struct sk_buff *skb,
15745 ++ struct sk_buff_head *head, struct tcp_sock *tp)
15746 ++{
15747 ++ struct sock *meta_sk = tp->meta_sk;
15748 ++ struct tcp_sock *tp_it, *meta_tp = tcp_sk(meta_sk);
15749 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
15750 ++ struct sk_buff *skb1, *best_shortcut = NULL;
15751 ++ u32 seq = TCP_SKB_CB(skb)->seq;
15752 ++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
15753 ++ u32 distance = 0xffffffff;
15754 ++
15755 ++ /* First, check the tp's shortcut */
15756 ++ if (!shortcut) {
15757 ++ if (skb_queue_empty(head)) {
15758 ++ __skb_queue_head(head, skb);
15759 ++ goto end;
15760 ++ }
15761 ++ } else {
15762 ++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
15763 ++ /* Does the tp's shortcut is a hit? If yes, we insert. */
15764 ++
15765 ++ if (ret) {
15766 ++ skb = (ret > 0) ? skb : NULL;
15767 ++ goto end;
15768 ++ }
15769 ++ }
15770 ++
15771 ++ /* Check the shortcuts of the other subsockets. */
15772 ++ mptcp_for_each_tp(mpcb, tp_it) {
15773 ++ shortcut = tp_it->mptcp->shortcut_ofoqueue;
15774 ++ /* Can we queue it here? If yes, do so! */
15775 ++ if (shortcut) {
15776 ++ int ret = mptcp_ofo_queue_after(head, skb, shortcut, tp);
15777 ++
15778 ++ if (ret) {
15779 ++ skb = (ret > 0) ? skb : NULL;
15780 ++ goto end;
15781 ++ }
15782 ++ }
15783 ++
15784 ++ /* Could not queue it, check if we are close.
15785 ++ * We are looking for a shortcut, close enough to seq to
15786 ++ * set skb1 prematurely and thus improve the subsequent lookup,
15787 ++ * which tries to find a skb1 so that skb1->seq <= seq.
15788 ++ *
15789 ++ * So, here we only take shortcuts, whose shortcut->seq > seq,
15790 ++ * and minimize the distance between shortcut->seq and seq and
15791 ++ * set best_shortcut to this one with the minimal distance.
15792 ++ *
15793 ++ * That way, the subsequent while-loop is shortest.
15794 ++ */
15795 ++ if (shortcut && after(TCP_SKB_CB(shortcut)->seq, seq)) {
15796 ++ /* Are we closer than the current best shortcut? */
15797 ++ if ((u32)(TCP_SKB_CB(shortcut)->seq - seq) < distance) {
15798 ++ distance = (u32)(TCP_SKB_CB(shortcut)->seq - seq);
15799 ++ best_shortcut = shortcut;
15800 ++ }
15801 ++ }
15802 ++ }
15803 ++
15804 ++ if (best_shortcut)
15805 ++ skb1 = best_shortcut;
15806 ++ else
15807 ++ skb1 = skb_peek_tail(head);
15808 ++
15809 ++ if (seq == TCP_SKB_CB(skb1)->end_seq) {
15810 ++ bool fragstolen = false;
15811 ++
15812 ++ if (!tcp_try_coalesce(meta_sk, skb1, skb, &fragstolen)) {
15813 ++ __skb_queue_after(&meta_tp->out_of_order_queue, skb1, skb);
15814 ++ } else {
15815 ++ kfree_skb_partial(skb, fragstolen);
15816 ++ skb = NULL;
15817 ++ }
15818 ++
15819 ++ goto end;
15820 ++ }
15821 ++
15822 ++ /* Find the insertion point, starting from best_shortcut if available.
15823 ++ *
15824 ++ * Inspired from tcp_data_queue_ofo.
15825 ++ */
15826 ++ while (1) {
15827 ++ /* skb1->seq <= seq */
15828 ++ if (!after(TCP_SKB_CB(skb1)->seq, seq))
15829 ++ break;
15830 ++ if (skb_queue_is_first(head, skb1)) {
15831 ++ skb1 = NULL;
15832 ++ break;
15833 ++ }
15834 ++ skb1 = skb_queue_prev(head, skb1);
15835 ++ }
15836 ++
15837 ++ /* Do skb overlap to previous one? */
15838 ++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
15839 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
15840 ++ /* All the bits are present. */
15841 ++ __kfree_skb(skb);
15842 ++ skb = NULL;
15843 ++ goto end;
15844 ++ }
15845 ++ if (seq == TCP_SKB_CB(skb1)->seq) {
15846 ++ if (skb_queue_is_first(head, skb1))
15847 ++ skb1 = NULL;
15848 ++ else
15849 ++ skb1 = skb_queue_prev(head, skb1);
15850 ++ }
15851 ++ }
15852 ++ if (!skb1)
15853 ++ __skb_queue_head(head, skb);
15854 ++ else
15855 ++ __skb_queue_after(head, skb1, skb);
15856 ++
15857 ++ /* And clean segments covered by new one as whole. */
15858 ++ while (!skb_queue_is_last(head, skb)) {
15859 ++ skb1 = skb_queue_next(head, skb);
15860 ++
15861 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
15862 ++ break;
15863 ++
15864 ++ __skb_unlink(skb1, head);
15865 ++ mptcp_remove_shortcuts(mpcb, skb1);
15866 ++ __kfree_skb(skb1);
15867 ++ }
15868 ++
15869 ++end:
15870 ++ if (skb) {
15871 ++ skb_set_owner_r(skb, meta_sk);
15872 ++ tp->mptcp->shortcut_ofoqueue = skb;
15873 ++ }
15874 ++
15875 ++ return;
15876 ++}
15877 ++
15878 ++/**
15879 ++ * @sk: the subflow that received this skb.
15880 ++ */
15881 ++void mptcp_add_meta_ofo_queue(const struct sock *meta_sk, struct sk_buff *skb,
15882 ++ struct sock *sk)
15883 ++{
15884 ++ struct tcp_sock *tp = tcp_sk(sk);
15885 ++
15886 ++ try_shortcut(tp->mptcp->shortcut_ofoqueue, skb,
15887 ++ &tcp_sk(meta_sk)->out_of_order_queue, tp);
15888 ++}
15889 ++
15890 ++bool mptcp_prune_ofo_queue(struct sock *sk)
15891 ++{
15892 ++ struct tcp_sock *tp = tcp_sk(sk);
15893 ++ bool res = false;
15894 ++
15895 ++ if (!skb_queue_empty(&tp->out_of_order_queue)) {
15896 ++ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
15897 ++ mptcp_purge_ofo_queue(tp);
15898 ++
15899 ++ /* No sack at the mptcp-level */
15900 ++ sk_mem_reclaim(sk);
15901 ++ res = true;
15902 ++ }
15903 ++
15904 ++ return res;
15905 ++}
15906 ++
15907 ++void mptcp_ofo_queue(struct sock *meta_sk)
15908 ++{
15909 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
15910 ++ struct sk_buff *skb;
15911 ++
15912 ++ while ((skb = skb_peek(&meta_tp->out_of_order_queue)) != NULL) {
15913 ++ u32 old_rcv_nxt = meta_tp->rcv_nxt;
15914 ++ if (after(TCP_SKB_CB(skb)->seq, meta_tp->rcv_nxt))
15915 ++ break;
15916 ++
15917 ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->rcv_nxt)) {
15918 ++ __skb_unlink(skb, &meta_tp->out_of_order_queue);
15919 ++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
15920 ++ __kfree_skb(skb);
15921 ++ continue;
15922 ++ }
15923 ++
15924 ++ __skb_unlink(skb, &meta_tp->out_of_order_queue);
15925 ++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
15926 ++
15927 ++ __skb_queue_tail(&meta_sk->sk_receive_queue, skb);
15928 ++ meta_tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
15929 ++ mptcp_check_rcvseq_wrap(meta_tp, old_rcv_nxt);
15930 ++
15931 ++ if (tcp_hdr(skb)->fin)
15932 ++ mptcp_fin(meta_sk);
15933 ++ }
15934 ++}
15935 ++
15936 ++void mptcp_purge_ofo_queue(struct tcp_sock *meta_tp)
15937 ++{
15938 ++ struct sk_buff_head *head = &meta_tp->out_of_order_queue;
15939 ++ struct sk_buff *skb, *tmp;
15940 ++
15941 ++ skb_queue_walk_safe(head, skb, tmp) {
15942 ++ __skb_unlink(skb, head);
15943 ++ mptcp_remove_shortcuts(meta_tp->mpcb, skb);
15944 ++ kfree_skb(skb);
15945 ++ }
15946 ++}
15947 +diff --git a/net/mptcp/mptcp_olia.c b/net/mptcp/mptcp_olia.c
15948 +new file mode 100644
15949 +index 000000000000..53f5c43bb488
15950 +--- /dev/null
15951 ++++ b/net/mptcp/mptcp_olia.c
15952 +@@ -0,0 +1,311 @@
15953 ++/*
15954 ++ * MPTCP implementation - OPPORTUNISTIC LINKED INCREASES CONGESTION CONTROL:
15955 ++ *
15956 ++ * Algorithm design:
15957 ++ * Ramin Khalili <ramin.khalili@××××.ch>
15958 ++ * Nicolas Gast <nicolas.gast@××××.ch>
15959 ++ * Jean-Yves Le Boudec <jean-yves.leboudec@××××.ch>
15960 ++ *
15961 ++ * Implementation:
15962 ++ * Ramin Khalili <ramin.khalili@××××.ch>
15963 ++ *
15964 ++ * Ported to the official MPTCP-kernel:
15965 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
15966 ++ *
15967 ++ * This program is free software; you can redistribute it and/or
15968 ++ * modify it under the terms of the GNU General Public License
15969 ++ * as published by the Free Software Foundation; either version
15970 ++ * 2 of the License, or (at your option) any later version.
15971 ++ */
15972 ++
15973 ++
15974 ++#include <net/tcp.h>
15975 ++#include <net/mptcp.h>
15976 ++
15977 ++#include <linux/module.h>
15978 ++
15979 ++static int scale = 10;
15980 ++
15981 ++struct mptcp_olia {
15982 ++ u32 mptcp_loss1;
15983 ++ u32 mptcp_loss2;
15984 ++ u32 mptcp_loss3;
15985 ++ int epsilon_num;
15986 ++ u32 epsilon_den;
15987 ++ int mptcp_snd_cwnd_cnt;
15988 ++};
15989 ++
15990 ++static inline int mptcp_olia_sk_can_send(const struct sock *sk)
15991 ++{
15992 ++ return mptcp_sk_can_send(sk) && tcp_sk(sk)->srtt_us;
15993 ++}
15994 ++
15995 ++static inline u64 mptcp_olia_scale(u64 val, int scale)
15996 ++{
15997 ++ return (u64) val << scale;
15998 ++}
15999 ++
16000 ++/* take care of artificially inflate (see RFC5681)
16001 ++ * of cwnd during fast-retransmit phase
16002 ++ */
16003 ++static u32 mptcp_get_crt_cwnd(struct sock *sk)
16004 ++{
16005 ++ const struct inet_connection_sock *icsk = inet_csk(sk);
16006 ++
16007 ++ if (icsk->icsk_ca_state == TCP_CA_Recovery)
16008 ++ return tcp_sk(sk)->snd_ssthresh;
16009 ++ else
16010 ++ return tcp_sk(sk)->snd_cwnd;
16011 ++}
16012 ++
16013 ++/* return the dominator of the first term of the increasing term */
16014 ++static u64 mptcp_get_rate(const struct mptcp_cb *mpcb , u32 path_rtt)
16015 ++{
16016 ++ struct sock *sk;
16017 ++ u64 rate = 1; /* We have to avoid a zero-rate because it is used as a divisor */
16018 ++
16019 ++ mptcp_for_each_sk(mpcb, sk) {
16020 ++ struct tcp_sock *tp = tcp_sk(sk);
16021 ++ u64 scaled_num;
16022 ++ u32 tmp_cwnd;
16023 ++
16024 ++ if (!mptcp_olia_sk_can_send(sk))
16025 ++ continue;
16026 ++
16027 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16028 ++ scaled_num = mptcp_olia_scale(tmp_cwnd, scale) * path_rtt;
16029 ++ rate += div_u64(scaled_num , tp->srtt_us);
16030 ++ }
16031 ++ rate *= rate;
16032 ++ return rate;
16033 ++}
16034 ++
16035 ++/* find the maximum cwnd, used to find set M */
16036 ++static u32 mptcp_get_max_cwnd(const struct mptcp_cb *mpcb)
16037 ++{
16038 ++ struct sock *sk;
16039 ++ u32 best_cwnd = 0;
16040 ++
16041 ++ mptcp_for_each_sk(mpcb, sk) {
16042 ++ u32 tmp_cwnd;
16043 ++
16044 ++ if (!mptcp_olia_sk_can_send(sk))
16045 ++ continue;
16046 ++
16047 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16048 ++ if (tmp_cwnd > best_cwnd)
16049 ++ best_cwnd = tmp_cwnd;
16050 ++ }
16051 ++ return best_cwnd;
16052 ++}
16053 ++
16054 ++static void mptcp_get_epsilon(const struct mptcp_cb *mpcb)
16055 ++{
16056 ++ struct mptcp_olia *ca;
16057 ++ struct tcp_sock *tp;
16058 ++ struct sock *sk;
16059 ++ u64 tmp_int, tmp_rtt, best_int = 0, best_rtt = 1;
16060 ++ u32 max_cwnd = 1, best_cwnd = 1, tmp_cwnd;
16061 ++ u8 M = 0, B_not_M = 0;
16062 ++
16063 ++ /* TODO - integrate this in the following loop - we just want to iterate once */
16064 ++
16065 ++ max_cwnd = mptcp_get_max_cwnd(mpcb);
16066 ++
16067 ++ /* find the best path */
16068 ++ mptcp_for_each_sk(mpcb, sk) {
16069 ++ tp = tcp_sk(sk);
16070 ++ ca = inet_csk_ca(sk);
16071 ++
16072 ++ if (!mptcp_olia_sk_can_send(sk))
16073 ++ continue;
16074 ++
16075 ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
16076 ++ /* TODO - check here and rename variables */
16077 ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
16078 ++ ca->mptcp_loss2 - ca->mptcp_loss1);
16079 ++
16080 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16081 ++ if ((u64)tmp_int * best_rtt >= (u64)best_int * tmp_rtt) {
16082 ++ best_rtt = tmp_rtt;
16083 ++ best_int = tmp_int;
16084 ++ best_cwnd = tmp_cwnd;
16085 ++ }
16086 ++ }
16087 ++
16088 ++ /* TODO - integrate this here in mptcp_get_max_cwnd and in the previous loop */
16089 ++ /* find the size of M and B_not_M */
16090 ++ mptcp_for_each_sk(mpcb, sk) {
16091 ++ tp = tcp_sk(sk);
16092 ++ ca = inet_csk_ca(sk);
16093 ++
16094 ++ if (!mptcp_olia_sk_can_send(sk))
16095 ++ continue;
16096 ++
16097 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16098 ++ if (tmp_cwnd == max_cwnd) {
16099 ++ M++;
16100 ++ } else {
16101 ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
16102 ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
16103 ++ ca->mptcp_loss2 - ca->mptcp_loss1);
16104 ++
16105 ++ if ((u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt)
16106 ++ B_not_M++;
16107 ++ }
16108 ++ }
16109 ++
16110 ++ /* check if the path is in M or B_not_M and set the value of epsilon accordingly */
16111 ++ mptcp_for_each_sk(mpcb, sk) {
16112 ++ tp = tcp_sk(sk);
16113 ++ ca = inet_csk_ca(sk);
16114 ++
16115 ++ if (!mptcp_olia_sk_can_send(sk))
16116 ++ continue;
16117 ++
16118 ++ if (B_not_M == 0) {
16119 ++ ca->epsilon_num = 0;
16120 ++ ca->epsilon_den = 1;
16121 ++ } else {
16122 ++ tmp_rtt = (u64)tp->srtt_us * tp->srtt_us;
16123 ++ tmp_int = max(ca->mptcp_loss3 - ca->mptcp_loss2,
16124 ++ ca->mptcp_loss2 - ca->mptcp_loss1);
16125 ++ tmp_cwnd = mptcp_get_crt_cwnd(sk);
16126 ++
16127 ++ if (tmp_cwnd < max_cwnd &&
16128 ++ (u64)tmp_int * best_rtt == (u64)best_int * tmp_rtt) {
16129 ++ ca->epsilon_num = 1;
16130 ++ ca->epsilon_den = mpcb->cnt_established * B_not_M;
16131 ++ } else if (tmp_cwnd == max_cwnd) {
16132 ++ ca->epsilon_num = -1;
16133 ++ ca->epsilon_den = mpcb->cnt_established * M;
16134 ++ } else {
16135 ++ ca->epsilon_num = 0;
16136 ++ ca->epsilon_den = 1;
16137 ++ }
16138 ++ }
16139 ++ }
16140 ++}
16141 ++
16142 ++/* setting the initial values */
16143 ++static void mptcp_olia_init(struct sock *sk)
16144 ++{
16145 ++ const struct tcp_sock *tp = tcp_sk(sk);
16146 ++ struct mptcp_olia *ca = inet_csk_ca(sk);
16147 ++
16148 ++ if (mptcp(tp)) {
16149 ++ ca->mptcp_loss1 = tp->snd_una;
16150 ++ ca->mptcp_loss2 = tp->snd_una;
16151 ++ ca->mptcp_loss3 = tp->snd_una;
16152 ++ ca->mptcp_snd_cwnd_cnt = 0;
16153 ++ ca->epsilon_num = 0;
16154 ++ ca->epsilon_den = 1;
16155 ++ }
16156 ++}
16157 ++
16158 ++/* updating inter-loss distance and ssthresh */
16159 ++static void mptcp_olia_set_state(struct sock *sk, u8 new_state)
16160 ++{
16161 ++ if (!mptcp(tcp_sk(sk)))
16162 ++ return;
16163 ++
16164 ++ if (new_state == TCP_CA_Loss ||
16165 ++ new_state == TCP_CA_Recovery || new_state == TCP_CA_CWR) {
16166 ++ struct mptcp_olia *ca = inet_csk_ca(sk);
16167 ++
16168 ++ if (ca->mptcp_loss3 != ca->mptcp_loss2 &&
16169 ++ !inet_csk(sk)->icsk_retransmits) {
16170 ++ ca->mptcp_loss1 = ca->mptcp_loss2;
16171 ++ ca->mptcp_loss2 = ca->mptcp_loss3;
16172 ++ }
16173 ++ }
16174 ++}
16175 ++
16176 ++/* main algorithm */
16177 ++static void mptcp_olia_cong_avoid(struct sock *sk, u32 ack, u32 acked)
16178 ++{
16179 ++ struct tcp_sock *tp = tcp_sk(sk);
16180 ++ struct mptcp_olia *ca = inet_csk_ca(sk);
16181 ++ const struct mptcp_cb *mpcb = tp->mpcb;
16182 ++
16183 ++ u64 inc_num, inc_den, rate, cwnd_scaled;
16184 ++
16185 ++ if (!mptcp(tp)) {
16186 ++ tcp_reno_cong_avoid(sk, ack, acked);
16187 ++ return;
16188 ++ }
16189 ++
16190 ++ ca->mptcp_loss3 = tp->snd_una;
16191 ++
16192 ++ if (!tcp_is_cwnd_limited(sk))
16193 ++ return;
16194 ++
16195 ++ /* slow start if it is in the safe area */
16196 ++ if (tp->snd_cwnd <= tp->snd_ssthresh) {
16197 ++ tcp_slow_start(tp, acked);
16198 ++ return;
16199 ++ }
16200 ++
16201 ++ mptcp_get_epsilon(mpcb);
16202 ++ rate = mptcp_get_rate(mpcb, tp->srtt_us);
16203 ++ cwnd_scaled = mptcp_olia_scale(tp->snd_cwnd, scale);
16204 ++ inc_den = ca->epsilon_den * tp->snd_cwnd * rate ? : 1;
16205 ++
16206 ++ /* calculate the increasing term, scaling is used to reduce the rounding effect */
16207 ++ if (ca->epsilon_num == -1) {
16208 ++ if (ca->epsilon_den * cwnd_scaled * cwnd_scaled < rate) {
16209 ++ inc_num = rate - ca->epsilon_den *
16210 ++ cwnd_scaled * cwnd_scaled;
16211 ++ ca->mptcp_snd_cwnd_cnt -= div64_u64(
16212 ++ mptcp_olia_scale(inc_num , scale) , inc_den);
16213 ++ } else {
16214 ++ inc_num = ca->epsilon_den *
16215 ++ cwnd_scaled * cwnd_scaled - rate;
16216 ++ ca->mptcp_snd_cwnd_cnt += div64_u64(
16217 ++ mptcp_olia_scale(inc_num , scale) , inc_den);
16218 ++ }
16219 ++ } else {
16220 ++ inc_num = ca->epsilon_num * rate +
16221 ++ ca->epsilon_den * cwnd_scaled * cwnd_scaled;
16222 ++ ca->mptcp_snd_cwnd_cnt += div64_u64(
16223 ++ mptcp_olia_scale(inc_num , scale) , inc_den);
16224 ++ }
16225 ++
16226 ++
16227 ++ if (ca->mptcp_snd_cwnd_cnt >= (1 << scale) - 1) {
16228 ++ if (tp->snd_cwnd < tp->snd_cwnd_clamp)
16229 ++ tp->snd_cwnd++;
16230 ++ ca->mptcp_snd_cwnd_cnt = 0;
16231 ++ } else if (ca->mptcp_snd_cwnd_cnt <= 0 - (1 << scale) + 1) {
16232 ++ tp->snd_cwnd = max((int) 1 , (int) tp->snd_cwnd - 1);
16233 ++ ca->mptcp_snd_cwnd_cnt = 0;
16234 ++ }
16235 ++}
16236 ++
16237 ++static struct tcp_congestion_ops mptcp_olia = {
16238 ++ .init = mptcp_olia_init,
16239 ++ .ssthresh = tcp_reno_ssthresh,
16240 ++ .cong_avoid = mptcp_olia_cong_avoid,
16241 ++ .set_state = mptcp_olia_set_state,
16242 ++ .owner = THIS_MODULE,
16243 ++ .name = "olia",
16244 ++};
16245 ++
16246 ++static int __init mptcp_olia_register(void)
16247 ++{
16248 ++ BUILD_BUG_ON(sizeof(struct mptcp_olia) > ICSK_CA_PRIV_SIZE);
16249 ++ return tcp_register_congestion_control(&mptcp_olia);
16250 ++}
16251 ++
16252 ++static void __exit mptcp_olia_unregister(void)
16253 ++{
16254 ++ tcp_unregister_congestion_control(&mptcp_olia);
16255 ++}
16256 ++
16257 ++module_init(mptcp_olia_register);
16258 ++module_exit(mptcp_olia_unregister);
16259 ++
16260 ++MODULE_AUTHOR("Ramin Khalili, Nicolas Gast, Jean-Yves Le Boudec");
16261 ++MODULE_LICENSE("GPL");
16262 ++MODULE_DESCRIPTION("MPTCP COUPLED CONGESTION CONTROL");
16263 ++MODULE_VERSION("0.1");
16264 +diff --git a/net/mptcp/mptcp_output.c b/net/mptcp/mptcp_output.c
16265 +new file mode 100644
16266 +index 000000000000..400ea254c078
16267 +--- /dev/null
16268 ++++ b/net/mptcp/mptcp_output.c
16269 +@@ -0,0 +1,1743 @@
16270 ++/*
16271 ++ * MPTCP implementation - Sending side
16272 ++ *
16273 ++ * Initial Design & Implementation:
16274 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
16275 ++ *
16276 ++ * Current Maintainer & Author:
16277 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
16278 ++ *
16279 ++ * Additional authors:
16280 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
16281 ++ * Gregory Detal <gregory.detal@×××××××××.be>
16282 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
16283 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
16284 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
16285 ++ * Andreas Ripke <ripke@××××××.eu>
16286 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
16287 ++ * Octavian Purdila <octavian.purdila@×××××.com>
16288 ++ * John Ronan <jronan@××××.org>
16289 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
16290 ++ * Brandon Heller <brandonh@××××××××.edu>
16291 ++ *
16292 ++ *
16293 ++ * This program is free software; you can redistribute it and/or
16294 ++ * modify it under the terms of the GNU General Public License
16295 ++ * as published by the Free Software Foundation; either version
16296 ++ * 2 of the License, or (at your option) any later version.
16297 ++ */
16298 ++
16299 ++#include <linux/kconfig.h>
16300 ++#include <linux/skbuff.h>
16301 ++#include <linux/tcp.h>
16302 ++
16303 ++#include <net/mptcp.h>
16304 ++#include <net/mptcp_v4.h>
16305 ++#include <net/mptcp_v6.h>
16306 ++#include <net/sock.h>
16307 ++
16308 ++static const int mptcp_dss_len = MPTCP_SUB_LEN_DSS_ALIGN +
16309 ++ MPTCP_SUB_LEN_ACK_ALIGN +
16310 ++ MPTCP_SUB_LEN_SEQ_ALIGN;
16311 ++
16312 ++static inline int mptcp_sub_len_remove_addr(u16 bitfield)
16313 ++{
16314 ++ unsigned int c;
16315 ++ for (c = 0; bitfield; c++)
16316 ++ bitfield &= bitfield - 1;
16317 ++ return MPTCP_SUB_LEN_REMOVE_ADDR + c - 1;
16318 ++}
16319 ++
16320 ++int mptcp_sub_len_remove_addr_align(u16 bitfield)
16321 ++{
16322 ++ return ALIGN(mptcp_sub_len_remove_addr(bitfield), 4);
16323 ++}
16324 ++EXPORT_SYMBOL(mptcp_sub_len_remove_addr_align);
16325 ++
16326 ++/* get the data-seq and end-data-seq and store them again in the
16327 ++ * tcp_skb_cb
16328 ++ */
16329 ++static int mptcp_reconstruct_mapping(struct sk_buff *skb)
16330 ++{
16331 ++ const struct mp_dss *mpdss = (struct mp_dss *)TCP_SKB_CB(skb)->dss;
16332 ++ u32 *p32;
16333 ++ u16 *p16;
16334 ++
16335 ++ if (!mpdss->M)
16336 ++ return 1;
16337 ++
16338 ++ /* Move the pointer to the data-seq */
16339 ++ p32 = (u32 *)mpdss;
16340 ++ p32++;
16341 ++ if (mpdss->A) {
16342 ++ p32++;
16343 ++ if (mpdss->a)
16344 ++ p32++;
16345 ++ }
16346 ++
16347 ++ TCP_SKB_CB(skb)->seq = ntohl(*p32);
16348 ++
16349 ++ /* Get the data_len to calculate the end_data_seq */
16350 ++ p32++;
16351 ++ p32++;
16352 ++ p16 = (u16 *)p32;
16353 ++ TCP_SKB_CB(skb)->end_seq = ntohs(*p16) + TCP_SKB_CB(skb)->seq;
16354 ++
16355 ++ return 0;
16356 ++}
16357 ++
16358 ++static void mptcp_find_and_set_pathmask(const struct sock *meta_sk, struct sk_buff *skb)
16359 ++{
16360 ++ struct sk_buff *skb_it;
16361 ++
16362 ++ skb_it = tcp_write_queue_head(meta_sk);
16363 ++
16364 ++ tcp_for_write_queue_from(skb_it, meta_sk) {
16365 ++ if (skb_it == tcp_send_head(meta_sk))
16366 ++ break;
16367 ++
16368 ++ if (TCP_SKB_CB(skb_it)->seq == TCP_SKB_CB(skb)->seq) {
16369 ++ TCP_SKB_CB(skb)->path_mask = TCP_SKB_CB(skb_it)->path_mask;
16370 ++ break;
16371 ++ }
16372 ++ }
16373 ++}
16374 ++
16375 ++/* Reinject data from one TCP subflow to the meta_sk. If sk == NULL, we are
16376 ++ * coming from the meta-retransmit-timer
16377 ++ */
16378 ++static void __mptcp_reinject_data(struct sk_buff *orig_skb, struct sock *meta_sk,
16379 ++ struct sock *sk, int clone_it)
16380 ++{
16381 ++ struct sk_buff *skb, *skb1;
16382 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16383 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
16384 ++ u32 seq, end_seq;
16385 ++
16386 ++ if (clone_it) {
16387 ++ /* pskb_copy is necessary here, because the TCP/IP-headers
16388 ++ * will be changed when it's going to be reinjected on another
16389 ++ * subflow.
16390 ++ */
16391 ++ skb = pskb_copy_for_clone(orig_skb, GFP_ATOMIC);
16392 ++ } else {
16393 ++ __skb_unlink(orig_skb, &sk->sk_write_queue);
16394 ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
16395 ++ sk->sk_wmem_queued -= orig_skb->truesize;
16396 ++ sk_mem_uncharge(sk, orig_skb->truesize);
16397 ++ skb = orig_skb;
16398 ++ }
16399 ++ if (unlikely(!skb))
16400 ++ return;
16401 ++
16402 ++ if (sk && mptcp_reconstruct_mapping(skb)) {
16403 ++ __kfree_skb(skb);
16404 ++ return;
16405 ++ }
16406 ++
16407 ++ skb->sk = meta_sk;
16408 ++
16409 ++ /* If it reached already the destination, we don't have to reinject it */
16410 ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
16411 ++ __kfree_skb(skb);
16412 ++ return;
16413 ++ }
16414 ++
16415 ++ /* Only reinject segments that are fully covered by the mapping */
16416 ++ if (skb->len + (mptcp_is_data_fin(skb) ? 1 : 0) !=
16417 ++ TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq) {
16418 ++ u32 seq = TCP_SKB_CB(skb)->seq;
16419 ++ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
16420 ++
16421 ++ __kfree_skb(skb);
16422 ++
16423 ++ /* Ok, now we have to look for the full mapping in the meta
16424 ++ * send-queue :S
16425 ++ */
16426 ++ tcp_for_write_queue(skb, meta_sk) {
16427 ++ /* Not yet at the mapping? */
16428 ++ if (before(TCP_SKB_CB(skb)->seq, seq))
16429 ++ continue;
16430 ++ /* We have passed by the mapping */
16431 ++ if (after(TCP_SKB_CB(skb)->end_seq, end_seq))
16432 ++ return;
16433 ++
16434 ++ __mptcp_reinject_data(skb, meta_sk, NULL, 1);
16435 ++ }
16436 ++ return;
16437 ++ }
16438 ++
16439 ++ /* Segment goes back to the MPTCP-layer. So, we need to zero the
16440 ++ * path_mask/dss.
16441 ++ */
16442 ++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
16443 ++
16444 ++ /* We need to find out the path-mask from the meta-write-queue
16445 ++ * to properly select a subflow.
16446 ++ */
16447 ++ mptcp_find_and_set_pathmask(meta_sk, skb);
16448 ++
16449 ++ /* If it's empty, just add */
16450 ++ if (skb_queue_empty(&mpcb->reinject_queue)) {
16451 ++ skb_queue_head(&mpcb->reinject_queue, skb);
16452 ++ return;
16453 ++ }
16454 ++
16455 ++ /* Find place to insert skb - or even we can 'drop' it, as the
16456 ++ * data is already covered by other skb's in the reinject-queue.
16457 ++ *
16458 ++ * This is inspired by code from tcp_data_queue.
16459 ++ */
16460 ++
16461 ++ skb1 = skb_peek_tail(&mpcb->reinject_queue);
16462 ++ seq = TCP_SKB_CB(skb)->seq;
16463 ++ while (1) {
16464 ++ if (!after(TCP_SKB_CB(skb1)->seq, seq))
16465 ++ break;
16466 ++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1)) {
16467 ++ skb1 = NULL;
16468 ++ break;
16469 ++ }
16470 ++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
16471 ++ }
16472 ++
16473 ++ /* Do skb overlap to previous one? */
16474 ++ end_seq = TCP_SKB_CB(skb)->end_seq;
16475 ++ if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
16476 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
16477 ++ /* All the bits are present. Don't reinject */
16478 ++ __kfree_skb(skb);
16479 ++ return;
16480 ++ }
16481 ++ if (seq == TCP_SKB_CB(skb1)->seq) {
16482 ++ if (skb_queue_is_first(&mpcb->reinject_queue, skb1))
16483 ++ skb1 = NULL;
16484 ++ else
16485 ++ skb1 = skb_queue_prev(&mpcb->reinject_queue, skb1);
16486 ++ }
16487 ++ }
16488 ++ if (!skb1)
16489 ++ __skb_queue_head(&mpcb->reinject_queue, skb);
16490 ++ else
16491 ++ __skb_queue_after(&mpcb->reinject_queue, skb1, skb);
16492 ++
16493 ++ /* And clean segments covered by new one as whole. */
16494 ++ while (!skb_queue_is_last(&mpcb->reinject_queue, skb)) {
16495 ++ skb1 = skb_queue_next(&mpcb->reinject_queue, skb);
16496 ++
16497 ++ if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
16498 ++ break;
16499 ++
16500 ++ __skb_unlink(skb1, &mpcb->reinject_queue);
16501 ++ __kfree_skb(skb1);
16502 ++ }
16503 ++ return;
16504 ++}
16505 ++
16506 ++/* Inserts data into the reinject queue */
16507 ++void mptcp_reinject_data(struct sock *sk, int clone_it)
16508 ++{
16509 ++ struct sk_buff *skb_it, *tmp;
16510 ++ struct tcp_sock *tp = tcp_sk(sk);
16511 ++ struct sock *meta_sk = tp->meta_sk;
16512 ++
16513 ++ /* It has already been closed - there is really no point in reinjecting */
16514 ++ if (meta_sk->sk_state == TCP_CLOSE)
16515 ++ return;
16516 ++
16517 ++ skb_queue_walk_safe(&sk->sk_write_queue, skb_it, tmp) {
16518 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb_it);
16519 ++ /* Subflow syn's and fin's are not reinjected.
16520 ++ *
16521 ++ * As well as empty subflow-fins with a data-fin.
16522 ++ * They are reinjected below (without the subflow-fin-flag)
16523 ++ */
16524 ++ if (tcb->tcp_flags & TCPHDR_SYN ||
16525 ++ (tcb->tcp_flags & TCPHDR_FIN && !mptcp_is_data_fin(skb_it)) ||
16526 ++ (tcb->tcp_flags & TCPHDR_FIN && mptcp_is_data_fin(skb_it) && !skb_it->len))
16527 ++ continue;
16528 ++
16529 ++ __mptcp_reinject_data(skb_it, meta_sk, sk, clone_it);
16530 ++ }
16531 ++
16532 ++ skb_it = tcp_write_queue_tail(meta_sk);
16533 ++ /* If sk has sent the empty data-fin, we have to reinject it too. */
16534 ++ if (skb_it && mptcp_is_data_fin(skb_it) && skb_it->len == 0 &&
16535 ++ TCP_SKB_CB(skb_it)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index)) {
16536 ++ __mptcp_reinject_data(skb_it, meta_sk, NULL, 1);
16537 ++ }
16538 ++
16539 ++ mptcp_push_pending_frames(meta_sk);
16540 ++
16541 ++ tp->pf = 1;
16542 ++}
16543 ++EXPORT_SYMBOL(mptcp_reinject_data);
16544 ++
16545 ++static void mptcp_combine_dfin(const struct sk_buff *skb, const struct sock *meta_sk,
16546 ++ struct sock *subsk)
16547 ++{
16548 ++ const struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16549 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
16550 ++ struct sock *sk_it;
16551 ++ int all_empty = 1, all_acked;
16552 ++
16553 ++ /* In infinite mapping we always try to combine */
16554 ++ if (mpcb->infinite_mapping_snd && tcp_close_state(subsk)) {
16555 ++ subsk->sk_shutdown |= SEND_SHUTDOWN;
16556 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
16557 ++ return;
16558 ++ }
16559 ++
16560 ++ /* Don't combine, if they didn't combine - otherwise we end up in
16561 ++ * TIME_WAIT, even if our app is smart enough to avoid it
16562 ++ */
16563 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN) {
16564 ++ if (!mpcb->dfin_combined)
16565 ++ return;
16566 ++ }
16567 ++
16568 ++ /* If no other subflow has data to send, we can combine */
16569 ++ mptcp_for_each_sk(mpcb, sk_it) {
16570 ++ if (!mptcp_sk_can_send(sk_it))
16571 ++ continue;
16572 ++
16573 ++ if (!tcp_write_queue_empty(sk_it))
16574 ++ all_empty = 0;
16575 ++ }
16576 ++
16577 ++ /* If all data has been DATA_ACKed, we can combine.
16578 ++ * -1, because the data_fin consumed one byte
16579 ++ */
16580 ++ all_acked = (meta_tp->snd_una == (meta_tp->write_seq - 1));
16581 ++
16582 ++ if ((all_empty || all_acked) && tcp_close_state(subsk)) {
16583 ++ subsk->sk_shutdown |= SEND_SHUTDOWN;
16584 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
16585 ++ }
16586 ++}
16587 ++
16588 ++static int mptcp_write_dss_mapping(const struct tcp_sock *tp, const struct sk_buff *skb,
16589 ++ __be32 *ptr)
16590 ++{
16591 ++ const struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
16592 ++ __be32 *start = ptr;
16593 ++ __u16 data_len;
16594 ++
16595 ++ *ptr++ = htonl(tcb->seq); /* data_seq */
16596 ++
16597 ++ /* If it's a non-data DATA_FIN, we set subseq to 0 (draft v7) */
16598 ++ if (mptcp_is_data_fin(skb) && skb->len == 0)
16599 ++ *ptr++ = 0; /* subseq */
16600 ++ else
16601 ++ *ptr++ = htonl(tp->write_seq - tp->mptcp->snt_isn); /* subseq */
16602 ++
16603 ++ if (tcb->mptcp_flags & MPTCPHDR_INF)
16604 ++ data_len = 0;
16605 ++ else
16606 ++ data_len = tcb->end_seq - tcb->seq;
16607 ++
16608 ++ if (tp->mpcb->dss_csum && data_len) {
16609 ++ __be16 *p16 = (__be16 *)ptr;
16610 ++ __be32 hdseq = mptcp_get_highorder_sndbits(skb, tp->mpcb);
16611 ++ __wsum csum;
16612 ++
16613 ++ *ptr = htonl(((data_len) << 16) |
16614 ++ (TCPOPT_EOL << 8) |
16615 ++ (TCPOPT_EOL));
16616 ++ csum = csum_partial(ptr - 2, 12, skb->csum);
16617 ++ p16++;
16618 ++ *p16++ = csum_fold(csum_partial(&hdseq, sizeof(hdseq), csum));
16619 ++ } else {
16620 ++ *ptr++ = htonl(((data_len) << 16) |
16621 ++ (TCPOPT_NOP << 8) |
16622 ++ (TCPOPT_NOP));
16623 ++ }
16624 ++
16625 ++ return ptr - start;
16626 ++}
16627 ++
16628 ++static int mptcp_write_dss_data_ack(const struct tcp_sock *tp, const struct sk_buff *skb,
16629 ++ __be32 *ptr)
16630 ++{
16631 ++ struct mp_dss *mdss = (struct mp_dss *)ptr;
16632 ++ __be32 *start = ptr;
16633 ++
16634 ++ mdss->kind = TCPOPT_MPTCP;
16635 ++ mdss->sub = MPTCP_SUB_DSS;
16636 ++ mdss->rsv1 = 0;
16637 ++ mdss->rsv2 = 0;
16638 ++ mdss->F = mptcp_is_data_fin(skb) ? 1 : 0;
16639 ++ mdss->m = 0;
16640 ++ mdss->M = mptcp_is_data_seq(skb) ? 1 : 0;
16641 ++ mdss->a = 0;
16642 ++ mdss->A = 1;
16643 ++ mdss->len = mptcp_sub_len_dss(mdss, tp->mpcb->dss_csum);
16644 ++ ptr++;
16645 ++
16646 ++ *ptr++ = htonl(mptcp_meta_tp(tp)->rcv_nxt);
16647 ++
16648 ++ return ptr - start;
16649 ++}
16650 ++
16651 ++/* RFC6824 states that once a particular subflow mapping has been sent
16652 ++ * out it must never be changed. However, packets may be split while
16653 ++ * they are in the retransmission queue (due to SACK or ACKs) and that
16654 ++ * arguably means that we would change the mapping (e.g. it splits it,
16655 ++ * our sends out a subset of the initial mapping).
16656 ++ *
16657 ++ * Furthermore, the skb checksum is not always preserved across splits
16658 ++ * (e.g. mptcp_fragment) which would mean that we need to recompute
16659 ++ * the DSS checksum in this case.
16660 ++ *
16661 ++ * To avoid this we save the initial DSS mapping which allows us to
16662 ++ * send the same DSS mapping even for fragmented retransmits.
16663 ++ */
16664 ++static void mptcp_save_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb)
16665 ++{
16666 ++ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
16667 ++ __be32 *ptr = (__be32 *)tcb->dss;
16668 ++
16669 ++ tcb->mptcp_flags |= MPTCPHDR_SEQ;
16670 ++
16671 ++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
16672 ++ ptr += mptcp_write_dss_mapping(tp, skb, ptr);
16673 ++}
16674 ++
16675 ++/* Write the saved DSS mapping to the header */
16676 ++static int mptcp_write_dss_data_seq(const struct tcp_sock *tp, struct sk_buff *skb,
16677 ++ __be32 *ptr)
16678 ++{
16679 ++ __be32 *start = ptr;
16680 ++
16681 ++ memcpy(ptr, TCP_SKB_CB(skb)->dss, mptcp_dss_len);
16682 ++
16683 ++ /* update the data_ack */
16684 ++ start[1] = htonl(mptcp_meta_tp(tp)->rcv_nxt);
16685 ++
16686 ++ /* dss is in a union with inet_skb_parm and
16687 ++ * the IP layer expects zeroed IPCB fields.
16688 ++ */
16689 ++ memset(TCP_SKB_CB(skb)->dss, 0 , mptcp_dss_len);
16690 ++
16691 ++ return mptcp_dss_len/sizeof(*ptr);
16692 ++}
16693 ++
16694 ++static bool mptcp_skb_entail(struct sock *sk, struct sk_buff *skb, int reinject)
16695 ++{
16696 ++ struct tcp_sock *tp = tcp_sk(sk);
16697 ++ const struct sock *meta_sk = mptcp_meta_sk(sk);
16698 ++ const struct mptcp_cb *mpcb = tp->mpcb;
16699 ++ struct tcp_skb_cb *tcb;
16700 ++ struct sk_buff *subskb = NULL;
16701 ++
16702 ++ if (!reinject)
16703 ++ TCP_SKB_CB(skb)->mptcp_flags |= (mpcb->snd_hiseq_index ?
16704 ++ MPTCPHDR_SEQ64_INDEX : 0);
16705 ++
16706 ++ subskb = pskb_copy_for_clone(skb, GFP_ATOMIC);
16707 ++ if (!subskb)
16708 ++ return false;
16709 ++
16710 ++ /* At the subflow-level we need to call again tcp_init_tso_segs. We
16711 ++ * force this, by setting gso_segs to 0. It has been set to 1 prior to
16712 ++ * the call to mptcp_skb_entail.
16713 ++ */
16714 ++ skb_shinfo(subskb)->gso_segs = 0;
16715 ++
16716 ++ TCP_SKB_CB(skb)->path_mask |= mptcp_pi_to_flag(tp->mptcp->path_index);
16717 ++
16718 ++ if (!(sk->sk_route_caps & NETIF_F_ALL_CSUM) &&
16719 ++ skb->ip_summed == CHECKSUM_PARTIAL) {
16720 ++ subskb->csum = skb->csum = skb_checksum(skb, 0, skb->len, 0);
16721 ++ subskb->ip_summed = skb->ip_summed = CHECKSUM_NONE;
16722 ++ }
16723 ++
16724 ++ tcb = TCP_SKB_CB(subskb);
16725 ++
16726 ++ if (tp->mpcb->send_infinite_mapping &&
16727 ++ !tp->mpcb->infinite_mapping_snd &&
16728 ++ !before(tcb->seq, mptcp_meta_tp(tp)->snd_nxt)) {
16729 ++ tp->mptcp->fully_established = 1;
16730 ++ tp->mpcb->infinite_mapping_snd = 1;
16731 ++ tp->mptcp->infinite_cutoff_seq = tp->write_seq;
16732 ++ tcb->mptcp_flags |= MPTCPHDR_INF;
16733 ++ }
16734 ++
16735 ++ if (mptcp_is_data_fin(subskb))
16736 ++ mptcp_combine_dfin(subskb, meta_sk, sk);
16737 ++
16738 ++ mptcp_save_dss_data_seq(tp, subskb);
16739 ++
16740 ++ tcb->seq = tp->write_seq;
16741 ++ tcb->sacked = 0; /* reset the sacked field: from the point of view
16742 ++ * of this subflow, we are sending a brand new
16743 ++ * segment
16744 ++ */
16745 ++ /* Take into account seg len */
16746 ++ tp->write_seq += subskb->len + ((tcb->tcp_flags & TCPHDR_FIN) ? 1 : 0);
16747 ++ tcb->end_seq = tp->write_seq;
16748 ++
16749 ++ /* If it's a non-payload DATA_FIN (also no subflow-fin), the
16750 ++ * segment is not part of the subflow but on a meta-only-level.
16751 ++ */
16752 ++ if (!mptcp_is_data_fin(subskb) || tcb->end_seq != tcb->seq) {
16753 ++ tcp_add_write_queue_tail(sk, subskb);
16754 ++ sk->sk_wmem_queued += subskb->truesize;
16755 ++ sk_mem_charge(sk, subskb->truesize);
16756 ++ } else {
16757 ++ int err;
16758 ++
16759 ++ /* Necessary to initialize for tcp_transmit_skb. mss of 1, as
16760 ++ * skb->len = 0 will force tso_segs to 1.
16761 ++ */
16762 ++ tcp_init_tso_segs(sk, subskb, 1);
16763 ++ /* Empty data-fins are sent immediatly on the subflow */
16764 ++ TCP_SKB_CB(subskb)->when = tcp_time_stamp;
16765 ++ err = tcp_transmit_skb(sk, subskb, 1, GFP_ATOMIC);
16766 ++
16767 ++ /* It has not been queued, we can free it now. */
16768 ++ kfree_skb(subskb);
16769 ++
16770 ++ if (err)
16771 ++ return false;
16772 ++ }
16773 ++
16774 ++ if (!tp->mptcp->fully_established) {
16775 ++ tp->mptcp->second_packet = 1;
16776 ++ tp->mptcp->last_end_data_seq = TCP_SKB_CB(skb)->end_seq;
16777 ++ }
16778 ++
16779 ++ return true;
16780 ++}
16781 ++
16782 ++/* Fragment an skb and update the mptcp meta-data. Due to reinject, we
16783 ++ * might need to undo some operations done by tcp_fragment.
16784 ++ */
16785 ++static int mptcp_fragment(struct sock *meta_sk, struct sk_buff *skb, u32 len,
16786 ++ gfp_t gfp, int reinject)
16787 ++{
16788 ++ int ret, diff, old_factor;
16789 ++ struct sk_buff *buff;
16790 ++ u8 flags;
16791 ++
16792 ++ if (skb_headlen(skb) < len)
16793 ++ diff = skb->len - len;
16794 ++ else
16795 ++ diff = skb->data_len;
16796 ++ old_factor = tcp_skb_pcount(skb);
16797 ++
16798 ++ /* The mss_now in tcp_fragment is used to set the tso_segs of the skb.
16799 ++ * At the MPTCP-level we do not care about the absolute value. All we
16800 ++ * care about is that it is set to 1 for accurate packets_out
16801 ++ * accounting.
16802 ++ */
16803 ++ ret = tcp_fragment(meta_sk, skb, len, UINT_MAX, gfp);
16804 ++ if (ret)
16805 ++ return ret;
16806 ++
16807 ++ buff = skb->next;
16808 ++
16809 ++ flags = TCP_SKB_CB(skb)->mptcp_flags;
16810 ++ TCP_SKB_CB(skb)->mptcp_flags = flags & ~(MPTCPHDR_FIN);
16811 ++ TCP_SKB_CB(buff)->mptcp_flags = flags;
16812 ++ TCP_SKB_CB(buff)->path_mask = TCP_SKB_CB(skb)->path_mask;
16813 ++
16814 ++ /* If reinject == 1, the buff will be added to the reinject
16815 ++ * queue, which is currently not part of memory accounting. So
16816 ++ * undo the changes done by tcp_fragment and update the
16817 ++ * reinject queue. Also, undo changes to the packet counters.
16818 ++ */
16819 ++ if (reinject == 1) {
16820 ++ int undo = buff->truesize - diff;
16821 ++ meta_sk->sk_wmem_queued -= undo;
16822 ++ sk_mem_uncharge(meta_sk, undo);
16823 ++
16824 ++ tcp_sk(meta_sk)->mpcb->reinject_queue.qlen++;
16825 ++ meta_sk->sk_write_queue.qlen--;
16826 ++
16827 ++ if (!before(tcp_sk(meta_sk)->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
16828 ++ undo = old_factor - tcp_skb_pcount(skb) -
16829 ++ tcp_skb_pcount(buff);
16830 ++ if (undo)
16831 ++ tcp_adjust_pcount(meta_sk, skb, -undo);
16832 ++ }
16833 ++ }
16834 ++
16835 ++ return 0;
16836 ++}
16837 ++
16838 ++/* Inspired by tcp_write_wakeup */
16839 ++int mptcp_write_wakeup(struct sock *meta_sk)
16840 ++{
16841 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
16842 ++ struct sk_buff *skb;
16843 ++ struct sock *sk_it;
16844 ++ int ans = 0;
16845 ++
16846 ++ if (meta_sk->sk_state == TCP_CLOSE)
16847 ++ return -1;
16848 ++
16849 ++ skb = tcp_send_head(meta_sk);
16850 ++ if (skb &&
16851 ++ before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(meta_tp))) {
16852 ++ unsigned int mss;
16853 ++ unsigned int seg_size = tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq;
16854 ++ struct sock *subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, true);
16855 ++ struct tcp_sock *subtp;
16856 ++ if (!subsk)
16857 ++ goto window_probe;
16858 ++ subtp = tcp_sk(subsk);
16859 ++ mss = tcp_current_mss(subsk);
16860 ++
16861 ++ seg_size = min(tcp_wnd_end(meta_tp) - TCP_SKB_CB(skb)->seq,
16862 ++ tcp_wnd_end(subtp) - subtp->write_seq);
16863 ++
16864 ++ if (before(meta_tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
16865 ++ meta_tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
16866 ++
16867 ++ /* We are probing the opening of a window
16868 ++ * but the window size is != 0
16869 ++ * must have been a result SWS avoidance ( sender )
16870 ++ */
16871 ++ if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
16872 ++ skb->len > mss) {
16873 ++ seg_size = min(seg_size, mss);
16874 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
16875 ++ if (mptcp_fragment(meta_sk, skb, seg_size,
16876 ++ GFP_ATOMIC, 0))
16877 ++ return -1;
16878 ++ } else if (!tcp_skb_pcount(skb)) {
16879 ++ /* see mptcp_write_xmit on why we use UINT_MAX */
16880 ++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);
16881 ++ }
16882 ++
16883 ++ TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
16884 ++ if (!mptcp_skb_entail(subsk, skb, 0))
16885 ++ return -1;
16886 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
16887 ++
16888 ++ mptcp_check_sndseq_wrap(meta_tp, TCP_SKB_CB(skb)->end_seq -
16889 ++ TCP_SKB_CB(skb)->seq);
16890 ++ tcp_event_new_data_sent(meta_sk, skb);
16891 ++
16892 ++ __tcp_push_pending_frames(subsk, mss, TCP_NAGLE_PUSH);
16893 ++
16894 ++ return 0;
16895 ++ } else {
16896 ++window_probe:
16897 ++ if (between(meta_tp->snd_up, meta_tp->snd_una + 1,
16898 ++ meta_tp->snd_una + 0xFFFF)) {
16899 ++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
16900 ++ if (mptcp_sk_can_send_ack(sk_it))
16901 ++ tcp_xmit_probe_skb(sk_it, 1);
16902 ++ }
16903 ++ }
16904 ++
16905 ++ /* At least one of the tcp_xmit_probe_skb's has to succeed */
16906 ++ mptcp_for_each_sk(meta_tp->mpcb, sk_it) {
16907 ++ int ret;
16908 ++
16909 ++ if (!mptcp_sk_can_send_ack(sk_it))
16910 ++ continue;
16911 ++
16912 ++ ret = tcp_xmit_probe_skb(sk_it, 0);
16913 ++ if (unlikely(ret > 0))
16914 ++ ans = ret;
16915 ++ }
16916 ++ return ans;
16917 ++ }
16918 ++}
16919 ++
16920 ++bool mptcp_write_xmit(struct sock *meta_sk, unsigned int mss_now, int nonagle,
16921 ++ int push_one, gfp_t gfp)
16922 ++{
16923 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk), *subtp;
16924 ++ struct sock *subsk = NULL;
16925 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
16926 ++ struct sk_buff *skb;
16927 ++ unsigned int sent_pkts;
16928 ++ int reinject = 0;
16929 ++ unsigned int sublimit;
16930 ++
16931 ++ sent_pkts = 0;
16932 ++
16933 ++ while ((skb = mpcb->sched_ops->next_segment(meta_sk, &reinject, &subsk,
16934 ++ &sublimit))) {
16935 ++ unsigned int limit;
16936 ++
16937 ++ subtp = tcp_sk(subsk);
16938 ++ mss_now = tcp_current_mss(subsk);
16939 ++
16940 ++ if (reinject == 1) {
16941 ++ if (!after(TCP_SKB_CB(skb)->end_seq, meta_tp->snd_una)) {
16942 ++ /* Segment already reached the peer, take the next one */
16943 ++ __skb_unlink(skb, &mpcb->reinject_queue);
16944 ++ __kfree_skb(skb);
16945 ++ continue;
16946 ++ }
16947 ++ }
16948 ++
16949 ++ /* If the segment was cloned (e.g. a meta retransmission),
16950 ++ * the header must be expanded/copied so that there is no
16951 ++ * corruption of TSO information.
16952 ++ */
16953 ++ if (skb_unclone(skb, GFP_ATOMIC))
16954 ++ break;
16955 ++
16956 ++ if (unlikely(!tcp_snd_wnd_test(meta_tp, skb, mss_now)))
16957 ++ break;
16958 ++
16959 ++ /* Force tso_segs to 1 by using UINT_MAX.
16960 ++ * We actually don't care about the exact number of segments
16961 ++ * emitted on the subflow. We need just to set tso_segs, because
16962 ++ * we still need an accurate packets_out count in
16963 ++ * tcp_event_new_data_sent.
16964 ++ */
16965 ++ tcp_set_skb_tso_segs(meta_sk, skb, UINT_MAX);
16966 ++
16967 ++ /* Check for nagle, irregardless of tso_segs. If the segment is
16968 ++ * actually larger than mss_now (TSO segment), then
16969 ++ * tcp_nagle_check will have partial == false and always trigger
16970 ++ * the transmission.
16971 ++ * tcp_write_xmit has a TSO-level nagle check which is not
16972 ++ * subject to the MPTCP-level. It is based on the properties of
16973 ++ * the subflow, not the MPTCP-level.
16974 ++ */
16975 ++ if (unlikely(!tcp_nagle_test(meta_tp, skb, mss_now,
16976 ++ (tcp_skb_is_last(meta_sk, skb) ?
16977 ++ nonagle : TCP_NAGLE_PUSH))))
16978 ++ break;
16979 ++
16980 ++ limit = mss_now;
16981 ++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
16982 ++ * tcp_write_xmit. Otherwise split-point would return 0.
16983 ++ */
16984 ++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
16985 ++ /* We limit the size of the skb so that it fits into the
16986 ++ * window. Call tcp_mss_split_point to avoid duplicating
16987 ++ * code.
16988 ++ * We really only care about fitting the skb into the
16989 ++ * window. That's why we use UINT_MAX. If the skb does
16990 ++ * not fit into the cwnd_quota or the NIC's max-segs
16991 ++ * limitation, it will be split by the subflow's
16992 ++ * tcp_write_xmit which does the appropriate call to
16993 ++ * tcp_mss_split_point.
16994 ++ */
16995 ++ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
16996 ++ UINT_MAX / mss_now,
16997 ++ nonagle);
16998 ++
16999 ++ if (sublimit)
17000 ++ limit = min(limit, sublimit);
17001 ++
17002 ++ if (skb->len > limit &&
17003 ++ unlikely(mptcp_fragment(meta_sk, skb, limit, gfp, reinject)))
17004 ++ break;
17005 ++
17006 ++ if (!mptcp_skb_entail(subsk, skb, reinject))
17007 ++ break;
17008 ++ /* Nagle is handled at the MPTCP-layer, so
17009 ++ * always push on the subflow
17010 ++ */
17011 ++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
17012 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
17013 ++
17014 ++ if (!reinject) {
17015 ++ mptcp_check_sndseq_wrap(meta_tp,
17016 ++ TCP_SKB_CB(skb)->end_seq -
17017 ++ TCP_SKB_CB(skb)->seq);
17018 ++ tcp_event_new_data_sent(meta_sk, skb);
17019 ++ }
17020 ++
17021 ++ tcp_minshall_update(meta_tp, mss_now, skb);
17022 ++ sent_pkts += tcp_skb_pcount(skb);
17023 ++
17024 ++ if (reinject > 0) {
17025 ++ __skb_unlink(skb, &mpcb->reinject_queue);
17026 ++ kfree_skb(skb);
17027 ++ }
17028 ++
17029 ++ if (push_one)
17030 ++ break;
17031 ++ }
17032 ++
17033 ++ return !meta_tp->packets_out && tcp_send_head(meta_sk);
17034 ++}
17035 ++
17036 ++void mptcp_write_space(struct sock *sk)
17037 ++{
17038 ++ mptcp_push_pending_frames(mptcp_meta_sk(sk));
17039 ++}
17040 ++
17041 ++u32 __mptcp_select_window(struct sock *sk)
17042 ++{
17043 ++ struct inet_connection_sock *icsk = inet_csk(sk);
17044 ++ struct tcp_sock *tp = tcp_sk(sk), *meta_tp = mptcp_meta_tp(tp);
17045 ++ int mss, free_space, full_space, window;
17046 ++
17047 ++ /* MSS for the peer's data. Previous versions used mss_clamp
17048 ++ * here. I don't know if the value based on our guesses
17049 ++ * of peer's MSS is better for the performance. It's more correct
17050 ++ * but may be worse for the performance because of rcv_mss
17051 ++ * fluctuations. --SAW 1998/11/1
17052 ++ */
17053 ++ mss = icsk->icsk_ack.rcv_mss;
17054 ++ free_space = tcp_space(sk);
17055 ++ full_space = min_t(int, meta_tp->window_clamp,
17056 ++ tcp_full_space(sk));
17057 ++
17058 ++ if (mss > full_space)
17059 ++ mss = full_space;
17060 ++
17061 ++ if (free_space < (full_space >> 1)) {
17062 ++ icsk->icsk_ack.quick = 0;
17063 ++
17064 ++ if (tcp_memory_pressure)
17065 ++ /* TODO this has to be adapted when we support different
17066 ++ * MSS's among the subflows.
17067 ++ */
17068 ++ meta_tp->rcv_ssthresh = min(meta_tp->rcv_ssthresh,
17069 ++ 4U * meta_tp->advmss);
17070 ++
17071 ++ if (free_space < mss)
17072 ++ return 0;
17073 ++ }
17074 ++
17075 ++ if (free_space > meta_tp->rcv_ssthresh)
17076 ++ free_space = meta_tp->rcv_ssthresh;
17077 ++
17078 ++ /* Don't do rounding if we are using window scaling, since the
17079 ++ * scaled window will not line up with the MSS boundary anyway.
17080 ++ */
17081 ++ window = meta_tp->rcv_wnd;
17082 ++ if (tp->rx_opt.rcv_wscale) {
17083 ++ window = free_space;
17084 ++
17085 ++ /* Advertise enough space so that it won't get scaled away.
17086 ++ * Import case: prevent zero window announcement if
17087 ++ * 1<<rcv_wscale > mss.
17088 ++ */
17089 ++ if (((window >> tp->rx_opt.rcv_wscale) << tp->
17090 ++ rx_opt.rcv_wscale) != window)
17091 ++ window = (((window >> tp->rx_opt.rcv_wscale) + 1)
17092 ++ << tp->rx_opt.rcv_wscale);
17093 ++ } else {
17094 ++ /* Get the largest window that is a nice multiple of mss.
17095 ++ * Window clamp already applied above.
17096 ++ * If our current window offering is within 1 mss of the
17097 ++ * free space we just keep it. This prevents the divide
17098 ++ * and multiply from happening most of the time.
17099 ++ * We also don't do any window rounding when the free space
17100 ++ * is too small.
17101 ++ */
17102 ++ if (window <= free_space - mss || window > free_space)
17103 ++ window = (free_space / mss) * mss;
17104 ++ else if (mss == full_space &&
17105 ++ free_space > window + (full_space >> 1))
17106 ++ window = free_space;
17107 ++ }
17108 ++
17109 ++ return window;
17110 ++}
17111 ++
17112 ++void mptcp_syn_options(const struct sock *sk, struct tcp_out_options *opts,
17113 ++ unsigned *remaining)
17114 ++{
17115 ++ const struct tcp_sock *tp = tcp_sk(sk);
17116 ++
17117 ++ opts->options |= OPTION_MPTCP;
17118 ++ if (is_master_tp(tp)) {
17119 ++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYN;
17120 ++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
17121 ++ opts->mp_capable.sender_key = tp->mptcp_loc_key;
17122 ++ opts->dss_csum = !!sysctl_mptcp_checksum;
17123 ++ } else {
17124 ++ const struct mptcp_cb *mpcb = tp->mpcb;
17125 ++
17126 ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYN;
17127 ++ *remaining -= MPTCP_SUB_LEN_JOIN_SYN_ALIGN;
17128 ++ opts->mp_join_syns.token = mpcb->mptcp_rem_token;
17129 ++ opts->mp_join_syns.low_prio = tp->mptcp->low_prio;
17130 ++ opts->addr_id = tp->mptcp->loc_id;
17131 ++ opts->mp_join_syns.sender_nonce = tp->mptcp->mptcp_loc_nonce;
17132 ++ }
17133 ++}
17134 ++
17135 ++void mptcp_synack_options(struct request_sock *req,
17136 ++ struct tcp_out_options *opts, unsigned *remaining)
17137 ++{
17138 ++ struct mptcp_request_sock *mtreq;
17139 ++ mtreq = mptcp_rsk(req);
17140 ++
17141 ++ opts->options |= OPTION_MPTCP;
17142 ++ /* MPCB not yet set - thus it's a new MPTCP-session */
17143 ++ if (!mtreq->is_sub) {
17144 ++ opts->mptcp_options |= OPTION_MP_CAPABLE | OPTION_TYPE_SYNACK;
17145 ++ opts->mp_capable.sender_key = mtreq->mptcp_loc_key;
17146 ++ opts->dss_csum = !!sysctl_mptcp_checksum || mtreq->dss_csum;
17147 ++ *remaining -= MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN;
17148 ++ } else {
17149 ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_SYNACK;
17150 ++ opts->mp_join_syns.sender_truncated_mac =
17151 ++ mtreq->mptcp_hash_tmac;
17152 ++ opts->mp_join_syns.sender_nonce = mtreq->mptcp_loc_nonce;
17153 ++ opts->mp_join_syns.low_prio = mtreq->low_prio;
17154 ++ opts->addr_id = mtreq->loc_id;
17155 ++ *remaining -= MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN;
17156 ++ }
17157 ++}
17158 ++
17159 ++void mptcp_established_options(struct sock *sk, struct sk_buff *skb,
17160 ++ struct tcp_out_options *opts, unsigned *size)
17161 ++{
17162 ++ struct tcp_sock *tp = tcp_sk(sk);
17163 ++ struct mptcp_cb *mpcb = tp->mpcb;
17164 ++ const struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
17165 ++
17166 ++ /* We are coming from tcp_current_mss with the meta_sk as an argument.
17167 ++ * It does not make sense to check for the options, because when the
17168 ++ * segment gets sent, another subflow will be chosen.
17169 ++ */
17170 ++ if (!skb && is_meta_sk(sk))
17171 ++ return;
17172 ++
17173 ++ /* In fallback mp_fail-mode, we have to repeat it until the fallback
17174 ++ * has been done by the sender
17175 ++ */
17176 ++ if (unlikely(tp->mptcp->send_mp_fail)) {
17177 ++ opts->options |= OPTION_MPTCP;
17178 ++ opts->mptcp_options |= OPTION_MP_FAIL;
17179 ++ *size += MPTCP_SUB_LEN_FAIL;
17180 ++ return;
17181 ++ }
17182 ++
17183 ++ if (unlikely(tp->send_mp_fclose)) {
17184 ++ opts->options |= OPTION_MPTCP;
17185 ++ opts->mptcp_options |= OPTION_MP_FCLOSE;
17186 ++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
17187 ++ *size += MPTCP_SUB_LEN_FCLOSE_ALIGN;
17188 ++ return;
17189 ++ }
17190 ++
17191 ++ /* 1. If we are the sender of the infinite-mapping, we need the
17192 ++ * MPTCPHDR_INF-flag, because a retransmission of the
17193 ++ * infinite-announcment still needs the mptcp-option.
17194 ++ *
17195 ++ * We need infinite_cutoff_seq, because retransmissions from before
17196 ++ * the infinite-cutoff-moment still need the MPTCP-signalling to stay
17197 ++ * consistent.
17198 ++ *
17199 ++ * 2. If we are the receiver of the infinite-mapping, we always skip
17200 ++ * mptcp-options, because acknowledgments from before the
17201 ++ * infinite-mapping point have already been sent out.
17202 ++ *
17203 ++ * I know, the whole infinite-mapping stuff is ugly...
17204 ++ *
17205 ++ * TODO: Handle wrapped data-sequence numbers
17206 ++ * (even if it's very unlikely)
17207 ++ */
17208 ++ if (unlikely(mpcb->infinite_mapping_snd) &&
17209 ++ ((mpcb->send_infinite_mapping && tcb &&
17210 ++ mptcp_is_data_seq(skb) &&
17211 ++ !(tcb->mptcp_flags & MPTCPHDR_INF) &&
17212 ++ !before(tcb->seq, tp->mptcp->infinite_cutoff_seq)) ||
17213 ++ !mpcb->send_infinite_mapping))
17214 ++ return;
17215 ++
17216 ++ if (unlikely(tp->mptcp->include_mpc)) {
17217 ++ opts->options |= OPTION_MPTCP;
17218 ++ opts->mptcp_options |= OPTION_MP_CAPABLE |
17219 ++ OPTION_TYPE_ACK;
17220 ++ *size += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN;
17221 ++ opts->mp_capable.sender_key = mpcb->mptcp_loc_key;
17222 ++ opts->mp_capable.receiver_key = mpcb->mptcp_rem_key;
17223 ++ opts->dss_csum = mpcb->dss_csum;
17224 ++
17225 ++ if (skb)
17226 ++ tp->mptcp->include_mpc = 0;
17227 ++ }
17228 ++ if (unlikely(tp->mptcp->pre_established)) {
17229 ++ opts->options |= OPTION_MPTCP;
17230 ++ opts->mptcp_options |= OPTION_MP_JOIN | OPTION_TYPE_ACK;
17231 ++ *size += MPTCP_SUB_LEN_JOIN_ACK_ALIGN;
17232 ++ }
17233 ++
17234 ++ if (!tp->mptcp->include_mpc && !tp->mptcp->pre_established) {
17235 ++ opts->options |= OPTION_MPTCP;
17236 ++ opts->mptcp_options |= OPTION_DATA_ACK;
17237 ++ /* If !skb, we come from tcp_current_mss and thus we always
17238 ++ * assume that the DSS-option will be set for the data-packet.
17239 ++ */
17240 ++ if (skb && !mptcp_is_data_seq(skb)) {
17241 ++ *size += MPTCP_SUB_LEN_ACK_ALIGN;
17242 ++ } else {
17243 ++ /* Doesn't matter, if csum included or not. It will be
17244 ++ * either 10 or 12, and thus aligned = 12
17245 ++ */
17246 ++ *size += MPTCP_SUB_LEN_ACK_ALIGN +
17247 ++ MPTCP_SUB_LEN_SEQ_ALIGN;
17248 ++ }
17249 ++
17250 ++ *size += MPTCP_SUB_LEN_DSS_ALIGN;
17251 ++ }
17252 ++
17253 ++ if (unlikely(mpcb->addr_signal) && mpcb->pm_ops->addr_signal)
17254 ++ mpcb->pm_ops->addr_signal(sk, size, opts, skb);
17255 ++
17256 ++ if (unlikely(tp->mptcp->send_mp_prio) &&
17257 ++ MAX_TCP_OPTION_SPACE - *size >= MPTCP_SUB_LEN_PRIO_ALIGN) {
17258 ++ opts->options |= OPTION_MPTCP;
17259 ++ opts->mptcp_options |= OPTION_MP_PRIO;
17260 ++ if (skb)
17261 ++ tp->mptcp->send_mp_prio = 0;
17262 ++ *size += MPTCP_SUB_LEN_PRIO_ALIGN;
17263 ++ }
17264 ++
17265 ++ return;
17266 ++}
17267 ++
17268 ++u16 mptcp_select_window(struct sock *sk)
17269 ++{
17270 ++ u16 new_win = tcp_select_window(sk);
17271 ++ struct tcp_sock *tp = tcp_sk(sk);
17272 ++ struct tcp_sock *meta_tp = mptcp_meta_tp(tp);
17273 ++
17274 ++ meta_tp->rcv_wnd = tp->rcv_wnd;
17275 ++ meta_tp->rcv_wup = meta_tp->rcv_nxt;
17276 ++
17277 ++ return new_win;
17278 ++}
17279 ++
17280 ++void mptcp_options_write(__be32 *ptr, struct tcp_sock *tp,
17281 ++ const struct tcp_out_options *opts,
17282 ++ struct sk_buff *skb)
17283 ++{
17284 ++ if (unlikely(OPTION_MP_CAPABLE & opts->mptcp_options)) {
17285 ++ struct mp_capable *mpc = (struct mp_capable *)ptr;
17286 ++
17287 ++ mpc->kind = TCPOPT_MPTCP;
17288 ++
17289 ++ if ((OPTION_TYPE_SYN & opts->mptcp_options) ||
17290 ++ (OPTION_TYPE_SYNACK & opts->mptcp_options)) {
17291 ++ mpc->sender_key = opts->mp_capable.sender_key;
17292 ++ mpc->len = MPTCP_SUB_LEN_CAPABLE_SYN;
17293 ++ ptr += MPTCP_SUB_LEN_CAPABLE_SYN_ALIGN >> 2;
17294 ++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
17295 ++ mpc->sender_key = opts->mp_capable.sender_key;
17296 ++ mpc->receiver_key = opts->mp_capable.receiver_key;
17297 ++ mpc->len = MPTCP_SUB_LEN_CAPABLE_ACK;
17298 ++ ptr += MPTCP_SUB_LEN_CAPABLE_ACK_ALIGN >> 2;
17299 ++ }
17300 ++
17301 ++ mpc->sub = MPTCP_SUB_CAPABLE;
17302 ++ mpc->ver = 0;
17303 ++ mpc->a = opts->dss_csum;
17304 ++ mpc->b = 0;
17305 ++ mpc->rsv = 0;
17306 ++ mpc->h = 1;
17307 ++ }
17308 ++
17309 ++ if (unlikely(OPTION_MP_JOIN & opts->mptcp_options)) {
17310 ++ struct mp_join *mpj = (struct mp_join *)ptr;
17311 ++
17312 ++ mpj->kind = TCPOPT_MPTCP;
17313 ++ mpj->sub = MPTCP_SUB_JOIN;
17314 ++ mpj->rsv = 0;
17315 ++
17316 ++ if (OPTION_TYPE_SYN & opts->mptcp_options) {
17317 ++ mpj->len = MPTCP_SUB_LEN_JOIN_SYN;
17318 ++ mpj->u.syn.token = opts->mp_join_syns.token;
17319 ++ mpj->u.syn.nonce = opts->mp_join_syns.sender_nonce;
17320 ++ mpj->b = opts->mp_join_syns.low_prio;
17321 ++ mpj->addr_id = opts->addr_id;
17322 ++ ptr += MPTCP_SUB_LEN_JOIN_SYN_ALIGN >> 2;
17323 ++ } else if (OPTION_TYPE_SYNACK & opts->mptcp_options) {
17324 ++ mpj->len = MPTCP_SUB_LEN_JOIN_SYNACK;
17325 ++ mpj->u.synack.mac =
17326 ++ opts->mp_join_syns.sender_truncated_mac;
17327 ++ mpj->u.synack.nonce = opts->mp_join_syns.sender_nonce;
17328 ++ mpj->b = opts->mp_join_syns.low_prio;
17329 ++ mpj->addr_id = opts->addr_id;
17330 ++ ptr += MPTCP_SUB_LEN_JOIN_SYNACK_ALIGN >> 2;
17331 ++ } else if (OPTION_TYPE_ACK & opts->mptcp_options) {
17332 ++ mpj->len = MPTCP_SUB_LEN_JOIN_ACK;
17333 ++ mpj->addr_id = 0; /* addr_id is rsv (RFC 6824, p. 21) */
17334 ++ memcpy(mpj->u.ack.mac, &tp->mptcp->sender_mac[0], 20);
17335 ++ ptr += MPTCP_SUB_LEN_JOIN_ACK_ALIGN >> 2;
17336 ++ }
17337 ++ }
17338 ++ if (unlikely(OPTION_ADD_ADDR & opts->mptcp_options)) {
17339 ++ struct mp_add_addr *mpadd = (struct mp_add_addr *)ptr;
17340 ++
17341 ++ mpadd->kind = TCPOPT_MPTCP;
17342 ++ if (opts->add_addr_v4) {
17343 ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR4;
17344 ++ mpadd->sub = MPTCP_SUB_ADD_ADDR;
17345 ++ mpadd->ipver = 4;
17346 ++ mpadd->addr_id = opts->add_addr4.addr_id;
17347 ++ mpadd->u.v4.addr = opts->add_addr4.addr;
17348 ++ ptr += MPTCP_SUB_LEN_ADD_ADDR4_ALIGN >> 2;
17349 ++ } else if (opts->add_addr_v6) {
17350 ++ mpadd->len = MPTCP_SUB_LEN_ADD_ADDR6;
17351 ++ mpadd->sub = MPTCP_SUB_ADD_ADDR;
17352 ++ mpadd->ipver = 6;
17353 ++ mpadd->addr_id = opts->add_addr6.addr_id;
17354 ++ memcpy(&mpadd->u.v6.addr, &opts->add_addr6.addr,
17355 ++ sizeof(mpadd->u.v6.addr));
17356 ++ ptr += MPTCP_SUB_LEN_ADD_ADDR6_ALIGN >> 2;
17357 ++ }
17358 ++ }
17359 ++ if (unlikely(OPTION_REMOVE_ADDR & opts->mptcp_options)) {
17360 ++ struct mp_remove_addr *mprem = (struct mp_remove_addr *)ptr;
17361 ++ u8 *addrs_id;
17362 ++ int id, len, len_align;
17363 ++
17364 ++ len = mptcp_sub_len_remove_addr(opts->remove_addrs);
17365 ++ len_align = mptcp_sub_len_remove_addr_align(opts->remove_addrs);
17366 ++
17367 ++ mprem->kind = TCPOPT_MPTCP;
17368 ++ mprem->len = len;
17369 ++ mprem->sub = MPTCP_SUB_REMOVE_ADDR;
17370 ++ mprem->rsv = 0;
17371 ++ addrs_id = &mprem->addrs_id;
17372 ++
17373 ++ mptcp_for_each_bit_set(opts->remove_addrs, id)
17374 ++ *(addrs_id++) = id;
17375 ++
17376 ++ /* Fill the rest with NOP's */
17377 ++ if (len_align > len) {
17378 ++ int i;
17379 ++ for (i = 0; i < len_align - len; i++)
17380 ++ *(addrs_id++) = TCPOPT_NOP;
17381 ++ }
17382 ++
17383 ++ ptr += len_align >> 2;
17384 ++ }
17385 ++ if (unlikely(OPTION_MP_FAIL & opts->mptcp_options)) {
17386 ++ struct mp_fail *mpfail = (struct mp_fail *)ptr;
17387 ++
17388 ++ mpfail->kind = TCPOPT_MPTCP;
17389 ++ mpfail->len = MPTCP_SUB_LEN_FAIL;
17390 ++ mpfail->sub = MPTCP_SUB_FAIL;
17391 ++ mpfail->rsv1 = 0;
17392 ++ mpfail->rsv2 = 0;
17393 ++ mpfail->data_seq = htonll(tp->mpcb->csum_cutoff_seq);
17394 ++
17395 ++ ptr += MPTCP_SUB_LEN_FAIL_ALIGN >> 2;
17396 ++ }
17397 ++ if (unlikely(OPTION_MP_FCLOSE & opts->mptcp_options)) {
17398 ++ struct mp_fclose *mpfclose = (struct mp_fclose *)ptr;
17399 ++
17400 ++ mpfclose->kind = TCPOPT_MPTCP;
17401 ++ mpfclose->len = MPTCP_SUB_LEN_FCLOSE;
17402 ++ mpfclose->sub = MPTCP_SUB_FCLOSE;
17403 ++ mpfclose->rsv1 = 0;
17404 ++ mpfclose->rsv2 = 0;
17405 ++ mpfclose->key = opts->mp_capable.receiver_key;
17406 ++
17407 ++ ptr += MPTCP_SUB_LEN_FCLOSE_ALIGN >> 2;
17408 ++ }
17409 ++
17410 ++ if (OPTION_DATA_ACK & opts->mptcp_options) {
17411 ++ if (!mptcp_is_data_seq(skb))
17412 ++ ptr += mptcp_write_dss_data_ack(tp, skb, ptr);
17413 ++ else
17414 ++ ptr += mptcp_write_dss_data_seq(tp, skb, ptr);
17415 ++ }
17416 ++ if (unlikely(OPTION_MP_PRIO & opts->mptcp_options)) {
17417 ++ struct mp_prio *mpprio = (struct mp_prio *)ptr;
17418 ++
17419 ++ mpprio->kind = TCPOPT_MPTCP;
17420 ++ mpprio->len = MPTCP_SUB_LEN_PRIO;
17421 ++ mpprio->sub = MPTCP_SUB_PRIO;
17422 ++ mpprio->rsv = 0;
17423 ++ mpprio->b = tp->mptcp->low_prio;
17424 ++ mpprio->addr_id = TCPOPT_NOP;
17425 ++
17426 ++ ptr += MPTCP_SUB_LEN_PRIO_ALIGN >> 2;
17427 ++ }
17428 ++}
17429 ++
17430 ++/* Sends the datafin */
17431 ++void mptcp_send_fin(struct sock *meta_sk)
17432 ++{
17433 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17434 ++ struct sk_buff *skb = tcp_write_queue_tail(meta_sk);
17435 ++ int mss_now;
17436 ++
17437 ++ if ((1 << meta_sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK))
17438 ++ meta_tp->mpcb->passive_close = 1;
17439 ++
17440 ++ /* Optimization, tack on the FIN if we have a queue of
17441 ++ * unsent frames. But be careful about outgoing SACKS
17442 ++ * and IP options.
17443 ++ */
17444 ++ mss_now = mptcp_current_mss(meta_sk);
17445 ++
17446 ++ if (tcp_send_head(meta_sk) != NULL) {
17447 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
17448 ++ TCP_SKB_CB(skb)->end_seq++;
17449 ++ meta_tp->write_seq++;
17450 ++ } else {
17451 ++ /* Socket is locked, keep trying until memory is available. */
17452 ++ for (;;) {
17453 ++ skb = alloc_skb_fclone(MAX_TCP_HEADER,
17454 ++ meta_sk->sk_allocation);
17455 ++ if (skb)
17456 ++ break;
17457 ++ yield();
17458 ++ }
17459 ++ /* Reserve space for headers and prepare control bits. */
17460 ++ skb_reserve(skb, MAX_TCP_HEADER);
17461 ++
17462 ++ tcp_init_nondata_skb(skb, meta_tp->write_seq, TCPHDR_ACK);
17463 ++ TCP_SKB_CB(skb)->end_seq++;
17464 ++ TCP_SKB_CB(skb)->mptcp_flags |= MPTCPHDR_FIN;
17465 ++ tcp_queue_skb(meta_sk, skb);
17466 ++ }
17467 ++ __tcp_push_pending_frames(meta_sk, mss_now, TCP_NAGLE_OFF);
17468 ++}
17469 ++
17470 ++void mptcp_send_active_reset(struct sock *meta_sk, gfp_t priority)
17471 ++{
17472 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17473 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
17474 ++ struct sock *sk = NULL, *sk_it = NULL, *tmpsk;
17475 ++
17476 ++ if (!mpcb->cnt_subflows)
17477 ++ return;
17478 ++
17479 ++ WARN_ON(meta_tp->send_mp_fclose);
17480 ++
17481 ++ /* First - select a socket */
17482 ++ sk = mptcp_select_ack_sock(meta_sk);
17483 ++
17484 ++ /* May happen if no subflow is in an appropriate state */
17485 ++ if (!sk)
17486 ++ return;
17487 ++
17488 ++ /* We are in infinite mode - just send a reset */
17489 ++ if (mpcb->infinite_mapping_snd || mpcb->infinite_mapping_rcv) {
17490 ++ sk->sk_err = ECONNRESET;
17491 ++ if (tcp_need_reset(sk->sk_state))
17492 ++ tcp_send_active_reset(sk, priority);
17493 ++ mptcp_sub_force_close(sk);
17494 ++ return;
17495 ++ }
17496 ++
17497 ++
17498 ++ tcp_sk(sk)->send_mp_fclose = 1;
17499 ++ /** Reset all other subflows */
17500 ++
17501 ++ /* tcp_done must be handled with bh disabled */
17502 ++ if (!in_serving_softirq())
17503 ++ local_bh_disable();
17504 ++
17505 ++ mptcp_for_each_sk_safe(mpcb, sk_it, tmpsk) {
17506 ++ if (tcp_sk(sk_it)->send_mp_fclose)
17507 ++ continue;
17508 ++
17509 ++ sk_it->sk_err = ECONNRESET;
17510 ++ if (tcp_need_reset(sk_it->sk_state))
17511 ++ tcp_send_active_reset(sk_it, GFP_ATOMIC);
17512 ++ mptcp_sub_force_close(sk_it);
17513 ++ }
17514 ++
17515 ++ if (!in_serving_softirq())
17516 ++ local_bh_enable();
17517 ++
17518 ++ tcp_send_ack(sk);
17519 ++ inet_csk_reset_keepalive_timer(sk, inet_csk(sk)->icsk_rto);
17520 ++
17521 ++ meta_tp->send_mp_fclose = 1;
17522 ++}
17523 ++
17524 ++static void mptcp_ack_retransmit_timer(struct sock *sk)
17525 ++{
17526 ++ struct sk_buff *skb;
17527 ++ struct tcp_sock *tp = tcp_sk(sk);
17528 ++ struct inet_connection_sock *icsk = inet_csk(sk);
17529 ++
17530 ++ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
17531 ++ goto out; /* Routing failure or similar */
17532 ++
17533 ++ if (!tp->retrans_stamp)
17534 ++ tp->retrans_stamp = tcp_time_stamp ? : 1;
17535 ++
17536 ++ if (tcp_write_timeout(sk)) {
17537 ++ tp->mptcp->pre_established = 0;
17538 ++ sk_stop_timer(sk, &tp->mptcp->mptcp_ack_timer);
17539 ++ tp->ops->send_active_reset(sk, GFP_ATOMIC);
17540 ++ goto out;
17541 ++ }
17542 ++
17543 ++ skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
17544 ++ if (skb == NULL) {
17545 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
17546 ++ jiffies + icsk->icsk_rto);
17547 ++ return;
17548 ++ }
17549 ++
17550 ++ /* Reserve space for headers and prepare control bits */
17551 ++ skb_reserve(skb, MAX_TCP_HEADER);
17552 ++ tcp_init_nondata_skb(skb, tp->snd_una, TCPHDR_ACK);
17553 ++
17554 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
17555 ++ if (tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC) > 0) {
17556 ++ /* Retransmission failed because of local congestion,
17557 ++ * do not backoff.
17558 ++ */
17559 ++ if (!icsk->icsk_retransmits)
17560 ++ icsk->icsk_retransmits = 1;
17561 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
17562 ++ jiffies + icsk->icsk_rto);
17563 ++ return;
17564 ++ }
17565 ++
17566 ++
17567 ++ icsk->icsk_retransmits++;
17568 ++ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
17569 ++ sk_reset_timer(sk, &tp->mptcp->mptcp_ack_timer,
17570 ++ jiffies + icsk->icsk_rto);
17571 ++ if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
17572 ++ __sk_dst_reset(sk);
17573 ++
17574 ++out:;
17575 ++}
17576 ++
17577 ++void mptcp_ack_handler(unsigned long data)
17578 ++{
17579 ++ struct sock *sk = (struct sock *)data;
17580 ++ struct sock *meta_sk = mptcp_meta_sk(sk);
17581 ++
17582 ++ bh_lock_sock(meta_sk);
17583 ++ if (sock_owned_by_user(meta_sk)) {
17584 ++ /* Try again later */
17585 ++ sk_reset_timer(sk, &tcp_sk(sk)->mptcp->mptcp_ack_timer,
17586 ++ jiffies + (HZ / 20));
17587 ++ goto out_unlock;
17588 ++ }
17589 ++
17590 ++ if (sk->sk_state == TCP_CLOSE)
17591 ++ goto out_unlock;
17592 ++ if (!tcp_sk(sk)->mptcp->pre_established)
17593 ++ goto out_unlock;
17594 ++
17595 ++ mptcp_ack_retransmit_timer(sk);
17596 ++
17597 ++ sk_mem_reclaim(sk);
17598 ++
17599 ++out_unlock:
17600 ++ bh_unlock_sock(meta_sk);
17601 ++ sock_put(sk);
17602 ++}
17603 ++
17604 ++/* Similar to tcp_retransmit_skb
17605 ++ *
17606 ++ * The diff is that we handle the retransmission-stats (retrans_stamp) at the
17607 ++ * meta-level.
17608 ++ */
17609 ++int mptcp_retransmit_skb(struct sock *meta_sk, struct sk_buff *skb)
17610 ++{
17611 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17612 ++ struct sock *subsk;
17613 ++ unsigned int limit, mss_now;
17614 ++ int err = -1;
17615 ++
17616 ++ /* Do not sent more than we queued. 1/4 is reserved for possible
17617 ++ * copying overhead: fragmentation, tunneling, mangling etc.
17618 ++ *
17619 ++ * This is a meta-retransmission thus we check on the meta-socket.
17620 ++ */
17621 ++ if (atomic_read(&meta_sk->sk_wmem_alloc) >
17622 ++ min(meta_sk->sk_wmem_queued + (meta_sk->sk_wmem_queued >> 2), meta_sk->sk_sndbuf)) {
17623 ++ return -EAGAIN;
17624 ++ }
17625 ++
17626 ++ /* We need to make sure that the retransmitted segment can be sent on a
17627 ++ * subflow right now. If it is too big, it needs to be fragmented.
17628 ++ */
17629 ++ subsk = meta_tp->mpcb->sched_ops->get_subflow(meta_sk, skb, false);
17630 ++ if (!subsk) {
17631 ++ /* We want to increase icsk_retransmits, thus return 0, so that
17632 ++ * mptcp_retransmit_timer enters the desired branch.
17633 ++ */
17634 ++ err = 0;
17635 ++ goto failed;
17636 ++ }
17637 ++ mss_now = tcp_current_mss(subsk);
17638 ++
17639 ++ /* If the segment was cloned (e.g. a meta retransmission), the header
17640 ++ * must be expanded/copied so that there is no corruption of TSO
17641 ++ * information.
17642 ++ */
17643 ++ if (skb_unclone(skb, GFP_ATOMIC)) {
17644 ++ err = -ENOMEM;
17645 ++ goto failed;
17646 ++ }
17647 ++
17648 ++ /* Must have been set by mptcp_write_xmit before */
17649 ++ BUG_ON(!tcp_skb_pcount(skb));
17650 ++
17651 ++ limit = mss_now;
17652 ++ /* skb->len > mss_now is the equivalent of tso_segs > 1 in
17653 ++ * tcp_write_xmit. Otherwise split-point would return 0.
17654 ++ */
17655 ++ if (skb->len > mss_now && !tcp_urg_mode(meta_tp))
17656 ++ limit = tcp_mss_split_point(meta_sk, skb, mss_now,
17657 ++ UINT_MAX / mss_now,
17658 ++ TCP_NAGLE_OFF);
17659 ++
17660 ++ if (skb->len > limit &&
17661 ++ unlikely(mptcp_fragment(meta_sk, skb, limit,
17662 ++ GFP_ATOMIC, 0)))
17663 ++ goto failed;
17664 ++
17665 ++ if (!mptcp_skb_entail(subsk, skb, -1))
17666 ++ goto failed;
17667 ++ TCP_SKB_CB(skb)->when = tcp_time_stamp;
17668 ++
17669 ++ /* Update global TCP statistics. */
17670 ++ TCP_INC_STATS(sock_net(meta_sk), TCP_MIB_RETRANSSEGS);
17671 ++
17672 ++ /* Diff to tcp_retransmit_skb */
17673 ++
17674 ++ /* Save stamp of the first retransmit. */
17675 ++ if (!meta_tp->retrans_stamp)
17676 ++ meta_tp->retrans_stamp = TCP_SKB_CB(skb)->when;
17677 ++
17678 ++ __tcp_push_pending_frames(subsk, mss_now, TCP_NAGLE_PUSH);
17679 ++
17680 ++ return 0;
17681 ++
17682 ++failed:
17683 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPRETRANSFAIL);
17684 ++ return err;
17685 ++}
17686 ++
17687 ++/* Similar to tcp_retransmit_timer
17688 ++ *
17689 ++ * The diff is that we have to handle retransmissions of the FAST_CLOSE-message
17690 ++ * and that we don't have an srtt estimation at the meta-level.
17691 ++ */
17692 ++void mptcp_retransmit_timer(struct sock *meta_sk)
17693 ++{
17694 ++ struct tcp_sock *meta_tp = tcp_sk(meta_sk);
17695 ++ struct mptcp_cb *mpcb = meta_tp->mpcb;
17696 ++ struct inet_connection_sock *meta_icsk = inet_csk(meta_sk);
17697 ++ int err;
17698 ++
17699 ++ /* In fallback, retransmission is handled at the subflow-level */
17700 ++ if (!meta_tp->packets_out || mpcb->infinite_mapping_snd ||
17701 ++ mpcb->send_infinite_mapping)
17702 ++ return;
17703 ++
17704 ++ WARN_ON(tcp_write_queue_empty(meta_sk));
17705 ++
17706 ++ if (!meta_tp->snd_wnd && !sock_flag(meta_sk, SOCK_DEAD) &&
17707 ++ !((1 << meta_sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
17708 ++ /* Receiver dastardly shrinks window. Our retransmits
17709 ++ * become zero probes, but we should not timeout this
17710 ++ * connection. If the socket is an orphan, time it out,
17711 ++ * we cannot allow such beasts to hang infinitely.
17712 ++ */
17713 ++ struct inet_sock *meta_inet = inet_sk(meta_sk);
17714 ++ if (meta_sk->sk_family == AF_INET) {
17715 ++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
17716 ++ &meta_inet->inet_daddr,
17717 ++ ntohs(meta_inet->inet_dport),
17718 ++ meta_inet->inet_num, meta_tp->snd_una,
17719 ++ meta_tp->snd_nxt);
17720 ++ }
17721 ++#if IS_ENABLED(CONFIG_IPV6)
17722 ++ else if (meta_sk->sk_family == AF_INET6) {
17723 ++ LIMIT_NETDEBUG(KERN_DEBUG "MPTCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
17724 ++ &meta_sk->sk_v6_daddr,
17725 ++ ntohs(meta_inet->inet_dport),
17726 ++ meta_inet->inet_num, meta_tp->snd_una,
17727 ++ meta_tp->snd_nxt);
17728 ++ }
17729 ++#endif
17730 ++ if (tcp_time_stamp - meta_tp->rcv_tstamp > TCP_RTO_MAX) {
17731 ++ tcp_write_err(meta_sk);
17732 ++ return;
17733 ++ }
17734 ++
17735 ++ mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
17736 ++ goto out_reset_timer;
17737 ++ }
17738 ++
17739 ++ if (tcp_write_timeout(meta_sk))
17740 ++ return;
17741 ++
17742 ++ if (meta_icsk->icsk_retransmits == 0)
17743 ++ NET_INC_STATS_BH(sock_net(meta_sk), LINUX_MIB_TCPTIMEOUTS);
17744 ++
17745 ++ meta_icsk->icsk_ca_state = TCP_CA_Loss;
17746 ++
17747 ++ err = mptcp_retransmit_skb(meta_sk, tcp_write_queue_head(meta_sk));
17748 ++ if (err > 0) {
17749 ++ /* Retransmission failed because of local congestion,
17750 ++ * do not backoff.
17751 ++ */
17752 ++ if (!meta_icsk->icsk_retransmits)
17753 ++ meta_icsk->icsk_retransmits = 1;
17754 ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS,
17755 ++ min(meta_icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
17756 ++ TCP_RTO_MAX);
17757 ++ return;
17758 ++ }
17759 ++
17760 ++ /* Increase the timeout each time we retransmit. Note that
17761 ++ * we do not increase the rtt estimate. rto is initialized
17762 ++ * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests
17763 ++ * that doubling rto each time is the least we can get away with.
17764 ++ * In KA9Q, Karn uses this for the first few times, and then
17765 ++ * goes to quadratic. netBSD doubles, but only goes up to *64,
17766 ++ * and clamps at 1 to 64 sec afterwards. Note that 120 sec is
17767 ++ * defined in the protocol as the maximum possible RTT. I guess
17768 ++ * we'll have to use something other than TCP to talk to the
17769 ++ * University of Mars.
17770 ++ *
17771 ++ * PAWS allows us longer timeouts and large windows, so once
17772 ++ * implemented ftp to mars will work nicely. We will have to fix
17773 ++ * the 120 second clamps though!
17774 ++ */
17775 ++ meta_icsk->icsk_backoff++;
17776 ++ meta_icsk->icsk_retransmits++;
17777 ++
17778 ++out_reset_timer:
17779 ++ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
17780 ++ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
17781 ++ * might be increased if the stream oscillates between thin and thick,
17782 ++ * thus the old value might already be too high compared to the value
17783 ++ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
17784 ++ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
17785 ++ * exponential backoff behaviour to avoid continue hammering
17786 ++ * linear-timeout retransmissions into a black hole
17787 ++ */
17788 ++ if (meta_sk->sk_state == TCP_ESTABLISHED &&
17789 ++ (meta_tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
17790 ++ tcp_stream_is_thin(meta_tp) &&
17791 ++ meta_icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
17792 ++ meta_icsk->icsk_backoff = 0;
17793 ++ /* We cannot do the same as in tcp_write_timer because the
17794 ++ * srtt is not set here.
17795 ++ */
17796 ++ mptcp_set_rto(meta_sk);
17797 ++ } else {
17798 ++ /* Use normal (exponential) backoff */
17799 ++ meta_icsk->icsk_rto = min(meta_icsk->icsk_rto << 1, TCP_RTO_MAX);
17800 ++ }
17801 ++ inet_csk_reset_xmit_timer(meta_sk, ICSK_TIME_RETRANS, meta_icsk->icsk_rto, TCP_RTO_MAX);
17802 ++
17803 ++ return;
17804 ++}
17805 ++
17806 ++/* Modify values to an mptcp-level for the initial window of new subflows */
17807 ++void mptcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd,
17808 ++ __u32 *window_clamp, int wscale_ok,
17809 ++ __u8 *rcv_wscale, __u32 init_rcv_wnd,
17810 ++ const struct sock *sk)
17811 ++{
17812 ++ struct mptcp_cb *mpcb = tcp_sk(sk)->mpcb;
17813 ++
17814 ++ *window_clamp = mpcb->orig_window_clamp;
17815 ++ __space = tcp_win_from_space(mpcb->orig_sk_rcvbuf);
17816 ++
17817 ++ tcp_select_initial_window(__space, mss, rcv_wnd, window_clamp,
17818 ++ wscale_ok, rcv_wscale, init_rcv_wnd, sk);
17819 ++}
17820 ++
17821 ++static inline u64 mptcp_calc_rate(const struct sock *meta_sk, unsigned int mss,
17822 ++ unsigned int (*mss_cb)(struct sock *sk))
17823 ++{
17824 ++ struct sock *sk;
17825 ++ u64 rate = 0;
17826 ++
17827 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
17828 ++ struct tcp_sock *tp = tcp_sk(sk);
17829 ++ int this_mss;
17830 ++ u64 this_rate;
17831 ++
17832 ++ if (!mptcp_sk_can_send(sk))
17833 ++ continue;
17834 ++
17835 ++ /* Do not consider subflows without a RTT estimation yet
17836 ++ * otherwise this_rate >>> rate.
17837 ++ */
17838 ++ if (unlikely(!tp->srtt_us))
17839 ++ continue;
17840 ++
17841 ++ this_mss = mss_cb(sk);
17842 ++
17843 ++ /* If this_mss is smaller than mss, it means that a segment will
17844 ++ * be splitted in two (or more) when pushed on this subflow. If
17845 ++ * you consider that mss = 1428 and this_mss = 1420 then two
17846 ++ * segments will be generated: a 1420-byte and 8-byte segment.
17847 ++ * The latter will introduce a large overhead as for a single
17848 ++ * data segment 2 slots will be used in the congestion window.
17849 ++ * Therefore reducing by ~2 the potential throughput of this
17850 ++ * subflow. Indeed, 1428 will be send while 2840 could have been
17851 ++ * sent if mss == 1420 reducing the throughput by 2840 / 1428.
17852 ++ *
17853 ++ * The following algorithm take into account this overhead
17854 ++ * when computing the potential throughput that MPTCP can
17855 ++ * achieve when generating mss-byte segments.
17856 ++ *
17857 ++ * The formulae is the following:
17858 ++ * \sum_{\forall sub} ratio * \frac{mss * cwnd_sub}{rtt_sub}
17859 ++ * Where ratio is computed as follows:
17860 ++ * \frac{mss}{\ceil{mss / mss_sub} * mss_sub}
17861 ++ *
17862 ++ * ratio gives the reduction factor of the theoretical
17863 ++ * throughput a subflow can achieve if MPTCP uses a specific
17864 ++ * MSS value.
17865 ++ */
17866 ++ this_rate = div64_u64((u64)mss * mss * (USEC_PER_SEC << 3) *
17867 ++ max(tp->snd_cwnd, tp->packets_out),
17868 ++ (u64)tp->srtt_us *
17869 ++ DIV_ROUND_UP(mss, this_mss) * this_mss);
17870 ++ rate += this_rate;
17871 ++ }
17872 ++
17873 ++ return rate;
17874 ++}
17875 ++
17876 ++static unsigned int __mptcp_current_mss(const struct sock *meta_sk,
17877 ++ unsigned int (*mss_cb)(struct sock *sk))
17878 ++{
17879 ++ unsigned int mss = 0;
17880 ++ u64 rate = 0;
17881 ++ struct sock *sk;
17882 ++
17883 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
17884 ++ int this_mss;
17885 ++ u64 this_rate;
17886 ++
17887 ++ if (!mptcp_sk_can_send(sk))
17888 ++ continue;
17889 ++
17890 ++ this_mss = mss_cb(sk);
17891 ++
17892 ++ /* Same mss values will produce the same throughput. */
17893 ++ if (this_mss == mss)
17894 ++ continue;
17895 ++
17896 ++ /* See whether using this mss value can theoretically improve
17897 ++ * the performances.
17898 ++ */
17899 ++ this_rate = mptcp_calc_rate(meta_sk, this_mss, mss_cb);
17900 ++ if (this_rate >= rate) {
17901 ++ mss = this_mss;
17902 ++ rate = this_rate;
17903 ++ }
17904 ++ }
17905 ++
17906 ++ return mss;
17907 ++}
17908 ++
17909 ++unsigned int mptcp_current_mss(struct sock *meta_sk)
17910 ++{
17911 ++ unsigned int mss = __mptcp_current_mss(meta_sk, tcp_current_mss);
17912 ++
17913 ++ /* If no subflow is available, we take a default-mss from the
17914 ++ * meta-socket.
17915 ++ */
17916 ++ return !mss ? tcp_current_mss(meta_sk) : mss;
17917 ++}
17918 ++
17919 ++static unsigned int mptcp_select_size_mss(struct sock *sk)
17920 ++{
17921 ++ return tcp_sk(sk)->mss_cache;
17922 ++}
17923 ++
17924 ++int mptcp_select_size(const struct sock *meta_sk, bool sg)
17925 ++{
17926 ++ unsigned int mss = __mptcp_current_mss(meta_sk, mptcp_select_size_mss);
17927 ++
17928 ++ if (sg) {
17929 ++ if (mptcp_sk_can_gso(meta_sk)) {
17930 ++ mss = SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
17931 ++ } else {
17932 ++ int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
17933 ++
17934 ++ if (mss >= pgbreak &&
17935 ++ mss <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
17936 ++ mss = pgbreak;
17937 ++ }
17938 ++ }
17939 ++
17940 ++ return !mss ? tcp_sk(meta_sk)->mss_cache : mss;
17941 ++}
17942 ++
17943 ++int mptcp_check_snd_buf(const struct tcp_sock *tp)
17944 ++{
17945 ++ const struct sock *sk;
17946 ++ u32 rtt_max = tp->srtt_us;
17947 ++ u64 bw_est;
17948 ++
17949 ++ if (!tp->srtt_us)
17950 ++ return tp->reordering + 1;
17951 ++
17952 ++ mptcp_for_each_sk(tp->mpcb, sk) {
17953 ++ if (!mptcp_sk_can_send(sk))
17954 ++ continue;
17955 ++
17956 ++ if (rtt_max < tcp_sk(sk)->srtt_us)
17957 ++ rtt_max = tcp_sk(sk)->srtt_us;
17958 ++ }
17959 ++
17960 ++ bw_est = div64_u64(((u64)tp->snd_cwnd * rtt_max) << 16,
17961 ++ (u64)tp->srtt_us);
17962 ++
17963 ++ return max_t(unsigned int, (u32)(bw_est >> 16),
17964 ++ tp->reordering + 1);
17965 ++}
17966 ++
17967 ++unsigned int mptcp_xmit_size_goal(const struct sock *meta_sk, u32 mss_now,
17968 ++ int large_allowed)
17969 ++{
17970 ++ struct sock *sk;
17971 ++ u32 xmit_size_goal = 0;
17972 ++
17973 ++ if (large_allowed && mptcp_sk_can_gso(meta_sk)) {
17974 ++ mptcp_for_each_sk(tcp_sk(meta_sk)->mpcb, sk) {
17975 ++ int this_size_goal;
17976 ++
17977 ++ if (!mptcp_sk_can_send(sk))
17978 ++ continue;
17979 ++
17980 ++ this_size_goal = tcp_xmit_size_goal(sk, mss_now, 1);
17981 ++ if (this_size_goal > xmit_size_goal)
17982 ++ xmit_size_goal = this_size_goal;
17983 ++ }
17984 ++ }
17985 ++
17986 ++ return max(xmit_size_goal, mss_now);
17987 ++}
17988 ++
17989 ++/* Similar to tcp_trim_head - but we correctly copy the DSS-option */
17990 ++int mptcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
17991 ++{
17992 ++ if (skb_cloned(skb)) {
17993 ++ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
17994 ++ return -ENOMEM;
17995 ++ }
17996 ++
17997 ++ __pskb_trim_head(skb, len);
17998 ++
17999 ++ TCP_SKB_CB(skb)->seq += len;
18000 ++ skb->ip_summed = CHECKSUM_PARTIAL;
18001 ++
18002 ++ skb->truesize -= len;
18003 ++ sk->sk_wmem_queued -= len;
18004 ++ sk_mem_uncharge(sk, len);
18005 ++ sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
18006 ++
18007 ++ /* Any change of skb->len requires recalculation of tso factor. */
18008 ++ if (tcp_skb_pcount(skb) > 1)
18009 ++ tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb));
18010 ++
18011 ++ return 0;
18012 ++}
18013 +diff --git a/net/mptcp/mptcp_pm.c b/net/mptcp/mptcp_pm.c
18014 +new file mode 100644
18015 +index 000000000000..9542f950729f
18016 +--- /dev/null
18017 ++++ b/net/mptcp/mptcp_pm.c
18018 +@@ -0,0 +1,169 @@
18019 ++/*
18020 ++ * MPTCP implementation - MPTCP-subflow-management
18021 ++ *
18022 ++ * Initial Design & Implementation:
18023 ++ * Sébastien Barré <sebastien.barre@×××××××××.be>
18024 ++ *
18025 ++ * Current Maintainer & Author:
18026 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
18027 ++ *
18028 ++ * Additional authors:
18029 ++ * Jaakko Korkeaniemi <jaakko.korkeaniemi@×××××.fi>
18030 ++ * Gregory Detal <gregory.detal@×××××××××.be>
18031 ++ * Fabien Duchêne <fabien.duchene@×××××××××.be>
18032 ++ * Andreas Seelinger <Andreas.Seelinger@×××××××××××.de>
18033 ++ * Lavkesh Lahngir <lavkesh51@×××××.com>
18034 ++ * Andreas Ripke <ripke@××××××.eu>
18035 ++ * Vlad Dogaru <vlad.dogaru@×××××.com>
18036 ++ * Octavian Purdila <octavian.purdila@×××××.com>
18037 ++ * John Ronan <jronan@××××.org>
18038 ++ * Catalin Nicutar <catalin.nicutar@×××××.com>
18039 ++ * Brandon Heller <brandonh@××××××××.edu>
18040 ++ *
18041 ++ *
18042 ++ * This program is free software; you can redistribute it and/or
18043 ++ * modify it under the terms of the GNU General Public License
18044 ++ * as published by the Free Software Foundation; either version
18045 ++ * 2 of the License, or (at your option) any later version.
18046 ++ */
18047 ++
18048 ++
18049 ++#include <linux/module.h>
18050 ++#include <net/mptcp.h>
18051 ++
18052 ++static DEFINE_SPINLOCK(mptcp_pm_list_lock);
18053 ++static LIST_HEAD(mptcp_pm_list);
18054 ++
18055 ++static int mptcp_default_id(sa_family_t family, union inet_addr *addr,
18056 ++ struct net *net, bool *low_prio)
18057 ++{
18058 ++ return 0;
18059 ++}
18060 ++
18061 ++struct mptcp_pm_ops mptcp_pm_default = {
18062 ++ .get_local_id = mptcp_default_id, /* We do not care */
18063 ++ .name = "default",
18064 ++ .owner = THIS_MODULE,
18065 ++};
18066 ++
18067 ++static struct mptcp_pm_ops *mptcp_pm_find(const char *name)
18068 ++{
18069 ++ struct mptcp_pm_ops *e;
18070 ++
18071 ++ list_for_each_entry_rcu(e, &mptcp_pm_list, list) {
18072 ++ if (strcmp(e->name, name) == 0)
18073 ++ return e;
18074 ++ }
18075 ++
18076 ++ return NULL;
18077 ++}
18078 ++
18079 ++int mptcp_register_path_manager(struct mptcp_pm_ops *pm)
18080 ++{
18081 ++ int ret = 0;
18082 ++
18083 ++ if (!pm->get_local_id)
18084 ++ return -EINVAL;
18085 ++
18086 ++ spin_lock(&mptcp_pm_list_lock);
18087 ++ if (mptcp_pm_find(pm->name)) {
18088 ++ pr_notice("%s already registered\n", pm->name);
18089 ++ ret = -EEXIST;
18090 ++ } else {
18091 ++ list_add_tail_rcu(&pm->list, &mptcp_pm_list);
18092 ++ pr_info("%s registered\n", pm->name);
18093 ++ }
18094 ++ spin_unlock(&mptcp_pm_list_lock);
18095 ++
18096 ++ return ret;
18097 ++}
18098 ++EXPORT_SYMBOL_GPL(mptcp_register_path_manager);
18099 ++
18100 ++void mptcp_unregister_path_manager(struct mptcp_pm_ops *pm)
18101 ++{
18102 ++ spin_lock(&mptcp_pm_list_lock);
18103 ++ list_del_rcu(&pm->list);
18104 ++ spin_unlock(&mptcp_pm_list_lock);
18105 ++}
18106 ++EXPORT_SYMBOL_GPL(mptcp_unregister_path_manager);
18107 ++
18108 ++void mptcp_get_default_path_manager(char *name)
18109 ++{
18110 ++ struct mptcp_pm_ops *pm;
18111 ++
18112 ++ BUG_ON(list_empty(&mptcp_pm_list));
18113 ++
18114 ++ rcu_read_lock();
18115 ++ pm = list_entry(mptcp_pm_list.next, struct mptcp_pm_ops, list);
18116 ++ strncpy(name, pm->name, MPTCP_PM_NAME_MAX);
18117 ++ rcu_read_unlock();
18118 ++}
18119 ++
18120 ++int mptcp_set_default_path_manager(const char *name)
18121 ++{
18122 ++ struct mptcp_pm_ops *pm;
18123 ++ int ret = -ENOENT;
18124 ++
18125 ++ spin_lock(&mptcp_pm_list_lock);
18126 ++ pm = mptcp_pm_find(name);
18127 ++#ifdef CONFIG_MODULES
18128 ++ if (!pm && capable(CAP_NET_ADMIN)) {
18129 ++ spin_unlock(&mptcp_pm_list_lock);
18130 ++
18131 ++ request_module("mptcp_%s", name);
18132 ++ spin_lock(&mptcp_pm_list_lock);
18133 ++ pm = mptcp_pm_find(name);
18134 ++ }
18135 ++#endif
18136 ++
18137 ++ if (pm) {
18138 ++ list_move(&pm->list, &mptcp_pm_list);
18139 ++ ret = 0;
18140 ++ } else {
18141 ++ pr_info("%s is not available\n", name);
18142 ++ }
18143 ++ spin_unlock(&mptcp_pm_list_lock);
18144 ++
18145 ++ return ret;
18146 ++}
18147 ++
18148 ++void mptcp_init_path_manager(struct mptcp_cb *mpcb)
18149 ++{
18150 ++ struct mptcp_pm_ops *pm;
18151 ++
18152 ++ rcu_read_lock();
18153 ++ list_for_each_entry_rcu(pm, &mptcp_pm_list, list) {
18154 ++ if (try_module_get(pm->owner)) {
18155 ++ mpcb->pm_ops = pm;
18156 ++ break;
18157 ++ }
18158 ++ }
18159 ++ rcu_read_unlock();
18160 ++}
18161 ++
18162 ++/* Manage refcounts on socket close. */
18163 ++void mptcp_cleanup_path_manager(struct mptcp_cb *mpcb)
18164 ++{
18165 ++ module_put(mpcb->pm_ops->owner);
18166 ++}
18167 ++
18168 ++/* Fallback to the default path-manager. */
18169 ++void mptcp_fallback_default(struct mptcp_cb *mpcb)
18170 ++{
18171 ++ struct mptcp_pm_ops *pm;
18172 ++
18173 ++ mptcp_cleanup_path_manager(mpcb);
18174 ++ pm = mptcp_pm_find("default");
18175 ++
18176 ++ /* Cannot fail - it's the default module */
18177 ++ try_module_get(pm->owner);
18178 ++ mpcb->pm_ops = pm;
18179 ++}
18180 ++EXPORT_SYMBOL_GPL(mptcp_fallback_default);
18181 ++
18182 ++/* Set default value from kernel configuration at bootup */
18183 ++static int __init mptcp_path_manager_default(void)
18184 ++{
18185 ++ return mptcp_set_default_path_manager(CONFIG_DEFAULT_MPTCP_PM);
18186 ++}
18187 ++late_initcall(mptcp_path_manager_default);
18188 +diff --git a/net/mptcp/mptcp_rr.c b/net/mptcp/mptcp_rr.c
18189 +new file mode 100644
18190 +index 000000000000..93278f684069
18191 +--- /dev/null
18192 ++++ b/net/mptcp/mptcp_rr.c
18193 +@@ -0,0 +1,301 @@
18194 ++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
18195 ++
18196 ++#include <linux/module.h>
18197 ++#include <net/mptcp.h>
18198 ++
18199 ++static unsigned char num_segments __read_mostly = 1;
18200 ++module_param(num_segments, byte, 0644);
18201 ++MODULE_PARM_DESC(num_segments, "The number of consecutive segments that are part of a burst");
18202 ++
18203 ++static bool cwnd_limited __read_mostly = 1;
18204 ++module_param(cwnd_limited, bool, 0644);
18205 ++MODULE_PARM_DESC(cwnd_limited, "if set to 1, the scheduler tries to fill the congestion-window on all subflows");
18206 ++
18207 ++struct rrsched_priv {
18208 ++ unsigned char quota;
18209 ++};
18210 ++
18211 ++static struct rrsched_priv *rrsched_get_priv(const struct tcp_sock *tp)
18212 ++{
18213 ++ return (struct rrsched_priv *)&tp->mptcp->mptcp_sched[0];
18214 ++}
18215 ++
18216 ++/* If the sub-socket sk available to send the skb? */
18217 ++static bool mptcp_rr_is_available(const struct sock *sk, const struct sk_buff *skb,
18218 ++ bool zero_wnd_test, bool cwnd_test)
18219 ++{
18220 ++ const struct tcp_sock *tp = tcp_sk(sk);
18221 ++ unsigned int space, in_flight;
18222 ++
18223 ++ /* Set of states for which we are allowed to send data */
18224 ++ if (!mptcp_sk_can_send(sk))
18225 ++ return false;
18226 ++
18227 ++ /* We do not send data on this subflow unless it is
18228 ++ * fully established, i.e. the 4th ack has been received.
18229 ++ */
18230 ++ if (tp->mptcp->pre_established)
18231 ++ return false;
18232 ++
18233 ++ if (tp->pf)
18234 ++ return false;
18235 ++
18236 ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
18237 ++ /* If SACK is disabled, and we got a loss, TCP does not exit
18238 ++ * the loss-state until something above high_seq has been acked.
18239 ++ * (see tcp_try_undo_recovery)
18240 ++ *
18241 ++ * high_seq is the snd_nxt at the moment of the RTO. As soon
18242 ++ * as we have an RTO, we won't push data on the subflow.
18243 ++ * Thus, snd_una can never go beyond high_seq.
18244 ++ */
18245 ++ if (!tcp_is_reno(tp))
18246 ++ return false;
18247 ++ else if (tp->snd_una != tp->high_seq)
18248 ++ return false;
18249 ++ }
18250 ++
18251 ++ if (!tp->mptcp->fully_established) {
18252 ++ /* Make sure that we send in-order data */
18253 ++ if (skb && tp->mptcp->second_packet &&
18254 ++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
18255 ++ return false;
18256 ++ }
18257 ++
18258 ++ if (!cwnd_test)
18259 ++ goto zero_wnd_test;
18260 ++
18261 ++ in_flight = tcp_packets_in_flight(tp);
18262 ++ /* Not even a single spot in the cwnd */
18263 ++ if (in_flight >= tp->snd_cwnd)
18264 ++ return false;
18265 ++
18266 ++ /* Now, check if what is queued in the subflow's send-queue
18267 ++ * already fills the cwnd.
18268 ++ */
18269 ++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
18270 ++
18271 ++ if (tp->write_seq - tp->snd_nxt > space)
18272 ++ return false;
18273 ++
18274 ++zero_wnd_test:
18275 ++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
18276 ++ return false;
18277 ++
18278 ++ return true;
18279 ++}
18280 ++
18281 ++/* Are we not allowed to reinject this skb on tp? */
18282 ++static int mptcp_rr_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
18283 ++{
18284 ++ /* If the skb has already been enqueued in this sk, try to find
18285 ++ * another one.
18286 ++ */
18287 ++ return skb &&
18288 ++ /* Has the skb already been enqueued into this subsocket? */
18289 ++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
18290 ++}
18291 ++
18292 ++/* We just look for any subflow that is available */
18293 ++static struct sock *rr_get_available_subflow(struct sock *meta_sk,
18294 ++ struct sk_buff *skb,
18295 ++ bool zero_wnd_test)
18296 ++{
18297 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18298 ++ struct sock *sk, *bestsk = NULL, *backupsk = NULL;
18299 ++
18300 ++ /* Answer data_fin on same subflow!!! */
18301 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
18302 ++ skb && mptcp_is_data_fin(skb)) {
18303 ++ mptcp_for_each_sk(mpcb, sk) {
18304 ++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
18305 ++ mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
18306 ++ return sk;
18307 ++ }
18308 ++ }
18309 ++
18310 ++ /* First, find the best subflow */
18311 ++ mptcp_for_each_sk(mpcb, sk) {
18312 ++ struct tcp_sock *tp = tcp_sk(sk);
18313 ++
18314 ++ if (!mptcp_rr_is_available(sk, skb, zero_wnd_test, true))
18315 ++ continue;
18316 ++
18317 ++ if (mptcp_rr_dont_reinject_skb(tp, skb)) {
18318 ++ backupsk = sk;
18319 ++ continue;
18320 ++ }
18321 ++
18322 ++ bestsk = sk;
18323 ++ }
18324 ++
18325 ++ if (bestsk) {
18326 ++ sk = bestsk;
18327 ++ } else if (backupsk) {
18328 ++ /* It has been sent on all subflows once - let's give it a
18329 ++ * chance again by restarting its pathmask.
18330 ++ */
18331 ++ if (skb)
18332 ++ TCP_SKB_CB(skb)->path_mask = 0;
18333 ++ sk = backupsk;
18334 ++ }
18335 ++
18336 ++ return sk;
18337 ++}
18338 ++
18339 ++/* Returns the next segment to be sent from the mptcp meta-queue.
18340 ++ * (chooses the reinject queue if any segment is waiting in it, otherwise,
18341 ++ * chooses the normal write queue).
18342 ++ * Sets *@reinject to 1 if the returned segment comes from the
18343 ++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
18344 ++ * and sets it to -1 if it is a meta-level retransmission to optimize the
18345 ++ * receive-buffer.
18346 ++ */
18347 ++static struct sk_buff *__mptcp_rr_next_segment(const struct sock *meta_sk, int *reinject)
18348 ++{
18349 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18350 ++ struct sk_buff *skb = NULL;
18351 ++
18352 ++ *reinject = 0;
18353 ++
18354 ++ /* If we are in fallback-mode, just take from the meta-send-queue */
18355 ++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
18356 ++ return tcp_send_head(meta_sk);
18357 ++
18358 ++ skb = skb_peek(&mpcb->reinject_queue);
18359 ++
18360 ++ if (skb)
18361 ++ *reinject = 1;
18362 ++ else
18363 ++ skb = tcp_send_head(meta_sk);
18364 ++ return skb;
18365 ++}
18366 ++
18367 ++static struct sk_buff *mptcp_rr_next_segment(struct sock *meta_sk,
18368 ++ int *reinject,
18369 ++ struct sock **subsk,
18370 ++ unsigned int *limit)
18371 ++{
18372 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18373 ++ struct sock *sk_it, *choose_sk = NULL;
18374 ++ struct sk_buff *skb = __mptcp_rr_next_segment(meta_sk, reinject);
18375 ++ unsigned char split = num_segments;
18376 ++ unsigned char iter = 0, full_subs = 0;
18377 ++
18378 ++ /* As we set it, we have to reset it as well. */
18379 ++ *limit = 0;
18380 ++
18381 ++ if (!skb)
18382 ++ return NULL;
18383 ++
18384 ++ if (*reinject) {
18385 ++ *subsk = rr_get_available_subflow(meta_sk, skb, false);
18386 ++ if (!*subsk)
18387 ++ return NULL;
18388 ++
18389 ++ return skb;
18390 ++ }
18391 ++
18392 ++retry:
18393 ++
18394 ++ /* First, we look for a subflow who is currently being used */
18395 ++ mptcp_for_each_sk(mpcb, sk_it) {
18396 ++ struct tcp_sock *tp_it = tcp_sk(sk_it);
18397 ++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
18398 ++
18399 ++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
18400 ++ continue;
18401 ++
18402 ++ iter++;
18403 ++
18404 ++ /* Is this subflow currently being used? */
18405 ++ if (rsp->quota > 0 && rsp->quota < num_segments) {
18406 ++ split = num_segments - rsp->quota;
18407 ++ choose_sk = sk_it;
18408 ++ goto found;
18409 ++ }
18410 ++
18411 ++ /* Or, it's totally unused */
18412 ++ if (!rsp->quota) {
18413 ++ split = num_segments;
18414 ++ choose_sk = sk_it;
18415 ++ }
18416 ++
18417 ++ /* Or, it must then be fully used */
18418 ++ if (rsp->quota == num_segments)
18419 ++ full_subs++;
18420 ++ }
18421 ++
18422 ++ /* All considered subflows have a full quota, and we considered at
18423 ++ * least one.
18424 ++ */
18425 ++ if (iter && iter == full_subs) {
18426 ++ /* So, we restart this round by setting quota to 0 and retry
18427 ++ * to find a subflow.
18428 ++ */
18429 ++ mptcp_for_each_sk(mpcb, sk_it) {
18430 ++ struct tcp_sock *tp_it = tcp_sk(sk_it);
18431 ++ struct rrsched_priv *rsp = rrsched_get_priv(tp_it);
18432 ++
18433 ++ if (!mptcp_rr_is_available(sk_it, skb, false, cwnd_limited))
18434 ++ continue;
18435 ++
18436 ++ rsp->quota = 0;
18437 ++ }
18438 ++
18439 ++ goto retry;
18440 ++ }
18441 ++
18442 ++found:
18443 ++ if (choose_sk) {
18444 ++ unsigned int mss_now;
18445 ++ struct tcp_sock *choose_tp = tcp_sk(choose_sk);
18446 ++ struct rrsched_priv *rsp = rrsched_get_priv(choose_tp);
18447 ++
18448 ++ if (!mptcp_rr_is_available(choose_sk, skb, false, true))
18449 ++ return NULL;
18450 ++
18451 ++ *subsk = choose_sk;
18452 ++ mss_now = tcp_current_mss(*subsk);
18453 ++ *limit = split * mss_now;
18454 ++
18455 ++ if (skb->len > mss_now)
18456 ++ rsp->quota += DIV_ROUND_UP(skb->len, mss_now);
18457 ++ else
18458 ++ rsp->quota++;
18459 ++
18460 ++ return skb;
18461 ++ }
18462 ++
18463 ++ return NULL;
18464 ++}
18465 ++
18466 ++static struct mptcp_sched_ops mptcp_sched_rr = {
18467 ++ .get_subflow = rr_get_available_subflow,
18468 ++ .next_segment = mptcp_rr_next_segment,
18469 ++ .name = "roundrobin",
18470 ++ .owner = THIS_MODULE,
18471 ++};
18472 ++
18473 ++static int __init rr_register(void)
18474 ++{
18475 ++ BUILD_BUG_ON(sizeof(struct rrsched_priv) > MPTCP_SCHED_SIZE);
18476 ++
18477 ++ if (mptcp_register_scheduler(&mptcp_sched_rr))
18478 ++ return -1;
18479 ++
18480 ++ return 0;
18481 ++}
18482 ++
18483 ++static void rr_unregister(void)
18484 ++{
18485 ++ mptcp_unregister_scheduler(&mptcp_sched_rr);
18486 ++}
18487 ++
18488 ++module_init(rr_register);
18489 ++module_exit(rr_unregister);
18490 ++
18491 ++MODULE_AUTHOR("Christoph Paasch");
18492 ++MODULE_LICENSE("GPL");
18493 ++MODULE_DESCRIPTION("ROUNDROBIN MPTCP");
18494 ++MODULE_VERSION("0.89");
18495 +diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c
18496 +new file mode 100644
18497 +index 000000000000..6c7ff4eceac1
18498 +--- /dev/null
18499 ++++ b/net/mptcp/mptcp_sched.c
18500 +@@ -0,0 +1,493 @@
18501 ++/* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
18502 ++
18503 ++#include <linux/module.h>
18504 ++#include <net/mptcp.h>
18505 ++
18506 ++static DEFINE_SPINLOCK(mptcp_sched_list_lock);
18507 ++static LIST_HEAD(mptcp_sched_list);
18508 ++
18509 ++struct defsched_priv {
18510 ++ u32 last_rbuf_opti;
18511 ++};
18512 ++
18513 ++static struct defsched_priv *defsched_get_priv(const struct tcp_sock *tp)
18514 ++{
18515 ++ return (struct defsched_priv *)&tp->mptcp->mptcp_sched[0];
18516 ++}
18517 ++
18518 ++/* If the sub-socket sk available to send the skb? */
18519 ++static bool mptcp_is_available(struct sock *sk, const struct sk_buff *skb,
18520 ++ bool zero_wnd_test)
18521 ++{
18522 ++ const struct tcp_sock *tp = tcp_sk(sk);
18523 ++ unsigned int mss_now, space, in_flight;
18524 ++
18525 ++ /* Set of states for which we are allowed to send data */
18526 ++ if (!mptcp_sk_can_send(sk))
18527 ++ return false;
18528 ++
18529 ++ /* We do not send data on this subflow unless it is
18530 ++ * fully established, i.e. the 4th ack has been received.
18531 ++ */
18532 ++ if (tp->mptcp->pre_established)
18533 ++ return false;
18534 ++
18535 ++ if (tp->pf)
18536 ++ return false;
18537 ++
18538 ++ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
18539 ++ /* If SACK is disabled, and we got a loss, TCP does not exit
18540 ++ * the loss-state until something above high_seq has been acked.
18541 ++ * (see tcp_try_undo_recovery)
18542 ++ *
18543 ++ * high_seq is the snd_nxt at the moment of the RTO. As soon
18544 ++ * as we have an RTO, we won't push data on the subflow.
18545 ++ * Thus, snd_una can never go beyond high_seq.
18546 ++ */
18547 ++ if (!tcp_is_reno(tp))
18548 ++ return false;
18549 ++ else if (tp->snd_una != tp->high_seq)
18550 ++ return false;
18551 ++ }
18552 ++
18553 ++ if (!tp->mptcp->fully_established) {
18554 ++ /* Make sure that we send in-order data */
18555 ++ if (skb && tp->mptcp->second_packet &&
18556 ++ tp->mptcp->last_end_data_seq != TCP_SKB_CB(skb)->seq)
18557 ++ return false;
18558 ++ }
18559 ++
18560 ++ /* If TSQ is already throttling us, do not send on this subflow. When
18561 ++ * TSQ gets cleared the subflow becomes eligible again.
18562 ++ */
18563 ++ if (test_bit(TSQ_THROTTLED, &tp->tsq_flags))
18564 ++ return false;
18565 ++
18566 ++ in_flight = tcp_packets_in_flight(tp);
18567 ++ /* Not even a single spot in the cwnd */
18568 ++ if (in_flight >= tp->snd_cwnd)
18569 ++ return false;
18570 ++
18571 ++ /* Now, check if what is queued in the subflow's send-queue
18572 ++ * already fills the cwnd.
18573 ++ */
18574 ++ space = (tp->snd_cwnd - in_flight) * tp->mss_cache;
18575 ++
18576 ++ if (tp->write_seq - tp->snd_nxt > space)
18577 ++ return false;
18578 ++
18579 ++ if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
18580 ++ return false;
18581 ++
18582 ++ mss_now = tcp_current_mss(sk);
18583 ++
18584 ++ /* Don't send on this subflow if we bypass the allowed send-window at
18585 ++ * the per-subflow level. Similar to tcp_snd_wnd_test, but manually
18586 ++ * calculated end_seq (because here at this point end_seq is still at
18587 ++ * the meta-level).
18588 ++ */
18589 ++ if (skb && !zero_wnd_test &&
18590 ++ after(tp->write_seq + min(skb->len, mss_now), tcp_wnd_end(tp)))
18591 ++ return false;
18592 ++
18593 ++ return true;
18594 ++}
18595 ++
18596 ++/* Are we not allowed to reinject this skb on tp? */
18597 ++static int mptcp_dont_reinject_skb(const struct tcp_sock *tp, const struct sk_buff *skb)
18598 ++{
18599 ++ /* If the skb has already been enqueued in this sk, try to find
18600 ++ * another one.
18601 ++ */
18602 ++ return skb &&
18603 ++ /* Has the skb already been enqueued into this subsocket? */
18604 ++ mptcp_pi_to_flag(tp->mptcp->path_index) & TCP_SKB_CB(skb)->path_mask;
18605 ++}
18606 ++
18607 ++/* This is the scheduler. This function decides on which flow to send
18608 ++ * a given MSS. If all subflows are found to be busy, NULL is returned
18609 ++ * The flow is selected based on the shortest RTT.
18610 ++ * If all paths have full cong windows, we simply return NULL.
18611 ++ *
18612 ++ * Additionally, this function is aware of the backup-subflows.
18613 ++ */
18614 ++static struct sock *get_available_subflow(struct sock *meta_sk,
18615 ++ struct sk_buff *skb,
18616 ++ bool zero_wnd_test)
18617 ++{
18618 ++ struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18619 ++ struct sock *sk, *bestsk = NULL, *lowpriosk = NULL, *backupsk = NULL;
18620 ++ u32 min_time_to_peer = 0xffffffff, lowprio_min_time_to_peer = 0xffffffff;
18621 ++ int cnt_backups = 0;
18622 ++
18623 ++ /* if there is only one subflow, bypass the scheduling function */
18624 ++ if (mpcb->cnt_subflows == 1) {
18625 ++ bestsk = (struct sock *)mpcb->connection_list;
18626 ++ if (!mptcp_is_available(bestsk, skb, zero_wnd_test))
18627 ++ bestsk = NULL;
18628 ++ return bestsk;
18629 ++ }
18630 ++
18631 ++ /* Answer data_fin on same subflow!!! */
18632 ++ if (meta_sk->sk_shutdown & RCV_SHUTDOWN &&
18633 ++ skb && mptcp_is_data_fin(skb)) {
18634 ++ mptcp_for_each_sk(mpcb, sk) {
18635 ++ if (tcp_sk(sk)->mptcp->path_index == mpcb->dfin_path_index &&
18636 ++ mptcp_is_available(sk, skb, zero_wnd_test))
18637 ++ return sk;
18638 ++ }
18639 ++ }
18640 ++
18641 ++ /* First, find the best subflow */
18642 ++ mptcp_for_each_sk(mpcb, sk) {
18643 ++ struct tcp_sock *tp = tcp_sk(sk);
18644 ++
18645 ++ if (tp->mptcp->rcv_low_prio || tp->mptcp->low_prio)
18646 ++ cnt_backups++;
18647 ++
18648 ++ if ((tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
18649 ++ tp->srtt_us < lowprio_min_time_to_peer) {
18650 ++ if (!mptcp_is_available(sk, skb, zero_wnd_test))
18651 ++ continue;
18652 ++
18653 ++ if (mptcp_dont_reinject_skb(tp, skb)) {
18654 ++ backupsk = sk;
18655 ++ continue;
18656 ++ }
18657 ++
18658 ++ lowprio_min_time_to_peer = tp->srtt_us;
18659 ++ lowpriosk = sk;
18660 ++ } else if (!(tp->mptcp->rcv_low_prio || tp->mptcp->low_prio) &&
18661 ++ tp->srtt_us < min_time_to_peer) {
18662 ++ if (!mptcp_is_available(sk, skb, zero_wnd_test))
18663 ++ continue;
18664 ++
18665 ++ if (mptcp_dont_reinject_skb(tp, skb)) {
18666 ++ backupsk = sk;
18667 ++ continue;
18668 ++ }
18669 ++
18670 ++ min_time_to_peer = tp->srtt_us;
18671 ++ bestsk = sk;
18672 ++ }
18673 ++ }
18674 ++
18675 ++ if (mpcb->cnt_established == cnt_backups && lowpriosk) {
18676 ++ sk = lowpriosk;
18677 ++ } else if (bestsk) {
18678 ++ sk = bestsk;
18679 ++ } else if (backupsk) {
18680 ++ /* It has been sent on all subflows once - let's give it a
18681 ++ * chance again by restarting its pathmask.
18682 ++ */
18683 ++ if (skb)
18684 ++ TCP_SKB_CB(skb)->path_mask = 0;
18685 ++ sk = backupsk;
18686 ++ }
18687 ++
18688 ++ return sk;
18689 ++}
18690 ++
18691 ++static struct sk_buff *mptcp_rcv_buf_optimization(struct sock *sk, int penal)
18692 ++{
18693 ++ struct sock *meta_sk;
18694 ++ const struct tcp_sock *tp = tcp_sk(sk);
18695 ++ struct tcp_sock *tp_it;
18696 ++ struct sk_buff *skb_head;
18697 ++ struct defsched_priv *dsp = defsched_get_priv(tp);
18698 ++
18699 ++ if (tp->mpcb->cnt_subflows == 1)
18700 ++ return NULL;
18701 ++
18702 ++ meta_sk = mptcp_meta_sk(sk);
18703 ++ skb_head = tcp_write_queue_head(meta_sk);
18704 ++
18705 ++ if (!skb_head || skb_head == tcp_send_head(meta_sk))
18706 ++ return NULL;
18707 ++
18708 ++ /* If penalization is optional (coming from mptcp_next_segment() and
18709 ++ * We are not send-buffer-limited we do not penalize. The retransmission
18710 ++ * is just an optimization to fix the idle-time due to the delay before
18711 ++ * we wake up the application.
18712 ++ */
18713 ++ if (!penal && sk_stream_memory_free(meta_sk))
18714 ++ goto retrans;
18715 ++
18716 ++ /* Only penalize again after an RTT has elapsed */
18717 ++ if (tcp_time_stamp - dsp->last_rbuf_opti < usecs_to_jiffies(tp->srtt_us >> 3))
18718 ++ goto retrans;
18719 ++
18720 ++ /* Half the cwnd of the slow flow */
18721 ++ mptcp_for_each_tp(tp->mpcb, tp_it) {
18722 ++ if (tp_it != tp &&
18723 ++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
18724 ++ if (tp->srtt_us < tp_it->srtt_us && inet_csk((struct sock *)tp_it)->icsk_ca_state == TCP_CA_Open) {
18725 ++ tp_it->snd_cwnd = max(tp_it->snd_cwnd >> 1U, 1U);
18726 ++ if (tp_it->snd_ssthresh != TCP_INFINITE_SSTHRESH)
18727 ++ tp_it->snd_ssthresh = max(tp_it->snd_ssthresh >> 1U, 2U);
18728 ++
18729 ++ dsp->last_rbuf_opti = tcp_time_stamp;
18730 ++ }
18731 ++ break;
18732 ++ }
18733 ++ }
18734 ++
18735 ++retrans:
18736 ++
18737 ++ /* Segment not yet injected into this path? Take it!!! */
18738 ++ if (!(TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp->mptcp->path_index))) {
18739 ++ bool do_retrans = false;
18740 ++ mptcp_for_each_tp(tp->mpcb, tp_it) {
18741 ++ if (tp_it != tp &&
18742 ++ TCP_SKB_CB(skb_head)->path_mask & mptcp_pi_to_flag(tp_it->mptcp->path_index)) {
18743 ++ if (tp_it->snd_cwnd <= 4) {
18744 ++ do_retrans = true;
18745 ++ break;
18746 ++ }
18747 ++
18748 ++ if (4 * tp->srtt_us >= tp_it->srtt_us) {
18749 ++ do_retrans = false;
18750 ++ break;
18751 ++ } else {
18752 ++ do_retrans = true;
18753 ++ }
18754 ++ }
18755 ++ }
18756 ++
18757 ++ if (do_retrans && mptcp_is_available(sk, skb_head, false))
18758 ++ return skb_head;
18759 ++ }
18760 ++ return NULL;
18761 ++}
18762 ++
18763 ++/* Returns the next segment to be sent from the mptcp meta-queue.
18764 ++ * (chooses the reinject queue if any segment is waiting in it, otherwise,
18765 ++ * chooses the normal write queue).
18766 ++ * Sets *@reinject to 1 if the returned segment comes from the
18767 ++ * reinject queue. Sets it to 0 if it is the regular send-head of the meta-sk,
18768 ++ * and sets it to -1 if it is a meta-level retransmission to optimize the
18769 ++ * receive-buffer.
18770 ++ */
18771 ++static struct sk_buff *__mptcp_next_segment(struct sock *meta_sk, int *reinject)
18772 ++{
18773 ++ const struct mptcp_cb *mpcb = tcp_sk(meta_sk)->mpcb;
18774 ++ struct sk_buff *skb = NULL;
18775 ++
18776 ++ *reinject = 0;
18777 ++
18778 ++ /* If we are in fallback-mode, just take from the meta-send-queue */
18779 ++ if (mpcb->infinite_mapping_snd || mpcb->send_infinite_mapping)
18780 ++ return tcp_send_head(meta_sk);
18781 ++
18782 ++ skb = skb_peek(&mpcb->reinject_queue);
18783 ++
18784 ++ if (skb) {
18785 ++ *reinject = 1;
18786 ++ } else {
18787 ++ skb = tcp_send_head(meta_sk);
18788 ++
18789 ++ if (!skb && meta_sk->sk_socket &&
18790 ++ test_bit(SOCK_NOSPACE, &meta_sk->sk_socket->flags) &&
18791 ++ sk_stream_wspace(meta_sk) < sk_stream_min_wspace(meta_sk)) {
18792 ++ struct sock *subsk = get_available_subflow(meta_sk, NULL,
18793 ++ false);
18794 ++ if (!subsk)
18795 ++ return NULL;
18796 ++
18797 ++ skb = mptcp_rcv_buf_optimization(subsk, 0);
18798 ++ if (skb)
18799 ++ *reinject = -1;
18800 ++ }
18801 ++ }
18802 ++ return skb;
18803 ++}
18804 ++
18805 ++static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
18806 ++ int *reinject,
18807 ++ struct sock **subsk,
18808 ++ unsigned int *limit)
18809 ++{
18810 ++ struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
18811 ++ unsigned int mss_now;
18812 ++ struct tcp_sock *subtp;
18813 ++ u16 gso_max_segs;
18814 ++ u32 max_len, max_segs, window, needed;
18815 ++
18816 ++ /* As we set it, we have to reset it as well. */
18817 ++ *limit = 0;
18818 ++
18819 ++ if (!skb)
18820 ++ return NULL;
18821 ++
18822 ++ *subsk = get_available_subflow(meta_sk, skb, false);
18823 ++ if (!*subsk)
18824 ++ return NULL;
18825 ++
18826 ++ subtp = tcp_sk(*subsk);
18827 ++ mss_now = tcp_current_mss(*subsk);
18828 ++
18829 ++ if (!*reinject && unlikely(!tcp_snd_wnd_test(tcp_sk(meta_sk), skb, mss_now))) {
18830 ++ skb = mptcp_rcv_buf_optimization(*subsk, 1);
18831 ++ if (skb)
18832 ++ *reinject = -1;
18833 ++ else
18834 ++ return NULL;
18835 ++ }
18836 ++
18837 ++ /* No splitting required, as we will only send one single segment */
18838 ++ if (skb->len <= mss_now)
18839 ++ return skb;
18840 ++
18841 ++ /* The following is similar to tcp_mss_split_point, but
18842 ++ * we do not care about nagle, because we will anyways
18843 ++ * use TCP_NAGLE_PUSH, which overrides this.
18844 ++ *
18845 ++ * So, we first limit according to the cwnd/gso-size and then according
18846 ++ * to the subflow's window.
18847 ++ */
18848 ++
18849 ++ gso_max_segs = (*subsk)->sk_gso_max_segs;
18850 ++ if (!gso_max_segs) /* No gso supported on the subflow's NIC */
18851 ++ gso_max_segs = 1;
18852 ++ max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);
18853 ++ if (!max_segs)
18854 ++ return NULL;
18855 ++
18856 ++ max_len = mss_now * max_segs;
18857 ++ window = tcp_wnd_end(subtp) - subtp->write_seq;
18858 ++
18859 ++ needed = min(skb->len, window);
18860 ++ if (max_len <= skb->len)
18861 ++ /* Take max_win, which is actually the cwnd/gso-size */
18862 ++ *limit = max_len;
18863 ++ else
18864 ++ /* Or, take the window */
18865 ++ *limit = needed;
18866 ++
18867 ++ return skb;
18868 ++}
18869 ++
18870 ++static void defsched_init(struct sock *sk)
18871 ++{
18872 ++ struct defsched_priv *dsp = defsched_get_priv(tcp_sk(sk));
18873 ++
18874 ++ dsp->last_rbuf_opti = tcp_time_stamp;
18875 ++}
18876 ++
18877 ++struct mptcp_sched_ops mptcp_sched_default = {
18878 ++ .get_subflow = get_available_subflow,
18879 ++ .next_segment = mptcp_next_segment,
18880 ++ .init = defsched_init,
18881 ++ .name = "default",
18882 ++ .owner = THIS_MODULE,
18883 ++};
18884 ++
18885 ++static struct mptcp_sched_ops *mptcp_sched_find(const char *name)
18886 ++{
18887 ++ struct mptcp_sched_ops *e;
18888 ++
18889 ++ list_for_each_entry_rcu(e, &mptcp_sched_list, list) {
18890 ++ if (strcmp(e->name, name) == 0)
18891 ++ return e;
18892 ++ }
18893 ++
18894 ++ return NULL;
18895 ++}
18896 ++
18897 ++int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
18898 ++{
18899 ++ int ret = 0;
18900 ++
18901 ++ if (!sched->get_subflow || !sched->next_segment)
18902 ++ return -EINVAL;
18903 ++
18904 ++ spin_lock(&mptcp_sched_list_lock);
18905 ++ if (mptcp_sched_find(sched->name)) {
18906 ++ pr_notice("%s already registered\n", sched->name);
18907 ++ ret = -EEXIST;
18908 ++ } else {
18909 ++ list_add_tail_rcu(&sched->list, &mptcp_sched_list);
18910 ++ pr_info("%s registered\n", sched->name);
18911 ++ }
18912 ++ spin_unlock(&mptcp_sched_list_lock);
18913 ++
18914 ++ return ret;
18915 ++}
18916 ++EXPORT_SYMBOL_GPL(mptcp_register_scheduler);
18917 ++
18918 ++void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
18919 ++{
18920 ++ spin_lock(&mptcp_sched_list_lock);
18921 ++ list_del_rcu(&sched->list);
18922 ++ spin_unlock(&mptcp_sched_list_lock);
18923 ++}
18924 ++EXPORT_SYMBOL_GPL(mptcp_unregister_scheduler);
18925 ++
18926 ++void mptcp_get_default_scheduler(char *name)
18927 ++{
18928 ++ struct mptcp_sched_ops *sched;
18929 ++
18930 ++ BUG_ON(list_empty(&mptcp_sched_list));
18931 ++
18932 ++ rcu_read_lock();
18933 ++ sched = list_entry(mptcp_sched_list.next, struct mptcp_sched_ops, list);
18934 ++ strncpy(name, sched->name, MPTCP_SCHED_NAME_MAX);
18935 ++ rcu_read_unlock();
18936 ++}
18937 ++
18938 ++int mptcp_set_default_scheduler(const char *name)
18939 ++{
18940 ++ struct mptcp_sched_ops *sched;
18941 ++ int ret = -ENOENT;
18942 ++
18943 ++ spin_lock(&mptcp_sched_list_lock);
18944 ++ sched = mptcp_sched_find(name);
18945 ++#ifdef CONFIG_MODULES
18946 ++ if (!sched && capable(CAP_NET_ADMIN)) {
18947 ++ spin_unlock(&mptcp_sched_list_lock);
18948 ++
18949 ++ request_module("mptcp_%s", name);
18950 ++ spin_lock(&mptcp_sched_list_lock);
18951 ++ sched = mptcp_sched_find(name);
18952 ++ }
18953 ++#endif
18954 ++
18955 ++ if (sched) {
18956 ++ list_move(&sched->list, &mptcp_sched_list);
18957 ++ ret = 0;
18958 ++ } else {
18959 ++ pr_info("%s is not available\n", name);
18960 ++ }
18961 ++ spin_unlock(&mptcp_sched_list_lock);
18962 ++
18963 ++ return ret;
18964 ++}
18965 ++
18966 ++void mptcp_init_scheduler(struct mptcp_cb *mpcb)
18967 ++{
18968 ++ struct mptcp_sched_ops *sched;
18969 ++
18970 ++ rcu_read_lock();
18971 ++ list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
18972 ++ if (try_module_get(sched->owner)) {
18973 ++ mpcb->sched_ops = sched;
18974 ++ break;
18975 ++ }
18976 ++ }
18977 ++ rcu_read_unlock();
18978 ++}
18979 ++
18980 ++/* Manage refcounts on socket close. */
18981 ++void mptcp_cleanup_scheduler(struct mptcp_cb *mpcb)
18982 ++{
18983 ++ module_put(mpcb->sched_ops->owner);
18984 ++}
18985 ++
18986 ++/* Set default value from kernel configuration at bootup */
18987 ++static int __init mptcp_scheduler_default(void)
18988 ++{
18989 ++ BUILD_BUG_ON(sizeof(struct defsched_priv) > MPTCP_SCHED_SIZE);
18990 ++
18991 ++ return mptcp_set_default_scheduler(CONFIG_DEFAULT_MPTCP_SCHED);
18992 ++}
18993 ++late_initcall(mptcp_scheduler_default);
18994 +diff --git a/net/mptcp/mptcp_wvegas.c b/net/mptcp/mptcp_wvegas.c
18995 +new file mode 100644
18996 +index 000000000000..29ca1d868d17
18997 +--- /dev/null
18998 ++++ b/net/mptcp/mptcp_wvegas.c
18999 +@@ -0,0 +1,268 @@
19000 ++/*
19001 ++ * MPTCP implementation - WEIGHTED VEGAS
19002 ++ *
19003 ++ * Algorithm design:
19004 ++ * Yu Cao <cyAnalyst@×××.com>
19005 ++ * Mingwei Xu <xmw@××××××××××××××××××××××.cn>
19006 ++ * Xiaoming Fu <fu@××××××××××××××××××.de>
19007 ++ *
19008 ++ * Implementation:
19009 ++ * Yu Cao <cyAnalyst@×××.com>
19010 ++ * Enhuan Dong <deh13@××××××××××××××××××.cn>
19011 ++ *
19012 ++ * Ported to the official MPTCP-kernel:
19013 ++ * Christoph Paasch <christoph.paasch@×××××××××.be>
19014 ++ *
19015 ++ * This program is free software; you can redistribute it and/or
19016 ++ * modify it under the terms of the GNU General Public License
19017 ++ * as published by the Free Software Foundation; either version
19018 ++ * 2 of the License, or (at your option) any later version.
19019 ++ */
19020 ++
19021 ++#include <linux/skbuff.h>
19022 ++#include <net/tcp.h>
19023 ++#include <net/mptcp.h>
19024 ++#include <linux/module.h>
19025 ++#include <linux/tcp.h>
19026 ++
19027 ++static int initial_alpha = 2;
19028 ++static int total_alpha = 10;
19029 ++static int gamma = 1;
19030 ++
19031 ++module_param(initial_alpha, int, 0644);
19032 ++MODULE_PARM_DESC(initial_alpha, "initial alpha for all subflows");
19033 ++module_param(total_alpha, int, 0644);
19034 ++MODULE_PARM_DESC(total_alpha, "total alpha for all subflows");
19035 ++module_param(gamma, int, 0644);
19036 ++MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
19037 ++
19038 ++#define MPTCP_WVEGAS_SCALE 16
19039 ++
19040 ++/* wVegas variables */
19041 ++struct wvegas {
19042 ++ u32 beg_snd_nxt; /* right edge during last RTT */
19043 ++ u8 doing_wvegas_now;/* if true, do wvegas for this RTT */
19044 ++
19045 ++ u16 cnt_rtt; /* # of RTTs measured within last RTT */
19046 ++ u32 sampled_rtt; /* cumulative RTTs measured within last RTT (in usec) */
19047 ++ u32 base_rtt; /* the min of all wVegas RTT measurements seen (in usec) */
19048 ++
19049 ++ u64 instant_rate; /* cwnd / srtt_us, unit: pkts/us * 2^16 */
19050 ++ u64 weight; /* the ratio of subflow's rate to the total rate, * 2^16 */
19051 ++ int alpha; /* alpha for each subflows */
19052 ++
19053 ++ u32 queue_delay; /* queue delay*/
19054 ++};
19055 ++
19056 ++
19057 ++static inline u64 mptcp_wvegas_scale(u32 val, int scale)
19058 ++{
19059 ++ return (u64) val << scale;
19060 ++}
19061 ++
19062 ++static void wvegas_enable(const struct sock *sk)
19063 ++{
19064 ++ const struct tcp_sock *tp = tcp_sk(sk);
19065 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19066 ++
19067 ++ wvegas->doing_wvegas_now = 1;
19068 ++
19069 ++ wvegas->beg_snd_nxt = tp->snd_nxt;
19070 ++
19071 ++ wvegas->cnt_rtt = 0;
19072 ++ wvegas->sampled_rtt = 0;
19073 ++
19074 ++ wvegas->instant_rate = 0;
19075 ++ wvegas->alpha = initial_alpha;
19076 ++ wvegas->weight = mptcp_wvegas_scale(1, MPTCP_WVEGAS_SCALE);
19077 ++
19078 ++ wvegas->queue_delay = 0;
19079 ++}
19080 ++
19081 ++static inline void wvegas_disable(const struct sock *sk)
19082 ++{
19083 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19084 ++
19085 ++ wvegas->doing_wvegas_now = 0;
19086 ++}
19087 ++
19088 ++static void mptcp_wvegas_init(struct sock *sk)
19089 ++{
19090 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19091 ++
19092 ++ wvegas->base_rtt = 0x7fffffff;
19093 ++ wvegas_enable(sk);
19094 ++}
19095 ++
19096 ++static inline u64 mptcp_wvegas_rate(u32 cwnd, u32 rtt_us)
19097 ++{
19098 ++ return div_u64(mptcp_wvegas_scale(cwnd, MPTCP_WVEGAS_SCALE), rtt_us);
19099 ++}
19100 ++
19101 ++static void mptcp_wvegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
19102 ++{
19103 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19104 ++ u32 vrtt;
19105 ++
19106 ++ if (rtt_us < 0)
19107 ++ return;
19108 ++
19109 ++ vrtt = rtt_us + 1;
19110 ++
19111 ++ if (vrtt < wvegas->base_rtt)
19112 ++ wvegas->base_rtt = vrtt;
19113 ++
19114 ++ wvegas->sampled_rtt += vrtt;
19115 ++ wvegas->cnt_rtt++;
19116 ++}
19117 ++
19118 ++static void mptcp_wvegas_state(struct sock *sk, u8 ca_state)
19119 ++{
19120 ++ if (ca_state == TCP_CA_Open)
19121 ++ wvegas_enable(sk);
19122 ++ else
19123 ++ wvegas_disable(sk);
19124 ++}
19125 ++
19126 ++static void mptcp_wvegas_cwnd_event(struct sock *sk, enum tcp_ca_event event)
19127 ++{
19128 ++ if (event == CA_EVENT_CWND_RESTART) {
19129 ++ mptcp_wvegas_init(sk);
19130 ++ } else if (event == CA_EVENT_LOSS) {
19131 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19132 ++ wvegas->instant_rate = 0;
19133 ++ }
19134 ++}
19135 ++
19136 ++static inline u32 mptcp_wvegas_ssthresh(const struct tcp_sock *tp)
19137 ++{
19138 ++ return min(tp->snd_ssthresh, tp->snd_cwnd - 1);
19139 ++}
19140 ++
19141 ++static u64 mptcp_wvegas_weight(const struct mptcp_cb *mpcb, const struct sock *sk)
19142 ++{
19143 ++ u64 total_rate = 0;
19144 ++ struct sock *sub_sk;
19145 ++ const struct wvegas *wvegas = inet_csk_ca(sk);
19146 ++
19147 ++ if (!mpcb)
19148 ++ return wvegas->weight;
19149 ++
19150 ++
19151 ++ mptcp_for_each_sk(mpcb, sub_sk) {
19152 ++ struct wvegas *sub_wvegas = inet_csk_ca(sub_sk);
19153 ++
19154 ++ /* sampled_rtt is initialized by 0 */
19155 ++ if (mptcp_sk_can_send(sub_sk) && (sub_wvegas->sampled_rtt > 0))
19156 ++ total_rate += sub_wvegas->instant_rate;
19157 ++ }
19158 ++
19159 ++ if (total_rate && wvegas->instant_rate)
19160 ++ return div64_u64(mptcp_wvegas_scale(wvegas->instant_rate, MPTCP_WVEGAS_SCALE), total_rate);
19161 ++ else
19162 ++ return wvegas->weight;
19163 ++}
19164 ++
19165 ++static void mptcp_wvegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
19166 ++{
19167 ++ struct tcp_sock *tp = tcp_sk(sk);
19168 ++ struct wvegas *wvegas = inet_csk_ca(sk);
19169 ++
19170 ++ if (!wvegas->doing_wvegas_now) {
19171 ++ tcp_reno_cong_avoid(sk, ack, acked);
19172 ++ return;
19173 ++ }
19174 ++
19175 ++ if (after(ack, wvegas->beg_snd_nxt)) {
19176 ++ wvegas->beg_snd_nxt = tp->snd_nxt;
19177 ++
19178 ++ if (wvegas->cnt_rtt <= 2) {
19179 ++ tcp_reno_cong_avoid(sk, ack, acked);
19180 ++ } else {
19181 ++ u32 rtt, diff, q_delay;
19182 ++ u64 target_cwnd;
19183 ++
19184 ++ rtt = wvegas->sampled_rtt / wvegas->cnt_rtt;
19185 ++ target_cwnd = div_u64(((u64)tp->snd_cwnd * wvegas->base_rtt), rtt);
19186 ++
19187 ++ diff = div_u64((u64)tp->snd_cwnd * (rtt - wvegas->base_rtt), rtt);
19188 ++
19189 ++ if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) {
19190 ++ tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
19191 ++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
19192 ++
19193 ++ } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
19194 ++ tcp_slow_start(tp, acked);
19195 ++ } else {
19196 ++ if (diff >= wvegas->alpha) {
19197 ++ wvegas->instant_rate = mptcp_wvegas_rate(tp->snd_cwnd, rtt);
19198 ++ wvegas->weight = mptcp_wvegas_weight(tp->mpcb, sk);
19199 ++ wvegas->alpha = max(2U, (u32)((wvegas->weight * total_alpha) >> MPTCP_WVEGAS_SCALE));
19200 ++ }
19201 ++ if (diff > wvegas->alpha) {
19202 ++ tp->snd_cwnd--;
19203 ++ tp->snd_ssthresh = mptcp_wvegas_ssthresh(tp);
19204 ++ } else if (diff < wvegas->alpha) {
19205 ++ tp->snd_cwnd++;
19206 ++ }
19207 ++
19208 ++ /* Try to drain link queue if needed*/
19209 ++ q_delay = rtt - wvegas->base_rtt;
19210 ++ if ((wvegas->queue_delay == 0) || (wvegas->queue_delay > q_delay))
19211 ++ wvegas->queue_delay = q_delay;
19212 ++
19213 ++ if (q_delay >= 2 * wvegas->queue_delay) {
19214 ++ u32 backoff_factor = div_u64(mptcp_wvegas_scale(wvegas->base_rtt, MPTCP_WVEGAS_SCALE), 2 * rtt);
19215 ++ tp->snd_cwnd = ((u64)tp->snd_cwnd * backoff_factor) >> MPTCP_WVEGAS_SCALE;
19216 ++ wvegas->queue_delay = 0;
19217 ++ }
19218 ++ }
19219 ++
19220 ++ if (tp->snd_cwnd < 2)
19221 ++ tp->snd_cwnd = 2;
19222 ++ else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
19223 ++ tp->snd_cwnd = tp->snd_cwnd_clamp;
19224 ++
19225 ++ tp->snd_ssthresh = tcp_current_ssthresh(sk);
19226 ++ }
19227 ++
19228 ++ wvegas->cnt_rtt = 0;
19229 ++ wvegas->sampled_rtt = 0;
19230 ++ }
19231 ++ /* Use normal slow start */
19232 ++ else if (tp->snd_cwnd <= tp->snd_ssthresh)
19233 ++ tcp_slow_start(tp, acked);
19234 ++}
19235 ++
19236 ++
19237 ++static struct tcp_congestion_ops mptcp_wvegas __read_mostly = {
19238 ++ .init = mptcp_wvegas_init,
19239 ++ .ssthresh = tcp_reno_ssthresh,
19240 ++ .cong_avoid = mptcp_wvegas_cong_avoid,
19241 ++ .pkts_acked = mptcp_wvegas_pkts_acked,
19242 ++ .set_state = mptcp_wvegas_state,
19243 ++ .cwnd_event = mptcp_wvegas_cwnd_event,
19244 ++
19245 ++ .owner = THIS_MODULE,
19246 ++ .name = "wvegas",
19247 ++};
19248 ++
19249 ++static int __init mptcp_wvegas_register(void)
19250 ++{
19251 ++ BUILD_BUG_ON(sizeof(struct wvegas) > ICSK_CA_PRIV_SIZE);
19252 ++ tcp_register_congestion_control(&mptcp_wvegas);
19253 ++ return 0;
19254 ++}
19255 ++
19256 ++static void __exit mptcp_wvegas_unregister(void)
19257 ++{
19258 ++ tcp_unregister_congestion_control(&mptcp_wvegas);
19259 ++}
19260 ++
19261 ++module_init(mptcp_wvegas_register);
19262 ++module_exit(mptcp_wvegas_unregister);
19263 ++
19264 ++MODULE_AUTHOR("Yu Cao, Enhuan Dong");
19265 ++MODULE_LICENSE("GPL");
19266 ++MODULE_DESCRIPTION("MPTCP wVegas");
19267 ++MODULE_VERSION("0.1");
19268
19269 diff --git a/4567_distro-Gentoo-Kconfig.patch b/4567_distro-Gentoo-Kconfig.patch
19270 index 71dbf09..652e2a7 100644
19271 --- a/4567_distro-Gentoo-Kconfig.patch
19272 +++ b/4567_distro-Gentoo-Kconfig.patch
19273 @@ -1,15 +1,15 @@
19274 ---- a/Kconfig 2014-04-02 09:45:05.389224541 -0400
19275 -+++ b/Kconfig 2014-04-02 09:45:39.269224273 -0400
19276 +--- a/Kconfig 2014-04-02 09:45:05.389224541 -0400
19277 ++++ b/Kconfig 2014-04-02 09:45:39.269224273 -0400
19278 @@ -8,4 +8,6 @@ config SRCARCH
19279 - string
19280 - option env="SRCARCH"
19281 -
19282 + string
19283 + option env="SRCARCH"
19284 +
19285 +source "distro/Kconfig"
19286 +
19287 source "arch/$SRCARCH/Kconfig"
19288 ---- /dev/null 2014-09-22 14:19:24.316977284 -0400
19289 -+++ distro/Kconfig 2014-09-22 19:30:35.670959281 -0400
19290 -@@ -0,0 +1,109 @@
19291 +--- 1969-12-31 19:00:00.000000000 -0500
19292 ++++ b/distro/Kconfig 2014-04-02 09:57:03.539218861 -0400
19293 +@@ -0,0 +1,108 @@
19294 +menu "Gentoo Linux"
19295 +
19296 +config GENTOO_LINUX
19297 @@ -34,8 +34,6 @@
19298 + select DEVTMPFS
19299 + select TMPFS
19300 +
19301 -+ select FHANDLE
19302 -+
19303 + select MMU
19304 + select SHMEM
19305 +
19306 @@ -91,6 +89,7 @@
19307 + select CGROUPS
19308 + select EPOLL
19309 + select FANOTIFY
19310 ++ select FHANDLE
19311 + select INOTIFY_USER
19312 + select NET
19313 + select NET_NS